经验首页 前端设计 程序设计 Java相关 移动开发 数据库/运维 软件/图像 大数据/云计算 其他经验
当前位置:技术经验 » 数据库/运维 » Spark » 查看文章
Spark随机森林实战
来源:cnblogs  作者:云山之巅  时间:2018/9/25 19:12:08  对本文有异议
  1. 1 package big.data.analyse.ml.randomforest
  2. 2
  3. 3 import org.apache.spark.ml.Pipeline
  4. 4 import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
  5. 5 import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
  6. 6 import org.apache.spark.ml.feature.{IndexToString, VectorIndexer, StringIndexer}
  7. 7 import org.apache.spark.sql.SparkSession
  8. 8
  9. 9 /**
  10. 10 * 随机森林
  11. 11 * Created by zhen on 2018/9/20.
  12. 12 */
  13. 13 object RandomForest {
  14. 14 def main(args: Array[String]) {
  15. 15 //创建spark对象
  16. 16 val spark = SparkSession.builder()
  17. 17 .appName("RandomForest")
  18. 18 .master("local[2]")
  19. 19 .getOrCreate()
  20. 20 //获取数据
  21. 21 val data = spark.read.format("libsvm")
  22. 22 .load("src/big/data/analyse/ml/randomforest/randomforest.txt")
  23. 23 //标识整个数据集的标识列和索引列
  24. 24 val labelIndexer = new StringIndexer()
  25. 25 .setInputCol("label")
  26. 26 .setOutputCol("indexedLabel")
  27. 27 .fit(data)
  28. 28 //设置树的最大层次
  29. 29 val featureIndexer = new VectorIndexer()
  30. 30 .setInputCol("features")
  31. 31 .setOutputCol("indexedFeatures")
  32. 32 .setMaxCategories(4)
  33. 33 .fit(data)
  34. 34 //拆分数据为训练集和测试集(7:3)
  35. 35 val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
  36. 36 testData.show(5)
  37. 37 //创建模型
  38. 38 val randomForest = new RandomForestClassifier()
  39. 39 .setLabelCol("indexedLabel")
  40. 40 .setFeaturesCol("indexedFeatures")
  41. 41 .setNumTrees(10)
  42. 42 //转化初始数据
  43. 43 val labelConverter = new IndexToString()
  44. 44 .setInputCol("prediction")
  45. 45 .setOutputCol("predictedLabel")
  46. 46 .setLabels(labelIndexer.labels)
  47. 47 //使用管道运行转换器和随机森林算法
  48. 48 val pipeline = new Pipeline()
  49. 49 .setStages(Array(labelIndexer, featureIndexer, randomForest, labelConverter))
  50. 50 //训练模型
  51. 51 val model = pipeline.fit(trainingData)
  52. 52 //预测
  53. 53 val predictions = model.transform(testData)
  54. 54 //输出预测结果
  55. 55 predictions.select("predictedLabel", "label", "features").show(5)
  56. 56 //创建评估函数,计算错误率
  57. 57 val evaluator = new MulticlassClassificationEvaluator()
  58. 58 .setLabelCol("indexedLabel")
  59. 59 .setPredictionCol("prediction")
  60. 60 .setMetricName("accuracy")
  61. 61
  62. 62 val accuracy = evaluator.evaluate(predictions)
  63. 63 println("test error = " + (1.0 - accuracy))
  64. 64
  65. 65 val rfModel = model.stages(2).asInstanceOf[RandomForestClassificationModel]
  66. 66 println("learned classification forest model:\n" + rfModel.toDebugString)
  67. 67
  68. 68 spark.stop()
  69. 69 }
  70. 70 }

使用数据:

  1. 0 128:51 129:159 130:253 131:159 132:50 155:48 156:238 157:252 158:252
  2. 1 159:124 160:253 161:255 162:63 186:96 187:244 188:251 189:253 190:62
  3. 1 125:145 126:255 127:211 128:31 152:32 153:237 154:253 155:252 156:71
  4. 1 153:5 154:63 155:197 181:20 182:254 183:230 184:24 209:20 210:254
  5. 1 152:1 153:168 154:242 155:28 180:10 181:228 182:254 183:100 209:190
  6. 0 130:64 131:253 132:255 133:63 157:96 158:205 159:251 160:253 161:205
  7. 1 159:121 160:254 161:136 186:13 187:230 188:253 189:248 190:99 213:4
  8. 1 100:166 101:222 102:55 128:197 129:254 130:218 131:5 155:29 156:249
  9. 0 155:53 156:255 157:253 158:253 159:253 160:124 183:180 184:253 185:25
  10. 0 128:73 129:253 130:227 131:73 132:21 156:73 157:251 158:251 159:251
  11. 1 155:178 156:255 157:105 182:6 183:188 184:253 185:216 186:14 210:14
  12. 0 154:46 155:105 156:254 157:254 158:254 159:254 160:255 161:239 162:41
  13. 0 152:56 153:105 154:220 155:254 156:63 178:18 179:166 180:233 181:253
  14. 1 130:7 131:176 132:254 133:224 158:51 159:253 160:253 161:223 185:4
  15. 0 155:21 156:176 157:253 158:253 159:124 182:105 183:176 184:251 185:25
  16. 1 151:68 152:45 153:131 154:131 155:131 156:101 157:68 158:92 159:44
  17. 0 125:29 126:170 127:255 128:255 129:141 151:29 152:198 153:255 154:255
  18. 0 153:203 154:254 155:252 156:252 157:252 158:214 159:51 160:20 180:62
  19. 1 98:64 99:191 100:70 125:68 126:243 127:253 128:249 129:63 152:30
  20. 1 125:26 126:240 127:72 153:25 154:238 155:208 182:209 183:226 184:14
  21. 0 155:62 156:91 157:213 158:255 159:228 160:91 161:12 182:70 183:230
  22. 1 157:42 158:228 159:253 160:253 185:144 186:251 187:251 188:251 212:89
  23. 1 128:62 129:254 130:213 156:102 157:253 158:252 159:102 160:20 184:102
  24. 0 154:28 155:195 156:254 157:254 158:254 159:254 160:254 161:255 162:61
  25. 0 123:8 124:76 125:202 126:254 127:255 128:163 129:37 130:2 150:13
  26. 0 127:68 128:254 129:255 130:254 131:107 153:11 154:176 155:230 156:253
  27. 1 157:85 158:255 159:103 160:1 185:205 186:253 187:253 188:30 213:205
  28. 1 126:94 127:132 154:250 155:250 156:4 182:250 183:254 184:95 210:250
  29. 1 124:32 125:253 126:31 152:32 153:251 154:149 180:32 181:251 182:188
  30. 1 129:39 130:254 131:255 132:254 133:140 157:136 158:253 159:253 160:22
  31. 0 123:59 124:55 149:71 150:192 151:254 152:250 153:147 154:17 176:123
  32. 1 128:58 129:139 156:247 157:247 158:25 183:121 184:253 185:156 186:3
  33. 1 129:28 130:247 131:255 132:165 156:47 157:221 158:252 159:252 160:164
  34. 0 156:13 157:6 181:10 182:77 183:145 184:253 185:190 186:67 207:11
  35. 0 127:28 128:164 129:254 130:233 131:148 132:11 154:3 155:164 156:254
  36. 0 129:105 130:255 131:219 132:67 133:67 134:52 156:20 157:181 158:253
  37. 0 125:22 126:183 127:252 128:254 129:252 130:252 131:252 132:76 151:85
  38. 1 155:114 156:206 157:25 183:238 184:252 185:55 211:222 212:252 213:55
  39. 1 127:73 128:253 129:253 130:63 155:115 156:252 157:252 158:144 183:217
  40. 1 120:85 121:253 122:132 123:9 147:82 148:241 149:251 150:251 151:128
  41. 1 126:15 127:200 128:255 129:90 154:42 155:254 156:254 157:173 182:42
  42. 0 182:32 183:57 184:57 185:57 186:57 187:57 188:57 189:57 208:67 209:18
  43. 0 127:42 128:235 129:255 130:84 153:15 154:132 155:208 156:253 157:253
  44. 1 156:202 157:253 158:69 184:253 185:252 186:121 212:253 213:252 214:69
  45. 1 156:73 157:253 158:253 159:253 160:124 184:73 185:251 186:251 187:251
  46. 1 124:111 125:255 126:48 152:162 153:253 154:237 155:63 180:206 181:253
  47. 0 99:70 100:255 101:165 102:114 127:122 128:253 129:253 130:253 131:120
  48. 1 124:29 125:197 126:255 127:84 152:85 153:251 154:253 155:83 180:86
  49. 1 159:31 160:210 161:253 162:163 187:198 188:252 189:252 190:162 213:10
  50. 1 131:159 132:255 133:122 158:167 159:228 160:253 161:121 185:64 186:23
  51. 0 153:92 154:191 155:178 156:253 157:242 158:141 159:104 160:29 180:26
  52. 1 128:53 129:250 130:255 131:25 156:167 157:253 158:253 159:25 182:3
  53. 0 122:63 123:176 124:253 125:253 126:159 127:113 128:63 150:140 151:253
  54. 0 153:12 154:136 155:254 156:255 157:195 158:115 159:3 180:6 181:175
  55. 1 128:255 129:253 130:57 156:253 157:251 158:225 159:56 183:169 184:254
  56. 0 151:23 152:167 153:208 154:254 155:255 156:129 157:19 179:151 180:253
  57. 1 130:24 131:150 132:233 133:38 156:14 157:89 158:253 159:254 160:254
  58. 0 125:120 126:253 127:253 128:63 151:38 152:131 153:246 154:252 155:252
  59. 1 127:155 128:253 129:126 155:253 156:251 157:141 158:4 183:253 184:251
  60. 0 101:88 102:127 103:5 126:19 127:58 128:20 129:14 130:217 131:19 152:7
  61. 0 127:37 128:141 129:156 130:156 131:194 132:194 133:47 153:11 154:132
  62. 0 154:32 155:134 156:218 157:254 158:254 159:254 160:217 161:84 176:44
  63. 1 124:102 125:252 126:252 127:41 152:102 153:250 154:250 155:202 180:10
  64. 0 124:20 125:121 126:197 127:253 128:64 151:23 152:200 153:252 154:252
  65. 1 127:20 128:254 129:255 130:37 155:19 156:253 157:253 158:134 183:19
  66. 0 235:40 236:37 238:7 239:77 240:137 241:136 242:136 243:136 244:136
  67. 1 128:166 129:255 130:187 131:6 156:165 157:253 158:253 159:13 183:15
  68. 1 128:117 129:128 155:2 156:199 157:127 183:81 184:254 185:87 211:116
  69. 1 129:159 130:142 156:11 157:220 158:141 184:78 185:254 186:141 212:111
  70. 0 124:66 125:254 126:254 127:58 128:60 129:59 130:59 131:50 151:73
  71. 1 129:101 130:222 131:84 157:225 158:252 159:84 184:89 185:246 186:208
  72. 0 124:41 125:254 126:254 127:157 128:34 129:34 130:218 131:255 132:206
  73. 0 96:56 97:247 98:121 124:24 125:242 126:245 127:122 153:231 154:253
  74. 0 125:19 126:164 127:253 128:255 129:253 130:118 131:59 132:36 153:78
  75. 1 129:232 130:255 131:107 156:58 157:244 158:253 159:106 184:95 185:253
  76. 1 127:63 128:128 129:2 155:63 156:254 157:123 183:63 184:254 185:179
  77. 1 130:131 131:255 132:184 133:15 157:99 158:247 159:253 160:182 161:15
  78. 0 125:57 126:255 127:253 128:198 129:85 153:168 154:253 155:251 156:253
  79. 0 127:12 128:105 129:224 130:255 131:247 132:22 155:131 156:254 157:254
  80. 1 130:226 131:247 132:55 157:99 158:248 159:254 160:230 161:30 185:125
  81. 1 130:166 131:253 132:124 133:53 158:140 159:251 160:251 161:180 185:12
  82. 1 129:17 130:206 131:229 132:44 157:2 158:125 159:254 160:123 185:95
  83. 1 130:218 131:253 132:124 157:84 158:236 159:251 160:251 184:63 185:236
  84. 1 124:102 125:180 126:1 152:140 153:254 154:130 180:140 181:254 182:204
  85. 0 128:87 129:208 130:249 155:27 156:212 157:254 158:195 182:118 183:225
  86. 1 126:134 127:230 154:133 155:231 156:10 182:133 183:253 184:96 210:133
  87. 1 125:29 126:85 127:255 128:139 153:197 154:251 155:253 156:251 181:254
  88. 1 125:149 126:255 127:254 128:58 153:215 154:253 155:183 156:2 180:41
  89. 1 130:79 131:203 132:141 157:51 158:240 159:240 160:140 185:88 186:252
  90. 1 126:94 127:254 128:75 154:166 155:253 156:231 182:208 183:253 184:147
  91. 0 127:46 128:105 129:254 130:254 131:224 132:59 133:59 134:9 155:196
  92. 1 125:42 126:232 127:254 128:58 153:86 154:253 155:253 156:58 181:86
  93. 1 156:60 157:229 158:38 184:187 185:254 186:78 211:121 212:252 213:254
  94. 1 101:11 102:150 103:72 129:37 130:251 131:71 157:63 158:251 159:71
  95. 0 127:45 128:254 129:254 130:254 131:148 132:24 133:9 154:43 155:254
  96. 0 125:218 126:253 127:253 128:255 129:149 130:62 151:42 152:144 153:236
  97. 0 127:60 128:96 129:96 130:48 153:16 154:171 155:228 156:253 157:251
  98. 0 126:32 127:202 128:255 129:253 130:253 131:175 132:21 152:84 153:144
  99. 1 130:218 131:170 132:108 157:32 158:227 159:252 160:232 185:129 186:25
  100. 1 130:116 131:255 132:123 157:29 158:213 159:253 160:122 185:189 186:25

结果(测试集&预测集):

  

 内部决策树结构:

  1. test error = 0.34375
  2. learned classification forest model:
  3. RandomForestClassificationModel (uid=rfc_0487ba2e1907) with 10 trees
  4. Tree 0 (weight 1.0):
  5. If (feature 185 <= 0.0)
  6. If (feature 157 <= 253.0)
  7. If (feature 149 <= 0.0)
  8. If (feature 210 in {3.0})
  9. Predict: 0.0
  10. Else (feature 210 not in {3.0})
  11. If (feature 208 in {2.0})
  12. Predict: 0.0
  13. Else (feature 208 not in {2.0})
  14. Predict: 0.0
  15. Else (feature 149 > 0.0)
  16. Predict: 1.0
  17. Else (feature 157 > 253.0)
  18. Predict: 1.0
  19. Else (feature 185 > 0.0)
  20. If (feature 160 <= 0.0)
  21. If (feature 180 <= 0.0)
  22. Predict: 0.0
  23. Else (feature 180 > 0.0)
  24. Predict: 1.0
  25. Else (feature 160 > 0.0)
  26. Predict: 0.0
  27. Tree 1 (weight 1.0):
  28. If (feature 156 <= 253.0)
  29. If (feature 187 <= 0.0)
  30. If (feature 133 in {2.0})
  31. Predict: 1.0
  32. Else (feature 133 not in {2.0})
  33. If (feature 100 <= 11.0)
  34. If (feature 128 <= 139.0)
  35. Predict: 0.0
  36. Else (feature 128 > 139.0)
  37. Predict: 1.0
  38. Else (feature 100 > 11.0)
  39. Predict: 1.0
  40. Else (feature 187 > 0.0)
  41. Predict: 0.0
  42. Else (feature 156 > 253.0)
  43. Predict: 1.0
  44. Tree 2 (weight 1.0):
  45. If (feature 158 <= 51.0)
  46. If (feature 182 <= 0.0)
  47. If (feature 127 <= 58.0)
  48. If (feature 129 <= 142.0)
  49. If (feature 154 <= 253.0)
  50. Predict: 0.0
  51. Else (feature 154 > 253.0)
  52. Predict: 1.0
  53. Else (feature 129 > 142.0)
  54. Predict: 1.0
  55. Else (feature 127 > 58.0)
  56. Predict: 1.0
  57. Else (feature 182 > 0.0)
  58. Predict: 0.0
  59. Else (feature 158 > 51.0)
  60. If (feature 127 <= 62.0)
  61. Predict: 0.0
  62. Else (feature 127 > 62.0)
  63. Predict: 1.0
  64. Tree 3 (weight 1.0):
  65. If (feature 100 <= 11.0)
  66. If (feature 127 <= 0.0)
  67. If (feature 151 <= 162.0)
  68. If (feature 159 <= 0.0)
  69. If (feature 125 <= 48.0)
  70. Predict: 0.0
  71. Else (feature 125 > 48.0)
  72. Predict: 0.0
  73. Else (feature 159 > 0.0)
  74. Predict: 0.0
  75. Else (feature 151 > 162.0)
  76. Predict: 1.0
  77. Else (feature 127 > 0.0)
  78. If (feature 131 <= 0.0)
  79. If (feature 153 <= 42.0)
  80. Predict: 0.0
  81. Else (feature 153 > 42.0)
  82. If (feature 154 <= 228.0)
  83. Predict: 1.0
  84. Else (feature 154 > 228.0)
  85. Predict: 0.0
  86. Else (feature 131 > 0.0)
  87. Predict: 1.0
  88. Else (feature 100 > 11.0)
  89. Predict: 1.0
  90. Tree 4 (weight 1.0):
  91. If (feature 152 <= 0.0)
  92. If (feature 158 <= 0.0)
  93. If (feature 151 <= 0.0)
  94. Predict: 1.0
  95. Else (feature 151 > 0.0)
  96. Predict: 0.0
  97. Else (feature 158 > 0.0)
  98. If (feature 182 <= 15.0)
  99. If (feature 153 <= 0.0)
  100. Predict: 0.0
  101. Else (feature 153 > 0.0)
  102. Predict: 1.0
  103. Else (feature 182 > 15.0)
  104. Predict: 1.0
  105. Else (feature 152 > 0.0)
  106. If (feature 124 <= 0.0)
  107. Predict: 1.0
  108. Else (feature 124 > 0.0)
  109. If (feature 123 <= 24.0)
  110. If (feature 125 <= 232.0)
  111. Predict: 0.0
  112. Else (feature 125 > 232.0)
  113. Predict: 1.0
  114. Else (feature 123 > 24.0)
  115. Predict: 0.0
  116. Tree 5 (weight 1.0):
  117. If (feature 157 <= 0.0)
  118. If (feature 101 <= 0.0)
  119. If (feature 129 <= 0.0)
  120. If (feature 183 <= 0.0)
  121. If (feature 152 <= 231.0)
  122. Predict: 0.0
  123. Else (feature 152 > 231.0)
  124. Predict: 0.0
  125. Else (feature 183 > 0.0)
  126. Predict: 0.0
  127. Else (feature 129 > 0.0)
  128. Predict: 1.0
  129. Else (feature 101 > 0.0)
  130. Predict: 1.0
  131. Else (feature 157 > 0.0)
  132. If (feature 155 <= 165.0)
  133. Predict: 0.0
  134. Else (feature 155 > 165.0)
  135. Predict: 1.0
  136. Tree 6 (weight 1.0):
  137. If (feature 153 <= 253.0)
  138. If (feature 125 <= 240.0)
  139. If (feature 158 <= 3.0)
  140. If (feature 182 <= 0.0)
  141. If (feature 179 <= 6.0)
  142. Predict: 1.0
  143. Else (feature 179 > 6.0)
  144. Predict: 0.0
  145. Else (feature 182 > 0.0)
  146. If (feature 128 <= 139.0)
  147. Predict: 0.0
  148. Else (feature 128 > 139.0)
  149. Predict: 0.0
  150. Else (feature 158 > 3.0)
  151. If (feature 155 <= 58.0)
  152. Predict: 0.0
  153. Else (feature 155 > 58.0)
  154. If (feature 175 in {1.0})
  155. Predict: 1.0
  156. Else (feature 175 not in {1.0})
  157. Predict: 0.0
  158. Else (feature 125 > 240.0)
  159. If (feature 129 <= 0.0)
  160. If (feature 154 <= 0.0)
  161. Predict: 1.0
  162. Else (feature 154 > 0.0)
  163. Predict: 0.0
  164. Else (feature 129 > 0.0)
  165. Predict: 1.0
  166. Else (feature 153 > 253.0)
  167. Predict: 1.0
  168. Tree 7 (weight 1.0):
  169. If (feature 131 <= 67.0)
  170. If (feature 155 <= 102.0)
  171. If (feature 129 <= 226.0)
  172. If (feature 129 <= 62.0)
  173. If (feature 127 <= 58.0)
  174. Predict: 0.0
  175. Else (feature 127 > 58.0)
  176. Predict: 1.0
  177. Else (feature 129 > 62.0)
  178. Predict: 0.0
  179. Else (feature 129 > 226.0)
  180. Predict: 1.0
  181. Else (feature 155 > 102.0)
  182. If (feature 128 <= 224.0)
  183. If (feature 184 <= 25.0)
  184. If (feature 157 <= 0.0)
  185. Predict: 1.0
  186. Else (feature 157 > 0.0)
  187. Predict: 1.0
  188. Else (feature 184 > 25.0)
  189. Predict: 0.0
  190. Else (feature 128 > 224.0)
  191. If (feature 131 <= 0.0)
  192. Predict: 0.0
  193. Else (feature 131 > 0.0)
  194. Predict: 1.0
  195. Else (feature 131 > 67.0)
  196. Predict: 0.0
  197. Tree 8 (weight 1.0):
  198. If (feature 182 <= 180.0)
  199. If (feature 179 <= 62.0)
  200. If (feature 128 <= 101.0)
  201. If (feature 156 <= 225.0)
  202. If (feature 149 <= 0.0)
  203. Predict: 0.0
  204. Else (feature 149 > 0.0)
  205. Predict: 1.0
  206. Else (feature 156 > 225.0)
  207. If (feature 155 <= 202.0)
  208. Predict: 0.0
  209. Else (feature 155 > 202.0)
  210. Predict: 1.0
  211. Else (feature 128 > 101.0)
  212. If (feature 183 <= 0.0)
  213. If (feature 128 <= 254.0)
  214. Predict: 1.0
  215. Else (feature 128 > 254.0)
  216. Predict: 0.0
  217. Else (feature 183 > 0.0)
  218. Predict: 0.0
  219. Else (feature 179 > 62.0)
  220. Predict: 0.0
  221. Else (feature 182 > 180.0)
  222. If (feature 156 <= 105.0)
  223. Predict: 0.0
  224. Else (feature 156 > 105.0)
  225. Predict: 1.0
  226. Tree 9 (weight 1.0):
  227. If (feature 96 in {1.0})
  228. Predict: 1.0
  229. Else (feature 96 not in {1.0})
  230. If (feature 185 <= 67.0)
  231. If (feature 160 <= 12.0)
  232. If (feature 178 in {1.0})
  233. Predict: 1.0
  234. Else (feature 178 not in {1.0})
  235. If (feature 126 <= 0.0)
  236. Predict: 0.0
  237. Else (feature 126 > 0.0)
  238. Predict: 1.0
  239. Else (feature 160 > 12.0)
  240. If (feature 155 <= 0.0)
  241. Predict: 0.0
  242. Else (feature 155 > 0.0)
  243. Predict: 1.0
  244. Else (feature 185 > 67.0)
  245. Predict: 0.0

总结:可知该随机森林共有10棵树组成,预测结果为10棵树的投票为准。每棵树的最大层次为4,这是为了避免层次过高带来的计算压力和过拟合!

 

 友情链接:直通硅谷  点职佳  北美留学生论坛

本站QQ群:前端 618073944 | Java 606181507 | Python 626812652 | C/C++ 612253063 | 微信 634508462 | 苹果 692586424 | C#/.net 182808419 | PHP 305140648 | 运维 608723728

W3xue 的所有内容仅供测试,对任何法律问题及风险不承担任何责任。通过使用本站内容随之而来的风险与本站无关。
关于我们  |  意见建议  |  捐助我们  |  报错有奖  |  广告合作、友情链接(目前9元/月)请联系QQ:27243702 沸活量
皖ICP备17017327号-2 皖公网安备34020702000426号