#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 8 09:27:08 2018
@author: luogan
"""
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
spark= SparkSession .builder .appName("dataFrame") .getOrCreate()
# Load and parse the data file, converting it to a DataFrame.
data = spark.read.format("libsvm").load("/home/luogan/lg/softinstall/spark-2.2.0-bin-hadoop2.7/data/mllib/sample_libsvm_data.txt")
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol="indexedFeatures")
# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])
# Train model. This also runs the indexer.
model = pipeline.fit(trainingData)
# Make predictions.
predictions = model.transform(testData)
# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
rfModel = model.stages[1]
print(rfModel) # summary only
结果:
+----------+-----+--------------------+ |prediction|label| features| +----------+-----+--------------------+ | 0.0| 0.0|(692,[95,96,97,12...| | 0.3| 0.0|(692,[100,101,102...| | 0.0| 0.0|(692,[123,124,125...| | 0.05| 0.0|(692,[124,125,126...| | 0.0| 0.0|(692,[124,125,126...| +----------+-----+--------------------+ only showing top 5 rows Root Mean Squared Error (RMSE) on test data = 0.127949 RandomForestRegressionModel (uid=RandomForestRegressor_4acc9ab165e4f84f7169) with 20 trees
原文:https://blog.csdn.net/luoganttcc/article/details/80618336
PySpark 分类模型训练 参考:
https://blog.csdn.net/u013719780/article/details/51792097
pyspark RandomForestRegressor 随机森林回归
原文:https://www.cnblogs.com/Allen-rg/p/10046583.html