导入相关库
%matplotlib inline
import numpy as np
#pandas读取.csv文件
import pandas as pd
from sklearn.datasets import load_boston
#train_test_split函数用于数据集划分
from sklearn.cross_validation import train_test_split
查看数据集
#load_boston函数加载数据集
boston=load_boston
#输出对数据集的描述
print(boston.DESCR)
#X是boston数据集的特征,y是对应房价
X=boston.data
y=boston.target
数据集划分
#将数据集划分成75%的训练集和25%的测试集,random_state控制随机性
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=33,test_size=0.25)
#查看数据集的shape
print("Train dataset's shape is :",X_train.shape)
print("Test dataset's shape is :",X_test.shape)
数据预处理
#分析预测值的差异。预测房价之间差距较大,因此先对特征以及目标值进行标准化处理
#输出预测值的最大值、最小值和平均值
print("The maximum target value is :",np.max(boston.target))
print("The minimum target value is :",np.min(boston.target))
print("The mean target value is :",np.mean(boston.target))
#导入数据标准化模块
from sklearn.preprocessing import StandardScaler
#对数据集标准化处理。标准化处理对象,使用该对象的fit_transform函数
ss_X=StandardScaler
ss_y=StandardScaler
#使用fit_transform函数对特征、预测值标准化处理。在python 3中,由于y_train是一个np.array([……])的(1×N列)对象,要转化成(N×1)列的np.array对像,使用reshape(-1,1)函数,第一个-1表示可为任意行,第二个1表示1列。
X_train=ss_X.fit_transform(X_train)
X_test=ss_X.transform(X_test)
y_train = ss_y.fit_transform(y_train.reshape(-1, 1))
y_test = ss_y.transform(y_test.reshape(-1,1))
建立模型
#导入线性回归模型
from sklearn.linear_model import LinearRegression
#使用默认配置的初始化线性回归LinearRegression建立简单的线性回归模型
lr=LinearRegression
lr.fit(X_train,y_train)
#做预测
lr_y_predict=lr.predict(X_test)
#模型评估:使用LinearRegression模型自带的评估模块,并输出评估结果
print("lr_score:",lr.score(X_test,y_test))
#从sklearn.metrics中导入R-square,MSE,MAE,使用R-square,MSE(平均绝对误差),MAE(均方误差)评估LinearRegression回归性能
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
#计算r2_score
print("r2_score:",r2_score(y_test,lr_y_predict))
#计算平均绝对误差,使用inverse_transform函数是将归一化后的预测值再次转回原来的目标预测值
print("MSE:",mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(lr_y_predict)))
#计算均方误差
print("MAE:",mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(lr_y_predict)))
输出结果:
lr_score: 0.6763403831
r2_score: 0.6763403831
MSE: 25.0969856921
MAE: 3.5261239964
#使用默认配置的初始化线性回归SGDRegressor
from sklearn.linear_model import SGDRegressor
sgdr=SGDRegressor(max_iter=5)
#拟合模型,y_train.ravel将N×1的2维np.array对象转成1维的np.array对象
sgdr.fit(X_train,y_train.ravel)
sgdr_y_predict=sgdr.predict(X_test)
#使用LinearRegression模型自带的评估模块,并输出评估结果
print("sgdr_score:",sgdr.score(X_test,y_test))
print("r2_score:",r2_score(y_test,sgdr_y_predict))
print("r2_score:",r2_score(ss_y.inverse_transform(y_test),ss_y.inverse_transform(sgdr_y_predict)))
print("MSE:",mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(sgdr_y_predict)))
print("MAE:",mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(sgdr_y_predict)))
输出结果:
sgdr_score: 0.657641746361
r2_score: 0.657641746361
r2_score: 0.657641746361
MSE: 26.5469021914
MAE: 3.50598660941
#使用sklearn.svm中的支持向量回归
from sklearn.svm import SVR
#使用线性核函数
linear_svr=SVR(kernel='linear')
linear_svr.fit(X_train,y_train.ravel)
linear_svr_y_predict=linear_svr.predict(X_test)
print("linear_svm_score:",linear_svr.score(X_test,y_test.ravel))
print("r2_score:",r2_score(y_test,linear_svr_y_predict))
print("MSE:",mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(linear_svr_y_predict)))
print("MAE:",mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(linear_svr_y_predict)))
输出结果:
linear_svm_score: 0.65171709743
r2_score: 0.65171709743
MSE: 27.0063071393
MAE: 3.42667291687
#使用多项式核函数
poly_svr=SVR(kernel='poly')
poly_svr.fit(X_train,y_train.ravel)
poly_svr_y_predict=poly_svr.predict(X_test)
print("linear_svm_score:",poly_svr.score(X_test,y_test.ravel))
print("r2_score:",r2_score(y_test,poly_svr_y_predict))
print("MSE:",mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(poly_svr_y_predict)))
print("MAE:",mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(poly_svr_y_predict)))
输出结果:
poly_svm_score: 0.404454058003
r2_score: 0.404454058003
MSE: 46.179403314
MAE: 3.75205926674
使用径向基核函数
rbf_svr=SVR(kernel='rbf')
rbf_svr.fit(X_train,y_train.ravel)
rbf_svr_y_predict=rbf_svr.predict(X_test)
print("linear_svm_score:",rbf_svr.score(X_test,y_test.ravel))
print("r2_score:",r2_score(y_test,rbf_svr_y_predict))
print("MSE:",mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(rbf_svr_y_predict)))
print("MAE:",mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(rbf_svr_y_predict)))
输出结果:
rbf_svm_score: 0.756406891227
r2_score: 0.756406891227
MSE: 18.8885250008
MAE: 2.60756329798
#k近邻回归模型
#从sklearn.neighbors导入KNeighborRegressor
from sklearn.neighbors import KNeighborsRegressor
#初始化K近邻回归器,并且调整配置,使得预测的方式为平均回归:weights='uniform'
uni_knr = KNeighborsRegressor(weights='uniform')
uni_knr.fit(X_train, y_train.ravel)
uni_knr_y_predict = uni_knr.predict(X_test)
print("uni_knr_score:",uni_knr.score(X_test,y_test.ravel))
print("r2_score:",r2_score(y_test,uni_knr_y_predict))
print("MSE:",mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(uni_knr_y_predict)))
print("MAE:",mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(uni_knr_y_predict)))
输出结果:
uni_knr_score: 0.690345456461
r2_score: 0.690345456461
MSE: 24.0110141732
MAE: 2.96803149606
#初始化K近邻回归器,并且调整配置,使得预测的方式为根据距离加权回归:weights='distance'
dis_knr = KNeighborsRegressor(weights='distance')
dis_knr.fit(X_train, y_train.ravel)
dis_knr_y_predict = dis_knr.predict(X_test)
print("dis_knr_score:",dis_knr.score(X_test,y_test.ravel))
print("r2_score:",r2_score(y_test,dis_knr_y_predict))
print("MSE:",mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(dis_knr_y_predict)))
print("MAE:",mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(dis_knr_y_predict)))
输出结果:
dis_knr_score: 0.719758997016
r2_score: 0.719758997016
MSE: 21.7302501609
MAE: 2.80505687851
# 建立决策树回归,从sklearn.tree中导入DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor
# 使用默认配置初始化的DecisionTreeRegressor
dtr = DecisionTreeRegressor
dtr.fit(X_train, y_train.ravel)
dtr_y_predict = dtr.predict(X_test)
print("dtr_score:",dtr.score(X_test,y_test.ravel))
print("r2_score:",r2_score(y_test,dtr_y_predict))
print("MSE:",mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(dtr_y_predict)))
print("MAE:",mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(dtr_y_predict)))
输出结果:
dtr_score: 0.724186586586
r2_score: 0.724186586586
MSE: 21.3869291339
MAE: 2.90866141732
#建立随机森林回归(RFR)、极端随机森林(ETRT)回归和梯度提升回归树(GBRT)
# 从sklearn.ensemble中导入RandomForestRegressor、ExtraTreesRegressor以及GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
# 使用RandomForestRegressor训练模型,并对测试数据做出预测,结果存储在变量rfr_y_predict中
rfr = RandomForestRegressor
rfr.fit(X_train, y_train.ravel)
rfr_y_predict = rfr.predict(X_test)
print("rfr_score:",rfr.score(X_test,y_test.ravel))
print("r2_score:",r2_score(y_test,rfr_y_predict))
print("MSE:",mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(rfr_y_predict)))
print("MAE:",mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(rfr_y_predict)))
输出结果:
rfr_score: 0.809357176431
r2_score: 0.809357176431
MSE: 14.7826913386
MAE: 2.45228346457
# 使用默认配置的ExtraTreesRegressor训练模型
etr = ExtraTreesRegressor
etr.fit(X_train, y_train.ravel)
etr_y_predict = etr.predict(X_test)
print("etr_score:",etr.score(X_test,y_test.ravel))
print("r2_score:",r2_score(y_test,etr_y_predict))
print("MSE:",mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(etr_y_predict)))
print("MAE:",mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(etr_y_predict)))
输出结果:
etr_score: 0.788650201156
r2_score: 0.788650201156
MSE: 16.3883370079
MAE: 2.47952755906
# 使用GradientBoostingRegressor(GBRT)训练模型
gbr = GradientBoostingRegressor
gbr.fit(X_train, y_train.ravel)
gbr_y_predict = gbr.predict(X_test)
print("gbr_score:",gbr.score(X_test,y_test.ravel))
print("r2_score:",r2_score(y_test,gbr_y_predict))
print("MSE:",mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(gbr_y_predict)))
print("MAE:",mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(gbr_y_predict)))
输出结果:
gbr_score: 0.840064661194
r2_score: 0.840064661194
MSE: 12.4015932173
MAE: 2.28207298939
#使用默认配置的AdaBoostRegressor训练模型
from sklearn.ensemble import AdaBoostRegressor
abr = AdaBoostRegressor
abr.fit(X_train,y_train.ravel)
abr_y_predict = abr.predict(X_test)
print("abr_score:",abr.score(X_test,y_test.ravel))
print("r2_score:",r2_score(y_test,abr_y_predict))
print("MSE:",mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(abr_y_predict)))
print("MAE:",mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(abr_y_predict)))
输出结果:
abr_score: 0.783785004026
r2_score: 0.783785004026
MSE: 16.7655906917
MAE: 2.89216618687
#使用默认配置的XGBRegressor训练模型
from xgboost import XGBRegressor
xgb = XGBRegressor
xgb.fit(X_train,y_train.ravel)
xgb_y_predict = xgb.predict(X_test)
print("xgbr_score:",xgb.score(X_test,y_test.ravel))
print("r2_score:",r2_score(y_test,xgb_y_predict))
print("MSE:",mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(xgb_y_predict)))
print("MAE:",mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(xgb_y_predict)))
输出结果:
xgbr_score: 0.843513197537
r2_score: 0.843513197537
MSE: 12.1341891225
MAE: 2.29020682132
#建立lightgbm模型
import lightgbm as lgb
params = {
'num_leaves': 6,
'objective': 'regression',
'min_data_in_leaf': 16,
'learning_rate': 0.04,
'feature_fraction': 0.97,
'bagging_fraction': 0.97,
'bagging_freq': 1,
'metric': 'l2',
'num_threads': 1
}
dtrain = lgb.Dataset(X_train, label=y_train.ravel)
#训练模型
lgb_ = lgb.train(params,dtrain,1000)
#做预测
lgb_y_predict = lgb_.predict(X_test)
print("r2_score:",r2_score(y_test,lgb_y_predict))
print("MSE:",mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(lgb_y_predict)))
print("MAE:",mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(lgb_y_predict)))
输出结果:
r2_score: 0.845939419188
MSE: 11.9460568773
MAE: 2.1404921479
特征重要性
#特征重要性
dtrimp = dtr.feature_importances_
rfrimp = rfr.feature_importances_
etrimp = etr.feature_importances_
gbrimp = gbr.feature_importances_
abrimp = abr.feature_importances_
xgbimp = xgb.feature_importances_
#建立特征字典
d = {'Decision Tree':dtrimp, 'Random Forest':rfrimp, 'Gradient Boost':gbrimp,'Ada boost':abrimp, 'Extra Tree':etrimp,'Xgb boost':xgbimp}
#将字典写入DataFrame,便于后面写成.csv格式
features = pd.DataFrame(data = d)
# 查特征看前几行
features.head
#为DataFrame新增两列“mean”和“names”
features['mean'] = features.mean(axis= 1)
features['names'] = boston.feature_names
#将Dataframe转化成csv文件!
features.to_csv("features_2.csv")
总结
GBTR、Lightgbm、RFR、XGBR的表现最好。
本文模型几乎都使用默认配置,没有进行超参数调优;可进行交叉验证选择模型参数。
怎么将np.array格式的数组写成csv文件格式:先写成字典,再弄成DataFrame格式,最后使用to_csv函数;以及如何为csv文件新增列。
思考为什么集成模型表现最好?
from sklearn.cross_validation import train_test_split
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
import lightgbm as lgb