df_train.shape

(10886, 12)

准备训练集数据,测试集数据:
1. df_train_target：目标,也就是count字段。
2. df_train_data：用于产出特征的数据

df_train_target = df_train[‘count‘].values 
print(df_train_target.shape) 
df_train_data = df_train.drop([‘count‘],axis =1).values
print(df_train_data.shape)

(10886,)
(10886, 11)

算法
咱们依旧会使用交叉验证的方式（交叉验证集约占全部数据的20%）来看看模型的效果,
我们会试 支持向量回归/Suport Vector Regression, 岭回归/Ridge Regression 和
随机森林回归/Random Forest Regressor。每个模型会跑3趟看平均的结果。

from sklearn import linear_model
from sklearn import cross_validation
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.learning_curve import learning_curve
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import explained_variance_score

# 切分一下数据（训练集和测试集）
cv = cross_validation.ShuffleSplit(len(df_train_data), n_iter=3, test_size=0.2,
    random_state=0)

# 各种模型来一圈

print("岭回归")    
for train, test in cv:    
    svc = linear_model.Ridge().fit(df_train_data[train], df_train_target[train])
    print("train score: {0:.3f}, test score: {1:.3f}\n".format(
        svc.score(df_train_data[train], df_train_target[train]), svc.score(df_train_data[test], df_train_target[test])))
    
print("支持向量回归/SVR(kernel=‘rbf‘,C=10,gamma=.001)")
for train, test in cv:
    
    svc = svm.SVR(kernel =‘rbf‘, C = 10, gamma = .001).fit(df_train_data[train], df_train_target[train])
    print("train score: {0:.3f}, test score: {1:.3f}\n".format(
        svc.score(df_train_data[train], df_train_target[train]), svc.score(df_train_data[test], df_train_target[test])))
    
print("随机森林回归/Random Forest(n_estimators = 100)")    
for train, test in cv:    
    svc = RandomForestRegressor(n_estimators = 100).fit(df_train_data[train], df_train_target[train])
    print("train score: {0:.3f}, test score: {1:.3f}\n".format(
        svc.score(df_train_data[train], df_train_target[train]), svc.score(df_train_data[test], df_train_target[test])))

岭回归
train score: 0.339, test score: 0.332

train score: 0.330, test score: 0.370

train score: 0.342, test score: 0.320

支持向量回归/SVR(kernel=‘rbf‘,C=10,gamma=.001)
train score: 0.417, test score: 0.408

train score: 0.406, test score: 0.452

train score: 0.419, test score: 0.390

随机森林回归/Random Forest(n_estimators = 100)
train score: 0.981, test score: 0.867

train score: 0.981, test score: 0.880

train score: 0.981, test score: 0.869

随机森林回归获得了最佳结果
不过,参数设置得是不是最好的,这个我们可以用GridSearch来帮助测试,找最好的参数

X = df_train_data
y = df_train_target

X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X, y, test_size=0.2, random_state=0)

tuned_parameters = [{‘n_estimators‘:[10,100,500,550]}]   
    
scores = [‘r2‘]

for score in scores:
    
    print(score)
    
    clf = GridSearchCV(RandomForestRegressor(), tuned_parameters, cv=5, scoring=score)
    clf.fit(X_train, y_train)

    print("最佳参数找到了：")
    print("")
    #best_estimator_ returns the best estimator chosen by the search
    print(clf.best_estimator_)
    print("")
    print("得分分别是:")
    print("")
    #grid_scores_的返回值:
    #    * a dict of parameter settings
    #    * the mean score over the cross-validation folds 
    #    * the list of scores for each fold
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
    print("")

r2
最佳参数找到了：

RandomForestRegressor(bootstrap=True, criterion=‘mse‘, max_depth=None,
           max_features=‘auto‘, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=550, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

得分分别是:

0.846 (+/-0.006) for {‘n_estimators‘: 10}
0.862 (+/-0.005) for {‘n_estimators‘: 100}
0.863 (+/-0.005) for {‘n_estimators‘: 500}
0.864 (+/-0.005) for {‘n_estimators‘: 550}

Grid Search帮挑参数还是蛮方便的, 而且要看看模型状态是不是,过拟合or欠拟合
我们发现n_estimators=500,550时,拟合得最好。

	datetime	season	weather	temp	atemp	humidity	casual	registered	count	hour	day	month
0	2011-01-01 00:00:00	1	1	9.84	14.395	81	3	13	16	0	5	1
1	2011-01-01 01:00:00	1	1	9.02	13.635	80	8	32	40	1	5	1
2	2011-01-01 02:00:00	1	1	9.02	13.635	80	5	27	32	2	5	1
3	2011-01-01 03:00:00	1	1	9.84	14.395	75	3	10	13	3	5	1
4	2011-01-01 04:00:00	1	1	9.84	14.395	75	0	1	1	4	5	1

	season	weather	temp	atemp	humidity	count	month	day	hour
0	1	1	9.84	14.395	81	16	1	5	0
1	1	1	9.02	13.635	80	40	1	5	1
2	1	1	9.02	13.635	80	32	1	5	2
3	1	1	9.84	14.395	75	13	1	5	3
4	1	1	9.84	14.395	75	1	1	5	4

Kaggle 自行车租赁预测比赛项目实现