码迷,mamicode.com
首页 > 其他好文 > 详细

房价预测《进阶版,测试》

时间:2017-10-23 21:39:16      阅读:351      评论:0      收藏:0      [点我收藏+]

标签:learn   div   and   .sh   ima   adaboost   span   index   作图   

#coding=utf8

import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor

#不要第一列id,只是作为索引
train_df = pd.read_csv(./input/train.csv, index_col=0)
test_df = pd.read_csv(./input/test.csv, index_col=0)
prices = pd.DataFrame({price:train_df[SalePrice], log(price + 1):np.log1p(train_df[SalePrice])})
#print train_df.columns
#prices.hist()
#print ‘ok‘
#print train_df.index
#print test_df.index

y_train = np.log1p(train_df.pop(SalePrice))
#print y_train.shape
#print train_df.index

all_df = pd.concat((train_df,test_df), axis=0)
#变量转换
print train_df.index
print test_df.index

#print all_df[‘MSSubClass‘].dtypes
all_df[MSSubClass] = all_df[MSSubClass].astype(str)
#print all_df.shape
#print all_df[‘MSSubClass‘].value_counts()
#print all_df[‘MSSubClass‘].dtypes
#print pd.get_dummies(all_df[‘MSSubClass‘], prefix=‘MSSubClass‘).head()
#当我们用numerical来表达categorical的时候,要注意,数字本身有大小的含义,所以乱用数字会给之后的模型学习带来麻烦。于是我们可以用One-Hot的方法来表达category。
#pandas自带的get_dummies方法,一键做到One-Hot。
#把所有的category数据,都给One-Hot了
all_dummy_df = pd.get_dummies(all_df)
#print all_dummy_df.head()
#print all_dummy_df.isnull().sum().sort_values(ascending=False).head(10)
#处理缺失值
mean_cols = all_dummy_df.mean()
#print mean_cols
all_dummy_df = all_dummy_df.fillna(mean_cols)
#print all_dummy_df.isnull().sum().sum()
#标准化numerical数据,这里,我们当然不需要把One-Hot的那些0/1数据给标准化。我们的目标应该是那些本来就是numerical的数据:
#先来看看 哪些是numerical的
numeric_cols = all_df.columns[all_df.dtypes != object]
#print numeric_cols
#print train_df.index
numeric_col_means = all_dummy_df.loc[:, numeric_cols].mean()
numeric_col_std = all_dummy_df.loc[:, numeric_cols].std()
all_dummy_df.loc[:, numeric_cols] = (all_dummy_df.loc[:, numeric_cols] - numeric_col_means) / numeric_col_std

dummy_train_df = all_dummy_df.loc[train_df.index]
dummy_test_df = all_dummy_df.loc[test_df.index]
#print train_df.index
#print test_df.index
#print dummy_train_df.shape
#print dummy_test_df.shape
#print type(dummy_train_df)

X_train = dummy_train_df.values
X_test = dummy_test_df.values
#print type(X_train)

print X_train.shape
alphas = np.logspace(-3, 2, 50)
test_scores = []
for alpha in alphas:
    clf = Ridge(alpha)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring=neg_mean_squared_error))
    test_scores.append(np.mean(test_score))

plt.plot(alphas, test_scores)
plt.title(Alpha vs CV Error)

max_features = [.1, .3, .5, .7, .9, .99]
test_scores = []
for max_feat in max_features:
    clf = RandomForestRegressor(n_estimators=200, max_features=max_feat)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=5, scoring=neg_mean_squared_error))
    test_scores.append(np.mean(test_score))

plt.plot(max_features, test_scores)
plt.title("Max Features vs CV Error")

#做一点高级的Ensemble
#这里,可以不必输入Base_estimator,使用自带的,但是结果不及已经调好的 base_estimator,通过作图可以验证。
ridge = Ridge(alpha=15)

#Bagging
params = [1, 10, 15, 20, 25, 30, 40]
test_scores = []
for param in params:
    clf = BaggingRegressor(n_estimators=param, base_estimator=ridge)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring=neg_mean_squared_error))
    test_scores.append(np.mean(test_score))

plt.plot(params, test_scores)
plt.title("n_estimator vs CV Error")

#Boosting
params = [10, 15, 20, 25, 30, 35, 40, 45, 50]
test_scores = []
for param in params:
    clf = AdaBoostRegressor(n_estimators=param, base_estimator=ridge)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring=neg_mean_squared_error))
    test_scores.append(np.mean(test_score))

plt.plot(params, test_scores)
plt.title("n_estimator vs CV Error")

#XGBoost
params = [1,2,3,4,5,6]
test_scores = []
for param in params:
    clf = XGBRegressor(max_depth=param)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring=neg_mean_squared_error))
    test_scores.append(np.mean(test_score))

plt.plot(params, test_scores)
plt.title("max_depth vs CV Error")


"""
rf = RandomForestRegressor(n_estimators=500, max_features=.3)

ridge.fit(X_train, y_train)
rf.fit(X_train, y_train)

y_ridge = np.expm1(ridge.predict(X_test))
y_rf = np.expm1(rf.predict(X_test))
y_final = (y_ridge + y_rf) / 2
"""

 

房价预测《进阶版,测试》

标签:learn   div   and   .sh   ima   adaboost   span   index   作图   

原文地址:http://www.cnblogs.com/pengwang57/p/7718848.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!