标签:
# coding=utf8
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression as lr
import matplotlib.pyplot as plt
def show_importance_of_sub_model(total_size,factor_size,special_percent):
each_dt_size = total_size/factor_size
dt = pd.DataFrame()
y = pd.DataFrame()
special_size = int(factor_size*special_percent)
special = range(special_size)
for i in range(factor_size):
dt_current = pd.DataFrame(np.random.random((each_dt_size,5)))
dt_current[‘type‘] = ‘type‘+str(100+i)
y_current = pd.DataFrame(np.ones((each_dt_size,)))
if i in special:
y_current[(dt_current[0]+dt_current[1]+dt_current[2])>0.6] = -1
else:
y_current[(dt_current[0]+dt_current[1]+dt_current[2])>1.5] = -1
y = pd.concat((y,y_current))
dt = pd.concat((dt,dt_current))
type = pd.get_dummies(dt[‘type‘])
X_ori = pd.concat((dt,type),1)
del X_ori[‘type‘]
y = np.array(y[0])
clf = lr(penalty=‘l1‘)
clf.fit(type,y)
coef = clf.coef_[0]
dt[‘coef‘] = 0
for i in range(len(type.columns)):
dt.loc[dt[‘type‘]==type.columns[i],‘coef‘] = coef[i]
X_sub = dt
del X_sub[‘type‘]
clf = rf(n_estimators=1000)
return [cross_val_score(clf,X_ori,y,cv=10).mean(),cross_val_score(clf,X_sub,y,cv=10).mean()]
def performance_ev(factor_size=100,special_percent=0.2,total_size=10000):
x = None
outcome = list()
if type(factor_size) == list:
x = factor_size
factor_sizes = factor_size
for factor_size in factor_sizes:
outcome.append(show_importance_of_sub_model(total_size,factor_size,special_percent))
else:
x = special_percent
special_percents = special_percent
for special_percent in special_percents:
outcome.append(show_importance_of_sub_model(total_size,factor_size,special_percent))
performance = pd.DataFrame(outcome,columns=[‘dummy‘,‘sub_model‘])
plt.plot(x,performance)
%matplotlib inline
performance_ev(factor_size = [5,10,20,40,80])
%matplotlib inline
performance_ev(special_percent = [0.02,0.04,0.08,0.16,0.32])
对离散特征,使用子模型建模能提升效果
离散特征越重要(重要的值越多,对应special_percent越高),使用子模型带来的提升越大
离散特征的取值越多,RandomForest的预测效果越差
标签:
原文地址:http://www.cnblogs.com/porco/p/5148527.html