码迷,mamicode.com
首页 > 其他好文 > 详细

信用卡欺诈检测分析案例

时间:2017-11-21 19:48:10      阅读:743      评论:0      收藏:0      [点我收藏+]

标签:standard   ssi   return   rsa   ram   tca   dict   concat   set   

技术分享图片
data = pd.read_csv("creditcard.csv")
data.head()

count_classes = pd.value_counts(data[‘class‘],sort = True).sort_index() # value_count:计算数值的个数
count_classes.plot(kind = ‘bar‘) # 绘制条形图
plt.title("Fraud class hiistogram")
plt.xlabel("class")
plt.ylabel("Frequency")
样本不平衡解决方案
from
sklearn.preprocessing import StandardScaler data[normAmount] = StandardScaler().fit_transform(data[Amount].reshape(-1,1)) # fit_transform对数据进行变换,amount数据值较大,机器算法会把它的数值看得比较
# 重要,所以要进行转换,让它的每一列数据的重要程度看起来一样
# [2,3].reshape(-1,2), -1:2*3/2 1:变成列向量 data
= data.drop([Time,Amount],axis=1) # 删除Time,Amount这两列数据 data.head()
下采样策略:
X = data.ix[:,data.columns != Class] y = data.ix[:,data.columns == Class] number_records_fraud = len(data[data.Class == 1]) # class=1的样本函数 fraud_indices = np.array(data[data.Class == 1].index) # 样本等于1的索引值 normal_indices = data[data.Class == 0].index # 样本等于0的索引值 random_normal_indices = np.random.choice(normal_indeces,number_records_fraud,replace = False)
random_normal_indices = np.array(random_normal_indices)

under_sample_indices = np.concatenate([fraud_indices,random_normal_indices]) # Appending the 2 indices

under_sample_data = data.iloc[under_sample_indices,:] # Under sample dataset

X_undersample = under_sample_data.ix[:,under_sample_data.columns != ‘Class‘]
y_undersample = under_sample_data.ix[:,under_sample_data.columns == ‘Class‘]
交叉验证
from sklearn.cross_validation import train_test_split
# While dataset
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 0) # random_state 
# Undersampled dataset
X_train_undersample,X_test_undersample,y_train_undersample,y_test_undersample = train_test_split(X_undersample,y_undersample,test_size=0.3,random_state=0)
模型评估方法
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold,cross_val_score
from sklearn.metrics import confusion_matrix,reclll_score,classification_report

def printing_Kfold_scores(x_train_data,y_train_data):
fold = KFold(len(y_train_data),5,shuffle=False) # 将原始的数据进行切分

c_param_range = [0.01,0.1,1,10,100] # 惩罚力度参数
results_table = pd.DataFrame(index = range(len(c_param_range),2),columns = ["C_parameter‘,‘Mean recall s’]
results_table[‘C_parameter‘] = c_param_range

j = 0
for c_param in c_param_range:
recall_accs = []
for iteration,indices in enumerate(fold,start=1):
lr = LogisticRegression(C=c_param,penalty=‘l1‘)# 建立逻辑回归模型,l1是w的绝对值惩罚,l2是w2的惩罚
lr.fit(x_train_data.iloc[indices[0],:],y_train_data.iloc[indices[0],:].values.ravel())
y_pred_undersample = lr.predict(x_train_data.iloc[indices[1],:].values)
recall_acc = recall_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample) # 召回率
recall_accs.append(recall_acc)
print("Iteration",iteration,‘:recall score = ‘,recall_acc)
result_table.ix[j,‘Mean recall score‘] = np.mean(recall_accs)
j += 1
print(‘Mean recall score‘,np.mean(recall_accs))

best_c = results_table.loc[results_table[‘Mean recall score‘].idxmax()][‘C_parameter‘]
print(“Best model”,best_c)
return best_c

混淆矩阵
逻辑回归阈值对结果的影响
lr = LogisticRegression(C = 0.01,penalty=l1) lr.fit(X_train_undersample,y_train_undersample.values.ravel()) y_pred_undersample_proba = lr.predict_proba(X_test_undersample.values) thresholds=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] plt.figure(figsize=(10,10)) j= 1 for i in thresholds: y_test_predicttions_high_recall = y_pred_undersample_proba[:,1]> i
plt.subplot(3,3,j)
j += 1
cnf_matrix = confusion_matrix(y_test_undersample,y_test_predictions_high_recall)
np.set_printoptions(precision=2)
print("Recall metric in the testing dataset:",cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
class_names = [0,1]
plot_confusion_matrix(cnf_matrix,classes=class_names,title="Threshold >= %s‘%i)
SMOTE样本生成策略(过采样)
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

credit_cards = pd.read_csv(‘creditcard.csv‘)
columns = credit_cards.columns
features_columns = columns.delete(len(columns)-1)

features = credit_cards[features_columns]
labels = credit_cards[‘Class‘]

features_train,features_test,labels_train,labels_test = train_test_split(features,labels,test_size=0.2,random_state=0)

oversampler = SMOTE(random_state=0)
os_features,os_labels = oversampler.fit_sample(features_train,labels_train)

len(os_labels[os_labels==1])

os_features = pd.DataFrame(os_features)
os_labels = pd.DataFrame(os_labels)
best_c = printing_Kfold_scores(os_features,os_labels)
lr = LogisticRegression(C = best_c,penalty=l1)
lr.fit(os_features,os_labels.values.ravel())
y_pred = lr.predict(features_test.values)

cnf_matrix = confusiion_matrix(labels_test,y_pred)
np.set_printopotions(precision=2)
print(recall metric:,cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
class_names[0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix,classes=class_names,title=‘Confusion matrix‘)
plt.show()

 

信用卡欺诈检测分析案例

标签:standard   ssi   return   rsa   ram   tca   dict   concat   set   

原文地址:http://www.cnblogs.com/panjie123pakho/p/7874796.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!