标签:sheet from 分词 pyplot pen cpi 形式 label rom
import xlrd import jieba import sys import importlib import os #python内置的包,用于进行文件目录操作,我们将会用到os.listdir函数 import pickle #导入cPickle包并且取一个别名pickle #持久化类 import random import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from pylab import mpl from sklearn.naive_bayes import MultinomialNB # 导入多项式贝叶斯算法包 from sklearn import svm from sklearn import metrics from sklearn.datasets.base import Bunch from sklearn.feature_extraction.text import TfidfVectorizer importlib.reload(sys) #把内容和类别转化成一个向量的形式 trainContentdatasave=[] #存储所有训练和测试数据的分词 testContentdatasave=[] trainContentdata = [] testContentdata = [] trainlabeldata = [] testlabeldata = [] #导入文本描述的训练和测试数据 def importTrainContentdata(): file = ‘20180716_train.xls‘ wb = xlrd.open_workbook(file) ws = wb.sheet_by_name("Sheet1") for r in range(ws.nrows): trainContentdata.append(ws.cell(r, 0).value) def importTestContentdata(): file = ‘20180716_test.xls‘ wb = xlrd.open_workbook(file) ws = wb.sheet_by_name("Sheet1") for r in range(ws.nrows): testContentdata.append(ws.cell(r, 0).value) #导入类别的训练和测试数据 def importTrainlabeldata(): file = ‘20180716_train_label.xls‘ wb = xlrd.open_workbook(file) ws = wb.sheet_by_name("Sheet1") for r in range(ws.nrows): trainlabeldata.append(ws.cell(r, 0).value) def importTestlabeldata(): file = ‘20180716_test_label.xls‘ wb = xlrd.open_workbook(file) ws = wb.sheet_by_name("Sheet1") for r in range(ws.nrows): testlabeldata.append(ws.cell(r, 0).value) if __name__=="__main__": importTrainContentdata() importTestContentdata() importTrainlabeldata() importTestlabeldata() ‘‘‘贝叶斯 clf = MultinomialNB(alpha=0.052).fit(train_set.tdm, train_set.label) #clf = svm.SVC(C=0.7, kernel=‘poly‘, gamma=10, decision_function_shape=‘ovr‘) clf.fit(train_set.tdm, train_set.label) predicted=clf.predict(test_set.tdm) 逻辑回归 tv = TfidfVectorizer() train_data = tv.fit_transform(X_train) test_data = tv.transform(X_test) lr = LogisticRegression(C=3) lr.fit(train_set.tdm, train_set.label) predicted=lr.predict(test_set.tdm) print(lr.score(test_set.tdm, test_set.label)) #print(test_set.tdm) #SVM clf = SVC(C=1500) clf.fit(train_set.tdm, train_set.label) predicted=clf.predict(test_set.tdm) print(clf.score(test_set.tdm, test_set.label)) ‘‘‘ tv = TfidfVectorizer() train_data = tv.fit_transform(trainContentdata) test_data = tv.transform(testContentdata) clf = SVC(C=1500) clf.fit(train_data, trainlabeldata) print(clf.score(test_data, testlabeldata)) a=[] b=[] for i in range(len(predicted)): b.append((int)(float(predicted[i]))) a.append(int(test_set.label[i][0])) ‘‘‘ f=open(‘F:/goverment/ArticleMining/predict.txt‘, ‘w‘) for i in range(len(predicted)): f.write(str(b[i])) f.write(‘\n‘) f.write("写好了") f.close() #for i in range(len(predicted)): #print(b[i]) ‘‘‘ metrics_result(a, b)
标签:sheet from 分词 pyplot pen cpi 形式 label rom
原文地址:https://www.cnblogs.com/caiyishuai/p/9354035.html