码迷,mamicode.com
首页 > 其他好文 > 详细

ptyhon中文本挖掘精简版

时间:2018-07-23 15:05:15      阅读:238      评论:0      收藏:0      [点我收藏+]

标签:sheet   from   分词   pyplot   pen   cpi   形式   label   rom   

import xlrd
import jieba
import sys  
import importlib
import os         #python内置的包,用于进行文件目录操作,我们将会用到os.listdir函数  
import pickle    #导入cPickle包并且取一个别名pickle #持久化类
import random
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from pylab import mpl  
from sklearn.naive_bayes import MultinomialNB # 导入多项式贝叶斯算法包
from sklearn import svm

from sklearn import metrics 
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer
importlib.reload(sys)


#把内容和类别转化成一个向量的形式
trainContentdatasave=[] #存储所有训练和测试数据的分词
testContentdatasave=[]

trainContentdata = []
testContentdata = []
trainlabeldata = []
testlabeldata = []

#导入文本描述的训练和测试数据
def importTrainContentdata():
    file = 20180716_train.xls
    wb = xlrd.open_workbook(file)
    ws = wb.sheet_by_name("Sheet1")
    for r in range(ws.nrows):
        trainContentdata.append(ws.cell(r, 0).value)

def importTestContentdata():
    file = 20180716_test.xls
    wb = xlrd.open_workbook(file)
    ws = wb.sheet_by_name("Sheet1")
    for r in range(ws.nrows):
        testContentdata.append(ws.cell(r, 0).value)   

#导入类别的训练和测试数据
def importTrainlabeldata():
    file = 20180716_train_label.xls
    wb = xlrd.open_workbook(file)
    ws = wb.sheet_by_name("Sheet1")
    for r in range(ws.nrows):
        trainlabeldata.append(ws.cell(r, 0).value)
        
def importTestlabeldata():
    file = 20180716_test_label.xls
    wb = xlrd.open_workbook(file)
    ws = wb.sheet_by_name("Sheet1")
    for r in range(ws.nrows):
        testlabeldata.append(ws.cell(r, 0).value)


if __name__=="__main__": 
    
    importTrainContentdata()
    importTestContentdata()
    importTrainlabeldata()
    importTestlabeldata()
    
    ‘‘‘贝叶斯
    clf = MultinomialNB(alpha=0.052).fit(train_set.tdm, train_set.label)  
    #clf = svm.SVC(C=0.7, kernel=‘poly‘, gamma=10, decision_function_shape=‘ovr‘)
    clf.fit(train_set.tdm, train_set.label)  
    predicted=clf.predict(test_set.tdm)
    
    逻辑回归
    tv = TfidfVectorizer()
    train_data = tv.fit_transform(X_train)
    test_data = tv.transform(X_test)
    
    lr = LogisticRegression(C=3)
    lr.fit(train_set.tdm, train_set.label)
    predicted=lr.predict(test_set.tdm)
    print(lr.score(test_set.tdm, test_set.label))
    #print(test_set.tdm)
    
    #SVM
    clf = SVC(C=1500)
    clf.fit(train_set.tdm, train_set.label)
    predicted=clf.predict(test_set.tdm)
    print(clf.score(test_set.tdm, test_set.label))
    ‘‘‘
    
    tv = TfidfVectorizer()
    train_data = tv.fit_transform(trainContentdata)
    test_data = tv.transform(testContentdata)

    clf = SVC(C=1500)
    clf.fit(train_data, trainlabeldata)
    print(clf.score(test_data, testlabeldata))
    
    
    
    a=[]
    b=[]
    for i in range(len(predicted)):
        b.append((int)(float(predicted[i])))
        a.append(int(test_set.label[i][0]))
    
    ‘‘‘
    f=open(‘F:/goverment/ArticleMining/predict.txt‘, ‘w‘)
    for i in range(len(predicted)):
       f.write(str(b[i]))
       f.write(‘\n‘)
    f.write("写好了")
    f.close()
    #for i in range(len(predicted)):
        #print(b[i])
    ‘‘‘
    metrics_result(a, b)

 

ptyhon中文本挖掘精简版

标签:sheet   from   分词   pyplot   pen   cpi   形式   label   rom   

原文地址:https://www.cnblogs.com/caiyishuai/p/9354035.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!