标签:evo 代码 error 邮件 sts range form fulltext voc
心得体会
1交叉验证:从训练的数据里随机抽取作为测试集
# 4-6朴素贝叶斯过滤垃圾邮件 #朴素贝叶斯交叉验证 def textParse(bigString): import re listOfTokens=re.split(‘\\W+‘,bigString) return [tok.lower() for tok in listOfTokens if len(tok)>2] def spamTest(): docList=[] classList=[] fullText=[] #存入数据 for i in range(1,26): wordList=textParse(open(‘E:/Python/《机器学习实战》代码/Ch04/spam/%d.txt‘%i).read()) docList.append(wordList) fullText.append(wordList) classList.append(1) wordList=textParse(open(‘E:/Python/《机器学习实战》代码/Ch04/ham/%d.txt‘%i).read()) docList.append(wordList) fullText.append(wordList) classList.append(0) # 随机建立测训练集 vocabList=createVocabList(docList) #生成单词向量 trainingSet=list(range(50)) testSet=[] for i in range(10): randIndex=int(random.uniform(0,len(trainingSet)))#生成trainingSet范围内的随机数 testSet.append(trainingSet[randIndex])#将该训练数据加入训练集 del(trainingSet[randIndex])#将training中该数据删除,以免重复使用同一数据 trainMat=[] trainClasses=[] for docIndex in trainingSet: trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))#将训练数据变成单词向量 trainClasses.append(classList[docIndex])#保存该训练集的类型 p0V,p1V,pSpam=trainNBO(array(trainMat),array(trainClasses))#生成每个位置的权重 #对测试集分类 errorCount=0 for docIndex in testSet: wordVector=setOfWords2Vec(vocabList,docList[docIndex]) if classifyNB(array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]:#分类结果如果不一样error+1 errorCount+=1 print("the error rate is :",float(errorCount)/len(testSet)) spamTest()
标签:evo 代码 error 邮件 sts range form fulltext voc
原文地址:https://www.cnblogs.com/LPworld/p/13272628.html