改善特征提取往往可以对分类的accuracy(和precision和召回率)有显著的正面影响。在本文中,我将评估word_feats的两项修改特征提取的方法:
import collections
import nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
def evaluate_classifier(featx):
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
classifier = NaiveBayesClassifier.train(trainfeats)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(testfeats):
refsets[label].add(i)
observed = classifier.classify(feats)
testsets[observed].add(i)
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
classifier.show_most_informative_features()
def word_feats(words):
return dict([(word, True) for word in words])
evaluate_classifier(word_feats)accuracy: 0.728
pos precision: 0.651595744681
pos recall: 0.98
neg precision: 0.959677419355
neg recall: 0.476
Most Informative Features
magnificent = True pos : neg = 15.0 : 1.0
outstanding = True pos : neg = 13.6 : 1.0
insulting = True neg : pos = 13.0 : 1.0
vulnerable = True pos : neg = 12.3 : 1.0
ludicrous = True neg : pos = 11.8 : 1.0
avoids = True pos : neg = 11.7 : 1.0
uninvolving = True neg : pos = 11.7 : 1.0
astounding = True pos : neg = 10.3 : 1.0
fascination = True pos : neg = 10.3 : 1.0
idiotic = True neg : pos = 9.8 : 1.0
from nltk.corpus import stopwords
stopset = set(stopwords.words('english'))
def stopword_filtered_word_feats(words):
return dict([(word, True) for word in words if word not in stopset])
evaluate_classifier(stopword_filtered_word_feats)accuracy: 0.726 pos precision: 0.649867374005 pos recall: 0.98 neg precision: 0.959349593496 neg recall: 0.472accuracy下降了0.2%,pos 的 precision和负recall也下降了!显然,停用词将信息添加到情感分析分类。我并没有包括最具信息量的特征,因为他们并没有改变。
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn, n)
return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
evaluate_classifier(bigram_word_feats)
一些实验后,我发现,对每个文件使用200个最好的二元组产生了很大的成效:
accuracy: 0.816
pos precision: 0.753205128205
pos recall: 0.94
neg precision: 0.920212765957
neg recall: 0.692
Most Informative Features
magnificent = True pos : neg = 15.0 : 1.0
outstanding = True pos : neg = 13.6 : 1.0
insulting = True neg : pos = 13.0 : 1.0
vulnerable = True pos : neg = 12.3 : 1.0
(‘matt‘, ‘damon‘) = True pos : neg = 12.3 : 1.0
(‘give‘, ‘us‘) = True neg : pos = 12.3 : 1.0
ludicrous = True neg : pos = 11.8 : 1.0
uninvolving = True neg : pos = 11.7 : 1.0
avoids = True pos : neg = 11.7 : 1.0
(‘absolutely‘, ‘no‘) = True neg : pos = 10.6 : 1.0
是的,你没有看错,Matt Damon显然是在电影评论中正向情绪的最佳指标之一。但是,尽管如此,这也是值得的结果原文:http://streamhacker.com/2010/05/24/text-classification-sentiment-analysis-stopwords-collocations/
原文地址:http://blog.csdn.net/laozhaokun/article/details/37954769