标签:fit letters split back ram tps panda import div
跟着Bag of Words Meets Bags of Popcorn的初学者实例,敲了一遍代码。主要用到的是CountVectorizer,生成每个评论的词频向量,然后利用随机森林建立模型,对新的评论进行预测。提交之后,分数大概为0.84。
import pandas as pd import re from bs4 import BeautifulSoup import logging logging.basicConfig(level=logging.ERROR) train = pd.read_csv(‘/Users/meitu/Downloads/labeledTrainData.tsv‘, header=0, delimiter="\t", quoting=3) print(train[‘sentiment‘].head(10)) # print(train.shape) # print(train.columns.values) # print(train[‘review‘][0]) example1 = BeautifulSoup(train[‘review‘][0]) # print(example1.get_text()) letters_only = re.sub(‘[^a-zA-z]‘, " ", example1.get_text()) print(letters_only) lower_case = letters_only.lower() words = lower_case.split() print(words) from nltk.corpus import stopwords # print(stopwords.words("english")) words = [w for w in words if not w in stopwords.words(‘english‘)] print(words) def review_to_words(raw_review): # 1. Remove HTML review_text = BeautifulSoup(raw_review).get_text() # 2. Remove non-letters letters_only = re.sub("[^a-zA-Z]", " ", review_text) # 3. Convert to lower case, split into individual words words = letters_only.lower().split() # 4. In Python, searching a set is much faster than searching a list stops = set(stopwords.words(‘english‘)) # 5. Remove stop words meaningful_words = [w for w in words if not w in stops] # 6. Join the words back into one string separated by space, and return the result return (" ".join(meaningful_words)) if __name__ == ‘__main__‘: num_reviews = train[‘review‘].size clean_train_reviews = [] for i in range(0, num_reviews): clean_train_reviews.append(review_to_words(train[‘review‘][i])) if (i + 1) % 1000 == 0: print("review %d of %d\n", i + 1, num_reviews) print("Creating the bag of words...\n") from sklearn.feature_extraction.text import CountVectorizer vectorzer = CountVectorizer(analyzer="word", tokenizer=None, stop_words=None, max_features=5000) train_data_features = vectorzer.fit_transform(clean_train_reviews) # print(train_data_features[0]) print(vectorzer.get_feature_names()) train_data_features = train_data_features.toarray() print(train_data_features) vocab = vectorzer.get_feature_names() print(vocab) print("Training the random forest...") from sklearn.ensemble import RandomForestClassifier forest = RandomForestClassifier(n_estimators=100) forest = forest.fit(train_data_features, train[‘sentiment‘]) test = pd.read_csv(‘/Users/meitu/Downloads/testData.tsv‘, header=0, delimiter="\t", quoting=3) print(test.shape) num_reviews = len(test[‘review‘]) clean_test_reviews = [] for i in range(0, num_reviews): if (i + 1) % 1000 == 0: print("Review %d of %d\n" % (i + 1, num_reviews)) clean_review = review_to_words(test[‘review‘][i]) clean_test_reviews.append(clean_review) test_data_features = vectorzer.transform(clean_test_reviews) test_data_features = test_data_features.toarray() result = forest.predict(test_data_features) output = pd.DataFrame(data={‘id‘: test[‘id‘], ‘sentiment‘: result}) output.to_csv(‘bag_of_word_model.csv‘, index=False, quoting=3) # test_data_features = vec
标签:fit letters split back ram tps panda import div
原文地址:http://www.cnblogs.com/fall12/p/7722084.html