标签:分类 word hid cto 输入 标记 targe 英语单词 min
下载nltk相关数据:
import nltk nltk.download()
测试安装是否成功:
from nltk.corpus import brown print(brown.words())
常用的函数有两个:
from nltk.tokenize import word_tokenize """ ‘I‘m super man‘ tokenize: [‘I‘, ‘‘m‘, ‘super‘,‘man‘ ] """ from nltk.stem import WordNetLemmatizer """ 词形还原(lemmatizer),即把一个任何形式的英语单词还原到一般形式,与词根还原不同(stemmer),后者是抽取一个单词的词根。 """
调用形式如下:
words = word_tokenize(line.lower()) lemmatizer = WordNetLemmatizer() lex = [lemmatizer.lemmatize(word) for word in lex]
import numpy as np import tensorflow as tf import random import pickle from collections import Counter import nltk from nltk.tokenize import word_tokenize """ ‘I‘m super man‘ tokenize: [‘I‘, ‘‘m‘, ‘super‘,‘man‘ ] """ from nltk.stem import WordNetLemmatizer """ 词形还原(lemmatizer),即把一个任何形式的英语单词还原到一般形式,与词根还原不同(stemmer),后者是抽取一个单词的词根。 """ import os os.environ[‘TF_CPP_MIN_LOG_LEVEL‘] = ‘2‘ pos_file = ‘pos.txt‘ neg_file = ‘neg.txt‘
词汇表,
def creat_lexicon(pos_file,neg_file): ‘‘‘建立词汇表‘‘‘ lex = [] def process_file(f): with open(pos_file,‘r‘) as f: lex = [] lines = f.readlines() for line in lines: words = word_tokenize(line.lower()) lex += words return lex lex += process_file(pos_file) lex += process_file(neg_file) lemmatizer = WordNetLemmatizer() lex = [lemmatizer.lemmatize(word) for word in lex] word_count = Counter(lex) # print(word_count) # {‘.‘: 13944, ‘,‘: 10536, ‘the‘: 10120, ‘a‘: 9444, ‘and‘: 7108, ‘of‘: 6624, ‘it‘: 4748, ‘to‘: 3940......} lex = [] for word in word_count: if word_count[word] < 2000 and word_count[word] > 20: lex.append(word) return lex lex = creat_lexicon(pos_file,neg_file)
到这里思路就很清晰了,就是把词汇表作为一个计数器集合,不同的评论对应不同的分布,去学习这个分布
# 把每条评论转换为向量, 转换原理: # 假设lex为[‘woman‘, ‘great‘, ‘feel‘, ‘actually‘, ‘looking‘, ‘latest‘, ‘seen‘, ‘is‘] 当然实际上要大的多 # 评论‘i think this movie is great‘ 转换为 [0,1(great有一个),0,0,0,0,0,1(is有一个)], 把评论中出现的字在lex中标记,出现过的标记为1,其余标记为0 def normalize_dataset(lex): dataset = [] def string_to_vector(lex,review): words = word_tokenize(review.lower()) lemmatizer = WordNetLemmatizer() words = [lemmatizer.lemmatize(word) for word in words] features = np.zeros(len(lex)) for word in words: if word in lex: features[lex.index(word)] += 1 return features with open(pos_file, ‘r‘) as f: lines = f.readlines() for line in lines: one_sample = [string_to_vector(lex, line), [1,0]] dataset.append(one_sample) with open(neg_file, ‘r‘) as f: lines = f.readlines() for line in lines: one_sample = (string_to_vector(lex, line), [0,1]) dataset.append(one_sample) return dataset dataset = normalize_dataset(lex) random.shuffle(dataset)
没什么好说的
n_input_layer = len(lex) # 输入层 n_layer_1 = 1000 # hide layer n_layer_2 = 1000 # hide layer n_output_layer = 2 # 输出层 def neural_network(data): # 定义第一层"神经元"的权重和biases layer_1_w_b = {‘w_‘: tf.Variable(tf.random_normal([n_input_layer,n_layer_1])), ‘b_‘: tf.Variable(tf.random_normal([n_layer_1]))} # 定义第二层"神经元"的权重和biases layer_2_w_b = {‘w_‘: tf.Variable(tf.random_normal([n_layer_1,n_layer_2])), ‘b_‘: tf.Variable(tf.random_normal([n_layer_2]))} # 定义输出层"神经元"的权重和biases layer_output_w_b = {‘w_‘: tf.Variable(tf.random_normal([n_layer_2,n_output_layer])), ‘b_‘: tf.Variable(tf.random_normal([n_output_layer]))} # w·x+b layer_1 = tf.add(tf.matmul(data,layer_1_w_b[‘w_‘]),layer_1_w_b[‘b_‘]) layer_1 = tf.nn.relu(layer_1) # 激活函数 layer_2 = tf.add(tf.matmul(layer_1,layer_2_w_b[‘w_‘]),layer_2_w_b[‘b_‘]) layer_2 = tf.nn.relu(layer_2) # 激活函数 layer_output = tf.add(tf.matmul(layer_2,layer_output_w_b[‘w_‘]),layer_output_w_b[‘b_‘]) return layer_output
feed的时候会给batch加上list(),不加会报错,我记得原因应该是之前的batch数据都是取的切片,不是一个单独的数据体,所以要加上把它变成一个独立的结构,其他没什么注意的了
test_size = int(len(dataset)*0.1) dataset = np.array(dataset) train_dataset = dataset[:-test_size] test_dataset = dataset[-test_size:] batch_size = 50 X = tf.placeholder(tf.float32,[None,len(train_dataset[0][0])]) Y = tf.placeholder(tf.float32) def train_neural_network(X, Y): predict = neural_network(X) loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=predict, labels=Y)) optimizer = tf.train.AdamOptimizer().minimize(loss) epochs = 13 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) epoch_loss = 0 i = 0 random.shuffle(train_dataset) train_x = train_dataset[:,0] train_y = train_dataset[:,1] for epoch in range(epochs): while i < len(train_y): start = i end = i + batch_size batch_x = train_x[start:end] batch_y = train_y[start:end] _,l = sess.run([optimizer, loss], feed_dict={X:list(batch_x), Y:list(batch_y)}) epoch_loss += l i += batch_size print(epoch,‘:‘,epoch_loss) test_x = test_dataset[:,0] test_y = test_dataset[:,1] correct = tf.equal(tf.argmax(predict,axis=1), tf.argmax(Y,axis=1)) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) print(‘准确率:‘,accuracy.eval({X:list(test_x),Y:list(test_y)})) train_neural_network(X, Y)
结果:
0 : 62576.5236931
1 : 62576.5236931
2 : 62576.5236931
3 : 62576.5236931
4 : 62576.5236931
5 : 62576.5236931
6 : 62576.5236931
7 : 62576.5236931
8 : 62576.5236931
9 : 62576.5236931
10 : 62576.5236931
11 : 62576.5236931
12 : 62576.5236931
准确率: 0.603189
效果一般,比瞎猜强一点... ...
标签:分类 word hid cto 输入 标记 targe 英语单词 min
原文地址:http://www.cnblogs.com/hellcat/p/7400373.html