标签:sim global close 分词 wiki 表示 sam als array
最近尝试了一下中文的情感分析。
主要使用了Glove和LSTM。语料数据集采用的是中文酒店评价语料
1、首先是训练Glove,获得词向量(这里是用的300d)。这一步使用的是jieba分词和中文维基。
2、将中文酒店评价语料进行清洗,并分词。分词后转化为词向量的表示形式。
3、使用LSTM网络进行训练。
最终的正确率在91%左右
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed May 30 13:52:23 2018 @author: xyli 处理酒店评价语料数据, 分词,并转化为Glove向量 """ import sys import os import chardet import jieba import re import gensim import numpy as np import pandas as pd import matplotlib.pyplot as plt from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.utils.np_utils import to_categorical from keras.layers import Masking from keras.layers import Dense, Input, Flatten, Activation from keras.layers import Conv1D, GlobalMaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional,Reshape from keras.models import Sequential, Model from Attention_layer import Attention_layer from keras.layers import Convolution2D, MaxPooling2D from keras.utils import np_utils def loadGLoveModel(filename): embeddings_index = {} f = open(filename) for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype=‘float32‘) embeddings_index[word] = coefs f.close() return embeddings_index def word2Glovec(List,model): vec=[] insert = [float(0) for i in range(300)] #300表示vec的维度 insert = np.asarray(insert, dtype=‘float32‘) for w in List: v = model.get(w) if v is None: vec.append(insert) else: vec.append(v) return vec def clean_str(string): """ Tokenization/string cleaning for dataset Every dataset is lower cased except """ # string = string.decode(‘utf-8‘) string = re.sub(r"\\", "", string) string = re.sub(r"\‘", "", string) string = re.sub(r"\"", "", string) string = re.sub(r"\r\n", "", string) string = re.sub(r"\r", "", string) string = re.sub(r"\,","",string) string = re.sub(r"\.","",string) string = re.sub(r"\,","",string) string = re.sub(r"\。","",string) string = re.sub(r"\(","",string) string = re.sub(r"\)","",string) string = re.sub(r"\(","",string) string = re.sub(r"\)","",string) string = re.sub(r"\“","",string) string = re.sub(r"\”","",string) return string.strip() def fitList(List,n): L = len(List) # insert = [0 for i in range(300)] insert = ‘!‘ if L < n: d=n-L appList=[insert for i in range(d)] List+=appList else: if L>n: List=List[0:n] return List def readData(filename): with open(filename, ‘rb‘) as f: data = f.read() data=data.decode(‘gb18030‘,‘ignore‘) data=clean_str(data) seg_list = jieba.cut(data) # 默认是精确模式 segList=[] for s in seg_list: s=clean_str(s) segList.append(s) return segList def loadData(): Corpus_DIR = "data/ChnSentiCorp_htl_unba_10000" DIR=[‘/neg‘,‘/pos‘] commentList=[] rootdir = Corpus_DIR+DIR[0] filelist = os.listdir(rootdir) #列出文件夹下所有的目录与文件 labelList=[[0.0,1.0] for i in range(0,len(filelist))] for i in range(0,len(filelist)): path = os.path.join(rootdir,filelist[i]) if os.path.isfile(path): templist=readData(path) commentList.append(templist) rootdir = Corpus_DIR+DIR[1] filelist = os.listdir(rootdir) #列出文件夹下所有的目录与文件 labelList2=[[1.0,0.0] for i in range(0,len(filelist))] for i in range(0,len(filelist)): path = os.path.join(rootdir,filelist[i]) if os.path.isfile(path): templist=readData(path) commentList.append(templist) labelList+=labelList2 return commentList,labelList if __name__==‘__main__‘: List,labelList=loadData() #加载语料数据 gloveModel=loadGLoveModel(‘model/zhs_wiki_glove.vectors.300d.txt‘) #加载glove模型数据 countList=[] commentVecList=[] n=100 for c in List: countList.append(len(c)) glovec=word2Glovec(fitList(c,n),gloveModel) commentVecList.append(glovec) VALIDATION_SPLIT = 0.2 commentVecList=np.array(commentVecList) labelList=np.array(labelList) indices = np.arange(commentVecList.shape[0]) np.random.shuffle(indices) data = commentVecList[indices] labels = labelList[indices] nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0]) x_train = data[:-nb_validation_samples] y_train = labels[:-nb_validation_samples] x_val = data[-nb_validation_samples:] y_val = labels[-nb_validation_samples:] model = Sequential() model.add(LSTM(120, input_shape=(x_train.shape[1], x_train.shape[2]),return_sequences=True)) # model.add(Activation(‘relu‘)) #激活层 # model.add(Attention_layer()) model.add(Bidirectional(LSTM(60,return_sequences=True))) # model.add(Attention_layer()) # model.add(Activation(‘relu‘)) #激活层 model.add(Dropout(0.3)) #神经元随机失活 model.add(Bidirectional(LSTM(30,return_sequences=False))) model.add(Dropout(0.3)) #神经元随机失活 model.add(Dense(y_train.shape[1], activation=‘softmax‘)) model.compile(loss=‘categorical_crossentropy‘, optimizer=‘adam‘, metrics=[‘accuracy‘]) model.summary() model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=25, batch_size=200)
本文还在完善中。。。
标签:sim global close 分词 wiki 表示 sam als array
原文地址:https://www.cnblogs.com/xyli09/p/9183354.html