标签:load models cut append __name__ rds codec RKE with
import jieba
from jieba import analyse
import numpy
import gensim
import codecs
import pandas as pd
import jieba.posseg as pog
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
#获取训练语料
def data_handle(data):
n = data.shape[0]
data_str = ‘‘
for i in numpy.arange(n):
data_str += str(data.ix[i, ‘comment‘])
return data_str
def fenci(data_str,stop_property,stopfile):
# 停用词
stop_word = [word.strip() for word in open(stopfile, encoding=‘utf-8‘).readlines()]
# 分词
word_cut = pog.cut(data_str)
with open(‘weibo.txt‘,‘w‘,encoding=‘utf-8‘) as f:
for word, flag in word_cut:
if flag not in stop_property:
if word not in stop_word:
f.write(word+‘\n‘)
# 原始的训练语料转化成一个sentence的迭代器,每一次迭代返回的sentence是一个word(utf8格式)的列表
def vctor_word():
wiki_news = open(‘weibo.txt‘, ‘r‘,encoding=‘utf-8‘)
sentences=LineSentence(wiki_news)
model=Word2Vec(sentences,sg=0,size=100,window=5,min_count=5,workers=9)
model.save(‘weibo.word2vec‘)
# 实现给出任意字符串,获取字符串中某字符的位置以及出现的总次数
def get_char_pos(string, char):
chPos = []
try:
chPos = list(((pos, char) for pos, val in enumerate(string) if (val == char)))
except:
pass
return chPos
# 利用训练好的词向量获取关键词的词向量
def cut_data(data,stopfile):
data.fillna(0,inplace=True)
stop_word = [word.strip() for word in open(stopfile, encoding=‘utf-8‘).readlines()]
charater=[‘a‘, ‘nr‘, ‘ns‘, ‘nt‘, ‘ng‘, ‘vn‘, ‘vi‘, ‘l‘, ‘n‘, ‘v‘]
m=data.shape[0]
with open(‘seg_word.txt‘, ‘w‘, encoding=‘utf-8‘) as f:
for i in range(m):
str_cut = ‘‘
str=data.ix[i,‘comment‘]
if str!=0:
segs=jieba.posseg.cut(str)
for word,flag in segs:
if flag in charater:
if word not in stop_word:
str_cut+=word+‘/‘
f.write(str_cut )
else:
str_cut=‘‘
f.write(‘\n ‘)
def get_vector(data,model):#str
wordvec_size = 100
word_vec_all = numpy.zeros(wordvec_size)
space_pos = get_char_pos(data, ‘/‘)
first_word = data[0:space_pos[0][0]]
print(‘first_word‘, first_word)
if first_word in model:
print(‘yes‘)
word_vec_all = word_vec_all + model[first_word]
for i in range(len(space_pos) - 2):
word = data[space_pos[i][0]:space_pos[i + 1][0]]
print(‘word‘,word)
if word in model:
print(‘yes‘)
word_vec_all = word_vec_all + model[first_word]
print(‘word_vec_all‘,word_vec_all)
return word_vec_all
def word2vec(file_name, model,str):
DataFile = codecs.open(file_name, "r", encoding=‘utf-8‘)
DataSet = DataFile.readlines()[:-1]
score_list=[]
str_vector=get_vector(str,model)
for data in DataSet: #
if data.strip()!=‘‘:
word_vec_all=get_vector(data,model)
score=simlarityCalu(word_vec_all, str_vector)
else:
score=0
score_list.append(score)
print(‘score_list‘,score_list)
return score_list
# 词向量相似度计算代码:余弦
def simlarityCalu(vector1, vector2):
vector1Mod = numpy.sqrt(vector1.dot(vector1))
vector2Mod = numpy.sqrt(vector2.dot(vector2))
if vector2Mod != 0 and vector1Mod != 0:
simlarity = (vector1.dot(vector2)) / (vector1Mod * vector2Mod)
else:
simlarity = 0
return simlarity
if __name__ == ‘__main__‘:
stop_property = [‘b‘, ‘c‘, ‘d‘, ‘e‘, ‘f‘, ‘m‘, ‘o‘, ‘p‘, ‘q‘, ‘r‘, ‘t‘, ‘u‘, ‘x‘, ‘y‘, ‘z‘, ‘uj‘, ‘nrt‘, ‘eng‘,
‘zg‘, ‘ul‘]
stop_file=‘stop.txt‘
# 读取数据
data = pd.read_excel(‘C:/E/weibo.xlsx‘)
data.rename(columns={‘粉丝ID‘: ‘fans_id‘, ‘粉丝‘: ‘fans_name‘, ‘微博账户id‘: ‘weibo_user_id‘, ‘微博名‘: ‘weibo_name‘,
‘微博id‘: ‘weibo_id‘, ‘评论id‘: ‘comment_id‘, ‘评论‘: ‘comment‘}, inplace=True)
# 获取评论字符串
comment_str=data_handle(data)
#获取语料
fenci(comment_str, stop_property, stop_file)
#训练模型
vctor_word()
#获取关键词
cut_data(data, stop_file)
p1_keywords = ‘seg_word.txt‘
str1 = ‘农农/陈利农/宝贝‘
# model = gensim.models.Word2Vec.load(‘weibo.word2vec‘)
model = gensim.models.Word2Vec.load(‘zhiwiki_news.word2vec‘)
p1_vec = word2vec(p1_keywords, model,str1)
str2=‘舒蔻 尤妮佳 买‘
标签:load models cut append __name__ rds codec RKE with
原文地址:https://www.cnblogs.com/hapyygril/p/9982215.html