标签:
本文是讲述如何使用word2vec的基础教程,文章比较基础,希望对你有所帮助!#encoding=utf-8
import sys
import re
import codecs
import os
import shutil
import jieba
import jieba.analyse
#导入自定义词典
jieba.load_userdict("dict_all.txt")
#Read file and cut
def read_file_cut():
#create path
pathBaidu = "BaiduSpiderCountry\\"
resName = "Result_Country.txt"
if os.path.exists(resName):
os.remove(resName)
result = codecs.open(resName, 'w', 'utf-8')
num = 1
while num<=100: #5A 200 其它100
name = "%04d" % num
fileName = pathBaidu + str(name) + ".txt"
source = open(fileName, 'r')
line = source.readline()
while line!="":
line = line.rstrip('\n')
#line = unicode(line, "utf-8")
seglist = jieba.cut(line,cut_all=False) #精确模式
output = ' '.join(list(seglist)) #空格拼接
#print output
result.write(output + ' ') #空格取代换行'\r\n'
line = source.readline()
else:
print 'End file: ' + str(num)
result.write('\r\n')
source.close()
num = num + 1
else:
print 'End Baidu'
result.close()
#Run function
if __name__ == '__main__':
read_file_cut()
make #if [ ! -e text8 ]; then # wget http://mattmahoney.net/dc/text8.zip -O text8.gz # gzip -d text8.gz -f #fi time ./word2vec -train Result_Country.txt -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 ./distance vectors.bin
cd C:/Users/dell/Desktop/word2vec sh demo-word.sh ./distance vectors.bin
最后希望文章对你有所帮助,主要是使用的方法。同时更多应用需要你自己去研究学习。
word2vec源码、语料下载地址:
(By:Eastmount 2016-02-18 深夜1点
http://blog.csdn.net/eastmount/ )
标签:
原文地址:http://blog.csdn.net/eastmount/article/details/50637476