标签:定义 root list port main use open for with open
记一下,懒得找了。
语料是NER的古文献语料,参考其他博客代码。
我先将标记的实体提出来,作为自定义字典,加入jieba中,然后再入停用词,再分词,最后训练词向量。效果还不知如何,后续再说。
#加载自定义词典 jieba.load_userdict("cidian.txt") #加载停用词 def getStopwords(): stopwords = [] with open("stopwords.txt", "r", encoding=‘utf8‘) as f: lines = f.readlines() for line in lines: stopwords.append(line.strip()) return stopwords stopwords = getStopwords() #分词 def segment(): segment_file = open(‘fenci.txt‘, ‘a‘, encoding=‘utf8‘) with open(‘guwen.txt‘, encoding=‘utf8‘) as f: text = f.readlines() for sentence in text: sentence = list(jieba.cut(sentence)) sentence_segment = [] for word in sentence: if word not in stopwords: sentence_segment.append(word) segment_file.write(" ".join(sentence_segment)) del text f.close() segment() #训练词向量 import logging import os.path import sys from gensim.models import Word2Vec from gensim.models.word2vec import PathLineSentences if __name__ == ‘__main__‘: program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format=‘%(asctime)s: %(levelname)s: %(message)s‘) logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ‘ ‘.join(sys.argv)) input_file = ‘fenci.txt‘ outp1 = ‘guwen.model‘ outp2 = ‘guwen_word2vec_format‘ # fileNames = os.listdir(input_dir) # 训练模型 输入语料目录 embedding size 256,共现窗口大小10,去除出现次数5以下的词,迭代10次 model = Word2Vec(PathLineSentences(input_file), size=256, window=10, min_count=5, iter=10) model.save(outp1) model.wv.save_word2vec_format(outp2, binary=False)
标签:定义 root list port main use open for with open
原文地址:https://www.cnblogs.com/harbin-ho/p/13311600.html