标签:fun __name__ pat jieba config thread inf app pre
源文件有4列
import os import sys import pandas as pd from joblib import Parallel, delayed import jieba import yaml config = yaml.load(open(‘config.yaml‘, ‘r‘)) def read_df(trainfile): data = pd.read_csv(trainfile, sep=‘\\t‘, header=None, nrows=60000, encoding=‘utf-8‘, names=[‘id‘, ‘title‘, ‘content‘, ‘label‘]) return data def word_cut(df): with open(config[‘train_cut‘], ‘a+‘) as f: line = ‘\t‘.join([df[0],‘ ‘.join(jieba.cut(df[1])) ,‘ ‘.join(jieba.cut(df[2])),df[3]]) f.writelines(line) f.writelines(‘\n‘) def applyParallel(content, func, n_thread): with Parallel(n_jobs=n_thread) as parallel: parallel(delayed(func)(c) for c in content) def main(): overwrite = True if overwrite: if os.path.exists(config[‘train_cut‘]): os.remove(config[‘train_cut‘]) trainfile = ‘data/train_fusai.tsv‘ df = read_df(trainfile) content = df.values applyParallel(content, word_cut, 22) if __name__ == ‘__main__‘: main()
标签:fun __name__ pat jieba config thread inf app pre
原文地址:https://www.cnblogs.com/zle1992/p/8967644.html