码迷,mamicode.com
首页 > 其他好文 > 详细

结巴并行分词

时间:2018-04-28 15:45:21      阅读:146      评论:0      收藏:0      [点我收藏+]

标签:fun   __name__   pat   jieba   config   thread   inf   app   pre   

 

源文件有4列

 

import os
import sys


import pandas as pd
from joblib import Parallel, delayed
import jieba

import yaml
config = yaml.load(open(config.yaml, r))


def read_df(trainfile):
    data = pd.read_csv(trainfile, sep=\\t, header=None, nrows=60000,
                       encoding=utf-8, names=[id, title, content, label])
    return data


def word_cut(df):
    with open(config[train_cut], a+) as f:
        line = \t.join([df[0], .join(jieba.cut(df[1])) , .join(jieba.cut(df[2])),df[3]])   
        f.writelines(line)
        f.writelines(\n)


def applyParallel(content, func, n_thread):
    with Parallel(n_jobs=n_thread) as parallel:
        parallel(delayed(func)(c) for c in content)


def main():
    overwrite = True
    if overwrite:
        if os.path.exists(config[train_cut]):
            os.remove(config[train_cut])

    trainfile = data/train_fusai.tsv
    df = read_df(trainfile)
    content = df.values
    applyParallel(content, word_cut, 22)
if __name__ == __main__:
    main()

 

结巴并行分词

标签:fun   __name__   pat   jieba   config   thread   inf   app   pre   

原文地址:https://www.cnblogs.com/zle1992/p/8967644.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!