标签:lin nta htm div items range 英文 UNC text
from pyecharts import WordCloud import jieba import re import nltk with open(r‘F:\算法\others\merry.txt‘, ‘r‘, encoding=‘utf-8‘) as f: text = f.readlines() word_list = [] word_dic = {} for line in text: if re.findall(‘([a-zA-Z])‘, line.strip()): result = "".join(i for i in line.strip() if ord(i) < 256) from nltk.tokenize import WordPunctTokenizer words = WordPunctTokenizer().tokenize(result) for w in words: if 97 <= ord(w.lower()[0]) <= 122: word_list.append(w) # print(word_list) set_word_list = list(set(word_list)) for set_word in set_word_list: word_dic[set_word] = word_list.count(set_word) # print(word_dic) name = [] value = [] for k,v in word_dic.items(): name.append(k) value.append(int(v)*100) wordcloud = WordCloud(width=1300, height=620) wordcloud.add("", name, value, word_size_range=[20, 100],shape=‘pentagon‘) wordcloud.render(‘test.html‘)
标签:lin nta htm div items range 英文 UNC text
原文地址:https://www.cnblogs.com/pandaboy1123/p/10175229.html