import jieba f=open(‘new.txt‘,‘r‘,encoding=‘utf-8‘) new=f.read() #关闭文件流 f.close() #删除数字和标点符号 str = ‘‘‘1234567890一!!“”,。?、;’"‘,.、:()()\n‘’‘‘‘ for i in str: new=new.replace(i," ") NEW=list(jieba.lcut(new)) exclude = [‘说‘,‘有‘,‘得‘,‘没‘,‘的‘,‘他‘,‘了‘,‘她‘,‘是‘,‘在‘,‘—‘,‘你‘,‘走‘,‘对‘,‘他们‘,‘着‘,‘把‘,‘不‘,‘也‘,‘我‘,‘人‘,‘而‘, ‘与‘,‘就‘,‘可是‘,‘那‘,‘要‘,‘又‘,‘想‘,‘和‘,‘一个‘,‘ ‘,‘呢‘,‘很‘,‘一点‘,‘都‘,‘去‘, ‘没有‘,‘个‘,‘上‘,‘给‘,‘来‘,‘还‘,‘到‘,‘这‘,‘\u3000‘,‘点‘,‘小‘,‘看‘] dictionary={} for i in NEW: #只出现一次一般没有意义 if NEW.count(i)==1: continue else: dictionary[i]=NEW.count(i) #删除助词 for i in exclude: if i in dictionary.keys(): del dictionary[i] #排序 dictionary=sorted(dictionary.items(),key=lambda item:item[1],reverse=True) for i in range(19): print(dictionary[i])
运行结果:
(‘工会‘, 17)
(‘日‘, 16)
(‘月‘, 12)
(‘清明节‘, 11)
(‘经费‘, 10)
(‘不准‘, 8)
(‘元‘, 7)
(‘将‘, 7)
(‘上调‘, 6)
(‘节日‘, 6)
(‘假期‘, 6)
(‘规定‘, 5)
(‘基层‘, 5)
(‘号‘, 5)
(‘汽油‘, 5)
(‘每升‘, 4)
(‘福利‘, 4)
(‘标准‘, 4)
(‘发放‘, 4)