标签:
#!usr/bin/python #coding=utf-8 import urllib2 import sys, time, re import sys import jieba jieba.load_userdict("userdict.txt") import jieba.analyse import jieba.posseg as pseg import os jieba.initialize() import operator reload(sys); sys.setdefaultencoding(‘utf8‘); t1 = time.time() url = "10.txt" content = open(url, "rb").read() #print type(content) print ‘文章长度:‘, len(content) strRe = re.sub(‘\s‘, ‘‘, content) #用正则干掉所有的空白 print ‘用正则干掉所有的空白后,字符长度‘, len(strRe) ‘‘‘ fo = open("foo.txt", "wb") fo.write(strRe); # 关闭打开的文件 fo.close() ‘‘‘ #分词, 未登录词用veterbi分词 words = list(jieba.cut(strRe, cut_all=False)) print "分词的总数:", len(words) wordset = sorted(set(words)) print "不重复的单词数:", len(wordset) #TF-IDF jieba.analyse.set_idf_path("extra_dict/idf.txt.big"); tf_idf_tags = jieba.analyse.extract_tags(strRe, topK = 10) print "TF-IDF 未去除停用词, 获取10个关键词" print(",".join(tf_idf_tags)) jieba.analyse.set_idf_path("extra_dict/idf.txt.big"); jieba.analyse.set_stop_words("extra_dict/cn_stop_words.txt") tf_idf_stop_words_tags = jieba.analyse.extract_tags(strRe, topK = 10) print "TF-IDF 去除停用词" print(",".join(tf_idf_stop_words_tags)) #TextRank #tagswords = jieba.analyse.textrank(content) #print(",".join(tagswords)) print "TextRank, 获取10个关键词" TextRank_words = jieba.analyse.textrank(strRe) print(",".join(TextRank_words)) ‘‘‘ list = words fl = open(‘list.txt‘, ‘wb‘) for i in range(len(list)): fl.write(list[i].encode(‘utf-8‘)+‘--‘) fl.close() ‘‘‘ # 统计分词结果后,每个个分词的次数 wordsDict = {} DictsMaxWordlen = 0 singal = ‘‘ for w in words: if wordsDict.get(w) == None: wordsDict[w] = 1 else: wordsDict[w] += 1 if DictsMaxWordlen <= wordsDict[w]: DictsMaxWordlen = wordsDict[w] global singal singal = w #print w print "分词最多重复的次数:".decode(‘utf-8‘), DictsMaxWordlen , "分词是:".decode(‘utf-8‘),singal #按字典值排序(默认为升序),返回值是字典{key, tuple} sorted_wordsDict = sorted(wordsDict.iteritems(), key=operator.itemgetter(1)) #print type(sorted_wordsDict[1]) #tuple classNumWord = {} for w in sorted_wordsDict: if classNumWord.has_key(w[1]) == True: if w[0] not in classNumWord[w[1]]: classNumWord[w[1]].append(w[0]) else: classNumWord[w[1]] = [] classNumWord[w[1]].append(w[0]) #将字典排序,按照升序, 通过键排序, sort_classNumWord = sorted(classNumWord.iteritems(), key=lambda asd:asd[0], reverse = False) #print sort_classNumWord[20][1][0].encode(‘gb2312‘) wordslength = 0 #分词的总数 worldsNum = 0 #分词有多少个不同的词或词组 wordsFequencelist = {} #分词出现的频次等级,从1到N次,并存储所对应等级的词语个数 for w in sort_classNumWord: worldsNum += w[0] wordslength += len(w[1]) * w[0] wordsFequencelist[w[0]] = [] wordsFequencelist[w[0]].append(len(w[1])) #print "============================" #for i in range(len(w[1])): #按照出现的频次,打印词组 # print w[1][i] #print "出现".decode(‘utf-8‘),w[0], "次的有:".decode(‘utf-8‘) ,len(w[1]) #print "============================" sort_wordsFequencelist = sorted(wordsFequencelist.iteritems(), key=lambda asd:asd[0], reverse = False) print ‘\t\t频率是单词出现的次数, 次数是出现对应次数的所有不同单词的总和‘ lenWords = 0 for wordsFequence in sort_wordsFequencelist: lenWords += 1 print ‘频率:{0:<4} 词数:{1:>6}‘.format(wordsFequence[0], wordsFequence[1]), " ", if lenWords % 4 == 0: print print print "一共有".decode(‘utf-8‘), worldsNum, ‘个不同的词或词组‘.decode(‘utf-8‘) print "一共有".decode(‘utf-8‘), wordslength, ‘个词或词组‘.decode(‘utf-8‘) print print t2 = time.time() tm_cost = t2-t1 print ‘运行时间‘, tm_cost
Building prefix dict from C:\Python27\lib\site-packages\jieba-0.36.2-py2.7.egg\jieba\dict.txt ...
Dumping model to file cache c:\users\og\appdata\local\temp\jieba.cache
Loading model cost 2.16899991035 seconds.
Prefix dict has been built succesfully.
标签:
原文地址:http://www.cnblogs.com/hgonlywj/p/4842689.html