标签:spl single 初始 soft https nsf sha 最简 ret
1 #!/usr/bin/env python 2 # encoding: utf-8 3 4 """ 5 @author: zkjiang 6 @site: https://www.github.com 7 @software: PyCharm 8 @file: TFIDF.py 9 @time: 2019/2/2 12:33 10 """ 11 12 import numpy as np 13 14 class TFIDF(object): 15 16 """ 17 手写一个TFIDF统计类,只写最简单的一个实现 18 """ 19 20 def __init__(self, corpus): 21 """ 22 初始化 23 self.vob:词汇个数统计,dict格式 24 self.word_id:词汇编码id,dict格式 25 self.smooth_idf:平滑系数,关于平滑不多解释了 26 :param corpus:输入的语料 27 """ 28 self.word_id = {} 29 self.vob = {} 30 self.corpus = corpus 31 self.smooth_idf = 0.01 32 33 def fit_transform(self, corpus): 34 pass 35 36 def get_vob_fre(self): 37 """ 38 计算文本特特征的出现次数,也就是文本频率term frequency,但是没有除token总数,因为后面bincount计算不支持float 39 :return: 修改self.vob也就是修改词频统计字典 40 """ 41 # 统计各词出现个数 42 id = 0 43 for single_corpus in self.corpus: 44 if isinstance(single_corpus, list): 45 pass 46 if isinstance(single_corpus, str): 47 single_corpus = single_corpus.strip("\n").split(" ") 48 for word in single_corpus: 49 if word not in self.vob: 50 self.vob[word] = 1 51 self.word_id[word] = id 52 id += 1 53 else: 54 self.vob[word] += 1 55 56 # 生成矩阵 57 X = np.zeros((len(self.corpus), len(self.vob))) 58 for i in range(len(self.corpus)): 59 if isinstance(self.corpus[i], str): 60 single_corpus = self.corpus[i].strip("\n").split(" ") 61 else: 62 single_corpus = self.corpus[i] 63 for j in range(len(single_corpus)): 64 feature = single_corpus[j] 65 feature_id = self.word_id[feature] 66 X[i, feature_id] = self.vob[feature] 67 return X.astype(int) # 需要转化成int 68 69 70 def get_tf_idf(self): 71 """ 72 计算idf并生成最后的TFIDF矩阵 73 :return: 74 """ 75 X = self.get_vob_fre() 76 n_samples, n_features = X.shape 77 df = [] 78 for i in range(n_features): 79 """ 80 这里是统计每个特征的非0的数量,也就是逆文档频率指数的分式中的分母,是为了计算idf 81 """ 82 df.append(n_samples - np.bincount(X[:,i])[0]) 83 df = np.array(df) 84 # perform idf smoothing if required 85 df += int(self.smooth_idf) 86 n_samples += int(self.smooth_idf) 87 idf = np.log(n_samples / df) + 1 # 核心公式 88 # print(self.vob) 89 # print(self.word_id) 90 return X*idf/len(self.vob) 91 92 93 94 if __name__ == ‘__main__‘: 95 corpus = [["我","a","e"],["我","a","c"],["我","a","b"]] 96 test = TFIDF(corpus) 97 # print(test.get_vob_fre()) 98 print(test.get_tf_idf())
标签:spl single 初始 soft https nsf sha 最简 ret
原文地址:https://www.cnblogs.com/smartisn/p/12459801.html