标签:port 频率 简单 == def axis line from log
tfidf原理的简单描述:
以一个图书馆为例,
tf: 该单词在图书馆所有书里出现的频率
idf: log((图书馆所有书的数量+平滑系数)/(该单词出现过的书的数量+平滑系数)+1)
tfidf = tf*idf
import numpy as np
from collections import defaultdict
class TFIDF(object):
def __init__(self, corpus, word_sep=‘ ‘, smooth_value=0.01, scale=False):
assert isinstance(corpus, list), ‘Not support this type corpus.‘
self.corpus = corpus
self.vob = defaultdict(int)
self.word_sep = word_sep
self.smooth_value = smooth_value
self.doc_cnt = defaultdict(set)
self.scale = scale
def get_tf_idf(self):
for i, line in enumerate(self.corpus):
if isinstance(line, str):
line = line.split(self.word_sep)
for w in line:
self.vob[w] += 1
self.doc_cnt[w].add(i)
output = np.zeros((len(self.corpus), len(self.vob)))
for i, line in enumerate(self.corpus):
if isinstance(line, str):
line = line.split(self.word_sep)
for j, w in enumerate(self.vob.keys()):
if w in line:
output[i, j] = self.vob[w]*np.log((self.smooth_value+len(self.corpus))/(self.smooth_value+len(self.doc_cnt[w])) + 1)
output = output / sum(self.vob.values())
if self.scale:
output = (output-output.mean(axis=1).reshape(len(self.corpus), -1))/output.std(axis=1).reshape(len(self.corpus), -1)
return output
if __name__ == ‘__main__‘:
corpus = [[‘this‘, ‘is‘, ‘a‘, ‘simple‘, ‘tfidf‘, ‘code‘], [‘python‘, ‘is‘, ‘a‘, ‘code‘, ‘language‘],
[‘learning‘, ‘python‘, ‘make‘, ‘things‘, ‘simple‘]]
result = TFIDF(corpus)
print(result.get_tf_idf())
标签:port 频率 简单 == def axis line from log
原文地址:https://www.cnblogs.com/laresh/p/12440051.html