标签:port 频率 简单 == def axis line from log
tfidf原理的简单描述:
以一个图书馆为例,
tf: 该单词在图书馆所有书里出现的频率
idf: log((图书馆所有书的数量+平滑系数)/(该单词出现过的书的数量+平滑系数)+1)
tfidf = tf*idf
import numpy as np from collections import defaultdict class TFIDF(object): def __init__(self, corpus, word_sep=‘ ‘, smooth_value=0.01, scale=False): assert isinstance(corpus, list), ‘Not support this type corpus.‘ self.corpus = corpus self.vob = defaultdict(int) self.word_sep = word_sep self.smooth_value = smooth_value self.doc_cnt = defaultdict(set) self.scale = scale def get_tf_idf(self): for i, line in enumerate(self.corpus): if isinstance(line, str): line = line.split(self.word_sep) for w in line: self.vob[w] += 1 self.doc_cnt[w].add(i) output = np.zeros((len(self.corpus), len(self.vob))) for i, line in enumerate(self.corpus): if isinstance(line, str): line = line.split(self.word_sep) for j, w in enumerate(self.vob.keys()): if w in line: output[i, j] = self.vob[w]*np.log((self.smooth_value+len(self.corpus))/(self.smooth_value+len(self.doc_cnt[w])) + 1) output = output / sum(self.vob.values()) if self.scale: output = (output-output.mean(axis=1).reshape(len(self.corpus), -1))/output.std(axis=1).reshape(len(self.corpus), -1) return output if __name__ == ‘__main__‘: corpus = [[‘this‘, ‘is‘, ‘a‘, ‘simple‘, ‘tfidf‘, ‘code‘], [‘python‘, ‘is‘, ‘a‘, ‘code‘, ‘language‘], [‘learning‘, ‘python‘, ‘make‘, ‘things‘, ‘simple‘]] result = TFIDF(corpus) print(result.get_tf_idf())
标签:port 频率 简单 == def axis line from log
原文地址:https://www.cnblogs.com/laresh/p/12440051.html