现在做的项目都是用python开发,需要用做关键词检查,过滤关键词,之前用c语言做过这样的事情,用字典树,蛮高效的,内存小,检查快。
到了python上,第一想法是在pip上找一个基于c语言的python字典树模块,可惜没找到合适的,如果我会用c写python模块的话,我就自己写一个了,可惜我还不具备这个能力,
只能用python写了,性能差一点就差点吧,内存多一点也无所谓了。
用搜索引擎看CSDN上的网友的用python实现的DFA,再参照自己以前用c语言写过的字典树,有些不大对,就自己写了一个。想象一下如果用C语言是会非常高效,而且空间也特别小。
下面是python代码:
class cNode(object): def __init__(self): self.children = None # The encode of word is UTF-8 # The encode of message is UTF-8 class cDfa(object): def __init__(self,lWords): self.root=None self.root=cNode() for sWord in lWords: self.addWord(sWord) # The encode of word is UTF-8 def addWord(self,word): node = self.root iEnd=len(word)-1 for i in xrange(len(word)): if node.children == None: node.children = {} if i!=iEnd: node.children[word[i]]=(cNode(),False) else: node.children[word[i]]=(cNode(),True) elif word[i] not in node.children: if i!=iEnd: node.children[word[i]]=(cNode(),False) else: node.children[word[i]]=(cNode(),True) else: #word[i] in node.children: if i==iEnd: Next,bWord=node.children[word[i]] node.children[word[i]]=(Next,True) node=node.children[word[i]][0] def isContain(self,sMsg): root=self.root iLen=len(sMsg) for i in xrange(iLen): p = root j = i while (j<iLen and p.children!=None and sMsg[j] in p.children): (p,bWord) = p.children[sMsg[j]] if bWord: return True j = j + 1 return False def filter(self,sMsg): lNew=[] root=self.root iLen=len(sMsg) i=0 bContinue=False while i<iLen: p=root j=i while (j<iLen and p.children!=None and sMsg[j] in p.children): (p,bWord) = p.children[sMsg[j]] if bWord: #print sMsg[i:j+1] lNew.append(u'*'*(j-i+1))#关键字替换 i=j+1 bContinue=True break j=j+1 if bContinue: bContinue=False continue lNew.append(sMsg[i]) i=i+1 return ''.join(lNew)
下面是c语言代码trie_tree.h:
#ifndef _TRIE_TREE_H_INCLUDED_ #define _TRIE_TREE_H_INCLUDED_ #define WORD_NUM 256 struct trie_node { struct trie_node *node[WORD_NUM]; int value; int exist; }; struct trie_node *create_trie_node(int value); void trie_tree_insert_word(struct trie_node *root, unsigned char *word); /* return 1 表示存在, return 0表示不存在 */ int tire_word_is_exist(struct trie_node *root, unsigned char *word); void destory_trie_tree(struct trie_node *root); void update_trie_tree(struct trie_node **root, const char *filename); #endif
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <trie_tree.h> struct trie_node *create_trie_node(int value) { struct trie_node * node = calloc(1, sizeof(struct trie_node)); node->value = value; return node; } int tire_word_is_exist(struct trie_node *root, unsigned char *word) { struct trie_node *n = NULL; unsigned char *p = NULL; if (root == NULL) { return 0; } while (*word != 0) { p = word++; n = root; while (*p != 0) { n = n->node[*p]; if (n == NULL) { break; } else if (n->exist == 1) { return 1; } p++; } } return 0; } void trie_tree_insert_word(struct trie_node *root, unsigned char *word) { struct trie_node *n; while (*word != 0) { n = root->node[*word]; if (n == NULL) { n = create_trie_node(*word); root->node[*word] = n; } root = n; word++; } root->exist = 1; } void destroy_trie_tree(struct trie_node *root) { int i; if (root == NULL) { return; } for (i = 0; i < WORD_NUM; i++) { destroy_trie_tree(root->node[i]); } free(root); } void update_trie_tree(struct trie_node **root, const char *filename) { char word[1024]; FILE *fp; char *p; if (*root != NULL) { destroy_trie_tree(*root); } *root = calloc(sizeof(**root),1); fp = fopen(filename, "r"); if (fp == NULL) { printf("file can't open %s\n", filename); return; } while (fgets(word, sizeof(word), fp)) { p = word; while (*p != 0) { if (*p == '\r' || *p == '\n' || *p == ' ') { *p = 0; break; } p++; } trie_tree_insert_word(*root, (unsigned char *)word); } }
DFA和trie字典树实现敏感词过滤(python和c语言),布布扣,bubuko.com
DFA和trie字典树实现敏感词过滤(python和c语言)
原文地址:http://blog.csdn.net/gamesofsailing/article/details/36421539