DFA和trie字典树实现敏感词过滤（python和c语言）

时间：2014-07-03 17:31:03 阅读：402 评论：0 收藏：0 [点我收藏+]

现在做的项目都是用python开发，需要用做关键词检查，过滤关键词，之前用c语言做过这样的事情，用字典树，蛮高效的，内存小，检查快。

到了python上，第一想法是在pip上找一个基于c语言的python字典树模块，可惜没找到合适的，如果我会用c写python模块的话，我就自己写一个了，可惜我还不具备这个能力，

只能用python写了，性能差一点就差点吧，内存多一点也无所谓了。

用搜索引擎看CSDN上的网友的用python实现的DFA，再参照自己以前用c语言写过的字典树，有些不大对，就自己写了一个。想象一下如果用C语言是会非常高效，而且空间也特别小。

某位网友的：DFA 算法实现敏感词过滤（python 实现)

下面是python代码：

class cNode(object):
	def __init__(self):
		self.children = None
		
# The encode of word is UTF-8
# The encode of message is UTF-8
class cDfa(object):
	def __init__(self,lWords):
		self.root=None
		self.root=cNode()
		for sWord in lWords:
			self.addWord(sWord)

	# The encode of word is UTF-8
	def addWord(self,word):
		node = self.root
		iEnd=len(word)-1
		for i in xrange(len(word)):
			if node.children == None:
				node.children = {}
				if i!=iEnd:
					node.children[word[i]]=(cNode(),False)
				else:
					node.children[word[i]]=(cNode(),True)

			elif word[i] not in node.children:
				if i!=iEnd:
					node.children[word[i]]=(cNode(),False)
				else:
					node.children[word[i]]=(cNode(),True)
			else: #word[i] in node.children:
				if i==iEnd:
					Next,bWord=node.children[word[i]]
					node.children[word[i]]=(Next,True)

			node=node.children[word[i]][0]

	def isContain(self,sMsg):
		root=self.root
		iLen=len(sMsg)
		for i in xrange(iLen):
			p = root
			j = i
			while (j<iLen and p.children!=None and sMsg[j] in p.children):
				(p,bWord) = p.children[sMsg[j]]
				if bWord:
					return True
				j = j + 1
		return False

	def filter(self,sMsg):
		lNew=[]
		root=self.root
		iLen=len(sMsg)
		i=0
		bContinue=False
		while i<iLen:
			p=root
			j=i
			while (j<iLen and p.children!=None and sMsg[j] in p.children):
				(p,bWord) = p.children[sMsg[j]]
				if bWord:
					#print sMsg[i:j+1]
					lNew.append(u'*'*(j-i+1))#关键字替换
					i=j+1
					bContinue=True
					break
				j=j+1
			if bContinue:
				bContinue=False
				continue
			lNew.append(sMsg[i])
			i=i+1
		return ''.join(lNew)

下面是c语言代码trie_tree.h：

#ifndef _TRIE_TREE_H_INCLUDED_
#define _TRIE_TREE_H_INCLUDED_


#define WORD_NUM          256
struct trie_node {
	struct trie_node *node[WORD_NUM];
	int value;
	int exist;
};

struct trie_node *create_trie_node(int value);
void trie_tree_insert_word(struct trie_node *root, unsigned char *word);
/* return 1 表示存在， return 0表示不存在 */
int tire_word_is_exist(struct trie_node *root, unsigned char *word);
void destory_trie_tree(struct trie_node *root);
void update_trie_tree(struct trie_node **root, const char *filename);

#endif

trie_tree.c:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <trie_tree.h>




struct trie_node *create_trie_node(int value)
{
	struct trie_node * node = calloc(1, sizeof(struct trie_node));
	node->value = value;
	return node;
}

int tire_word_is_exist(struct trie_node *root, unsigned char *word)
{
	struct trie_node *n = NULL;
	unsigned char *p = NULL;
	
	if (root == NULL) {
		return 0;
	}
	
	while (*word != 0) {
		p = word++;
		n = root;
		while (*p != 0) {
			n = n->node[*p];
			if (n == NULL) {
				break;
			}
			else if (n->exist == 1) {
				return 1;
			}
			p++;
		}
	}
		
	return 0;
}

void trie_tree_insert_word(struct trie_node *root, unsigned char *word) 
{
	struct trie_node *n;
	while (*word != 0) {
		n = root->node[*word];
		if (n == NULL) {
			n = create_trie_node(*word);
			root->node[*word] = n;
		}
		root = n;
		word++;
	}
	root->exist = 1;
}

void destroy_trie_tree(struct trie_node *root) 
{
	int i;
	if (root == NULL) {
		return;
	}
	for (i = 0; i < WORD_NUM; i++) {
		destroy_trie_tree(root->node[i]);
	}
	free(root);
}


void update_trie_tree(struct trie_node **root, const char *filename)
{
	char word[1024];
	FILE *fp;
	char *p;
	
	if (*root != NULL) {
		destroy_trie_tree(*root);
	}

	*root = calloc(sizeof(**root),1);

	fp = fopen(filename, "r");
	if (fp == NULL) {
		printf("file can't open %s\n", filename);
		return;
	}

	while (fgets(word, sizeof(word), fp)) {
		p = word;

		while (*p != 0) {
			if (*p == '\r' || *p == '\n' || *p == ' ') {
				*p = 0;
				break;
			}
			p++;
		}
		trie_tree_insert_word(*root, (unsigned char *)word);
	}
}

DFA和trie字典树实现敏感词过滤（python和c语言）,布布扣,bubuko.com

DFA和trie字典树实现敏感词过滤（python和c语言）

标签：trie dfa python

原文地址：http://blog.csdn.net/gamesofsailing/article/details/36421539

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行