码迷,mamicode.com
首页 > 编程语言 > 详细

C++ 敏感字符过滤

时间:2015-05-12 21:07:38      阅读:169      评论:0      收藏:0      [点我收藏+]

标签:c++ 敏感字符

WordNode.h
#ifndef __TOOLS_WORDNODE_H_INCLUDE__
#define __TOOLS_WORDNODE_H_INCLUDE__


#include <map>


class CWordNode
{
typedef std::map<std::string, CWordNode*> umap;
public:
CWordNode(const std::string& word) { Reset(word); }
~CWordNode()
{
umap::iterator Ite = this->m_mapWordNodes.begin();
while (Ite != this->m_mapWordNodes.end())
{
CWordNode* pTmp = Ite->second;
delete pTmp;
pTmp = NULL;
++Ite;
}


this->m_mapWordNodes.clear();
this->m_nEndTag = 0;
}


void Reset(const std::string& word) 
{
this->m_cWord   = word;
this->m_nEndTag = 0;
this->m_mapWordNodes.clear();
}


public:
std::string                      m_cWord;
int                              m_nEndTag;
umap                             m_mapWordNodes;
};


#endif // __TOOLS_WORDNODE_H_INCLUDE__


WordsFilter.h
#ifndef __TOOLS_WORDSFILTER_H_INCLUDE__
#define __TOOLS_WORDSFILTER_H_INCLUDE__


#include <list>
#include "WordNode.h"


class CWordsFilter
{
typedef std::map<std::string, CWordNode*> umap;
private:
std::list<std::string> m_lsAllSensitiveWords; // 所有敏感词列表
CWordNode*             m_rootWordNode;
    bool                   m_bIsInit;


public:
CWordsFilter();
~CWordsFilter();


static CWordsFilter&  GetInstance(); // 获取共享实例


void                   InitSensitiveWords(std::string strWord);                        // 初始化敏感词集
void                   InitSensitiveWords(std::list<std::string> lsAllSensitiveWords); // 初始化敏感词集
std::string            FilterSensitiveWords(const std::string& strContent);      // 过滤敏感词


private:
void                   BuildWordTree(); // 构建敏感词树
void                   InsertNode(CWordNode* pNode, const std::string& strContent, int nIndex);
CWordNode*             FindNode(CWordNode* pNode, const std::string& word);
int                    GetFirstBytes(const std::string& str); // 获取字符串中的第一个字符字节长度


};


#endif // __TOOLS_WORDSFILTER_H_INCLUDE__


WordsFilter.cpp
#include <iostream>
#include <sstream>
#include <fstream>
#include <cmath>
#include "WordsFilter.h"


int nStep = 2;


typedef std::vector<std::string> Tokens;
Tokens StrSplit(const std::string &src, const std::string &sep)
{
Tokens r;
std::string s;


for (std::string::const_iterator i = src.begin(); i != src.end(); i++)
{
if (sep.find((*i)) != std::string::npos)
{
if (s.length())
{
r.push_back(s);
}
s = "";
}
else
{
s += (*i);
}
}


if (s.length())
{
r.push_back(s);
}
return r;
};


int CWordsFilter::GetFirstBytes(const std::string& str)
{
for (int i = 0; i < (int)str.size(); ++i)
{


unsigned char chr = (unsigned char)str.at(i);


// 如果是该字节是 0XXX XXXX 样式,说明其是一个英文文字,占1字节
if ((chr >> 7) == 0)
{
return 1;
}
// 如果该字节是 1111 110X 样式,说明其是一个文字的头,且该文字占6字节
else if ((chr >> 1) == 126)
{
return 6;
}
// 如果该字节是 1111 10XX 样式,说明其是一个文字的头,且该文字占5字节
else if ((chr >> 2) == 62)
{
return 5;
}
// 如果该字节是 1111 0XXX 样式,说明其是一个文字的头,且该文字占4字节
else if ((chr >> 3) == 30)
{
return 4;
}
// 如果该字节是 1110 XXXX 样式,说明其是一个文字的头,且该文字占3字节
else if ((chr >> 4) == 14)
{
return 3;
}
// 如果该字节是 110X XXXX 样式,说明其是一个文字的头,且该文字占2字节
else if ((chr >> 5) == 6)
{
return 2;
}
else
{
continue;
}
}
return 1;
}


CWordsFilter::CWordsFilter():
m_bIsInit(false),
m_rootWordNode(NULL)
{
m_lsAllSensitiveWords.clear();
}


CWordsFilter::~CWordsFilter()
{
this->m_lsAllSensitiveWords.clear();
delete this->m_rootWordNode;
this->m_rootWordNode = NULL;
}


void CWordsFilter::InitSensitiveWords(std::string strWord)
{
Tokens token = StrSplit(strWord, ",");
std::list<std::string> lsAllSensitiveWords;
Tokens::iterator Ite = token.begin();
while (Ite != token.end())
{
lsAllSensitiveWords.push_back(*Ite);
++Ite;
}
InitSensitiveWords(lsAllSensitiveWords);
}


void CWordsFilter::InitSensitiveWords(std::list<std::string> lsAllSensitiveWords)
{
std::cout << "start init sensitive words" << std::endl;
this->m_lsAllSensitiveWords.clear();
this->m_lsAllSensitiveWords = lsAllSensitiveWords;


    BuildWordTree();
this->m_bIsInit = true;
}


std::string CWordsFilter::FilterSensitiveWords(const std::string& strContent)
{
if (!this->m_bIsInit || NULL == this->m_rootWordNode)
{
std::cout << "the sensitive words is not init" << std::endl;
return "";
}


CWordNode* pNode = this->m_rootWordNode;


std::string strBuffer = "";
std::list<std::string> lsBad;
int a = 0;
while ( a < strContent.size() )
{
std::string strContentTmp = strContent.substr(a, strContent.size());
nStep = GetFirstBytes(strContentTmp);
std::string strTmp = "";
if (nStep <= strContentTmp.size())
{
strTmp = strContentTmp.substr(0, nStep);
}
pNode = FindNode(pNode, strTmp);
if (pNode == NULL)
{
pNode = this->m_rootWordNode;
int nSize = 0;
std::list<std::string>::iterator Ite = lsBad.begin();
while (Ite != lsBad.end())
{
nSize += (*Ite).size();
++Ite;
}
if (lsBad.size() > 0)
{
lsBad.clear();
}
a = a - nSize;
if (a < 0) 
{
a = 0;
}
std::string strContentTmp = strContent.substr(a, strContent.size());
nStep = GetFirstBytes(strContentTmp);
strTmp = "";
if (nStep <= strContentTmp.size())
{
strTmp = strContentTmp.substr(0, nStep);
}
strBuffer.append(strTmp);
}
else if (pNode->m_nEndTag == 1)
{
lsBad.push_back(strTmp);
for (int nIndex = 0; nIndex < lsBad.size(); ++nIndex)
{
strBuffer.append("*");
}
pNode = this->m_rootWordNode;
lsBad.clear();
}
else
{
lsBad.push_back(strTmp);
if (a == strContent.size() - nStep)
{
std::list<std::string>::const_iterator cIte = lsBad.begin();
while (cIte != lsBad.end())
{
strBuffer.append(*cIte);
++cIte;
}
}
}
strContentTmp = strContentTmp.substr(nStep, strContentTmp.size());
a += nStep;
}


return strBuffer;
}


void CWordsFilter::BuildWordTree()
{
if ( this->m_rootWordNode == NULL )
{
this->m_rootWordNode = new CWordNode("R");
if (NULL == this->m_rootWordNode)
{
return;
}
}
this->m_rootWordNode->Reset("R");


std::list<std::string>::const_iterator cIte = this->m_lsAllSensitiveWords.begin();
while (cIte != this->m_lsAllSensitiveWords.end())
{
std::string strTmp = (*cIte);


if (strTmp.size() > 0)
{
InsertNode(this->m_rootWordNode, strTmp, 0);
}


++cIte;
}
}


void CWordsFilter::InsertNode(CWordNode* pNode, const std::string& strContent, int nIndex)
{
if (NULL == pNode)
{
return;
}
nStep = GetFirstBytes(strContent);
std::string strTmp = "";
if (nStep <= strContent.size())
{
strContent.substr(0, nStep);
}
CWordNode* pN = FindNode(pNode, strTmp);
if (NULL == pN)
{
pN = new CWordNode(strTmp);
if (NULL == pN)
{
return;
}
pNode->m_mapWordNodes[strTmp] = pN;
}


if (nIndex == strContent.size() - nStep)
{
pN->m_nEndTag = 1;
}


strTmp = strContent.substr(nStep, strContent.size());


if (strTmp.size() > 0)
{
InsertNode(pN, strTmp, 0);
}
}


CWordNode* CWordsFilter::FindNode(CWordNode* pNode, const std::string& word)
{
if ( NULL == pNode )
{
return NULL;
}
umap::iterator Ite = pNode->m_mapWordNodes.find(word);
if (Ite != pNode->m_mapWordNodes.end())
{
return Ite->second;
}


return NULL;
}


CWordsFilter& CWordsFilter::GetInstance()
{
static CWordsFilter inst;
return inst;
}

C++ 敏感字符过滤

标签:c++ 敏感字符

原文地址:http://blog.csdn.net/ttan1215225/article/details/45673661

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!