标签:
public class BadWordFilter
{
#region 变量
private HashSet<string> hash = new HashSet<string>();
private byte[] fastCheck = new byte[char.MaxValue];
private byte[] fastLength = new byte[char.MaxValue];
private BitArray charCheck = new BitArray(char.MaxValue);
private BitArray endCheck = new BitArray(char.MaxValue);
private int maxWordLength = 0;
private int minWordLength = int.MaxValue;
private string _replaceString = "*";
private string _newWord;
#endregion
#region 单例模式创建实例
private static BadWordFilter badWordFilter = null;
/// <summary>
/// 构造函数
/// </summary>
private BadWordFilter() { }
/// <summary>
/// 单例
/// </summary>
/// <returns></returns>
public static BadWordFilter CreateBadWordsFilter()
{
if (badWordFilter == null)
{
badWordFilter = new BadWordFilter();
}
return badWordFilter;
}
#endregion
#region 初始化数据,将List集合类型敏感词放入HashSet中
/// <summary>
/// 初始化数据,将敏感词放入HashSet中
/// </summary>
/// <param name="badwords"></param>
public void Init(List<BadWordEntity> badwords)
{
foreach (BadWordEntity word in badwords)
{
maxWordLength = Math.Max(maxWordLength, word.BadWord.Length);
minWordLength = Math.Min(minWordLength, word.BadWord.Length);
for (int i = 0; i < 7 && i < word.BadWord.Length; i++)
{
fastCheck[word.BadWord[i]] |= (byte)(1 << i);
}
for (int i = 7; i < word.BadWord.Length; i++)
{
fastCheck[word.BadWord[i]] |= 0x80;
}
if (word.BadWord.Length == 1)
{
charCheck[word.BadWord[0]] = true;
}
else
{
fastLength[word.BadWord[0]] |= (byte)(1 << (Math.Min(7, word.BadWord.Length - 2)));
endCheck[word.BadWord[word.BadWord.Length - 1]] = true;
hash.Add(word.BadWord);
}
}
}
#endregion
#region 初始化数据,将String[]类型敏感词放入HashSet中
/// <summary>
/// 初始化数据,将敏感词放入HashSet中
/// </summary>
/// <param name="badwords"></param>
private void Init(string[] badwords)
{
foreach (string word in badwords)
{
maxWordLength = Math.Max(maxWordLength, word.Length);
minWordLength = Math.Min(minWordLength, word.Length);
for (int i = 0; i < 7 && i < word.Length; i++)
{
fastCheck[word[i]] |= (byte)(1 << i);
}
for (int i = 7; i < word.Length; i++)
{
fastCheck[word[i]] |= 0x80;
}
if (word.Length == 1)
{
charCheck[word[0]] = true;
}
else
{
fastLength[word[0]] |= (byte)(1 << (Math.Min(7, word.Length - 2)));
endCheck[word[word.Length - 1]] = true;
hash.Add(word);
}
}
}
#endregion
#region 检查是否有敏感词
/// <summary>
/// 检查是否有敏感词
/// </summary>
/// <param name="text"></param>
/// <returns></returns>
public bool HasBadWord(string text)
{
int index = 0;
while (index < text.Length)
{
int count = 1;
if (index > 0 || (fastCheck[text[index]] & 1) == 0)
{
while (index < text.Length - 1 && (fastCheck[text[++index]] & 1) == 0) ;
}
char begin = text[index];
if (minWordLength == 1 && charCheck[begin])
{
return true;
}
for (int j = 1; j <= Math.Min(maxWordLength, text.Length - index - 1); j++)
{
char current = text[index + j];
if ((fastCheck[current] & 1) == 0)
{
++count;
}
if ((fastCheck[current] & (1 << Math.Min(j, 7))) == 0)
{
break;
}
if (j + 1 >= minWordLength)
{
if ((fastLength[begin] & (1 << Math.Min(j - 1, 7))) > 0 && endCheck[current])
{
string sub = text.Substring(index, j + 1);
if (hash.Contains(sub))
{
return true;
}
}
}
}
index += count;
}
return false;
}
#endregion
#region 替换敏感词
/// <summary>
/// 替换敏感词
/// </summary>
/// <param name="text"></param>
/// <returns></returns>
public string ReplaceBadWord(string text)
{
int index = 0;
for (index = 0; index < text.Length; index++)
{
if ((fastCheck[text[index]] & 1) == 0)
{
while (index < text.Length - 1 && (fastCheck[text[++index]] & 1) == 0) ;
}
//单字节检测
if (minWordLength == 1 && charCheck[text[index]])
{
text = text.Replace(text[index], _replaceString[0]);
continue;
}
//多字节检测
for (int j = 1; j <= Math.Min(maxWordLength, text.Length - index - 1); j++)
{
//快速排除
if ((fastCheck[text[index + j]] & (1 << Math.Min(j, 7))) == 0)
{
break;
}
if (j + 1 >= minWordLength)
{
string sub = text.Substring(index, j + 1);
if (hash.Contains(sub))
{
//替换字符操作
char cc = _replaceString[0];
string rp = _replaceString.PadRight((j + 1), cc);
text = text.Replace(sub, rp);
//记录新位置
index += j;
break;
}
}
}
}
_newWord = text;
return text;
}
#endregion
}
#region 敏感词实体类
/// <summary>
/// 敏感词实体
/// </summary>
public class BadWordEntity
{
/// <summary>
/// 敏感词
/// </summary>
public string BadWord { get; set; }
}
#endregion
标签:
原文地址:http://www.cnblogs.com/liuyl/p/4276467.html