实现一个简单的布隆过滤器

时间：2016-05-30 16:01:50 阅读：200 评论：0 收藏：0 [点我收藏+]

标签：空间过滤器字符串

哈希表这种数据结构能够非常快速的查找元素

但它的缺点是空间效率不高

位图提高了空间效率

但位图只能用来判断元素是否存在

关于位图的实现，在这里

http://zhweizhi.blog.51cto.com/10800691/1784383

下面简单介绍一下布隆过滤器。

现在，

假如通过哈希算法，将一个字符串转化成int类型的数据，

通过简单的线性探测（为什么不用其他方式，比如二次探测或者开链法呢？应该还是考虑了效率问题）存放在位图中，就能实现对字符串的查找了。

不过考虑到哈希冲突，用一个哈希算法肯定是不够的。

可以考虑用多个哈希算法，将转化出的int类型数据映射到多个地方。

这样，在查找时，在通过之前用过的若干个哈希算法查找相应位置是否为‘1‘

如果均为‘1‘，那么很大概率上，这个字符串是存在的，

（因此布隆过滤器的判断存在并不是百分之百靠谱的，存在误判的概率，特别是随着元素数量越来越接近容量的时候，误判概率也会越来越高）

如果有一个为‘0‘，那么就能肯定这个字符串是不存在的

实现的代码如下：
我采用了5个哈希算法，可以在网上找，找一些存活率高的。

然后将他们用仿函数实现：

struct __HashFunc1
{
	size_t SDBMHash(char *str)
	{
		size_t hash = 0;
		while (*str)
		{
			hash = (*str++) + (hash << 6) + (hash << 16) - hash;
		}
		return (hash & 0x7FFFFFFF);
	}
	size_t operator()(const string &key)
	{
		return (SDBMHash((char*)key.c_str()));
	}
};
struct __HashFunc2
{
	unsigned int RSHash(char *str)
	{
		unsigned int b = 378551;
		unsigned int a = 63689;
		unsigned int hash = 0;

		while (*str)
		{
			hash = hash * a + (*str++);
			a *= b;
		}

		return (hash & 0x7FFFFFFF);
	}
	size_t operator()(const string &key)
	{
		return (RSHash((char*)key.c_str()));
	}
};
struct __HashFunc3
{
	unsigned int RSHash(char *str)
	{
		unsigned int b = 378551;
		unsigned int a = 63689;
		unsigned int hash = 0;
		while (*str)
		{
			hash = hash * a + (*str++);
			a *= b;
		}
		return (hash & 0x7FFFFFFF);
	}
	size_t operator()(const string &key)
	{
		return (RSHash((char*)key.c_str()));
	}
};
struct __HashFunc4
{
	unsigned int JSHash(char *str)
	{
		unsigned int hash = 1315423911;

		while (*str)
		{
			hash ^= ((hash << 5) + (*str++) + (hash >> 2));
		}

		return (hash & 0x7FFFFFFF);
	}
	size_t operator()(const string &key)
	{
		return (JSHash((char*)key.c_str()));
	}
};
struct __HashFunc5
{
	unsigned int PJWHash(char *str)
	{
		unsigned int BitsInUnignedInt = (unsigned int)(sizeof(unsigned int) * 8);
		unsigned int ThreeQuarters = (unsigned int)((BitsInUnignedInt * 3) / 4);
		unsigned int OneEighth = (unsigned int)(BitsInUnignedInt / 8);
		unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnignedInt - OneEighth);
		unsigned int hash = 0;
		unsigned int test = 0;

		while (*str)
		{
			hash = (hash << OneEighth) + (*str++);
			if ((test = hash & HighBits) != 0)
			{
				hash = ((hash ^ (test >> ThreeQuarters)) & (~HighBits));
			}
		}

		return (hash & 0x7FFFFFFF);
	}
	size_t operator()(const string &key)
	{
		return (PJWHash((char*)key.c_str()));
	}
};

布隆过滤器：

//布隆过滤器
template<
class HashFunc1 = __HashFunc1,
class HashFunc2 = __HashFunc2,
class HashFunc3 = __HashFunc3,
class HashFunc4 = __HashFunc4,
class HashFunc5 = __HashFunc5
>
class BloomFillter
{
public:
	BloomFillter(size_t n)
		:_capacity(_GetNextPrime(n))
		,_bm(_capacity)
	{}

	void Set(const string &key)
	{
		size_t hash1 = HashFunc1()(key);
		size_t hash2 = HashFunc2()(key);
		size_t hash3 = HashFunc3()(key);
		size_t hash4 = HashFunc4()(key);
		size_t hash5 = HashFunc5()(key);
		_bm.Set(hash1 % _capacity);
		_bm.Set(hash2 % _capacity);
		_bm.Set(hash3 % _capacity);
		_bm.Set(hash4 % _capacity);
		_bm.Set(hash5 % _capacity);

	}

	bool Test(const string &key)
	{
		size_t hash1 = HashFunc1()(key);
		if (!_bm.Test(hash1 % _capacity))
		{
			return false;
		}
		size_t hash2 = HashFunc2()(key);
		if (!_bm.Test(hash2 % _capacity))
		{
			return false;
		}
		size_t hash3 = HashFunc3()(key);
		if (!_bm.Test(hash3 % _capacity))
		{
			return false;
		}
		size_t hash4 = HashFunc4()(key);
		if (!_bm.Test(hash4 % _capacity))
		{
			return false;
		}
		size_t hash5 = HashFunc5()(key);
		if (!_bm.Test(hash5 % _capacity))
		{
			return false;
		}
		return true;
	}


protected:
	size_t _GetNextPrime(size_t n)
	{
		const int _PrimeSize = 28;
		static const unsigned long _PrimeList[_PrimeSize] =
		{
			53ul,         97ul,         193ul,       389ul,       769ul,
			1543ul,       3079ul,       6151ul,      12289ul,     24593ul,
			49157ul,      98317ul,      196613ul,    393241ul,    786433ul,
			1572869ul,    3145739ul,    6291469ul,   12582917ul,  25165843ul,
			50331653ul,   100663319ul,  201326611ul, 402653189ul, 805306457ul,
			1610612741ul, 3221225473ul, 4294967291ul
		};
		for (int i = 0; i < _PrimeSize; ++i)
		{
			if (_PrimeList[i] > n)
			{
				return _PrimeList[i];
			}
		}
		return n;
	}
protected:
	size_t _capacity;
	BitMap _bm;
};

实现一个简单的布隆过滤器

标签：空间过滤器字符串

原文地址：http://zhweizhi.blog.51cto.com/10800691/1784384

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行