【数据结构】布隆过滤器

布隆过滤器

原理

　　如果要判断一个数是不是在一个集合里，一半想到的是将所有的元素保存起来，然后通过比较确定。但是随着集合中元素的增加，需要的存储空间越来越大，检索速度自然会变慢。这时会有人想到使用哈希表，将元素通过哈希函数映射到一个位阵列中，将相应的比特位置为1，这样就可以判断这个元素是不是在集合之中了。
　　但是哈希有一个很严重的问题，那就是哈希冲突。针对这个问题，我们的解决方法是使用多个哈希函数，用多个位表示一个值，如果它们之中有一个说元素不在集合中，那么这个元素势必就不在；但是反过来，它们都说在，却是不一定在集合之中，因为有可能它们在说谎。

优缺点

　　布隆过滤器就是用于检索一个元素是否字一个集合之中，它的优点是空间效率和查询时间都由于其他一般的算法，缺点是有一定的几率识别错误，并且删除困难。
　　
　　之所以会出现删除困难，是因为由于哈希冲突，可能一个位被多次置1，如果我们直接删除，那么就会出现错误。如果一定要实现删除功能的话，可以想到将位数组换成一般的数组。将其初始化为0，然后每增加一个元素，相应的位置加1，删除的时候相应的位置减1就可以了。

算法实现

　　1. 我们实现的布隆过滤器，底层搭载的是位图，关于位图的概念及实现，参考这里。
　　2. 关于哈希函数们可以参考这篇文章 http://www.cnblogs.com/-clq/archive/2012/05/31/2528153.html，这里介绍了很多哈希函数。

//comm.hpp
//这里的哈希函数都来自上面的文章中
#pragma once
#include <string>
using namespace std;
class KeyToInt1  
{
public:
    size_t operator()(const string& s)
    {
        return BKDRHash(s.c_str());
    }
private:
    size_t BKDRHash(const char *str)
    {
        register size_t hash = 0;
        while (size_t ch = (size_t)*str++)
        {
            hash = hash * 131 + ch;
        }
        return hash;
    }
};

class KeyToInt2
{
public:
    size_t operator()(const string& s)
    {
        return SDBMHash(s.c_str());
    }
private:
    size_t SDBMHash(const char *str)
    {
        register size_t hash = 0;
        while (size_t ch = (size_t)*str++)
        {
            hash = 65599 * hash + ch;
            //hash = (size_t)ch + (hash << 6) + (hash << 16) - hash; 
        }
        return hash;
    }
};


class KeyToInt3
{
public:
    size_t operator()(const string& s)
    {
        return RSHash(s.c_str());
    }
private:
    size_t RSHash(const char *str)
    {
        register size_t hash = 0;
        size_t magic = 63689;
        while (size_t ch = (size_t)*str++)
        {
            hash = hash * magic + ch;
            magic *= 378551;
        }
        return hash;
    }
};


class KeyToInt4
{
public:
    size_t operator()(const string & s)
    {
        return APHash(s.c_str());
    }
private:
    size_t APHash(const char *str)
    {
        register size_t hash = 0;
        size_t ch;
        for (long i = 0; ch = (size_t)*str++; i++)
        {
            if ((i & 1) == 0)
            {
                hash ^= ((hash << 7) ^ ch ^ (hash >> 3));
            }
            else
            {
                hash ^= (~((hash << 11) ^ ch ^ (hash >> 5)));
            }
        }
        return hash;
    }
};


class KeyToInt5
{
public:
    size_t operator()(const string& s)
    {
        return JSHash(s.c_str());
    }
private:
    size_t JSHash(const char *str)
    {
        if (!*str)        // 这是由本人添加，以保证空字符串返回哈希值0 
            return 0;
        register size_t hash = 1315423911;
        while (size_t ch = (size_t)*str++)
        {
            hash ^= ((hash << 5) + ch + (hash >> 2));
        }
        return hash;
    }
};

//具体实现
#include "comm.hpp"
#include "bitMap.hpp"
#include <iostream>

template <class K,class KToInt1 = KeyToInt1,class KToInt2 = KeyToInt2,class KToInt3 = KeyToInt3,class KToInt4 = KeyToInt4,class KToInt5 = KeyToInt5>
class BloomFilter
{
public:
    BloomFilter(size_t size)//size表示存储的元素个数，5个比特位表示一个元素
        : _bmp(size * 10),_size(0)
    {}

    bool Insert(const K& key)//插入元素
    {
        size_t bitCount = _bmp.Size();//位图能存储bitCount个比特位
        size_t index1 = KToInt1()(key) % bitCount;//字符串转换成整型，转换后的数字可能大于位图的位数，所以要取模
        size_t index2 = KToInt2()(key) % bitCount;
        size_t index3 = KToInt3()(key) % bitCount;
        size_t index4 = KToInt4()(key) % bitCount;
        size_t index5 = KToInt5()(key) % bitCount;

        _bmp.Set(index1);//整型插入到位图中
        _bmp.Set(index2);
        _bmp.Set(index3);
        _bmp.Set(index4);
        _bmp.Set(index5);
        _size++;

        cout << index1 << " " << index2 << " " << index3 << " " << index4 << " " << index5;//打印索引，做测试 
        cout << endl;
        return true;
    }

    bool IsInBloomFilter(const K& key)//判断一个元素是否在集合中
    {
        size_t bitCount = _bmp.Size();
        size_t index1 = KToInt1()(key) % bitCount;
        size_t index2 = KToInt2()(key) % bitCount;
        size_t index3 = KToInt3()(key) % bitCount;
        size_t index4 = KToInt4()(key) % bitCount;
        size_t index5 = KToInt5()(key) % bitCount;
        if (!_bmp.Test(index1))
            return false;//index1表明key不在位图中，说明key一定不在其中
        if (!_bmp.Test(index2))
            return false;
        if (!_bmp.Test(index3))
            return false;
        if (!_bmp.Test(index4))
            return false;
        if (!_bmp.Test(index5))
            return false;
        return true;//五个索引表明都在，key才有可能在其中
    }

private:
    BitMap _bmp;//位图存储元素
    size_t _size;//表示存了几个元素
};


void test()
{
    BloomFilter<string> b(5);
    b.Insert("小明");
    b.Insert("小黄");
    b.Insert("小花");
    b.Insert("小兰");
    b.Insert("小熊");
    b.Insert("小泽");
    if (b.IsInBloomFilter("小熊"))
        cout << "小熊在名单之中" << endl;
    if (!b.IsInBloomFilter("小王"))
        cout << "小王不在名单之中" << endl;
}

测试结果：

【数据结构】布隆过滤器

布隆过滤器

原理

优缺点

算法实现

猜你在找的数据结构相关文章