布隆过滤器 (Bloom Filter)的实现

布隆过滤器,先看看维基百科的解释:是1970年由布隆提出的。它实际上是一个很长的二进制向量和一系列随机映射函数。布隆过滤器可以用于检索一个元素是否在一个集合中。它的优点是空间效率和查询时间都远远超过一般的算法,缺点是有一定的误识别率和删除困难。

几个重要参数:
1.n为可能存入的元素数,
2.m为布隆过滤器或哈希表的slot数
3.k为布隆过滤器重hash function数。

k = 0.7(m/n)时候误判率最低,参考的是http://hi.baidu.com/waxiga/item/cddfe4fb7e324c19e2e3bdfb 这个孩子的推算!
不过看到Cassandra中的实现是: k = (Integer.MAX_VALUE – 20) / n)和15的最小值,也就是Cassandra最大的hash个数是15个,所以自己也采用了15作为最大值.

选择的要点
1.当k很大时,设计k个独立的hash function是不现实并且困难的。对于一个输出范围很大的hash function(例如MD5产生的128 bits数),如果不同bit位的相关性很小,则可把此输出分割为k份。或者可将k个不同的初始值(例如0,1,2, … ,k-1)结合元素,feed给一个hash function从而产生k个不同的数。
2.n/m过大时候会存在一些误报情况
3.算法复杂度是O(K)

代码实现:

</pre>
package com.dianping.user.admin.DCurrency.domain;

import java.util.BitSet;

import com.dianping.avatar.log.AvatarLogger;
import com.dianping.avatar.log.AvatarLoggerFactory;

/**
 * 布隆过滤器
 *
 * @author zhaoming
 */
public class BloomFIlter {

private static final AvatarLogger LOGGER = AvatarLoggerFactory.getLogger(BloomFIlter.class);

// 元素个数 n 当前有问题的数量是1600,积累了10年的时间,所以设置为3000,基本能够满足近十年的需求
 private int numElements = 3000;

// 最大的bit位数 ,布隆过滤器的匹配槽数
 private int maxBitSize = 30000;

// hash个数因子,一个比较理想的计算需要hash的槽数
 private static float bucketsFactor = 0.7f;

// 对象hash生成的桶数目 k = 0.7 * (m/n) , Cassandra中的实现是: k = (Integer.MAX_VALUE - 20) /
 // n)和15的最小值,也就是Cassandra最大的hash个数是15个,所以自己也采用了15作为最大值.见MAX_TARGET_BUCKETS_PERELEM
 private int targetBucketsPerElem = Float.valueOf(bucketsFactor
 * (maxBitSize / numElements)).intValue();

// 最大的hash槽数 即k<=15
 private static final int MAX_TARGET_BUCKETS_PERELEM = 15;

// 所有数据的映射地址
 // private CopyOnWriteBitSet bitSet = new CopyOnWriteBitSet(maxBitSize);
 private BitSet bitSet = new BitSet(maxBitSize);

// 哈希的工具类
 private static Hasher hasher = new Hasher();

public BloomFIlter() {

super();

logInfo();
 }

private void logInfo() {
 System.out.println("n : " + numElements);
 System.out.println("m : " + maxBitSize);
 System.out.println("k : " + targetBucketsPerElem);
 }

public BloomFIlter(int numElements, int maxBitSize, int targetBucketsPerElem) {

this.numElements = numElements;
 this.maxBitSize = maxBitSize;
 this.targetBucketsPerElem = targetBucketsPerElem;

logInfo();
 }

public void add(Object object) {

System.out.println("--------------------------------------------");

// 第一次hash
 // int hash = hasher.hash(object.hashCode());
 // System.out.println("first has value is : " + hash);

int buckersPerElem = Math.min(targetBucketsPerElem, MAX_TARGET_BUCKETS_PERELEM);
 for (int i = 1; i <= buckersPerElem; i++) {

// int bucketHash = hasher.hash(hash, i); // 第二次hash,尽量保证散列
 byte[] data = object.toString().getBytes();
 int bucketHash = hasher.murmurHash32(data, data.length, i);
 int indexHash = hasher.indexMod(bucketHash, maxBitSize);// 落位到具体数组的位置上

System.out.println("second slot value is :" + bucketHash + " 。 fall into slot index is :" + indexHash);
 bitSet.set(indexHash, true);
 }
 System.out.println("--------------------------------------------");
 }

/**
 * 验证是否匹配数据
 *
 * @param value 需要校验的值
 * @return
 */
 public boolean isMatch(Object value) {

boolean isMatch = false;
 // int h = hasher.hash(value.hashCode());

int buckersPerElem = Math.min(targetBucketsPerElem, MAX_TARGET_BUCKETS_PERELEM);
 for (int i = 1; i <= buckersPerElem; i++) {

// int bucketHash = hasher.hash(h, i);

byte[] data = value.toString().getBytes();
 int bucketHash = hasher.murmurHash32(data, data.length, i);
 int indexHash = hasher.indexMod(bucketHash, maxBitSize);
 boolean bitMatch = bitSet.get(indexHash);

// 任意一位没有匹配上,就表明没有匹配上。
 if (!bitMatch) {
 return isMatch;
 }
 }
 return true;
 }

public int getNumElements() {
 return numElements;
 }

public int getTargetBucketsPerElem() {
 return targetBucketsPerElem;
 }

public static void main(String[] args) {

String invalidValue1 = "江苏省连云港市东海县牛山镇文工巷8号1-2-11";
 String invalidValue2 = "江苏省连云港市东海县牛山镇利民东路飞扬烫染造型";
 String invalidValue3 = "15122777345";
 String validValues4 = "25122777345";// 第一位同第三条记录不同
 String validValues5 = "江苏省连云港市东海县牛山镇利民东路飞扬烫染造"; // 比第二条少了最后一个字

BloomFIlter domain = new BloomFIlter();

domain.add(invalidValue1);
 domain.add(invalidValue2);
 domain.add(invalidValue3);

boolean match = domain.isMatch(invalidValue2);
 System.out.println("是否匹配中了:" + match);

boolean match2 = domain.isMatch(validValues4);
 System.out.println("是否匹配中了:" + match2);

boolean match3 = domain.isMatch(validValues5);
 System.out.println("是否匹配中了:" + match3);
 }

/**
 * @author zhaoming
 */
 static class Hasher {

/**
 * 这个是hashmap中的hash实现
 *
 * @param h
 * @return
 */
 public int hash(int h) {

h += (h << 15) ^ 0xffffcd7d;
 h ^= (h >>> 10);
 h += (h << 3);
 h ^= (h >>> 6);
 h += (h << 2) + (h << 14);
 return h ^ (h >>> 16);
 }

/**
 * @param h 第一次hash的值
 * @param seed 多次hash的种子
 * @return
 */
 public int hash(int h, int seed) {

long id = h;
 int hash = (((int) (id ^ (id >>> 32))) ^ 0x811c9dc5) * 0x01000193;

int nbits = (((0xfffffc00 >> seed) & 4) | // Compute ceil(log2(m+1))
 ((0x000001f8 >>> seed) & 2) | // The constants hold
 ((0xffff00f2 >>> seed) & 1)); // a lookup table

hash ^= hash >>> seed;
 hash ^= nbits;
 return hash;
 }

/**
 * MurmurHash 64位实现
 *
 * @param data hash的对象字节数组
 * @param length 字节数组长度
 * @param seed 随即的种子
 * @return
 */
 public long murmurHash64(byte[] data, int length, int seed) {

final long m = 0xc6a4a7935bd1e995L;
 final int r = 47;

long h = (seed & 0xffffffffl) ^ (length * m);

int length8 = length / 8;

for (int i = 0; i < length8; i++) {
 final int i8 = i * 8;
 long k = ((long) data[i8 + 0] & 0xff) + (((long) data[i8 + 1] & 0xff) << 8)
 + (((long) data[i8 + 2] & 0xff) << 16) + (((long) data[i8 + 3] & 0xff) << 24)
 + (((long) data[i8 + 4] & 0xff) << 32) + (((long) data[i8 + 5] & 0xff) << 40)
 + (((long) data[i8 + 6] & 0xff) << 48) + (((long) data[i8 + 7] & 0xff) << 56);

k *= m;
 k ^= k >>> r;
 k *= m;

h ^= k;
 h *= m;
 }

switch (length % 8) {
 case 7:
 h ^= (long) (data[(length & ~7) + 6] & 0xff) << 48;
 case 6:
 h ^= (long) (data[(length & ~7) + 5] & 0xff) << 40;
 case 5:
 h ^= (long) (data[(length & ~7) + 4] & 0xff) << 32;
 case 4:
 h ^= (long) (data[(length & ~7) + 3] & 0xff) << 24;
 case 3:
 h ^= (long) (data[(length & ~7) + 2] & 0xff) << 16;
 case 2:
 h ^= (long) (data[(length & ~7) + 1] & 0xff) << 8;
 case 1:
 h ^= (long) (data[length & ~7] & 0xff);
 h *= m;
 }
 ;

h ^= h >>> r;
 h *= m;
 h ^= h >>> r;
 return h;

}

/**
 * MurmurHash 32位实现
 *
 * @param data
 * @param length
 * @param seed
 * @return
 */
 private int murmurHash32(byte[] data, int length, int seed) {

final int m = 0x5bd1e995;
 final int r = 24;
 // Initialize the hash to a random value
 int h = seed ^ length;
 int length4 = length / 4;

for (int i = 0; i < length4; i++) {
 final int i4 = i * 4;
 int k = (data[i4 + 0] & 0xff) + ((data[i4 + 1] & 0xff) << 8) + ((data[i4 + 2] & 0xff) << 16)
 + ((data[i4 + 3] & 0xff) << 24);
 k *= m;
 k ^= k >>> r;
 k *= m;
 h *= m;
 h ^= k;
 }

// Handle the last few bytes of the input array
 switch (length % 4) {
 case 3:
 h ^= (data[(length & ~3) + 2] & 0xff) << 16;
 case 2:
 h ^= (data[(length & ~3) + 1] & 0xff) << 8;
 case 1:
 h ^= (data[length & ~3] & 0xff);
 h *= m;
 }

h ^= h >>> 13;
 h *= m;
 h ^= h >>> 15;

return h;
 }

int indexMod(int h, int length) {
 return h & (length - 1);
 }

}

}
<pre>

结果:

n : 3000
m : 30000
k : 7
——————————————–
second slot value is :-126888208 。 fall into slot index is :21536
second slot value is :1684162830 。 fall into slot index is :16654
second slot value is :-384078105 。 fall into slot index is :25639
second slot value is :-1583787929 。 fall into slot index is :20519
second slot value is :973903817 。 fall into slot index is :5385
second slot value is :191465886 。 fall into slot index is :270
second slot value is :-434737902 。 fall into slot index is :25858
——————————————–
——————————————–
second slot value is :1321661739 。 fall into slot index is :29995
second slot value is :-772734986 。 fall into slot index is :29990
second slot value is :-1883191967 。 fall into slot index is :17697
second slot value is :-2068638865 。 fall into slot index is :4399
second slot value is :-653632621 。 fall into slot index is :20739
second slot value is :-277885801 。 fall into slot index is :17415
second slot value is :-1306019914 。 fall into slot index is :13606
——————————————–
——————————————–
second slot value is :1927800852 。 fall into slot index is :24580
second slot value is :859061377 。 fall into slot index is :13313
second slot value is :-928664486 。 fall into slot index is :13322
second slot value is :1540514158 。 fall into slot index is :24878
second slot value is :2036950474 。 fall into slot index is :25866
second slot value is :-846007063 。 fall into slot index is :29737
second slot value is :-247103133 。 fall into slot index is :291
——————————————–
是否匹配中了:true
是否匹配中了:false
是否匹配中了:false

从结果hash还是较为散列的,基本能够满足要求了!

为了提高hash的散列,将原来的两次hash改用了单次murmurHash。在google收编了Austin Appleby后,又发布了CityHash。具体实现如下,留着以后研究下:

https://gist.github.com/tamtam180/2316537

https://github.com/tamtam180/CityHash-For-Java/blob/master/src/main/java/at/orz/hash/CityHash.java

作者: inter12

在这苦短的人生中,追求点自己的简单快乐

发表评论

电子邮件地址不会被公开。 必填项已用*标注