您的位置:首页 > 大数据

大数据量的过滤 (用于爬虫,蜘蛛) Bloom Filter 布隆过滤器

2012-11-28 10:18 483 查看
来自:http://www.cnblogs.com/lovebanyi/archive/2007/07/06/808736.html

原文:Bloom Filters in C#
http://www.devsource.com/article2/0,1895,2113495,00.asp
想像一下.如果你有一个非常大的无序的数据(url连接) 并且你要保证同样的一条连接不会在其它地方再次出现

你实时的收集哪些数据,你没有办法来预防两个相同的url出现,再不断增加的数据当中.

当这些数据是少的时候你可以轻松的创建一个list(dictonary or hashtable 或者你自已的数据结构)然后遍历它们,看它是不是已经存在在这个list当中,

遍历所花的时间是非常多的,

但是 当这些数据的长度超过可用的内存的时候? hashtable可以帮我加快速度,但是存...

上面的比较多的费话..看起来累 呦口继续不下去了.



得出的方案是用 bit数组来节省空间

a hash table capable of holding M items will take m/8 bytes of memory

接下来是实际的解决方法

net 里面有一个类 BitArray它可以帮我们很容易的创建出一个hashtable

创建一个简单的 HashingTable


public class SimpleHashTable


{


private BitArray hashbits = null;




public SimpleHashTable(int tableSize)


{


hashbits = new BitArray(tableSize, false);


}




public bool Test(string str)


{


int hash = Math.Abs(str.GetHashCode()) % hashbits.Count;


return hashbits[hash];


}




public bool Add(string str)


{


int hash = Math.Abs(str.GetHashCode()) % hashbits.Count;


bool rslt = hashbits[hash];


if (!rslt)


hashbits[hash] = true;


return rslt;


}


}

Add 里面没有这个值的时候是返回 false ,

Test 里面有值的话是返回 true;


class Program


{


static void Main(string[] args)


{


int urlsRead = 0;


int collisions = 0;




SimpleHashTable hashTable = new SimpleHashTable(1000000);


using (StreamReader sr = new StreamReader("urls.txt"))


{


string url;


while ((url = sr.ReadLine()) != null)


{


urlsRead++;


bool rslt = hashTable.Add(url);


if (rslt)


collisions++;


if ((urlsRead % 10000) == 0)


{


Console.WriteLine("{0} {1} {2}%",


urlsRead, collisions, 100*


(double)collisions / urlsRead);


}


}


}


Console.WriteLine("{0} urls read", urlsRead);


Console.WriteLine("{0} collisions", collisions);


Console.WriteLine("False positive rate = {0}%",


100*(double)collisions / urlsRead);


}


}



The output from that program, run against the 100,000 unique URLs in my file, is:

10000  44  0.44%
20000  187  0.935%
30000  423  1.41%
40000  753  1.8825%
50000  1200  2.4%
60000  1753  2.92166666666667%
70000  2375  3.39285714285714%
80000  3123  3.90375%
90000  3946  4.38444444444444%
100000  4834  4.834%
100000 urls read
4834 collisions
False positive rate = 4.834%

然后写了一个简单的测试类,我们可以看到它的碰撞(冲突)还是比较明显的

接下来就是如何继续去解决这样的问题

创建一个新的 Hash算法函数 HashString

hi(x) = (f1(x) + if2(x)) mod m


然后提供了一个 防止碰撞的结构. hashkeys 保存这个hash的三个位置


public class BloomFilter


{


private BitArray hashbits;


private int numKeys;


private int[] hashKeys;




public BloomFilter(int tableSize, int nKeys)


{


numKeys = nKeys;


hashKeys = new int[numKeys];


hashbits = new BitArray(tableSize);


}




private int HashString(string s)


{


int hash = 0;




for (int i = 0; i < s.Length; i++)


{


hash += s[i];


hash += (hash << 10);


hash ^= (hash >> 6);


}


hash += (hash << 3);


hash ^= (hash >> 11);


hash += (hash << 15);


return hash;


}




private void CreateHashes(string str)


{


int hash1 = str.GetHashCode();


int hash2 = HashString(str);




hashKeys[0] = Math.Abs(hash1 % hashbits.Count);


if (numKeys > 1)


{


for (int i = 1; i < numKeys; i++)


{


hashKeys[i] = Math.Abs((hash1 + (i * hash2))


% hashbits.Count);


}


}


}




public bool Test(string str)


{


CreateHashes(str);


// Test each hash key. Return false if any


// one of the bits is not set.


foreach (int hash in hashKeys)


{


if (!hashbits[hash])


return false;


}


// All bits set. The item is there.


return true;


}




public bool Add(string str)


{


// Initially assume that the item is in the table


bool rslt = true;


CreateHashes(str);


foreach (int hash in hashKeys)


{


if (!hashbits[hash])


{


// One of the bits wasn't set, so show that


// the item wasn't in the table, and set that bit.


rslt = false;


hashbits[hash] = true;


}


}


return rslt;


}


}

测试:


class Program


{


static void Main(string[] args)


{


int urlsRead = 0;


int hashCollisions = 0;


int bloomCollisions = 0;




SimpleHashTable hashTable = new SimpleHashTable(1000000);


BloomFilter bloom = new BloomFilter(480833, 3);




using (StreamReader sr = new StreamReader("urls.txt"))


{


string url;


while ((url = sr.ReadLine()) != null)


{


urlsRead++;


bool rslt = hashTable.Add(url);


if (rslt)


hashCollisions++;


rslt = bloom.Add(url);


if (rslt)


bloomCollisions++;




if ((urlsRead % 10000) == 0)


{


Console.WriteLine("{0} {1} {2}% {3} {4}%", urlsRead,


hashCollisions, 100*(double)hashCollisions / urlsRead,


bloomCollisions, 100*(double)bloomCollisions / urlsRead);


}


}


}


Console.WriteLine("{0} urls read", urlsRead);


Console.WriteLine("{0} hash collisions", hashCollisions);


Console.WriteLine("False positive rate (hash) = {0}%",


100*(double)hashCollisions / urlsRead);


Console.WriteLine("{0} Bloom collisions", bloomCollisions);


Console.WriteLine("False positive rate (Bloom) = {0}%",


100*(double)bloomCollisions / urlsRead);


}


}



10000  44  0.44%  1  0.01%
20000  187  0.935%  10  0.05%
30000  423  1.41%  38  0.126666666666667%
40000  753  1.8825%  118  0.295%
50000  1200  2.4%  262  0.524%
60000  1753  2.92166666666667%  517  0.861666666666667%
70000  2375  3.39285714285714%  866  1.23714285714286%
80000  3123  3.90375%  1352  1.69%
90000  3946  4.38444444444444%  2118  2.35333333333333%
100000  4834  4.834%  2966  2.966%
100000 urls read
4834 hash collisions
False positive rate (hash) = 4.834%
2966 Bloom collisions
False positive rate (Bloom) = 2.966%


在添加数据的时候.他会判断前面这个位置是否有一个地址存了.如果有的话.它就存第二个. 如果都被存光了..哪就发生碰撞了

实际总的容量是有限的.

还有泛型和它的例子


public abstract class BloomFilter


{


private BitArray hashbits;


private int numKeys;


protected int[] hashKeys;




public BloomFilter(int tableSize, int nKeys)


{


numKeys = nKeys;


hashKeys = new int[numKeys];


hashbits = new BitArray(tableSize);


}




public bool Test(TValue val)


{


CreateHashes(val);


// Test each hash key. Return false


// if any one of the bits is not set.


foreach (int hash in hashKeys)


{


if (!hashbits[hash])


return false;


}


// All bits set. The item is there.


return true;


}




public bool Add(TValue val)


{


// Initially assume that the item is in the table


bool rslt = true;


CreateHashes(val);


foreach (int hash in hashKeys)


{


if (!hashbits[hash])


{


// One of the bits wasn't set, so show that


// the item wasn't in the table, and set that bit.


rslt = false;


hashbits[hash] = true;


}


}


return rslt;


}




protected virtual void CreateHashes(TValue val)


{


int hash1 = CreateHash1(val);


int hash2 = CreateHash2(val);




hashKeys[0] = Math.Abs(hash1 % hashbits.Count);


if (numKeys > 1)


{


for (int i = 1; i < numKeys; i++)


{


hashKeys[i] = Math.Abs((hash1 + (i * hash2)) %


hashbits.Count);


}


}


}




protected abstract int CreateHash1(TValue val);




protected abstract int CreateHash2(TValue val);


}






class StringBloomFilter : BloomFilter


{


public StringBloomFilter(int tableSize, int nKeys)


: base(tableSize, nKeys)


{


}




protected override int CreateHash1(string val)


{


return val.GetHashCode();


}




protected override int CreateHash2(string val)


{


int hash = 0;




for (int i = 0; i < val.Length; i++)


{


hash += val[i];


hash += (hash << 10);


hash ^= (hash >> 6);


}


hash += (hash << 3);


hash ^= (hash >> 11);


hash += (hash << 15);


return hash;


}


}





让我们的蜘蛛跑得更快吧

Google 数学之美哪边也介绍到了.

http://googlechinablog.com/2007/07/bloom-filter.html
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: