您的位置：首页 > 大数据

大数据量的过滤 (用于爬虫,蜘蛛) Bloom Filter 布隆过滤器

2008-03-27 21:01 447 查看

导读：
　　原文:Bloom Filters in C#
　　http://www.devsource.com/article2/0,1895,2113495,00.asp
　　想像一下.如果你有一个非常大的无序的数据(url连接) 并且你要保证同样的一条连接不会在其它地方再次出现
　　你实时的收集哪些数据,你没有办法来预防两个相同的url出现,再不断增加的数据当中.
　　当这些数据是少的时候你可以轻松的创建一个list(dictonary or hashtable 或者你自已的数据结构)然后遍历它们,看它是不是已经存在在这个list当中,
　　遍历所花的时间是非常多的,
　　但是当这些数据的长度超过可用的内存的时候? hashtable可以帮我加快速度,但是存...
　　上面的比较多的费话..看起来累呦口继续不下去了.
　　得出的方案是用 bit数组来节省空间
　　a hash table capable of holding M items will take m/8 bytes of memory
　　接下来是实际的解决方法
　　net 里面有一个类 BitArray它可以帮我们很容易的创建出一个hashtable
　　创建一个简单的 HashingTable
　　

　　
　　public class SimpleHashTable
　　

　　{
　　

　　
　　private BitArray hashbits = null
　　

　　public SimpleHashTable(int tableSize)
　　

　　{
　　

　　
　　hashbits = new BitArray(tableSize, false);
　　

　　
　　}
　　

　　public bool Test(string str)
　　

　　{
　　

　　
　　int hash = Math.Abs(str.GetHashCode()) % hashbits.Count;
　　

　　
　　return hashbits[hash];
　　

　　
　　}
　　

　　public bool Add(string str)
　　

　　{
　　

　　
　　int hash = Math.Abs(str.GetHashCode()) % hashbits.Count;
　　

　　
　　bool rslt = hashbits[hash];
　　

　　
　　if (!rslt)
　　

　　
　　hashbits[hash] = true
　　

　　return rslt;
　　

　　
　　}
　　

　　}
　　Add 里面没有这个值的时候是返回 false ,
　　Test 里面有值的话是返回 true;
　　

　　
　　class Program
　　

　　{
　　

　　
　　static void Main(string[] args)
　　

　　{
　　

　　
　　int urlsRead = 0
　　

　　int collisions = 0
　　

　　SimpleHashTable hashTable = new SimpleHashTable(1000000);
　　

　　
　　using (StreamReader sr = new StreamReader("urls.txt"))
　　

　　{
　　

　　
　　string url;
　　

　　
　　while ((url = sr.ReadLine()) != null)
　　

　　{
　　

　　
　　urlsRead++
　　

　　bool rslt = hashTable.Add(url);
　　

　　
　　if (rslt)
　　

　　
　　collisions++
　　

　　if ((urlsRead % 10000) == 0)
　　

　　{
　　

　　
　　Console.WriteLine("{0} {1} {2}%",
　　

　　
　　urlsRead, collisions, 100*
　　

　　(double)collisions / urlsRead);
　　

　　
　　}
　　

　　Console.WriteLine("{0} urls read", urlsRead);
　　

　　
　　Console.WriteLine("{0} collisions", collisions);
　　

　　
　　Console.WriteLine("False positive rate = {0}%",
　　

　　
　　100*(double)collisions / urlsRead);
　　

　　
　　}
　　

　　
　　The output from that program, run against the 100,000 unique URLs in my file, is:
　　10000 44 0.44%
　　20000 187 0.935%
　　30000 423 1.41%
　　40000 753 1.8825%
　　50000 1200 2.4%
　　60000 1753 2.92166666666667%
　　70000 2375 3.39285714285714%
　　80000 3123 3.90375%
　　90000 3946 4.38444444444444%
　　100000 4834 4.834%
　　100000 urls read
　　4834 collisions
　　False positive rate = 4.834%
　　然后写了一个简单的测试类,我们可以看到它的碰撞(冲突)还是比较明显的
　　
　　接下来就是如何继续去解决这样的问题
　　
　　创建一个新的 Hash算法函数 HashString
　　
　　hi(x) = (f1(x) + if2(x)) mod m
　　然后提供了一个防止碰撞的结构. hashkeys 保存这个hash的三个位置
　　

　　
　　public class BloomFilter
　　

　　{
　　

　　
　　private BitArray hashbits;
　　

　　
　　private int numKeys;
　　

　　
　　private int[] hashKeys;
　　

　　
　　public BloomFilter(int tableSize, int nKeys)
　　

　　{
　　

　　
　　numKeys = nKeys;
　　

　　
　　hashKeys = new int[numKeys];
　　

　　
　　hashbits = new BitArray(tableSize);
　　

　　
　　}
　　

　　private int HashString(string s)
　　

　　{
　　

　　
　　int hash = 0
　　

　　for (int i = 0 i < s.Length; i++)
　　

　　{
　　

　　
　　hash += s[i];
　　

　　
　　hash += (hash << 10);
　　

　　
　　hash ^= (hash >> 6);
　　

　　
　　}
　　

　　hash += (hash << 3);
　　

　　
　　hash ^= (hash >> 11);
　　

　　
　　hash += (hash << 15);
　　

　　
　　return hash;
　　

　　
　　}
　　

　　private void CreateHashes(string str)
　　

　　{
　　

　　
　　int hash1 = str.GetHashCode();
　　

　　
　　int hash2 = HashString(str);
　　

　　
　　hashKeys[0] = Math.Abs(hash1 % hashbits.Count);
　　

　　
　　if (numKeys > 1)
　　

　　{
　　

　　
　　for (int i = 1 i < numKeys; i++)
　　

　　{
　　

　　
　　hashKeys[i] = Math.Abs((hash1 + (i * hash2))
　　

　　
　　% hashbits.Count);
　　

　　
　　}
　　

　　public bool Test(string str)
　　

　　{
　　

　　
　　CreateHashes(str);
　　

　　
　　// Test each hash key. Return false if any
　　

　　
　　// one of the bits is not set.
　　

　　foreach (int hash in hashKeys)
　　

　　{
　　

　　
　　if (!hashbits[hash])
　　

　　
　　return false
　　

　　}
　　

　　// All bits set. The item is there.
　　

　　return true
　　

　　}
　　

　　public bool Add(string str)
　　

　　{
　　

　　
　　// Initially assume that the item is in the table
　　

　　bool rslt = true
　　

　　CreateHashes(str);
　　

　　
　　foreach (int hash in hashKeys)
　　

　　{
　　

　　
　　if (!hashbits[hash])
　　

　　{
　　

　　
　　// One of the bits wasn't set, so show that
　　

　　
　　// the item wasn't in the table, and set that bit.
　　

　　rslt = false
　　

　　hashbits[hash] = true
　　

　　}
　　

　　return rslt;
　　

　　
　　}
　　

　　}
　　测试:
　　

　　
　　class Program
　　

　　{
　　

　　
　　static void Main(string[] args)
　　

　　{
　　

　　
　　int urlsRead = 0
　　

　　int hashCollisions = 0
　　

　　int bloomCollisions = 0
　　

　　SimpleHashTable hashTable = new SimpleHashTable(1000000);
　　

　　
　　BloomFilter bloom = new BloomFilter(480833, 3);
　　

　　
　　using (StreamReader sr = new StreamReader("urls.txt"))
　　

　　{
　　

　　
　　string url;
　　

　　
　　while ((url = sr.ReadLine()) != null)
　　

　　{
　　

　　
　　urlsRead++
　　

　　bool rslt = hashTable.Add(url);
　　

　　
　　if (rslt)
　　

　　
　　hashCollisions++
　　

　　rslt = bloom.Add(url);
　　

　　
　　if (rslt)
　　

　　
　　bloomCollisions++
　　

　　if ((urlsRead % 10000) == 0)
　　

　　{
　　

　　
　　Console.WriteLine("{0} {1} {2}% {3} {4}%", urlsRead,
　　

　　
　　hashCollisions, 100*(double)hashCollisions / urlsRead,
　　

　　
　　bloomCollisions, 100*(double)bloomCollisions / urlsRead);
　　

　　
　　}
　　

　　Console.WriteLine("{0} urls read", urlsRead);
　　

　　
　　Console.WriteLine("{0} hash collisions", hashCollisions);
　　

　　
　　Console.WriteLine("False positive rate (hash) = {0}%",
　　

　　
　　100*(double)hashCollisions / urlsRead);
　　

　　
　　Console.WriteLine("{0} Bloom collisions", bloomCollisions);
　　

　　
　　Console.WriteLine("False positive rate (Bloom) = {0}%",
　　

　　
　　100*(double)bloomCollisions / urlsRead);
　　

　　
　　}
　　

　　
　　10000 44 0.44% 1 0.01%
　　20000 187 0.935% 10 0.05%
　　30000 423 1.41% 38 0.126666666666667%
　　40000 753 1.8825% 118 0.295%
　　50000 1200 2.4% 262 0.524%
　　60000 1753 2.92166666666667% 517 0.861666666666667%
　　70000 2375 3.39285714285714% 866 1.23714285714286%
　　80000 3123 3.90375% 1352 1.69%
　　90000 3946 4.38444444444444% 2118 2.35333333333333%
　　100000 4834 4.834% 2966 2.966%
　　100000 urls read
　　4834 hash collisions
　　False positive rate (hash) = 4.834%
　　2966 Bloom collisions
　　False positive rate (Bloom) = 2.966%
　　在添加数据的时候.他会判断前面这个位置是否有一个地址存了.如果有的话.它就存第二个. 如果都被存光了..哪就发生碰撞了
　　实际总的容量是有限的.
　　还有泛型和它的例子
　　

　　
　　public abstract class BloomFilter
　　

　　{
　　

　　
　　private BitArray hashbits;
　　

　　
　　private int numKeys;
　　

　　
　　protected int[] hashKeys;
　　

　　
　　public BloomFilter(int tableSize, int nKeys)
　　

　　{
　　

　　
　　numKeys = nKeys;
　　

　　
　　hashKeys = new int[numKeys];
　　

　　
　　hashbits = new BitArray(tableSize);
　　

　　
　　}
　　

　　public bool Test(TValue val)
　　

　　{
　　

　　
　　CreateHashes(val);
　　

　　
　　// Test each hash key. Return false
　　

　　
　　// if any one of the bits is not set.
　　

　　foreach (int hash in hashKeys)
　　

　　{
　　

　　
　　if (!hashbits[hash])
　　

　　
　　return false
　　

　　}
　　

　　// All bits set. The item is there.
　　

　　return true
　　

　　}
　　

　　public bool Add(TValue val)
　　

　　{
　　

　　
　　// Initially assume that the item is in the table
　　

　　bool rslt = true
　　

　　CreateHashes(val);
　　

　　
　　foreach (int hash in hashKeys)
　　

　　{
　　

　　
　　if (!hashbits[hash])
　　

　　{
　　

　　
　　// One of the bits wasn't set, so show that
　　

　　
　　// the item wasn't in the table, and set that bit.
　　

　　rslt = false
　　

　　hashbits[hash] = true
　　

　　}
　　

　　return rslt;
　　

　　
　　}
　　

　　protected virtual void CreateHashes(TValue val)
　　

　　{
　　

　　
　　int hash1 = CreateHash1(val);
　　

　　
　　int hash2 = CreateHash2(val);
　　

　　
　　hashKeys[0] = Math.Abs(hash1 % hashbits.Count);
　　

　　
　　if (numKeys > 1)
　　

　　{
　　

　　
　　for (int i = 1 i < numKeys; i++)
　　

　　{
　　

　　
　　hashKeys[i] = Math.Abs((hash1 + (i * hash2)) %
　　

　　hashbits.Count);
　　

　　
　　}
　　

　　protected abstract int CreateHash1(TValue val);
　　

　　
　　protected abstract int CreateHash2(TValue val);
　　

　　
　　}
　　

　　
　　class StringBloomFilter : BloomFilter
　　

　　{
　　

　　
　　public StringBloomFilter(int tableSize, int nKeys)
　　

　　
　　: base(tableSize, nKeys)
　　

　　{
　　

　　
　　}
　　

　　protected override int CreateHash1(string val)
　　

　　{
　　

　　
　　return val.GetHashCode();
　　

　　
　　}
　　

　　protected override int CreateHash2(string val)
　　

　　{
　　

　　
　　int hash = 0
　　

　　for (int i = 0 i < val.Length; i++)
　　

　　{
　　

　　
　　hash += val[i];
　　

　　
　　hash += (hash << 10);
　　

　　
　　hash ^= (hash >> 6);
　　

　　
　　}
　　

　　hash += (hash << 3);
　　

　　
　　hash ^= (hash >> 11);
　　

　　
　　hash += (hash << 15);
　　

　　
　　return hash;
　　

　　
　　}
　　

　　
　　让我们的蜘蛛跑得更快吧
　　Google 数学之美哪边也介绍到了.
　　http://googlechinablog.com/2007/07/bloom-filter.html

本文转自
http://www.cnblogs.com/lovebanyi/archive/2007/07/06/808736.html

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航