大数据量的过滤 (用于爬虫,蜘蛛) Bloom Filter 布隆过滤器
2012-11-28 10:18
483 查看
来自:http://www.cnblogs.com/lovebanyi/archive/2007/07/06/808736.html
原文:Bloom Filters in C#
http://www.devsource.com/article2/0,1895,2113495,00.asp
想像一下.如果你有一个非常大的无序的数据(url连接) 并且你要保证同样的一条连接不会在其它地方再次出现
你实时的收集哪些数据,你没有办法来预防两个相同的url出现,再不断增加的数据当中.
当这些数据是少的时候你可以轻松的创建一个list(dictonary or hashtable 或者你自已的数据结构)然后遍历它们,看它是不是已经存在在这个list当中,
遍历所花的时间是非常多的,
但是 当这些数据的长度超过可用的内存的时候? hashtable可以帮我加快速度,但是存...
上面的比较多的费话..看起来累 呦口继续不下去了.
得出的方案是用 bit数组来节省空间
a hash table capable of holding M items will take m/8 bytes of memory
接下来是实际的解决方法
net 里面有一个类 BitArray它可以帮我们很容易的创建出一个hashtable
创建一个简单的 HashingTable
public class SimpleHashTable
{
private BitArray hashbits = null;
public SimpleHashTable(int tableSize)
{
hashbits = new BitArray(tableSize, false);
}
public bool Test(string str)
{
int hash = Math.Abs(str.GetHashCode()) % hashbits.Count;
return hashbits[hash];
}
public bool Add(string str)
{
int hash = Math.Abs(str.GetHashCode()) % hashbits.Count;
bool rslt = hashbits[hash];
if (!rslt)
hashbits[hash] = true;
return rslt;
}
}
Add 里面没有这个值的时候是返回 false ,
Test 里面有值的话是返回 true;
class Program
{
static void Main(string[] args)
{
int urlsRead = 0;
int collisions = 0;
SimpleHashTable hashTable = new SimpleHashTable(1000000);
using (StreamReader sr = new StreamReader("urls.txt"))
{
string url;
while ((url = sr.ReadLine()) != null)
{
urlsRead++;
bool rslt = hashTable.Add(url);
if (rslt)
collisions++;
if ((urlsRead % 10000) == 0)
{
Console.WriteLine("{0} {1} {2}%",
urlsRead, collisions, 100*
(double)collisions / urlsRead);
}
}
}
Console.WriteLine("{0} urls read", urlsRead);
Console.WriteLine("{0} collisions", collisions);
Console.WriteLine("False positive rate = {0}%",
100*(double)collisions / urlsRead);
}
}
The output from that program, run against the 100,000 unique URLs in my file, is:
然后提供了一个 防止碰撞的结构. hashkeys 保存这个hash的三个位置
public class BloomFilter
{
private BitArray hashbits;
private int numKeys;
private int[] hashKeys;
public BloomFilter(int tableSize, int nKeys)
{
numKeys = nKeys;
hashKeys = new int[numKeys];
hashbits = new BitArray(tableSize);
}
private int HashString(string s)
{
int hash = 0;
for (int i = 0; i < s.Length; i++)
{
hash += s[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return hash;
}
private void CreateHashes(string str)
{
int hash1 = str.GetHashCode();
int hash2 = HashString(str);
hashKeys[0] = Math.Abs(hash1 % hashbits.Count);
if (numKeys > 1)
{
for (int i = 1; i < numKeys; i++)
{
hashKeys[i] = Math.Abs((hash1 + (i * hash2))
% hashbits.Count);
}
}
}
public bool Test(string str)
{
CreateHashes(str);
// Test each hash key. Return false if any
// one of the bits is not set.
foreach (int hash in hashKeys)
{
if (!hashbits[hash])
return false;
}
// All bits set. The item is there.
return true;
}
public bool Add(string str)
{
// Initially assume that the item is in the table
bool rslt = true;
CreateHashes(str);
foreach (int hash in hashKeys)
{
if (!hashbits[hash])
{
// One of the bits wasn't set, so show that
// the item wasn't in the table, and set that bit.
rslt = false;
hashbits[hash] = true;
}
}
return rslt;
}
}
测试:
class Program
{
static void Main(string[] args)
{
int urlsRead = 0;
int hashCollisions = 0;
int bloomCollisions = 0;
SimpleHashTable hashTable = new SimpleHashTable(1000000);
BloomFilter bloom = new BloomFilter(480833, 3);
using (StreamReader sr = new StreamReader("urls.txt"))
{
string url;
while ((url = sr.ReadLine()) != null)
{
urlsRead++;
bool rslt = hashTable.Add(url);
if (rslt)
hashCollisions++;
rslt = bloom.Add(url);
if (rslt)
bloomCollisions++;
if ((urlsRead % 10000) == 0)
{
Console.WriteLine("{0} {1} {2}% {3} {4}%", urlsRead,
hashCollisions, 100*(double)hashCollisions / urlsRead,
bloomCollisions, 100*(double)bloomCollisions / urlsRead);
}
}
}
Console.WriteLine("{0} urls read", urlsRead);
Console.WriteLine("{0} hash collisions", hashCollisions);
Console.WriteLine("False positive rate (hash) = {0}%",
100*(double)hashCollisions / urlsRead);
Console.WriteLine("{0} Bloom collisions", bloomCollisions);
Console.WriteLine("False positive rate (Bloom) = {0}%",
100*(double)bloomCollisions / urlsRead);
}
}
在添加数据的时候.他会判断前面这个位置是否有一个地址存了.如果有的话.它就存第二个. 如果都被存光了..哪就发生碰撞了
实际总的容量是有限的.
还有泛型和它的例子
public abstract class BloomFilter
{
private BitArray hashbits;
private int numKeys;
protected int[] hashKeys;
public BloomFilter(int tableSize, int nKeys)
{
numKeys = nKeys;
hashKeys = new int[numKeys];
hashbits = new BitArray(tableSize);
}
public bool Test(TValue val)
{
CreateHashes(val);
// Test each hash key. Return false
// if any one of the bits is not set.
foreach (int hash in hashKeys)
{
if (!hashbits[hash])
return false;
}
// All bits set. The item is there.
return true;
}
public bool Add(TValue val)
{
// Initially assume that the item is in the table
bool rslt = true;
CreateHashes(val);
foreach (int hash in hashKeys)
{
if (!hashbits[hash])
{
// One of the bits wasn't set, so show that
// the item wasn't in the table, and set that bit.
rslt = false;
hashbits[hash] = true;
}
}
return rslt;
}
protected virtual void CreateHashes(TValue val)
{
int hash1 = CreateHash1(val);
int hash2 = CreateHash2(val);
hashKeys[0] = Math.Abs(hash1 % hashbits.Count);
if (numKeys > 1)
{
for (int i = 1; i < numKeys; i++)
{
hashKeys[i] = Math.Abs((hash1 + (i * hash2)) %
hashbits.Count);
}
}
}
protected abstract int CreateHash1(TValue val);
protected abstract int CreateHash2(TValue val);
}
class StringBloomFilter : BloomFilter
{
public StringBloomFilter(int tableSize, int nKeys)
: base(tableSize, nKeys)
{
}
protected override int CreateHash1(string val)
{
return val.GetHashCode();
}
protected override int CreateHash2(string val)
{
int hash = 0;
for (int i = 0; i < val.Length; i++)
{
hash += val[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return hash;
}
}
让我们的蜘蛛跑得更快吧
Google 数学之美哪边也介绍到了.
http://googlechinablog.com/2007/07/bloom-filter.html
原文:Bloom Filters in C#
http://www.devsource.com/article2/0,1895,2113495,00.asp
想像一下.如果你有一个非常大的无序的数据(url连接) 并且你要保证同样的一条连接不会在其它地方再次出现
你实时的收集哪些数据,你没有办法来预防两个相同的url出现,再不断增加的数据当中.
当这些数据是少的时候你可以轻松的创建一个list(dictonary or hashtable 或者你自已的数据结构)然后遍历它们,看它是不是已经存在在这个list当中,
遍历所花的时间是非常多的,
但是 当这些数据的长度超过可用的内存的时候? hashtable可以帮我加快速度,但是存...
上面的比较多的费话..看起来累 呦口继续不下去了.
得出的方案是用 bit数组来节省空间
a hash table capable of holding M items will take m/8 bytes of memory
接下来是实际的解决方法
net 里面有一个类 BitArray它可以帮我们很容易的创建出一个hashtable
创建一个简单的 HashingTable
public class SimpleHashTable
{
private BitArray hashbits = null;
public SimpleHashTable(int tableSize)
{
hashbits = new BitArray(tableSize, false);
}
public bool Test(string str)
{
int hash = Math.Abs(str.GetHashCode()) % hashbits.Count;
return hashbits[hash];
}
public bool Add(string str)
{
int hash = Math.Abs(str.GetHashCode()) % hashbits.Count;
bool rslt = hashbits[hash];
if (!rslt)
hashbits[hash] = true;
return rslt;
}
}
Add 里面没有这个值的时候是返回 false ,
Test 里面有值的话是返回 true;
class Program
{
static void Main(string[] args)
{
int urlsRead = 0;
int collisions = 0;
SimpleHashTable hashTable = new SimpleHashTable(1000000);
using (StreamReader sr = new StreamReader("urls.txt"))
{
string url;
while ((url = sr.ReadLine()) != null)
{
urlsRead++;
bool rslt = hashTable.Add(url);
if (rslt)
collisions++;
if ((urlsRead % 10000) == 0)
{
Console.WriteLine("{0} {1} {2}%",
urlsRead, collisions, 100*
(double)collisions / urlsRead);
}
}
}
Console.WriteLine("{0} urls read", urlsRead);
Console.WriteLine("{0} collisions", collisions);
Console.WriteLine("False positive rate = {0}%",
100*(double)collisions / urlsRead);
}
}
The output from that program, run against the 100,000 unique URLs in my file, is:
10000 44 0.44% 20000 187 0.935% 30000 423 1.41% 40000 753 1.8825% 50000 1200 2.4% 60000 1753 2.92166666666667% 70000 2375 3.39285714285714% 80000 3123 3.90375% 90000 3946 4.38444444444444% 100000 4834 4.834% 100000 urls read 4834 collisions False positive rate = 4.834%
然后写了一个简单的测试类,我们可以看到它的碰撞(冲突)还是比较明显的
接下来就是如何继续去解决这样的问题 创建一个新的 Hash算法函数 HashString hi(x) = (f1(x) + if2(x)) mod m
然后提供了一个 防止碰撞的结构. hashkeys 保存这个hash的三个位置
public class BloomFilter
{
private BitArray hashbits;
private int numKeys;
private int[] hashKeys;
public BloomFilter(int tableSize, int nKeys)
{
numKeys = nKeys;
hashKeys = new int[numKeys];
hashbits = new BitArray(tableSize);
}
private int HashString(string s)
{
int hash = 0;
for (int i = 0; i < s.Length; i++)
{
hash += s[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return hash;
}
private void CreateHashes(string str)
{
int hash1 = str.GetHashCode();
int hash2 = HashString(str);
hashKeys[0] = Math.Abs(hash1 % hashbits.Count);
if (numKeys > 1)
{
for (int i = 1; i < numKeys; i++)
{
hashKeys[i] = Math.Abs((hash1 + (i * hash2))
% hashbits.Count);
}
}
}
public bool Test(string str)
{
CreateHashes(str);
// Test each hash key. Return false if any
// one of the bits is not set.
foreach (int hash in hashKeys)
{
if (!hashbits[hash])
return false;
}
// All bits set. The item is there.
return true;
}
public bool Add(string str)
{
// Initially assume that the item is in the table
bool rslt = true;
CreateHashes(str);
foreach (int hash in hashKeys)
{
if (!hashbits[hash])
{
// One of the bits wasn't set, so show that
// the item wasn't in the table, and set that bit.
rslt = false;
hashbits[hash] = true;
}
}
return rslt;
}
}
测试:
class Program
{
static void Main(string[] args)
{
int urlsRead = 0;
int hashCollisions = 0;
int bloomCollisions = 0;
SimpleHashTable hashTable = new SimpleHashTable(1000000);
BloomFilter bloom = new BloomFilter(480833, 3);
using (StreamReader sr = new StreamReader("urls.txt"))
{
string url;
while ((url = sr.ReadLine()) != null)
{
urlsRead++;
bool rslt = hashTable.Add(url);
if (rslt)
hashCollisions++;
rslt = bloom.Add(url);
if (rslt)
bloomCollisions++;
if ((urlsRead % 10000) == 0)
{
Console.WriteLine("{0} {1} {2}% {3} {4}%", urlsRead,
hashCollisions, 100*(double)hashCollisions / urlsRead,
bloomCollisions, 100*(double)bloomCollisions / urlsRead);
}
}
}
Console.WriteLine("{0} urls read", urlsRead);
Console.WriteLine("{0} hash collisions", hashCollisions);
Console.WriteLine("False positive rate (hash) = {0}%",
100*(double)hashCollisions / urlsRead);
Console.WriteLine("{0} Bloom collisions", bloomCollisions);
Console.WriteLine("False positive rate (Bloom) = {0}%",
100*(double)bloomCollisions / urlsRead);
}
}
10000 44 0.44% 1 0.01% 20000 187 0.935% 10 0.05% 30000 423 1.41% 38 0.126666666666667% 40000 753 1.8825% 118 0.295% 50000 1200 2.4% 262 0.524% 60000 1753 2.92166666666667% 517 0.861666666666667% 70000 2375 3.39285714285714% 866 1.23714285714286% 80000 3123 3.90375% 1352 1.69% 90000 3946 4.38444444444444% 2118 2.35333333333333% 100000 4834 4.834% 2966 2.966% 100000 urls read 4834 hash collisions False positive rate (hash) = 4.834% 2966 Bloom collisions False positive rate (Bloom) = 2.966%
在添加数据的时候.他会判断前面这个位置是否有一个地址存了.如果有的话.它就存第二个. 如果都被存光了..哪就发生碰撞了
实际总的容量是有限的.
还有泛型和它的例子
public abstract class BloomFilter
{
private BitArray hashbits;
private int numKeys;
protected int[] hashKeys;
public BloomFilter(int tableSize, int nKeys)
{
numKeys = nKeys;
hashKeys = new int[numKeys];
hashbits = new BitArray(tableSize);
}
public bool Test(TValue val)
{
CreateHashes(val);
// Test each hash key. Return false
// if any one of the bits is not set.
foreach (int hash in hashKeys)
{
if (!hashbits[hash])
return false;
}
// All bits set. The item is there.
return true;
}
public bool Add(TValue val)
{
// Initially assume that the item is in the table
bool rslt = true;
CreateHashes(val);
foreach (int hash in hashKeys)
{
if (!hashbits[hash])
{
// One of the bits wasn't set, so show that
// the item wasn't in the table, and set that bit.
rslt = false;
hashbits[hash] = true;
}
}
return rslt;
}
protected virtual void CreateHashes(TValue val)
{
int hash1 = CreateHash1(val);
int hash2 = CreateHash2(val);
hashKeys[0] = Math.Abs(hash1 % hashbits.Count);
if (numKeys > 1)
{
for (int i = 1; i < numKeys; i++)
{
hashKeys[i] = Math.Abs((hash1 + (i * hash2)) %
hashbits.Count);
}
}
}
protected abstract int CreateHash1(TValue val);
protected abstract int CreateHash2(TValue val);
}
class StringBloomFilter : BloomFilter
{
public StringBloomFilter(int tableSize, int nKeys)
: base(tableSize, nKeys)
{
}
protected override int CreateHash1(string val)
{
return val.GetHashCode();
}
protected override int CreateHash2(string val)
{
int hash = 0;
for (int i = 0; i < val.Length; i++)
{
hash += val[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return hash;
}
}
让我们的蜘蛛跑得更快吧
Google 数学之美哪边也介绍到了.
http://googlechinablog.com/2007/07/bloom-filter.html
相关文章推荐
- 大数据量的过滤 (用于爬虫,蜘蛛) Bloom Filter 布隆过滤器
- 大数据量的过滤 (用于爬虫,蜘蛛) Bloom Filter 布隆过滤器
- 大数据量的过滤 (用于爬虫,蜘蛛) Bloom Filter 布隆过滤器
- 大数据量的过滤 (用于爬虫,蜘蛛) Bloom Filter 布隆过滤器
- 一个用于白名单服务的布隆过滤器(bloom filter)
- [大数据量]布隆过滤器(Bloom Filter)适用类型以及具体示例
- 爬虫 —— 布隆过滤器算法(Bloom Filter)
- 流过滤-Bloom Filter布隆过滤器
- Bloom Filter(布隆过滤器)用于 检查一个元素是否在集合中
- 线段相交算法——平面扫描(可用于空间连接查询过滤)
- 布隆过滤器 (Bloom Filter) 详解
- 正则表达式__【匹配、切割、替换】【获取:Pattern & Matcher】【网页爬虫(蜘蛛)】
- 布隆过滤器(Bloom Filter)
- 布隆过滤器(Bloom Filter)详解
- 用于过滤空白字符的几种常见的js正则表达式pattern
- Bloom Filter 布隆过滤器
- Crawljax - 支持Ajax的网络爬虫,可以用于WEB自动化测试
- Bloom Filter布隆过滤器
- robot.txt 搜索引擎 蜘蛛爬虫 搜索规则
- 热点_C#实现蜘蛛_爬虫程序的多线程控制