您的位置:首页 > 其它

AC自动机(多模式串“KMP")模版

2016-09-22 22:31 489 查看
参考博客:kuangbin AC自动机小结AC自动机算法  海量数据处理之Tire树(字典树)

AC自动机,Aho-Corasick automation 是建立在字典树(Tire)上的多模式串快速匹配算法;

一个典型的例子就是:给出N个单词,和一篇文章,判断文章中出现了多种(个)之前的单词。

要想理解AC自动机必须先学Tire。Tire是一棵k叉树,除根节点之外,每个节点都储存了一个字符(字母),于是从祖先往下看,每一条路径都是一个单词。

AC自动机就是建立在Tire数据结构上的一个算法,类似于在Tire树上做KMP。类似kmp的next指针,它也有一个fail指针,以加速匹配的速度。

它可以判断某单词是否在文章中出现(可重叠),以及出现的次数。

具体原理不赘述,网上各种解释遍地开花。对于偷懒者来说,求个模版就算了……

AC自动机算法主要有三个步骤

1)建Tire树

2)构造fail指针

3)匹配

【模版】(带详细解释)

const int MAXN = 500*200;    //模式串串个数*模式串长度
const int MAXL = 10000+10;   //原串最大长度
const int MAXM = 128;        //Tire树分支个数,即字符种类数
struct Trie
{
int next[MAXN][MAXM],fail[MAXN],end[MAXN];
int root,L;
int newnode()
{
for(int i = 0;i < MAXM;i++)
next[L][i] = -1;
end[L++] = -1;  //单词"L"初始为-1,表示没有这个单词
return L-1;
}
void init()
{
L = 0;
root = newnode();
}
void insert(char buf[],int id)  //插入模式串,建树;id是模式串的编号,可无
{
int len = strlen(buf);
int now = root;
for(int i = 0;i < len;i++)
{
if(next[now][buf[i]] == -1)
next[now][buf[i]] = newnode();
now = next[now][buf[i]];
}
end[now] = id;  //记住id
}
void build()     //求fail指针
{
queue<int>Q;
fail[root] = root;
for(int i = 0;i < MAXM;i++)
if(next[root][i] == -1)
next[root][i] = root;
else
{
fail[next[root][i]] = root;
Q.push(next[root][i]);
}
while( !Q.empty() )
{
int now = Q.front();
Q.pop();
for(int i = 0;i < MAXM;i++)
if(next[now][i] == -1)
next[now][i] = next[fail[now]][i];
else
{
fail[next[now][i]]=next[fail[now]][i];
Q.push(next[now][i]);
}
}
}
int num[501];
int query(char buf[],int n,int id)   //匹配,某些变化主要在此处;id是文章的编号
{
bool has = false;
mst(num,0);
int len = strlen(buf);
int now = root;
int ct = 0;
for(int i = 0;i < len;i++)
{
now = next[now][buf[i]];
int temp = now;
while( temp != root )
{
if(end[temp] != -1) //如果单词存在
{
num[ct++] = end[temp];  //num数组存出现过单词的id
//end[temp] = -1;  //把temp删除
has = true;
}
temp = fail[temp];
}
}
if(!has) return 0;
}
};


三道入门题,注意字符种类总数。

1、HDU 2222 Keywords Search  

题意:求出现了多少种单词

【代码】

/* ***********************************************
Author        :angon

************************************************ */
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <algorithm>
#include <stack>
#include <vector>
#include <queue>
#include <set>
#include <map>
#include <string>
#include <math.h>
#include <stdlib.h>
#include <time.h>
using namespace std;
#define showtime fprintf(stderr,"time = %.15f\n",clock() / (double)CLOCKS_PER_SEC)
#define lld %I64d
#define REP(i,k,n) for(int i=k;i<n;i++)
#define REPP(i,k,n) for(int i=k;i<=n;i++)
#define scan(d) scanf("%d",&d)
#define scanl(d) scanf("%I64d",&d)
#define scann(n,m) scanf("%d%d",&n,&m)
#define scannl(n,m) scanf("%I64d%I64d",&n,&m)
#define mst(a,k)  memset(a,k,sizeof(a))
#define LL long long
#define N 1005
#define mod 1000000007
inline int read(){int s=0;char ch=getchar();for(; ch<'0'||ch>'9'; ch=getchar());for(; ch>='0'&&ch<='9'; ch=getchar())s=s*10+ch-'0';return s;}

const int MAXN = 500010;    //字符串个数
const int MAXL = 1000010;   //字符最大长度
const int MAXM = 26;        //Tire树分支个数
struct Trie
{
int next[MAXN][MAXM],fail[MAXN],end[MAXN];
int root,L;
int newnode()
{
for(int i = 0;i < 26;i++)
next[L][i] = -1;
end[L++] = 0;
return L-1;
}
void init()
{
L = 0;
root = newnode();
}
void insert(char buf[])
{
int len = strlen(buf);
int now = root;
for(int i = 0;i < len;i++)
{
if(next[now][buf[i]-'a'] == -1)
next[now][buf[i]-'a'] = newnode();
now = next[now][buf[i]-'a'];
}
end[now]++;
}
void build()
{
queue<int>Q;
fail[root] = root;
for(int i = 0;i < 26;i++)
if(next[root][i] == -1)
next[root][i] = root;
else
{
fail[next[root][i]] = root;
Q.push(next[root][i]);
}
while( !Q.empty() )
{
int now = Q.front();
Q.pop();
for(int i = 0;i < 26;i++)
if(next[now][i] == -1)
next[now][i] = next[fail[now]][i];
else
{
fail[next[now][i]]=next[fail[now]][i];
Q.push(next[now][i]);
}
}
}
int query(char buf[])
{
int len = strlen(buf);
int now = root;
int res = 0;
for(int i = 0;i < len;i++)
{
now = next[now][buf[i]-'a'];
int temp = now;
while( temp != root )
{
res += end[temp];
end[temp] = 0;
temp = fail[temp];
}
}
return res;
}
void debug()
{
for(int i = 0;i < L;i++)
{
printf("id = %3d,fail = %3d,end = %3d,chi = [",i,fail[i],end[i]);
for(int j = 0;j < 26;j++)
printf("%2d",next[i][j]);
printf("]\n");
}
}
};
char buf[MAXL];
Trie ac;
int main()
{
//freopen("in.txt","r",stdin);
//freopen("out.txt","w",stdout);
int t;scan(t);
while(t--)
{
int n;
scan(n);
ac.init();
REP(i,0,n)
{
scanf("%s",buf);
ac.insert(buf);
}
ac.build();
scanf("%s",buf);
printf("%d\n",ac.query(buf));
}

return 0;
}


2、HDU 3065 病毒侵袭持续中

题意:要求输出每个单词出现的次数

【代码】

/* ***********************************************
Author        :angon

************************************************ */
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <algorithm>
#include <stack>
#include <vector>
#include <queue>
#include <set>
#include <map>
#include <string>
#include <math.h>
#include <stdlib.h>
#include <time.h>
using namespace std;
#define showtime fprintf(stderr,"time = %.15f\n",clock() / (double)CLOCKS_PER_SEC)
#define lld %I64d
#define REP(i,k,n) for(int i=k;i<n;i++)
#define REPP(i,k,n) for(int i=k;i<=n;i++)
#define scan(d) scanf("%d",&d)
#define scanl(d) scanf("%I64d",&d)
#define scann(n,m) scanf("%d%d",&n,&m)
#define scannl(n,m) scanf("%I64d%I64d",&n,&m)
#define mst(a,k)  memset(a,k,sizeof(a))
#define LL long long
#define N 1005
#define mod 1000000007
inline int read(){int s=0;char ch=getchar();for(; ch<'0'||ch>'9'; ch=getchar());for(; ch>='0'&&ch<='9'; ch=getchar())s=s*10+ch-'0';return s;}

const int MAXN = 1010*50;    //字符串个数*50
const int MAXL = 2000000+10;   //原串最大长度
const int MAXM = 128;        //Tire树分支个数
char str[1005][100];
struct Trie
{
int next[MAXN][MAXM],fail[MAXN],end[MAXN];
int root,L;
int newnode()
{
for(int i = 0;i < MAXM;i++)
next[L][i] = -1;
end[L++] = -1;
return L-1;
}
void init()
{
L = 0;
root = newnode();
}
void insert(char buf[],int id)
{
int len = strlen(buf);
int now = root;
for(int i = 0;i < len;i++)
{
if(next[now][buf[i]] == -1)
next[now][buf[i]] = newnode();
now = next[now][buf[i]];
}
end[now] = id;
}
void build()
{
queue<int>Q;
fail[root] = root;
for(int i = 0;i < MAXM;i++)
if(next[root][i] == -1)
next[root][i] = root;
else
{
fail[next[root][i]] = root;
Q.push(next[root][i]);
}
while( !Q.empty() )
{
int now = Q.front();
Q.pop();
for(int i = 0;i < MAXM;i++)
if(next[now][i] == -1)
next[now][i] = next[fail[now]][i];
else
{
fail[next[now][i]]=next[fail[now]][i];
Q.push(next[now][i]);
}
}
}
int num[1001];
void query(char buf[],int n)
{
mst(num,0);
int len = strlen(buf);
int now = root;
//int res = 0;
for(int i = 0;i < len;i++)
{
now = next[now][buf[i]];
int temp = now;
while( temp != root )
{
//res += end[temp];
//end[temp] = 0;
if(end[temp] != -1)
num[end[temp]]++;
temp = fail[temp];
}
}
//return res;
REP(i,0,n)
if(num[i])
printf("%s: %d\n",str[i],num[i]);
}
};
char buf[MAXL];
Trie ac;
int main()
{
//freopen("in.txt","r",stdin);
//freopen("out.txt","w",stdout);
int n;
while(~scan(n))
{
ac.init();
REP(i,0,n)
{
scanf("%s",str[i]);
ac.insert(str[i],i);
}
ac.build();
scanf("%s",buf);
ac.query(buf,n);
}

return 0;
}


3、HDU 2896 病毒侵袭 

题意:输出出现了单词的编号

【代码】

/* ***********************************************
Author :angon

************************************************ */
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <algorithm>
#include <stack>
#include <vector>
#include <queue>
#include <set>
#include <map>
#include <string>
#include <math.h>
#include <stdlib.h>
#include <time.h>
using namespace std;
#define showtime fprintf(stderr,"time = %.15f\n",clock() / (double)CLOCKS_PER_SEC)
#define lld %I64d
#define REP(i,k,n) for(int i=k;i<n;i++)
#define REPP(i,k,n) for(int i=k;i<=n;i++)
#define scan(d) scanf("%d",&d)
#define scanl(d) scanf("%I64d",&d)
#define scann(n,m) scanf("%d%d",&n,&m)
#define scannl(n,m) scanf("%I64d%I64d",&n,&m)
#define mst(a,k) memset(a,k,sizeof(a))
#define LL long long
#define N 1005
#define mod 1000000007
inline int read(){int s=0;char ch=getchar();for(; ch<'0'||ch>'9'; ch=getchar());for(; ch>='0'&&ch<='9'; ch=getchar())s=s*10+ch-'0';return s;}

const int MAXN = 500*200; //模式串串个数*模式串长度
const int MAXL = 10000+10; //原串最大长度
const int MAXM = 128; //Tire树分支个数
struct Trie
{
int next[MAXN][MAXM],fail[MAXN],end[MAXN];
int root,L;
int newnode()
{
for(int i = 0;i < MAXM;i++)
next[L][i] = -1;
end[L++] = -1;
return L-1;
}
void init()
{
L = 0;
root = newnode();
}
void insert(char buf[],int id)
{
int len = strlen(buf);
int now = root;
for(int i = 0;i < len;i++)
{
if(next[now][buf[i]] == -1)
next[now][buf[i]] = newnode();
now = next[now][buf[i]];
}
end[now] = id;
}
void build()
{
queue<int>Q;
fail[root] = root;
for(int i = 0;i < MAXM;i++)
if(next[root][i] == -1)
next[root][i] = root;
else
{
fail[next[root][i]] = root;
Q.push(next[root][i]);
}
while( !Q.empty() )
{
int now = Q.front();
Q.pop();
for(int i = 0;i < MAXM;i++)
if(next[now][i] == -1)
next[now][i] = next[fail[now]][i];
else
{
fail[next[now][i]]=next[fail[now]][i];
Q.push(next[now][i]);
}
}
}
int num[501];
int query(char buf[],int n,int id)
{
bool has = false;
mst(num,0);
int len = strlen(buf);
int now = root;
int ct = 0;
for(int i = 0;i < len;i++)
{
now = next[now][buf[i]];
int temp = now;
while( temp != root )
{
if(end[temp] != -1)
{
num[ct++] = end[temp];
//end[temp] = -1;
has = true;
}
temp = fail[temp];
}
}
if(!has) return 0;
printf("web %d: ",id);
sort(num,num+ct);
REP(i,0,ct)
printf("%d%c",num[i]+1,i==ct-1?'\n':' ');
return 1;
}
};
char buf[MAXL];
Trie ac;
int main()
{
//freopen("in.txt","r",stdin);
//freopen("out.txt","w",stdout);
int n;
while(~scan(n))
{
ac.init();
REP(i,0,n)
{
scanf("%s",buf);
ac.insert(buf,i);
}
ac.build();
int m; scan(m);
int total = 0;
REPP(i,1,m)
{
scanf("%s",buf);
if(ac.query(buf,n,i))
total++;
}
printf("total: %d\n",total);

}

return 0;
}

以上~
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: