您的位置:首页 > 其它

后缀数组应用4: 求不可重叠最长重复子串

2012-09-26 16:16 489 查看
View Code

#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<iostream>
#include<vector>
#include<string>
#include<math.h>
#include<map>
#include<set>
#include<algorithm>
using namespace std;
#define MAXN 10010

int sa[MAXN], rank[MAXN], sum[MAXN], height[MAXN];
int wa[MAXN], wb[MAXN], wx[MAXN], wsum[MAXN];
char str[MAXN];
int dp[1010][20];
/*
RMQ:
dp[i][j] = max(dp[i][j-1], dp[i + 2 ^(j-1)][j-1])
dp[i][0] = A[i];

求区间最值[i,j]
int L =lg( j - i + 1 )
return max(dp[i][L], dp[j + 1 - 2 ^ L][L]);
*/
//预处理height数组
void pre(int n)
{
for( int i = 0; i <= n; i++)
dp[i][0] = height[i];
int L = (int) log2(n);
for( int j = 1; j <= L; j++)
{
for( int i = 1; i <= n + 1 - (1<<j); i++)
dp[i][j] = min(dp[i][j-1], dp[i + (1<<(j-1))][j-1]);

}
}

int get_min( int a, int b)
{
int L = (int) log2(b - a + 1 );
return min(dp[a][L], dp[b + 1 - (1<<L)][L]);
}

//比较字符串是否相等
int cmp( int *r, int a, int b, int l)
{
return (r[a] == r[b] && r[a+l] == r[b+l]);
}

//倍增算法求sa数组
void get_sa(char *r, int *sa, int n, int m) //r为字符串, sa数组, n为字符串长度, m为字符串最大值
{
int i, j,p, *x = wa, *y = wb, *t;
for( i = 0; i < m; i++)
sum[i] = 0;
//对长度为1时后缀字符串排序
for( i = 0; i < n; i++)
sum[ x[i] = r[i] ]++;  //x相当于rank,但不是真正rank
for( i = 1; i < m; i++)
sum[i] += sum[i-1];
for( i = n-1; i >= 0; i--)
sa[--sum[x[i]]] = i;
//对长度为2,4,...的后缀字符串排序
for(j = 1, p = 1; p < n && j <= n; j *= 2)
{
//首先对关键字y排序,排序后的结果保存在y数组中,即是这个后缀字符串的起始位置
for(p = 0,i = n - j; i < n; i++)
y[p++] = i;
for(i = 0; i < n; i++) if( sa[i] >= j )  y[p++] = sa[i] - j;
//然后对关键字x排序,先要获取第1关键字x
for(i = 0; i < n; i++)
wx[i] = x[y[i]];
for(i = 0; i < m; i++)
wsum[i] = 0;
for(i = 0; i < n; i++)
wsum[ wx[i] ]++;
for(i = 1; i < m; i++)
wsum[i] += wsum[i-1];
for(i = n - 1;i >= 0; i--)
sa[--wsum[wx[i]]] = y[i];
//更新x
t = x, x = y, y = t;
for( x[sa[0]] = 0,i = 1, p = 1; i < n; i++)
x[ sa[i] ] = cmp(y, sa[i-1], sa[i], j) ? p - 1 : p++;
}
}

//h[i] = height[rank[i]], h[i] >= h[i-1] - 1
void get_height(char *r, int n)
{
int i, j, k = 0;//sa[0] = len 就是我们补的那个0
for(i = 1; i <= n; i++)
rank[sa[i]] = i;
for(i = 0; i < n ; height[rank[i++]] = k )
for( k ? k-- : 0, j = sa[rank[i]-1]; r[i+k] == r[j+k]; k++);
}

//枚举height数组中最长公共前缀大于可x的
int jugde( int x, int n)
{
int Min, Max;
for( int i = 1; i <= n; i++)
{
if( height[i] >= x )
{
Max = max(sa[i], sa[i-1]);
Min = min(sa[i], sa[i-1]);
if( Max - Min >= x )
return 1;

}
else
{
Min = sa[i];
Max = sa[i];
}

}
return 0;
}

//枚举不可重叠最长重复字串长度mid
int find(int n, int l, int r)
{
int ans = 0;
while( l <= r )
{
int mid = (l + r) / 2;
if( jugde(mid, n) )
{
ans = mid;
l = mid + 1;
}
else
r = mid - 1;
}
return ans;
}

int main( )
{
int a, b, n, m;
while( scanf("%s",str) != EOF )
{
int len = strlen(str);
str[len] = '0';
str[len+1] = 0;
memset(wa,0,sizeof(wa));
memset(wb,0,sizeof(wb));
memset(sa,0,sizeof(sa));
memset(height,0,sizeof(height));
get_sa(str, sa, len + 1, 255);
get_height( str, len );
pre(len);
printf("%d\n",find(len, 0, len-1));
}
return 0;
}


求不可重叠最长重复子串,二分。。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: