您的位置：首页 > 其它

一种基于Sunday算法的单模式字符串匹配算法

2014-12-11 12:35 281 查看

最近看了几个字符串匹配算法，偶然想到对Sunday算法的一种可能的改进（改变）

问题描述：

单模式字符串匹配

算法综述：

Sundy算法四目前最快的字符串匹配算法，本文在sunday基础上做了一些改进（或者改变），能在一定程度上提高sunday算法的运行效率，较有限，但也算是一种思路。

Sunday算法：

从前往后匹配，每次匹配不成功，关注的是尾部的后一字符（设该字符为x）是否在模式串中出现。若出现，移动模式串，使x与模式串中最后一次出现的该字符（下标最大）对齐；若没出现过，移动模式串使其头字符与x的下一字符对齐。总体思想是尽可能大步长地往后移动模式串。

本文新提出SundayEvening算法：

总体思想依然是尽可能大步长后移模式串。SundayEvening算法每次从后往前匹配，每次匹配不成功，不只关注尾部后一字符（设为x），也关注当前匹配不成功的字符（设为y）。因为：当尾x在模式串中，而y不在模式传中时，sunday算法可能没有找到最大移动步长。

SundayEvening的做法是，每当y匹配不成功，若y在模式串中则计算移动模式串使其相应字符与y对齐时的步长，若y不在字符串中，计算移动模式串使其头部与y后一字符对齐时的步长，总之，可以计算出按y来移动模式串的步长。再来看x，当x不在字符串中时，操作与sunday算法一致，若x在字符串中，计算按x移动的步长，与上述按y移动的步长比较，取较大的来进行移动。这样就尽可能的获得了较大的移动步长。而SundayEvening从后往前匹配显然也有助于获得更大步长（y靠后的概率大）。

Sunday和SundayEvening的比较：

SundayEvening算法通过两种措施①从后向前匹配②比较步长选较大者，提高了获得最大步长的可能性。但是会增加一些比较操作。当模式串长度极大或极小时，效率与sunday基本无差别，但是在中间某一区域内，SundayEvening算法可以获得约百分10到百分之15的效率提升。（测试数据为随机字符串，字符范围为ascii码中96个可打印字符）。

代码实现：

代码中写了三个算法：Sunday、KMP、SundayEvening（为了比较，将KMP一同写出）

测试数据：随机字符串（字符范围为96个可打印字符）

测试串长度：1000000000

模式串长度：10

测试结果与模式串长度、文本串长度、字符范围均有关系，文本串较小或模式串较大时效率与Sunday算法基本无差（但貌似不会低于Sunday）。

一次测试的结果：效率提升百分之14

具体代码：

#pragma once
#include<iostream>
#include<Windows.h>
#include<stdlib.h>
#include<time.h>
using namespace std;
#define random(x) (rand()%x)

//*********************KMP算法：***********************

void getNext(const unsigned char shortList[], int next[], int shortLength)
{
next[0] = -1;
int pointer = -1;//初始值是前面子串的前缀长度（前面子串前缀后第一个元素位置）
//pointer最终指向添加shortList[index]之前的子串的最长前缀后的第一个元素
int index = 0;//next数组指针，从1开始为next[index]赋值
while (index <= shortLength - 1)
{
if (pointer == -1 || shortList[index] == shortList[pointer])
{
pointer++;//前缀长度++（位置后移一位）
index++;
//if (shortList[index] == shortList[pointer])//index不匹配，若pionter位置与他相同，也一定不匹配，应避免这种pointer
//	next[index] = next[pointer];//从头开始，每个index都不与他对应的pointer位置元素相同，所以若出现相同，只需一次向前迭代即可保证不同了
//else
next[index] = pointer;//不同，不匹配就跳到pointer位置看shortList[pointer]是否匹配
}
else
//从未加shortList[index]之前的子串的前缀中，寻找更小的前缀（前缀的前缀），该前缀与后缀的后缀相同，然后看能不能匹配
pointer = next[pointer];
}
}

int KMP(const unsigned char longList[], const unsigned char shortList[], int next[], long  longLength, int shortLength)
{
int lp = 0;//longList中的指针
int sp = 0;//shortList中的指针
while (lp < longLength && sp < shortLength)
{
if (sp == -1 || longList[lp] == shortList[sp])//都不匹配重新检查，或者匹配
{
lp++;
sp++;
}
else//取前面与 #lp之前子串的某后缀# 相同的某前缀后的第一个元素
{
sp = next[sp];
}
}
if (sp == shortLength)
return(lp - shortLength);
else
return -1;
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////

//*********************Suday算法：***********************

void GetMaxIndex(int maxindex[], const unsigned char* shortList, const int shortLength)
{
for (int i = 0; i < 256; i++)
maxindex[i] = -1;
for (int i = 0; i < shortLength; i++)
{
if (maxindex[shortList[i]] < i)
maxindex[shortList[i]] = i;
}
}

int Sunday(const unsigned char shortList[], const unsigned char longList[], int maxindex[], const int shortLength, const long  longLength)
{
int lp = 0;//longList指针
int sp = 0;//shortList指针
int lhead = lp - sp;//longList中与shortList首元素对齐的元素下标
int lrear = lhead + shortLength - 1;//longList中与shortList尾元素对齐的元素下标
int result = -1;//longList中shortList的位置
while (lrear < longLength)
{
if (shortList[sp] == longList[lp])
{
if (lp == lrear)
return lhead;
lp++;
sp++;
}
else if (lrear + 1 >= longLength)//检查到末尾了，跳出循环
break;
else if (maxindex[longList[lrear + 1]] == -1)//后一字符不在shortList中
{
lp = lrear + 2;
sp = 0;
lhead = lp;
lrear = lhead + shortLength - 1;
}
else//后一字符串在shortList中
{
lhead = lrear + 1 - maxindex[longList[lrear + 1]];
lrear = lhead + shortLength - 1;
lp = lhead;
sp = 0;
}
}
return result;
}

/////////////////////////////////////////////////////////////////////
//***********************Sunday改进算法*************************

int SundayEvening(const unsigned char shortList[], const unsigned char longList[], int maxindex[], const int shortLength, const long  longLength)
{

int lhead = 0;//longList中与shortList首元素对齐的元素下标
int lrear = shortLength - 1;//longList中与shortList尾元素对齐的元素下标
int lp = lrear;//longList指针
int sp = shortLength - 1;//shortList指针
int result = -1;//longList中shortList的位置
int step = 0;//前移的数目
while (lrear < longLength)
{
if (shortList[sp] == longList[lp])
{
if (lp == lhead)
return lhead;
lp--;
sp--;
}
else if (lrear + 1 >= longLength)//检查到末尾了，跳出循环
break;
else if (maxindex[longList[lrear + 1]] == -1)//后一字符不在shortList中
{
lhead = lrear + 2;
lrear = lhead + shortLength - 1;
sp = shortLength - 1;
lp = lrear;
}
else
{
if (lrear + 1 - maxindex[longList[lrear + 1]] < lp - lhead + 1)
{
lhead += lp - lhead + 1;
lrear = lhead + shortLength - 1;
lp = lrear;
sp = shortLength - 1;
}
else
{
lhead = lrear + 1 - maxindex[longList[lrear + 1]];
lrear = lhead + shortLength - 1;
lp = lrear;
sp = shortLength - 1;
}

}
}
return result;
}

void main()
{
srand((int)time(0));

int shortLength = 10;
long longLength = 1000000000;
unsigned char* shortList = new unsigned char[shortLength];
unsigned char* longList = new unsigned char[longLength];
for (long i = 0; i < shortLength; i++)
{
int a = random(96);
unsigned char c = a;
shortList[i] = c;
}
for (long i = 0; i < longLength - shortLength; i++)
{
char c = random(96);
longList[i] = c;
}
for (long i = 0; i < shortLength; i++)
longList[i + longLength - shortLength] = shortList[i];

DWORD start_time = GetTickCount();

int maxindex[256];//存shortList中出现有的字符在shortList中最后一次出现的位置，同时也标记那些字符出现在了shortList中
GetMaxIndex(maxindex, shortList, shortLength);
int Sunday_result = Sunday(shortList, longList, maxindex, shortLength, longLength);
DWORD Sunday_end_time = GetTickCount();

if (Sunday_result < 0)
cout << "未在文本串中找到模式串！" << endl;
else
cout << "模式串在文本串中的起始下标为：" << Sunday_result << endl;

cout << "The Sunday run time is:" << (Sunday_end_time - start_time) << "ms!" << endl;//输出运行时间

int *next = new int[shortLength];
//next[index]存index前面子串中前缀后的第一个元素位置，不匹配时换这个位置的元素看是否匹配
//next是对应子串最长前缀长度数组的右移（左端补-1）
getNext(shortList, next, shortLength);
int KMP_result = KMP(longList, shortList, next, longLength, shortLength);
if (KMP_result < 0)
cout << "未在文本串中找到模式串！" << endl;
else
cout << "模式串在文本串中的起始下标为：" << KMP_result << endl;

DWORD KMP_end_time = GetTickCount();
cout << "The KMP run time is:" << (KMP_end_time - Sunday_end_time) << "ms!" << endl;//输出运行时间

GetMaxIndex(maxindex, shortList, shortLength);
int Sunday2_result = SundayEvening(shortList, longList, maxindex, shortLength, longLength);
DWORD Sunday2_end_time = GetTickCount();

if (Sunday2_result < 0)
cout << "未在文本串中找到模式串！" << endl;
else
cout << "模式串在文本串中的起始下标为：" << Sunday2_result << endl;

cout << "The SundayEvening run time is:" << (Sunday2_end_time - KMP_end_time) << "ms!" << endl;//输出运行时间

cout << "提高了：" << 1 - (double)(Sunday2_end_time - KMP_end_time) / (Sunday_end_time - start_time) << endl;

system("pause");
}

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航