您的位置:首页 > 编程语言

double-array trie c代码 - a efficient implementation of trie structures

2013-01-03 19:30 507 查看
这些代码已经stress test, 但是因为set_list 函数还有待改进,所以代码中insert_word还会占用很长的时间。会进一步改进。算法来源于a efficient implementation of trie structures. 作者貌似是个日本人。先贴上代码,以便供喜欢double array trie的人研究,看了libdatrie的源码,比这个复杂,但是 原理是一样的。近期会给出原文章的翻译。http://blog.csdn.net/zzran/article/details/8462002

#include<iostream>
#include<string>
using namespace std;

#define MIN_CODE 1
#define MAX_CODE 255
#define BC_INC 10
#define TAIL_INC 10
#define TEMP_INC 5
#define CHAR_NUM 26

int *BC;
char *TAIL;
char *TEMP;
int BC_POS;
int TAIL_POS;
int BC_MAX;
int TAIL_MAX;
int TEMP_MAX;

void realloc_bc();
void separate(int s, char *b, int tail_pos);
int  change_bc(int current, int s, char *list, char ch);

int base(int n) {
if(n > BC_POS) {
return 0;
} else {
cout << "read base index=" << n << ":value="<< BC[2 * n] << endl;
return BC[2 * n];
}
}

int check(int n) {
if(n > BC_POS) {
return 0;
} else {
cout << "read check index=" << n << ":value="<< BC[2 * n + 1] << endl;
return BC[2 * n + 1];
}
}

void w_base(int n, int node) {
while(n >= BC_MAX) {
realloc_bc();
}
if(n > BC_POS) {
BC_POS = n;
}
BC[2 * n] = node;
cout << "write base index=" << n << ":value="<< BC[2 * n] << endl;
}

void w_check(int n, int node) {
while(n >= BC_MAX) {
realloc_bc();
}
if(n > BC_POS) {
BC_POS = n;
}
BC[2 * n + 1] = node;
cout << "write check index=" << n << ":value="<< BC[2 * n + 1] << endl;
}

char *mem_str(char *area_name, int *max, int init) {
*max = init;
char *area = (char*)malloc(sizeof(char) * (*max));
if(area == NULL) {
cout << area_name << " malloc error!" << endl;
}

memset(area, *max, '\0');
return area;
}
int arc_index(char ch) {
return ch - 'a' + 2;
}
void realloc_bc() {
int i, pre_bc;
pre_bc = BC_MAX;
BC_MAX += BC_INC;
BC =(int*)realloc(BC, sizeof(int) * 2 * BC_MAX);
if(BC == NULL) {
cout << "realloc bc error!" << endl;
return;
}
for(i = 2 * pre_bc; i < 2 * BC_MAX; i++) {
BC[i] = 0;
}
cout << "realloc bc!" << endl;
}

char *realloc_str(char *area_name, char *area, int *max, int inc) {
int pre_size;
int i;
pre_size = *max;
*max += inc;
area = (char*) realloc(area, sizeof(char) * (*max));
if(area == NULL) {
cout << area_name << " realloc error!" << endl;
exit(-1);
}
for(i = pre_size; i < *max; i++) {
area[i] = '\0';
}

cout << area_name << " realloc ok!" << endl;
return area;
}

void read_tail(int p) {
int i = 0;
while(TAIL[p] != '#') TEMP[i++] = TAIL[p++];
TEMP[i++] = '#';
TEMP[i] = '\0';
cout << "read tail!" << endl;
}

void write_tail(char *temp, int p) {
int i = 0;
int tail_index;

tail_index = p;
while((p + strlen(temp)) >= TAIL_MAX - 1) {
TAIL = realloc_str("TAIL", TAIL, &TAIL_MAX, TAIL_INC);
}

while(*(temp + i) != '\0') {
TAIL[tail_index++] = *(temp + i);
i++;
}

if(p + i + 1 > TAIL_POS) {
TAIL_POS = p + i;
}

cout << "write tail!" << endl;
}

int x_check(char *list) {
int i, base_pos = 1, check_pos;
unsigned char ch;
i = 0;
cout << "x_check start:" << endl;
do {
ch = list[i++];
check_pos = base_pos + ch;  //change
if(check(check_pos) != 0) {
base_pos++;
i = 0;
continue;
}
} while(list[i] != '\0');
cout << "x_check end!" << endl;
return base_pos;
}

char *set_list(int s) {
char *list = (char*)malloc(MAX_CODE + 1 + 1); // 256个字符 + 1 '\0'
int i, j = 0, t;
for(i = MIN_CODE; i < MAX_CODE; i++) {
t = base(s) + i;
if(check(t) == s) {
list[j] = (unsigned char)i; //change
j++;
}
}
list[j] = '\0';
cout << "set_list:" << list << endl;
return list;
}

void separate(int s, char *b, int tail_pos) {
int t = base(s) + (unsigned char)(*b); // change
b++;
w_check(t, s);
w_base(t, (-1) * tail_pos);
write_tail(b, tail_pos);
}

void bc_insert(int s, char *b) {
int t;
char list_s[MAX_CODE + 2];
char list_t[MAX_CODE + 2];
cout << "bc_insert start:" << endl;
t = base(s) + (unsigned char)(*b); // change
cout << "t=" << t << " check(t)=" << check(t) << endl;
if(check(t) != 0) {
strcpy(list_s, set_list(s));
strcpy(list_t, set_list(check(t)));
if(strlen(list_s) + 1 < strlen(list_t)) {
cout << "list_s=" << list_s << endl;
s = change_bc(s, s, list_s, *b);
} else {
cout << "list_t=" << list_t << endl;
s = change_bc(s, check(t), list_t, '\0');
}
}

separate(s, b, TAIL_POS);
cout << "bc_insert end." << endl;
}

int  change_bc(int current, int s, char *list, char ch) {
int i, k, old_node, new_node, old_base;
char a_list [MAX_CODE + 2];
old_base = base(s);
if(ch != '\0') {
strcpy(a_list, list);
i = strlen(a_list);
a_list[i] = ch;
a_list[i + 1] = '\0';
} else {
strcpy(a_list, list);
}
w_base(s, x_check(a_list));
i = 0;
do {
old_node = old_base + (unsigned char)(*list); //change
new_node = base(s) + (unsigned char)(*list);
cout << "old_node=" << old_node << ",new_node=" << new_node << endl;
w_base(new_node, base(old_node));
w_check(new_node, s);
if(base(old_node) > 0) {
k = base(old_node) + 1;
while(k - base(old_node) <= MAX_CODE || k < BC_POS) {
if(check(k) == old_node) {
w_check(k, new_node);
}
++k;
}
}
if(current != s && old_node == current) {
current = new_node;
}
w_base(old_node, 0);
w_check(old_node, 0);
list++;
} while(*list != '\0');
return current;
}

void tail_insert(int s, char *a, char *b) {
char list[3];
unsigned char ch;
int i = 0;
int length = 0;
int t;
int old_tail_pos;
old_tail_pos = (-1) * base(s);
cout << "tail_insert:" << "s=" << s << "a=" << a << " b=" << b << endl;
while(a[length] == b[length])
length++;
while(i < length) {
ch = a[i++];
list[0] = ch;
list[1] = '\0';
w_base(s, x_check(list));
t = base(s) + (unsigned char)(ch);
w_check(t, s);
s = t;
}
list[0] = a[length];
list[1] = b[length];
list[2] = '\0';
w_base(s, x_check(list));
separate(s, a + length, old_tail_pos);
separate(s, b + length, TAIL_POS);
}

int search_word(char *p_word) { // if found word, return its base index, if not, return -1
unsigned char ch;
int h = -1;
int s = 1;
int t;
cout << "begin-search word: " << p_word << endl;
do {
++h;
ch = p_word[h];
t = base(s) + (unsigned char)(ch);
if(check(t) != s) {
cout << "end-search word:" << p_word << endl;
return -1;
}
if(base(t) < 0) {
break;
}
s = t;
} while(*(p_word + h));
if(p_word[h] != '#')
read_tail((-1) * base(t));
if(p_word[h] == '#' || strcmp(TEMP, p_word + h + 1) == 0) {
cout << "end-search word: " << p_word << endl;
return t;
} else {
cout << "end-search word: " << p_word << endl;
return -1;
}
}

int delete_word(char *p_word) { // if delete given word, return 1, else return 0
int t = search_word(p_word);
if(t = -1) {
return 0;
} else {
w_base(t, 0);
w_check(t, 0);
return 1;
}
}

int insert_word(char *p_word) {
unsigned char ch;
int h = -1;
int s = 1;
int t;
cout << "begin-insert word :" << p_word << endl;
strcat(p_word, "#");
do {
++h;
ch = *(p_word + h);
t = base(s) + (unsigned char)(ch);
if(check(t) != s) {
cout << "s=" << s << ",t=" << t << ",check(t)=" << check(t) << endl;
bc_insert(s, p_word + h);
cout << "end-insert word:" << p_word << endl;
return 1;
}
if(base(t) < 0) {
break;
}
s = t;
} while(*(p_word + h));

if(p_word[h] != '#')
read_tail((-1) * base(t));
if(p_word[h] == '#' || strcmp(TEMP, p_word + h + 1) == 0) {
return 1;
}
if(base(t) != 0) {
tail_insert(t, TEMP, p_word + h + 1);
cout << "end-insert word :" << p_word << endl;
}
return 1;
}

void initialize() {
BC_MAX = BC_INC;
BC_POS = 1;
TAIL_POS = 1;
BC = (int*)malloc(sizeof(int) * 2 * BC_MAX);
if(BC == NULL) {
cout << "BC malloc error!" << endl;
return;
}
for(int i = 0; i < 2 * BC_MAX; i++) {
BC[i] = 0;
}
w_base(1, 1);
BC_POS = 1;

TAIL = mem_str("TAIL", &TAIL_MAX, TAIL_INC);
TAIL[0] = '#';
TEMP = mem_str("TEMP", &TEMP_MAX, TEMP_INC);
}
void main() {
char word[30] = {'\0'};
initialize();
FILE *key_file = fopen("key_words.txt", "r");
if(key_file == NULL) {
cout << "open key file error!" << endl;
return ;
}
while(fscanf(key_file, "%s", word) != EOF) {
insert_word(word);
cout << endl;
}

strcpy(word, "Beijing#");
if(search_word(word) > 0) {
printf("find word!\n");
} else {
printf("not find word!\n");
}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: