您的位置：首页 > 其它

利用二叉搜索树来实现输入文本的单词统计

2012-05-01 16:45 405 查看

这里有个题目，输入一个文本（纯英文），我们要能够得到文本中每个单词出现的个数，并且按照字典顺序输出。

解决这个题目可以利用二叉搜索树，这是一个比较好理解的方法，它将整个文本单词分解从根部(root）延伸，当输入的

单词比上一个大，就连接到此节点的右边，若小，则连接到左边，这样就可以轻易的将他们有序的连接了。

首先,我们需要一个结构体来提供储存单词和单词出现的次数，定义如下：

struct tnode
{
char *word;             //记录单词
int count;                 //记录次数
struct tnode *left;   //左节点
strcut tnode *right; //右节点
};

然后我们需要一个插入函数，这是一个递归版本的插入函数，talloc只是malloc的替代，是一个自定义函数，后面会给出

其实现代码。

struct tnode *addTree(struct tnode *p,char *w)
{
int cond;

if(p == NULL)
{
p = talloc();
p->word = strdup(w);
p->count = 1;
p->left = p->right = NULL;
}
else if((cond = strcmp(w,p->word)) == 0)
p->count++;
else if(cond < 0)
p->left = addTree(p->left,w);
else
p->right = addTree(p->right,w);
return p;
}

在插入函数中我们有一个strdup函数，它用于将字符串复制给一个动态字符串并返回此动态字符串，

由于strlen只能算出字符串除了‘\0’之外的字符数，所有申请动态内存时应该用strlen（s）+1。下面给出

此函数的实现：

char *strdup(char *s)
{
char *p;

p = (char *)malloc(strlen(s)+1);
if(p != NULL)
strcpy(p,s);
return p;
}

然后我们需要一个打印这个二叉树的函数，同样采用递归;

void treePrint(struct tnode *p)
{
if(p != NULL)
{
treePrint(p->left);
printf("%4d %s\n",p->count,p->word);
treePrint(p->right);
}
}

大家应该注意到了，我们还有getword、getch、ungetch函数为实现，下面将对其一一解释：

第一个：getch函数和ungetch函数

顾名思义，这两个的作用一个是获取字符，一个是压回字符，其实这是堆栈的结构，我们需要

一个足够大的数组来实现静态数组的堆栈：

#define BUFSIZE 100//缓冲区的大小

char buf[BUFSIZE];//字符缓冲区

int bufp = 0;//缓冲区中的下一个位置

下面给出这两个函数的实现：

int getch(void)
{
return (bufp > 0) ? buf[--bufp] : getchar();
}

void ungetch(int c)
{
if(bufp >= BUFSIZE)
printf("ungetch: too many characters\n");
else
buf[bufp++] = c;
}

最后，我们来看一看getword函数，这个函数将从缓冲区中获取一个字符，这个字符开头必须是字母，

它返回的字符是输入字符串的第一个字符，代码如下：

int getword(char *word,int lim)
{
int c;
char *w = word;

while(isspace(c = getch()))
;
if(c != EOF)
*w++ = c;
if(!isalpha(c))
{
*w = '\0';
return c;
}
for(;--lim > 0;w++)
{
if(!isalnum(*w = getch()))
{
ungetch(*w);
break;
}
}
*w = '\0';
return word[0];
}

这样我们就可以轻易的实现的文本中的单词统计了。

附录（完整代码）：

#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>

#define BUFSIZE 100

char buf[BUFSIZE];
int bufp = 0;

struct tnode
{
char *word;
int count;
struct tnode *left;
struct tnode *right;
};

int getch(void);
void ungetch(int c);
int getword(char *word,int lim);
struct tnode *addTree(struct tnode *p,char *w);
struct tnode *talloc(void);
char *strdup(char *s);
void treePrint(struct tnode *p);

main()
{
struct tnode *root;
char word[100];

root = NULL;
while(getword(word,100) != EOF)
if(isalpha(word[0]))
root = addTree(root,word);

treePrint(root);

system("pause");
return 0;
}

int getword(char *word,int lim)
{
int c;
char *w = word;

while(isspace(c = getch()))
;
if(c != EOF)
*w++ = c;
if(!isalpha(c))
{
*w = '\0';
return c;
}
for(;--lim > 0;w++)
{
if(!isalnum(*w = getch()))
{
ungetch(*w);
break;
}
}
*w = '\0';
return word[0];
}

struct tnode *addTree(struct tnode *p,char *w)
{
int cond;

if(p == NULL)
{
p = talloc();
p->word = strdup(w);
p->count = 1;
p->left = p->right = NULL;
}
else if((cond = strcmp(w,p->word)) == 0)
p->count++;
else if(cond < 0)
p->left = addTree(p->left,w);
else
p->right = addTree(p->right,w);
return p;
}

struct tnode *talloc(void)
{
return (struct tnode *)malloc(sizeof(struct tnode));
}

char *strdup(char *s)
{
char *p;

p = (char *)malloc(strlen(s)+1);
if(p != NULL)
strcpy(p,s);
return p;
}

void treePrint(struct tnode *p)
{
if(p != NULL)
{
treePrint(p->left);
printf("%4d %s\n",p->count,p->word);
treePrint(p->right);
}
}

int getch(void)
{
return (bufp > 0) ? buf[--bufp] : getchar();
}

void ungetch(int c)
{
if(bufp >= BUFSIZE)
printf("ungetch: too many characters\n");
else
buf[bufp++] = c;
}

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航