您的位置:首页 > 运维架构

百度题目--TOP K问题

2012-03-21 21:09 169 查看
某天有1千万条查询,大部分为重复的,可能只有300万条查询,每条查询的长度为1-255字节,请设计算法查找出最热门的10条查询

哈希 + 最小堆 时间复杂度为O(nlgk) n为数据量 , k为查询长度,这里为10;

#include <stdio.h>
#include <cstring>
#include <algorithm>
using namespace std;
#define HASHLEN 2807303
#define CHARLEN 30
typedef struct node_no_space* ptr_no_space;
typedef struct node_has_space * ptr_has_space;
ptr_no_space  head[HASHLEN];

struct node_no_space
{
char* word;
int count;
node_no_space * next;
};
struct node_has_space
{
char word[CHARLEN];
int count;
};
bool cmp(const node_has_space a ,const node_has_space b )
{
return a.count > b.count ;
}
int hash_funtion(char *p)
{
int value = 0 ;
while ( *p != '\0')
{
value = value * 31 + *p++;
if ( value > HASHLEN)
value = value % HASHLEN;
}
return value;
}
void addwordToTable(char * str)
{
int index = hash_funtion(str);
ptr_no_space temp = head[index]; //判断头结点
while ( temp != NULL )
{
if ( !strcmp(temp->word,str))
{
temp->count ++;
return ;
}
temp = temp->next;
}
//不在任意的index里面,新开一条记录
ptr_no_space new_list = new node_no_space;
new_list->count =1;
new_list->word = new char[strlen(str ) +1 ];
strcpy(new_list->word , str);
new_list->next = head[index];
head[index] = new_list;
}
//去除前后的特殊符号
void handle_symbol(char *str, int n)
{
while (str
< '0' || (str
> '9' && str
< 'A') || (str
> 'Z' && str
< 'a') || str
> 'z')
{
str
= '\0';
n--;
}

while (str[0] < '0' || (str[0] > '9' && str[0] < 'A') || (str[0] > 'Z' && str[0] < 'a') || str[0] > 'z')
{
int i = 0;
while (i < n)
{
str[i] = str[i+1];
i++;
}
str[i] = '\0';
n--;
}
}
void write_to_file()
{
FILE *fp = fopen("result.txt","w");
for ( int i = 0 ; i < HASHLEN; i++)
{
ptr_no_space tmp = head[i];
while (  tmp != NULL )
{

fprintf(fp,"%s %d\n" ,tmp->word , tmp->count);
tmp = tmp->next ;
}
}
fclose(fp);
}
int main()
{
FILE *fp_read  = fopen("string.txt","r");

char str[CHARLEN];
for ( int i = 0 ; i < HASHLEN ; i++)
head[i] = NULL;
while ( fscanf(fp_read,"%s" , &str) != EOF)
{
int n = strlen(str) - 1;
if (n > 0)
handle_symbol(str, n);
addwordToTable(str);//往哈希表中添加str
}
fclose(fp_read);
write_to_file();//写入文件
ptr_has_space heap = new node_has_space [10];
FILE *fp_result = fopen("result.txt","r");
int c;
for ( int i = 0 ; i < 10 ; i++)
{
fscanf(fp_result,"%s %d" ,&str  ,&c);
heap[i].count = c;
strcpy(heap[i].word , str);
}
//建立最小堆
make_heap(heap,heap+10,cmp);
ptr_has_space p = new node_has_space;
//不断读入result.txt中数据 , 维护最小堆
while ( fscanf(fp_result,"%s %d" ,&p->word , &p->count) != EOF)
{
if ( p->count > heap[0].count)
{
heap[0].count = p->count;
strcpy(heap[0].word , p->word);
make_heap(heap , heap+10 , cmp);
}
}
fclose(fp_result);
//输出堆中结果
sort_heap(heap,heap+10 ,cmp);
for ( int i = 0 ; i < 10  ; i++)
printf("%s %d\n", heap[i].word , heap[i].count);
return 0 ;
}


  
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: