您的位置:首页 > 其它

LibSVM文本分类之工程中调用LibSVM进行文本分类

2010-09-04 10:44 519 查看
作者:finallyliuyu 转载使用等请注明出处

首先介绍libsvm 中主要的文件svm.h,svm.c ,这个两个文件实现了svm的算法。 svm-train.c,svm-predict.c 分别完成训练和预测功能。

本来我参照svm-train,svm-predict中的 main函数,将train功能,和predict功能直接在程序中整合,结果,调了一天都有异常。。(我还是太菜了)最后在同学的建议下 工程中改用系统调用的方式。为了获得准确率(将分类准确率输出到文本文件),将svm-predict函数做了如下修改:

注意 accuracy_file部分对应的修改。

void predict(FILE *input, FILE *output, FILE *accuracy_file)
{
int correct = 0;
int total = 0;
double error = 0;
double sump = 0, sumt = 0, sumpp = 0, sumtt = 0, sumpt = 0;

int svm_type=svm_get_svm_type(model);
int nr_class=svm_get_nr_class(model);
double *prob_estimates=NULL;
int j;

if(predict_probability)
{
if (svm_type==NU_SVR || svm_type==EPSILON_SVR)
printf("Prob. model for test data: target value = predicted value + z,\nz: Laplace distribution e^(-|z|/sigma)/(2sigma),sigma=%g\n",svm_get_svr_probability(model));
else
{
int *labels=(int *) malloc(nr_class*sizeof(int));
svm_get_labels(model,labels);
prob_estimates = (double *) malloc(nr_class*sizeof(double));
fprintf(output,"labels");
for(j=0;j<nr_class;j++)
fprintf(output," %d",labels[j]);
fprintf(output,"\n");
free(labels);
}
}

max_line_len = 1024;
line = (char *)malloc(max_line_len*sizeof(char));
while(readline(input) != NULL)
{
int i = 0;
double target_label, predict_label;
char *idx, *val, *label, *endptr;
int inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0

label = strtok(line," \t");
target_label = strtod(label,&endptr);
if(endptr == label)
exit_input_error(total+1);

while(1)
{
if(i>=max_nr_attr-1)	// need one more for index = -1
{
max_nr_attr *= 2;
x = (struct svm_node *) realloc(x,max_nr_attr*sizeof(struct svm_node));
}

idx = strtok(NULL,":");
val = strtok(NULL," \t");

if(val == NULL)
break;
errno = 0;
x[i].index = (int) strtol(idx,&endptr,10);
if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index)
exit_input_error(total+1);
else
inst_max_index = x[i].index;

errno = 0;
x[i].value = strtod(val,&endptr);
if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
exit_input_error(total+1);

++i;
}
x[i].index = -1;

if (predict_probability && (svm_type==C_SVC || svm_type==NU_SVC))
{
predict_label = svm_predict_probability(model,x,prob_estimates);
fprintf(output,"%g",predict_label);
for(j=0;j<nr_class;j++)
fprintf(output," %g",prob_estimates[j]);
fprintf(output,"\n");
}
else
{
predict_label = svm_predict(model,x);
fprintf(output,"%g\n",predict_label);
}

if(predict_label == target_label)
++correct;
error += (predict_label-target_label)*(predict_label-target_label);
sump += predict_label;
sumt += target_label;
sumpp += predict_label*predict_label;
sumtt += target_label*target_label;
sumpt += predict_label*target_label;
++total;
}
if (svm_type==NU_SVR || svm_type==EPSILON_SVR)
{
printf("Mean squared error = %g (regression)\n",error/total);
printf("Squared correlation coefficient = %g (regression)\n",
((total*sumpt-sump*sumt)*(total*sumpt-sump*sumt))/
((total*sumpp-sump*sump)*(total*sumtt-sumt*sumt))
);
}
else
{
float accuracy_rate = (float)correct/total*100;
fprintf(accuracy_file, "%f", accuracy_rate);

printf("Accuracy = %g%% (%d/%d) (classification)\n",
(double)correct/total*100,correct,total);
}

if(predict_probability)
free(prob_estimates);
}


int main(int argc, char **argv)
{

FILE *input, *output, *accuracy_file;
int i;

////@@debug
//fprintf(stdout, "my svm_predict...");

// parse options
for(i=1;i<argc;i++)
{
if(argv[i][0] != '-') break;
++i;
switch(argv[i-1][1])
{
case 'b':
predict_probability = atoi(argv[i]);
break;
default:
fprintf(stderr,"Unknown option: -%c\n", argv[i-1][1]);
exit_with_help();
}
}
//if(i>=argc-2)
if(i>=argc-3)
exit_with_help();

input = fopen(argv[i],"r");
if(input == NULL)
{
fprintf(stderr,"can't open input file %s\n",argv[i]);
exit(1);
}

if((model=svm_load_model(argv[i+1]))==0)
{
fprintf(stderr,"can't open model file %s\n",argv[i+1]);
exit(1);
}

output = fopen(argv[i+2],"w");
if(output == NULL)
{
fprintf(stderr,"can't open output file %s\n",argv[i+2]);
exit(1);
}

accuracy_file = fopen(argv[i+3], "w");
if(accuracy_file == NULL)
{
fprintf(stderr,"can't open output file %s\n",argv[i+3]);
exit(1);
}

x = (struct svm_node *) malloc(max_nr_attr*sizeof(struct svm_node));
if(predict_probability)
{
if(svm_check_probability_model(model)==0)
{
fprintf(stderr,"Model does not support probabiliy estimates\n");
exit(1);
}
}
else
{
if(svm_check_probability_model(model)!=0)
printf("Model supports probability estimates, but disabled in prediction.\n");
}
predict(input,output, accuracy_file);
svm_destroy_model(model);
free(x);
free(line);
fclose(input);
fclose(output);
return 0;
}


 

调用Libsvm完成分类,准确率计算的主程序。(我的代码)

 

 

include "stdio.h"
#include "stdlib.h"
#include "memory.h"
#include "string.h"

#define MAX_COMMAND_LINE_LENGTH 2048

int svm_train(char *command_path, char *train_libsvm, char *model_libsvm)
{
// 生成命令行
char command_line[MAX_COMMAND_LINE_LENGTH] = {'\0'};
sprintf(command_line, "%s -t 0 %s %s", command_path, train_libsvm, model_libsvm);
// 执行命令行
system(command_line);
return 1;
}
int svm_predict(char *command_path, char *test_libsvm, char *model_libsvm, char *result_path, char *accuracy_path)
{
// 生成命令行
char command_line[MAX_COMMAND_LINE_LENGTH] = {'\0'};
sprintf(command_line, "%s %s %s %s %s", command_path, test_libsvm, model_libsvm, result_path, accuracy_path);
// 执行命令行
system(command_line);
return 1;
}
int main()
{
void AccuracyFormation();
int LibSvm();
int end;
AccuracyFormation();
//LibSvm();

printf("finalfinish,congratulations!");
scanf("%d",&end);
return 1;

}

//char command_line[MAX_COMMAND_LINE_LENGTH] = {'\0'};
//// train
//sprintf(command_line, "..\\Release\\svm_train.exe -t 0 D:\\libsvmdata\\500\\0\\100\\train.libsvm D:\\libsvmdata\\500\\0\\100\\model.libsvm");
//system(command_line);

//// predict
////command_line[0] = '\0';
//memset(command_line, 0, sizeof(command_line[0])*MAX_COMMAND_LINE_LENGTH);
//
//sprintf(command_line, "..\\Release\\svm_predict.exe D:\\libsvmdata\\500\\0\\100\\test.libsvm D:\\libsvmdata\\500\\0\\100\\model.libsvm D:\\libsvmdata\\500\\0\\100\\result.txt D:\\libsvmdata\\500\\0\\100\\accuracy.txt");
//system(command_line);
int LibSvm()
{
int vfold=5;
char *train_path_part="\\data\\train.libsvm";
char *test_path_part="\\data\\test.libsvm";
char *result_path_part="\\data\\result.txt";
char *model_path_part="\\data\\model.libsvm";
char *accuracy_path_part="\\data\\accuracy.txt";
char  featureDimensions[15][20]={"10","20","30","40","50","60","70","80","90","100","110","120","130","140","150"};//特征维数
char done_research_times[5][10]={"0","1","2","3","4"};//已经进行了几次试验
char N_corpus[4][20]={"100","500","1000","1500"};//文档集规模
char command_path_train[] = "..\\Release\\svm_train.exe";
char command_path_predict[] = "..\\Release\\svm_predict.exe";
/*char train_libsvm[] = "D:\\1_100\\TextCategorization_1_100_100\\data\\train.libsvm";
char test_libsvm[] = "D:\\1_100\\TextCategorization_1_100_100\\data\\test.libsvm";
char model_libsvm[] = "D:\\1_100\\TextCategorization_1_100_100\\data\\model.libsvm";
char result_path_1[] = "D:\\1_100\\TextCategorization_1_100_100\\data\\result.txt";
char accuracy_path_1[] = "D:\\1_100\\TextCategorization_1_100_100\\data\\accuracy.txt";*/

char file_address[300][5000];
char *temp=(char*) malloc(10000);
char *train_path=(char*) malloc(10000);
char *test_path=(char*) malloc(10000);
char *result_path=(char*) malloc(10000);
char  *model_path=(char *)malloc(10000);
char *accuracy_path=(char *)malloc(10000);
int i,j,k;

memset(temp,0,10000);
memset(train_path,0,10000);
memset(result_path,0,10000);
memset(test_path,0,10000);
memset(model_path,0,10000);
memset(accuracy_path,0,10000);

/*	// train
svm_train(command_path_train, train_libsvm, model_libsvm);
// predict
svm_predict(command_path_predict, test_libsvm, model_libsvm, result_path_1, accuracy_path_1);

// 停住
system("pause");

return 1;*/

/*************************************生成文件名****************************************************/
for(i=0;i<5;i++)//指征 done_research_times
{
for(j=0;j<4;j++)//指征文档集规模
{
for( k=0;k<15;k++)//指征特征词维数
{
strcat(temp,"D:\\");
strcat(temp,done_research_times[i]);
strcat(temp,"_");
strcat(temp,N_corpus[j]);
strcat(temp,"_rfinish");
strcat(temp,"\\TextCategorization_");
strcat(temp,done_research_times[i]);
strcat(temp,"_");
strcat(temp,N_corpus[j]);
strcat(temp,"_");
strcat(temp,featureDimensions[k]);
strcpy(file_address[i*60+j*15+k],temp);
//printf("%s\n",temp);
memset(temp,0,10000);

}

}
}

free(temp);
for(i=0;i<300;i++)
{
//printf("%s\n", file_address[i]);
strcat(train_path,file_address[i]);
strcat(train_path,train_path_part);
strcat(test_path,file_address[i]);
strcat(test_path,test_path_part);
strcat(result_path,file_address[i]);
strcat(result_path,result_path_part);
strcat(model_path,file_address[i]);
strcat(model_path,model_path_part);
strcat(accuracy_path,file_address[i]);
strcat(accuracy_path,accuracy_path_part);
// train
svm_train(command_path_train, train_path, model_path);
// predict
svm_predict(command_path_predict, test_path, model_path, result_path, accuracy_path);
printf("\n%s路径下的LibSVM分类完成\n",file_address[i]);
memset(train_path,0,10000);
memset(result_path,0,10000);
memset(test_path,0,10000);
memset(model_path,0,10000);
memset(accuracy_path,0,10000);

}
free(train_path);
free(test_path);
free(model_path);
free(result_path);
free(accuracy_path);
printf("试验完成\n");

return 1;

}
void AccuracyFormation()
{
char *accuracy_path_part="\\data\\accuracy.txt";
//char  featureDimensions[11][20]={"100","500","1000","1500","2000","2500","3000","3500","4000","4500","5000"};//特征维数
char featureDimensions[15][20]={"10","20","30","40","50","60","70","80","90","100","110","120","130","140","150"};//特征维数
char done_research_times[5][10]={"0","1","2","3","4"};//已经进行了几次试验
char N_corpus[4][20]={"100","500","1000","1500"};//文档集规模
char *accuracy_path=(char *)malloc(10000);
char dest_accuracy[5][20]={"0.txt","1.txt","2.txt","3.txt","4.txt"};
int i,j,k;
int reallen=0;
FILE *fp=NULL;
char *temp=(char *)malloc(1000);
memset(accuracy_path,0,10000);
memset(temp,0,100);
for(i=0;i<5;i++)//指征 done_research_times
{
for(j=0;j<4;j++)//指征文档集规模
{
for( k=0;k<15;k++)//指征特征词维数
{   //构造路径
strcat(accuracy_path,"D:\\");
strcat(accuracy_path,done_research_times[i]);
strcat(accuracy_path,"_");
strcat(accuracy_path,N_corpus[j]);
strcat(accuracy_path,"_r1");
strcat(accuracy_path,"\\TextCategorization_");
strcat(accuracy_path,done_research_times[i]);
strcat(accuracy_path,"_");
strcat(accuracy_path,N_corpus[j]);
strcat(accuracy_path,"_");
strcat(accuracy_path,featureDimensions[k]);
strcat(accuracy_path,accuracy_path_part);

fp=fopen(accuracy_path,"r");
if(fp==NULL)
{
printf("FILENAEM ERROR");
exit(0);
}

fread(temp,1,100,fp);
fclose(fp);
fp=fopen(dest_accuracy[i],"a");
if(fp==NULL)
{
printf("FILENAEM ERROR");
exit(0);

}
if(k<14)//添加逗号
{
strcat(temp,",");
}
fwrite(temp,1,strlen(temp),fp);
fclose(fp);
printf("%s处理完毕\n",accuracy_path);
memset(accuracy_path,0,10000);
memset(temp,0,1000);

}
fp=fopen(dest_accuracy[i],"a");
if(fp==NULL)
{
printf("FILENAEM ERROR");
exit(0);

}
strcat(temp,"\r\n");

fwrite(temp,1,strlen(temp),fp);
fclose(fp);
printf("一行处理完毕\n");
memset(temp,0,1000);

}

printf("%s填写完毕\n",dest_accuracy[i]);
}

free(temp);
free(accuracy_path);

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: