您的位置:首页 > 编程语言

决策树分类器+C代码

2015-07-29 10:01 309 查看
关于决策树的理解和计算过程,http://www.tuicool.com/articles/3EZJBz 这篇文章上有很详细的介绍。

就我完成决策树代码的一些步骤进行介绍:(ID3算法)

1. 获取样本,计算样本的增益值,选择增益值最大的作为下一步分支的根节点;

2. 将被选中的样本属性删除(我这里是将该属性列的值设置为99,即认为删除);

3. 对被选中的属性列中的子属性进行分类:如:在本例代码中,最先被选中的属性为outlook,将outlook中的子属性:sunny,overcast,rain分为三类s1,s2,s3;

4. 对s1,s2,s3分别进行递归,即将分别以s1,s2,s3为样本执行步骤1;

5. 递归结束的标志为:当前样本全为反例,即全为NO,输出NO并return;或 当前样本全为正例,即全为yes,输出yes并return;当前分支结束遍历。

递归的算法还是很容易编写的,难的是如何找到它的遍历路径和子属性的选择,本例代码在路径记录和选择输出上我感觉不是很完美和正确,希望大家多提提意见和纠正代码的不足。


本代码用例如下:



根据上述表格,对应得到以下数据:

0 0 0 0 0

0 0 0 1 0

1 0 0 0 1

2 1 0 0 1

2 2 1 0 1

2 2 1 1 0

1 2 1 1 1

0 1 0 0 0

0 2 1 0 1

2 1 1 0 1

0 1 1 1 1

1 1 1 1 1

1 0 1 0 1

2 1 0 1 0

c语言代码如下:

#include "stdio.h"

#include "stdlib.h"

#include "math.h"

#include "string.h"

#include "vector"

using namespace std;

#define INF 99

#define dimNum 5 //样本维数

typedef vector<int> intVector;

vector<intVector> getFileInf(char *File); //获取样本

void ID3(vector<intVector> sample); //ID3决策树开始引擎

intVector getYESorNOnum(vector<intVector> sample); //获取去和不去的数量

vector<double> getEntropy(vector<intVector> sample); //获取各信息熵

int getGainLaber(double HD, vector<double> Entropy, int num); //获取增益值最大处的位置

void Iter(vector<intVector> sample, int laber); //开始迭代

void output(); //路径输出

void save(); //保存路径

struct pathInf

{

int att; //属性类别

int num; //使用的属性

int ID; //调用的顺序

int r; //结果

};

pathInf path[dimNum];

int ID = 1;

int result;

typedef vector<pathInf> pathVector;

vector<pathVector> savePath;

void main()

{

int i;

vector<intVector> sample;

char *File = "样本.txt";

//初始化路径

for(i=0; i<dimNum-1; i++)

{

path[i].att = INF;

path[i].num = INF;

path[i].ID = INF;

}

printf("天气 温度 湿度 风力 Y/N (注:99表示未考虑。)\n");

sample = getFileInf(File);

ID3(sample);

output();

}

//ID3决策树开始引擎

void ID3(vector<intVector> sample)

{

int i, j;

intVector yesORno = getYESorNOnum(sample);

vector<double> Entropy;

double HD_Entropy;

if(yesORno.at(0)==0)

{

printf("样本全为不去!!!\n");

exit(0);

}

if(yesORno.at(1)==0)

{

printf("样本全为去!!!\n");

exit(0);

}

HD_Entropy = -((double)yesORno.at(0)/sample.size()) *log10((double)yesORno.at(0)/sample.size())/log10(2)

- ((double)yesORno.at(1)/sample.size()) *log10((double)yesORno.at(1)/sample.size())/log10(2);

Entropy = getEntropy(sample);

int laber = getGainLaber(HD_Entropy, Entropy, 0); //最大增益值所对应的列

Iter(sample, laber); //开始迭代

}

//迭代函数

void Iter(vector<intVector> sample, int laber)

{

int i, j;

intVector temp;

intVector YorN;

intVector yesORno = getYESorNOnum(sample);

vector<double> Entropy;

double HD_Entropy;

vector<intVector> samp0, samp1, samp2;

int laber0;

if(yesORno[0]==0)

{

result = 0;

save();

return;

}

if(yesORno[1]==0)

{

result = 1;

save();

return;

}

//进行分支

for(i=0; i<sample.size(); i++)

{

if (sample[i][laber]==0)

{

temp = sample[i];

temp[laber] = INF;

samp0.push_back(temp);

}

if (sample[i][laber]==1)

{

temp = sample[i];

temp[laber] = INF;

samp1.push_back(temp);

}

if (sample[i][laber]==2)

{

temp = sample[i];

temp[laber] = INF;

samp2.push_back(temp);

}

}

if(samp0.size()!=0)

{

YorN = getYESorNOnum(samp0);

HD_Entropy = -((double)YorN.at(0)/samp0.size()) *log10((double)YorN.at(0)/samp0.size())/log10(2)

- ((double)YorN.at(1)/samp0.size()) *log10((double)YorN.at(1)/samp0.size())/log10(2);

Entropy = getEntropy(samp0);

laber0 = getGainLaber(HD_Entropy, Entropy, 0); //最大增益值所对应的列

Iter(samp0, laber0);

}

if(samp1.size()!=0)

{

YorN = getYESorNOnum(samp1);

HD_Entropy = -((double)YorN.at(0)/samp1.size()) *log10((double)YorN.at(0)/samp1.size())/log10(2)

- ((double)YorN.at(1)/samp1.size()) *log10((double)YorN.at(1)/samp1.size())/log10(2);

Entropy = getEntropy(samp1);

laber0 = getGainLaber(HD_Entropy, Entropy, 1); //最大增益值所对应的列

Iter(samp1, laber0);

}

if(samp2.size()!=0)

{

YorN = getYESorNOnum(samp2);

HD_Entropy = -((double)YorN.at(0)/samp2.size()) *log10((double)YorN.at(0)/samp2.size())/log10(2)

- ((double)YorN.at(1)/samp2.size()) *log10((double)YorN.at(1)/samp2.size())/log10(2);

Entropy = getEntropy(samp2);

laber0 = getGainLaber(HD_Entropy, Entropy, 2); //最大增益值所对应的列

Iter(samp2, laber0);

}

}

//获取增益值

int getGainLaber(double HD, vector<double> Entropy, int num)

{

int i, l=0;

int laber;

double max;

vector<double> Gain;

for(i=0; i<Entropy.size(); i++)

Gain.push_back(HD-Entropy[i]);

max = Gain[0];

for(i=1; i<Gain.size(); i++)

if(max<Gain[i])

{

max = Gain[i];

l=i;

}

for(i=0; i<Entropy.size(); i++)

if(Entropy[i]>10 && path[i].att==INF)

{

path[i].att = i;

path[i].ID = ID;

path[i].num = num;

ID++;

}

return l;

}

//获取去和不去的数量

intVector getYESorNOnum(vector<intVector> sample)

{

int i;

intVector dst;

int yesNum=0;

int noNum=0;

for(i=0; i<sample.size(); i++)

{

if(sample[i][sample[0].size()-1]==1)

yesNum++;

else

noNum++;

}

dst.push_back(yesNum);

dst.push_back(noNum);

return dst;

}

//获取各信息熵

vector<double> getEntropy(vector<intVector> sample)

{

vector<double> Entropy;

int i, j;

int temp1, temp11, temp12;

int temp2, temp21, temp22;

int temp3, temp31, temp32;

double H1, H2, H3;

for(i=0; i<sample[0].size()-1; i++)

{

H1=H2=H3=0;

temp1 = temp11 = temp12 = temp2 = temp21 = temp22 = temp3 = temp31 = temp32 = 0;

for(j=0; j<sample.size(); j++)

{

//第1种情况

if(sample[j][i]==0)

{

temp1++;

if(sample[j][i]==0 && sample[j][sample[0].size()-1]==1) //第1种情况下yes

temp11++;

if(sample[j][i]==0 && sample[j][sample[0].size()-1]==0) //第1种情况下NO

temp12++;

}

//第2种情况

if(sample[j][i]==1)

{

temp2++;

if(sample[j][i]==1 && sample[j][sample[0].size()-1]==1) //第2种情况下yes

temp21++;

if(sample[j][i]==1 && sample[j][sample[0].size()-1]==0) //第2种情况下NO

temp22++;

}

//第3种情况

if(sample[j][i]==2)

{

temp3++;

if(sample[j][i]==2 && sample[j][sample[0].size()-1]==1) //第3种情况下yes

temp31++;

if(sample[j][i]==2 && sample[j][sample[0].size()-1]==0) //第3种情况下NO

temp32++;

}

}

//计算信息熵

if(temp1!=0)

{

if(temp11==0 || temp12==0)

H1 = 0;

else

H1 = -((double)temp11/temp1)*log10((double)temp11/temp1)/log10(2) - ((double)temp12/temp1)*log10((double)temp12/temp1)/log10(2);

}

if(temp2!=0)

{

if(temp21==0 || temp22==0)

H2 = 0;

else

H2 = -((double)temp21/temp2)*log10((double)temp21/temp2)/log10(2) - ((double)temp22/temp2)*log10((double)temp22/temp2)/log10(2);

}

if(temp3!=0)

{

if(temp31==0 || temp32==0)

H3 = 0;

else

H3 = -((double)temp31/temp3)*log10((double)temp31/temp3)/log10(2) - ((double)temp32/temp3)*log10((double)temp32/temp3)/log10(2);

}

if(sample[0][i]==99)

Entropy.push_back(99);

else

Entropy.push_back(((double)temp1/sample.size())*H1 +((double)temp2/sample.size())*H2 +((double)temp3/sample.size())*H3);

}

return Entropy;

}

//获取样本

vector<intVector> getFileInf(char *File)

{

int i=1;

int num;

vector<intVector> samlpe;

intVector temp;

FILE *fp = fopen(File, "r");

if(fp==NULL)

{

printf("Open file error!\n");

exit(0);

}

while(fscanf(fp, "%d", &num)!=EOF)

{

temp.push_back(num);

if(i%dimNum==0)

{

samlpe.push_back(temp);

temp.clear();

}

i++;

}

return samlpe;

}

//保存路径

void save()

{

pathVector temp;

pathInf swp;

int i, j, l;

int min;

ID = 1;

for(i=0; i<dimNum-1; i++)

{

l=i;

min = path[i].ID;

for(j=i+1; j<dimNum-1; j++)

if(min>path[j].ID)

{

min = path[j].ID;

l = j;

}

swp = path[i];

path[i] = path[l];

path[l] = swp;

}

path[dimNum-2].r = result;

for(i=0; i<dimNum-1; i++)

temp.push_back(path[i]);

savePath.push_back(temp);

temp.clear();

for(i=0; i<dimNum-1; i++)

{

path[i].att = INF;

path[i].num = INF;

path[i].ID = INF;

}

}

//结果输出

void output()

{

int i, j;

int root;

int maxPro;

int Pro[dimNum] = {0}; //优先级

intVector temp(dimNum, INF);

vector<intVector> saveResult(savePath.size(), temp);

for(i=0; i<dimNum; i++)

for(j=0; j<dimNum; j++)

saveResult[i][j] = INF;

//计算优先级的大小

for(i=0; i<savePath.size(); i++)

{

for(j=0; j<dimNum-1; j++)

{

if(savePath[i][j].att==INF)

continue;

Pro[savePath[i][j].att] += abs(j-dimNum); //累加值越大,说明该节点越深

}

}

//训练结果

for(i=0; i<savePath.size(); i++)

{

for(j=0; j<dimNum-1; j++)

{

if(savePath[i][j].att == INF)

break;

if(j==0)

saveResult[i][savePath[i][j].att] = savePath[i][j].num;

if(j>0)

{ //因为程序是递归的,根据节点深度确定程序的遍历顺序和当前节点所处的节点位置,判断当前子树是否遍历完成

if(Pro[savePath[i][j].att] > Pro[savePath[i][j-1].att])

saveResult[i][savePath[i][j].att] = saveResult[i-1][savePath[i][j].att];

else

saveResult[i][savePath[i][j].att] = savePath[i][j].num;

}

}

saveResult[i][dimNum-1] = savePath[i][dimNum-2].r;

}

//结果输出

for(i=0; i<saveResult.size(); i++)

{

for(j=0; j<dimNum; j++)

printf("%4d ", saveResult[i][j]);

printf("\n");

}

}

运行结果如下:




内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: