您的位置:首页 > 其它

8.2 - 《机器学习基石》Home Work 2 Q.19-20

2014-04-08 17:46 281 查看







这一题把16题中的 decision stump 拓展到多维,要求找出E-in最小的那一维并在测试数据上计算对应维度的E-out:





#include <fstream>
#include <iostream>
#include <ctime>
#include <cmath>
#include <vector>
#include <algorithm>

using namespace std;

#define DEMENSION 9           //数据维度

char *file = "training.txt";
char *file_test = "testing.txt";

struct record {
    double input[DEMENSION];   
    int output;                        
};

struct singleDemensionRecord {
    double input;
    int output;
};

struct Hypothesis{
    int coef;
    double threshold;  
};

//求数字的符号
int sign(double x)
{
    if(x<0)       return -1;
    else if(x>0) return 1;
    else         return -1;
}

//从文件读取数据
void getData(ifstream & dataFile, vector<record> &data)
{
    while(!dataFile.eof()){
        record curRecord;   
        for(int i=0;i<DEMENSION;++i){ dataFile>>curRecord.input[i]; }
        dataFile>>curRecord.output;
        data.push_back(curRecord);
    }
    dataFile.close();   
}

//计算指定维度的样本错误率
double calErr(vector<singleDemensionRecord>& singleDemensionVec, vector<Hypothesis>& hypo, int demension)
{
    int errCount = 0;
    int length = singleDemensionVec.size();

    for(int i=0;i<length;++i){
        if(singleDemensionVec[i].output != hypo[demension-1].coef*sign(singleDemensionVec[i].input-hypo[demension-1].threshold)){
            errCount++;
        }   
    }

    return double(errCount)/double(length);
}

//single demension record的比较函数
bool recCompare(singleDemensionRecord & a, singleDemensionRecord & b)
{
    return a.input<b.input; 
}

//将指定维度的数据提取出来并升序排列
void getInputByDemension(vector<record>& dataSet, vector<singleDemensionRecord>& singleDemensionVec, int demension)
{
    int recordSize = dataSet.size(); 
    singleDemensionRecord curRec;

    for(int i=0;i<recordSize;++i){
        curRec.input = dataSet[i].input[demension-1]; 
        curRec.output = dataSet[i].output; 
        singleDemensionVec.push_back(curRec);
    }

    sort(singleDemensionVec.begin(),singleDemensionVec.end(),recCompare);
}

//遍历所有θ,找到最小的E-in并返回
double getMinErrIn(vector<singleDemensionRecord> & singleDemensionVec, vector<Hypothesis>& hypo, int demension, double & bestThres)
{
    double minErrIn = 1.0;
    double curErrIn;
    int recordSize = singleDemensionVec.size();

    for(int i=0;i<recordSize-1;++i){
        hypo[demension-1].threshold = double(singleDemensionVec[i].input+singleDemensionVec[i+1].input)/2.0;
        curErrIn = calErr(singleDemensionVec,hypo,demension);
        if(curErrIn<minErrIn){
            minErrIn = curErrIn;
            bestThres = hypo[demension-1].threshold;
        }
    }

    return minErrIn;
}

//Decision Stump 算法, 确定s和θ
void decisionStump(vector<record>& trainingSet, vector<record>& testSet, vector<Hypothesis>& hypo)
{
    int recordSize = trainingSet.size();
    int minErrInDem;
    double minErrIn = 1.1;   

    for(int dem=0;dem<DEMENSION;++dem){

        vector<singleDemensionRecord> singleDemensionVec; 
        double curMinErrIn;
        double bestThresPositive;
        double bestThresNegtive;
        double minErrInPositive;
        double minErrInNegtive;

        getInputByDemension(trainingSet,singleDemensionVec,dem+1);

        hypo[dem].coef = 1;
        minErrInPositive = getMinErrIn(singleDemensionVec,hypo,dem+1,bestThresPositive);

        hypo[dem].coef = -1;
        minErrInNegtive = getMinErrIn(singleDemensionVec,hypo,dem+1,bestThresNegtive);

        if(minErrInPositive<minErrInNegtive){
            hypo[dem].coef = 1;   
            curMinErrIn = minErrInPositive; 
            hypo[dem].threshold = bestThresPositive;
        }else{
            hypo[dem].coef = -1;  
            curMinErrIn = minErrInNegtive;    
            hypo[dem].threshold = bestThresNegtive;
        }

        if(minErrIn>curMinErrIn){
            minErrIn = curMinErrIn;
            minErrInDem = dem+1;
        }
    }

    cout<<"The demension with min error is : "<<minErrInDem<<endl;
    cout<<"min E-in = "<<minErrIn<<endl;
    vector<singleDemensionRecord> singleDemensionTestVec;
    getInputByDemension(testSet,singleDemensionTestVec,minErrInDem);
    cout<<"min E-out = "<<calErr(singleDemensionTestVec,hypo,minErrInDem)<<endl<<endl;
}

void main()
{
    srand((unsigned)time(NULL)); 

    vector<record> trainingSet;                      //训练数据
    vector<record> testSet;                          //测试数据
    vector<Hypothesis> hypoVec(DEMENSION);           //每个维度一个hypothesis

    ifstream dataFile(file);
    ifstream testDataFile(file_test);  

    if( dataFile.is_open() && testDataFile.is_open() ){
        getData(dataFile,trainingSet);  
        getData(testDataFile,testSet);  
    }else{
        cerr<<"ERROR ---> 文件打开失败"<<endl;
        exit(1);
    }

    decisionStump(trainingSet,testSet,hypoVec);
}




内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: