8.2 - 《机器学习基石》Home Work 2 Q.19-20
2014-04-08 17:46
281 查看
这一题把16题中的 decision stump 拓展到多维,要求找出E-in最小的那一维并在测试数据上计算对应维度的E-out:
#include <fstream> #include <iostream> #include <ctime> #include <cmath> #include <vector> #include <algorithm> using namespace std; #define DEMENSION 9 //数据维度 char *file = "training.txt"; char *file_test = "testing.txt"; struct record { double input[DEMENSION]; int output; }; struct singleDemensionRecord { double input; int output; }; struct Hypothesis{ int coef; double threshold; }; //求数字的符号 int sign(double x) { if(x<0) return -1; else if(x>0) return 1; else return -1; } //从文件读取数据 void getData(ifstream & dataFile, vector<record> &data) { while(!dataFile.eof()){ record curRecord; for(int i=0;i<DEMENSION;++i){ dataFile>>curRecord.input[i]; } dataFile>>curRecord.output; data.push_back(curRecord); } dataFile.close(); } //计算指定维度的样本错误率 double calErr(vector<singleDemensionRecord>& singleDemensionVec, vector<Hypothesis>& hypo, int demension) { int errCount = 0; int length = singleDemensionVec.size(); for(int i=0;i<length;++i){ if(singleDemensionVec[i].output != hypo[demension-1].coef*sign(singleDemensionVec[i].input-hypo[demension-1].threshold)){ errCount++; } } return double(errCount)/double(length); } //single demension record的比较函数 bool recCompare(singleDemensionRecord & a, singleDemensionRecord & b) { return a.input<b.input; } //将指定维度的数据提取出来并升序排列 void getInputByDemension(vector<record>& dataSet, vector<singleDemensionRecord>& singleDemensionVec, int demension) { int recordSize = dataSet.size(); singleDemensionRecord curRec; for(int i=0;i<recordSize;++i){ curRec.input = dataSet[i].input[demension-1]; curRec.output = dataSet[i].output; singleDemensionVec.push_back(curRec); } sort(singleDemensionVec.begin(),singleDemensionVec.end(),recCompare); } //遍历所有θ,找到最小的E-in并返回 double getMinErrIn(vector<singleDemensionRecord> & singleDemensionVec, vector<Hypothesis>& hypo, int demension, double & bestThres) { double minErrIn = 1.0; double curErrIn; int recordSize = singleDemensionVec.size(); for(int i=0;i<recordSize-1;++i){ hypo[demension-1].threshold = double(singleDemensionVec[i].input+singleDemensionVec[i+1].input)/2.0; curErrIn = calErr(singleDemensionVec,hypo,demension); if(curErrIn<minErrIn){ minErrIn = curErrIn; bestThres = hypo[demension-1].threshold; } } return minErrIn; } //Decision Stump 算法, 确定s和θ void decisionStump(vector<record>& trainingSet, vector<record>& testSet, vector<Hypothesis>& hypo) { int recordSize = trainingSet.size(); int minErrInDem; double minErrIn = 1.1; for(int dem=0;dem<DEMENSION;++dem){ vector<singleDemensionRecord> singleDemensionVec; double curMinErrIn; double bestThresPositive; double bestThresNegtive; double minErrInPositive; double minErrInNegtive; getInputByDemension(trainingSet,singleDemensionVec,dem+1); hypo[dem].coef = 1; minErrInPositive = getMinErrIn(singleDemensionVec,hypo,dem+1,bestThresPositive); hypo[dem].coef = -1; minErrInNegtive = getMinErrIn(singleDemensionVec,hypo,dem+1,bestThresNegtive); if(minErrInPositive<minErrInNegtive){ hypo[dem].coef = 1; curMinErrIn = minErrInPositive; hypo[dem].threshold = bestThresPositive; }else{ hypo[dem].coef = -1; curMinErrIn = minErrInNegtive; hypo[dem].threshold = bestThresNegtive; } if(minErrIn>curMinErrIn){ minErrIn = curMinErrIn; minErrInDem = dem+1; } } cout<<"The demension with min error is : "<<minErrInDem<<endl; cout<<"min E-in = "<<minErrIn<<endl; vector<singleDemensionRecord> singleDemensionTestVec; getInputByDemension(testSet,singleDemensionTestVec,minErrInDem); cout<<"min E-out = "<<calErr(singleDemensionTestVec,hypo,minErrInDem)<<endl<<endl; } void main() { srand((unsigned)time(NULL)); vector<record> trainingSet; //训练数据 vector<record> testSet; //测试数据 vector<Hypothesis> hypoVec(DEMENSION); //每个维度一个hypothesis ifstream dataFile(file); ifstream testDataFile(file_test); if( dataFile.is_open() && testDataFile.is_open() ){ getData(dataFile,trainingSet); getData(testDataFile,testSet); }else{ cerr<<"ERROR ---> 文件打开失败"<<endl; exit(1); } decisionStump(trainingSet,testSet,hypoVec); }
相关文章推荐
- 机器学习技法作业二题目19-20
- 机器学习技法作业三题目19-20
- Python实现基于朴素贝叶斯的垃圾邮件分类 标签: python朴素贝叶斯垃圾邮件分类 2016-04-20 15:09 2750人阅读 评论(1) 收藏 举报 分类: 机器学习(19) 听说
- Coursera课程-机器学习基石作业一Q18-Q20(pocket on D算法 for PLA/C++ edition)
- 機器學習基石 机器学习基石 (Machine Learning Foundations) 作业二 Q19-20 C++实现
- 机器学习基石 8.2 Error Measure
- 機器學習基石(Machine Learning Foundations) 机器学习基石 作业四 Q13-20 MATLAB实现
- 機器學習基石(Machine Learning Foundations) 机器学习基石 作业三 Q18-20 C++实现
- 機器學習基石(Machine Learning Foundations) 机器学习基石 作业四 Q13-20 MATLAB实现
- 机器学习基石 3-1 Learning with different output space
- 台大机器学习基石笔记(四)——VC 维3
- 林轩田机器学习基石笔记4:机器学习的可行性
- 机器学习基石-05-4-Break Point
- 林轩田--机器学习基石&机器学习技法
- 机器学习基石-07-1-Definition of VC Dimension
- 机器学习基石-08-2-Error Measure
- 机器学习基石-08-3-Algorithmic Error Measure
- 【面试系列】编写一个程序,列出1-19所有相加为20的组合
- 机器学习基石PLA算法c++语言实现
- 计算1/2-2/3+3/4-…+19/20的值.