机器学习实战-knn-1
2016-06-14 12:17
323 查看
#coding=utf-8 from numpy import *import operatorimport stringimport matplotlib.pyplot as pltimport numpy as np#创建数据集def createDataSet(): group=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]]) labels=['A','A','B','B'] return group,labels#KNN算法实现def knn_classify0(inx,dataSet,labels,k): #inx 用于分类的输入向量 #dataSet 输入的训练样本集 #标签 labels #k 最近邻居的数目 dataSetSize= dataSet.shape[0] #tile函数是重复函数 diffmat = tile(inx, (dataSetSize,1)) - dataSet sqdiffmat = diffmat**2 #对数组按行求和 sqdistance = sqdiffmat.sum(axis=1) distance=sqdistance**0.5 #对数组按列排序,返回索引值 sortedDisIndex = distance.argsort(axis=0) #建立字典 classCount={} for i in range(k): votelabel=labels[sortedDisIndex[i]] #字典的get方法(a,b),如果字典里没有此key返回b,有返回key_value。不断的累积标签对应的数值 classCount[votelabel]=classCount.get(votelabel,0)+1 sortedclassCount=sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True) return sortedclassCount[0][0]
def file2matrix(filename): fr=open(filename) #按行读取文件 arrayoLines=fr.readlines() numberOfLines=len(arrayoLines) #根据数据定返回的数组行列数 returnMat=zeros((numberOfLines,3)) classLabelVector=[] index=0 #按行遍历,构造数组 for line in arrayoLines: #截掉所有回车字符 line=line.strip() #将文本以TAB分割 listFromLine=line.split('\t') returnMat[index,:]=listFromLine[0:3] classLabelVector.append((listFromLine[-1])) index=index+1 return returnMat,classLabelVector
# DataMat,DataLabel=file2matrix('D:\learn\Ch02\datingTestSet.txt') # group,label=createDataSet()# result0=knn_classify0([3,3.5], group, label, 3)
#可视化DataMat,DataLabel=file2matrix('D:\learn\Ch02\datingTestSet.txt') fig=plt.figure()ax =fig.add_subplot(211)ax.scatter(DataMat[:,1],DataMat[:,2])# plt.show()
#归一化函数def autoNorm(dataset): minValue = dataset.min(0) maxValue = dataset.max(0) rangeValue = maxValue - minValue m=dataset.shape[0] normData = zeros(shape(dataset)) normData = dataset - tile(minValue,(m, 1)) normData = normData/tile(rangeValue, (m,1)) return normData,rangeValue,minValue
def datingClassTest(): #选取测试集比例 hoRatio = 0.1 DataMat,DataLabel=file2matrix('D:\learn\Ch02\datingTestSet.txt') normMat,ranges,minvalues=autoNorm(DataMat) m=normMat.shape[0] numOfTest=int(m*hoRatio) errorCount=0.0 for i in range(numOfTest): classiferResult = knn_classify0(normMat[i,:], normMat[numOfTest:m,:], DataLabel[numOfTest:m], 3) print (classiferResult) ,(DataLabel[i]) if(classiferResult!=DataLabel[i]): errorCount=errorCount + 1 print "the total error rate is : %f" %(errorCount/float(numOfTest)) #datingClassTest()
#搭建一个用户输入飞行里数,游戏时长,吃冰淇淋就可以匹配自己是否感兴趣的系统 def classifyPerson(): resultList=['not at all','in small doses','in large doses'] ffmile = float(raw_input("flier miles per year?")) percentGame = float(raw_input("percentage of time spent playing video game?")) iceCream = float(raw_input("ice cream consumed per year?")) DataMat,DataLabel=file2matrix('D:\learn\Ch02\datingTestSet.txt') normMat,ranges,minvalues=autoNorm(DataMat) inx = array([ffmile,percentGame,iceCream]) classiferResult = knn_classify0((inx-minvalues)/ranges, normMat, DataLabel, 3) print classiferResult classifyPerson()
def file2matrix(filename): fr=open(filename) #按行读取文件 arrayoLines=fr.readlines() numberOfLines=len(arrayoLines) #根据数据定返回的数组行列数 returnMat=zeros((numberOfLines,3)) classLabelVector=[] index=0 #按行遍历,构造数组 for line in arrayoLines: #截掉所有回车字符 line=line.strip() #将文本以TAB分割 listFromLine=line.split('\t') returnMat[index,:]=listFromLine[0:3] classLabelVector.append((listFromLine[-1])) index=index+1 return returnMat,classLabelVector
# DataMat,DataLabel=file2matrix('D:\learn\Ch02\datingTestSet.txt') # group,label=createDataSet()# result0=knn_classify0([3,3.5], group, label, 3)
#可视化DataMat,DataLabel=file2matrix('D:\learn\Ch02\datingTestSet.txt') fig=plt.figure()ax =fig.add_subplot(211)ax.scatter(DataMat[:,1],DataMat[:,2])# plt.show()
#归一化函数def autoNorm(dataset): minValue = dataset.min(0) maxValue = dataset.max(0) rangeValue = maxValue - minValue m=dataset.shape[0] normData = zeros(shape(dataset)) normData = dataset - tile(minValue,(m, 1)) normData = normData/tile(rangeValue, (m,1)) return normData,rangeValue,minValue
def datingClassTest(): #选取测试集比例 hoRatio = 0.1 DataMat,DataLabel=file2matrix('D:\learn\Ch02\datingTestSet.txt') normMat,ranges,minvalues=autoNorm(DataMat) m=normMat.shape[0] numOfTest=int(m*hoRatio) errorCount=0.0 for i in range(numOfTest): classiferResult = knn_classify0(normMat[i,:], normMat[numOfTest:m,:], DataLabel[numOfTest:m], 3) print (classiferResult) ,(DataLabel[i]) if(classiferResult!=DataLabel[i]): errorCount=errorCount + 1 print "the total error rate is : %f" %(errorCount/float(numOfTest)) #datingClassTest()
#搭建一个用户输入飞行里数,游戏时长,吃冰淇淋就可以匹配自己是否感兴趣的系统 def classifyPerson(): resultList=['not at all','in small doses','in large doses'] ffmile = float(raw_input("flier miles per year?")) percentGame = float(raw_input("percentage of time spent playing video game?")) iceCream = float(raw_input("ice cream consumed per year?")) DataMat,DataLabel=file2matrix('D:\learn\Ch02\datingTestSet.txt') normMat,ranges,minvalues=autoNorm(DataMat) inx = array([ffmile,percentGame,iceCream]) classiferResult = knn_classify0((inx-minvalues)/ranges, normMat, DataLabel, 3) print classiferResult classifyPerson()
相关文章推荐
- python 文本处理
- KNN-1 python 实现
- Python中的sorted函数以及operator.itemgetter函数 (转)
- KNN-1 可能用到的python 知识点
- SWT第一个程序测试
- AngularJS 1
- JAVA 线程初步
- .whl 安装方法(附带pip 安装)
- JAVA 事件监听
- JDBC
- HTML 常用文档
- jQuery 语法
- 事件冒泡或事件捕获
- json
- Java - Java概述
- XX图文扩容配置说明
- 建立一个Point类,包含数据成员x,y,实现需要的成员函数,并设计main函数完成测试
- java.lang.NoClassDefFoundError: org/w3c/dom/ElementTraversal
- canvas元素大小与绘图表面大小
- java.lang.NoClassDefFoundError: org/w3c/dom/ElementTraversal