您的位置:首页 > 编程语言 > Python开发

【风马一族_Python】 实施kNN算法

2016-06-08 11:07 417 查看
一、在PyCharm 5.0.4(编写python程序的IDE) 编写kNN.py文件的代码

--------------------------

1、 kNN.py 运算符模块

--------------------------

from numpy import *
import operator

#运算符模块   创建数据集和标签
def createDataSet():
group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels


1)、打开命令行

  先进入kNN.py的所在文件夹,在对kNN.py进行程序处理,效果如下图所示



--------------------------

2、 kNN.py k-近邻算法

--------------------------

from numpy import *
import operator

#运算符模块
def createDataSet():
group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels

#k-近邻算法  此模块需要使用运算符模块的group/labels
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize, 1)) - dataSet
sqDiffMat = diffMat ** 2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances ** 0.5
sortedDistIndicies = distances.argsort()
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1

sortedClassCount = sorted(classCount.iteritems(),
key=operator.itemgetter(1),
reverse=True)
return sortedClassCount[0][0]


2) 在1)的基础上,进行2)的内容,可以修改参数
  


  


--------------------------

3、 kNN.py 准备数据:从文本文件中解析数据

--------------------------

# 将文本记录转换为NumPy的解析程序
def file2matrix(filename):
fr = open(filename,'r')
numberOfLines = len(fr.readlines())  # get the number of lines in the file
returnMat = zeros((numberOfLines, 3))  # prepare matrix to return
classLabelVector = []  # prepare labels return
fr = open(filename)
index = 0
for line in fr.readlines():
line = line.strip()
listFromLine = line.split('\t')
returnMat[index, :] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat, classLabelVector


datingTestSet2.txt文件可以从(博客园)文件下载



--------------------------

4、 kNN.py 使用Matplotlib创建散点图

--------------------------



datingTestSet2.txt 文件的数据通过matplotlib,图形化的表现出来



--------------------------

5、 kNN.py 使用Matplotlib创建散点图  表示不同属性的点,使用不同颜色进行表示

    警告:import os

       from numpy import *

       这两句必须加上,否则会报如下,错误提示信息:

        Traceback (most recent call last):
          File "<stdin>", line 1, in <module>
           NameError: name 'array' is not defined

--------------------------



让点出现颜色划分的关键代码是:

ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*array(datingLabels),15.0*array(datingLabels))



--------------------------

6、 kNN.py  归一化特征值

--------------------------

1 # 归一化特征值
2 def autoNorm(dataSet):
3     minVals = dataSet.min(0)
4     maxVals = dataSet.max(0)
5     ranges = maxVals - minVals
6     normDataSet = zeros(shape(dataSet))
7     m = dataSet.shape[0]
8     normDataSet = dataSet - tile(minVals, (m, 1))
9     normDataSet = normDataSet / tile(ranges, (m, 1))
10     return normDataSet, ranges, minVals




--------------------------

7、 kNN.py   分类器针对约会网站的测试代码

--------------------------

# 分类器针对约会网站的测试代码
def datingClassTest():
hoRatio = 0.10
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m * hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
print "the classifier came back with: %d, the real answer is : %d" % (classifierResult, datingLabels[i])

if (classifierResult != datingLabels[i]):
errorCount += 1.0

print  "the total error rate is : %f " % (errorCount / float(numTestVecs))




--------------------------

8、 kNN.py   使用算法:构建完整可用系统

--------------------------

#! /usr/bin/env python
# -*- coding: gbk -*-        用来解决中文乱码的注解

from numpy import *
import operator

# 运算符模块
def createDataSet():
group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels

# k-近邻算法
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize, 1)) - dataSet
sqDiffMat = diffMat ** 2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances ** 0.5
sortedDistIndicies = distances.argsort()
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1

sortedClassCount = sorted(classCount.iteritems(),
key=operator.itemgetter(1),
reverse=True)
return sortedClassCount[0][0]

# 将文本记录转换为NumPy的解析程序
def file2matrix(filename):
fr = open(filename, 'r')
numberOfLines = len(fr.readlines())  # get the number of lines in the file
returnMat = zeros((numberOfLines, 3))  # prepare matrix to return
classLabelVector = []  # prepare labels return
fr = open(filename)
index = 0
for line in fr.readlines():
line = line.strip()
listFromLine = line.split('\t')
returnMat[index, :] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat, classLabelVector

# 归一化特征值
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m, 1))
normDataSet = normDataSet / tile(ranges, (m, 1))
return normDataSet, ranges, minVals

# 分类器针对约会网站的测试代码
def datingClassTest():
hoRatio = 0.10
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m * hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
print "分类器的回复是:%d,真正的答案是:%d" % (classifierResult, datingLabels[i])

if (classifierResult != datingLabels[i]):
errorCount += 1.0

print "总误差率  : %f " % (errorCount / float(numTestVecs))

# 约会网预测函数
def classifyPerson():
resultList = ['完全没有兴趣', '有一点吧', '特别感兴趣']
percentTats = float(raw_input("玩电子游戏的时间百分比?"))
ffMiles = float(raw_input("每年的飞行里程数是多少?"))
iceCream = float(raw_input("每年的冰淇淋消费量是多少?"))
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
norMat, ranges, minVals = autoNorm(datingDataMat)
inArr = array([ffMiles, percentTats, iceCream])
classifierResult = classify0((inArr - minVals) / ranges, norMat, datingLabels, 3)
print "你可能会喜欢这个人 :", resultList[classifierResult - 1]




--------------------------

9、 kNN.py   准备数据:将图像转换为测试向量

--------------------------

# 准备数据:将图像转换为测试向量
def img2vector(filename):
returnVect = zeros((1, 1024))
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0, 32 * i + j] = int(lineStr[j])
return returnVect




--------------------------

10、 kNN.py   测试算法:使用k-近邻算法识别手写数字

      注意:本文需要使用 from os import listdir

      数据digits.zip 存放在博客园的文件夹中,或者下载《机器学习实战》的源代码,里面有

--------------------------

#! /usr/bin/env python
# -*- coding: gbk -*-

from numpy import *
import operator
from os import listdir

# 测试算法:使用K-近邻算法识别手写数字
def handwritingClassTest():
hwLabels = []
trainingFileList = listdir('trainingDigits')
m = len(trainingFileList)
trainingMat = zeros((m, 1024))
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
hwLabels.append(classNumStr)
trainingMat[i, :] = img2vector('trainingDigits/%s' % fileNameStr)

testFileList = listdir('testDigits')
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0]
classNameStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
print "\n分类器的回复是:%d,真正的答案是:%d" % (classifierResult, classNumStr)

if (classifierResult != classNumStr):
errorCount += 1.0

print "\t 错误的总数是 : %d " % errorCount
print "\t 总误差率是 : %f" % (errorCount / float(mTest))




  |

  |

  图片太长,其中截断了,读者可以自行测试看看效果

  |

  |



-------------------------------------------------------------------------------------------------

总结:以上就是机器学习实战的第二章的代码内容,没想到,三月份开始学习的内容,等到六月份才开始能够成功实现,主要是Numpy的安装,太狗了!

  其间,学习安装Numpy与.whl类型的文件,会使用基本的matplotlib。k-近邻算法的模样还没有认清楚。接下来,进行决策树,过一段时间就可以

  认识k-近邻算法了吧
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: