用knn算法来预测喜爱程度
2016-08-29 20:17
309 查看
能根据对方的一些特征判断他(她)对你的吸引程度,是不喜欢,还是一般喜欢,还是很喜欢。以此改进约会配对效果。
1、有一千组数据,前200作为测试数据,后800个作为样本数据,然后训练模型
2、然后吧特征变量归一化去增加数据的可靠性,同时调整k值的参数来提高预测的准确度
3,准确度达到一定程度后,然后输入用户数据来进行匹配最优的人
#! /usr/bin/env python
# -*- coding=utf-8 -*-
from numpy import *
import operator
from os import listdir
import matplotlib
import matplotlib.pyplot as plt
import time
import pdb
def classify0(inX, dataSet, labels, k=5):# k=3 表示分三类
# print "class lass -------------------------"
# print inX
# print dataSet #表示 后面 801的数据
# print labels #表示 最后800数据的标签
# print "class end---------------------------------"
# pdb.set_trace()
dataSetSize = dataSet.shape[0] #代表的是最后801个数据
# print "classift0"
# print dataSetSize
diffMat = tile(inX, (dataSetSize, 1)) - dataSet #把当前的扩展下,然后减去后面801个
#用欧氏距离来计算输入的数据和样本数据 801个进行计算距离
# print "处理前的矩阵"
# print diffMat
sqDiffMat = diffMat ** 2
# print "平方后的数据"
# print sqDiffMat
sqDistances = sqDiffMat.sum(axis=1) #行相加求和
# print "矩阵预处理"
# print sqDistances
distances = sqDistances ** 0.5
# print "处理后的数据"
# print distances
sortedDistIndicies = distances.argsort() # ascend sorted,
#排序后返回的是索引值
# print "输入排序后的数据"
# print sortedDistIndicies
# return the index of unsorted, that is to choose the least 3 item
classCount = {}
for i in range(k):#让三个人去投票 然后说 他是谁,多者获胜
voteIlabel = labels[sortedDistIndicies[i]] #返回第一个是多少号元素
print "VOTEVOTE"
print voteIlabel
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 # a dict with label as key and occurrence number as value
print classCount
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) #然后根据 key值排序,求出三者之间投票最多的
'''descend sorted according to value, '''
print sortedClassCount[0][0]
return sortedClassCount[0][0]
def file2matrix(filename):
print "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
fr = open(filename)
# pdb.set_trace()
L = fr.readlines()
numberOfLines = len(L) # get the number of lines in the file
print numberOfLines
returnMat = zeros((numberOfLines, 3)) # prepare matrix to return
print "初始化矩阵"
print returnMat
classLabelVector = [] # prepare labels return 初始化三维矩阵
index = 0
for line in L:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index, :] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
# classLabelVector.append((listFromLine[-1]))
index += 1
fr.close()
print "生成的 label 列表"
print classLabelVector
print type(returnMat)
ff = open("file.txt",'a')
ff.write(returnMat)
ff.close()
print "生成的三维数据矩阵"
print returnMat
return returnMat, classLabelVector #前面是矩阵,后面是类别
#特征变量归一化
def autoNorm(dataSet):
#取出每一列的最小值,即每一个特征值得最小值
minVals = dataSet.min(0) #分别取三维中的最小的数
print "minvals"
print minVals
#取出每一列的最大值,即每一个特征值的最大值
maxVals = dataSet.max(0)#分别取三维中最大的数
print "maxvals"
print maxVals
#每一个特征变量的取值范围
ranges = maxVals - minVals
print "最大值减去最小值的范围"
print ranges
normDataSet = zeros(shape(dataSet)) #显示数组的属性
m = dataSet.shape[0] #显示数组的行数
print "显示数组的行数"
print m
normDataSet = dataSet - tile(minVals, (m, 1))#把minVals,行复制m次,列复制一次
normDataSet = normDataSet / tile(ranges, (m, 1)) # element wise divide
return normDataSet, ranges, minVals
#分类器测试
def datingClassTest(hoRatio=0.20):
# hold out 10%
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') # load data setfrom file
normMat, ranges, minVals = autoNorm(datingDataMat) #特征变量归一化
m = normMat.shape[0]
print "what happend"
print m
numTestVecs = int(m * hoRatio)#输入 20%的案例 numTestVecs = 200 即取前200 个数据
errorCount = 0.0
print "what it is"
for i in range(numTestVecs):
classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 5) #第一个表示第 i 个数据,然后第二个表示 (后800个数据)从nomTestVecs到最后的数据,第三个数据表示的当前数据的label
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
if (classifierResult != datingLabels[i]): errorCount += 1.0
print "the total error rate is: %.2f%%" % (100 * errorCount / float(numTestVecs))
print 'testcount is %s, errorCount is %s' % (numTestVecs, errorCount)
def classifyPerson():
'''
input a person , decide like or not, then update the DB
'''
resultlist = ['not at all', 'little doses', 'large doses']
percentTats = float(raw_input('input the person\' percentage of time playing video games:'))
ffMiles = float(raw_input('flier miles in a year:'))
iceCream = float(raw_input('amount of iceCream consumed per year:'))
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
normPerson = (array([ffMiles, percentTats, iceCream]) - minVals) / ranges
result = classify0(normPerson, normMat, datingLabels, 3)
print 'you will probably like this guy in:', resultlist[result - 1]
# update the datingTestSet
print 'update dating DB'
tmp = '\t'.join([repr(ffMiles), repr(percentTats), repr(iceCream), repr(result)]) + '\n'
with open('datingTestSet2.txt', 'a') as fr:
fr.write(tmp)
if __name__ == '__main__':
datingClassTest()
classifyPerson()
#handwritingClassTest()
1、有一千组数据,前200作为测试数据,后800个作为样本数据,然后训练模型
2、然后吧特征变量归一化去增加数据的可靠性,同时调整k值的参数来提高预测的准确度
3,准确度达到一定程度后,然后输入用户数据来进行匹配最优的人
#! /usr/bin/env python
# -*- coding=utf-8 -*-
from numpy import *
import operator
from os import listdir
import matplotlib
import matplotlib.pyplot as plt
import time
import pdb
def classify0(inX, dataSet, labels, k=5):# k=3 表示分三类
# print "class lass -------------------------"
# print inX
# print dataSet #表示 后面 801的数据
# print labels #表示 最后800数据的标签
# print "class end---------------------------------"
# pdb.set_trace()
dataSetSize = dataSet.shape[0] #代表的是最后801个数据
# print "classift0"
# print dataSetSize
diffMat = tile(inX, (dataSetSize, 1)) - dataSet #把当前的扩展下,然后减去后面801个
#用欧氏距离来计算输入的数据和样本数据 801个进行计算距离
# print "处理前的矩阵"
# print diffMat
sqDiffMat = diffMat ** 2
# print "平方后的数据"
# print sqDiffMat
sqDistances = sqDiffMat.sum(axis=1) #行相加求和
# print "矩阵预处理"
# print sqDistances
distances = sqDistances ** 0.5
# print "处理后的数据"
# print distances
sortedDistIndicies = distances.argsort() # ascend sorted,
#排序后返回的是索引值
# print "输入排序后的数据"
# print sortedDistIndicies
# return the index of unsorted, that is to choose the least 3 item
classCount = {}
for i in range(k):#让三个人去投票 然后说 他是谁,多者获胜
voteIlabel = labels[sortedDistIndicies[i]] #返回第一个是多少号元素
print "VOTEVOTE"
print voteIlabel
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 # a dict with label as key and occurrence number as value
print classCount
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) #然后根据 key值排序,求出三者之间投票最多的
'''descend sorted according to value, '''
print sortedClassCount[0][0]
return sortedClassCount[0][0]
def file2matrix(filename):
print "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
fr = open(filename)
# pdb.set_trace()
L = fr.readlines()
numberOfLines = len(L) # get the number of lines in the file
print numberOfLines
returnMat = zeros((numberOfLines, 3)) # prepare matrix to return
print "初始化矩阵"
print returnMat
classLabelVector = [] # prepare labels return 初始化三维矩阵
index = 0
for line in L:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index, :] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
# classLabelVector.append((listFromLine[-1]))
index += 1
fr.close()
print "生成的 label 列表"
print classLabelVector
print type(returnMat)
ff = open("file.txt",'a')
ff.write(returnMat)
ff.close()
print "生成的三维数据矩阵"
print returnMat
return returnMat, classLabelVector #前面是矩阵,后面是类别
#特征变量归一化
def autoNorm(dataSet):
#取出每一列的最小值,即每一个特征值得最小值
minVals = dataSet.min(0) #分别取三维中的最小的数
print "minvals"
print minVals
#取出每一列的最大值,即每一个特征值的最大值
maxVals = dataSet.max(0)#分别取三维中最大的数
print "maxvals"
print maxVals
#每一个特征变量的取值范围
ranges = maxVals - minVals
print "最大值减去最小值的范围"
print ranges
normDataSet = zeros(shape(dataSet)) #显示数组的属性
m = dataSet.shape[0] #显示数组的行数
print "显示数组的行数"
print m
normDataSet = dataSet - tile(minVals, (m, 1))#把minVals,行复制m次,列复制一次
normDataSet = normDataSet / tile(ranges, (m, 1)) # element wise divide
return normDataSet, ranges, minVals
#分类器测试
def datingClassTest(hoRatio=0.20):
# hold out 10%
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') # load data setfrom file
normMat, ranges, minVals = autoNorm(datingDataMat) #特征变量归一化
m = normMat.shape[0]
print "what happend"
print m
numTestVecs = int(m * hoRatio)#输入 20%的案例 numTestVecs = 200 即取前200 个数据
errorCount = 0.0
print "what it is"
for i in range(numTestVecs):
classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 5) #第一个表示第 i 个数据,然后第二个表示 (后800个数据)从nomTestVecs到最后的数据,第三个数据表示的当前数据的label
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
if (classifierResult != datingLabels[i]): errorCount += 1.0
print "the total error rate is: %.2f%%" % (100 * errorCount / float(numTestVecs))
print 'testcount is %s, errorCount is %s' % (numTestVecs, errorCount)
def classifyPerson():
'''
input a person , decide like or not, then update the DB
'''
resultlist = ['not at all', 'little doses', 'large doses']
percentTats = float(raw_input('input the person\' percentage of time playing video games:'))
ffMiles = float(raw_input('flier miles in a year:'))
iceCream = float(raw_input('amount of iceCream consumed per year:'))
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
normPerson = (array([ffMiles, percentTats, iceCream]) - minVals) / ranges
result = classify0(normPerson, normMat, datingLabels, 3)
print 'you will probably like this guy in:', resultlist[result - 1]
# update the datingTestSet
print 'update dating DB'
tmp = '\t'.join([repr(ffMiles), repr(percentTats), repr(iceCream), repr(result)]) + '\n'
with open('datingTestSet2.txt', 'a') as fr:
fr.write(tmp)
if __name__ == '__main__':
datingClassTest()
classifyPerson()
#handwritingClassTest()
相关文章推荐
- 机器学习分类之结合实际应用介绍KNN算法原理以及利用sklearn进行分类预测
- KNN算法预测泰坦尼克号乘客生存率
- 使用kNN算法预测价格
- K邻近(KNN)分类和预测算法
- K邻近(KNN)分类和预测算法的原理及实现
- [机器学习篇]基于Scikit learn库中KNN,SVM算法的笔迹识别
- Knn算法实现
- KNN算法简述
- kNN:k-nearest neighbor classification(K最近邻分类算法,KNN)
- [置顶] KNN及其改进算法的python实现
- 常见的预测算法
- 朴素贝叶斯分类和预测算法的原理及实现
- KNN-临近算法
- 推荐/预测算法
- KNN算法介绍
- KNN分类算法及其改进
- K-近邻算法(kNN)
- 《机器学习实战》学习笔记二:kNN算法实例 -- 手写体识别
- ML算法-kNN
- 数据挖掘学习札记:KNN算法(一)