K-Means算法
2016-07-20 09:03
169 查看
from numpy import * def loadDataSet(fileName): dataSet = [] fr = open(fileName) for line in fr.readlines(): curLine = line.strip().split('\t') fltLine = map(float, curLine) dataSet.append(fltLine) return dataSet def distEclud(vecA, vecB): return sqrt(sum(power(vecA-vecB, 2))) def randCent(dataSet, k): n = shape(dataSet)[1] centroids = mat(zeros((k, n))) for j in range(n): minJ = min(dataSet[:, 0]) rangeJ = float(max(dataSet[:, j]) - minJ) centroids[:, j] = minJ + rangeJ * random.rand(k, 1) return centroids def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent): m = shape(dataSet)[0] clusterAssment = mat(zeros((m, 2))) centroids = createCent(dataSet, k) clusterChanged = True while clusterChanged: clusterChanged = False for i in range(m): minDist = inf; minIndex = -1 for j in range(k): distJI = distMeas(centroids[j, :], dataSet[i, :]) if distJI < minDist: minDist = distJI; minIndex = j if clusterAssment[i, 0] != minIndex: clusterChanged = True clusterAssment[i, :] = minIndex, minDist ** 2 # print centroids for cent in range(k): ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]] # print nonzero(clusterAssment[:, 0].A == cent)[0] centroids[cent,:] = mean(ptsInClust) return centroids, clusterAssment def biKmeans(dataSet, k, distMeans=distEclud): m = shape(dataSet)[0] clusterAssment = mat(zeros((m, 2))) centroid0 = mean(dataSet, axis = 0).tolist()[0] centList = [centroid0] for j in range(m): clusterAssment[j, 1] = distMeans(mat(centroid0), dataSet[j, :]) ** 2 while(len(centList) < k): lowestSSE = inf for i in range(len(centList)): ptsInCurrCluster = dataSet[nonzero(clusterAssment[:, 0].A == i)[0],:] centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeans) sseSplit = sum(splitClustAss[:, 1]) sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1]) print "sseSplit, and notSplit: ",sseSplit,sseNotSplit if (sseSplit + sseNotSplit) < lowestSSE: bestCentToSplit = i bestNewCents = centroidMat bestClustAss = splitClustAss.copy() lowestSSE = sseSplit + sseNotSplit bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList) bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit print 'the bestCentToSplit is: ',bestCentToSplit print 'the len of bestClustAss is: ', len(bestClustAss) centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0] centList.append(bestNewCents[1,:].tolist()[0]) clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss return mat(centList), clusterAssment
相关文章推荐
- Apriori算法
- Hadoop 2环境配置
- AngularJS与Velocity一起发生奇怪的错误(挖坑)
- Tomcat webapps目录的ROOT
- Java 动态代理
- cvFindContours
- sevlet生命周期
- malloc calloc realloc 作用、用法、区别、实现原理
- 一维数组
- What is Vertical Align?
- Wifi小车之 - 无线路由挂摄像头篇
- python学习——协程
- DOM 文档对象模型
- 守护进程
- F - The MAX(sort)
- HDU 5011-game -nim游戏变种
- hdu 1384逆序数 暴力法
- quartz定时器demo类
- javascript特效实现――当前时间和倒计时效果的简单实例
- What is Vertical Align?