您的位置:首页 > 编程语言

集体智慧编程:第三章 聚类

2014-03-03 17:11 507 查看
1.分层聚类

后边代码有些不是自己写的。

__author__='li'
from math import sqrt
def readfiles(filenames):
"""
:param filenames:
"""
f1 = open(filenames)
lines = f1.readlines()
colnames = lines[0].strip().split('\t')[1:]
rownames = []
data = []
for line in lines[1:]:
p = line.strip().split('\t')
rownames.append(p[0])
data.append([float(item) for item in p[1:]])
return rownames, colnames, data

def pearson(v1,v2):
sum1 = sum(v1)
sum2 = sum(v2)
sum1Sq = sum([pow(item,2) for item in v1])
sum2Sq = sum([pow(item,2) for item in v2])
pSum = sum([v1[i] * v2[i] for i  in  range(len(v1))])
num = pSum -(sum1*sum2/len(v1))
den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
if den==0: return 0
return 1.0-num/den
class bicluster:
def __init__(self, vec, left=None, right=None, distance = 0.0,id = None):
self.vec = vec
self.left = left
self.right = right
self.distance = distance
self.id = id
def hcluster(rows,distance = pearson):
distances = {}
currentClusterId = -1
clust = [bicluster(rows[i],id =i) for i in range(len(rows))]
while len(clust) > 1 :
lowestPair = (0,1)
closet = distance(clust[0].vec,clust[1].vec)
for i in range(len(clust)):
for j in range(i+1,len(clust)):
if (clust[i].id,clust[j].id) not in distances:
distances[(clust[i].id,clust[j].id)] = distance(clust[i].vec,clust[j].vec)
d = distances[(clust[i].id,clust[j].id)]

if d < closet:
closet = d
lowestPair = (i,j)
mergeVec = [(clust[lowestPair[0]].vec[i]+clust[lowestPair[1]].vec[i])/2.0
for i in range(len(clust[0].vec))]

newcluster = bicluster(mergeVec,left=clust[lowestPair[1]],right=clust[lowestPair[0]],
distance = closet,id = currentClusterId)
currentClusterId -= 1
del clust[lowestPair[1]]
del clust[lowestPair[0]]
clust.append(newcluster)
return clust[0]

def printclust(clust,labels=None,n=0):
# indent to make a hierarchy layout
for i in range(n): print ' ',
if clust.id<0:
# negative id means that this is branch
print '-'
else:
# positive id means that this is an endpoint
if labels==None: print clust.id
else: print labels[clust.id]

# now print the right and left branches
if clust.left!=None: printclust(clust.left,labels=labels,n=n+1)
if clust.right!=None: printclust(clust.right,labels=labels,n=n+1)

if __name__=='__main__':
a,b,data = readfiles('blogdata.txt')
cluster =  hcluster(data)
printclust(cluster)


2.K-均值聚类

import random

def kCluster(rows,distance = pearson,k = 5):
#先从外层遍历i然后遍历rows
ranges = [(min(raw[i] for raw in rows),max(raw[i] for raw in rows)) for i in range(len(rows[0]))]

clusters = [[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0] for i in range(len(rows[0]))]for j in range(k)]

lastmaches = None
for t in range(100):
print "ITERATON:%d" % t
bestmaches = [[] for i in range(k)]
for i in range(len(rows)):
best = 0
for j in range(k):
d = distance(clusters[j],rows[i])
if d < distance(clusters[best],rows[i]):
best = j
bestmaches[best].append(i)
if bestmaches == lastmaches:
break
lastmaches = bestmaches
for i in range(k):
args = [0.0]*len(rows[0])
n = len(lastmaches[i])
if n > 0:
for rowid in lastmaches[i]:
for j in range(len(rows[0])):
args[j] += rows[rowid][j]
for j in range(len(rows[0])):
args[j] /=n
clusters[i] = args
return bestmaches
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: