您的位置:首页 > 其它

聚类算法(层级聚类,K-Means)

2013-08-01 00:55 330 查看
#!/usr/bin/python
import sys
sys.path.append('..')
import modules.similarity as sim # http://hi.baidu.com/bithigher/item/6ca6844ac29d0128fb896026 import random
words = []
blogs = []
data = []
def getData(file = 'blogdata.txt'):
global words, map
f = open(file)
line = f.readline()
words = line.strip().split('\t')[1:]

for line in f:
p = line.strip().split('\t')
blogs.append(p[0])
data.append([float(x) for x in p[1:]])
f.close()

def hcluster(similarity = sim.pearson):
global data
clusters = [bicluster(data[i], id=i) for i in range(len(data))]

currentId = -1 # id for merged bicluster
dis = {} #record the distance computed before
while(len(clusters) > 1):
closestIndex = (0, 1)
closestKey = (clusters[0].id, clusters[1].id)
if closestKey not in dis:
dis[closestKey] = 1 - similarity(clusters[0].vec, clusters[1].vec)
closest = dis[closestKey]

for i in range(len(clusters)):
for j in range(i+1, len(clusters)):
key = (clusters[i].id, clusters[j].id)
if key not in dis:
dis[key] = 1 - similarity(clusters[i].vec, clusters[j].vec)
if(dis[key] < closest):
closest = dis[key]
closestKey = key
closestIndex = (i, j)

# generate a new bicluster and delete the closePair
mergedVec = [(clusters[closestIndex[0]].vec[i] + clusters[closestIndex[1]].vec[i])/2.0
for i in range(len(clusters[0].vec))]
clusters.append(bicluster(mergedVec,
clusters[closestIndex[0]],
clusters[closestIndex[1]],
closest,
currentId))
currentId -= 1
del dis[closestKey]

#ATTENTION!!! remove the later one first
#ATTENTION!!! because the index will change if remove the former one first
del clusters[closestIndex[1]]
del clusters[closestIndex[0]]
print closestIndex, closestKey

return clusters[0]

def rotateData():
global data
newData = []
for i in range(len(data[0])):
newData.append([data[j][i] for j in range(len(data))])

data = newData

def printCluster(cluster, labels, n=0):
if cluster == None: return
for i in range(n):
print ' ',
if(cluster.id >= 0):
print '-', labels[cluster.id]
else:
print '+', cluster.id
printCluster(cluster.left, labels, n+1)
printCluster(cluster.right, labels, n+1)

class bicluster:
def __init__(self, vec, left=None, right=None, distance=0, id=None):
self.vec = vec
self.left = left
self.right = right
self.distance = distance
self.id = id

def kmeans(k=5, iterations=100, similarity=sim.pearson):
global data
if(len(data) <= k): return data

clusters = [[] for i in range(k)]
length = len(data[0])
ranges = [(min(row[i] for row in data), max(row[i] for row in data))
for i in range(length)]
#random k centers
centers = [[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0]
for i in range(length)]
for j in range(k)]

# in case of falling into endless loop
while(iterations > 0):
iterations -= 1
clusters = [[] for i in range(k)]
newCenters = [[0 for i in range(length)] for j in range(k)]
for i in range(len(data)):
minVal = 1 - similarity(data[i], centers[0])
minIndex = 0
for j in range(1, k):
curMin = 1 - similarity(data[i], centers[j])
if curMin < minVal:
minVal = curMin
minIndex = j

clusters[minIndex].append(i)
newCenters[minIndex] = [newCenters[minIndex][j] + data[i][j] for j in range(length)]

for i in range(k):
num = float(len(clusters[i]))
if(num > 0):
newCenters[i] = [newCenters[i][j] / num for j in range(length)]

if(newCenters == centers):
return clusters
else:
centers = newCenters
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: