您的位置:首页 > 运维架构

聚类算法效果评估entropy purity nmi

2012-02-21 13:03 633 查看
1.数据管理脚本:原始文件格式id\tclusterId\tgoldstandardId

DataManagement.py

#!/usr/bin/python
import cPickle as p;
import sys;
import re;
if(__name__=="__main__"):
filename=str(sys.argv[1]);
preturn=re.compile('(^\s+|\s+$)');
fidsrc=file(filename,'r');
clusters={}; #(key,[])
goldstandards={};#(key,[])
for line in fidsrc.readlines():
line=preturn.sub('',line);
m=line.split('\t');
#print m
#s=raw_input('please enter');
if(len(m)==3):#if
if(not clusters.has_key(int(m[1]))):
clusters[int(m[1])]=[];
clusters[int(m[1])].append(int(m[0]));
else:
clusters[int(m[1])].append(int(m[0]));
if(not goldstandards.has_key(int(m[2]))):
goldstandards[int(m[2])]=[];
goldstandards[int(m[2])].append(int(m[0]));
else:
goldstandards[int(m[2])].append(int(m[0]));
fidclusters=file(sys.argv[2],'w');
fidgoldstandards=file(sys.argv[3],'w');
p.dump(clusters,fidclusters);
fidclusters.close();
p.dump(goldstandards,fidgoldstandards);
fidgoldstandards.close();
fidsrc.close();
print '%s has finished!'%sys.argv[0];

EvaluationClusterAlgorithm.py

#!/usr/bin/python
#-*- coding:cp936 -*-
import re;
import cPickle as mypickle;
import sys;
import math;
class Evaluation:
def __init__(self,clusterfid,goldstandardfid):
self.clusters=mypickle.load(file(clusterfid));#get the cluster algorithm results
self.goldstandards=mypickle.load(file(goldstandardfid));#get the gold-standard answers
tempclusterkeys=self.clusters.keys();
tempclusterkeys.sort();
tempgoldstandardkeys=self.goldstandards.keys();
tempgoldstandardkeys.sort();
self.k=len(tempclusterkeys);
self.q=len(tempgoldstandardkeys);
self.minclusterId=tempclusterkeys[0];#最小聚类ID
self.maxclusterId=tempclusterkeys[self.k-1];#最大聚类ID
self.mingoldstandardId=tempgoldstandardkeys[0];
self.maxgoldstandardId=tempgoldstandardkeys[self.q-1];
self.coocurrence={};#(clusterId,goldstandardId)=num;store the number of documents shared by clusterId and goldstandardId;
N1=0;
N2=0;
for m in tempclusterkeys:
N1=N1+len(self.clusters[m]);
for m in tempgoldstandardkeys:
N2=N2+len(self.goldstandards[m]);
if(N1==N2):
self.N=N1;#num of documents
else:
print 'there is a error N1=%d,N2=%d,please reexamine the data source'%(N1,N2);
def GenerateCoocurrence(self):
for key_cluster in self.clusters.keys():
set1=set(self.clusters[key_cluster]);
for key_gold in self.goldstandards.keys():
set2=set(self.goldstandards[key_gold]);
setintersect=set1&set2;
Num=len(setintersect);
if(not self.coocurrence.has_key((key_cluster,key_gold))):
self.coocurrence[(key_cluster,key_gold)]=Num;

def CalPurityForPerCluster(self,clusterId):
result=0.0;
NumCollection=[];
for Id in range(self.mingoldstandardId,self.maxgoldstandardId+1):
NumCollection.append(self.coocurrence[(clusterId,Id)]);
NumCollection.sort();
result=float(NumCollection[len(NumCollection)-1])/float(len(self.clusters[clusterId]));

return result;
def CalPurity(self):
result=0.0;
for clusterId in range(self.minclusterId,self.maxclusterId+1):
purityPer=self.CalPurityForPerCluster(clusterId);
result=result+float(len(self.clusters[clusterId]))*purityPer/float(self.N);
return result;
def CalEntropyFormula(self,seq):
result=0.0;
for elemP in seq:
if(elemP>0):
result=result+elemP*math.log(elemP,2);
return -result;
def CalEntropyForPerCluster(self,clusterId):
seq=[];
result=0;
for Id in range(self.mingoldstandardId,self.maxgoldstandardId+1):
Prob=float(self.coocurrence[(clusterId,Id)])/float(len(self.clusters[clusterId]));
seq.append(Prob);
result=self.CalEntropyFormula(seq);
return result;
def CalEntropy(self):
result=0;
for clusterId in range(self.minclusterId,self.maxclusterId+1):
entropyPer=self.CalEntropyForPerCluster(clusterId);
result=result+float(len(self.clusters[clusterId]))*entropyPer/float(self.N);
return result;
def CalMutualInformation(self):
result=0.0;
for clusterId in range(self.minclusterId,self.maxclusterId+1):
N_c=len(self.clusters[clusterId]);
for goldId in range(self.mingoldstandardId,self.maxgoldstandardId+1):
N_g=len(self.goldstandards[goldId]);
N_cg=self.coocurrence[(clusterId,goldId)];
part=float(self.N)*float(N_cg)/(N_c*N_g);
if(part>0):
result=result+(float(N_cg)/float(self.N))*math.log(part,2);
return result;
def CalNMI(self):
NMI=0.0;
seq1=[];#calculate the entropy of automated clusters
seq2=[];#calculate the entropy of gold-standard clusters
for clusterId in range(self.minclusterId,self.maxclusterId+1):
Prob=float(len(self.clusters[clusterId]))/float(self.N);
seq1.append(Prob);
for goldId in range(self.mingoldstandardId,self.maxgoldstandardId+1):
Prob=float(len(self.goldstandards[goldId]))/float(self.N);
seq2.append(Prob);
H1=self.CalEntropyFormula(seq1);
H2=self.CalEntropyFormula(seq2);
IG=self.CalMutualInformation();
NMI=2*IG/(H1+H2);
return NMI;

if(__name__=="__main__"):
clusterAddress=str(sys.argv[1]);
goldAddress=str(sys.argv[2]);
e= Evaluation(clusterAddress,goldAddress);
print '聚类算法产生簇个数%d'%e.k;
print '人工标注的标准答案中簇个数%d'%e.q;
print '文档总数%d'%e.N;
print '最小聚类ID标号%d'%e.minclusterId;
print '最大聚类ID标号%d'%e.maxclusterId;
print '标准答案中最小聚类ID标号%d'%e.mingoldstandardId;
print '标准答案中最大聚类ID标号%d'% e.maxgoldstandardId;
e.GenerateCoocurrence();
#for m in e.coocurrence:
# print m;
# print e.coocurrence[m];
# print '***************************'
purity=e.CalPurity();
print '纯度为%f'% purity;
#a=[0.2,0.3,0.5,0];
#print e.CalEntropyFormula(a);
entropy= e.CalEntropy();
print '熵为%f'%entropy;
nmi=e.CalNMI();
print '归一化互信息为%f'%nmi

代码调用示意图



内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: