您的位置:首页 > 其它

聚类算法——K-means(下)

2012-03-08 21:39 190 查看
  K-means的源码实现

  一般情况下,我们通过C++/Matlab/Python等语言进行实现K-means算法,结合近期我刚刚学的C++,先从C++实现谈起,C++里面我们一般采用的是OpenCV库中写好的K-means函数,即cvKmeans2,首先来看函数原型:
  从OpenCV manual看到的是:
int cvKMeans2(const CvArr* samples, int nclusters,
        CvArr* labels, CvTermCriteria termcrit,
        int attempts=1, CvRNG* rng=0,int flags=0,
        CvArr* centers=0,double* compactness=0);
由于除去已经确定的参数,我们自己需要输入的为:
void cvKMeans2(
  const CvArr* samples, //输入样本的浮点矩阵,每个样本一行。
  int cluster_count, //所给定的聚类数目
  * labels, //输出整数向量:每个样本对应的类别标识
  CvTermCriteria termcrit //指定聚类的最大迭代次数和/或精度(两次迭代引起的聚类中心的移动距离)
);
其使用例程为:

#ifdef _CH_
#pragma package <opencv>
#endif

#define CV_NO_BACKWARD_COMPATIBILITY

#ifndef _EiC
#include "cv.h"
#include "highgui.h"
#include <stdio.h>
#endif

int main( int argc, char** argv )
{
#define MAX_CLUSTERS 5    //设置类别的颜色,个数(《=5)
CvScalar color_tab[MAX_CLUSTERS];
IplImage* img = cvCreateImage( cvSize( 500, 500 ), 8, 3 );
CvRNG rng = cvRNG(-1);
CvPoint ipt;

color_tab[0] = CV_RGB(255,0,0);
color_tab[1] = CV_RGB(0,255,0);
color_tab[2] = CV_RGB(100,100,255);
color_tab[3] = CV_RGB(255,0,255);
color_tab[4] = CV_RGB(255,255,0);

cvNamedWindow( "clusters", 1 );

for(;;)
{
char key;
int k, cluster_count = cvRandInt(&rng)%MAX_CLUSTERS + 1;
int i, sample_count = cvRandInt(&rng)%1000 + 1;
CvMat* points = cvCreateMat( sample_count, 1, CV_32FC2 );
CvMat* clusters = cvCreateMat( sample_count, 1, CV_32SC1 );
cluster_count = MIN(cluster_count, sample_count);

/** generate random sample from multigaussian distribution */
for( k = 0; k < cluster_count; k++ )
{
CvPoint center;
CvMat point_chunk;
center.x = cvRandInt(&rng)%img->width;
center.y = cvRandInt(&rng)%img->height;
cvGetRows( points, &point_chunk, k*sample_count/cluster_count,
k == cluster_count - 1 ? sample_count :
(k+1)*sample_count/cluster_count, 1 );

cvRandArr( &rng, &point_chunk, CV_RAND_NORMAL,
cvScalar(center.x,center.y,0,0),
cvScalar(img->width*0.1,img->height*0.1,0,0));
}

/** shuffle samples */
for( i = 0; i < sample_count/2; i++ )
{
CvPoint2D32f* pt1 = (CvPoint2D32f*)points->data.fl + cvRandInt(&rng)%sample_count;
CvPoint2D32f* pt2 = (CvPoint2D32f*)points->data.fl + cvRandInt(&rng)%sample_count;
CvPoint2D32f temp;
CV_SWAP( *pt1, *pt2, temp );
}

printf( "iterations=%d\n", cvKMeans2( points, cluster_count, clusters,
cvTermCriteria( CV_TERMCRIT_EPS+CV_TERMCRIT_ITER, 10, 1.0 ),
5, 0, 0, 0, 0 ));

cvZero( img );

for( i = 0; i < sample_count; i++ )
{
int cluster_idx = clusters->data.i[i];
ipt.x = (int)points->data.fl[i*2];
ipt.y = (int)points->data.fl[i*2+1];
cvCircle( img, ipt, 2, color_tab[cluster_idx], CV_FILLED, CV_AA, 0 );
}

cvReleaseMat( &points );
cvReleaseMat( &clusters );

cvShowImage( "clusters", img );

key = (char) cvWaitKey(0);
if( key == 27 || key == 'q' || key == 'Q' ) // 'ESC'
break;
}

cvDestroyWindow( "clusters" );
return 0;
}

#ifdef _EiC
main(1,"kmeans.c");
#endif


  至于cvKmeans2函数的具体实现细节,可参见OpenCV源码

  下面是Python的实现代码(网上所找):

#!/usr/bin/python

from __future__ import with_statement
import cPickle as pickle
from matplotlib import pyplot
from numpy import zeros, array, tile
from scipy.linalg import norm
import numpy.matlib as ml
import random

def kmeans(X, k, observer=None, threshold=1e-15, maxiter=300):
N = len(X)
labels = zeros(N, dtype=int)
centers = array(random.sample(X, k))
iter = 0

def calc_J():
sum = 0
for i in xrange(N):
sum += norm(X[i]-centers[labels[i]])
return sum

def distmat(X, Y):
n = len(X)
m = len(Y)
xx = ml.sum(X*X, axis=1)
yy = ml.sum(Y*Y, axis=1)
xy = ml.dot(X, Y.T)

return tile(xx, (m, 1)).T+tile(yy, (n, 1)) - 2*xy

Jprev = calc_J()
while True:
# notify the observer
if observer is not None:
observer(iter, labels, centers)

# calculate distance from x to each center
# distance_matrix is only available in scipy newer than 0.7
# dist = distance_matrix(X, centers)
dist = distmat(X, centers)
# assign x to nearst center
labels = dist.argmin(axis=1)
# re-calculate each center
for j in range(k):
idx_j = (labels == j).nonzero()
centers[j] = X[idx_j].mean(axis=0)

J = calc_J()
iter += 1

if Jprev-J < threshold:
break
Jprev = J
if iter >= maxiter:
break

# final notification
if observer is not None:
observer(iter, labels, centers)

if __name__ == '__main__':
# load previously generated points
with open('cluster.pkl') as inf:
samples = pickle.load(inf)
N = 0
for smp in samples:
N += len(smp[0])
X = zeros((N, 2))
idxfrm = 0
for i in range(len(samples)):
idxto = idxfrm + len(samples[i][0])
X[idxfrm:idxto, 0] = samples[i][0]
X[idxfrm:idxto, 1] = samples[i][1]
idxfrm = idxto

def observer(iter, labels, centers):
print "iter %d." % iter
colors = array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
pyplot.plot(hold=False)  # clear previous plot
pyplot.hold(True)

# draw points
data_colors=[colors[lbl] for lbl in labels]
pyplot.scatter(X[:, 0], X[:, 1], c=data_colors, alpha=0.5)
# draw centers
pyplot.scatter(centers[:, 0], centers[:, 1], s=200, c=colors)

pyplot.savefig('kmeans/iter_%02d.png' % iter, format='png')

kmeans(X, 3, observer=observer)


  matlab的kmeans实现代码可直接参照其kmeans(X,k)函数的实现源码。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: