聚类算法——K-means(下)
2012-03-08 21:39
190 查看
K-means的源码实现
一般情况下,我们通过C++/Matlab/Python等语言进行实现K-means算法,结合近期我刚刚学的C++,先从C++实现谈起,C++里面我们一般采用的是OpenCV库中写好的K-means函数,即cvKmeans2,首先来看函数原型:
从OpenCV manual看到的是:
int cvKMeans2(const CvArr* samples, int nclusters,
CvArr* labels, CvTermCriteria termcrit,
int attempts=1, CvRNG* rng=0,int flags=0,
CvArr* centers=0,double* compactness=0);
由于除去已经确定的参数,我们自己需要输入的为:
void cvKMeans2(
const CvArr* samples, //输入样本的浮点矩阵,每个样本一行。
int cluster_count, //所给定的聚类数目
* labels, //输出整数向量:每个样本对应的类别标识
CvTermCriteria termcrit //指定聚类的最大迭代次数和/或精度(两次迭代引起的聚类中心的移动距离)
);
其使用例程为:
至于cvKmeans2函数的具体实现细节,可参见OpenCV源码
下面是Python的实现代码(网上所找):
matlab的kmeans实现代码可直接参照其kmeans(X,k)函数的实现源码。
一般情况下,我们通过C++/Matlab/Python等语言进行实现K-means算法,结合近期我刚刚学的C++,先从C++实现谈起,C++里面我们一般采用的是OpenCV库中写好的K-means函数,即cvKmeans2,首先来看函数原型:
从OpenCV manual看到的是:
int cvKMeans2(const CvArr* samples, int nclusters,
CvArr* labels, CvTermCriteria termcrit,
int attempts=1, CvRNG* rng=0,int flags=0,
CvArr* centers=0,double* compactness=0);
由于除去已经确定的参数,我们自己需要输入的为:
void cvKMeans2(
const CvArr* samples, //输入样本的浮点矩阵,每个样本一行。
int cluster_count, //所给定的聚类数目
* labels, //输出整数向量:每个样本对应的类别标识
CvTermCriteria termcrit //指定聚类的最大迭代次数和/或精度(两次迭代引起的聚类中心的移动距离)
);
其使用例程为:
#ifdef _CH_ #pragma package <opencv> #endif #define CV_NO_BACKWARD_COMPATIBILITY #ifndef _EiC #include "cv.h" #include "highgui.h" #include <stdio.h> #endif int main( int argc, char** argv ) { #define MAX_CLUSTERS 5 //设置类别的颜色,个数(《=5) CvScalar color_tab[MAX_CLUSTERS]; IplImage* img = cvCreateImage( cvSize( 500, 500 ), 8, 3 ); CvRNG rng = cvRNG(-1); CvPoint ipt; color_tab[0] = CV_RGB(255,0,0); color_tab[1] = CV_RGB(0,255,0); color_tab[2] = CV_RGB(100,100,255); color_tab[3] = CV_RGB(255,0,255); color_tab[4] = CV_RGB(255,255,0); cvNamedWindow( "clusters", 1 ); for(;;) { char key; int k, cluster_count = cvRandInt(&rng)%MAX_CLUSTERS + 1; int i, sample_count = cvRandInt(&rng)%1000 + 1; CvMat* points = cvCreateMat( sample_count, 1, CV_32FC2 ); CvMat* clusters = cvCreateMat( sample_count, 1, CV_32SC1 ); cluster_count = MIN(cluster_count, sample_count); /** generate random sample from multigaussian distribution */ for( k = 0; k < cluster_count; k++ ) { CvPoint center; CvMat point_chunk; center.x = cvRandInt(&rng)%img->width; center.y = cvRandInt(&rng)%img->height; cvGetRows( points, &point_chunk, k*sample_count/cluster_count, k == cluster_count - 1 ? sample_count : (k+1)*sample_count/cluster_count, 1 ); cvRandArr( &rng, &point_chunk, CV_RAND_NORMAL, cvScalar(center.x,center.y,0,0), cvScalar(img->width*0.1,img->height*0.1,0,0)); } /** shuffle samples */ for( i = 0; i < sample_count/2; i++ ) { CvPoint2D32f* pt1 = (CvPoint2D32f*)points->data.fl + cvRandInt(&rng)%sample_count; CvPoint2D32f* pt2 = (CvPoint2D32f*)points->data.fl + cvRandInt(&rng)%sample_count; CvPoint2D32f temp; CV_SWAP( *pt1, *pt2, temp ); } printf( "iterations=%d\n", cvKMeans2( points, cluster_count, clusters, cvTermCriteria( CV_TERMCRIT_EPS+CV_TERMCRIT_ITER, 10, 1.0 ), 5, 0, 0, 0, 0 )); cvZero( img ); for( i = 0; i < sample_count; i++ ) { int cluster_idx = clusters->data.i[i]; ipt.x = (int)points->data.fl[i*2]; ipt.y = (int)points->data.fl[i*2+1]; cvCircle( img, ipt, 2, color_tab[cluster_idx], CV_FILLED, CV_AA, 0 ); } cvReleaseMat( &points ); cvReleaseMat( &clusters ); cvShowImage( "clusters", img ); key = (char) cvWaitKey(0); if( key == 27 || key == 'q' || key == 'Q' ) // 'ESC' break; } cvDestroyWindow( "clusters" ); return 0; } #ifdef _EiC main(1,"kmeans.c"); #endif
至于cvKmeans2函数的具体实现细节,可参见OpenCV源码
下面是Python的实现代码(网上所找):
#!/usr/bin/python from __future__ import with_statement import cPickle as pickle from matplotlib import pyplot from numpy import zeros, array, tile from scipy.linalg import norm import numpy.matlib as ml import random def kmeans(X, k, observer=None, threshold=1e-15, maxiter=300): N = len(X) labels = zeros(N, dtype=int) centers = array(random.sample(X, k)) iter = 0 def calc_J(): sum = 0 for i in xrange(N): sum += norm(X[i]-centers[labels[i]]) return sum def distmat(X, Y): n = len(X) m = len(Y) xx = ml.sum(X*X, axis=1) yy = ml.sum(Y*Y, axis=1) xy = ml.dot(X, Y.T) return tile(xx, (m, 1)).T+tile(yy, (n, 1)) - 2*xy Jprev = calc_J() while True: # notify the observer if observer is not None: observer(iter, labels, centers) # calculate distance from x to each center # distance_matrix is only available in scipy newer than 0.7 # dist = distance_matrix(X, centers) dist = distmat(X, centers) # assign x to nearst center labels = dist.argmin(axis=1) # re-calculate each center for j in range(k): idx_j = (labels == j).nonzero() centers[j] = X[idx_j].mean(axis=0) J = calc_J() iter += 1 if Jprev-J < threshold: break Jprev = J if iter >= maxiter: break # final notification if observer is not None: observer(iter, labels, centers) if __name__ == '__main__': # load previously generated points with open('cluster.pkl') as inf: samples = pickle.load(inf) N = 0 for smp in samples: N += len(smp[0]) X = zeros((N, 2)) idxfrm = 0 for i in range(len(samples)): idxto = idxfrm + len(samples[i][0]) X[idxfrm:idxto, 0] = samples[i][0] X[idxfrm:idxto, 1] = samples[i][1] idxfrm = idxto def observer(iter, labels, centers): print "iter %d." % iter colors = array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) pyplot.plot(hold=False) # clear previous plot pyplot.hold(True) # draw points data_colors=[colors[lbl] for lbl in labels] pyplot.scatter(X[:, 0], X[:, 1], c=data_colors, alpha=0.5) # draw centers pyplot.scatter(centers[:, 0], centers[:, 1], s=200, c=colors) pyplot.savefig('kmeans/iter_%02d.png' % iter, format='png') kmeans(X, 3, observer=observer)
matlab的kmeans实现代码可直接参照其kmeans(X,k)函数的实现源码。
相关文章推荐
- 聚类算法——K-means(上)
- 聚类算法——K-means(下)
- 聚类算法——K-means
- 聚类算法——K-means(上)
- 【opencv、机器学习】聚类算法——K-means
- 经典聚类算法——K-means
- 聚类算法——K-means(上)
- delphi pascal 写的 fcm kmeans 模糊C-均值算法
- Whatever Love Means
- 漫谈 Clustering (1): k-means
- k-means简介
- 基于SIFT+Kmeans+LDA的图片分类器的实现
- K-Means 算法
- K-Means 算法
- Maciej Pacula » k-means clustering example (Python)
- K-means
- Kmeans && Kmeans++ && Davies-Bouldin && Dunn index
- 聚类算法之K-means
- Online K-means, SGD variant and Mini-batch K-means
- Weka -- 聚类算法之K-means