您的位置:首页 > 编程语言

K-Means源代码

2014-05-10 23:30 375 查看
K-Means源代码

        作者:liangdas

        出处:简单点儿,通俗点儿,机器学习 http://write.blog.csdn.net/postedit/25512617

        下面是我写的K-Means的源代码,分别有三个文件,一个是K-Means.h头文件,一个是K-Means.c文件,另外一个是Main.cpp文件。K-Means.h和K-Means.c文件中引用了系统的stdio.h,stdlib.h文件,是因为里面用到了printf和exit函数,如果去掉这两个函数,也就可以去掉stdio.h,stdlib.h文件的引用,那么这个K-Means.h和K-Means.c文件就都是用C自己写的啦,的可移植性就更强啦!

 
      后面的main.cpp是介绍怎么使用的,输入是按txt格式存贮的,存贮格式是:

sample number(样本总数)

feature number(特征维数)

intend class number(待分类的类别)

feature list as(特征列表):

feature1 feature2 ...

feature1 feature2 ...

......

        当然可以自己定义数据的格式,并重先写LoadPatterns()函数。

K-Means.h
/***********************************
*	Author: liangdas
*	Time: 20140504
*	Version: 0_20140504
*	Contaction:
*		QQ: 358536026  Email: liangdas1986@163.com
*	Working place: Beijing Samsuang Telecom&Technology Institute
************************************/
#ifndef __K_MEAN_H__
#define __K_MEAN_H__

#ifdef __cplusplus
extern "C"{
#endif

#define         SUCCESS         1
#define         FAILURE         0
#define         TRUE            1
#define         FALSE           0
#define         MAX_DIM			20		//特征维数
#define         MAX_SAMPLES		2000		//一个类别包含的样本个数
#define         MAX_CLUSTER		10		//最大类别数目

typedef struct stCluster
{
double Center[MAX_DIM];
int Member[MAX_SAMPLES];
int NumMembers;
}CLASSCLUSTER, *PCLASSCLUSTER;

/***************************************************************
* Function: ReClassify()
* Description: 重新计算样本所属的类别
* Input&Output:
* Returns:
****************************************************************/
void ReClassify(double** Pattern, int NumSamples, PCLASSCLUSTER Cluster,
int ClusterNum, int NumDim);

/***************************************************************
* Function: CalcNewClustCenters()
* Description: 重新计算类别中心
* Input&Output:
* Returns:
****************************************************************/
int CalcNewClustCenters(double** Pattern, int NumSamples, PCLASSCLUSTER Cluster,
int ClusterNum, int NumDim);

/***************************************************************
* Function: CalcuDistance()
* Description: 样本序号p,到第c个类别的距离
* Input&Output:
* Returns:
****************************************************************/
double CalcuDistance(int sampleID, int clusterID, double** Pattern, int NumSamples,
PCLASSCLUSTER Cluster, int ClusterNum, int NumDim);

/***************************************************************
* Function: FindClosestCluster()
* Description: 找到样本序号为sampleID的样本,所属的类别
* Input&Output:
* Returns:
****************************************************************/
int FindClosestCluster(int sampleID, double** Pattern, int NumSamples,
PCLASSCLUSTER Cluster, int ClusterNum, int NumDim);

/***************************************************************
* Function: LoadPatterns()
* Descrption: 通过问卷名字,加载样本列表
* Input&OutPut:
* Return:
* File format: 样本数 特征维数 待分类类别数目 样本。。。
****************************************************************/
int  LoadPatterns(char *fname, double** Pattern, int* pNumPatterns,
PCLASSCLUSTER Cluster, int* pClusterNum, int* pNumDim);

/***************************************************************
* Function: InitClusters()
* Description: 指定初始类别中心,这个函数取的事样本序列的前ClusterNum
* 个样本作为聚类的初始类别中心
* Input&Output:
* Returns:
****************************************************************/
void InitClusters(double** Pattern, int NumSamples, PCLASSCLUSTER Cluster,
int ClusterNum, int NumDim);

/***************************************************************
* Function: RunKMeans()
* Description: 执行K-Means分类
* Input&Output:
* Returns:
****************************************************************/
void RunKMeans(double** Pattern, int NumSamples, PCLASSCLUSTER Cluster,
int ClusterNum, int NumDim, int nIterTimes);

/***************************************************************
* Function: SaveClusters()
* Description: 保存聚类中心的结果
* Input&Output:
* Returns:
****************************************************************/
void SaveCenters(char* pFilePath, PCLASSCLUSTER Cluster, int ClusterNum, int NumDim);

/***************************************************************
* Function: SaveClusters()
* Description: 保存聚类的样本的结果
* Input&Output:
* Returns:
****************************************************************/
void SaveClusters(char* pFilePath, double** Pattern, int NumSamples, PCLASSCLUSTER Cluster,
int ClusterNum, int NumDim);

#ifdef __cplusplus
}
#endif

#endif


K-Means.c

/***********************************
*	Author: liangdas
*	Time: 20140504
*	Version: 0_20140504
*	Contaction:
*		QQ: 358536026  Email: liangdas1986@163.com
*	Working place: Beijing Samsuang Telecom&Technology Institute
************************************/
#include <stdlib.h>
#include <io.h>
#include <stdio.h>
#include "K_Means.h"

#ifdef __cplusplus
extern "C"{
#endif

/********************************************************
* Function: LoadPatterns()
* Descrption: 通过问卷名字,加载样本列表
* Input&OutPut:
* Return:
* File format: 样本数 特征维数 待分类类别数目 样本。。。
*********************************************************/
int LoadPatterns(char *fname, double** Pattern, int* pNumPatterns,
PCLASSCLUSTER Cluster, int* pClusterNum, int* pNumDim)
{
FILE* InFilePtr;
int    i,j;
double x;
if((InFilePtr = fopen(fname, "rt")) == NULL)
{
return FAILURE;
}
fscanf(InFilePtr, "%d", pNumPatterns);  // Read # of patterns
fscanf(InFilePtr, "%d", pNumDim);   // Read dimension of vector
fscanf(InFilePtr, "%d", pClusterNum);  // Read # of clusters for K-Means
for (i=0; i<*pNumPatterns; i++)  // For each vector
{
for (j=0; j<*pNumDim; j++)
{       // create a pattern
fscanf(InFilePtr,"%lg",&x);       // consisting of all elements
Pattern[i][j]=x;
} /* endfor */
} /* endfor */
return SUCCESS;
}

/***************************************************************
* Function: InitClusters()
* Description: 指定初始类别中心,这个函数取的事样本序列的前ClusterNum样本作为聚类的初始类别中心
* Input&Output:
* Returns:
****************************************************************/
void InitClusters(double** Pattern, int NumSamples, PCLASSCLUSTER Cluster, int ClusterNum, int NumDim)
{
int i,j;
printf("Initial cluster centers:\n");
if(ClusterNum > NumSamples)
{
printf("class number exceed to sample number\n");
}

for (i=0; i<ClusterNum; i++)
{
Cluster[i].Member[0] = i;
for (j=0; j<NumDim; j++)
{
Cluster[i].Center[j] = Pattern[i][j];
} /* endfor */
} /* endfor */
for (i=0; i<ClusterNum; i++)
{
printf("ClusterCenter[%d]=(%f,%f)\n", i, Cluster[i].Center[0], Cluster[i].Center[1]);
} /* endfor */
printf("\n");
}

/***************************************************************
* Function: RunKMeans()
* Description: 执行K-Means分类
* Input&Output:
* Returns:
****************************************************************/
void RunKMeans(double** Pattern, int NumSamples, PCLASSCLUSTER Cluster, int ClusterNum, int NumDim, int nIterTimes)
{
int nStopTag;
int pass;
int nCurTimes = 0;
nStopTag=FALSE;

while (nStopTag == FALSE)
{
printf("iteration time = %d\n", nCurTimes);
ReClassify(Pattern, NumSamples, Cluster, ClusterNum, NumDim);
nStopTag = CalcNewClustCenters(Pattern, NumSamples, Cluster, ClusterNum, NumDim);
nCurTimes++;
if(nCurTimes>=nIterTimes)
{
nStopTag = TRUE;
}
}
}

/***************************************************************
* Function: CalcuDistance()
* Description: 样本序号sampleID,到第c个类别的距离
* Input&Output:
* Returns:
****************************************************************/
double CalcuDistance(int sampleID, int clusterID, double** Pattern, int NumSamples, PCLASSCLUSTER Cluster, int ClusterNum, int NumDim)
{   // Calc Euclidean norm of vector difference
double dist,x;                          // between pattern vector, sampleID, and cluster
int i;                                  // center, clusterID.
char *pnum;
dist = 0;
for (i=0; i<NumDim; i++)
{
x = (Cluster[clusterID].Center[i]-Pattern[sampleID][i])*(Cluster[clusterID].Center[i]-Pattern[sampleID][i]);
dist += (Cluster[clusterID].Center[i]-Pattern[sampleID][i])*(Cluster[clusterID].Center[i]-Pattern[sampleID][i]);
} /* endfor */
return dist;
}

/***************************************************************
* Function: FindClosestCluster()
* Description: 找到样本序号为sampleID的样本,所属的类别
* Input&Output:
* Returns:
****************************************************************/
int FindClosestCluster(int sampleID, double** Pattern, int NumSamples, PCLASSCLUSTER Cluster, int ClusterNum, int NumDim)
{
int i, ClustID;
double MinDist, d;
MinDist =9.9e+99;
ClustID=-1;
for (i=0; i<ClusterNum; i++)
{
d=CalcuDistance(sampleID, i, Pattern, NumSamples, Cluster, ClusterNum, NumDim);
if (d<MinDist)
{
MinDist=d;
ClustID=i;
}
}
if (ClustID<0)
{
//printf("Aaargh");
exit(0);
} /* endif */
return ClustID;
}

/***************************************************************
* Function: ReClassify()
* Description: 重新计算样本所属的类别
* Input&Output:
* Returns:
****************************************************************/
void ReClassify(double** Pattern, int NumSamples, PCLASSCLUSTER Cluster, int ClusterNum, int NumDim)
{
int i,sampleID,Clustid,MemberIndex;
//Clear membership list for all current clusters
for (i=0; i<ClusterNum; i++)
{
Cluster[i].NumMembers = 0;
}
for (sampleID=0; sampleID<NumSamples; sampleID++)
{
//Find cluster center to which the pattern is closest
Clustid= FindClosestCluster(sampleID, Pattern, NumSamples, Cluster, ClusterNum, NumDim);
MemberIndex=Cluster[Clustid].NumMembers;
Cluster[Clustid].Member[MemberIndex]=sampleID;
Cluster[Clustid].NumMembers++;
} /* endfor */
}

/***************************************************************
* Function: CalcNewClustCenters()
* Description: 重新计算类别中心
* Input&Output:
* Returns:
****************************************************************/
int  CalcNewClustCenters(double** Pattern, int NumSamples, PCLASSCLUSTER Cluster, int ClusterNum, int NumDim)
{
int ConvFlag,VectID,i,j,k;
double tmp[MAX_DIM];

ConvFlag=TRUE;
//	printf("The new cluster centers are now calculated as:\n");
for (i=0; i<ClusterNum; i++)   //for each cluster
{
for(j=0; j<NumDim; j++)
{            // clear workspace
tmp[j]=0.0;
} /* endfor */
for(j=0; j<Cluster[i].NumMembers; j++)
{ //traverse member vectors
VectID = Cluster[i].Member[j];
for (k=0; k<NumDim; k++)
{         //traverse elements of vector
tmp[k] += Pattern[VectID][k];       // add (member) pattern elmnt into temp
} /* endfor */

} /* endfor */
for (k=0; k<NumDim; k++)
{
tmp[k] = tmp[k]/Cluster[i].NumMembers;
if (tmp[k] != Cluster[i].Center[k])
{
ConvFlag=FALSE;
}
Cluster[i].Center[k]=tmp[k];
} /* endfor */
} /* endfor */
return ConvFlag;
}

/***************************************************************
* Function: SaveClusters()
* Description: 保存聚类中心的结果
* Input&Output:
* Returns:
****************************************************************/
void SaveCenters(char* pFilePath, PCLASSCLUSTER Cluster, int ClusterNum, int NumDim)
{
int i,j;
FILE* fpResultFile;
fpResultFile = fopen(pFilePath, "wt");
if(fpResultFile == 0)
{
printf("open file %s error\n", pFilePath);
return;
}
fprintf(fpResultFile, "x\ty\tlabel\t\n");
for (i=0; i<ClusterNum; i++)
{
Cluster[i].Member[0] = i;
for(j=0; j<NumDim; j++)
{
fprintf(fpResultFile, "%f\t", Cluster[i].Center[j]);
}
fprintf(fpResultFile, "%d\n", i);

} /* endfor */
fclose(fpResultFile);
}

/***************************************************************
* Function: SaveClusters()
* Description: 保存聚类的结果
* Input&Output:
* Returns:
****************************************************************/
void SaveClusters(char* pFilePath, double** Pattern, int NumSamples, PCLASSCLUSTER Cluster,
int ClusterNum, int NumDim)
{
FILE* fpResultFile = 0;
int i = 0, j = 0, k = 0;
int nSampleID = 0;
fpResultFile = fopen(pFilePath, "wt");
if(fpResultFile == 0)
{
printf("open file %s error\n", pFilePath);
return;
}
fprintf(fpResultFile, "x\ty\tlabel\t\n");
for (i=0; i<ClusterNum; i++)
{
for(j=0; j<Cluster[i].NumMembers; j++)
{
nSampleID = Cluster[i].Member[j];
for(k=0; k<NumDim; k++)
{
fprintf(fpResultFile, "%hg\t", Pattern[nSampleID][k]);
}
fprintf(fpResultFile, "%d\n", i); //所属的类别
}
} /* endfor */
fclose(fpResultFile);
}
#ifdef __cplusplus
}
#endif


Main.cpp

#include <stdlib.h>
#include <stdio.h>
#include <io.h>
#include <string.h>
#include "K_Means.h"

#define MAX_ITER_TIMES  1000

int main(int argc, char *argv[])
{	//main procedure
//System kmeans;
//double       Pattern[MAX_SAMPLES][MAX_DIM+1];
double**	Pattern;
CLASSCLUSTER Cluster[MAX_CLUSTER];
int NumSamples;			// Number of patterns
int NumDim;				// Number of dimensions in vector
int ClusterNum;			// Number of clusters

int i = 0;
Pattern = (double**)malloc(sizeof(double*)*MAX_SAMPLES);
for(i=0; i<MAX_SAMPLES; i++)
{
Pattern[i] = (double*)malloc(sizeof(double)*MAX_DIM);
}

char* pFilePath = argv[1];
if (argc<2)
{
printf("usage: intput k_means file\n");
exit(0);
}
if (LoadPatterns(pFilePath, (double**)Pattern, &NumSamples, Cluster, &ClusterNum, &NumDim) == FAILURE)
{
printf("read file %s error\n", pFilePath);
exit(0);
}
//
#if 0
InitClusters((double**)Pattern, NumSamples, Cluster, ClusterNum, NumDim);
#else
Cluster[0].Center[0] = 21;
Cluster[0].Center[1] = 377;

Cluster[1].Center[0] = 20;
Cluster[1].Center[1] = 377;

Cluster[2].Center[0] = 20;
Cluster[2].Center[1] = 376;
#endif

RunKMeans((double**)Pattern, NumSamples, Cluster, ClusterNum, NumDim, MAX_ITER_TIMES);

SaveClusters("cluster.txt", (double**)Pattern, NumSamples, Cluster, ClusterNum, NumDim);
SaveCenters("center.txt", Cluster, ClusterNum, NumDim);
//ShowClusters();

//delete memory
for(i=0; i<MAX_SAMPLES; i++)
{
free(Pattern[i]);
}
free(Pattern);
}



ps:使用或者转载请标明出处,禁止以商业为目的的使用。
如果有需要word版,或者是pdf版的,请与我联系,QQ:358536026
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  Source Code