您的位置:首页 > 大数据

大数据挖掘算法篇之K-Means实例

2013-12-19 12:00 501 查看
一、引言

  K-Means算法是聚类算法中,应用最为广泛的一种。本文基于欧几里得距离公式:d = sqrt((x1-x2)^+(y1-y2)^)计算二维向量间的距离,作为聚类划分的依据,输入数据为二维数据两列数据,输出结果为聚类中心和元素划分结果。输入数据格式如下:

1 18
2 2
3 2
4 0.0 0.0
5 1.0 0.0
6 0.0 1.0
7 2.0 1.0
8 1.0 2.0
9 2.0 2.0
10 2.0 0.0
11 0.0 2.0
12 7.0 6.0
13 7.0 7.0
14 7.0 8.0
15 8.0 6.0
16 8.0 7.0
17 8.0 8.0
18 8.0 9.0
19 9.0 7.0
20 9.0 8.0
21 9.0 9.0
22


二、欧几里得距离:

欧几里得距离定义: 欧几里得距离( Euclidean distance)也称欧氏距离,在n维空间内,最短的线的长度即为其欧氏距离。它是一个通常采用的距离定义,它是在m维空间中两个点之间的真实距离。
在二维和三维空间中的欧式距离的就是两点之间的距离,二维的公式是
d = sqrt((x1-x2)^+(y1-y2)^)
三维的公式是
d=sqrt((x1-x2)^+(y1-y2)^+(z1-z2)^)
推广到n维空间,欧式距离的公式是
d=sqrt( ∑(xi1-xi2)^ ) 这里i=1,2..n
xi1表示第一个点的第i维坐标,xi2表示第二个点的第i维坐标
n维欧氏空间是一个点集,它的每个点可以表示为(x(1),x(2),...x(n)),其中x(i)(i=1,2...n)是实数,称为x的第i个坐标,两个点x和y=(y(1),y(2)...y(n))之间的距离d(x,y)定义为上面的公式.
欧氏距离看作信号的相似程度。 距离越近就越相似,就越容易相互干扰,误码率就越高。
三、代码示例

1 /****************************************************************************
2 *                                                                           *
3 *  KMEANS                                                                   *
4 *                                                                           *
5 *****************************************************************************/
6
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <conio.h>
11 #include <math.h>
12
13 // FUNCTION PROTOTYPES
14
15
16 // DEFINES
17 #define         SUCCESS         1
18 #define         FAILURE         0
19 #define         TRUE            1
20 #define         FALSE           0
21 #define         MAXVECTDIM      20
22 #define         MAXPATTERN      20
23 #define         MAXCLUSTER      10
24
25
26
27
28
29 char *f2a(double x, int width){
30    char cbuf[255];
31    char *cp;
32    int i,k;
33    int d,s;
34     cp=fcvt(x,width,&d,&s);
35     if (s) {
36        strcpy(cbuf,"-");
37      }
38      else {
39        strcpy(cbuf," ");
40        } /* endif */
41     if (d>0) {
42        for (i=0; i<d; i++) {
43           cbuf[i+1]=cp[i];
44           } /* endfor */
45        cbuf[d+1]=0;
46        cp+=d;
47        strcat(cbuf,".");
48        strcat(cbuf,cp);
49        } else {
50           if (d==0) {
51              strcat(cbuf,".");
52              strcat(cbuf,cp);
53              }
54            else {
55              k=-d;
56              strcat(cbuf,".");
57              for (i=0; i<k; i++) {
58                 strcat(cbuf,"0");
59                 } /* endfor */
60              strcat(cbuf,cp);
61              } /* endif */
62        } /* endif */
63     cp=&cbuf[0];
64     return cp;
65 }
66
67
68
69
70 // ***** Defined structures & classes *****
71 struct aCluster {
72    double       Center[MAXVECTDIM];
73    int          Member[MAXPATTERN];  //Index of Vectors belonging to this cluster
74    int          NumMembers;
75 };
76
77 struct aVector {
78    double       Center[MAXVECTDIM];
79    int          Size;
80 };
81
82 class System {
83 private:
84    double       Pattern[MAXPATTERN][MAXVECTDIM+1];
85    aCluster     Cluster[MAXCLUSTER];
86    int          NumPatterns;          // Number of patterns
87    int          SizeVector;           // Number of dimensions in vector
88    int          NumClusters;          // Number of clusters
89    void         DistributeSamples();  // Step 2 of K-means algorithm
90    int          CalcNewClustCenters();// Step 3 of K-means algorithm
91    double       EucNorm(int, int);   // Calc Euclidean norm vector
92    int          FindClosestCluster(int); //ret indx of clust closest to pattern
93                                          //whose index is arg
94 public:
95    void system();
96    int LoadPatterns(char *fname);      // Get pattern data to be clustered
97    void InitClusters();                // Step 1 of K-means algorithm
98    void RunKMeans();                   // Overall control K-means process
99    void ShowClusters();                // Show results on screen
100    void SaveClusters(char *fname);     // Save results to file
101    void ShowCenters();
102 };
103 //输出聚类中心
104 void System::ShowCenters(){
105     int i,j;
106     printf("Cluster centers:\n");
107     for (i=0; i<NumClusters; i++) {
108        Cluster[i].Member[0]=i;
109        printf("ClusterCenter[%d]=(%f,%f)\n",i,Cluster[i].Center[0],Cluster[i].Center[1]);
110        } /* endfor */
111     printf("\n");
112     getchar();
113 }
114
115 //读取文件
116 int System::LoadPatterns(char *fname)
117 {
118    FILE *InFilePtr;
119    int    i,j;
120    double x;
121     if((InFilePtr = fopen(fname, "r")) == NULL)
122         return FAILURE;
123     fscanf(InFilePtr, "%d", &NumPatterns);  // Read # of patterns                18数据量
124     fscanf(InFilePtr, "%d", &SizeVector);   // Read dimension of vector            2维度
125     fscanf(InFilePtr, "%d", &NumClusters);  // Read # of clusters for K-Means    2簇
126     for (i=0; i<NumPatterns; i++) {         // For each vector
127        for (j=0; j<SizeVector; j++) {       // create a pattern
128           fscanf(InFilePtr,"%lg",&x);       // consisting of all elements
129           Pattern[i][j]=x;
130           } /* endfor */
131        } /* endfor */
132     //输出所有数据元素
133     printf("Input patterns:\n");
134     for (i=0; i<NumPatterns; i++) {
135        printf("Pattern[%d]=(%2.3f,%2.3f)\n",i,Pattern[i][0],Pattern[i][1]);
136        } /* endfor */
137     printf("\n--------------------\n");
138     getchar();
139     return SUCCESS;
140 }
141 //***************************************************************************
142 // InitClusters                                                             *
143 //   Arbitrarily assign a vector to each of the K clusters                  *
144 //   We choose the first K vectors to do this                               *
145 //***************************************************************************
146 //初始化聚类中心
147 void System::InitClusters(){
148     int i,j;
149     printf("Initial cluster centers:\n");
150     for (i=0; i<NumClusters; i++) {
151        Cluster[i].Member[0]=i;
152        for (j=0; j<SizeVector; j++) {
153           Cluster[i].Center[j]=Pattern[i][j];
154           } /* endfor */
155        } /* endfor */
156     for (i=0; i<NumClusters; i++) {
157         printf("ClusterCenter[%d]=(%f,%f)\n",i,Cluster[i].Center[0],Cluster[i].Center[1]);                //untransplant
158        } /* endfor */
159     printf("\n");
160     getchar();
161 }
162 //运行KMeans
163 void System::RunKMeans(){
164       int converged;
165       int pass;
166     pass=1;
167     converged=FALSE;
168     //第N次聚类
169     while (converged==FALSE) {
170        printf("PASS=%d\n",pass++);
171        DistributeSamples();
172        converged=CalcNewClustCenters();
173        ShowCenters();
174        getchar();
175        } /* endwhile */
176 }
177 //在二维和三维空间中的欧式距离的就是两点之间的距离,二维的公式是
178 //d = sqrt((x1-x2)^+(y1-y2)^)
179 //通过这种运算,就可以把所有列的属性都纳入进来
180 double System::EucNorm(int p, int c){        // Calc Euclidean norm of vector difference
181     double dist,x;                          // between pattern vector, p, and cluster
182     int i;                                  // center, c.
183     char zout[128];
184     char znum[40];
185     char *pnum;
186     //
187     pnum=&znum[0];
188     strcpy(zout,"d=sqrt(");
189     printf("The distance from pattern %d to cluster %d is calculated as:\n",p,c);
190     dist=0;
191     for (i=0; i<SizeVector ;i++){
192        //拼写字符串
193        x=(Cluster[c].Center[i]-Pattern[p][i])*(Cluster[c].Center[i]-Pattern[p][i]);
194        strcat(zout,f2a(x,4));
195        if (i==0)
196           strcat(zout,"+");
197         //计算距离
198        dist += (Cluster[c].Center[i]-Pattern[p][i])*(Cluster[c].Center[i]-Pattern[p][i]);
199        } /* endfor */
200     printf("%s)\n",zout);
201     return dist;
202 }
203 //查找最近的群集
204 int System::FindClosestCluster(int pat){
205    int i, ClustID;
206    double MinDist, d;
207     MinDist =9.9e+99;
208     ClustID=-1;
209     for (i=0; i<NumClusters; i++) {
210        d=EucNorm(pat,i);
211        printf("Distance from pattern %d to cluster %d is %f\n\n",pat,i,sqrt(d));
212        if (d<MinDist) {
213           MinDist=d;
214           ClustID=i;
215           } /* endif */
216        } /* endfor */
217     if (ClustID<0) {
218        printf("Aaargh");
219        exit(0);
220        } /* endif */
221     return ClustID;
222 }
223 //
224 void System::DistributeSamples(){
225     int i,pat,Clustid,MemberIndex;
226     //Clear membership list for all current clusters
227     for (i=0; i<NumClusters;i++){
228        Cluster[i].NumMembers=0;
229        }
230     for (pat=0; pat<NumPatterns; pat++) {
231        //Find cluster center to which the pattern is closest
232        Clustid= FindClosestCluster(pat);//查找最近的聚类中心
233        printf("patern %d assigned to cluster %d\n\n",pat,Clustid);
234        //post this pattern to the cluster
235        MemberIndex=Cluster[Clustid].NumMembers;
236        Cluster[Clustid].Member[MemberIndex]=pat;
237        Cluster[Clustid].NumMembers++;
238        } /* endfor */
239 }
240 //计算新的群集中心
241 int  System::CalcNewClustCenters(){
242    int ConvFlag,VectID,i,j,k;
243    double tmp[MAXVECTDIM];
244    char xs[255];
245    char ys[255];
246    char nc1[20];
247    char nc2[20];
248    char *pnc1;
249    char *pnc2;
250    char *fpv;
251
252     pnc1=&nc1[0];
253     pnc2=&nc2[0];
254     ConvFlag=TRUE;
255     printf("The new cluster centers are now calculated as:\n");
256     for (i=0; i<NumClusters; i++) {              //for each cluster
257        pnc1=itoa(Cluster[i].NumMembers,nc1,10);
258        pnc2=itoa(i,nc2,10);
259        strcpy(xs,"Cluster Center");
260        strcat(xs,nc2);
261        strcat(xs,"(1/");
262        strcpy(ys,"(1/");
263        strcat(xs,nc1);
264        strcat(ys,nc1);
265        strcat(xs,")(");
266        strcat(ys,")(");
267        for (j=0; j<SizeVector; j++) {            // clear workspace
268           tmp[j]=0.0;
269           } /* endfor */
270        for (j=0; j<Cluster[i].NumMembers; j++) { //traverse member vectors
271           VectID=Cluster[i].Member[j];
272           for (k=0; k<SizeVector; k++) {         //traverse elements of vector
273                  tmp[k] += Pattern[VectID][k];       // add (member) pattern elmnt into temp
274                  if (k==0) {
275                       strcat(xs,f2a(Pattern[VectID][k],3));
276                     } else {
277                       strcat(ys,f2a(Pattern[VectID][k],3));
278                       } /* endif */
279             } /* endfor */
280           if(j<Cluster[i].NumMembers-1){
281              strcat(xs,"+");
282              strcat(ys,"+");
283              }
284             else {
285              strcat(xs,")");
286              strcat(ys,")");
287              }
288           } /* endfor */
289        for (k=0; k<SizeVector; k++) {            //traverse elements of vector
290           tmp[k]=tmp[k]/Cluster[i].NumMembers;
291           if (tmp[k] != Cluster[i].Center[k])
292              ConvFlag=FALSE;
293           Cluster[i].Center[k]=tmp[k];
294           } /* endfor */
295        printf("%s,\n",xs);
296        printf("%s\n",ys);
297        } /* endfor */
298     return ConvFlag;
299 }
300 //输出聚类
301 void System::ShowClusters(){
302    int cl;
303     for (cl=0; cl<NumClusters; cl++) {
304        printf("\nCLUSTER %d ==>[%f,%f]\n", cl,Cluster[cl].Center[0],Cluster[cl].Center[1]);
305        } /* endfor */
306 }
307
308 void System::SaveClusters(char *fname){
309 }


四、主调程序

1 void main(int argc, char *argv[])
2 {
3
4    System kmeans;
5    /*
6     if (argc<2) {
7        printf("USAGE: KMEANS PATTERN_FILE\n");
8        exit(0);
9        }*/
10     if (kmeans.LoadPatterns("KM2.DAT")==FAILURE ){
11        printf("UNABLE TO READ PATTERN_FILE:%s\n",argv[1]);
12        exit(0);
13         }
14
15     kmeans.InitClusters();
16     kmeans.RunKMeans();
17     kmeans.ShowClusters();
18 }


五、输出结果

1 Input patterns:
2 Pattern[0]=(0.000,0.000)
3 Pattern[1]=(1.000,0.000)
4 Pattern[2]=(0.000,1.000)
5 Pattern[3]=(2.000,1.000)
6 Pattern[4]=(1.000,2.000)
7 Pattern[5]=(2.000,2.000)
8 Pattern[6]=(2.000,0.000)
9 Pattern[7]=(0.000,2.000)
10 Pattern[8]=(7.000,6.000)
11 Pattern[9]=(7.000,7.000)
12 Pattern[10]=(7.000,8.000)
13 Pattern[11]=(8.000,6.000)
14 Pattern[12]=(8.000,7.000)
15 Pattern[13]=(8.000,8.000)
16 Pattern[14]=(8.000,9.000)
17 Pattern[15]=(9.000,7.000)
18 Pattern[16]=(9.000,8.000)
19 Pattern[17]=(9.000,9.000)
20
21 --------------------
22
23 Initial cluster centers:
24 ClusterCenter[0]=(0.000000,0.000000)
25 ClusterCenter[1]=(1.000000,0.000000)
26
27
28 PASS=1
29 The distance from pattern 0 to cluster 0 is calculated as:
30 d=sqrt( .0000+ .0000)
31 Distance from pattern 0 to cluster 0 is 0.000000
32
33 The distance from pattern 0 to cluster 1 is calculated as:
34 d=sqrt( 1.0000+ .0000)
35 Distance from pattern 0 to cluster 1 is 1.000000
36
37 patern 0 assigned to cluster 0
38
39 The distance from pattern 1 to cluster 0 is calculated as:
40 d=sqrt( 1.0000+ .0000)
41 Distance from pattern 1 to cluster 0 is 1.000000
42
43 The distance from pattern 1 to cluster 1 is calculated as:
44 d=sqrt( .0000+ .0000)
45 Distance from pattern 1 to cluster 1 is 0.000000
46
47 patern 1 assigned to cluster 1
48
49 The distance from pattern 2 to cluster 0 is calculated as:
50 d=sqrt( .0000+ 1.0000)
51 Distance from pattern 2 to cluster 0 is 1.000000
52
53 The distance from pattern 2 to cluster 1 is calculated as:
54 d=sqrt( 1.0000+ 1.0000)
55 Distance from pattern 2 to cluster 1 is 1.414214
56
57 patern 2 assigned to cluster 0
58
59 The distance from pattern 3 to cluster 0 is calculated as:
60 d=sqrt( 4.0000+ 1.0000)
61 Distance from pattern 3 to cluster 0 is 2.236068
62
63 The distance from pattern 3 to cluster 1 is calculated as:
64 d=sqrt( 1.0000+ 1.0000)
65 Distance from pattern 3 to cluster 1 is 1.414214
66
67 patern 3 assigned to cluster 1
68
69 The distance from pattern 4 to cluster 0 is calculated as:
70 d=sqrt( 1.0000+ 4.0000)
71 Distance from pattern 4 to cluster 0 is 2.236068
72
73 The distance from pattern 4 to cluster 1 is calculated as:
74 d=sqrt( .0000+ 4.0000)
75 Distance from pattern 4 to cluster 1 is 2.000000
76
77 patern 4 assigned to cluster 1
78
79 The distance from pattern 5 to cluster 0 is calculated as:
80 d=sqrt( 4.0000+ 4.0000)
81 Distance from pattern 5 to cluster 0 is 2.828427
82
83 The distance from pattern 5 to cluster 1 is calculated as:
84 d=sqrt( 1.0000+ 4.0000)
85 Distance from pattern 5 to cluster 1 is 2.236068
86
87 patern 5 assigned to cluster 1
88
89 The distance from pattern 6 to cluster 0 is calculated as:
90 d=sqrt( 4.0000+ .0000)
91 Distance from pattern 6 to cluster 0 is 2.000000
92
93 The distance from pattern 6 to cluster 1 is calculated as:
94 d=sqrt( 1.0000+ .0000)
95 Distance from pattern 6 to cluster 1 is 1.000000
96
97 patern 6 assigned to cluster 1
98
99 The distance from pattern 7 to cluster 0 is calculated as:
100 d=sqrt( .0000+ 4.0000)
101 Distance from pattern 7 to cluster 0 is 2.000000
102
103 The distance from pattern 7 to cluster 1 is calculated as:
104 d=sqrt( 1.0000+ 4.0000)
105 Distance from pattern 7 to cluster 1 is 2.236068
106
107 patern 7 assigned to cluster 0
108
109 The distance from pattern 8 to cluster 0 is calculated as:
110 d=sqrt( 49.0000+ 36.0000)
111 Distance from pattern 8 to cluster 0 is 9.219544
112
113 The distance from pattern 8 to cluster 1 is calculated as:
114 d=sqrt( 36.0000+ 36.0000)
115 Distance from pattern 8 to cluster 1 is 8.485281
116
117 patern 8 assigned to cluster 1
118
119 The distance from pattern 9 to cluster 0 is calculated as:
120 d=sqrt( 49.0000+ 49.0000)
121 Distance from pattern 9 to cluster 0 is 9.899495
122
123 The distance from pattern 9 to cluster 1 is calculated as:
124 d=sqrt( 36.0000+ 49.0000)
125 Distance from pattern 9 to cluster 1 is 9.219544
126
127 patern 9 assigned to cluster 1
128
129 The distance from pattern 10 to cluster 0 is calculated as:
130 d=sqrt( 49.0000+ 64.0000)
131 Distance from pattern 10 to cluster 0 is 10.630146
132
133 The distance from pattern 10 to cluster 1 is calculated as:
134 d=sqrt( 36.0000+ 64.0000)
135 Distance from pattern 10 to cluster 1 is 10.000000
136
137 patern 10 assigned to cluster 1
138
139 The distance from pattern 11 to cluster 0 is calculated as:
140 d=sqrt( 64.0000+ 36.0000)
141 Distance from pattern 11 to cluster 0 is 10.000000
142
143 The distance from pattern 11 to cluster 1 is calculated as:
144 d=sqrt( 49.0000+ 36.0000)
145 Distance from pattern 11 to cluster 1 is 9.219544
146
147 patern 11 assigned to cluster 1
148
149 The distance from pattern 12 to cluster 0 is calculated as:
150 d=sqrt( 64.0000+ 49.0000)
151 Distance from pattern 12 to cluster 0 is 10.630146
152
153 The distance from pattern 12 to cluster 1 is calculated as:
154 d=sqrt( 49.0000+ 49.0000)
155 Distance from pattern 12 to cluster 1 is 9.899495
156
157 patern 12 assigned to cluster 1
158
159 The distance from pattern 13 to cluster 0 is calculated as:
160 d=sqrt( 64.0000+ 64.0000)
161 Distance from pattern 13 to cluster 0 is 11.313708
162
163 The distance from pattern 13 to cluster 1 is calculated as:
164 d=sqrt( 49.0000+ 64.0000)
165 Distance from pattern 13 to cluster 1 is 10.630146
166
167 patern 13 assigned to cluster 1
168
169 The distance from pattern 14 to cluster 0 is calculated as:
170 d=sqrt( 64.0000+ 81.0000)
171 Distance from pattern 14 to cluster 0 is 12.041595
172
173 The distance from pattern 14 to cluster 1 is calculated as:
174 d=sqrt( 49.0000+ 81.0000)
175 Distance from pattern 14 to cluster 1 is 11.401754
176
177 patern 14 assigned to cluster 1
178
179 The distance from pattern 15 to cluster 0 is calculated as:
180 d=sqrt( 81.0000+ 49.0000)
181 Distance from pattern 15 to cluster 0 is 11.401754
182
183 The distance from pattern 15 to cluster 1 is calculated as:
184 d=sqrt( 64.0000+ 49.0000)
185 Distance from pattern 15 to cluster 1 is 10.630146
186
187 patern 15 assigned to cluster 1
188
189 The distance from pattern 16 to cluster 0 is calculated as:
190 d=sqrt( 81.0000+ 64.0000)
191 Distance from pattern 16 to cluster 0 is 12.041595
192
193 The distance from pattern 16 to cluster 1 is calculated as:
194 d=sqrt( 64.0000+ 64.0000)
195 Distance from pattern 16 to cluster 1 is 11.313708
196
197 patern 16 assigned to cluster 1
198
199 The distance from pattern 17 to cluster 0 is calculated as:
200 d=sqrt( 81.0000+ 81.0000)
201 Distance from pattern 17 to cluster 0 is 12.727922
202
203 The distance from pattern 17 to cluster 1 is calculated as:
204 d=sqrt( 64.0000+ 81.0000)
205 Distance from pattern 17 to cluster 1 is 12.041595
206
207 patern 17 assigned to cluster 1
208
209 The new cluster centers are now calculated as:
210 Cluster Center0(1/3)( .000+ .000+ .000),
211 (1/3)( .000+ 1.000+ 2.000)
212 Cluster Center1(1/15)( 1.000+ 2.000+ 1.000+ 2.000+ 2.000+ 7.000+ 7.000+ 7.000+ 8
213 .000+ 8.000+ 8.000+ 8.000+ 9.000+ 9.000+ 9.000),
214 (1/15)( .000+ 1.000+ 2.000+ 2.000+ .000+ 6.000+ 7.000+ 8.000+ 6.000+ 7.000+ 8.00
215 0+ 9.000+ 7.000+ 8.000+ 9.000)
216 Cluster centers:
217 ClusterCenter[0]=(0.000000,1.000000)
218 ClusterCenter[1]=(5.866667,5.333333)


内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: