模式识别c均值算法的实现(C++实现)
一、实验目的:
通过c均值算法的具体实现,掌握算法的思想,理解动态聚类方法
二、实验原理:
看教程吧!
三、实验内容:
写程序实现c均值算法,并用表中的三维数据进行测试,下面给出了每种测试的类别数目和初始值。
的结果与(3)中的结果进行比较,并解释差别,包括迭代次数的差别。
样本(3个维X20个样本)
1
-7.82
-4.58
-3.97
2
-6.68
3.16
2.71
3
4.36
-2.19
2.09
4
6.72
0.88
2.80
5
-8.64
3.06
3.50
6
-6.87
0.57
-5.45
7
4.47
-2.62
5.76
8
6.73
-2.01
4.18
9
-7.71
2.34
-6.33
10
-6.91
-0.49
-5.68
11
6.18
2.81
5.82
12
6.72
-0.93
-4.04
13
-6.25
-0.26
0.56
14
-6.94
-1.22
1.13
15
8.09
0.20
2.25
16
6.18
0.17
-4.15
17
-5.19
4.24
4.04
18
-6.38
-1.74
1.43
19
4.08
1.30
5.33
20
6.27
0.93
-2.78
四、实验代码:
头函数:
#pragma once
#include <list>
#include <vector>
#include <iostream.h>
using namespace std;
#define DATANUM 20
#define MAXDIST 333333
struct CData
{
double x1;
double x2;
double x3;
};
class CCMean
{
public:
CCMean(CData *pdata);
CCMean(CData *pdata,CData *pmean);
~CCMean(void);
void init();
void work(int InitClassNum);
private:
void CalcuMean( int i );//计算第i类的均值
void CalcuJc(int i);//计算第i类的误差
void CalcuJe();//计算总误差
void InitDeploy();//初始化分类
//将第i类样本移动到第k类中,如果返回true这,总误差变小,否则不移动
bool MoveItoK( const CData& da, int i, int &k );
//计算某个样本与某类重心的平方差
double dist( const CData& mean, const CData& da);
void OutPut();//打印输出结果
int iClassNum;//定义类别数
CData *pData;//指针指向样本数据地址
CData *pMean;//指针指向初始化分类重心数据地址
//存储分类样本重心,最多的分类为总样本个数
CData mean[DATANUM];
double jc[DATANUM];//各样本的误差君方根
double je; //总误差
// 指针指向已经分类的样本地址
list< CData >* pcla[DATANUM];
};
实现函数:
#include "assert.h"
#include "cmean.h"
CCMean::CCMean(CData *pdata)
{
pData = pdata;
for(int i = 0; i < DATANUM; i ++ )
{
pcla[i] = new list< CData >;
assert( pcla[i] != 0 );
}
je = 0.0;
}
CCMean::CCMean(CData *pdata,CData *pmean)
{
pData = pdata;
pMean = pmean;
for(int i = 0; i < DATANUM; i ++ )
{
pcla[i] = new list< CData >;
assert( pcla[i] != 0 );
}
je = 0.0;
}
CCMean::~CCMean()
{
for(int i = 0; i < DATANUM; i ++ )
delete pcla[i];
}
void CCMean::init()
{
for(int i = 0; i < DATANUM; i ++ )
{
pcla[i]->clear();
mean[i].x1 = 0.0;
mean[i].x2 = 0.0;
mean[i].x3 = 0.0;
}
je = 0.0;
}
void CCMean::CalcuMean(int ii)
{
double sum1 = 0.0, sum2 = 0.0,sum3 = 0.0;
int si = (int)pcla[ii]->size();
list< CData >::iterator iter = pcla[ii]->begin();
for(int i = 0; i < si; i ++ )
{
sum1 += iter->x1;
sum2 += iter->x2;
sum3 += iter->x3;
iter++;
}
mean[ii].x1 = (double)sum1 / si;
mean[ii].x2 = (double)sum2 / si;
mean[ii].x3 = (double)sum3 / si;
}
void CCMean::CalcuJe()
{
for( int i = 0; i < iClassNum ; i ++ )
{
CalcuJc( i );
je += jc[i];
}
}
void CCMean::CalcuJc( int index )
{
list< CData >::iterator iter = pcla[index]->begin();
int si = (int)pcla[index]->size();
jc[index] = 0;
for( int i = 0; i < si; i ++)
{
jc[index] += dist( mean[index], *iter );
iter ++;
}
}
double CCMean::dist(const CData& mean, const CData& da)
{
return (mean.x1 - da.x1)*(mean.x1 - da.x1) + (mean.x2 - da.x2)*(mean.x2 - da.x2) + (mean.x3 - da.x3)*(mean.x3 - da.x3);
}
void CCMean::InitDeploy()
{
CData *pmean = pMean;
for( int ii = 0; ii < iClassNum; ii ++ )
{
mean[ii] = *pmean;//初始化类别重心数组
pmean++;
}
// //如果是没有给定初始化的分类重心,可以加上下面的这段代码
// CData *ptem = pData;
// for( int i = 0; i < iClassNum; i ++ )
// {
// mean[i] = *ptem;
// pcla[i]->push_back( *ptem );
// ptem++;
// }
for( int i = 0; i < DATANUM; i ++ )
{
double mindis = MAXDIST;
int pos = 0;
//计算各样本与各类别重心的的最短距离
for( int j = 0; j < iClassNum; j ++ )
{
double curdis = dist( pData[i], mean[j] );
if( curdis < mindis )
{
mindis = curdis;
pos = j;
}
}
// 将pData加入到pos类别中
pcla[pos]->push_back( pData[i] );
}
for( int j = 0; j < iClassNum ; j ++ )
CalcuMean( j );
CalcuJe();
}
bool CCMean::MoveItoK( const CData &da, int i , int& k )
{
double Pk = MAXDIST;
double Pj = 0.0;
int temk = 0;
for( int j = 0; j < iClassNum; j ++ )
{
int si = (int)pcla[j]->size();
if( j == i )
Pj = dist( mean[j], da ) * si/(si - 1);
else
Pj = dist( mean[j], da ) * si/(si + 1);
if( Pj < Pk )
{
Pk = Pj;
temk = j;
}
else if ( Pj == Pk && j == i )
{
// 当 Pj == Pk && j == i, 移动
temk = i;
}
}
if( i == temk )
return false; // 不移动
k = temk;
// 将da加到第k类
pcla[k]->push_back( da );
//从第i类中删除da,但是首先从链表中找到他的位置
list< CData >::iterator iter = pcla[i]->begin();
while( iter != pcla[i]->end() )
{
if( iter->x1 == da.x1 && iter->x2 == da.x2 && iter->x3 == da.x3 )
break;
iter++;
}
pcla[i]->erase( iter );
return true;
}
void CCMean::OutPut()
{
for( int i = 0; i < iClassNum ; i ++ )
{
cout<<"类别: "<<i<<endl;
cout<<"重心点为: ("<<mean[i].x1<<","<<mean[i].x2<<","<<mean[i].x3<<")"<<endl;
list< CData >::iterator iter = pcla[i]->begin();
int j = 1;
while( iter != pcla[i]->end() )
{
cout<<"("<<iter->x1<<","<<iter->x2<<","<<iter->x3<<") "<<endl;
iter ++;
if( j++ % 5 == 0)
cout<<endl;
}
cout<<endl;
}
}
void CCMean::work(int InitClassNum)
{
iClassNum = InitClassNum;
InitDeploy();
int count = 0;//计算迭代次数
int counter = 0;//用来判断迭代是否停止
Again:
for( int i = 0; i < iClassNum ; i ++ )
{
int si = (int)pcla[i]->size();
if( si == 1 )
continue;
list< CData >::iterator iter = pcla[i]->begin();
for(int j = 0; j < (int)pcla[i]->size(); j++)
{
int k = 0;
CData da = *iter;
iter ++;
if( MoveItoK( da , i, k ) == true )
{
double OldJe = je;
je -= jc[i];
je -= jc[k];
CalcuMean( i );
CalcuMean( k );
CalcuJc( i );
CalcuJc( k );
je += jc[i];
je += jc[k];
if( OldJe > je )
{
count++;
counter = 0;
goto Again;
}
}
counter++;
if( counter == DATANUM )
goto end;
}
}
end:
cout<<" 最后总误差 Je 为: "<<je<<endl;
cout<<"迭代次数是:"<<count<<endl;
OutPut();
}
测试主函数:
#include "cmean.h"
#include "process.h"
CData yy[DATANUM] =
{
{-7.82,-4.58,-3.97},{-6.68,3.16,2.71},{4.36,-2.19,2.09},{6.72,0.88,2.80},{-8.64,3.06,3.50},
{-6.87,0.57,-5.45},{4.47,-2.62,5.76},{6.73,-2.01,4.18},{-7.71,2.34,-6.33},{-6.91,-0.49,-5.68},
{6.18,2.81,5.82},{6.72,-0.93,-4.04},{-6.25,-0.26,0.56},{-6.94,-1.22,1.13},{8.09,0.20,2.25},
{6.18,0.17,-4.15},{-5.19,4.24,4.04},{-6.38,-1.74,1.43},{4.08,1.30,5.33},{6.27,0.93,-2.78}
};
CData m11[2] = {{1,1,1},{-1,1,-1}};
CData m12[2] = {{0,0,0},{1,1,-1}};
CData m21[3] = {{0,0,0},{-1,1,-1},{-1,0,2}};
CData m22[3] = {{-0.1,0,0.1},{0,-0.1,0.1},{-0.1,-0.1,0.1}};
CData m31[4] = {{-0.1,0,0.1},{0,-0.1,0.1},{-0.1,-0.1,0.1},{0.2,1,0}};
int main(int argc, char* argv[])
{
cout<<"测试1:"<<endl;
CCMean cmean11( yy,m11 );
cmean11.work(2);
cout<<"测试2:"<<endl;
CCMean cmean12( yy,m12 );
cmean12.work(2);
cout<<"测试3:"<<endl;
CCMean cmean21( yy,m21 );
cmean21.work(3);
cout<<"测试4:"<<endl;
CCMean cmean22( yy,m22 );
cmean22.work(3);
system("pause");
return 0;
}
五、实验结果:
测试1:
最后总误差 Je 为: 409.498
迭代次数是:0
类别: 0
重心点为: (5.98,-0.146,1.726)
(4.36,-2.19,2.09)
(6.72,0.88,2. 
(4.47,-2.62,5.76)
(6.73,-2.01,4.1 
(6.18,2.81,5.82)
(6.72,-0.93,-4.04)
(8.09,0.2,2.25)
(6.18,0.17,-4.15)
(4.08,1.3,5.33)
(6.27,0.93,-2.7 
类别: 1
重心点为: (-6.939,0.508,-0.806)
(-7.82,-4.58,-3.97)
(-6.68,3.16,2.71)
(-8.64,3.06,3.5)
(-6.87,0.57,-5.45)
(-7.71,2.34,-6.33)
(-6.91,-0.49,-5.6 
(-6.25,-0.26,0.56)
(-6.94,-1.22,1.13)
(-5.19,4.24,4.04)
(-6.38,-1.74,1.43)
测试2:
最后总误差 Je 为: 409.498
迭代次数是:4
类别: 0
重心点为: (-6.939,0.508,-0.806)
(-7.82,-4.58,-3.97)
(-6.68,3.16,2.71)
(-8.64,3.06,3.5)
(-6.87,0.57,-5.45)
(-7.71,2.34,-6.33)
(-6.91,-0.49,-5.6 
(-6.25,-0.26,0.56)
(-6.94,-1.22,1.13)
(-5.19,4.24,4.04)
(-6.38,-1.74,1.43)
类别: 1
重心点为: (5.98,-0.146,1.726)
(6.72,0.88,2. 
(6.18,2.81,5.82)
(6.72,-0.93,-4.04)
(8.09,0.2,2.25)
(6.18,0.17,-4.15)
(6.27,0.93,-2.7 
(4.36,-2.19,2.09)
(4.47,-2.62,5.76)
(6.73,-2.01,4.1 
(4.08,1.3,5.33)
测试3:
最后总误差 Je 为: 263.062
迭代次数是:3
类别: 0
重心点为: (5.98,-0.146,1.726)
(4.36,-2.19,2.09)
(6.72,0.88,2. 
(6.73,-2.01,4.1 
(6.72,-0.93,-4.04)
(8.09,0.2,2.25)
(6.18,0.17,-4.15)
(6.27,0.93,-2.7 
(4.47,-2.62,5.76)
(6.18,2.81,5.82)
(4.08,1.3,5.33)
类别: 1
重心点为: (-7.3275,-0.54,-5.3575)
(-7.82,-4.58,-3.97)
(-6.87,0.57,-5.45)
(-7.71,2.34,-6.33)
(-6.91,-0.49,-5.6 
类别: 2
重心点为: (-6.68,1.20667,2.22833)
(-6.68,3.16,2.71)
(-8.64,3.06,3.5)
(-6.25,-0.26,0.56)
(-6.94,-1.22,1.13)
(-5.19,4.24,4.04)
(-6.38,-1.74,1.43)
测试4:
最后总误差 Je 为: 295.01
迭代次数是:2
类别: 0
重心点为: (-6.83667,3.48667,3.41667)
(-6.68,3.16,2.71)
(-8.64,3.06,3.5)
(-5.19,4.24,4.04)
类别: 1
重心点为: (5.98,-0.146,1.726)
(4.36,-2.19,2.09)
(6.72,0.88,2. 
(4.47,-2.62,5.76)
(6.73,-2.01,4.1 
(6.18,2.81,5.82)
(6.72,-0.93,-4.04)
(8.09,0.2,2.25)
(6.18,0.17,-4.15)
(4.08,1.3,5.33)
(6.27,0.93,-2.7 
类别: 2
重心点为: (-6.98286,-0.768571,-2.61571)
(-7.82,-4.58,-3.97)
(-6.91,-0.49,-5.6 
(-6.25,-0.26,0.56)
(-6.94,-1.22,1.13)
(-6.38,-1.74,1.43)
(-6.87,0.57,-5.45)
(-7.71,2.34,-6.33)
请按任意键继续. . .
结果分析:初始的类别重心选择越与实际的的重心接近,则迭代次数越少,划分的类别越多则迭代的次数也越少,当分成N类,则迭代次数是零。