clustermanager.cpp

Go to the documentation of this file.
00001 //----------------------------------------------------------------------
00002 //      File:           clustermanager.cpp
00003 //      Description:    k-means Clustering Algorithm
00004 //----------------------------------------------------------------------
00005 
00006 //-----------k-means Clustering Algorithm--------------------
00007 /* input : database D of m records r(1) to r(m) and a desired number of clusters k
00008 
00009    output : Set of k cluters( i.e. their centers) with minimum squared error
00010      begin:
00011         1. Randomly choose k records as centroids for k clusters
00012         2. repeat
00013             2.1 Assign each record r(i), to a cluster such that the distance between r(i) and the cluster centroid( mean) is the smallest among the k clusters;
00014             2.2 Recalculate the centroid(mean) for each cluster based on the records assigned to the cluster until no change end;
00015 
00016 
00017 Distance(r(j),r(k)) = sqrt(sqr(|r(j1)-r(k1)|)+
00018                 sqr(|r(j2)-r(k2)|)+
00019                 sqr(|r(j3)-r(k3)|)+....+
00020                 sqr(|r(jn)-r(kn)|)
00021 */
00022 
00023 using namespace std;   // so as to make std available
00024 
00025 //include files
00026 
00027 #include <iostream>   // for enabling input output streams
00028 #include <unistd.h>   // for file IO
00029 #include <stdio.h>    // for standard IO
00030 #include <cstring>    // for string functionality
00031 #include <cstdlib>    // C standard library routines
00032 #include <fcntl.h>    // manipulate the file descriptor
00033 #include <cmath>      // math routines
00034 #include <string>     // string manipulations   
00035 #include <vector>     // vector manipulations
00036 #include <iterator>   // using iterators for STL support
00037 
00038 #ifndef FRAMEMFCCTABLEINCLUDED
00039 #include "framemfcctable.cpp"
00040 #endif
00041 
00042 #ifndef REPOSITORYMANAGERINCLUDED
00043 #include "repositorymanager.cpp"
00044 #endif
00045 
00046 #ifndef FRAMEINCLUDED
00047 #include "frame.cpp"
00048 #endif
00049 
00050 #ifndef PARAMETERSINCLUDED
00051 #include "parameters.cpp"
00052 #endif
00053 
00054 #define CLUSTERMANAGERINCLUDED
00055 
00056 /* The class clustermanager contains all the data structures needed to store the data points and the cluster centroids and also the functionality that implements the k-means clustering algorithm*/
00057 
00058 class clustermanager
00059 {
00060         // data members
00061 private:
00062         long int current;
00063         vector<double> dist;                      // distance of each cluster center from the current data point
00064         vector<double> centroid;                  //stores the MFCC parameters of a particular cluster center.
00065         vector<vector<double> > cluster_centers;  // centers of the clusters
00066         vector<unsigned long int> indices;        //indices of frames (in mfcc table) to be added to the repository
00067         vector<int> count;                        // count of members in each cluster currently
00068 
00069 public :
00070 
00071   framemfcctable fmtbl; //avoiding wrapper functions
00072 
00073   // the class functionality
00074 
00075   clustermanager() // constuctor for the clustermanager class
00076   {
00077    int i;
00078     for(i=0;i<k;i++)
00079     {
00080        dist.push_back(0); // initialising the distance array
00081        count.push_back(0);    
00082     }
00083        current=0;
00084   }
00085 
00086 
00087   void showcenters()
00088   {
00089     int i,j;
00090     cout<<"\n Cluster centers are ";
00091     for(i=0;i<k;i++)
00092     {
00093       for(j=0;j<MAX_DIM;j++)
00094       {
00095         cout<< "\t" << cluster_centers[i][j];
00096       }
00097     cout<< "\n";
00098     }
00099   }
00100 
00101 
00102 
00103 // This function 'initialize()' selects the first k data points as the random cluster centroids.
00104 
00105   int initcentroids(int iter)
00106   {
00107     unsigned int i,j;
00108     vector<double> temp;
00109         int status=SUCCESS;
00110     //    vector<int>::iterator  i1=count.begin();
00111     srand(i);
00112     for(i=0;i<k;i++) 
00113     {
00114       if(!iter)
00115         {
00116  j=(int) ((float)fmtbl.nFrames()*rand()/(RAND_MAX+1.0));
00117            temp=fmtbl.getFrameMFCC(j,&status);
00118         cluster_centers.push_back(temp);
00119         }
00120 count[i]=0;//(iter==0)?1:0;
00121 
00122     }
00123     current=0;//(iter==0)?k:0;
00124     return SUCCESS;
00125   }
00126 
00127   int start()
00128   {
00129     int iter;
00130     for(iter=0;iter<NO_OF_ITER;iter++)
00131     {
00132       initcentroids(iter);
00133       while(distance())
00134       {
00135         recalculate1(minimum());
00136         current++; 
00137       }
00138     }
00139     return SUCCESS;
00140   }
00141 
00142   // The function 'distance()' calculates the distance between the current data point and the k cluster centroids.
00143   //m is the index of the data point in the input file
00144 
00145   int  distance()
00146   {
00147     int j,i;
00148     int status=SUCCESS;
00149     double temp=0.0;
00150     vector<double> cur_mfcc; 
00151     cur_mfcc=fmtbl.getFrameMFCC(current,&status);
00152     if(!status)
00153       return FAILURE;
00154     for(i=0;i<k;i++)
00155     {
00156       temp=0;
00157       for(j=0;j<MAX_DIM;j++)
00158       {
00159         temp+=(cur_mfcc[j]-cluster_centers[i][j])*(cur_mfcc[j]-cluster_centers[i][j]);
00160       }
00161       temp=sqrt(temp);
00162       dist[i]=temp;
00163     }
00164     return SUCCESS;
00165   }
00166 
00167 
00168 
00169   int  distance(vector<double> mfcc)
00170   {
00171     int j,i;
00172     int status=SUCCESS;
00173     double temp=0.0; 
00174     for(i=0;i<k;i++)
00175     {
00176       temp=0;
00177       for(j=0;j<MAX_DIM;j++)
00178       {
00179         temp+=(mfcc[j]-cluster_centers[i][j])*(mfcc[j]-cluster_centers[i][j]);
00180       }
00181       temp=sqrt(temp);
00182       dist[i]=temp;
00183     }
00184     return SUCCESS;
00185   }
00186 
00187 
00188 
00189   int minimum()
00190   {
00191     int i,pos=0;
00192     double least=dist[0];
00193     for(i=0;i<k;i++)
00194       if(dist[i]<least)
00195       {
00196         least=dist[i];
00197         pos=i;
00198       }
00199     return pos;
00200   }
00201 
00202   int recalculate1(int min)
00203   {
00204     int i;
00205     int status=SUCCESS;
00206     vector<double> cur_mfcc; 
00207     cur_mfcc=fmtbl.getFrameMFCC(current,&status);
00208     if(!status)
00209       return FAILURE;
00210     count[min]++;
00211     for(i=0;i<MAX_DIM;i++)
00212     {
00213     cluster_centers[min][i]=(cluster_centers[min][i]*(count[min]-1)+cur_mfcc[i])/(float)count[min];
00214     }
00215     return SUCCESS;
00216   }
00217 
00218   vector<unsigned long int> getIndices(void)
00219   {
00220     unsigned long int pos=0;
00221     vector<double> cur_mfcc;
00222     int status=SUCCESS;
00223     double temp=0,oldtemp;
00224     int i;
00225     frame  frptr;
00226     string framename;
00227     for(i=0;i<k;i++)
00228     {
00229       pos=0;
00230       oldtemp=VERY_HIGH_VALUE;
00231       indices.push_back(i);
00232       cur_mfcc= fmtbl.getFrameMFCC(pos,&status);
00233       while(status)
00234       {
00235         temp=0; 
00236         for(int j=0;j<MAX_DIM;j++)
00237         {
00238           temp+=(cur_mfcc[j]-cluster_centers[i][j])*(cur_mfcc[j]-cluster_centers[i][j]);
00239         }
00240         temp=sqrt(temp);
00241         if(temp<oldtemp)
00242         {
00243           indices[i]=pos;
00244           oldtemp=temp;
00245         }
00246         pos++;
00247         cur_mfcc= fmtbl.getFrameMFCC(pos,&status);
00248       }
00249     }
00250     return indices;
00251   }
00252     //indices is now filled with the indices of the framemfcctable for the frames to be made permanent in the repository.
00253 
00254   vector<vector<double> >  getcentroids(void)
00255   {
00256      return cluster_centers; //theoretical value of cluster centers
00257   }
00258 
00259   //gets the cluster centers from the codebook which is being managed by repositorymanager
00260   int getallclustercenters(string email)
00261   { 
00262     repositorymanager repmgr(email);
00263     for(unsigned int i=0;i<k;i++)
00264     {
00265       cluster_centers.push_back(repmgr.getClusterCenter(i));
00266     }
00267     return SUCCESS;
00268   }
00269 
00270   unsigned int compare(vector<double> mfcc)
00271   {
00272     distance(mfcc);
00273     return  minimum();
00274   }
00275 
00276 
00277 };

Best viewed on Get Firefox!. Generated on Mon Mar 28 22:09:09 2005 for VoX by  doxygen 1.4.2 . This project hosted by SourceForge.net Logo .