SHOGUN  6.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules
KMeans.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2014 Parijat Mazumdar
8  * Written (W) 2016 Saurabh Mahindre
9  */
10 
16 #include <shogun/io/SGIO.h>
18 
19 using namespace Eigen;
20 using namespace shogun;
21 
22 
23 namespace shogun
24 {
25 
26 CKMeans::CKMeans():CKMeansBase()
27 {
28 }
29 
30 CKMeans::CKMeans(int32_t k_i, CDistance* d_i, bool use_kmpp_i):CKMeansBase(k_i, d_i, use_kmpp_i)
31 {
32 }
33 
34 CKMeans::CKMeans(int32_t k_i, CDistance* d_i, SGMatrix<float64_t> centers_i):CKMeansBase(k_i, d_i, centers_i)
35 {
36 }
37 
39 {
40 }
41 
42 void CKMeans::Lloyd_KMeans(SGMatrix<float64_t> centers, int32_t num_centers)
43 {
46 
47  int32_t lhs_size=lhs->get_num_vectors();
48  int32_t dim=lhs->get_num_features();
49 
51  CFeatures* rhs_cache=distance->replace_rhs(rhs_mus);
52 
53  SGVector<int32_t> cluster_assignments=SGVector<int32_t>(lhs_size);
54  cluster_assignments.zero();
55 
56  /* Weights : Number of points in each cluster */
57  SGVector<int64_t> weights_set(num_centers);
58  weights_set.zero();
59  /* Initially set all weights for zeroth cluster, Changes in assignement step */
60  weights_set[0]=lhs_size;
61 
63 
64  int32_t changed=1;
65  int32_t iter;
66 
67  for(iter=0; iter<max_iter; iter++)
68  {
69  if (iter==max_iter-1)
70  SG_SWARNING("KMeans clustering has reached maximum number of ( %d ) iterations without having converged. \
71  Terminating. \n", iter)
72 
73  changed=0;
74  rhs_mus->set_feature_matrix(centers.clone());
75  rhs_mus->initialize_cache();
76 
77  distance->precompute_rhs();
78 
79 #pragma omp parallel for firstprivate(lhs_size, dim, num_centers) \
80  shared(centers, cluster_assignments, weights_set) \
81  reduction(+:changed) if (!fixed_centers)
82  /* Assigment step : Assign each point to nearest cluster */
83  for (int32_t i=0; i<lhs_size; i++)
84  {
85  const int32_t cluster_assignments_i=cluster_assignments[i];
86  int32_t min_cluster, j;
87  float64_t min_dist, dist;
88 
89  min_cluster=0;
90  min_dist=distance->distance(i,0);
91  for (j=1; j<num_centers; j++)
92  {
93  dist=distance->distance(i,j);
94  if (dist<min_dist)
95  {
96  min_dist=dist;
97  min_cluster=j;
98  }
99  }
100 
101  if (min_cluster!=cluster_assignments_i)
102  {
103  changed++;
104 #pragma omp atomic
105  ++weights_set[min_cluster];
106 #pragma omp atomic
107  --weights_set[cluster_assignments_i];
108 
109  if(fixed_centers)
110  {
112  float64_t temp_min = 1.0 / weights_set[min_cluster];
113 
114  /* mu_new = mu_old + (x - mu_old)/(w) */
115  for (j=0; j<dim; j++)
116  {
117  centers(j, min_cluster)+=
118  (vec[j]-centers(j, min_cluster))*temp_min;
119  }
120 
121  lhs->free_feature_vector(vec, i);
122 
123  /* mu_new = mu_old - (x - mu_old)/(w-1) */
124  /* if weights_set(j)~=0 */
125  if (weights_set[cluster_assignments_i]!=0)
126  {
127  float64_t temp_i = 1.0 / weights_set[cluster_assignments_i];
129 
130  for (j=0; j<dim; j++)
131  {
132  centers(j, cluster_assignments_i)-=
133  (vec1[j]-centers(j, cluster_assignments_i))*temp_i;
134  }
135  lhs->free_feature_vector(vec1, i);
136  }
137  else
138  {
139  /* mus(:,j)=zeros(dim,1) ; */
140  for (j=0; j<dim; j++)
141  centers(j, cluster_assignments_i)=0;
142  }
143 
144  }
145 
146  cluster_assignments[i] = min_cluster;
147  }
148  }
149  if(changed==0)
150  break;
151 
152  /* Update Step : Calculate new means */
153  if (!fixed_centers)
154  {
155  /* mus=zeros(dim, num_centers) ; */
156  centers.zero();
157  Map<MatrixXd> map_centers(centers.matrix, centers.num_rows, centers.num_cols);
158 
159  for (int32_t i=0; i<lhs_size; i++)
160  {
161  int32_t cluster_i=cluster_assignments[i];
162 
164  Map<VectorXd> map_vec(vec.vector, vec.size());
165 
166  map_centers.col(cluster_i) += map_vec;
167 
168  lhs->free_feature_vector(vec, i);
169  }
170 
171  for (int32_t i=0; i<num_centers; i++)
172  {
173  if (weights_set[i]!=0)
174  map_centers.col(i)*=1.0/weights_set[i];
175  }
176  }
177  if (iter%(max_iter/10) == 0)
178  SG_SINFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed)
179  }
181  distance->replace_rhs(rhs_cache);
182  delete rhs_mus;
183  SG_UNREF(lhs);
184 }
185 
186 bool CKMeans::train_machine(CFeatures* data)
187 {
188  initialize_training(data);
189  Lloyd_KMeans(mus, k);
191  return true;
192 }
193 
194 }
195 
ST * get_feature_vector(int32_t num, int32_t &len, bool &dofree)
Class Distance, a base class for all the distances used in the Shogun toolbox.
Definition: Distance.h:87
int32_t get_num_features() const
virtual void reset_precompute()
Definition: Distance.h:150
CFeatures * get_lhs()
Definition: Distance.h:218
virtual CSGObject * clone()
Definition: SGObject.cpp:729
#define SG_SWARNING(...)
Definition: SGIO.h:177
Definition: SGMatrix.h:24
index_t num_cols
Definition: SGMatrix.h:465
SGMatrix< float64_t > mus
Definition: KMeansBase.h:199
index_t num_rows
Definition: SGMatrix.h:463
virtual ~CKMeans()
Definition: KMeans.cpp:38
int32_t size() const
Definition: SGVector.h:136
void compute_cluster_variances()
Definition: KMeansBase.cpp:90
void initialize_training(CFeatures *data=NULL)
Definition: KMeansBase.cpp:138
virtual int32_t get_num_vectors() const
double float64_t
Definition: common.h:60
virtual CFeatures * replace_rhs(CFeatures *rhs)
Definition: Distance.cpp:147
virtual float64_t distance(int32_t idx_a, int32_t idx_b)
Definition: Distance.cpp:183
#define SG_UNREF(x)
Definition: SGObject.h:53
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
The class Features is the base class of all feature objects.
Definition: Features.h:68
#define SG_SINFO(...)
Definition: SGIO.h:172
virtual void precompute_lhs()
Definition: Distance.h:143
static CDenseFeatures * obtain_from_generic(CFeatures *const base_features)

SHOGUN Machine Learning Toolbox - Documentation