SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
KMeans.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2008 Gunnar Raetsch
8  * Written (W) 2007-2009 Soeren Sonnenburg
9  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  */
11 
17 #include <shogun/labels/Labels.h>
20 #include <shogun/base/Parallel.h>
22 
23 #ifdef HAVE_PTHREAD
24 #include <pthread.h>
25 #endif
26 
27 #ifdef HAVE_LINALG_LIB
29 #endif
30 
31 using namespace shogun;
32 using namespace Eigen;
33 
36 {
37  init();
38 }
39 
42 {
43  init();
44  k=k_;
45  set_distance(d);
46  train_method=f;
47 }
48 
49 CKMeans::CKMeans(int32_t k_, CDistance* d, bool use_kmpp, EKMeansMethod f)
51 {
52  init();
53  k=k_;
54  set_distance(d);
55  use_kmeanspp=use_kmpp;
56  train_method=f;
57 }
58 
61 {
62  init();
63  k = k_i;
64  set_distance(d_i);
65  set_initial_centers(centers_i);
66  train_method=f;
67 }
68 
70 {
71 }
72 
74 {
76  dimensions=lhs->get_num_features();
77  REQUIRE(centers.num_cols == k,
78  "Expected %d initial cluster centers, got %d", k, centers.num_cols);
79  REQUIRE(centers.num_rows == dimensions,
80  "Expected %d dimensionional cluster centers, got %d", dimensions, centers.num_rows);
81  mus_initial = centers;
82  SG_UNREF(lhs);
83 }
84 
85 void CKMeans::set_random_centers()
86 {
87  mus.zero();
90  int32_t lhs_size=lhs->get_num_vectors();
91 
92  SGVector<int32_t> temp=SGVector<int32_t>(lhs_size);
93  SGVector<int32_t>::range_fill_vector(temp, lhs_size, 0);
94  CMath::permute(temp);
95 
96  for (int32_t i=0; i<k; i++)
97  {
98  const int32_t cluster_center_i=temp[i];
99  SGVector<float64_t> vec=lhs->get_feature_vector(cluster_center_i);
100 
101  for (int32_t j=0; j<dimensions; j++)
102  mus(j,i)=vec[j];
103 
104  lhs->free_feature_vector(vec, cluster_center_i);
105  }
106 
107  SG_UNREF(lhs);
108 
109 }
110 
111 void CKMeans::compute_cluster_variances()
112 {
113  /* compute the ,,variances'' of the clusters */
114  for (int32_t i=0; i<k; i++)
115  {
116  float64_t rmin1=0;
117  float64_t rmin2=0;
118 
119  bool first_round=true;
120 
121  for (int32_t j=0; j<k; j++)
122  {
123  if (j!=i)
124  {
125  int32_t l;
126  float64_t dist = 0;
127 
128  for (l=0; l<dimensions; l++)
129  {
130  dist+=CMath::sq(
131  mus.matrix[i*dimensions+l]
132  -mus.matrix[j*dimensions+l]);
133  }
134 
135  if (first_round)
136  {
137  rmin1=dist;
138  rmin2=dist;
139  first_round=false;
140  }
141  else
142  {
143  if ((dist<rmin2) && (dist>=rmin1))
144  rmin2=dist;
145 
146  if (dist<rmin1)
147  {
148  rmin2=rmin1;
149  rmin1=dist;
150  }
151  }
152  }
153  }
154 
155  R.vector[i]=(0.7*CMath::sqrt(rmin1)+0.3*CMath::sqrt(rmin2));
156  }
157 }
158 
159 bool CKMeans::train_machine(CFeatures* data)
160 {
161  REQUIRE(distance, "Distance is not provided")
162  REQUIRE(distance->get_feature_type()==F_DREAL, "Distance's features type (%d) should be of type REAL (%d)")
163 
164  if (data)
165  distance->init(data, data);
166 
168  CDenseFeatures<float64_t>::obtain_from_generic(distance->get_lhs());
169 
170  REQUIRE(lhs, "Lhs features of distance not provided");
171  int32_t lhs_size=lhs->get_num_vectors();
172  dimensions=lhs->get_num_features();
173  const int32_t centers_size=dimensions*k;
174 
175  REQUIRE(lhs_size>0, "Lhs features should not be empty");
176  REQUIRE(dimensions>0, "Lhs features should have more than zero dimensions");
177 
178  /* if kmeans++ to be used */
179  if (use_kmeanspp)
180  {
181 #ifdef HAVE_LINALG_LIB
182  mus_initial=kmeanspp();
183 #endif
184  }
185 
186  R=SGVector<float64_t>(k);
187 
188  mus=SGMatrix<float64_t>(dimensions, k);
189  /* cluster_centers=zeros(dimensions, k) ; */
190  memset(mus.matrix, 0, sizeof(float64_t)*centers_size);
191 
192 
193  if (mus_initial.matrix)
194  mus = mus_initial;
195  else
196  set_random_centers();
197 
198  if (train_method==KMM_MINI_BATCH)
199  {
200  CKMeansMiniBatchImpl::minibatch_KMeans(k, distance, batch_size, minib_iter, mus);
201  }
202  else
203  {
204  CKMeansLloydImpl::Lloyd_KMeans(k, distance, max_iter, mus, fixed_centers);
205  }
206 
207  compute_cluster_variances();
208  SG_UNREF(lhs);
209  return true;
210 }
211 
212 bool CKMeans::load(FILE* srcfile)
213 {
216  return false;
217 }
218 
219 bool CKMeans::save(FILE* dstfile)
220 {
223  return false;
224 }
225 
227 {
228  use_kmeanspp=kmpp;
229 }
230 
232 {
233  return use_kmeanspp;
234 }
235 
236 void CKMeans::set_k(int32_t p_k)
237 {
238  REQUIRE(p_k>0, "number of clusters should be > 0");
239  this->k=p_k;
240 }
241 
242 int32_t CKMeans::get_k()
243 {
244  return k;
245 }
246 
247 void CKMeans::set_max_iter(int32_t iter)
248 {
249  REQUIRE(iter>0, "number of clusters should be > 0");
250  max_iter=iter;
251 }
252 
254 {
255  return max_iter;
256 }
257 
259 {
260  train_method=f;
261 }
262 
264 {
265  return train_method;
266 }
267 
269 {
270  REQUIRE(b>0, "Parameter bach size should be > 0");
271  batch_size=b;
272 }
273 
275 {
276  return batch_size;
277 }
278 
280 {
281  REQUIRE(i>0, "Parameter number of iterations should be > 0");
282  minib_iter=i;
283 }
284 
286 {
287  return minib_iter;
288 }
289 
290 void CKMeans::set_mbKMeans_params(int32_t b, int32_t t)
291 {
292  REQUIRE(b>0, "Parameter bach size should be > 0");
293  REQUIRE(t>0, "Parameter number of iterations should be > 0");
294  batch_size=b;
295  minib_iter=t;
296 }
297 
299 {
300  return R;
301 }
302 
304 {
305  if (!R.vector)
306  return SGMatrix<float64_t>();
307 
310  SGMatrix<float64_t> centers=lhs->get_feature_matrix();
311  SG_UNREF(lhs);
312  return centers;
313 }
314 
316 {
317  return dimensions;
318 }
319 
321 {
322  fixed_centers=fixed;
323 }
324 
326 {
327  return fixed_centers;
328 }
329 
330 void CKMeans::store_model_features()
331 {
332  /* set lhs of underlying distance to cluster centers */
334  mus);
335 
336  /* store cluster centers in lhs of distance variable */
337  CFeatures* rhs=distance->get_rhs();
338  distance->init(cluster_centers, rhs);
339  SG_UNREF(rhs);
340 }
341 
342 SGMatrix<float64_t> CKMeans::kmeanspp()
343 {
344  int32_t lhs_size;
346  lhs_size=lhs->get_num_vectors();
347 
348  SGMatrix<float64_t> centers=SGMatrix<float64_t>(dimensions, k);
349  centers.zero();
350  SGVector<float64_t> min_dist=SGVector<float64_t>(lhs_size);
351  min_dist.zero();
352 
353  /* First center is chosen at random */
354  int32_t mu=CMath::random((int32_t) 0, lhs_size-1);
355  SGVector<float64_t> mu_first=lhs->get_feature_vector(mu);
356  for(int32_t j=0; j<dimensions; j++)
357  centers(j, 0)=mu_first[j];
358 
361 #pragma omp parallel for shared(min_dist)
362  for(int32_t i=0; i<lhs_size; i++)
363  min_dist[i]=CMath::sq(distance->distance(i, mu));
364 #ifdef HAVE_LINALG
365  float64_t sum=linalg::vector_sum(min_dist);
366 #else //HAVE_LINALG
367  Map<VectorXd> eigen_min_dist(min_dist.vector, min_dist.vlen);
368  float64_t sum=eigen_min_dist.sum();
369 #endif //HAVE_LINALG
370  int32_t n_rands=2 + int32_t(CMath::log(k));
371 
372  /* Choose centers with weighted probability */
373  for(int32_t i=1; i<k; i++)
374  {
375  int32_t best_center=0;
376  float64_t best_sum=-1.0;
377  SGVector<float64_t> best_min_dist=SGVector<float64_t>(lhs_size);
378 
379  /* local tries for best center */
380  for(int32_t trial=0; trial<n_rands; trial++)
381  {
382  float64_t temp_sum=0.0;
383  float64_t temp_dist=0.0;
384  SGVector<float64_t> temp_min_dist=SGVector<float64_t>(lhs_size);
385  int32_t new_center=0;
386  float64_t prob=CMath::random(0.0, 1.0);
387  prob=prob*sum;
388 
389  for(int32_t j=0; j<lhs_size; j++)
390  {
391  temp_sum+=min_dist[j];
392  if (prob <= temp_sum)
393  {
394  new_center=j;
395  break;
396  }
397  }
398 
399 #pragma omp parallel for firstprivate(lhs_size) \
400  shared(temp_min_dist)
401  for(int32_t j=0; j<lhs_size; j++)
402  {
403  temp_dist=CMath::sq(distance->distance(j, new_center));
404  temp_min_dist[j]=CMath::min(temp_dist, min_dist[j]);
405  }
406 
407 #ifdef HAVE_LINALG
408  temp_sum=linalg::vector_sum(temp_min_dist);
409 #else //HAVE_LINALG
410  Map<VectorXd> eigen_temp_sum(temp_min_dist.vector, temp_min_dist.vlen);
411  temp_sum=eigen_temp_sum.sum();
412 #endif //HAVE_LINALG
413  if ((temp_sum<best_sum) || (best_sum<0))
414  {
415  best_sum=temp_sum;
416  best_min_dist=temp_min_dist;
417  best_center=new_center;
418  }
419  }
420 
421  SGVector<float64_t> vec=lhs->get_feature_vector(best_center);
422  for(int32_t j=0; j<dimensions; j++)
423  centers(j, i)=vec[j];
424  sum=best_sum;
425  min_dist=best_min_dist;
426  }
427 
429  SG_UNREF(lhs);
430  return centers;
431 }
432 
433 void CKMeans::init()
434 {
435  max_iter=10000;
436  k=3;
437  dimensions=0;
438  fixed_centers=false;
439  use_kmeanspp=false;
440  train_method=KMM_LLOYD;
441  batch_size=-1;
442  minib_iter=-1;
443  SG_ADD(&max_iter, "max_iter", "Maximum number of iterations", MS_AVAILABLE);
444  SG_ADD(&k, "k", "k, the number of clusters", MS_AVAILABLE);
445  SG_ADD(&dimensions, "dimensions", "Dimensions of data", MS_NOT_AVAILABLE);
446  SG_ADD(&R, "R", "Cluster radiuses", MS_NOT_AVAILABLE);
447 }
448 
int32_t get_mbKMeans_batch_size() const
Definition: KMeans.cpp:274
static void permute(SGVector< T > v, CRandom *rand=NULL)
Definition: Math.h:1144
#define SG_RESET_LOCALE
Definition: SGIO.h:86
int32_t get_mbKMeans_iter() const
Definition: KMeans.cpp:285
virtual bool save(FILE *dstfile)
Definition: KMeans.cpp:219
void set_mbKMeans_params(int32_t b, int32_t t)
Definition: KMeans.cpp:290
ST * get_feature_vector(int32_t num, int32_t &len, bool &dofree)
Class Distance, a base class for all the distances used in the Shogun toolbox.
Definition: Distance.h:87
int32_t get_num_features() const
virtual void reset_precompute()
Definition: Distance.h:150
CFeatures * get_lhs()
Definition: Distance.h:224
void set_mbKMeans_batch_size(int32_t b)
Definition: KMeans.cpp:268
void set_mbKMeans_iter(int32_t t)
Definition: KMeans.cpp:279
static T sq(T x)
Definition: Math.h:450
EKMeansMethod
Definition: KMeans.h:28
void set_use_kmeanspp(bool kmpp)
Definition: KMeans.cpp:226
Definition: SGMatrix.h:20
#define REQUIRE(x,...)
Definition: SGIO.h:206
CFeatures * get_rhs()
Definition: Distance.h:230
void set_k(int32_t p_k)
Definition: KMeans.cpp:236
index_t num_cols
Definition: SGMatrix.h:376
int32_t get_dimensions()
Definition: KMeans.cpp:315
#define SG_SET_LOCALE_C
Definition: SGIO.h:85
A generic DistanceMachine interface.
index_t num_rows
Definition: SGMatrix.h:374
static uint64_t random()
Definition: Math.h:1019
virtual ~CKMeans()
Definition: KMeans.cpp:69
index_t vlen
Definition: SGVector.h:492
static void Lloyd_KMeans(int32_t k, CDistance *distance, int32_t max_iter, SGMatrix< float64_t > mus, bool fixed_centers)
bool get_use_kmeanspp() const
Definition: KMeans.cpp:231
SGVector< float64_t > get_radiuses()
Definition: KMeans.cpp:298
virtual int32_t get_num_vectors() const
float64_t get_max_iter()
Definition: KMeans.cpp:253
double float64_t
Definition: common.h:50
virtual bool load(FILE *srcfile)
Definition: KMeans.cpp:212
static void range_fill_vector(T *vec, int32_t len, T start=0)
Definition: SGVector.cpp:228
bool get_fixed_centers()
Definition: KMeans.cpp:325
void set_max_iter(int32_t iter)
Definition: KMeans.cpp:247
void set_fixed_centers(bool fixed)
Definition: KMeans.cpp:320
virtual float64_t distance(int32_t idx_a, int32_t idx_b)
Definition: Distance.cpp:189
#define SG_UNREF(x)
Definition: SGObject.h:52
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual void set_initial_centers(SGMatrix< float64_t > centers)
Definition: KMeans.cpp:73
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
The class Features is the base class of all feature objects.
Definition: Features.h:68
static T min(T a, T b)
Definition: Math.h:157
void set_distance(CDistance *d)
void set_train_method(EKMeansMethod f)
Definition: KMeans.cpp:258
virtual void precompute_lhs()
Definition: Distance.h:143
static float64_t log(float64_t v)
Definition: Math.h:922
static CDenseFeatures * obtain_from_generic(CFeatures *const base_features)
Vector::Scalar vector_sum(Vector a)
Definition: Redux.h:81
virtual void precompute_rhs()
Definition: Distance.h:135
EKMeansMethod get_train_method() const
Definition: KMeans.cpp:263
#define SG_ADD(...)
Definition: SGObject.h:81
static float32_t sqrt(float32_t x)
Definition: Math.h:459
static void minibatch_KMeans(int32_t k, CDistance *distance, int32_t batch_size, int32_t minib_iter, SGMatrix< float64_t > mus)
int32_t get_k()
Definition: KMeans.cpp:242
virtual bool init(CFeatures *lhs, CFeatures *rhs)
Definition: Distance.cpp:78
SGMatrix< float64_t > get_cluster_centers()
Definition: KMeans.cpp:303

SHOGUN Machine Learning Toolbox - Documentation