SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
KNN.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2006 Christian Gehl
8  * Written (W) 2006-2009 Soeren Sonnenburg
9  * Written (W) 2011 Sergey Lisitsyn
10  * Written (W) 2012 Fernando José Iglesias García, cover tree support
11  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
12  */
13 
14 #include <shogun/multiclass/KNN.h>
15 #include <shogun/labels/Labels.h>
18 #include <shogun/lib/Signal.h>
19 #include <shogun/lib/JLCoverTree.h>
20 #include <shogun/lib/Time.h>
21 #include <shogun/base/Parameter.h>
22 
23 //#define BENCHMARK_KNN
24 //#define DEBUG_KNN
25 
26 using namespace shogun;
27 
30 {
31  init();
32 }
33 
34 CKNN::CKNN(int32_t k, CDistance* d, CLabels* trainlab)
36 {
37  init();
38 
39  m_k=k;
40 
41  ASSERT(d)
42  ASSERT(trainlab)
43 
44  set_distance(d);
45  set_labels(trainlab);
47 }
48 
49 void CKNN::init()
50 {
51  /* do not store model features by default (CDistanceMachine::apply(...) is
52  * overwritten */
54 
55  m_k=3;
56  m_q=1.0;
57  m_use_covertree=false;
58  m_num_classes=0;
59 
60  /* use the method classify_multiply_k to experiment with different values
61  * of k */
62  SG_ADD(&m_k, "m_k", "Parameter k", MS_NOT_AVAILABLE);
63  SG_ADD(&m_q, "m_q", "Parameter q", MS_AVAILABLE);
64  SG_ADD(&m_use_covertree, "m_use_covertree", "Parameter use_covertree", MS_NOT_AVAILABLE);
65  SG_ADD(&m_num_classes, "m_num_classes", "Number of classes", MS_NOT_AVAILABLE);
66 }
67 
69 {
70 }
71 
73 {
76 
77  if (data)
78  {
79  if (m_labels->get_num_labels() != data->get_num_vectors())
80  SG_ERROR("Number of training vectors does not match number of labels\n")
81  distance->init(data, data);
82  }
83 
84  SGVector<int32_t> lab=((CMulticlassLabels*) m_labels)->get_int_labels();
85  m_train_labels=lab.clone();
87 
88  int32_t max_class=m_train_labels[0];
89  int32_t min_class=m_train_labels[0];
90 
91  for (int32_t i=1; i<m_train_labels.vlen; i++)
92  {
93  max_class=CMath::max(max_class, m_train_labels[i]);
94  min_class=CMath::min(min_class, m_train_labels[i]);
95  }
96 
97  for (int32_t i=0; i<m_train_labels.vlen; i++)
98  m_train_labels[i]-=min_class;
99 
100  m_min_label=min_class;
101  m_num_classes=max_class-min_class+1;
102 
103  SG_INFO("m_num_classes: %d (%+d to %+d) num_train: %d\n", m_num_classes,
104  min_class, max_class, m_train_labels.vlen);
105 
106  return true;
107 }
108 
110 {
111  //number of examples to which kNN is applied
112  int32_t n=distance->get_num_vec_rhs();
113  //distances to train data
114  float64_t* dists=SG_MALLOC(float64_t, m_train_labels.vlen);
115  //indices to train data
116  index_t* train_idxs=SG_MALLOC(index_t, m_train_labels.vlen);
117  //pre-allocation of the nearest neighbors
118  SGMatrix<index_t> NN(m_k, n);
119 
120  //for each test example
121  for (int32_t i=0; i<n && (!CSignal::cancel_computations()); i++)
122  {
123  SG_PROGRESS(i, 0, n)
124 
125  //lhs idx 0..num train examples-1 (i.e., all train examples) and rhs idx i
126  distances_lhs(dists,0,m_train_labels.vlen-1,i);
127 
128  //fill in an array with 0..num train examples-1
129  for (int32_t j=0; j<m_train_labels.vlen; j++)
130  train_idxs[j]=j;
131 
132  //sort the distance vector between test example i and all train examples
133  CMath::qsort_index(dists, train_idxs, m_train_labels.vlen);
134 
135 #ifdef DEBUG_KNN
136  SG_PRINT("\nQuick sort query %d\n", i)
137  for (int32_t j=0; j<m_k; j++)
138  SG_PRINT("%d ", train_idxs[j])
139  SG_PRINT("\n")
140 #endif
141 
142  //fill in the output the indices of the nearest neighbors
143  for (int32_t j=0; j<m_k; j++)
144  NN(j,i) = train_idxs[j];
145  }
146 
147  SG_FREE(train_idxs);
148  SG_FREE(dists);
149 
150  return NN;
151 }
152 
154 {
155  if (data)
156  init_distance(data);
157 
158  //redirecting to fast (without sorting) classify if k==1
159  if (m_k == 1)
160  return classify_NN();
161 
165 
166  int32_t num_lab=distance->get_num_vec_rhs();
167  ASSERT(m_k<=distance->get_num_vec_lhs())
168 
169  CMulticlassLabels* output=new CMulticlassLabels(num_lab);
170 
171  //labels of the k nearest neighbors
172  int32_t* train_lab=SG_MALLOC(int32_t, m_k);
173 
174  SG_INFO("%d test examples\n", num_lab)
176 
177  //histogram of classes and returned output
178  float64_t* classes=SG_MALLOC(float64_t, m_num_classes);
179 
180 #ifdef BENCHMARK_KNN
181  CTime tstart;
182  float64_t tfinish, tparsed, tcreated, tqueried;
183 #endif
184 
185  if ( ! m_use_covertree )
186  {
187  //get the k nearest neighbors of each example
189 
190  //from the indices to the nearest neighbors, compute the class labels
191  for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
192  {
193  //write the labels of the k nearest neighbors from theirs indices
194  for (int32_t j=0; j<m_k; j++)
195  train_lab[j] = m_train_labels[ NN(j,i) ];
196 
197  //get the index of the 'nearest' class
198  int32_t out_idx = choose_class(classes, train_lab);
199  //write the label of 'nearest' in the output
200  output->set_label(i, out_idx + m_min_label);
201  }
202 
203 #ifdef BENCHMARK_KNN
204  SG_PRINT(">>>> Quick sort applied in %9.4f\n",
205  (tfinish = tstart.cur_time_diff(false)));
206 #endif
207  }
208  else // Use cover tree
209  {
210  // m_q != 1.0 not supported with cover tree because the neighbors
211  // are not retrieved in increasing order of distance to the query
212  float64_t old_q = m_q;
213  if ( old_q != 1.0 )
214  SG_INFO("q != 1.0 not supported with cover tree, using q = 1\n")
215 
216  // From the sets of features (lhs and rhs) stored in distance,
217  // build arrays of cover tree points
218  v_array< CJLCoverTreePoint > set_of_points =
220  v_array< CJLCoverTreePoint > set_of_queries =
222 
223 #ifdef BENCHMARK_KNN
224  SG_PRINT(">>>> JL parsed in %9.4f\n",
225  ( tparsed = tstart.cur_time_diff(false) ) - tfinish);
226 #endif
227  // Build the cover trees, one for the test vectors (rhs features)
228  // and another for the training vectors (lhs features)
230  node< CJLCoverTreePoint > top = batch_create(set_of_points);
231  CFeatures* l = distance->replace_lhs(r);
232  distance->replace_rhs(r);
233  node< CJLCoverTreePoint > top_query = batch_create(set_of_queries);
234 
235 #ifdef BENCHMARK_KNN
236  SG_PRINT(">>>> Cover trees created in %9.4f\n",
237  (tcreated = tstart.cur_time_diff(false)) - tparsed);
238 #endif
239 
240  // Get the k nearest neighbors to all the test vectors (batch method)
241  distance->replace_lhs(l);
243  k_nearest_neighbor(top, top_query, res, m_k);
244 
245 #ifdef BENCHMARK_KNN
246  SG_PRINT(">>>> Query finished in %9.4f\n",
247  (tqueried = tstart.cur_time_diff(false)) - tcreated);
248 #endif
249 
250 #ifdef DEBUG_KNN
251  SG_PRINT("\nJL Results:\n")
252  for ( int32_t i = 0 ; i < res.index ; ++i )
253  {
254  for ( int32_t j = 0 ; j < res[i].index ; ++j )
255  {
256  printf("%d ", res[i][j].m_index);
257  }
258  printf("\n");
259  }
260  SG_PRINT("\n")
261 #endif
262 
263  for ( int32_t i = 0 ; i < res.index ; ++i )
264  {
265  // Translate from indices to labels of the nearest neighbors
266  for ( int32_t j = 0; j < m_k; ++j )
267  // The first index in res[i] points to the test vector
268  train_lab[j] = m_train_labels.vector[ res[i][j+1].m_index ];
269 
270  // Get the index of the 'nearest' class
271  int32_t out_idx = choose_class(classes, train_lab);
272  output->set_label(res[i][0].m_index, out_idx+m_min_label);
273  }
274 
275  m_q = old_q;
276 
277 #ifdef BENCHMARK_KNN
278  SG_PRINT(">>>> JL applied in %9.4f\n", tstart.cur_time_diff(false))
279 #endif
280  }
281 
282  SG_FREE(classes);
283  SG_FREE(train_lab);
284 
285  return output;
286 }
287 
289 {
292 
293  int32_t num_lab = distance->get_num_vec_rhs();
294  ASSERT(num_lab)
295 
296  CMulticlassLabels* output = new CMulticlassLabels(num_lab);
297  float64_t* distances = SG_MALLOC(float64_t, m_train_labels.vlen);
298 
299  SG_INFO("%d test examples\n", num_lab)
301 
302  // for each test example
303  for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
304  {
305  SG_PROGRESS(i,0,num_lab)
306 
307  // get distances from i-th test example to 0..num_m_train_labels-1 train examples
308  distances_lhs(distances,0,m_train_labels.vlen-1,i);
309  int32_t j;
310 
311  // assuming 0th train examples as nearest to i-th test example
312  int32_t out_idx = 0;
313  float64_t min_dist = distances[0];
314 
315  // searching for nearest neighbor by comparing distances
316  for (j=0; j<m_train_labels.vlen; j++)
317  {
318  if (distances[j]<min_dist)
319  {
320  min_dist = distances[j];
321  out_idx = j;
322  }
323  }
324 
325  // label i-th test example with label of nearest neighbor with out_idx index
326  output->set_label(i,m_train_labels.vector[out_idx]+m_min_label);
327  }
328 
329  SG_FREE(distances);
330  return output;
331 }
332 
334 {
338 
339  int32_t num_lab=distance->get_num_vec_rhs();
340  ASSERT(m_k<=num_lab)
341 
342  int32_t* output=SG_MALLOC(int32_t, m_k*num_lab);
343 
344  //working buffer of m_train_labels
345  int32_t* train_lab=SG_MALLOC(int32_t, m_k);
346 
347  //histogram of classes and returned output
348  int32_t* classes=SG_MALLOC(int32_t, m_num_classes);
349 
350  SG_INFO("%d test examples\n", num_lab)
352 
353  if ( ! m_use_covertree )
354  {
355  //get the k nearest neighbors of each example
357 
358  for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
359  {
360  //write the labels of the k nearest neighbors from theirs indices
361  for (int32_t j=0; j<m_k; j++)
362  train_lab[j] = m_train_labels[ NN(j,i) ];
363 
364  choose_class_for_multiple_k(output+i, classes, train_lab, num_lab);
365  }
366  }
367  else // Use cover tree
368  {
369  //allocation for distances to nearest neighbors
370  float64_t* dists=SG_MALLOC(float64_t, m_k);
371 
372  // From the sets of features (lhs and rhs) stored in distance,
373  // build arrays of cover tree points
374  v_array< CJLCoverTreePoint > set_of_points =
376  v_array< CJLCoverTreePoint > set_of_queries =
378 
379  // Build the cover trees, one for the test vectors (rhs features)
380  // and another for the training vectors (lhs features)
382  node< CJLCoverTreePoint > top = batch_create(set_of_points);
383  CFeatures* l = distance->replace_lhs(r);
384  distance->replace_rhs(r);
385  node< CJLCoverTreePoint > top_query = batch_create(set_of_queries);
386 
387  // Get the k nearest neighbors to all the test vectors (batch method)
388  distance->replace_lhs(l);
390  k_nearest_neighbor(top, top_query, res, m_k);
391 
392  for ( int32_t i = 0 ; i < res.index ; ++i )
393  {
394  // Handle the fact that cover tree doesn't return neighbors
395  // ordered by distance
396 
397  for ( int32_t j = 0 ; j < m_k ; ++j )
398  {
399  // The first index in res[i] points to the test vector
400  dists[j] = distance->distance(res[i][j+1].m_index,
401  res[i][0].m_index);
402  train_lab[j] = m_train_labels.vector[
403  res[i][j+1].m_index ];
404  }
405 
406  // Now we get the indices to the neighbors sorted by distance
407  CMath::qsort_index(dists, train_lab, m_k);
408 
409  choose_class_for_multiple_k(output+res[i][0].m_index, classes,
410  train_lab, num_lab);
411  }
412 
413  SG_FREE(dists);
414  }
415 
416  SG_FREE(train_lab);
417  SG_FREE(classes);
418 
419  return SGMatrix<int32_t>(output,num_lab,m_k,true);
420 }
421 
423 {
424  if (!distance)
425  SG_ERROR("No distance assigned!\n")
426  CFeatures* lhs=distance->get_lhs();
427  if (!lhs || !lhs->get_num_vectors())
428  {
429  SG_UNREF(lhs);
430  SG_ERROR("No vectors on left hand side\n")
431  }
432  distance->init(lhs, data);
433  SG_UNREF(lhs);
434 }
435 
436 bool CKNN::load(FILE* srcfile)
437 {
440  return false;
441 }
442 
443 bool CKNN::save(FILE* dstfile)
444 {
447  return false;
448 }
449 
451 {
452  CFeatures* d_lhs=distance->get_lhs();
453  CFeatures* d_rhs=distance->get_rhs();
454 
455  /* copy lhs of underlying distance */
456  distance->init(d_lhs->duplicate(), d_rhs);
457 
458  SG_UNREF(d_lhs); // because of d_lhs->duplicate()
459  SG_UNREF(d_lhs); // because of distance->get_lhs()
460  SG_UNREF(d_rhs);
461 }
462 
463 int32_t CKNN::choose_class(float64_t* classes, int32_t* train_lab)
464 {
465  memset(classes, 0, sizeof(float64_t)*m_num_classes);
466 
467  float64_t multiplier = m_q;
468  for (int32_t j=0; j<m_k; j++)
469  {
470  classes[train_lab[j]]+= multiplier;
471  multiplier*= multiplier;
472  }
473 
474  //choose the class that got 'outputted' most often
475  int32_t out_idx=0;
476  float64_t out_max=0;
477 
478  for (int32_t j=0; j<m_num_classes; j++)
479  {
480  if (out_max< classes[j])
481  {
482  out_idx= j;
483  out_max= classes[j];
484  }
485  }
486 
487  return out_idx;
488 }
489 
490 void CKNN::choose_class_for_multiple_k(int32_t* output, int32_t* classes, int32_t* train_lab, int32_t step)
491 {
492  //compute histogram of class outputs of the first k nearest neighbours
493  memset(classes, 0, sizeof(int32_t)*m_num_classes);
494 
495  for (int32_t j=0; j<m_k; j++)
496  {
497  classes[train_lab[j]]++;
498 
499  //choose the class that got 'outputted' most often
500  int32_t out_idx=0;
501  int32_t out_max=0;
502 
503  for (int32_t c=0; c<m_num_classes; c++)
504  {
505  if (out_max< classes[c])
506  {
507  out_idx= c;
508  out_max= classes[c];
509  }
510  }
511 
512  output[j*step]=out_idx+m_min_label;
513  }
514 }

SHOGUN Machine Learning Toolbox - Documentation