SHOGUN  6.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules
Hierarchical.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2007-2009 Soeren Sonnenburg
8  * Copyright (C) 2007-2009 Fraunhofer Institute FIRST and Max-Planck-Society
9  */
10 
13 #include <shogun/labels/Labels.h>
16 #include <shogun/base/Parallel.h>
17 
18 using namespace shogun;
19 
20 #ifndef DOXYGEN_SHOULD_SKIP_THIS
21 struct pair
22 {
24  int32_t idx1;
26  int32_t idx2;
27 };
28 #endif // DOXYGEN_SHOULD_SKIP_THIS
29 
31 : CDistanceMachine(), merges(3), dimensions(0), assignment(NULL),
32  table_size(0), pairs(NULL), merge_distance(NULL)
33 {
34 }
35 
37 : CDistanceMachine(), merges(merges_), dimensions(0), assignment(NULL),
38  table_size(0), pairs(NULL), merge_distance(NULL)
39 {
40  set_distance(d);
41 }
42 
44 {
45  SG_FREE(merge_distance);
46  SG_FREE(assignment);
47  SG_FREE(pairs);
48 }
49 
51 {
52  return CT_HIERARCHICAL;
53 }
54 
56 {
58 
59  if (data)
60  distance->init(data, data);
61 
62  CFeatures* lhs=distance->get_lhs();
63  ASSERT(lhs)
64 
65  int32_t num=lhs->get_num_vectors();
66  ASSERT(num>0)
67 
68  const int32_t num_pairs=num*(num-1)/2;
69 
70  SG_FREE(merge_distance);
71  merge_distance=SG_MALLOC(float64_t, num);
73 
74  SG_FREE(assignment);
75  assignment=SG_MALLOC(int32_t, num);
77 
78  SG_FREE(pairs);
79  pairs=SG_MALLOC(int32_t, 2*num);
81 
82  pair* index=SG_MALLOC(pair, num_pairs);
83  float64_t* distances=SG_MALLOC(float64_t, num_pairs);
84 
85  int32_t offs=0;
86  for (int32_t i=0; i<num; i++)
87  {
88  for (int32_t j=i+1; j<num; j++)
89  {
90  distances[offs]=distance->distance(i,j);
91  index[offs].idx1=i;
92  index[offs].idx2=j;
93  offs++; //offs=i*(i+1)/2+j
94  }
95  SG_PROGRESS(i, 0, num-1)
96  }
97 
98  CMath::qsort_index<float64_t,pair>(distances, index, (num-1)*num/2);
99  //CMath::display_vector(distances, (num-1)*num/2, "dists");
100 
101  int32_t k=-1;
102  int32_t l=0;
103  for (; l<num && (num-l)>=merges && k<num_pairs-1; l++)
104  {
105  while (k<num_pairs-1)
106  {
107  k++;
108 
109  int32_t i=index[k].idx1;
110  int32_t j=index[k].idx2;
111  int32_t c1=assignment[i];
112  int32_t c2=assignment[j];
113 
114  if (c1==c2)
115  continue;
116 
117  SG_PROGRESS(k, 0, num_pairs-1)
118 
119  if (c1<c2)
120  {
121  pairs[2*l]=c1;
122  pairs[2*l+1]=c2;
123  }
124  else
125  {
126  pairs[2*l]=c2;
127  pairs[2*l+1]=c1;
128  }
129  merge_distance[l]=distances[k];
130 
131  int32_t c=num+l;
132  for (int32_t m=0; m<num; m++)
133  {
134  if (assignment[m] == c1 || assignment[m] == c2)
135  assignment[m] = c;
136  }
137 #ifdef DEBUG_HIERARCHICAL
138  SG_PRINT("l=%04i i=%04i j=%04i c1=%+04d c2=%+04d c=%+04d dist=%6.6f\n", l,i,j, c1,c2,c, merge_distance[l])
139 #endif
140  break;
141  }
142  }
143 
144  assignment_size=num;
145  table_size=l-1;
146  ASSERT(table_size>0)
147  SG_FREE(distances);
148  SG_FREE(index);
149  SG_UNREF(lhs)
150 
151  return true;
152 }
153 
154 bool CHierarchical::load(FILE* srcfile)
155 {
158  return false;
159 }
160 
161 bool CHierarchical::save(FILE* dstfile)
162 {
165  return false;
166 }
167 
168 
170 {
171  return merges;
172 }
173 
175 {
176  return SGVector<int32_t>(assignment,table_size, false);
177 }
178 
180 {
182 }
183 
185 {
186  return SGMatrix<int32_t>(pairs,2,merges, false);
187 }
188 
189 
191 {
192  /* TODO. Currently does nothing since apply methods are not implemented. */
193 }
EMachineType
Definition: Machine.h:33
virtual bool save(FILE *dstfile)
#define SG_RESET_LOCALE
Definition: SGIO.h:85
static void fill_vector(T *vec, int32_t len, T value)
Definition: SGVector.cpp:264
SGVector< float64_t > get_merge_distances()
Class Distance, a base class for all the distances used in the Shogun toolbox.
Definition: Distance.h:87
#define SG_PROGRESS(...)
Definition: SGIO.h:141
CFeatures * get_lhs()
Definition: Distance.h:218
virtual int32_t get_num_vectors() const =0
int32_t merges
the number of merges in hierarchical clustering
Definition: Hierarchical.h:129
#define SG_SET_LOCALE_C
Definition: SGIO.h:84
int32_t * assignment
cluster assignment for the num_points
Definition: Hierarchical.h:138
A generic DistanceMachine interface.
#define SG_PRINT(...)
Definition: SGIO.h:136
#define ASSERT(x)
Definition: SGIO.h:200
virtual bool train_machine(CFeatures *data=NULL)
SGMatrix< int32_t > get_cluster_pairs()
int32_t table_size
size of the below tables
Definition: Hierarchical.h:141
double float64_t
Definition: common.h:60
int32_t assignment_size
size of assignment table
Definition: Hierarchical.h:135
static void range_fill_vector(T *vec, int32_t len, T start=0)
Definition: SGVector.cpp:271
virtual float64_t distance(int32_t idx_a, int32_t idx_b)
Definition: Distance.cpp:183
#define SG_UNREF(x)
Definition: SGObject.h:53
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
SGVector< int32_t > get_assignment()
virtual void store_model_features()
virtual bool load(FILE *srcfile)
The class Features is the base class of all feature objects.
Definition: Features.h:68
void set_distance(CDistance *d)
float64_t * merge_distance
distance at which pair i/j was added
Definition: Hierarchical.h:147
virtual bool init(CFeatures *lhs, CFeatures *rhs)
Definition: Distance.cpp:55
virtual EMachineType get_classifier_type()
int32_t * pairs
tuples of i/j
Definition: Hierarchical.h:144

SHOGUN Machine Learning Toolbox - Documentation