SHOGUN  5.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules
BaggingMachine.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Viktor Gal
8  * Copyright (C) 2013 Viktor Gal
9  */
10 
14 
15 using namespace shogun;
16 
18  : CMachine()
19 {
20  init();
22 }
23 
25  : CMachine()
26 {
27  init();
29 
30  set_labels(labels);
31 
32  SG_REF(features);
33  m_features = features;
34 }
35 
37 {
43 }
44 
46 {
47  SGVector<float64_t> combined_vector = apply_get_outputs(data);
48 
49  CBinaryLabels* pred = new CBinaryLabels(combined_vector);
50  return pred;
51 }
52 
54 {
55  SGVector<float64_t> combined_vector = apply_get_outputs(data);
56 
57  CMulticlassLabels* pred = new CMulticlassLabels(combined_vector);
58  return pred;
59 }
60 
62 {
63  SGVector<float64_t> combined_vector = apply_get_outputs(data);
64 
65  CRegressionLabels* pred = new CRegressionLabels(combined_vector);
66 
67  return pred;
68 }
69 
71 {
72  ASSERT(data != NULL);
73  REQUIRE(m_combination_rule != NULL, "Combination rule is not set!");
75 
77  output.zero();
78 
79 
80  #pragma omp parallel for
81  for (int32_t i = 0; i < m_num_bags; ++i)
82  {
83  CMachine* m = dynamic_cast<CMachine*>(m_bags->get_element(i));
84  CLabels* l = m->apply(data);
86  if (l!=NULL)
87  lv = dynamic_cast<CDenseLabels*>(l)->get_labels();
88  else
89  SG_ERROR("NULL returned by apply method\n");
90 
91  float64_t* bag_results = output.get_column_vector(i);
92  memcpy(bag_results, lv.vector, lv.vlen*sizeof(float64_t));
93 
94  SG_UNREF(l);
95  SG_UNREF(m);
96  }
97 
99 
100  return combined;
101 }
102 
104 {
105  REQUIRE(m_machine != NULL, "Machine is not set!");
106  REQUIRE(m_num_bags > 0, "Number of bag is not set!");
107 
108  if (data)
109  {
110  SG_REF(data);
112  m_features = data;
113 
115  }
116 
117  // if bag size is not provided, set it equal to number of training vectors
118  if (m_bag_size==0)
120 
121  // clear the array, if previously trained
122  m_bags->reset_array();
123 
124  // reset the oob index vector
127 
130 
131  /*
132  TODO: enable multi-threaded learning. This requires views support
133  on CFeatures*/
134  #pragma omp parallel for
135  for (int32_t i = 0; i < m_num_bags; ++i)
136  {
137  CMachine* c=dynamic_cast<CMachine*>(m_machine->clone());
138  ASSERT(c != NULL);
140  idx.random(0, m_features->get_num_vectors()-1);
141 
142  CFeatures* features;
143  CLabels* labels;
144 
146  {
147  features = m_features;
148  labels = m_labels;
149  }
150  else
151  {
152  features = m_features->shallow_subset_copy();
153  labels = m_labels->shallow_subset_copy();
154  }
155 
156  labels->add_subset(idx);
157  /* TODO:
158  if it's a binary labeling ensure that
159  there's always samples of both classes
160  if ((m_labels->get_label_type() == LT_BINARY))
161  {
162  while (true) {
163  if (!m_labels->ensure_valid()) {
164  m_labels->remove_subset();
165  idx.random(0, m_features->get_num_vectors());
166  m_labels->add_subset(idx);
167  continue;
168  }
169  break;
170  }
171  }
172  */
173  features->add_subset(idx);
174  set_machine_parameters(c,idx);
175  c->set_labels(labels);
176  c->train(features);
177  features->remove_subset();
178  labels->remove_subset();
179 
180  #pragma omp critical
181  {
182  // get out of bag indexes
184  m_oob_indices->push_back(oob);
185 
186  // add trained machine to bag array
187  m_bags->push_back(c);
188  }
189 
190  if (get_global_parallel()->get_num_threads()!=1)
191  {
192  SG_UNREF(features);
193  SG_UNREF(labels);
194  }
195 
196  SG_UNREF(c);
197  }
198 
199  return true;
200 }
201 
203 {
204 }
205 
207 {
208  SG_ADD((CSGObject**)&m_features, "features", "Train features for bagging",
210  SG_ADD(&m_num_bags, "num_bags", "Number of bags", MS_AVAILABLE);
211  SG_ADD(&m_bag_size, "bag_size", "Number of vectors per bag", MS_AVAILABLE);
212  SG_ADD((CSGObject**)&m_bags, "bags", "Bags array", MS_NOT_AVAILABLE);
213  SG_ADD((CSGObject**)&m_combination_rule, "combination_rule",
214  "Combination rule to use for aggregating", MS_AVAILABLE);
215  SG_ADD(&m_all_oob_idx, "all_oob_idx", "Indices of all oob vectors",
217  SG_ADD((CSGObject**)&m_oob_indices, "oob_indices",
218  "OOB indices for each machine", MS_NOT_AVAILABLE);
219 }
220 
221 void CBaggingMachine::set_num_bags(int32_t num_bags)
222 {
223  m_num_bags = num_bags;
224 }
225 
227 {
228  return m_num_bags;
229 }
230 
231 void CBaggingMachine::set_bag_size(int32_t bag_size)
232 {
233  m_bag_size = bag_size;
234 }
235 
237 {
238  return m_bag_size;
239 }
240 
242 {
243  SG_REF(m_machine);
244  return m_machine;
245 }
246 
248 {
249  SG_REF(machine);
251  m_machine = machine;
252 }
253 
255 {
256  m_bags = new CDynamicObjectArray();
257  m_machine = NULL;
258  m_features = NULL;
259  m_combination_rule = NULL;
260  m_labels = NULL;
261  m_num_bags = 0;
262  m_bag_size = 0;
264  m_oob_indices = NULL;
265 }
266 
268 {
269  SG_REF(rule);
271  m_combination_rule = rule;
272 }
273 
275 {
277  return m_combination_rule;
278 }
279 
281 {
282  REQUIRE(m_combination_rule != NULL, "Combination rule is not set!");
283  REQUIRE(m_bags->get_num_elements() > 0, "BaggingMachine is not trained!");
284 
287  output.zero();
288  else
289  output.set_const(NAN);
290 
291  /* TODO: add parallel support of applying the OOBs
292  only possible when add_subset is thread-safe
293  #pragma omp parallel for num_threads(parallel->get_num_threads())
294  */
295  for (index_t i = 0; i < m_bags->get_num_elements(); i++)
296  {
297  CMachine* m = dynamic_cast<CMachine*>(m_bags->get_element(i));
298  CDynamicArray<index_t>* current_oob
299  = dynamic_cast<CDynamicArray<index_t>*>(m_oob_indices->get_element(i));
300 
301  SGVector<index_t> oob(current_oob->get_array(), current_oob->get_num_elements(), false);
302  m_features->add_subset(oob);
303 
304  CLabels* l = m->apply(m_features);
306  if (l!=NULL)
307  lv = dynamic_cast<CDenseLabels*>(l)->get_labels();
308  else
309  SG_ERROR("NULL returned by apply method\n");
310 
311  // assign the values in the matrix (NAN) that are in-bag!
312  for (index_t j = 0; j < oob.vlen; j++)
313  output(oob[j], i) = lv[j];
314 
316  SG_UNREF(current_oob);
317  SG_UNREF(m);
318  SG_UNREF(l);
319  }
320 
321  DynArray<index_t> idx;
322  for (index_t i = 0; i < m_features->get_num_vectors(); i++)
323  {
324  if (m_all_oob_idx[i])
325  idx.push_back(i);
326  }
327 
328  SGVector<float64_t> combined = m_combination_rule->combine(output);
330  for (int32_t i=0;i<lab.vlen;i++)
331  lab[i]=combined[idx.get_element(i)];
332 
333  CLabels* predicted = NULL;
334  switch (m_labels->get_label_type())
335  {
336  case LT_BINARY:
337  predicted = new CBinaryLabels(lab);
338  break;
339 
340  case LT_MULTICLASS:
341  predicted = new CMulticlassLabels(lab);
342  break;
343 
344  case LT_REGRESSION:
345  predicted = new CRegressionLabels(lab);
346  break;
347 
348  default:
349  SG_ERROR("Unsupported label type\n");
350  }
351 
353  float64_t res = eval->evaluate(predicted, m_labels);
355 
356  SG_UNREF(predicted);
357  return res;
358 }
359 
361 {
363  out_of_bag.set_const(true);
364 
365  // mark the ones that are in_bag
366  index_t oob_count = m_features->get_num_vectors();
367  for (index_t i = 0; i < in_bag.vlen; i++)
368  {
369  if (out_of_bag[in_bag[i]])
370  {
371  out_of_bag[in_bag[i]] = false;
372  oob_count--;
373  }
374  }
375 
377  // store the indicies of vectors that are out of the bag
378  for (index_t i = 0; i < out_of_bag.vlen; i++)
379  {
380  if (out_of_bag[i])
381  {
382  oob->push_back(i);
383  m_all_oob_idx[i] = true;
384  }
385  }
386 
387  return oob;
388 }
389 
virtual CFeatures * shallow_subset_copy()
Definition: Features.h:336
virtual CRegressionLabels * apply_regression(CFeatures *data=NULL)
T get_element(int32_t index) const
Definition: DynArray.h:142
Parallel * get_global_parallel()
Definition: SGObject.cpp:310
void set_combination_rule(CCombinationRule *rule)
virtual ELabelType get_label_type() const =0
binary labels +1/-1
Definition: LabelTypes.h:18
Real Labels are real-valued labels.
virtual CLabels * shallow_subset_copy()
Definition: Labels.h:127
CCombinationRule * m_combination_rule
int32_t get_num_threads() const
Definition: Parallel.cpp:78
CCombinationRule * get_combination_rule() const
int32_t index_t
Definition: common.h:62
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:43
virtual CSGObject * clone()
Definition: SGObject.cpp:747
virtual int32_t get_num_labels() const =0
real valued labels (e.g. for regression, classifier outputs)
Definition: LabelTypes.h:22
virtual bool train_machine(CFeatures *data=NULL)
multi-class labels 0,1,...
Definition: LabelTypes.h:20
virtual void set_machine_parameters(CMachine *m, SGVector< index_t > idx)
virtual int32_t get_bag_size() const
virtual float64_t evaluate(CLabels *predicted, CLabels *ground_truth)=0
virtual int32_t get_num_vectors() const =0
T * get_array() const
Definition: DynamicArray.h:408
CLabels * m_labels
Definition: Machine.h:361
void random(T min_value, T max_value)
Definition: SGVector.cpp:179
#define SG_ERROR(...)
Definition: SGIO.h:129
#define REQUIRE(x,...)
Definition: SGIO.h:206
int32_t get_num_bags() const
int32_t get_num_elements() const
Definition: DynArray.h:130
CDynamicArray< index_t > * get_oob_indices(const SGVector< index_t > &in_bag)
SGVector< float64_t > apply_get_outputs(CFeatures *data)
Template Dynamic array class that creates an array that can be used like a list or an array...
Definition: DynArray.h:22
virtual CMulticlassLabels * apply_multiclass(CFeatures *data=NULL)
#define SG_REF(x)
Definition: SGObject.h:54
A generic learning machine interface.
Definition: Machine.h:143
SGVector< bool > m_all_oob_idx
CMachine * get_machine() const
virtual SGVector< float64_t > combine(const SGMatrix< float64_t > &ensemble_result) const =0
Multiclass Labels for multi-class classification.
index_t vlen
Definition: SGVector.h:494
#define ASSERT(x)
Definition: SGIO.h:201
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:115
void push_back(T element)
Definition: DynArray.h:254
virtual void set_machine(CMachine *machine)
Template Dynamic array class that creates an array that can be used like a list or an array...
Definition: DynArray.h:32
double float64_t
Definition: common.h:50
virtual void remove_subset()
Definition: Labels.cpp:49
CDynamicObjectArray * m_bags
virtual CLabels * get_labels()
Definition: Machine.cpp:76
virtual void add_subset(SGVector< index_t > subset)
Definition: Labels.cpp:39
Dynamic array class for CSGObject pointers that creates an array that can be used like a list or an a...
void set_num_bags(int32_t num_bags)
CDynamicObjectArray * m_oob_indices
#define SG_UNREF(x)
Definition: SGObject.h:55
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual void remove_subset()
Definition: Features.cpp:322
The class Features is the base class of all feature objects.
Definition: Features.h:68
CombinationRule abstract class The CombinationRule defines an interface to how to combine the classif...
T * get_array() const
Definition: DynArray.h:372
virtual bool train(CFeatures *data=NULL)
Definition: Machine.cpp:39
int32_t get_num_elements() const
Definition: DynamicArray.h:200
Binary Labels for binary classification.
Definition: BinaryLabels.h:37
CSGObject * get_element(int32_t index) const
#define SG_ADD(...)
Definition: SGObject.h:84
#define NAN
Definition: Math.cpp:26
Dense integer or floating point labels.
Definition: DenseLabels.h:35
virtual CBinaryLabels * apply_binary(CFeatures *data=NULL)
float64_t get_oob_error(CEvaluation *eval) const
virtual void set_labels(CLabels *lab)
Definition: Machine.cpp:65
Class Evaluation, a base class for other classes used to evaluate labels, e.g. accuracy of classifica...
Definition: Evaluation.h:40
void set_const(T const_elem)
Definition: SGVector.cpp:150
virtual void add_subset(SGVector< index_t > subset)
Definition: Features.cpp:310
virtual void set_bag_size(int32_t bag_size)
virtual CLabels * apply(CFeatures *data=NULL)
Definition: Machine.cpp:152

SHOGUN Machine Learning Toolbox - Documentation