SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
GaussianNaiveBayes.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2011 Sergey Lisitsyn
8  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
9  */
10 
13 #include <shogun/labels/Labels.h>
17 #include <shogun/lib/Signal.h>
18 
19 using namespace shogun;
20 
22  m_min_label(0), m_num_classes(0), m_dim(0), m_means(), m_variances(),
23  m_label_prob(), m_rates()
24 {
25 
26 };
27 
29  CLabels* train_labels) : CNativeMulticlassMachine(), m_features(NULL),
30  m_min_label(0), m_num_classes(0), m_dim(0), m_means(),
31  m_variances(), m_label_prob(), m_rates()
32 {
33  ASSERT(train_examples->get_num_vectors() == train_labels->get_num_labels())
34  set_labels(train_labels);
35 
36  if (!train_examples->has_property(FP_DOT))
37  SG_ERROR("Specified features are not of type CDotFeatures\n")
38 
39  set_features((CDotFeatures*)train_examples);
40 };
41 
43 {
45 };
46 
48 {
50  return m_features;
51 }
52 
54 {
55  if (!features->has_property(FP_DOT))
56  SG_ERROR("Specified features are not of type CDotFeatures\n")
57 
58  SG_REF(features);
60  m_features = (CDotFeatures*)features;
61 }
62 
64 {
65  // init features with data if necessary and assure type is correct
66  if (data)
67  {
68  if (!data->has_property(FP_DOT))
69  SG_ERROR("Specified features are not of type CDotFeatures\n")
70  set_features((CDotFeatures*) data);
71  }
72 
73  // get int labels to train_labels and check length equality
76  SGVector<int32_t> train_labels = ((CMulticlassLabels*) m_labels)->get_int_labels();
77  ASSERT(m_features->get_num_vectors()==train_labels.vlen)
78 
79  // init min_label, max_label and loop variables
80  int32_t min_label = train_labels.vector[0];
81  int32_t max_label = train_labels.vector[0];
82  int i,j;
83 
84  // find minimal and maximal label
85  for (i=1; i<train_labels.vlen; i++)
86  {
87  min_label = CMath::min(min_label, train_labels.vector[i]);
88  max_label = CMath::max(max_label, train_labels.vector[i]);
89  }
90 
91  // subtract minimal label from all labels
92  for (i=0; i<train_labels.vlen; i++)
93  train_labels.vector[i]-= min_label;
94 
95  // get number of classes, minimal label and dimensionality
96  m_num_classes = max_label-min_label+1;
97  m_min_label = min_label;
99 
100  // allocate memory for distributions' parameters and a priori probability
104 
105  // allocate memory for label rates
107 
108  // make arrays filled by zeros before using
109  m_means.zero();
110  m_variances.zero();
111  m_label_prob.zero();
112  m_rates.zero();
113 
114  // number of iterations in all cycles
115  int32_t max_progress = 2 * train_labels.vlen + 2 * m_num_classes;
116 
117  // current progress
118  int32_t progress = 0;
119  SG_PROGRESS(progress, 0, max_progress)
120 
121  // get sum of features among labels
122  for (i=0; i<train_labels.vlen; i++)
123  {
125  for (j=0; j<m_dim; j++)
126  m_means(j, train_labels.vector[i]) += fea.vector[j];
127 
128  m_label_prob.vector[train_labels.vector[i]]+=1.0;
129 
130  progress++;
131  SG_PROGRESS(progress, 0, max_progress)
132  }
133 
134  // get means of features of labels
135  for (i=0; i<m_num_classes; i++)
136  {
137  for (j=0; j<m_dim; j++)
138  m_means(j, i) /= m_label_prob.vector[i];
139 
140  progress++;
141  SG_PROGRESS(progress, 0, max_progress)
142  }
143 
144  // compute squared residuals with means available
145  for (i=0; i<train_labels.vlen; i++)
146  {
148  for (j=0; j<m_dim; j++)
149  {
150  m_variances(j, train_labels.vector[i]) +=
151  CMath::sq(fea[j]-m_means(j, train_labels.vector[i]));
152  }
153 
154  progress++;
155  SG_PROGRESS(progress, 0, max_progress)
156  }
157 
158  // get variance of features of labels
159  for (i=0; i<m_num_classes; i++)
160  {
161  for (j=0; j<m_dim; j++)
162  m_variances(j, i) /= m_label_prob.vector[i] > 1 ? m_label_prob.vector[i]-1 : 1;
163 
164  // get a priori probabilities of labels
165  m_label_prob.vector[i]/= m_num_classes;
166 
167  progress++;
168  SG_PROGRESS(progress, 0, max_progress)
169  }
170  SG_DONE()
171 
172  return true;
173 }
174 
176 {
177  if (data)
178  set_features(data);
179 
181 
182  // init number of vectors
183  int32_t num_vectors = m_features->get_num_vectors();
184 
185  // init result labels
186  CMulticlassLabels* result = new CMulticlassLabels(num_vectors);
187 
188  // classify each example of data
189  SG_PROGRESS(0, 0, num_vectors)
190  for (int i = 0; i < num_vectors; i++)
191  {
192  result->set_label(i,apply_one(i));
193  SG_PROGRESS(i + 1, 0, num_vectors)
194  }
195  SG_DONE()
196  return result;
197 };
198 
200 {
201  // get [idx] feature vector
203 
204  // init loop variables
205  int i,k;
206 
207  // rate all labels
208  for (i=0; i<m_num_classes; i++)
209  {
210  // set rate to 0.0 if a priori probability is 0.0 and continue
211  if (m_label_prob.vector[i]==0.0)
212  {
213  m_rates.vector[i] = 0.0;
214  continue;
215  }
216  else
218 
219  // product all conditional gaussian probabilities
220  for (k=0; k<m_dim; k++)
221  if (m_variances(k,i)!=0.0)
222  m_rates.vector[i]+= CMath::log(0.39894228/CMath::sqrt(m_variances(k, i))) -
223  0.5*CMath::sq(feature_vector.vector[k]-m_means(k, i))/(m_variances(k, i));
224  }
225 
226  // find label with maximum rate
227  int32_t max_label_idx = 0;
228 
229  for (i=0; i<m_num_classes; i++)
230  {
231  if (m_rates.vector[i]>m_rates.vector[max_label_idx])
232  max_label_idx = i;
233  }
234 
235  return max_label_idx+m_min_label;
236 };
#define SG_DONE()
Definition: SGIO.h:157
virtual ELabelType get_label_type() const =0
SGVector< float64_t > m_label_prob
a priori probabilities of labels
experimental abstract native multiclass machine class
#define SG_PROGRESS(...)
Definition: SGIO.h:142
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:43
virtual int32_t get_num_labels() const =0
multi-class labels 0,1,...
Definition: LabelTypes.h:20
static T sq(T x)
Definition: Math.h:450
virtual CMulticlassLabels * apply_multiclass(CFeatures *data=NULL)
virtual int32_t get_num_vectors() const =0
CLabels * m_labels
Definition: Machine.h:361
#define SG_ERROR(...)
Definition: SGIO.h:129
Features that support dot products among other operations.
Definition: DotFeatures.h:44
#define SG_REF(x)
Definition: SGObject.h:51
SGMatrix< float64_t > m_variances
variances for normal distributions of features
SGVector< float64_t > m_rates
label rates
virtual int32_t get_dim_feature_space() const =0
bool set_label(int32_t idx, float64_t label)
Multiclass Labels for multi-class classification.
#define ASSERT(x)
Definition: SGIO.h:201
int32_t m_num_classes
number of different classes (labels)
double float64_t
Definition: common.h:50
int32_t m_min_label
minimal label
virtual void set_features(CFeatures *features)
static T max(T a, T b)
Definition: Math.h:168
int32_t m_dim
dimensionality of feature space
virtual bool train_machine(CFeatures *data=NULL)
#define SG_UNREF(x)
Definition: SGObject.h:52
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
SGMatrix< float64_t > m_means
means for normal distributions of features
The class Features is the base class of all feature objects.
Definition: Features.h:68
static T min(T a, T b)
Definition: Math.h:157
static float64_t log(float64_t v)
Definition: Math.h:922
SGVector< float64_t > get_computed_dot_feature_vector(int32_t num)
static float32_t sqrt(float32_t x)
Definition: Math.h:459
virtual CFeatures * get_features()
bool has_property(EFeatureProperty p) const
Definition: Features.cpp:295
virtual float64_t apply_one(int32_t idx)
CDotFeatures * m_features
features for training or classifying
virtual void set_labels(CLabels *lab)

SHOGUN Machine Learning Toolbox - Documentation