SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
PCA.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2008 Gunnar Raetsch
8  * Written (W) 1999-2008,2011 Soeren Sonnenburg
9  * Written (W) 2014 Parijat Mazumdar
10  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
11  * Copyright (C) 2011 Berlin Institute of Technology
12  */
13 #include <shogun/lib/config.h>
14 
15 #ifdef HAVE_EIGEN3
20 #include <shogun/io/SGIO.h>
22 
23 using namespace shogun;
24 using namespace Eigen;
25 
26 CPCA::CPCA(bool do_whitening, EPCAMode mode, float64_t thresh, EPCAMethod method, EPCAMemoryMode mem_mode)
28 {
29  init();
30  m_whitening = do_whitening;
31  m_mode = mode;
32  m_thresh = thresh;
33  m_mem_mode = mem_mode;
34  m_method = method;
35 }
36 
37 CPCA::CPCA(EPCAMethod method, bool do_whitening, EPCAMemoryMode mem_mode)
39 {
40  init();
41  m_whitening = do_whitening;
42  m_mem_mode = mem_mode;
43  m_method = method;
44 }
45 
46 void CPCA::init()
47 {
51  num_dim = 0;
52  m_initialized = false;
53  m_whitening = false;
55  m_thresh = 1e-6;
57  m_method = AUTO;
59 
60  SG_ADD(&m_transformation_matrix, "transformation_matrix",
61  "Transformation matrix (Eigenvectors of covariance matrix).",
63  SG_ADD(&m_mean_vector, "mean_vector", "Mean Vector.", MS_NOT_AVAILABLE);
64  SG_ADD(&m_eigenvalues_vector, "eigenvalues_vector",
65  "Vector with Eigenvalues.", MS_NOT_AVAILABLE);
66  SG_ADD(&m_initialized, "initalized", "True when initialized.",
68  SG_ADD(&m_whitening, "whitening", "Whether data shall be whitened.",
69  MS_AVAILABLE);
70  SG_ADD((machine_int_t*) &m_mode, "mode", "PCA Mode.", MS_AVAILABLE);
71  SG_ADD(&m_thresh, "m_thresh", "Cutoff threshold.", MS_AVAILABLE);
72  SG_ADD((machine_int_t*) &m_mem_mode, "m_mem_mode",
73  "Memory mode (in-place or reallocation).", MS_NOT_AVAILABLE);
74  SG_ADD((machine_int_t*) &m_method, "m_method",
75  "Method used for PCA calculation", MS_NOT_AVAILABLE);
76  SG_ADD(&m_eigenvalue_zero_tolerance, "eigenvalue_zero_tolerance", "zero tolerance"
77  " for determining zero eigenvalues during whitening to avoid numerical issues", MS_NOT_AVAILABLE);
78 }
79 
81 {
82 }
83 
84 bool CPCA::init(CFeatures* features)
85 {
86  if (!m_initialized)
87  {
88  REQUIRE(features->get_feature_class()==C_DENSE, "PCA only works with dense features")
89  REQUIRE(features->get_feature_type()==F_DREAL, "PCA only works with real features")
90 
91  SGMatrix<float64_t> feature_matrix = ((CDenseFeatures<float64_t>*)features)
92  ->get_feature_matrix();
93  int32_t num_vectors = feature_matrix.num_cols;
94  int32_t num_features = feature_matrix.num_rows;
95  SG_INFO("num_examples: %ld num_features: %ld \n", num_vectors, num_features)
96 
97  // max target dim allowed
98  int32_t max_dim_allowed = CMath::min(num_vectors, num_features);
99  num_dim=0;
100 
101  REQUIRE(m_target_dim<=max_dim_allowed,
102  "target dimension should be less or equal to than minimum of N and D")
103 
104  // center data
105  Map<MatrixXd> fmatrix(feature_matrix.matrix, num_features, num_vectors);
106  m_mean_vector = SGVector<float64_t>(num_features);
107  Map<VectorXd> data_mean(m_mean_vector.vector, num_features);
108  data_mean = fmatrix.rowwise().sum()/(float64_t) num_vectors;
109  fmatrix = fmatrix.colwise()-data_mean;
110 
111  m_eigenvalues_vector = SGVector<float64_t>(max_dim_allowed);
112  Map<VectorXd> eigenValues(m_eigenvalues_vector.vector, max_dim_allowed);
113 
114  if (m_method == AUTO)
115  m_method = (num_vectors>num_features) ? EVD : SVD;
116 
117  if (m_method == EVD)
118  {
119  // covariance matrix
120  MatrixXd cov_mat(num_features, num_features);
121  cov_mat = fmatrix*fmatrix.transpose();
122  cov_mat /= (num_vectors-1);
123 
124  SG_INFO("Computing Eigenvalues ... ")
125  // eigen value computed
126  SelfAdjointEigenSolver<MatrixXd> eigenSolve =
127  SelfAdjointEigenSolver<MatrixXd>(cov_mat);
128  eigenValues = eigenSolve.eigenvalues().tail(max_dim_allowed);
129 
130  // target dimension
131  switch (m_mode)
132  {
133  case FIXED_NUMBER :
135  break;
136 
137  case VARIANCE_EXPLAINED :
138  {
139  float64_t eig_sum = eigenValues.sum();
140  float64_t com_sum = 0;
141  for (int32_t i=num_features-1; i<-1; i++)
142  {
143  num_dim++;
144  com_sum += m_eigenvalues_vector.vector[i];
145  if (com_sum/eig_sum>=m_thresh)
146  break;
147  }
148  }
149  break;
150 
151  case THRESHOLD :
152  for (int32_t i=num_features-1; i<-1; i++)
153  {
155  num_dim++;
156  else
157  break;
158  }
159  break;
160  };
161  SG_INFO("Done\nReducing from %i to %i features..", num_features, num_dim)
162 
164  Map<MatrixXd> transformMatrix(m_transformation_matrix.matrix,
165  num_features, num_dim);
166  num_old_dim = num_features;
167 
168  // eigenvector matrix
169  transformMatrix = eigenSolve.eigenvectors().block(0,
170  num_features-num_dim, num_features,num_dim);
171  if (m_whitening)
172  {
173  for (int32_t i=0; i<num_dim; i++)
174  {
175  if (CMath::fequals_abs<float64_t>(0.0, eigenValues[i+max_dim_allowed-num_dim],
177  {
178  SG_WARNING("Covariance matrix has almost zero Eigenvalue (ie "
179  "Eigenvalue within a tolerance of %E around 0) at "
180  "dimension %d. Consider reducing its dimension.",
181  m_eigenvalue_zero_tolerance, i+max_dim_allowed-num_dim+1)
182 
183  transformMatrix.col(i) = MatrixXd::Zero(num_features,1);
184  continue;
185  }
186 
187  transformMatrix.col(i) /=
188  CMath::sqrt(eigenValues[i+max_dim_allowed-num_dim]*(num_vectors-1));
189  }
190  }
191  }
192 
193  else
194  {
195  // compute SVD of data matrix
196  JacobiSVD<MatrixXd> svd(fmatrix.transpose(), ComputeThinU | ComputeThinV);
197 
198  // compute non-negative eigen values from singular values
199  eigenValues = svd.singularValues();
200  eigenValues = eigenValues.cwiseProduct(eigenValues)/(num_vectors-1);
201 
202  // target dimension
203  switch (m_mode)
204  {
205  case FIXED_NUMBER :
207  break;
208 
209  case VARIANCE_EXPLAINED :
210  {
211  float64_t eig_sum = eigenValues.sum();
212  float64_t com_sum = 0;
213  for (int32_t i=0; i<num_features; i++)
214  {
215  num_dim++;
216  com_sum += m_eigenvalues_vector.vector[i];
217  if (com_sum/eig_sum>=m_thresh)
218  break;
219  }
220  }
221  break;
222 
223  case THRESHOLD :
224  for (int32_t i=0; i<num_features; i++)
225  {
227  num_dim++;
228  else
229  break;
230  }
231  break;
232  };
233  SG_INFO("Done\nReducing from %i to %i features..", num_features, num_dim)
234 
235  // right singular vectors form eigenvectors
237  Map<MatrixXd> transformMatrix(m_transformation_matrix.matrix, num_features, num_dim);
238  num_old_dim = num_features;
239  transformMatrix = svd.matrixV().block(0, 0, num_features, num_dim);
240  if (m_whitening)
241  {
242  for (int32_t i=0; i<num_dim; i++)
243  {
244  if (CMath::fequals_abs<float64_t>(0.0, eigenValues[i],
246  {
247  SG_WARNING("Covariance matrix has almost zero Eigenvalue (ie "
248  "Eigenvalue within a tolerance of %E around 0) at "
249  "dimension %d. Consider reducing its dimension.",
251 
252  transformMatrix.col(i) = MatrixXd::Zero(num_features,1);
253  continue;
254  }
255 
256  transformMatrix.col(i) /= CMath::sqrt(eigenValues[i]*(num_vectors-1));
257  }
258  }
259  }
260 
261  // restore feature matrix
262  fmatrix = fmatrix.colwise()+data_mean;
263  m_initialized = true;
264  return true;
265  }
266 
267  return false;
268 }
269 
271 {
275  m_initialized = false;
276 }
277 
279 {
281  ASSERT(features != NULL)
282  SGMatrix<float64_t> m = ((CDenseFeatures<float64_t>*) features)->get_feature_matrix();
283  int32_t num_vectors = m.num_cols;
284  int32_t num_features = m.num_rows;
285 
286  SG_INFO("Transforming feature matrix\n")
287  Map<MatrixXd> transform_matrix(m_transformation_matrix.matrix,
289 
290  if (m_mem_mode == MEM_IN_PLACE)
291  {
292  if (m.matrix)
293  {
294  SG_INFO("Preprocessing feature matrix\n")
295  Map<MatrixXd> feature_matrix(m.matrix, num_features, num_vectors);
296  VectorXd data_mean = feature_matrix.rowwise().sum()/(float64_t) num_vectors;
297  feature_matrix = feature_matrix.colwise()-data_mean;
298 
299  feature_matrix.block(0,0,num_dim,num_vectors) =
300  transform_matrix.transpose()*feature_matrix;
301 
302  SG_INFO("Form matrix of target dimension")
303  for (int32_t col=0; col<num_vectors; col++)
304  {
305  for (int32_t row=0; row<num_dim; row++)
306  m.matrix[col*num_dim+row] = feature_matrix(row,col);
307  }
308  m.num_rows = num_dim;
309  m.num_cols = num_vectors;
310  }
311 
312  ((CDenseFeatures<float64_t>*) features)->set_feature_matrix(m);
313  return m;
314  }
315  else
316  {
317  SGMatrix<float64_t> ret(num_dim, num_vectors);
318  Map<MatrixXd> ret_matrix(ret.matrix, num_dim, num_vectors);
319  if (m.matrix)
320  {
321  SG_INFO("Preprocessing feature matrix\n")
322  Map<MatrixXd> feature_matrix(m.matrix, num_features, num_vectors);
323  VectorXd data_mean = feature_matrix.rowwise().sum()/(float64_t) num_vectors;
324  feature_matrix = feature_matrix.colwise()-data_mean;
325 
326  ret_matrix = transform_matrix.transpose()*feature_matrix;
327  }
328  ((CDenseFeatures<float64_t>*) features)->set_feature_matrix(ret);
329  return ret;
330  }
331 }
332 
334 {
336  Map<VectorXd> resultVec(result.vector, num_dim);
337  Map<VectorXd> inputVec(vector.vector, vector.vlen);
338 
339  Map<VectorXd> mean(m_mean_vector.vector, m_mean_vector.vlen);
340  Map<MatrixXd> transformMat(m_transformation_matrix.matrix,
342 
343  inputVec = inputVec-mean;
344  resultVec = transformMat.transpose()*inputVec;
345  inputVec = inputVec+mean;
346 
347  return result;
348 }
349 
351 {
353 }
354 
356 {
357  return m_eigenvalues_vector;
358 }
359 
361 {
362  return m_mean_vector;
363 }
364 
366 {
367  return m_mem_mode;
368 }
369 
371 {
372  m_mem_mode = e;
373 }
374 
375 void CPCA::set_eigenvalue_zero_tolerance(float64_t eigenvalue_zero_tolerance)
376 {
377  m_eigenvalue_zero_tolerance = eigenvalue_zero_tolerance;
378 }
379 
381 {
383 }
384 
385 #endif // HAVE_EIGEN3

SHOGUN Machine Learning Toolbox - Documentation