SHOGUN  5.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules
Statistics.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2014 Wu Lin
8  * Written (W) 2011-2016 Heiko Strathmann
9  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
10  *
11  * Most cdf routines are wrappers for CDFLIB, part of the public domain NETLIB.
12  * https://people.sc.fsu.edu/~jburkardt/f_src/cdflib/cdflib.html
13  */
14 
15 #include <cmath>
18 #include <shogun/lib/SGMatrix.h>
19 #include <shogun/lib/SGVector.h>
22 #include <shogun/lib/external/cdflib.hpp>
24 
25 using namespace Eigen;
26 
27 using namespace shogun;
28 
29 float64_t CStatistics::variance(SGVector<float64_t> values)
30 {
31  REQUIRE(values.vlen>1, "Number of observations (%d) needs to be at least 1.\n",
32  values.vlen);
33 
34  float64_t mean=CStatistics::mean(values);
35 
36  float64_t sum_squared_diff=0;
37  for (index_t i=0; i<values.vlen; ++i)
38  sum_squared_diff+=CMath::pow(values.vector[i]-mean, 2);
39 
40  return sum_squared_diff/(values.vlen-1);
41 }
42 
43 SGVector<float64_t> CStatistics::matrix_mean(SGMatrix<float64_t> values,
44  bool col_wise)
45 {
46  ASSERT(values.num_rows>0)
47  ASSERT(values.num_cols>0)
48  ASSERT(values.matrix)
49 
50  SGVector<float64_t> result;
51 
52  if (col_wise)
53  {
54  result=SGVector<float64_t>(values.num_cols);
55  for (index_t j=0; j<values.num_cols; ++j)
56  {
57  result[j]=0;
58  for (index_t i=0; i<values.num_rows; ++i)
59  result[j]+=values(i,j);
60 
61  result[j]/=values.num_rows;
62  }
63  }
64  else
65  {
66  result=SGVector<float64_t>(values.num_rows);
67  for (index_t j=0; j<values.num_rows; ++j)
68  {
69  result[j]=0;
70  for (index_t i=0; i<values.num_cols; ++i)
71  result[j]+=values(j,i);
72 
73  result[j]/=values.num_cols;
74  }
75  }
76 
77  return result;
78 }
79 
80 SGVector<float64_t> CStatistics::matrix_variance(SGMatrix<float64_t> values,
81  bool col_wise)
82 {
83  ASSERT(values.num_rows>0)
84  ASSERT(values.num_cols>0)
85  ASSERT(values.matrix)
86 
87  /* first compute mean */
88  SGVector<float64_t> mean=CStatistics::matrix_mean(values, col_wise);
89 
90  SGVector<float64_t> result;
91 
92  if (col_wise)
93  {
94  result=SGVector<float64_t>(values.num_cols);
95  for (index_t j=0; j<values.num_cols; ++j)
96  {
97  result[j]=0;
98  for (index_t i=0; i<values.num_rows; ++i)
99  result[j]+=CMath::pow(values(i,j)-mean[j], 2);
100 
101  result[j]/=(values.num_rows-1);
102  }
103  }
104  else
105  {
106  result=SGVector<float64_t>(values.num_rows);
107  for (index_t j=0; j<values.num_rows; ++j)
108  {
109  result[j]=0;
110  for (index_t i=0; i<values.num_cols; ++i)
111  result[j]+=CMath::pow(values(j,i)-mean[j], 2);
112 
113  result[j]/=(values.num_cols-1);
114  }
115  }
116 
117  return result;
118 }
119 
120 float64_t CStatistics::std_deviation(SGVector<float64_t> values)
121 {
122  return CMath::sqrt(variance(values));
123 }
124 
125 SGVector<float64_t> CStatistics::matrix_std_deviation(
126  SGMatrix<float64_t> values, bool col_wise)
127 {
128  SGVector<float64_t> var=CStatistics::matrix_variance(values, col_wise);
129  for (index_t i=0; i<var.vlen; ++i)
130  var[i]=CMath::sqrt(var[i]);
131 
132  return var;
133 }
134 
135 SGMatrix<float64_t> CStatistics::covariance_matrix(
136  SGMatrix<float64_t> observations, bool in_place)
137 {
138  int32_t D = observations.num_rows;
139  int32_t N = observations.num_cols;
140  SG_SDEBUG("%d observations in %d dimensions\n", N, D)
141 
142  REQUIRE(N>1, "Number of observations (%d) must be at least 2.\n", N);
143  REQUIRE(D>0, "Number of dimensions (%d) must be at least 1.\n", D);
144 
145  SGMatrix<float64_t> centered=in_place ? observations :
146  SGMatrix<float64_t>(D, N);
147 
148  /* center observations, potentially in-place */
149  if (!in_place)
150  {
151  int64_t num_elements = N*D;
152  memcpy(centered.matrix, observations.matrix,
153  sizeof(float64_t)*num_elements);
154  }
155  SG_SDEBUG("Centering observations\n");
156  Map<MatrixXd> eigen_centered(centered.matrix, D, N);
157  eigen_centered.colwise() -= eigen_centered.rowwise().mean();
158 
159  /* compute and store 1/(N-1) * X * X.T */
160  SG_SDEBUG("Computing squared differences\n");
161  SGMatrix<float64_t> cov(D, D);
162  Map<MatrixXd> eigen_cov(cov.matrix, D, D);
163  eigen_cov = (eigen_centered * eigen_centered.adjoint()) / double(N - 1);
164 
165  return cov;
166 }
167 
168 SGVector<float64_t> CStatistics::fishers_exact_test_for_multiple_2x3_tables(
169  SGMatrix<float64_t> tables)
170 {
171  SGMatrix<float64_t> table(NULL, 2, 3, false);
172  int32_t len=tables.num_cols/3;
173 
174  SGVector<float64_t> v(len);
175  for (int32_t i=0; i<len; i++)
176  {
177  table.matrix=&tables.matrix[2*3*i];
178  v.vector[i]=fishers_exact_test_for_2x3_table(table);
179  }
180  return v;
181 }
182 
183 float64_t CStatistics::fishers_exact_test_for_2x3_table(
184  SGMatrix<float64_t> table)
185 {
186  ASSERT(table.num_rows==2)
187  ASSERT(table.num_cols==3)
188 
189  int32_t m_len=3+2;
190  float64_t* m=SG_MALLOC(float64_t, 3+2);
191  m[0]=table.matrix[0]+table.matrix[2]+table.matrix[4];
192  m[1]=table.matrix[1]+table.matrix[3]+table.matrix[5];
193  m[2]=table.matrix[0]+table.matrix[1];
194  m[3]=table.matrix[2]+table.matrix[3];
195  m[4]=table.matrix[4]+table.matrix[5];
196 
197  float64_t n=SGVector<float64_t>::sum(m, m_len)/2.0;
198  int32_t x_len=2*3*CMath::sq(CMath::max(m, m_len));
199  float64_t* x=SG_MALLOC(float64_t, x_len);
200  SGVector<float64_t>::fill_vector(x, x_len, 0.0);
201 
202  float64_t log_nom=0.0;
203  for (int32_t i=0; i<3+2; i++)
204  log_nom+=lgamma(m[i]+1);
205  log_nom-=lgamma(n+1.0);
206 
207  float64_t log_denomf=0;
208  floatmax_t log_denom=0;
209 
210  for (int32_t i=0; i<3*2; i++)
211  {
212  log_denom+=lgammal((floatmax_t)table.matrix[i]+1);
213  log_denomf+=lgammal((floatmax_t)table.matrix[i]+1);
214  }
215 
216  floatmax_t prob_table_log=log_nom-log_denom;
217 
218  int32_t dim1=CMath::min(m[0], m[2]);
219 
220  //traverse all possible tables with given m
221  int32_t counter=0;
222  for (int32_t k=0; k<=dim1; k++)
223  {
224  for (int32_t l=CMath::max(0.0, m[0]-m[4]-k);
225  l<=CMath::min(m[0]-k, m[3]); l++)
226  {
227  x[0+0*2+counter*2*3]=k;
228  x[0+1*2+counter*2*3]=l;
229  x[0+2*2+counter*2*3]=m[0]-x[0+0*2+counter*2*3]-x[0+1*2+counter*2*3];
230  x[1+0*2+counter*2*3]=m[2]-x[0+0*2+counter*2*3];
231  x[1+1*2+counter*2*3]=m[3]-x[0+1*2+counter*2*3];
232  x[1+2*2+counter*2*3]=m[4]-x[0+2*2+counter*2*3];
233 
234  counter++;
235  }
236  }
237 
238 //#define DEBUG_FISHER_TABLE
239 #ifdef DEBUG_FISHER_TABLE
240  SG_SPRINT("counter=%d\n", counter)
241  SG_SPRINT("dim1=%d\n", dim1)
242  SG_SPRINT("l=%g...%g\n", CMath::max(0.0,m[0]-m[4]-0), CMath::min(m[0]-0, m[3]))
243  SG_SPRINT("n=%g\n", n)
244  SG_SPRINT("prob_table_log=%.18Lg\n", prob_table_log)
245  SG_SPRINT("log_denomf=%.18g\n", log_denomf)
246  SG_SPRINT("log_denom=%.18Lg\n", log_denom)
247  SG_SPRINT("log_nom=%.18g\n", log_nom)
248  display_vector(m, m_len, "marginals");
249  display_vector(x, 2*3*counter, "x");
250 #endif // DEBUG_FISHER_TABLE
251 
252  floatmax_t* log_denom_vec=SG_MALLOC(floatmax_t, counter);
253  SGVector<floatmax_t>::fill_vector(log_denom_vec, counter, (floatmax_t)0.0);
254 
255  for (int32_t k=0; k<counter; k++)
256  {
257  for (int32_t j=0; j<3; j++)
258  {
259  for (int32_t i=0; i<2; i++)
260  log_denom_vec[k]+=lgammal(x[i+j*2+k*2*3]+1.0);
261  }
262  }
263 
264  for (int32_t i=0; i<counter; i++)
265  log_denom_vec[i]=log_nom-log_denom_vec[i];
266 
267 #ifdef DEBUG_FISHER_TABLE
268  display_vector(log_denom_vec, counter, "log_denom_vec");
269 #endif // DEBUG_FISHER_TABLE
270 
271  float64_t nonrand_p=-CMath::INFTY;
272  for (int32_t i=0; i<counter; i++)
273  {
274  if (log_denom_vec[i]<=prob_table_log)
275  nonrand_p=CMath::logarithmic_sum(nonrand_p, log_denom_vec[i]);
276  }
277 
278 #ifdef DEBUG_FISHER_TABLE
279  SG_SPRINT("nonrand_p=%.18g\n", nonrand_p)
280  SG_SPRINT("exp_nonrand_p=%.18g\n", CMath::exp(nonrand_p))
281 #endif // DEBUG_FISHER_TABLE
282  nonrand_p=CMath::exp(nonrand_p);
283 
284  SG_FREE(log_denom_vec);
285  SG_FREE(x);
286  SG_FREE(m);
287 
288  return nonrand_p;
289 }
290 
291 float64_t CStatistics::mutual_info(float64_t* p1, float64_t* p2, int32_t len)
292 {
293  double e=0;
294 
295  for (int32_t i=0; i<len; i++)
296  for (int32_t j=0; j<len; j++)
297  e+=exp(p2[j*len+i])*(p2[j*len+i]-p1[i]-p1[j]);
298 
299  return (float64_t)e;
300 }
301 
302 float64_t CStatistics::relative_entropy(float64_t* p, float64_t* q, int32_t len)
303 {
304  double e=0;
305 
306  for (int32_t i=0; i<len; i++)
307  e+=exp(p[i])*(p[i]-q[i]);
308 
309  return (float64_t)e;
310 }
311 
312 float64_t CStatistics::entropy(float64_t* p, int32_t len)
313 {
314  double e=0;
315 
316  for (int32_t i=0; i<len; i++)
317  e-=exp(p[i])*p[i];
318 
319  return (float64_t)e;
320 }
321 
322 SGVector<int32_t> CStatistics::sample_indices(int32_t sample_size, int32_t N)
323 {
324  REQUIRE(sample_size<N,
325  "sample size should be less than number of indices\n");
326  int32_t* idxs=SG_MALLOC(int32_t,N);
327  int32_t i, rnd;
328  int32_t* permuted_idxs=SG_MALLOC(int32_t,sample_size);
329 
330  // reservoir sampling
331  for (i=0; i<N; i++)
332  idxs[i]=i;
333  for (i=0; i<sample_size; i++)
334  permuted_idxs[i]=idxs[i];
335  for (i=sample_size; i<N; i++)
336  {
337  rnd=CMath::random(1, i);
338  if (rnd<sample_size)
339  permuted_idxs[rnd]=idxs[i];
340  }
341  SG_FREE(idxs);
342 
343  SGVector<int32_t> result=SGVector<int32_t>(permuted_idxs, sample_size);
344  CMath::qsort(result);
345  return result;
346 }
347 
348 float64_t CStatistics::inverse_normal_cdf(float64_t p, float64_t mean,
349  float64_t std_deviation)
350 {
351  REQUIRE(p>=0, "p (%f); must be greater or equal to 0.\n", p);
352  REQUIRE(p<=1, "p (%f); must be greater or equal to 1.\n", p);
353  REQUIRE(std_deviation>0, "Standard deviation (%f); must be positive\n",
354  std_deviation);
355 
356  // invserse normal cdf case, see cdflib.cpp for details
357  int which=2;
358  float64_t output_x;
359  float64_t q=1-p;
360  float64_t output_bound;
361  int output_status;
362 
363  cdfnor(&which, &p, &q, &output_x, &mean, &std_deviation, &output_status, &output_bound);
364 
365  if (output_status!=0)
366  SG_SERROR("Error %d while calling cdflib::cdfnor\n", output_status);
367 
368  return output_x;
369 
370  //void cdfnor ( int *which, double *p, double *q, double *x, double *mean,
371  // double *sd, int *status, double *bound )
372 }
373 
374 float64_t CStatistics::chi2_cdf(float64_t x, float64_t k)
375 {
376  REQUIRE(x>=0, "x (%f) has to be greater or equal to 0.\n", x);
377  REQUIRE(k>0, "Degrees of freedom (%f) has to be positive.\n", k);
378 
379  // chi2 cdf case, see cdflib.cpp for details
380  int which=1;
381  float64_t df=k;
382  float64_t output_q;
383  float64_t output_p;
384  float64_t output_bound;
385  int output_status;
386 
387  cdfchi(&which, &output_p, &output_q, &x, &df, &output_status, &output_bound);
388 
389  if (output_status!=0)
390  SG_SERROR("Error %d while calling cdflib::cdfchi\n", output_status);
391 
392  return output_p;
393 }
394 
395 float64_t CStatistics::gamma_cdf(float64_t x, float64_t a, float64_t b)
396 {
397  REQUIRE(x>=0, "x (%f) has to be greater or equal to 0.\n", x);
398  REQUIRE(a>=0, "a (%f) has to be greater or equal to 0.\n", a);
399  REQUIRE(b>=0, "b (%f) has to be greater or equal to 0.\n", b);
400 
401  // inverse gamma cdf case, see cdflib.cpp for details
402  float64_t shape=a;
403  float64_t scale=b;
404  int which=1;
405  float64_t output_p;
406  float64_t output_q;
407  float64_t output_bound;
408  int output_error_code;
409 
410  cdfgam(&which, &output_p, &output_q, &x, &shape, &scale, &output_error_code, &output_bound);
411 
412  if (output_error_code!=0)
413  SG_SERROR("Error %d while calling cdflib::cdfgam\n", output_error_code);
414 
415  return output_p;
416 }
417 
418 float64_t CStatistics::lnormal_cdf(float64_t x)
419 {
420  /* Loosely based on logphi.m in
421  * Gaussian Process Machine Learning Toolbox file logphi.m
422  * http://www.gaussianprocess.org/gpml/code/matlab/doc/
423  * Under FreeBSD license
424  */
425 
426  const float64_t sqrt_of_2=1.41421356237309514547;
427  const float64_t log_of_2=0.69314718055994528623;
428  const float64_t sqrt_of_pi=1.77245385090551588192;
429 
430  const index_t c_len=14;
431  static float64_t c_array[c_len]=
432  {
433  0.00048204,
434  -0.00142906,
435  0.0013200243174,
436  0.0009461589032,
437  -0.0045563339802,
438  0.00556964649138,
439  0.00125993961762116,
440  -0.01621575378835404,
441  0.02629651521057465,
442  -0.001829764677455021,
443  2.0*(1.0-CMath::PI/3.0),
444  (4.0-CMath::PI)/3.0,
445  1.0,
446  1.0
447  };
448 
449  if (x*x<ERFC_CASE1)
450  {
451  float64_t f = 0.0;
452  float64_t lp0 = -x/(sqrt_of_2*sqrt_of_pi);
453  for (index_t i=0; i<c_len; i++)
454  f=lp0*(c_array[i]+f);
455  return -2.0*f-log_of_2;
456  }
457  else if (x<ERFC_CASE2)
458  return CMath::log(erfc8_weighted_sum(x))-log_of_2-x*x*0.5;
459 
460  //id3 = ~id2 & ~id1; lp(id3) = log(erfc(-z(id3)/sqrt(2))/2);
461  return CMath::log(normal_cdf(x));
462 }
463 
464 float64_t CStatistics::erfc8_weighted_sum(float64_t x)
465 {
466  /* This is based on index 5725 in Hart et al */
467 
468  const float64_t sqrt_of_2=1.41421356237309514547;
469 
470  static float64_t P[]=
471  {
472  0.5641895835477550741253201704,
473  1.275366644729965952479585264,
474  5.019049726784267463450058,
475  6.1602098531096305440906,
476  7.409740605964741794425,
477  2.97886562639399288862
478  };
479 
480  static float64_t Q[]=
481  {
482  1.0,
483  2.260528520767326969591866945,
484  9.396034016235054150430579648,
485  12.0489519278551290360340491,
486  17.08144074746600431571095,
487  9.608965327192787870698,
488  3.3690752069827527677
489  };
490 
491  float64_t num=0.0, den=0.0;
492 
493  num = P[0];
494  for (index_t i=1; i<6; i++)
495  {
496  num=-x*num/sqrt_of_2+P[i];
497  }
498 
499  den = Q[0];
500  for (index_t i=1; i<7; i++)
501  {
502  den=-x*den/sqrt_of_2+Q[i];
503  }
504 
505  return num/den;
506 }
507 
508 float64_t CStatistics::normal_cdf(float64_t x, float64_t std_dev)
509 {
510  return 0.5*(erfc(-x*M_SQRT1_2/std_dev));
511 }
512 
513 float64_t CStatistics::gamma_inverse_cdf(float64_t p, float64_t a,
514  float64_t b)
515 {
516  REQUIRE(p>=0, "p (%f) has to be greater or equal to 0.\n", p);
517  REQUIRE(a>=0, "a (%f) has to be greater or equal to 0.\n", a);
518  REQUIRE(b>=0, "b (%f) has to be greater or equal to 0.\n", b);
519 
520  float64_t shape=a;
521  float64_t scale=b;
522  float64_t q = 1-p;
523  int which=2;
524  float64_t output_x=0;
525  float64_t output_bound;
526  int output_error_code=0;
527 
528  // inverse gamma cdf case, see cdflib.cpp for details
529  cdfgam(&which, &p, &q, &output_x, &shape, &scale, &output_error_code, &output_bound);
530 
531  if (output_error_code!=0)
532  SG_SERROR("Error %d while calling cdflib::beta_inc\n", output_error_code);
533 
534  return output_x;
535 }
536 
537 float64_t CStatistics::fdistribution_cdf(float64_t x, float64_t d1, float64_t d2)
538 {
539  REQUIRE(x>=0, "x (%f) has to be greater or equal to 0.\n", x);
540  REQUIRE(d1>0, "d1 (%f) has to be positive.\n", d1);
541  REQUIRE(d2>0, "d2 (%f) has to be positive.\n", d2);
542 
543  // fcdf case, see cdflib.cpp for details
544  int which=1;
545  float64_t output_p;
546  float64_t output_q;
547  float64_t output_bound;
548  int output_error_code;
549 
550  cdff(&which, &output_p, &output_q, &x, &d1, &d2, &output_error_code, &output_bound);
551 
552  if (output_error_code!=0)
553  SG_SERROR("Error %d while calling cdflib::cdff\n", output_error_code);
554 
555  return output_p;
556 }
557 
558 float64_t CStatistics::dlgamma(float64_t x)
559 {
560  float64_t result=0.0;
561 
562  if (x<0.0)
563  {
564  // use reflection formula
565  x=1.0-x;
566  result=CMath::PI/CMath::tan(CMath::PI*x);
567  }
568 
569  // make x>7 for approximation
570  // (use reccurent formula: psi(x+1) = psi(x) + 1/x)
571  while (x<=7.0)
572  {
573  result-=1.0/x;
574  x++;
575  }
576 
577  // perform approximation
578  x-=0.5;
579  result+=log(x);
580 
581  float64_t coeff[10]={
582  0.04166666666666666667,
583  -0.00729166666666666667,
584  0.00384424603174603175,
585  -0.00413411458333333333,
586  0.00756096117424242424,
587  -0.02108249687595390720,
588  0.08332316080729166666,
589  -0.44324627670587277880,
590  3.05393103044765369366,
591  -26.45616165999210241989};
592 
593  float64_t power=1.0;
594  float64_t ix2=1.0/CMath::sq(x);
595 
596  // perform approximation
597  for (index_t i=0; i<10; i++)
598  {
599  power*=ix2;
600  result+=coeff[i]*power;
601  }
602 
603  return result;
604 }
605 
606 float64_t CStatistics::log_det_general(const SGMatrix<float64_t> A)
607 {
608  Map<MatrixXd> eigen_A(A.matrix, A.num_rows, A.num_cols);
609  REQUIRE(eigen_A.rows()==eigen_A.cols(),
610  "Input matrix should be a sqaure matrix row(%d) col(%d)\n",
611  eigen_A.rows(), eigen_A.cols());
612 
613  PartialPivLU<MatrixXd> lu(eigen_A);
614  VectorXd tmp(eigen_A.rows());
615 
616  for (index_t idx=0; idx<tmp.rows(); idx++)
617  tmp[idx]=idx+1;
618 
619  VectorXd p=lu.permutationP()*tmp;
620  int32_t detP=1;
621 
622  for (index_t idx=0; idx<p.rows(); idx++)
623  {
624  if (p[idx]!=idx+1)
625  {
626  detP*=-1;
627  index_t j=idx+1;
628  while(j<p.rows())
629  {
630  if (p[j]==idx+1)
631  break;
632  j++;
633  }
634  p[j]=p[idx];
635  }
636  }
637 
638  VectorXd u=lu.matrixLU().diagonal();
639  int32_t check_u=1;
640 
641  for (index_t idx=0; idx<u.rows(); idx++)
642  {
643  if (u[idx]<0)
644  check_u*=-1;
645  else if (u[idx]==0)
646  {
647  check_u=0;
648  break;
649  }
650  }
651 
652  float64_t result=CMath::INFTY;
653 
654  if (check_u==detP)
655  result=u.array().abs().log().sum();
656 
657  return result;
658 }
659 
660 float64_t CStatistics::log_det(SGMatrix<float64_t> m)
661 {
662  /* map the matrix to eigen3 to perform cholesky */
664 
665  /* computing the cholesky decomposition */
666  LLT<MatrixXd> llt;
667  llt.compute(M);
668 
669  /* the lower triangular matrix */
670  MatrixXd l = llt.matrixL();
671 
672  /* calculate the log-determinant */
673  VectorXd diag = l.diagonal();
674  float64_t retval = 0.0;
675  for( int32_t i = 0; i < diag.rows(); ++i ) {
676  retval += log(diag(i));
677  }
678  retval *= 2;
679 
680  return retval;
681 }
682 
683 float64_t CStatistics::log_det(const SGSparseMatrix<float64_t> m)
684 {
685  typedef SparseMatrix<float64_t> MatrixType;
686  const MatrixType &M=EigenSparseUtil<float64_t>::toEigenSparse(m);
687 
688  SimplicialLLT<MatrixType> llt;
689 
690  // factorize using cholesky with amd permutation
691  llt.compute(M);
692  MatrixType L=llt.matrixL();
693 
694  // calculate the log-determinant
695  float64_t retval=0.0;
696  for( index_t i=0; i<M.rows(); ++i )
697  retval+=log(L.coeff(i,i));
698  retval*=2;
699 
700  return retval;
701 }
702 
703 SGMatrix<float64_t> CStatistics::sample_from_gaussian(SGVector<float64_t> mean,
704  SGMatrix<float64_t> cov, int32_t N, bool precision_matrix)
705 {
706  REQUIRE(cov.num_rows>0, "Number of covariance rows must be positive!\n");
707  REQUIRE(cov.num_cols>0,"Number of covariance cols must be positive!\n");
708  REQUIRE(cov.matrix, "Covariance is not initialized!\n");
709  REQUIRE(cov.num_rows==cov.num_cols, "Covariance should be square matrix!\n");
710  REQUIRE(mean.vlen==cov.num_rows, "Mean and covariance dimension mismatch!\n");
711 
712  int32_t dim=mean.vlen;
713  Map<VectorXd> mu(mean.vector, mean.vlen);
714  Map<MatrixXd> c(cov.matrix, cov.num_rows, cov.num_cols);
715 
716  // generate samples, z, from N(0, I), DxN
717  SGMatrix<float64_t> S(dim, N);
718  for( int32_t j=0; j<N; ++j )
719  for( int32_t i=0; i<dim; ++i )
720  S(i,j)=CMath::randn_double();
721 
722  // the cholesky factorization c=L*U
723  MatrixXd U=c.llt().matrixU();
724 
725  // generate samples, x, from N(mean, cov) or N(mean, cov^-1)
726  // return samples of dimension NxD
727  if( precision_matrix )
728  {
729  // here we have U*x=z, to solve this, we use cholesky again
731  LDLT<MatrixXd> ldlt;
732  ldlt.compute(U);
733  s=ldlt.solve(s);
734  }
735 
737 
738  if( !precision_matrix )
739  {
740  // here we need to find x=L*z, so, x'=z'*L' i.e. x'=z'*U
742  s=s*U;
743  }
744 
745  // add the mean
747  for( int32_t i=0; i<N; ++i )
748  x.row(i)+=mu;
749 
750  return S;
751 }
752 
753 SGMatrix<float64_t> CStatistics::sample_from_gaussian(SGVector<float64_t> mean,
754  SGSparseMatrix<float64_t> cov, int32_t N, bool precision_matrix)
755 {
756  REQUIRE(cov.num_vectors>0,
757  "CStatistics::sample_from_gaussian(): \
758  Number of covariance rows must be positive!\n");
759  REQUIRE(cov.num_features>0,
760  "CStatistics::sample_from_gaussian(): \
761  Number of covariance cols must be positive!\n");
763  "CStatistics::sample_from_gaussian(): \
764  Covariance is not initialized!\n");
766  "CStatistics::sample_from_gaussian(): \
767  Covariance should be square matrix!\n");
768  REQUIRE(mean.vlen==cov.num_vectors,
769  "CStatistics::sample_from_gaussian(): \
770  Mean and covariance dimension mismatch!\n");
771 
772  int32_t dim=mean.vlen;
773  Map<VectorXd> mu(mean.vector, mean.vlen);
774 
775  typedef SparseMatrix<float64_t> MatrixType;
776  const MatrixType &c=EigenSparseUtil<float64_t>::toEigenSparse(cov);
777 
778  SimplicialLLT<MatrixType> llt;
779 
780  // generate samples, z, from N(0, I), DxN
781  SGMatrix<float64_t> S(dim, N);
782  for( int32_t j=0; j<N; ++j )
783  for( int32_t i=0; i<dim; ++i )
784  S(i,j)=CMath::randn_double();
785 
787 
788  // the cholesky factorization P*c*P^-1 = LP*UP, with LP=P*L, UP=U*P^-1
789  llt.compute(c);
790  MatrixType LP=llt.matrixL();
791  MatrixType UP=llt.matrixU();
792 
793  // generate samples, x, from N(mean, cov) or N(mean, cov^-1)
794  // return samples of dimension NxD
795  if( precision_matrix )
796  {
797  // here we have UP*xP=z, to solve this, we use cholesky again
798  SimplicialLLT<MatrixType> lltUP;
799  lltUP.compute(UP);
800  s=lltUP.solve(s);
801  }
802  else
803  {
804  // here we need to find xP=LP*z
805  s=LP*s;
806  }
807 
808  // permute the samples back with x=P^-1*xP
809  s=llt.permutationPinv()*s;
810 
812  // add the mean
814  for( int32_t i=0; i<N; ++i )
815  x.row(i)+=mu;
816 
817  return S;
818 }
819 
820 
821 CStatistics::SigmoidParamters CStatistics::fit_sigmoid(SGVector<float64_t> scores)
822 {
823  SG_SDEBUG("entering CStatistics::fit_sigmoid()\n")
824 
825  REQUIRE(scores.vector, "CStatistics::fit_sigmoid() requires "
826  "scores vector!\n");
827 
828  /* count prior0 and prior1 if needed */
829  int32_t prior0=0;
830  int32_t prior1=0;
831  SG_SDEBUG("counting number of positive and negative labels\n")
832  {
833  for (index_t i=0; i<scores.vlen; ++i)
834  {
835  if (scores[i]>0)
836  prior1++;
837  else
838  prior0++;
839  }
840  }
841  SG_SDEBUG("%d pos; %d neg\n", prior1, prior0)
842 
843  /* parameter setting */
844  /* maximum number of iterations */
845  index_t maxiter=100;
846 
847  /* minimum step taken in line search */
848  float64_t minstep=1E-10;
849 
850  /* for numerically strict pd of hessian */
851  float64_t sigma=1E-12;
852  float64_t eps=1E-5;
853 
854  /* construct target support */
855  float64_t hiTarget=(prior1+1.0)/(prior1+2.0);
856  float64_t loTarget=1/(prior0+2.0);
857  index_t length=prior1+prior0;
858 
859  SGVector<float64_t> t(length);
860  for (index_t i=0; i<length; ++i)
861  {
862  if (scores[i]>0)
863  t[i]=hiTarget;
864  else
865  t[i]=loTarget;
866  }
867 
868  /* initial Point and Initial Fun Value */
869  /* result parameters of sigmoid */
870  float64_t a=0;
871  float64_t b=CMath::log((prior0+1.0)/(prior1+1.0));
872  float64_t fval=0.0;
873 
874  for (index_t i=0; i<length; ++i)
875  {
876  float64_t fApB=scores[i]*a+b;
877  if (fApB>=0)
878  fval+=t[i]*fApB+CMath::log(1+CMath::exp(-fApB));
879  else
880  fval+=(t[i]-1)*fApB+CMath::log(1+CMath::exp(fApB));
881  }
882 
883  index_t it;
884  float64_t g1;
885  float64_t g2;
886  for (it=0; it<maxiter; ++it)
887  {
888  SG_SDEBUG("Iteration %d, a=%f, b=%f, fval=%f\n", it, a, b, fval)
889 
890  /* Update Gradient and Hessian (use H' = H + sigma I) */
891  float64_t h11=sigma; //Numerically ensures strict PD
892  float64_t h22=h11;
893  float64_t h21=0;
894  g1=0;
895  g2=0;
896 
897  for (index_t i=0; i<length; ++i)
898  {
899  float64_t fApB=scores[i]*a+b;
900  float64_t p;
901  float64_t q;
902  if (fApB>=0)
903  {
904  p=CMath::exp(-fApB)/(1.0+CMath::exp(-fApB));
905  q=1.0/(1.0+CMath::exp(-fApB));
906  }
907  else
908  {
909  p=1.0/(1.0+CMath::exp(fApB));
910  q=CMath::exp(fApB)/(1.0+CMath::exp(fApB));
911  }
912 
913  float64_t d2=p*q;
914  h11+=scores[i]*scores[i]*d2;
915  h22+=d2;
916  h21+=scores[i]*d2;
917  float64_t d1=t[i]-p;
918  g1+=scores[i]*d1;
919  g2+=d1;
920  }
921 
922  /* Stopping Criteria */
923  if (CMath::abs(g1)<eps && CMath::abs(g2)<eps)
924  break;
925 
926  /* Finding Newton direction: -inv(H') * g */
927  float64_t det=h11*h22-h21*h21;
928  float64_t dA=-(h22*g1-h21*g2)/det;
929  float64_t dB=-(-h21*g1+h11*g2)/det;
930  float64_t gd=g1*dA+g2*dB;
931 
932  /* Line Search */
933  float64_t stepsize=1;
934 
935  while (stepsize>=minstep)
936  {
937  float64_t newA=a+stepsize*dA;
938  float64_t newB=b+stepsize*dB;
939 
940  /* New function value */
941  float64_t newf=0.0;
942  for (index_t i=0; i<length; ++i)
943  {
944  float64_t fApB=scores[i]*newA+newB;
945  if (fApB>=0)
946  newf+=t[i]*fApB+CMath::log(1+CMath::exp(-fApB));
947  else
948  newf+=(t[i]-1)*fApB+CMath::log(1+CMath::exp(fApB));
949  }
950 
951  /* Check sufficient decrease */
952  if (newf<fval+0.0001*stepsize*gd)
953  {
954  a=newA;
955  b=newB;
956  fval=newf;
957  break;
958  }
959  else
960  stepsize=stepsize/2.0;
961  }
962 
963  if (stepsize<minstep)
964  {
965  SG_SWARNING("CStatistics::fit_sigmoid(): line search fails, A=%f, "
966  "B=%f, g1=%f, g2=%f, dA=%f, dB=%f, gd=%f\n",
967  a, b, g1, g2, dA, dB, gd);
968  }
969  }
970 
971  if (it>=maxiter-1)
972  {
973  SG_SWARNING("CStatistics::fit_sigmoid(): reaching maximal iterations,"
974  " g1=%f, g2=%f\n", g1, g2);
975  }
976 
977  SG_SDEBUG("fitted sigmoid: a=%f, b=%f\n", a, b)
978 
980  result.a=a;
981  result.b=b;
982 
983  SG_SDEBUG("leaving CStatistics::fit_sigmoid()\n")
984  return result;
985 }
986 
987 const float64_t CStatistics::ERFC_CASE1=0.0492;
988 
989 const float64_t CStatistics::ERFC_CASE2=-11.3137;
int32_t index_t
Definition: common.h:62
#define SG_SWARNING(...)
Definition: SGIO.h:178
Definition: SGMatrix.h:20
#define REQUIRE(x,...)
Definition: SGIO.h:206
index_t num_cols
Definition: SGMatrix.h:376
index_t num_rows
Definition: SGMatrix.h:374
index_t vlen
Definition: SGVector.h:494
#define SG_SPRINT(...)
Definition: SGIO.h:180
#define ASSERT(x)
Definition: SGIO.h:201
index_t num_features
total number of features
double float64_t
Definition: common.h:50
long double floatmax_t
Definition: common.h:51
SGSparseVector< T > * sparse_matrix
array of sparse vectors of size num_vectors
This class contains some utilities for Eigen3 Sparse Matrix integration with shogun. Currently it provides a method for converting SGSparseMatrix to Eigen3 SparseMatrix.
Definition: eigen3.h:87
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
#define SG_SDEBUG(...)
Definition: SGIO.h:168
#define SG_SERROR(...)
Definition: SGIO.h:179
void scale(Matrix A, Matrix B, typename Matrix::Scalar alpha)
Definition: Core.h:94
Matrix::Scalar max(Matrix m)
Definition: Redux.h:68
index_t num_vectors
total number of vectors

SHOGUN Machine Learning Toolbox - Documentation