SHOGUN  6.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules
Statistics.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2014 Wu Lin
8  * Written (W) 2011-2016 Heiko Strathmann
9  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
10  *
11  * Most cdf routines are wrappers for CDFLIB, part of the public domain NETLIB.
12  * https://people.sc.fsu.edu/~jburkardt/f_src/cdflib/cdflib.html
13  */
14 
17 #include <shogun/lib/SGMatrix.h>
18 #include <shogun/lib/SGVector.h>
21 #include <shogun/lib/external/cdflib.hpp>
23 
24 using namespace Eigen;
25 
26 using namespace shogun;
27 
28 float64_t CStatistics::variance(SGVector<float64_t> values)
29 {
30  REQUIRE(values.vlen>1, "Number of observations (%d) needs to be at least 1.\n",
31  values.vlen);
32 
34 
35  float64_t sum_squared_diff=0;
36  for (index_t i=0; i<values.vlen; ++i)
37  sum_squared_diff+=CMath::pow(values.vector[i]-mean, 2);
38 
39  return sum_squared_diff/(values.vlen-1);
40 }
41 
42 SGVector<float64_t> CStatistics::matrix_mean(SGMatrix<float64_t> values,
43  bool col_wise)
44 {
45  ASSERT(values.num_rows>0)
46  ASSERT(values.num_cols>0)
47  ASSERT(values.matrix)
48 
49  SGVector<float64_t> result;
50 
51  if (col_wise)
52  {
53  result=SGVector<float64_t>(values.num_cols);
54  for (index_t j=0; j<values.num_cols; ++j)
55  {
56  result[j]=0;
57  for (index_t i=0; i<values.num_rows; ++i)
58  result[j]+=values(i,j);
59 
60  result[j]/=values.num_rows;
61  }
62  }
63  else
64  {
65  result=SGVector<float64_t>(values.num_rows);
66  for (index_t j=0; j<values.num_rows; ++j)
67  {
68  result[j]=0;
69  for (index_t i=0; i<values.num_cols; ++i)
70  result[j]+=values(j,i);
71 
72  result[j]/=values.num_cols;
73  }
74  }
75 
76  return result;
77 }
78 
79 SGVector<float64_t> CStatistics::matrix_variance(SGMatrix<float64_t> values,
80  bool col_wise)
81 {
82  ASSERT(values.num_rows>0)
83  ASSERT(values.num_cols>0)
84  ASSERT(values.matrix)
85 
86  /* first compute mean */
87  SGVector<float64_t> mean=CStatistics::matrix_mean(values, col_wise);
88 
89  SGVector<float64_t> result;
90 
91  if (col_wise)
92  {
93  result=SGVector<float64_t>(values.num_cols);
94  for (index_t j=0; j<values.num_cols; ++j)
95  {
96  result[j]=0;
97  for (index_t i=0; i<values.num_rows; ++i)
98  result[j]+=CMath::pow(values(i,j)-mean[j], 2);
99 
100  result[j]/=(values.num_rows-1);
101  }
102  }
103  else
104  {
105  result=SGVector<float64_t>(values.num_rows);
106  for (index_t j=0; j<values.num_rows; ++j)
107  {
108  result[j]=0;
109  for (index_t i=0; i<values.num_cols; ++i)
110  result[j]+=CMath::pow(values(j,i)-mean[j], 2);
111 
112  result[j]/=(values.num_cols-1);
113  }
114  }
115 
116  return result;
117 }
118 
119 float64_t CStatistics::std_deviation(SGVector<float64_t> values)
120 {
121  return CMath::sqrt(variance(values));
122 }
123 
124 SGVector<float64_t> CStatistics::matrix_std_deviation(
125  SGMatrix<float64_t> values, bool col_wise)
126 {
127  SGVector<float64_t> var=CStatistics::matrix_variance(values, col_wise);
128  for (index_t i=0; i<var.vlen; ++i)
129  var[i]=CMath::sqrt(var[i]);
130 
131  return var;
132 }
133 
134 SGMatrix<float64_t> CStatistics::covariance_matrix(
135  SGMatrix<float64_t> observations, bool in_place)
136 {
137  int32_t D = observations.num_rows;
138  int32_t N = observations.num_cols;
139  SG_SDEBUG("%d observations in %d dimensions\n", N, D)
140 
141  REQUIRE(N>1, "Number of observations (%d) must be at least 2.\n", N);
142  REQUIRE(D>0, "Number of dimensions (%d) must be at least 1.\n", D);
143 
144  SGMatrix<float64_t> centered=in_place ? observations :
145  SGMatrix<float64_t>(D, N);
146 
147  /* center observations, potentially in-place */
148  if (!in_place)
149  {
150  int64_t num_elements = N*D;
151  sg_memcpy(centered.matrix, observations.matrix,
152  sizeof(float64_t)*num_elements);
153  }
154  SG_SDEBUG("Centering observations\n");
155  Map<MatrixXd> eigen_centered(centered.matrix, D, N);
156  eigen_centered.colwise() -= eigen_centered.rowwise().mean();
157 
158  /* compute and store 1/(N-1) * X * X.T */
159  SG_SDEBUG("Computing squared differences\n");
160  SGMatrix<float64_t> cov(D, D);
161  Map<MatrixXd> eigen_cov(cov.matrix, D, D);
162  eigen_cov = (eigen_centered * eigen_centered.adjoint()) / double(N - 1);
163 
164  return cov;
165 }
166 
167 SGVector<float64_t> CStatistics::fishers_exact_test_for_multiple_2x3_tables(
168  SGMatrix<float64_t> tables)
169 {
170  SGMatrix<float64_t> table(NULL, 2, 3, false);
171  int32_t len=tables.num_cols/3;
172 
173  SGVector<float64_t> v(len);
174  for (int32_t i=0; i<len; i++)
175  {
176  table.matrix=&tables.matrix[2*3*i];
177  v.vector[i]=fishers_exact_test_for_2x3_table(table);
178  }
179  return v;
180 }
181 
182 float64_t CStatistics::fishers_exact_test_for_2x3_table(
183  SGMatrix<float64_t> table)
184 {
185  ASSERT(table.num_rows==2)
186  ASSERT(table.num_cols==3)
187 
188  int32_t m_len=3+2;
189  float64_t* m=SG_MALLOC(float64_t, 3+2);
190  m[0]=table.matrix[0]+table.matrix[2]+table.matrix[4];
191  m[1]=table.matrix[1]+table.matrix[3]+table.matrix[5];
192  m[2]=table.matrix[0]+table.matrix[1];
193  m[3]=table.matrix[2]+table.matrix[3];
194  m[4]=table.matrix[4]+table.matrix[5];
195 
196  float64_t n=SGVector<float64_t>::sum(m, m_len)/2.0;
197  int32_t x_len=2*3*CMath::sq(CMath::max(m, m_len));
198  float64_t* x=SG_MALLOC(float64_t, x_len);
199  SGVector<float64_t>::fill_vector(x, x_len, 0.0);
200 
201  float64_t log_nom=0.0;
202  for (int32_t i=0; i<3+2; i++)
203  log_nom+=lgamma(m[i]+1);
204  log_nom-=lgamma(n+1.0);
205 
206  float64_t log_denomf=0;
207  floatmax_t log_denom=0;
208 
209  for (int32_t i=0; i<3*2; i++)
210  {
211  log_denom+=lgammal((floatmax_t)table.matrix[i]+1);
212  log_denomf+=lgammal((floatmax_t)table.matrix[i]+1);
213  }
214 
215  floatmax_t prob_table_log=log_nom-log_denom;
216 
217  int32_t dim1=CMath::min(m[0], m[2]);
218 
219  //traverse all possible tables with given m
220  int32_t counter=0;
221  for (int32_t k=0; k<=dim1; k++)
222  {
223  for (int32_t l=CMath::max(0.0, m[0]-m[4]-k);
224  l<=CMath::min(m[0]-k, m[3]); l++)
225  {
226  x[0+0*2+counter*2*3]=k;
227  x[0+1*2+counter*2*3]=l;
228  x[0+2*2+counter*2*3]=m[0]-x[0+0*2+counter*2*3]-x[0+1*2+counter*2*3];
229  x[1+0*2+counter*2*3]=m[2]-x[0+0*2+counter*2*3];
230  x[1+1*2+counter*2*3]=m[3]-x[0+1*2+counter*2*3];
231  x[1+2*2+counter*2*3]=m[4]-x[0+2*2+counter*2*3];
232 
233  counter++;
234  }
235  }
236 
237 //#define DEBUG_FISHER_TABLE
238 #ifdef DEBUG_FISHER_TABLE
239  SG_SPRINT("counter=%d\n", counter)
240  SG_SPRINT("dim1=%d\n", dim1)
241  SG_SPRINT("l=%g...%g\n", CMath::max(0.0,m[0]-m[4]-0), CMath::min(m[0]-0, m[3]))
242  SG_SPRINT("n=%g\n", n)
243  SG_SPRINT("prob_table_log=%.18Lg\n", prob_table_log)
244  SG_SPRINT("log_denomf=%.18g\n", log_denomf)
245  SG_SPRINT("log_denom=%.18Lg\n", log_denom)
246  SG_SPRINT("log_nom=%.18g\n", log_nom)
247  display_vector(m, m_len, "marginals");
248  display_vector(x, 2*3*counter, "x");
249 #endif // DEBUG_FISHER_TABLE
250 
251  floatmax_t* log_denom_vec=SG_MALLOC(floatmax_t, counter);
252  SGVector<floatmax_t>::fill_vector(log_denom_vec, counter, (floatmax_t)0.0);
253 
254  for (int32_t k=0; k<counter; k++)
255  {
256  for (int32_t j=0; j<3; j++)
257  {
258  for (int32_t i=0; i<2; i++)
259  log_denom_vec[k]+=lgammal(x[i+j*2+k*2*3]+1.0);
260  }
261  }
262 
263  for (int32_t i=0; i<counter; i++)
264  log_denom_vec[i]=log_nom-log_denom_vec[i];
265 
266 #ifdef DEBUG_FISHER_TABLE
267  display_vector(log_denom_vec, counter, "log_denom_vec");
268 #endif // DEBUG_FISHER_TABLE
269 
270  float64_t nonrand_p=-CMath::INFTY;
271  for (int32_t i=0; i<counter; i++)
272  {
273  if (log_denom_vec[i]<=prob_table_log)
274  nonrand_p=CMath::logarithmic_sum(nonrand_p, log_denom_vec[i]);
275  }
276 
277 #ifdef DEBUG_FISHER_TABLE
278  SG_SPRINT("nonrand_p=%.18g\n", nonrand_p)
279  SG_SPRINT("exp_nonrand_p=%.18g\n", CMath::exp(nonrand_p))
280 #endif // DEBUG_FISHER_TABLE
281  nonrand_p=CMath::exp(nonrand_p);
282 
283  SG_FREE(log_denom_vec);
284  SG_FREE(x);
285  SG_FREE(m);
286 
287  return nonrand_p;
288 }
289 
290 float64_t CStatistics::mutual_info(float64_t* p1, float64_t* p2, int32_t len)
291 {
292  double e=0;
293 
294  for (int32_t i=0; i<len; i++)
295  for (int32_t j=0; j<len; j++)
296  e+=exp(p2[j*len+i])*(p2[j*len+i]-p1[i]-p1[j]);
297 
298  return (float64_t)e;
299 }
300 
301 float64_t CStatistics::relative_entropy(float64_t* p, float64_t* q, int32_t len)
302 {
303  double e=0;
304 
305  for (int32_t i=0; i<len; i++)
306  e+=exp(p[i])*(p[i]-q[i]);
307 
308  return (float64_t)e;
309 }
310 
311 float64_t CStatistics::entropy(float64_t* p, int32_t len)
312 {
313  double e=0;
314 
315  for (int32_t i=0; i<len; i++)
316  e-=exp(p[i])*p[i];
317 
318  return (float64_t)e;
319 }
320 
321 SGVector<int32_t> CStatistics::sample_indices(int32_t sample_size, int32_t N)
322 {
323  REQUIRE(sample_size<N,
324  "sample size should be less than number of indices\n");
325  int32_t* idxs=SG_MALLOC(int32_t,N);
326  int32_t i, rnd;
327  int32_t* permuted_idxs=SG_MALLOC(int32_t,sample_size);
328 
329  // reservoir sampling
330  for (i=0; i<N; i++)
331  idxs[i]=i;
332  for (i=0; i<sample_size; i++)
333  permuted_idxs[i]=idxs[i];
334  for (i=sample_size; i<N; i++)
335  {
336  rnd=CMath::random(1, i);
337  if (rnd<sample_size)
338  permuted_idxs[rnd]=idxs[i];
339  }
340  SG_FREE(idxs);
341 
342  SGVector<int32_t> result=SGVector<int32_t>(permuted_idxs, sample_size);
343  CMath::qsort(result);
344  return result;
345 }
346 
347 float64_t CStatistics::inverse_normal_cdf(float64_t p, float64_t mean,
348  float64_t std_deviation)
349 {
350  REQUIRE(p>=0, "p (%f); must be greater or equal to 0.\n", p);
351  REQUIRE(p<=1, "p (%f); must be greater or equal to 1.\n", p);
352  REQUIRE(std_deviation>0, "Standard deviation (%f); must be positive\n",
353  std_deviation);
354 
355  // invserse normal cdf case, see cdflib.cpp for details
356  int which=2;
357  float64_t output_x;
358  float64_t q=1-p;
359  float64_t output_bound;
360  int output_status;
361 
362  cdfnor(&which, &p, &q, &output_x, &mean, &std_deviation, &output_status, &output_bound);
363 
364  if (output_status!=0)
365  SG_SERROR("Error %d while calling cdflib::cdfnor\n", output_status);
366 
367  return output_x;
368 
369  //void cdfnor ( int *which, double *p, double *q, double *x, double *mean,
370  // double *sd, int *status, double *bound )
371 }
372 
373 float64_t CStatistics::chi2_cdf(float64_t x, float64_t k)
374 {
375  REQUIRE(x>=0, "x (%f) has to be greater or equal to 0.\n", x);
376  REQUIRE(k>0, "Degrees of freedom (%f) has to be positive.\n", k);
377 
378  // chi2 cdf case, see cdflib.cpp for details
379  int which=1;
380  float64_t df=k;
381  float64_t output_q;
382  float64_t output_p;
383  float64_t output_bound;
384  int output_status;
385 
386  cdfchi(&which, &output_p, &output_q, &x, &df, &output_status, &output_bound);
387 
388  if (output_status!=0)
389  SG_SERROR("Error %d while calling cdflib::cdfchi\n", output_status);
390 
391  return output_p;
392 }
393 
394 float64_t CStatistics::gamma_cdf(float64_t x, float64_t a, float64_t b)
395 {
396  REQUIRE(x>=0, "x (%f) has to be greater or equal to 0.\n", x);
397  REQUIRE(a>=0, "a (%f) has to be greater or equal to 0.\n", a);
398  REQUIRE(b>=0, "b (%f) has to be greater or equal to 0.\n", b);
399 
400  // inverse gamma cdf case, see cdflib.cpp for details
401  float64_t shape=a;
402  float64_t scale=b;
403  int which=1;
404  float64_t output_p;
405  float64_t output_q;
406  float64_t output_bound;
407  int output_error_code;
408 
409  cdfgam(&which, &output_p, &output_q, &x, &shape, &scale, &output_error_code, &output_bound);
410 
411  if (output_error_code!=0)
412  SG_SERROR("Error %d while calling cdflib::cdfgam\n", output_error_code);
413 
414  return output_p;
415 }
416 
417 float64_t CStatistics::lnormal_cdf(float64_t x)
418 {
419  /* Loosely based on logphi.m in
420  * Gaussian Process Machine Learning Toolbox file logphi.m
421  * http://www.gaussianprocess.org/gpml/code/matlab/doc/
422  * Under FreeBSD license
423  */
424 
425  const float64_t sqrt_of_2=1.41421356237309514547;
426  const float64_t log_of_2=0.69314718055994528623;
427  const float64_t sqrt_of_pi=1.77245385090551588192;
428 
429  const index_t c_len=14;
430  static float64_t c_array[c_len]=
431  {
432  0.00048204,
433  -0.00142906,
434  0.0013200243174,
435  0.0009461589032,
436  -0.0045563339802,
437  0.00556964649138,
438  0.00125993961762116,
439  -0.01621575378835404,
440  0.02629651521057465,
441  -0.001829764677455021,
442  2.0*(1.0-CMath::PI/3.0),
443  (4.0-CMath::PI)/3.0,
444  1.0,
445  1.0
446  };
447 
448  if (x*x<ERFC_CASE1)
449  {
450  float64_t f = 0.0;
451  float64_t lp0 = -x/(sqrt_of_2*sqrt_of_pi);
452  for (index_t i=0; i<c_len; i++)
453  f=lp0*(c_array[i]+f);
454  return -2.0*f-log_of_2;
455  }
456  else if (x<ERFC_CASE2)
457  return CMath::log(erfc8_weighted_sum(x))-log_of_2-x*x*0.5;
458 
459  //id3 = ~id2 & ~id1; lp(id3) = log(erfc(-z(id3)/sqrt(2))/2);
460  return CMath::log(normal_cdf(x));
461 }
462 
463 float64_t CStatistics::erfc8_weighted_sum(float64_t x)
464 {
465  /* This is based on index 5725 in Hart et al */
466 
467  const float64_t sqrt_of_2=1.41421356237309514547;
468 
469  static float64_t P[]=
470  {
471  0.5641895835477550741253201704,
472  1.275366644729965952479585264,
473  5.019049726784267463450058,
474  6.1602098531096305440906,
475  7.409740605964741794425,
476  2.97886562639399288862
477  };
478 
479  static float64_t Q[]=
480  {
481  1.0,
482  2.260528520767326969591866945,
483  9.396034016235054150430579648,
484  12.0489519278551290360340491,
485  17.08144074746600431571095,
486  9.608965327192787870698,
487  3.3690752069827527677
488  };
489 
490  float64_t num=0.0, den=0.0;
491 
492  num = P[0];
493  for (index_t i=1; i<6; i++)
494  {
495  num=-x*num/sqrt_of_2+P[i];
496  }
497 
498  den = Q[0];
499  for (index_t i=1; i<7; i++)
500  {
501  den=-x*den/sqrt_of_2+Q[i];
502  }
503 
504  return num/den;
505 }
506 
507 float64_t CStatistics::normal_cdf(float64_t x, float64_t std_dev)
508 {
509  return 0.5*(erfc(-x*M_SQRT1_2/std_dev));
510 }
511 
512 float64_t CStatistics::gamma_inverse_cdf(float64_t p, float64_t a,
513  float64_t b)
514 {
515  REQUIRE(p>=0, "p (%f) has to be greater or equal to 0.\n", p);
516  REQUIRE(a>=0, "a (%f) has to be greater or equal to 0.\n", a);
517  REQUIRE(b>=0, "b (%f) has to be greater or equal to 0.\n", b);
518 
519  float64_t shape=a;
520  float64_t scale=b;
521  float64_t q = 1-p;
522  int which=2;
523  float64_t output_x=0;
524  float64_t output_bound;
525  int output_error_code=0;
526 
527  // inverse gamma cdf case, see cdflib.cpp for details
528  cdfgam(&which, &p, &q, &output_x, &shape, &scale, &output_error_code, &output_bound);
529 
530  if (output_error_code!=0)
531  SG_SERROR("Error %d while calling cdflib::beta_inc\n", output_error_code);
532 
533  return output_x;
534 }
535 
536 float64_t CStatistics::fdistribution_cdf(float64_t x, float64_t d1, float64_t d2)
537 {
538  REQUIRE(x>=0, "x (%f) has to be greater or equal to 0.\n", x);
539  REQUIRE(d1>0, "d1 (%f) has to be positive.\n", d1);
540  REQUIRE(d2>0, "d2 (%f) has to be positive.\n", d2);
541 
542  // fcdf case, see cdflib.cpp for details
543  int which=1;
544  float64_t output_p;
545  float64_t output_q;
546  float64_t output_bound;
547  int output_error_code;
548 
549  cdff(&which, &output_p, &output_q, &x, &d1, &d2, &output_error_code, &output_bound);
550 
551  if (output_error_code!=0)
552  SG_SERROR("Error %d while calling cdflib::cdff\n", output_error_code);
553 
554  return output_p;
555 }
556 
557 float64_t CStatistics::dlgamma(float64_t x)
558 {
559  float64_t result=0.0;
560 
561  if (x<0.0)
562  {
563  // use reflection formula
564  x=1.0-x;
565  result=CMath::PI/CMath::tan(CMath::PI*x);
566  }
567 
568  // make x>7 for approximation
569  // (use reccurent formula: psi(x+1) = psi(x) + 1/x)
570  while (x<=7.0)
571  {
572  result-=1.0/x;
573  x++;
574  }
575 
576  // perform approximation
577  x-=0.5;
578  result+=log(x);
579 
580  float64_t coeff[10]={
581  0.04166666666666666667,
582  -0.00729166666666666667,
583  0.00384424603174603175,
584  -0.00413411458333333333,
585  0.00756096117424242424,
586  -0.02108249687595390720,
587  0.08332316080729166666,
588  -0.44324627670587277880,
589  3.05393103044765369366,
590  -26.45616165999210241989};
591 
592  float64_t power=1.0;
593  float64_t ix2=1.0/CMath::sq(x);
594 
595  // perform approximation
596  for (index_t i=0; i<10; i++)
597  {
598  power*=ix2;
599  result+=coeff[i]*power;
600  }
601 
602  return result;
603 }
604 
605 float64_t CStatistics::log_det_general(const SGMatrix<float64_t> A)
606 {
607  Map<MatrixXd> eigen_A(A.matrix, A.num_rows, A.num_cols);
608  REQUIRE(eigen_A.rows()==eigen_A.cols(),
609  "Input matrix should be a sqaure matrix row(%d) col(%d)\n",
610  eigen_A.rows(), eigen_A.cols());
611 
612  PartialPivLU<MatrixXd> lu(eigen_A);
613  VectorXd tmp(eigen_A.rows());
614 
615  for (index_t idx=0; idx<tmp.rows(); idx++)
616  tmp[idx]=idx+1;
617 
618  VectorXd p=lu.permutationP()*tmp;
619  int32_t detP=1;
620 
621  for (index_t idx=0; idx<p.rows(); idx++)
622  {
623  if (p[idx]!=idx+1)
624  {
625  detP*=-1;
626  index_t j=idx+1;
627  while(j<p.rows())
628  {
629  if (p[j]==idx+1)
630  break;
631  j++;
632  }
633  p[j]=p[idx];
634  }
635  }
636 
637  VectorXd u=lu.matrixLU().diagonal();
638  int32_t check_u=1;
639 
640  for (index_t idx=0; idx<u.rows(); idx++)
641  {
642  if (u[idx]<0)
643  check_u*=-1;
644  else if (u[idx]==0)
645  {
646  check_u=0;
647  break;
648  }
649  }
650 
651  float64_t result=CMath::INFTY;
652 
653  if (check_u==detP)
654  result=u.array().abs().log().sum();
655 
656  return result;
657 }
658 
659 float64_t CStatistics::log_det(SGMatrix<float64_t> m)
660 {
661  /* map the matrix to eigen3 to perform cholesky */
663 
664  /* computing the cholesky decomposition */
665  LLT<MatrixXd> llt;
666  llt.compute(M);
667 
668  /* the lower triangular matrix */
669  MatrixXd l = llt.matrixL();
670 
671  /* calculate the log-determinant */
672  VectorXd diag = l.diagonal();
673  float64_t retval = 0.0;
674  for( int32_t i = 0; i < diag.rows(); ++i ) {
675  retval += log(diag(i));
676  }
677  retval *= 2;
678 
679  return retval;
680 }
681 
682 float64_t CStatistics::log_det(const SGSparseMatrix<float64_t> m)
683 {
684  typedef SparseMatrix<float64_t> MatrixType;
685  const MatrixType &M=EigenSparseUtil<float64_t>::toEigenSparse(m);
686 
687  SimplicialLLT<MatrixType> llt;
688 
689  // factorize using cholesky with amd permutation
690  llt.compute(M);
691  MatrixType L=llt.matrixL();
692 
693  // calculate the log-determinant
694  float64_t retval=0.0;
695  for( index_t i=0; i<M.rows(); ++i )
696  retval+=log(L.coeff(i,i));
697  retval*=2;
698 
699  return retval;
700 }
701 
702 SGMatrix<float64_t> CStatistics::sample_from_gaussian(SGVector<float64_t> mean,
703  SGMatrix<float64_t> cov, int32_t N, bool precision_matrix)
704 {
705  REQUIRE(cov.num_rows>0, "Number of covariance rows must be positive!\n");
706  REQUIRE(cov.num_cols>0,"Number of covariance cols must be positive!\n");
707  REQUIRE(cov.matrix, "Covariance is not initialized!\n");
708  REQUIRE(cov.num_rows==cov.num_cols, "Covariance should be square matrix!\n");
709  REQUIRE(mean.vlen==cov.num_rows, "Mean and covariance dimension mismatch!\n");
710 
711  int32_t dim=mean.vlen;
712  Map<VectorXd> mu(mean.vector, mean.vlen);
713  Map<MatrixXd> c(cov.matrix, cov.num_rows, cov.num_cols);
714 
715  // generate samples, z, from N(0, I), DxN
716  SGMatrix<float64_t> S(dim, N);
717  for( int32_t j=0; j<N; ++j )
718  for( int32_t i=0; i<dim; ++i )
719  S(i,j)=CMath::randn_double();
720 
721  // the cholesky factorization c=L*U
722  MatrixXd U=c.llt().matrixU();
723 
724  // generate samples, x, from N(mean, cov) or N(mean, cov^-1)
725  // return samples of dimension NxD
726  if( precision_matrix )
727  {
728  // here we have U*x=z, to solve this, we use cholesky again
730  LDLT<MatrixXd> ldlt;
731  ldlt.compute(U);
732  s=ldlt.solve(s);
733  }
734 
736 
737  if( !precision_matrix )
738  {
739  // here we need to find x=L*z, so, x'=z'*L' i.e. x'=z'*U
741  s=s*U;
742  }
743 
744  // add the mean
746  for( int32_t i=0; i<N; ++i )
747  x.row(i)+=mu;
748 
749  return S;
750 }
751 
752 SGMatrix<float64_t> CStatistics::sample_from_gaussian(SGVector<float64_t> mean,
753  SGSparseMatrix<float64_t> cov, int32_t N, bool precision_matrix)
754 {
755  REQUIRE(cov.num_vectors>0,
756  "CStatistics::sample_from_gaussian(): \
757  Number of covariance rows must be positive!\n");
758  REQUIRE(cov.num_features>0,
759  "CStatistics::sample_from_gaussian(): \
760  Number of covariance cols must be positive!\n");
762  "CStatistics::sample_from_gaussian(): \
763  Covariance is not initialized!\n");
765  "CStatistics::sample_from_gaussian(): \
766  Covariance should be square matrix!\n");
767  REQUIRE(mean.vlen==cov.num_vectors,
768  "CStatistics::sample_from_gaussian(): \
769  Mean and covariance dimension mismatch!\n");
770 
771  int32_t dim=mean.vlen;
772  Map<VectorXd> mu(mean.vector, mean.vlen);
773 
774  typedef SparseMatrix<float64_t> MatrixType;
775  const MatrixType &c=EigenSparseUtil<float64_t>::toEigenSparse(cov);
776 
777  SimplicialLLT<MatrixType> llt;
778 
779  // generate samples, z, from N(0, I), DxN
780  SGMatrix<float64_t> S(dim, N);
781  for( int32_t j=0; j<N; ++j )
782  for( int32_t i=0; i<dim; ++i )
783  S(i,j)=CMath::randn_double();
784 
786 
787  // the cholesky factorization P*c*P^-1 = LP*UP, with LP=P*L, UP=U*P^-1
788  llt.compute(c);
789  MatrixType LP=llt.matrixL();
790  MatrixType UP=llt.matrixU();
791 
792  // generate samples, x, from N(mean, cov) or N(mean, cov^-1)
793  // return samples of dimension NxD
794  if( precision_matrix )
795  {
796  // here we have UP*xP=z, to solve this, we use cholesky again
797  SimplicialLLT<MatrixType> lltUP;
798  lltUP.compute(UP);
799  s=lltUP.solve(s);
800  }
801  else
802  {
803  // here we need to find xP=LP*z
804  s=LP*s;
805  }
806 
807  // permute the samples back with x=P^-1*xP
808  s=llt.permutationPinv()*s;
809 
811  // add the mean
813  for( int32_t i=0; i<N; ++i )
814  x.row(i)+=mu;
815 
816  return S;
817 }
818 
819 
820 CStatistics::SigmoidParamters CStatistics::fit_sigmoid(SGVector<float64_t> scores)
821 {
822  SG_SDEBUG("entering CStatistics::fit_sigmoid()\n")
823 
824  REQUIRE(scores.vector, "CStatistics::fit_sigmoid() requires "
825  "scores vector!\n");
826 
827  /* count prior0 and prior1 if needed */
828  int32_t prior0=0;
829  int32_t prior1=0;
830  SG_SDEBUG("counting number of positive and negative labels\n")
831  {
832  for (index_t i=0; i<scores.vlen; ++i)
833  {
834  if (scores[i]>0)
835  prior1++;
836  else
837  prior0++;
838  }
839  }
840  SG_SDEBUG("%d pos; %d neg\n", prior1, prior0)
841 
842  /* parameter setting */
843  /* maximum number of iterations */
844  index_t maxiter=100;
845 
846  /* minimum step taken in line search */
847  float64_t minstep=1E-10;
848 
849  /* for numerically strict pd of hessian */
850  float64_t sigma=1E-12;
851  float64_t eps=1E-5;
852 
853  /* construct target support */
854  float64_t hiTarget=(prior1+1.0)/(prior1+2.0);
855  float64_t loTarget=1/(prior0+2.0);
856  index_t length=prior1+prior0;
857 
858  SGVector<float64_t> t(length);
859  for (index_t i=0; i<length; ++i)
860  {
861  if (scores[i]>0)
862  t[i]=hiTarget;
863  else
864  t[i]=loTarget;
865  }
866 
867  /* initial Point and Initial Fun Value */
868  /* result parameters of sigmoid */
869  float64_t a=0;
870  float64_t b=CMath::log((prior0+1.0)/(prior1+1.0));
871  float64_t fval=0.0;
872 
873  for (index_t i=0; i<length; ++i)
874  {
875  float64_t fApB=scores[i]*a+b;
876  if (fApB>=0)
877  fval+=t[i]*fApB+CMath::log(1+CMath::exp(-fApB));
878  else
879  fval+=(t[i]-1)*fApB+CMath::log(1+CMath::exp(fApB));
880  }
881 
882  index_t it;
883  float64_t g1;
884  float64_t g2;
885  for (it=0; it<maxiter; ++it)
886  {
887  SG_SDEBUG("Iteration %d, a=%f, b=%f, fval=%f\n", it, a, b, fval)
888 
889  /* Update Gradient and Hessian (use H' = H + sigma I) */
890  float64_t h11=sigma; //Numerically ensures strict PD
891  float64_t h22=h11;
892  float64_t h21=0;
893  g1=0;
894  g2=0;
895 
896  for (index_t i=0; i<length; ++i)
897  {
898  float64_t fApB=scores[i]*a+b;
899  float64_t p;
900  float64_t q;
901  if (fApB>=0)
902  {
903  p=CMath::exp(-fApB)/(1.0+CMath::exp(-fApB));
904  q=1.0/(1.0+CMath::exp(-fApB));
905  }
906  else
907  {
908  p=1.0/(1.0+CMath::exp(fApB));
909  q=CMath::exp(fApB)/(1.0+CMath::exp(fApB));
910  }
911 
912  float64_t d2=p*q;
913  h11+=scores[i]*scores[i]*d2;
914  h22+=d2;
915  h21+=scores[i]*d2;
916  float64_t d1=t[i]-p;
917  g1+=scores[i]*d1;
918  g2+=d1;
919  }
920 
921  /* Stopping Criteria */
922  if (CMath::abs(g1)<eps && CMath::abs(g2)<eps)
923  break;
924 
925  /* Finding Newton direction: -inv(H') * g */
926  float64_t det=h11*h22-h21*h21;
927  float64_t dA=-(h22*g1-h21*g2)/det;
928  float64_t dB=-(-h21*g1+h11*g2)/det;
929  float64_t gd=g1*dA+g2*dB;
930 
931  /* Line Search */
932  float64_t stepsize=1;
933 
934  while (stepsize>=minstep)
935  {
936  float64_t newA=a+stepsize*dA;
937  float64_t newB=b+stepsize*dB;
938 
939  /* New function value */
940  float64_t newf=0.0;
941  for (index_t i=0; i<length; ++i)
942  {
943  float64_t fApB=scores[i]*newA+newB;
944  if (fApB>=0)
945  newf+=t[i]*fApB+CMath::log(1+CMath::exp(-fApB));
946  else
947  newf+=(t[i]-1)*fApB+CMath::log(1+CMath::exp(fApB));
948  }
949 
950  /* Check sufficient decrease */
951  if (newf<fval+0.0001*stepsize*gd)
952  {
953  a=newA;
954  b=newB;
955  fval=newf;
956  break;
957  }
958  else
959  stepsize=stepsize/2.0;
960  }
961 
962  if (stepsize<minstep)
963  {
964  SG_SWARNING("CStatistics::fit_sigmoid(): line search fails, A=%f, "
965  "B=%f, g1=%f, g2=%f, dA=%f, dB=%f, gd=%f\n",
966  a, b, g1, g2, dA, dB, gd);
967  }
968  }
969 
970  if (it>=maxiter-1)
971  {
972  SG_SWARNING("CStatistics::fit_sigmoid(): reaching maximal iterations,"
973  " g1=%f, g2=%f\n", g1, g2);
974  }
975 
976  SG_SDEBUG("fitted sigmoid: a=%f, b=%f\n", a, b)
977 
979  result.a=a;
980  result.b=b;
981 
982  SG_SDEBUG("leaving CStatistics::fit_sigmoid()\n")
983  return result;
984 }
985 
986 const float64_t CStatistics::ERFC_CASE1=0.0492;
987 
988 const float64_t CStatistics::ERFC_CASE2=-11.3137;
int32_t index_t
Definition: common.h:72
#define SG_SWARNING(...)
Definition: SGIO.h:177
void scale(SGVector< T > &a, SGVector< T > &result, T alpha=1)
Definition: SGMatrix.h:24
#define REQUIRE(x,...)
Definition: SGIO.h:205
index_t num_cols
Definition: SGMatrix.h:465
std::enable_if<!std::is_same< T, complex128_t >::value, float64_t >::type mean(const Container< T > &a)
index_t num_rows
Definition: SGMatrix.h:463
index_t vlen
Definition: SGVector.h:545
#define SG_SPRINT(...)
Definition: SGIO.h:179
#define ASSERT(x)
Definition: SGIO.h:200
index_t num_features
total number of features
double float64_t
Definition: common.h:60
long double floatmax_t
Definition: common.h:61
SGSparseVector< T > * sparse_matrix
array of sparse vectors of size num_vectors
This class contains some utilities for Eigen3 Sparse Matrix integration with shogun. Currently it provides a method for converting SGSparseMatrix to Eigen3 SparseMatrix.
Definition: eigen3.h:92
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
T sum(const Container< T > &a, bool no_diag=false)
#define SG_SDEBUG(...)
Definition: SGIO.h:167
#define SG_SERROR(...)
Definition: SGIO.h:178
T max(const Container< T > &a)
index_t num_vectors
total number of vectors

SHOGUN Machine Learning Toolbox - Documentation