SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
EPInferenceMethod.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (W) 2013 Roman Votyakov
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are those
27  * of the authors and should not be interpreted as representing official policies,
28  * either expressed or implied, of the Shogun Development Team.
29  *
30  *
31  * Based on ideas from GAUSSIAN PROCESS REGRESSION AND CLASSIFICATION Toolbox
32  * http://www.gaussianprocess.org/gpml/code/matlab/doc/
33  */
35 
36 
42 
44 
45 using namespace shogun;
46 using namespace Eigen;
47 
48 // try to use previously allocated memory for SGVector
49 #define CREATE_SGVECTOR(vec, len, sg_type) \
50  { \
51  if (!vec.vector || vec.vlen!=len) \
52  vec=SGVector<sg_type>(len); \
53  }
54 
55 // try to use previously allocated memory for SGMatrix
56 #define CREATE_SGMATRIX(mat, rows, cols, sg_type) \
57  { \
58  if (!mat.matrix || mat.num_rows!=rows || mat.num_cols!=cols) \
59  mat=SGMatrix<sg_type>(rows, cols); \
60  }
61 
63 {
64  init();
65 }
66 
68  CMeanFunction* mean, CLabels* labels, CLikelihoodModel* model)
69  : CInference(kernel, features, mean, labels, model)
70 {
71  init();
72 }
73 
75 {
76 }
77 
79 {
80  SG_WARNING("The method does not require a minimizer. The provided minimizer will not be used.\n");
81 }
82 
83 void CEPInferenceMethod::init()
84 {
85  m_max_sweep=15;
86  m_min_sweep=2;
87  m_tol=1e-4;
88 }
89 
91  CInference* inference)
92 {
93  if (inference==NULL)
94  return NULL;
95 
96  if (inference->get_inference_type()!=INF_EP)
97  SG_SERROR("Provided inference is not of type CEPInferenceMethod!\n")
98 
99  SG_REF(inference);
100  return (CEPInferenceMethod*)inference;
101 }
102 
104 {
106  update();
107 
108  return m_nlZ;
109 }
110 
112 {
114  update();
115 
117 }
118 
120 {
122  update();
123 
124  return SGMatrix<float64_t>(m_L);
125 }
126 
128 {
130  update();
131 
132  return SGVector<float64_t>(m_sttau);
133 }
134 
136 {
138 
139  return SGVector<float64_t>(m_mu);
140 }
141 
143 {
145 
146  return SGMatrix<float64_t>(m_Sigma);
147 }
148 
150 {
152 
153  if (!m_gradient_update)
154  {
155  // update matrices to compute derivatives
156  update_deriv();
157  m_gradient_update=true;
159  }
160 }
161 
163 {
164  SG_DEBUG("entering\n");
165 
166  // update kernel and feature matrix
168 
169  // get number of labels (trainig examples)
171 
172  // try to use tilde values from previous call
173  if (m_ttau.vlen==n)
174  {
175  update_chol();
179  }
180 
181  // get mean vector
183 
184  // get and scale diagonal of the kernel matrix
186  ktrtr_diag.scale(CMath::exp(m_log_scale*2.0));
187 
188  // marginal likelihood for ttau = tnu = 0
190  mean, ktrtr_diag, m_labels));
191 
192  // use zero values if we have no better guess or it's better
193  if (m_ttau.vlen!=n || m_nlZ>nlZ0)
194  {
195  CREATE_SGVECTOR(m_ttau, n, float64_t);
196  m_ttau.zero();
197 
198  CREATE_SGVECTOR(m_sttau, n, float64_t);
199  m_sttau.zero();
200 
201  CREATE_SGVECTOR(m_tnu, n, float64_t);
202  m_tnu.zero();
203 
205 
206  // copy data manually, since we don't have appropriate method
207  for (index_t i=0; i<m_ktrtr.num_rows; i++)
208  for (index_t j=0; j<m_ktrtr.num_cols; j++)
209  m_Sigma(i,j)=m_ktrtr(i,j)*CMath::exp(m_log_scale);
210 
211  CREATE_SGVECTOR(m_mu, n, float64_t);
212  m_mu.zero();
213 
214  // set marginal likelihood
215  m_nlZ=nlZ0;
216  }
217 
218  // create vector of the random permutation
219  SGVector<index_t> v(n);
220  v.range_fill();
221 
222  // cavity tau and nu vectors
223  SGVector<float64_t> tau_n(n);
224  SGVector<float64_t> nu_n(n);
225 
226  // cavity mu and s2 vectors
227  SGVector<float64_t> mu_n(n);
228  SGVector<float64_t> s2_n(n);
229 
230  float64_t nlZ_old=CMath::INFTY;
231  uint32_t sweep=0;
232 
233  while ((CMath::abs(m_nlZ-nlZ_old)>m_tol && sweep<m_max_sweep) ||
234  sweep<m_min_sweep)
235  {
236  nlZ_old=m_nlZ;
237  sweep++;
238 
239  // shuffle random permutation
240  CMath::permute(v);
241 
242  for (index_t j=0; j<n; j++)
243  {
244  index_t i=v[j];
245 
246  // find cavity paramters
247  tau_n[i]=1.0/m_Sigma(i,i)-m_ttau[i];
248  nu_n[i]=m_mu[i]/m_Sigma(i,i)+mean[i]*tau_n[i]-m_tnu[i];
249 
250  // compute cavity mean and variance
251  mu_n[i]=nu_n[i]/tau_n[i];
252  s2_n[i]=1.0/tau_n[i];
253 
254  // get moments
255  float64_t mu=m_model->get_first_moment(mu_n, s2_n, m_labels, i);
256  float64_t s2=m_model->get_second_moment(mu_n, s2_n, m_labels, i);
257 
258  // save old value of ttau
259  float64_t ttau_old=m_ttau[i];
260 
261  // compute ttau and sqrt(ttau)
262  m_ttau[i]=CMath::max(1.0/s2-tau_n[i], 0.0);
263  m_sttau[i]=CMath::sqrt(m_ttau[i]);
264 
265  // compute tnu
266  m_tnu[i]=mu/s2-nu_n[i];
267 
268  // compute difference ds2=ttau_new-ttau_old
269  float64_t ds2=m_ttau[i]-ttau_old;
270 
271  // create eigen representation of Sigma, tnu and mu
272  Map<MatrixXd> eigen_Sigma(m_Sigma.matrix, m_Sigma.num_rows,
273  m_Sigma.num_cols);
274  Map<VectorXd> eigen_tnu(m_tnu.vector, m_tnu.vlen);
275  Map<VectorXd> eigen_mu(m_mu.vector, m_mu.vlen);
276 
277  VectorXd eigen_si=eigen_Sigma.col(i);
278 
279  // rank-1 update Sigma
280  eigen_Sigma=eigen_Sigma-ds2/(1.0+ds2*eigen_si(i))*eigen_si*
281  eigen_si.adjoint();
282 
283  // update mu
284  eigen_mu=eigen_Sigma*eigen_tnu;
285  }
286 
287  // update upper triangular factor (L^T) of Cholesky decomposition of
288  // matrix B, approximate posterior covariance and mean, negative
289  // marginal likelihood
290  update_chol();
294  }
295 
296  if (sweep==m_max_sweep && CMath::abs(m_nlZ-nlZ_old)>m_tol)
297  {
298  SG_ERROR("Maximum number (%d) of sweeps reached, but tolerance (%f) was "
299  "not yet reached. You can manually set maximum number of sweeps "
300  "or tolerance to fix this problem.\n", m_max_sweep, m_tol);
301  }
302 
303  // update vector alpha
304  update_alpha();
305 
306  m_gradient_update=false;
307 
308  // update hash of the parameters
310 
311  SG_DEBUG("leaving\n");
312 }
313 
315 {
316  // create eigen representations kernel matrix, L^T, sqrt(ttau) and tnu
318  Map<VectorXd> eigen_tnu(m_tnu.vector, m_tnu.vlen);
319  Map<VectorXd> eigen_sttau(m_sttau.vector, m_sttau.vlen);
321 
322  // create shogun and eigen representation of the alpha vector
324  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
325 
326  // solve LL^T * v = tS^(1/2) * K * tnu
327  VectorXd eigen_v=eigen_L.triangularView<Upper>().adjoint().solve(
328  eigen_sttau.cwiseProduct(eigen_K*CMath::exp(m_log_scale*2.0)*eigen_tnu));
329  eigen_v=eigen_L.triangularView<Upper>().solve(eigen_v);
330 
331  // compute alpha = (I - tS^(1/2) * B^(-1) * tS(1/2) * K) * tnu =
332  // tnu - tS(1/2) * (L^T)^(-1) * L^(-1) * tS^(1/2) * K * tnu =
333  // tnu - tS(1/2) * v
334  eigen_alpha=eigen_tnu-eigen_sttau.cwiseProduct(eigen_v);
335 }
336 
338 {
339  // create eigen representations of kernel matrix and sqrt(ttau)
341  Map<VectorXd> eigen_sttau(m_sttau.vector, m_sttau.vlen);
342 
343  // create shogun and eigen representation of the upper triangular factor
344  // (L^T) of the Cholesky decomposition of the matrix B
347 
348  // compute upper triangular factor L^T of the Cholesky decomposion of the
349  // matrix: B = tS^(1/2) * K * tS^(1/2) + I
350  LLT<MatrixXd> eigen_chol((eigen_sttau*eigen_sttau.adjoint()).cwiseProduct(
351  eigen_K*CMath::exp(m_log_scale*2.0))+
352  MatrixXd::Identity(m_L.num_rows, m_L.num_cols));
353 
354  eigen_L=eigen_chol.matrixU();
355 }
356 
358 {
359  // create eigen representations of kernel matrix, L^T matrix and sqrt(ttau)
362  Map<VectorXd> eigen_sttau(m_sttau.vector, m_sttau.vlen);
363 
364  // create shogun and eigen representation of the approximate covariance
365  // matrix
367  Map<MatrixXd> eigen_Sigma(m_Sigma.matrix, m_Sigma.num_rows, m_Sigma.num_cols);
368 
369  // compute V = L^(-1) * tS^(1/2) * K, using upper triangular factor L^T
370  MatrixXd eigen_V=eigen_L.triangularView<Upper>().adjoint().solve(
371  eigen_sttau.asDiagonal()*eigen_K*CMath::exp(m_log_scale*2.0));
372 
373  // compute covariance matrix of the posterior:
374  // Sigma = K - K * tS^(1/2) * (L * L^T)^(-1) * tS^(1/2) * K =
375  // K - (K * tS^(1/2)) * (L^T)^(-1) * L^(-1) * tS^(1/2) * K =
376  // K - (tS^(1/2) * K)^T * (L^(-1))^T * L^(-1) * tS^(1/2) * K = K - V^T * V
377  eigen_Sigma=eigen_K*CMath::exp(m_log_scale*2.0)-eigen_V.adjoint()*eigen_V;
378 }
379 
381 {
382  // create eigen representation of posterior covariance matrix and tnu
383  Map<MatrixXd> eigen_Sigma(m_Sigma.matrix, m_Sigma.num_rows, m_Sigma.num_cols);
384  Map<VectorXd> eigen_tnu(m_tnu.vector, m_tnu.vlen);
385 
386  // create shogun and eigen representation of the approximate mean vector
387  CREATE_SGVECTOR(m_mu, m_tnu.vlen, float64_t);
388  Map<VectorXd> eigen_mu(m_mu.vector, m_mu.vlen);
389 
390  // compute mean vector of the approximate posterior: mu = Sigma * tnu
391  eigen_mu=eigen_Sigma*eigen_tnu;
392 }
393 
395 {
396  // create eigen representation of Sigma, L, mu, tnu, ttau
397  Map<MatrixXd> eigen_Sigma(m_Sigma.matrix, m_Sigma.num_rows, m_Sigma.num_cols);
399  Map<VectorXd> eigen_mu(m_mu.vector, m_mu.vlen);
400  Map<VectorXd> eigen_tnu(m_tnu.vector, m_tnu.vlen);
401  Map<VectorXd> eigen_ttau(m_ttau.vector, m_ttau.vlen);
402 
403  // get and create eigen representation of the mean vector
405  Map<VectorXd> eigen_m(m.vector, m.vlen);
406 
407  // compute vector of cavity parameter tau_n
408  VectorXd eigen_tau_n=(VectorXd::Ones(m_ttau.vlen)).cwiseQuotient(
409  eigen_Sigma.diagonal())-eigen_ttau;
410 
411  // compute vector of cavity parameter nu_n
412  VectorXd eigen_nu_n=eigen_mu.cwiseQuotient(eigen_Sigma.diagonal())-
413  eigen_tnu+eigen_m.cwiseProduct(eigen_tau_n);
414 
415  // compute cavity mean: mu_n=nu_n/tau_n
416  SGVector<float64_t> mu_n(m_ttau.vlen);
417  Map<VectorXd> eigen_mu_n(mu_n.vector, mu_n.vlen);
418 
419  eigen_mu_n=eigen_nu_n.cwiseQuotient(eigen_tau_n);
420 
421  // compute cavity variance: s2_n=1.0/tau_n
422  SGVector<float64_t> s2_n(m_ttau.vlen);
423  Map<VectorXd> eigen_s2_n(s2_n.vector, s2_n.vlen);
424 
425  eigen_s2_n=(VectorXd::Ones(m_ttau.vlen)).cwiseQuotient(eigen_tau_n);
426 
428  m_model->get_log_zeroth_moments(mu_n, s2_n, m_labels));
429 
430  // compute nlZ_part1=sum(log(diag(L)))-sum(lZ)-tnu'*Sigma*tnu/2
431  float64_t nlZ_part1=eigen_L.diagonal().array().log().sum()-lZ-
432  (eigen_tnu.adjoint()*eigen_Sigma).dot(eigen_tnu)/2.0;
433 
434  // compute nlZ_part2=sum(tnu.^2./(tau_n+ttau))/2-sum(log(1+ttau./tau_n))/2
435  float64_t nlZ_part2=(eigen_tnu.array().square()/
436  (eigen_tau_n+eigen_ttau).array()).sum()/2.0-(1.0+eigen_ttau.array()/
437  eigen_tau_n.array()).log().sum()/2.0;
438 
439  // compute nlZ_part3=-(nu_n-m.*tau_n)'*((ttau./tau_n.*(nu_n-m.*tau_n)-2*tnu)
440  // ./(ttau+tau_n))/2
441  float64_t nlZ_part3=-(eigen_nu_n-eigen_m.cwiseProduct(eigen_tau_n)).dot(
442  ((eigen_ttau.array()/eigen_tau_n.array()*(eigen_nu_n.array()-
443  eigen_m.array()*eigen_tau_n.array())-2*eigen_tnu.array())/
444  (eigen_ttau.array()+eigen_tau_n.array())).matrix())/2.0;
445 
446  // compute nlZ=nlZ_part1+nlZ_part2+nlZ_part3
447  m_nlZ=nlZ_part1+nlZ_part2+nlZ_part3;
448 }
449 
451 {
452  // create eigen representation of L, sstau, alpha
454  Map<VectorXd> eigen_sttau(m_sttau.vector, m_sttau.vlen);
455  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
456 
457  // create shogun and eigen representation of F
459  Map<MatrixXd> eigen_F(m_F.matrix, m_F.num_rows, m_F.num_cols);
460 
461  // solve L*L^T * V = diag(sqrt(ttau))
462  MatrixXd V=eigen_L.triangularView<Upper>().adjoint().solve(
463  MatrixXd(eigen_sttau.asDiagonal()));
464  V=eigen_L.triangularView<Upper>().solve(V);
465 
466  // compute F=alpha*alpha'-repmat(sW,1,n).*solve_chol(L,diag(sW))
467  eigen_F=eigen_alpha*eigen_alpha.adjoint()-eigen_sttau.asDiagonal()*V;
468 }
469 
471  const TParameter* param)
472 {
473  REQUIRE(!strcmp(param->m_name, "log_scale"), "Can't compute derivative of "
474  "the nagative log marginal likelihood wrt %s.%s parameter\n",
475  get_name(), param->m_name)
476 
478  Map<MatrixXd> eigen_F(m_F.matrix, m_F.num_rows, m_F.num_cols);
479 
480  SGVector<float64_t> result(1);
481 
482  // compute derivative wrt kernel scale: dnlZ=-sum(F.*K*scale*2)/2
483  result[0]=-(eigen_F.cwiseProduct(eigen_K)).sum();
484  result[0]*=CMath::exp(m_log_scale*2.0);
485 
486  return result;
487 }
488 
490  const TParameter* param)
491 {
493  return SGVector<float64_t>();
494 }
495 
497  const TParameter* param)
498 {
499  // create eigen representation of the matrix Q
500  Map<MatrixXd> eigen_F(m_F.matrix, m_F.num_rows, m_F.num_cols);
501 
502  REQUIRE(param, "Param not set\n");
503  SGVector<float64_t> result;
504  int64_t len=const_cast<TParameter *>(param)->m_datatype.get_num_elements();
505  result=SGVector<float64_t>(len);
506 
507  for (index_t i=0; i<result.vlen; i++)
508  {
510 
511  if (result.vlen==1)
512  dK=m_kernel->get_parameter_gradient(param);
513  else
514  dK=m_kernel->get_parameter_gradient(param, i);
515 
516  Map<MatrixXd> eigen_dK(dK.matrix, dK.num_rows, dK.num_cols);
517 
518  // compute derivative wrt kernel parameter: dnlZ=-sum(F.*dK*scale^2)/2.0
519  result[i]=-(eigen_F.cwiseProduct(eigen_dK)).sum();
520  result[i]*=CMath::exp(m_log_scale*2.0)/2.0;
521  }
522 
523  return result;
524 }
525 
527  const TParameter* param)
528 {
530  return SGVector<float64_t>();
531 }
532 
float64_t m_log_scale
Definition: Inference.h:490
void range_fill(T start=0)
Definition: SGVector.cpp:171
virtual void update()
Definition: Inference.cpp:316
static void permute(SGVector< T > v, CRandom *rand=NULL)
Definition: Math.h:1144
virtual SGVector< float64_t > get_diagonal_vector()
virtual void update_parameter_hash()
Definition: SGObject.cpp:281
virtual SGVector< float64_t > get_alpha()
int32_t index_t
Definition: common.h:62
static CEPInferenceMethod * obtain_from_generic(CInference *inference)
Vector::Scalar dot(Vector a, Vector b)
Definition: Redux.h:58
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:43
static const float64_t INFTY
infinity
Definition: Math.h:2048
virtual EInferenceType get_inference_type() const
Definition: Inference.h:104
virtual SGMatrix< float64_t > get_posterior_covariance()
virtual int32_t get_num_labels() const =0
CKernel * m_kernel
Definition: Inference.h:469
Definition: SGMatrix.h:20
parameter struct
#define SG_ERROR(...)
Definition: SGIO.h:129
virtual SGVector< float64_t > get_log_zeroth_moments(SGVector< float64_t > mu, SGVector< float64_t > s2, const CLabels *lab) const =0
#define REQUIRE(x,...)
Definition: SGIO.h:206
#define SG_NOTIMPLEMENTED
Definition: SGIO.h:139
index_t num_cols
Definition: SGMatrix.h:376
virtual SGVector< float64_t > get_mean_vector(const CFeatures *features) const =0
#define CREATE_SGMATRIX(mat, rows, cols, sg_type)
virtual float64_t get_negative_log_marginal_likelihood()
virtual float64_t get_second_moment(SGVector< float64_t > mu, SGVector< float64_t > s2, const CLabels *lab, index_t i) const =0
An abstract class of the mean function.
Definition: MeanFunction.h:49
void scale(T alpha)
Scale vector inplace.
Definition: SGVector.cpp:841
virtual SGVector< float64_t > get_derivative_wrt_kernel(const TParameter *param)
#define SG_REF(x)
Definition: SGObject.h:54
index_t num_rows
Definition: SGMatrix.h:374
CFeatures * m_features
Definition: Inference.h:478
SGMatrix< float64_t > m_ktrtr
Definition: Inference.h:493
virtual SGVector< float64_t > get_posterior_mean()
CMeanFunction * m_mean
Definition: Inference.h:472
SGVector< T > get_diagonal_vector() const
Definition: SGMatrix.cpp:1095
index_t vlen
Definition: SGVector.h:492
virtual SGVector< float64_t > get_derivative_wrt_mean(const TParameter *param)
CLabels * m_labels
Definition: Inference.h:481
double float64_t
Definition: common.h:50
virtual void compute_gradient()
Definition: Inference.cpp:343
#define CREATE_SGVECTOR(vec, len, sg_type)
static T sum(T *vec, int32_t len)
Return sum(vec)
Definition: SGVector.h:352
SGMatrix< float64_t > m_L
Definition: Inference.h:487
static T max(T a, T b)
Definition: Math.h:168
Matrix< float64_t,-1,-1, 0,-1,-1 > MatrixXd
Definition: KLInference.h:52
virtual SGVector< float64_t > get_derivative_wrt_likelihood_model(const TParameter *param)
virtual void register_minimizer(Minimizer *minimizer)
#define SG_DEBUG(...)
Definition: SGIO.h:107
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual SGVector< float64_t > get_derivative_wrt_inference_method(const TParameter *param)
The Inference Method base class.
Definition: Inference.h:81
Class of the Expectation Propagation (EP) posterior approximation inference method.
The class Features is the base class of all feature objects.
Definition: Features.h:68
#define SG_SERROR(...)
Definition: SGIO.h:179
static float64_t exp(float64_t x)
Definition: Math.h:621
virtual SGMatrix< float64_t > get_parameter_gradient(const TParameter *param, index_t index=-1)
Definition: Kernel.h:851
The Kernel base class.
Definition: Kernel.h:159
The minimizer base class.
Definition: Minimizer.h:43
#define SG_WARNING(...)
Definition: SGIO.h:128
virtual SGMatrix< float64_t > get_cholesky()
static float32_t sqrt(float32_t x)
Definition: Math.h:459
virtual float64_t get_first_moment(SGVector< float64_t > mu, SGVector< float64_t > s2, const CLabels *lab, index_t i) const =0
virtual const char * get_name() const
CLikelihoodModel * m_model
Definition: Inference.h:475
virtual bool parameter_hash_changed()
Definition: SGObject.cpp:295
The Likelihood model base class.
SGVector< float64_t > m_alpha
Definition: Inference.h:484
static T abs(T a)
Definition: Math.h:179

SHOGUN Machine Learning Toolbox - Documentation