SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
lbfgs.cpp
Go to the documentation of this file.
1 /*
2  * Limited memory BFGS (L-BFGS).
3  *
4  * Copyright (c) 1990, Jorge Nocedal
5  * Copyright (c) 2007-2010 Naoaki Okazaki
6  * All rights reserved.
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a copy
9  * of this software and associated documentation files (the "Software"), to deal
10  * in the Software without restriction, including without limitation the rights
11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12  * copies of the Software, and to permit persons to whom the Software is
13  * furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice shall be included in
16  * all copies or substantial portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24  * THE SOFTWARE.
25  */
26 
27 /* $Id$ */
28 
29 /*
30 This library is a C port of the FORTRAN implementation of Limited-memory
31 Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) method written by Jorge Nocedal.
32 The original FORTRAN source code is available at:
33 http://www.ece.northwestern.edu/~nocedal/lbfgs.html
34 
35 The L-BFGS algorithm is described in:
36  - Jorge Nocedal.
37  Updating Quasi-Newton Matrices with Limited Storage.
38  <i>Mathematics of Computation</i>, Vol. 35, No. 151, pp. 773--782, 1980.
39  - Dong C. Liu and Jorge Nocedal.
40  On the limited memory BFGS method for large scale optimization.
41  <i>Mathematical Programming</i> B, Vol. 45, No. 3, pp. 503-528, 1989.
42 
43 The line search algorithms used in this implementation are described in:
44  - John E. Dennis and Robert B. Schnabel.
45  <i>Numerical Methods for Unconstrained Optimization and Nonlinear
46  Equations</i>, Englewood Cliffs, 1983.
47  - Jorge J. More and David J. Thuente.
48  Line search algorithm with guaranteed sufficient decrease.
49  <i>ACM Transactions on Mathematical Software (TOMS)</i>, Vol. 20, No. 3,
50  pp. 286-307, 1994.
51 
52 This library also implements Orthant-Wise Limited-memory Quasi-Newton (OWL-QN)
53 method presented in:
54  - Galen Andrew and Jianfeng Gao.
55  Scalable training of L1-regularized log-linear models.
56  In <i>Proceedings of the 24th International Conference on Machine
57  Learning (ICML 2007)</i>, pp. 33-40, 2007.
58 
59 I would like to thank the original author, Jorge Nocedal, who has been
60 distributing the effieicnt and explanatory implementation in an open source
61 licence.
62 */
63 
64 #include <algorithm>
65 #include <cstdio>
66 #include <cmath>
67 #include <string.h>
68 
70 #include <shogun/lib/SGVector.h>
71 #include <shogun/lib/common.h>
72 #include <shogun/lib/memory.h>
73 
74 namespace shogun
75 {
76 
77 #define min2(a, b) ((a) <= (b) ? (a) : (b))
78 #define max2(a, b) ((a) >= (b) ? (a) : (b))
79 #define max3(a, b, c) max2(max2((a), (b)), (c));
80 
82  int32_t n;
83  void *instance;
86 };
88 
91  float64_t *s; /* [n] */
92  float64_t *y; /* [n] */
93  float64_t ys; /* vecdot(y, s) */
94 };
96 
97 static const lbfgs_parameter_t _defparam = {
98  6, 1e-5, 0, 0.0,
100  1e-20, 1e20, 1e-4, 0.9, 0.9, 1.0e-16,
101  0.0, 0, 1,
102 };
103 
104 /* Forward function declarations. */
105 
106 typedef int32_t (*line_search_proc)(
107  int32_t n,
108  float64_t *x,
109  float64_t *f,
110  float64_t *g,
111  float64_t *s,
112  float64_t *stp,
113  const float64_t* xp,
114  const float64_t* gp,
115  float64_t *wa,
116  callback_data_t *cd,
117  const lbfgs_parameter_t *param
118  );
119 
120 static int32_t line_search_backtracking(
121  int32_t n,
122  float64_t *x,
123  float64_t *f,
124  float64_t *g,
125  float64_t *s,
126  float64_t *stp,
127  const float64_t* xp,
128  const float64_t* gp,
129  float64_t *wa,
130  callback_data_t *cd,
131  const lbfgs_parameter_t *param
132  );
133 
134 static int32_t line_search_backtracking_owlqn(
135  int32_t n,
136  float64_t *x,
137  float64_t *f,
138  float64_t *g,
139  float64_t *s,
140  float64_t *stp,
141  const float64_t* xp,
142  const float64_t* gp,
143  float64_t *wp,
144  callback_data_t *cd,
145  const lbfgs_parameter_t *param
146  );
147 
148 static int32_t line_search_morethuente(
149  int32_t n,
150  float64_t *x,
151  float64_t *f,
152  float64_t *g,
153  float64_t *s,
154  float64_t *stp,
155  const float64_t* xp,
156  const float64_t* gp,
157  float64_t *wa,
158  callback_data_t *cd,
159  const lbfgs_parameter_t *param
160  );
161 
162 static int32_t update_trial_interval(
163  float64_t *x,
164  float64_t *fx,
165  float64_t *dx,
166  float64_t *y,
167  float64_t *fy,
168  float64_t *dy,
169  float64_t *t,
170  float64_t *ft,
171  float64_t *dt,
172  const float64_t tmin,
173  const float64_t tmax,
174  int32_t *brackt
175  );
176 
177 static float64_t owlqn_x1norm(
178  const float64_t* x,
179  const int32_t start,
180  const int32_t n
181  );
182 
183 static void owlqn_pseudo_gradient(
184  float64_t* pg,
185  const float64_t* x,
186  const float64_t* g,
187  const int32_t n,
188  const float64_t c,
189  const int32_t start,
190  const int32_t end
191  );
192 
193 static void owlqn_project(
194  float64_t* d,
195  const float64_t* sign,
196  const int32_t start,
197  const int32_t end
198  );
199 
200 
202 {
203  memcpy(param, &_defparam, sizeof(*param));
204 }
205 
206 int32_t lbfgs(
207  int32_t n,
208  float64_t *x,
209  float64_t *ptr_fx,
210  lbfgs_evaluate_t proc_evaluate,
211  lbfgs_progress_t proc_progress,
212  void *instance,
213  lbfgs_parameter_t *_param
214  )
215 {
216  int32_t ret;
217  int32_t i, j, k, ls, end, bound;
218  float64_t step;
219 
220  /* Constant parameters and their default values. */
221  lbfgs_parameter_t param = (_param != NULL) ? (*_param) : _defparam;
222  const int32_t m = param.m;
223 
224  float64_t *xp = NULL;
225  float64_t *g = NULL, *gp = NULL, *pg = NULL;
226  float64_t *d = NULL, *w = NULL, *pf = NULL;
227  iteration_data_t *lm = NULL, *it = NULL;
228  float64_t ys, yy;
229  float64_t xnorm, gnorm, beta;
230  float64_t fx = 0.;
231  float64_t rate = 0.;
233 
234  /* Construct a callback data. */
235  callback_data_t cd;
236  cd.n = n;
237  cd.instance = instance;
238  cd.proc_evaluate = proc_evaluate;
239  cd.proc_progress = proc_progress;
240 
241  /* Check the input parameters for errors. */
242  if (n <= 0) {
243  return LBFGSERR_INVALID_N;
244  }
245  if (param.epsilon < 0.) {
247  }
248  if (param.past < 0) {
250  }
251  if (param.delta < 0.) {
252  return LBFGSERR_INVALID_DELTA;
253  }
254  if (param.min_step < 0.) {
256  }
257  if (param.max_step < param.min_step) {
259  }
260  if (param.ftol < 0.) {
261  return LBFGSERR_INVALID_FTOL;
262  }
265  if (param.wolfe <= param.ftol || 1. <= param.wolfe) {
266  return LBFGSERR_INVALID_WOLFE;
267  }
268  }
269  if (param.gtol < 0.) {
270  return LBFGSERR_INVALID_GTOL;
271  }
272  if (param.xtol < 0.) {
273  return LBFGSERR_INVALID_XTOL;
274  }
275  if (param.max_linesearch <= 0) {
277  }
278  if (param.orthantwise_c < 0.) {
280  }
281  if (param.orthantwise_start < 0 || n < param.orthantwise_start) {
283  }
284  if (param.orthantwise_end < 0) {
285  param.orthantwise_end = n;
286  }
287  if (n < param.orthantwise_end) {
289  }
290  if (param.orthantwise_c != 0.) {
291  switch (param.linesearch) {
293  linesearch = line_search_backtracking_owlqn;
294  break;
295  default:
296  /* Only the backtracking method is available. */
298  }
299  } else {
300  switch (param.linesearch) {
302  linesearch = line_search_morethuente;
303  break;
307  linesearch = line_search_backtracking;
308  break;
309  default:
311  }
312  }
313 
314  /* Allocate working space. */
315  xp = SG_CALLOC(float64_t, n);
316  g = SG_CALLOC(float64_t, n);
317  gp = SG_CALLOC(float64_t, n);
318  d = SG_CALLOC(float64_t, n);
319  w = SG_CALLOC(float64_t, n);
320  if (xp == NULL || g == NULL || gp == NULL || d == NULL || w == NULL) {
321  ret = LBFGSERR_OUTOFMEMORY;
322  goto lbfgs_exit;
323  }
324 
325  if (param.orthantwise_c != 0.) {
326  /* Allocate working space for OW-LQN. */
327  pg = SG_CALLOC(float64_t, n);
328  if (pg == NULL) {
329  ret = LBFGSERR_OUTOFMEMORY;
330  goto lbfgs_exit;
331  }
332  }
333 
334  /* Allocate limited memory storage. */
335  lm = SG_CALLOC(iteration_data_t, m);
336  if (lm == NULL) {
337  ret = LBFGSERR_OUTOFMEMORY;
338  goto lbfgs_exit;
339  }
340 
341  /* Initialize the limited memory. */
342  for (i = 0;i < m;++i) {
343  it = &lm[i];
344  it->alpha = 0;
345  it->ys = 0;
346  it->s = SG_CALLOC(float64_t, n);
347  it->y = SG_CALLOC(float64_t, n);
348  if (it->s == NULL || it->y == NULL) {
349  ret = LBFGSERR_OUTOFMEMORY;
350  goto lbfgs_exit;
351  }
352  }
353 
354  /* Allocate an array for storing previous values of the objective function. */
355  if (0 < param.past) {
356  pf = SG_CALLOC(float64_t, param.past);
357  }
358 
359  /* Evaluate the function value and its gradient. */
360  fx = cd.proc_evaluate(cd.instance, x, g, cd.n, 0);
361  if (0. != param.orthantwise_c) {
362  /* Compute the L1 norm of the variable and add it to the object value. */
363  xnorm = owlqn_x1norm(x, param.orthantwise_start, param.orthantwise_end);
364  fx += xnorm * param.orthantwise_c;
366  pg, x, g, n,
368  );
369  }
370 
371  /* Store the initial value of the objective function. */
372  if (pf != NULL) {
373  pf[0] = fx;
374  }
375 
376  /*
377  Compute the direction;
378  we assume the initial hessian matrix H_0 as the identity matrix.
379  */
380  if (param.orthantwise_c == 0.) {
381  std::copy(g,g+n,d);
383  } else {
384  std::copy(pg,pg+n,d);
386  }
387 
388  /*
389  Make sure that the initial variables are not a minimizer.
390  */
391  xnorm = SGVector<float64_t>::twonorm(x, n);
392  if (param.orthantwise_c == 0.) {
393  gnorm = SGVector<float64_t>::twonorm(g, n);
394  } else {
395  gnorm = SGVector<float64_t>::twonorm(pg, n);
396  }
397  if (xnorm < 1.0) xnorm = 1.0;
398  if (gnorm / xnorm <= param.epsilon) {
400  goto lbfgs_exit;
401  }
402 
403  /* Compute the initial step:
404  step = 1.0 / sqrt(vecdot(d, d, n))
405  */
406  step = 1.0 / SGVector<float64_t>::twonorm(d, n);
407 
408  k = 1;
409  end = 0;
410  for (;;) {
411  /* Store the current position and gradient vectors. */
412  std::copy(x,x+n,xp);
413  std::copy(g,g+n,gp);
414 
415  /* Search for an optimal step. */
416  if (param.orthantwise_c == 0.) {
417  ls = linesearch(n, x, &fx, g, d, &step, xp, gp, w, &cd, &param);
418  } else {
419  ls = linesearch(n, x, &fx, g, d, &step, xp, pg, w, &cd, &param);
421  pg, x, g, n,
423  );
424  }
425  if (ls < 0) {
426  /* Revert to the previous point. */
427  std::copy(xp,xp+n,x);
428  std::copy(gp,gp+n,g);
429  ret = ls;
430  goto lbfgs_exit;
431  }
432 
433  /* Compute x and g norms. */
434  xnorm = SGVector<float64_t>::twonorm(x, n);
435  if (param.orthantwise_c == 0.) {
436  gnorm = SGVector<float64_t>::twonorm(g, n);
437  } else {
438  gnorm = SGVector<float64_t>::twonorm(pg, n);
439  }
440 
441  /* Report the progress. */
442  if (cd.proc_progress) {
443  if ((ret = cd.proc_progress(cd.instance, x, g, fx, xnorm, gnorm, step, cd.n, k, ls))) {
444  goto lbfgs_exit;
445  }
446  }
447 
448  /*
449  Convergence test.
450  The criterion is given by the following formula:
451  |g(x)| / \max2(1, |x|) < \epsilon
452  */
453  if (xnorm < 1.0) xnorm = 1.0;
454  if (gnorm / xnorm <= param.epsilon) {
455  /* Convergence. */
456  ret = LBFGS_SUCCESS;
457  break;
458  }
459 
460  /*
461  Test for stopping criterion.
462  The criterion is given by the following formula:
463  (f(past_x) - f(x)) / f(x) < \delta
464  */
465  if (pf != NULL) {
466  /* We don't test the stopping criterion while k < past. */
467  if (param.past <= k) {
468  /* Compute the relative improvement from the past. */
469  rate = (pf[k % param.past] - fx) / fx;
470 
471  /* The stopping criterion. */
472  if (rate < param.delta) {
473  ret = LBFGS_STOP;
474  break;
475  }
476  }
477 
478  /* Store the current value of the objective function. */
479  pf[k % param.past] = fx;
480  }
481 
482  if (param.max_iterations != 0 && param.max_iterations < k+1) {
483  /* Maximum number of iterations. */
485  break;
486  }
487 
488  /*
489  Update vectors s and y:
490  s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}.
491  y_{k+1} = g_{k+1} - g_{k}.
492  */
493  it = &lm[end];
494  SGVector<float64_t>::add(it->s, 1, x, -1, xp, n);
495  SGVector<float64_t>::add(it->y, 1, g, -1, gp, n);
496 
497  /*
498  Compute scalars ys and yy:
499  ys = y^t \cdot s = 1 / \rho.
500  yy = y^t \cdot y.
501  Notice that yy is used for scaling the hessian matrix H_0 (Cholesky factor).
502  */
503  ys = SGVector<float64_t>::dot(it->y, it->s, n);
504  yy = SGVector<float64_t>::dot(it->y, it->y, n);
505  it->ys = ys;
506 
507  /*
508  Recursive formula to compute dir = -(H \cdot g).
509  This is described in page 779 of:
510  Jorge Nocedal.
511  Updating Quasi-Newton Matrices with Limited Storage.
512  Mathematics of Computation, Vol. 35, No. 151,
513  pp. 773--782, 1980.
514  */
515  bound = (m <= k) ? m : k;
516  ++k;
517  end = (end + 1) % m;
518 
519  /* Compute the steepest direction. */
520  if (param.orthantwise_c == 0.) {
521  /* Compute the negative of gradients. */
522  std::copy(g, g+n, d);
524  } else {
525  std::copy(pg, pg+n, d);
527  }
528 
529  j = end;
530  for (i = 0;i < bound;++i) {
531  j = (j + m - 1) % m; /* if (--j == -1) j = m-1; */
532  it = &lm[j];
533  /* \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}. */
534  it->alpha = SGVector<float64_t>::dot(it->s, d, n);
535  it->alpha /= it->ys;
536  /* q_{i} = q_{i+1} - \alpha_{i} y_{i}. */
537  SGVector<float64_t>::add(d, 1, d, -it->alpha, it->y, n);
538  }
539 
540  SGVector<float64_t>::scale_vector(ys / yy, d, n);
541 
542  for (i = 0;i < bound;++i) {
543  it = &lm[j];
544  /* \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}. */
545  beta = SGVector<float64_t>::dot(it->y, d, n);
546  beta /= it->ys;
547  /* \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}. */
548  SGVector<float64_t>::add(d, 1, d, it->alpha-beta, it->s, n);
549  j = (j + 1) % m; /* if (++j == m) j = 0; */
550  }
551 
552  /*
553  Constrain the search direction for orthant-wise updates.
554  */
555  if (param.orthantwise_c != 0.) {
556  for (i = param.orthantwise_start;i < param.orthantwise_end;++i) {
557  if (d[i] * pg[i] >= 0) {
558  d[i] = 0;
559  }
560  }
561  }
562 
563  /*
564  Now the search direction d is ready. We try step = 1 first.
565  */
566  step = 1.0;
567  }
568 
569 lbfgs_exit:
570  /* Return the final value of the objective function. */
571  if (ptr_fx != NULL) {
572  *ptr_fx = fx;
573  }
574 
575  SG_FREE(pf);
576 
577  /* Free memory blocks used by this function. */
578  if (lm != NULL) {
579  for (i = 0;i < m;++i) {
580  SG_FREE(lm[i].s);
581  SG_FREE(lm[i].y);
582  }
583  SG_FREE(lm);
584  }
585  SG_FREE(pg);
586  SG_FREE(w);
587  SG_FREE(d);
588  SG_FREE(gp);
589  SG_FREE(g);
590  SG_FREE(xp);
591 
592  return ret;
593 }
594 
595 
596 
598  int32_t n,
599  float64_t *x,
600  float64_t *f,
601  float64_t *g,
602  float64_t *s,
603  float64_t *stp,
604  const float64_t* xp,
605  const float64_t* gp,
606  float64_t *wp,
607  callback_data_t *cd,
608  const lbfgs_parameter_t *param
609  )
610 {
611  int32_t count = 0;
612  float64_t width, dg;
613  float64_t finit, dginit = 0., dgtest;
614  const float64_t dec = 0.5, inc = 2.1;
615 
616  /* Check the input parameters for errors. */
617  if (*stp <= 0.) {
619  }
620 
621  /* Compute the initial gradient in the search direction. */
622  dginit = SGVector<float64_t>::dot(g, s, n);
623 
624  /* Make sure that s points to a descent direction. */
625  if (0 < dginit) {
627  }
628 
629  /* The initial value of the objective function. */
630  finit = *f;
631  dgtest = param->ftol * dginit;
632 
633  for (;;) {
634  std::copy(xp,xp+n,x);
635  SGVector<float64_t>::add(x, 1, x, *stp, s, n);
636 
637  /* Evaluate the function and gradient values. */
638  *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp);
639 
640  ++count;
641 
642  if (*f > finit + *stp * dgtest) {
643  width = dec;
644  } else {
645  /* The sufficient decrease condition (Armijo condition). */
647  /* Exit with the Armijo condition. */
648  return count;
649  }
650 
651  /* Check the Wolfe condition. */
652  dg = SGVector<float64_t>::dot(g, s, n);
653  if (dg < param->wolfe * dginit) {
654  width = inc;
655  } else {
657  /* Exit with the regular Wolfe condition. */
658  return count;
659  }
660 
661  /* Check the strong Wolfe condition. */
662  if(dg > -param->wolfe * dginit) {
663  width = dec;
664  } else {
665  /* Exit with the strong Wolfe condition. */
666  return count;
667  }
668  }
669  }
670 
671  if (*stp < param->min_step) {
672  /* The step is the minimum value. */
673  return LBFGSERR_MINIMUMSTEP;
674  }
675  if (*stp > param->max_step) {
676  /* The step is the maximum value. */
677  return LBFGSERR_MAXIMUMSTEP;
678  }
679  if (param->max_linesearch <= count) {
680  /* Maximum number of iteration. */
682  }
683 
684  (*stp) *= width;
685  }
686 }
687 
688 
689 
691  int32_t n,
692  float64_t *x,
693  float64_t *f,
694  float64_t *g,
695  float64_t *s,
696  float64_t *stp,
697  const float64_t* xp,
698  const float64_t* gp,
699  float64_t *wp,
700  callback_data_t *cd,
701  const lbfgs_parameter_t *param
702  )
703 {
704  int32_t i, count = 0;
705  float64_t width = 0.5, norm = 0.;
706  float64_t finit = *f, dgtest;
707 
708  /* Check the input parameters for errors. */
709  if (*stp <= 0.) {
711  }
712 
713  /* Choose the orthant for the new point. */
714  for (i = 0;i < n;++i) {
715  wp[i] = (xp[i] == 0.) ? -gp[i] : xp[i];
716  }
717 
718  for (;;) {
719  /* Update the current point. */
720  std::copy(xp,xp+n,x);
721  SGVector<float64_t>::add(x, 1, x, *stp, s, n);
722 
723  /* The current point is projected onto the orthant. */
724  owlqn_project(x, wp, param->orthantwise_start, param->orthantwise_end);
725 
726  /* Evaluate the function and gradient values. */
727  *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp);
728 
729  /* Compute the L1 norm of the variables and add it to the object value. */
730  norm = owlqn_x1norm(x, param->orthantwise_start, param->orthantwise_end);
731  *f += norm * param->orthantwise_c;
732 
733  ++count;
734 
735  dgtest = 0.;
736  for (i = 0;i < n;++i) {
737  dgtest += (x[i] - xp[i]) * gp[i];
738  }
739 
740  if (*f <= finit + param->ftol * dgtest) {
741  /* The sufficient decrease condition. */
742  return count;
743  }
744 
745  if (*stp < param->min_step) {
746  /* The step is the minimum value. */
747  return LBFGSERR_MINIMUMSTEP;
748  }
749  if (*stp > param->max_step) {
750  /* The step is the maximum value. */
751  return LBFGSERR_MAXIMUMSTEP;
752  }
753  if (param->max_linesearch <= count) {
754  /* Maximum number of iteration. */
756  }
757 
758  (*stp) *= width;
759  }
760 }
761 
762 
763 
764 static int32_t line_search_morethuente(
765  int32_t n,
766  float64_t *x,
767  float64_t *f,
768  float64_t *g,
769  float64_t *s,
770  float64_t *stp,
771  const float64_t* xp,
772  const float64_t* gp,
773  float64_t *wa,
774  callback_data_t *cd,
775  const lbfgs_parameter_t *param
776  )
777 {
778  int32_t count = 0;
779  int32_t brackt, stage1, uinfo = 0;
780  float64_t dg;
781  float64_t stx, fx, dgx;
782  float64_t sty, fy, dgy;
783  float64_t fxm, dgxm, fym, dgym, fm, dgm;
784  float64_t finit, ftest1, dginit, dgtest;
785  float64_t width, prev_width;
786  float64_t stmin, stmax;
787 
788  /* Check the input parameters for errors. */
789  if (*stp <= 0.) {
791  }
792 
793  /* Compute the initial gradient in the search direction. */
794  dginit = SGVector<float64_t>::dot(g, s, n);
795 
796  /* Make sure that s points to a descent direction. */
797  if (0 < dginit) {
799  }
800 
801  /* Initialize local variables. */
802  brackt = 0;
803  stage1 = 1;
804  finit = *f;
805  dgtest = param->ftol * dginit;
806  width = param->max_step - param->min_step;
807  prev_width = 2.0 * width;
808 
809  /*
810  The variables stx, fx, dgx contain the values of the step,
811  function, and directional derivative at the best step.
812  The variables sty, fy, dgy contain the value of the step,
813  function, and derivative at the other endpoint of
814  the interval of uncertainty.
815  The variables stp, f, dg contain the values of the step,
816  function, and derivative at the current step.
817  */
818  stx = sty = 0.;
819  fx = fy = finit;
820  dgx = dgy = dginit;
821 
822  for (;;) {
823  /*
824  Set the minimum and maximum steps to correspond to the
825  present interval of uncertainty.
826  */
827  if (brackt) {
828  stmin = min2(stx, sty);
829  stmax = max2(stx, sty);
830  } else {
831  stmin = stx;
832  stmax = *stp + 4.0 * (*stp - stx);
833  }
834 
835  /* Clip the step in the range of [stpmin, stpmax]. */
836  if (*stp < param->min_step) *stp = param->min_step;
837  if (param->max_step < *stp) *stp = param->max_step;
838 
839  /*
840  If an unusual termination is to occur then let
841  stp be the lowest point obtained so far.
842  */
843  if ((brackt && ((*stp <= stmin || stmax <= *stp) || param->max_linesearch <= count + 1 || uinfo != 0)) || (brackt && (stmax - stmin <= param->xtol * stmax))) {
844  *stp = stx;
845  }
846 
847  /*
848  Compute the current value of x:
849  x <- x + (*stp) * s.
850  */
851  std::copy(xp,xp+n,x);
852  SGVector<float64_t>::add(x, 1, x, *stp, s, n);
853 
854  /* Evaluate the function and gradient values. */
855  *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp);
856  dg = SGVector<float64_t>::dot(g, s, n);
857 
858  ftest1 = finit + *stp * dgtest;
859  ++count;
860 
861  /* Test for errors and convergence. */
862  if (brackt && ((*stp <= stmin || stmax <= *stp) || uinfo != 0)) {
863  /* Rounding errors prevent further progress. */
865  }
866  if (*stp == param->max_step && *f <= ftest1 && dg <= dgtest) {
867  /* The step is the maximum value. */
868  return LBFGSERR_MAXIMUMSTEP;
869  }
870  if (*stp == param->min_step && (ftest1 < *f || dgtest <= dg)) {
871  /* The step is the minimum value. */
872  return LBFGSERR_MINIMUMSTEP;
873  }
874  if (brackt && (stmax - stmin) <= param->xtol * stmax) {
875  /* Relative width of the interval of uncertainty is at most xtol. */
876  return LBFGSERR_WIDTHTOOSMALL;
877  }
878  if (param->max_linesearch <= count) {
879  /* Maximum number of iteration. */
881  }
882  if (*f <= ftest1 && fabs(dg) <= param->gtol * (-dginit)) {
883  /* The sufficient decrease condition and the directional derivative condition hold. */
884  return count;
885  }
886 
887  /*
888  In the first stage we seek a step for which the modified
889  function has a nonpositive value and nonnegative derivative.
890  */
891  if (stage1 && *f <= ftest1 && min2(param->ftol, param->gtol) * dginit <= dg) {
892  stage1 = 0;
893  }
894 
895  /*
896  A modified function is used to predict the step only if
897  we have not obtained a step for which the modified
898  function has a nonpositive function value and nonnegative
899  derivative, and if a lower function value has been
900  obtained but the decrease is not sufficient.
901  */
902  if (stage1 && ftest1 < *f && *f <= fx) {
903  /* Define the modified function and derivative values. */
904  fm = *f - *stp * dgtest;
905  fxm = fx - stx * dgtest;
906  fym = fy - sty * dgtest;
907  dgm = dg - dgtest;
908  dgxm = dgx - dgtest;
909  dgym = dgy - dgtest;
910 
911  /*
912  Call update_trial_interval() to update the interval of
913  uncertainty and to compute the new step.
914  */
915  uinfo = update_trial_interval(
916  &stx, &fxm, &dgxm,
917  &sty, &fym, &dgym,
918  stp, &fm, &dgm,
919  stmin, stmax, &brackt
920  );
921 
922  /* Reset the function and gradient values for f. */
923  fx = fxm + stx * dgtest;
924  fy = fym + sty * dgtest;
925  dgx = dgxm + dgtest;
926  dgy = dgym + dgtest;
927  } else {
928  /*
929  Call update_trial_interval() to update the interval of
930  uncertainty and to compute the new step.
931  */
932  uinfo = update_trial_interval(
933  &stx, &fx, &dgx,
934  &sty, &fy, &dgy,
935  stp, f, &dg,
936  stmin, stmax, &brackt
937  );
938  }
939 
940  /*
941  Force a sufficient decrease in the interval of uncertainty.
942  */
943  if (brackt) {
944  if (0.66 * prev_width <= fabs(sty - stx)) {
945  *stp = stx + 0.5 * (sty - stx);
946  }
947  prev_width = width;
948  width = fabs(sty - stx);
949  }
950  }
951 
952  return LBFGSERR_LOGICERROR;
953 }
954 
955 
956 
960 #define USES_MINIMIZER \
961  float64_t a, d, gamma, theta, p, q, r, s;
962 
973 #define CUBIC_MINIMIZER(cm, u, fu, du, v, fv, dv) \
974  d = (v) - (u); \
975  theta = ((fu) - (fv)) * 3 / d + (du) + (dv); \
976  p = fabs(theta); \
977  q = fabs(du); \
978  r = fabs(dv); \
979  s = max3(p, q, r); \
980  /* gamma = s*sqrt((theta/s)**2 - (du/s) * (dv/s)) */ \
981  a = theta / s; \
982  gamma = s * sqrt(a * a - ((du) / s) * ((dv) / s)); \
983  if ((v) < (u)) gamma = -gamma; \
984  p = gamma - (du) + theta; \
985  q = gamma - (du) + gamma + (dv); \
986  r = p / q; \
987  (cm) = (u) + r * d;
988 
1001 #define CUBIC_MINIMIZER2(cm, u, fu, du, v, fv, dv, xmin, xmax) \
1002  d = (v) - (u); \
1003  theta = ((fu) - (fv)) * 3 / d + (du) + (dv); \
1004  p = fabs(theta); \
1005  q = fabs(du); \
1006  r = fabs(dv); \
1007  s = max3(p, q, r); \
1008  /* gamma = s*sqrt((theta/s)**2 - (du/s) * (dv/s)) */ \
1009  a = theta / s; \
1010  gamma = s * sqrt(max2(0, a * a - ((du) / s) * ((dv) / s))); \
1011  if ((u) < (v)) gamma = -gamma; \
1012  p = gamma - (dv) + theta; \
1013  q = gamma - (dv) + gamma + (du); \
1014  r = p / q; \
1015  if (r < 0. && gamma != 0.) { \
1016  (cm) = (v) - r * d; \
1017  } else if (a < 0) { \
1018  (cm) = (xmax); \
1019  } else { \
1020  (cm) = (xmin); \
1021  }
1022 
1032 #define QUARD_MINIMIZER(qm, u, fu, du, v, fv) \
1033  a = (v) - (u); \
1034  (qm) = (u) + (du) / (((fu) - (fv)) / a + (du)) / 2 * a;
1035 
1044 #define QUARD_MINIMIZER2(qm, u, du, v, dv) \
1045  a = (u) - (v); \
1046  (qm) = (v) + (dv) / ((dv) - (du)) * a;
1047 
1048 #define fsigndiff(x, y) (*(x) * (*(y) / fabs(*(y))) < 0.)
1049 
1079 static int32_t update_trial_interval(
1080  float64_t *x,
1081  float64_t *fx,
1082  float64_t *dx,
1083  float64_t *y,
1084  float64_t *fy,
1085  float64_t *dy,
1086  float64_t *t,
1087  float64_t *ft,
1088  float64_t *dt,
1089  const float64_t tmin,
1090  const float64_t tmax,
1091  int32_t *brackt
1092  )
1093 {
1094  int32_t bound;
1095  int32_t dsign = fsigndiff(dt, dx);
1096  float64_t mc; /* minimizer of an interpolated cubic. */
1097  float64_t mq; /* minimizer of an interpolated quadratic. */
1098  float64_t newt; /* new trial value. */
1099  USES_MINIMIZER; /* for CUBIC_MINIMIZER and QUARD_MINIMIZER. */
1100 
1101  /* Check the input parameters for errors. */
1102  if (*brackt) {
1103  if (*t <= min2(*x, *y) || max2(*x, *y) <= *t) {
1104  /* The trival value t is out of the interval. */
1105  return LBFGSERR_OUTOFINTERVAL;
1106  }
1107  if (0. <= *dx * (*t - *x)) {
1108  /* The function must decrease from x. */
1110  }
1111  if (tmax < tmin) {
1112  /* Incorrect tmin and tmax specified. */
1114  }
1115  }
1116 
1117  /*
1118  Trial value selection.
1119  */
1120  if (*fx < *ft) {
1121  /*
1122  Case 1: a higher function value.
1123  The minimum is brackt. If the cubic minimizer is closer
1124  to x than the quadratic one, the cubic one is taken, else
1125  the average of the minimizers is taken.
1126  */
1127  *brackt = 1;
1128  bound = 1;
1129  CUBIC_MINIMIZER(mc, *x, *fx, *dx, *t, *ft, *dt);
1130  QUARD_MINIMIZER(mq, *x, *fx, *dx, *t, *ft);
1131  if (fabs(mc - *x) < fabs(mq - *x)) {
1132  newt = mc;
1133  } else {
1134  newt = mc + 0.5 * (mq - mc);
1135  }
1136  } else if (dsign) {
1137  /*
1138  Case 2: a lower function value and derivatives of
1139  opposite sign. The minimum is brackt. If the cubic
1140  minimizer is closer to x than the quadratic (secant) one,
1141  the cubic one is taken, else the quadratic one is taken.
1142  */
1143  *brackt = 1;
1144  bound = 0;
1145  CUBIC_MINIMIZER(mc, *x, *fx, *dx, *t, *ft, *dt);
1146  QUARD_MINIMIZER2(mq, *x, *dx, *t, *dt);
1147  if (fabs(mc - *t) > fabs(mq - *t)) {
1148  newt = mc;
1149  } else {
1150  newt = mq;
1151  }
1152  } else if (fabs(*dt) < fabs(*dx)) {
1153  /*
1154  Case 3: a lower function value, derivatives of the
1155  same sign, and the magnitude of the derivative decreases.
1156  The cubic minimizer is only used if the cubic tends to
1157  infinity in the direction of the minimizer or if the minimum
1158  of the cubic is beyond t. Otherwise the cubic minimizer is
1159  defined to be either tmin or tmax. The quadratic (secant)
1160  minimizer is also computed and if the minimum is brackt
1161  then the the minimizer closest to x is taken, else the one
1162  farthest away is taken.
1163  */
1164  bound = 1;
1165  CUBIC_MINIMIZER2(mc, *x, *fx, *dx, *t, *ft, *dt, tmin, tmax);
1166  QUARD_MINIMIZER2(mq, *x, *dx, *t, *dt);
1167  if (*brackt) {
1168  if (fabs(*t - mc) < fabs(*t - mq)) {
1169  newt = mc;
1170  } else {
1171  newt = mq;
1172  }
1173  } else {
1174  if (fabs(*t - mc) > fabs(*t - mq)) {
1175  newt = mc;
1176  } else {
1177  newt = mq;
1178  }
1179  }
1180  } else {
1181  /*
1182  Case 4: a lower function value, derivatives of the
1183  same sign, and the magnitude of the derivative does
1184  not decrease. If the minimum is not brackt, the step
1185  is either tmin or tmax, else the cubic minimizer is taken.
1186  */
1187  bound = 0;
1188  if (*brackt) {
1189  CUBIC_MINIMIZER(newt, *t, *ft, *dt, *y, *fy, *dy);
1190  } else if (*x < *t) {
1191  newt = tmax;
1192  } else {
1193  newt = tmin;
1194  }
1195  }
1196 
1197  /*
1198  Update the interval of uncertainty. This update does not
1199  depend on the new step or the case analysis above.
1200 
1201  - Case a: if f(x) < f(t),
1202  x <- x, y <- t.
1203  - Case b: if f(t) <= f(x) && f'(t)*f'(x) > 0,
1204  x <- t, y <- y.
1205  - Case c: if f(t) <= f(x) && f'(t)*f'(x) < 0,
1206  x <- t, y <- x.
1207  */
1208  if (*fx < *ft) {
1209  /* Case a */
1210  *y = *t;
1211  *fy = *ft;
1212  *dy = *dt;
1213  } else {
1214  /* Case c */
1215  if (dsign) {
1216  *y = *x;
1217  *fy = *fx;
1218  *dy = *dx;
1219  }
1220  /* Cases b and c */
1221  *x = *t;
1222  *fx = *ft;
1223  *dx = *dt;
1224  }
1225 
1226  /* Clip the new trial value in [tmin, tmax]. */
1227  if (tmax < newt) newt = tmax;
1228  if (newt < tmin) newt = tmin;
1229 
1230  /*
1231  Redefine the new trial value if it is close to the upper bound
1232  of the interval.
1233  */
1234  if (*brackt && bound) {
1235  mq = *x + 0.66 * (*y - *x);
1236  if (*x < *y) {
1237  if (mq < newt) newt = mq;
1238  } else {
1239  if (newt < mq) newt = mq;
1240  }
1241  }
1242 
1243  /* Return the new trial value. */
1244  *t = newt;
1245  return 0;
1246 }
1247 
1248 
1249 
1250 
1251 
1253  const float64_t* x,
1254  const int32_t start,
1255  const int32_t n
1256  )
1257 {
1258  int32_t i;
1259  float64_t norm = 0.;
1260 
1261  for (i = start;i < n;++i) {
1262  norm += fabs(x[i]);
1263  }
1264 
1265  return norm;
1266 }
1267 
1269  float64_t* pg,
1270  const float64_t* x,
1271  const float64_t* g,
1272  const int32_t n,
1273  const float64_t c,
1274  const int32_t start,
1275  const int32_t end
1276  )
1277 {
1278  int32_t i;
1279 
1280  /* Compute the negative of gradients. */
1281  for (i = 0;i < start;++i) {
1282  pg[i] = g[i];
1283  }
1284 
1285  /* Compute the psuedo-gradients. */
1286  for (i = start;i < end;++i) {
1287  if (x[i] < 0.) {
1288  /* Differentiable. */
1289  pg[i] = g[i] - c;
1290  } else if (0. < x[i]) {
1291  /* Differentiable. */
1292  pg[i] = g[i] + c;
1293  } else {
1294  if (g[i] < -c) {
1295  /* Take the right partial derivative. */
1296  pg[i] = g[i] + c;
1297  } else if (c < g[i]) {
1298  /* Take the left partial derivative. */
1299  pg[i] = g[i] - c;
1300  } else {
1301  pg[i] = 0.;
1302  }
1303  }
1304  }
1305 
1306  for (i = end;i < n;++i) {
1307  pg[i] = g[i];
1308  }
1309 }
1310 
1311 static void owlqn_project(
1312  float64_t* d,
1313  const float64_t* sign,
1314  const int32_t start,
1315  const int32_t end
1316  )
1317 {
1318  int32_t i;
1319 
1320  for (i = start;i < end;++i) {
1321  if (d[i] * sign[i] <= 0) {
1322  d[i] = 0;
1323  }
1324  }
1325 }
1326 
1327 }

SHOGUN Machine Learning Toolbox - Documentation