SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
lbfgs.cpp
Go to the documentation of this file.
1 /*
2  * Limited memory BFGS (L-BFGS).
3  *
4  * Copyright (c) 1990, Jorge Nocedal
5  * Copyright (c) 2007-2010 Naoaki Okazaki
6  * All rights reserved.
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a copy
9  * of this software and associated documentation files (the "Software"), to deal
10  * in the Software without restriction, including without limitation the rights
11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12  * copies of the Software, and to permit persons to whom the Software is
13  * furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice shall be included in
16  * all copies or substantial portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24  * THE SOFTWARE.
25  */
26 
27 /* $Id$ */
28 
29 /*
30 This library is a C port of the FORTRAN implementation of Limited-memory
31 Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) method written by Jorge Nocedal.
32 The original FORTRAN source code is available at:
33 http://www.ece.northwestern.edu/~nocedal/lbfgs.html
34 
35 The L-BFGS algorithm is described in:
36  - Jorge Nocedal.
37  Updating Quasi-Newton Matrices with Limited Storage.
38  <i>Mathematics of Computation</i>, Vol. 35, No. 151, pp. 773--782, 1980.
39  - Dong C. Liu and Jorge Nocedal.
40  On the limited memory BFGS method for large scale optimization.
41  <i>Mathematical Programming</i> B, Vol. 45, No. 3, pp. 503-528, 1989.
42 
43 The line search algorithms used in this implementation are described in:
44  - John E. Dennis and Robert B. Schnabel.
45  <i>Numerical Methods for Unconstrained Optimization and Nonlinear
46  Equations</i>, Englewood Cliffs, 1983.
47  - Jorge J. More and David J. Thuente.
48  Line search algorithm with guaranteed sufficient decrease.
49  <i>ACM Transactions on Mathematical Software (TOMS)</i>, Vol. 20, No. 3,
50  pp. 286-307, 1994.
51 
52 This library also implements Orthant-Wise Limited-memory Quasi-Newton (OWL-QN)
53 method presented in:
54  - Galen Andrew and Jianfeng Gao.
55  Scalable training of L1-regularized log-linear models.
56  In <i>Proceedings of the 24th International Conference on Machine
57  Learning (ICML 2007)</i>, pp. 33-40, 2007.
58 
59 I would like to thank the original author, Jorge Nocedal, who has been
60 distributing the effieicnt and explanatory implementation in an open source
61 licence.
62 */
63 
64 #include <algorithm>
65 #include <cstdio>
66 #include <cmath>
67 #include <string.h>
68 
70 #include <shogun/lib/SGVector.h>
71 #include <shogun/lib/common.h>
72 #include <shogun/lib/memory.h>
74 
75 namespace shogun
76 {
77 
78 #define min2(a, b) ((a) <= (b) ? (a) : (b))
79 #define max2(a, b) ((a) >= (b) ? (a) : (b))
80 #define max3(a, b, c) max2(max2((a), (b)), (c));
81 
83  int32_t n;
84  void *instance;
88 };
90 
93  float64_t *s; /* [n] */
94  float64_t *y; /* [n] */
95  float64_t ys; /* vecdot(y, s) */
96 };
98 
99 static const lbfgs_parameter_t _defparam = {
100  6, 1e-5, 0, 0.0,
102  1e-20, 1e20, 1e-4, 0.9, 0.9, 1.0e-16,
103  0.0, 0, 1,
104 };
105 
106 /* Forward function declarations. */
107 
108 typedef int32_t (*line_search_proc)(
109  int32_t n,
110  float64_t *x,
111  float64_t *f,
112  float64_t *g,
113  float64_t *s,
114  float64_t *stp,
115  const float64_t* xp,
116  const float64_t* gp,
117  float64_t *wa,
118  callback_data_t *cd,
119  const lbfgs_parameter_t *param
120  );
121 
122 static int32_t line_search_backtracking(
123  int32_t n,
124  float64_t *x,
125  float64_t *f,
126  float64_t *g,
127  float64_t *s,
128  float64_t *stp,
129  const float64_t* xp,
130  const float64_t* gp,
131  float64_t *wa,
132  callback_data_t *cd,
133  const lbfgs_parameter_t *param
134  );
135 
136 static int32_t line_search_backtracking_owlqn(
137  int32_t n,
138  float64_t *x,
139  float64_t *f,
140  float64_t *g,
141  float64_t *s,
142  float64_t *stp,
143  const float64_t* xp,
144  const float64_t* gp,
145  float64_t *wp,
146  callback_data_t *cd,
147  const lbfgs_parameter_t *param
148  );
149 
150 static int32_t line_search_morethuente(
151  int32_t n,
152  float64_t *x,
153  float64_t *f,
154  float64_t *g,
155  float64_t *s,
156  float64_t *stp,
157  const float64_t* xp,
158  const float64_t* gp,
159  float64_t *wa,
160  callback_data_t *cd,
161  const lbfgs_parameter_t *param
162  );
163 
164 static int32_t update_trial_interval(
165  float64_t *x,
166  float64_t *fx,
167  float64_t *dx,
168  float64_t *y,
169  float64_t *fy,
170  float64_t *dy,
171  float64_t *t,
172  float64_t *ft,
173  float64_t *dt,
174  const float64_t tmin,
175  const float64_t tmax,
176  int32_t *brackt
177  );
178 
179 static float64_t owlqn_x1norm(
180  const float64_t* x,
181  const int32_t start,
182  const int32_t n
183  );
184 
185 static void owlqn_pseudo_gradient(
186  float64_t* pg,
187  const float64_t* x,
188  const float64_t* g,
189  const int32_t n,
190  const float64_t c,
191  const int32_t start,
192  const int32_t end
193  );
194 
195 static void owlqn_project(
196  float64_t* d,
197  const float64_t* sign,
198  const int32_t start,
199  const int32_t end
200  );
201 
202 
204 {
205  memcpy(param, &_defparam, sizeof(*param));
206 }
207 
208 int32_t lbfgs(
209  int32_t n,
210  float64_t *x,
211  float64_t *ptr_fx,
212  lbfgs_evaluate_t proc_evaluate,
213  lbfgs_progress_t proc_progress,
214  void *instance,
215  lbfgs_parameter_t *_param,
216  lbfgs_adjust_step_t proc_adjust_step
217  )
218 {
219  int32_t ret;
220  int32_t i, j, k, ls, end, bound;
221  float64_t step;
222 
223  /* Constant parameters and their default values. */
224  lbfgs_parameter_t param = (_param != NULL) ? (*_param) : _defparam;
225  const int32_t m = param.m;
226 
227  float64_t *xp = NULL;
228  float64_t *g = NULL, *gp = NULL, *pg = NULL;
229  float64_t *d = NULL, *w = NULL, *pf = NULL;
230  iteration_data_t *lm = NULL, *it = NULL;
231  float64_t ys, yy;
232  float64_t xnorm, gnorm, beta;
233  float64_t fx = 0.;
234  float64_t rate = 0.;
236 
237  /* Construct a callback data. */
238  callback_data_t cd;
239  cd.n = n;
240  cd.instance = instance;
241  cd.proc_evaluate = proc_evaluate;
242  cd.proc_progress = proc_progress;
243  cd.proc_adjust_step=proc_adjust_step;
244 
245  /* Check the input parameters for errors. */
246  if (n <= 0) {
247  return LBFGSERR_INVALID_N;
248  }
249  if (param.epsilon < 0.) {
251  }
252  if (param.past < 0) {
254  }
255  if (param.delta < 0.) {
256  return LBFGSERR_INVALID_DELTA;
257  }
258  if (param.min_step < 0.) {
260  }
261  if (param.max_step < param.min_step) {
263  }
264  if (param.ftol < 0.) {
265  return LBFGSERR_INVALID_FTOL;
266  }
269  if (param.wolfe <= param.ftol || 1. <= param.wolfe) {
270  return LBFGSERR_INVALID_WOLFE;
271  }
272  }
273  if (param.gtol < 0.) {
274  return LBFGSERR_INVALID_GTOL;
275  }
276  if (param.xtol < 0.) {
277  return LBFGSERR_INVALID_XTOL;
278  }
279  if (param.max_linesearch <= 0) {
281  }
282  if (param.orthantwise_c < 0.) {
284  }
285  if (param.orthantwise_start < 0 || n < param.orthantwise_start) {
287  }
288  if (param.orthantwise_end < 0) {
289  param.orthantwise_end = n;
290  }
291  if (n < param.orthantwise_end) {
293  }
294  if (param.orthantwise_c != 0.) {
295  switch (param.linesearch) {
297  linesearch = line_search_backtracking_owlqn;
298  break;
299  default:
300  /* Only the backtracking method is available. */
302  }
303  } else {
304  switch (param.linesearch) {
306  linesearch = line_search_morethuente;
307  break;
311  linesearch = line_search_backtracking;
312  break;
313  default:
315  }
316  }
317 
318  /* Allocate working space. */
319  xp = SG_CALLOC(float64_t, n);
320  g = SG_CALLOC(float64_t, n);
321  gp = SG_CALLOC(float64_t, n);
322  d = SG_CALLOC(float64_t, n);
323  w = SG_CALLOC(float64_t, n);
324  if (xp == NULL || g == NULL || gp == NULL || d == NULL || w == NULL) {
325  ret = LBFGSERR_OUTOFMEMORY;
326  goto lbfgs_exit;
327  }
328 
329  if (param.orthantwise_c != 0.) {
330  /* Allocate working space for OW-LQN. */
331  pg = SG_CALLOC(float64_t, n);
332  if (pg == NULL) {
333  ret = LBFGSERR_OUTOFMEMORY;
334  goto lbfgs_exit;
335  }
336  }
337 
338  /* Allocate limited memory storage. */
339  lm = SG_CALLOC(iteration_data_t, m);
340  if (lm == NULL) {
341  ret = LBFGSERR_OUTOFMEMORY;
342  goto lbfgs_exit;
343  }
344 
345  /* Initialize the limited memory. */
346  for (i = 0;i < m;++i) {
347  it = &lm[i];
348  it->alpha = 0;
349  it->ys = 0;
350  it->s = SG_CALLOC(float64_t, n);
351  it->y = SG_CALLOC(float64_t, n);
352  if (it->s == NULL || it->y == NULL) {
353  ret = LBFGSERR_OUTOFMEMORY;
354  goto lbfgs_exit;
355  }
356  }
357 
358  /* Allocate an array for storing previous values of the objective function. */
359  if (0 < param.past) {
360  pf = SG_CALLOC(float64_t, param.past);
361  }
362 
363  /* Evaluate the function value and its gradient. */
364  fx = cd.proc_evaluate(cd.instance, x, g, cd.n, 0);
365  if (0. != param.orthantwise_c) {
366  /* Compute the L1 norm of the variable and add it to the object value. */
367  xnorm = owlqn_x1norm(x, param.orthantwise_start, param.orthantwise_end);
368  fx += xnorm * param.orthantwise_c;
370  pg, x, g, n,
372  );
373  }
374 
375  /* Store the initial value of the objective function. */
376  if (pf != NULL) {
377  pf[0] = fx;
378  }
379 
380  /*
381  Compute the direction;
382  we assume the initial hessian matrix H_0 as the identity matrix.
383  */
384  if (param.orthantwise_c == 0.) {
385  std::copy(g,g+n,d);
387  } else {
388  std::copy(pg,pg+n,d);
390  }
391 
392  /*
393  Make sure that the initial variables are not a minimizer.
394  */
395  xnorm = SGVector<float64_t>::twonorm(x, n);
396  if (param.orthantwise_c == 0.) {
397  gnorm = SGVector<float64_t>::twonorm(g, n);
398  } else {
399  gnorm = SGVector<float64_t>::twonorm(pg, n);
400  }
401  if (xnorm < 1.0) xnorm = 1.0;
402  if (gnorm / xnorm <= param.epsilon) {
404  goto lbfgs_exit;
405  }
406 
407  /* Compute the initial step:
408  step = 1.0 / sqrt(vecdot(d, d, n))
409  */
410  step = 1.0 / SGVector<float64_t>::twonorm(d, n);
411 
412  k = 1;
413  end = 0;
414  for (;;) {
415  /* Store the current position and gradient vectors. */
416  std::copy(x,x+n,xp);
417  std::copy(g,g+n,gp);
418 
419  /* Search for an optimal step. */
420  if (param.orthantwise_c == 0.) {
421  ls = linesearch(n, x, &fx, g, d, &step, xp, gp, w, &cd, &param);
422  } else {
423  ls = linesearch(n, x, &fx, g, d, &step, xp, pg, w, &cd, &param);
425  pg, x, g, n,
427  );
428  }
429  if (ls < 0) {
430  /* Revert to the previous point. */
431  std::copy(xp,xp+n,x);
432  std::copy(gp,gp+n,g);
433  ret = ls;
434 
435  /* Roll back */
436  if (ls==LBFGSERR_INVALID_VALUE)
437  fx = cd.proc_evaluate(cd.instance, x, g, cd.n, step);
438 
439  goto lbfgs_exit;
440  }
441 
442  /* Compute x and g norms. */
443  xnorm = SGVector<float64_t>::twonorm(x, n);
444  if (param.orthantwise_c == 0.) {
445  gnorm = SGVector<float64_t>::twonorm(g, n);
446  } else {
447  gnorm = SGVector<float64_t>::twonorm(pg, n);
448  }
449 
450  /* Report the progress. */
451  if (cd.proc_progress) {
452  if ((ret = cd.proc_progress(cd.instance, x, g, fx, xnorm, gnorm, step, cd.n, k, ls))) {
453  goto lbfgs_exit;
454  }
455  }
456 
457  /*
458  Convergence test.
459  The criterion is given by the following formula:
460  |g(x)| / \max2(1, |x|) < \epsilon
461  */
462  if (xnorm < 1.0) xnorm = 1.0;
463  if (gnorm / xnorm <= param.epsilon) {
464  /* Convergence. */
465  ret = LBFGS_SUCCESS;
466  break;
467  }
468 
469  /*
470  Test for stopping criterion.
471  The criterion is given by the following formula:
472  (f(past_x) - f(x)) / f(x) < \delta
473  */
474  if (pf != NULL) {
475  /* We don't test the stopping criterion while k < past. */
476  if (param.past <= k) {
477  /* Compute the relative improvement from the past. */
478  rate = (pf[k % param.past] - fx) / fx;
479 
480  /* The stopping criterion. */
481  if (rate < param.delta) {
482  ret = LBFGS_STOP;
483  break;
484  }
485  }
486 
487  /* Store the current value of the objective function. */
488  pf[k % param.past] = fx;
489  }
490 
491  if (param.max_iterations != 0 && param.max_iterations < k+1) {
492  /* Maximum number of iterations. */
494  break;
495  }
496 
497  /*
498  Update vectors s and y:
499  s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}.
500  y_{k+1} = g_{k+1} - g_{k}.
501  */
502  it = &lm[end];
503  SGVector<float64_t>::add(it->s, 1, x, -1, xp, n);
504  SGVector<float64_t>::add(it->y, 1, g, -1, gp, n);
505 
506  /*
507  Compute scalars ys and yy:
508  ys = y^t \cdot s = 1 / \rho.
509  yy = y^t \cdot y.
510  Notice that yy is used for scaling the hessian matrix H_0 (Cholesky factor).
511  */
512  ys = SGVector<float64_t>::dot(it->y, it->s, n);
513  yy = SGVector<float64_t>::dot(it->y, it->y, n);
514  it->ys = ys;
515 
516  /*
517  Recursive formula to compute dir = -(H \cdot g).
518  This is described in page 779 of:
519  Jorge Nocedal.
520  Updating Quasi-Newton Matrices with Limited Storage.
521  Mathematics of Computation, Vol. 35, No. 151,
522  pp. 773--782, 1980.
523  */
524  bound = (m <= k) ? m : k;
525  ++k;
526  end = (end + 1) % m;
527 
528  /* Compute the steepest direction. */
529  if (param.orthantwise_c == 0.) {
530  /* Compute the negative of gradients. */
531  std::copy(g, g+n, d);
533  } else {
534  std::copy(pg, pg+n, d);
536  }
537 
538  j = end;
539  for (i = 0;i < bound;++i) {
540  j = (j + m - 1) % m; /* if (--j == -1) j = m-1; */
541  it = &lm[j];
542  /* \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}. */
543  it->alpha = SGVector<float64_t>::dot(it->s, d, n);
544  it->alpha /= it->ys;
545  /* q_{i} = q_{i+1} - \alpha_{i} y_{i}. */
546  SGVector<float64_t>::add(d, 1, d, -it->alpha, it->y, n);
547  }
548 
549  SGVector<float64_t>::scale_vector(ys / yy, d, n);
550 
551  for (i = 0;i < bound;++i) {
552  it = &lm[j];
553  /* \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}. */
554  beta = SGVector<float64_t>::dot(it->y, d, n);
555  beta /= it->ys;
556  /* \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}. */
557  SGVector<float64_t>::add(d, 1, d, it->alpha-beta, it->s, n);
558  j = (j + 1) % m; /* if (++j == m) j = 0; */
559  }
560 
561  /*
562  Constrain the search direction for orthant-wise updates.
563  */
564  if (param.orthantwise_c != 0.) {
565  for (i = param.orthantwise_start;i < param.orthantwise_end;++i) {
566  if (d[i] * pg[i] >= 0) {
567  d[i] = 0;
568  }
569  }
570  }
571 
572  /*
573  Now the search direction d is ready. We try step = 1 first.
574  */
575  step = 1.0;
576  }
577 
578 lbfgs_exit:
579  /* Return the final value of the objective function. */
580  if (ptr_fx != NULL) {
581  *ptr_fx = fx;
582  }
583 
584  SG_FREE(pf);
585 
586  /* Free memory blocks used by this function. */
587  if (lm != NULL) {
588  for (i = 0;i < m;++i) {
589  SG_FREE(lm[i].s);
590  SG_FREE(lm[i].y);
591  }
592  SG_FREE(lm);
593  }
594  SG_FREE(pg);
595  SG_FREE(w);
596  SG_FREE(d);
597  SG_FREE(gp);
598  SG_FREE(g);
599  SG_FREE(xp);
600 
601  return ret;
602 }
603 
604 
605 
607  int32_t n,
608  float64_t *x,
609  float64_t *f,
610  float64_t *g,
611  float64_t *s,
612  float64_t *stp,
613  const float64_t* xp,
614  const float64_t* gp,
615  float64_t *wp,
616  callback_data_t *cd,
617  const lbfgs_parameter_t *param
618  )
619 {
620  int32_t count = 0;
621  float64_t width, dg;
622  float64_t finit, dginit = 0., dgtest;
623  const float64_t dec = 0.5, inc = 2.1;
624 
625  /* Check the input parameters for errors. */
626  if (*stp <= 0.) {
628  }
629 
630  /* Compute the initial gradient in the search direction. */
631  dginit = SGVector<float64_t>::dot(g, s, n);
632 
633  /* Make sure that s points to a descent direction. */
634  if (0 < dginit) {
636  }
637 
638  /* The initial value of the objective function. */
639  finit = *f;
640  dgtest = param->ftol * dginit;
641  const index_t max_iter = 20;
642 
643  for (;;) {
644  std::copy(xp,xp+n,x);
645  if (cd->proc_adjust_step)
646  *stp=cd->proc_adjust_step(cd->instance, x, s, cd->n, *stp);
647 
648  for(index_t j=0; j<n; j++)
649  {
650  if (CMath::is_nan(s[j]) || CMath::is_infinity(s[j]))
651  return LBFGSERR_INVALID_VALUE;
652  }
653 
654  SGVector<float64_t>::add(x, 1, x, *stp, s, n);
655  float64_t decay=0.5;
656  index_t iter=0;
657 
658  while(true)
659  {
660  /* Evaluate the function and gradient values. */
661  *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp);
662  ++count;
663  if (CMath::is_nan(*f) || CMath::is_infinity(*f))
664  *stp*=decay;
665  else
666  break;
667  SGVector<float64_t>::add(x, 1, x, -1.0*(*stp), s, n);
668  iter++;
669  if (iter>max_iter)
670  return LBFGSERR_INVALID_VALUE;
671  }
672 
673 
674  if (*f > finit + *stp * dgtest) {
675  width = dec;
676  } else {
677  /* The sufficient decrease condition (Armijo condition). */
679  /* Exit with the Armijo condition. */
680  return count;
681  }
682 
683  /* Check the Wolfe condition. */
684  dg = SGVector<float64_t>::dot(g, s, n);
685  if (dg < param->wolfe * dginit) {
686  width = inc;
687  } else {
689  /* Exit with the regular Wolfe condition. */
690  return count;
691  }
692 
693  /* Check the strong Wolfe condition. */
694  if(dg > -param->wolfe * dginit) {
695  width = dec;
696  } else {
697  /* Exit with the strong Wolfe condition. */
698  return count;
699  }
700  }
701  }
702 
703  if (*stp < param->min_step) {
704  /* The step is the minimum value. */
705  return LBFGSERR_MINIMUMSTEP;
706  }
707  if (*stp > param->max_step) {
708  /* The step is the maximum value. */
709  return LBFGSERR_MAXIMUMSTEP;
710  }
711  if (param->max_linesearch <= count) {
712  /* Maximum number of iteration. */
714  }
715 
716  (*stp) *= width;
717  }
718 }
719 
720 
721 
723  int32_t n,
724  float64_t *x,
725  float64_t *f,
726  float64_t *g,
727  float64_t *s,
728  float64_t *stp,
729  const float64_t* xp,
730  const float64_t* gp,
731  float64_t *wp,
732  callback_data_t *cd,
733  const lbfgs_parameter_t *param
734  )
735 {
736  int32_t i, count = 0;
737  float64_t width = 0.5, norm = 0.;
738  float64_t finit = *f, dgtest;
739 
740  /* Check the input parameters for errors. */
741  if (*stp <= 0.) {
743  }
744 
745  /* Choose the orthant for the new point. */
746  for (i = 0;i < n;++i) {
747  wp[i] = (xp[i] == 0.) ? -gp[i] : xp[i];
748  }
749 
750  for (;;) {
751  /* Update the current point. */
752  std::copy(xp,xp+n,x);
753  if (cd->proc_adjust_step)
754  *stp=cd->proc_adjust_step(cd->instance, x, s, cd->n, *stp);
755  SGVector<float64_t>::add(x, 1, x, *stp, s, n);
756 
757  /* The current point is projected onto the orthant. */
758  owlqn_project(x, wp, param->orthantwise_start, param->orthantwise_end);
759 
760  /* Evaluate the function and gradient values. */
761  *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp);
762 
763  /* Compute the L1 norm of the variables and add it to the object value. */
764  norm = owlqn_x1norm(x, param->orthantwise_start, param->orthantwise_end);
765  *f += norm * param->orthantwise_c;
766 
767  ++count;
768 
769  dgtest = 0.;
770  for (i = 0;i < n;++i) {
771  dgtest += (x[i] - xp[i]) * gp[i];
772  }
773 
774  if (*f <= finit + param->ftol * dgtest) {
775  /* The sufficient decrease condition. */
776  return count;
777  }
778 
779  if (*stp < param->min_step) {
780  /* The step is the minimum value. */
781  return LBFGSERR_MINIMUMSTEP;
782  }
783  if (*stp > param->max_step) {
784  /* The step is the maximum value. */
785  return LBFGSERR_MAXIMUMSTEP;
786  }
787  if (param->max_linesearch <= count) {
788  /* Maximum number of iteration. */
790  }
791 
792  (*stp) *= width;
793  }
794 }
795 
796 
797 
798 static int32_t line_search_morethuente(
799  int32_t n,
800  float64_t *x,
801  float64_t *f,
802  float64_t *g,
803  float64_t *s,
804  float64_t *stp,
805  const float64_t* xp,
806  const float64_t* gp,
807  float64_t *wa,
808  callback_data_t *cd,
809  const lbfgs_parameter_t *param
810  )
811 {
812  int32_t count = 0;
813  int32_t brackt, stage1, uinfo = 0;
814  float64_t dg;
815  float64_t stx, fx, dgx;
816  float64_t sty, fy, dgy;
817  float64_t fxm, dgxm, fym, dgym, fm, dgm;
818  float64_t finit, ftest1, dginit, dgtest;
819  float64_t width, prev_width;
820  float64_t stmin, stmax;
821 
822  /* Check the input parameters for errors. */
823  if (*stp <= 0.) {
825  }
826 
827  /* Compute the initial gradient in the search direction. */
828  dginit = SGVector<float64_t>::dot(g, s, n);
829 
830  /* Make sure that s points to a descent direction. */
831  if (0 < dginit) {
833  }
834 
835  /* Initialize local variables. */
836  brackt = 0;
837  stage1 = 1;
838  finit = *f;
839  dgtest = param->ftol * dginit;
840  width = param->max_step - param->min_step;
841  prev_width = 2.0 * width;
842 
843  /*
844  The variables stx, fx, dgx contain the values of the step,
845  function, and directional derivative at the best step.
846  The variables sty, fy, dgy contain the value of the step,
847  function, and derivative at the other endpoint of
848  the interval of uncertainty.
849  The variables stp, f, dg contain the values of the step,
850  function, and derivative at the current step.
851  */
852  stx = sty = 0.;
853  fx = fy = finit;
854  dgx = dgy = dginit;
855 
856  for (;;) {
857  /*
858  Set the minimum and maximum steps to correspond to the
859  present interval of uncertainty.
860  */
861  if (brackt) {
862  stmin = min2(stx, sty);
863  stmax = max2(stx, sty);
864  } else {
865  stmin = stx;
866  stmax = *stp + 4.0 * (*stp - stx);
867  }
868 
869  /* Clip the step in the range of [stpmin, stpmax]. */
870  if (*stp < param->min_step) *stp = param->min_step;
871  if (param->max_step < *stp) *stp = param->max_step;
872 
873  /*
874  If an unusual termination is to occur then let
875  stp be the lowest point obtained so far.
876  */
877  if ((brackt && ((*stp <= stmin || stmax <= *stp) || param->max_linesearch <= count + 1 || uinfo != 0)) || (brackt && (stmax - stmin <= param->xtol * stmax))) {
878  *stp = stx;
879  }
880 
881  /*
882  Compute the current value of x:
883  x <- x + (*stp) * s.
884  */
885  std::copy(xp,xp+n,x);
886  if (cd->proc_adjust_step)
887  *stp=cd->proc_adjust_step(cd->instance, x, s, cd->n, *stp);
888 
889  SGVector<float64_t>::add(x, 1, x, *stp, s, n);
890 
891  /* Evaluate the function and gradient values. */
892  *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp);
893 
894  dg = SGVector<float64_t>::dot(g, s, n);
895 
896  ftest1 = finit + *stp * dgtest;
897  ++count;
898 
899  /* Test for errors and convergence. */
900  if (brackt && ((*stp <= stmin || stmax <= *stp) || uinfo != 0)) {
901  /* Rounding errors prevent further progress. */
903  }
904  if (*stp == param->max_step && *f <= ftest1 && dg <= dgtest) {
905  /* The step is the maximum value. */
906  return LBFGSERR_MAXIMUMSTEP;
907  }
908  if (*stp == param->min_step && (ftest1 < *f || dgtest <= dg)) {
909  /* The step is the minimum value. */
910  return LBFGSERR_MINIMUMSTEP;
911  }
912  if (brackt && (stmax - stmin) <= param->xtol * stmax) {
913  /* Relative width of the interval of uncertainty is at most xtol. */
914  return LBFGSERR_WIDTHTOOSMALL;
915  }
916  if (param->max_linesearch <= count) {
917  /* Maximum number of iteration. */
919  }
920  if (*f <= ftest1 && fabs(dg) <= param->gtol * (-dginit)) {
921  /* The sufficient decrease condition and the directional derivative condition hold. */
922  return count;
923  }
924 
925  /*
926  In the first stage we seek a step for which the modified
927  function has a nonpositive value and nonnegative derivative.
928  */
929  if (stage1 && *f <= ftest1 && min2(param->ftol, param->gtol) * dginit <= dg) {
930  stage1 = 0;
931  }
932 
933  /*
934  A modified function is used to predict the step only if
935  we have not obtained a step for which the modified
936  function has a nonpositive function value and nonnegative
937  derivative, and if a lower function value has been
938  obtained but the decrease is not sufficient.
939  */
940  if (stage1 && ftest1 < *f && *f <= fx) {
941  /* Define the modified function and derivative values. */
942  fm = *f - *stp * dgtest;
943  fxm = fx - stx * dgtest;
944  fym = fy - sty * dgtest;
945  dgm = dg - dgtest;
946  dgxm = dgx - dgtest;
947  dgym = dgy - dgtest;
948 
949  /*
950  Call update_trial_interval() to update the interval of
951  uncertainty and to compute the new step.
952  */
953  uinfo = update_trial_interval(
954  &stx, &fxm, &dgxm,
955  &sty, &fym, &dgym,
956  stp, &fm, &dgm,
957  stmin, stmax, &brackt
958  );
959 
960  /* Reset the function and gradient values for f. */
961  fx = fxm + stx * dgtest;
962  fy = fym + sty * dgtest;
963  dgx = dgxm + dgtest;
964  dgy = dgym + dgtest;
965  } else {
966  /*
967  Call update_trial_interval() to update the interval of
968  uncertainty and to compute the new step.
969  */
970  uinfo = update_trial_interval(
971  &stx, &fx, &dgx,
972  &sty, &fy, &dgy,
973  stp, f, &dg,
974  stmin, stmax, &brackt
975  );
976  }
977 
978  /*
979  Force a sufficient decrease in the interval of uncertainty.
980  */
981  if (brackt) {
982  if (0.66 * prev_width <= fabs(sty - stx)) {
983  *stp = stx + 0.5 * (sty - stx);
984  }
985  prev_width = width;
986  width = fabs(sty - stx);
987  }
988  }
989 
990  return LBFGSERR_LOGICERROR;
991 }
992 
993 
994 
998 #define USES_MINIMIZER \
999  float64_t a, d, gamma, theta, p, q, r, s;
1000 
1011 #define CUBIC_MINIMIZER(cm, u, fu, du, v, fv, dv) \
1012  d = (v) - (u); \
1013  theta = ((fu) - (fv)) * 3 / d + (du) + (dv); \
1014  p = fabs(theta); \
1015  q = fabs(du); \
1016  r = fabs(dv); \
1017  s = max3(p, q, r); \
1018  /* gamma = s*sqrt((theta/s)**2 - (du/s) * (dv/s)) */ \
1019  a = theta / s; \
1020  gamma = s * sqrt(a * a - ((du) / s) * ((dv) / s)); \
1021  if ((v) < (u)) gamma = -gamma; \
1022  p = gamma - (du) + theta; \
1023  q = gamma - (du) + gamma + (dv); \
1024  r = p / q; \
1025  (cm) = (u) + r * d;
1026 
1039 #define CUBIC_MINIMIZER2(cm, u, fu, du, v, fv, dv, xmin, xmax) \
1040  d = (v) - (u); \
1041  theta = ((fu) - (fv)) * 3 / d + (du) + (dv); \
1042  p = fabs(theta); \
1043  q = fabs(du); \
1044  r = fabs(dv); \
1045  s = max3(p, q, r); \
1046  /* gamma = s*sqrt((theta/s)**2 - (du/s) * (dv/s)) */ \
1047  a = theta / s; \
1048  gamma = s * sqrt(max2(0, a * a - ((du) / s) * ((dv) / s))); \
1049  if ((u) < (v)) gamma = -gamma; \
1050  p = gamma - (dv) + theta; \
1051  q = gamma - (dv) + gamma + (du); \
1052  r = p / q; \
1053  if (r < 0. && gamma != 0.) { \
1054  (cm) = (v) - r * d; \
1055  } else if (a < 0) { \
1056  (cm) = (xmax); \
1057  } else { \
1058  (cm) = (xmin); \
1059  }
1060 
1070 #define QUARD_MINIMIZER(qm, u, fu, du, v, fv) \
1071  a = (v) - (u); \
1072  (qm) = (u) + (du) / (((fu) - (fv)) / a + (du)) / 2 * a;
1073 
1082 #define QUARD_MINIMIZER2(qm, u, du, v, dv) \
1083  a = (u) - (v); \
1084  (qm) = (v) + (dv) / ((dv) - (du)) * a;
1085 
1086 #define fsigndiff(x, y) (*(x) * (*(y) / fabs(*(y))) < 0.)
1087 
1117 static int32_t update_trial_interval(
1118  float64_t *x,
1119  float64_t *fx,
1120  float64_t *dx,
1121  float64_t *y,
1122  float64_t *fy,
1123  float64_t *dy,
1124  float64_t *t,
1125  float64_t *ft,
1126  float64_t *dt,
1127  const float64_t tmin,
1128  const float64_t tmax,
1129  int32_t *brackt
1130  )
1131 {
1132  int32_t bound;
1133  int32_t dsign = fsigndiff(dt, dx);
1134  float64_t mc; /* minimizer of an interpolated cubic. */
1135  float64_t mq; /* minimizer of an interpolated quadratic. */
1136  float64_t newt; /* new trial value. */
1137  USES_MINIMIZER; /* for CUBIC_MINIMIZER and QUARD_MINIMIZER. */
1138 
1139  /* Check the input parameters for errors. */
1140  if (*brackt) {
1141  if (*t <= min2(*x, *y) || max2(*x, *y) <= *t) {
1142  /* The trival value t is out of the interval. */
1143  return LBFGSERR_OUTOFINTERVAL;
1144  }
1145  if (0. <= *dx * (*t - *x)) {
1146  /* The function must decrease from x. */
1148  }
1149  if (tmax < tmin) {
1150  /* Incorrect tmin and tmax specified. */
1152  }
1153  }
1154 
1155  /*
1156  Trial value selection.
1157  */
1158  if (*fx < *ft) {
1159  /*
1160  Case 1: a higher function value.
1161  The minimum is brackt. If the cubic minimizer is closer
1162  to x than the quadratic one, the cubic one is taken, else
1163  the average of the minimizers is taken.
1164  */
1165  *brackt = 1;
1166  bound = 1;
1167  CUBIC_MINIMIZER(mc, *x, *fx, *dx, *t, *ft, *dt);
1168  QUARD_MINIMIZER(mq, *x, *fx, *dx, *t, *ft);
1169  if (fabs(mc - *x) < fabs(mq - *x)) {
1170  newt = mc;
1171  } else {
1172  newt = mc + 0.5 * (mq - mc);
1173  }
1174  } else if (dsign) {
1175  /*
1176  Case 2: a lower function value and derivatives of
1177  opposite sign. The minimum is brackt. If the cubic
1178  minimizer is closer to x than the quadratic (secant) one,
1179  the cubic one is taken, else the quadratic one is taken.
1180  */
1181  *brackt = 1;
1182  bound = 0;
1183  CUBIC_MINIMIZER(mc, *x, *fx, *dx, *t, *ft, *dt);
1184  QUARD_MINIMIZER2(mq, *x, *dx, *t, *dt);
1185  if (fabs(mc - *t) > fabs(mq - *t)) {
1186  newt = mc;
1187  } else {
1188  newt = mq;
1189  }
1190  } else if (fabs(*dt) < fabs(*dx)) {
1191  /*
1192  Case 3: a lower function value, derivatives of the
1193  same sign, and the magnitude of the derivative decreases.
1194  The cubic minimizer is only used if the cubic tends to
1195  infinity in the direction of the minimizer or if the minimum
1196  of the cubic is beyond t. Otherwise the cubic minimizer is
1197  defined to be either tmin or tmax. The quadratic (secant)
1198  minimizer is also computed and if the minimum is brackt
1199  then the the minimizer closest to x is taken, else the one
1200  farthest away is taken.
1201  */
1202  bound = 1;
1203  CUBIC_MINIMIZER2(mc, *x, *fx, *dx, *t, *ft, *dt, tmin, tmax);
1204  QUARD_MINIMIZER2(mq, *x, *dx, *t, *dt);
1205  if (*brackt) {
1206  if (fabs(*t - mc) < fabs(*t - mq)) {
1207  newt = mc;
1208  } else {
1209  newt = mq;
1210  }
1211  } else {
1212  if (fabs(*t - mc) > fabs(*t - mq)) {
1213  newt = mc;
1214  } else {
1215  newt = mq;
1216  }
1217  }
1218  } else {
1219  /*
1220  Case 4: a lower function value, derivatives of the
1221  same sign, and the magnitude of the derivative does
1222  not decrease. If the minimum is not brackt, the step
1223  is either tmin or tmax, else the cubic minimizer is taken.
1224  */
1225  bound = 0;
1226  if (*brackt) {
1227  CUBIC_MINIMIZER(newt, *t, *ft, *dt, *y, *fy, *dy);
1228  } else if (*x < *t) {
1229  newt = tmax;
1230  } else {
1231  newt = tmin;
1232  }
1233  }
1234 
1235  /*
1236  Update the interval of uncertainty. This update does not
1237  depend on the new step or the case analysis above.
1238 
1239  - Case a: if f(x) < f(t),
1240  x <- x, y <- t.
1241  - Case b: if f(t) <= f(x) && f'(t)*f'(x) > 0,
1242  x <- t, y <- y.
1243  - Case c: if f(t) <= f(x) && f'(t)*f'(x) < 0,
1244  x <- t, y <- x.
1245  */
1246  if (*fx < *ft) {
1247  /* Case a */
1248  *y = *t;
1249  *fy = *ft;
1250  *dy = *dt;
1251  } else {
1252  /* Case c */
1253  if (dsign) {
1254  *y = *x;
1255  *fy = *fx;
1256  *dy = *dx;
1257  }
1258  /* Cases b and c */
1259  *x = *t;
1260  *fx = *ft;
1261  *dx = *dt;
1262  }
1263 
1264  /* Clip the new trial value in [tmin, tmax]. */
1265  if (tmax < newt) newt = tmax;
1266  if (newt < tmin) newt = tmin;
1267 
1268  /*
1269  Redefine the new trial value if it is close to the upper bound
1270  of the interval.
1271  */
1272  if (*brackt && bound) {
1273  mq = *x + 0.66 * (*y - *x);
1274  if (*x < *y) {
1275  if (mq < newt) newt = mq;
1276  } else {
1277  if (newt < mq) newt = mq;
1278  }
1279  }
1280 
1281  /* Return the new trial value. */
1282  *t = newt;
1283  return 0;
1284 }
1285 
1286 
1287 
1288 
1289 
1291  const float64_t* x,
1292  const int32_t start,
1293  const int32_t n
1294  )
1295 {
1296  int32_t i;
1297  float64_t norm = 0.;
1298 
1299  for (i = start;i < n;++i) {
1300  norm += fabs(x[i]);
1301  }
1302 
1303  return norm;
1304 }
1305 
1307  float64_t* pg,
1308  const float64_t* x,
1309  const float64_t* g,
1310  const int32_t n,
1311  const float64_t c,
1312  const int32_t start,
1313  const int32_t end
1314  )
1315 {
1316  int32_t i;
1317 
1318  /* Compute the negative of gradients. */
1319  for (i = 0;i < start;++i) {
1320  pg[i] = g[i];
1321  }
1322 
1323  /* Compute the psuedo-gradients. */
1324  for (i = start;i < end;++i) {
1325  if (x[i] < 0.) {
1326  /* Differentiable. */
1327  pg[i] = g[i] - c;
1328  } else if (0. < x[i]) {
1329  /* Differentiable. */
1330  pg[i] = g[i] + c;
1331  } else {
1332  if (g[i] < -c) {
1333  /* Take the right partial derivative. */
1334  pg[i] = g[i] + c;
1335  } else if (c < g[i]) {
1336  /* Take the left partial derivative. */
1337  pg[i] = g[i] - c;
1338  } else {
1339  pg[i] = 0.;
1340  }
1341  }
1342  }
1343 
1344  for (i = end;i < n;++i) {
1345  pg[i] = g[i];
1346  }
1347 }
1348 
1349 static void owlqn_project(
1350  float64_t* d,
1351  const float64_t* sign,
1352  const int32_t start,
1353  const int32_t end
1354  )
1355 {
1356  int32_t i;
1357 
1358  for (i = start;i < end;++i) {
1359  if (d[i] * sign[i] <= 0) {
1360  d[i] = 0;
1361  }
1362  }
1363 }
1364 
1365 }

SHOGUN Machine Learning Toolbox - Documentation