changelog shortlog tags changeset files revisions annotate raw

scripts/optimization/fminunc.m

changeset 10289: 4b124317dc38
parent:9f25290a35e8
author: John W. Eaton <jwe@octave.org>
date: Tue Feb 09 20:58:55 2010 -0500 (55 minutes ago)
permissions: -rw-r--r--
description: base_properties::set_children: account for hidden children
1## Copyright (C) 2008, 2009 VZLU Prague, a.s.
2##
3## This file is part of Octave.
4##
5## Octave is free software; you can redistribute it and/or modify it
6## under the terms of the GNU General Public License as published by
7## the Free Software Foundation; either version 3 of the License, or (at
8## your option) any later version.
9##
10## Octave is distributed in the hope that it will be useful, but
11## WITHOUT ANY WARRANTY; without even the implied warranty of
12## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13## General Public License for more details.
14##
15## You should have received a copy of the GNU General Public License
16## along with Octave; see the file COPYING. If not, see
17## <http://www.gnu.org/licenses/>.
18##
19## Author: Jaroslav Hajek <highegg@gmail.com>
20
21## -*- texinfo -*-
22## @deftypefn{Function File} {} fminunc (@var{fcn}, @var{x0}, @var{options})
23## @deftypefnx{Function File} {[@var{x}, @var{fvec}, @var{info}, @var{output}, @var{grad}, @var{hess}]} = fminunc (@var{fcn}, @dots{})
24## Solve a unconstrained optimization problem defined by the function @var{fcn}.
25## @var{fcn} should accepts a vector (array) defining the unknown variables,
26## and return the objective function value, optionally with gradient.
27## In other words, this function attempts to determine a vector @var{x} such
28## that @code{@var{fcn} (@var{x})} is a local minimum.
29## @var{x0} determines a starting guess. The shape of @var{x0} is preserved
30## in all calls to @var{fcn}, but otherwise it is treated as a column vector.
31## @var{options} is a structure specifying additional options.
32## Currently, @code{fminunc} recognizes these options:
33## @code{"FunValCheck"}, @code{"OutputFcn"}, @code{"TolX"},
34## @code{"TolFun"}, @code{"MaxIter"}, @code{"MaxFunEvals"},
35## @code{"GradObj"}, @code{"FinDiffType"},
36## @code{"TypicalX"}, @code{"AutoScaling"}.
37##
38## If @code{"GradObj"} is @code{"on"}, it specifies that @var{fcn},
39## called with 2 output arguments, also returns the Jacobian matrix
40## of right-hand sides at the requested point. @code{"TolX"} specifies
41## the termination tolerance in the unknown variables, while
42## @code{"TolFun"} is a tolerance for equations. Default is @code{1e-7}
43## for both @code{"TolX"} and @code{"TolFun"}.
44##
45## For description of the other options, see @code{optimset}.
46##
47## On return, @var{fval} contains the value of the function @var{fcn}
48## evaluated at @var{x}, and @var{info} may be one of the following values:
49##
50## @table @asis
51## @item 1
52## Converged to a solution point. Relative gradient error is less than specified
53## by TolFun.
54## @item 2
55## Last relative step size was less that TolX.
56## @item 3
57## Last relative decrease in func value was less than TolF.
58## @item 0
59## Iteration limit exceeded.
60## @item -3
61## The trust region radius became excessively small.
62## @end table
63##
64## Optionally, fminunc can also yield a structure with convergence statistics
65## (@var{output}), the output gradient (@var{grad}) and approximate hessian
66## (@var{hess}).
67##
68## Note: If you only have a single nonlinear equation of one variable, using
69## @code{fminbnd} is usually a much better idea.
70## @seealso{fminbnd, optimset}
71## @end deftypefn
72
73## PKG_ADD: __all_opts__ ("fminunc");
74
75function [x, fval, info, output, grad, hess] = fminunc (fcn, x0, options = struct ())
76
77 ## Get default options if requested.
78 if (nargin == 1 && ischar (fcn) && strcmp (fcn, 'defaults'))
79 x = optimset ("MaxIter", 400, "MaxFunEvals", Inf, \
80 "GradObj", "off", "TolX", 1e-7, "TolFun", 1e-7,
81 "OutputFcn", [], "FunValCheck", "off",
82 "FinDiffType", "central",
83 "TypicalX", [], "AutoScaling", "off");
84 return;
85 endif
86
87 if (nargin < 2 || nargin > 3 || ! ismatrix (x0))
88 print_usage ();
89 endif
90
91 if (ischar (fcn))
92 fcn = str2func (fcn, "global");
93 endif
94
95 xsiz = size (x0);
96 n = numel (x0);
97
98 has_grad = strcmpi (optimget (options, "GradObj", "off"), "on");
99 cdif = strcmpi (optimget (options, "FinDiffType", "central"), "central");
100 maxiter = optimget (options, "MaxIter", 400);
101 maxfev = optimget (options, "MaxFunEvals", Inf);
102 outfcn = optimget (options, "OutputFcn");
103
104 ## Get scaling matrix using the TypicalX option. If set to "auto", the
105 ## scaling matrix is estimated using the jacobian.
106 typicalx = optimget (options, "TypicalX");
107 if (isempty (typicalx))
108 typicalx = ones (n, 1);
109 endif
110 autoscale = strcmpi (optimget (options, "AutoScaling", "off"), "on");
111 if (! autoscale)
112 dg = 1 ./ typicalx;
113 endif
114
115 funvalchk = strcmpi (optimget (options, "FunValCheck", "off"), "on");
116
117 if (funvalchk)
118 ## Replace fcn with a guarded version.
119 fcn = @(x) guarded_eval (fcn, x);
120 endif
121
122 ## These defaults are rather stringent. I think that normally, user
123 ## prefers accuracy to performance.
124
125 macheps = eps (class (x0));
126
127 tolx = optimget (options, "TolX", 1e-7);
128 tolf = optimget (options, "TolFun", 1e-7);
129
130 factor = 0.1;
131 ## FIXME: TypicalX corresponds to user scaling (???)
132 autodg = true;
133
134 niter = 1;
135 nfev = 0;
136
137 x = x0(:);
138 info = 0;
139
140 ## Initial evaluation.
141 fval = fcn (reshape (x, xsiz));
142 n = length (x);
143
144 if (! isempty (outfcn))
145 optimvalues.iter = niter;
146 optimvalues.funccount = nfev;
147 optimvalues.fval = fval;
148 optimvalues.searchdirection = zeros (n, 1);
149 state = 'init';
150 stop = outfcn (x, optimvalues, state);
151 if (stop)
152 info = -1;
153 break;
154 endif
155 endif
156
157 nsuciter = 0;
158 lastratio = 0;
159
160 grad = [];
161
162 ## Outer loop.
163 while (niter < maxiter && nfev < maxfev && ! info)
164
165 grad0 = grad;
166
167 ## Calculate function value and gradient (possibly via FD).
168 if (has_grad)
169 [fval, grad] = fcn (reshape (x, xsiz));
170 grad = grad(:);
171 nfev ++;
172 else
173 grad = __fdjac__ (fcn, reshape (x, xsiz), fval, typicalx, cdif)(:);
174 nfev += (1 + cdif) * length (x);
175 endif
176
177 if (niter == 1)
178 ## Initialize by identity matrix.
179 hesr = eye (n);
180 else
181 ## Use the damped BFGS formula.
182 y = grad - grad0;
183 sBs = sumsq (w);
184 Bs = hesr'*w;
185 sy = y'*s;
186 theta = 0.8 / max (1 - sy / sBs, 0.8);
187 r = theta * y + (1-theta) * Bs;
188 hesr = cholupdate (hesr, r / sqrt (s'*r), "+");
189 [hesr, info] = cholupdate (hesr, Bs / sqrt (sBs), "-");
190 if (info)
191 hesr = eye (n);
192 endif
193 endif
194
195 if (autoscale)
196 ## Second derivatives approximate the hessian.
197 d2f = norm (hesr, 'columns').';
198 if (niter == 1)
199 dg = d2f;
200 else
201 ## FIXME: maybe fixed lower and upper bounds?
202 dg = max (0.1*dg, d2f);
203 endif
204 endif
205
206 if (niter == 1)
207 xn = norm (dg .* x);
208 ## FIXME: something better?
209 delta = factor * max (xn, 1);
210 endif
211
212 ## FIXME -- why tolf*n*xn? If abs (e) ~ abs(x) * eps is a vector
213 ## of perturbations of x, then norm (hesr*e) <= eps*xn, i.e. by
214 ## tolf ~ eps we demand as much accuracy as we can expect.
215 if (norm (grad) <= tolf*n*xn)
216 info = 1;
217 break;
218 endif
219
220 suc = false;
221 decfac = 0.5;
222
223 ## Inner loop.
224 while (! suc && niter <= maxiter && nfev < maxfev && ! info)
225
226 s = - __doglegm__ (hesr, grad, dg, delta);
227
228 sn = norm (dg .* s);
229 if (niter == 1)
230 delta = min (delta, sn);
231 endif
232
233 fval1 = fcn (reshape (x + s, xsiz)) (:);
234 nfev ++;
235
236 if (fval1 < fval)
237 ## Scaled actual reduction.
238 actred = (fval - fval1) / (abs (fval1) + abs (fval));
239 else
240 actred = -1;
241 endif
242
243 w = hesr*s;
244 ## Scaled predicted reduction, and ratio.
245 t = 1/2 * sumsq (w) + grad'*s;
246 if (t < 0)
247 prered = -t/(abs (fval) + abs (fval + t));
248 ratio = actred / prered;
249 else
250 prered = 0;
251 ratio = 0;
252 endif
253
254 ## Update delta.
255 if (ratio < min(max(0.1, 0.8*lastratio), 0.9))
256 delta *= decfac;
257 decfac ^= 1.4142;
258 if (delta <= 1e1*macheps*xn)
259 ## Trust region became uselessly small.
260 info = -3;
261 break;
262 endif
263 else
264 lastratio = ratio;
265 decfac = 0.5;
266 if (abs (1-ratio) <= 0.1)
267 delta = 1.4142*sn;
268 elseif (ratio >= 0.5)
269 delta = max (delta, 1.4142*sn);
270 endif
271 endif
272
273 if (ratio >= 1e-4)
274 ## Successful iteration.
275 x += s;
276 xn = norm (dg .* x);
277 fval = fval1;
278 nsuciter ++;
279 suc = true;
280 endif
281
282 niter ++;
283
284 ## FIXME: should outputfcn be only called after a successful iteration?
285 if (! isempty (outfcn))
286 optimvalues.iter = niter;
287 optimvalues.funccount = nfev;
288 optimvalues.fval = fval;
289 optimvalues.searchdirection = s;
290 state = 'iter';
291 stop = outfcn (x, optimvalues, state);
292 if (stop)
293 info = -1;
294 break;
295 endif
296 endif
297
298 ## Tests for termination conditions. A mysterious place, anything
299 ## can happen if you change something here...
300
301 ## The rule of thumb (which I'm not sure M*b is quite following)
302 ## is that for a tolerance that depends on scaling, only 0 makes
303 ## sense as a default value. But 0 usually means uselessly long
304 ## iterations, so we need scaling-independent tolerances wherever
305 ## possible.
306
307 ## The following tests done only after successful step.
308 if (ratio >= 1e-4)
309 ## This one is classic. Note that we use scaled variables again,
310 ## but compare to scaled step, so nothing bad.
311 if (sn <= tolx*xn)
312 info = 2;
313 ## Again a classic one.
314 elseif (actred < tolf)
315 info = 3;
316 endif
317 endif
318
319 endwhile
320 endwhile
321
322 ## Restore original shapes.
323 x = reshape (x, xsiz);
324
325 output.iterations = niter;
326 output.successful = nsuciter;
327 output.funcCount = nfev;
328
329 if (nargout > 5)
330 hess = hesr'*hesr;
331 endif
332
333endfunction
334
335## An assistant function that evaluates a function handle and checks for
336## bad results.
337function [fx, gx] = guarded_eval (fun, x)
338 if (nargout > 1)
339 [fx, gx] = fun (x);
340 else
341 fx = fun (x);
342 gx = [];
343 endif
344
345 if (! (isreal (fx) && isreal (jx)))
346 error ("fminunc:notreal", "fminunc: non-real value encountered");
347 elseif (complexeqn && ! (isnumeric (fx) && isnumeric(jx)))
348 error ("fminunc:notnum", "fminunc: non-numeric value encountered");
349 elseif (any (isnan (fx(:))))
350 error ("fminunc:isnan", "fminunc: NaN value encountered");
351 endif
352endfunction
353
354%!function f = rosenb (x)
355%! n = length (x);
356%! f = sumsq (1 - x(1:n-1)) + 100 * sumsq (x(2:n) - x(1:n-1).^2);
357%!test
358%! [x, fval, info, out] = fminunc (@rosenb, [5, -5]);
359%! tol = 2e-5;
360%! assert (info > 0);
361%! assert (x, ones (1, 2), tol);
362%! assert (fval, 0, tol);
363%!test
364%! [x, fval, info, out] = fminunc (@rosenb, zeros (1, 4));
365%! tol = 2e-5;
366%! assert (info > 0);
367%! assert (x, ones (1, 4), tol);
368%! assert (fval, 0, tol);
369
370## Solve the double dogleg trust-region minimization problem:
371## Minimize 1/2*norm(r*x)^2 subject to the constraint norm(d.*x) <= delta,
372## x being a convex combination of the gauss-newton and scaled gradient.
373
374## TODO: error checks
375## TODO: handle singularity, or leave it up to mldivide?
376
377function x = __doglegm__ (r, g, d, delta)
378 ## Get Gauss-Newton direction.
379 b = r' \ g;
380 x = r \ b;
381 xn = norm (d .* x);
382 if (xn > delta)
383 ## GN is too big, get scaled gradient.
384 s = g ./ d;
385 sn = norm (s);
386 if (sn > 0)
387 ## Normalize and rescale.
388 s = (s / sn) ./ d;
389 ## Get the line minimizer in s direction.
390 tn = norm (r*s);
391 snm = (sn / tn) / tn;
392 if (snm < delta)
393 ## Get the dogleg path minimizer.
394 bn = norm (b);
395 dxn = delta/xn; snmd = snm/delta;
396 t = (bn/sn) * (bn/xn) * snmd;
397 t -= dxn * snmd^2 - sqrt ((t-dxn)^2 + (1-dxn^2)*(1-snmd^2));
398 alpha = dxn*(1-snmd^2) / t;
399 else
400 alpha = 0;
401 endif
402 else
403 alpha = delta / xn;
404 snm = 0;
405 endif
406 ## Form the appropriate convex combination.
407 x = alpha * x + ((1-alpha) * min (snm, delta)) * s;
408 endif
409endfunction