1## Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,
4## This file is part of Octave.
6## Octave is free software; you can redistribute it and/or modify it
7## under the terms of the GNU General Public License as published by
8## the Free Software Foundation; either version 3 of the License, or (at
9## your option) any later version.
11## Octave is distributed in the hope that it will be useful, but
12## WITHOUT ANY WARRANTY; without even the implied warranty of
13## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14## General Public License for more details.
16## You should have received a copy of the GNU General Public License
17## along with Octave; see the file COPYING. If not, see
18## <http://www.gnu.org/licenses/>.
21## @deftypefn {Function File} {} speed (@var{f}, @var{init}, @var{max_n}, @var{f2}, @var{tol})
22## @deftypefnx {Function File} {[@var{order}, @var{n}, @var{T_f}, @var{T_f2}] =} speed (@dots{})
24## Determine the execution time of an expression for various @var{n}.
25## The @var{n} are log-spaced from 1 to @var{max_n}. For each @var{n},
26## an initialization expression is computed to create whatever data
27## are needed for the test. If a second expression is given, the
28## execution times of the two expressions will be compared. Called
29## without output arguments the results are presented graphically.
33## The expression to evaluate.
36## The maximum test length to run. Default value is 100. Alternatively,
37## use @code{[min_n,max_n]} or for complete control, @code{[n1,n2,@dots{},nk]}.
40## Initialization expression for function argument values. Use @var{k}
41## for the test number and @var{n} for the size of the test. This should
42## compute values for all variables listed in args. Note that init will
43## be evaluated first for @math{k = 0}, so things which are constant throughout
44## the test can be computed then. The default value is @code{@var{x} =
45## randn (@var{n}, 1);}.
48## An alternative expression to evaluate, so the speed of the two
49## can be compared. Default is @code{[]}.
52## If @var{tol} is @code{Inf}, then no comparison will be made between the
53## results of expression @var{f} and expression @var{f2}. Otherwise,
54## expression @var{f} should produce a value @var{v} and expression @var{f2}
55## should produce a value @var{v2}, and these shall be compared using
56## @code{assert(@var{v},@var{v2},@var{tol})}. If @var{tol} is positive,
57## the tolerance is assumed to be absolute. If @var{tol} is negative,
58## the tolerance is assumed to be relative. The default is @code{eps}.
61## The time complexity of the expression @code{O(a n^p)}. This
62## is a structure with fields @code{a} and @code{p}.
65## The values @var{n} for which the expression was calculated and
66## the execution time was greater than zero.
69## The nonzero execution times recorded for the expression @var{f} in seconds.
72## The nonzero execution times recorded for the expression @var{f2} in seconds.
73## If it is needed, the mean time ratio is just @code{mean(T_f./T_f2)}.
77## The slope of the execution time graph shows the approximate
78## power of the asymptotic running time @code{O(n^p)}. This
79## power is plotted for the region over which it is approximated
80## (the latter half of the graph). The estimated power is not
81## very accurate, but should be sufficient to determine the
82## general order of your algorithm. It should indicate if for
83## example your implementation is unexpectedly @code{O(n^2)}
84## rather than @code{O(n)} because it extends a vector each
85## time through the loop rather than preallocating one which is
86## big enough. For example, in the current version of Octave,
87## the following is not the expected @code{O(n)}:
90## speed ("for i = 1:n, y@{i@} = x(i); end", "", [1000,10000])
93## but it is if you preallocate the cell array @code{y}:
97## speed ("for i = 1:n, y@{i@} = x(i); end", ...
98## "x = rand (n, 1); y = cell (size (x));", [1000, 10000])
102## An attempt is made to approximate the cost of the individual
103## operations, but it is wildly inaccurate. You can improve the
104## stability somewhat by doing more work for each @code{n}. For
108## speed ("airy(x)", "x = rand (n, 10)", [10000, 100000])
111## When comparing a new and original expression, the line on the
112## speedup ratio graph should be larger than 1 if the new expression
113## is faster. Better algorithms have a shallow slope. Generally,
114## vectorizing an algorithm will not change the slope of the execution
115## time graph, but it will shift it relative to the original. For
120## speed ("v = sum (x)", "", [10000, 100000], ...
121## "v = 0; for i = 1:length (x), v += x(i); end")
125## A more complex example, if you had an original version of @code{xcorr}
126## using for loops and another version using an FFT, you could compare the
127## run speed for various lags as follows, or for a fixed lag with varying
128## vector lengths as follows:
132## speed ("v = xcorr (x, n)", "x = rand (128, 1);", 100,
133## "v2 = xcorr_orig (x, n)", -100*eps)
134## speed ("v = xcorr (x, 15)", "x = rand (20+n, 1);", 100,
135## "v2 = xcorr_orig (x, n)", -100*eps)
139## Assuming one of the two versions is in @var{xcorr_orig}, this
140## would compare their speed and their output values. Note that the
141## FFT version is not exact, so we specify an acceptable tolerance on
142## the comparison @code{100*eps}, and the errors should be computed
143## relatively, as @code{abs((@var{x} - @var{y})./@var{y})} rather than
144## absolutely as @code{abs(@var{x} - @var{y})}.
146## Type @code{example('speed')} to see some real examples. Note for
147## obscure reasons, you can't run examples 1 and 2 directly using
148## @code{demo('speed')}. Instead use, @code{eval(example('speed',1))}
149## and @code{eval(example('speed',2))}.
152## FIXME: consider two dimensional speedup surfaces for functions like kron.
153function [__order, __test_n, __tnew, __torig] ...
154 = speed (__f1, __init, __max_n, __f2, __tol)
156 if (nargin < 1 || nargin > 6)
160 if (nargin < 2 || isempty (__init))
161 __init = "x = randn(n, 1);";
164 if (nargin < 3 || isempty (__max_n))
172 if (nargin < 5 || isempty (__tol))
178 ## Let user specify range of n.
179 if (isscalar (__max_n))
181 assert (__max_n > __min_n);
182 __test_n = logspace (0, log10 (__max_n), __numtests);
183 elseif (length (__max_n) == 2)
184 __min_n = __max_n(1);
185 __max_n = __max_n(2);
186 assert (__min_n >= 1);
187 __test_n = logspace (log10 (__min_n), log10 (__max_n), __numtests);
191 ## Force n to be an integer.
192 __test_n = unique (round (__test_n));
193 assert (__test_n >= 1);
195 __torig = __tnew = zeros (size (__test_n));
197 disp (cstrcat ("testing ", __f1, "\ninit: ", __init));
199 ## Make sure the functions are freshly loaded by evaluating them at
200 ## test_n(1); first have to initialize the args though.
203 eval (cstrcat (__init, ";"));
204 if (! isempty (__f2))
205 eval (cstrcat (__f2, ";"));
207 eval (cstrcat (__f1, ";"));
210 for k = 1:length (__test_n)
212 eval (cstrcat (__init, ";"));
214 printf ("n%i = %i ",k, n);
216 eval (cstrcat ("__t = time();", __f1, "; __v1=ans; __t = time()-__t;"));
218 eval (cstrcat ("__t2 = time();", __f1, "; __t2 = time()-__t2;"));
219 eval (cstrcat ("__t3 = time();", __f1, "; __t3 = time()-__t3;"));
220 __t = min ([__t, __t2, __t3]);
224 if (! isempty (__f2))
225 eval (cstrcat ("__t = time();", __f2, "; __v2=ans; __t = time()-__t;"));
227 eval (cstrcat ("__t2 = time();", __f2, "; __t2 = time()-__t2;"));
228 eval (cstrcat ("__t3 = time();", __f2, "; __t3 = time()-__t3;"));
232 assert (__v1, __v2, __tol);
237 ## Drop times of zero.
238 if (! isempty (__f2))
239 zidx = (__tnew < 100*eps | __torig < 100*eps);
244 zidx = (__tnew < 100*eps);
249 ## Approximate time complexity and return it if requested.
250 tailidx = ceil(length(__test_n)/2):length(__test_n);
251 p = polyfit (log (__test_n(tailidx)), log (__tnew(tailidx)), 1);
254 __order.a = exp (p(2));
257 ## Plot the data if no output is requested.
258 doplot = (nargout == 0);
264 if (doplot && ! isempty (__f2))
266 semilogx (__test_n, __torig./__tnew,
267 cstrcat ("-*r;", strrep (__f1, ";", "."), "/",
268 strrep (__f2, ";", "."), ";"),
269 __test_n, __tnew./__torig,
270 cstrcat ("-*g;", strrep (__f2, ";", "."), "/",
271 strrep (__f1, ";", "."), ";"));
272 xlabel ("test length");
274 ylabel ("speedup ratio");
277 loglog (__test_n, __tnew*1000,
278 cstrcat ("*-g;", strrep (__f1, ";", "."), ";"),
279 __test_n, __torig*1000,
280 cstrcat ("*-r;", strrep (__f2,";","."), ";"));
282 xlabel ("test length");
283 ylabel ("best execution time (ms)");
284 title (cstrcat ("init: ", __init));
286 ratio = mean (__torig ./ __tnew);
287 printf ("\n\nMean runtime ratio = %.3g for '%s' vs '%s'\n",
292 loglog (__test_n, __tnew*1000, "*-g;execution time;");
293 xlabel ("test length");
294 ylabel ("best execution time (ms)");
295 title (cstrcat (__f1, " init: ", __init));
301 ## Plot time complexity approximation (using milliseconds).
302 order = sprintf ("O(n^%g)", round (10*p(1))/10);
303 v = polyval (p, log (__test_n(tailidx)));
305 loglog (__test_n(tailidx), exp(v)*1000, sprintf ("b;%s;", order));
307 ## Get base time to 1 digit of accuracy.
309 dt = floor (dt/10^floor(log10(dt)))*10^floor(log10(dt));
310 if (log10 (dt) >= -0.5)
311 time = sprintf ("%g s", dt);
312 elseif (log10 (dt) >= -3.5)
313 time = sprintf ("%g ms", dt*1e3);
314 elseif (log10 (dt) >= -6.5)
315 time = sprintf ("%g us", dt*1e6);
317 time = sprintf ("%g ns", dt*1e9);
320 ## Display nicely formatted complexity.
321 printf ("\nFor %s:\n", __f1);
322 printf (" asymptotic power: %s\n", order);
323 printf (" approximate time per operation: %s\n", time);
330%! function x = build_orig(n)
331%! ## extend the target vector on the fly
332%! for i=0:n-1, x([1:10]+i*10) = 1:10; endfor
334%! function x = build(n)
335%! ## preallocate the target vector
336%! x = zeros(1, n*10);
338%! if (prefer_column_vectors), x = x.'; endif
341%! for i=0:n-1, x([1:10]+i*10) = 1:10; endfor
344%! disp("-----------------------");
346%! disp("-----------------------");
348%! disp("-----------------------");
350%! disp("Preallocated vector test.\nThis takes a little while...");
351%! speed('build(n)', '', 1000, 'build_orig(n)');
352%! clear build build_orig
353%! disp("Note how much faster it is to pre-allocate a vector.");
354%! disp("Notice the peak speedup ratio.");
358%! function x = build_orig(n)
359%! for i=0:n-1, x([1:10]+i*10) = 1:10; endfor
361%! function x = build(n)
363%! x = idx(:,ones(1,n));
364%! x = reshape(x, 1, n*10);
366%! if (prefer_column_vectors), x = x.'; endif
371%! disp("-----------------------");
373%! disp("-----------------------");
375%! disp("-----------------------");
377%! disp("Vectorized test. This takes a little while...");
378%! speed('build(n)', '', 1000, 'build_orig(n)');
379%! clear build build_orig
380%! disp("-----------------------");
381%! disp("This time, the for loop is done away with entirely.");
382%! disp("Notice how much bigger the speedup is then in example 1.");