changelog shortlog tags changeset files revisions annotate raw

scripts/statistics/base/quantile.m

changeset 10289: 4b124317dc38
parent:634274aaa183
author: John W. Eaton <jwe@octave.org>
date: Tue Feb 09 20:58:55 2010 -0500 (34 minutes ago)
permissions: -rw-r--r--
description: base_properties::set_children: account for hidden children
1## Copyright (C) 2008, 2009 Ben Abbott and Jaroslav Hajek
2##
3## This file is part of Octave.
4##
5## Octave is free software; you can redistribute it and/or modify it
6## under the terms of the GNU General Public License as published by
7## the Free Software Foundation; either version 3 of the License, or (at
8## your option) any later version.
9##
10## Octave is distributed in the hope that it will be useful, but
11## WITHOUT ANY WARRANTY; without even the implied warranty of
12## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13## General Public License for more details.
14##
15## You should have received a copy of the GNU General Public License
16## along with Octave; see the file COPYING. If not, see
17## <http://www.gnu.org/licenses/>.
18
19## -*- texinfo -*-
20## @deftypefn {Function File} {@var{q} =} quantile (@var{x}, @var{p})
21## @deftypefnx {Function File} {@var{q} =} quantile (@var{x}, @var{p}, @var{dim})
22## @deftypefnx {Function File} {@var{q} =} quantile (@var{x}, @var{p}, @var{dim}, @var{method})
23## For a sample, @var{x}, calculate the quantiles, @var{q}, corresponding to
24## the cumulative probability values in @var{p}. All non-numeric values (NaNs) of
25## @var{x} are ignored.
26##
27## If @var{x} is a matrix, compute the quantiles for each column and
28## return them in a matrix, such that the i-th row of @var{q} contains
29## the @var{p}(i)th quantiles of each column of @var{x}.
30##
31## The optional argument @var{dim} determines the dimension along which
32## the percentiles are calculated. If @var{dim} is omitted, and @var{x} is
33## a vector or matrix, it defaults to 1 (column wise quantiles). In the
34## instance that @var{x} is a N-d array, @var{dim} defaults to the first
35## dimension whose size greater than unity.
36##
37## The methods available to calculate sample quantiles are the nine methods
38## used by R (http://www.r-project.org/). The default value is METHOD = 5.
39##
40## Discontinuous sample quantile methods 1, 2, and 3
41##
42## @enumerate 1
43## @item Method 1: Inverse of empirical distribution function.
44## @item Method 2: Similar to method 1 but with averaging at discontinuities.
45## @item Method 3: SAS definition: nearest even order statistic.
46## @end enumerate
47##
48## Continuous sample quantile methods 4 through 9, where p(k) is the linear
49## interpolation function respecting each methods' representative cdf.
50##
51## @enumerate 4
52## @item Method 4: p(k) = k / n. That is, linear interpolation of the empirical cdf.
53## @item Method 5: p(k) = (k - 0.5) / n. That is a piecewise linear function where
54## the knots are the values midway through the steps of the empirical cdf.
55## @item Method 6: p(k) = k / (n + 1).
56## @item Method 7: p(k) = (k - 1) / (n - 1).
57## @item Method 8: p(k) = (k - 1/3) / (n + 1/3). The resulting quantile estimates
58## are approximately median-unbiased regardless of the distribution of @var{x}.
59## @item Method 9: p(k) = (k - 3/8) / (n + 1/4). The resulting quantile estimates
60## are approximately unbiased for the expected order statistics if @var{x} is
61## normally distributed.
62## @end enumerate
63##
64## Hyndman and Fan (1996) recommend method 8. Maxima, S, and R
65## (versions prior to 2.0.0) use 7 as their default. Minitab and SPSS
66## use method 6. @sc{matlab} uses method 5.
67##
68## References:
69##
70## @itemize @bullet
71## @item Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) The New
72## S Language. Wadsworth & Brooks/Cole.
73##
74## @item Hyndman, R. J. and Fan, Y. (1996) Sample quantiles in
75## statistical packages, American Statistician, 50, 361--365.
76##
77## @item R: A Language and Environment for Statistical Computing;
78## @url{http://cran.r-project.org/doc/manuals/fullrefman.pdf}.
79## @end itemize
80## @end deftypefn
81
82## Author: Ben Abbott <bpabbott@mac.com>
83## Description: Matlab style quantile function of a discrete/continuous distribution
84
85function q = quantile (x, p, dim, method)
86
87 if (nargin < 1 || nargin > 4)
88 print_usage ();
89 endif
90
91 if (nargin < 2)
92 p = [0.00 0.25, 0.50, 0.75, 1.00];
93 endif
94
95 if (nargin < 3)
96 dim = 1;
97 endif
98
99 if (nargin < 4)
100 method = 5;
101 endif
102
103 if (dim > ndims(x))
104 error ("quantile: invalid dimension");
105 endif
106
107 ## Set the permutation vector.
108 perm = 1:ndims(x);
109 perm(1) = dim;
110 perm(dim) = 1;
111
112 ## Permute dim to the 1st index.
113 x = permute (x, perm);
114
115 ## Save the size of the permuted x N-d array.
116 sx = size (x);
117
118 ## Reshape to a 2-d array.
119 x = reshape (x, [sx(1), prod(sx(2:end))]);
120
121 ## Calculate the quantiles.
122 q = __quantile__ (x, p, method);
123
124 ## Return the shape to the original N-d array.
125 q = reshape (q, [numel(p), sx(2:end)]);
126
127 ## Permute the 1st index back to dim.
128 q = ipermute (q, perm);
129
130endfunction
131
132%!test
133%! p = 0.5;
134%! x = sort (rand (11));
135%! q = quantile (x, p);
136%! assert (q, x(6,:))
137%! x = x.';
138%! q = quantile (x, p, 2);
139%! assert (q, x(:,6));
140
141%!test
142%! p = [0.00, 0.25, 0.50, 0.75, 1.00];
143%! x = [1; 2; 3; 4];
144%! a = [1.0000 1.0000 2.0000 3.0000 4.0000
145%! 1.0000 1.5000 2.5000 3.5000 4.0000
146%! 1.0000 1.0000 2.0000 3.0000 4.0000
147%! 1.0000 1.0000 2.0000 3.0000 4.0000
148%! 1.0000 1.5000 2.5000 3.5000 4.0000
149%! 1.0000 1.2500 2.5000 3.7500 4.0000
150%! 1.0000 1.7500 2.5000 3.2500 4.0000
151%! 1.0000 1.4167 2.5000 3.5833 4.0000
152%! 1.0000 1.4375 2.5000 3.5625 4.0000];
153%! for m = (1:9)
154%! q = quantile (x, p, 1, m).';
155%! assert (q, a(m,:), 0.0001)
156%! endfor
157
158%!test
159%! p = [0.00, 0.25, 0.50, 0.75, 1.00];
160%! x = [1; 2; 3; 4; 5];
161%! a = [1.0000 2.0000 3.0000 4.0000 5.0000
162%! 1.0000 2.0000 3.0000 4.0000 5.0000
163%! 1.0000 1.0000 2.0000 4.0000 5.0000
164%! 1.0000 1.2500 2.5000 3.7500 5.0000
165%! 1.0000 1.7500 3.0000 4.2500 5.0000
166%! 1.0000 1.5000 3.0000 4.5000 5.0000
167%! 1.0000 2.0000 3.0000 4.0000 5.0000
168%! 1.0000 1.6667 3.0000 4.3333 5.0000
169%! 1.0000 1.6875 3.0000 4.3125 5.0000];
170%! for m = (1:9)
171%! q = quantile (x, p, 1, m).';
172%! assert (q, a(m,:), 0.0001)
173%! endfor
174
175%!test
176%! p = [0.00, 0.25, 0.50, 0.75, 1.00];
177%! x = [1; 2; 5; 9];
178%! a = [1.0000 1.0000 2.0000 5.0000 9.0000
179%! 1.0000 1.5000 3.5000 7.0000 9.0000
180%! 1.0000 1.0000 2.0000 5.0000 9.0000
181%! 1.0000 1.0000 2.0000 5.0000 9.0000
182%! 1.0000 1.5000 3.5000 7.0000 9.0000
183%! 1.0000 1.2500 3.5000 8.0000 9.0000
184%! 1.0000 1.7500 3.5000 6.0000 9.0000
185%! 1.0000 1.4167 3.5000 7.3333 9.0000
186%! 1.0000 1.4375 3.5000 7.2500 9.0000];
187%! for m = (1:9)
188%! q = quantile (x, p, 1, m).';
189%! assert (q, a(m,:), 0.0001)
190%! endfor
191
192%!test
193%! p = [0.00, 0.25, 0.50, 0.75, 1.00];
194%! x = [1; 2; 5; 9; 11];
195%! a = [1.0000 2.0000 5.0000 9.0000 11.0000
196%! 1.0000 2.0000 5.0000 9.0000 11.0000
197%! 1.0000 1.0000 2.0000 9.0000 11.0000
198%! 1.0000 1.2500 3.5000 8.0000 11.0000
199%! 1.0000 1.7500 5.0000 9.5000 11.0000
200%! 1.0000 1.5000 5.0000 10.0000 11.0000
201%! 1.0000 2.0000 5.0000 9.0000 11.0000
202%! 1.0000 1.6667 5.0000 9.6667 11.0000
203%! 1.0000 1.6875 5.0000 9.6250 11.0000];
204%! for m = (1:9)
205%! q = quantile (x, p, 1, m).';
206%! assert (q, a(m,:), 0.0001)
207%! endfor
208
209%!test
210%! p = [0.00, 0.25, 0.50, 0.75, 1.00];
211%! x = [16; 11; 15; 12; 15; 8; 11; 12; 6; 10];
212%! a = [6.0000 10.0000 11.0000 15.0000 16.0000
213%! 6.0000 10.0000 11.5000 15.0000 16.0000
214%! 6.0000 8.0000 11.0000 15.0000 16.0000
215%! 6.0000 9.0000 11.0000 13.5000 16.0000
216%! 6.0000 10.0000 11.5000 15.0000 16.0000
217%! 6.0000 9.5000 11.5000 15.0000 16.0000
218%! 6.0000 10.2500 11.5000 14.2500 16.0000
219%! 6.0000 9.8333 11.5000 15.0000 16.0000
220%! 6.0000 9.8750 11.5000 15.0000 16.0000];
221%! for m = (1:9)
222%! q = quantile (x, p, 1, m).';
223%! assert (q, a(m,:), 0.0001)
224%! endfor
225
226%!test
227%! p = [0.00, 0.25, 0.50, 0.75, 1.00];
228%! x = [-0.58851; 0.40048; 0.49527; -2.551500; -0.52057; ...
229%! -0.17841; 0.057322; -0.62523; 0.042906; 0.12337];
230%! a = [-2.551474 -0.588505 -0.178409 0.123366 0.495271
231%! -2.551474 -0.588505 -0.067751 0.123366 0.495271
232%! -2.551474 -0.625231 -0.178409 0.123366 0.495271
233%! -2.551474 -0.606868 -0.178409 0.090344 0.495271
234%! -2.551474 -0.588505 -0.067751 0.123366 0.495271
235%! -2.551474 -0.597687 -0.067751 0.192645 0.495271
236%! -2.551474 -0.571522 -0.067751 0.106855 0.495271
237%! -2.551474 -0.591566 -0.067751 0.146459 0.495271
238%! -2.551474 -0.590801 -0.067751 0.140686 0.495271];
239%! for m = (1:9)
240%! q = quantile (x, p, 1, m).';
241%! assert (q, a(m,:), 0.0001)
242%! endfor
243
244%!test
245%! p = 0.5;
246%! x = [0.112600, 0.114800, 0.052100, 0.236400, 0.139300
247%! 0.171800, 0.727300, 0.204100, 0.453100, 0.158500
248%! 0.279500, 0.797800, 0.329600, 0.556700, 0.730700
249%! 0.428800, 0.875300, 0.647700, 0.628700, 0.816500
250%! 0.933100, 0.931200, 0.963500, 0.779600, 0.846100];
251%! tol = 0.00001;
252%! x(5,5) = NaN;
253%! assert (quantile(x, p, 1), [0.27950, 0.79780, 0.32960, 0.55670, 0.44460], tol);
254%! x(1,1) = NaN;
255%! assert (quantile(x, p, 1), [0.35415, 0.79780, 0.32960, 0.55670, 0.44460], tol);
256%! x(3,3) = NaN;
257%! assert (quantile(x, p, 1), [0.35415, 0.79780, 0.42590, 0.55670, 0.44460], tol);
258
259%!test
260%! sx = [2, 3, 4];
261%! x = rand (sx);
262%! dim = 2;
263%! p = 0.5;
264%! yobs = quantile (x, p, dim);
265%! yexp = median (x, dim);
266%! assert (yobs, yexp);
267
268## For the cumulative probability values in @var{p}, compute the
269## quantiles, @var{q} (the inverse of the cdf), for the sample, @var{x}.
270##
271## The optional input, @var{method}, refers to nine methods available in R
272## (http://www.r-project.org/). The default is @var{method} = 7. For more
273## detail, see `help quantile'.
274## @seealso{prctile, quantile, statistics}
275
276## Author: Ben Abbott <bpabbott@mac.com>
277## Vectorized version: Jaroslav Hajek <highegg@gmail.com>
278## Description: Quantile function of a empirical samples
279
280function inv = __quantile__ (x, p, method = 5)
281
282 if (nargin < 2 || nargin > 3)
283 print_usage ();
284 endif
285
286 if (! ismatrix (x))
287 error ("quantile: x must be a matrix");
288 endif
289
290 ## Save length and set shape of quantiles.
291 n = numel (p);
292 p = p(:);
293
294 ## Save length and set shape of samples.
295 ## FIXME: does sort guarantee that NaN's come at the end?
296 x = sort (x);
297 m = sum (! isnan (x));
298 mx = size (x, 1);
299 nx = size (x, 2);
300
301 ## Initialize output values.
302 inv = Inf*(-(p < 0) + (p > 1));
303 inv = repmat (inv, 1, nx);
304
305 ## Do the work.
306 if (any(k = find((p >= 0) & (p <= 1))))
307 n = length (k);
308 p = p (k);
309 ## Special case.
310 if (mx == 1)
311 inv(k,:) = repmat (x, n, 1);
312 return
313 endif
314
315 ## The column-distribution indices.
316 pcd = kron (ones (n, 1), mx*(0:nx-1));
317 mm = kron (ones (n, 1), m);
318 switch method
319 case {1, 2, 3}
320 switch method
321 case 1
322 p = max (ceil (kron (p, m)), 1);
323 inv(k,:) = x(p + pcd);
324
325 case 2
326 p = kron (p, m);
327 p_lr = max (ceil (p), 1);
328 p_rl = min (floor (p + 1), mm);
329 inv(k,:) = (x(p_lr + pcd) + x(p_rl + pcd))/2;
330
331 case 3
332 ## Used by SAS, method PCTLDEF=2.
333 ## http://support.sas.com/onlinedoc/913/getDoc/en/statug.hlp/stdize_sect14.htm
334 t = max (kron (p, m), 1);
335 t = roundb (t);
336 inv(k,:) = x(t + pcd);
337 endswitch
338
339 otherwise
340 switch method
341 case 4
342 p = kron (p, m);
343
344 case 5
345 ## Used by Matlab.
346 p = kron (p, m) + 0.5;
347
348 case 6
349 ## Used by Minitab and SPSS.
350 p = kron (p, m+1);
351
352 case 7
353 ## Used by S and R.
354 p = kron (p, m-1) + 1;
355
356 case 8
357 ## Median unbiased .
358 p = kron (p, m+1/3) + 1/3;
359
360 case 9
361 ## Approximately unbiased respecting order statistics.
362 p = kron (p, m+0.25) + 0.375;
363
364 otherwise
365 error ("quantile: Unknown method, '%d'", method);
366 endswitch
367
368 ## Duplicate single values.
369 imm1 = mm == 1;
370 x(2,imm1) = x(1,imm1);
371
372 ## Interval indices.
373 pi = max (min (floor (p), mm-1), 1);
374 pr = max (min (p - pi, 1), 0);
375 pi += pcd;
376 inv(k,:) = (1-pr) .* x(pi) + pr .* x(pi+1);
377 endswitch
378 endif
379
380endfunction