001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.math.stat.regression;
019    import java.io.Serializable;
020    
021    import org.apache.commons.math.MathException;
022    import org.apache.commons.math.MathRuntimeException;
023    import org.apache.commons.math.distribution.TDistribution;
024    import org.apache.commons.math.distribution.TDistributionImpl;
025    import org.apache.commons.math.exception.util.LocalizedFormats;
026    import org.apache.commons.math.util.FastMath;
027    
028    /**
029     * Estimates an ordinary least squares regression model
030     * with one independent variable.
031     * <p>
032     * <code> y = intercept + slope * x  </code></p>
033     * <p>
034     * Standard errors for <code>intercept</code> and <code>slope</code> are
035     * available as well as ANOVA, r-square and Pearson's r statistics.</p>
036     * <p>
037     * Observations (x,y pairs) can be added to the model one at a time or they
038     * can be provided in a 2-dimensional array.  The observations are not stored
039     * in memory, so there is no limit to the number of observations that can be
040     * added to the model.</p>
041     * <p>
042     * <strong>Usage Notes</strong>: <ul>
043     * <li> When there are fewer than two observations in the model, or when
044     * there is no variation in the x values (i.e. all x values are the same)
045     * all statistics return <code>NaN</code>. At least two observations with
046     * different x coordinates are requred to estimate a bivariate regression
047     * model.
048     * </li>
049     * <li> getters for the statistics always compute values based on the current
050     * set of observations -- i.e., you can get statistics, then add more data
051     * and get updated statistics without using a new instance.  There is no
052     * "compute" method that updates all statistics.  Each of the getters performs
053     * the necessary computations to return the requested statistic.</li>
054     * </ul></p>
055     *
056     * @version $Revision: 1042336 $ $Date: 2010-12-05 13:40:48 +0100 (dim. 05 d??c. 2010) $
057     */
058    public class SimpleRegression implements Serializable {
059    
060        /** Serializable version identifier */
061        private static final long serialVersionUID = -3004689053607543335L;
062    
063        /** the distribution used to compute inference statistics. */
064        private TDistribution distribution;
065    
066        /** sum of x values */
067        private double sumX = 0d;
068    
069        /** total variation in x (sum of squared deviations from xbar) */
070        private double sumXX = 0d;
071    
072        /** sum of y values */
073        private double sumY = 0d;
074    
075        /** total variation in y (sum of squared deviations from ybar) */
076        private double sumYY = 0d;
077    
078        /** sum of products */
079        private double sumXY = 0d;
080    
081        /** number of observations */
082        private long n = 0;
083    
084        /** mean of accumulated x values, used in updating formulas */
085        private double xbar = 0;
086    
087        /** mean of accumulated y values, used in updating formulas */
088        private double ybar = 0;
089    
090        // ---------------------Public methods--------------------------------------
091    
092        /**
093         * Create an empty SimpleRegression instance
094         */
095        public SimpleRegression() {
096            this(new TDistributionImpl(1.0));
097        }
098    
099        /**
100         * Create an empty SimpleRegression using the given distribution object to
101         * compute inference statistics.
102         * @param t the distribution used to compute inference statistics.
103         * @since 1.2
104         * @deprecated in 2.2 (to be removed in 3.0). Please use the {@link
105         * #SimpleRegression(int) other constructor} instead.
106         */
107        @Deprecated
108        public SimpleRegression(TDistribution t) {
109            super();
110            setDistribution(t);
111        }
112    
113        /**
114         * Create an empty SimpleRegression.
115         *
116         * @param degrees Number of degrees of freedom of the distribution
117         * used to compute inference statistics.
118         * @since 2.2
119         */
120        public SimpleRegression(int degrees) {
121            setDistribution(new TDistributionImpl(degrees));
122        }
123    
124        /**
125         * Adds the observation (x,y) to the regression data set.
126         * <p>
127         * Uses updating formulas for means and sums of squares defined in
128         * "Algorithms for Computing the Sample Variance: Analysis and
129         * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J.
130         * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
131         * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p>
132         *
133         *
134         * @param x independent variable value
135         * @param y dependent variable value
136         */
137        public void addData(double x, double y) {
138            if (n == 0) {
139                xbar = x;
140                ybar = y;
141            } else {
142                double dx = x - xbar;
143                double dy = y - ybar;
144                sumXX += dx * dx * (double) n / (n + 1d);
145                sumYY += dy * dy * (double) n / (n + 1d);
146                sumXY += dx * dy * (double) n / (n + 1d);
147                xbar += dx / (n + 1.0);
148                ybar += dy / (n + 1.0);
149            }
150            sumX += x;
151            sumY += y;
152            n++;
153    
154            if (n > 2) {
155                distribution.setDegreesOfFreedom(n - 2);
156            }
157        }
158    
159    
160        /**
161         * Removes the observation (x,y) from the regression data set.
162         * <p>
163         * Mirrors the addData method.  This method permits the use of
164         * SimpleRegression instances in streaming mode where the regression
165         * is applied to a sliding "window" of observations, however the caller is
166         * responsible for maintaining the set of observations in the window.</p>
167         *
168         * The method has no effect if there are no points of data (i.e. n=0)
169         *
170         * @param x independent variable value
171         * @param y dependent variable value
172         */
173        public void removeData(double x, double y) {
174            if (n > 0) {
175                double dx = x - xbar;
176                double dy = y - ybar;
177                sumXX -= dx * dx * (double) n / (n - 1d);
178                sumYY -= dy * dy * (double) n / (n - 1d);
179                sumXY -= dx * dy * (double) n / (n - 1d);
180                xbar -= dx / (n - 1.0);
181                ybar -= dy / (n - 1.0);
182                sumX -= x;
183                sumY -= y;
184                n--;
185    
186                if (n > 2) {
187                    distribution.setDegreesOfFreedom(n - 2);
188                }
189            }
190        }
191    
192        /**
193         * Adds the observations represented by the elements in
194         * <code>data</code>.
195         * <p>
196         * <code>(data[0][0],data[0][1])</code> will be the first observation, then
197         * <code>(data[1][0],data[1][1])</code>, etc.</p>
198         * <p>
199         * This method does not replace data that has already been added.  The
200         * observations represented by <code>data</code> are added to the existing
201         * dataset.</p>
202         * <p>
203         * To replace all data, use <code>clear()</code> before adding the new
204         * data.</p>
205         *
206         * @param data array of observations to be added
207         */
208        public void addData(double[][] data) {
209            for (int i = 0; i < data.length; i++) {
210                addData(data[i][0], data[i][1]);
211            }
212        }
213    
214    
215        /**
216         * Removes observations represented by the elements in <code>data</code>.
217          * <p>
218         * If the array is larger than the current n, only the first n elements are
219         * processed.  This method permits the use of SimpleRegression instances in
220         * streaming mode where the regression is applied to a sliding "window" of
221         * observations, however the caller is responsible for maintaining the set
222         * of observations in the window.</p>
223         * <p>
224         * To remove all data, use <code>clear()</code>.</p>
225         *
226         * @param data array of observations to be removed
227         */
228        public void removeData(double[][] data) {
229            for (int i = 0; i < data.length && n > 0; i++) {
230                removeData(data[i][0], data[i][1]);
231            }
232        }
233    
234        /**
235         * Clears all data from the model.
236         */
237        public void clear() {
238            sumX = 0d;
239            sumXX = 0d;
240            sumY = 0d;
241            sumYY = 0d;
242            sumXY = 0d;
243            n = 0;
244        }
245    
246        /**
247         * Returns the number of observations that have been added to the model.
248         *
249         * @return n number of observations that have been added.
250         */
251        public long getN() {
252            return n;
253        }
254    
255        /**
256         * Returns the "predicted" <code>y</code> value associated with the
257         * supplied <code>x</code> value,  based on the data that has been
258         * added to the model when this method is activated.
259         * <p>
260         * <code> predict(x) = intercept + slope * x </code></p>
261         * <p>
262         * <strong>Preconditions</strong>: <ul>
263         * <li>At least two observations (with at least two different x values)
264         * must have been added before invoking this method. If this method is
265         * invoked before a model can be estimated, <code>Double,NaN</code> is
266         * returned.
267         * </li></ul></p>
268         *
269         * @param x input <code>x</code> value
270         * @return predicted <code>y</code> value
271         */
272        public double predict(double x) {
273            double b1 = getSlope();
274            return getIntercept(b1) + b1 * x;
275        }
276    
277        /**
278         * Returns the intercept of the estimated regression line.
279         * <p>
280         * The least squares estimate of the intercept is computed using the
281         * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
282         * The intercept is sometimes denoted b0.</p>
283         * <p>
284         * <strong>Preconditions</strong>: <ul>
285         * <li>At least two observations (with at least two different x values)
286         * must have been added before invoking this method. If this method is
287         * invoked before a model can be estimated, <code>Double,NaN</code> is
288         * returned.
289         * </li></ul></p>
290         *
291         * @return the intercept of the regression line
292         */
293        public double getIntercept() {
294            return getIntercept(getSlope());
295        }
296    
297        /**
298        * Returns the slope of the estimated regression line.
299        * <p>
300        * The least squares estimate of the slope is computed using the
301        * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
302        * The slope is sometimes denoted b1.</p>
303        * <p>
304        * <strong>Preconditions</strong>: <ul>
305        * <li>At least two observations (with at least two different x values)
306        * must have been added before invoking this method. If this method is
307        * invoked before a model can be estimated, <code>Double.NaN</code> is
308        * returned.
309        * </li></ul></p>
310        *
311        * @return the slope of the regression line
312        */
313        public double getSlope() {
314            if (n < 2) {
315                return Double.NaN; //not enough data
316            }
317            if (FastMath.abs(sumXX) < 10 * Double.MIN_VALUE) {
318                return Double.NaN; //not enough variation in x
319            }
320            return sumXY / sumXX;
321        }
322    
323        /**
324         * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
325         * sum of squared errors</a> (SSE) associated with the regression
326         * model.
327         * <p>
328         * The sum is computed using the computational formula</p>
329         * <p>
330         * <code>SSE = SYY - (SXY * SXY / SXX)</code></p>
331         * <p>
332         * where <code>SYY</code> is the sum of the squared deviations of the y
333         * values about their mean, <code>SXX</code> is similarly defined and
334         * <code>SXY</code> is the sum of the products of x and y mean deviations.
335         * </p><p>
336         * The sums are accumulated using the updating algorithm referenced in
337         * {@link #addData}.</p>
338         * <p>
339         * The return value is constrained to be non-negative - i.e., if due to
340         * rounding errors the computational formula returns a negative result,
341         * 0 is returned.</p>
342         * <p>
343         * <strong>Preconditions</strong>: <ul>
344         * <li>At least two observations (with at least two different x values)
345         * must have been added before invoking this method. If this method is
346         * invoked before a model can be estimated, <code>Double,NaN</code> is
347         * returned.
348         * </li></ul></p>
349         *
350         * @return sum of squared errors associated with the regression model
351         */
352        public double getSumSquaredErrors() {
353            return FastMath.max(0d, sumYY - sumXY * sumXY / sumXX);
354        }
355    
356        /**
357         * Returns the sum of squared deviations of the y values about their mean.
358         * <p>
359         * This is defined as SSTO
360         * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p>
361         * <p>
362         * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p>
363         *
364         * @return sum of squared deviations of y values
365         */
366        public double getTotalSumSquares() {
367            if (n < 2) {
368                return Double.NaN;
369            }
370            return sumYY;
371        }
372    
373        /**
374         * Returns the sum of squared deviations of the x values about their mean.
375         *
376         * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p>
377         *
378         * @return sum of squared deviations of x values
379         */
380        public double getXSumSquares() {
381            if (n < 2) {
382                return Double.NaN;
383            }
384            return sumXX;
385        }
386    
387        /**
388         * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>.
389         *
390         * @return sum of cross products
391         */
392        public double getSumOfCrossProducts() {
393            return sumXY;
394        }
395    
396        /**
397         * Returns the sum of squared deviations of the predicted y values about
398         * their mean (which equals the mean of y).
399         * <p>
400         * This is usually abbreviated SSR or SSM.  It is defined as SSM
401         * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p>
402         * <p>
403         * <strong>Preconditions</strong>: <ul>
404         * <li>At least two observations (with at least two different x values)
405         * must have been added before invoking this method. If this method is
406         * invoked before a model can be estimated, <code>Double.NaN</code> is
407         * returned.
408         * </li></ul></p>
409         *
410         * @return sum of squared deviations of predicted y values
411         */
412        public double getRegressionSumSquares() {
413            return getRegressionSumSquares(getSlope());
414        }
415    
416        /**
417         * Returns the sum of squared errors divided by the degrees of freedom,
418         * usually abbreviated MSE.
419         * <p>
420         * If there are fewer than <strong>three</strong> data pairs in the model,
421         * or if there is no variation in <code>x</code>, this returns
422         * <code>Double.NaN</code>.</p>
423         *
424         * @return sum of squared deviations of y values
425         */
426        public double getMeanSquareError() {
427            if (n < 3) {
428                return Double.NaN;
429            }
430            return getSumSquaredErrors() / (n - 2);
431        }
432    
433        /**
434         * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
435         * Pearson's product moment correlation coefficient</a>,
436         * usually denoted r.
437         * <p>
438         * <strong>Preconditions</strong>: <ul>
439         * <li>At least two observations (with at least two different x values)
440         * must have been added before invoking this method. If this method is
441         * invoked before a model can be estimated, <code>Double,NaN</code> is
442         * returned.
443         * </li></ul></p>
444         *
445         * @return Pearson's r
446         */
447        public double getR() {
448            double b1 = getSlope();
449            double result = FastMath.sqrt(getRSquare());
450            if (b1 < 0) {
451                result = -result;
452            }
453            return result;
454        }
455    
456        /**
457         * Returns the <a href="http://www.xycoon.com/coefficient1.htm">
458         * coefficient of determination</a>,
459         * usually denoted r-square.
460         * <p>
461         * <strong>Preconditions</strong>: <ul>
462         * <li>At least two observations (with at least two different x values)
463         * must have been added before invoking this method. If this method is
464         * invoked before a model can be estimated, <code>Double,NaN</code> is
465         * returned.
466         * </li></ul></p>
467         *
468         * @return r-square
469         */
470        public double getRSquare() {
471            double ssto = getTotalSumSquares();
472            return (ssto - getSumSquaredErrors()) / ssto;
473        }
474    
475        /**
476         * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
477         * standard error of the intercept estimate</a>,
478         * usually denoted s(b0).
479         * <p>
480         * If there are fewer that <strong>three</strong> observations in the
481         * model, or if there is no variation in x, this returns
482         * <code>Double.NaN</code>.</p>
483         *
484         * @return standard error associated with intercept estimate
485         */
486        public double getInterceptStdErr() {
487            return FastMath.sqrt(
488                getMeanSquareError() * ((1d / (double) n) + (xbar * xbar) / sumXX));
489        }
490    
491        /**
492         * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
493         * error of the slope estimate</a>,
494         * usually denoted s(b1).
495         * <p>
496         * If there are fewer that <strong>three</strong> data pairs in the model,
497         * or if there is no variation in x, this returns <code>Double.NaN</code>.
498         * </p>
499         *
500         * @return standard error associated with slope estimate
501         */
502        public double getSlopeStdErr() {
503            return FastMath.sqrt(getMeanSquareError() / sumXX);
504        }
505    
506        /**
507         * Returns the half-width of a 95% confidence interval for the slope
508         * estimate.
509         * <p>
510         * The 95% confidence interval is</p>
511         * <p>
512         * <code>(getSlope() - getSlopeConfidenceInterval(),
513         * getSlope() + getSlopeConfidenceInterval())</code></p>
514         * <p>
515         * If there are fewer that <strong>three</strong> observations in the
516         * model, or if there is no variation in x, this returns
517         * <code>Double.NaN</code>.</p>
518         * <p>
519         * <strong>Usage Note</strong>:<br>
520         * The validity of this statistic depends on the assumption that the
521         * observations included in the model are drawn from a
522         * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
523         * Bivariate Normal Distribution</a>.</p>
524         *
525         * @return half-width of 95% confidence interval for the slope estimate
526         * @throws MathException if the confidence interval can not be computed.
527         */
528        public double getSlopeConfidenceInterval() throws MathException {
529            return getSlopeConfidenceInterval(0.05d);
530        }
531    
532        /**
533         * Returns the half-width of a (100-100*alpha)% confidence interval for
534         * the slope estimate.
535         * <p>
536         * The (100-100*alpha)% confidence interval is </p>
537         * <p>
538         * <code>(getSlope() - getSlopeConfidenceInterval(),
539         * getSlope() + getSlopeConfidenceInterval())</code></p>
540         * <p>
541         * To request, for example, a 99% confidence interval, use
542         * <code>alpha = .01</code></p>
543         * <p>
544         * <strong>Usage Note</strong>:<br>
545         * The validity of this statistic depends on the assumption that the
546         * observations included in the model are drawn from a
547         * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
548         * Bivariate Normal Distribution</a>.</p>
549         * <p>
550         * <strong> Preconditions:</strong><ul>
551         * <li>If there are fewer that <strong>three</strong> observations in the
552         * model, or if there is no variation in x, this returns
553         * <code>Double.NaN</code>.
554         * </li>
555         * <li><code>(0 < alpha < 1)</code>; otherwise an
556         * <code>IllegalArgumentException</code> is thrown.
557         * </li></ul></p>
558         *
559         * @param alpha the desired significance level
560         * @return half-width of 95% confidence interval for the slope estimate
561         * @throws MathException if the confidence interval can not be computed.
562         */
563        public double getSlopeConfidenceInterval(double alpha)
564            throws MathException {
565            if (alpha >= 1 || alpha <= 0) {
566                throw MathRuntimeException.createIllegalArgumentException(
567                      LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL,
568                      alpha, 0.0, 1.0);
569            }
570            return getSlopeStdErr() *
571                distribution.inverseCumulativeProbability(1d - alpha / 2d);
572        }
573    
574        /**
575         * Returns the significance level of the slope (equiv) correlation.
576         * <p>
577         * Specifically, the returned value is the smallest <code>alpha</code>
578         * such that the slope confidence interval with significance level
579         * equal to <code>alpha</code> does not include <code>0</code>.
580         * On regression output, this is often denoted <code>Prob(|t| > 0)</code>
581         * </p><p>
582         * <strong>Usage Note</strong>:<br>
583         * The validity of this statistic depends on the assumption that the
584         * observations included in the model are drawn from a
585         * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
586         * Bivariate Normal Distribution</a>.</p>
587         * <p>
588         * If there are fewer that <strong>three</strong> observations in the
589         * model, or if there is no variation in x, this returns
590         * <code>Double.NaN</code>.</p>
591         *
592         * @return significance level for slope/correlation
593         * @throws MathException if the significance level can not be computed.
594         */
595        public double getSignificance() throws MathException {
596            return 2d * (1.0 - distribution.cumulativeProbability(
597                        FastMath.abs(getSlope()) / getSlopeStdErr()));
598        }
599    
600        // ---------------------Private methods-----------------------------------
601    
602        /**
603        * Returns the intercept of the estimated regression line, given the slope.
604        * <p>
605        * Will return <code>NaN</code> if slope is <code>NaN</code>.</p>
606        *
607        * @param slope current slope
608        * @return the intercept of the regression line
609        */
610        private double getIntercept(double slope) {
611            return (sumY - slope * sumX) / n;
612        }
613    
614        /**
615         * Computes SSR from b1.
616         *
617         * @param slope regression slope estimate
618         * @return sum of squared deviations of predicted y values
619         */
620        private double getRegressionSumSquares(double slope) {
621            return slope * slope * sumXX;
622        }
623    
624        /**
625         * Modify the distribution used to compute inference statistics.
626         * @param value the new distribution
627         * @since 1.2
628         * @deprecated in 2.2 (to be removed in 3.0).
629         */
630        @Deprecated
631        public void setDistribution(TDistribution value) {
632            distribution = value;
633    
634            // modify degrees of freedom
635            if (n > 2) {
636                distribution.setDegreesOfFreedom(n - 2);
637            }
638        }
639    }