001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.math3.stat.clustering;
018    
019    import java.util.ArrayList;
020    import java.util.Collection;
021    import java.util.HashMap;
022    import java.util.HashSet;
023    import java.util.List;
024    import java.util.Map;
025    import java.util.Set;
026    
027    import org.apache.commons.math3.exception.NotPositiveException;
028    import org.apache.commons.math3.exception.NullArgumentException;
029    import org.apache.commons.math3.util.MathUtils;
030    
031    /**
032     * DBSCAN (density-based spatial clustering of applications with noise) algorithm.
033     * <p>
034     * The DBSCAN algorithm forms clusters based on the idea of density connectivity, i.e.
035     * a point p is density connected to another point q, if there exists a chain of
036     * points p<sub>i</sub>, with i = 1 .. n and p<sub>1</sub> = p and p<sub>n</sub> = q,
037     * such that each pair &lt;p<sub>i</sub>, p<sub>i+1</sub>&gt; is directly density-reachable.
038     * A point q is directly density-reachable from point p if it is in the &epsilon;-neighborhood
039     * of this point.
040     * <p>
041     * Any point that is not density-reachable from a formed cluster is treated as noise, and
042     * will thus not be present in the result.
043     * <p>
044     * The algorithm requires two parameters:
045     * <ul>
046     *   <li>eps: the distance that defines the &epsilon;-neighborhood of a point
047     *   <li>minPoints: the minimum number of density-connected points required to form a cluster
048     * </ul>
049     * <p>
050     * <b>Note:</b> as DBSCAN is not a centroid-based clustering algorithm, the resulting
051     * {@link Cluster} objects will have no defined center, i.e. {@link Cluster#getCenter()} will
052     * return {@code null}.
053     *
054     * @param <T> type of the points to cluster
055     * @see <a href="http://en.wikipedia.org/wiki/DBSCAN">DBSCAN (wikipedia)</a>
056     * @see <a href="http://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf">
057     * A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise</a>
058     * @version $Id: DBSCANClusterer.java 1410882 2012-11-18 12:49:49Z tn $
059     * @since 3.1
060     */
061    public class DBSCANClusterer<T extends Clusterable<T>> {
062    
063        /** Maximum radius of the neighborhood to be considered. */
064        private final double              eps;
065    
066        /** Minimum number of points needed for a cluster. */
067        private final int                 minPts;
068    
069        /** Status of a point during the clustering process. */
070        private enum PointStatus {
071            /** The point has is considered to be noise. */
072            NOISE,
073            /** The point is already part of a cluster. */
074            PART_OF_CLUSTER
075        }
076    
077        /**
078         * Creates a new instance of a DBSCANClusterer.
079         *
080         * @param eps maximum radius of the neighborhood to be considered
081         * @param minPts minimum number of points needed for a cluster
082         * @throws NotPositiveException if {@code eps < 0.0} or {@code minPts < 0}
083         */
084        public DBSCANClusterer(final double eps, final int minPts)
085            throws NotPositiveException {
086            if (eps < 0.0d) {
087                throw new NotPositiveException(eps);
088            }
089            if (minPts < 0) {
090                throw new NotPositiveException(minPts);
091            }
092            this.eps = eps;
093            this.minPts = minPts;
094        }
095    
096        /**
097         * Returns the maximum radius of the neighborhood to be considered.
098         *
099         * @return maximum radius of the neighborhood
100         */
101        public double getEps() {
102            return eps;
103        }
104    
105        /**
106         * Returns the minimum number of points needed for a cluster.
107         *
108         * @return minimum number of points needed for a cluster
109         */
110        public int getMinPts() {
111            return minPts;
112        }
113    
114        /**
115         * Performs DBSCAN cluster analysis.
116         * <p>
117         * <b>Note:</b> as DBSCAN is not a centroid-based clustering algorithm, the resulting
118         * {@link Cluster} objects will have no defined center, i.e. {@link Cluster#getCenter()} will
119         * return {@code null}.
120         *
121         * @param points the points to cluster
122         * @return the list of clusters
123         * @throws NullArgumentException if the data points are null
124         */
125        public List<Cluster<T>> cluster(final Collection<T> points) throws NullArgumentException {
126    
127            // sanity checks
128            MathUtils.checkNotNull(points);
129    
130            final List<Cluster<T>> clusters = new ArrayList<Cluster<T>>();
131            final Map<Clusterable<T>, PointStatus> visited = new HashMap<Clusterable<T>, PointStatus>();
132    
133            for (final T point : points) {
134                if (visited.get(point) != null) {
135                    continue;
136                }
137                final List<T> neighbors = getNeighbors(point, points);
138                if (neighbors.size() >= minPts) {
139                    // DBSCAN does not care about center points
140                    final Cluster<T> cluster = new Cluster<T>(null);
141                    clusters.add(expandCluster(cluster, point, neighbors, points, visited));
142                } else {
143                    visited.put(point, PointStatus.NOISE);
144                }
145            }
146    
147            return clusters;
148        }
149    
150        /**
151         * Expands the cluster to include density-reachable items.
152         *
153         * @param cluster Cluster to expand
154         * @param point Point to add to cluster
155         * @param neighbors List of neighbors
156         * @param points the data set
157         * @param visited the set of already visited points
158         * @return the expanded cluster
159         */
160        private Cluster<T> expandCluster(final Cluster<T> cluster,
161                                         final T point,
162                                         final List<T> neighbors,
163                                         final Collection<T> points,
164                                         final Map<Clusterable<T>, PointStatus> visited) {
165            cluster.addPoint(point);
166            visited.put(point, PointStatus.PART_OF_CLUSTER);
167    
168            List<T> seeds = new ArrayList<T>(neighbors);
169            int index = 0;
170            while (index < seeds.size()) {
171                final T current = seeds.get(index);
172                PointStatus pStatus = visited.get(current);
173                // only check non-visited points
174                if (pStatus == null) {
175                    final List<T> currentNeighbors = getNeighbors(current, points);
176                    if (currentNeighbors.size() >= minPts) {
177                        seeds = merge(seeds, currentNeighbors);
178                    }
179                }
180    
181                if (pStatus != PointStatus.PART_OF_CLUSTER) {
182                    visited.put(current, PointStatus.PART_OF_CLUSTER);
183                    cluster.addPoint(current);
184                }
185    
186                index++;
187            }
188            return cluster;
189        }
190    
191        /**
192         * Returns a list of density-reachable neighbors of a {@code point}.
193         *
194         * @param point the point to look for
195         * @param points possible neighbors
196         * @return the List of neighbors
197         */
198        private List<T> getNeighbors(final T point, final Collection<T> points) {
199            final List<T> neighbors = new ArrayList<T>();
200            for (final T neighbor : points) {
201                if (point != neighbor && neighbor.distanceFrom(point) <= eps) {
202                    neighbors.add(neighbor);
203                }
204            }
205            return neighbors;
206        }
207    
208        /**
209         * Merges two lists together.
210         *
211         * @param one first list
212         * @param two second list
213         * @return merged lists
214         */
215        private List<T> merge(final List<T> one, final List<T> two) {
216            final Set<T> oneSet = new HashSet<T>(one);
217            for (T item : two) {
218                if (!oneSet.contains(item)) {
219                    one.add(item);
220                }
221            }
222            return one;
223        }
224    }