001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.math3.stat.clustering; 018 019 import java.util.ArrayList; 020 import java.util.Collection; 021 import java.util.HashMap; 022 import java.util.HashSet; 023 import java.util.List; 024 import java.util.Map; 025 import java.util.Set; 026 027 import org.apache.commons.math3.exception.NotPositiveException; 028 import org.apache.commons.math3.exception.NullArgumentException; 029 import org.apache.commons.math3.util.MathUtils; 030 031 /** 032 * DBSCAN (density-based spatial clustering of applications with noise) algorithm. 033 * <p> 034 * The DBSCAN algorithm forms clusters based on the idea of density connectivity, i.e. 035 * a point p is density connected to another point q, if there exists a chain of 036 * points p<sub>i</sub>, with i = 1 .. n and p<sub>1</sub> = p and p<sub>n</sub> = q, 037 * such that each pair <p<sub>i</sub>, p<sub>i+1</sub>> is directly density-reachable. 038 * A point q is directly density-reachable from point p if it is in the ε-neighborhood 039 * of this point. 040 * <p> 041 * Any point that is not density-reachable from a formed cluster is treated as noise, and 042 * will thus not be present in the result. 043 * <p> 044 * The algorithm requires two parameters: 045 * <ul> 046 * <li>eps: the distance that defines the ε-neighborhood of a point 047 * <li>minPoints: the minimum number of density-connected points required to form a cluster 048 * </ul> 049 * <p> 050 * <b>Note:</b> as DBSCAN is not a centroid-based clustering algorithm, the resulting 051 * {@link Cluster} objects will have no defined center, i.e. {@link Cluster#getCenter()} will 052 * return {@code null}. 053 * 054 * @param <T> type of the points to cluster 055 * @see <a href="http://en.wikipedia.org/wiki/DBSCAN">DBSCAN (wikipedia)</a> 056 * @see <a href="http://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf"> 057 * A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise</a> 058 * @version $Id: DBSCANClusterer.java 1410882 2012-11-18 12:49:49Z tn $ 059 * @since 3.1 060 */ 061 public class DBSCANClusterer<T extends Clusterable<T>> { 062 063 /** Maximum radius of the neighborhood to be considered. */ 064 private final double eps; 065 066 /** Minimum number of points needed for a cluster. */ 067 private final int minPts; 068 069 /** Status of a point during the clustering process. */ 070 private enum PointStatus { 071 /** The point has is considered to be noise. */ 072 NOISE, 073 /** The point is already part of a cluster. */ 074 PART_OF_CLUSTER 075 } 076 077 /** 078 * Creates a new instance of a DBSCANClusterer. 079 * 080 * @param eps maximum radius of the neighborhood to be considered 081 * @param minPts minimum number of points needed for a cluster 082 * @throws NotPositiveException if {@code eps < 0.0} or {@code minPts < 0} 083 */ 084 public DBSCANClusterer(final double eps, final int minPts) 085 throws NotPositiveException { 086 if (eps < 0.0d) { 087 throw new NotPositiveException(eps); 088 } 089 if (minPts < 0) { 090 throw new NotPositiveException(minPts); 091 } 092 this.eps = eps; 093 this.minPts = minPts; 094 } 095 096 /** 097 * Returns the maximum radius of the neighborhood to be considered. 098 * 099 * @return maximum radius of the neighborhood 100 */ 101 public double getEps() { 102 return eps; 103 } 104 105 /** 106 * Returns the minimum number of points needed for a cluster. 107 * 108 * @return minimum number of points needed for a cluster 109 */ 110 public int getMinPts() { 111 return minPts; 112 } 113 114 /** 115 * Performs DBSCAN cluster analysis. 116 * <p> 117 * <b>Note:</b> as DBSCAN is not a centroid-based clustering algorithm, the resulting 118 * {@link Cluster} objects will have no defined center, i.e. {@link Cluster#getCenter()} will 119 * return {@code null}. 120 * 121 * @param points the points to cluster 122 * @return the list of clusters 123 * @throws NullArgumentException if the data points are null 124 */ 125 public List<Cluster<T>> cluster(final Collection<T> points) throws NullArgumentException { 126 127 // sanity checks 128 MathUtils.checkNotNull(points); 129 130 final List<Cluster<T>> clusters = new ArrayList<Cluster<T>>(); 131 final Map<Clusterable<T>, PointStatus> visited = new HashMap<Clusterable<T>, PointStatus>(); 132 133 for (final T point : points) { 134 if (visited.get(point) != null) { 135 continue; 136 } 137 final List<T> neighbors = getNeighbors(point, points); 138 if (neighbors.size() >= minPts) { 139 // DBSCAN does not care about center points 140 final Cluster<T> cluster = new Cluster<T>(null); 141 clusters.add(expandCluster(cluster, point, neighbors, points, visited)); 142 } else { 143 visited.put(point, PointStatus.NOISE); 144 } 145 } 146 147 return clusters; 148 } 149 150 /** 151 * Expands the cluster to include density-reachable items. 152 * 153 * @param cluster Cluster to expand 154 * @param point Point to add to cluster 155 * @param neighbors List of neighbors 156 * @param points the data set 157 * @param visited the set of already visited points 158 * @return the expanded cluster 159 */ 160 private Cluster<T> expandCluster(final Cluster<T> cluster, 161 final T point, 162 final List<T> neighbors, 163 final Collection<T> points, 164 final Map<Clusterable<T>, PointStatus> visited) { 165 cluster.addPoint(point); 166 visited.put(point, PointStatus.PART_OF_CLUSTER); 167 168 List<T> seeds = new ArrayList<T>(neighbors); 169 int index = 0; 170 while (index < seeds.size()) { 171 final T current = seeds.get(index); 172 PointStatus pStatus = visited.get(current); 173 // only check non-visited points 174 if (pStatus == null) { 175 final List<T> currentNeighbors = getNeighbors(current, points); 176 if (currentNeighbors.size() >= minPts) { 177 seeds = merge(seeds, currentNeighbors); 178 } 179 } 180 181 if (pStatus != PointStatus.PART_OF_CLUSTER) { 182 visited.put(current, PointStatus.PART_OF_CLUSTER); 183 cluster.addPoint(current); 184 } 185 186 index++; 187 } 188 return cluster; 189 } 190 191 /** 192 * Returns a list of density-reachable neighbors of a {@code point}. 193 * 194 * @param point the point to look for 195 * @param points possible neighbors 196 * @return the List of neighbors 197 */ 198 private List<T> getNeighbors(final T point, final Collection<T> points) { 199 final List<T> neighbors = new ArrayList<T>(); 200 for (final T neighbor : points) { 201 if (point != neighbor && neighbor.distanceFrom(point) <= eps) { 202 neighbors.add(neighbor); 203 } 204 } 205 return neighbors; 206 } 207 208 /** 209 * Merges two lists together. 210 * 211 * @param one first list 212 * @param two second list 213 * @return merged lists 214 */ 215 private List<T> merge(final List<T> one, final List<T> two) { 216 final Set<T> oneSet = new HashSet<T>(one); 217 for (T item : two) { 218 if (!oneSet.contains(item)) { 219 one.add(item); 220 } 221 } 222 return one; 223 } 224 }