001/* 002 * Copyright (C) 2011 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.hash; 016 017import static com.google.common.base.Preconditions.checkArgument; 018import static com.google.common.base.Preconditions.checkNotNull; 019 020import com.google.common.annotations.Beta; 021import com.google.common.annotations.VisibleForTesting; 022import com.google.common.base.Objects; 023import com.google.common.base.Predicate; 024import com.google.common.hash.BloomFilterStrategies.BitArray; 025import com.google.common.primitives.SignedBytes; 026import com.google.common.primitives.UnsignedBytes; 027 028import java.io.DataInputStream; 029import java.io.DataOutputStream; 030import java.io.IOException; 031import java.io.InputStream; 032import java.io.OutputStream; 033import java.io.Serializable; 034 035import javax.annotation.CheckReturnValue; 036import javax.annotation.Nullable; 037 038/** 039 * A Bloom filter for instances of {@code T}. A Bloom filter offers an approximate containment test 040 * with one-sided error: if it claims that an element is contained in it, this might be in error, 041 * but if it claims that an element is <i>not</i> contained in it, then this is definitely true. 042 * 043 * <p>If you are unfamiliar with Bloom filters, this nice 044 * <a href="http://llimllib.github.com/bloomfilter-tutorial/">tutorial</a> may help you understand 045 * how they work. 046 * 047 * <p>The false positive probability ({@code FPP}) of a bloom filter is defined as the probability 048 * that {@linkplain #mightContain(Object)} will erroneously return {@code true} for an object that 049 * has not actually been put in the {@code BloomFilter}. 050 * 051 * <p>Bloom filters are serializable. They also support a more compact serial representation via 052 * the {@link #writeTo} and {@link #readFrom} methods. Both serialized forms will continue to be 053 * supported by future versions of this library. However, serial forms generated by newer versions 054 * of the code may not be readable by older versions of the code (e.g., a serialized bloom filter 055 * generated today may <i>not</i> be readable by a binary that was compiled 6 months ago). 056 * 057 * @param <T> the type of instances that the {@code BloomFilter} accepts 058 * @author Dimitris Andreou 059 * @author Kevin Bourrillion 060 * @since 11.0 061 */ 062@Beta 063public final class BloomFilter<T> implements Predicate<T>, Serializable { 064 /** 065 * A strategy to translate T instances, to {@code numHashFunctions} bit indexes. 066 * 067 * <p>Implementations should be collections of pure functions (i.e. stateless). 068 */ 069 interface Strategy extends java.io.Serializable { 070 071 /** 072 * Sets {@code numHashFunctions} bits of the given bit array, by hashing a user element. 073 * 074 * <p>Returns whether any bits changed as a result of this operation. 075 */ 076 <T> boolean put(T object, Funnel<? super T> funnel, int numHashFunctions, BitArray bits); 077 078 /** 079 * Queries {@code numHashFunctions} bits of the given bit array, by hashing a user element; 080 * returns {@code true} if and only if all selected bits are set. 081 */ 082 <T> boolean mightContain( 083 T object, Funnel<? super T> funnel, int numHashFunctions, BitArray bits); 084 085 /** 086 * Identifier used to encode this strategy, when marshalled as part of a BloomFilter. 087 * Only values in the [-128, 127] range are valid for the compact serial form. 088 * Non-negative values are reserved for enums defined in BloomFilterStrategies; 089 * negative values are reserved for any custom, stateful strategy we may define 090 * (e.g. any kind of strategy that would depend on user input). 091 */ 092 int ordinal(); 093 } 094 095 /** The bit set of the BloomFilter (not necessarily power of 2!)*/ 096 private final BitArray bits; 097 098 /** Number of hashes per element */ 099 private final int numHashFunctions; 100 101 /** The funnel to translate Ts to bytes */ 102 private final Funnel<? super T> funnel; 103 104 /** 105 * The strategy we employ to map an element T to {@code numHashFunctions} bit indexes. 106 */ 107 private final Strategy strategy; 108 109 /** 110 * Creates a BloomFilter. 111 */ 112 private BloomFilter( 113 BitArray bits, int numHashFunctions, Funnel<? super T> funnel, Strategy strategy) { 114 checkArgument(numHashFunctions > 0, "numHashFunctions (%s) must be > 0", numHashFunctions); 115 checkArgument( 116 numHashFunctions <= 255, "numHashFunctions (%s) must be <= 255", numHashFunctions); 117 this.bits = checkNotNull(bits); 118 this.numHashFunctions = numHashFunctions; 119 this.funnel = checkNotNull(funnel); 120 this.strategy = checkNotNull(strategy); 121 } 122 123 /** 124 * Creates a new {@code BloomFilter} that's a copy of this instance. The new instance is equal to 125 * this instance but shares no mutable state. 126 * 127 * @since 12.0 128 */ 129 @CheckReturnValue 130 public BloomFilter<T> copy() { 131 return new BloomFilter<T>(bits.copy(), numHashFunctions, funnel, strategy); 132 } 133 134 /** 135 * Returns {@code true} if the element <i>might</i> have been put in this Bloom filter, 136 * {@code false} if this is <i>definitely</i> not the case. 137 */ 138 @CheckReturnValue 139 public boolean mightContain(T object) { 140 return strategy.mightContain(object, funnel, numHashFunctions, bits); 141 } 142 143 /** 144 * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #mightContain} 145 * instead. 146 */ 147 @Deprecated 148 @Override 149 @CheckReturnValue 150 public boolean apply(T input) { 151 return mightContain(input); 152 } 153 154 /** 155 * Puts an element into this {@code BloomFilter}. Ensures that subsequent invocations of 156 * {@link #mightContain(Object)} with the same element will always return {@code true}. 157 * 158 * @return true if the bloom filter's bits changed as a result of this operation. If the bits 159 * changed, this is <i>definitely</i> the first time {@code object} has been added to the 160 * filter. If the bits haven't changed, this <i>might</i> be the first time {@code object} 161 * has been added to the filter. Note that {@code put(t)} always returns the 162 * <i>opposite</i> result to what {@code mightContain(t)} would have returned at the time 163 * it is called." 164 * @since 12.0 (present in 11.0 with {@code void} return type}) 165 */ 166 public boolean put(T object) { 167 return strategy.put(object, funnel, numHashFunctions, bits); 168 } 169 170 /** 171 * Returns the probability that {@linkplain #mightContain(Object)} will erroneously return 172 * {@code true} for an object that has not actually been put in the {@code BloomFilter}. 173 * 174 * <p>Ideally, this number should be close to the {@code fpp} parameter 175 * passed in {@linkplain #create(Funnel, int, double)}, or smaller. If it is 176 * significantly higher, it is usually the case that too many elements (more than 177 * expected) have been put in the {@code BloomFilter}, degenerating it. 178 * 179 * @since 14.0 (since 11.0 as expectedFalsePositiveProbability()) 180 */ 181 @CheckReturnValue 182 public double expectedFpp() { 183 // You down with FPP? (Yeah you know me!) Who's down with FPP? (Every last homie!) 184 return Math.pow((double) bits.bitCount() / bitSize(), numHashFunctions); 185 } 186 187 /** 188 * Returns the number of bits in the underlying bit array. 189 */ 190 @VisibleForTesting 191 long bitSize() { 192 return bits.bitSize(); 193 } 194 195 /** 196 * Determines whether a given bloom filter is compatible with this bloom filter. For two 197 * bloom filters to be compatible, they must: 198 * 199 * <ul> 200 * <li>not be the same instance 201 * <li>have the same number of hash functions 202 * <li>have the same bit size 203 * <li>have the same strategy 204 * <li>have equal funnels 205 * <ul> 206 * 207 * @param that The bloom filter to check for compatibility. 208 * @since 15.0 209 */ 210 @CheckReturnValue 211 public boolean isCompatible(BloomFilter<T> that) { 212 checkNotNull(that); 213 return (this != that) 214 && (this.numHashFunctions == that.numHashFunctions) 215 && (this.bitSize() == that.bitSize()) 216 && (this.strategy.equals(that.strategy)) 217 && (this.funnel.equals(that.funnel)); 218 } 219 220 /** 221 * Combines this bloom filter with another bloom filter by performing a bitwise OR of the 222 * underlying data. The mutations happen to <b>this</b> instance. Callers must ensure the 223 * bloom filters are appropriately sized to avoid saturating them. 224 * 225 * @param that The bloom filter to combine this bloom filter with. It is not mutated. 226 * @throws IllegalArgumentException if {@code isCompatible(that) == false} 227 * 228 * @since 15.0 229 */ 230 public void putAll(BloomFilter<T> that) { 231 checkNotNull(that); 232 checkArgument(this != that, "Cannot combine a BloomFilter with itself."); 233 checkArgument( 234 this.numHashFunctions == that.numHashFunctions, 235 "BloomFilters must have the same number of hash functions (%s != %s)", 236 this.numHashFunctions, 237 that.numHashFunctions); 238 checkArgument( 239 this.bitSize() == that.bitSize(), 240 "BloomFilters must have the same size underlying bit arrays (%s != %s)", 241 this.bitSize(), 242 that.bitSize()); 243 checkArgument( 244 this.strategy.equals(that.strategy), 245 "BloomFilters must have equal strategies (%s != %s)", 246 this.strategy, 247 that.strategy); 248 checkArgument( 249 this.funnel.equals(that.funnel), 250 "BloomFilters must have equal funnels (%s != %s)", 251 this.funnel, 252 that.funnel); 253 this.bits.putAll(that.bits); 254 } 255 256 @Override 257 public boolean equals(@Nullable Object object) { 258 if (object == this) { 259 return true; 260 } 261 if (object instanceof BloomFilter) { 262 BloomFilter<?> that = (BloomFilter<?>) object; 263 return this.numHashFunctions == that.numHashFunctions 264 && this.funnel.equals(that.funnel) 265 && this.bits.equals(that.bits) 266 && this.strategy.equals(that.strategy); 267 } 268 return false; 269 } 270 271 @Override 272 public int hashCode() { 273 return Objects.hashCode(numHashFunctions, funnel, strategy, bits); 274 } 275 276 /** 277 * Creates a {@link BloomFilter BloomFilter<T>} with the expected number of 278 * insertions and expected false positive probability. 279 * 280 * <p>Note that overflowing a {@code BloomFilter} with significantly more elements 281 * than specified, will result in its saturation, and a sharp deterioration of its 282 * false positive probability. 283 * 284 * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided 285 * {@code Funnel<T>} is. 286 * 287 * <p>It is recommended that the funnel be implemented as a Java enum. This has the 288 * benefit of ensuring proper serialization and deserialization, which is important 289 * since {@link #equals} also relies on object identity of funnels. 290 * 291 * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use 292 * @param expectedInsertions the number of expected insertions to the constructed 293 * {@code BloomFilter<T>}; must be positive 294 * @param fpp the desired false positive probability (must be positive and less than 1.0) 295 * @return a {@code BloomFilter} 296 */ 297 @CheckReturnValue 298 public static <T> BloomFilter<T> create( 299 Funnel<? super T> funnel, int expectedInsertions, double fpp) { 300 return create(funnel, (long) expectedInsertions, fpp); 301 } 302 303 /** 304 * Creates a {@link BloomFilter BloomFilter<T>} with the expected number of 305 * insertions and expected false positive probability. 306 * 307 * <p>Note that overflowing a {@code BloomFilter} with significantly more elements 308 * than specified, will result in its saturation, and a sharp deterioration of its 309 * false positive probability. 310 * 311 * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided 312 * {@code Funnel<T>} is. 313 * 314 * <p>It is recommended that the funnel be implemented as a Java enum. This has the 315 * benefit of ensuring proper serialization and deserialization, which is important 316 * since {@link #equals} also relies on object identity of funnels. 317 * 318 * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use 319 * @param expectedInsertions the number of expected insertions to the constructed 320 * {@code BloomFilter<T>}; must be positive 321 * @param fpp the desired false positive probability (must be positive and less than 1.0) 322 * @return a {@code BloomFilter} 323 * @since 19.0 324 */ 325 @CheckReturnValue 326 public static <T> BloomFilter<T> create( 327 Funnel<? super T> funnel, long expectedInsertions, double fpp) { 328 return create(funnel, expectedInsertions, fpp, BloomFilterStrategies.MURMUR128_MITZ_64); 329 } 330 331 @VisibleForTesting 332 static <T> BloomFilter<T> create( 333 Funnel<? super T> funnel, long expectedInsertions, double fpp, Strategy strategy) { 334 checkNotNull(funnel); 335 checkArgument( 336 expectedInsertions >= 0, "Expected insertions (%s) must be >= 0", expectedInsertions); 337 checkArgument(fpp > 0.0, "False positive probability (%s) must be > 0.0", fpp); 338 checkArgument(fpp < 1.0, "False positive probability (%s) must be < 1.0", fpp); 339 checkNotNull(strategy); 340 341 if (expectedInsertions == 0) { 342 expectedInsertions = 1; 343 } 344 /* 345 * TODO(user): Put a warning in the javadoc about tiny fpp values, 346 * since the resulting size is proportional to -log(p), but there is not 347 * much of a point after all, e.g. optimalM(1000, 0.0000000000000001) = 76680 348 * which is less than 10kb. Who cares! 349 */ 350 long numBits = optimalNumOfBits(expectedInsertions, fpp); 351 int numHashFunctions = optimalNumOfHashFunctions(expectedInsertions, numBits); 352 try { 353 return new BloomFilter<T>(new BitArray(numBits), numHashFunctions, funnel, strategy); 354 } catch (IllegalArgumentException e) { 355 throw new IllegalArgumentException("Could not create BloomFilter of " + numBits + " bits", e); 356 } 357 } 358 359 /** 360 * Creates a {@link BloomFilter BloomFilter<T>} with the expected number of 361 * insertions and a default expected false positive probability of 3%. 362 * 363 * <p>Note that overflowing a {@code BloomFilter} with significantly more elements 364 * than specified, will result in its saturation, and a sharp deterioration of its 365 * false positive probability. 366 * 367 * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided 368 * {@code Funnel<T>} is. 369 * 370 * <p>It is recommended that the funnel be implemented as a Java enum. This has the 371 * benefit of ensuring proper serialization and deserialization, which is important 372 * since {@link #equals} also relies on object identity of funnels. 373 * 374 * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use 375 * @param expectedInsertions the number of expected insertions to the constructed 376 * {@code BloomFilter<T>}; must be positive 377 * @return a {@code BloomFilter} 378 */ 379 @CheckReturnValue 380 public static <T> BloomFilter<T> create(Funnel<? super T> funnel, int expectedInsertions) { 381 return create(funnel, (long) expectedInsertions); 382 } 383 384 /** 385 * Creates a {@link BloomFilter BloomFilter<T>} with the expected number of 386 * insertions and a default expected false positive probability of 3%. 387 * 388 * <p>Note that overflowing a {@code BloomFilter} with significantly more elements 389 * than specified, will result in its saturation, and a sharp deterioration of its 390 * false positive probability. 391 * 392 * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided 393 * {@code Funnel<T>} is. 394 * 395 * <p>It is recommended that the funnel be implemented as a Java enum. This has the 396 * benefit of ensuring proper serialization and deserialization, which is important 397 * since {@link #equals} also relies on object identity of funnels. 398 * 399 * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use 400 * @param expectedInsertions the number of expected insertions to the constructed 401 * {@code BloomFilter<T>}; must be positive 402 * @return a {@code BloomFilter} 403 * @since 19.0 404 */ 405 @CheckReturnValue 406 public static <T> BloomFilter<T> create(Funnel<? super T> funnel, long expectedInsertions) { 407 return create(funnel, expectedInsertions, 0.03); // FYI, for 3%, we always get 5 hash functions 408 } 409 410 /* 411 * Cheat sheet: 412 * 413 * m: total bits 414 * n: expected insertions 415 * b: m/n, bits per insertion 416 * p: expected false positive probability 417 * 418 * 1) Optimal k = b * ln2 419 * 2) p = (1 - e ^ (-kn/m))^k 420 * 3) For optimal k: p = 2 ^ (-k) ~= 0.6185^b 421 * 4) For optimal k: m = -nlnp / ((ln2) ^ 2) 422 */ 423 424 /** 425 * Computes the optimal k (number of hashes per element inserted in Bloom filter), given the 426 * expected insertions and total number of bits in the Bloom filter. 427 * 428 * See http://en.wikipedia.org/wiki/File:Bloom_filter_fp_probability.svg for the formula. 429 * 430 * @param n expected insertions (must be positive) 431 * @param m total number of bits in Bloom filter (must be positive) 432 */ 433 @VisibleForTesting 434 static int optimalNumOfHashFunctions(long n, long m) { 435 // (m / n) * log(2), but avoid truncation due to division! 436 return Math.max(1, (int) Math.round((double) m / n * Math.log(2))); 437 } 438 439 /** 440 * Computes m (total bits of Bloom filter) which is expected to achieve, for the specified 441 * expected insertions, the required false positive probability. 442 * 443 * See http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives for the formula. 444 * 445 * @param n expected insertions (must be positive) 446 * @param p false positive rate (must be 0 < p < 1) 447 */ 448 @VisibleForTesting 449 static long optimalNumOfBits(long n, double p) { 450 if (p == 0) { 451 p = Double.MIN_VALUE; 452 } 453 return (long) (-n * Math.log(p) / (Math.log(2) * Math.log(2))); 454 } 455 456 private Object writeReplace() { 457 return new SerialForm<T>(this); 458 } 459 460 private static class SerialForm<T> implements Serializable { 461 final long[] data; 462 final int numHashFunctions; 463 final Funnel<? super T> funnel; 464 final Strategy strategy; 465 466 SerialForm(BloomFilter<T> bf) { 467 this.data = bf.bits.data; 468 this.numHashFunctions = bf.numHashFunctions; 469 this.funnel = bf.funnel; 470 this.strategy = bf.strategy; 471 } 472 473 Object readResolve() { 474 return new BloomFilter<T>(new BitArray(data), numHashFunctions, funnel, strategy); 475 } 476 477 private static final long serialVersionUID = 1; 478 } 479 480 /** 481 * Writes this {@code BloomFilter} to an output stream, with a custom format (not Java 482 * serialization). This has been measured to save at least 400 bytes compared to regular 483 * serialization. 484 * 485 * <p>Use {@linkplain #readFrom(InputStream, Funnel)} to reconstruct the written BloomFilter. 486 */ 487 public void writeTo(OutputStream out) throws IOException { 488 /* 489 * Serial form: 490 * 1 signed byte for the strategy 491 * 1 unsigned byte for the number of hash functions 492 * 1 big endian int, the number of longs in our bitset 493 * N big endian longs of our bitset 494 */ 495 DataOutputStream dout = new DataOutputStream(out); 496 dout.writeByte(SignedBytes.checkedCast(strategy.ordinal())); 497 dout.writeByte(UnsignedBytes.checkedCast(numHashFunctions)); // note: checked at the c'tor 498 dout.writeInt(bits.data.length); 499 for (long value : bits.data) { 500 dout.writeLong(value); 501 } 502 } 503 504 /** 505 * Reads a byte stream, which was written by {@linkplain #writeTo(OutputStream)}, into 506 * a {@code BloomFilter<T>}. 507 * 508 * The {@code Funnel} to be used is not encoded in the stream, so it must be provided here. 509 * <b>Warning:</b> the funnel provided <b>must</b> behave identically to the one used to 510 * populate the original Bloom filter! 511 * 512 * @throws IOException if the InputStream throws an {@code IOException}, or if its data does 513 * not appear to be a BloomFilter serialized using the 514 * {@linkplain #writeTo(OutputStream)} method. 515 */ 516 @CheckReturnValue 517 public static <T> BloomFilter<T> readFrom(InputStream in, Funnel<T> funnel) throws IOException { 518 checkNotNull(in, "InputStream"); 519 checkNotNull(funnel, "Funnel"); 520 int strategyOrdinal = -1; 521 int numHashFunctions = -1; 522 int dataLength = -1; 523 try { 524 DataInputStream din = new DataInputStream(in); 525 // currently this assumes there is no negative ordinal; will have to be updated if we 526 // add non-stateless strategies (for which we've reserved negative ordinals; see 527 // Strategy.ordinal()). 528 strategyOrdinal = din.readByte(); 529 numHashFunctions = UnsignedBytes.toInt(din.readByte()); 530 dataLength = din.readInt(); 531 532 Strategy strategy = BloomFilterStrategies.values()[strategyOrdinal]; 533 long[] data = new long[dataLength]; 534 for (int i = 0; i < data.length; i++) { 535 data[i] = din.readLong(); 536 } 537 return new BloomFilter<T>(new BitArray(data), numHashFunctions, funnel, strategy); 538 } catch (RuntimeException e) { 539 IOException ioException = new IOException( 540 "Unable to deserialize BloomFilter from InputStream." 541 + " strategyOrdinal: " + strategyOrdinal 542 + " numHashFunctions: " + numHashFunctions 543 + " dataLength: " + dataLength); 544 ioException.initCause(e); 545 throw ioException; 546 } 547 } 548}