001 /* 002 * Copyright (C) 2009 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package com.google.common.net; 018 019 import static com.google.common.base.Preconditions.checkArgument; 020 import static com.google.common.base.Preconditions.checkNotNull; 021 import static com.google.common.base.Preconditions.checkState; 022 023 import com.google.common.annotations.Beta; 024 import com.google.common.annotations.GwtCompatible; 025 import com.google.common.base.Ascii; 026 import com.google.common.base.CharMatcher; 027 import com.google.common.base.Joiner; 028 import com.google.common.base.Objects; 029 import com.google.common.base.Splitter; 030 import com.google.common.collect.ImmutableList; 031 032 import java.util.List; 033 034 import javax.annotation.Nullable; 035 036 /** 037 * An immutable well-formed internet domain name, such as {@code com} or {@code 038 * foo.co.uk}. Only syntactic analysis is performed; no DNS lookups or other 039 * network interactions take place. Thus there is no guarantee that the domain 040 * actually exists on the internet. 041 * 042 * <p>One common use of this class is to determine whether a given string is 043 * likely to represent an addressable domain on the web -- that is, for a 044 * candidate string {@code "xxx"}, might browsing to {@code "http://xxx/"} 045 * result in a webpage being displayed? In the past, this test was frequently 046 * done by determining whether the domain ended with a {@linkplain 047 * #isPublicSuffix() public suffix} but was not itself a public suffix. However, 048 * this test is no longer accurate. There are many domains which are both public 049 * suffixes and addressable as hosts; {@code "uk.com"} is one example. As a 050 * result, the only useful test to determine if a domain is a plausible web host 051 * is {@link #hasPublicSuffix()}. This will return {@code true} for many domains 052 * which (currently) are not hosts, such as {@code "com"}), but given that any 053 * public suffix may become a host without warning, it is better to err on the 054 * side of permissiveness and thus avoid spurious rejection of valid sites. 055 * 056 * <p>During construction, names are normalized in two ways: 057 * <ol> 058 * <li>ASCII uppercase characters are converted to lowercase. 059 * <li>Unicode dot separators other than the ASCII period ({@code '.'}) are 060 * converted to the ASCII period. 061 * </ol> 062 * The normalized values will be returned from {@link #name()} and 063 * {@link #parts()}, and will be reflected in the result of 064 * {@link #equals(Object)}. 065 * 066 * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name"> 067 * internationalized domain names</a> such as {@code 网络.cn} are supported, as 068 * are the equivalent <a 069 * href="http://en.wikipedia.org/wiki/Internationalized_domain_name">IDNA 070 * Punycode-encoded</a> versions. 071 * 072 * @author Craig Berry 073 * @since 5.0 074 */ 075 @Beta 076 @GwtCompatible(emulated = true) 077 public final class InternetDomainName { 078 079 private static final CharMatcher DOTS_MATCHER = 080 CharMatcher.anyOf(".\u3002\uFF0E\uFF61"); 081 private static final Splitter DOT_SPLITTER = Splitter.on('.'); 082 private static final Joiner DOT_JOINER = Joiner.on('.'); 083 084 /** 085 * Value of {@link #publicSuffixIndex} which indicates that no public suffix 086 * was found. 087 */ 088 private static final int NO_PUBLIC_SUFFIX_FOUND = -1; 089 090 private static final String DOT_REGEX = "\\."; 091 092 /** 093 * Maximum parts (labels) in a domain name. This value arises from 094 * the 255-octet limit described in 095 * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11 with 096 * the fact that the encoding of each part occupies at least two bytes 097 * (dot plus label externally, length byte plus label internally). Thus, if 098 * all labels have the minimum size of one byte, 127 of them will fit. 099 */ 100 private static final int MAX_PARTS = 127; 101 102 /** 103 * Maximum length of a full domain name, including separators, and 104 * leaving room for the root label. See 105 * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11. 106 */ 107 private static final int MAX_LENGTH = 253; 108 109 /** 110 * Maximum size of a single part of a domain name. See 111 * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11. 112 */ 113 private static final int MAX_DOMAIN_PART_LENGTH = 63; 114 115 /** 116 * The full domain name, converted to lower case. 117 */ 118 private final String name; 119 120 /** 121 * The parts of the domain name, converted to lower case. 122 */ 123 private final ImmutableList<String> parts; 124 125 /** 126 * The index in the {@link #parts()} list at which the public suffix begins. 127 * For example, for the domain name {@code www.google.co.uk}, the value would 128 * be 2 (the index of the {@code co} part). The value is negative 129 * (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was 130 * found. 131 */ 132 private final int publicSuffixIndex; 133 134 /** 135 * Constructor used to implement {@link #from(String)}, and from subclasses. 136 */ 137 InternetDomainName(String name) { 138 // Normalize: 139 // * ASCII characters to lowercase 140 // * All dot-like characters to '.' 141 // * Strip trailing '.' 142 143 name = Ascii.toLowerCase(DOTS_MATCHER.replaceFrom(name, '.')); 144 145 if (name.endsWith(".")) { 146 name = name.substring(0, name.length() - 1); 147 } 148 149 checkArgument(name.length() <= MAX_LENGTH, "Domain name too long: '%s':", name); 150 this.name = name; 151 152 this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name)); 153 checkArgument(parts.size() <= MAX_PARTS, "Domain has too many parts: '%s'", name); 154 checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name); 155 156 this.publicSuffixIndex = findPublicSuffix(); 157 } 158 159 /** 160 * Returns the index of the leftmost part of the public suffix, or -1 if not 161 * found. Note that the value defined as the "public suffix" may not be a 162 * public suffix according to {@link #isPublicSuffix()} if the domain ends 163 * with an excluded domain pattern such as {@code "nhs.uk"}. 164 */ 165 private int findPublicSuffix() { 166 final int partsSize = parts.size(); 167 168 for (int i = 0; i < partsSize; i++) { 169 String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize)); 170 171 if (TldPatterns.EXACT.contains(ancestorName)) { 172 return i; 173 } 174 175 // Excluded domains (e.g. !nhs.uk) use the next highest 176 // domain as the effective public suffix (e.g. uk). 177 178 if (TldPatterns.EXCLUDED.contains(ancestorName)) { 179 return i + 1; 180 } 181 182 if (matchesWildcardPublicSuffix(ancestorName)) { 183 return i; 184 } 185 } 186 187 return NO_PUBLIC_SUFFIX_FOUND; 188 } 189 190 /** 191 * A deprecated synonym for {@link #from(String)}. 192 * 193 * @param domain A domain name (not IP address) 194 * @throws IllegalArgumentException if {@code name} is not syntactically valid 195 * according to {@link #isValidLenient} 196 * @since 8.0 (previously named {@code from}) 197 * @deprecated Use {@link #from(String)} 198 */ 199 @Deprecated 200 public static InternetDomainName fromLenient(String domain) { 201 return from(domain); 202 } 203 204 /** 205 * Returns an instance of {@link InternetDomainName} after lenient 206 * validation. Specifically, validation against <a 207 * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a> 208 * ("Internationalizing Domain Names in Applications") is skipped, while 209 * validation against <a 210 * href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a> is relaxed in 211 * the following ways: 212 * <ul> 213 * <li>Any part containing non-ASCII characters is considered valid. 214 * <li>Underscores ('_') are permitted wherever dashes ('-') are permitted. 215 * <li>Parts other than the final part may start with a digit. 216 * </ul> 217 * 218 * @param domain A domain name (not IP address) 219 * @throws IllegalArgumentException if {@code name} is not syntactically valid 220 * according to {@link #isValidLenient} 221 * @since 10.0 (previously named {@code fromLenient}) 222 */ 223 public static InternetDomainName from(String domain) { 224 return new InternetDomainName(checkNotNull(domain)); 225 } 226 227 /** 228 * Validation method used by {@from} to ensure that the domain name is 229 * syntactically valid according to RFC 1035. 230 * 231 * @return Is the domain name syntactically valid? 232 */ 233 private static boolean validateSyntax(List<String> parts) { 234 final int lastIndex = parts.size() - 1; 235 236 // Validate the last part specially, as it has different syntax rules. 237 238 if (!validatePart(parts.get(lastIndex), true)) { 239 return false; 240 } 241 242 for (int i = 0; i < lastIndex; i++) { 243 String part = parts.get(i); 244 if (!validatePart(part, false)) { 245 return false; 246 } 247 } 248 249 return true; 250 } 251 252 private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_"); 253 254 private static final CharMatcher PART_CHAR_MATCHER = 255 CharMatcher.JAVA_LETTER_OR_DIGIT.or(DASH_MATCHER); 256 257 /** 258 * Helper method for {@link #validateSyntax(List)}. Validates that one part of 259 * a domain name is valid. 260 * 261 * @param part The domain name part to be validated 262 * @param isFinalPart Is this the final (rightmost) domain part? 263 * @return Whether the part is valid 264 */ 265 private static boolean validatePart(String part, boolean isFinalPart) { 266 267 // These tests could be collapsed into one big boolean expression, but 268 // they have been left as independent tests for clarity. 269 270 if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) { 271 return false; 272 } 273 274 /* 275 * GWT claims to support java.lang.Character's char-classification methods, 276 * but it actually only works for ASCII. So for now, assume any non-ASCII 277 * characters are valid. The only place this seems to be documented is here: 278 * http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html 279 * 280 * <p>ASCII characters in the part are expected to be valid per RFC 1035, 281 * with underscore also being allowed due to widespread practice. 282 */ 283 284 String asciiChars = CharMatcher.ASCII.retainFrom(part); 285 286 if (!PART_CHAR_MATCHER.matchesAllOf(asciiChars)) { 287 return false; 288 } 289 290 // No initial or final dashes or underscores. 291 292 if (DASH_MATCHER.matches(part.charAt(0)) 293 || DASH_MATCHER.matches(part.charAt(part.length() - 1))) { 294 return false; 295 } 296 297 /* 298 * Note that we allow (in contravention of a strict interpretation of the 299 * relevant RFCs) domain parts other than the last may begin with a digit 300 * (for example, "3com.com"). It's important to disallow an initial digit in 301 * the last part; it's the only thing that stops an IPv4 numeric address 302 * like 127.0.0.1 from looking like a valid domain name. 303 */ 304 305 if (isFinalPart && CharMatcher.DIGIT.matches(part.charAt(0))) { 306 return false; 307 } 308 309 return true; 310 } 311 312 /** 313 * Returns the domain name, normalized to all lower case. 314 */ 315 public String name() { 316 return name; 317 } 318 319 /** 320 * Returns the individual components of this domain name, normalized to all 321 * lower case. For example, for the domain name {@code mail.google.com}, this 322 * method returns the list {@code ["mail", "google", "com"]}. 323 */ 324 public ImmutableList<String> parts() { 325 return parts; 326 } 327 328 /** 329 * Indicates whether this domain name represents a <i>public suffix</i>, as 330 * defined by the Mozilla Foundation's 331 * <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public 332 * suffix is one under which Internet users can directly register names, such 333 * as {@code com}, {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain 334 * names that are <i>not</i> public suffixes include {@code google}, {@code 335 * google.com} and {@code foo.co.uk}. 336 * 337 * @return {@code true} if this domain name appears exactly on the public 338 * suffix list 339 * @since 6.0 340 */ 341 public boolean isPublicSuffix() { 342 return publicSuffixIndex == 0; 343 } 344 345 /** 346 * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix() 347 * public suffix}, including if it is a public suffix itself. For example, 348 * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and 349 * {@code com}, but not for {@code google} or {@code google.foo}. This is 350 * the recommended method for determining whether a domain is potentially an 351 * addressable host. 352 * 353 * @since 6.0 354 */ 355 public boolean hasPublicSuffix() { 356 return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND; 357 } 358 359 /** 360 * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the 361 * domain name, or {@code null} if no public suffix is present. 362 * 363 * @since 6.0 364 */ 365 public InternetDomainName publicSuffix() { 366 return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null; 367 } 368 369 /** 370 * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix() 371 * public suffix}, while not being a public suffix itself. For example, 372 * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and 373 * {@code bar.ca.us}, but not for {@code google}, {@code com}, or {@code 374 * google.foo}. 375 * 376 * <p><b>Warning:</b> a {@code false} result from this method does not imply 377 * that the domain does not represent an addressable host, as many public 378 * suffixes are also addressable hosts. Use {@link #hasPublicSuffix()} for 379 * that test. 380 * 381 * <p>This method can be used to determine whether it will probably be 382 * possible to set cookies on the domain, though even that depends on 383 * individual browsers' implementations of cookie controls. See 384 * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details. 385 * 386 * @since 6.0 387 */ 388 public boolean isUnderPublicSuffix() { 389 return publicSuffixIndex > 0; 390 } 391 392 /** 393 * Indicates whether this domain name is composed of exactly one subdomain 394 * component followed by a {@linkplain #isPublicSuffix() public suffix}. For 395 * example, returns {@code true} for {@code google.com} and {@code foo.co.uk}, 396 * but not for {@code www.google.com} or {@code co.uk}. 397 * 398 * <p><b>Warning:</b> A {@code true} result from this method does not imply 399 * that the domain is at the highest level which is addressable as a host, as 400 * many public suffixes are also addressable hosts. For example, the domain 401 * {@code bar.uk.com} has a public suffix of {@code uk.com}, so it would 402 * return {@code true} from this method. But {@code uk.com} is itself an 403 * addressable host. 404 * 405 * <p>This method can be used to determine whether a domain is probably the 406 * highest level for which cookies may be set, though even that depends on 407 * individual browsers' implementations of cookie controls. See 408 * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details. 409 * 410 * @since 6.0 411 */ 412 public boolean isTopPrivateDomain() { 413 return publicSuffixIndex == 1; 414 } 415 416 /** 417 * Returns the portion of this domain name that is one level beneath the 418 * public suffix. For example, for {@code x.adwords.google.co.uk} it returns 419 * {@code google.co.uk}, since {@code co.uk} is a public suffix. 420 * 421 * <p>If {@link #isTopPrivateDomain()} is true, the current domain name 422 * instance is returned. 423 * 424 * <p>This method should not be used to determine the topmost parent domain 425 * which is addressable as a host, as many public suffixes are also 426 * addressable hosts. For example, the domain {@code foo.bar.uk.com} has 427 * a public suffix of {@code uk.com}, so it would return {@code bar.uk.com} 428 * from this method. But {@code uk.com} is itself an addressable host. 429 * 430 * <p>This method can be used to determine the probable highest level parent 431 * domain for which cookies may be set, though even that depends on individual 432 * browsers' implementations of cookie controls. 433 * 434 * @throws IllegalStateException if this domain does not end with a 435 * public suffix 436 * @since 6.0 437 */ 438 public InternetDomainName topPrivateDomain() { 439 if (isTopPrivateDomain()) { 440 return this; 441 } 442 checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name); 443 return ancestor(publicSuffixIndex - 1); 444 } 445 446 /** 447 * Indicates whether this domain is composed of two or more parts. 448 */ 449 public boolean hasParent() { 450 return parts.size() > 1; 451 } 452 453 /** 454 * Returns an {@code InternetDomainName} that is the immediate ancestor of 455 * this one; that is, the current domain with the leftmost part removed. For 456 * example, the parent of {@code www.google.com} is {@code google.com}. 457 * 458 * @throws IllegalStateException if the domain has no parent, as determined 459 * by {@link #hasParent} 460 */ 461 public InternetDomainName parent() { 462 checkState(hasParent(), "Domain '%s' has no parent", name); 463 return ancestor(1); 464 } 465 466 /** 467 * Returns the ancestor of the current domain at the given number of levels 468 * "higher" (rightward) in the subdomain list. The number of levels must be 469 * non-negative, and less than {@code N-1}, where {@code N} is the number of 470 * parts in the domain. 471 * 472 * <p>TODO: Reasonable candidate for addition to public API. 473 */ 474 private InternetDomainName ancestor(int levels) { 475 return from(DOT_JOINER.join(parts.subList(levels, parts.size()))); 476 } 477 478 /** 479 * Creates and returns a new {@code InternetDomainName} by prepending the 480 * argument and a dot to the current name. For example, {@code 481 * InternetDomainName.fromLenient("foo.com").child("www.bar")} returns a new 482 * {@code InternetDomainName} with the value {@code www.bar.foo.com}. Only 483 * lenient validation is performed, as described {@link #from(String) here}. 484 * 485 * @throws NullPointerException if leftParts is null 486 * @throws IllegalArgumentException if the resulting name is not valid 487 */ 488 public InternetDomainName child(String leftParts) { 489 return from(checkNotNull(leftParts) + "." + name); 490 } 491 492 /** 493 * A deprecated synonym for {@link #isValid(String)}. 494 * 495 * @since 8.0 (previously named {@code isValid}) 496 * @deprecated Use {@link #isValid(String)} instead 497 */ 498 @Deprecated 499 public static boolean isValidLenient(String name) { 500 return isValid(name); 501 } 502 503 /** 504 * Indicates whether the argument is a syntactically valid domain name using 505 * lenient validation. Specifically, validation against <a 506 * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a> 507 * ("Internationalizing Domain Names in Applications") is skipped. 508 * 509 * <p>The following two code snippets are equivalent: 510 * 511 * <pre> {@code 512 * 513 * domainName = InternetDomainName.isValid(name) 514 * ? InternetDomainName.from(name) 515 * : DEFAULT_DOMAIN; 516 * }</pre> 517 * 518 * <pre> {@code 519 * 520 * try { 521 * domainName = InternetDomainName.from(name); 522 * } catch (IllegalArgumentException e) { 523 * domainName = DEFAULT_DOMAIN; 524 * }}</pre> 525 * 526 * @since 8.0 (previously named {@code isValidLenient}) 527 */ 528 public static boolean isValid(String name) { 529 try { 530 from(name); 531 return true; 532 } catch (IllegalArgumentException e) { 533 return false; 534 } 535 } 536 537 /** 538 * Does the domain name match one of the "wildcard" patterns (e.g. 539 * {@code "*.ar"})? 540 */ 541 private static boolean matchesWildcardPublicSuffix(String domain) { 542 final String[] pieces = domain.split(DOT_REGEX, 2); 543 return pieces.length == 2 && TldPatterns.UNDER.contains(pieces[1]); 544 } 545 546 // TODO: specify this to return the same as name(); remove name() 547 @Override 548 public String toString() { 549 return Objects.toStringHelper(this).add("name", name).toString(); 550 } 551 552 @Override 553 public boolean equals(@Nullable Object object) { 554 if (object == this) { 555 return true; 556 } 557 558 if (object instanceof InternetDomainName) { 559 InternetDomainName that = (InternetDomainName) object; 560 return this.name.equals(that.name); 561 } 562 563 return false; 564 } 565 566 @Override 567 public int hashCode() { 568 return name.hashCode(); 569 } 570 }