001    /*
002     * Copyright (C) 2009 The Guava Authors
003     *
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     *
008     * http://www.apache.org/licenses/LICENSE-2.0
009     *
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package com.google.common.net;
018    
019    import static com.google.common.base.Preconditions.checkArgument;
020    import static com.google.common.base.Preconditions.checkNotNull;
021    import static com.google.common.base.Preconditions.checkState;
022    
023    import com.google.common.annotations.Beta;
024    import com.google.common.annotations.GwtCompatible;
025    import com.google.common.base.Ascii;
026    import com.google.common.base.CharMatcher;
027    import com.google.common.base.Joiner;
028    import com.google.common.base.Objects;
029    import com.google.common.base.Splitter;
030    import com.google.common.collect.ImmutableList;
031    
032    import java.util.List;
033    
034    import javax.annotation.Nullable;
035    
036    /**
037     * An immutable well-formed internet domain name, such as {@code com} or {@code
038     * foo.co.uk}. Only syntactic analysis is performed; no DNS lookups or other
039     * network interactions take place. Thus there is no guarantee that the domain
040     * actually exists on the internet.
041     *
042     * <p>One common use of this class is to determine whether a given string is
043     * likely to represent an addressable domain on the web -- that is, for a
044     * candidate string {@code "xxx"}, might browsing to {@code "http://xxx/"}
045     * result in a webpage being displayed? In the past, this test was frequently
046     * done by determining whether the domain ended with a {@linkplain
047     * #isPublicSuffix() public suffix} but was not itself a public suffix. However,
048     * this test is no longer accurate. There are many domains which are both public
049     * suffixes and addressable as hosts; {@code "uk.com"} is one example. As a
050     * result, the only useful test to determine if a domain is a plausible web host
051     * is {@link #hasPublicSuffix()}. This will return {@code true} for many domains
052     * which (currently) are not hosts, such as {@code "com"}), but given that any
053     * public suffix may become a host without warning, it is better to err on the
054     * side of permissiveness and thus avoid spurious rejection of valid sites.
055     *
056     * <p>During construction, names are normalized in two ways:
057     * <ol>
058     * <li>ASCII uppercase characters are converted to lowercase.
059     * <li>Unicode dot separators other than the ASCII period ({@code '.'}) are
060     * converted to the ASCII period.
061     * </ol>
062     * The normalized values will be returned from {@link #name()} and
063     * {@link #parts()}, and will be reflected in the result of
064     * {@link #equals(Object)}.
065     *
066     * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name">
067     * internationalized domain names</a> such as {@code 网络.cn} are supported, as
068     * are the equivalent <a
069     * href="http://en.wikipedia.org/wiki/Internationalized_domain_name">IDNA
070     * Punycode-encoded</a> versions.
071     *
072     * @author Craig Berry
073     * @since 5.0
074     */
075    @Beta
076    @GwtCompatible(emulated = true)
077    public final class InternetDomainName {
078    
079      private static final CharMatcher DOTS_MATCHER =
080          CharMatcher.anyOf(".\u3002\uFF0E\uFF61");
081      private static final Splitter DOT_SPLITTER = Splitter.on('.');
082      private static final Joiner DOT_JOINER = Joiner.on('.');
083    
084      /**
085       * Value of {@link #publicSuffixIndex} which indicates that no public suffix
086       * was found.
087       */
088      private static final int NO_PUBLIC_SUFFIX_FOUND = -1;
089    
090      private static final String DOT_REGEX = "\\.";
091    
092      /**
093       * Maximum parts (labels) in a domain name. This value arises from
094       * the 255-octet limit described in
095       * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11 with
096       * the fact that the encoding of each part occupies at least two bytes
097       * (dot plus label externally, length byte plus label internally). Thus, if
098       * all labels have the minimum size of one byte, 127 of them will fit.
099       */
100      private static final int MAX_PARTS = 127;
101    
102      /**
103       * Maximum length of a full domain name, including separators, and
104       * leaving room for the root label. See
105       * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
106       */
107      private static final int MAX_LENGTH = 253;
108    
109      /**
110       * Maximum size of a single part of a domain name. See
111       * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
112       */
113      private static final int MAX_DOMAIN_PART_LENGTH = 63;
114    
115      /**
116       * The full domain name, converted to lower case.
117       */
118      private final String name;
119    
120      /**
121       * The parts of the domain name, converted to lower case.
122       */
123      private final ImmutableList<String> parts;
124    
125      /**
126       * The index in the {@link #parts()} list at which the public suffix begins.
127       * For example, for the domain name {@code www.google.co.uk}, the value would
128       * be 2 (the index of the {@code co} part). The value is negative
129       * (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was
130       * found.
131       */
132      private final int publicSuffixIndex;
133    
134      /**
135       * Constructor used to implement {@link #from(String)}, and from subclasses.
136       */
137      InternetDomainName(String name) {
138        // Normalize:
139        // * ASCII characters to lowercase
140        // * All dot-like characters to '.'
141        // * Strip trailing '.'
142    
143        name = Ascii.toLowerCase(DOTS_MATCHER.replaceFrom(name, '.'));
144    
145        if (name.endsWith(".")) {
146          name = name.substring(0, name.length() - 1);
147        }
148    
149        checkArgument(name.length() <= MAX_LENGTH, "Domain name too long: '%s':", name);
150        this.name = name;
151    
152        this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name));
153        checkArgument(parts.size() <= MAX_PARTS, "Domain has too many parts: '%s'", name);
154        checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name);
155    
156        this.publicSuffixIndex = findPublicSuffix();
157      }
158    
159      /**
160       * Returns the index of the leftmost part of the public suffix, or -1 if not
161       * found. Note that the value defined as the "public suffix" may not be a
162       * public suffix according to {@link #isPublicSuffix()} if the domain ends
163       * with an excluded domain pattern such as {@code "nhs.uk"}.
164       */
165      private int findPublicSuffix() {
166        final int partsSize = parts.size();
167    
168        for (int i = 0; i < partsSize; i++) {
169          String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize));
170    
171          if (TldPatterns.EXACT.contains(ancestorName)) {
172            return i;
173          }
174    
175          // Excluded domains (e.g. !nhs.uk) use the next highest
176          // domain as the effective public suffix (e.g. uk).
177    
178          if (TldPatterns.EXCLUDED.contains(ancestorName)) {
179            return i + 1;
180          }
181    
182          if (matchesWildcardPublicSuffix(ancestorName)) {
183            return i;
184          }
185        }
186    
187        return NO_PUBLIC_SUFFIX_FOUND;
188      }
189    
190      /**
191       * A deprecated synonym for {@link #from(String)}.
192       *
193       * @param domain A domain name (not IP address)
194       * @throws IllegalArgumentException if {@code name} is not syntactically valid
195       *     according to {@link #isValidLenient}
196       * @since 8.0 (previously named {@code from})
197       * @deprecated Use {@link #from(String)}
198       */
199      @Deprecated
200      public static InternetDomainName fromLenient(String domain) {
201        return from(domain);
202      }
203    
204      /**
205       * Returns an instance of {@link InternetDomainName} after lenient
206       * validation.  Specifically, validation against <a
207       * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
208       * ("Internationalizing Domain Names in Applications") is skipped, while
209       * validation against <a
210       * href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a> is relaxed in
211       * the following ways:
212       * <ul>
213       * <li>Any part containing non-ASCII characters is considered valid.
214       * <li>Underscores ('_') are permitted wherever dashes ('-') are permitted.
215       * <li>Parts other than the final part may start with a digit.
216       * </ul>
217       *
218       * @param domain A domain name (not IP address)
219       * @throws IllegalArgumentException if {@code name} is not syntactically valid
220       *     according to {@link #isValidLenient}
221       * @since 10.0 (previously named {@code fromLenient})
222       */
223      public static InternetDomainName from(String domain) {
224        return new InternetDomainName(checkNotNull(domain));
225      }
226    
227      /**
228       * Validation method used by {@from} to ensure that the domain name is
229       * syntactically valid according to RFC 1035.
230       *
231       * @return Is the domain name syntactically valid?
232       */
233      private static boolean validateSyntax(List<String> parts) {
234        final int lastIndex = parts.size() - 1;
235    
236        // Validate the last part specially, as it has different syntax rules.
237    
238        if (!validatePart(parts.get(lastIndex), true)) {
239          return false;
240        }
241    
242        for (int i = 0; i < lastIndex; i++) {
243          String part = parts.get(i);
244          if (!validatePart(part, false)) {
245            return false;
246          }
247        }
248    
249        return true;
250      }
251    
252      private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_");
253    
254      private static final CharMatcher PART_CHAR_MATCHER =
255          CharMatcher.JAVA_LETTER_OR_DIGIT.or(DASH_MATCHER);
256    
257      /**
258       * Helper method for {@link #validateSyntax(List)}. Validates that one part of
259       * a domain name is valid.
260       *
261       * @param part The domain name part to be validated
262       * @param isFinalPart Is this the final (rightmost) domain part?
263       * @return Whether the part is valid
264       */
265      private static boolean validatePart(String part, boolean isFinalPart) {
266    
267        // These tests could be collapsed into one big boolean expression, but
268        // they have been left as independent tests for clarity.
269    
270        if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) {
271          return false;
272        }
273    
274        /*
275         * GWT claims to support java.lang.Character's char-classification methods,
276         * but it actually only works for ASCII. So for now, assume any non-ASCII
277         * characters are valid. The only place this seems to be documented is here:
278         * http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html
279         *
280         * <p>ASCII characters in the part are expected to be valid per RFC 1035,
281         * with underscore also being allowed due to widespread practice.
282         */
283    
284        String asciiChars = CharMatcher.ASCII.retainFrom(part);
285    
286        if (!PART_CHAR_MATCHER.matchesAllOf(asciiChars)) {
287          return false;
288        }
289    
290        // No initial or final dashes or underscores.
291    
292        if (DASH_MATCHER.matches(part.charAt(0))
293            || DASH_MATCHER.matches(part.charAt(part.length() - 1))) {
294          return false;
295        }
296    
297        /*
298         * Note that we allow (in contravention of a strict interpretation of the
299         * relevant RFCs) domain parts other than the last may begin with a digit
300         * (for example, "3com.com"). It's important to disallow an initial digit in
301         * the last part; it's the only thing that stops an IPv4 numeric address
302         * like 127.0.0.1 from looking like a valid domain name.
303         */
304    
305        if (isFinalPart && CharMatcher.DIGIT.matches(part.charAt(0))) {
306          return false;
307        }
308    
309        return true;
310      }
311    
312      /**
313       * Returns the domain name, normalized to all lower case.
314       */
315      public String name() {
316        return name;
317      }
318    
319      /**
320       * Returns the individual components of this domain name, normalized to all
321       * lower case. For example, for the domain name {@code mail.google.com}, this
322       * method returns the list {@code ["mail", "google", "com"]}.
323       */
324      public ImmutableList<String> parts() {
325        return parts;
326      }
327    
328      /**
329       * Indicates whether this domain name represents a <i>public suffix</i>, as
330       * defined by the Mozilla Foundation's
331       * <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public
332       * suffix is one under which Internet users can directly register names, such
333       * as {@code com}, {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain
334       * names that are <i>not</i> public suffixes include {@code google}, {@code
335       * google.com} and {@code foo.co.uk}.
336       *
337       * @return {@code true} if this domain name appears exactly on the public
338       *     suffix list
339       * @since 6.0
340       */
341      public boolean isPublicSuffix() {
342        return publicSuffixIndex == 0;
343      }
344    
345      /**
346       * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
347       * public suffix}, including if it is a public suffix itself. For example,
348       * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
349       * {@code com}, but not for {@code google} or {@code google.foo}. This is
350       * the recommended method for determining whether a domain is potentially an
351       * addressable host.
352       *
353       * @since 6.0
354       */
355      public boolean hasPublicSuffix() {
356        return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND;
357      }
358    
359      /**
360       * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the
361       * domain name, or {@code null} if no public suffix is present.
362       *
363       * @since 6.0
364       */
365      public InternetDomainName publicSuffix() {
366        return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null;
367      }
368    
369      /**
370       * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
371       * public suffix}, while not being a public suffix itself. For example,
372       * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
373       * {@code bar.ca.us}, but not for {@code google}, {@code com}, or {@code
374       * google.foo}.
375       *
376       * <p><b>Warning:</b> a {@code false} result from this method does not imply
377       * that the domain does not represent an addressable host, as many public
378       * suffixes are also addressable hosts. Use {@link #hasPublicSuffix()} for
379       * that test.
380       *
381       * <p>This method can be used to determine whether it will probably be
382       * possible to set cookies on the domain, though even that depends on
383       * individual browsers' implementations of cookie controls. See
384       * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
385       *
386       * @since 6.0
387       */
388      public boolean isUnderPublicSuffix() {
389        return publicSuffixIndex > 0;
390      }
391    
392      /**
393       * Indicates whether this domain name is composed of exactly one subdomain
394       * component followed by a {@linkplain #isPublicSuffix() public suffix}. For
395       * example, returns {@code true} for {@code google.com} and {@code foo.co.uk},
396       * but not for {@code www.google.com} or {@code co.uk}.
397       *
398       * <p><b>Warning:</b> A {@code true} result from this method does not imply
399       * that the domain is at the highest level which is addressable as a host, as
400       * many public suffixes are also addressable hosts. For example, the domain
401       * {@code bar.uk.com} has a public suffix of {@code uk.com}, so it would
402       * return {@code true} from this method. But {@code uk.com} is itself an
403       * addressable host.
404       *
405       * <p>This method can be used to determine whether a domain is probably the
406       * highest level for which cookies may be set, though even that depends on
407       * individual browsers' implementations of cookie controls. See
408       * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
409       *
410       * @since 6.0
411       */
412      public boolean isTopPrivateDomain() {
413        return publicSuffixIndex == 1;
414      }
415    
416      /**
417       * Returns the portion of this domain name that is one level beneath the
418       * public suffix. For example, for {@code x.adwords.google.co.uk} it returns
419       * {@code google.co.uk}, since {@code co.uk} is a public suffix.
420       *
421       * <p>If {@link #isTopPrivateDomain()} is true, the current domain name
422       * instance is returned.
423       *
424       * <p>This method should not be used to determine the topmost parent domain
425       * which is addressable as a host, as many public suffixes are also
426       * addressable hosts. For example, the domain {@code foo.bar.uk.com} has
427       * a public suffix of {@code uk.com}, so it would return {@code bar.uk.com}
428       * from this method. But {@code uk.com} is itself an addressable host.
429       *
430       * <p>This method can be used to determine the probable highest level parent
431       * domain for which cookies may be set, though even that depends on individual
432       * browsers' implementations of cookie controls.
433       *
434       * @throws IllegalStateException if this domain does not end with a
435       *     public suffix
436       * @since 6.0
437       */
438      public InternetDomainName topPrivateDomain() {
439        if (isTopPrivateDomain()) {
440          return this;
441        }
442        checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name);
443        return ancestor(publicSuffixIndex - 1);
444      }
445    
446      /**
447       * Indicates whether this domain is composed of two or more parts.
448       */
449      public boolean hasParent() {
450        return parts.size() > 1;
451      }
452    
453      /**
454       * Returns an {@code InternetDomainName} that is the immediate ancestor of
455       * this one; that is, the current domain with the leftmost part removed. For
456       * example, the parent of {@code www.google.com} is {@code google.com}.
457       *
458       * @throws IllegalStateException if the domain has no parent, as determined
459       *     by {@link #hasParent}
460       */
461      public InternetDomainName parent() {
462        checkState(hasParent(), "Domain '%s' has no parent", name);
463        return ancestor(1);
464      }
465    
466      /**
467       * Returns the ancestor of the current domain at the given number of levels
468       * "higher" (rightward) in the subdomain list. The number of levels must be
469       * non-negative, and less than {@code N-1}, where {@code N} is the number of
470       * parts in the domain.
471       *
472       * <p>TODO: Reasonable candidate for addition to public API.
473       */
474      private InternetDomainName ancestor(int levels) {
475        return from(DOT_JOINER.join(parts.subList(levels, parts.size())));
476      }
477    
478      /**
479       * Creates and returns a new {@code InternetDomainName} by prepending the
480       * argument and a dot to the current name. For example, {@code
481       * InternetDomainName.fromLenient("foo.com").child("www.bar")} returns a new
482       * {@code InternetDomainName} with the value {@code www.bar.foo.com}. Only
483       * lenient validation is performed, as described {@link #from(String) here}.
484       *
485       * @throws NullPointerException if leftParts is null
486       * @throws IllegalArgumentException if the resulting name is not valid
487       */
488      public InternetDomainName child(String leftParts) {
489        return from(checkNotNull(leftParts) + "." + name);
490      }
491    
492      /**
493       * A deprecated synonym for {@link #isValid(String)}.
494       *
495       * @since 8.0 (previously named {@code isValid})
496       * @deprecated Use {@link #isValid(String)} instead
497       */
498      @Deprecated
499      public static boolean isValidLenient(String name) {
500        return isValid(name);
501      }
502    
503      /**
504       * Indicates whether the argument is a syntactically valid domain name using
505       * lenient validation. Specifically, validation against <a
506       * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
507       * ("Internationalizing Domain Names in Applications") is skipped.
508       *
509       * <p>The following two code snippets are equivalent:
510       *
511       * <pre>   {@code
512       *
513       *   domainName = InternetDomainName.isValid(name)
514       *       ? InternetDomainName.from(name)
515       *       : DEFAULT_DOMAIN;
516       *   }</pre>
517       *
518       * <pre>   {@code
519       *
520       *   try {
521       *     domainName = InternetDomainName.from(name);
522       *   } catch (IllegalArgumentException e) {
523       *     domainName = DEFAULT_DOMAIN;
524       *   }}</pre>
525       *
526       * @since 8.0 (previously named {@code isValidLenient})
527       */
528      public static boolean isValid(String name) {
529        try {
530          from(name);
531          return true;
532        } catch (IllegalArgumentException e) {
533          return false;
534        }
535      }
536    
537      /**
538       * Does the domain name match one of the "wildcard" patterns (e.g.
539       * {@code "*.ar"})?
540       */
541      private static boolean matchesWildcardPublicSuffix(String domain) {
542        final String[] pieces = domain.split(DOT_REGEX, 2);
543        return pieces.length == 2 && TldPatterns.UNDER.contains(pieces[1]);
544      }
545    
546      // TODO: specify this to return the same as name(); remove name()
547      @Override
548      public String toString() {
549        return Objects.toStringHelper(this).add("name", name).toString();
550      }
551    
552      @Override
553      public boolean equals(@Nullable Object object) {
554        if (object == this) {
555          return true;
556        }
557    
558        if (object instanceof InternetDomainName) {
559          InternetDomainName that = (InternetDomainName) object;
560          return this.name.equals(that.name);
561        }
562    
563        return false;
564      }
565    
566      @Override
567      public int hashCode() {
568        return name.hashCode();
569      }
570    }