001package com.nimbusds.langtag; 002 003 004import java.util.LinkedList; 005import java.util.List; 006 007 008/** 009 * Language tag according to <a href="http://tools.ietf.org/html/rfc5646">RFC 5646</a>. 010 * 011 * <p>Supports normal language tags. Special private language tags beginning 012 * with "x" and grandfathered tags beginning with "i" are not supported. 013 * 014 * <p>To construct a new language tag from scratch: 015 * 016 * <pre> 017 * // English as used in the United States 018 * LangTag tag = new LangTag("en"); 019 * tag.setRegion("US"); 020 * 021 * // Returns "en-US" 022 * tag.toString(); 023 * </pre> 024 * 025 * <p>To parse a language tag: 026 * 027 * <pre> 028 * // Chinese, Mandarin, Simplified script, as used in China 029 * LangTag tag = LangTag.parse("zh-cmn-Hans-CN"); 030 * 031 * // Returns "zh" 032 * tag.getPrimaryLanguage(); 033 * 034 * // Returns "cmn" 035 * tag.getExtendedLanguageSubtags()[0]; 036 * 037 * // Returns "zh-cmn" 038 * tag.getLanguage(); 039 * 040 * // Returns "Hans" 041 * tag.getScript(); 042 * 043 * // Returns "CN" 044 * tag.getRegion(); 045 * </pre> 046 * 047 * <p>See <a href="http://tools.ietf.org/html/rfc5646">RFC 5646</a>. 048 */ 049public class LangTag implements ReadOnlyLangTag { 050 051 052 /** 053 * The primary language, as the shortest ISO 639 code (2*3ALPHA). Must 054 * always be defined, unless sufficient language subtags exist. 055 */ 056 private String primaryLanguage; 057 058 059 /** 060 * Optional extended language subtags, as three-letter ISO-639-3 codes. 061 */ 062 private String[] languageSubtags; 063 064 065 /** 066 * Optional script, (4ALPHA) ISO 15924 code. 067 */ 068 private String script = null; 069 070 071 /** 072 * Optional region, (2ALPHA) ISO 3166-1 code or (3DIGIT) UN M.49 code. 073 */ 074 private String region = null; 075 076 077 /** 078 * Optional variants, (5*8alphanum) or (DIGIT 3alphanum). 079 */ 080 private String[] variants = null; 081 082 083 /** 084 * Optional extensions. 085 */ 086 private String[] extensions = null; 087 088 089 /** 090 * Optional private use subtag. 091 */ 092 private String privateUse = null; 093 094 095 /** 096 * Ensures the specified subtag has a valid maximum length of eight 097 * characters. 098 * 099 * @param subtag The sub tag to check. Must not be {@code null}. 100 * 101 * @throws LangTagException If the subtag has length greater than eight 102 * characters. 103 */ 104 private static void ensureMaxLength(final String subtag) 105 throws LangTagException { 106 107 if (subtag.length() > 8) 108 109 // extension or private use subtag? 110 if (subtag.charAt(1) != '-' && subtag.length() > 10) 111 112 throw new LangTagException("Invalid subtag syntax: Max character length exceeded"); 113 } 114 115 116 /** 117 * Creates a new simple language tag. 118 * 119 * <p>Use for simple language tags such as "en" (English), "fr" 120 * (French) or "pt" (Portuguese). 121 * 122 * @param primaryLanguage The primary language, as the shortest two or 123 * three-letter ISO 639 code. Must not be 124 * {@code null}. 125 * 126 * @throws LangTagException If the primary language syntax is invalid. 127 */ 128 public LangTag(final String primaryLanguage) 129 throws LangTagException { 130 131 this(primaryLanguage, new String[]{}); 132 } 133 134 135 /** 136 * Creates a new extended language tag. 137 * 138 * <p>Use for extended language tags such as "zh-cmn" (Mandarin 139 * Chinese) or "zh-yue" (Cantonese Chinese). 140 * 141 * @param primaryLanguage The primary language, as the shortest two or 142 * three-letter ISO 639 code. May be {@code null} 143 * if the subtags are sufficient to identify the 144 * language. 145 * @param languageSubtags One or more extended language subtags, as 146 * three-letter ISO 639-3 codes. {@code null} if 147 * none. 148 * 149 * @throws LangTagException If the primary or extended language syntax 150 * is invalid. 151 */ 152 public LangTag(final String primaryLanguage, final String... languageSubtags) 153 throws LangTagException { 154 155 if (primaryLanguage == null && 156 (languageSubtags == null || languageSubtags.length == 0)) 157 throw new LangTagException("Either the primary language or the extended language subtags, or both must be defined"); 158 159 setPrimaryLanguage(primaryLanguage); 160 setExtendedLanguageSubtags(languageSubtags); 161 } 162 163 164 @Override 165 public String getLanguage() { 166 167 StringBuilder sb = new StringBuilder(); 168 169 if (primaryLanguage != null) 170 sb.append(primaryLanguage); 171 172 if (languageSubtags != null && languageSubtags.length > 0) { 173 174 for (String tag: languageSubtags) { 175 176 if (sb.length() > 0) 177 sb.append('-'); 178 179 sb.append(tag); 180 } 181 } 182 183 return sb.toString(); 184 } 185 186 187 @Override 188 public String getPrimaryLanguage() { 189 190 return primaryLanguage; 191 } 192 193 194 /** 195 * Checks if the specified string has a valid primary language subtag 196 * syntax. 197 * 198 * @param s The string to check. Must not be {@code null}. 199 * 200 * @return {@code true} if the syntax is correct, else {@code false}. 201 */ 202 private static boolean isPrimaryLanguage(final String s) { 203 204 return s.matches("[a-zA-Z]{2,3}"); 205 } 206 207 208 /** 209 * Sets the primary language subtag. 210 * 211 * <p>See RFC 5646 section 2.2.1. 212 * 213 * @param primaryLanguage The primary language, as the shortest two or 214 * three-letter ISO 639 code. May be 215 * {@code null}. 216 * 217 * @throws LangTagException If the primary language syntax is invalid. 218 */ 219 private void setPrimaryLanguage(final String primaryLanguage) 220 throws LangTagException { 221 222 if (primaryLanguage == null) { 223 this.primaryLanguage = null; 224 return; 225 } 226 227 ensureMaxLength(primaryLanguage); 228 229 if (! isPrimaryLanguage(primaryLanguage)) 230 throw new LangTagException("Invalid primary language subtag: Must be a two or three-letter ISO 639 code"); 231 232 this.primaryLanguage = primaryLanguage.toLowerCase(); 233 } 234 235 236 @Override 237 public String[] getExtendedLanguageSubtags() { 238 239 return languageSubtags; 240 } 241 242 243 /** 244 * Checks if the specified string has a valid extended language subtag 245 * syntax. 246 * 247 * @param s The string to check. Must not be {@code null}. 248 * 249 * @return {@code true} if the syntax is correct, else {@code false}. 250 */ 251 private static boolean isExtendedLanguageSubtag(final String s) { 252 253 return s.matches("[a-zA-Z]{3}"); 254 } 255 256 257 /** 258 * Sets the extended language subtags. 259 * 260 * <p>See RFC 5646 section 2.2.2. 261 * 262 * @param languageSubtags The extended language subtags, as three-letter 263 * ISO 639-3 codes. {@code null} if none. 264 */ 265 private void setExtendedLanguageSubtags(final String... languageSubtags) 266 throws LangTagException { 267 268 if (languageSubtags == null || languageSubtags.length == 0) { 269 this.languageSubtags = null; 270 return; 271 } 272 273 this.languageSubtags = new String[languageSubtags.length]; 274 275 for (int i=0; i < languageSubtags.length; i++) { 276 277 ensureMaxLength(languageSubtags[i]); 278 279 if (! isExtendedLanguageSubtag(languageSubtags[i])) 280 throw new LangTagException("Invalid extended language subtag: Must be a three-letter ISO 639-3 code"); 281 282 this.languageSubtags[i] = languageSubtags[i].toLowerCase(); 283 } 284 } 285 286 287 @Override 288 public String getScript() { 289 290 return script; 291 } 292 293 294 /** 295 * Checks if the specified string has a valid script subtag syntax. 296 * 297 * @param s The string to check. Must not be {@code null}. 298 * 299 * @return {@code true} if the syntax is correct, else {@code false}. 300 */ 301 private static boolean isScript(final String s) { 302 303 return s.matches("[a-zA-Z]{4}"); 304 } 305 306 307 /** 308 * Sets the script. 309 * 310 * <p>See RFC 5646 section 2.2.3. 311 * 312 * @param script The script, as a four-letter ISO 15924 code. 313 * {@code null} if not defined. 314 * 315 * @throws LangTagException If the script syntax is invalid. 316 */ 317 public void setScript(final String script) 318 throws LangTagException { 319 320 if (script == null) { 321 this.script = null; 322 return; 323 } 324 325 ensureMaxLength(script); 326 327 if (! isScript(script)) 328 throw new LangTagException("Invalid script subtag: Must be a four-letter ISO 15924 code"); 329 330 this.script = script.substring(0, 1).toUpperCase() + 331 script.substring(1).toLowerCase(); 332 } 333 334 335 @Override 336 public String getRegion() { 337 338 return region; 339 } 340 341 342 /** 343 * Checks if the specified string has a valid region subtag syntax. 344 * 345 * @param s The string to check. Must not be {@code null}. 346 * 347 * @return {@code true} if the syntax is correct, else {@code false}. 348 */ 349 private static boolean isRegion(final String s) { 350 351 return s.matches("[a-zA-Z]{2}|\\d{3}"); 352 } 353 354 355 /** 356 * Sets the region. 357 * 358 * <p>See RFC 5646 section 2.2.4. 359 * 360 * @param region The region, as a two-letter ISO 3166-1 code or a three- 361 * digit UN M.49 code. {@code null} if not defined. 362 * 363 * @throws LangTagException If the region syntax is invalid. 364 */ 365 public void setRegion(final String region) 366 throws LangTagException { 367 368 if (region == null) { 369 this.region = null; 370 return; 371 } 372 373 ensureMaxLength(region); 374 375 if (! isRegion(region)) 376 throw new LangTagException("Invalid region subtag: Must be a two-letter ISO 3166-1 code or a three-digit UN M.49 code"); 377 378 this.region = region.toUpperCase(); 379 } 380 381 382 @Override 383 public String[] getVariants() { 384 385 return variants; 386 } 387 388 389 /** 390 * Checks if the specified string has a valid variant subtag syntax. 391 * 392 * @param s The string to check. Must not be {@code null}. 393 * 394 * @return {@code true} if the syntax is correct, else {@code false}. 395 */ 396 private static boolean isVariant(final String s) { 397 398 return s.matches("[a-zA-Z][a-zA-Z0-9]{4,}|[0-9][a-zA-Z0-9]{3,}"); 399 } 400 401 402 /** 403 * Sets the variants. 404 * 405 * <p>See RFC 5646 section 2.2.5. 406 * 407 * @param variants The variants. {@code null} if not defined. 408 * 409 * @throws LangTagException If the variant syntax is invalid. 410 */ 411 public void setVariants(final String... variants) 412 throws LangTagException { 413 414 if (variants == null || variants.length == 0) { 415 this.variants = null; 416 return; 417 } 418 419 this.variants = new String[variants.length]; 420 421 for (int i=0; i < variants.length; i++) { 422 423 ensureMaxLength(variants[i]); 424 425 if (! isVariant(variants[i])) 426 throw new LangTagException("Invalid variant subtag"); 427 428 this.variants[i] = variants[i].toLowerCase(); 429 } 430 } 431 432 433 @Override 434 public String[] getExtensions() { 435 436 return extensions; 437 } 438 439 440 /** 441 * Checks if the specified string has a valid extension singleton 442 * syntax. 443 * 444 * @param s The string to check. Must not be {@code null}. 445 * 446 * @return {@code true} if the syntax is correct, else {@code false}. 447 */ 448 private static boolean isExtensionSingleton(final String s) { 449 450 return s.matches("[0-9a-wA-Wy-zY-Z]"); 451 } 452 453 454 /** 455 * Checks if the specified string has a valid extension subtag syntax. 456 * 457 * @param s The string to check. Must not be {@code null}. 458 * 459 * @return {@code true} if the syntax is correct, else {@code false}. 460 */ 461 private static boolean isExtension(final String s) { 462 463 return s.matches("[0-9a-wA-Wy-zY-Z]-[0-9a-zA-Z]+"); 464 } 465 466 467 /** 468 * Sets the extensions. 469 * 470 * <p>See RFC 5646 section 2.2.6. 471 * 472 * @param extensions The extensions. {@code null} if not defined. 473 * 474 * @throws LangTagException If the extension syntax is invalid. 475 */ 476 public void setExtensions(final String... extensions) 477 throws LangTagException { 478 479 if (extensions == null || extensions.length == 0) { 480 this.extensions = null; 481 return; 482 } 483 484 this.extensions = new String[extensions.length]; 485 486 for (int i=0; i < extensions.length; i++) { 487 488 ensureMaxLength(extensions[i]); 489 490 if (! isExtension(extensions[i])) 491 throw new LangTagException("Invalid extension subtag"); 492 493 this.extensions[i] = extensions[i].toLowerCase(); 494 } 495 } 496 497 498 @Override 499 public String getPrivateUse() { 500 501 return privateUse; 502 } 503 504 505 /** 506 * Checks if the specified string has a valid private use subtag syntax. 507 * 508 * @param s The string to check. Must not be {@code null}. 509 * 510 * @return {@code true} if the syntax is correct, else {@code false}. 511 */ 512 private static boolean isPrivateUse(final String s) { 513 514 return s.matches("x-[0-9a-zA-Z]+"); 515 } 516 517 518 /** 519 * Sets the private use. 520 * 521 * <p>See RFC 5646 section 2.2.7. 522 * 523 * @param privateUse The private use. {@code null} if not defined. 524 * 525 * @throws LangTagException If the extension syntax is invalid. 526 */ 527 public void setPrivateUse(final String privateUse) 528 throws LangTagException { 529 530 if (privateUse == null) { 531 this.privateUse = null; 532 return; 533 } 534 535 ensureMaxLength(privateUse); 536 537 if (! isPrivateUse(privateUse)) 538 throw new LangTagException("Invalid private use subtag"); 539 540 this.privateUse = privateUse.toLowerCase(); 541 } 542 543 544 @Override 545 public String toString() { 546 547 StringBuilder sb = new StringBuilder(getLanguage()); 548 549 if (script != null) { 550 sb.append('-'); 551 sb.append(script); 552 } 553 554 if (region != null) { 555 sb.append('-'); 556 sb.append(region); 557 } 558 559 if (variants != null) { 560 561 for (String v: variants) { 562 sb.append('-'); 563 sb.append(v); 564 } 565 } 566 567 if (extensions != null) { 568 569 for (String e: extensions) { 570 sb.append('-'); 571 sb.append(e); 572 } 573 } 574 575 if (privateUse != null) { 576 577 sb.append('-'); 578 sb.append(privateUse); 579 } 580 581 return sb.toString(); 582 } 583 584 585 /** 586 * Overrides {@code Object.hashCode()}. 587 * 588 * @return The object hash code. 589 */ 590 @Override 591 public int hashCode() { 592 593 return toString().hashCode(); 594 } 595 596 597 /** 598 * Overrides {@code Object.equals()}. 599 * 600 * @param object The object to compare to. 601 * 602 * @return {@code true} if the objects have the same value, otherwise 603 * {@code false}. 604 */ 605 @Override 606 public boolean equals(Object object) { 607 608 return object != null && 609 object instanceof LangTag && 610 this.toString().equals(object.toString()); 611 } 612 613 614 /** 615 * Parses the specified string representation of a language tag. 616 * 617 * @param s The string to parse. May be {@code null}. 618 * 619 * @return The language tag. {@code null} if the string was empty or 620 * {@code null}. 621 * 622 * @throws LangTagException If the string has invalid language tag 623 * syntax. 624 */ 625 public static LangTag parse(final String s) 626 throws LangTagException { 627 628 if (s == null || s.trim().isEmpty()) 629 return null; 630 631 final String[] subtags = s.split("-"); 632 633 int pos = 0; 634 635 // Parse primary lang + ext lang subtags 636 String primaryLang = null; 637 List<String> extLangSubtags = new LinkedList<String>(); 638 639 if (isPrimaryLanguage(subtags[0])) 640 primaryLang = subtags[pos++]; 641 642 // Multiple ext lang subtags possible 643 while (pos < subtags.length && isExtendedLanguageSubtag(subtags[pos])) 644 extLangSubtags.add(subtags[pos++]); 645 646 LangTag langTag = new LangTag(primaryLang, extLangSubtags.toArray(new String[]{})); 647 648 649 // Parse script 650 if (pos < subtags.length && isScript(subtags[pos])) 651 langTag.setScript(subtags[pos++]); 652 653 // Parse region 654 if (pos < subtags.length && isRegion(subtags[pos])) 655 langTag.setRegion(subtags[pos++]); 656 657 // Parse variants 658 List<String> variantSubtags = new LinkedList<String>(); 659 660 while (pos < subtags.length && isVariant(subtags[pos])) 661 variantSubtags.add(subtags[pos++]); 662 663 if (! variantSubtags.isEmpty()) 664 langTag.setVariants(variantSubtags.toArray(new String[]{})); 665 666 // Parse extensions, e.g. u-usercal 667 List<String> extSubtags = new LinkedList<String>(); 668 669 while (pos < subtags.length && isExtensionSingleton(subtags[pos])) { 670 671 String singleton = subtags[pos++]; 672 673 if (pos == subtags.length) 674 throw new LangTagException("Invalid extension subtag"); 675 676 extSubtags.add(singleton + "-" + subtags[pos++]); 677 } 678 679 if (! extSubtags.isEmpty()) 680 langTag.setExtensions(extSubtags.toArray(new String[]{})); 681 682 683 // Parse private use, e.g. x-abc 684 if (pos < subtags.length && subtags[pos].equals("x")) { 685 686 if (++pos == subtags.length) 687 throw new LangTagException("Invalid private use subtag"); 688 689 langTag.setPrivateUse("x-" + subtags[pos++]); 690 } 691 692 // End of tag? 693 if (pos < subtags.length) 694 throw new LangTagException("Invalid language tag: Unexpected subtag"); 695 696 return langTag; 697 } 698}