001/* 002 * lang-tag 003 * 004 * Copyright 2012-2016, Connect2id Ltd. 005 * 006 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use 007 * this file except in compliance with the License. You may obtain a copy of the 008 * License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software distributed 013 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 014 * CONDITIONS OF ANY KIND, either express or implied. See the License for the 015 * specific language governing permissions and limitations under the License. 016 */ 017 018package com.nimbusds.langtag; 019 020 021import java.util.LinkedList; 022import java.util.List; 023 024 025/** 026 * Language tag according to <a href="http://tools.ietf.org/html/rfc5646">RFC 5646</a>. 027 * 028 * <p>Supports normal language tags. Special private language tags beginning 029 * with "x" and grandfathered tags beginning with "i" are not supported. 030 * 031 * <p>To construct a new language tag from scratch: 032 * 033 * <pre> 034 * // English as used in the United States 035 * LangTag tag = new LangTag("en"); 036 * tag.setRegion("US"); 037 * 038 * // Returns "en-US" 039 * tag.toString(); 040 * </pre> 041 * 042 * <p>To parse a language tag: 043 * 044 * <pre> 045 * // Chinese, Mandarin, Simplified script, as used in China 046 * LangTag tag = LangTag.parse("zh-cmn-Hans-CN"); 047 * 048 * // Returns "zh" 049 * tag.getPrimaryLanguage(); 050 * 051 * // Returns "cmn" 052 * tag.getExtendedLanguageSubtags()[0]; 053 * 054 * // Returns "zh-cmn" 055 * tag.getLanguage(); 056 * 057 * // Returns "Hans" 058 * tag.getScript(); 059 * 060 * // Returns "CN" 061 * tag.getRegion(); 062 * </pre> 063 * 064 * <p>See <a href="http://tools.ietf.org/html/rfc5646">RFC 5646</a>. 065 */ 066public class LangTag implements ReadOnlyLangTag { 067 068 069 /** 070 * The primary language, as the shortest ISO 639 code (2*3ALPHA). Must 071 * always be defined, unless sufficient language subtags exist. 072 */ 073 private String primaryLanguage; 074 075 076 /** 077 * Optional extended language subtags, as three-letter ISO-639-3 codes. 078 */ 079 private String[] languageSubtags; 080 081 082 /** 083 * Optional script, (4ALPHA) ISO 15924 code. 084 */ 085 private String script = null; 086 087 088 /** 089 * Optional region, (2ALPHA) ISO 3166-1 code or (3DIGIT) UN M.49 code. 090 */ 091 private String region = null; 092 093 094 /** 095 * Optional variants, (5*8alphanum) or (DIGIT 3alphanum). 096 */ 097 private String[] variants = null; 098 099 100 /** 101 * Optional extensions. 102 */ 103 private String[] extensions = null; 104 105 106 /** 107 * Optional private use subtag. 108 */ 109 private String privateUse = null; 110 111 112 /** 113 * Ensures the specified subtag has a valid maximum length of eight 114 * characters. 115 * 116 * @param subtag The sub tag to check. Must not be {@code null}. 117 * 118 * @throws LangTagException If the subtag has length greater than eight 119 * characters. 120 */ 121 private static void ensureMaxLength(final String subtag) 122 throws LangTagException { 123 124 if (subtag.length() > 8) 125 126 // extension or private use subtag? 127 if (subtag.charAt(1) != '-' && subtag.length() > 10) 128 129 throw new LangTagException("Invalid subtag syntax: Max character length exceeded"); 130 } 131 132 133 /** 134 * Creates a new simple language tag. 135 * 136 * <p>Use for simple language tags such as "en" (English), "fr" 137 * (French) or "pt" (Portuguese). 138 * 139 * @param primaryLanguage The primary language, as the shortest two or 140 * three-letter ISO 639 code. Must not be 141 * {@code null}. 142 * 143 * @throws LangTagException If the primary language syntax is invalid. 144 */ 145 public LangTag(final String primaryLanguage) 146 throws LangTagException { 147 148 this(primaryLanguage, new String[]{}); 149 } 150 151 152 /** 153 * Creates a new extended language tag. 154 * 155 * <p>Use for extended language tags such as "zh-cmn" (Mandarin 156 * Chinese) or "zh-yue" (Cantonese Chinese). 157 * 158 * @param primaryLanguage The primary language, as the shortest two or 159 * three-letter ISO 639 code. May be {@code null} 160 * if the subtags are sufficient to identify the 161 * language. 162 * @param languageSubtags One or more extended language subtags, as 163 * three-letter ISO 639-3 codes. {@code null} if 164 * none. 165 * 166 * @throws LangTagException If the primary or extended language syntax 167 * is invalid. 168 */ 169 public LangTag(final String primaryLanguage, final String... languageSubtags) 170 throws LangTagException { 171 172 if (primaryLanguage == null && 173 (languageSubtags == null || languageSubtags.length == 0)) 174 throw new LangTagException("Either the primary language or the extended language subtags, or both must be defined"); 175 176 setPrimaryLanguage(primaryLanguage); 177 setExtendedLanguageSubtags(languageSubtags); 178 } 179 180 181 @Override 182 public String getLanguage() { 183 184 StringBuilder sb = new StringBuilder(); 185 186 if (primaryLanguage != null) 187 sb.append(primaryLanguage); 188 189 if (languageSubtags != null && languageSubtags.length > 0) { 190 191 for (String tag: languageSubtags) { 192 193 if (sb.length() > 0) 194 sb.append('-'); 195 196 sb.append(tag); 197 } 198 } 199 200 return sb.toString(); 201 } 202 203 204 @Override 205 public String getPrimaryLanguage() { 206 207 return primaryLanguage; 208 } 209 210 211 /** 212 * Checks if the specified string has a valid primary language subtag 213 * syntax. 214 * 215 * @param s The string to check. Must not be {@code null}. 216 * 217 * @return {@code true} if the syntax is correct, else {@code false}. 218 */ 219 private static boolean isPrimaryLanguage(final String s) { 220 221 return s.matches("[a-zA-Z]{2,3}"); 222 } 223 224 225 /** 226 * Sets the primary language subtag. 227 * 228 * <p>See RFC 5646 section 2.2.1. 229 * 230 * @param primaryLanguage The primary language, as the shortest two or 231 * three-letter ISO 639 code. May be 232 * {@code null}. 233 * 234 * @throws LangTagException If the primary language syntax is invalid. 235 */ 236 private void setPrimaryLanguage(final String primaryLanguage) 237 throws LangTagException { 238 239 if (primaryLanguage == null) { 240 this.primaryLanguage = null; 241 return; 242 } 243 244 ensureMaxLength(primaryLanguage); 245 246 if (! isPrimaryLanguage(primaryLanguage)) 247 throw new LangTagException("Invalid primary language subtag: Must be a two or three-letter ISO 639 code"); 248 249 this.primaryLanguage = primaryLanguage.toLowerCase(); 250 } 251 252 253 @Override 254 public String[] getExtendedLanguageSubtags() { 255 256 return languageSubtags; 257 } 258 259 260 /** 261 * Checks if the specified string has a valid extended language subtag 262 * syntax. 263 * 264 * @param s The string to check. Must not be {@code null}. 265 * 266 * @return {@code true} if the syntax is correct, else {@code false}. 267 */ 268 private static boolean isExtendedLanguageSubtag(final String s) { 269 270 return s.matches("[a-zA-Z]{3}"); 271 } 272 273 274 /** 275 * Sets the extended language subtags. 276 * 277 * <p>See RFC 5646 section 2.2.2. 278 * 279 * @param languageSubtags The extended language subtags, as three-letter 280 * ISO 639-3 codes. {@code null} if none. 281 */ 282 private void setExtendedLanguageSubtags(final String... languageSubtags) 283 throws LangTagException { 284 285 if (languageSubtags == null || languageSubtags.length == 0) { 286 this.languageSubtags = null; 287 return; 288 } 289 290 this.languageSubtags = new String[languageSubtags.length]; 291 292 for (int i=0; i < languageSubtags.length; i++) { 293 294 ensureMaxLength(languageSubtags[i]); 295 296 if (! isExtendedLanguageSubtag(languageSubtags[i])) 297 throw new LangTagException("Invalid extended language subtag: Must be a three-letter ISO 639-3 code"); 298 299 this.languageSubtags[i] = languageSubtags[i].toLowerCase(); 300 } 301 } 302 303 304 @Override 305 public String getScript() { 306 307 return script; 308 } 309 310 311 /** 312 * Checks if the specified string has a valid script subtag syntax. 313 * 314 * @param s The string to check. Must not be {@code null}. 315 * 316 * @return {@code true} if the syntax is correct, else {@code false}. 317 */ 318 private static boolean isScript(final String s) { 319 320 return s.matches("[a-zA-Z]{4}"); 321 } 322 323 324 /** 325 * Sets the script. 326 * 327 * <p>See RFC 5646 section 2.2.3. 328 * 329 * @param script The script, as a four-letter ISO 15924 code. 330 * {@code null} if not defined. 331 * 332 * @throws LangTagException If the script syntax is invalid. 333 */ 334 public void setScript(final String script) 335 throws LangTagException { 336 337 if (script == null) { 338 this.script = null; 339 return; 340 } 341 342 ensureMaxLength(script); 343 344 if (! isScript(script)) 345 throw new LangTagException("Invalid script subtag: Must be a four-letter ISO 15924 code"); 346 347 this.script = script.substring(0, 1).toUpperCase() + 348 script.substring(1).toLowerCase(); 349 } 350 351 352 @Override 353 public String getRegion() { 354 355 return region; 356 } 357 358 359 /** 360 * Checks if the specified string has a valid region subtag syntax. 361 * 362 * @param s The string to check. Must not be {@code null}. 363 * 364 * @return {@code true} if the syntax is correct, else {@code false}. 365 */ 366 private static boolean isRegion(final String s) { 367 368 return s.matches("[a-zA-Z]{2}|\\d{3}"); 369 } 370 371 372 /** 373 * Sets the region. 374 * 375 * <p>See RFC 5646 section 2.2.4. 376 * 377 * @param region The region, as a two-letter ISO 3166-1 code or a three- 378 * digit UN M.49 code. {@code null} if not defined. 379 * 380 * @throws LangTagException If the region syntax is invalid. 381 */ 382 public void setRegion(final String region) 383 throws LangTagException { 384 385 if (region == null) { 386 this.region = null; 387 return; 388 } 389 390 ensureMaxLength(region); 391 392 if (! isRegion(region)) 393 throw new LangTagException("Invalid region subtag: Must be a two-letter ISO 3166-1 code or a three-digit UN M.49 code"); 394 395 this.region = region.toUpperCase(); 396 } 397 398 399 @Override 400 public String[] getVariants() { 401 402 return variants; 403 } 404 405 406 /** 407 * Checks if the specified string has a valid variant subtag syntax. 408 * 409 * @param s The string to check. Must not be {@code null}. 410 * 411 * @return {@code true} if the syntax is correct, else {@code false}. 412 */ 413 private static boolean isVariant(final String s) { 414 415 return s.matches("[a-zA-Z][a-zA-Z0-9]{4,}|[0-9][a-zA-Z0-9]{3,}"); 416 } 417 418 419 /** 420 * Sets the variants. 421 * 422 * <p>See RFC 5646 section 2.2.5. 423 * 424 * @param variants The variants. {@code null} if not defined. 425 * 426 * @throws LangTagException If the variant syntax is invalid. 427 */ 428 public void setVariants(final String... variants) 429 throws LangTagException { 430 431 if (variants == null || variants.length == 0) { 432 this.variants = null; 433 return; 434 } 435 436 this.variants = new String[variants.length]; 437 438 for (int i=0; i < variants.length; i++) { 439 440 ensureMaxLength(variants[i]); 441 442 if (! isVariant(variants[i])) 443 throw new LangTagException("Invalid variant subtag"); 444 445 this.variants[i] = variants[i].toLowerCase(); 446 } 447 } 448 449 450 @Override 451 public String[] getExtensions() { 452 453 return extensions; 454 } 455 456 457 /** 458 * Checks if the specified string has a valid extension singleton 459 * syntax. 460 * 461 * @param s The string to check. Must not be {@code null}. 462 * 463 * @return {@code true} if the syntax is correct, else {@code false}. 464 */ 465 private static boolean isExtensionSingleton(final String s) { 466 467 return s.matches("[0-9a-wA-Wy-zY-Z]"); 468 } 469 470 471 /** 472 * Checks if the specified string has a valid extension subtag syntax. 473 * 474 * @param s The string to check. Must not be {@code null}. 475 * 476 * @return {@code true} if the syntax is correct, else {@code false}. 477 */ 478 private static boolean isExtension(final String s) { 479 480 return s.matches("[0-9a-wA-Wy-zY-Z]-[0-9a-zA-Z]+"); 481 } 482 483 484 /** 485 * Sets the extensions. 486 * 487 * <p>See RFC 5646 section 2.2.6. 488 * 489 * @param extensions The extensions. {@code null} if not defined. 490 * 491 * @throws LangTagException If the extension syntax is invalid. 492 */ 493 public void setExtensions(final String... extensions) 494 throws LangTagException { 495 496 if (extensions == null || extensions.length == 0) { 497 this.extensions = null; 498 return; 499 } 500 501 this.extensions = new String[extensions.length]; 502 503 for (int i=0; i < extensions.length; i++) { 504 505 ensureMaxLength(extensions[i]); 506 507 if (! isExtension(extensions[i])) 508 throw new LangTagException("Invalid extension subtag"); 509 510 this.extensions[i] = extensions[i].toLowerCase(); 511 } 512 } 513 514 515 @Override 516 public String getPrivateUse() { 517 518 return privateUse; 519 } 520 521 522 /** 523 * Checks if the specified string has a valid private use subtag syntax. 524 * 525 * @param s The string to check. Must not be {@code null}. 526 * 527 * @return {@code true} if the syntax is correct, else {@code false}. 528 */ 529 private static boolean isPrivateUse(final String s) { 530 531 return s.matches("x-[0-9a-zA-Z]+"); 532 } 533 534 535 /** 536 * Sets the private use. 537 * 538 * <p>See RFC 5646 section 2.2.7. 539 * 540 * @param privateUse The private use. {@code null} if not defined. 541 * 542 * @throws LangTagException If the extension syntax is invalid. 543 */ 544 public void setPrivateUse(final String privateUse) 545 throws LangTagException { 546 547 if (privateUse == null) { 548 this.privateUse = null; 549 return; 550 } 551 552 ensureMaxLength(privateUse); 553 554 if (! isPrivateUse(privateUse)) 555 throw new LangTagException("Invalid private use subtag"); 556 557 this.privateUse = privateUse.toLowerCase(); 558 } 559 560 561 @Override 562 public String toString() { 563 564 StringBuilder sb = new StringBuilder(getLanguage()); 565 566 if (script != null) { 567 sb.append('-'); 568 sb.append(script); 569 } 570 571 if (region != null) { 572 sb.append('-'); 573 sb.append(region); 574 } 575 576 if (variants != null) { 577 578 for (String v: variants) { 579 sb.append('-'); 580 sb.append(v); 581 } 582 } 583 584 if (extensions != null) { 585 586 for (String e: extensions) { 587 sb.append('-'); 588 sb.append(e); 589 } 590 } 591 592 if (privateUse != null) { 593 594 sb.append('-'); 595 sb.append(privateUse); 596 } 597 598 return sb.toString(); 599 } 600 601 602 /** 603 * Overrides {@code Object.hashCode()}. 604 * 605 * @return The object hash code. 606 */ 607 @Override 608 public int hashCode() { 609 610 return toString().hashCode(); 611 } 612 613 614 /** 615 * Overrides {@code Object.equals()}. 616 * 617 * @param object The object to compare to. 618 * 619 * @return {@code true} if the objects have the same value, otherwise 620 * {@code false}. 621 */ 622 @Override 623 public boolean equals(Object object) { 624 625 return object != null && 626 object instanceof LangTag && 627 this.toString().equals(object.toString()); 628 } 629 630 631 /** 632 * Parses the specified string representation of a language tag. 633 * 634 * @param s The string to parse. May be {@code null}. 635 * 636 * @return The language tag. {@code null} if the string was empty or 637 * {@code null}. 638 * 639 * @throws LangTagException If the string has invalid language tag 640 * syntax. 641 */ 642 public static LangTag parse(final String s) 643 throws LangTagException { 644 645 if (s == null || s.trim().isEmpty()) 646 return null; 647 648 final String[] subtags = s.split("-"); 649 650 int pos = 0; 651 652 // Parse primary lang + ext lang subtags 653 String primaryLang = null; 654 List<String> extLangSubtags = new LinkedList<String>(); 655 656 if (isPrimaryLanguage(subtags[0])) 657 primaryLang = subtags[pos++]; 658 659 // Multiple ext lang subtags possible 660 while (pos < subtags.length && isExtendedLanguageSubtag(subtags[pos])) 661 extLangSubtags.add(subtags[pos++]); 662 663 LangTag langTag = new LangTag(primaryLang, extLangSubtags.toArray(new String[]{})); 664 665 666 // Parse script 667 if (pos < subtags.length && isScript(subtags[pos])) 668 langTag.setScript(subtags[pos++]); 669 670 // Parse region 671 if (pos < subtags.length && isRegion(subtags[pos])) 672 langTag.setRegion(subtags[pos++]); 673 674 // Parse variants 675 List<String> variantSubtags = new LinkedList<String>(); 676 677 while (pos < subtags.length && isVariant(subtags[pos])) 678 variantSubtags.add(subtags[pos++]); 679 680 if (! variantSubtags.isEmpty()) 681 langTag.setVariants(variantSubtags.toArray(new String[]{})); 682 683 // Parse extensions, e.g. u-usercal 684 List<String> extSubtags = new LinkedList<String>(); 685 686 while (pos < subtags.length && isExtensionSingleton(subtags[pos])) { 687 688 String singleton = subtags[pos++]; 689 690 if (pos == subtags.length) 691 throw new LangTagException("Invalid extension subtag"); 692 693 extSubtags.add(singleton + "-" + subtags[pos++]); 694 } 695 696 if (! extSubtags.isEmpty()) 697 langTag.setExtensions(extSubtags.toArray(new String[]{})); 698 699 700 // Parse private use, e.g. x-abc 701 if (pos < subtags.length && subtags[pos].equals("x")) { 702 703 if (++pos == subtags.length) 704 throw new LangTagException("Invalid private use subtag"); 705 706 langTag.setPrivateUse("x-" + subtags[pos++]); 707 } 708 709 // End of tag? 710 if (pos < subtags.length) 711 throw new LangTagException("Invalid language tag: Unexpected subtag"); 712 713 return langTag; 714 } 715}