001/*
002 * lang-tag
003 *
004 * Copyright 2012-2016, Connect2id Ltd.
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
007 * this file except in compliance with the License. You may obtain a copy of the
008 * License at
009 *
010 *    http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software distributed
013 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
014 * CONDITIONS OF ANY KIND, either express or implied. See the License for the
015 * specific language governing permissions and limitations under the License.
016 */
017
018package com.nimbusds.langtag;
019
020
021import java.util.LinkedList;
022import java.util.List;
023
024
025/**
026 * Language tag according to <a href="http://tools.ietf.org/html/rfc5646">RFC 5646</a>.
027 *
028 * <p>Supports normal language tags. Special private language tags beginning 
029 * with "x" and grandfathered tags beginning with "i" are not supported.
030 *
031 * <p>To construct a new language tag from scratch:
032 *
033 * <pre>
034 * // English as used in the United States
035 * LangTag tag = new LangTag("en");
036 * tag.setRegion("US");
037 *
038 * // Returns "en-US"
039 * tag.toString();
040 * </pre>
041 *
042 * <p>To parse a language tag:
043 *
044 * <pre>
045 * // Chinese, Mandarin, Simplified script, as used in China
046 * LangTag tag = LangTag.parse("zh-cmn-Hans-CN");
047 *
048 * // Returns "zh"
049 * tag.getPrimaryLanguage();
050 *
051 * // Returns "cmn"
052 * tag.getExtendedLanguageSubtags()[0];
053 *
054 * // Returns "zh-cmn"
055 * tag.getLanguage();
056 *
057 * // Returns "Hans"
058 * tag.getScript();
059 *
060 * // Returns "CN"
061 * tag.getRegion();
062 * </pre>
063 *
064 * <p>See <a href="http://tools.ietf.org/html/rfc5646">RFC 5646</a>.
065 */
066public class LangTag implements ReadOnlyLangTag {
067
068
069        /**
070         * The primary language, as the shortest ISO 639 code (2*3ALPHA). Must
071         * always be defined, unless sufficient language subtags exist.
072         */
073        private String primaryLanguage;
074        
075        
076        /**
077         * Optional extended language subtags, as three-letter ISO-639-3 codes.
078         */
079        private String[] languageSubtags;
080        
081        
082        /**
083         * Optional script, (4ALPHA) ISO 15924 code.
084         */
085        private String script = null;
086
087
088        /**
089         * Optional region, (2ALPHA) ISO 3166-1 code or (3DIGIT) UN M.49 code.
090         */
091        private String region = null;
092        
093        
094        /**
095         * Optional variants, (5*8alphanum) or (DIGIT 3alphanum).
096         */
097        private String[] variants = null;
098        
099        
100        /**
101         * Optional extensions.
102         */
103        private String[] extensions = null;
104        
105        
106        /**
107         * Optional private use subtag.
108         */
109        private String privateUse = null;
110        
111        
112        /**
113         * Ensures the specified subtag has a valid maximum length of eight
114         * characters.
115         *
116         * @param subtag The sub tag to check. Must not be {@code null}.
117         *
118         * @throws LangTagException If the subtag has length greater than eight
119         *                          characters.
120         */
121        private static void ensureMaxLength(final String subtag)
122                throws LangTagException {
123                
124                if (subtag.length() > 8)
125                
126                        // extension or private use subtag?
127                        if (subtag.charAt(1) != '-' && subtag.length() > 10)
128                        
129                                throw new LangTagException("Invalid subtag syntax: Max character length exceeded");
130        }
131        
132        
133        /**
134         * Creates a new simple language tag.
135         *
136         * <p>Use for simple language tags such as "en" (English), "fr" 
137         * (French) or "pt" (Portuguese).
138         *
139         * @param primaryLanguage The primary language, as the shortest two or 
140         *                        three-letter ISO 639 code. Must not be 
141         *                        {@code null}.
142         *
143         * @throws LangTagException If the primary language syntax is invalid.
144         */
145        public LangTag(final String primaryLanguage)
146                throws LangTagException {
147        
148                this(primaryLanguage, new String[]{});
149        }
150        
151        
152        /**
153         * Creates a new extended language tag.
154         *
155         * <p>Use for extended language tags such as "zh-cmn" (Mandarin 
156         * Chinese) or "zh-yue" (Cantonese Chinese).
157         *
158         * @param primaryLanguage The primary language, as the shortest two or 
159         *                        three-letter ISO 639 code. May be {@code null}
160         *                        if the subtags are sufficient to identify the
161         *                        language.
162         * @param languageSubtags One or more extended language subtags, as
163         *                        three-letter ISO 639-3 codes. {@code null} if
164         *                        none.
165         *
166         * @throws LangTagException If the primary or extended language syntax 
167         *                          is invalid.
168         */
169        public LangTag(final String primaryLanguage, final String... languageSubtags)
170                throws LangTagException {
171                
172                if (primaryLanguage == null && 
173                    (languageSubtags == null || languageSubtags.length == 0))
174                        throw new LangTagException("Either the primary language or the extended language subtags, or both must be defined");
175                
176                setPrimaryLanguage(primaryLanguage);
177                setExtendedLanguageSubtags(languageSubtags);
178        }
179        
180        
181        @Override
182        public String getLanguage() {
183        
184                StringBuilder sb = new StringBuilder();
185                
186                if (primaryLanguage != null)
187                        sb.append(primaryLanguage);
188                        
189                if (languageSubtags != null && languageSubtags.length > 0) {
190                
191                        for (String tag: languageSubtags) {
192                        
193                                if (sb.length() > 0)
194                                        sb.append('-');
195                                
196                                sb.append(tag);
197                        }
198                }
199                        
200                return sb.toString();
201        }
202        
203        
204        @Override
205        public String getPrimaryLanguage() {
206        
207                return primaryLanguage;
208        }
209        
210        
211        /**
212         * Checks if the specified string has a valid primary language subtag 
213         * syntax.
214         *
215         * @param s The string to check. Must not be {@code null}.
216         *
217         * @return {@code true} if the syntax is correct, else {@code false}.
218         */
219        private static boolean isPrimaryLanguage(final String s) {
220        
221                return s.matches("[a-zA-Z]{2,3}");
222        }
223        
224        
225        /**
226         * Sets the primary language subtag.
227         *
228         * <p>See RFC 5646 section 2.2.1.
229         *
230         * @param primaryLanguage The primary language, as the shortest two or
231         *                        three-letter ISO 639 code. May be 
232         *                        {@code null}.
233         *
234         * @throws LangTagException If the primary language syntax is invalid.
235         */
236        private void setPrimaryLanguage(final String primaryLanguage)
237                throws LangTagException {
238                
239                if (primaryLanguage == null) {
240                        this.primaryLanguage = null;
241                        return;
242                }
243                
244                ensureMaxLength(primaryLanguage);
245                
246                if (! isPrimaryLanguage(primaryLanguage))
247                        throw new LangTagException("Invalid primary language subtag: Must be a two or three-letter ISO 639 code");
248                
249                this.primaryLanguage = primaryLanguage.toLowerCase();
250        }
251        
252        
253        @Override
254        public String[] getExtendedLanguageSubtags() {
255        
256                return languageSubtags;
257        }
258        
259        
260        /**
261         * Checks if the specified string has a valid extended language subtag 
262         * syntax.
263         *
264         * @param s The string to check. Must not be {@code null}.
265         *
266         * @return {@code true} if the syntax is correct, else {@code false}.
267         */
268        private static boolean isExtendedLanguageSubtag(final String s) {
269        
270                return s.matches("[a-zA-Z]{3}");
271        }
272        
273        
274        /**
275         * Sets the extended language subtags.
276         *
277         * <p>See RFC 5646 section 2.2.2.
278         *
279         * @param languageSubtags The extended language subtags, as three-letter
280         *                        ISO 639-3 codes. {@code null} if none.
281         */
282        private void setExtendedLanguageSubtags(final String... languageSubtags)
283                throws LangTagException {
284                
285                if (languageSubtags == null || languageSubtags.length == 0) {
286                        this.languageSubtags = null;
287                        return;
288                }
289
290                this.languageSubtags = new String[languageSubtags.length];
291                
292                for (int i=0; i < languageSubtags.length; i++) {
293                
294                        ensureMaxLength(languageSubtags[i]);
295
296                        if (! isExtendedLanguageSubtag(languageSubtags[i]))
297                                throw new LangTagException("Invalid extended language subtag: Must be a three-letter ISO 639-3 code");
298
299                        this.languageSubtags[i] = languageSubtags[i].toLowerCase();
300                }
301        }
302        
303        
304        @Override
305        public String getScript() {
306        
307                return script;
308        }
309        
310        
311        /**
312         * Checks if the specified string has a valid script subtag syntax.
313         *
314         * @param s The string to check. Must not be {@code null}.
315         *
316         * @return {@code true} if the syntax is correct, else {@code false}.
317         */
318        private static boolean isScript(final String s) {
319                
320                return s.matches("[a-zA-Z]{4}");
321        }
322        
323        
324        /**
325         * Sets the script.
326         *
327         * <p>See RFC 5646 section 2.2.3.
328         *
329         * @param script The script, as a four-letter ISO 15924 code. 
330         *               {@code null} if not defined.
331         *
332         * @throws LangTagException If the script syntax is invalid.
333         */
334        public void setScript(final String script)
335                throws LangTagException {
336        
337                if (script == null) {
338                        this.script = null;
339                        return;
340                }
341                
342                ensureMaxLength(script);
343                
344                if (! isScript(script))
345                        throw new LangTagException("Invalid script subtag: Must be a four-letter ISO 15924 code");
346                
347                this.script = script.substring(0, 1).toUpperCase() + 
348                              script.substring(1).toLowerCase();
349        }
350        
351        
352        @Override
353        public String getRegion() {
354        
355                return region;
356        }
357        
358        
359        /**
360         * Checks if the specified string has a valid region subtag syntax.
361         *
362         * @param s The string to check. Must not be {@code null}.
363         *
364         * @return {@code true} if the syntax is correct, else {@code false}.
365         */
366        private static boolean isRegion(final String s) {
367        
368                return s.matches("[a-zA-Z]{2}|\\d{3}");
369        }
370        
371        
372        /**
373         * Sets the region.
374         *
375         * <p>See RFC 5646 section 2.2.4.
376         *
377         * @param region The region, as a two-letter ISO 3166-1 code or a three-
378         *               digit UN M.49 code. {@code null} if not defined.
379         *
380         * @throws LangTagException If the region syntax is invalid.
381         */
382        public void setRegion(final String region)
383                throws LangTagException {
384                
385                if (region == null) {
386                        this.region = null;
387                        return;
388                }
389                
390                ensureMaxLength(region);
391                
392                if (! isRegion(region))
393                        throw new LangTagException("Invalid region subtag: Must be a two-letter ISO 3166-1 code or a three-digit UN M.49 code");
394        
395                this.region = region.toUpperCase();
396        }
397        
398        
399        @Override
400        public String[] getVariants() {
401        
402                return variants;
403        }
404        
405        
406        /**
407         * Checks if the specified string has a valid variant subtag syntax.
408         *
409         * @param s The string to check. Must not be {@code null}.
410         *
411         * @return {@code true} if the syntax is correct, else {@code false}.
412         */
413        private static boolean isVariant(final String s) {
414        
415                return s.matches("[a-zA-Z][a-zA-Z0-9]{4,}|[0-9][a-zA-Z0-9]{3,}");
416        }
417        
418        
419        /**
420         * Sets the variants.
421         *
422         * <p>See RFC 5646 section 2.2.5.
423         *
424         * @param variants The variants. {@code null} if not defined.
425         *
426         * @throws LangTagException If the variant syntax is invalid.
427         */
428        public void setVariants(final String... variants)
429                throws LangTagException {
430                
431                if (variants == null || variants.length == 0) {
432                        this.variants = null;
433                        return;
434                }
435        
436                this.variants = new String[variants.length];
437                
438                for (int i=0; i < variants.length; i++) {
439                
440                        ensureMaxLength(variants[i]);
441
442                        if (! isVariant(variants[i]))
443                                throw new LangTagException("Invalid variant subtag");
444
445                        this.variants[i] = variants[i].toLowerCase();
446                }
447        }
448        
449        
450        @Override
451        public String[] getExtensions() {
452        
453                return extensions;
454        }
455        
456        
457        /**
458         * Checks if the specified string has a valid extension singleton 
459         * syntax.
460         *
461         * @param s The string to check. Must not be {@code null}.
462         *
463         * @return {@code true} if the syntax is correct, else {@code false}.
464         */
465        private static boolean isExtensionSingleton(final String s) {
466        
467                return s.matches("[0-9a-wA-Wy-zY-Z]");
468        }
469        
470        
471        /**
472         * Checks if the specified string has a valid extension subtag syntax.
473         *
474         * @param s The string to check. Must not be {@code null}.
475         *
476         * @return {@code true} if the syntax is correct, else {@code false}.
477         */
478        private static boolean isExtension(final String s) {
479        
480                return s.matches("[0-9a-wA-Wy-zY-Z]-[0-9a-zA-Z]+");
481        }
482        
483        
484        /**
485         * Sets the extensions.
486         *
487         * <p>See RFC 5646 section 2.2.6.
488         *
489         * @param extensions The extensions. {@code null} if not defined.
490         *
491         * @throws LangTagException If the extension syntax is invalid.
492         */
493        public void setExtensions(final String... extensions)
494                throws LangTagException {
495                
496                if (extensions == null || extensions.length == 0) {
497                        this.extensions = null;
498                        return;
499                }
500        
501                this.extensions = new String[extensions.length];
502                
503                for (int i=0; i < extensions.length; i++) {
504                
505                        ensureMaxLength(extensions[i]);
506
507                        if (! isExtension(extensions[i]))
508                                throw new LangTagException("Invalid extension subtag");
509
510                        this.extensions[i] = extensions[i].toLowerCase();
511                }
512        }
513        
514        
515        @Override
516        public String getPrivateUse() {
517        
518                return privateUse;
519        }
520        
521        
522        /**
523         * Checks if the specified string has a valid private use subtag syntax.
524         *
525         * @param s The string to check. Must not be {@code null}.
526         *
527         * @return {@code true} if the syntax is correct, else {@code false}.
528         */
529        private static boolean isPrivateUse(final String s) {
530        
531                return s.matches("x-[0-9a-zA-Z]+");
532        }
533        
534        
535        /**
536         * Sets the private use.
537         *
538         * <p>See RFC 5646 section 2.2.7.
539         *
540         * @param privateUse The private use. {@code null} if not defined.
541         *
542         * @throws LangTagException If the extension syntax is invalid.
543         */
544        public void setPrivateUse(final String privateUse)
545                throws LangTagException {
546        
547                if (privateUse == null) {
548                        this.privateUse = null;
549                        return;
550                }
551                
552                ensureMaxLength(privateUse);
553                
554                if (! isPrivateUse(privateUse))
555                        throw new LangTagException("Invalid private use subtag");
556        
557                this.privateUse = privateUse.toLowerCase();
558        }
559        
560        
561        @Override
562        public String toString() {
563        
564                StringBuilder sb = new StringBuilder(getLanguage());
565                
566                if (script != null) {
567                        sb.append('-');
568                        sb.append(script);
569                }
570                
571                if (region != null) {
572                        sb.append('-');
573                        sb.append(region);
574                }
575                
576                if (variants != null) {
577                
578                        for (String v: variants) {
579                                sb.append('-');
580                                sb.append(v);
581                        }
582                }
583                
584                if (extensions != null) {
585                
586                        for (String e: extensions) {
587                                sb.append('-');
588                                sb.append(e);
589                        }
590                }
591                
592                if (privateUse != null) {
593                
594                        sb.append('-');
595                        sb.append(privateUse);
596                }
597                
598                return sb.toString();
599        }
600        
601        
602        /**
603         * Overrides {@code Object.hashCode()}.
604         *
605         * @return The object hash code.
606         */
607        @Override
608        public int hashCode() {
609        
610                return toString().hashCode();
611        }
612        
613        
614        /**
615         * Overrides {@code Object.equals()}.
616         *
617         * @param object The object to compare to.
618         *
619         * @return {@code true} if the objects have the same value, otherwise
620         *         {@code false}.
621         */
622        @Override
623        public boolean equals(Object object) {
624        
625                return object != null &&
626                       object instanceof LangTag && 
627                       this.toString().equals(object.toString());
628        }
629        
630        
631        /**
632         * Parses the specified string representation of a language tag.
633         *
634         * @param s The string to parse. May be {@code null}.
635         *
636         * @return The language tag. {@code null} if the string was empty or
637         *         {@code null}.
638         *
639         * @throws LangTagException If the string has invalid language tag 
640         *                          syntax.
641         */
642        public static LangTag parse(final String s)
643                throws LangTagException {
644                
645                if (s == null || s.trim().isEmpty())
646                        return null;
647                        
648                final String[] subtags = s.split("-");
649                
650                int pos = 0;
651                
652                // Parse primary lang + ext lang subtags
653                String primaryLang = null;
654                List<String> extLangSubtags = new LinkedList<String>();
655                
656                if (isPrimaryLanguage(subtags[0]))
657                        primaryLang = subtags[pos++];
658                
659                // Multiple ext lang subtags possible
660                while (pos < subtags.length && isExtendedLanguageSubtag(subtags[pos]))
661                        extLangSubtags.add(subtags[pos++]);
662                
663                LangTag langTag = new LangTag(primaryLang, extLangSubtags.toArray(new String[]{}));
664                
665                
666                // Parse script
667                if (pos < subtags.length && isScript(subtags[pos]))
668                        langTag.setScript(subtags[pos++]);
669                                
670                // Parse region
671                if (pos < subtags.length && isRegion(subtags[pos]))
672                        langTag.setRegion(subtags[pos++]);
673                                
674                // Parse variants
675                List<String> variantSubtags = new LinkedList<String>();
676                        
677                while (pos < subtags.length && isVariant(subtags[pos]))
678                        variantSubtags.add(subtags[pos++]);
679                        
680                if (! variantSubtags.isEmpty())
681                        langTag.setVariants(variantSubtags.toArray(new String[]{}));
682                        
683                // Parse extensions, e.g. u-usercal
684                List<String> extSubtags = new LinkedList<String>();
685                
686                while (pos < subtags.length && isExtensionSingleton(subtags[pos])) {
687                        
688                        String singleton = subtags[pos++];
689                        
690                        if (pos == subtags.length)
691                                throw new LangTagException("Invalid extension subtag");
692                        
693                        extSubtags.add(singleton + "-" + subtags[pos++]);
694                }
695                        
696                if (! extSubtags.isEmpty())
697                        langTag.setExtensions(extSubtags.toArray(new String[]{}));
698                        
699                        
700                // Parse private use, e.g. x-abc
701                if (pos < subtags.length && subtags[pos].equals("x")) {
702                
703                        if (++pos == subtags.length)
704                                throw new LangTagException("Invalid private use subtag");
705                        
706                        langTag.setPrivateUse("x-" + subtags[pos++]);
707                }
708                
709                // End of tag?
710                if (pos < subtags.length)
711                        throw new LangTagException("Invalid language tag: Unexpected subtag");
712                
713                return langTag;
714        }
715}