001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.io;
020    
021    import java.io.IOException;
022    import java.io.DataInput;
023    import java.io.DataOutput;
024    import java.nio.ByteBuffer;
025    import java.nio.CharBuffer;
026    import java.nio.charset.CharacterCodingException;
027    import java.nio.charset.Charset;
028    import java.nio.charset.CharsetDecoder;
029    import java.nio.charset.CharsetEncoder;
030    import java.nio.charset.CodingErrorAction;
031    import java.nio.charset.MalformedInputException;
032    import java.text.CharacterIterator;
033    import java.text.StringCharacterIterator;
034    import java.util.Arrays;
035    
036    import org.apache.avro.reflect.Stringable;
037    
038    import org.apache.hadoop.classification.InterfaceAudience;
039    import org.apache.hadoop.classification.InterfaceStability;
040    
041    /** This class stores text using standard UTF8 encoding.  It provides methods
042     * to serialize, deserialize, and compare texts at byte level.  The type of
043     * length is integer and is serialized using zero-compressed format.  <p>In
044     * addition, it provides methods for string traversal without converting the
045     * byte array to a string.  <p>Also includes utilities for
046     * serializing/deserialing a string, coding/decoding a string, checking if a
047     * byte array contains valid UTF8 code, calculating the length of an encoded
048     * string.
049     */
050    @Stringable
051    @InterfaceAudience.Public
052    @InterfaceStability.Stable
053    public class Text extends BinaryComparable
054        implements WritableComparable<BinaryComparable> {
055      
056      private static ThreadLocal<CharsetEncoder> ENCODER_FACTORY =
057        new ThreadLocal<CharsetEncoder>() {
058          @Override
059          protected CharsetEncoder initialValue() {
060            return Charset.forName("UTF-8").newEncoder().
061                   onMalformedInput(CodingErrorAction.REPORT).
062                   onUnmappableCharacter(CodingErrorAction.REPORT);
063        }
064      };
065      
066      private static ThreadLocal<CharsetDecoder> DECODER_FACTORY =
067        new ThreadLocal<CharsetDecoder>() {
068        @Override
069        protected CharsetDecoder initialValue() {
070          return Charset.forName("UTF-8").newDecoder().
071                 onMalformedInput(CodingErrorAction.REPORT).
072                 onUnmappableCharacter(CodingErrorAction.REPORT);
073        }
074      };
075      
076      private static final byte [] EMPTY_BYTES = new byte[0];
077      
078      private byte[] bytes;
079      private int length;
080    
081      public Text() {
082        bytes = EMPTY_BYTES;
083      }
084    
085      /** Construct from a string. 
086       */
087      public Text(String string) {
088        set(string);
089      }
090    
091      /** Construct from another text. */
092      public Text(Text utf8) {
093        set(utf8);
094      }
095    
096      /** Construct from a byte array.
097       */
098      public Text(byte[] utf8)  {
099        set(utf8);
100      }
101      
102      /**
103       * Get a copy of the bytes that is exactly the length of the data.
104       * See {@link #getBytes()} for faster access to the underlying array.
105       */
106      public byte[] copyBytes() {
107        byte[] result = new byte[length];
108        System.arraycopy(bytes, 0, result, 0, length);
109        return result;
110      }
111      
112      /**
113       * Returns the raw bytes; however, only data up to {@link #getLength()} is
114       * valid. Please use {@link #copyBytes()} if you
115       * need the returned array to be precisely the length of the data.
116       */
117      @Override
118      public byte[] getBytes() {
119        return bytes;
120      }
121    
122      /** Returns the number of bytes in the byte array */ 
123      @Override
124      public int getLength() {
125        return length;
126      }
127      
128      /**
129       * Returns the Unicode Scalar Value (32-bit integer value)
130       * for the character at <code>position</code>. Note that this
131       * method avoids using the converter or doing String instantiation
132       * @return the Unicode scalar value at position or -1
133       *          if the position is invalid or points to a
134       *          trailing byte
135       */
136      public int charAt(int position) {
137        if (position > this.length) return -1; // too long
138        if (position < 0) return -1; // duh.
139          
140        ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position);
141        return bytesToCodePoint(bb.slice());
142      }
143      
144      public int find(String what) {
145        return find(what, 0);
146      }
147      
148      /**
149       * Finds any occurence of <code>what</code> in the backing
150       * buffer, starting as position <code>start</code>. The starting
151       * position is measured in bytes and the return value is in
152       * terms of byte position in the buffer. The backing buffer is
153       * not converted to a string for this operation.
154       * @return byte position of the first occurence of the search
155       *         string in the UTF-8 buffer or -1 if not found
156       */
157      public int find(String what, int start) {
158        try {
159          ByteBuffer src = ByteBuffer.wrap(this.bytes,0,this.length);
160          ByteBuffer tgt = encode(what);
161          byte b = tgt.get();
162          src.position(start);
163              
164          while (src.hasRemaining()) {
165            if (b == src.get()) { // matching first byte
166              src.mark(); // save position in loop
167              tgt.mark(); // save position in target
168              boolean found = true;
169              int pos = src.position()-1;
170              while (tgt.hasRemaining()) {
171                if (!src.hasRemaining()) { // src expired first
172                  tgt.reset();
173                  src.reset();
174                  found = false;
175                  break;
176                }
177                if (!(tgt.get() == src.get())) {
178                  tgt.reset();
179                  src.reset();
180                  found = false;
181                  break; // no match
182                }
183              }
184              if (found) return pos;
185            }
186          }
187          return -1; // not found
188        } catch (CharacterCodingException e) {
189          // can't get here
190          e.printStackTrace();
191          return -1;
192        }
193      }  
194      /** Set to contain the contents of a string. 
195       */
196      public void set(String string) {
197        try {
198          ByteBuffer bb = encode(string, true);
199          bytes = bb.array();
200          length = bb.limit();
201        }catch(CharacterCodingException e) {
202          throw new RuntimeException("Should not have happened ", e); 
203        }
204      }
205    
206      /** Set to a utf8 byte array
207       */
208      public void set(byte[] utf8) {
209        set(utf8, 0, utf8.length);
210      }
211      
212      /** copy a text. */
213      public void set(Text other) {
214        set(other.getBytes(), 0, other.getLength());
215      }
216    
217      /**
218       * Set the Text to range of bytes
219       * @param utf8 the data to copy from
220       * @param start the first position of the new string
221       * @param len the number of bytes of the new string
222       */
223      public void set(byte[] utf8, int start, int len) {
224        setCapacity(len, false);
225        System.arraycopy(utf8, start, bytes, 0, len);
226        this.length = len;
227      }
228    
229      /**
230       * Append a range of bytes to the end of the given text
231       * @param utf8 the data to copy from
232       * @param start the first position to append from utf8
233       * @param len the number of bytes to append
234       */
235      public void append(byte[] utf8, int start, int len) {
236        setCapacity(length + len, true);
237        System.arraycopy(utf8, start, bytes, length, len);
238        length += len;
239      }
240    
241      /**
242       * Clear the string to empty.
243       *
244       * <em>Note</em>: For performance reasons, this call does not clear the
245       * underlying byte array that is retrievable via {@link #getBytes()}.
246       * In order to free the byte-array memory, call {@link #set(byte[])}
247       * with an empty byte array (For example, <code>new byte[0]</code>).
248       */
249      public void clear() {
250        length = 0;
251      }
252    
253      /*
254       * Sets the capacity of this Text object to <em>at least</em>
255       * <code>len</code> bytes. If the current buffer is longer,
256       * then the capacity and existing content of the buffer are
257       * unchanged. If <code>len</code> is larger
258       * than the current capacity, the Text object's capacity is
259       * increased to match.
260       * @param len the number of bytes we need
261       * @param keepData should the old data be kept
262       */
263      private void setCapacity(int len, boolean keepData) {
264        if (bytes == null || bytes.length < len) {
265          if (bytes != null && keepData) {
266            bytes = Arrays.copyOf(bytes, Math.max(len,length << 1));
267          } else {
268            bytes = new byte[len];
269          }
270        }
271      }
272       
273      /** 
274       * Convert text back to string
275       * @see java.lang.Object#toString()
276       */
277      @Override
278      public String toString() {
279        try {
280          return decode(bytes, 0, length);
281        } catch (CharacterCodingException e) { 
282          throw new RuntimeException("Should not have happened " , e); 
283        }
284      }
285      
286      /** deserialize 
287       */
288      @Override
289      public void readFields(DataInput in) throws IOException {
290        int newLength = WritableUtils.readVInt(in);
291        setCapacity(newLength, false);
292        in.readFully(bytes, 0, newLength);
293        length = newLength;
294      }
295      
296      public void readFields(DataInput in, int maxLength) throws IOException {
297        int newLength = WritableUtils.readVInt(in);
298        if (newLength < 0) {
299          throw new IOException("tried to deserialize " + newLength +
300              " bytes of data!  newLength must be non-negative.");
301        } else if (newLength >= maxLength) {
302          throw new IOException("tried to deserialize " + newLength +
303              " bytes of data, but maxLength = " + maxLength);
304        }
305        setCapacity(newLength, false);
306        in.readFully(bytes, 0, newLength);
307        length = newLength;
308      }
309    
310      /** Skips over one Text in the input. */
311      public static void skip(DataInput in) throws IOException {
312        int length = WritableUtils.readVInt(in);
313        WritableUtils.skipFully(in, length);
314      }
315    
316      /** serialize
317       * write this object to out
318       * length uses zero-compressed encoding
319       * @see Writable#write(DataOutput)
320       */
321      @Override
322      public void write(DataOutput out) throws IOException {
323        WritableUtils.writeVInt(out, length);
324        out.write(bytes, 0, length);
325      }
326    
327      public void write(DataOutput out, int maxLength) throws IOException {
328        if (length > maxLength) {
329          throw new IOException("data was too long to write!  Expected " +
330              "less than or equal to " + maxLength + " bytes, but got " +
331              length + " bytes.");
332        }
333        WritableUtils.writeVInt(out, length);
334        out.write(bytes, 0, length);
335      }
336    
337      /** Returns true iff <code>o</code> is a Text with the same contents.  */
338      @Override
339      public boolean equals(Object o) {
340        if (o instanceof Text)
341          return super.equals(o);
342        return false;
343      }
344    
345      @Override
346      public int hashCode() {
347        return super.hashCode();
348      }
349    
350      /** A WritableComparator optimized for Text keys. */
351      public static class Comparator extends WritableComparator {
352        public Comparator() {
353          super(Text.class);
354        }
355    
356        @Override
357        public int compare(byte[] b1, int s1, int l1,
358                           byte[] b2, int s2, int l2) {
359          int n1 = WritableUtils.decodeVIntSize(b1[s1]);
360          int n2 = WritableUtils.decodeVIntSize(b2[s2]);
361          return compareBytes(b1, s1+n1, l1-n1, b2, s2+n2, l2-n2);
362        }
363      }
364    
365      static {
366        // register this comparator
367        WritableComparator.define(Text.class, new Comparator());
368      }
369    
370      /// STATIC UTILITIES FROM HERE DOWN
371      /**
372       * Converts the provided byte array to a String using the
373       * UTF-8 encoding. If the input is malformed,
374       * replace by a default value.
375       */
376      public static String decode(byte[] utf8) throws CharacterCodingException {
377        return decode(ByteBuffer.wrap(utf8), true);
378      }
379      
380      public static String decode(byte[] utf8, int start, int length) 
381        throws CharacterCodingException {
382        return decode(ByteBuffer.wrap(utf8, start, length), true);
383      }
384      
385      /**
386       * Converts the provided byte array to a String using the
387       * UTF-8 encoding. If <code>replace</code> is true, then
388       * malformed input is replaced with the
389       * substitution character, which is U+FFFD. Otherwise the
390       * method throws a MalformedInputException.
391       */
392      public static String decode(byte[] utf8, int start, int length, boolean replace) 
393        throws CharacterCodingException {
394        return decode(ByteBuffer.wrap(utf8, start, length), replace);
395      }
396      
397      private static String decode(ByteBuffer utf8, boolean replace) 
398        throws CharacterCodingException {
399        CharsetDecoder decoder = DECODER_FACTORY.get();
400        if (replace) {
401          decoder.onMalformedInput(
402              java.nio.charset.CodingErrorAction.REPLACE);
403          decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
404        }
405        String str = decoder.decode(utf8).toString();
406        // set decoder back to its default value: REPORT
407        if (replace) {
408          decoder.onMalformedInput(CodingErrorAction.REPORT);
409          decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
410        }
411        return str;
412      }
413    
414      /**
415       * Converts the provided String to bytes using the
416       * UTF-8 encoding. If the input is malformed,
417       * invalid chars are replaced by a default value.
418       * @return ByteBuffer: bytes stores at ByteBuffer.array() 
419       *                     and length is ByteBuffer.limit()
420       */
421    
422      public static ByteBuffer encode(String string)
423        throws CharacterCodingException {
424        return encode(string, true);
425      }
426    
427      /**
428       * Converts the provided String to bytes using the
429       * UTF-8 encoding. If <code>replace</code> is true, then
430       * malformed input is replaced with the
431       * substitution character, which is U+FFFD. Otherwise the
432       * method throws a MalformedInputException.
433       * @return ByteBuffer: bytes stores at ByteBuffer.array() 
434       *                     and length is ByteBuffer.limit()
435       */
436      public static ByteBuffer encode(String string, boolean replace)
437        throws CharacterCodingException {
438        CharsetEncoder encoder = ENCODER_FACTORY.get();
439        if (replace) {
440          encoder.onMalformedInput(CodingErrorAction.REPLACE);
441          encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
442        }
443        ByteBuffer bytes = 
444          encoder.encode(CharBuffer.wrap(string.toCharArray()));
445        if (replace) {
446          encoder.onMalformedInput(CodingErrorAction.REPORT);
447          encoder.onUnmappableCharacter(CodingErrorAction.REPORT);
448        }
449        return bytes;
450      }
451    
452      static final public int DEFAULT_MAX_LEN = 1024 * 1024;
453    
454      /** Read a UTF8 encoded string from in
455       */
456      public static String readString(DataInput in) throws IOException {
457        int length = WritableUtils.readVInt(in);
458        byte [] bytes = new byte[length];
459        in.readFully(bytes, 0, length);
460        return decode(bytes);
461      }
462      
463      /** Read a UTF8 encoded string with a maximum size
464       */
465      public static String readString(DataInput in, int maxLength)
466          throws IOException {
467        int length = WritableUtils.readVIntInRange(in, 0, maxLength);
468        byte [] bytes = new byte[length];
469        in.readFully(bytes, 0, length);
470        return decode(bytes);
471      }
472      
473      /** Write a UTF8 encoded string to out
474       */
475      public static int writeString(DataOutput out, String s) throws IOException {
476        ByteBuffer bytes = encode(s);
477        int length = bytes.limit();
478        WritableUtils.writeVInt(out, length);
479        out.write(bytes.array(), 0, length);
480        return length;
481      }
482    
483      /** Write a UTF8 encoded string with a maximum size to out
484       */
485      public static int writeString(DataOutput out, String s, int maxLength)
486          throws IOException {
487        ByteBuffer bytes = encode(s);
488        int length = bytes.limit();
489        if (length > maxLength) {
490          throw new IOException("string was too long to write!  Expected " +
491              "less than or equal to " + maxLength + " bytes, but got " +
492              length + " bytes.");
493        }
494        WritableUtils.writeVInt(out, length);
495        out.write(bytes.array(), 0, length);
496        return length;
497      }
498    
499      ////// states for validateUTF8
500      
501      private static final int LEAD_BYTE = 0;
502    
503      private static final int TRAIL_BYTE_1 = 1;
504    
505      private static final int TRAIL_BYTE = 2;
506    
507      /** 
508       * Check if a byte array contains valid utf-8
509       * @param utf8 byte array
510       * @throws MalformedInputException if the byte array contains invalid utf-8
511       */
512      public static void validateUTF8(byte[] utf8) throws MalformedInputException {
513        validateUTF8(utf8, 0, utf8.length);     
514      }
515      
516      /**
517       * Check to see if a byte array is valid utf-8
518       * @param utf8 the array of bytes
519       * @param start the offset of the first byte in the array
520       * @param len the length of the byte sequence
521       * @throws MalformedInputException if the byte array contains invalid bytes
522       */
523      public static void validateUTF8(byte[] utf8, int start, int len)
524        throws MalformedInputException {
525        int count = start;
526        int leadByte = 0;
527        int length = 0;
528        int state = LEAD_BYTE;
529        while (count < start+len) {
530          int aByte = utf8[count] & 0xFF;
531    
532          switch (state) {
533          case LEAD_BYTE:
534            leadByte = aByte;
535            length = bytesFromUTF8[aByte];
536    
537            switch (length) {
538            case 0: // check for ASCII
539              if (leadByte > 0x7F)
540                throw new MalformedInputException(count);
541              break;
542            case 1:
543              if (leadByte < 0xC2 || leadByte > 0xDF)
544                throw new MalformedInputException(count);
545              state = TRAIL_BYTE_1;
546              break;
547            case 2:
548              if (leadByte < 0xE0 || leadByte > 0xEF)
549                throw new MalformedInputException(count);
550              state = TRAIL_BYTE_1;
551              break;
552            case 3:
553              if (leadByte < 0xF0 || leadByte > 0xF4)
554                throw new MalformedInputException(count);
555              state = TRAIL_BYTE_1;
556              break;
557            default:
558              // too long! Longest valid UTF-8 is 4 bytes (lead + three)
559              // or if < 0 we got a trail byte in the lead byte position
560              throw new MalformedInputException(count);
561            } // switch (length)
562            break;
563    
564          case TRAIL_BYTE_1:
565            if (leadByte == 0xF0 && aByte < 0x90)
566              throw new MalformedInputException(count);
567            if (leadByte == 0xF4 && aByte > 0x8F)
568              throw new MalformedInputException(count);
569            if (leadByte == 0xE0 && aByte < 0xA0)
570              throw new MalformedInputException(count);
571            if (leadByte == 0xED && aByte > 0x9F)
572              throw new MalformedInputException(count);
573            // falls through to regular trail-byte test!!
574          case TRAIL_BYTE:
575            if (aByte < 0x80 || aByte > 0xBF)
576              throw new MalformedInputException(count);
577            if (--length == 0) {
578              state = LEAD_BYTE;
579            } else {
580              state = TRAIL_BYTE;
581            }
582            break;
583          } // switch (state)
584          count++;
585        }
586      }
587    
588      /**
589       * Magic numbers for UTF-8. These are the number of bytes
590       * that <em>follow</em> a given lead byte. Trailing bytes
591       * have the value -1. The values 4 and 5 are presented in
592       * this table, even though valid UTF-8 cannot include the
593       * five and six byte sequences.
594       */
595      static final int[] bytesFromUTF8 =
596      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
597        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
598        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
599        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
600        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
601        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
602        0, 0, 0, 0, 0, 0, 0,
603        // trail bytes
604        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
605        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
606        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
607        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1,
608        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
609        1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
610        3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
611    
612      /**
613       * Returns the next code point at the current position in
614       * the buffer. The buffer's position will be incremented.
615       * Any mark set on this buffer will be changed by this method!
616       */
617      public static int bytesToCodePoint(ByteBuffer bytes) {
618        bytes.mark();
619        byte b = bytes.get();
620        bytes.reset();
621        int extraBytesToRead = bytesFromUTF8[(b & 0xFF)];
622        if (extraBytesToRead < 0) return -1; // trailing byte!
623        int ch = 0;
624    
625        switch (extraBytesToRead) {
626        case 5: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
627        case 4: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
628        case 3: ch += (bytes.get() & 0xFF); ch <<= 6;
629        case 2: ch += (bytes.get() & 0xFF); ch <<= 6;
630        case 1: ch += (bytes.get() & 0xFF); ch <<= 6;
631        case 0: ch += (bytes.get() & 0xFF);
632        }
633        ch -= offsetsFromUTF8[extraBytesToRead];
634    
635        return ch;
636      }
637    
638      
639      static final int offsetsFromUTF8[] =
640      { 0x00000000, 0x00003080,
641        0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
642    
643      /**
644       * For the given string, returns the number of UTF-8 bytes
645       * required to encode the string.
646       * @param string text to encode
647       * @return number of UTF-8 bytes required to encode
648       */
649      public static int utf8Length(String string) {
650        CharacterIterator iter = new StringCharacterIterator(string);
651        char ch = iter.first();
652        int size = 0;
653        while (ch != CharacterIterator.DONE) {
654          if ((ch >= 0xD800) && (ch < 0xDC00)) {
655            // surrogate pair?
656            char trail = iter.next();
657            if ((trail > 0xDBFF) && (trail < 0xE000)) {
658              // valid pair
659              size += 4;
660            } else {
661              // invalid pair
662              size += 3;
663              iter.previous(); // rewind one
664            }
665          } else if (ch < 0x80) {
666            size++;
667          } else if (ch < 0x800) {
668            size += 2;
669          } else {
670            // ch < 0x10000, that is, the largest char value
671            size += 3;
672          }
673          ch = iter.next();
674        }
675        return size;
676      }
677    }