001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.io;
020    
021    import java.io.IOException;
022    import java.io.DataInput;
023    import java.io.DataOutput;
024    import java.nio.ByteBuffer;
025    import java.nio.CharBuffer;
026    import java.nio.charset.CharacterCodingException;
027    import java.nio.charset.Charset;
028    import java.nio.charset.CharsetDecoder;
029    import java.nio.charset.CharsetEncoder;
030    import java.nio.charset.CodingErrorAction;
031    import java.nio.charset.MalformedInputException;
032    import java.text.CharacterIterator;
033    import java.text.StringCharacterIterator;
034    import java.util.Arrays;
035    
036    import org.apache.avro.reflect.Stringable;
037    
038    import org.apache.hadoop.classification.InterfaceAudience;
039    import org.apache.hadoop.classification.InterfaceStability;
040    
041    /** This class stores text using standard UTF8 encoding.  It provides methods
042     * to serialize, deserialize, and compare texts at byte level.  The type of
043     * length is integer and is serialized using zero-compressed format.  <p>In
044     * addition, it provides methods for string traversal without converting the
045     * byte array to a string.  <p>Also includes utilities for
046     * serializing/deserialing a string, coding/decoding a string, checking if a
047     * byte array contains valid UTF8 code, calculating the length of an encoded
048     * string.
049     */
050    @Stringable
051    @InterfaceAudience.Public
052    @InterfaceStability.Stable
053    public class Text extends BinaryComparable
054        implements WritableComparable<BinaryComparable> {
055      
056      private static ThreadLocal<CharsetEncoder> ENCODER_FACTORY =
057        new ThreadLocal<CharsetEncoder>() {
058          protected CharsetEncoder initialValue() {
059            return Charset.forName("UTF-8").newEncoder().
060                   onMalformedInput(CodingErrorAction.REPORT).
061                   onUnmappableCharacter(CodingErrorAction.REPORT);
062        }
063      };
064      
065      private static ThreadLocal<CharsetDecoder> DECODER_FACTORY =
066        new ThreadLocal<CharsetDecoder>() {
067        protected CharsetDecoder initialValue() {
068          return Charset.forName("UTF-8").newDecoder().
069                 onMalformedInput(CodingErrorAction.REPORT).
070                 onUnmappableCharacter(CodingErrorAction.REPORT);
071        }
072      };
073      
074      private static final byte [] EMPTY_BYTES = new byte[0];
075      
076      private byte[] bytes;
077      private int length;
078    
079      public Text() {
080        bytes = EMPTY_BYTES;
081      }
082    
083      /** Construct from a string. 
084       */
085      public Text(String string) {
086        set(string);
087      }
088    
089      /** Construct from another text. */
090      public Text(Text utf8) {
091        set(utf8);
092      }
093    
094      /** Construct from a byte array.
095       */
096      public Text(byte[] utf8)  {
097        set(utf8);
098      }
099      
100      /**
101       * Get a copy of the bytes that is exactly the length of the data.
102       * See {@link #getBytes()} for faster access to the underlying array.
103       */
104      public byte[] copyBytes() {
105        byte[] result = new byte[length];
106        System.arraycopy(bytes, 0, result, 0, length);
107        return result;
108      }
109      
110      /**
111       * Returns the raw bytes; however, only data up to {@link #getLength()} is
112       * valid. Please use {@link #copyBytes()} if you
113       * need the returned array to be precisely the length of the data.
114       */
115      public byte[] getBytes() {
116        return bytes;
117      }
118    
119      /** Returns the number of bytes in the byte array */ 
120      public int getLength() {
121        return length;
122      }
123      
124      /**
125       * Returns the Unicode Scalar Value (32-bit integer value)
126       * for the character at <code>position</code>. Note that this
127       * method avoids using the converter or doing String instatiation
128       * @return the Unicode scalar value at position or -1
129       *          if the position is invalid or points to a
130       *          trailing byte
131       */
132      public int charAt(int position) {
133        if (position > this.length) return -1; // too long
134        if (position < 0) return -1; // duh.
135          
136        ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position);
137        return bytesToCodePoint(bb.slice());
138      }
139      
140      public int find(String what) {
141        return find(what, 0);
142      }
143      
144      /**
145       * Finds any occurence of <code>what</code> in the backing
146       * buffer, starting as position <code>start</code>. The starting
147       * position is measured in bytes and the return value is in
148       * terms of byte position in the buffer. The backing buffer is
149       * not converted to a string for this operation.
150       * @return byte position of the first occurence of the search
151       *         string in the UTF-8 buffer or -1 if not found
152       */
153      public int find(String what, int start) {
154        try {
155          ByteBuffer src = ByteBuffer.wrap(this.bytes,0,this.length);
156          ByteBuffer tgt = encode(what);
157          byte b = tgt.get();
158          src.position(start);
159              
160          while (src.hasRemaining()) {
161            if (b == src.get()) { // matching first byte
162              src.mark(); // save position in loop
163              tgt.mark(); // save position in target
164              boolean found = true;
165              int pos = src.position()-1;
166              while (tgt.hasRemaining()) {
167                if (!src.hasRemaining()) { // src expired first
168                  tgt.reset();
169                  src.reset();
170                  found = false;
171                  break;
172                }
173                if (!(tgt.get() == src.get())) {
174                  tgt.reset();
175                  src.reset();
176                  found = false;
177                  break; // no match
178                }
179              }
180              if (found) return pos;
181            }
182          }
183          return -1; // not found
184        } catch (CharacterCodingException e) {
185          // can't get here
186          e.printStackTrace();
187          return -1;
188        }
189      }  
190      /** Set to contain the contents of a string. 
191       */
192      public void set(String string) {
193        try {
194          ByteBuffer bb = encode(string, true);
195          bytes = bb.array();
196          length = bb.limit();
197        }catch(CharacterCodingException e) {
198          throw new RuntimeException("Should not have happened ", e); 
199        }
200      }
201    
202      /** Set to a utf8 byte array
203       */
204      public void set(byte[] utf8) {
205        set(utf8, 0, utf8.length);
206      }
207      
208      /** copy a text. */
209      public void set(Text other) {
210        set(other.getBytes(), 0, other.getLength());
211      }
212    
213      /**
214       * Set the Text to range of bytes
215       * @param utf8 the data to copy from
216       * @param start the first position of the new string
217       * @param len the number of bytes of the new string
218       */
219      public void set(byte[] utf8, int start, int len) {
220        setCapacity(len, false);
221        System.arraycopy(utf8, start, bytes, 0, len);
222        this.length = len;
223      }
224    
225      /**
226       * Append a range of bytes to the end of the given text
227       * @param utf8 the data to copy from
228       * @param start the first position to append from utf8
229       * @param len the number of bytes to append
230       */
231      public void append(byte[] utf8, int start, int len) {
232        setCapacity(length + len, true);
233        System.arraycopy(utf8, start, bytes, length, len);
234        length += len;
235      }
236    
237      /**
238       * Clear the string to empty.
239       */
240      public void clear() {
241        length = 0;
242      }
243    
244      /*
245       * Sets the capacity of this Text object to <em>at least</em>
246       * <code>len</code> bytes. If the current buffer is longer,
247       * then the capacity and existing content of the buffer are
248       * unchanged. If <code>len</code> is larger
249       * than the current capacity, the Text object's capacity is
250       * increased to match.
251       * @param len the number of bytes we need
252       * @param keepData should the old data be kept
253       */
254      private void setCapacity(int len, boolean keepData) {
255        if (bytes == null || bytes.length < len) {
256          if (bytes != null && keepData) {
257            bytes = Arrays.copyOf(bytes, Math.max(len,length << 1));
258          } else {
259            bytes = new byte[len];
260          }
261        }
262      }
263       
264      /** 
265       * Convert text back to string
266       * @see java.lang.Object#toString()
267       */
268      @Override
269      public String toString() {
270        try {
271          return decode(bytes, 0, length);
272        } catch (CharacterCodingException e) { 
273          throw new RuntimeException("Should not have happened " , e); 
274        }
275      }
276      
277      /** deserialize 
278       */
279      public void readFields(DataInput in) throws IOException {
280        int newLength = WritableUtils.readVInt(in);
281        setCapacity(newLength, false);
282        in.readFully(bytes, 0, newLength);
283        length = newLength;
284      }
285    
286      /** Skips over one Text in the input. */
287      public static void skip(DataInput in) throws IOException {
288        int length = WritableUtils.readVInt(in);
289        WritableUtils.skipFully(in, length);
290      }
291    
292      /** serialize
293       * write this object to out
294       * length uses zero-compressed encoding
295       * @see Writable#write(DataOutput)
296       */
297      public void write(DataOutput out) throws IOException {
298        WritableUtils.writeVInt(out, length);
299        out.write(bytes, 0, length);
300      }
301    
302      /** Returns true iff <code>o</code> is a Text with the same contents.  */
303      public boolean equals(Object o) {
304        if (o instanceof Text)
305          return super.equals(o);
306        return false;
307      }
308    
309      @Override
310      public int hashCode() {
311        return super.hashCode();
312      }
313    
314      /** A WritableComparator optimized for Text keys. */
315      public static class Comparator extends WritableComparator {
316        public Comparator() {
317          super(Text.class);
318        }
319    
320        public int compare(byte[] b1, int s1, int l1,
321                           byte[] b2, int s2, int l2) {
322          int n1 = WritableUtils.decodeVIntSize(b1[s1]);
323          int n2 = WritableUtils.decodeVIntSize(b2[s2]);
324          return compareBytes(b1, s1+n1, l1-n1, b2, s2+n2, l2-n2);
325        }
326      }
327    
328      static {
329        // register this comparator
330        WritableComparator.define(Text.class, new Comparator());
331      }
332    
333      /// STATIC UTILITIES FROM HERE DOWN
334      /**
335       * Converts the provided byte array to a String using the
336       * UTF-8 encoding. If the input is malformed,
337       * replace by a default value.
338       */
339      public static String decode(byte[] utf8) throws CharacterCodingException {
340        return decode(ByteBuffer.wrap(utf8), true);
341      }
342      
343      public static String decode(byte[] utf8, int start, int length) 
344        throws CharacterCodingException {
345        return decode(ByteBuffer.wrap(utf8, start, length), true);
346      }
347      
348      /**
349       * Converts the provided byte array to a String using the
350       * UTF-8 encoding. If <code>replace</code> is true, then
351       * malformed input is replaced with the
352       * substitution character, which is U+FFFD. Otherwise the
353       * method throws a MalformedInputException.
354       */
355      public static String decode(byte[] utf8, int start, int length, boolean replace) 
356        throws CharacterCodingException {
357        return decode(ByteBuffer.wrap(utf8, start, length), replace);
358      }
359      
360      private static String decode(ByteBuffer utf8, boolean replace) 
361        throws CharacterCodingException {
362        CharsetDecoder decoder = DECODER_FACTORY.get();
363        if (replace) {
364          decoder.onMalformedInput(
365              java.nio.charset.CodingErrorAction.REPLACE);
366          decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
367        }
368        String str = decoder.decode(utf8).toString();
369        // set decoder back to its default value: REPORT
370        if (replace) {
371          decoder.onMalformedInput(CodingErrorAction.REPORT);
372          decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
373        }
374        return str;
375      }
376    
377      /**
378       * Converts the provided String to bytes using the
379       * UTF-8 encoding. If the input is malformed,
380       * invalid chars are replaced by a default value.
381       * @return ByteBuffer: bytes stores at ByteBuffer.array() 
382       *                     and length is ByteBuffer.limit()
383       */
384    
385      public static ByteBuffer encode(String string)
386        throws CharacterCodingException {
387        return encode(string, true);
388      }
389    
390      /**
391       * Converts the provided String to bytes using the
392       * UTF-8 encoding. If <code>replace</code> is true, then
393       * malformed input is replaced with the
394       * substitution character, which is U+FFFD. Otherwise the
395       * method throws a MalformedInputException.
396       * @return ByteBuffer: bytes stores at ByteBuffer.array() 
397       *                     and length is ByteBuffer.limit()
398       */
399      public static ByteBuffer encode(String string, boolean replace)
400        throws CharacterCodingException {
401        CharsetEncoder encoder = ENCODER_FACTORY.get();
402        if (replace) {
403          encoder.onMalformedInput(CodingErrorAction.REPLACE);
404          encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
405        }
406        ByteBuffer bytes = 
407          encoder.encode(CharBuffer.wrap(string.toCharArray()));
408        if (replace) {
409          encoder.onMalformedInput(CodingErrorAction.REPORT);
410          encoder.onUnmappableCharacter(CodingErrorAction.REPORT);
411        }
412        return bytes;
413      }
414    
415      /** Read a UTF8 encoded string from in
416       */
417      public static String readString(DataInput in) throws IOException {
418        int length = WritableUtils.readVInt(in);
419        byte [] bytes = new byte[length];
420        in.readFully(bytes, 0, length);
421        return decode(bytes);
422      }
423    
424      /** Write a UTF8 encoded string to out
425       */
426      public static int writeString(DataOutput out, String s) throws IOException {
427        ByteBuffer bytes = encode(s);
428        int length = bytes.limit();
429        WritableUtils.writeVInt(out, length);
430        out.write(bytes.array(), 0, length);
431        return length;
432      }
433    
434      ////// states for validateUTF8
435      
436      private static final int LEAD_BYTE = 0;
437    
438      private static final int TRAIL_BYTE_1 = 1;
439    
440      private static final int TRAIL_BYTE = 2;
441    
442      /** 
443       * Check if a byte array contains valid utf-8
444       * @param utf8 byte array
445       * @throws MalformedInputException if the byte array contains invalid utf-8
446       */
447      public static void validateUTF8(byte[] utf8) throws MalformedInputException {
448        validateUTF8(utf8, 0, utf8.length);     
449      }
450      
451      /**
452       * Check to see if a byte array is valid utf-8
453       * @param utf8 the array of bytes
454       * @param start the offset of the first byte in the array
455       * @param len the length of the byte sequence
456       * @throws MalformedInputException if the byte array contains invalid bytes
457       */
458      public static void validateUTF8(byte[] utf8, int start, int len)
459        throws MalformedInputException {
460        int count = start;
461        int leadByte = 0;
462        int length = 0;
463        int state = LEAD_BYTE;
464        while (count < start+len) {
465          int aByte = ((int) utf8[count] & 0xFF);
466    
467          switch (state) {
468          case LEAD_BYTE:
469            leadByte = aByte;
470            length = bytesFromUTF8[aByte];
471    
472            switch (length) {
473            case 0: // check for ASCII
474              if (leadByte > 0x7F)
475                throw new MalformedInputException(count);
476              break;
477            case 1:
478              if (leadByte < 0xC2 || leadByte > 0xDF)
479                throw new MalformedInputException(count);
480              state = TRAIL_BYTE_1;
481              break;
482            case 2:
483              if (leadByte < 0xE0 || leadByte > 0xEF)
484                throw new MalformedInputException(count);
485              state = TRAIL_BYTE_1;
486              break;
487            case 3:
488              if (leadByte < 0xF0 || leadByte > 0xF4)
489                throw new MalformedInputException(count);
490              state = TRAIL_BYTE_1;
491              break;
492            default:
493              // too long! Longest valid UTF-8 is 4 bytes (lead + three)
494              // or if < 0 we got a trail byte in the lead byte position
495              throw new MalformedInputException(count);
496            } // switch (length)
497            break;
498    
499          case TRAIL_BYTE_1:
500            if (leadByte == 0xF0 && aByte < 0x90)
501              throw new MalformedInputException(count);
502            if (leadByte == 0xF4 && aByte > 0x8F)
503              throw new MalformedInputException(count);
504            if (leadByte == 0xE0 && aByte < 0xA0)
505              throw new MalformedInputException(count);
506            if (leadByte == 0xED && aByte > 0x9F)
507              throw new MalformedInputException(count);
508            // falls through to regular trail-byte test!!
509          case TRAIL_BYTE:
510            if (aByte < 0x80 || aByte > 0xBF)
511              throw new MalformedInputException(count);
512            if (--length == 0) {
513              state = LEAD_BYTE;
514            } else {
515              state = TRAIL_BYTE;
516            }
517            break;
518          } // switch (state)
519          count++;
520        }
521      }
522    
523      /**
524       * Magic numbers for UTF-8. These are the number of bytes
525       * that <em>follow</em> a given lead byte. Trailing bytes
526       * have the value -1. The values 4 and 5 are presented in
527       * this table, even though valid UTF-8 cannot include the
528       * five and six byte sequences.
529       */
530      static final int[] bytesFromUTF8 =
531      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
532        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
533        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
534        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
535        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
536        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
537        0, 0, 0, 0, 0, 0, 0,
538        // trail bytes
539        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
540        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
541        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
542        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1,
543        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
544        1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
545        3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
546    
547      /**
548       * Returns the next code point at the current position in
549       * the buffer. The buffer's position will be incremented.
550       * Any mark set on this buffer will be changed by this method!
551       */
552      public static int bytesToCodePoint(ByteBuffer bytes) {
553        bytes.mark();
554        byte b = bytes.get();
555        bytes.reset();
556        int extraBytesToRead = bytesFromUTF8[(b & 0xFF)];
557        if (extraBytesToRead < 0) return -1; // trailing byte!
558        int ch = 0;
559    
560        switch (extraBytesToRead) {
561        case 5: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
562        case 4: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
563        case 3: ch += (bytes.get() & 0xFF); ch <<= 6;
564        case 2: ch += (bytes.get() & 0xFF); ch <<= 6;
565        case 1: ch += (bytes.get() & 0xFF); ch <<= 6;
566        case 0: ch += (bytes.get() & 0xFF);
567        }
568        ch -= offsetsFromUTF8[extraBytesToRead];
569    
570        return ch;
571      }
572    
573      
574      static final int offsetsFromUTF8[] =
575      { 0x00000000, 0x00003080,
576        0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
577    
578      /**
579       * For the given string, returns the number of UTF-8 bytes
580       * required to encode the string.
581       * @param string text to encode
582       * @return number of UTF-8 bytes required to encode
583       */
584      public static int utf8Length(String string) {
585        CharacterIterator iter = new StringCharacterIterator(string);
586        char ch = iter.first();
587        int size = 0;
588        while (ch != CharacterIterator.DONE) {
589          if ((ch >= 0xD800) && (ch < 0xDC00)) {
590            // surrogate pair?
591            char trail = iter.next();
592            if ((trail > 0xDBFF) && (trail < 0xE000)) {
593              // valid pair
594              size += 4;
595            } else {
596              // invalid pair
597              size += 3;
598              iter.previous(); // rewind one
599            }
600          } else if (ch < 0x80) {
601            size++;
602          } else if (ch < 0x800) {
603            size += 2;
604          } else {
605            // ch < 0x10000, that is, the largest char value
606            size += 3;
607          }
608          ch = iter.next();
609        }
610        return size;
611      }
612    }