001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.io;
020    
021    import java.io.IOException;
022    import java.io.DataInput;
023    import java.io.DataOutput;
024    import java.io.UTFDataFormatException;
025    
026    import org.apache.hadoop.util.StringUtils;
027    
028    import org.apache.commons.logging.*;
029    import org.apache.hadoop.classification.InterfaceAudience;
030    import org.apache.hadoop.classification.InterfaceStability;
031    
032    /** A WritableComparable for strings that uses the UTF8 encoding.
033     * 
034     * <p>Also includes utilities for efficiently reading and writing UTF-8.
035     *
036     * Note that this decodes UTF-8 but actually encodes CESU-8, a variant of
037     * UTF-8: see http://en.wikipedia.org/wiki/CESU-8
038     *
039     * @deprecated replaced by Text
040     */
041    @Deprecated
042    @InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"})
043    @InterfaceStability.Stable
044    public class UTF8 implements WritableComparable<UTF8> {
045      private static final Log LOG= LogFactory.getLog(UTF8.class);
046      private static final DataInputBuffer IBUF = new DataInputBuffer();
047    
048      private static final ThreadLocal<DataOutputBuffer> OBUF_FACTORY =
049        new ThreadLocal<DataOutputBuffer>(){
050        @Override
051        protected DataOutputBuffer initialValue() {
052          return new DataOutputBuffer();
053        }
054      };
055    
056      private static final byte[] EMPTY_BYTES = new byte[0];
057    
058      private byte[] bytes = EMPTY_BYTES;
059      private int length;
060    
061      public UTF8() {
062        //set("");
063      }
064    
065      /** Construct from a given string. */
066      public UTF8(String string) {
067        set(string);
068      }
069    
070      /** Construct from a given string. */
071      public UTF8(UTF8 utf8) {
072        set(utf8);
073      }
074    
075      /** The raw bytes. */
076      public byte[] getBytes() {
077        return bytes;
078      }
079    
080      /** The number of bytes in the encoded string. */
081      public int getLength() {
082        return length;
083      }
084    
085      /** Set to contain the contents of a string. */
086      public void set(String string) {
087        if (string.length() > 0xffff/3) {             // maybe too long
088          LOG.warn("truncating long string: " + string.length()
089                   + " chars, starting with " + string.substring(0, 20));
090          string = string.substring(0, 0xffff/3);
091        }
092    
093        length = utf8Length(string);                  // compute length
094        if (length > 0xffff)                          // double-check length
095          throw new RuntimeException("string too long!");
096    
097        if (bytes == null || length > bytes.length)   // grow buffer
098          bytes = new byte[length];
099    
100        try {                                         // avoid sync'd allocations
101          DataOutputBuffer obuf = OBUF_FACTORY.get();
102          obuf.reset();
103          writeChars(obuf, string, 0, string.length());
104          System.arraycopy(obuf.getData(), 0, bytes, 0, length);
105        } catch (IOException e) {
106          throw new RuntimeException(e);
107        }
108      }
109    
110      /** Set to contain the contents of a string. */
111      public void set(UTF8 other) {
112        length = other.length;
113        if (bytes == null || length > bytes.length)   // grow buffer
114          bytes = new byte[length];
115        System.arraycopy(other.bytes, 0, bytes, 0, length);
116      }
117    
118      @Override
119      public void readFields(DataInput in) throws IOException {
120        length = in.readUnsignedShort();
121        if (bytes == null || bytes.length < length)
122          bytes = new byte[length];
123        in.readFully(bytes, 0, length);
124      }
125    
126      /** Skips over one UTF8 in the input. */
127      public static void skip(DataInput in) throws IOException {
128        int length = in.readUnsignedShort();
129        WritableUtils.skipFully(in, length);
130      }
131    
132      @Override
133      public void write(DataOutput out) throws IOException {
134        out.writeShort(length);
135        out.write(bytes, 0, length);
136      }
137    
138      /** Compare two UTF8s. */
139      @Override
140      public int compareTo(UTF8 o) {
141        return WritableComparator.compareBytes(bytes, 0, length,
142                                               o.bytes, 0, o.length);
143      }
144    
145      /** Convert to a String. */
146      @Override
147      public String toString() {
148        StringBuilder buffer = new StringBuilder(length);
149        try {
150          synchronized (IBUF) {
151            IBUF.reset(bytes, length);
152            readChars(IBUF, buffer, length);
153          }
154        } catch (IOException e) {
155          throw new RuntimeException(e);
156        }
157        return buffer.toString();
158      }
159      
160      /**
161       * Convert to a string, checking for valid UTF8.
162       * @return the converted string
163       * @throws UTFDataFormatException if the underlying bytes contain invalid
164       * UTF8 data.
165       */
166      public String toStringChecked() throws IOException {
167        StringBuilder buffer = new StringBuilder(length);
168        synchronized (IBUF) {
169          IBUF.reset(bytes, length);
170          readChars(IBUF, buffer, length);
171        }
172        return buffer.toString();
173      }
174    
175      /** Returns true iff <code>o</code> is a UTF8 with the same contents.  */
176      @Override
177      public boolean equals(Object o) {
178        if (!(o instanceof UTF8))
179          return false;
180        UTF8 that = (UTF8)o;
181        if (this.length != that.length)
182          return false;
183        else
184          return WritableComparator.compareBytes(bytes, 0, length,
185                                                 that.bytes, 0, that.length) == 0;
186      }
187    
188      @Override
189      public int hashCode() {
190        return WritableComparator.hashBytes(bytes, length);
191      }
192    
193      /** A WritableComparator optimized for UTF8 keys. */
194      public static class Comparator extends WritableComparator {
195        public Comparator() {
196          super(UTF8.class);
197        }
198    
199        @Override
200        public int compare(byte[] b1, int s1, int l1,
201                           byte[] b2, int s2, int l2) {
202          int n1 = readUnsignedShort(b1, s1);
203          int n2 = readUnsignedShort(b2, s2);
204          return compareBytes(b1, s1+2, n1, b2, s2+2, n2);
205        }
206      }
207    
208      static {                                        // register this comparator
209        WritableComparator.define(UTF8.class, new Comparator());
210      }
211    
212      /// STATIC UTILITIES FROM HERE DOWN
213    
214      /// These are probably not used much anymore, and might be removed...
215    
216      /** Convert a string to a UTF-8 encoded byte array.
217       * @see String#getBytes(String)
218       */
219      public static byte[] getBytes(String string) {
220        byte[] result = new byte[utf8Length(string)];
221        try {                                         // avoid sync'd allocations
222          DataOutputBuffer obuf = OBUF_FACTORY.get();
223          obuf.reset();
224          writeChars(obuf, string, 0, string.length());
225          System.arraycopy(obuf.getData(), 0, result, 0, obuf.getLength());
226        } catch (IOException e) {
227          throw new RuntimeException(e);
228        }
229        return result;
230      }
231    
232      /**
233       * Convert a UTF-8 encoded byte array back into a string.
234       *
235       * @throws IOException if the byte array is invalid UTF8
236       */
237      public static String fromBytes(byte[] bytes) throws IOException {
238        DataInputBuffer dbuf = new DataInputBuffer();
239        dbuf.reset(bytes, 0, bytes.length);
240        StringBuilder buf = new StringBuilder(bytes.length);
241        readChars(dbuf, buf, bytes.length);
242        return buf.toString();
243      }
244    
245      /** Read a UTF-8 encoded string.
246       *
247       * @see DataInput#readUTF()
248       */
249      public static String readString(DataInput in) throws IOException {
250        int bytes = in.readUnsignedShort();
251        StringBuilder buffer = new StringBuilder(bytes);
252        readChars(in, buffer, bytes);
253        return buffer.toString();
254      }
255    
256      private static void readChars(DataInput in, StringBuilder buffer, int nBytes)
257        throws UTFDataFormatException, IOException {
258        DataOutputBuffer obuf = OBUF_FACTORY.get();
259        obuf.reset();
260        obuf.write(in, nBytes);
261        byte[] bytes = obuf.getData();
262        int i = 0;
263        while (i < nBytes) {
264          byte b = bytes[i++];
265          if ((b & 0x80) == 0) {
266            // 0b0xxxxxxx: 1-byte sequence
267            buffer.append((char)(b & 0x7F));
268          } else if ((b & 0xE0) == 0xC0) {
269            if (i >= nBytes) {
270              throw new UTFDataFormatException("Truncated UTF8 at " +
271                  StringUtils.byteToHexString(bytes, i - 1, 1));
272            }
273            // 0b110xxxxx: 2-byte sequence
274            buffer.append((char)(((b & 0x1F) << 6)
275                | (bytes[i++] & 0x3F)));
276          } else if ((b & 0xF0) == 0xE0) {
277            // 0b1110xxxx: 3-byte sequence
278            if (i + 1 >= nBytes) {
279              throw new UTFDataFormatException("Truncated UTF8 at " +
280                  StringUtils.byteToHexString(bytes, i - 1, 2));
281            }
282            buffer.append((char)(((b & 0x0F) << 12)
283                | ((bytes[i++] & 0x3F) << 6)
284                |  (bytes[i++] & 0x3F)));
285          } else if ((b & 0xF8) == 0xF0) {
286            if (i + 2 >= nBytes) {
287              throw new UTFDataFormatException("Truncated UTF8 at " +
288                  StringUtils.byteToHexString(bytes, i - 1, 3));
289            }
290            // 0b11110xxx: 4-byte sequence
291            int codepoint =
292                ((b & 0x07) << 18)
293              | ((bytes[i++] & 0x3F) <<  12)
294              | ((bytes[i++] & 0x3F) <<  6)
295              | ((bytes[i++] & 0x3F));
296            buffer.append(highSurrogate(codepoint))
297                  .append(lowSurrogate(codepoint));
298          } else {
299            // The UTF8 standard describes 5-byte and 6-byte sequences, but
300            // these are no longer allowed as of 2003 (see RFC 3629)
301    
302            // Only show the next 6 bytes max in the error code - in case the
303            // buffer is large, this will prevent an exceedingly large message.
304            int endForError = Math.min(i + 5, nBytes);
305            throw new UTFDataFormatException("Invalid UTF8 at " +
306                StringUtils.byteToHexString(bytes, i - 1, endForError));
307          }
308        }
309      }
310    
311      private static char highSurrogate(int codePoint) {
312        return (char) ((codePoint >>> 10)
313            + (Character.MIN_HIGH_SURROGATE - (Character.MIN_SUPPLEMENTARY_CODE_POINT >>> 10)));
314      }
315    
316      private static char lowSurrogate(int codePoint) {
317        return (char) ((codePoint & 0x3ff) + Character.MIN_LOW_SURROGATE);
318      }
319    
320      /** Write a UTF-8 encoded string.
321       *
322       * @see DataOutput#writeUTF(String)
323       */
324      public static int writeString(DataOutput out, String s) throws IOException {
325        if (s.length() > 0xffff/3) {         // maybe too long
326          LOG.warn("truncating long string: " + s.length()
327                   + " chars, starting with " + s.substring(0, 20));
328          s = s.substring(0, 0xffff/3);
329        }
330    
331        int len = utf8Length(s);
332        if (len > 0xffff)                             // double-check length
333          throw new IOException("string too long!");
334          
335        out.writeShort(len);
336        writeChars(out, s, 0, s.length());
337        return len;
338      }
339    
340      /** Returns the number of bytes required to write this. */
341      private static int utf8Length(String string) {
342        int stringLength = string.length();
343        int utf8Length = 0;
344        for (int i = 0; i < stringLength; i++) {
345          int c = string.charAt(i);
346          if (c <= 0x007F) {
347            utf8Length++;
348          } else if (c > 0x07FF) {
349            utf8Length += 3;
350          } else {
351            utf8Length += 2;
352          }
353        }
354        return utf8Length;
355      }
356    
357      private static void writeChars(DataOutput out,
358                                     String s, int start, int length)
359        throws IOException {
360        final int end = start + length;
361        for (int i = start; i < end; i++) {
362          int code = s.charAt(i);
363          if (code <= 0x7F) {
364            out.writeByte((byte)code);
365          } else if (code <= 0x07FF) {
366            out.writeByte((byte)(0xC0 | ((code >> 6) & 0x1F)));
367            out.writeByte((byte)(0x80 |   code       & 0x3F));
368          } else {
369            out.writeByte((byte)(0xE0 | ((code >> 12) & 0X0F)));
370            out.writeByte((byte)(0x80 | ((code >>  6) & 0x3F)));
371            out.writeByte((byte)(0x80 |  (code        & 0x3F)));
372          }
373        }
374      }
375    
376    }