001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.io;
020    
021    import java.io.IOException;
022    import java.io.DataInput;
023    import java.io.DataOutput;
024    
025    
026    import org.apache.commons.logging.*;
027    import org.apache.hadoop.classification.InterfaceAudience;
028    import org.apache.hadoop.classification.InterfaceStability;
029    
030    /** A WritableComparable for strings that uses the UTF8 encoding.
031     * 
032     * <p>Also includes utilities for efficiently reading and writing UTF-8.
033     *
034     * @deprecated replaced by Text
035     */
036    @Deprecated
037    @InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"})
038    @InterfaceStability.Stable
039    public class UTF8 implements WritableComparable<UTF8> {
040      private static final Log LOG= LogFactory.getLog(UTF8.class);
041      private static final DataInputBuffer IBUF = new DataInputBuffer();
042    
043      private static final ThreadLocal<DataOutputBuffer> OBUF_FACTORY =
044        new ThreadLocal<DataOutputBuffer>(){
045        @Override
046        protected DataOutputBuffer initialValue() {
047          return new DataOutputBuffer();
048        }
049      };
050    
051      private static final byte[] EMPTY_BYTES = new byte[0];
052    
053      private byte[] bytes = EMPTY_BYTES;
054      private int length;
055    
056      public UTF8() {
057        //set("");
058      }
059    
060      /** Construct from a given string. */
061      public UTF8(String string) {
062        set(string);
063      }
064    
065      /** Construct from a given string. */
066      public UTF8(UTF8 utf8) {
067        set(utf8);
068      }
069    
070      /** The raw bytes. */
071      public byte[] getBytes() {
072        return bytes;
073      }
074    
075      /** The number of bytes in the encoded string. */
076      public int getLength() {
077        return length;
078      }
079    
080      /** Set to contain the contents of a string. */
081      public void set(String string) {
082        if (string.length() > 0xffff/3) {             // maybe too long
083          LOG.warn("truncating long string: " + string.length()
084                   + " chars, starting with " + string.substring(0, 20));
085          string = string.substring(0, 0xffff/3);
086        }
087    
088        length = utf8Length(string);                  // compute length
089        if (length > 0xffff)                          // double-check length
090          throw new RuntimeException("string too long!");
091    
092        if (bytes == null || length > bytes.length)   // grow buffer
093          bytes = new byte[length];
094    
095        try {                                         // avoid sync'd allocations
096          DataOutputBuffer obuf = OBUF_FACTORY.get();
097          obuf.reset();
098          writeChars(obuf, string, 0, string.length());
099          System.arraycopy(obuf.getData(), 0, bytes, 0, length);
100        } catch (IOException e) {
101          throw new RuntimeException(e);
102        }
103      }
104    
105      /** Set to contain the contents of a string. */
106      public void set(UTF8 other) {
107        length = other.length;
108        if (bytes == null || length > bytes.length)   // grow buffer
109          bytes = new byte[length];
110        System.arraycopy(other.bytes, 0, bytes, 0, length);
111      }
112    
113      @Override
114      public void readFields(DataInput in) throws IOException {
115        length = in.readUnsignedShort();
116        if (bytes == null || bytes.length < length)
117          bytes = new byte[length];
118        in.readFully(bytes, 0, length);
119      }
120    
121      /** Skips over one UTF8 in the input. */
122      public static void skip(DataInput in) throws IOException {
123        int length = in.readUnsignedShort();
124        WritableUtils.skipFully(in, length);
125      }
126    
127      @Override
128      public void write(DataOutput out) throws IOException {
129        out.writeShort(length);
130        out.write(bytes, 0, length);
131      }
132    
133      /** Compare two UTF8s. */
134      @Override
135      public int compareTo(UTF8 o) {
136        return WritableComparator.compareBytes(bytes, 0, length,
137                                               o.bytes, 0, o.length);
138      }
139    
140      /** Convert to a String. */
141      @Override
142      public String toString() {
143        StringBuilder buffer = new StringBuilder(length);
144        try {
145          synchronized (IBUF) {
146            IBUF.reset(bytes, length);
147            readChars(IBUF, buffer, length);
148          }
149        } catch (IOException e) {
150          throw new RuntimeException(e);
151        }
152        return buffer.toString();
153      }
154    
155      /** Returns true iff <code>o</code> is a UTF8 with the same contents.  */
156      @Override
157      public boolean equals(Object o) {
158        if (!(o instanceof UTF8))
159          return false;
160        UTF8 that = (UTF8)o;
161        if (this.length != that.length)
162          return false;
163        else
164          return WritableComparator.compareBytes(bytes, 0, length,
165                                                 that.bytes, 0, that.length) == 0;
166      }
167    
168      @Override
169      public int hashCode() {
170        return WritableComparator.hashBytes(bytes, length);
171      }
172    
173      /** A WritableComparator optimized for UTF8 keys. */
174      public static class Comparator extends WritableComparator {
175        public Comparator() {
176          super(UTF8.class);
177        }
178    
179        @Override
180        public int compare(byte[] b1, int s1, int l1,
181                           byte[] b2, int s2, int l2) {
182          int n1 = readUnsignedShort(b1, s1);
183          int n2 = readUnsignedShort(b2, s2);
184          return compareBytes(b1, s1+2, n1, b2, s2+2, n2);
185        }
186      }
187    
188      static {                                        // register this comparator
189        WritableComparator.define(UTF8.class, new Comparator());
190      }
191    
192      /// STATIC UTILITIES FROM HERE DOWN
193    
194      /// These are probably not used much anymore, and might be removed...
195    
196      /** Convert a string to a UTF-8 encoded byte array.
197       * @see String#getBytes(String)
198       */
199      public static byte[] getBytes(String string) {
200        byte[] result = new byte[utf8Length(string)];
201        try {                                         // avoid sync'd allocations
202          DataOutputBuffer obuf = OBUF_FACTORY.get();
203          obuf.reset();
204          writeChars(obuf, string, 0, string.length());
205          System.arraycopy(obuf.getData(), 0, result, 0, obuf.getLength());
206        } catch (IOException e) {
207          throw new RuntimeException(e);
208        }
209        return result;
210      }
211    
212      /** Read a UTF-8 encoded string.
213       *
214       * @see DataInput#readUTF()
215       */
216      public static String readString(DataInput in) throws IOException {
217        int bytes = in.readUnsignedShort();
218        StringBuilder buffer = new StringBuilder(bytes);
219        readChars(in, buffer, bytes);
220        return buffer.toString();
221      }
222    
223      private static void readChars(DataInput in, StringBuilder buffer, int nBytes)
224        throws IOException {
225        DataOutputBuffer obuf = OBUF_FACTORY.get();
226        obuf.reset();
227        obuf.write(in, nBytes);
228        byte[] bytes = obuf.getData();
229        int i = 0;
230        while (i < nBytes) {
231          byte b = bytes[i++];
232          if ((b & 0x80) == 0) {
233            buffer.append((char)(b & 0x7F));
234          } else if ((b & 0xE0) != 0xE0) {
235            buffer.append((char)(((b & 0x1F) << 6)
236                | (bytes[i++] & 0x3F)));
237          } else {
238            buffer.append((char)(((b & 0x0F) << 12)
239                | ((bytes[i++] & 0x3F) << 6)
240                |  (bytes[i++] & 0x3F)));
241          }
242        }
243      }
244    
245      /** Write a UTF-8 encoded string.
246       *
247       * @see DataOutput#writeUTF(String)
248       */
249      public static int writeString(DataOutput out, String s) throws IOException {
250        if (s.length() > 0xffff/3) {         // maybe too long
251          LOG.warn("truncating long string: " + s.length()
252                   + " chars, starting with " + s.substring(0, 20));
253          s = s.substring(0, 0xffff/3);
254        }
255    
256        int len = utf8Length(s);
257        if (len > 0xffff)                             // double-check length
258          throw new IOException("string too long!");
259          
260        out.writeShort(len);
261        writeChars(out, s, 0, s.length());
262        return len;
263      }
264    
265      /** Returns the number of bytes required to write this. */
266      private static int utf8Length(String string) {
267        int stringLength = string.length();
268        int utf8Length = 0;
269        for (int i = 0; i < stringLength; i++) {
270          int c = string.charAt(i);
271          if (c <= 0x007F) {
272            utf8Length++;
273          } else if (c > 0x07FF) {
274            utf8Length += 3;
275          } else {
276            utf8Length += 2;
277          }
278        }
279        return utf8Length;
280      }
281    
282      private static void writeChars(DataOutput out,
283                                     String s, int start, int length)
284        throws IOException {
285        final int end = start + length;
286        for (int i = start; i < end; i++) {
287          int code = s.charAt(i);
288          if (code <= 0x7F) {
289            out.writeByte((byte)code);
290          } else if (code <= 0x07FF) {
291            out.writeByte((byte)(0xC0 | ((code >> 6) & 0x1F)));
292            out.writeByte((byte)(0x80 |   code       & 0x3F));
293          } else {
294            out.writeByte((byte)(0xE0 | ((code >> 12) & 0X0F)));
295            out.writeByte((byte)(0x80 | ((code >>  6) & 0x3F)));
296            out.writeByte((byte)(0x80 |  (code        & 0x3F)));
297          }
298        }
299      }
300    
301    }