001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.io;
020
021import java.io.IOException;
022import java.io.DataInput;
023import java.io.DataOutput;
024
025
026import org.apache.commons.logging.*;
027import org.apache.hadoop.classification.InterfaceAudience;
028import org.apache.hadoop.classification.InterfaceStability;
029
030/** A WritableComparable for strings that uses the UTF8 encoding.
031 * 
032 * <p>Also includes utilities for efficiently reading and writing UTF-8.
033 *
034 * @deprecated replaced by Text
035 */
036@Deprecated
037@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"})
038@InterfaceStability.Stable
039public class UTF8 implements WritableComparable<UTF8> {
040  private static final Log LOG= LogFactory.getLog(UTF8.class);
041  private static final DataInputBuffer IBUF = new DataInputBuffer();
042
043  private static final ThreadLocal<DataOutputBuffer> OBUF_FACTORY =
044    new ThreadLocal<DataOutputBuffer>(){
045    @Override
046    protected DataOutputBuffer initialValue() {
047      return new DataOutputBuffer();
048    }
049  };
050
051  private static final byte[] EMPTY_BYTES = new byte[0];
052
053  private byte[] bytes = EMPTY_BYTES;
054  private int length;
055
056  public UTF8() {
057    //set("");
058  }
059
060  /** Construct from a given string. */
061  public UTF8(String string) {
062    set(string);
063  }
064
065  /** Construct from a given string. */
066  public UTF8(UTF8 utf8) {
067    set(utf8);
068  }
069
070  /** The raw bytes. */
071  public byte[] getBytes() {
072    return bytes;
073  }
074
075  /** The number of bytes in the encoded string. */
076  public int getLength() {
077    return length;
078  }
079
080  /** Set to contain the contents of a string. */
081  public void set(String string) {
082    if (string.length() > 0xffff/3) {             // maybe too long
083      LOG.warn("truncating long string: " + string.length()
084               + " chars, starting with " + string.substring(0, 20));
085      string = string.substring(0, 0xffff/3);
086    }
087
088    length = utf8Length(string);                  // compute length
089    if (length > 0xffff)                          // double-check length
090      throw new RuntimeException("string too long!");
091
092    if (bytes == null || length > bytes.length)   // grow buffer
093      bytes = new byte[length];
094
095    try {                                         // avoid sync'd allocations
096      DataOutputBuffer obuf = OBUF_FACTORY.get();
097      obuf.reset();
098      writeChars(obuf, string, 0, string.length());
099      System.arraycopy(obuf.getData(), 0, bytes, 0, length);
100    } catch (IOException e) {
101      throw new RuntimeException(e);
102    }
103  }
104
105  /** Set to contain the contents of a string. */
106  public void set(UTF8 other) {
107    length = other.length;
108    if (bytes == null || length > bytes.length)   // grow buffer
109      bytes = new byte[length];
110    System.arraycopy(other.bytes, 0, bytes, 0, length);
111  }
112
113  public void readFields(DataInput in) throws IOException {
114    length = in.readUnsignedShort();
115    if (bytes == null || bytes.length < length)
116      bytes = new byte[length];
117    in.readFully(bytes, 0, length);
118  }
119
120  /** Skips over one UTF8 in the input. */
121  public static void skip(DataInput in) throws IOException {
122    int length = in.readUnsignedShort();
123    WritableUtils.skipFully(in, length);
124  }
125
126  public void write(DataOutput out) throws IOException {
127    out.writeShort(length);
128    out.write(bytes, 0, length);
129  }
130
131  /** Compare two UTF8s. */
132  @Override
133  public int compareTo(UTF8 o) {
134    return WritableComparator.compareBytes(bytes, 0, length,
135                                           o.bytes, 0, o.length);
136  }
137
138  /** Convert to a String. */
139  @Override
140  public String toString() {
141    StringBuilder buffer = new StringBuilder(length);
142    try {
143      synchronized (IBUF) {
144        IBUF.reset(bytes, length);
145        readChars(IBUF, buffer, length);
146      }
147    } catch (IOException e) {
148      throw new RuntimeException(e);
149    }
150    return buffer.toString();
151  }
152
153  /** Returns true iff <code>o</code> is a UTF8 with the same contents.  */
154  @Override
155  public boolean equals(Object o) {
156    if (!(o instanceof UTF8))
157      return false;
158    UTF8 that = (UTF8)o;
159    if (this.length != that.length)
160      return false;
161    else
162      return WritableComparator.compareBytes(bytes, 0, length,
163                                             that.bytes, 0, that.length) == 0;
164  }
165
166  @Override
167  public int hashCode() {
168    return WritableComparator.hashBytes(bytes, length);
169  }
170
171  /** A WritableComparator optimized for UTF8 keys. */
172  public static class Comparator extends WritableComparator {
173    public Comparator() {
174      super(UTF8.class);
175    }
176
177    @Override
178    public int compare(byte[] b1, int s1, int l1,
179                       byte[] b2, int s2, int l2) {
180      int n1 = readUnsignedShort(b1, s1);
181      int n2 = readUnsignedShort(b2, s2);
182      return compareBytes(b1, s1+2, n1, b2, s2+2, n2);
183    }
184  }
185
186  static {                                        // register this comparator
187    WritableComparator.define(UTF8.class, new Comparator());
188  }
189
190  /// STATIC UTILITIES FROM HERE DOWN
191
192  /// These are probably not used much anymore, and might be removed...
193
194  /** Convert a string to a UTF-8 encoded byte array.
195   * @see String#getBytes(String)
196   */
197  public static byte[] getBytes(String string) {
198    byte[] result = new byte[utf8Length(string)];
199    try {                                         // avoid sync'd allocations
200      DataOutputBuffer obuf = OBUF_FACTORY.get();
201      obuf.reset();
202      writeChars(obuf, string, 0, string.length());
203      System.arraycopy(obuf.getData(), 0, result, 0, obuf.getLength());
204    } catch (IOException e) {
205      throw new RuntimeException(e);
206    }
207    return result;
208  }
209
210  /** Read a UTF-8 encoded string.
211   *
212   * @see DataInput#readUTF()
213   */
214  public static String readString(DataInput in) throws IOException {
215    int bytes = in.readUnsignedShort();
216    StringBuilder buffer = new StringBuilder(bytes);
217    readChars(in, buffer, bytes);
218    return buffer.toString();
219  }
220
221  private static void readChars(DataInput in, StringBuilder buffer, int nBytes)
222    throws IOException {
223    DataOutputBuffer obuf = OBUF_FACTORY.get();
224    obuf.reset();
225    obuf.write(in, nBytes);
226    byte[] bytes = obuf.getData();
227    int i = 0;
228    while (i < nBytes) {
229      byte b = bytes[i++];
230      if ((b & 0x80) == 0) {
231        buffer.append((char)(b & 0x7F));
232      } else if ((b & 0xE0) != 0xE0) {
233        buffer.append((char)(((b & 0x1F) << 6)
234            | (bytes[i++] & 0x3F)));
235      } else {
236        buffer.append((char)(((b & 0x0F) << 12)
237            | ((bytes[i++] & 0x3F) << 6)
238            |  (bytes[i++] & 0x3F)));
239      }
240    }
241  }
242
243  /** Write a UTF-8 encoded string.
244   *
245   * @see DataOutput#writeUTF(String)
246   */
247  public static int writeString(DataOutput out, String s) throws IOException {
248    if (s.length() > 0xffff/3) {         // maybe too long
249      LOG.warn("truncating long string: " + s.length()
250               + " chars, starting with " + s.substring(0, 20));
251      s = s.substring(0, 0xffff/3);
252    }
253
254    int len = utf8Length(s);
255    if (len > 0xffff)                             // double-check length
256      throw new IOException("string too long!");
257      
258    out.writeShort(len);
259    writeChars(out, s, 0, s.length());
260    return len;
261  }
262
263  /** Returns the number of bytes required to write this. */
264  private static int utf8Length(String string) {
265    int stringLength = string.length();
266    int utf8Length = 0;
267    for (int i = 0; i < stringLength; i++) {
268      int c = string.charAt(i);
269      if (c <= 0x007F) {
270        utf8Length++;
271      } else if (c > 0x07FF) {
272        utf8Length += 3;
273      } else {
274        utf8Length += 2;
275      }
276    }
277    return utf8Length;
278  }
279
280  private static void writeChars(DataOutput out,
281                                 String s, int start, int length)
282    throws IOException {
283    final int end = start + length;
284    for (int i = start; i < end; i++) {
285      int code = s.charAt(i);
286      if (code <= 0x7F) {
287        out.writeByte((byte)code);
288      } else if (code <= 0x07FF) {
289        out.writeByte((byte)(0xC0 | ((code >> 6) & 0x1F)));
290        out.writeByte((byte)(0x80 |   code       & 0x3F));
291      } else {
292        out.writeByte((byte)(0xE0 | ((code >> 12) & 0X0F)));
293        out.writeByte((byte)(0x80 | ((code >>  6) & 0x3F)));
294        out.writeByte((byte)(0x80 |  (code        & 0x3F)));
295      }
296    }
297  }
298
299}