001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 package org.apache.hadoop.io;
020
021 import java.io.IOException;
022 import java.io.DataInput;
023 import java.io.DataOutput;
024 import java.io.UTFDataFormatException;
025
026 import org.apache.hadoop.util.StringUtils;
027
028 import org.apache.commons.logging.*;
029 import org.apache.hadoop.classification.InterfaceAudience;
030 import org.apache.hadoop.classification.InterfaceStability;
031
032 /** A WritableComparable for strings that uses the UTF8 encoding.
033 *
034 * <p>Also includes utilities for efficiently reading and writing UTF-8.
035 *
036 * Note that this decodes UTF-8 but actually encodes CESU-8, a variant of
037 * UTF-8: see http://en.wikipedia.org/wiki/CESU-8
038 *
039 * @deprecated replaced by Text
040 */
041 @Deprecated
042 @InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"})
043 @InterfaceStability.Stable
044 public class UTF8 implements WritableComparable<UTF8> {
045 private static final Log LOG= LogFactory.getLog(UTF8.class);
046 private static final DataInputBuffer IBUF = new DataInputBuffer();
047
048 private static final ThreadLocal<DataOutputBuffer> OBUF_FACTORY =
049 new ThreadLocal<DataOutputBuffer>(){
050 @Override
051 protected DataOutputBuffer initialValue() {
052 return new DataOutputBuffer();
053 }
054 };
055
056 private static final byte[] EMPTY_BYTES = new byte[0];
057
058 private byte[] bytes = EMPTY_BYTES;
059 private int length;
060
061 public UTF8() {
062 //set("");
063 }
064
065 /** Construct from a given string. */
066 public UTF8(String string) {
067 set(string);
068 }
069
070 /** Construct from a given string. */
071 public UTF8(UTF8 utf8) {
072 set(utf8);
073 }
074
075 /** The raw bytes. */
076 public byte[] getBytes() {
077 return bytes;
078 }
079
080 /** The number of bytes in the encoded string. */
081 public int getLength() {
082 return length;
083 }
084
085 /** Set to contain the contents of a string. */
086 public void set(String string) {
087 if (string.length() > 0xffff/3) { // maybe too long
088 LOG.warn("truncating long string: " + string.length()
089 + " chars, starting with " + string.substring(0, 20));
090 string = string.substring(0, 0xffff/3);
091 }
092
093 length = utf8Length(string); // compute length
094 if (length > 0xffff) // double-check length
095 throw new RuntimeException("string too long!");
096
097 if (bytes == null || length > bytes.length) // grow buffer
098 bytes = new byte[length];
099
100 try { // avoid sync'd allocations
101 DataOutputBuffer obuf = OBUF_FACTORY.get();
102 obuf.reset();
103 writeChars(obuf, string, 0, string.length());
104 System.arraycopy(obuf.getData(), 0, bytes, 0, length);
105 } catch (IOException e) {
106 throw new RuntimeException(e);
107 }
108 }
109
110 /** Set to contain the contents of a string. */
111 public void set(UTF8 other) {
112 length = other.length;
113 if (bytes == null || length > bytes.length) // grow buffer
114 bytes = new byte[length];
115 System.arraycopy(other.bytes, 0, bytes, 0, length);
116 }
117
118 @Override
119 public void readFields(DataInput in) throws IOException {
120 length = in.readUnsignedShort();
121 if (bytes == null || bytes.length < length)
122 bytes = new byte[length];
123 in.readFully(bytes, 0, length);
124 }
125
126 /** Skips over one UTF8 in the input. */
127 public static void skip(DataInput in) throws IOException {
128 int length = in.readUnsignedShort();
129 WritableUtils.skipFully(in, length);
130 }
131
132 @Override
133 public void write(DataOutput out) throws IOException {
134 out.writeShort(length);
135 out.write(bytes, 0, length);
136 }
137
138 /** Compare two UTF8s. */
139 @Override
140 public int compareTo(UTF8 o) {
141 return WritableComparator.compareBytes(bytes, 0, length,
142 o.bytes, 0, o.length);
143 }
144
145 /** Convert to a String. */
146 @Override
147 public String toString() {
148 StringBuilder buffer = new StringBuilder(length);
149 try {
150 synchronized (IBUF) {
151 IBUF.reset(bytes, length);
152 readChars(IBUF, buffer, length);
153 }
154 } catch (IOException e) {
155 throw new RuntimeException(e);
156 }
157 return buffer.toString();
158 }
159
160 /**
161 * Convert to a string, checking for valid UTF8.
162 * @return the converted string
163 * @throws UTFDataFormatException if the underlying bytes contain invalid
164 * UTF8 data.
165 */
166 public String toStringChecked() throws IOException {
167 StringBuilder buffer = new StringBuilder(length);
168 synchronized (IBUF) {
169 IBUF.reset(bytes, length);
170 readChars(IBUF, buffer, length);
171 }
172 return buffer.toString();
173 }
174
175 /** Returns true iff <code>o</code> is a UTF8 with the same contents. */
176 @Override
177 public boolean equals(Object o) {
178 if (!(o instanceof UTF8))
179 return false;
180 UTF8 that = (UTF8)o;
181 if (this.length != that.length)
182 return false;
183 else
184 return WritableComparator.compareBytes(bytes, 0, length,
185 that.bytes, 0, that.length) == 0;
186 }
187
188 @Override
189 public int hashCode() {
190 return WritableComparator.hashBytes(bytes, length);
191 }
192
193 /** A WritableComparator optimized for UTF8 keys. */
194 public static class Comparator extends WritableComparator {
195 public Comparator() {
196 super(UTF8.class);
197 }
198
199 @Override
200 public int compare(byte[] b1, int s1, int l1,
201 byte[] b2, int s2, int l2) {
202 int n1 = readUnsignedShort(b1, s1);
203 int n2 = readUnsignedShort(b2, s2);
204 return compareBytes(b1, s1+2, n1, b2, s2+2, n2);
205 }
206 }
207
208 static { // register this comparator
209 WritableComparator.define(UTF8.class, new Comparator());
210 }
211
212 /// STATIC UTILITIES FROM HERE DOWN
213
214 /// These are probably not used much anymore, and might be removed...
215
216 /** Convert a string to a UTF-8 encoded byte array.
217 * @see String#getBytes(String)
218 */
219 public static byte[] getBytes(String string) {
220 byte[] result = new byte[utf8Length(string)];
221 try { // avoid sync'd allocations
222 DataOutputBuffer obuf = OBUF_FACTORY.get();
223 obuf.reset();
224 writeChars(obuf, string, 0, string.length());
225 System.arraycopy(obuf.getData(), 0, result, 0, obuf.getLength());
226 } catch (IOException e) {
227 throw new RuntimeException(e);
228 }
229 return result;
230 }
231
232 /**
233 * Convert a UTF-8 encoded byte array back into a string.
234 *
235 * @throws IOException if the byte array is invalid UTF8
236 */
237 public static String fromBytes(byte[] bytes) throws IOException {
238 DataInputBuffer dbuf = new DataInputBuffer();
239 dbuf.reset(bytes, 0, bytes.length);
240 StringBuilder buf = new StringBuilder(bytes.length);
241 readChars(dbuf, buf, bytes.length);
242 return buf.toString();
243 }
244
245 /** Read a UTF-8 encoded string.
246 *
247 * @see DataInput#readUTF()
248 */
249 public static String readString(DataInput in) throws IOException {
250 int bytes = in.readUnsignedShort();
251 StringBuilder buffer = new StringBuilder(bytes);
252 readChars(in, buffer, bytes);
253 return buffer.toString();
254 }
255
256 private static void readChars(DataInput in, StringBuilder buffer, int nBytes)
257 throws UTFDataFormatException, IOException {
258 DataOutputBuffer obuf = OBUF_FACTORY.get();
259 obuf.reset();
260 obuf.write(in, nBytes);
261 byte[] bytes = obuf.getData();
262 int i = 0;
263 while (i < nBytes) {
264 byte b = bytes[i++];
265 if ((b & 0x80) == 0) {
266 // 0b0xxxxxxx: 1-byte sequence
267 buffer.append((char)(b & 0x7F));
268 } else if ((b & 0xE0) == 0xC0) {
269 if (i >= nBytes) {
270 throw new UTFDataFormatException("Truncated UTF8 at " +
271 StringUtils.byteToHexString(bytes, i - 1, 1));
272 }
273 // 0b110xxxxx: 2-byte sequence
274 buffer.append((char)(((b & 0x1F) << 6)
275 | (bytes[i++] & 0x3F)));
276 } else if ((b & 0xF0) == 0xE0) {
277 // 0b1110xxxx: 3-byte sequence
278 if (i + 1 >= nBytes) {
279 throw new UTFDataFormatException("Truncated UTF8 at " +
280 StringUtils.byteToHexString(bytes, i - 1, 2));
281 }
282 buffer.append((char)(((b & 0x0F) << 12)
283 | ((bytes[i++] & 0x3F) << 6)
284 | (bytes[i++] & 0x3F)));
285 } else if ((b & 0xF8) == 0xF0) {
286 if (i + 2 >= nBytes) {
287 throw new UTFDataFormatException("Truncated UTF8 at " +
288 StringUtils.byteToHexString(bytes, i - 1, 3));
289 }
290 // 0b11110xxx: 4-byte sequence
291 int codepoint =
292 ((b & 0x07) << 18)
293 | ((bytes[i++] & 0x3F) << 12)
294 | ((bytes[i++] & 0x3F) << 6)
295 | ((bytes[i++] & 0x3F));
296 buffer.append(highSurrogate(codepoint))
297 .append(lowSurrogate(codepoint));
298 } else {
299 // The UTF8 standard describes 5-byte and 6-byte sequences, but
300 // these are no longer allowed as of 2003 (see RFC 3629)
301
302 // Only show the next 6 bytes max in the error code - in case the
303 // buffer is large, this will prevent an exceedingly large message.
304 int endForError = Math.min(i + 5, nBytes);
305 throw new UTFDataFormatException("Invalid UTF8 at " +
306 StringUtils.byteToHexString(bytes, i - 1, endForError));
307 }
308 }
309 }
310
311 private static char highSurrogate(int codePoint) {
312 return (char) ((codePoint >>> 10)
313 + (Character.MIN_HIGH_SURROGATE - (Character.MIN_SUPPLEMENTARY_CODE_POINT >>> 10)));
314 }
315
316 private static char lowSurrogate(int codePoint) {
317 return (char) ((codePoint & 0x3ff) + Character.MIN_LOW_SURROGATE);
318 }
319
320 /** Write a UTF-8 encoded string.
321 *
322 * @see DataOutput#writeUTF(String)
323 */
324 public static int writeString(DataOutput out, String s) throws IOException {
325 if (s.length() > 0xffff/3) { // maybe too long
326 LOG.warn("truncating long string: " + s.length()
327 + " chars, starting with " + s.substring(0, 20));
328 s = s.substring(0, 0xffff/3);
329 }
330
331 int len = utf8Length(s);
332 if (len > 0xffff) // double-check length
333 throw new IOException("string too long!");
334
335 out.writeShort(len);
336 writeChars(out, s, 0, s.length());
337 return len;
338 }
339
340 /** Returns the number of bytes required to write this. */
341 private static int utf8Length(String string) {
342 int stringLength = string.length();
343 int utf8Length = 0;
344 for (int i = 0; i < stringLength; i++) {
345 int c = string.charAt(i);
346 if (c <= 0x007F) {
347 utf8Length++;
348 } else if (c > 0x07FF) {
349 utf8Length += 3;
350 } else {
351 utf8Length += 2;
352 }
353 }
354 return utf8Length;
355 }
356
357 private static void writeChars(DataOutput out,
358 String s, int start, int length)
359 throws IOException {
360 final int end = start + length;
361 for (int i = start; i < end; i++) {
362 int code = s.charAt(i);
363 if (code <= 0x7F) {
364 out.writeByte((byte)code);
365 } else if (code <= 0x07FF) {
366 out.writeByte((byte)(0xC0 | ((code >> 6) & 0x1F)));
367 out.writeByte((byte)(0x80 | code & 0x3F));
368 } else {
369 out.writeByte((byte)(0xE0 | ((code >> 12) & 0X0F)));
370 out.writeByte((byte)(0x80 | ((code >> 6) & 0x3F)));
371 out.writeByte((byte)(0x80 | (code & 0x3F)));
372 }
373 }
374 }
375
376 }