001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.io; 020 021import java.io.IOException; 022import java.io.DataInput; 023import java.io.DataOutput; 024 025 026import org.apache.commons.logging.*; 027import org.apache.hadoop.classification.InterfaceAudience; 028import org.apache.hadoop.classification.InterfaceStability; 029 030/** A WritableComparable for strings that uses the UTF8 encoding. 031 * 032 * <p>Also includes utilities for efficiently reading and writing UTF-8. 033 * 034 * @deprecated replaced by Text 035 */ 036@Deprecated 037@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"}) 038@InterfaceStability.Stable 039public class UTF8 implements WritableComparable<UTF8> { 040 private static final Log LOG= LogFactory.getLog(UTF8.class); 041 private static final DataInputBuffer IBUF = new DataInputBuffer(); 042 043 private static final ThreadLocal<DataOutputBuffer> OBUF_FACTORY = 044 new ThreadLocal<DataOutputBuffer>(){ 045 @Override 046 protected DataOutputBuffer initialValue() { 047 return new DataOutputBuffer(); 048 } 049 }; 050 051 private static final byte[] EMPTY_BYTES = new byte[0]; 052 053 private byte[] bytes = EMPTY_BYTES; 054 private int length; 055 056 public UTF8() { 057 //set(""); 058 } 059 060 /** Construct from a given string. */ 061 public UTF8(String string) { 062 set(string); 063 } 064 065 /** Construct from a given string. */ 066 public UTF8(UTF8 utf8) { 067 set(utf8); 068 } 069 070 /** The raw bytes. */ 071 public byte[] getBytes() { 072 return bytes; 073 } 074 075 /** The number of bytes in the encoded string. */ 076 public int getLength() { 077 return length; 078 } 079 080 /** Set to contain the contents of a string. */ 081 public void set(String string) { 082 if (string.length() > 0xffff/3) { // maybe too long 083 LOG.warn("truncating long string: " + string.length() 084 + " chars, starting with " + string.substring(0, 20)); 085 string = string.substring(0, 0xffff/3); 086 } 087 088 length = utf8Length(string); // compute length 089 if (length > 0xffff) // double-check length 090 throw new RuntimeException("string too long!"); 091 092 if (bytes == null || length > bytes.length) // grow buffer 093 bytes = new byte[length]; 094 095 try { // avoid sync'd allocations 096 DataOutputBuffer obuf = OBUF_FACTORY.get(); 097 obuf.reset(); 098 writeChars(obuf, string, 0, string.length()); 099 System.arraycopy(obuf.getData(), 0, bytes, 0, length); 100 } catch (IOException e) { 101 throw new RuntimeException(e); 102 } 103 } 104 105 /** Set to contain the contents of a string. */ 106 public void set(UTF8 other) { 107 length = other.length; 108 if (bytes == null || length > bytes.length) // grow buffer 109 bytes = new byte[length]; 110 System.arraycopy(other.bytes, 0, bytes, 0, length); 111 } 112 113 public void readFields(DataInput in) throws IOException { 114 length = in.readUnsignedShort(); 115 if (bytes == null || bytes.length < length) 116 bytes = new byte[length]; 117 in.readFully(bytes, 0, length); 118 } 119 120 /** Skips over one UTF8 in the input. */ 121 public static void skip(DataInput in) throws IOException { 122 int length = in.readUnsignedShort(); 123 WritableUtils.skipFully(in, length); 124 } 125 126 public void write(DataOutput out) throws IOException { 127 out.writeShort(length); 128 out.write(bytes, 0, length); 129 } 130 131 /** Compare two UTF8s. */ 132 @Override 133 public int compareTo(UTF8 o) { 134 return WritableComparator.compareBytes(bytes, 0, length, 135 o.bytes, 0, o.length); 136 } 137 138 /** Convert to a String. */ 139 @Override 140 public String toString() { 141 StringBuilder buffer = new StringBuilder(length); 142 try { 143 synchronized (IBUF) { 144 IBUF.reset(bytes, length); 145 readChars(IBUF, buffer, length); 146 } 147 } catch (IOException e) { 148 throw new RuntimeException(e); 149 } 150 return buffer.toString(); 151 } 152 153 /** Returns true iff <code>o</code> is a UTF8 with the same contents. */ 154 @Override 155 public boolean equals(Object o) { 156 if (!(o instanceof UTF8)) 157 return false; 158 UTF8 that = (UTF8)o; 159 if (this.length != that.length) 160 return false; 161 else 162 return WritableComparator.compareBytes(bytes, 0, length, 163 that.bytes, 0, that.length) == 0; 164 } 165 166 @Override 167 public int hashCode() { 168 return WritableComparator.hashBytes(bytes, length); 169 } 170 171 /** A WritableComparator optimized for UTF8 keys. */ 172 public static class Comparator extends WritableComparator { 173 public Comparator() { 174 super(UTF8.class); 175 } 176 177 @Override 178 public int compare(byte[] b1, int s1, int l1, 179 byte[] b2, int s2, int l2) { 180 int n1 = readUnsignedShort(b1, s1); 181 int n2 = readUnsignedShort(b2, s2); 182 return compareBytes(b1, s1+2, n1, b2, s2+2, n2); 183 } 184 } 185 186 static { // register this comparator 187 WritableComparator.define(UTF8.class, new Comparator()); 188 } 189 190 /// STATIC UTILITIES FROM HERE DOWN 191 192 /// These are probably not used much anymore, and might be removed... 193 194 /** Convert a string to a UTF-8 encoded byte array. 195 * @see String#getBytes(String) 196 */ 197 public static byte[] getBytes(String string) { 198 byte[] result = new byte[utf8Length(string)]; 199 try { // avoid sync'd allocations 200 DataOutputBuffer obuf = OBUF_FACTORY.get(); 201 obuf.reset(); 202 writeChars(obuf, string, 0, string.length()); 203 System.arraycopy(obuf.getData(), 0, result, 0, obuf.getLength()); 204 } catch (IOException e) { 205 throw new RuntimeException(e); 206 } 207 return result; 208 } 209 210 /** Read a UTF-8 encoded string. 211 * 212 * @see DataInput#readUTF() 213 */ 214 public static String readString(DataInput in) throws IOException { 215 int bytes = in.readUnsignedShort(); 216 StringBuilder buffer = new StringBuilder(bytes); 217 readChars(in, buffer, bytes); 218 return buffer.toString(); 219 } 220 221 private static void readChars(DataInput in, StringBuilder buffer, int nBytes) 222 throws IOException { 223 DataOutputBuffer obuf = OBUF_FACTORY.get(); 224 obuf.reset(); 225 obuf.write(in, nBytes); 226 byte[] bytes = obuf.getData(); 227 int i = 0; 228 while (i < nBytes) { 229 byte b = bytes[i++]; 230 if ((b & 0x80) == 0) { 231 buffer.append((char)(b & 0x7F)); 232 } else if ((b & 0xE0) != 0xE0) { 233 buffer.append((char)(((b & 0x1F) << 6) 234 | (bytes[i++] & 0x3F))); 235 } else { 236 buffer.append((char)(((b & 0x0F) << 12) 237 | ((bytes[i++] & 0x3F) << 6) 238 | (bytes[i++] & 0x3F))); 239 } 240 } 241 } 242 243 /** Write a UTF-8 encoded string. 244 * 245 * @see DataOutput#writeUTF(String) 246 */ 247 public static int writeString(DataOutput out, String s) throws IOException { 248 if (s.length() > 0xffff/3) { // maybe too long 249 LOG.warn("truncating long string: " + s.length() 250 + " chars, starting with " + s.substring(0, 20)); 251 s = s.substring(0, 0xffff/3); 252 } 253 254 int len = utf8Length(s); 255 if (len > 0xffff) // double-check length 256 throw new IOException("string too long!"); 257 258 out.writeShort(len); 259 writeChars(out, s, 0, s.length()); 260 return len; 261 } 262 263 /** Returns the number of bytes required to write this. */ 264 private static int utf8Length(String string) { 265 int stringLength = string.length(); 266 int utf8Length = 0; 267 for (int i = 0; i < stringLength; i++) { 268 int c = string.charAt(i); 269 if (c <= 0x007F) { 270 utf8Length++; 271 } else if (c > 0x07FF) { 272 utf8Length += 3; 273 } else { 274 utf8Length += 2; 275 } 276 } 277 return utf8Length; 278 } 279 280 private static void writeChars(DataOutput out, 281 String s, int start, int length) 282 throws IOException { 283 final int end = start + length; 284 for (int i = start; i < end; i++) { 285 int code = s.charAt(i); 286 if (code <= 0x7F) { 287 out.writeByte((byte)code); 288 } else if (code <= 0x07FF) { 289 out.writeByte((byte)(0xC0 | ((code >> 6) & 0x1F))); 290 out.writeByte((byte)(0x80 | code & 0x3F)); 291 } else { 292 out.writeByte((byte)(0xE0 | ((code >> 12) & 0X0F))); 293 out.writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); 294 out.writeByte((byte)(0x80 | (code & 0x3F))); 295 } 296 } 297 } 298 299}