001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019 package org.apache.hadoop.io; 020 021 import java.io.IOException; 022 import java.io.DataInput; 023 import java.io.DataOutput; 024 025 026 import org.apache.commons.logging.*; 027 import org.apache.hadoop.classification.InterfaceAudience; 028 import org.apache.hadoop.classification.InterfaceStability; 029 030 /** A WritableComparable for strings that uses the UTF8 encoding. 031 * 032 * <p>Also includes utilities for efficiently reading and writing UTF-8. 033 * 034 * @deprecated replaced by Text 035 */ 036 @Deprecated 037 @InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"}) 038 @InterfaceStability.Stable 039 public class UTF8 implements WritableComparable<UTF8> { 040 private static final Log LOG= LogFactory.getLog(UTF8.class); 041 private static final DataInputBuffer IBUF = new DataInputBuffer(); 042 043 private static final ThreadLocal<DataOutputBuffer> OBUF_FACTORY = 044 new ThreadLocal<DataOutputBuffer>(){ 045 @Override 046 protected DataOutputBuffer initialValue() { 047 return new DataOutputBuffer(); 048 } 049 }; 050 051 private static final byte[] EMPTY_BYTES = new byte[0]; 052 053 private byte[] bytes = EMPTY_BYTES; 054 private int length; 055 056 public UTF8() { 057 //set(""); 058 } 059 060 /** Construct from a given string. */ 061 public UTF8(String string) { 062 set(string); 063 } 064 065 /** Construct from a given string. */ 066 public UTF8(UTF8 utf8) { 067 set(utf8); 068 } 069 070 /** The raw bytes. */ 071 public byte[] getBytes() { 072 return bytes; 073 } 074 075 /** The number of bytes in the encoded string. */ 076 public int getLength() { 077 return length; 078 } 079 080 /** Set to contain the contents of a string. */ 081 public void set(String string) { 082 if (string.length() > 0xffff/3) { // maybe too long 083 LOG.warn("truncating long string: " + string.length() 084 + " chars, starting with " + string.substring(0, 20)); 085 string = string.substring(0, 0xffff/3); 086 } 087 088 length = utf8Length(string); // compute length 089 if (length > 0xffff) // double-check length 090 throw new RuntimeException("string too long!"); 091 092 if (bytes == null || length > bytes.length) // grow buffer 093 bytes = new byte[length]; 094 095 try { // avoid sync'd allocations 096 DataOutputBuffer obuf = OBUF_FACTORY.get(); 097 obuf.reset(); 098 writeChars(obuf, string, 0, string.length()); 099 System.arraycopy(obuf.getData(), 0, bytes, 0, length); 100 } catch (IOException e) { 101 throw new RuntimeException(e); 102 } 103 } 104 105 /** Set to contain the contents of a string. */ 106 public void set(UTF8 other) { 107 length = other.length; 108 if (bytes == null || length > bytes.length) // grow buffer 109 bytes = new byte[length]; 110 System.arraycopy(other.bytes, 0, bytes, 0, length); 111 } 112 113 @Override 114 public void readFields(DataInput in) throws IOException { 115 length = in.readUnsignedShort(); 116 if (bytes == null || bytes.length < length) 117 bytes = new byte[length]; 118 in.readFully(bytes, 0, length); 119 } 120 121 /** Skips over one UTF8 in the input. */ 122 public static void skip(DataInput in) throws IOException { 123 int length = in.readUnsignedShort(); 124 WritableUtils.skipFully(in, length); 125 } 126 127 @Override 128 public void write(DataOutput out) throws IOException { 129 out.writeShort(length); 130 out.write(bytes, 0, length); 131 } 132 133 /** Compare two UTF8s. */ 134 @Override 135 public int compareTo(UTF8 o) { 136 return WritableComparator.compareBytes(bytes, 0, length, 137 o.bytes, 0, o.length); 138 } 139 140 /** Convert to a String. */ 141 @Override 142 public String toString() { 143 StringBuilder buffer = new StringBuilder(length); 144 try { 145 synchronized (IBUF) { 146 IBUF.reset(bytes, length); 147 readChars(IBUF, buffer, length); 148 } 149 } catch (IOException e) { 150 throw new RuntimeException(e); 151 } 152 return buffer.toString(); 153 } 154 155 /** Returns true iff <code>o</code> is a UTF8 with the same contents. */ 156 @Override 157 public boolean equals(Object o) { 158 if (!(o instanceof UTF8)) 159 return false; 160 UTF8 that = (UTF8)o; 161 if (this.length != that.length) 162 return false; 163 else 164 return WritableComparator.compareBytes(bytes, 0, length, 165 that.bytes, 0, that.length) == 0; 166 } 167 168 @Override 169 public int hashCode() { 170 return WritableComparator.hashBytes(bytes, length); 171 } 172 173 /** A WritableComparator optimized for UTF8 keys. */ 174 public static class Comparator extends WritableComparator { 175 public Comparator() { 176 super(UTF8.class); 177 } 178 179 @Override 180 public int compare(byte[] b1, int s1, int l1, 181 byte[] b2, int s2, int l2) { 182 int n1 = readUnsignedShort(b1, s1); 183 int n2 = readUnsignedShort(b2, s2); 184 return compareBytes(b1, s1+2, n1, b2, s2+2, n2); 185 } 186 } 187 188 static { // register this comparator 189 WritableComparator.define(UTF8.class, new Comparator()); 190 } 191 192 /// STATIC UTILITIES FROM HERE DOWN 193 194 /// These are probably not used much anymore, and might be removed... 195 196 /** Convert a string to a UTF-8 encoded byte array. 197 * @see String#getBytes(String) 198 */ 199 public static byte[] getBytes(String string) { 200 byte[] result = new byte[utf8Length(string)]; 201 try { // avoid sync'd allocations 202 DataOutputBuffer obuf = OBUF_FACTORY.get(); 203 obuf.reset(); 204 writeChars(obuf, string, 0, string.length()); 205 System.arraycopy(obuf.getData(), 0, result, 0, obuf.getLength()); 206 } catch (IOException e) { 207 throw new RuntimeException(e); 208 } 209 return result; 210 } 211 212 /** Read a UTF-8 encoded string. 213 * 214 * @see DataInput#readUTF() 215 */ 216 public static String readString(DataInput in) throws IOException { 217 int bytes = in.readUnsignedShort(); 218 StringBuilder buffer = new StringBuilder(bytes); 219 readChars(in, buffer, bytes); 220 return buffer.toString(); 221 } 222 223 private static void readChars(DataInput in, StringBuilder buffer, int nBytes) 224 throws IOException { 225 DataOutputBuffer obuf = OBUF_FACTORY.get(); 226 obuf.reset(); 227 obuf.write(in, nBytes); 228 byte[] bytes = obuf.getData(); 229 int i = 0; 230 while (i < nBytes) { 231 byte b = bytes[i++]; 232 if ((b & 0x80) == 0) { 233 buffer.append((char)(b & 0x7F)); 234 } else if ((b & 0xE0) != 0xE0) { 235 buffer.append((char)(((b & 0x1F) << 6) 236 | (bytes[i++] & 0x3F))); 237 } else { 238 buffer.append((char)(((b & 0x0F) << 12) 239 | ((bytes[i++] & 0x3F) << 6) 240 | (bytes[i++] & 0x3F))); 241 } 242 } 243 } 244 245 /** Write a UTF-8 encoded string. 246 * 247 * @see DataOutput#writeUTF(String) 248 */ 249 public static int writeString(DataOutput out, String s) throws IOException { 250 if (s.length() > 0xffff/3) { // maybe too long 251 LOG.warn("truncating long string: " + s.length() 252 + " chars, starting with " + s.substring(0, 20)); 253 s = s.substring(0, 0xffff/3); 254 } 255 256 int len = utf8Length(s); 257 if (len > 0xffff) // double-check length 258 throw new IOException("string too long!"); 259 260 out.writeShort(len); 261 writeChars(out, s, 0, s.length()); 262 return len; 263 } 264 265 /** Returns the number of bytes required to write this. */ 266 private static int utf8Length(String string) { 267 int stringLength = string.length(); 268 int utf8Length = 0; 269 for (int i = 0; i < stringLength; i++) { 270 int c = string.charAt(i); 271 if (c <= 0x007F) { 272 utf8Length++; 273 } else if (c > 0x07FF) { 274 utf8Length += 3; 275 } else { 276 utf8Length += 2; 277 } 278 } 279 return utf8Length; 280 } 281 282 private static void writeChars(DataOutput out, 283 String s, int start, int length) 284 throws IOException { 285 final int end = start + length; 286 for (int i = start; i < end; i++) { 287 int code = s.charAt(i); 288 if (code <= 0x7F) { 289 out.writeByte((byte)code); 290 } else if (code <= 0x07FF) { 291 out.writeByte((byte)(0xC0 | ((code >> 6) & 0x1F))); 292 out.writeByte((byte)(0x80 | code & 0x3F)); 293 } else { 294 out.writeByte((byte)(0xE0 | ((code >> 12) & 0X0F))); 295 out.writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); 296 out.writeByte((byte)(0x80 | (code & 0x3F))); 297 } 298 } 299 } 300 301 }