001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 package org.apache.hadoop.io.compress; 019 020 import java.util.*; 021 022 import org.apache.commons.logging.Log; 023 import org.apache.commons.logging.LogFactory; 024 import org.apache.hadoop.classification.InterfaceAudience; 025 import org.apache.hadoop.classification.InterfaceStability; 026 import org.apache.hadoop.conf.Configuration; 027 import org.apache.hadoop.fs.Path; 028 import org.apache.hadoop.util.ReflectionUtils; 029 030 /** 031 * A factory that will find the correct codec for a given filename. 032 */ 033 @InterfaceAudience.Public 034 @InterfaceStability.Evolving 035 public class CompressionCodecFactory { 036 037 public static final Log LOG = 038 LogFactory.getLog(CompressionCodecFactory.class.getName()); 039 040 private static final ServiceLoader<CompressionCodec> CODEC_PROVIDERS = 041 ServiceLoader.load(CompressionCodec.class); 042 043 /** 044 * A map from the reversed filename suffixes to the codecs. 045 * This is probably overkill, because the maps should be small, but it 046 * automatically supports finding the longest matching suffix. 047 */ 048 private SortedMap<String, CompressionCodec> codecs = null; 049 050 /** 051 * A map from the reversed filename suffixes to the codecs. 052 * This is probably overkill, because the maps should be small, but it 053 * automatically supports finding the longest matching suffix. 054 */ 055 private Map<String, CompressionCodec> codecsByName = null; 056 057 /** 058 * A map from class names to the codecs 059 */ 060 private HashMap<String, CompressionCodec> codecsByClassName = null; 061 062 private void addCodec(CompressionCodec codec) { 063 String suffix = codec.getDefaultExtension(); 064 codecs.put(new StringBuilder(suffix).reverse().toString(), codec); 065 codecsByClassName.put(codec.getClass().getCanonicalName(), codec); 066 067 String codecName = codec.getClass().getSimpleName(); 068 codecsByName.put(codecName.toLowerCase(), codec); 069 if (codecName.endsWith("Codec")) { 070 codecName = codecName.substring(0, codecName.length() - "Codec".length()); 071 codecsByName.put(codecName.toLowerCase(), codec); 072 } 073 } 074 075 /** 076 * Print the extension map out as a string. 077 */ 078 public String toString() { 079 StringBuilder buf = new StringBuilder(); 080 Iterator<Map.Entry<String, CompressionCodec>> itr = 081 codecs.entrySet().iterator(); 082 buf.append("{ "); 083 if (itr.hasNext()) { 084 Map.Entry<String, CompressionCodec> entry = itr.next(); 085 buf.append(entry.getKey()); 086 buf.append(": "); 087 buf.append(entry.getValue().getClass().getName()); 088 while (itr.hasNext()) { 089 entry = itr.next(); 090 buf.append(", "); 091 buf.append(entry.getKey()); 092 buf.append(": "); 093 buf.append(entry.getValue().getClass().getName()); 094 } 095 } 096 buf.append(" }"); 097 return buf.toString(); 098 } 099 100 /** 101 * Get the list of codecs discovered via a Java ServiceLoader, or 102 * listed in the configuration. Codecs specified in configuration come 103 * later in the returned list, and are considered to override those 104 * from the ServiceLoader. 105 * @param conf the configuration to look in 106 * @return a list of the {@link CompressionCodec} classes 107 */ 108 public static List<Class<? extends CompressionCodec>> getCodecClasses(Configuration conf) { 109 List<Class<? extends CompressionCodec>> result 110 = new ArrayList<Class<? extends CompressionCodec>>(); 111 // Add codec classes discovered via service loading 112 for (CompressionCodec codec : CODEC_PROVIDERS) { 113 result.add(codec.getClass()); 114 } 115 // Add codec classes from configuration 116 String codecsString = conf.get("io.compression.codecs"); 117 if (codecsString != null) { 118 StringTokenizer codecSplit = new StringTokenizer(codecsString, ","); 119 while (codecSplit.hasMoreElements()) { 120 String codecSubstring = codecSplit.nextToken(); 121 if (codecSubstring.length() != 0) { 122 try { 123 Class<?> cls = conf.getClassByName(codecSubstring); 124 if (!CompressionCodec.class.isAssignableFrom(cls)) { 125 throw new IllegalArgumentException("Class " + codecSubstring + 126 " is not a CompressionCodec"); 127 } 128 result.add(cls.asSubclass(CompressionCodec.class)); 129 } catch (ClassNotFoundException ex) { 130 throw new IllegalArgumentException("Compression codec " + 131 codecSubstring + " not found.", 132 ex); 133 } 134 } 135 } 136 } 137 return result; 138 } 139 140 /** 141 * Sets a list of codec classes in the configuration. In addition to any 142 * classes specified using this method, {@link CompressionCodec} classes on 143 * the classpath are discovered using a Java ServiceLoader. 144 * @param conf the configuration to modify 145 * @param classes the list of classes to set 146 */ 147 public static void setCodecClasses(Configuration conf, 148 List<Class> classes) { 149 StringBuilder buf = new StringBuilder(); 150 Iterator<Class> itr = classes.iterator(); 151 if (itr.hasNext()) { 152 Class cls = itr.next(); 153 buf.append(cls.getName()); 154 while(itr.hasNext()) { 155 buf.append(','); 156 buf.append(itr.next().getName()); 157 } 158 } 159 conf.set("io.compression.codecs", buf.toString()); 160 } 161 162 /** 163 * Find the codecs specified in the config value io.compression.codecs 164 * and register them. Defaults to gzip and deflate. 165 */ 166 public CompressionCodecFactory(Configuration conf) { 167 codecs = new TreeMap<String, CompressionCodec>(); 168 codecsByClassName = new HashMap<String, CompressionCodec>(); 169 codecsByName = new HashMap<String, CompressionCodec>(); 170 List<Class<? extends CompressionCodec>> codecClasses = getCodecClasses(conf); 171 if (codecClasses == null || codecClasses.isEmpty()) { 172 addCodec(new GzipCodec()); 173 addCodec(new DefaultCodec()); 174 } else { 175 for (Class<? extends CompressionCodec> codecClass : codecClasses) { 176 addCodec(ReflectionUtils.newInstance(codecClass, conf)); 177 } 178 } 179 } 180 181 /** 182 * Find the relevant compression codec for the given file based on its 183 * filename suffix. 184 * @param file the filename to check 185 * @return the codec object 186 */ 187 public CompressionCodec getCodec(Path file) { 188 CompressionCodec result = null; 189 if (codecs != null) { 190 String filename = file.getName(); 191 String reversedFilename = new StringBuilder(filename).reverse().toString(); 192 SortedMap<String, CompressionCodec> subMap = 193 codecs.headMap(reversedFilename); 194 if (!subMap.isEmpty()) { 195 String potentialSuffix = subMap.lastKey(); 196 if (reversedFilename.startsWith(potentialSuffix)) { 197 result = codecs.get(potentialSuffix); 198 } 199 } 200 } 201 return result; 202 } 203 204 /** 205 * Find the relevant compression codec for the codec's canonical class name. 206 * @param classname the canonical class name of the codec 207 * @return the codec object 208 */ 209 public CompressionCodec getCodecByClassName(String classname) { 210 if (codecsByClassName == null) { 211 return null; 212 } 213 return codecsByClassName.get(classname); 214 } 215 216 /** 217 * Find the relevant compression codec for the codec's canonical class name 218 * or by codec alias. 219 * <p/> 220 * Codec aliases are case insensitive. 221 * <p/> 222 * The code alias is the short class name (without the package name). 223 * If the short class name ends with 'Codec', then there are two aliases for 224 * the codec, the complete short class name and the short class name without 225 * the 'Codec' ending. For example for the 'GzipCodec' codec class name the 226 * alias are 'gzip' and 'gzipcodec'. 227 * 228 * @param codecName the canonical class name of the codec 229 * @return the codec object 230 */ 231 public CompressionCodec getCodecByName(String codecName) { 232 if (codecsByClassName == null) { 233 return null; 234 } 235 CompressionCodec codec = getCodecByClassName(codecName); 236 if (codec == null) { 237 // trying to get the codec by name in case the name was specified instead a class 238 codec = codecsByName.get(codecName.toLowerCase()); 239 } 240 return codec; 241 } 242 243 /** 244 * Find the relevant compression codec for the codec's canonical class name 245 * or by codec alias and returns its implemetation class. 246 * <p/> 247 * Codec aliases are case insensitive. 248 * <p/> 249 * The code alias is the short class name (without the package name). 250 * If the short class name ends with 'Codec', then there are two aliases for 251 * the codec, the complete short class name and the short class name without 252 * the 'Codec' ending. For example for the 'GzipCodec' codec class name the 253 * alias are 'gzip' and 'gzipcodec'. 254 * 255 * @param codecName the canonical class name of the codec 256 * @return the codec class 257 */ 258 public Class<? extends CompressionCodec> getCodecClassByName(String codecName) { 259 CompressionCodec codec = getCodecByName(codecName); 260 if (codec == null) { 261 return null; 262 } 263 return codec.getClass(); 264 } 265 266 /** 267 * Removes a suffix from a filename, if it has it. 268 * @param filename the filename to strip 269 * @param suffix the suffix to remove 270 * @return the shortened filename 271 */ 272 public static String removeSuffix(String filename, String suffix) { 273 if (filename.endsWith(suffix)) { 274 return filename.substring(0, filename.length() - suffix.length()); 275 } 276 return filename; 277 } 278 279 /** 280 * A little test program. 281 * @param args 282 */ 283 public static void main(String[] args) throws Exception { 284 Configuration conf = new Configuration(); 285 CompressionCodecFactory factory = new CompressionCodecFactory(conf); 286 boolean encode = false; 287 for(int i=0; i < args.length; ++i) { 288 if ("-in".equals(args[i])) { 289 encode = true; 290 } else if ("-out".equals(args[i])) { 291 encode = false; 292 } else { 293 CompressionCodec codec = factory.getCodec(new Path(args[i])); 294 if (codec == null) { 295 System.out.println("Codec for " + args[i] + " not found."); 296 } else { 297 if (encode) { 298 CompressionOutputStream out = null; 299 java.io.InputStream in = null; 300 try { 301 out = codec.createOutputStream( 302 new java.io.FileOutputStream(args[i])); 303 byte[] buffer = new byte[100]; 304 String inFilename = removeSuffix(args[i], 305 codec.getDefaultExtension()); 306 in = new java.io.FileInputStream(inFilename); 307 int len = in.read(buffer); 308 while (len > 0) { 309 out.write(buffer, 0, len); 310 len = in.read(buffer); 311 } 312 } finally { 313 if(out != null) { out.close(); } 314 if(in != null) { in.close(); } 315 } 316 } else { 317 CompressionInputStream in = null; 318 try { 319 in = codec.createInputStream( 320 new java.io.FileInputStream(args[i])); 321 byte[] buffer = new byte[100]; 322 int len = in.read(buffer); 323 while (len > 0) { 324 System.out.write(buffer, 0, len); 325 len = in.read(buffer); 326 } 327 } finally { 328 if(in != null) { in.close(); } 329 } 330 } 331 } 332 } 333 } 334 } 335 }