001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.io.compress;
019
020 import java.util.*;
021
022 import org.apache.commons.logging.Log;
023 import org.apache.commons.logging.LogFactory;
024 import org.apache.hadoop.classification.InterfaceAudience;
025 import org.apache.hadoop.classification.InterfaceStability;
026 import org.apache.hadoop.conf.Configuration;
027 import org.apache.hadoop.fs.Path;
028 import org.apache.hadoop.util.ReflectionUtils;
029
030 /**
031 * A factory that will find the correct codec for a given filename.
032 */
033 @InterfaceAudience.Public
034 @InterfaceStability.Evolving
035 public class CompressionCodecFactory {
036
037 public static final Log LOG =
038 LogFactory.getLog(CompressionCodecFactory.class.getName());
039
040 private static final ServiceLoader<CompressionCodec> CODEC_PROVIDERS =
041 ServiceLoader.load(CompressionCodec.class);
042
043 /**
044 * A map from the reversed filename suffixes to the codecs.
045 * This is probably overkill, because the maps should be small, but it
046 * automatically supports finding the longest matching suffix.
047 */
048 private SortedMap<String, CompressionCodec> codecs = null;
049
050 /**
051 * A map from the reversed filename suffixes to the codecs.
052 * This is probably overkill, because the maps should be small, but it
053 * automatically supports finding the longest matching suffix.
054 */
055 private Map<String, CompressionCodec> codecsByName = null;
056
057 /**
058 * A map from class names to the codecs
059 */
060 private HashMap<String, CompressionCodec> codecsByClassName = null;
061
062 private void addCodec(CompressionCodec codec) {
063 String suffix = codec.getDefaultExtension();
064 codecs.put(new StringBuilder(suffix).reverse().toString(), codec);
065 codecsByClassName.put(codec.getClass().getCanonicalName(), codec);
066
067 String codecName = codec.getClass().getSimpleName();
068 codecsByName.put(codecName.toLowerCase(), codec);
069 if (codecName.endsWith("Codec")) {
070 codecName = codecName.substring(0, codecName.length() - "Codec".length());
071 codecsByName.put(codecName.toLowerCase(), codec);
072 }
073 }
074
075 /**
076 * Print the extension map out as a string.
077 */
078 public String toString() {
079 StringBuilder buf = new StringBuilder();
080 Iterator<Map.Entry<String, CompressionCodec>> itr =
081 codecs.entrySet().iterator();
082 buf.append("{ ");
083 if (itr.hasNext()) {
084 Map.Entry<String, CompressionCodec> entry = itr.next();
085 buf.append(entry.getKey());
086 buf.append(": ");
087 buf.append(entry.getValue().getClass().getName());
088 while (itr.hasNext()) {
089 entry = itr.next();
090 buf.append(", ");
091 buf.append(entry.getKey());
092 buf.append(": ");
093 buf.append(entry.getValue().getClass().getName());
094 }
095 }
096 buf.append(" }");
097 return buf.toString();
098 }
099
100 /**
101 * Get the list of codecs discovered via a Java ServiceLoader, or
102 * listed in the configuration. Codecs specified in configuration come
103 * later in the returned list, and are considered to override those
104 * from the ServiceLoader.
105 * @param conf the configuration to look in
106 * @return a list of the {@link CompressionCodec} classes
107 */
108 public static List<Class<? extends CompressionCodec>> getCodecClasses(Configuration conf) {
109 List<Class<? extends CompressionCodec>> result
110 = new ArrayList<Class<? extends CompressionCodec>>();
111 // Add codec classes discovered via service loading
112 for (CompressionCodec codec : CODEC_PROVIDERS) {
113 result.add(codec.getClass());
114 }
115 // Add codec classes from configuration
116 String codecsString = conf.get("io.compression.codecs");
117 if (codecsString != null) {
118 StringTokenizer codecSplit = new StringTokenizer(codecsString, ",");
119 while (codecSplit.hasMoreElements()) {
120 String codecSubstring = codecSplit.nextToken();
121 if (codecSubstring.length() != 0) {
122 try {
123 Class<?> cls = conf.getClassByName(codecSubstring);
124 if (!CompressionCodec.class.isAssignableFrom(cls)) {
125 throw new IllegalArgumentException("Class " + codecSubstring +
126 " is not a CompressionCodec");
127 }
128 result.add(cls.asSubclass(CompressionCodec.class));
129 } catch (ClassNotFoundException ex) {
130 throw new IllegalArgumentException("Compression codec " +
131 codecSubstring + " not found.",
132 ex);
133 }
134 }
135 }
136 }
137 return result;
138 }
139
140 /**
141 * Sets a list of codec classes in the configuration. In addition to any
142 * classes specified using this method, {@link CompressionCodec} classes on
143 * the classpath are discovered using a Java ServiceLoader.
144 * @param conf the configuration to modify
145 * @param classes the list of classes to set
146 */
147 public static void setCodecClasses(Configuration conf,
148 List<Class> classes) {
149 StringBuilder buf = new StringBuilder();
150 Iterator<Class> itr = classes.iterator();
151 if (itr.hasNext()) {
152 Class cls = itr.next();
153 buf.append(cls.getName());
154 while(itr.hasNext()) {
155 buf.append(',');
156 buf.append(itr.next().getName());
157 }
158 }
159 conf.set("io.compression.codecs", buf.toString());
160 }
161
162 /**
163 * Find the codecs specified in the config value io.compression.codecs
164 * and register them. Defaults to gzip and deflate.
165 */
166 public CompressionCodecFactory(Configuration conf) {
167 codecs = new TreeMap<String, CompressionCodec>();
168 codecsByClassName = new HashMap<String, CompressionCodec>();
169 codecsByName = new HashMap<String, CompressionCodec>();
170 List<Class<? extends CompressionCodec>> codecClasses = getCodecClasses(conf);
171 if (codecClasses == null || codecClasses.isEmpty()) {
172 addCodec(new GzipCodec());
173 addCodec(new DefaultCodec());
174 } else {
175 for (Class<? extends CompressionCodec> codecClass : codecClasses) {
176 addCodec(ReflectionUtils.newInstance(codecClass, conf));
177 }
178 }
179 }
180
181 /**
182 * Find the relevant compression codec for the given file based on its
183 * filename suffix.
184 * @param file the filename to check
185 * @return the codec object
186 */
187 public CompressionCodec getCodec(Path file) {
188 CompressionCodec result = null;
189 if (codecs != null) {
190 String filename = file.getName();
191 String reversedFilename = new StringBuilder(filename).reverse().toString();
192 SortedMap<String, CompressionCodec> subMap =
193 codecs.headMap(reversedFilename);
194 if (!subMap.isEmpty()) {
195 String potentialSuffix = subMap.lastKey();
196 if (reversedFilename.startsWith(potentialSuffix)) {
197 result = codecs.get(potentialSuffix);
198 }
199 }
200 }
201 return result;
202 }
203
204 /**
205 * Find the relevant compression codec for the codec's canonical class name.
206 * @param classname the canonical class name of the codec
207 * @return the codec object
208 */
209 public CompressionCodec getCodecByClassName(String classname) {
210 if (codecsByClassName == null) {
211 return null;
212 }
213 return codecsByClassName.get(classname);
214 }
215
216 /**
217 * Find the relevant compression codec for the codec's canonical class name
218 * or by codec alias.
219 * <p/>
220 * Codec aliases are case insensitive.
221 * <p/>
222 * The code alias is the short class name (without the package name).
223 * If the short class name ends with 'Codec', then there are two aliases for
224 * the codec, the complete short class name and the short class name without
225 * the 'Codec' ending. For example for the 'GzipCodec' codec class name the
226 * alias are 'gzip' and 'gzipcodec'.
227 *
228 * @param codecName the canonical class name of the codec
229 * @return the codec object
230 */
231 public CompressionCodec getCodecByName(String codecName) {
232 if (codecsByClassName == null) {
233 return null;
234 }
235 CompressionCodec codec = getCodecByClassName(codecName);
236 if (codec == null) {
237 // trying to get the codec by name in case the name was specified instead a class
238 codec = codecsByName.get(codecName.toLowerCase());
239 }
240 return codec;
241 }
242
243 /**
244 * Find the relevant compression codec for the codec's canonical class name
245 * or by codec alias and returns its implemetation class.
246 * <p/>
247 * Codec aliases are case insensitive.
248 * <p/>
249 * The code alias is the short class name (without the package name).
250 * If the short class name ends with 'Codec', then there are two aliases for
251 * the codec, the complete short class name and the short class name without
252 * the 'Codec' ending. For example for the 'GzipCodec' codec class name the
253 * alias are 'gzip' and 'gzipcodec'.
254 *
255 * @param codecName the canonical class name of the codec
256 * @return the codec class
257 */
258 public Class<? extends CompressionCodec> getCodecClassByName(String codecName) {
259 CompressionCodec codec = getCodecByName(codecName);
260 if (codec == null) {
261 return null;
262 }
263 return codec.getClass();
264 }
265
266 /**
267 * Removes a suffix from a filename, if it has it.
268 * @param filename the filename to strip
269 * @param suffix the suffix to remove
270 * @return the shortened filename
271 */
272 public static String removeSuffix(String filename, String suffix) {
273 if (filename.endsWith(suffix)) {
274 return filename.substring(0, filename.length() - suffix.length());
275 }
276 return filename;
277 }
278
279 /**
280 * A little test program.
281 * @param args
282 */
283 public static void main(String[] args) throws Exception {
284 Configuration conf = new Configuration();
285 CompressionCodecFactory factory = new CompressionCodecFactory(conf);
286 boolean encode = false;
287 for(int i=0; i < args.length; ++i) {
288 if ("-in".equals(args[i])) {
289 encode = true;
290 } else if ("-out".equals(args[i])) {
291 encode = false;
292 } else {
293 CompressionCodec codec = factory.getCodec(new Path(args[i]));
294 if (codec == null) {
295 System.out.println("Codec for " + args[i] + " not found.");
296 } else {
297 if (encode) {
298 CompressionOutputStream out = null;
299 java.io.InputStream in = null;
300 try {
301 out = codec.createOutputStream(
302 new java.io.FileOutputStream(args[i]));
303 byte[] buffer = new byte[100];
304 String inFilename = removeSuffix(args[i],
305 codec.getDefaultExtension());
306 in = new java.io.FileInputStream(inFilename);
307 int len = in.read(buffer);
308 while (len > 0) {
309 out.write(buffer, 0, len);
310 len = in.read(buffer);
311 }
312 } finally {
313 if(out != null) { out.close(); }
314 if(in != null) { in.close(); }
315 }
316 } else {
317 CompressionInputStream in = null;
318 try {
319 in = codec.createInputStream(
320 new java.io.FileInputStream(args[i]));
321 byte[] buffer = new byte[100];
322 int len = in.read(buffer);
323 while (len > 0) {
324 System.out.write(buffer, 0, len);
325 len = in.read(buffer);
326 }
327 } finally {
328 if(in != null) { in.close(); }
329 }
330 }
331 }
332 }
333 }
334 }
335 }