001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.io.compress;
019
020 import java.util.*;
021
022 import org.apache.commons.logging.Log;
023 import org.apache.commons.logging.LogFactory;
024 import org.apache.hadoop.classification.InterfaceAudience;
025 import org.apache.hadoop.classification.InterfaceStability;
026 import org.apache.hadoop.conf.Configuration;
027 import org.apache.hadoop.fs.Path;
028 import org.apache.hadoop.util.ReflectionUtils;
029
030 /**
031 * A factory that will find the correct codec for a given filename.
032 */
033 @InterfaceAudience.Public
034 @InterfaceStability.Evolving
035 public class CompressionCodecFactory {
036
037 public static final Log LOG =
038 LogFactory.getLog(CompressionCodecFactory.class.getName());
039
040 /**
041 * A map from the reversed filename suffixes to the codecs.
042 * This is probably overkill, because the maps should be small, but it
043 * automatically supports finding the longest matching suffix.
044 */
045 private SortedMap<String, CompressionCodec> codecs = null;
046
047 /**
048 * A map from the reversed filename suffixes to the codecs.
049 * This is probably overkill, because the maps should be small, but it
050 * automatically supports finding the longest matching suffix.
051 */
052 private Map<String, CompressionCodec> codecsByName = null;
053
054 /**
055 * A map from class names to the codecs
056 */
057 private HashMap<String, CompressionCodec> codecsByClassName = null;
058
059 private void addCodec(CompressionCodec codec) {
060 String suffix = codec.getDefaultExtension();
061 codecs.put(new StringBuilder(suffix).reverse().toString(), codec);
062 codecsByClassName.put(codec.getClass().getCanonicalName(), codec);
063
064 String codecName = codec.getClass().getSimpleName();
065 codecsByName.put(codecName.toLowerCase(), codec);
066 if (codecName.endsWith("Codec")) {
067 codecName = codecName.substring(0, codecName.length() - "Codec".length());
068 codecsByName.put(codecName.toLowerCase(), codec);
069 }
070 }
071
072 /**
073 * Print the extension map out as a string.
074 */
075 public String toString() {
076 StringBuilder buf = new StringBuilder();
077 Iterator<Map.Entry<String, CompressionCodec>> itr =
078 codecs.entrySet().iterator();
079 buf.append("{ ");
080 if (itr.hasNext()) {
081 Map.Entry<String, CompressionCodec> entry = itr.next();
082 buf.append(entry.getKey());
083 buf.append(": ");
084 buf.append(entry.getValue().getClass().getName());
085 while (itr.hasNext()) {
086 entry = itr.next();
087 buf.append(", ");
088 buf.append(entry.getKey());
089 buf.append(": ");
090 buf.append(entry.getValue().getClass().getName());
091 }
092 }
093 buf.append(" }");
094 return buf.toString();
095 }
096
097 /**
098 * Get the list of codecs listed in the configuration
099 * @param conf the configuration to look in
100 * @return a list of the Configuration classes or null if the attribute
101 * was not set
102 */
103 public static List<Class<? extends CompressionCodec>> getCodecClasses(Configuration conf) {
104 String codecsString = conf.get("io.compression.codecs");
105 if (codecsString != null) {
106 List<Class<? extends CompressionCodec>> result
107 = new ArrayList<Class<? extends CompressionCodec>>();
108 StringTokenizer codecSplit = new StringTokenizer(codecsString, ",");
109 while (codecSplit.hasMoreElements()) {
110 String codecSubstring = codecSplit.nextToken();
111 if (codecSubstring.length() != 0) {
112 try {
113 Class<?> cls = conf.getClassByName(codecSubstring);
114 if (!CompressionCodec.class.isAssignableFrom(cls)) {
115 throw new IllegalArgumentException("Class " + codecSubstring +
116 " is not a CompressionCodec");
117 }
118 result.add(cls.asSubclass(CompressionCodec.class));
119 } catch (ClassNotFoundException ex) {
120 throw new IllegalArgumentException("Compression codec " +
121 codecSubstring + " not found.",
122 ex);
123 }
124 }
125 }
126 return result;
127 } else {
128 return null;
129 }
130 }
131
132 /**
133 * Sets a list of codec classes in the configuration.
134 * @param conf the configuration to modify
135 * @param classes the list of classes to set
136 */
137 public static void setCodecClasses(Configuration conf,
138 List<Class> classes) {
139 StringBuilder buf = new StringBuilder();
140 Iterator<Class> itr = classes.iterator();
141 if (itr.hasNext()) {
142 Class cls = itr.next();
143 buf.append(cls.getName());
144 while(itr.hasNext()) {
145 buf.append(',');
146 buf.append(itr.next().getName());
147 }
148 }
149 conf.set("io.compression.codecs", buf.toString());
150 }
151
152 /**
153 * Find the codecs specified in the config value io.compression.codecs
154 * and register them. Defaults to gzip and zip.
155 */
156 public CompressionCodecFactory(Configuration conf) {
157 codecs = new TreeMap<String, CompressionCodec>();
158 codecsByClassName = new HashMap<String, CompressionCodec>();
159 codecsByName = new HashMap<String, CompressionCodec>();
160 List<Class<? extends CompressionCodec>> codecClasses = getCodecClasses(conf);
161 if (codecClasses == null) {
162 addCodec(new GzipCodec());
163 addCodec(new DefaultCodec());
164 } else {
165 Iterator<Class<? extends CompressionCodec>> itr = codecClasses.iterator();
166 while (itr.hasNext()) {
167 CompressionCodec codec = ReflectionUtils.newInstance(itr.next(), conf);
168 addCodec(codec);
169 }
170 }
171 }
172
173 /**
174 * Find the relevant compression codec for the given file based on its
175 * filename suffix.
176 * @param file the filename to check
177 * @return the codec object
178 */
179 public CompressionCodec getCodec(Path file) {
180 CompressionCodec result = null;
181 if (codecs != null) {
182 String filename = file.getName();
183 String reversedFilename = new StringBuilder(filename).reverse().toString();
184 SortedMap<String, CompressionCodec> subMap =
185 codecs.headMap(reversedFilename);
186 if (!subMap.isEmpty()) {
187 String potentialSuffix = subMap.lastKey();
188 if (reversedFilename.startsWith(potentialSuffix)) {
189 result = codecs.get(potentialSuffix);
190 }
191 }
192 }
193 return result;
194 }
195
196 /**
197 * Find the relevant compression codec for the codec's canonical class name.
198 * @param classname the canonical class name of the codec
199 * @return the codec object
200 */
201 public CompressionCodec getCodecByClassName(String classname) {
202 if (codecsByClassName == null) {
203 return null;
204 }
205 return codecsByClassName.get(classname);
206 }
207
208 /**
209 * Find the relevant compression codec for the codec's canonical class name
210 * or by codec alias.
211 * <p/>
212 * Codec aliases are case insensitive.
213 * <p/>
214 * The code alias is the short class name (without the package name).
215 * If the short class name ends with 'Codec', then there are two aliases for
216 * the codec, the complete short class name and the short class name without
217 * the 'Codec' ending. For example for the 'GzipCodec' codec class name the
218 * alias are 'gzip' and 'gzipcodec'.
219 *
220 * @param codecName the canonical class name of the codec
221 * @return the codec object
222 */
223 public CompressionCodec getCodecByName(String codecName) {
224 if (codecsByClassName == null) {
225 return null;
226 }
227 CompressionCodec codec = getCodecByClassName(codecName);
228 if (codec == null) {
229 // trying to get the codec by name in case the name was specified instead a class
230 codec = codecsByName.get(codecName.toLowerCase());
231 }
232 return codec;
233 }
234
235 /**
236 * Find the relevant compression codec for the codec's canonical class name
237 * or by codec alias and returns its implemetation class.
238 * <p/>
239 * Codec aliases are case insensitive.
240 * <p/>
241 * The code alias is the short class name (without the package name).
242 * If the short class name ends with 'Codec', then there are two aliases for
243 * the codec, the complete short class name and the short class name without
244 * the 'Codec' ending. For example for the 'GzipCodec' codec class name the
245 * alias are 'gzip' and 'gzipcodec'.
246 *
247 * @param codecName the canonical class name of the codec
248 * @return the codec class
249 */
250 public Class<? extends CompressionCodec> getCodecClassByName(String codecName) {
251 CompressionCodec codec = getCodecByName(codecName);
252 if (codec == null) {
253 return null;
254 }
255 return codec.getClass();
256 }
257
258 /**
259 * Removes a suffix from a filename, if it has it.
260 * @param filename the filename to strip
261 * @param suffix the suffix to remove
262 * @return the shortened filename
263 */
264 public static String removeSuffix(String filename, String suffix) {
265 if (filename.endsWith(suffix)) {
266 return filename.substring(0, filename.length() - suffix.length());
267 }
268 return filename;
269 }
270
271 /**
272 * A little test program.
273 * @param args
274 */
275 public static void main(String[] args) throws Exception {
276 Configuration conf = new Configuration();
277 CompressionCodecFactory factory = new CompressionCodecFactory(conf);
278 boolean encode = false;
279 for(int i=0; i < args.length; ++i) {
280 if ("-in".equals(args[i])) {
281 encode = true;
282 } else if ("-out".equals(args[i])) {
283 encode = false;
284 } else {
285 CompressionCodec codec = factory.getCodec(new Path(args[i]));
286 if (codec == null) {
287 System.out.println("Codec for " + args[i] + " not found.");
288 } else {
289 if (encode) {
290 CompressionOutputStream out = null;
291 java.io.InputStream in = null;
292 try {
293 out = codec.createOutputStream(
294 new java.io.FileOutputStream(args[i]));
295 byte[] buffer = new byte[100];
296 String inFilename = removeSuffix(args[i],
297 codec.getDefaultExtension());
298 in = new java.io.FileInputStream(inFilename);
299 int len = in.read(buffer);
300 while (len > 0) {
301 out.write(buffer, 0, len);
302 len = in.read(buffer);
303 }
304 } finally {
305 if(out != null) { out.close(); }
306 if(in != null) { in.close(); }
307 }
308 } else {
309 CompressionInputStream in = null;
310 try {
311 in = codec.createInputStream(
312 new java.io.FileInputStream(args[i]));
313 byte[] buffer = new byte[100];
314 int len = in.read(buffer);
315 while (len > 0) {
316 System.out.write(buffer, 0, len);
317 len = in.read(buffer);
318 }
319 } finally {
320 if(in != null) { in.close(); }
321 }
322 }
323 }
324 }
325 }
326 }
327 }