001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.io.compress;
019    
020    import java.util.*;
021    
022    import org.apache.commons.logging.Log;
023    import org.apache.commons.logging.LogFactory;
024    import org.apache.hadoop.classification.InterfaceAudience;
025    import org.apache.hadoop.classification.InterfaceStability;
026    import org.apache.hadoop.conf.Configuration;
027    import org.apache.hadoop.fs.Path;
028    import org.apache.hadoop.util.ReflectionUtils;
029    
030    /**
031     * A factory that will find the correct codec for a given filename.
032     */
033    @InterfaceAudience.Public
034    @InterfaceStability.Evolving
035    public class CompressionCodecFactory {
036    
037      public static final Log LOG =
038        LogFactory.getLog(CompressionCodecFactory.class.getName());
039    
040      /**
041       * A map from the reversed filename suffixes to the codecs.
042       * This is probably overkill, because the maps should be small, but it 
043       * automatically supports finding the longest matching suffix. 
044       */
045      private SortedMap<String, CompressionCodec> codecs = null;
046    
047        /**
048         * A map from the reversed filename suffixes to the codecs.
049         * This is probably overkill, because the maps should be small, but it
050         * automatically supports finding the longest matching suffix.
051         */
052        private Map<String, CompressionCodec> codecsByName = null;
053    
054      /**
055       * A map from class names to the codecs
056       */
057      private HashMap<String, CompressionCodec> codecsByClassName = null;
058    
059      private void addCodec(CompressionCodec codec) {
060        String suffix = codec.getDefaultExtension();
061        codecs.put(new StringBuilder(suffix).reverse().toString(), codec);
062        codecsByClassName.put(codec.getClass().getCanonicalName(), codec);
063    
064        String codecName = codec.getClass().getSimpleName();
065        codecsByName.put(codecName.toLowerCase(), codec);
066        if (codecName.endsWith("Codec")) {
067          codecName = codecName.substring(0, codecName.length() - "Codec".length());
068          codecsByName.put(codecName.toLowerCase(), codec);
069        }
070      }
071    
072      /**
073       * Print the extension map out as a string.
074       */
075      public String toString() {
076        StringBuilder buf = new StringBuilder();
077        Iterator<Map.Entry<String, CompressionCodec>> itr = 
078          codecs.entrySet().iterator();
079        buf.append("{ ");
080        if (itr.hasNext()) {
081          Map.Entry<String, CompressionCodec> entry = itr.next();
082          buf.append(entry.getKey());
083          buf.append(": ");
084          buf.append(entry.getValue().getClass().getName());
085          while (itr.hasNext()) {
086            entry = itr.next();
087            buf.append(", ");
088            buf.append(entry.getKey());
089            buf.append(": ");
090            buf.append(entry.getValue().getClass().getName());
091          }
092        }
093        buf.append(" }");
094        return buf.toString();
095      }
096    
097      /**
098       * Get the list of codecs listed in the configuration
099       * @param conf the configuration to look in
100       * @return a list of the Configuration classes or null if the attribute
101       *         was not set
102       */
103      public static List<Class<? extends CompressionCodec>> getCodecClasses(Configuration conf) {
104        String codecsString = conf.get("io.compression.codecs");
105        if (codecsString != null) {
106          List<Class<? extends CompressionCodec>> result
107            = new ArrayList<Class<? extends CompressionCodec>>();
108          StringTokenizer codecSplit = new StringTokenizer(codecsString, ",");
109          while (codecSplit.hasMoreElements()) {
110            String codecSubstring = codecSplit.nextToken().trim();
111            if (codecSubstring.length() != 0) {
112              try {
113                Class<?> cls = conf.getClassByName(codecSubstring);
114                if (!CompressionCodec.class.isAssignableFrom(cls)) {
115                  throw new IllegalArgumentException("Class " + codecSubstring +
116                                                     " is not a CompressionCodec");
117                }
118                result.add(cls.asSubclass(CompressionCodec.class));
119              } catch (ClassNotFoundException ex) {
120                throw new IllegalArgumentException("Compression codec " + 
121                                                   codecSubstring + " not found.",
122                                                   ex);
123              }
124            }
125          }
126          return result;
127        } else {
128          return null;
129        }
130      }
131      
132      /**
133       * Sets a list of codec classes in the configuration.
134       * @param conf the configuration to modify
135       * @param classes the list of classes to set
136       */
137      public static void setCodecClasses(Configuration conf,
138                                         List<Class> classes) {
139        StringBuilder buf = new StringBuilder();
140        Iterator<Class> itr = classes.iterator();
141        if (itr.hasNext()) {
142          Class cls = itr.next();
143          buf.append(cls.getName());
144          while(itr.hasNext()) {
145            buf.append(',');
146            buf.append(itr.next().getName());
147          }
148        }
149        conf.set("io.compression.codecs", buf.toString());   
150      }
151      
152      /**
153       * Find the codecs specified in the config value io.compression.codecs 
154       * and register them. Defaults to gzip and zip.
155       */
156      public CompressionCodecFactory(Configuration conf) {
157        codecs = new TreeMap<String, CompressionCodec>();
158        codecsByClassName = new HashMap<String, CompressionCodec>();
159        codecsByName = new HashMap<String, CompressionCodec>();
160        List<Class<? extends CompressionCodec>> codecClasses = getCodecClasses(conf);
161        if (codecClasses == null) {
162          addCodec(new GzipCodec());
163          addCodec(new DefaultCodec());      
164        } else {
165          Iterator<Class<? extends CompressionCodec>> itr = codecClasses.iterator();
166          while (itr.hasNext()) {
167            CompressionCodec codec = ReflectionUtils.newInstance(itr.next(), conf);
168            addCodec(codec);     
169          }
170        }
171      }
172      
173      /**
174       * Find the relevant compression codec for the given file based on its
175       * filename suffix.
176       * @param file the filename to check
177       * @return the codec object
178       */
179      public CompressionCodec getCodec(Path file) {
180        CompressionCodec result = null;
181        if (codecs != null) {
182          String filename = file.getName();
183          String reversedFilename = new StringBuilder(filename).reverse().toString();
184          SortedMap<String, CompressionCodec> subMap = 
185            codecs.headMap(reversedFilename);
186          if (!subMap.isEmpty()) {
187            String potentialSuffix = subMap.lastKey();
188            if (reversedFilename.startsWith(potentialSuffix)) {
189              result = codecs.get(potentialSuffix);
190            }
191          }
192        }
193        return result;
194      }
195      
196      /**
197       * Find the relevant compression codec for the codec's canonical class name.
198       * @param classname the canonical class name of the codec
199       * @return the codec object
200       */
201      public CompressionCodec getCodecByClassName(String classname) {
202        if (codecsByClassName == null) {
203          return null;
204        }
205        return codecsByClassName.get(classname);
206      }
207    
208        /**
209         * Find the relevant compression codec for the codec's canonical class name
210         * or by codec alias.
211         * <p/>
212         * Codec aliases are case insensitive.
213         * <p/>
214         * The code alias is the short class name (without the package name).
215         * If the short class name ends with 'Codec', then there are two aliases for
216         * the codec, the complete short class name and the short class name without
217         * the 'Codec' ending. For example for the 'GzipCodec' codec class name the
218         * alias are 'gzip' and 'gzipcodec'.
219         *
220         * @param codecName the canonical class name of the codec
221         * @return the codec object
222         */
223        public CompressionCodec getCodecByName(String codecName) {
224          if (codecsByClassName == null) {
225            return null;
226          }
227          CompressionCodec codec = getCodecByClassName(codecName);
228          if (codec == null) {
229            // trying to get the codec by name in case the name was specified instead a class
230            codec = codecsByName.get(codecName.toLowerCase());
231          }
232          return codec;
233        }
234    
235        /**
236         * Find the relevant compression codec for the codec's canonical class name
237         * or by codec alias and returns its implemetation class.
238         * <p/>
239         * Codec aliases are case insensitive.
240         * <p/>
241         * The code alias is the short class name (without the package name).
242         * If the short class name ends with 'Codec', then there are two aliases for
243         * the codec, the complete short class name and the short class name without
244         * the 'Codec' ending. For example for the 'GzipCodec' codec class name the
245         * alias are 'gzip' and 'gzipcodec'.
246         *
247         * @param codecName the canonical class name of the codec
248         * @return the codec class
249         */
250        public Class<? extends CompressionCodec> getCodecClassByName(String codecName) {
251          CompressionCodec codec = getCodecByName(codecName);
252          if (codec == null) {
253            return null;
254          }
255          return codec.getClass();
256        }
257    
258      /**
259       * Removes a suffix from a filename, if it has it.
260       * @param filename the filename to strip
261       * @param suffix the suffix to remove
262       * @return the shortened filename
263       */
264      public static String removeSuffix(String filename, String suffix) {
265        if (filename.endsWith(suffix)) {
266          return filename.substring(0, filename.length() - suffix.length());
267        }
268        return filename;
269      }
270      
271      /**
272       * A little test program.
273       * @param args
274       */
275      public static void main(String[] args) throws Exception {
276        Configuration conf = new Configuration();
277        CompressionCodecFactory factory = new CompressionCodecFactory(conf);
278        boolean encode = false;
279        for(int i=0; i < args.length; ++i) {
280          if ("-in".equals(args[i])) {
281            encode = true;
282          } else if ("-out".equals(args[i])) {
283            encode = false;
284          } else {
285            CompressionCodec codec = factory.getCodec(new Path(args[i]));
286            if (codec == null) {
287              System.out.println("Codec for " + args[i] + " not found.");
288            } else { 
289              if (encode) {
290                CompressionOutputStream out = null;
291                java.io.InputStream in = null;
292                try {
293                  out = codec.createOutputStream(
294                      new java.io.FileOutputStream(args[i]));
295                  byte[] buffer = new byte[100];
296                  String inFilename = removeSuffix(args[i], 
297                      codec.getDefaultExtension());
298                  in = new java.io.FileInputStream(inFilename);
299                  int len = in.read(buffer);
300                  while (len > 0) {
301                    out.write(buffer, 0, len);
302                    len = in.read(buffer);
303                  }
304                } finally {
305                  if(out != null) { out.close(); }
306                  if(in  != null) { in.close(); }
307                }
308              } else {
309                CompressionInputStream in = null;
310                try {
311                  in = codec.createInputStream(
312                      new java.io.FileInputStream(args[i]));
313                  byte[] buffer = new byte[100];
314                  int len = in.read(buffer);
315                  while (len > 0) {
316                    System.out.write(buffer, 0, len);
317                    len = in.read(buffer);
318                  }
319                } finally {
320                  if(in != null) { in.close(); }
321                }
322              }
323            }
324          }
325        }
326      }
327    }