001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.io.compress;
020    
021    import java.io.BufferedInputStream;
022    import java.io.IOException;
023    import java.io.InputStream;
024    import java.io.OutputStream;
025    
026    
027    import org.apache.hadoop.classification.InterfaceAudience;
028    import org.apache.hadoop.classification.InterfaceStability;
029    import org.apache.hadoop.fs.Seekable;
030    import org.apache.hadoop.io.compress.bzip2.BZip2Constants;
031    import org.apache.hadoop.io.compress.bzip2.BZip2DummyCompressor;
032    import org.apache.hadoop.io.compress.bzip2.BZip2DummyDecompressor;
033    import org.apache.hadoop.io.compress.bzip2.CBZip2InputStream;
034    import org.apache.hadoop.io.compress.bzip2.CBZip2OutputStream;
035    
036    /**
037     * This class provides CompressionOutputStream and CompressionInputStream for
038     * compression and decompression. Currently we dont have an implementation of
039     * the Compressor and Decompressor interfaces, so those methods of
040     * CompressionCodec which have a Compressor or Decompressor type argument, throw
041     * UnsupportedOperationException.
042     */
043    @InterfaceAudience.Public
044    @InterfaceStability.Evolving
045    public class BZip2Codec implements SplittableCompressionCodec {
046    
047      private static final String HEADER = "BZ";
048      private static final int HEADER_LEN = HEADER.length();
049      private static final String SUB_HEADER = "h9";
050      private static final int SUB_HEADER_LEN = SUB_HEADER.length();
051    
052      /**
053      * Creates a new instance of BZip2Codec
054      */
055      public BZip2Codec() { }
056    
057      /**
058      * Creates CompressionOutputStream for BZip2
059      *
060      * @param out
061      *            The output Stream
062      * @return The BZip2 CompressionOutputStream
063      * @throws java.io.IOException
064      *             Throws IO exception
065      */
066      public CompressionOutputStream createOutputStream(OutputStream out)
067          throws IOException {
068        return new BZip2CompressionOutputStream(out);
069      }
070    
071      /**
072      * Creates a compressor using given OutputStream.
073       *
074      * @return CompressionOutputStream
075        @throws java.io.IOException
076       */
077      public CompressionOutputStream createOutputStream(OutputStream out,
078          Compressor compressor) throws IOException {
079        return createOutputStream(out);
080      }
081    
082      /**
083      * This functionality is currently not supported.
084      *
085      * @return BZip2DummyCompressor.class
086      */
087      public Class<? extends org.apache.hadoop.io.compress.Compressor> getCompressorType() {
088        return BZip2DummyCompressor.class;
089      }
090    
091      /**
092      * This functionality is currently not supported.
093      *
094      * @return Compressor
095      */
096      public Compressor createCompressor() {
097        return new BZip2DummyCompressor();
098      }
099    
100      /**
101      * Creates CompressionInputStream to be used to read off uncompressed data.
102      *
103      * @param in
104      *            The InputStream
105      * @return Returns CompressionInputStream for BZip2
106      * @throws java.io.IOException
107      *             Throws IOException
108      */
109      public CompressionInputStream createInputStream(InputStream in)
110          throws IOException {
111        return new BZip2CompressionInputStream(in);
112      }
113    
114      /**
115      * This functionality is currently not supported.
116      *
117      * @return CompressionInputStream
118      */
119      public CompressionInputStream createInputStream(InputStream in,
120          Decompressor decompressor) throws IOException {
121        return createInputStream(in);
122      }
123    
124      /**
125       * Creates CompressionInputStream to be used to read off uncompressed data
126       * in one of the two reading modes. i.e. Continuous or Blocked reading modes
127       *
128       * @param seekableIn The InputStream
129       * @param start The start offset into the compressed stream
130       * @param end The end offset into the compressed stream
131       * @param readMode Controls whether progress is reported continuously or
132       *                 only at block boundaries.
133       *
134       * @return CompressionInputStream for BZip2 aligned at block boundaries
135       */
136      public SplitCompressionInputStream createInputStream(InputStream seekableIn,
137          Decompressor decompressor, long start, long end, READ_MODE readMode)
138          throws IOException {
139    
140        if (!(seekableIn instanceof Seekable)) {
141          throw new IOException("seekableIn must be an instance of " +
142              Seekable.class.getName());
143        }
144    
145        //find the position of first BZip2 start up marker
146        ((Seekable)seekableIn).seek(0);
147    
148        // BZip2 start of block markers are of 6 bytes.  But the very first block
149        // also has "BZh9", making it 10 bytes.  This is the common case.  But at
150        // time stream might start without a leading BZ.
151        final long FIRST_BZIP2_BLOCK_MARKER_POSITION =
152          CBZip2InputStream.numberOfBytesTillNextMarker(seekableIn);
153        long adjStart = Math.max(0L, start - FIRST_BZIP2_BLOCK_MARKER_POSITION);
154    
155        ((Seekable)seekableIn).seek(adjStart);
156        SplitCompressionInputStream in =
157          new BZip2CompressionInputStream(seekableIn, adjStart, end, readMode);
158    
159    
160        // The following if clause handles the following case:
161        // Assume the following scenario in BZip2 compressed stream where
162        // . represent compressed data.
163        // .....[48 bit Block].....[48 bit   Block].....[48 bit Block]...
164        // ........................[47 bits][1 bit].....[48 bit Block]...
165        // ................................^[Assume a Byte alignment here]
166        // ........................................^^[current position of stream]
167        // .....................^^[We go back 10 Bytes in stream and find a Block marker]
168        // ........................................^^[We align at wrong position!]
169        // ...........................................................^^[While this pos is correct]
170    
171        if (in.getPos() <= start) {
172          ((Seekable)seekableIn).seek(start);
173          in = new BZip2CompressionInputStream(seekableIn, start, end, readMode);
174        }
175    
176        return in;
177      }
178    
179      /**
180      * This functionality is currently not supported.
181      *
182      * @return BZip2DummyDecompressor.class
183      */
184      public Class<? extends org.apache.hadoop.io.compress.Decompressor> getDecompressorType() {
185        return BZip2DummyDecompressor.class;
186      }
187    
188      /**
189      * This functionality is currently not supported.
190      *
191      * @return Decompressor
192      */
193      public Decompressor createDecompressor() {
194        return new BZip2DummyDecompressor();
195      }
196    
197      /**
198      * .bz2 is recognized as the default extension for compressed BZip2 files
199      *
200      * @return A String telling the default bzip2 file extension
201      */
202      public String getDefaultExtension() {
203        return ".bz2";
204      }
205    
206      private static class BZip2CompressionOutputStream extends
207          CompressionOutputStream {
208    
209        // class data starts here//
210        private CBZip2OutputStream output;
211        private boolean needsReset; 
212        // class data ends here//
213    
214        public BZip2CompressionOutputStream(OutputStream out)
215            throws IOException {
216          super(out);
217          needsReset = true;
218        }
219    
220        private void writeStreamHeader() throws IOException {
221          if (super.out != null) {
222            // The compressed bzip2 stream should start with the
223            // identifying characters BZ. Caller of CBZip2OutputStream
224            // i.e. this class must write these characters.
225            out.write(HEADER.getBytes());
226          }
227        }
228    
229        public void finish() throws IOException {
230          if (needsReset) {
231            // In the case that nothing is written to this stream, we still need to
232            // write out the header before closing, otherwise the stream won't be
233            // recognized by BZip2CompressionInputStream.
234            internalReset();
235          }
236          this.output.finish();
237          needsReset = true;
238        }
239    
240        private void internalReset() throws IOException {
241          if (needsReset) {
242            needsReset = false;
243            writeStreamHeader();
244            this.output = new CBZip2OutputStream(out);
245          }
246        }    
247        
248        public void resetState() throws IOException {
249          // Cannot write to out at this point because out might not be ready
250          // yet, as in SequenceFile.Writer implementation.
251          needsReset = true;
252        }
253    
254        public void write(int b) throws IOException {
255          if (needsReset) {
256            internalReset();
257          }
258          this.output.write(b);
259        }
260    
261        public void write(byte[] b, int off, int len) throws IOException {
262          if (needsReset) {
263            internalReset();
264          }
265          this.output.write(b, off, len);
266        }
267    
268        public void close() throws IOException {
269          if (needsReset) {
270            // In the case that nothing is written to this stream, we still need to
271            // write out the header before closing, otherwise the stream won't be
272            // recognized by BZip2CompressionInputStream.
273            internalReset();
274          }
275          this.output.flush();
276          this.output.close();
277          needsReset = true;
278        }
279    
280      }// end of class BZip2CompressionOutputStream
281    
282      /**
283       * This class is capable to de-compress BZip2 data in two modes;
284       * CONTINOUS and BYBLOCK.  BYBLOCK mode makes it possible to
285       * do decompression starting any arbitrary position in the stream.
286       *
287       * So this facility can easily be used to parallelize decompression
288       * of a large BZip2 file for performance reasons.  (It is exactly
289       * done so for Hadoop framework.  See LineRecordReader for an
290       * example).  So one can break the file (of course logically) into
291       * chunks for parallel processing.  These "splits" should be like
292       * default Hadoop splits (e.g as in FileInputFormat getSplit metod).
293       * So this code is designed and tested for FileInputFormat's way
294       * of splitting only.
295       */
296    
297      private static class BZip2CompressionInputStream extends
298          SplitCompressionInputStream {
299    
300        // class data starts here//
301        private CBZip2InputStream input;
302        boolean needsReset;
303        private BufferedInputStream bufferedIn;
304        private boolean isHeaderStripped = false;
305        private boolean isSubHeaderStripped = false;
306        private READ_MODE readMode = READ_MODE.CONTINUOUS;
307        private long startingPos = 0L;
308    
309        // Following state machine handles different states of compressed stream
310        // position
311        // HOLD : Don't advertise compressed stream position
312        // ADVERTISE : Read 1 more character and advertise stream position
313        // See more comments about it before updatePos method.
314        private enum POS_ADVERTISEMENT_STATE_MACHINE {
315          HOLD, ADVERTISE
316        };
317    
318        POS_ADVERTISEMENT_STATE_MACHINE posSM = POS_ADVERTISEMENT_STATE_MACHINE.HOLD;
319        long compressedStreamPosition = 0;
320    
321        // class data ends here//
322    
323        public BZip2CompressionInputStream(InputStream in) throws IOException {
324          this(in, 0L, Long.MAX_VALUE, READ_MODE.CONTINUOUS);
325        }
326    
327        public BZip2CompressionInputStream(InputStream in, long start, long end,
328            READ_MODE readMode) throws IOException {
329          super(in, start, end);
330          needsReset = false;
331          bufferedIn = new BufferedInputStream(super.in);
332          this.startingPos = super.getPos();
333          this.readMode = readMode;
334          if (this.startingPos == 0) {
335            // We only strip header if it is start of file
336            bufferedIn = readStreamHeader();
337          }
338          input = new CBZip2InputStream(bufferedIn, readMode);
339          if (this.isHeaderStripped) {
340            input.updateReportedByteCount(HEADER_LEN);
341          }
342    
343          if (this.isSubHeaderStripped) {
344            input.updateReportedByteCount(SUB_HEADER_LEN);
345          }
346    
347          this.updatePos(false);
348        }
349    
350        private BufferedInputStream readStreamHeader() throws IOException {
351          // We are flexible enough to allow the compressed stream not to
352          // start with the header of BZ. So it works fine either we have
353          // the header or not.
354          if (super.in != null) {
355            bufferedIn.mark(HEADER_LEN);
356            byte[] headerBytes = new byte[HEADER_LEN];
357            int actualRead = bufferedIn.read(headerBytes, 0, HEADER_LEN);
358            if (actualRead != -1) {
359              String header = new String(headerBytes);
360              if (header.compareTo(HEADER) != 0) {
361                bufferedIn.reset();
362              } else {
363                this.isHeaderStripped = true;
364                // In case of BYBLOCK mode, we also want to strip off
365                // remaining two character of the header.
366                if (this.readMode == READ_MODE.BYBLOCK) {
367                  actualRead = bufferedIn.read(headerBytes, 0,
368                      SUB_HEADER_LEN);
369                  if (actualRead != -1) {
370                    this.isSubHeaderStripped = true;
371                  }
372                }
373              }
374            }
375          }
376    
377          if (bufferedIn == null) {
378            throw new IOException("Failed to read bzip2 stream.");
379          }
380    
381          return bufferedIn;
382    
383        }// end of method
384    
385        public void close() throws IOException {
386          if (!needsReset) {
387            input.close();
388            needsReset = true;
389          }
390        }
391    
392        /**
393        * This method updates compressed stream position exactly when the
394        * client of this code has read off at least one byte passed any BZip2
395        * end of block marker.
396        *
397        * This mechanism is very helpful to deal with data level record
398        * boundaries. Please see constructor and next methods of
399        * org.apache.hadoop.mapred.LineRecordReader as an example usage of this
400        * feature.  We elaborate it with an example in the following:
401        *
402        * Assume two different scenarios of the BZip2 compressed stream, where
403        * [m] represent end of block, \n is line delimiter and . represent compressed
404        * data.
405        *
406        * ............[m]......\n.......
407        *
408        * ..........\n[m]......\n.......
409        *
410        * Assume that end is right after [m].  In the first case the reading
411        * will stop at \n and there is no need to read one more line.  (To see the
412        * reason of reading one more line in the next() method is explained in LineRecordReader.)
413        * While in the second example LineRecordReader needs to read one more line
414        * (till the second \n).  Now since BZip2Codecs only update position
415        * at least one byte passed a maker, so it is straight forward to differentiate
416        * between the two cases mentioned.
417        *
418        */
419    
420        public int read(byte[] b, int off, int len) throws IOException {
421          if (needsReset) {
422            internalReset();
423          }
424    
425          int result = 0;
426          result = this.input.read(b, off, len);
427          if (result == BZip2Constants.END_OF_BLOCK) {
428            this.posSM = POS_ADVERTISEMENT_STATE_MACHINE.ADVERTISE;
429          }
430    
431          if (this.posSM == POS_ADVERTISEMENT_STATE_MACHINE.ADVERTISE) {
432            result = this.input.read(b, off, off + 1);
433            // This is the precise time to update compressed stream position
434            // to the client of this code.
435            this.updatePos(true);
436            this.posSM = POS_ADVERTISEMENT_STATE_MACHINE.HOLD;
437          }
438    
439          return result;
440    
441        }
442    
443        public int read() throws IOException {
444          byte b[] = new byte[1];
445          int result = this.read(b, 0, 1);
446          return (result < 0) ? result : (b[0] & 0xff);
447        }
448    
449        private void internalReset() throws IOException {
450          if (needsReset) {
451            needsReset = false;
452            BufferedInputStream bufferedIn = readStreamHeader();
453            input = new CBZip2InputStream(bufferedIn, this.readMode);
454          }
455        }    
456        
457        public void resetState() throws IOException {
458          // Cannot read from bufferedIn at this point because bufferedIn
459          // might not be ready
460          // yet, as in SequenceFile.Reader implementation.
461          needsReset = true;
462        }
463    
464        public long getPos() {
465          return this.compressedStreamPosition;
466          }
467    
468        /*
469         * As the comments before read method tell that
470         * compressed stream is advertised when at least
471         * one byte passed EOB have been read off.  But
472         * there is an exception to this rule.  When we
473         * construct the stream we advertise the position
474         * exactly at EOB.  In the following method
475         * shouldAddOn boolean captures this exception.
476         *
477         */
478        private void updatePos(boolean shouldAddOn) {
479          int addOn = shouldAddOn ? 1 : 0;
480          this.compressedStreamPosition = this.startingPos
481              + this.input.getProcessedByteCount() + addOn;
482        }
483    
484      }// end of BZip2CompressionInputStream
485    
486    }