001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.gzip;
020
021import java.io.ByteArrayOutputStream;
022import java.io.IOException;
023import java.io.EOFException;
024import java.io.InputStream;
025import java.io.DataInput;
026import java.io.DataInputStream;
027import java.io.BufferedInputStream;
028import java.util.zip.DataFormatException;
029import java.util.zip.Deflater;
030import java.util.zip.Inflater;
031import java.util.zip.CRC32;
032
033import org.apache.commons.compress.compressors.CompressorInputStream;
034import org.apache.commons.compress.utils.ByteUtils;
035import org.apache.commons.compress.utils.CharsetNames;
036
037/**
038 * Input stream that decompresses .gz files.
039 *
040 * <p>This supports decompressing concatenated .gz files which is important
041 * when decompressing standalone .gz files.</p>
042 *
043 * <p>
044 * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz
045 * files: it stops after the first member and silently ignores the rest.
046 * It doesn't leave the read position to point to the beginning of the next
047 * member, which makes it difficult workaround the lack of concatenation
048 * support.
049 * </p>
050 *
051 * <p>
052 * Instead of using <code>GZIPInputStream</code>, this class has its own .gz
053 * container format decoder. The actual decompression is done with
054 * {@link java.util.zip.Inflater}.
055 * </p>
056 *
057 * <p>If you use the constructor {@code GzipCompressorInputStream(in)}
058 * or {@code GzipCompressorInputStream(in, false)} with some {@code
059 * InputStream} {@code in} then {@link #read} will return -1 as soon
060 * as the first internal member has been read completely. The stream
061 * {@code in} will be positioned at the start of the second gzip
062 * member if there is one.</p>
063 *
064 * <p>If you use the constructor {@code GzipCompressorInputStream(in,
065 * true)} with some {@code InputStream} {@code in} then {@link #read}
066 * will return -1 once the stream {@code in} has been exhausted. The
067 * data read from a stream constructed this way will consist of the
068 * concatenated data of all gzip members contained inside {@code
069 * in}.</p>
070 *
071 * @see "https://tools.ietf.org/html/rfc1952"
072 */
073public class GzipCompressorInputStream extends CompressorInputStream {
074    // Header flags
075    // private static final int FTEXT = 0x01; // Uninteresting for us
076    private static final int FHCRC = 0x02;
077    private static final int FEXTRA = 0x04;
078    private static final int FNAME = 0x08;
079    private static final int FCOMMENT = 0x10;
080    private static final int FRESERVED = 0xE0;
081
082    // Compressed input stream, possibly wrapped in a BufferedInputStream
083    private final InputStream in;
084
085    // True if decompressing multi member streams.
086    private final boolean decompressConcatenated;
087
088    // Buffer to hold the input data
089    private final byte[] buf = new byte[8192];
090
091    // Amount of data in buf.
092    private int bufUsed;
093
094    // Decompressor
095    private Inflater inf = new Inflater(true);
096
097    // CRC32 from uncompressed data
098    private final CRC32 crc = new CRC32();
099
100    // True once everything has been decompressed
101    private boolean endReached = false;
102
103    // used in no-arg read method
104    private final byte[] oneByte = new byte[1];
105
106    private final GzipParameters parameters = new GzipParameters();
107
108    /**
109     * Constructs a new input stream that decompresses gzip-compressed data
110     * from the specified input stream.
111     * <p>
112     * This is equivalent to
113     * <code>GzipCompressorInputStream(inputStream, false)</code> and thus
114     * will not decompress concatenated .gz files.
115     *
116     * @param inputStream  the InputStream from which this object should
117     *                     be created of
118     *
119     * @throws IOException if the stream could not be created
120     */
121    public GzipCompressorInputStream(final InputStream inputStream)
122            throws IOException {
123        this(inputStream, false);
124    }
125
126    /**
127     * Constructs a new input stream that decompresses gzip-compressed data
128     * from the specified input stream.
129     * <p>
130     * If <code>decompressConcatenated</code> is {@code false}:
131     * This decompressor might read more input than it will actually use.
132     * If <code>inputStream</code> supports <code>mark</code> and
133     * <code>reset</code>, then the input position will be adjusted
134     * so that it is right after the last byte of the compressed stream.
135     * If <code>mark</code> isn't supported, the input position will be
136     * undefined.
137     *
138     * @param inputStream  the InputStream from which this object should
139     *                     be created of
140     * @param decompressConcatenated
141     *                     if true, decompress until the end of the input;
142     *                     if false, stop after the first .gz member
143     *
144     * @throws IOException if the stream could not be created
145     */
146    public GzipCompressorInputStream(final InputStream inputStream,
147                                     final boolean decompressConcatenated)
148            throws IOException {
149        // Mark support is strictly needed for concatenated files only,
150        // but it's simpler if it is always available.
151        if (inputStream.markSupported()) {
152            in = inputStream;
153        } else {
154            in = new BufferedInputStream(inputStream);
155        }
156
157        this.decompressConcatenated = decompressConcatenated;
158        init(true);
159    }
160
161    /**
162     * Provides the stream's meta data - may change with each stream
163     * when decompressing concatenated streams.
164     * @return the stream's meta data
165     * @since 1.8
166     */
167    public GzipParameters getMetaData() {
168        return parameters;
169    }
170
171    private boolean init(final boolean isFirstMember) throws IOException {
172        assert isFirstMember || decompressConcatenated;
173
174        // Check the magic bytes without a possibility of EOFException.
175        final int magic0 = in.read();
176        final int magic1 = in.read();
177
178        // If end of input was reached after decompressing at least
179        // one .gz member, we have reached the end of the file successfully.
180        if (magic0 == -1 && !isFirstMember) {
181            return false;
182        }
183
184        if (magic0 != 31 || magic1 != 139) {
185            throw new IOException(isFirstMember
186                                  ? "Input is not in the .gz format"
187                                  : "Garbage after a valid .gz stream");
188        }
189
190        // Parsing the rest of the header may throw EOFException.
191        final DataInput inData = new DataInputStream(in);
192        final int method = inData.readUnsignedByte();
193        if (method != Deflater.DEFLATED) {
194            throw new IOException("Unsupported compression method "
195                                  + method + " in the .gz header");
196        }
197
198        final int flg = inData.readUnsignedByte();
199        if ((flg & FRESERVED) != 0) {
200            throw new IOException(
201                    "Reserved flags are set in the .gz header");
202        }
203
204        parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4) * 1000);
205        switch (inData.readUnsignedByte()) { // extra flags
206        case 2:
207            parameters.setCompressionLevel(Deflater.BEST_COMPRESSION);
208            break;
209        case 4:
210            parameters.setCompressionLevel(Deflater.BEST_SPEED);
211            break;
212        default:
213            // ignored for now
214            break;
215        }
216        parameters.setOperatingSystem(inData.readUnsignedByte());
217
218        // Extra field, ignored
219        if ((flg & FEXTRA) != 0) {
220            int xlen = inData.readUnsignedByte();
221            xlen |= inData.readUnsignedByte() << 8;
222
223            // This isn't as efficient as calling in.skip would be,
224            // but it's lazier to handle unexpected end of input this way.
225            // Most files don't have an extra field anyway.
226            while (xlen-- > 0) {
227                inData.readUnsignedByte();
228            }
229        }
230
231        // Original file name
232        if ((flg & FNAME) != 0) {
233            parameters.setFilename(new String(readToNull(inData),
234                                              CharsetNames.ISO_8859_1));
235        }
236
237        // Comment
238        if ((flg & FCOMMENT) != 0) {
239            parameters.setComment(new String(readToNull(inData),
240                                             CharsetNames.ISO_8859_1));
241        }
242
243        // Header "CRC16" which is actually a truncated CRC32 (which isn't
244        // as good as real CRC16). I don't know if any encoder implementation
245        // sets this, so it's not worth trying to verify it. GNU gzip 1.4
246        // doesn't support this field, but zlib seems to be able to at least
247        // skip over it.
248        if ((flg & FHCRC) != 0) {
249            inData.readShort();
250        }
251
252        // Reset
253        inf.reset();
254        crc.reset();
255
256        return true;
257    }
258
259    private static byte[] readToNull(final DataInput inData) throws IOException {
260        final ByteArrayOutputStream bos = new ByteArrayOutputStream();
261        int b = 0;
262        while ((b = inData.readUnsignedByte()) != 0x00) { // NOPMD
263            bos.write(b);
264        }
265        return bos.toByteArray();
266    }
267
268    @Override
269    public int read() throws IOException {
270        return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
271    }
272
273    /**
274     * {@inheritDoc}
275     *
276     * @since 1.1
277     */
278    @Override
279    public int read(final byte[] b, int off, int len) throws IOException {
280        if (endReached) {
281            return -1;
282        }
283
284        int size = 0;
285
286        while (len > 0) {
287            if (inf.needsInput()) {
288                // Remember the current position because we may need to
289                // rewind after reading too much input.
290                in.mark(buf.length);
291
292                bufUsed = in.read(buf);
293                if (bufUsed == -1) {
294                    throw new EOFException();
295                }
296
297                inf.setInput(buf, 0, bufUsed);
298            }
299
300            int ret;
301            try {
302                ret = inf.inflate(b, off, len);
303            } catch (final DataFormatException e) {
304                throw new IOException("Gzip-compressed data is corrupt");
305            }
306
307            crc.update(b, off, ret);
308            off += ret;
309            len -= ret;
310            size += ret;
311            count(ret);
312
313            if (inf.finished()) {
314                // We may have read too many bytes. Rewind the read
315                // position to match the actual amount used.
316                //
317                // NOTE: The "if" is there just in case. Since we used
318                // in.mark earlier, it should always skip enough.
319                in.reset();
320
321                final int skipAmount = bufUsed - inf.getRemaining();
322                if (in.skip(skipAmount) != skipAmount) {
323                    throw new IOException();
324                }
325
326                bufUsed = 0;
327
328                final DataInput inData = new DataInputStream(in);
329
330                // CRC32
331                final long crcStored = ByteUtils.fromLittleEndian(inData, 4);
332
333                if (crcStored != crc.getValue()) {
334                    throw new IOException("Gzip-compressed data is corrupt "
335                                          + "(CRC32 error)");
336                }
337
338                // Uncompressed size modulo 2^32 (ISIZE in the spec)
339                final long isize = ByteUtils.fromLittleEndian(inData, 4);
340
341                if (isize != (inf.getBytesWritten() & 0xffffffffL)) {
342                    throw new IOException("Gzip-compressed data is corrupt"
343                                          + "(uncompressed size mismatch)");
344                }
345
346                // See if this is the end of the file.
347                if (!decompressConcatenated || !init(false)) {
348                    inf.end();
349                    inf = null;
350                    endReached = true;
351                    return size == 0 ? -1 : size;
352                }
353            }
354        }
355
356        return size;
357    }
358
359    /**
360     * Checks if the signature matches what is expected for a .gz file.
361     *
362     * @param signature the bytes to check
363     * @param length    the number of bytes to check
364     * @return          true if this is a .gz stream, false otherwise
365     *
366     * @since 1.1
367     */
368    public static boolean matches(final byte[] signature, final int length) {
369        return length >= 2 && signature[0] == 31 && signature[1] == -117;
370    }
371
372    /**
373     * Closes the input stream (unless it is System.in).
374     *
375     * @since 1.2
376     */
377    @Override
378    public void close() throws IOException {
379        if (inf != null) {
380            inf.end();
381            inf = null;
382        }
383
384        if (this.in != System.in) {
385            this.in.close();
386        }
387    }
388}