001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.snappy;
020
021import java.io.IOException;
022import java.io.InputStream;
023
024import org.apache.commons.compress.compressors.lz77support.AbstractLZ77CompressorInputStream;
025import org.apache.commons.compress.utils.ByteUtils;
026
027/**
028 * CompressorInputStream for the raw Snappy format.
029 *
030 * <p>This implementation uses an internal buffer in order to handle
031 * the back-references that are at the heart of the LZ77 algorithm.
032 * The size of the buffer must be at least as big as the biggest
033 * offset used in the compressed stream.  The current version of the
034 * Snappy algorithm as defined by Google works on 32k blocks and
035 * doesn't contain offsets bigger than 32k which is the default block
036 * size used by this class.</p>
037 *
038 * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a>
039 * @since 1.7
040 */
041public class SnappyCompressorInputStream extends AbstractLZ77CompressorInputStream {
042
043    /** Mask used to determine the type of "tag" is being processed */
044    private static final int TAG_MASK = 0x03;
045
046    /** Default block size */
047    public static final int DEFAULT_BLOCK_SIZE = 32768;
048
049    /** The size of the uncompressed data */
050    private final int size;
051
052    /** Number of uncompressed bytes still to be read. */
053    private int uncompressedBytesRemaining;
054
055    /** Current state of the stream */
056    private State state = State.NO_BLOCK;
057
058    private boolean endReached = false;
059
060    /**
061     * Constructor using the default buffer size of 32k.
062     *
063     * @param is
064     *            An InputStream to read compressed data from
065     *
066     * @throws IOException if reading fails
067     */
068    public SnappyCompressorInputStream(final InputStream is) throws IOException {
069        this(is, DEFAULT_BLOCK_SIZE);
070    }
071
072    /**
073     * Constructor using a configurable buffer size.
074     *
075     * @param is
076     *            An InputStream to read compressed data from
077     * @param blockSize
078     *            The block size used in compression
079     *
080     * @throws IOException if reading fails
081     * @throws IllegalArgumentException if blockSize is not bigger than 0
082     */
083    public SnappyCompressorInputStream(final InputStream is, final int blockSize)
084            throws IOException {
085        super(is, blockSize);
086        uncompressedBytesRemaining = size = (int) readSize();
087    }
088
089    /**
090     * {@inheritDoc}
091     */
092    @Override
093    public int read(final byte[] b, final int off, final int len) throws IOException {
094        if (len == 0) {
095            return 0;
096        }
097        if (endReached) {
098            return -1;
099        }
100        switch (state) {
101        case NO_BLOCK:
102            fill();
103            return read(b, off, len);
104        case IN_LITERAL:
105            int litLen = readLiteral(b, off, len);
106            if (!hasMoreDataInBlock()) {
107                state = State.NO_BLOCK;
108            }
109            return litLen > 0 ? litLen : read(b, off, len);
110        case IN_BACK_REFERENCE:
111            int backReferenceLen = readBackReference(b, off, len);
112            if (!hasMoreDataInBlock()) {
113                state = State.NO_BLOCK;
114            }
115            return backReferenceLen > 0 ? backReferenceLen : read(b, off, len);
116        default:
117            throw new IOException("Unknown stream state " + state);
118        }
119    }
120
121    /**
122     * Try to fill the buffer with the next block of data.
123     */
124    private void fill() throws IOException {
125        if (uncompressedBytesRemaining == 0) {
126            endReached = true;
127            return;
128        }
129
130        int b = readOneByte();
131        if (b == -1) {
132            throw new IOException("Premature end of stream reading block start");
133        }
134        int length = 0;
135        int offset = 0;
136
137        switch (b & TAG_MASK) {
138
139        case 0x00:
140
141            length = readLiteralLength(b);
142            if (length < 0) {
143                throw new IOException("Illegal block with a negative literal size found");
144            }
145            uncompressedBytesRemaining -= length;
146            startLiteral(length);
147            state = State.IN_LITERAL;
148            break;
149
150        case 0x01:
151
152            /*
153             * These elements can encode lengths between [4..11] bytes and
154             * offsets between [0..2047] bytes. (len-4) occupies three bits
155             * and is stored in bits [2..4] of the tag byte. The offset
156             * occupies 11 bits, of which the upper three are stored in the
157             * upper three bits ([5..7]) of the tag byte, and the lower
158             * eight are stored in a byte following the tag byte.
159             */
160
161            length = 4 + ((b >> 2) & 0x07);
162            if (length < 0) {
163                throw new IOException("Illegal block with a negative match length found");
164            }
165            uncompressedBytesRemaining -= length;
166            offset = (b & 0xE0) << 3;
167            b = readOneByte();
168            if (b == -1) {
169                throw new IOException("Premature end of stream reading back-reference length");
170            }
171            offset |= b;
172
173            try {
174                startBackReference(offset, length);
175            } catch (IllegalArgumentException ex) {
176                throw new IOException("Illegal block with bad offset found", ex);
177            }
178            state = State.IN_BACK_REFERENCE;
179            break;
180
181        case 0x02:
182
183            /*
184             * These elements can encode lengths between [1..64] and offsets
185             * from [0..65535]. (len-1) occupies six bits and is stored in
186             * the upper six bits ([2..7]) of the tag byte. The offset is
187             * stored as a little-endian 16-bit integer in the two bytes
188             * following the tag byte.
189             */
190
191            length = (b >> 2) + 1;
192            if (length < 0) {
193                throw new IOException("Illegal block with a negative match length found");
194            }
195            uncompressedBytesRemaining -= length;
196
197            offset = (int) ByteUtils.fromLittleEndian(supplier, 2);
198
199            try {
200                startBackReference(offset, length);
201            } catch (IllegalArgumentException ex) {
202                throw new IOException("Illegal block with bad offset found", ex);
203            }
204            state = State.IN_BACK_REFERENCE;
205            break;
206
207        case 0x03:
208
209            /*
210             * These are like the copies with 2-byte offsets (see previous
211             * subsection), except that the offset is stored as a 32-bit
212             * integer instead of a 16-bit integer (and thus will occupy
213             * four bytes).
214             */
215
216            length = (b >> 2) + 1;
217            if (length < 0) {
218                throw new IOException("Illegal block with a negative match length found");
219            }
220            uncompressedBytesRemaining -= length;
221
222            offset = (int) ByteUtils.fromLittleEndian(supplier, 4) & 0x7fffffff;
223
224            try {
225                startBackReference(offset, length);
226            } catch (IllegalArgumentException ex) {
227                throw new IOException("Illegal block with bad offset found", ex);
228            }
229            state = State.IN_BACK_REFERENCE;
230            break;
231        default:
232            // impossible as TAG_MASK is two bits and all four possible cases have been covered
233            break;
234        }
235    }
236
237    /*
238     * For literals up to and including 60 bytes in length, the
239     * upper six bits of the tag byte contain (len-1). The literal
240     * follows immediately thereafter in the bytestream. - For
241     * longer literals, the (len-1) value is stored after the tag
242     * byte, little-endian. The upper six bits of the tag byte
243     * describe how many bytes are used for the length; 60, 61, 62
244     * or 63 for 1-4 bytes, respectively. The literal itself follows
245     * after the length.
246     */
247    private int readLiteralLength(final int b) throws IOException {
248        int length;
249        switch (b >> 2) {
250        case 60:
251            length = readOneByte();
252            if (length == -1) {
253                throw new IOException("Premature end of stream reading literal length");
254            }
255            break;
256        case 61:
257            length = (int) ByteUtils.fromLittleEndian(supplier, 2);
258            break;
259        case 62:
260            length = (int) ByteUtils.fromLittleEndian(supplier, 3);
261            break;
262        case 63:
263            length = (int) ByteUtils.fromLittleEndian(supplier, 4);
264            break;
265        default:
266            length = b >> 2;
267            break;
268        }
269
270        return length + 1;
271    }
272
273    /**
274     * The stream starts with the uncompressed length (up to a maximum of 2^32 -
275     * 1), stored as a little-endian varint. Varints consist of a series of
276     * bytes, where the lower 7 bits are data and the upper bit is set iff there
277     * are more bytes to be read. In other words, an uncompressed length of 64
278     * would be stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE)
279     * would be stored as 0xFE 0xFF 0x7F.
280     *
281     * @return The size of the uncompressed data
282     *
283     * @throws IOException
284     *             Could not read a byte
285     */
286    private long readSize() throws IOException {
287        int index = 0;
288        long sz = 0;
289        int b = 0;
290
291        do {
292            b = readOneByte();
293            if (b == -1) {
294                throw new IOException("Premature end of stream reading size");
295            }
296            sz |= (b & 0x7f) << (index++ * 7);
297        } while (0 != (b & 0x80));
298        return sz;
299    }
300
301    /**
302     * Get the uncompressed size of the stream
303     *
304     * @return the uncompressed size
305     */
306    @Override
307    public int getSize() {
308        return size;
309    }
310
311    private enum State {
312        NO_BLOCK, IN_LITERAL, IN_BACK_REFERENCE
313    }
314}