001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.lz77support;
020
021import java.io.IOException;
022import java.io.InputStream;
023import java.util.Arrays;
024
025import org.apache.commons.compress.compressors.CompressorInputStream;
026import org.apache.commons.compress.utils.ByteUtils;
027import org.apache.commons.compress.utils.CountingInputStream;
028import org.apache.commons.compress.utils.IOUtils;
029import org.apache.commons.compress.utils.InputStreamStatistics;
030
031/**
032 * Encapsulates code common to LZ77 decompressors.
033 *
034 * <p>Assumes the stream consists of blocks of literal data and
035 * back-references (called copies) in any order. Of course the first
036 * block must be a literal block for the scheme to work - unless the
037 * {@link #prefill prefill} method has been used to provide initial
038 * data that is never returned by {@link #read read} but only used for
039 * back-references.</p>
040 *
041 * <p>Subclasses must override the three-arg {@link #read read} method
042 * as the no-arg version delegates to it and the default
043 * implementation delegates to the no-arg version, leading to infinite
044 * mutual recursion and a {@code StackOverflowError} otherwise.</p>
045 *
046 * <p>The contract for subclasses' {@code read} implementation is:</p>
047 * <ul>
048 *
049 *  <li>keep track of the current state of the stream. Is it inside a
050 *  literal block or a back-reference or in-between blocks?</li>
051 *
052 *  <li>Use {@link #readOneByte} to access the underlying stream
053 *  directly.</li>
054 *
055 *  <li>If a new literal block starts, use {@link #startLiteral} to
056 *  tell this class about it and read the literal data using {@link
057 *  #readLiteral} until it returns {@code 0}. {@link
058 *  #hasMoreDataInBlock} will return {@code false} before the next
059 *  call to {@link #readLiteral} would return {@code 0}.</li>
060 *
061 *  <li>If a new back-reference starts, use {@link #startBackReference} to
062 *  tell this class about it and read the literal data using {@link
063 *  #readBackReference} until it returns {@code 0}. {@link
064 *  #hasMoreDataInBlock} will return {@code false} before the next
065 *  call to {@link #readBackReference} would return {@code 0}.</li>
066 *
067 *  <li>If the end of the stream has been reached, return {@code -1}
068 *  as this class' methods will never do so themselves.</li>
069 *
070 * </ul>
071 *
072 * <p>{@link #readOneByte} and {@link #readLiteral} update the counter
073 * for bytes read.</p>
074 *
075 * @since 1.14
076 */
077public abstract class AbstractLZ77CompressorInputStream extends CompressorInputStream
078    implements InputStreamStatistics {
079
080    /** Size of the window - must be bigger than the biggest offset expected. */
081    private final int windowSize;
082
083    /**
084     * Buffer to write decompressed bytes to for back-references, will
085     * be three times windowSize big.
086     *
087     * <p>Three times so we can slide the whole buffer a windowSize to
088     * the left once we've read twice windowSize and still have enough
089     * data inside of it to satisfy back-references.</p>
090     */
091    private final byte[] buf;
092
093    /** One behind the index of the last byte in the buffer that was written, i.e. the next position to write to */
094    private int writeIndex;
095
096    /** Index of the next byte to be read. */
097    private int readIndex;
098
099    /** The underlying stream to read compressed data from */
100    private final CountingInputStream in;
101
102    /** Number of bytes still to be read from the current literal or back-reference. */
103    private long bytesRemaining;
104
105    /** Offset of the current back-reference. */
106    private int backReferenceOffset;
107
108    /** uncompressed size */
109    private int size = 0;
110
111    // used in no-arg read method
112    private final byte[] oneByte = new byte[1];
113
114    /**
115     * Supplier that delegates to {@link #readOneByte}.
116     */
117    protected final ByteUtils.ByteSupplier supplier = new ByteUtils.ByteSupplier() {
118        @Override
119        public int getAsByte() throws IOException {
120            return readOneByte();
121        }
122    };
123
124    /**
125     * Creates a new LZ77 input stream.
126     *
127     * @param is
128     *            An InputStream to read compressed data from
129     * @param windowSize
130     *            Size of the window kept for back-references, must be bigger than the biggest offset expected.
131     *
132     * @throws IOException if reading fails
133     * @throws IllegalArgumentException if windowSize is not bigger than 0
134     */
135    public AbstractLZ77CompressorInputStream(final InputStream is, int windowSize) throws IOException {
136        this.in = new CountingInputStream(is);
137        if (windowSize <= 0) {
138            throw new IllegalArgumentException("windowSize must be bigger than 0");
139        }
140        this.windowSize = windowSize;
141        buf = new byte[3 * windowSize];
142        writeIndex = readIndex = 0;
143        bytesRemaining = 0;
144    }
145
146    /** {@inheritDoc} */
147    @Override
148    public int read() throws IOException {
149        return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
150    }
151
152    /** {@inheritDoc} */
153    @Override
154    public void close() throws IOException {
155        in.close();
156    }
157
158    /** {@inheritDoc} */
159    @Override
160    public int available() {
161        return writeIndex - readIndex;
162    }
163
164    /**
165     * Get the uncompressed size of the stream
166     *
167     * @return the uncompressed size
168     */
169    public int getSize() {
170        return size;
171    }
172
173    /**
174     * Adds some initial data to fill the window with.
175     *
176     * <p>This is used if the stream has been cut into blocks and
177     * back-references of one block may refer to data of the previous
178     * block(s). One such example is the LZ4 frame format using block
179     * dependency.</p>
180     *
181     * @param data the data to fill the window with.
182     * @throws IllegalStateException if the stream has already started to read data
183     */
184    public void prefill(byte[] data) {
185        if (writeIndex != 0) {
186            throw new IllegalStateException("The stream has already been read from, can't prefill anymore");
187        }
188        // we don't need more data than the big offset could refer to, so cap it
189        int len = Math.min(windowSize, data.length);
190        // we need the last data as we are dealing with *back*-references
191        System.arraycopy(data, data.length - len, buf, 0, len);
192        writeIndex += len;
193        readIndex += len;
194    }
195
196    /**
197     * @since 1.17
198     */
199    @Override
200    public long getCompressedCount() {
201        return in.getBytesRead();
202    }
203
204    /**
205     * Used by subclasses to signal the next block contains the given
206     * amount of literal data.
207     * @param length the length of the block
208     * @throws IllegalArgumentException if length is negative
209     */
210    protected final void startLiteral(long length) {
211        if (length < 0) {
212            throw new IllegalArgumentException("length must not be negative");
213        }
214        bytesRemaining = length;
215    }
216
217    /**
218     * Is there still data remaining inside the current block?
219     * @return true if there is still data remaining inside the current block.
220     */
221    protected final boolean hasMoreDataInBlock() {
222        return bytesRemaining > 0;
223    }
224
225    /**
226     * Reads data from the current literal block.
227     * @param b buffer to write data to
228     * @param off offset to start writing to
229     * @param len maximum amount of data to read
230     * @return number of bytes read, may be 0. Will never return -1 as
231     * EOF-detection is the responsibility of the subclass
232     * @throws IOException if the underlying stream throws or signals
233     * an EOF before the amount of data promised for the block have
234     * been read
235     * @throws NullPointerException if <code>b</code> is null
236     * @throws IndexOutOfBoundsException if <code>off</code> is
237     * negative, <code>len</code> is negative, or <code>len</code> is
238     * greater than <code>b.length - off</code>
239     */
240    protected final int readLiteral(final byte[] b, final int off, final int len) throws IOException {
241        final int avail = available();
242        if (len > avail) {
243            tryToReadLiteral(len - avail);
244        }
245        return readFromBuffer(b, off, len);
246    }
247
248    private void tryToReadLiteral(int bytesToRead) throws IOException {
249        // min of "what is still inside the literal", "what does the user want" and "how much can fit into the buffer"
250        final int reallyTryToRead = Math.min((int) Math.min(bytesToRead, bytesRemaining),
251                                             buf.length - writeIndex);
252        final int bytesRead = reallyTryToRead > 0
253            ? IOUtils.readFully(in, buf, writeIndex, reallyTryToRead)
254            : 0 /* happens for bytesRemaining == 0 */;
255        count(bytesRead);
256        if (reallyTryToRead != bytesRead) {
257            throw new IOException("Premature end of stream reading literal");
258        }
259        writeIndex += reallyTryToRead;
260        bytesRemaining -= reallyTryToRead;
261    }
262
263    private int readFromBuffer(final byte[] b, final int off, final int len) {
264        final int readable = Math.min(len, available());
265        if (readable > 0) {
266            System.arraycopy(buf, readIndex, b, off, readable);
267            readIndex += readable;
268            if (readIndex > 2 * windowSize) {
269                slideBuffer();
270            }
271        }
272        size += readable;
273        return readable;
274    }
275
276    private void slideBuffer() {
277        System.arraycopy(buf, windowSize, buf, 0, windowSize * 2);
278        writeIndex -= windowSize;
279        readIndex -= windowSize;
280    }
281
282    /**
283     * Used by subclasses to signal the next block contains a back-reference with the given coordinates.
284     * @param offset the offset of the back-reference
285     * @param length the length of the back-reference
286     * @throws IllegalArgumentException if offset not bigger than 0 or
287     * bigger than the number of bytes available for back-references
288     * or if length is negative
289     */
290    protected final void startBackReference(int offset, long length) {
291        if (offset <= 0 || offset > writeIndex) {
292            throw new IllegalArgumentException("offset must be bigger than 0 but not bigger than the number"
293                + " of bytes available for back-references");
294        }
295        if (length < 0) {
296            throw new IllegalArgumentException("length must not be negative");
297        }
298        backReferenceOffset = offset;
299        bytesRemaining = length;
300    }
301
302    /**
303     * Reads data from the current back-reference.
304     * @param b buffer to write data to
305     * @param off offset to start writing to
306     * @param len maximum amount of data to read
307     * @return number of bytes read, may be 0. Will never return -1 as
308     * EOF-detection is the responsibility of the subclass
309     * @throws NullPointerException if <code>b</code> is null
310     * @throws IndexOutOfBoundsException if <code>off</code> is
311     * negative, <code>len</code> is negative, or <code>len</code> is
312     * greater than <code>b.length - off</code>
313     */
314    protected final int readBackReference(final byte[] b, final int off, final int len) {
315        final int avail = available();
316        if (len > avail) {
317            tryToCopy(len - avail);
318        }
319        return readFromBuffer(b, off, len);
320    }
321
322    private void tryToCopy(int bytesToCopy) {
323        // this will fit into the buffer without sliding and not
324        // require more than is available inside the back-reference
325        int copy = Math.min((int) Math.min(bytesToCopy, bytesRemaining),
326                            buf.length - writeIndex);
327        if (copy == 0) {
328            // NOP
329        } else if (backReferenceOffset == 1) { // pretty common special case
330            final byte last = buf[writeIndex - 1];
331            Arrays.fill(buf, writeIndex, writeIndex + copy, last);
332            writeIndex += copy;
333        } else if (copy < backReferenceOffset) {
334            System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, copy);
335            writeIndex += copy;
336        } else {
337            // back-reference overlaps with the bytes created from it
338            // like go back two bytes and then copy six (by copying
339            // the last two bytes three time).
340            final int fullRots = copy / backReferenceOffset;
341            for (int i = 0; i < fullRots; i++) {
342                System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, backReferenceOffset);
343                writeIndex += backReferenceOffset;
344            }
345
346            final int pad = copy - (backReferenceOffset * fullRots);
347            if (pad > 0) {
348                System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, pad);
349                writeIndex += pad;
350            }
351        }
352        bytesRemaining -= copy;
353    }
354
355    /**
356     * Reads a single byte from the real input stream and ensures the data is accounted for.
357     *
358     * @return the byte read as value between 0 and 255 or -1 if EOF has been reached.
359     * @throws IOException if the underlying stream throws
360     */
361    protected final int readOneByte() throws IOException {
362        final int b = in.read();
363        if (b != -1) {
364            count(1);
365            return b & 0xFF;
366        }
367        return -1;
368    }
369}