001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.lz77support;
020
021import java.io.IOException;
022import java.util.Arrays;
023import java.util.Objects;
024
025/**
026 * Helper class for compression algorithms that use the ideas of LZ77.
027 *
028 * <p>Most LZ77 derived algorithms split input data into blocks of
029 * uncompressed data (called literal blocks) and back-references
030 * (pairs of offsets and lengths) that state "add <code>length</code>
031 * bytes that are the same as those already written starting
032 * <code>offset</code> bytes before the current position. The details
033 * of how those blocks and back-references are encoded are quite
034 * different between the algorithms and some algorithms perform
035 * additional steps (Huffman encoding in the case of DEFLATE for
036 * example).</p>
037 *
038 * <p>This class attempts to extract the core logic - finding
039 * back-references - so it can be re-used. It follows the algorithm
040 * explained in section 4 of RFC 1951 (DEFLATE) and currently doesn't
041 * implement the "lazy match" optimization. The three-byte hash
042 * function used in this class is the same as the one used by zlib and
043 * InfoZIP's ZIP implementation of DEFLATE. The whole class is
044 * strongly inspired by InfoZIP's implementation.</p>
045 *
046 * <p>LZ77 is used vaguely here (as well as many other places that
047 * talk about it :-), LZSS would likely be closer to the truth but
048 * LZ77 has become the synonym for a whole family of algorithms.</p>
049 *
050 * <p>The API consists of a compressor that is fed <code>byte</code>s
051 * and emits {@link Block}s to a registered callback where the blocks
052 * represent either {@link LiteralBlock literal blocks}, {@link
053 * BackReference back-references} or {@link EOD end of data
054 * markers}. In order to ensure the callback receives all information,
055 * the {@code #finish} method must be used once all data has been fed
056 * into the compressor.</p>
057 *
058 * <p>Several parameters influence the outcome of the "compression":</p>
059 * <dl>
060 *
061 *  <dt><code>windowSize</code></dt> <dd>the size of the sliding
062 *  window, must be a power of two - this determines the maximum
063 *  offset a back-reference can take. The compressor maintains a
064 *  buffer of twice of <code>windowSize</code> - real world values are
065 *  in the area of 32k.</dd>
066 *
067 *  <dt><code>minBackReferenceLength</code></dt>
068 *  <dd>Minimal length of a back-reference found. A true minimum of 3 is
069 *  hard-coded inside of this implemention but bigger lengths can be
070 *  configured.</dd>
071 *
072 *  <dt><code>maxBackReferenceLength</code></dt>
073 *  <dd>Maximal length of a back-reference found.</dd>
074 *
075 *  <dt><code>maxOffset</code></dt>
076 *  <dd>Maximal offset of a back-reference.</dd>
077 *
078 *  <dt><code>maxLiteralLength</code></dt>
079 *  <dd>Maximal length of a literal block.</dd>
080 * </dl>
081 *
082 * @see "https://tools.ietf.org/html/rfc1951#section-4"
083 * @since 1.14
084 * @NotThreadSafe
085 */
086public class LZ77Compressor {
087
088    /**
089     * Base class representing blocks the compressor may emit.
090     *
091     * <p>This class is not supposed to be subclassed by classes
092     * outside of Commons Compress so it is considered internal and
093     * changed that would break subclasses may get introduced with
094     * future releases.</p>
095     */
096    public static abstract class Block {
097        /** Enumeration of the block types the compressor may emit. */
098        public enum BlockType {
099            LITERAL, BACK_REFERENCE, EOD
100        }
101        public abstract BlockType getType();
102    }
103
104    /**
105     * Represents a literal block of data.
106     *
107     * <p>For performance reasons this encapsulates the real data, not
108     * a copy of it. Don't modify the data and process it inside of
109     * {@link Callback#accept} immediately as it will get overwritten
110     * sooner or later.</p>
111     */
112    public static final class LiteralBlock extends Block {
113        private final byte[] data;
114        private final int offset, length;
115        public LiteralBlock(byte[] data, int offset, int length) {
116            this.data = data;
117            this.offset = offset;
118            this.length = length;
119        }
120        /**
121         * The literal data.
122         *
123         * <p>This returns a life view of the actual data in order to
124         * avoid copying, modify the array at your own risk.</p>
125         * @return the data
126         */
127        public byte[] getData() {
128            return data;
129        }
130        /**
131         * Offset into data where the literal block starts.
132         * @return the offset
133         */
134        public int getOffset() {
135            return offset;
136        }
137        /**
138         * Length of literal block.
139         * @return the length
140         */
141        public int getLength() {
142            return length;
143        }
144        @Override
145        public BlockType getType() {
146            return BlockType.LITERAL;
147        }
148        @Override
149        public String toString() {
150            return "LiteralBlock starting at " + offset + " with length " + length;
151        }
152    }
153
154    /**
155     * Represents a back-reference.
156     */
157    public static final class BackReference extends Block {
158        private final int offset, length;
159        public BackReference(int offset, int length) {
160            this.offset = offset;
161            this.length = length;
162        }
163        /**
164         * Provides the offset of the back-reference.
165         * @return the offset
166         */
167        public int getOffset() {
168            return offset;
169        }
170        /**
171         * Provides the length of the back-reference.
172         * @return the length
173         */
174        public int getLength() {
175            return length;
176        }
177        @Override
178        public BlockType getType() {
179            return BlockType.BACK_REFERENCE;
180        }
181        @Override
182        public String toString() {
183            return "BackReference with offset " + offset + " and length " + length;
184        }
185    }
186
187    /** A simple "we are done" marker. */
188    public static final class EOD extends Block {
189        @Override
190        public BlockType getType() {
191            return BlockType.EOD;
192        }
193    }
194
195    private static final Block THE_EOD = new EOD();
196
197    /**
198     * Callback invoked while the compressor processes data.
199     *
200     * <p>The callback is invoked on the same thread that receives the
201     * bytes to compress and may be invoked multiple times during the
202     * execution of {@link #compress} or {@link #finish}.</p>
203     */
204    public interface Callback {
205        /**
206         * Consumes a block.
207         * @param b the block to consume
208         * @throws IOException in case of an error
209         */
210        void accept(Block b) throws IOException;
211    }
212
213    static final int NUMBER_OF_BYTES_IN_HASH = 3;
214    private static final int NO_MATCH = -1;
215
216    private final Parameters params;
217    private final Callback callback;
218
219    // the sliding window, twice as big as "windowSize" parameter
220    private final byte[] window;
221    // the head of hash-chain - indexed by hash-code, points to the
222    // location inside of window of the latest sequence of bytes with
223    // the given hash.
224    private final int[] head;
225    // for each window-location points to the latest earlier location
226    // with the same hash. Only stores values for the latest
227    // "windowSize" elements, the index is "window location modulo
228    // windowSize".
229    private final int[] prev;
230
231    // bit mask used when indexing into prev
232    private final int wMask;
233
234    private boolean initialized = false;
235    // the position inside of window that shall be encoded right now
236    private int currentPosition;
237    // the number of bytes available to compress including the one at
238    // currentPosition
239    private int lookahead = 0;
240    // the hash of the three bytes stating at the current position
241    private int insertHash = 0;
242    // the position inside of the window where the current literal
243    // block starts (in case we are inside of a literal block).
244    private int blockStart = 0;
245    // position of the current match
246    private int matchStart = NO_MATCH;
247    // number of missed insertString calls for the up to three last
248    // bytes of the last match that can only be performed once more
249    // data has been read
250    private int missedInserts = 0;
251
252    /**
253     * Initializes a compressor with parameters and a callback.
254     * @param params the parameters
255     * @param callback the callback
256     * @throws NullPointerException if either parameter is <code>null</code>
257     */
258    public LZ77Compressor(Parameters params, Callback callback) {
259        Objects.requireNonNull(params, "params");
260        Objects.requireNonNull(callback, "callback");
261        
262        this.params = params;
263        this.callback = callback;
264
265        final int wSize = params.getWindowSize();
266        window = new byte[wSize * 2];
267        wMask = wSize - 1;
268        head = new int[HASH_SIZE];
269        Arrays.fill(head, NO_MATCH);
270        prev = new int[wSize];
271    }
272
273    /**
274     * Feeds bytes into the compressor which in turn may emit zero or
275     * more blocks to the callback during the execution of this
276     * method.
277     * @param data the data to compress - must not be null
278     * @throws IOException if the callback throws an exception
279     */
280    public void compress(byte[] data) throws IOException {
281        compress(data, 0, data.length);
282    }
283
284    /**
285     * Feeds bytes into the compressor which in turn may emit zero or
286     * more blocks to the callback during the execution of this
287     * method.
288     * @param data the data to compress - must not be null
289     * @param off the start offset of the data
290     * @param len the number of bytes to compress
291     * @throws IOException if the callback throws an exception
292     */
293    public void compress(byte[] data, int off, int len) throws IOException {
294        final int wSize = params.getWindowSize();
295        while (len > wSize) { // chop into windowSize sized chunks
296            doCompress(data, off, wSize);
297            off += wSize;
298            len -= wSize;
299        }
300        if (len > 0) {
301            doCompress(data, off, len);
302        }
303    }
304
305    /**
306     * Tells the compressor to process all remaining data and signal
307     * end of data to the callback.
308     *
309     * <p>The compressor will in turn emit at least one block ({@link
310     * EOD}) but potentially multiple blocks to the callback during
311     * the execution of this method.</p>
312     * @throws IOException if the callback throws an exception
313     */
314    public void finish() throws IOException {
315        if (blockStart != currentPosition || lookahead > 0) {
316            currentPosition += lookahead;
317            flushLiteralBlock();
318        }
319        callback.accept(THE_EOD);
320    }
321
322    /**
323     * Adds some initial data to fill the window with.
324     *
325     * <p>This is used if the stream has been cut into blocks and
326     * back-references of one block may refer to data of the previous
327     * block(s). One such example is the LZ4 frame format using block
328     * dependency.</p>
329     *
330     * @param data the data to fill the window with.
331     * @throws IllegalStateException if the compressor has already started to accept data
332     */
333    public void prefill(byte[] data) {
334        if (currentPosition != 0 || lookahead != 0) {
335            throw new IllegalStateException("The compressor has already started to accept data, can't prefill anymore");
336        }
337
338        // don't need more than windowSize for back-references
339        final int len = Math.min(params.getWindowSize(), data.length);
340        System.arraycopy(data, data.length - len, window, 0, len);
341
342        if (len >= NUMBER_OF_BYTES_IN_HASH) {
343            initialize();
344            final int stop = len - NUMBER_OF_BYTES_IN_HASH + 1;
345            for (int i = 0; i < stop; i++) {
346                insertString(i);
347            }
348            missedInserts = NUMBER_OF_BYTES_IN_HASH - 1;
349        } else { // not enough data to hash anything
350            missedInserts = len;
351        }
352        blockStart = currentPosition = len;
353    }
354
355    // we use a 15 bit hashcode as calculated in updateHash
356    private static final int HASH_SIZE = 1 << 15;
357    private static final int HASH_MASK = HASH_SIZE - 1;
358    private static final int H_SHIFT = 5;
359
360    /**
361     * Assumes we are calculating the hash for three consecutive bytes
362     * as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC
363     * the new hash for BCD is nextHash(H, D).
364     *
365     * <p>The hash is shifted by five bits on each update so all
366     * effects of A have been swapped after the third update.</p>
367     */
368    private int nextHash(int oldHash, byte nextByte) {
369        final int nextVal = nextByte & 0xFF;
370        return ((oldHash << H_SHIFT) ^ nextVal) & HASH_MASK;
371    }
372
373    // performs the actual algorithm with the pre-condition len <= windowSize
374    private void doCompress(byte[] data, int off, int len) throws IOException {
375        int spaceLeft = window.length - currentPosition - lookahead;
376        if (len > spaceLeft) {
377            slide();
378        }
379        System.arraycopy(data, off, window, currentPosition + lookahead, len);
380        lookahead += len;
381        if (!initialized && lookahead >= params.getMinBackReferenceLength()) {
382            initialize();
383        }
384        if (initialized) {
385            compress();
386        }
387    }
388
389    private void slide() throws IOException {
390        final int wSize = params.getWindowSize();
391        if (blockStart != currentPosition && blockStart < wSize) {
392            flushLiteralBlock();
393            blockStart = currentPosition;
394        }
395        System.arraycopy(window, wSize, window, 0, wSize);
396        currentPosition -= wSize;
397        matchStart -= wSize;
398        blockStart -= wSize;
399        for (int i = 0; i < HASH_SIZE; i++) {
400            int h = head[i];
401            head[i] = h >= wSize ? h - wSize : NO_MATCH;
402        }
403        for (int i = 0; i < wSize; i++) {
404            int p = prev[i];
405            prev[i] = p >= wSize ? p - wSize : NO_MATCH;
406        }
407    }
408
409    private void initialize() {
410        for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) {
411            insertHash = nextHash(insertHash, window[i]);
412        }
413        initialized = true;
414    }
415
416    private void compress() throws IOException {
417        final int minMatch = params.getMinBackReferenceLength();
418        final boolean lazy = params.getLazyMatching();
419        final int lazyThreshold = params.getLazyMatchingThreshold();
420
421        while (lookahead >= minMatch) {
422            catchUpMissedInserts();
423            int matchLength = 0;
424            int hashHead = insertString(currentPosition);
425            if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) {
426                // sets matchStart as a side effect
427                matchLength = longestMatch(hashHead);
428
429                if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) {
430                    // try to find a longer match using the next position
431                    matchLength = longestMatchForNextPosition(matchLength);
432                }
433            }
434            if (matchLength >= minMatch) {
435                if (blockStart != currentPosition) {
436                    // emit preceeding literal block
437                    flushLiteralBlock();
438                    blockStart = NO_MATCH;
439                }
440                flushBackReference(matchLength);
441                insertStringsInMatch(matchLength);
442                lookahead -= matchLength;
443                currentPosition += matchLength;
444                blockStart = currentPosition;
445            } else {
446                // no match, append to current or start a new literal
447                lookahead--;
448                currentPosition++;
449                if (currentPosition - blockStart >= params.getMaxLiteralLength()) {
450                    flushLiteralBlock();
451                    blockStart = currentPosition;
452                }
453            }
454        }
455    }
456
457    /**
458     * Inserts the current three byte sequence into the dictionary and
459     * returns the previous head of the hash-chain.
460     *
461     * <p>Updates <code>insertHash</code> and <code>prev</code> as a
462     * side effect.</p>
463     */
464    private int insertString(int pos) {
465        insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]);
466        int hashHead = head[insertHash];
467        prev[pos & wMask] = hashHead;
468        head[insertHash] = pos;
469        return hashHead;
470    }
471
472    private int longestMatchForNextPosition(final int prevMatchLength) {
473        // save a bunch of values to restore them if the next match isn't better than the current one
474        final int prevMatchStart = matchStart;
475        final int prevInsertHash = insertHash;
476
477        lookahead--;
478        currentPosition++;
479        int hashHead = insertString(currentPosition);
480        final int prevHashHead = prev[currentPosition & wMask];
481        int matchLength = longestMatch(hashHead);
482
483        if (matchLength <= prevMatchLength) {
484            // use the first match, as the next one isn't any better
485            matchLength = prevMatchLength;
486            matchStart = prevMatchStart;
487
488            // restore modified values
489            head[insertHash] = prevHashHead;
490            insertHash = prevInsertHash;
491            currentPosition--;
492            lookahead++;
493        }
494        return matchLength;
495    }
496
497    private void insertStringsInMatch(int matchLength) {
498        // inserts strings contained in current match
499        // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts
500        final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH);
501        // currentPosition has been inserted already
502        for (int i = 1; i <= stop; i++) {
503            insertString(currentPosition + i);
504        }
505        missedInserts = matchLength - stop - 1;
506    }
507
508    private void catchUpMissedInserts() {
509        while (missedInserts > 0) {
510            insertString(currentPosition - missedInserts--);
511        }
512    }
513
514    private void flushBackReference(int matchLength) throws IOException {
515        callback.accept(new BackReference(currentPosition - matchStart, matchLength));
516    }
517
518    private void flushLiteralBlock() throws IOException {
519        callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart));
520    }
521
522    /**
523     * Searches the hash chain for real matches and returns the length
524     * of the longest match (0 if none were found) that isn't too far
525     * away (WRT maxOffset).
526     *
527     * <p>Sets matchStart to the index of the start position of the
528     * longest match as a side effect.</p>
529     */
530    private int longestMatch(int matchHead) {
531        final int minLength = params.getMinBackReferenceLength();
532        int longestMatchLength = minLength - 1;
533        final int maxPossibleLength = Math.min(params.getMaxBackReferenceLength(), lookahead);
534        final int minIndex = Math.max(0, currentPosition - params.getMaxOffset());
535        final int niceBackReferenceLength = Math.min(maxPossibleLength, params.getNiceBackReferenceLength());
536        final int maxCandidates = params.getMaxCandidates();
537        for (int candidates = 0; candidates < maxCandidates && matchHead >= minIndex; candidates++) {
538            int currentLength = 0;
539            for (int i = 0; i < maxPossibleLength; i++) {
540                if (window[matchHead + i] != window[currentPosition + i]) {
541                    break;
542                }
543                currentLength++;
544            }
545            if (currentLength > longestMatchLength) {
546                longestMatchLength = currentLength;
547                matchStart = matchHead;
548                if (currentLength >= niceBackReferenceLength) {
549                    // no need to search any further
550                    break;
551                }
552            }
553            matchHead = prev[matchHead & wMask];
554        }
555        return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress()
556    }
557}