001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.lz77support; 020 021import java.io.IOException; 022import java.util.Arrays; 023import java.util.Objects; 024 025/** 026 * Helper class for compression algorithms that use the ideas of LZ77. 027 * 028 * <p>Most LZ77 derived algorithms split input data into blocks of 029 * uncompressed data (called literal blocks) and back-references 030 * (pairs of offsets and lengths) that state "add <code>length</code> 031 * bytes that are the same as those already written starting 032 * <code>offset</code> bytes before the current position. The details 033 * of how those blocks and back-references are encoded are quite 034 * different between the algorithms and some algorithms perform 035 * additional steps (Huffman encoding in the case of DEFLATE for 036 * example).</p> 037 * 038 * <p>This class attempts to extract the core logic - finding 039 * back-references - so it can be re-used. It follows the algorithm 040 * explained in section 4 of RFC 1951 (DEFLATE) and currently doesn't 041 * implement the "lazy match" optimization. The three-byte hash 042 * function used in this class is the same as the one used by zlib and 043 * InfoZIP's ZIP implementation of DEFLATE. The whole class is 044 * strongly inspired by InfoZIP's implementation.</p> 045 * 046 * <p>LZ77 is used vaguely here (as well as many other places that 047 * talk about it :-), LZSS would likely be closer to the truth but 048 * LZ77 has become the synonym for a whole family of algorithms.</p> 049 * 050 * <p>The API consists of a compressor that is fed <code>byte</code>s 051 * and emits {@link Block}s to a registered callback where the blocks 052 * represent either {@link LiteralBlock literal blocks}, {@link 053 * BackReference back-references} or {@link EOD end of data 054 * markers}. In order to ensure the callback receives all information, 055 * the {@code #finish} method must be used once all data has been fed 056 * into the compressor.</p> 057 * 058 * <p>Several parameters influence the outcome of the "compression":</p> 059 * <dl> 060 * 061 * <dt><code>windowSize</code></dt> <dd>the size of the sliding 062 * window, must be a power of two - this determines the maximum 063 * offset a back-reference can take. The compressor maintains a 064 * buffer of twice of <code>windowSize</code> - real world values are 065 * in the area of 32k.</dd> 066 * 067 * <dt><code>minBackReferenceLength</code></dt> 068 * <dd>Minimal length of a back-reference found. A true minimum of 3 is 069 * hard-coded inside of this implemention but bigger lengths can be 070 * configured.</dd> 071 * 072 * <dt><code>maxBackReferenceLength</code></dt> 073 * <dd>Maximal length of a back-reference found.</dd> 074 * 075 * <dt><code>maxOffset</code></dt> 076 * <dd>Maximal offset of a back-reference.</dd> 077 * 078 * <dt><code>maxLiteralLength</code></dt> 079 * <dd>Maximal length of a literal block.</dd> 080 * </dl> 081 * 082 * @see "https://tools.ietf.org/html/rfc1951#section-4" 083 * @since 1.14 084 * @NotThreadSafe 085 */ 086public class LZ77Compressor { 087 088 /** 089 * Base class representing blocks the compressor may emit. 090 * 091 * <p>This class is not supposed to be subclassed by classes 092 * outside of Commons Compress so it is considered internal and 093 * changed that would break subclasses may get introduced with 094 * future releases.</p> 095 */ 096 public static abstract class Block { 097 /** Enumeration of the block types the compressor may emit. */ 098 public enum BlockType { 099 LITERAL, BACK_REFERENCE, EOD 100 } 101 public abstract BlockType getType(); 102 } 103 104 /** 105 * Represents a literal block of data. 106 * 107 * <p>For performance reasons this encapsulates the real data, not 108 * a copy of it. Don't modify the data and process it inside of 109 * {@link Callback#accept} immediately as it will get overwritten 110 * sooner or later.</p> 111 */ 112 public static final class LiteralBlock extends Block { 113 private final byte[] data; 114 private final int offset, length; 115 public LiteralBlock(byte[] data, int offset, int length) { 116 this.data = data; 117 this.offset = offset; 118 this.length = length; 119 } 120 /** 121 * The literal data. 122 * 123 * <p>This returns a life view of the actual data in order to 124 * avoid copying, modify the array at your own risk.</p> 125 * @return the data 126 */ 127 public byte[] getData() { 128 return data; 129 } 130 /** 131 * Offset into data where the literal block starts. 132 * @return the offset 133 */ 134 public int getOffset() { 135 return offset; 136 } 137 /** 138 * Length of literal block. 139 * @return the length 140 */ 141 public int getLength() { 142 return length; 143 } 144 @Override 145 public BlockType getType() { 146 return BlockType.LITERAL; 147 } 148 @Override 149 public String toString() { 150 return "LiteralBlock starting at " + offset + " with length " + length; 151 } 152 } 153 154 /** 155 * Represents a back-reference. 156 */ 157 public static final class BackReference extends Block { 158 private final int offset, length; 159 public BackReference(int offset, int length) { 160 this.offset = offset; 161 this.length = length; 162 } 163 /** 164 * Provides the offset of the back-reference. 165 * @return the offset 166 */ 167 public int getOffset() { 168 return offset; 169 } 170 /** 171 * Provides the length of the back-reference. 172 * @return the length 173 */ 174 public int getLength() { 175 return length; 176 } 177 @Override 178 public BlockType getType() { 179 return BlockType.BACK_REFERENCE; 180 } 181 @Override 182 public String toString() { 183 return "BackReference with offset " + offset + " and length " + length; 184 } 185 } 186 187 /** A simple "we are done" marker. */ 188 public static final class EOD extends Block { 189 @Override 190 public BlockType getType() { 191 return BlockType.EOD; 192 } 193 } 194 195 private static final Block THE_EOD = new EOD(); 196 197 /** 198 * Callback invoked while the compressor processes data. 199 * 200 * <p>The callback is invoked on the same thread that receives the 201 * bytes to compress and may be invoked multiple times during the 202 * execution of {@link #compress} or {@link #finish}.</p> 203 */ 204 public interface Callback { 205 /** 206 * Consumes a block. 207 * @param b the block to consume 208 * @throws IOException in case of an error 209 */ 210 void accept(Block b) throws IOException; 211 } 212 213 static final int NUMBER_OF_BYTES_IN_HASH = 3; 214 private static final int NO_MATCH = -1; 215 216 private final Parameters params; 217 private final Callback callback; 218 219 // the sliding window, twice as big as "windowSize" parameter 220 private final byte[] window; 221 // the head of hash-chain - indexed by hash-code, points to the 222 // location inside of window of the latest sequence of bytes with 223 // the given hash. 224 private final int[] head; 225 // for each window-location points to the latest earlier location 226 // with the same hash. Only stores values for the latest 227 // "windowSize" elements, the index is "window location modulo 228 // windowSize". 229 private final int[] prev; 230 231 // bit mask used when indexing into prev 232 private final int wMask; 233 234 private boolean initialized = false; 235 // the position inside of window that shall be encoded right now 236 private int currentPosition; 237 // the number of bytes available to compress including the one at 238 // currentPosition 239 private int lookahead = 0; 240 // the hash of the three bytes stating at the current position 241 private int insertHash = 0; 242 // the position inside of the window where the current literal 243 // block starts (in case we are inside of a literal block). 244 private int blockStart = 0; 245 // position of the current match 246 private int matchStart = NO_MATCH; 247 // number of missed insertString calls for the up to three last 248 // bytes of the last match that can only be performed once more 249 // data has been read 250 private int missedInserts = 0; 251 252 /** 253 * Initializes a compressor with parameters and a callback. 254 * @param params the parameters 255 * @param callback the callback 256 * @throws NullPointerException if either parameter is <code>null</code> 257 */ 258 public LZ77Compressor(Parameters params, Callback callback) { 259 Objects.requireNonNull(params, "params"); 260 Objects.requireNonNull(callback, "callback"); 261 262 this.params = params; 263 this.callback = callback; 264 265 final int wSize = params.getWindowSize(); 266 window = new byte[wSize * 2]; 267 wMask = wSize - 1; 268 head = new int[HASH_SIZE]; 269 Arrays.fill(head, NO_MATCH); 270 prev = new int[wSize]; 271 } 272 273 /** 274 * Feeds bytes into the compressor which in turn may emit zero or 275 * more blocks to the callback during the execution of this 276 * method. 277 * @param data the data to compress - must not be null 278 * @throws IOException if the callback throws an exception 279 */ 280 public void compress(byte[] data) throws IOException { 281 compress(data, 0, data.length); 282 } 283 284 /** 285 * Feeds bytes into the compressor which in turn may emit zero or 286 * more blocks to the callback during the execution of this 287 * method. 288 * @param data the data to compress - must not be null 289 * @param off the start offset of the data 290 * @param len the number of bytes to compress 291 * @throws IOException if the callback throws an exception 292 */ 293 public void compress(byte[] data, int off, int len) throws IOException { 294 final int wSize = params.getWindowSize(); 295 while (len > wSize) { // chop into windowSize sized chunks 296 doCompress(data, off, wSize); 297 off += wSize; 298 len -= wSize; 299 } 300 if (len > 0) { 301 doCompress(data, off, len); 302 } 303 } 304 305 /** 306 * Tells the compressor to process all remaining data and signal 307 * end of data to the callback. 308 * 309 * <p>The compressor will in turn emit at least one block ({@link 310 * EOD}) but potentially multiple blocks to the callback during 311 * the execution of this method.</p> 312 * @throws IOException if the callback throws an exception 313 */ 314 public void finish() throws IOException { 315 if (blockStart != currentPosition || lookahead > 0) { 316 currentPosition += lookahead; 317 flushLiteralBlock(); 318 } 319 callback.accept(THE_EOD); 320 } 321 322 /** 323 * Adds some initial data to fill the window with. 324 * 325 * <p>This is used if the stream has been cut into blocks and 326 * back-references of one block may refer to data of the previous 327 * block(s). One such example is the LZ4 frame format using block 328 * dependency.</p> 329 * 330 * @param data the data to fill the window with. 331 * @throws IllegalStateException if the compressor has already started to accept data 332 */ 333 public void prefill(byte[] data) { 334 if (currentPosition != 0 || lookahead != 0) { 335 throw new IllegalStateException("The compressor has already started to accept data, can't prefill anymore"); 336 } 337 338 // don't need more than windowSize for back-references 339 final int len = Math.min(params.getWindowSize(), data.length); 340 System.arraycopy(data, data.length - len, window, 0, len); 341 342 if (len >= NUMBER_OF_BYTES_IN_HASH) { 343 initialize(); 344 final int stop = len - NUMBER_OF_BYTES_IN_HASH + 1; 345 for (int i = 0; i < stop; i++) { 346 insertString(i); 347 } 348 missedInserts = NUMBER_OF_BYTES_IN_HASH - 1; 349 } else { // not enough data to hash anything 350 missedInserts = len; 351 } 352 blockStart = currentPosition = len; 353 } 354 355 // we use a 15 bit hashcode as calculated in updateHash 356 private static final int HASH_SIZE = 1 << 15; 357 private static final int HASH_MASK = HASH_SIZE - 1; 358 private static final int H_SHIFT = 5; 359 360 /** 361 * Assumes we are calculating the hash for three consecutive bytes 362 * as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC 363 * the new hash for BCD is nextHash(H, D). 364 * 365 * <p>The hash is shifted by five bits on each update so all 366 * effects of A have been swapped after the third update.</p> 367 */ 368 private int nextHash(int oldHash, byte nextByte) { 369 final int nextVal = nextByte & 0xFF; 370 return ((oldHash << H_SHIFT) ^ nextVal) & HASH_MASK; 371 } 372 373 // performs the actual algorithm with the pre-condition len <= windowSize 374 private void doCompress(byte[] data, int off, int len) throws IOException { 375 int spaceLeft = window.length - currentPosition - lookahead; 376 if (len > spaceLeft) { 377 slide(); 378 } 379 System.arraycopy(data, off, window, currentPosition + lookahead, len); 380 lookahead += len; 381 if (!initialized && lookahead >= params.getMinBackReferenceLength()) { 382 initialize(); 383 } 384 if (initialized) { 385 compress(); 386 } 387 } 388 389 private void slide() throws IOException { 390 final int wSize = params.getWindowSize(); 391 if (blockStart != currentPosition && blockStart < wSize) { 392 flushLiteralBlock(); 393 blockStart = currentPosition; 394 } 395 System.arraycopy(window, wSize, window, 0, wSize); 396 currentPosition -= wSize; 397 matchStart -= wSize; 398 blockStart -= wSize; 399 for (int i = 0; i < HASH_SIZE; i++) { 400 int h = head[i]; 401 head[i] = h >= wSize ? h - wSize : NO_MATCH; 402 } 403 for (int i = 0; i < wSize; i++) { 404 int p = prev[i]; 405 prev[i] = p >= wSize ? p - wSize : NO_MATCH; 406 } 407 } 408 409 private void initialize() { 410 for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) { 411 insertHash = nextHash(insertHash, window[i]); 412 } 413 initialized = true; 414 } 415 416 private void compress() throws IOException { 417 final int minMatch = params.getMinBackReferenceLength(); 418 final boolean lazy = params.getLazyMatching(); 419 final int lazyThreshold = params.getLazyMatchingThreshold(); 420 421 while (lookahead >= minMatch) { 422 catchUpMissedInserts(); 423 int matchLength = 0; 424 int hashHead = insertString(currentPosition); 425 if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) { 426 // sets matchStart as a side effect 427 matchLength = longestMatch(hashHead); 428 429 if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) { 430 // try to find a longer match using the next position 431 matchLength = longestMatchForNextPosition(matchLength); 432 } 433 } 434 if (matchLength >= minMatch) { 435 if (blockStart != currentPosition) { 436 // emit preceeding literal block 437 flushLiteralBlock(); 438 blockStart = NO_MATCH; 439 } 440 flushBackReference(matchLength); 441 insertStringsInMatch(matchLength); 442 lookahead -= matchLength; 443 currentPosition += matchLength; 444 blockStart = currentPosition; 445 } else { 446 // no match, append to current or start a new literal 447 lookahead--; 448 currentPosition++; 449 if (currentPosition - blockStart >= params.getMaxLiteralLength()) { 450 flushLiteralBlock(); 451 blockStart = currentPosition; 452 } 453 } 454 } 455 } 456 457 /** 458 * Inserts the current three byte sequence into the dictionary and 459 * returns the previous head of the hash-chain. 460 * 461 * <p>Updates <code>insertHash</code> and <code>prev</code> as a 462 * side effect.</p> 463 */ 464 private int insertString(int pos) { 465 insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]); 466 int hashHead = head[insertHash]; 467 prev[pos & wMask] = hashHead; 468 head[insertHash] = pos; 469 return hashHead; 470 } 471 472 private int longestMatchForNextPosition(final int prevMatchLength) { 473 // save a bunch of values to restore them if the next match isn't better than the current one 474 final int prevMatchStart = matchStart; 475 final int prevInsertHash = insertHash; 476 477 lookahead--; 478 currentPosition++; 479 int hashHead = insertString(currentPosition); 480 final int prevHashHead = prev[currentPosition & wMask]; 481 int matchLength = longestMatch(hashHead); 482 483 if (matchLength <= prevMatchLength) { 484 // use the first match, as the next one isn't any better 485 matchLength = prevMatchLength; 486 matchStart = prevMatchStart; 487 488 // restore modified values 489 head[insertHash] = prevHashHead; 490 insertHash = prevInsertHash; 491 currentPosition--; 492 lookahead++; 493 } 494 return matchLength; 495 } 496 497 private void insertStringsInMatch(int matchLength) { 498 // inserts strings contained in current match 499 // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts 500 final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH); 501 // currentPosition has been inserted already 502 for (int i = 1; i <= stop; i++) { 503 insertString(currentPosition + i); 504 } 505 missedInserts = matchLength - stop - 1; 506 } 507 508 private void catchUpMissedInserts() { 509 while (missedInserts > 0) { 510 insertString(currentPosition - missedInserts--); 511 } 512 } 513 514 private void flushBackReference(int matchLength) throws IOException { 515 callback.accept(new BackReference(currentPosition - matchStart, matchLength)); 516 } 517 518 private void flushLiteralBlock() throws IOException { 519 callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart)); 520 } 521 522 /** 523 * Searches the hash chain for real matches and returns the length 524 * of the longest match (0 if none were found) that isn't too far 525 * away (WRT maxOffset). 526 * 527 * <p>Sets matchStart to the index of the start position of the 528 * longest match as a side effect.</p> 529 */ 530 private int longestMatch(int matchHead) { 531 final int minLength = params.getMinBackReferenceLength(); 532 int longestMatchLength = minLength - 1; 533 final int maxPossibleLength = Math.min(params.getMaxBackReferenceLength(), lookahead); 534 final int minIndex = Math.max(0, currentPosition - params.getMaxOffset()); 535 final int niceBackReferenceLength = Math.min(maxPossibleLength, params.getNiceBackReferenceLength()); 536 final int maxCandidates = params.getMaxCandidates(); 537 for (int candidates = 0; candidates < maxCandidates && matchHead >= minIndex; candidates++) { 538 int currentLength = 0; 539 for (int i = 0; i < maxPossibleLength; i++) { 540 if (window[matchHead + i] != window[currentPosition + i]) { 541 break; 542 } 543 currentLength++; 544 } 545 if (currentLength > longestMatchLength) { 546 longestMatchLength = currentLength; 547 matchStart = matchHead; 548 if (currentLength >= niceBackReferenceLength) { 549 // no need to search any further 550 break; 551 } 552 } 553 matchHead = prev[matchHead & wMask]; 554 } 555 return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress() 556 } 557}