001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.lz77support; 020 021import java.io.IOException; 022import java.io.InputStream; 023import java.util.Arrays; 024 025import org.apache.commons.compress.compressors.CompressorInputStream; 026import org.apache.commons.compress.utils.ByteUtils; 027import org.apache.commons.compress.utils.CountingInputStream; 028import org.apache.commons.compress.utils.IOUtils; 029import org.apache.commons.compress.utils.InputStreamStatistics; 030 031/** 032 * Encapsulates code common to LZ77 decompressors. 033 * 034 * <p>Assumes the stream consists of blocks of literal data and 035 * back-references (called copies) in any order. Of course the first 036 * block must be a literal block for the scheme to work - unless the 037 * {@link #prefill prefill} method has been used to provide initial 038 * data that is never returned by {@link #read read} but only used for 039 * back-references.</p> 040 * 041 * <p>Subclasses must override the three-arg {@link #read read} method 042 * as the no-arg version delegates to it and the default 043 * implementation delegates to the no-arg version, leading to infinite 044 * mutual recursion and a {@code StackOverflowError} otherwise.</p> 045 * 046 * <p>The contract for subclasses' {@code read} implementation is:</p> 047 * <ul> 048 * 049 * <li>keep track of the current state of the stream. Is it inside a 050 * literal block or a back-reference or in-between blocks?</li> 051 * 052 * <li>Use {@link #readOneByte} to access the underlying stream 053 * directly.</li> 054 * 055 * <li>If a new literal block starts, use {@link #startLiteral} to 056 * tell this class about it and read the literal data using {@link 057 * #readLiteral} until it returns {@code 0}. {@link 058 * #hasMoreDataInBlock} will return {@code false} before the next 059 * call to {@link #readLiteral} would return {@code 0}.</li> 060 * 061 * <li>If a new back-reference starts, use {@link #startBackReference} to 062 * tell this class about it and read the literal data using {@link 063 * #readBackReference} until it returns {@code 0}. {@link 064 * #hasMoreDataInBlock} will return {@code false} before the next 065 * call to {@link #readBackReference} would return {@code 0}.</li> 066 * 067 * <li>If the end of the stream has been reached, return {@code -1} 068 * as this class' methods will never do so themselves.</li> 069 * 070 * </ul> 071 * 072 * <p>{@link #readOneByte} and {@link #readLiteral} update the counter 073 * for bytes read.</p> 074 * 075 * @since 1.14 076 */ 077public abstract class AbstractLZ77CompressorInputStream extends CompressorInputStream 078 implements InputStreamStatistics { 079 080 /** Size of the window - must be bigger than the biggest offset expected. */ 081 private final int windowSize; 082 083 /** 084 * Buffer to write decompressed bytes to for back-references, will 085 * be three times windowSize big. 086 * 087 * <p>Three times so we can slide the whole buffer a windowSize to 088 * the left once we've read twice windowSize and still have enough 089 * data inside of it to satisfy back-references.</p> 090 */ 091 private final byte[] buf; 092 093 /** One behind the index of the last byte in the buffer that was written, i.e. the next position to write to */ 094 private int writeIndex; 095 096 /** Index of the next byte to be read. */ 097 private int readIndex; 098 099 /** The underlying stream to read compressed data from */ 100 private final CountingInputStream in; 101 102 /** Number of bytes still to be read from the current literal or back-reference. */ 103 private long bytesRemaining; 104 105 /** Offset of the current back-reference. */ 106 private int backReferenceOffset; 107 108 /** uncompressed size */ 109 private int size = 0; 110 111 // used in no-arg read method 112 private final byte[] oneByte = new byte[1]; 113 114 /** 115 * Supplier that delegates to {@link #readOneByte}. 116 */ 117 protected final ByteUtils.ByteSupplier supplier = new ByteUtils.ByteSupplier() { 118 @Override 119 public int getAsByte() throws IOException { 120 return readOneByte(); 121 } 122 }; 123 124 /** 125 * Creates a new LZ77 input stream. 126 * 127 * @param is 128 * An InputStream to read compressed data from 129 * @param windowSize 130 * Size of the window kept for back-references, must be bigger than the biggest offset expected. 131 * 132 * @throws IOException if reading fails 133 * @throws IllegalArgumentException if windowSize is not bigger than 0 134 */ 135 public AbstractLZ77CompressorInputStream(final InputStream is, int windowSize) throws IOException { 136 this.in = new CountingInputStream(is); 137 if (windowSize <= 0) { 138 throw new IllegalArgumentException("windowSize must be bigger than 0"); 139 } 140 this.windowSize = windowSize; 141 buf = new byte[3 * windowSize]; 142 writeIndex = readIndex = 0; 143 bytesRemaining = 0; 144 } 145 146 /** {@inheritDoc} */ 147 @Override 148 public int read() throws IOException { 149 return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF; 150 } 151 152 /** {@inheritDoc} */ 153 @Override 154 public void close() throws IOException { 155 in.close(); 156 } 157 158 /** {@inheritDoc} */ 159 @Override 160 public int available() { 161 return writeIndex - readIndex; 162 } 163 164 /** 165 * Get the uncompressed size of the stream 166 * 167 * @return the uncompressed size 168 */ 169 public int getSize() { 170 return size; 171 } 172 173 /** 174 * Adds some initial data to fill the window with. 175 * 176 * <p>This is used if the stream has been cut into blocks and 177 * back-references of one block may refer to data of the previous 178 * block(s). One such example is the LZ4 frame format using block 179 * dependency.</p> 180 * 181 * @param data the data to fill the window with. 182 * @throws IllegalStateException if the stream has already started to read data 183 */ 184 public void prefill(byte[] data) { 185 if (writeIndex != 0) { 186 throw new IllegalStateException("The stream has already been read from, can't prefill anymore"); 187 } 188 // we don't need more data than the big offset could refer to, so cap it 189 int len = Math.min(windowSize, data.length); 190 // we need the last data as we are dealing with *back*-references 191 System.arraycopy(data, data.length - len, buf, 0, len); 192 writeIndex += len; 193 readIndex += len; 194 } 195 196 /** 197 * @since 1.17 198 */ 199 @Override 200 public long getCompressedCount() { 201 return in.getBytesRead(); 202 } 203 204 /** 205 * Used by subclasses to signal the next block contains the given 206 * amount of literal data. 207 * @param length the length of the block 208 * @throws IllegalArgumentException if length is negative 209 */ 210 protected final void startLiteral(long length) { 211 if (length < 0) { 212 throw new IllegalArgumentException("length must not be negative"); 213 } 214 bytesRemaining = length; 215 } 216 217 /** 218 * Is there still data remaining inside the current block? 219 * @return true if there is still data remaining inside the current block. 220 */ 221 protected final boolean hasMoreDataInBlock() { 222 return bytesRemaining > 0; 223 } 224 225 /** 226 * Reads data from the current literal block. 227 * @param b buffer to write data to 228 * @param off offset to start writing to 229 * @param len maximum amount of data to read 230 * @return number of bytes read, may be 0. Will never return -1 as 231 * EOF-detection is the responsibility of the subclass 232 * @throws IOException if the underlying stream throws or signals 233 * an EOF before the amount of data promised for the block have 234 * been read 235 * @throws NullPointerException if <code>b</code> is null 236 * @throws IndexOutOfBoundsException if <code>off</code> is 237 * negative, <code>len</code> is negative, or <code>len</code> is 238 * greater than <code>b.length - off</code> 239 */ 240 protected final int readLiteral(final byte[] b, final int off, final int len) throws IOException { 241 final int avail = available(); 242 if (len > avail) { 243 tryToReadLiteral(len - avail); 244 } 245 return readFromBuffer(b, off, len); 246 } 247 248 private void tryToReadLiteral(int bytesToRead) throws IOException { 249 // min of "what is still inside the literal", "what does the user want" and "how much can fit into the buffer" 250 final int reallyTryToRead = Math.min((int) Math.min(bytesToRead, bytesRemaining), 251 buf.length - writeIndex); 252 final int bytesRead = reallyTryToRead > 0 253 ? IOUtils.readFully(in, buf, writeIndex, reallyTryToRead) 254 : 0 /* happens for bytesRemaining == 0 */; 255 count(bytesRead); 256 if (reallyTryToRead != bytesRead) { 257 throw new IOException("Premature end of stream reading literal"); 258 } 259 writeIndex += reallyTryToRead; 260 bytesRemaining -= reallyTryToRead; 261 } 262 263 private int readFromBuffer(final byte[] b, final int off, final int len) { 264 final int readable = Math.min(len, available()); 265 if (readable > 0) { 266 System.arraycopy(buf, readIndex, b, off, readable); 267 readIndex += readable; 268 if (readIndex > 2 * windowSize) { 269 slideBuffer(); 270 } 271 } 272 size += readable; 273 return readable; 274 } 275 276 private void slideBuffer() { 277 System.arraycopy(buf, windowSize, buf, 0, windowSize * 2); 278 writeIndex -= windowSize; 279 readIndex -= windowSize; 280 } 281 282 /** 283 * Used by subclasses to signal the next block contains a back-reference with the given coordinates. 284 * @param offset the offset of the back-reference 285 * @param length the length of the back-reference 286 * @throws IllegalArgumentException if offset not bigger than 0 or 287 * bigger than the number of bytes available for back-references 288 * or if length is negative 289 */ 290 protected final void startBackReference(int offset, long length) { 291 if (offset <= 0 || offset > writeIndex) { 292 throw new IllegalArgumentException("offset must be bigger than 0 but not bigger than the number" 293 + " of bytes available for back-references"); 294 } 295 if (length < 0) { 296 throw new IllegalArgumentException("length must not be negative"); 297 } 298 backReferenceOffset = offset; 299 bytesRemaining = length; 300 } 301 302 /** 303 * Reads data from the current back-reference. 304 * @param b buffer to write data to 305 * @param off offset to start writing to 306 * @param len maximum amount of data to read 307 * @return number of bytes read, may be 0. Will never return -1 as 308 * EOF-detection is the responsibility of the subclass 309 * @throws NullPointerException if <code>b</code> is null 310 * @throws IndexOutOfBoundsException if <code>off</code> is 311 * negative, <code>len</code> is negative, or <code>len</code> is 312 * greater than <code>b.length - off</code> 313 */ 314 protected final int readBackReference(final byte[] b, final int off, final int len) { 315 final int avail = available(); 316 if (len > avail) { 317 tryToCopy(len - avail); 318 } 319 return readFromBuffer(b, off, len); 320 } 321 322 private void tryToCopy(int bytesToCopy) { 323 // this will fit into the buffer without sliding and not 324 // require more than is available inside the back-reference 325 int copy = Math.min((int) Math.min(bytesToCopy, bytesRemaining), 326 buf.length - writeIndex); 327 if (copy == 0) { 328 // NOP 329 } else if (backReferenceOffset == 1) { // pretty common special case 330 final byte last = buf[writeIndex - 1]; 331 Arrays.fill(buf, writeIndex, writeIndex + copy, last); 332 writeIndex += copy; 333 } else if (copy < backReferenceOffset) { 334 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, copy); 335 writeIndex += copy; 336 } else { 337 // back-reference overlaps with the bytes created from it 338 // like go back two bytes and then copy six (by copying 339 // the last two bytes three time). 340 final int fullRots = copy / backReferenceOffset; 341 for (int i = 0; i < fullRots; i++) { 342 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, backReferenceOffset); 343 writeIndex += backReferenceOffset; 344 } 345 346 final int pad = copy - (backReferenceOffset * fullRots); 347 if (pad > 0) { 348 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, pad); 349 writeIndex += pad; 350 } 351 } 352 bytesRemaining -= copy; 353 } 354 355 /** 356 * Reads a single byte from the real input stream and ensures the data is accounted for. 357 * 358 * @return the byte read as value between 0 and 255 or -1 if EOF has been reached. 359 * @throws IOException if the underlying stream throws 360 */ 361 protected final int readOneByte() throws IOException { 362 final int b = in.read(); 363 if (b != -1) { 364 count(1); 365 return b & 0xFF; 366 } 367 return -1; 368 } 369}