001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.gzip; 020 021import java.io.ByteArrayOutputStream; 022import java.io.IOException; 023import java.io.EOFException; 024import java.io.InputStream; 025import java.io.DataInput; 026import java.io.DataInputStream; 027import java.io.BufferedInputStream; 028import java.util.zip.DataFormatException; 029import java.util.zip.Deflater; 030import java.util.zip.Inflater; 031import java.util.zip.CRC32; 032 033import org.apache.commons.compress.compressors.CompressorInputStream; 034import org.apache.commons.compress.utils.ByteUtils; 035import org.apache.commons.compress.utils.CharsetNames; 036import org.apache.commons.compress.utils.CountingInputStream; 037import org.apache.commons.compress.utils.IOUtils; 038import org.apache.commons.compress.utils.InputStreamStatistics; 039 040/** 041 * Input stream that decompresses .gz files. 042 * 043 * <p>This supports decompressing concatenated .gz files which is important 044 * when decompressing standalone .gz files.</p> 045 * 046 * <p> 047 * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz 048 * files: it stops after the first member and silently ignores the rest. 049 * It doesn't leave the read position to point to the beginning of the next 050 * member, which makes it difficult workaround the lack of concatenation 051 * support. 052 * </p> 053 * 054 * <p> 055 * Instead of using <code>GZIPInputStream</code>, this class has its own .gz 056 * container format decoder. The actual decompression is done with 057 * {@link java.util.zip.Inflater}. 058 * </p> 059 * 060 * <p>If you use the constructor {@code GzipCompressorInputStream(in)} 061 * or {@code GzipCompressorInputStream(in, false)} with some {@code 062 * InputStream} {@code in} then {@link #read} will return -1 as soon 063 * as the first internal member has been read completely. The stream 064 * {@code in} will be positioned at the start of the second gzip 065 * member if there is one.</p> 066 * 067 * <p>If you use the constructor {@code GzipCompressorInputStream(in, 068 * true)} with some {@code InputStream} {@code in} then {@link #read} 069 * will return -1 once the stream {@code in} has been exhausted. The 070 * data read from a stream constructed this way will consist of the 071 * concatenated data of all gzip members contained inside {@code 072 * in}.</p> 073 * 074 * @see "https://tools.ietf.org/html/rfc1952" 075 */ 076public class GzipCompressorInputStream extends CompressorInputStream 077 implements InputStreamStatistics { 078 079 // Header flags 080 // private static final int FTEXT = 0x01; // Uninteresting for us 081 private static final int FHCRC = 0x02; 082 private static final int FEXTRA = 0x04; 083 private static final int FNAME = 0x08; 084 private static final int FCOMMENT = 0x10; 085 private static final int FRESERVED = 0xE0; 086 087 private final CountingInputStream countingStream; 088 089 // Compressed input stream, possibly wrapped in a 090 // BufferedInputStream, always wrapped in countingStream above 091 private final InputStream in; 092 093 // True if decompressing multi member streams. 094 private final boolean decompressConcatenated; 095 096 // Buffer to hold the input data 097 private final byte[] buf = new byte[8192]; 098 099 // Amount of data in buf. 100 private int bufUsed; 101 102 // Decompressor 103 private Inflater inf = new Inflater(true); 104 105 // CRC32 from uncompressed data 106 private final CRC32 crc = new CRC32(); 107 108 // True once everything has been decompressed 109 private boolean endReached = false; 110 111 // used in no-arg read method 112 private final byte[] oneByte = new byte[1]; 113 114 private final GzipParameters parameters = new GzipParameters(); 115 116 /** 117 * Constructs a new input stream that decompresses gzip-compressed data 118 * from the specified input stream. 119 * <p> 120 * This is equivalent to 121 * <code>GzipCompressorInputStream(inputStream, false)</code> and thus 122 * will not decompress concatenated .gz files. 123 * 124 * @param inputStream the InputStream from which this object should 125 * be created of 126 * 127 * @throws IOException if the stream could not be created 128 */ 129 public GzipCompressorInputStream(final InputStream inputStream) 130 throws IOException { 131 this(inputStream, false); 132 } 133 134 /** 135 * Constructs a new input stream that decompresses gzip-compressed data 136 * from the specified input stream. 137 * <p> 138 * If <code>decompressConcatenated</code> is {@code false}: 139 * This decompressor might read more input than it will actually use. 140 * If <code>inputStream</code> supports <code>mark</code> and 141 * <code>reset</code>, then the input position will be adjusted 142 * so that it is right after the last byte of the compressed stream. 143 * If <code>mark</code> isn't supported, the input position will be 144 * undefined. 145 * 146 * @param inputStream the InputStream from which this object should 147 * be created of 148 * @param decompressConcatenated 149 * if true, decompress until the end of the input; 150 * if false, stop after the first .gz member 151 * 152 * @throws IOException if the stream could not be created 153 */ 154 public GzipCompressorInputStream(final InputStream inputStream, 155 final boolean decompressConcatenated) 156 throws IOException { 157 countingStream = new CountingInputStream(inputStream); 158 // Mark support is strictly needed for concatenated files only, 159 // but it's simpler if it is always available. 160 if (countingStream.markSupported()) { 161 in = countingStream; 162 } else { 163 in = new BufferedInputStream(countingStream); 164 } 165 166 this.decompressConcatenated = decompressConcatenated; 167 init(true); 168 } 169 170 /** 171 * Provides the stream's meta data - may change with each stream 172 * when decompressing concatenated streams. 173 * @return the stream's meta data 174 * @since 1.8 175 */ 176 public GzipParameters getMetaData() { 177 return parameters; 178 } 179 180 private boolean init(final boolean isFirstMember) throws IOException { 181 assert isFirstMember || decompressConcatenated; 182 183 // Check the magic bytes without a possibility of EOFException. 184 final int magic0 = in.read(); 185 186 // If end of input was reached after decompressing at least 187 // one .gz member, we have reached the end of the file successfully. 188 if (magic0 == -1 && !isFirstMember) { 189 return false; 190 } 191 192 if (magic0 != 31 || in.read() != 139) { 193 throw new IOException(isFirstMember 194 ? "Input is not in the .gz format" 195 : "Garbage after a valid .gz stream"); 196 } 197 198 // Parsing the rest of the header may throw EOFException. 199 final DataInput inData = new DataInputStream(in); 200 final int method = inData.readUnsignedByte(); 201 if (method != Deflater.DEFLATED) { 202 throw new IOException("Unsupported compression method " 203 + method + " in the .gz header"); 204 } 205 206 final int flg = inData.readUnsignedByte(); 207 if ((flg & FRESERVED) != 0) { 208 throw new IOException( 209 "Reserved flags are set in the .gz header"); 210 } 211 212 parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4) * 1000); 213 switch (inData.readUnsignedByte()) { // extra flags 214 case 2: 215 parameters.setCompressionLevel(Deflater.BEST_COMPRESSION); 216 break; 217 case 4: 218 parameters.setCompressionLevel(Deflater.BEST_SPEED); 219 break; 220 default: 221 // ignored for now 222 break; 223 } 224 parameters.setOperatingSystem(inData.readUnsignedByte()); 225 226 // Extra field, ignored 227 if ((flg & FEXTRA) != 0) { 228 int xlen = inData.readUnsignedByte(); 229 xlen |= inData.readUnsignedByte() << 8; 230 231 // This isn't as efficient as calling in.skip would be, 232 // but it's lazier to handle unexpected end of input this way. 233 // Most files don't have an extra field anyway. 234 while (xlen-- > 0) { 235 inData.readUnsignedByte(); 236 } 237 } 238 239 // Original file name 240 if ((flg & FNAME) != 0) { 241 parameters.setFilename(new String(readToNull(inData), 242 CharsetNames.ISO_8859_1)); 243 } 244 245 // Comment 246 if ((flg & FCOMMENT) != 0) { 247 parameters.setComment(new String(readToNull(inData), 248 CharsetNames.ISO_8859_1)); 249 } 250 251 // Header "CRC16" which is actually a truncated CRC32 (which isn't 252 // as good as real CRC16). I don't know if any encoder implementation 253 // sets this, so it's not worth trying to verify it. GNU gzip 1.4 254 // doesn't support this field, but zlib seems to be able to at least 255 // skip over it. 256 if ((flg & FHCRC) != 0) { 257 inData.readShort(); 258 } 259 260 // Reset 261 inf.reset(); 262 crc.reset(); 263 264 return true; 265 } 266 267 private static byte[] readToNull(final DataInput inData) throws IOException { 268 try (final ByteArrayOutputStream bos = new ByteArrayOutputStream()) { 269 int b = 0; 270 while ((b = inData.readUnsignedByte()) != 0x00) { // NOPMD NOSONAR 271 bos.write(b); 272 } 273 return bos.toByteArray(); 274 } 275 } 276 277 @Override 278 public int read() throws IOException { 279 return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF; 280 } 281 282 /** 283 * {@inheritDoc} 284 * 285 * @since 1.1 286 */ 287 @Override 288 public int read(final byte[] b, int off, int len) throws IOException { 289 if (len == 0) { 290 return 0; 291 } 292 if (endReached) { 293 return -1; 294 } 295 296 int size = 0; 297 298 while (len > 0) { 299 if (inf.needsInput()) { 300 // Remember the current position because we may need to 301 // rewind after reading too much input. 302 in.mark(buf.length); 303 304 bufUsed = in.read(buf); 305 if (bufUsed == -1) { 306 throw new EOFException(); 307 } 308 309 inf.setInput(buf, 0, bufUsed); 310 } 311 312 int ret; 313 try { 314 ret = inf.inflate(b, off, len); 315 } catch (final DataFormatException e) { // NOSONAR 316 throw new IOException("Gzip-compressed data is corrupt"); 317 } 318 319 crc.update(b, off, ret); 320 off += ret; 321 len -= ret; 322 size += ret; 323 count(ret); 324 325 if (inf.finished()) { 326 // We may have read too many bytes. Rewind the read 327 // position to match the actual amount used. 328 in.reset(); 329 330 final int skipAmount = bufUsed - inf.getRemaining(); 331 if (IOUtils.skip(in, skipAmount) != skipAmount) { 332 throw new IOException(); 333 } 334 335 bufUsed = 0; 336 337 final DataInput inData = new DataInputStream(in); 338 339 // CRC32 340 final long crcStored = ByteUtils.fromLittleEndian(inData, 4); 341 342 if (crcStored != crc.getValue()) { 343 throw new IOException("Gzip-compressed data is corrupt " 344 + "(CRC32 error)"); 345 } 346 347 // Uncompressed size modulo 2^32 (ISIZE in the spec) 348 final long isize = ByteUtils.fromLittleEndian(inData, 4); 349 350 if (isize != (inf.getBytesWritten() & 0xffffffffL)) { 351 throw new IOException("Gzip-compressed data is corrupt" 352 + "(uncompressed size mismatch)"); 353 } 354 355 // See if this is the end of the file. 356 if (!decompressConcatenated || !init(false)) { 357 inf.end(); 358 inf = null; 359 endReached = true; 360 return size == 0 ? -1 : size; 361 } 362 } 363 } 364 365 return size; 366 } 367 368 /** 369 * Checks if the signature matches what is expected for a .gz file. 370 * 371 * @param signature the bytes to check 372 * @param length the number of bytes to check 373 * @return true if this is a .gz stream, false otherwise 374 * 375 * @since 1.1 376 */ 377 public static boolean matches(final byte[] signature, final int length) { 378 return length >= 2 && signature[0] == 31 && signature[1] == -117; 379 } 380 381 /** 382 * Closes the input stream (unless it is System.in). 383 * 384 * @since 1.2 385 */ 386 @Override 387 public void close() throws IOException { 388 if (inf != null) { 389 inf.end(); 390 inf = null; 391 } 392 393 if (this.in != System.in) { 394 this.in.close(); 395 } 396 } 397 398 /** 399 * @since 1.17 400 */ 401 @Override 402 public long getCompressedCount() { 403 return countingStream.getBytesRead(); 404 } 405}