001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.gzip; 020 021import java.io.ByteArrayOutputStream; 022import java.io.IOException; 023import java.io.EOFException; 024import java.io.InputStream; 025import java.io.DataInput; 026import java.io.DataInputStream; 027import java.io.BufferedInputStream; 028import java.util.zip.DataFormatException; 029import java.util.zip.Deflater; 030import java.util.zip.Inflater; 031import java.util.zip.CRC32; 032 033import org.apache.commons.compress.compressors.CompressorInputStream; 034import org.apache.commons.compress.utils.ByteUtils; 035import org.apache.commons.compress.utils.CharsetNames; 036 037/** 038 * Input stream that decompresses .gz files. 039 * 040 * <p>This supports decompressing concatenated .gz files which is important 041 * when decompressing standalone .gz files.</p> 042 * 043 * <p> 044 * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz 045 * files: it stops after the first member and silently ignores the rest. 046 * It doesn't leave the read position to point to the beginning of the next 047 * member, which makes it difficult workaround the lack of concatenation 048 * support. 049 * </p> 050 * 051 * <p> 052 * Instead of using <code>GZIPInputStream</code>, this class has its own .gz 053 * container format decoder. The actual decompression is done with 054 * {@link java.util.zip.Inflater}. 055 * </p> 056 * 057 * <p>If you use the constructor {@code GzipCompressorInputStream(in)} 058 * or {@code GzipCompressorInputStream(in, false)} with some {@code 059 * InputStream} {@code in} then {@link #read} will return -1 as soon 060 * as the first internal member has been read completely. The stream 061 * {@code in} will be positioned at the start of the second gzip 062 * member if there is one.</p> 063 * 064 * <p>If you use the constructor {@code GzipCompressorInputStream(in, 065 * true)} with some {@code InputStream} {@code in} then {@link #read} 066 * will return -1 once the stream {@code in} has been exhausted. The 067 * data read from a stream constructed this way will consist of the 068 * concatenated data of all gzip members contained inside {@code 069 * in}.</p> 070 * 071 * @see "https://tools.ietf.org/html/rfc1952" 072 */ 073public class GzipCompressorInputStream extends CompressorInputStream { 074 // Header flags 075 // private static final int FTEXT = 0x01; // Uninteresting for us 076 private static final int FHCRC = 0x02; 077 private static final int FEXTRA = 0x04; 078 private static final int FNAME = 0x08; 079 private static final int FCOMMENT = 0x10; 080 private static final int FRESERVED = 0xE0; 081 082 // Compressed input stream, possibly wrapped in a BufferedInputStream 083 private final InputStream in; 084 085 // True if decompressing multi member streams. 086 private final boolean decompressConcatenated; 087 088 // Buffer to hold the input data 089 private final byte[] buf = new byte[8192]; 090 091 // Amount of data in buf. 092 private int bufUsed; 093 094 // Decompressor 095 private Inflater inf = new Inflater(true); 096 097 // CRC32 from uncompressed data 098 private final CRC32 crc = new CRC32(); 099 100 // True once everything has been decompressed 101 private boolean endReached = false; 102 103 // used in no-arg read method 104 private final byte[] oneByte = new byte[1]; 105 106 private final GzipParameters parameters = new GzipParameters(); 107 108 /** 109 * Constructs a new input stream that decompresses gzip-compressed data 110 * from the specified input stream. 111 * <p> 112 * This is equivalent to 113 * <code>GzipCompressorInputStream(inputStream, false)</code> and thus 114 * will not decompress concatenated .gz files. 115 * 116 * @param inputStream the InputStream from which this object should 117 * be created of 118 * 119 * @throws IOException if the stream could not be created 120 */ 121 public GzipCompressorInputStream(final InputStream inputStream) 122 throws IOException { 123 this(inputStream, false); 124 } 125 126 /** 127 * Constructs a new input stream that decompresses gzip-compressed data 128 * from the specified input stream. 129 * <p> 130 * If <code>decompressConcatenated</code> is {@code false}: 131 * This decompressor might read more input than it will actually use. 132 * If <code>inputStream</code> supports <code>mark</code> and 133 * <code>reset</code>, then the input position will be adjusted 134 * so that it is right after the last byte of the compressed stream. 135 * If <code>mark</code> isn't supported, the input position will be 136 * undefined. 137 * 138 * @param inputStream the InputStream from which this object should 139 * be created of 140 * @param decompressConcatenated 141 * if true, decompress until the end of the input; 142 * if false, stop after the first .gz member 143 * 144 * @throws IOException if the stream could not be created 145 */ 146 public GzipCompressorInputStream(final InputStream inputStream, 147 final boolean decompressConcatenated) 148 throws IOException { 149 // Mark support is strictly needed for concatenated files only, 150 // but it's simpler if it is always available. 151 if (inputStream.markSupported()) { 152 in = inputStream; 153 } else { 154 in = new BufferedInputStream(inputStream); 155 } 156 157 this.decompressConcatenated = decompressConcatenated; 158 init(true); 159 } 160 161 /** 162 * Provides the stream's meta data - may change with each stream 163 * when decompressing concatenated streams. 164 * @return the stream's meta data 165 * @since 1.8 166 */ 167 public GzipParameters getMetaData() { 168 return parameters; 169 } 170 171 private boolean init(final boolean isFirstMember) throws IOException { 172 assert isFirstMember || decompressConcatenated; 173 174 // Check the magic bytes without a possibility of EOFException. 175 final int magic0 = in.read(); 176 final int magic1 = in.read(); 177 178 // If end of input was reached after decompressing at least 179 // one .gz member, we have reached the end of the file successfully. 180 if (magic0 == -1 && !isFirstMember) { 181 return false; 182 } 183 184 if (magic0 != 31 || magic1 != 139) { 185 throw new IOException(isFirstMember 186 ? "Input is not in the .gz format" 187 : "Garbage after a valid .gz stream"); 188 } 189 190 // Parsing the rest of the header may throw EOFException. 191 final DataInput inData = new DataInputStream(in); 192 final int method = inData.readUnsignedByte(); 193 if (method != Deflater.DEFLATED) { 194 throw new IOException("Unsupported compression method " 195 + method + " in the .gz header"); 196 } 197 198 final int flg = inData.readUnsignedByte(); 199 if ((flg & FRESERVED) != 0) { 200 throw new IOException( 201 "Reserved flags are set in the .gz header"); 202 } 203 204 parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4) * 1000); 205 switch (inData.readUnsignedByte()) { // extra flags 206 case 2: 207 parameters.setCompressionLevel(Deflater.BEST_COMPRESSION); 208 break; 209 case 4: 210 parameters.setCompressionLevel(Deflater.BEST_SPEED); 211 break; 212 default: 213 // ignored for now 214 break; 215 } 216 parameters.setOperatingSystem(inData.readUnsignedByte()); 217 218 // Extra field, ignored 219 if ((flg & FEXTRA) != 0) { 220 int xlen = inData.readUnsignedByte(); 221 xlen |= inData.readUnsignedByte() << 8; 222 223 // This isn't as efficient as calling in.skip would be, 224 // but it's lazier to handle unexpected end of input this way. 225 // Most files don't have an extra field anyway. 226 while (xlen-- > 0) { 227 inData.readUnsignedByte(); 228 } 229 } 230 231 // Original file name 232 if ((flg & FNAME) != 0) { 233 parameters.setFilename(new String(readToNull(inData), 234 CharsetNames.ISO_8859_1)); 235 } 236 237 // Comment 238 if ((flg & FCOMMENT) != 0) { 239 parameters.setComment(new String(readToNull(inData), 240 CharsetNames.ISO_8859_1)); 241 } 242 243 // Header "CRC16" which is actually a truncated CRC32 (which isn't 244 // as good as real CRC16). I don't know if any encoder implementation 245 // sets this, so it's not worth trying to verify it. GNU gzip 1.4 246 // doesn't support this field, but zlib seems to be able to at least 247 // skip over it. 248 if ((flg & FHCRC) != 0) { 249 inData.readShort(); 250 } 251 252 // Reset 253 inf.reset(); 254 crc.reset(); 255 256 return true; 257 } 258 259 private static byte[] readToNull(final DataInput inData) throws IOException { 260 final ByteArrayOutputStream bos = new ByteArrayOutputStream(); 261 int b = 0; 262 while ((b = inData.readUnsignedByte()) != 0x00) { // NOPMD 263 bos.write(b); 264 } 265 return bos.toByteArray(); 266 } 267 268 @Override 269 public int read() throws IOException { 270 return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF; 271 } 272 273 /** 274 * {@inheritDoc} 275 * 276 * @since 1.1 277 */ 278 @Override 279 public int read(final byte[] b, int off, int len) throws IOException { 280 if (endReached) { 281 return -1; 282 } 283 284 int size = 0; 285 286 while (len > 0) { 287 if (inf.needsInput()) { 288 // Remember the current position because we may need to 289 // rewind after reading too much input. 290 in.mark(buf.length); 291 292 bufUsed = in.read(buf); 293 if (bufUsed == -1) { 294 throw new EOFException(); 295 } 296 297 inf.setInput(buf, 0, bufUsed); 298 } 299 300 int ret; 301 try { 302 ret = inf.inflate(b, off, len); 303 } catch (final DataFormatException e) { 304 throw new IOException("Gzip-compressed data is corrupt"); 305 } 306 307 crc.update(b, off, ret); 308 off += ret; 309 len -= ret; 310 size += ret; 311 count(ret); 312 313 if (inf.finished()) { 314 // We may have read too many bytes. Rewind the read 315 // position to match the actual amount used. 316 // 317 // NOTE: The "if" is there just in case. Since we used 318 // in.mark earlier, it should always skip enough. 319 in.reset(); 320 321 final int skipAmount = bufUsed - inf.getRemaining(); 322 if (in.skip(skipAmount) != skipAmount) { 323 throw new IOException(); 324 } 325 326 bufUsed = 0; 327 328 final DataInput inData = new DataInputStream(in); 329 330 // CRC32 331 final long crcStored = ByteUtils.fromLittleEndian(inData, 4); 332 333 if (crcStored != crc.getValue()) { 334 throw new IOException("Gzip-compressed data is corrupt " 335 + "(CRC32 error)"); 336 } 337 338 // Uncompressed size modulo 2^32 (ISIZE in the spec) 339 final long isize = ByteUtils.fromLittleEndian(inData, 4); 340 341 if (isize != (inf.getBytesWritten() & 0xffffffffL)) { 342 throw new IOException("Gzip-compressed data is corrupt" 343 + "(uncompressed size mismatch)"); 344 } 345 346 // See if this is the end of the file. 347 if (!decompressConcatenated || !init(false)) { 348 inf.end(); 349 inf = null; 350 endReached = true; 351 return size == 0 ? -1 : size; 352 } 353 } 354 } 355 356 return size; 357 } 358 359 /** 360 * Checks if the signature matches what is expected for a .gz file. 361 * 362 * @param signature the bytes to check 363 * @param length the number of bytes to check 364 * @return true if this is a .gz stream, false otherwise 365 * 366 * @since 1.1 367 */ 368 public static boolean matches(final byte[] signature, final int length) { 369 return length >= 2 && signature[0] == 31 && signature[1] == -117; 370 } 371 372 /** 373 * Closes the input stream (unless it is System.in). 374 * 375 * @since 1.2 376 */ 377 @Override 378 public void close() throws IOException { 379 if (inf != null) { 380 inf.end(); 381 inf = null; 382 } 383 384 if (this.in != System.in) { 385 this.in.close(); 386 } 387 } 388}