001/*
002 *  Licensed to the Apache Software Foundation (ASF) under one or more
003 *  contributor license agreements.  See the NOTICE file distributed with
004 *  this work for additional information regarding copyright ownership.
005 *  The ASF licenses this file to You under the Apache License, Version 2.0
006 *  (the "License"); you may not use this file except in compliance with
007 *  the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 *  Unless required by applicable law or agreed to in writing, software
012 *  distributed under the License is distributed on an "AS IS" BASIS,
013 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 *  See the License for the specific language governing permissions and
015 *  limitations under the License.
016 *
017 */
018
019/*
020 * This package is based on the work done by Timothy Gerard Endres
021 * ([email protected]) to whom the Ant project is very grateful for his great code.
022 */
023
024package org.apache.commons.compress.archivers.tar;
025
026import java.io.ByteArrayOutputStream;
027import java.io.IOException;
028import java.io.InputStream;
029import java.util.ArrayList;
030import java.util.Collections;
031import java.util.Comparator;
032import java.util.HashMap;
033import java.util.List;
034import java.util.Map;
035
036import org.apache.commons.compress.archivers.ArchiveEntry;
037import org.apache.commons.compress.archivers.ArchiveInputStream;
038import org.apache.commons.compress.archivers.zip.ZipEncoding;
039import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
040import org.apache.commons.compress.utils.ArchiveUtils;
041import org.apache.commons.compress.utils.BoundedInputStream;
042import org.apache.commons.compress.utils.CharsetNames;
043import org.apache.commons.compress.utils.IOUtils;
044
045/**
046 * The TarInputStream reads a UNIX tar archive as an InputStream.
047 * methods are provided to position at each successive entry in
048 * the archive, and the read each entry as a normal input stream
049 * using read().
050 * @NotThreadSafe
051 */
052public class TarArchiveInputStream extends ArchiveInputStream {
053
054    private static final int SMALL_BUFFER_SIZE = 256;
055
056    private final byte[] smallBuf = new byte[SMALL_BUFFER_SIZE];
057
058    /** The size the TAR header */
059    private final int recordSize;
060
061    /** The size of a block */
062    private final int blockSize;
063
064    /** True if file has hit EOF */
065    private boolean hasHitEOF;
066
067    /** Size of the current entry */
068    private long entrySize;
069
070    /** How far into the entry the stream is at */
071    private long entryOffset;
072
073    /** An input stream to read from */
074    private final InputStream inputStream;
075
076    /** Input streams for reading sparse entries **/
077    private List<InputStream> sparseInputStreams;
078
079    /** the index of current input stream being read when reading sparse entries */
080    private int currentSparseInputStreamIndex;
081
082    /** The meta-data about the current entry */
083    private TarArchiveEntry currEntry;
084
085    /** The encoding of the file */
086    private final ZipEncoding zipEncoding;
087
088    // the provided encoding (for unit tests)
089    final String encoding;
090
091    // the global PAX header
092    private Map<String, String> globalPaxHeaders = new HashMap<>();
093
094    // the global sparse headers, this is only used in PAX Format 0.X
095    private final List<TarArchiveStructSparse> globalSparseHeaders = new ArrayList<>();
096
097    private final boolean lenient;
098
099    /**
100     * Constructor for TarInputStream.
101     * @param is the input stream to use
102     */
103    public TarArchiveInputStream(final InputStream is) {
104        this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE);
105    }
106
107    /**
108     * Constructor for TarInputStream.
109     * @param is the input stream to use
110     * @param lenient when set to true illegal values for group/userid, mode, device numbers and timestamp will be
111     * ignored and the fields set to {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an
112     * exception instead.
113     * @since 1.19
114     */
115    public TarArchiveInputStream(final InputStream is, boolean lenient) {
116        this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, lenient);
117    }
118
119    /**
120     * Constructor for TarInputStream.
121     * @param is the input stream to use
122     * @param encoding name of the encoding to use for file names
123     * @since 1.4
124     */
125    public TarArchiveInputStream(final InputStream is, final String encoding) {
126        this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE,
127             encoding);
128    }
129
130    /**
131     * Constructor for TarInputStream.
132     * @param is the input stream to use
133     * @param blockSize the block size to use
134     */
135    public TarArchiveInputStream(final InputStream is, final int blockSize) {
136        this(is, blockSize, TarConstants.DEFAULT_RCDSIZE);
137    }
138
139    /**
140     * Constructor for TarInputStream.
141     * @param is the input stream to use
142     * @param blockSize the block size to use
143     * @param encoding name of the encoding to use for file names
144     * @since 1.4
145     */
146    public TarArchiveInputStream(final InputStream is, final int blockSize,
147                                 final String encoding) {
148        this(is, blockSize, TarConstants.DEFAULT_RCDSIZE, encoding);
149    }
150
151    /**
152     * Constructor for TarInputStream.
153     * @param is the input stream to use
154     * @param blockSize the block size to use
155     * @param recordSize the record size to use
156     */
157    public TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize) {
158        this(is, blockSize, recordSize, null);
159    }
160
161    /**
162     * Constructor for TarInputStream.
163     * @param is the input stream to use
164     * @param blockSize the block size to use
165     * @param recordSize the record size to use
166     * @param encoding name of the encoding to use for file names
167     * @since 1.4
168     */
169    public TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize,
170                                 final String encoding) {
171        this(is, blockSize, recordSize, encoding, false);
172    }
173
174    /**
175     * Constructor for TarInputStream.
176     * @param is the input stream to use
177     * @param blockSize the block size to use
178     * @param recordSize the record size to use
179     * @param encoding name of the encoding to use for file names
180     * @param lenient when set to true illegal values for group/userid, mode, device numbers and timestamp will be
181     * ignored and the fields set to {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an
182     * exception instead.
183     * @since 1.19
184     */
185    public TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize,
186                                 final String encoding, boolean lenient) {
187        this.inputStream = is;
188        this.hasHitEOF = false;
189        this.encoding = encoding;
190        this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding);
191        this.recordSize = recordSize;
192        this.blockSize = blockSize;
193        this.lenient = lenient;
194    }
195
196    /**
197     * Closes this stream. Calls the TarBuffer's close() method.
198     * @throws IOException on error
199     */
200    @Override
201    public void close() throws IOException {
202        // Close all the input streams in sparseInputStreams
203        if(sparseInputStreams != null) {
204            for (InputStream inputStream : sparseInputStreams) {
205                inputStream.close();
206            }
207        }
208
209        inputStream.close();
210    }
211
212    /**
213     * Get the record size being used by this stream's buffer.
214     *
215     * @return The TarBuffer record size.
216     */
217    public int getRecordSize() {
218        return recordSize;
219    }
220
221    /**
222     * Get the available data that can be read from the current
223     * entry in the archive. This does not indicate how much data
224     * is left in the entire archive, only in the current entry.
225     * This value is determined from the entry's size header field
226     * and the amount of data already read from the current entry.
227     * Integer.MAX_VALUE is returned in case more than Integer.MAX_VALUE
228     * bytes are left in the current entry in the archive.
229     *
230     * @return The number of available bytes for the current entry.
231     * @throws IOException for signature
232     */
233    @Override
234    public int available() throws IOException {
235        if (isDirectory()) {
236            return 0;
237        }
238
239        if (currEntry.getRealSize() - entryOffset > Integer.MAX_VALUE) {
240            return Integer.MAX_VALUE;
241        }
242        return (int) (currEntry.getRealSize() - entryOffset);
243    }
244
245
246    /**
247     * Skips over and discards <code>n</code> bytes of data from this input
248     * stream. The <code>skip</code> method may, for a variety of reasons, end
249     * up skipping over some smaller number of bytes, possibly <code>0</code>.
250     * This may result from any of a number of conditions; reaching end of file
251     * or end of entry before <code>n</code> bytes have been skipped; are only
252     * two possibilities. The actual number of bytes skipped is returned. If
253     * <code>n</code> is negative, no bytes are skipped.
254     *
255     *
256     * @param n
257     *            the number of bytes to be skipped.
258     * @return the actual number of bytes skipped.
259     * @throws IOException
260     *                if some other I/O error occurs.
261     */
262    @Override
263    public long skip(final long n) throws IOException {
264        if (n <= 0 || isDirectory()) {
265            return 0;
266        }
267
268        final long available = currEntry.getRealSize() - entryOffset;
269        final long skipped;
270        if (!currEntry.isSparse()) {
271            skipped = IOUtils.skip(inputStream, Math.min(n, available));
272        } else {
273            skipped = skipSparse(Math.min(n, available));
274        }
275        count(skipped);
276        entryOffset += skipped;
277        return skipped;
278    }
279
280    /**
281     * Skip n bytes from current input stream, if the current input stream doesn't have enough data to skip,
282     * jump to the next input stream and skip the rest bytes, keep doing this until total n bytes are skipped
283     * or the input streams are all skipped
284     *
285     * @param n bytes of data to skip
286     * @return actual bytes of data skipped
287     * @throws IOException
288     */
289    private long skipSparse(final long n) throws IOException {
290        if (sparseInputStreams == null || sparseInputStreams.size() == 0) {
291            return inputStream.skip(n);
292        }
293
294        long bytesSkipped = 0;
295
296        while (bytesSkipped < n && currentSparseInputStreamIndex < sparseInputStreams.size()) {
297            final InputStream  currentInputStream = sparseInputStreams.get(currentSparseInputStreamIndex);
298            bytesSkipped += currentInputStream.skip(n - bytesSkipped);
299
300            if (bytesSkipped < n) {
301                currentSparseInputStreamIndex++;
302            }
303        }
304
305        return bytesSkipped;
306    }
307
308    /**
309     * Since we do not support marking just yet, we return false.
310     *
311     * @return False.
312     */
313    @Override
314    public boolean markSupported() {
315        return false;
316    }
317
318    /**
319     * Since we do not support marking just yet, we do nothing.
320     *
321     * @param markLimit The limit to mark.
322     */
323    @Override
324    public synchronized void mark(final int markLimit) {
325    }
326
327    /**
328     * Since we do not support marking just yet, we do nothing.
329     */
330    @Override
331    public synchronized void reset() {
332    }
333
334    /**
335     * Get the next entry in this tar archive. This will skip
336     * over any remaining data in the current entry, if there
337     * is one, and place the input stream at the header of the
338     * next entry, and read the header and instantiate a new
339     * TarEntry from the header bytes and return that entry.
340     * If there are no more entries in the archive, null will
341     * be returned to indicate that the end of the archive has
342     * been reached.
343     *
344     * @return The next TarEntry in the archive, or null.
345     * @throws IOException on error
346     */
347    public TarArchiveEntry getNextTarEntry() throws IOException {
348        if (isAtEOF()) {
349            return null;
350        }
351
352        if (currEntry != null) {
353            /* Skip will only go to the end of the current entry */
354            IOUtils.skip(this, Long.MAX_VALUE);
355
356            /* skip to the end of the last record */
357            skipRecordPadding();
358        }
359
360        final byte[] headerBuf = getRecord();
361
362        if (headerBuf == null) {
363            /* hit EOF */
364            currEntry = null;
365            return null;
366        }
367
368        try {
369            currEntry = new TarArchiveEntry(headerBuf, zipEncoding, lenient);
370        } catch (final IllegalArgumentException e) {
371            throw new IOException("Error detected parsing the header", e);
372        }
373
374        entryOffset = 0;
375        entrySize = currEntry.getSize();
376
377        if (currEntry.isGNULongLinkEntry()) {
378            final byte[] longLinkData = getLongNameData();
379            if (longLinkData == null) {
380                // Bugzilla: 40334
381                // Malformed tar file - long link entry name not followed by
382                // entry
383                return null;
384            }
385            currEntry.setLinkName(zipEncoding.decode(longLinkData));
386        }
387
388        if (currEntry.isGNULongNameEntry()) {
389            final byte[] longNameData = getLongNameData();
390            if (longNameData == null) {
391                // Bugzilla: 40334
392                // Malformed tar file - long entry name not followed by
393                // entry
394                return null;
395            }
396            currEntry.setName(zipEncoding.decode(longNameData));
397        }
398
399        if (currEntry.isGlobalPaxHeader()){ // Process Global Pax headers
400            readGlobalPaxHeaders();
401        }
402
403        if (currEntry.isPaxHeader()){ // Process Pax headers
404            paxHeaders();
405        } else if (!globalPaxHeaders.isEmpty()) {
406            applyPaxHeadersToCurrentEntry(globalPaxHeaders, globalSparseHeaders);
407        }
408
409        if (currEntry.isOldGNUSparse()){ // Process sparse files
410            readOldGNUSparse();
411        }
412
413        // If the size of the next element in the archive has changed
414        // due to a new size being reported in the posix header
415        // information, we update entrySize here so that it contains
416        // the correct value.
417        entrySize = currEntry.getSize();
418
419        return currEntry;
420    }
421
422    /**
423     * The last record block should be written at the full size, so skip any
424     * additional space used to fill a record after an entry
425     */
426    private void skipRecordPadding() throws IOException {
427        if (!isDirectory() && this.entrySize > 0 && this.entrySize % this.recordSize != 0) {
428            final long numRecords = (this.entrySize / this.recordSize) + 1;
429            final long padding = (numRecords * this.recordSize) - this.entrySize;
430            final long skipped = IOUtils.skip(inputStream, padding);
431            count(skipped);
432        }
433    }
434
435    /**
436     * Get the next entry in this tar archive as longname data.
437     *
438     * @return The next entry in the archive as longname data, or null.
439     * @throws IOException on error
440     */
441    protected byte[] getLongNameData() throws IOException {
442        // read in the name
443        final ByteArrayOutputStream longName = new ByteArrayOutputStream();
444        int length = 0;
445        while ((length = read(smallBuf)) >= 0) {
446            longName.write(smallBuf, 0, length);
447        }
448        getNextEntry();
449        if (currEntry == null) {
450            // Bugzilla: 40334
451            // Malformed tar file - long entry name not followed by entry
452            return null;
453        }
454        byte[] longNameData = longName.toByteArray();
455        // remove trailing null terminator(s)
456        length = longNameData.length;
457        while (length > 0 && longNameData[length - 1] == 0) {
458            --length;
459        }
460        if (length != longNameData.length) {
461            final byte[] l = new byte[length];
462            System.arraycopy(longNameData, 0, l, 0, length);
463            longNameData = l;
464        }
465        return longNameData;
466    }
467
468    /**
469     * Get the next record in this tar archive. This will skip
470     * over any remaining data in the current entry, if there
471     * is one, and place the input stream at the header of the
472     * next entry.
473     *
474     * <p>If there are no more entries in the archive, null will be
475     * returned to indicate that the end of the archive has been
476     * reached.  At the same time the {@code hasHitEOF} marker will be
477     * set to true.</p>
478     *
479     * @return The next header in the archive, or null.
480     * @throws IOException on error
481     */
482    private byte[] getRecord() throws IOException {
483        byte[] headerBuf = readRecord();
484        setAtEOF(isEOFRecord(headerBuf));
485        if (isAtEOF() && headerBuf != null) {
486            tryToConsumeSecondEOFRecord();
487            consumeRemainderOfLastBlock();
488            headerBuf = null;
489        }
490        return headerBuf;
491    }
492
493    /**
494     * Determine if an archive record indicate End of Archive. End of
495     * archive is indicated by a record that consists entirely of null bytes.
496     *
497     * @param record The record data to check.
498     * @return true if the record data is an End of Archive
499     */
500    protected boolean isEOFRecord(final byte[] record) {
501        return record == null || ArchiveUtils.isArrayZero(record, recordSize);
502    }
503
504    /**
505     * Read a record from the input stream and return the data.
506     *
507     * @return The record data or null if EOF has been hit.
508     * @throws IOException on error
509     */
510    protected byte[] readRecord() throws IOException {
511
512        final byte[] record = new byte[recordSize];
513
514        final int readNow = IOUtils.readFully(inputStream, record);
515        count(readNow);
516        if (readNow != recordSize) {
517            return null;
518        }
519
520        return record;
521    }
522
523    private void readGlobalPaxHeaders() throws IOException {
524        globalPaxHeaders = parsePaxHeaders(this, globalSparseHeaders);
525        getNextEntry(); // Get the actual file entry
526    }
527
528    /**
529     * For PAX Format 0.0, the sparse headers(GNU.sparse.offset and GNU.sparse.numbytes)
530     * may appear multi times, and they look like:
531     *
532     * GNU.sparse.size=size
533     * GNU.sparse.numblocks=numblocks
534     * repeat numblocks times
535     *   GNU.sparse.offset=offset
536     *   GNU.sparse.numbytes=numbytes
537     * end repeat
538     *
539     *
540     * For PAX Format 0.1, the sparse headers are stored in a single variable : GNU.sparse.map
541     *
542     * GNU.sparse.map
543     *    Map of non-null data chunks. It is a string consisting of comma-separated values "offset,size[,offset-1,size-1...]"
544     *
545     *
546     * For PAX Format 1.X:
547     * The sparse map itself is stored in the file data block, preceding the actual file data.
548     * It consists of a series of decimal numbers delimited by newlines. The map is padded with nulls to the nearest block boundary.
549     * The first number gives the number of entries in the map. Following are map entries, each one consisting of two numbers
550     * giving the offset and size of the data block it describes.
551     * @throws IOException
552     */
553    private void paxHeaders() throws IOException{
554        List<TarArchiveStructSparse> sparseHeaders = new ArrayList<>();
555        final Map<String, String> headers = parsePaxHeaders(this, sparseHeaders);
556
557        // for 0.1 PAX Headers
558        if (headers.containsKey("GNU.sparse.map")) {
559            sparseHeaders = parsePAX01SparseHeaders(headers.get("GNU.sparse.map"));
560        }
561        getNextEntry(); // Get the actual file entry
562        applyPaxHeadersToCurrentEntry(headers, sparseHeaders);
563
564        // for 1.0 PAX Format, the sparse map is stored in the file data block
565        if (currEntry.isPaxGNU1XSparse()) {
566            sparseHeaders = parsePAX1XSparseHeaders();
567            currEntry.setSparseHeaders(sparseHeaders);
568        }
569
570        // sparse headers are all done reading, we need to build
571        // sparse input streams using these sparse headers
572        buildSparseInputStreams();
573    }
574
575    /**
576     * For PAX Format 0.1, the sparse headers are stored in a single variable : GNU.sparse.map
577     * GNU.sparse.map
578     *    Map of non-null data chunks. It is a string consisting of comma-separated values "offset,size[,offset-1,size-1...]"
579     *
580     * @param sparseMap the sparse map string consisting of comma-separated values "offset,size[,offset-1,size-1...]"
581     * @return sparse headers parsed from sparse map
582     * @throws IOException
583     */
584    private List<TarArchiveStructSparse> parsePAX01SparseHeaders(String sparseMap) throws IOException {
585        List<TarArchiveStructSparse> sparseHeaders = new ArrayList<>();
586        String[] sparseHeaderStrings = sparseMap.split(",");
587
588        for (int i = 0; i < sparseHeaderStrings.length;i += 2) {
589            long sparseOffset = Long.parseLong(sparseHeaderStrings[i]);
590            long sparseNumbytes = Long.parseLong(sparseHeaderStrings[i + 1]);
591            sparseHeaders.add(new TarArchiveStructSparse(sparseOffset, sparseNumbytes));
592        }
593
594        return sparseHeaders;
595    }
596
597    /**
598     * For PAX Format 1.X:
599     * The sparse map itself is stored in the file data block, preceding the actual file data.
600     * It consists of a series of decimal numbers delimited by newlines. The map is padded with nulls to the nearest block boundary.
601     * The first number gives the number of entries in the map. Following are map entries, each one consisting of two numbers
602     * giving the offset and size of the data block it describes.
603     * @return sparse headers
604     * @throws IOException
605     */
606    private List<TarArchiveStructSparse> parsePAX1XSparseHeaders() throws IOException {
607        // for 1.X PAX Headers
608        List<TarArchiveStructSparse> sparseHeaders = new ArrayList<>();
609        long bytesRead = 0;
610
611        long[] readResult = readLineOfNumberForPax1X(inputStream);
612        long sparseHeadersCount = readResult[0];
613        bytesRead += readResult[1];
614        while (sparseHeadersCount-- > 0) {
615            readResult = readLineOfNumberForPax1X(inputStream);
616            long sparseOffset = readResult[0];
617            bytesRead += readResult[1];
618
619            readResult = readLineOfNumberForPax1X(inputStream);
620            long sparseNumbytes = readResult[0];
621            bytesRead += readResult[1];
622            sparseHeaders.add(new TarArchiveStructSparse(sparseOffset, sparseNumbytes));
623        }
624
625        // skip the rest of this record data
626        long bytesToSkip = recordSize - bytesRead % recordSize;
627        IOUtils.skip(inputStream, bytesToSkip);
628        return sparseHeaders;
629    }
630
631    /**
632     * For 1.X PAX Format, the sparse headers are stored in the file data block, preceding the actual file data.
633     * It consists of a series of decimal numbers delimited by newlines.
634     *
635     * @param inputStream the input stream of the tar file
636     * @return the decimal number delimited by '\n', and the bytes read from input stream
637     * @throws IOException
638     */
639    private long[] readLineOfNumberForPax1X(InputStream inputStream) throws IOException {
640        int number;
641        long result = 0;
642        long bytesRead = 0;
643
644        while((number = inputStream.read()) != '\n') {
645            bytesRead += 1;
646            if(number == -1) {
647                throw new IOException("Unexpected EOF when reading parse information of 1.X PAX format");
648            }
649            result = result * 10 + (number - '0');
650        }
651        bytesRead += 1;
652
653        return new long[] {result, bytesRead};
654    }
655
656    /**
657     * For PAX Format 0.0, the sparse headers(GNU.sparse.offset and GNU.sparse.numbytes)
658     * may appear multi times, and they look like:
659     *
660     * GNU.sparse.size=size
661     * GNU.sparse.numblocks=numblocks
662     * repeat numblocks times
663     *   GNU.sparse.offset=offset
664     *   GNU.sparse.numbytes=numbytes
665     * end repeat
666     *
667     * For PAX Format 0.1, the sparse headers are stored in a single variable : GNU.sparse.map
668     *
669     * GNU.sparse.map
670     *    Map of non-null data chunks. It is a string consisting of comma-separated values "offset,size[,offset-1,size-1...]"
671     *
672     * @param inputstream inputstream to read keys and values
673     * @param sparseHeaders used in PAX Format 0.0 &amp; 0.1, as it may appear multi times,
674     *                      the sparse headers need to be stored in an array, not a map
675     * @return map of PAX headers values found inside of the current (local or global) PAX headers tar entry.
676     * @throws IOException
677     */
678    Map<String, String> parsePaxHeaders(final InputStream inputStream, List<TarArchiveStructSparse> sparseHeaders)
679        throws IOException {
680        final Map<String, String> headers = new HashMap<>(globalPaxHeaders);
681        Long offset = null;
682        // Format is "length keyword=value\n";
683        while(true) { // get length
684            int ch;
685            int len = 0;
686            int read = 0;
687            while((ch = inputStream.read()) != -1) {
688                read++;
689                if (ch == '\n') { // blank line in header
690                    break;
691                } else if (ch == ' '){ // End of length string
692                    // Get keyword
693                    final ByteArrayOutputStream coll = new ByteArrayOutputStream();
694                    while((ch = inputStream.read()) != -1) {
695                        read++;
696                        if (ch == '='){ // end of keyword
697                            final String keyword = coll.toString(CharsetNames.UTF_8);
698                            // Get rest of entry
699                            final int restLen = len - read;
700                            if (restLen == 1) { // only NL
701                                headers.remove(keyword);
702                            } else {
703                                final byte[] rest = new byte[restLen];
704                                final int got = IOUtils.readFully(inputStream, rest);
705                                if (got != restLen) {
706                                    throw new IOException("Failed to read "
707                                                          + "Paxheader. Expected "
708                                                          + restLen
709                                                          + " bytes, read "
710                                                          + got);
711                                }
712                                // Drop trailing NL
713                                final String value = new String(rest, 0,
714                                                          restLen - 1, CharsetNames.UTF_8);
715                                headers.put(keyword, value);
716
717                                // for 0.0 PAX Headers
718                                if (keyword.equals("GNU.sparse.offset")) {
719                                    if (offset != null) {
720                                        // previous GNU.sparse.offset header but but no numBytes
721                                        sparseHeaders.add(new TarArchiveStructSparse(offset, 0));
722                                    }
723                                    offset = Long.valueOf(value);
724                                }
725
726                                // for 0.0 PAX Headers
727                                if (keyword.equals("GNU.sparse.numbytes")) {
728                                    if (offset == null) {
729                                        throw new IOException("Failed to read Paxheader." +
730                                                "GNU.sparse.offset is expected before GNU.sparse.numbytes shows up.");
731                                    }
732                                    sparseHeaders.add(new TarArchiveStructSparse(offset, Long.parseLong(value)));
733                                    offset = null;
734                                }
735                            }
736                            break;
737                        }
738                        coll.write((byte) ch);
739                    }
740                    break; // Processed single header
741                }
742                len *= 10;
743                len += ch - '0';
744            }
745            if (ch == -1){ // EOF
746                break;
747            }
748        }
749        if (offset != null) {
750            // offset but no numBytes
751            sparseHeaders.add(new TarArchiveStructSparse(offset, 0));
752        }
753        return headers;
754    }
755
756    private void applyPaxHeadersToCurrentEntry(final Map<String, String> headers, final List<TarArchiveStructSparse> sparseHeaders) {
757        currEntry.updateEntryFromPaxHeaders(headers);
758        currEntry.setSparseHeaders(sparseHeaders);
759    }
760
761    /**
762     * Adds the sparse chunks from the current entry to the sparse chunks,
763     * including any additional sparse entries following the current entry.
764     *
765     * @throws IOException on error
766     */
767    private void readOldGNUSparse() throws IOException {
768        if (currEntry.isExtended()) {
769            TarArchiveSparseEntry entry;
770            do {
771                final byte[] headerBuf = getRecord();
772                if (headerBuf == null) {
773                    currEntry = null;
774                    break;
775                }
776                entry = new TarArchiveSparseEntry(headerBuf);
777                currEntry.getSparseHeaders().addAll(entry.getSparseHeaders());
778            } while (entry.isExtended());
779        }
780
781        // sparse headers are all done reading, we need to build
782        // sparse input streams using these sparse headers
783        buildSparseInputStreams();
784    }
785
786    private boolean isDirectory() {
787        return currEntry != null && currEntry.isDirectory();
788    }
789
790    /**
791     * Returns the next Archive Entry in this Stream.
792     *
793     * @return the next entry,
794     *         or {@code null} if there are no more entries
795     * @throws IOException if the next entry could not be read
796     */
797    @Override
798    public ArchiveEntry getNextEntry() throws IOException {
799        return getNextTarEntry();
800    }
801
802    /**
803     * Tries to read the next record rewinding the stream if it is not a EOF record.
804     *
805     * <p>This is meant to protect against cases where a tar
806     * implementation has written only one EOF record when two are
807     * expected.  Actually this won't help since a non-conforming
808     * implementation likely won't fill full blocks consisting of - by
809     * default - ten records either so we probably have already read
810     * beyond the archive anyway.</p>
811     */
812    private void tryToConsumeSecondEOFRecord() throws IOException {
813        boolean shouldReset = true;
814        final boolean marked = inputStream.markSupported();
815        if (marked) {
816            inputStream.mark(recordSize);
817        }
818        try {
819            shouldReset = !isEOFRecord(readRecord());
820        } finally {
821            if (shouldReset && marked) {
822                pushedBackBytes(recordSize);
823                inputStream.reset();
824            }
825        }
826    }
827
828    /**
829     * Reads bytes from the current tar archive entry.
830     *
831     * This method is aware of the boundaries of the current
832     * entry in the archive and will deal with them as if they
833     * were this stream's start and EOF.
834     *
835     * @param buf The buffer into which to place bytes read.
836     * @param offset The offset at which to place bytes read.
837     * @param numToRead The number of bytes to read.
838     * @return The number of bytes read, or -1 at EOF.
839     * @throws IOException on error
840     */
841    @Override
842    public int read(final byte[] buf, final int offset, int numToRead) throws IOException {
843        if (numToRead == 0) {
844            return 0;
845        }
846        int totalRead = 0;
847
848        if (isAtEOF() || isDirectory()) {
849            return -1;
850        }
851
852        if (currEntry == null) {
853            throw new IllegalStateException("No current tar entry");
854        }
855
856        if (!currEntry.isSparse()) {
857            if (entryOffset >= entrySize) {
858                return -1;
859            }
860        } else {
861            // for sparse entries, there are actually currEntry.getRealSize() bytes to read
862            if (entryOffset >= currEntry.getRealSize()) {
863                return -1;
864            }
865        }
866
867        numToRead = Math.min(numToRead, available());
868
869        if (currEntry.isSparse()) {
870            // for sparse entries, we need to read them in another way
871            totalRead = readSparse(buf, offset, numToRead);
872        } else {
873            totalRead = inputStream.read(buf, offset, numToRead);
874        }
875
876        if (totalRead == -1) {
877            if (numToRead > 0) {
878                throw new IOException("Truncated TAR archive");
879            }
880            setAtEOF(true);
881        } else {
882            count(totalRead);
883            entryOffset += totalRead;
884        }
885
886        return totalRead;
887    }
888
889    /**
890     * For sparse tar entries, there are many "holes"(consisting of all 0) in the file. Only the non-zero data is
891     * stored in tar files, and they are stored separately. The structure of non-zero data is introduced by the
892     * sparse headers using the offset, where a block of non-zero data starts, and numbytes, the length of the
893     * non-zero data block.
894     * When reading sparse entries, the actual data is read out with "holes" and non-zero data combined together
895     * according to the sparse headers.
896     *
897     * @param buf The buffer into which to place bytes read.
898     * @param offset The offset at which to place bytes read.
899     * @param numToRead The number of bytes to read.
900     * @return The number of bytes read, or -1 at EOF.
901     * @throws IOException on error
902     */
903    private int readSparse(final byte[] buf, final int offset, int numToRead) throws IOException {
904        // if there are no actual input streams, just read from the original input stream
905        if (sparseInputStreams == null || sparseInputStreams.size() == 0) {
906            return inputStream.read(buf, offset, numToRead);
907        }
908
909        if(currentSparseInputStreamIndex >= sparseInputStreams.size()) {
910            return -1;
911        }
912
913        InputStream currentInputStream = sparseInputStreams.get(currentSparseInputStreamIndex);
914        int readLen = currentInputStream.read(buf, offset, numToRead);
915
916        // if the current input stream is the last input stream,
917        // just return the number of bytes read from current input stream
918        if (currentSparseInputStreamIndex == sparseInputStreams.size() - 1) {
919            return readLen;
920        }
921
922        // if EOF of current input stream is meet, open a new input stream and recursively call read
923        if (readLen == -1) {
924            currentSparseInputStreamIndex++;
925            return readSparse(buf, offset, numToRead);
926        }
927
928        // if the rest data of current input stream is not long enough, open a new input stream
929        // and recursively call read
930        if (readLen < numToRead) {
931            currentSparseInputStreamIndex++;
932            int readLenOfNext = readSparse(buf, offset + readLen, numToRead - readLen);
933            if (readLenOfNext == -1) {
934                return readLen;
935            }
936
937            return readLen + readLenOfNext;
938        }
939
940        // if the rest data of current input stream is enough(which means readLen == len), just return readLen
941        return readLen;
942    }
943
944    /**
945     * Whether this class is able to read the given entry.
946     *
947     * <p>May return false if the current entry is a sparse file.</p>
948     */
949    @Override
950    public boolean canReadEntryData(final ArchiveEntry ae) {
951        if (ae instanceof TarArchiveEntry) {
952            final TarArchiveEntry te = (TarArchiveEntry) ae;
953            return !te.isSparse();
954        }
955        return false;
956    }
957
958    /**
959     * Get the current TAR Archive Entry that this input stream is processing
960     *
961     * @return The current Archive Entry
962     */
963    public TarArchiveEntry getCurrentEntry() {
964        return currEntry;
965    }
966
967    protected final void setCurrentEntry(final TarArchiveEntry e) {
968        currEntry = e;
969    }
970
971    protected final boolean isAtEOF() {
972        return hasHitEOF;
973    }
974
975    protected final void setAtEOF(final boolean b) {
976        hasHitEOF = b;
977    }
978
979    /**
980     * This method is invoked once the end of the archive is hit, it
981     * tries to consume the remaining bytes under the assumption that
982     * the tool creating this archive has padded the last block.
983     */
984    private void consumeRemainderOfLastBlock() throws IOException {
985        final long bytesReadOfLastBlock = getBytesRead() % blockSize;
986        if (bytesReadOfLastBlock > 0) {
987            final long skipped = IOUtils.skip(inputStream, blockSize - bytesReadOfLastBlock);
988            count(skipped);
989        }
990    }
991
992    /**
993     * Checks if the signature matches what is expected for a tar file.
994     *
995     * @param signature
996     *            the bytes to check
997     * @param length
998     *            the number of bytes to check
999     * @return true, if this stream is a tar archive stream, false otherwise
1000     */
1001    public static boolean matches(final byte[] signature, final int length) {
1002        if (length < TarConstants.VERSION_OFFSET+TarConstants.VERSIONLEN) {
1003            return false;
1004        }
1005
1006        if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_POSIX,
1007                signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
1008            &&
1009            ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_POSIX,
1010                signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
1011                ){
1012            return true;
1013        }
1014        if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_GNU,
1015                signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
1016            &&
1017            (
1018             ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_SPACE,
1019                signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
1020            ||
1021            ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_ZERO,
1022                signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
1023            )
1024                ){
1025            return true;
1026        }
1027        // COMPRESS-107 - recognise Ant tar files
1028        return ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_ANT,
1029                signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
1030                &&
1031                ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_ANT,
1032                        signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN);
1033    }
1034
1035    /**
1036     * Build the input streams consisting of all-zero input streams and non-zero input streams.
1037     * When reading from the non-zero input streams, the data is actually read from the original input stream.
1038     * The size of each input stream is introduced by the sparse headers.
1039     *
1040     * NOTE : Some all-zero input streams and non-zero input streams have the size of 0. We DO NOT store the
1041     *        0 size input streams because they are meaningless.
1042     */
1043    private void buildSparseInputStreams() throws IOException {
1044        currentSparseInputStreamIndex = -1;
1045        sparseInputStreams = new ArrayList<>();
1046
1047        final List<TarArchiveStructSparse> sparseHeaders = currEntry.getSparseHeaders();
1048        // sort the sparse headers in case they are written in wrong order
1049        if (sparseHeaders != null && sparseHeaders.size() > 1) {
1050            final Comparator<TarArchiveStructSparse> sparseHeaderComparator = new Comparator<TarArchiveStructSparse>() {
1051                @Override
1052                public int compare(final TarArchiveStructSparse p, final TarArchiveStructSparse q) {
1053                    Long pOffset = p.getOffset();
1054                    Long qOffset = q.getOffset();
1055                    return pOffset.compareTo(qOffset);
1056                }
1057            };
1058            Collections.sort(sparseHeaders, sparseHeaderComparator);
1059        }
1060
1061        if (sparseHeaders != null) {
1062            // Stream doesn't need to be closed at all as it doesn't use any resources
1063            final InputStream zeroInputStream = new TarArchiveSparseZeroInputStream(); //NOSONAR
1064            long offset = 0;
1065            for (TarArchiveStructSparse sparseHeader : sparseHeaders) {
1066                if (sparseHeader.getOffset() == 0 && sparseHeader.getNumbytes() == 0) {
1067                    break;
1068                }
1069
1070                if ((sparseHeader.getOffset() - offset) < 0) {
1071                    throw new IOException("Corrupted struct sparse detected");
1072                }
1073
1074                // only store the input streams with non-zero size
1075                if ((sparseHeader.getOffset() - offset) > 0) {
1076                    sparseInputStreams.add(new BoundedInputStream(zeroInputStream, sparseHeader.getOffset() - offset));
1077                }
1078
1079                // only store the input streams with non-zero size
1080                if (sparseHeader.getNumbytes() > 0) {
1081                    sparseInputStreams.add(new BoundedInputStream(inputStream, sparseHeader.getNumbytes()));
1082                }
1083
1084                offset = sparseHeader.getOffset() + sparseHeader.getNumbytes();
1085            }
1086        }
1087
1088        if (sparseInputStreams.size() > 0) {
1089            currentSparseInputStreamIndex = 0;
1090        }
1091    }
1092
1093    /**
1094     * This is an inputstream that always return 0,
1095     * this is used when reading the "holes" of a sparse file
1096     */
1097    private static class TarArchiveSparseZeroInputStream extends InputStream {
1098        /**
1099         * Just return 0
1100         * @return
1101         * @throws IOException
1102         */
1103        @Override
1104        public int read() throws IOException {
1105            return 0;
1106        }
1107
1108        /**
1109         * these's nothing need to do when skipping
1110         *
1111         * @param n bytes to skip
1112         * @return bytes actually skipped
1113         */
1114        @Override
1115        public long skip(final long n) {
1116            return n;
1117        }
1118    }
1119}