001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 * 017 */ 018 019/* 020 * This package is based on the work done by Timothy Gerard Endres 021 * ([email protected]) to whom the Ant project is very grateful for his great code. 022 */ 023 024package org.apache.commons.compress.archivers.tar; 025 026import java.io.ByteArrayOutputStream; 027import java.io.IOException; 028import java.io.InputStream; 029import java.util.ArrayList; 030import java.util.Collections; 031import java.util.Comparator; 032import java.util.HashMap; 033import java.util.List; 034import java.util.Map; 035 036import org.apache.commons.compress.archivers.ArchiveEntry; 037import org.apache.commons.compress.archivers.ArchiveInputStream; 038import org.apache.commons.compress.archivers.zip.ZipEncoding; 039import org.apache.commons.compress.archivers.zip.ZipEncodingHelper; 040import org.apache.commons.compress.utils.ArchiveUtils; 041import org.apache.commons.compress.utils.BoundedInputStream; 042import org.apache.commons.compress.utils.CharsetNames; 043import org.apache.commons.compress.utils.IOUtils; 044 045/** 046 * The TarInputStream reads a UNIX tar archive as an InputStream. 047 * methods are provided to position at each successive entry in 048 * the archive, and the read each entry as a normal input stream 049 * using read(). 050 * @NotThreadSafe 051 */ 052public class TarArchiveInputStream extends ArchiveInputStream { 053 054 private static final int SMALL_BUFFER_SIZE = 256; 055 056 private final byte[] smallBuf = new byte[SMALL_BUFFER_SIZE]; 057 058 /** The size the TAR header */ 059 private final int recordSize; 060 061 /** The size of a block */ 062 private final int blockSize; 063 064 /** True if file has hit EOF */ 065 private boolean hasHitEOF; 066 067 /** Size of the current entry */ 068 private long entrySize; 069 070 /** How far into the entry the stream is at */ 071 private long entryOffset; 072 073 /** An input stream to read from */ 074 private final InputStream inputStream; 075 076 /** Input streams for reading sparse entries **/ 077 private List<InputStream> sparseInputStreams; 078 079 /** the index of current input stream being read when reading sparse entries */ 080 private int currentSparseInputStreamIndex; 081 082 /** The meta-data about the current entry */ 083 private TarArchiveEntry currEntry; 084 085 /** The encoding of the file */ 086 private final ZipEncoding zipEncoding; 087 088 // the provided encoding (for unit tests) 089 final String encoding; 090 091 // the global PAX header 092 private Map<String, String> globalPaxHeaders = new HashMap<>(); 093 094 // the global sparse headers, this is only used in PAX Format 0.X 095 private final List<TarArchiveStructSparse> globalSparseHeaders = new ArrayList<>(); 096 097 private final boolean lenient; 098 099 /** 100 * Constructor for TarInputStream. 101 * @param is the input stream to use 102 */ 103 public TarArchiveInputStream(final InputStream is) { 104 this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE); 105 } 106 107 /** 108 * Constructor for TarInputStream. 109 * @param is the input stream to use 110 * @param lenient when set to true illegal values for group/userid, mode, device numbers and timestamp will be 111 * ignored and the fields set to {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an 112 * exception instead. 113 * @since 1.19 114 */ 115 public TarArchiveInputStream(final InputStream is, boolean lenient) { 116 this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, lenient); 117 } 118 119 /** 120 * Constructor for TarInputStream. 121 * @param is the input stream to use 122 * @param encoding name of the encoding to use for file names 123 * @since 1.4 124 */ 125 public TarArchiveInputStream(final InputStream is, final String encoding) { 126 this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, 127 encoding); 128 } 129 130 /** 131 * Constructor for TarInputStream. 132 * @param is the input stream to use 133 * @param blockSize the block size to use 134 */ 135 public TarArchiveInputStream(final InputStream is, final int blockSize) { 136 this(is, blockSize, TarConstants.DEFAULT_RCDSIZE); 137 } 138 139 /** 140 * Constructor for TarInputStream. 141 * @param is the input stream to use 142 * @param blockSize the block size to use 143 * @param encoding name of the encoding to use for file names 144 * @since 1.4 145 */ 146 public TarArchiveInputStream(final InputStream is, final int blockSize, 147 final String encoding) { 148 this(is, blockSize, TarConstants.DEFAULT_RCDSIZE, encoding); 149 } 150 151 /** 152 * Constructor for TarInputStream. 153 * @param is the input stream to use 154 * @param blockSize the block size to use 155 * @param recordSize the record size to use 156 */ 157 public TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize) { 158 this(is, blockSize, recordSize, null); 159 } 160 161 /** 162 * Constructor for TarInputStream. 163 * @param is the input stream to use 164 * @param blockSize the block size to use 165 * @param recordSize the record size to use 166 * @param encoding name of the encoding to use for file names 167 * @since 1.4 168 */ 169 public TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize, 170 final String encoding) { 171 this(is, blockSize, recordSize, encoding, false); 172 } 173 174 /** 175 * Constructor for TarInputStream. 176 * @param is the input stream to use 177 * @param blockSize the block size to use 178 * @param recordSize the record size to use 179 * @param encoding name of the encoding to use for file names 180 * @param lenient when set to true illegal values for group/userid, mode, device numbers and timestamp will be 181 * ignored and the fields set to {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an 182 * exception instead. 183 * @since 1.19 184 */ 185 public TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize, 186 final String encoding, boolean lenient) { 187 this.inputStream = is; 188 this.hasHitEOF = false; 189 this.encoding = encoding; 190 this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding); 191 this.recordSize = recordSize; 192 this.blockSize = blockSize; 193 this.lenient = lenient; 194 } 195 196 /** 197 * Closes this stream. Calls the TarBuffer's close() method. 198 * @throws IOException on error 199 */ 200 @Override 201 public void close() throws IOException { 202 // Close all the input streams in sparseInputStreams 203 if(sparseInputStreams != null) { 204 for (InputStream inputStream : sparseInputStreams) { 205 inputStream.close(); 206 } 207 } 208 209 inputStream.close(); 210 } 211 212 /** 213 * Get the record size being used by this stream's buffer. 214 * 215 * @return The TarBuffer record size. 216 */ 217 public int getRecordSize() { 218 return recordSize; 219 } 220 221 /** 222 * Get the available data that can be read from the current 223 * entry in the archive. This does not indicate how much data 224 * is left in the entire archive, only in the current entry. 225 * This value is determined from the entry's size header field 226 * and the amount of data already read from the current entry. 227 * Integer.MAX_VALUE is returned in case more than Integer.MAX_VALUE 228 * bytes are left in the current entry in the archive. 229 * 230 * @return The number of available bytes for the current entry. 231 * @throws IOException for signature 232 */ 233 @Override 234 public int available() throws IOException { 235 if (isDirectory()) { 236 return 0; 237 } 238 239 if (currEntry.getRealSize() - entryOffset > Integer.MAX_VALUE) { 240 return Integer.MAX_VALUE; 241 } 242 return (int) (currEntry.getRealSize() - entryOffset); 243 } 244 245 246 /** 247 * Skips over and discards <code>n</code> bytes of data from this input 248 * stream. The <code>skip</code> method may, for a variety of reasons, end 249 * up skipping over some smaller number of bytes, possibly <code>0</code>. 250 * This may result from any of a number of conditions; reaching end of file 251 * or end of entry before <code>n</code> bytes have been skipped; are only 252 * two possibilities. The actual number of bytes skipped is returned. If 253 * <code>n</code> is negative, no bytes are skipped. 254 * 255 * 256 * @param n 257 * the number of bytes to be skipped. 258 * @return the actual number of bytes skipped. 259 * @throws IOException 260 * if some other I/O error occurs. 261 */ 262 @Override 263 public long skip(final long n) throws IOException { 264 if (n <= 0 || isDirectory()) { 265 return 0; 266 } 267 268 final long available = currEntry.getRealSize() - entryOffset; 269 final long skipped; 270 if (!currEntry.isSparse()) { 271 skipped = IOUtils.skip(inputStream, Math.min(n, available)); 272 } else { 273 skipped = skipSparse(Math.min(n, available)); 274 } 275 count(skipped); 276 entryOffset += skipped; 277 return skipped; 278 } 279 280 /** 281 * Skip n bytes from current input stream, if the current input stream doesn't have enough data to skip, 282 * jump to the next input stream and skip the rest bytes, keep doing this until total n bytes are skipped 283 * or the input streams are all skipped 284 * 285 * @param n bytes of data to skip 286 * @return actual bytes of data skipped 287 * @throws IOException 288 */ 289 private long skipSparse(final long n) throws IOException { 290 if (sparseInputStreams == null || sparseInputStreams.size() == 0) { 291 return inputStream.skip(n); 292 } 293 294 long bytesSkipped = 0; 295 296 while (bytesSkipped < n && currentSparseInputStreamIndex < sparseInputStreams.size()) { 297 final InputStream currentInputStream = sparseInputStreams.get(currentSparseInputStreamIndex); 298 bytesSkipped += currentInputStream.skip(n - bytesSkipped); 299 300 if (bytesSkipped < n) { 301 currentSparseInputStreamIndex++; 302 } 303 } 304 305 return bytesSkipped; 306 } 307 308 /** 309 * Since we do not support marking just yet, we return false. 310 * 311 * @return False. 312 */ 313 @Override 314 public boolean markSupported() { 315 return false; 316 } 317 318 /** 319 * Since we do not support marking just yet, we do nothing. 320 * 321 * @param markLimit The limit to mark. 322 */ 323 @Override 324 public synchronized void mark(final int markLimit) { 325 } 326 327 /** 328 * Since we do not support marking just yet, we do nothing. 329 */ 330 @Override 331 public synchronized void reset() { 332 } 333 334 /** 335 * Get the next entry in this tar archive. This will skip 336 * over any remaining data in the current entry, if there 337 * is one, and place the input stream at the header of the 338 * next entry, and read the header and instantiate a new 339 * TarEntry from the header bytes and return that entry. 340 * If there are no more entries in the archive, null will 341 * be returned to indicate that the end of the archive has 342 * been reached. 343 * 344 * @return The next TarEntry in the archive, or null. 345 * @throws IOException on error 346 */ 347 public TarArchiveEntry getNextTarEntry() throws IOException { 348 if (isAtEOF()) { 349 return null; 350 } 351 352 if (currEntry != null) { 353 /* Skip will only go to the end of the current entry */ 354 IOUtils.skip(this, Long.MAX_VALUE); 355 356 /* skip to the end of the last record */ 357 skipRecordPadding(); 358 } 359 360 final byte[] headerBuf = getRecord(); 361 362 if (headerBuf == null) { 363 /* hit EOF */ 364 currEntry = null; 365 return null; 366 } 367 368 try { 369 currEntry = new TarArchiveEntry(headerBuf, zipEncoding, lenient); 370 } catch (final IllegalArgumentException e) { 371 throw new IOException("Error detected parsing the header", e); 372 } 373 374 entryOffset = 0; 375 entrySize = currEntry.getSize(); 376 377 if (currEntry.isGNULongLinkEntry()) { 378 final byte[] longLinkData = getLongNameData(); 379 if (longLinkData == null) { 380 // Bugzilla: 40334 381 // Malformed tar file - long link entry name not followed by 382 // entry 383 return null; 384 } 385 currEntry.setLinkName(zipEncoding.decode(longLinkData)); 386 } 387 388 if (currEntry.isGNULongNameEntry()) { 389 final byte[] longNameData = getLongNameData(); 390 if (longNameData == null) { 391 // Bugzilla: 40334 392 // Malformed tar file - long entry name not followed by 393 // entry 394 return null; 395 } 396 currEntry.setName(zipEncoding.decode(longNameData)); 397 } 398 399 if (currEntry.isGlobalPaxHeader()){ // Process Global Pax headers 400 readGlobalPaxHeaders(); 401 } 402 403 if (currEntry.isPaxHeader()){ // Process Pax headers 404 paxHeaders(); 405 } else if (!globalPaxHeaders.isEmpty()) { 406 applyPaxHeadersToCurrentEntry(globalPaxHeaders, globalSparseHeaders); 407 } 408 409 if (currEntry.isOldGNUSparse()){ // Process sparse files 410 readOldGNUSparse(); 411 } 412 413 // If the size of the next element in the archive has changed 414 // due to a new size being reported in the posix header 415 // information, we update entrySize here so that it contains 416 // the correct value. 417 entrySize = currEntry.getSize(); 418 419 return currEntry; 420 } 421 422 /** 423 * The last record block should be written at the full size, so skip any 424 * additional space used to fill a record after an entry 425 */ 426 private void skipRecordPadding() throws IOException { 427 if (!isDirectory() && this.entrySize > 0 && this.entrySize % this.recordSize != 0) { 428 final long numRecords = (this.entrySize / this.recordSize) + 1; 429 final long padding = (numRecords * this.recordSize) - this.entrySize; 430 final long skipped = IOUtils.skip(inputStream, padding); 431 count(skipped); 432 } 433 } 434 435 /** 436 * Get the next entry in this tar archive as longname data. 437 * 438 * @return The next entry in the archive as longname data, or null. 439 * @throws IOException on error 440 */ 441 protected byte[] getLongNameData() throws IOException { 442 // read in the name 443 final ByteArrayOutputStream longName = new ByteArrayOutputStream(); 444 int length = 0; 445 while ((length = read(smallBuf)) >= 0) { 446 longName.write(smallBuf, 0, length); 447 } 448 getNextEntry(); 449 if (currEntry == null) { 450 // Bugzilla: 40334 451 // Malformed tar file - long entry name not followed by entry 452 return null; 453 } 454 byte[] longNameData = longName.toByteArray(); 455 // remove trailing null terminator(s) 456 length = longNameData.length; 457 while (length > 0 && longNameData[length - 1] == 0) { 458 --length; 459 } 460 if (length != longNameData.length) { 461 final byte[] l = new byte[length]; 462 System.arraycopy(longNameData, 0, l, 0, length); 463 longNameData = l; 464 } 465 return longNameData; 466 } 467 468 /** 469 * Get the next record in this tar archive. This will skip 470 * over any remaining data in the current entry, if there 471 * is one, and place the input stream at the header of the 472 * next entry. 473 * 474 * <p>If there are no more entries in the archive, null will be 475 * returned to indicate that the end of the archive has been 476 * reached. At the same time the {@code hasHitEOF} marker will be 477 * set to true.</p> 478 * 479 * @return The next header in the archive, or null. 480 * @throws IOException on error 481 */ 482 private byte[] getRecord() throws IOException { 483 byte[] headerBuf = readRecord(); 484 setAtEOF(isEOFRecord(headerBuf)); 485 if (isAtEOF() && headerBuf != null) { 486 tryToConsumeSecondEOFRecord(); 487 consumeRemainderOfLastBlock(); 488 headerBuf = null; 489 } 490 return headerBuf; 491 } 492 493 /** 494 * Determine if an archive record indicate End of Archive. End of 495 * archive is indicated by a record that consists entirely of null bytes. 496 * 497 * @param record The record data to check. 498 * @return true if the record data is an End of Archive 499 */ 500 protected boolean isEOFRecord(final byte[] record) { 501 return record == null || ArchiveUtils.isArrayZero(record, recordSize); 502 } 503 504 /** 505 * Read a record from the input stream and return the data. 506 * 507 * @return The record data or null if EOF has been hit. 508 * @throws IOException on error 509 */ 510 protected byte[] readRecord() throws IOException { 511 512 final byte[] record = new byte[recordSize]; 513 514 final int readNow = IOUtils.readFully(inputStream, record); 515 count(readNow); 516 if (readNow != recordSize) { 517 return null; 518 } 519 520 return record; 521 } 522 523 private void readGlobalPaxHeaders() throws IOException { 524 globalPaxHeaders = parsePaxHeaders(this, globalSparseHeaders); 525 getNextEntry(); // Get the actual file entry 526 } 527 528 /** 529 * For PAX Format 0.0, the sparse headers(GNU.sparse.offset and GNU.sparse.numbytes) 530 * may appear multi times, and they look like: 531 * 532 * GNU.sparse.size=size 533 * GNU.sparse.numblocks=numblocks 534 * repeat numblocks times 535 * GNU.sparse.offset=offset 536 * GNU.sparse.numbytes=numbytes 537 * end repeat 538 * 539 * 540 * For PAX Format 0.1, the sparse headers are stored in a single variable : GNU.sparse.map 541 * 542 * GNU.sparse.map 543 * Map of non-null data chunks. It is a string consisting of comma-separated values "offset,size[,offset-1,size-1...]" 544 * 545 * 546 * For PAX Format 1.X: 547 * The sparse map itself is stored in the file data block, preceding the actual file data. 548 * It consists of a series of decimal numbers delimited by newlines. The map is padded with nulls to the nearest block boundary. 549 * The first number gives the number of entries in the map. Following are map entries, each one consisting of two numbers 550 * giving the offset and size of the data block it describes. 551 * @throws IOException 552 */ 553 private void paxHeaders() throws IOException{ 554 List<TarArchiveStructSparse> sparseHeaders = new ArrayList<>(); 555 final Map<String, String> headers = parsePaxHeaders(this, sparseHeaders); 556 557 // for 0.1 PAX Headers 558 if (headers.containsKey("GNU.sparse.map")) { 559 sparseHeaders = parsePAX01SparseHeaders(headers.get("GNU.sparse.map")); 560 } 561 getNextEntry(); // Get the actual file entry 562 applyPaxHeadersToCurrentEntry(headers, sparseHeaders); 563 564 // for 1.0 PAX Format, the sparse map is stored in the file data block 565 if (currEntry.isPaxGNU1XSparse()) { 566 sparseHeaders = parsePAX1XSparseHeaders(); 567 currEntry.setSparseHeaders(sparseHeaders); 568 } 569 570 // sparse headers are all done reading, we need to build 571 // sparse input streams using these sparse headers 572 buildSparseInputStreams(); 573 } 574 575 /** 576 * For PAX Format 0.1, the sparse headers are stored in a single variable : GNU.sparse.map 577 * GNU.sparse.map 578 * Map of non-null data chunks. It is a string consisting of comma-separated values "offset,size[,offset-1,size-1...]" 579 * 580 * @param sparseMap the sparse map string consisting of comma-separated values "offset,size[,offset-1,size-1...]" 581 * @return sparse headers parsed from sparse map 582 * @throws IOException 583 */ 584 private List<TarArchiveStructSparse> parsePAX01SparseHeaders(String sparseMap) throws IOException { 585 List<TarArchiveStructSparse> sparseHeaders = new ArrayList<>(); 586 String[] sparseHeaderStrings = sparseMap.split(","); 587 588 for (int i = 0; i < sparseHeaderStrings.length;i += 2) { 589 long sparseOffset = Long.parseLong(sparseHeaderStrings[i]); 590 long sparseNumbytes = Long.parseLong(sparseHeaderStrings[i + 1]); 591 sparseHeaders.add(new TarArchiveStructSparse(sparseOffset, sparseNumbytes)); 592 } 593 594 return sparseHeaders; 595 } 596 597 /** 598 * For PAX Format 1.X: 599 * The sparse map itself is stored in the file data block, preceding the actual file data. 600 * It consists of a series of decimal numbers delimited by newlines. The map is padded with nulls to the nearest block boundary. 601 * The first number gives the number of entries in the map. Following are map entries, each one consisting of two numbers 602 * giving the offset and size of the data block it describes. 603 * @return sparse headers 604 * @throws IOException 605 */ 606 private List<TarArchiveStructSparse> parsePAX1XSparseHeaders() throws IOException { 607 // for 1.X PAX Headers 608 List<TarArchiveStructSparse> sparseHeaders = new ArrayList<>(); 609 long bytesRead = 0; 610 611 long[] readResult = readLineOfNumberForPax1X(inputStream); 612 long sparseHeadersCount = readResult[0]; 613 bytesRead += readResult[1]; 614 while (sparseHeadersCount-- > 0) { 615 readResult = readLineOfNumberForPax1X(inputStream); 616 long sparseOffset = readResult[0]; 617 bytesRead += readResult[1]; 618 619 readResult = readLineOfNumberForPax1X(inputStream); 620 long sparseNumbytes = readResult[0]; 621 bytesRead += readResult[1]; 622 sparseHeaders.add(new TarArchiveStructSparse(sparseOffset, sparseNumbytes)); 623 } 624 625 // skip the rest of this record data 626 long bytesToSkip = recordSize - bytesRead % recordSize; 627 IOUtils.skip(inputStream, bytesToSkip); 628 return sparseHeaders; 629 } 630 631 /** 632 * For 1.X PAX Format, the sparse headers are stored in the file data block, preceding the actual file data. 633 * It consists of a series of decimal numbers delimited by newlines. 634 * 635 * @param inputStream the input stream of the tar file 636 * @return the decimal number delimited by '\n', and the bytes read from input stream 637 * @throws IOException 638 */ 639 private long[] readLineOfNumberForPax1X(InputStream inputStream) throws IOException { 640 int number; 641 long result = 0; 642 long bytesRead = 0; 643 644 while((number = inputStream.read()) != '\n') { 645 bytesRead += 1; 646 if(number == -1) { 647 throw new IOException("Unexpected EOF when reading parse information of 1.X PAX format"); 648 } 649 result = result * 10 + (number - '0'); 650 } 651 bytesRead += 1; 652 653 return new long[] {result, bytesRead}; 654 } 655 656 /** 657 * For PAX Format 0.0, the sparse headers(GNU.sparse.offset and GNU.sparse.numbytes) 658 * may appear multi times, and they look like: 659 * 660 * GNU.sparse.size=size 661 * GNU.sparse.numblocks=numblocks 662 * repeat numblocks times 663 * GNU.sparse.offset=offset 664 * GNU.sparse.numbytes=numbytes 665 * end repeat 666 * 667 * For PAX Format 0.1, the sparse headers are stored in a single variable : GNU.sparse.map 668 * 669 * GNU.sparse.map 670 * Map of non-null data chunks. It is a string consisting of comma-separated values "offset,size[,offset-1,size-1...]" 671 * 672 * @param inputstream inputstream to read keys and values 673 * @param sparseHeaders used in PAX Format 0.0 & 0.1, as it may appear multi times, 674 * the sparse headers need to be stored in an array, not a map 675 * @return map of PAX headers values found inside of the current (local or global) PAX headers tar entry. 676 * @throws IOException 677 */ 678 Map<String, String> parsePaxHeaders(final InputStream inputStream, List<TarArchiveStructSparse> sparseHeaders) 679 throws IOException { 680 final Map<String, String> headers = new HashMap<>(globalPaxHeaders); 681 Long offset = null; 682 // Format is "length keyword=value\n"; 683 while(true) { // get length 684 int ch; 685 int len = 0; 686 int read = 0; 687 while((ch = inputStream.read()) != -1) { 688 read++; 689 if (ch == '\n') { // blank line in header 690 break; 691 } else if (ch == ' '){ // End of length string 692 // Get keyword 693 final ByteArrayOutputStream coll = new ByteArrayOutputStream(); 694 while((ch = inputStream.read()) != -1) { 695 read++; 696 if (ch == '='){ // end of keyword 697 final String keyword = coll.toString(CharsetNames.UTF_8); 698 // Get rest of entry 699 final int restLen = len - read; 700 if (restLen == 1) { // only NL 701 headers.remove(keyword); 702 } else { 703 final byte[] rest = new byte[restLen]; 704 final int got = IOUtils.readFully(inputStream, rest); 705 if (got != restLen) { 706 throw new IOException("Failed to read " 707 + "Paxheader. Expected " 708 + restLen 709 + " bytes, read " 710 + got); 711 } 712 // Drop trailing NL 713 final String value = new String(rest, 0, 714 restLen - 1, CharsetNames.UTF_8); 715 headers.put(keyword, value); 716 717 // for 0.0 PAX Headers 718 if (keyword.equals("GNU.sparse.offset")) { 719 if (offset != null) { 720 // previous GNU.sparse.offset header but but no numBytes 721 sparseHeaders.add(new TarArchiveStructSparse(offset, 0)); 722 } 723 offset = Long.valueOf(value); 724 } 725 726 // for 0.0 PAX Headers 727 if (keyword.equals("GNU.sparse.numbytes")) { 728 if (offset == null) { 729 throw new IOException("Failed to read Paxheader." + 730 "GNU.sparse.offset is expected before GNU.sparse.numbytes shows up."); 731 } 732 sparseHeaders.add(new TarArchiveStructSparse(offset, Long.parseLong(value))); 733 offset = null; 734 } 735 } 736 break; 737 } 738 coll.write((byte) ch); 739 } 740 break; // Processed single header 741 } 742 len *= 10; 743 len += ch - '0'; 744 } 745 if (ch == -1){ // EOF 746 break; 747 } 748 } 749 if (offset != null) { 750 // offset but no numBytes 751 sparseHeaders.add(new TarArchiveStructSparse(offset, 0)); 752 } 753 return headers; 754 } 755 756 private void applyPaxHeadersToCurrentEntry(final Map<String, String> headers, final List<TarArchiveStructSparse> sparseHeaders) { 757 currEntry.updateEntryFromPaxHeaders(headers); 758 currEntry.setSparseHeaders(sparseHeaders); 759 } 760 761 /** 762 * Adds the sparse chunks from the current entry to the sparse chunks, 763 * including any additional sparse entries following the current entry. 764 * 765 * @throws IOException on error 766 */ 767 private void readOldGNUSparse() throws IOException { 768 if (currEntry.isExtended()) { 769 TarArchiveSparseEntry entry; 770 do { 771 final byte[] headerBuf = getRecord(); 772 if (headerBuf == null) { 773 currEntry = null; 774 break; 775 } 776 entry = new TarArchiveSparseEntry(headerBuf); 777 currEntry.getSparseHeaders().addAll(entry.getSparseHeaders()); 778 } while (entry.isExtended()); 779 } 780 781 // sparse headers are all done reading, we need to build 782 // sparse input streams using these sparse headers 783 buildSparseInputStreams(); 784 } 785 786 private boolean isDirectory() { 787 return currEntry != null && currEntry.isDirectory(); 788 } 789 790 /** 791 * Returns the next Archive Entry in this Stream. 792 * 793 * @return the next entry, 794 * or {@code null} if there are no more entries 795 * @throws IOException if the next entry could not be read 796 */ 797 @Override 798 public ArchiveEntry getNextEntry() throws IOException { 799 return getNextTarEntry(); 800 } 801 802 /** 803 * Tries to read the next record rewinding the stream if it is not a EOF record. 804 * 805 * <p>This is meant to protect against cases where a tar 806 * implementation has written only one EOF record when two are 807 * expected. Actually this won't help since a non-conforming 808 * implementation likely won't fill full blocks consisting of - by 809 * default - ten records either so we probably have already read 810 * beyond the archive anyway.</p> 811 */ 812 private void tryToConsumeSecondEOFRecord() throws IOException { 813 boolean shouldReset = true; 814 final boolean marked = inputStream.markSupported(); 815 if (marked) { 816 inputStream.mark(recordSize); 817 } 818 try { 819 shouldReset = !isEOFRecord(readRecord()); 820 } finally { 821 if (shouldReset && marked) { 822 pushedBackBytes(recordSize); 823 inputStream.reset(); 824 } 825 } 826 } 827 828 /** 829 * Reads bytes from the current tar archive entry. 830 * 831 * This method is aware of the boundaries of the current 832 * entry in the archive and will deal with them as if they 833 * were this stream's start and EOF. 834 * 835 * @param buf The buffer into which to place bytes read. 836 * @param offset The offset at which to place bytes read. 837 * @param numToRead The number of bytes to read. 838 * @return The number of bytes read, or -1 at EOF. 839 * @throws IOException on error 840 */ 841 @Override 842 public int read(final byte[] buf, final int offset, int numToRead) throws IOException { 843 if (numToRead == 0) { 844 return 0; 845 } 846 int totalRead = 0; 847 848 if (isAtEOF() || isDirectory()) { 849 return -1; 850 } 851 852 if (currEntry == null) { 853 throw new IllegalStateException("No current tar entry"); 854 } 855 856 if (!currEntry.isSparse()) { 857 if (entryOffset >= entrySize) { 858 return -1; 859 } 860 } else { 861 // for sparse entries, there are actually currEntry.getRealSize() bytes to read 862 if (entryOffset >= currEntry.getRealSize()) { 863 return -1; 864 } 865 } 866 867 numToRead = Math.min(numToRead, available()); 868 869 if (currEntry.isSparse()) { 870 // for sparse entries, we need to read them in another way 871 totalRead = readSparse(buf, offset, numToRead); 872 } else { 873 totalRead = inputStream.read(buf, offset, numToRead); 874 } 875 876 if (totalRead == -1) { 877 if (numToRead > 0) { 878 throw new IOException("Truncated TAR archive"); 879 } 880 setAtEOF(true); 881 } else { 882 count(totalRead); 883 entryOffset += totalRead; 884 } 885 886 return totalRead; 887 } 888 889 /** 890 * For sparse tar entries, there are many "holes"(consisting of all 0) in the file. Only the non-zero data is 891 * stored in tar files, and they are stored separately. The structure of non-zero data is introduced by the 892 * sparse headers using the offset, where a block of non-zero data starts, and numbytes, the length of the 893 * non-zero data block. 894 * When reading sparse entries, the actual data is read out with "holes" and non-zero data combined together 895 * according to the sparse headers. 896 * 897 * @param buf The buffer into which to place bytes read. 898 * @param offset The offset at which to place bytes read. 899 * @param numToRead The number of bytes to read. 900 * @return The number of bytes read, or -1 at EOF. 901 * @throws IOException on error 902 */ 903 private int readSparse(final byte[] buf, final int offset, int numToRead) throws IOException { 904 // if there are no actual input streams, just read from the original input stream 905 if (sparseInputStreams == null || sparseInputStreams.size() == 0) { 906 return inputStream.read(buf, offset, numToRead); 907 } 908 909 if(currentSparseInputStreamIndex >= sparseInputStreams.size()) { 910 return -1; 911 } 912 913 InputStream currentInputStream = sparseInputStreams.get(currentSparseInputStreamIndex); 914 int readLen = currentInputStream.read(buf, offset, numToRead); 915 916 // if the current input stream is the last input stream, 917 // just return the number of bytes read from current input stream 918 if (currentSparseInputStreamIndex == sparseInputStreams.size() - 1) { 919 return readLen; 920 } 921 922 // if EOF of current input stream is meet, open a new input stream and recursively call read 923 if (readLen == -1) { 924 currentSparseInputStreamIndex++; 925 return readSparse(buf, offset, numToRead); 926 } 927 928 // if the rest data of current input stream is not long enough, open a new input stream 929 // and recursively call read 930 if (readLen < numToRead) { 931 currentSparseInputStreamIndex++; 932 int readLenOfNext = readSparse(buf, offset + readLen, numToRead - readLen); 933 if (readLenOfNext == -1) { 934 return readLen; 935 } 936 937 return readLen + readLenOfNext; 938 } 939 940 // if the rest data of current input stream is enough(which means readLen == len), just return readLen 941 return readLen; 942 } 943 944 /** 945 * Whether this class is able to read the given entry. 946 * 947 * <p>May return false if the current entry is a sparse file.</p> 948 */ 949 @Override 950 public boolean canReadEntryData(final ArchiveEntry ae) { 951 if (ae instanceof TarArchiveEntry) { 952 final TarArchiveEntry te = (TarArchiveEntry) ae; 953 return !te.isSparse(); 954 } 955 return false; 956 } 957 958 /** 959 * Get the current TAR Archive Entry that this input stream is processing 960 * 961 * @return The current Archive Entry 962 */ 963 public TarArchiveEntry getCurrentEntry() { 964 return currEntry; 965 } 966 967 protected final void setCurrentEntry(final TarArchiveEntry e) { 968 currEntry = e; 969 } 970 971 protected final boolean isAtEOF() { 972 return hasHitEOF; 973 } 974 975 protected final void setAtEOF(final boolean b) { 976 hasHitEOF = b; 977 } 978 979 /** 980 * This method is invoked once the end of the archive is hit, it 981 * tries to consume the remaining bytes under the assumption that 982 * the tool creating this archive has padded the last block. 983 */ 984 private void consumeRemainderOfLastBlock() throws IOException { 985 final long bytesReadOfLastBlock = getBytesRead() % blockSize; 986 if (bytesReadOfLastBlock > 0) { 987 final long skipped = IOUtils.skip(inputStream, blockSize - bytesReadOfLastBlock); 988 count(skipped); 989 } 990 } 991 992 /** 993 * Checks if the signature matches what is expected for a tar file. 994 * 995 * @param signature 996 * the bytes to check 997 * @param length 998 * the number of bytes to check 999 * @return true, if this stream is a tar archive stream, false otherwise 1000 */ 1001 public static boolean matches(final byte[] signature, final int length) { 1002 if (length < TarConstants.VERSION_OFFSET+TarConstants.VERSIONLEN) { 1003 return false; 1004 } 1005 1006 if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_POSIX, 1007 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN) 1008 && 1009 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_POSIX, 1010 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 1011 ){ 1012 return true; 1013 } 1014 if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_GNU, 1015 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN) 1016 && 1017 ( 1018 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_SPACE, 1019 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 1020 || 1021 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_ZERO, 1022 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 1023 ) 1024 ){ 1025 return true; 1026 } 1027 // COMPRESS-107 - recognise Ant tar files 1028 return ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_ANT, 1029 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN) 1030 && 1031 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_ANT, 1032 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN); 1033 } 1034 1035 /** 1036 * Build the input streams consisting of all-zero input streams and non-zero input streams. 1037 * When reading from the non-zero input streams, the data is actually read from the original input stream. 1038 * The size of each input stream is introduced by the sparse headers. 1039 * 1040 * NOTE : Some all-zero input streams and non-zero input streams have the size of 0. We DO NOT store the 1041 * 0 size input streams because they are meaningless. 1042 */ 1043 private void buildSparseInputStreams() throws IOException { 1044 currentSparseInputStreamIndex = -1; 1045 sparseInputStreams = new ArrayList<>(); 1046 1047 final List<TarArchiveStructSparse> sparseHeaders = currEntry.getSparseHeaders(); 1048 // sort the sparse headers in case they are written in wrong order 1049 if (sparseHeaders != null && sparseHeaders.size() > 1) { 1050 final Comparator<TarArchiveStructSparse> sparseHeaderComparator = new Comparator<TarArchiveStructSparse>() { 1051 @Override 1052 public int compare(final TarArchiveStructSparse p, final TarArchiveStructSparse q) { 1053 Long pOffset = p.getOffset(); 1054 Long qOffset = q.getOffset(); 1055 return pOffset.compareTo(qOffset); 1056 } 1057 }; 1058 Collections.sort(sparseHeaders, sparseHeaderComparator); 1059 } 1060 1061 if (sparseHeaders != null) { 1062 // Stream doesn't need to be closed at all as it doesn't use any resources 1063 final InputStream zeroInputStream = new TarArchiveSparseZeroInputStream(); //NOSONAR 1064 long offset = 0; 1065 for (TarArchiveStructSparse sparseHeader : sparseHeaders) { 1066 if (sparseHeader.getOffset() == 0 && sparseHeader.getNumbytes() == 0) { 1067 break; 1068 } 1069 1070 if ((sparseHeader.getOffset() - offset) < 0) { 1071 throw new IOException("Corrupted struct sparse detected"); 1072 } 1073 1074 // only store the input streams with non-zero size 1075 if ((sparseHeader.getOffset() - offset) > 0) { 1076 sparseInputStreams.add(new BoundedInputStream(zeroInputStream, sparseHeader.getOffset() - offset)); 1077 } 1078 1079 // only store the input streams with non-zero size 1080 if (sparseHeader.getNumbytes() > 0) { 1081 sparseInputStreams.add(new BoundedInputStream(inputStream, sparseHeader.getNumbytes())); 1082 } 1083 1084 offset = sparseHeader.getOffset() + sparseHeader.getNumbytes(); 1085 } 1086 } 1087 1088 if (sparseInputStreams.size() > 0) { 1089 currentSparseInputStreamIndex = 0; 1090 } 1091 } 1092 1093 /** 1094 * This is an inputstream that always return 0, 1095 * this is used when reading the "holes" of a sparse file 1096 */ 1097 private static class TarArchiveSparseZeroInputStream extends InputStream { 1098 /** 1099 * Just return 0 1100 * @return 1101 * @throws IOException 1102 */ 1103 @Override 1104 public int read() throws IOException { 1105 return 0; 1106 } 1107 1108 /** 1109 * these's nothing need to do when skipping 1110 * 1111 * @param n bytes to skip 1112 * @return bytes actually skipped 1113 */ 1114 @Override 1115 public long skip(final long n) { 1116 return n; 1117 } 1118 } 1119}