001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 package org.apache.hadoop.fs; 019 020 import org.apache.commons.logging.Log; 021 import org.apache.commons.logging.LogFactory; 022 import org.apache.hadoop.conf.Configuration; 023 import org.apache.hadoop.fs.permission.FsPermission; 024 import org.apache.hadoop.io.IOUtils; 025 import org.apache.hadoop.io.Text; 026 import org.apache.hadoop.util.LineReader; 027 import org.apache.hadoop.util.Progressable; 028 029 import java.io.FileNotFoundException; 030 import java.io.IOException; 031 import java.io.UnsupportedEncodingException; 032 import java.net.URI; 033 import java.net.URISyntaxException; 034 import java.net.URLDecoder; 035 import java.util.*; 036 037 /** 038 * This is an implementation of the Hadoop Archive 039 * Filesystem. This archive Filesystem has index files 040 * of the form _index* and has contents of the form 041 * part-*. The index files store the indexes of the 042 * real files. The index files are of the form _masterindex 043 * and _index. The master index is a level of indirection 044 * in to the index file to make the look ups faster. the index 045 * file is sorted with hash code of the paths that it contains 046 * and the master index contains pointers to the positions in 047 * index for ranges of hashcodes. 048 */ 049 050 public class HarFileSystem extends FileSystem { 051 052 private static final Log LOG = LogFactory.getLog(HarFileSystem.class); 053 054 public static final String METADATA_CACHE_ENTRIES_KEY = "fs.har.metadatacache.entries"; 055 public static final int METADATA_CACHE_ENTRIES_DEFAULT = 10; 056 057 public static final int VERSION = 3; 058 059 private static Map<URI, HarMetaData> harMetaCache; 060 061 // uri representation of this Har filesystem 062 private URI uri; 063 // the top level path of the archive 064 // in the underlying file system 065 private Path archivePath; 066 // the har auth 067 private String harAuth; 068 069 // pointer into the static metadata cache 070 private HarMetaData metadata; 071 072 private FileSystem fs; 073 074 /** 075 * public construction of harfilesystem 076 */ 077 public HarFileSystem() { 078 // Must call #initialize() method to set the underlying file system 079 } 080 081 /** 082 * Return the protocol scheme for the FileSystem. 083 * <p/> 084 * 085 * @return <code>har</code> 086 */ 087 @Override 088 public String getScheme() { 089 return "har"; 090 } 091 092 /** 093 * Constructor to create a HarFileSystem with an 094 * underlying filesystem. 095 * @param fs underlying file system 096 */ 097 public HarFileSystem(FileSystem fs) { 098 this.fs = fs; 099 this.statistics = fs.statistics; 100 } 101 102 private synchronized void initializeMetadataCache(Configuration conf) { 103 if (harMetaCache == null) { 104 int cacheSize = conf.getInt(METADATA_CACHE_ENTRIES_KEY, METADATA_CACHE_ENTRIES_DEFAULT); 105 harMetaCache = Collections.synchronizedMap(new LruCache<URI, HarMetaData>(cacheSize)); 106 } 107 } 108 109 /** 110 * Initialize a Har filesystem per har archive. The 111 * archive home directory is the top level directory 112 * in the filesystem that contains the HAR archive. 113 * Be careful with this method, you do not want to go 114 * on creating new Filesystem instances per call to 115 * path.getFileSystem(). 116 * the uri of Har is 117 * har://underlyingfsscheme-host:port/archivepath. 118 * or 119 * har:///archivepath. This assumes the underlying filesystem 120 * to be used in case not specified. 121 */ 122 @Override 123 public void initialize(URI name, Configuration conf) throws IOException { 124 // initialize the metadata cache, if needed 125 initializeMetadataCache(conf); 126 127 // decode the name 128 URI underLyingURI = decodeHarURI(name, conf); 129 // we got the right har Path- now check if this is 130 // truly a har filesystem 131 Path harPath = archivePath( 132 new Path(name.getScheme(), name.getAuthority(), name.getPath())); 133 if (harPath == null) { 134 throw new IOException("Invalid path for the Har Filesystem. " + 135 name.toString()); 136 } 137 if (fs == null) { 138 fs = FileSystem.get(underLyingURI, conf); 139 } 140 uri = harPath.toUri(); 141 archivePath = new Path(uri.getPath()); 142 harAuth = getHarAuth(underLyingURI); 143 //check for the underlying fs containing 144 // the index file 145 Path masterIndexPath = new Path(archivePath, "_masterindex"); 146 Path archiveIndexPath = new Path(archivePath, "_index"); 147 if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) { 148 throw new IOException("Invalid path for the Har Filesystem. " + 149 "No index file in " + harPath); 150 } 151 152 metadata = harMetaCache.get(uri); 153 if (metadata != null) { 154 FileStatus mStat = fs.getFileStatus(masterIndexPath); 155 FileStatus aStat = fs.getFileStatus(archiveIndexPath); 156 if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() || 157 aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) { 158 // the archive has been overwritten since we last read it 159 // remove the entry from the meta data cache 160 metadata = null; 161 harMetaCache.remove(uri); 162 } 163 } 164 if (metadata == null) { 165 metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath); 166 metadata.parseMetaData(); 167 harMetaCache.put(uri, metadata); 168 } 169 } 170 171 @Override 172 public Configuration getConf() { 173 return fs.getConf(); 174 } 175 176 // get the version of the filesystem from the masterindex file 177 // the version is currently not useful since its the first version 178 // of archives 179 public int getHarVersion() throws IOException { 180 if (metadata != null) { 181 return metadata.getVersion(); 182 } 183 else { 184 throw new IOException("Invalid meta data for the Har Filesystem"); 185 } 186 } 187 188 /* 189 * find the parent path that is the 190 * archive path in the path. The last 191 * path segment that ends with .har is 192 * the path that will be returned. 193 */ 194 private Path archivePath(Path p) { 195 Path retPath = null; 196 Path tmp = p; 197 for (int i=0; i< p.depth(); i++) { 198 if (tmp.toString().endsWith(".har")) { 199 retPath = tmp; 200 break; 201 } 202 tmp = tmp.getParent(); 203 } 204 return retPath; 205 } 206 207 /** 208 * decode the raw URI to get the underlying URI 209 * @param rawURI raw Har URI 210 * @return filtered URI of the underlying fileSystem 211 */ 212 private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException { 213 String tmpAuth = rawURI.getAuthority(); 214 //we are using the default file 215 //system in the config 216 //so create a underlying uri and 217 //return it 218 if (tmpAuth == null) { 219 //create a path 220 return FileSystem.getDefaultUri(conf); 221 } 222 String authority = rawURI.getAuthority(); 223 if (authority == null) { 224 throw new IOException("URI: " + rawURI 225 + " is an invalid Har URI since authority==null." 226 + " Expecting har://<scheme>-<host>/<path>."); 227 } 228 229 int i = authority.indexOf('-'); 230 if (i < 0) { 231 throw new IOException("URI: " + rawURI 232 + " is an invalid Har URI since '-' not found." 233 + " Expecting har://<scheme>-<host>/<path>."); 234 } 235 236 if (rawURI.getQuery() != null) { 237 // query component not allowed 238 throw new IOException("query component in Path not supported " + rawURI); 239 } 240 241 URI tmp; 242 try { 243 // convert <scheme>-<host> to <scheme>://<host> 244 URI baseUri = new URI(authority.replaceFirst("-", "://")); 245 246 tmp = new URI(baseUri.getScheme(), baseUri.getAuthority(), 247 rawURI.getPath(), rawURI.getQuery(), rawURI.getFragment()); 248 } catch (URISyntaxException e) { 249 throw new IOException("URI: " + rawURI 250 + " is an invalid Har URI. Expecting har://<scheme>-<host>/<path>."); 251 } 252 return tmp; 253 } 254 255 private static String decodeString(String str) 256 throws UnsupportedEncodingException { 257 return URLDecoder.decode(str, "UTF-8"); 258 } 259 260 private String decodeFileName(String fname) 261 throws UnsupportedEncodingException { 262 int version = metadata.getVersion(); 263 if (version == 2 || version == 3){ 264 return decodeString(fname); 265 } 266 return fname; 267 } 268 269 /** 270 * return the top level archive. 271 */ 272 @Override 273 public Path getWorkingDirectory() { 274 return new Path(uri.toString()); 275 } 276 277 @Override 278 public Path getInitialWorkingDirectory() { 279 return getWorkingDirectory(); 280 } 281 282 @Override 283 public FsStatus getStatus(Path p) throws IOException { 284 return fs.getStatus(p); 285 } 286 287 /** 288 * Create a har specific auth 289 * har-underlyingfs:port 290 * @param underLyingUri the uri of underlying 291 * filesystem 292 * @return har specific auth 293 */ 294 private String getHarAuth(URI underLyingUri) { 295 String auth = underLyingUri.getScheme() + "-"; 296 if (underLyingUri.getHost() != null) { 297 if (underLyingUri.getUserInfo() != null) { 298 auth += underLyingUri.getUserInfo(); 299 auth += "@"; 300 } 301 auth += underLyingUri.getHost(); 302 if (underLyingUri.getPort() != -1) { 303 auth += ":"; 304 auth += underLyingUri.getPort(); 305 } 306 } 307 else { 308 auth += ":"; 309 } 310 return auth; 311 } 312 313 /** 314 * Used for delegation token related functionality. Must delegate to 315 * underlying file system. 316 */ 317 @Override 318 protected URI getCanonicalUri() { 319 return fs.getCanonicalUri(); 320 } 321 322 @Override 323 protected URI canonicalizeUri(URI uri) { 324 return fs.canonicalizeUri(uri); 325 } 326 327 /** 328 * Returns the uri of this filesystem. 329 * The uri is of the form 330 * har://underlyingfsschema-host:port/pathintheunderlyingfs 331 */ 332 @Override 333 public URI getUri() { 334 return this.uri; 335 } 336 337 @Override 338 protected void checkPath(Path path) { 339 fs.checkPath(path); 340 } 341 342 @Override 343 public Path resolvePath(Path p) throws IOException { 344 return fs.resolvePath(p); 345 } 346 347 /** 348 * this method returns the path 349 * inside the har filesystem. 350 * this is relative path inside 351 * the har filesystem. 352 * @param path the fully qualified path in the har filesystem. 353 * @return relative path in the filesystem. 354 */ 355 private Path getPathInHar(Path path) { 356 Path harPath = new Path(path.toUri().getPath()); 357 if (archivePath.compareTo(harPath) == 0) 358 return new Path(Path.SEPARATOR); 359 Path tmp = new Path(harPath.getName()); 360 Path parent = harPath.getParent(); 361 while (!(parent.compareTo(archivePath) == 0)) { 362 if (parent.toString().equals(Path.SEPARATOR)) { 363 tmp = null; 364 break; 365 } 366 tmp = new Path(parent.getName(), tmp); 367 parent = parent.getParent(); 368 } 369 if (tmp != null) 370 tmp = new Path(Path.SEPARATOR, tmp); 371 return tmp; 372 } 373 374 //the relative path of p. basically 375 // getting rid of /. Parsing and doing 376 // string manipulation is not good - so 377 // just use the path api to do it. 378 private Path makeRelative(String initial, Path p) { 379 String scheme = this.uri.getScheme(); 380 String authority = this.uri.getAuthority(); 381 Path root = new Path(Path.SEPARATOR); 382 if (root.compareTo(p) == 0) 383 return new Path(scheme, authority, initial); 384 Path retPath = new Path(p.getName()); 385 Path parent = p.getParent(); 386 for (int i=0; i < p.depth()-1; i++) { 387 retPath = new Path(parent.getName(), retPath); 388 parent = parent.getParent(); 389 } 390 return new Path(new Path(scheme, authority, initial), 391 retPath.toString()); 392 } 393 394 /* this makes a path qualified in the har filesystem 395 * (non-Javadoc) 396 * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified( 397 * org.apache.hadoop.fs.Path) 398 */ 399 @Override 400 public Path makeQualified(Path path) { 401 // make sure that we just get the 402 // path component 403 Path fsPath = path; 404 if (!path.isAbsolute()) { 405 fsPath = new Path(archivePath, path); 406 } 407 408 URI tmpURI = fsPath.toUri(); 409 //change this to Har uri 410 return new Path(uri.getScheme(), harAuth, tmpURI.getPath()); 411 } 412 413 /** 414 * Fix offset and length of block locations. 415 * Note that this method modifies the original array. 416 * @param locations block locations of har part file 417 * @param start the start of the desired range in the contained file 418 * @param len the length of the desired range 419 * @param fileOffsetInHar the offset of the desired file in the har part file 420 * @return block locations with fixed offset and length 421 */ 422 static BlockLocation[] fixBlockLocations(BlockLocation[] locations, 423 long start, 424 long len, 425 long fileOffsetInHar) { 426 // offset 1 past last byte of desired range 427 long end = start + len; 428 429 for (BlockLocation location : locations) { 430 // offset of part block relative to beginning of desired file 431 // (may be negative if file starts in this part block) 432 long harBlockStart = location.getOffset() - fileOffsetInHar; 433 // offset 1 past last byte of har block relative to beginning of 434 // desired file 435 long harBlockEnd = harBlockStart + location.getLength(); 436 437 if (start > harBlockStart) { 438 // desired range starts after beginning of this har block 439 // fix offset to beginning of relevant range (relative to desired file) 440 location.setOffset(start); 441 // fix length to relevant portion of har block 442 location.setLength(location.getLength() - (start - harBlockStart)); 443 } else { 444 // desired range includes beginning of this har block 445 location.setOffset(harBlockStart); 446 } 447 448 if (harBlockEnd > end) { 449 // range ends before end of this har block 450 // fix length to remove irrelevant portion at the end 451 location.setLength(location.getLength() - (harBlockEnd - end)); 452 } 453 } 454 455 return locations; 456 } 457 458 /** 459 * Get block locations from the underlying fs and fix their 460 * offsets and lengths. 461 * @param file the input file status to get block locations 462 * @param start the start of the desired range in the contained file 463 * @param len the length of the desired range 464 * @return block locations for this segment of file 465 * @throws IOException 466 */ 467 @Override 468 public BlockLocation[] getFileBlockLocations(FileStatus file, long start, 469 long len) throws IOException { 470 HarStatus hstatus = getFileHarStatus(file.getPath()); 471 Path partPath = new Path(archivePath, hstatus.getPartName()); 472 FileStatus partStatus = metadata.getPartFileStatus(partPath); 473 474 // get all part blocks that overlap with the desired file blocks 475 BlockLocation[] locations = 476 fs.getFileBlockLocations(partStatus, 477 hstatus.getStartIndex() + start, len); 478 479 return fixBlockLocations(locations, start, len, hstatus.getStartIndex()); 480 } 481 482 /** 483 * the hash of the path p inside the filesystem 484 * @param p the path in the harfilesystem 485 * @return the hash code of the path. 486 */ 487 public static int getHarHash(Path p) { 488 return (p.toString().hashCode() & 0x7fffffff); 489 } 490 491 static class Store { 492 public Store() { 493 begin = end = startHash = endHash = 0; 494 } 495 public Store(long begin, long end, int startHash, int endHash) { 496 this.begin = begin; 497 this.end = end; 498 this.startHash = startHash; 499 this.endHash = endHash; 500 } 501 public long begin; 502 public long end; 503 public int startHash; 504 public int endHash; 505 } 506 507 /** 508 * Get filestatuses of all the children of a given directory. This just reads 509 * through index file and reads line by line to get all statuses for children 510 * of a directory. Its a brute force way of getting all such filestatuses 511 * 512 * @param parent 513 * the parent path directory 514 * @param statuses 515 * the list to add the children filestatuses to 516 */ 517 private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses) 518 throws IOException { 519 String parentString = parent.getName(); 520 if (!parentString.endsWith(Path.SEPARATOR)){ 521 parentString += Path.SEPARATOR; 522 } 523 Path harPath = new Path(parentString); 524 int harlen = harPath.depth(); 525 final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>(); 526 527 for (HarStatus hstatus : metadata.archive.values()) { 528 String child = hstatus.getName(); 529 if ((child.startsWith(parentString))) { 530 Path thisPath = new Path(child); 531 if (thisPath.depth() == harlen + 1) { 532 statuses.add(toFileStatus(hstatus, cache)); 533 } 534 } 535 } 536 } 537 538 /** 539 * Combine the status stored in the index and the underlying status. 540 * @param h status stored in the index 541 * @param cache caching the underlying file statuses 542 * @return the combined file status 543 * @throws IOException 544 */ 545 private FileStatus toFileStatus(HarStatus h, 546 Map<String, FileStatus> cache) throws IOException { 547 FileStatus underlying = null; 548 if (cache != null) { 549 underlying = cache.get(h.partName); 550 } 551 if (underlying == null) { 552 final Path p = h.isDir? archivePath: new Path(archivePath, h.partName); 553 underlying = fs.getFileStatus(p); 554 if (cache != null) { 555 cache.put(h.partName, underlying); 556 } 557 } 558 559 long modTime = 0; 560 int version = metadata.getVersion(); 561 if (version < 3) { 562 modTime = underlying.getModificationTime(); 563 } else if (version == 3) { 564 modTime = h.getModificationTime(); 565 } 566 567 return new FileStatus( 568 h.isDir()? 0L: h.getLength(), 569 h.isDir(), 570 underlying.getReplication(), 571 underlying.getBlockSize(), 572 modTime, 573 underlying.getAccessTime(), 574 underlying.getPermission(), 575 underlying.getOwner(), 576 underlying.getGroup(), 577 makeRelative(this.uri.getPath(), new Path(h.name))); 578 } 579 580 // a single line parser for hadoop archives status 581 // stored in a single line in the index files 582 // the format is of the form 583 // filename "dir"/"file" partFileName startIndex length 584 // <space separated children> 585 private class HarStatus { 586 boolean isDir; 587 String name; 588 List<String> children; 589 String partName; 590 long startIndex; 591 long length; 592 long modificationTime = 0; 593 594 public HarStatus(String harString) throws UnsupportedEncodingException { 595 String[] splits = harString.split(" "); 596 this.name = decodeFileName(splits[0]); 597 this.isDir = "dir".equals(splits[1]) ? true: false; 598 // this is equal to "none" if its a directory 599 this.partName = splits[2]; 600 this.startIndex = Long.parseLong(splits[3]); 601 this.length = Long.parseLong(splits[4]); 602 603 int version = metadata.getVersion(); 604 String[] propSplits = null; 605 // propSplits is used to retrieve the metainformation that Har versions 606 // 1 & 2 missed (modification time, permission, owner group). 607 // These fields are stored in an encoded string placed in different 608 // locations depending on whether it's a file or directory entry. 609 // If it's a directory, the string will be placed at the partName 610 // location (directories have no partName because they don't have data 611 // to be stored). This is done because the number of fields in a 612 // directory entry is unbounded (all children are listed at the end) 613 // If it's a file, the string will be the last field. 614 if (isDir) { 615 if (version == 3){ 616 propSplits = decodeString(this.partName).split(" "); 617 } 618 children = new ArrayList<String>(); 619 for (int i = 5; i < splits.length; i++) { 620 children.add(decodeFileName(splits[i])); 621 } 622 } else if (version == 3) { 623 propSplits = decodeString(splits[5]).split(" "); 624 } 625 626 if (propSplits != null && propSplits.length >= 4) { 627 modificationTime = Long.parseLong(propSplits[0]); 628 // the fields below are stored in the file but are currently not used 629 // by HarFileSystem 630 // permission = new FsPermission(Short.parseShort(propSplits[1])); 631 // owner = decodeString(propSplits[2]); 632 // group = decodeString(propSplits[3]); 633 } 634 } 635 public boolean isDir() { 636 return isDir; 637 } 638 639 public String getName() { 640 return name; 641 } 642 public String getPartName() { 643 return partName; 644 } 645 public long getStartIndex() { 646 return startIndex; 647 } 648 public long getLength() { 649 return length; 650 } 651 public long getModificationTime() { 652 return modificationTime; 653 } 654 } 655 656 /** 657 * return the filestatus of files in har archive. 658 * The permission returned are that of the archive 659 * index files. The permissions are not persisted 660 * while creating a hadoop archive. 661 * @param f the path in har filesystem 662 * @return filestatus. 663 * @throws IOException 664 */ 665 @Override 666 public FileStatus getFileStatus(Path f) throws IOException { 667 HarStatus hstatus = getFileHarStatus(f); 668 return toFileStatus(hstatus, null); 669 } 670 671 private HarStatus getFileHarStatus(Path f) throws IOException { 672 // get the fs DataInputStream for the underlying file 673 // look up the index. 674 Path p = makeQualified(f); 675 Path harPath = getPathInHar(p); 676 if (harPath == null) { 677 throw new IOException("Invalid file name: " + f + " in " + uri); 678 } 679 HarStatus hstatus = metadata.archive.get(harPath); 680 if (hstatus == null) { 681 throw new FileNotFoundException("File: " + f + " does not exist in " + uri); 682 } 683 return hstatus; 684 } 685 686 /** 687 * @return null since no checksum algorithm is implemented. 688 */ 689 @Override 690 public FileChecksum getFileChecksum(Path f) { 691 return null; 692 } 693 694 /** 695 * Returns a har input stream which fakes end of 696 * file. It reads the index files to get the part 697 * file name and the size and start of the file. 698 */ 699 @Override 700 public FSDataInputStream open(Path f, int bufferSize) throws IOException { 701 // get the fs DataInputStream for the underlying file 702 HarStatus hstatus = getFileHarStatus(f); 703 if (hstatus.isDir()) { 704 throw new FileNotFoundException(f + " : not a file in " + 705 archivePath); 706 } 707 return new HarFSDataInputStream(fs, new Path(archivePath, 708 hstatus.getPartName()), 709 hstatus.getStartIndex(), hstatus.getLength(), bufferSize); 710 } 711 712 /** 713 * Used for delegation token related functionality. Must delegate to 714 * underlying file system. 715 */ 716 @Override 717 public FileSystem[] getChildFileSystems() { 718 return new FileSystem[]{fs}; 719 } 720 721 @Override 722 public FSDataOutputStream create(Path f, FsPermission permission, 723 boolean overwrite, int bufferSize, short replication, long blockSize, 724 Progressable progress) throws IOException { 725 throw new IOException("Har: create not allowed."); 726 } 727 728 @SuppressWarnings("deprecation") 729 @Override 730 public FSDataOutputStream createNonRecursive(Path f, boolean overwrite, 731 int bufferSize, short replication, long blockSize, Progressable progress) 732 throws IOException { 733 throw new IOException("Har: create not allowed."); 734 } 735 736 @Override 737 public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) throws IOException { 738 throw new IOException("Har: append not allowed."); 739 } 740 741 @Override 742 public void close() throws IOException { 743 super.close(); 744 if (fs != null) { 745 try { 746 fs.close(); 747 } catch(IOException ie) { 748 //this might already be closed 749 // ignore 750 } 751 } 752 } 753 754 /** 755 * Not implemented. 756 */ 757 @Override 758 public boolean setReplication(Path src, short replication) throws IOException{ 759 throw new IOException("Har: setReplication not allowed"); 760 } 761 762 @Override 763 public boolean rename(Path src, Path dst) throws IOException { 764 throw new IOException("Har: rename not allowed"); 765 } 766 767 @Override 768 public FSDataOutputStream append(Path f) throws IOException { 769 throw new IOException("Har: append not allowed"); 770 } 771 772 /** 773 * Not implemented. 774 */ 775 @Override 776 public boolean delete(Path f, boolean recursive) throws IOException { 777 throw new IOException("Har: delete not allowed"); 778 } 779 780 /** 781 * liststatus returns the children of a directory 782 * after looking up the index files. 783 */ 784 @Override 785 public FileStatus[] listStatus(Path f) throws IOException { 786 //need to see if the file is an index in file 787 //get the filestatus of the archive directory 788 // we will create fake filestatuses to return 789 // to the client 790 List<FileStatus> statuses = new ArrayList<FileStatus>(); 791 Path tmpPath = makeQualified(f); 792 Path harPath = getPathInHar(tmpPath); 793 HarStatus hstatus = metadata.archive.get(harPath); 794 if (hstatus == null) { 795 throw new FileNotFoundException("File " + f + " not found in " + archivePath); 796 } 797 if (hstatus.isDir()) { 798 fileStatusesInIndex(hstatus, statuses); 799 } else { 800 statuses.add(toFileStatus(hstatus, null)); 801 } 802 803 return statuses.toArray(new FileStatus[statuses.size()]); 804 } 805 806 /** 807 * return the top level archive path. 808 */ 809 @Override 810 public Path getHomeDirectory() { 811 return new Path(uri.toString()); 812 } 813 814 @Override 815 public void setWorkingDirectory(Path newDir) { 816 //does nothing. 817 } 818 819 /** 820 * not implemented. 821 */ 822 @Override 823 public boolean mkdirs(Path f, FsPermission permission) throws IOException { 824 throw new IOException("Har: mkdirs not allowed"); 825 } 826 827 /** 828 * not implemented. 829 */ 830 @Override 831 public void copyFromLocalFile(boolean delSrc, boolean overwrite, 832 Path src, Path dst) throws IOException { 833 throw new IOException("Har: copyfromlocalfile not allowed"); 834 } 835 836 @Override 837 public void copyFromLocalFile(boolean delSrc, boolean overwrite, 838 Path[] srcs, Path dst) throws IOException { 839 throw new IOException("Har: copyfromlocalfile not allowed"); 840 } 841 842 /** 843 * copies the file in the har filesystem to a local file. 844 */ 845 @Override 846 public void copyToLocalFile(boolean delSrc, Path src, Path dst) 847 throws IOException { 848 FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf()); 849 } 850 851 /** 852 * not implemented. 853 */ 854 @Override 855 public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) 856 throws IOException { 857 throw new IOException("Har: startLocalOutput not allowed"); 858 } 859 860 /** 861 * not implemented. 862 */ 863 @Override 864 public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) 865 throws IOException { 866 throw new IOException("Har: completeLocalOutput not allowed"); 867 } 868 869 /** 870 * not implemented. 871 */ 872 @Override 873 public void setOwner(Path p, String username, String groupname) 874 throws IOException { 875 throw new IOException("Har: setowner not allowed"); 876 } 877 878 @Override 879 public void setTimes(Path p, long mtime, long atime) throws IOException { 880 throw new IOException("Har: setTimes not allowed"); 881 } 882 883 /** 884 * Not implemented. 885 */ 886 @Override 887 public void setPermission(Path p, FsPermission permission) 888 throws IOException { 889 throw new IOException("Har: setPermission not allowed"); 890 } 891 892 /** 893 * Hadoop archives input stream. This input stream fakes EOF 894 * since archive files are part of bigger part files. 895 */ 896 private static class HarFSDataInputStream extends FSDataInputStream { 897 /** 898 * Create an input stream that fakes all the reads/positions/seeking. 899 */ 900 private static class HarFsInputStream extends FSInputStream 901 implements CanSetDropBehind, CanSetReadahead { 902 private long position, start, end; 903 //The underlying data input stream that the 904 // underlying filesystem will return. 905 private final FSDataInputStream underLyingStream; 906 //one byte buffer 907 private final byte[] oneBytebuff = new byte[1]; 908 909 HarFsInputStream(FileSystem fs, Path path, long start, 910 long length, int bufferSize) throws IOException { 911 if (length < 0) { 912 throw new IllegalArgumentException("Negative length ["+length+"]"); 913 } 914 underLyingStream = fs.open(path, bufferSize); 915 underLyingStream.seek(start); 916 // the start of this file in the part file 917 this.start = start; 918 // the position pointer in the part file 919 this.position = start; 920 // the end pointer in the part file 921 this.end = start + length; 922 } 923 924 @Override 925 public synchronized int available() throws IOException { 926 long remaining = end - underLyingStream.getPos(); 927 if (remaining > Integer.MAX_VALUE) { 928 return Integer.MAX_VALUE; 929 } 930 return (int) remaining; 931 } 932 933 @Override 934 public synchronized void close() throws IOException { 935 underLyingStream.close(); 936 super.close(); 937 } 938 939 //not implemented 940 @Override 941 public void mark(int readLimit) { 942 // do nothing 943 } 944 945 /** 946 * reset is not implemented 947 */ 948 @Override 949 public void reset() throws IOException { 950 throw new IOException("reset not implemented."); 951 } 952 953 @Override 954 public synchronized int read() throws IOException { 955 int ret = read(oneBytebuff, 0, 1); 956 return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff); 957 } 958 959 // NB: currently this method actually never executed becusae 960 // java.io.DataInputStream.read(byte[]) directly delegates to 961 // method java.io.InputStream.read(byte[], int, int). 962 // However, potentially it can be invoked, so leave it intact for now. 963 @Override 964 public synchronized int read(byte[] b) throws IOException { 965 final int ret = read(b, 0, b.length); 966 if (ret > 0) { 967 position += ret; 968 } 969 return ret; 970 } 971 972 /** 973 * 974 */ 975 @Override 976 public synchronized int read(byte[] b, int offset, int len) 977 throws IOException { 978 int newlen = len; 979 int ret = -1; 980 if (position + len > end) { 981 newlen = (int) (end - position); 982 } 983 // end case 984 if (newlen == 0) 985 return ret; 986 ret = underLyingStream.read(b, offset, newlen); 987 position += ret; 988 return ret; 989 } 990 991 @Override 992 public synchronized long skip(long n) throws IOException { 993 long tmpN = n; 994 if (tmpN > 0) { 995 final long actualRemaining = end - position; 996 if (tmpN > actualRemaining) { 997 tmpN = actualRemaining; 998 } 999 underLyingStream.seek(tmpN + position); 1000 position += tmpN; 1001 return tmpN; 1002 } 1003 // NB: the contract is described in java.io.InputStream.skip(long): 1004 // this method returns the number of bytes actually skipped, so, 1005 // the return value should never be negative. 1006 return 0; 1007 } 1008 1009 @Override 1010 public synchronized long getPos() throws IOException { 1011 return (position - start); 1012 } 1013 1014 @Override 1015 public synchronized void seek(final long pos) throws IOException { 1016 validatePosition(pos); 1017 position = start + pos; 1018 underLyingStream.seek(position); 1019 } 1020 1021 private void validatePosition(final long pos) throws IOException { 1022 if (pos < 0) { 1023 throw new IOException("Negative position: "+pos); 1024 } 1025 final long length = end - start; 1026 if (pos > length) { 1027 throw new IOException("Position behind the end " + 1028 "of the stream (length = "+length+"): " + pos); 1029 } 1030 } 1031 1032 @Override 1033 public boolean seekToNewSource(long targetPos) throws IOException { 1034 // do not need to implement this 1035 // hdfs in itself does seektonewsource 1036 // while reading. 1037 return false; 1038 } 1039 1040 /** 1041 * implementing position readable. 1042 */ 1043 @Override 1044 public int read(long pos, byte[] b, int offset, int length) 1045 throws IOException { 1046 int nlength = length; 1047 if (start + nlength + pos > end) { 1048 // length corrected to the real remaining length: 1049 nlength = (int) (end - start - pos); 1050 } 1051 if (nlength <= 0) { 1052 // EOS: 1053 return -1; 1054 } 1055 return underLyingStream.read(pos + start , b, offset, nlength); 1056 } 1057 1058 /** 1059 * position readable again. 1060 */ 1061 @Override 1062 public void readFully(long pos, byte[] b, int offset, int length) 1063 throws IOException { 1064 if (start + length + pos > end) { 1065 throw new IOException("Not enough bytes to read."); 1066 } 1067 underLyingStream.readFully(pos + start, b, offset, length); 1068 } 1069 1070 @Override 1071 public void readFully(long pos, byte[] b) throws IOException { 1072 readFully(pos, b, 0, b.length); 1073 } 1074 1075 @Override 1076 public void setReadahead(Long readahead) throws IOException { 1077 underLyingStream.setReadahead(readahead); 1078 } 1079 1080 @Override 1081 public void setDropBehind(Boolean dropBehind) throws IOException { 1082 underLyingStream.setDropBehind(dropBehind); 1083 } 1084 } 1085 1086 /** 1087 * constructors for har input stream. 1088 * @param fs the underlying filesystem 1089 * @param p The path in the underlying filesystem 1090 * @param start the start position in the part file 1091 * @param length the length of valid data in the part file 1092 * @param bufsize the buffer size 1093 * @throws IOException 1094 */ 1095 public HarFSDataInputStream(FileSystem fs, Path p, long start, 1096 long length, int bufsize) throws IOException { 1097 super(new HarFsInputStream(fs, p, start, length, bufsize)); 1098 } 1099 } 1100 1101 private class HarMetaData { 1102 private FileSystem fs; 1103 private int version; 1104 // the masterIndex of the archive 1105 private Path masterIndexPath; 1106 // the index file 1107 private Path archiveIndexPath; 1108 1109 private long masterIndexTimestamp; 1110 private long archiveIndexTimestamp; 1111 1112 List<Store> stores = new ArrayList<Store>(); 1113 Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>(); 1114 private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>(); 1115 1116 public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) { 1117 this.fs = fs; 1118 this.masterIndexPath = masterIndexPath; 1119 this.archiveIndexPath = archiveIndexPath; 1120 } 1121 1122 public FileStatus getPartFileStatus(Path partPath) throws IOException { 1123 FileStatus status; 1124 status = partFileStatuses.get(partPath); 1125 if (status == null) { 1126 status = fs.getFileStatus(partPath); 1127 partFileStatuses.put(partPath, status); 1128 } 1129 return status; 1130 } 1131 1132 public long getMasterIndexTimestamp() { 1133 return masterIndexTimestamp; 1134 } 1135 1136 public long getArchiveIndexTimestamp() { 1137 return archiveIndexTimestamp; 1138 } 1139 1140 private int getVersion() { 1141 return version; 1142 } 1143 1144 private void parseMetaData() throws IOException { 1145 Text line = new Text(); 1146 long read; 1147 FSDataInputStream in = null; 1148 LineReader lin = null; 1149 1150 try { 1151 in = fs.open(masterIndexPath); 1152 FileStatus masterStat = fs.getFileStatus(masterIndexPath); 1153 masterIndexTimestamp = masterStat.getModificationTime(); 1154 lin = new LineReader(in, getConf()); 1155 read = lin.readLine(line); 1156 1157 // the first line contains the version of the index file 1158 String versionLine = line.toString(); 1159 String[] arr = versionLine.split(" "); 1160 version = Integer.parseInt(arr[0]); 1161 // make it always backwards-compatible 1162 if (this.version > HarFileSystem.VERSION) { 1163 throw new IOException("Invalid version " + 1164 this.version + " expected " + HarFileSystem.VERSION); 1165 } 1166 1167 // each line contains a hashcode range and the index file name 1168 String[] readStr; 1169 while(read < masterStat.getLen()) { 1170 int b = lin.readLine(line); 1171 read += b; 1172 readStr = line.toString().split(" "); 1173 int startHash = Integer.parseInt(readStr[0]); 1174 int endHash = Integer.parseInt(readStr[1]); 1175 stores.add(new Store(Long.parseLong(readStr[2]), 1176 Long.parseLong(readStr[3]), startHash, 1177 endHash)); 1178 line.clear(); 1179 } 1180 } catch (IOException ioe) { 1181 LOG.warn("Encountered exception ", ioe); 1182 throw ioe; 1183 } finally { 1184 IOUtils.cleanup(LOG, lin, in); 1185 } 1186 1187 FSDataInputStream aIn = fs.open(archiveIndexPath); 1188 try { 1189 FileStatus archiveStat = fs.getFileStatus(archiveIndexPath); 1190 archiveIndexTimestamp = archiveStat.getModificationTime(); 1191 LineReader aLin; 1192 1193 // now start reading the real index file 1194 for (Store s: stores) { 1195 read = 0; 1196 aIn.seek(s.begin); 1197 aLin = new LineReader(aIn, getConf()); 1198 while (read + s.begin < s.end) { 1199 int tmp = aLin.readLine(line); 1200 read += tmp; 1201 String lineFeed = line.toString(); 1202 String[] parsed = lineFeed.split(" "); 1203 parsed[0] = decodeFileName(parsed[0]); 1204 archive.put(new Path(parsed[0]), new HarStatus(lineFeed)); 1205 line.clear(); 1206 } 1207 } 1208 } finally { 1209 IOUtils.cleanup(LOG, aIn); 1210 } 1211 } 1212 } 1213 1214 /* 1215 * testing purposes only: 1216 */ 1217 HarMetaData getMetadata() { 1218 return metadata; 1219 } 1220 1221 private static class LruCache<K, V> extends LinkedHashMap<K, V> { 1222 private final int MAX_ENTRIES; 1223 1224 public LruCache(int maxEntries) { 1225 super(maxEntries + 1, 1.0f, true); 1226 MAX_ENTRIES = maxEntries; 1227 } 1228 1229 @Override 1230 protected boolean removeEldestEntry(Map.Entry<K, V> eldest) { 1231 return size() > MAX_ENTRIES; 1232 } 1233 } 1234 1235 @SuppressWarnings("deprecation") 1236 @Override 1237 public FsServerDefaults getServerDefaults() throws IOException { 1238 return fs.getServerDefaults(); 1239 } 1240 1241 @Override 1242 public FsServerDefaults getServerDefaults(Path f) throws IOException { 1243 return fs.getServerDefaults(f); 1244 } 1245 1246 @Override 1247 public long getUsed() throws IOException{ 1248 return fs.getUsed(); 1249 } 1250 1251 @SuppressWarnings("deprecation") 1252 @Override 1253 public long getDefaultBlockSize() { 1254 return fs.getDefaultBlockSize(); 1255 } 1256 1257 @SuppressWarnings("deprecation") 1258 @Override 1259 public long getDefaultBlockSize(Path f) { 1260 return fs.getDefaultBlockSize(f); 1261 } 1262 1263 @SuppressWarnings("deprecation") 1264 @Override 1265 public short getDefaultReplication() { 1266 return fs.getDefaultReplication(); 1267 } 1268 1269 @Override 1270 public short getDefaultReplication(Path f) { 1271 return fs.getDefaultReplication(f); 1272 } 1273 }