001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 package org.apache.hadoop.fs; 019 020 import org.apache.commons.logging.Log; 021 import org.apache.commons.logging.LogFactory; 022 import org.apache.hadoop.conf.Configuration; 023 import org.apache.hadoop.fs.permission.FsPermission; 024 import org.apache.hadoop.io.IOUtils; 025 import org.apache.hadoop.io.Text; 026 import org.apache.hadoop.util.LineReader; 027 import org.apache.hadoop.util.Progressable; 028 029 import java.io.FileNotFoundException; 030 import java.io.IOException; 031 import java.io.UnsupportedEncodingException; 032 import java.net.URI; 033 import java.net.URISyntaxException; 034 import java.net.URLDecoder; 035 import java.util.*; 036 037 /** 038 * This is an implementation of the Hadoop Archive 039 * Filesystem. This archive Filesystem has index files 040 * of the form _index* and has contents of the form 041 * part-*. The index files store the indexes of the 042 * real files. The index files are of the form _masterindex 043 * and _index. The master index is a level of indirection 044 * in to the index file to make the look ups faster. the index 045 * file is sorted with hash code of the paths that it contains 046 * and the master index contains pointers to the positions in 047 * index for ranges of hashcodes. 048 */ 049 050 public class HarFileSystem extends FileSystem { 051 052 private static final Log LOG = LogFactory.getLog(HarFileSystem.class); 053 054 public static final String METADATA_CACHE_ENTRIES_KEY = "fs.har.metadatacache.entries"; 055 public static final int METADATA_CACHE_ENTRIES_DEFAULT = 10; 056 057 public static final int VERSION = 3; 058 059 private static Map<URI, HarMetaData> harMetaCache; 060 061 // uri representation of this Har filesystem 062 private URI uri; 063 // the top level path of the archive 064 // in the underlying file system 065 private Path archivePath; 066 // the har auth 067 private String harAuth; 068 069 // pointer into the static metadata cache 070 private HarMetaData metadata; 071 072 private FileSystem fs; 073 074 /** 075 * public construction of harfilesystem 076 */ 077 public HarFileSystem() { 078 // Must call #initialize() method to set the underlying file system 079 } 080 081 /** 082 * Return the protocol scheme for the FileSystem. 083 * <p/> 084 * 085 * @return <code>har</code> 086 */ 087 @Override 088 public String getScheme() { 089 return "har"; 090 } 091 092 /** 093 * Constructor to create a HarFileSystem with an 094 * underlying filesystem. 095 * @param fs underlying file system 096 */ 097 public HarFileSystem(FileSystem fs) { 098 this.fs = fs; 099 this.statistics = fs.statistics; 100 } 101 102 private synchronized void initializeMetadataCache(Configuration conf) { 103 if (harMetaCache == null) { 104 int cacheSize = conf.getInt(METADATA_CACHE_ENTRIES_KEY, METADATA_CACHE_ENTRIES_DEFAULT); 105 harMetaCache = Collections.synchronizedMap(new LruCache<URI, HarMetaData>(cacheSize)); 106 } 107 } 108 109 /** 110 * Initialize a Har filesystem per har archive. The 111 * archive home directory is the top level directory 112 * in the filesystem that contains the HAR archive. 113 * Be careful with this method, you do not want to go 114 * on creating new Filesystem instances per call to 115 * path.getFileSystem(). 116 * the uri of Har is 117 * har://underlyingfsscheme-host:port/archivepath. 118 * or 119 * har:///archivepath. This assumes the underlying filesystem 120 * to be used in case not specified. 121 */ 122 @Override 123 public void initialize(URI name, Configuration conf) throws IOException { 124 // initialize the metadata cache, if needed 125 initializeMetadataCache(conf); 126 127 // decode the name 128 URI underLyingURI = decodeHarURI(name, conf); 129 // we got the right har Path- now check if this is 130 // truly a har filesystem 131 Path harPath = archivePath( 132 new Path(name.getScheme(), name.getAuthority(), name.getPath())); 133 if (harPath == null) { 134 throw new IOException("Invalid path for the Har Filesystem. " + 135 name.toString()); 136 } 137 if (fs == null) { 138 fs = FileSystem.get(underLyingURI, conf); 139 } 140 uri = harPath.toUri(); 141 archivePath = new Path(uri.getPath()); 142 harAuth = getHarAuth(underLyingURI); 143 //check for the underlying fs containing 144 // the index file 145 Path masterIndexPath = new Path(archivePath, "_masterindex"); 146 Path archiveIndexPath = new Path(archivePath, "_index"); 147 if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) { 148 throw new IOException("Invalid path for the Har Filesystem. " + 149 "No index file in " + harPath); 150 } 151 152 metadata = harMetaCache.get(uri); 153 if (metadata != null) { 154 FileStatus mStat = fs.getFileStatus(masterIndexPath); 155 FileStatus aStat = fs.getFileStatus(archiveIndexPath); 156 if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() || 157 aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) { 158 // the archive has been overwritten since we last read it 159 // remove the entry from the meta data cache 160 metadata = null; 161 harMetaCache.remove(uri); 162 } 163 } 164 if (metadata == null) { 165 metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath); 166 metadata.parseMetaData(); 167 harMetaCache.put(uri, metadata); 168 } 169 } 170 171 @Override 172 public Configuration getConf() { 173 return fs.getConf(); 174 } 175 176 // get the version of the filesystem from the masterindex file 177 // the version is currently not useful since its the first version 178 // of archives 179 public int getHarVersion() throws IOException { 180 if (metadata != null) { 181 return metadata.getVersion(); 182 } 183 else { 184 throw new IOException("Invalid meta data for the Har Filesystem"); 185 } 186 } 187 188 /* 189 * find the parent path that is the 190 * archive path in the path. The last 191 * path segment that ends with .har is 192 * the path that will be returned. 193 */ 194 private Path archivePath(Path p) { 195 Path retPath = null; 196 Path tmp = p; 197 for (int i=0; i< p.depth(); i++) { 198 if (tmp.toString().endsWith(".har")) { 199 retPath = tmp; 200 break; 201 } 202 tmp = tmp.getParent(); 203 } 204 return retPath; 205 } 206 207 /** 208 * decode the raw URI to get the underlying URI 209 * @param rawURI raw Har URI 210 * @return filtered URI of the underlying fileSystem 211 */ 212 private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException { 213 String tmpAuth = rawURI.getAuthority(); 214 //we are using the default file 215 //system in the config 216 //so create a underlying uri and 217 //return it 218 if (tmpAuth == null) { 219 //create a path 220 return FileSystem.getDefaultUri(conf); 221 } 222 String authority = rawURI.getAuthority(); 223 if (authority == null) { 224 throw new IOException("URI: " + rawURI 225 + " is an invalid Har URI since authority==null." 226 + " Expecting har://<scheme>-<host>/<path>."); 227 } 228 229 int i = authority.indexOf('-'); 230 if (i < 0) { 231 throw new IOException("URI: " + rawURI 232 + " is an invalid Har URI since '-' not found." 233 + " Expecting har://<scheme>-<host>/<path>."); 234 } 235 236 if (rawURI.getQuery() != null) { 237 // query component not allowed 238 throw new IOException("query component in Path not supported " + rawURI); 239 } 240 241 URI tmp; 242 try { 243 // convert <scheme>-<host> to <scheme>://<host> 244 URI baseUri = new URI(authority.replaceFirst("-", "://")); 245 246 tmp = new URI(baseUri.getScheme(), baseUri.getAuthority(), 247 rawURI.getPath(), rawURI.getQuery(), rawURI.getFragment()); 248 } catch (URISyntaxException e) { 249 throw new IOException("URI: " + rawURI 250 + " is an invalid Har URI. Expecting har://<scheme>-<host>/<path>."); 251 } 252 return tmp; 253 } 254 255 private static String decodeString(String str) 256 throws UnsupportedEncodingException { 257 return URLDecoder.decode(str, "UTF-8"); 258 } 259 260 private String decodeFileName(String fname) 261 throws UnsupportedEncodingException { 262 int version = metadata.getVersion(); 263 if (version == 2 || version == 3){ 264 return decodeString(fname); 265 } 266 return fname; 267 } 268 269 /** 270 * return the top level archive. 271 */ 272 @Override 273 public Path getWorkingDirectory() { 274 return new Path(uri.toString()); 275 } 276 277 /** 278 * Create a har specific auth 279 * har-underlyingfs:port 280 * @param underLyingUri the uri of underlying 281 * filesystem 282 * @return har specific auth 283 */ 284 private String getHarAuth(URI underLyingUri) { 285 String auth = underLyingUri.getScheme() + "-"; 286 if (underLyingUri.getHost() != null) { 287 auth += underLyingUri.getHost(); 288 if (underLyingUri.getPort() != -1) { 289 auth += ":"; 290 auth += underLyingUri.getPort(); 291 } 292 } 293 else { 294 auth += ":"; 295 } 296 return auth; 297 } 298 299 @Override 300 protected URI getCanonicalUri() { 301 return fs.canonicalizeUri(getUri()); 302 } 303 304 /** 305 * Returns the uri of this filesystem. 306 * The uri is of the form 307 * har://underlyingfsschema-host:port/pathintheunderlyingfs 308 */ 309 @Override 310 public URI getUri() { 311 return this.uri; 312 } 313 314 /** 315 * this method returns the path 316 * inside the har filesystem. 317 * this is relative path inside 318 * the har filesystem. 319 * @param path the fully qualified path in the har filesystem. 320 * @return relative path in the filesystem. 321 */ 322 private Path getPathInHar(Path path) { 323 Path harPath = new Path(path.toUri().getPath()); 324 if (archivePath.compareTo(harPath) == 0) 325 return new Path(Path.SEPARATOR); 326 Path tmp = new Path(harPath.getName()); 327 Path parent = harPath.getParent(); 328 while (!(parent.compareTo(archivePath) == 0)) { 329 if (parent.toString().equals(Path.SEPARATOR)) { 330 tmp = null; 331 break; 332 } 333 tmp = new Path(parent.getName(), tmp); 334 parent = parent.getParent(); 335 } 336 if (tmp != null) 337 tmp = new Path(Path.SEPARATOR, tmp); 338 return tmp; 339 } 340 341 //the relative path of p. basically 342 // getting rid of /. Parsing and doing 343 // string manipulation is not good - so 344 // just use the path api to do it. 345 private Path makeRelative(String initial, Path p) { 346 String scheme = this.uri.getScheme(); 347 String authority = this.uri.getAuthority(); 348 Path root = new Path(Path.SEPARATOR); 349 if (root.compareTo(p) == 0) 350 return new Path(scheme, authority, initial); 351 Path retPath = new Path(p.getName()); 352 Path parent = p.getParent(); 353 for (int i=0; i < p.depth()-1; i++) { 354 retPath = new Path(parent.getName(), retPath); 355 parent = parent.getParent(); 356 } 357 return new Path(new Path(scheme, authority, initial), 358 retPath.toString()); 359 } 360 361 /* this makes a path qualified in the har filesystem 362 * (non-Javadoc) 363 * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified( 364 * org.apache.hadoop.fs.Path) 365 */ 366 @Override 367 public Path makeQualified(Path path) { 368 // make sure that we just get the 369 // path component 370 Path fsPath = path; 371 if (!path.isAbsolute()) { 372 fsPath = new Path(archivePath, path); 373 } 374 375 URI tmpURI = fsPath.toUri(); 376 //change this to Har uri 377 return new Path(uri.getScheme(), harAuth, tmpURI.getPath()); 378 } 379 380 /** 381 * Fix offset and length of block locations. 382 * Note that this method modifies the original array. 383 * @param locations block locations of har part file 384 * @param start the start of the desired range in the contained file 385 * @param len the length of the desired range 386 * @param fileOffsetInHar the offset of the desired file in the har part file 387 * @return block locations with fixed offset and length 388 */ 389 static BlockLocation[] fixBlockLocations(BlockLocation[] locations, 390 long start, 391 long len, 392 long fileOffsetInHar) { 393 // offset 1 past last byte of desired range 394 long end = start + len; 395 396 for (BlockLocation location : locations) { 397 // offset of part block relative to beginning of desired file 398 // (may be negative if file starts in this part block) 399 long harBlockStart = location.getOffset() - fileOffsetInHar; 400 // offset 1 past last byte of har block relative to beginning of 401 // desired file 402 long harBlockEnd = harBlockStart + location.getLength(); 403 404 if (start > harBlockStart) { 405 // desired range starts after beginning of this har block 406 // fix offset to beginning of relevant range (relative to desired file) 407 location.setOffset(start); 408 // fix length to relevant portion of har block 409 location.setLength(location.getLength() - (start - harBlockStart)); 410 } else { 411 // desired range includes beginning of this har block 412 location.setOffset(harBlockStart); 413 } 414 415 if (harBlockEnd > end) { 416 // range ends before end of this har block 417 // fix length to remove irrelevant portion at the end 418 location.setLength(location.getLength() - (harBlockEnd - end)); 419 } 420 } 421 422 return locations; 423 } 424 425 /** 426 * Get block locations from the underlying fs and fix their 427 * offsets and lengths. 428 * @param file the input file status to get block locations 429 * @param start the start of the desired range in the contained file 430 * @param len the length of the desired range 431 * @return block locations for this segment of file 432 * @throws IOException 433 */ 434 @Override 435 public BlockLocation[] getFileBlockLocations(FileStatus file, long start, 436 long len) throws IOException { 437 HarStatus hstatus = getFileHarStatus(file.getPath()); 438 Path partPath = new Path(archivePath, hstatus.getPartName()); 439 FileStatus partStatus = metadata.getPartFileStatus(partPath); 440 441 // get all part blocks that overlap with the desired file blocks 442 BlockLocation[] locations = 443 fs.getFileBlockLocations(partStatus, 444 hstatus.getStartIndex() + start, len); 445 446 return fixBlockLocations(locations, start, len, hstatus.getStartIndex()); 447 } 448 449 /** 450 * the hash of the path p inside the filesystem 451 * @param p the path in the harfilesystem 452 * @return the hash code of the path. 453 */ 454 public static int getHarHash(Path p) { 455 return (p.toString().hashCode() & 0x7fffffff); 456 } 457 458 static class Store { 459 public Store() { 460 begin = end = startHash = endHash = 0; 461 } 462 public Store(long begin, long end, int startHash, int endHash) { 463 this.begin = begin; 464 this.end = end; 465 this.startHash = startHash; 466 this.endHash = endHash; 467 } 468 public long begin; 469 public long end; 470 public int startHash; 471 public int endHash; 472 } 473 474 /** 475 * Get filestatuses of all the children of a given directory. This just reads 476 * through index file and reads line by line to get all statuses for children 477 * of a directory. Its a brute force way of getting all such filestatuses 478 * 479 * @param parent 480 * the parent path directory 481 * @param statuses 482 * the list to add the children filestatuses to 483 */ 484 private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses) 485 throws IOException { 486 String parentString = parent.getName(); 487 if (!parentString.endsWith(Path.SEPARATOR)){ 488 parentString += Path.SEPARATOR; 489 } 490 Path harPath = new Path(parentString); 491 int harlen = harPath.depth(); 492 final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>(); 493 494 for (HarStatus hstatus : metadata.archive.values()) { 495 String child = hstatus.getName(); 496 if ((child.startsWith(parentString))) { 497 Path thisPath = new Path(child); 498 if (thisPath.depth() == harlen + 1) { 499 statuses.add(toFileStatus(hstatus, cache)); 500 } 501 } 502 } 503 } 504 505 /** 506 * Combine the status stored in the index and the underlying status. 507 * @param h status stored in the index 508 * @param cache caching the underlying file statuses 509 * @return the combined file status 510 * @throws IOException 511 */ 512 private FileStatus toFileStatus(HarStatus h, 513 Map<String, FileStatus> cache) throws IOException { 514 FileStatus underlying = null; 515 if (cache != null) { 516 underlying = cache.get(h.partName); 517 } 518 if (underlying == null) { 519 final Path p = h.isDir? archivePath: new Path(archivePath, h.partName); 520 underlying = fs.getFileStatus(p); 521 if (cache != null) { 522 cache.put(h.partName, underlying); 523 } 524 } 525 526 long modTime = 0; 527 int version = metadata.getVersion(); 528 if (version < 3) { 529 modTime = underlying.getModificationTime(); 530 } else if (version == 3) { 531 modTime = h.getModificationTime(); 532 } 533 534 return new FileStatus( 535 h.isDir()? 0L: h.getLength(), 536 h.isDir(), 537 underlying.getReplication(), 538 underlying.getBlockSize(), 539 modTime, 540 underlying.getAccessTime(), 541 underlying.getPermission(), 542 underlying.getOwner(), 543 underlying.getGroup(), 544 makeRelative(this.uri.getPath(), new Path(h.name))); 545 } 546 547 // a single line parser for hadoop archives status 548 // stored in a single line in the index files 549 // the format is of the form 550 // filename "dir"/"file" partFileName startIndex length 551 // <space separated children> 552 private class HarStatus { 553 boolean isDir; 554 String name; 555 List<String> children; 556 String partName; 557 long startIndex; 558 long length; 559 long modificationTime = 0; 560 561 public HarStatus(String harString) throws UnsupportedEncodingException { 562 String[] splits = harString.split(" "); 563 this.name = decodeFileName(splits[0]); 564 this.isDir = "dir".equals(splits[1]) ? true: false; 565 // this is equal to "none" if its a directory 566 this.partName = splits[2]; 567 this.startIndex = Long.parseLong(splits[3]); 568 this.length = Long.parseLong(splits[4]); 569 570 int version = metadata.getVersion(); 571 String[] propSplits = null; 572 // propSplits is used to retrieve the metainformation that Har versions 573 // 1 & 2 missed (modification time, permission, owner group). 574 // These fields are stored in an encoded string placed in different 575 // locations depending on whether it's a file or directory entry. 576 // If it's a directory, the string will be placed at the partName 577 // location (directories have no partName because they don't have data 578 // to be stored). This is done because the number of fields in a 579 // directory entry is unbounded (all children are listed at the end) 580 // If it's a file, the string will be the last field. 581 if (isDir) { 582 if (version == 3){ 583 propSplits = decodeString(this.partName).split(" "); 584 } 585 children = new ArrayList<String>(); 586 for (int i = 5; i < splits.length; i++) { 587 children.add(decodeFileName(splits[i])); 588 } 589 } else if (version == 3) { 590 propSplits = decodeString(splits[5]).split(" "); 591 } 592 593 if (propSplits != null && propSplits.length >= 4) { 594 modificationTime = Long.parseLong(propSplits[0]); 595 // the fields below are stored in the file but are currently not used 596 // by HarFileSystem 597 // permission = new FsPermission(Short.parseShort(propSplits[1])); 598 // owner = decodeString(propSplits[2]); 599 // group = decodeString(propSplits[3]); 600 } 601 } 602 public boolean isDir() { 603 return isDir; 604 } 605 606 public String getName() { 607 return name; 608 } 609 public String getPartName() { 610 return partName; 611 } 612 public long getStartIndex() { 613 return startIndex; 614 } 615 public long getLength() { 616 return length; 617 } 618 public long getModificationTime() { 619 return modificationTime; 620 } 621 } 622 623 /** 624 * return the filestatus of files in har archive. 625 * The permission returned are that of the archive 626 * index files. The permissions are not persisted 627 * while creating a hadoop archive. 628 * @param f the path in har filesystem 629 * @return filestatus. 630 * @throws IOException 631 */ 632 @Override 633 public FileStatus getFileStatus(Path f) throws IOException { 634 HarStatus hstatus = getFileHarStatus(f); 635 return toFileStatus(hstatus, null); 636 } 637 638 private HarStatus getFileHarStatus(Path f) throws IOException { 639 // get the fs DataInputStream for the underlying file 640 // look up the index. 641 Path p = makeQualified(f); 642 Path harPath = getPathInHar(p); 643 if (harPath == null) { 644 throw new IOException("Invalid file name: " + f + " in " + uri); 645 } 646 HarStatus hstatus = metadata.archive.get(harPath); 647 if (hstatus == null) { 648 throw new FileNotFoundException("File: " + f + " does not exist in " + uri); 649 } 650 return hstatus; 651 } 652 653 /** 654 * @return null since no checksum algorithm is implemented. 655 */ 656 @Override 657 public FileChecksum getFileChecksum(Path f) { 658 return null; 659 } 660 661 /** 662 * Returns a har input stream which fakes end of 663 * file. It reads the index files to get the part 664 * file name and the size and start of the file. 665 */ 666 @Override 667 public FSDataInputStream open(Path f, int bufferSize) throws IOException { 668 // get the fs DataInputStream for the underlying file 669 HarStatus hstatus = getFileHarStatus(f); 670 if (hstatus.isDir()) { 671 throw new FileNotFoundException(f + " : not a file in " + 672 archivePath); 673 } 674 return new HarFSDataInputStream(fs, new Path(archivePath, 675 hstatus.getPartName()), 676 hstatus.getStartIndex(), hstatus.getLength(), bufferSize); 677 } 678 679 @Override 680 public FSDataOutputStream create(Path f, 681 FsPermission permission, 682 boolean overwrite, 683 int bufferSize, 684 short replication, 685 long blockSize, 686 Progressable progress) throws IOException { 687 throw new IOException("Har: create not allowed."); 688 } 689 690 @Override 691 public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) throws IOException { 692 throw new IOException("Har: append not allowed."); 693 } 694 695 @Override 696 public void close() throws IOException { 697 if (fs != null) { 698 try { 699 fs.close(); 700 } catch(IOException ie) { 701 //this might already be closed 702 // ignore 703 } 704 } 705 } 706 707 /** 708 * Not implemented. 709 */ 710 @Override 711 public boolean setReplication(Path src, short replication) throws IOException{ 712 throw new IOException("Har: setReplication not allowed"); 713 } 714 715 @Override 716 public boolean rename(Path src, Path dst) throws IOException { 717 throw new IOException("Har: rename not allowed"); 718 } 719 720 @Override 721 public FSDataOutputStream append(Path f) throws IOException { 722 throw new IOException("Har: append not allowed"); 723 } 724 725 /** 726 * Not implemented. 727 */ 728 @Override 729 public boolean delete(Path f, boolean recursive) throws IOException { 730 throw new IOException("Har: delete not allowed"); 731 } 732 733 /** 734 * liststatus returns the children of a directory 735 * after looking up the index files. 736 */ 737 @Override 738 public FileStatus[] listStatus(Path f) throws IOException { 739 //need to see if the file is an index in file 740 //get the filestatus of the archive directory 741 // we will create fake filestatuses to return 742 // to the client 743 List<FileStatus> statuses = new ArrayList<FileStatus>(); 744 Path tmpPath = makeQualified(f); 745 Path harPath = getPathInHar(tmpPath); 746 HarStatus hstatus = metadata.archive.get(harPath); 747 if (hstatus == null) { 748 throw new FileNotFoundException("File " + f + " not found in " + archivePath); 749 } 750 if (hstatus.isDir()) { 751 fileStatusesInIndex(hstatus, statuses); 752 } else { 753 statuses.add(toFileStatus(hstatus, null)); 754 } 755 756 return statuses.toArray(new FileStatus[statuses.size()]); 757 } 758 759 /** 760 * return the top level archive path. 761 */ 762 @Override 763 public Path getHomeDirectory() { 764 return new Path(uri.toString()); 765 } 766 767 @Override 768 public void setWorkingDirectory(Path newDir) { 769 //does nothing. 770 } 771 772 /** 773 * not implemented. 774 */ 775 @Override 776 public boolean mkdirs(Path f, FsPermission permission) throws IOException { 777 throw new IOException("Har: mkdirs not allowed"); 778 } 779 780 /** 781 * not implemented. 782 */ 783 @Override 784 public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws 785 IOException { 786 throw new IOException("Har: copyfromlocalfile not allowed"); 787 } 788 789 /** 790 * copies the file in the har filesystem to a local file. 791 */ 792 @Override 793 public void copyToLocalFile(boolean delSrc, Path src, Path dst) 794 throws IOException { 795 FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf()); 796 } 797 798 /** 799 * not implemented. 800 */ 801 @Override 802 public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) 803 throws IOException { 804 throw new IOException("Har: startLocalOutput not allowed"); 805 } 806 807 /** 808 * not implemented. 809 */ 810 @Override 811 public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) 812 throws IOException { 813 throw new IOException("Har: completeLocalOutput not allowed"); 814 } 815 816 /** 817 * not implemented. 818 */ 819 @Override 820 public void setOwner(Path p, String username, String groupname) 821 throws IOException { 822 throw new IOException("Har: setowner not allowed"); 823 } 824 825 /** 826 * Not implemented. 827 */ 828 @Override 829 public void setPermission(Path p, FsPermission permission) 830 throws IOException { 831 throw new IOException("Har: setPermission not allowed"); 832 } 833 834 /** 835 * Hadoop archives input stream. This input stream fakes EOF 836 * since archive files are part of bigger part files. 837 */ 838 private static class HarFSDataInputStream extends FSDataInputStream { 839 /** 840 * Create an input stream that fakes all the reads/positions/seeking. 841 */ 842 private static class HarFsInputStream extends FSInputStream 843 implements CanSetDropBehind, CanSetReadahead { 844 private long position, start, end; 845 //The underlying data input stream that the 846 // underlying filesystem will return. 847 private FSDataInputStream underLyingStream; 848 //one byte buffer 849 private byte[] oneBytebuff = new byte[1]; 850 HarFsInputStream(FileSystem fs, Path path, long start, 851 long length, int bufferSize) throws IOException { 852 underLyingStream = fs.open(path, bufferSize); 853 underLyingStream.seek(start); 854 // the start of this file in the part file 855 this.start = start; 856 // the position pointer in the part file 857 this.position = start; 858 // the end pointer in the part file 859 this.end = start + length; 860 } 861 862 @Override 863 public synchronized int available() throws IOException { 864 long remaining = end - underLyingStream.getPos(); 865 if (remaining > (long)Integer.MAX_VALUE) { 866 return Integer.MAX_VALUE; 867 } 868 return (int) remaining; 869 } 870 871 @Override 872 public synchronized void close() throws IOException { 873 underLyingStream.close(); 874 super.close(); 875 } 876 877 //not implemented 878 @Override 879 public void mark(int readLimit) { 880 // do nothing 881 } 882 883 /** 884 * reset is not implemented 885 */ 886 @Override 887 public void reset() throws IOException { 888 throw new IOException("reset not implemented."); 889 } 890 891 @Override 892 public synchronized int read() throws IOException { 893 int ret = read(oneBytebuff, 0, 1); 894 return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff); 895 } 896 897 @Override 898 public synchronized int read(byte[] b) throws IOException { 899 int ret = read(b, 0, b.length); 900 if (ret != -1) { 901 position += ret; 902 } 903 return ret; 904 } 905 906 /** 907 * 908 */ 909 @Override 910 public synchronized int read(byte[] b, int offset, int len) 911 throws IOException { 912 int newlen = len; 913 int ret = -1; 914 if (position + len > end) { 915 newlen = (int) (end - position); 916 } 917 // end case 918 if (newlen == 0) 919 return ret; 920 ret = underLyingStream.read(b, offset, newlen); 921 position += ret; 922 return ret; 923 } 924 925 @Override 926 public synchronized long skip(long n) throws IOException { 927 long tmpN = n; 928 if (tmpN > 0) { 929 if (position + tmpN > end) { 930 tmpN = end - position; 931 } 932 underLyingStream.seek(tmpN + position); 933 position += tmpN; 934 return tmpN; 935 } 936 return (tmpN < 0)? -1 : 0; 937 } 938 939 @Override 940 public synchronized long getPos() throws IOException { 941 return (position - start); 942 } 943 944 @Override 945 public synchronized void seek(long pos) throws IOException { 946 if (pos < 0 || (start + pos > end)) { 947 throw new IOException("Failed to seek: EOF"); 948 } 949 position = start + pos; 950 underLyingStream.seek(position); 951 } 952 953 @Override 954 public boolean seekToNewSource(long targetPos) throws IOException { 955 // do not need to implement this 956 // hdfs in itself does seektonewsource 957 // while reading. 958 return false; 959 } 960 961 /** 962 * implementing position readable. 963 */ 964 @Override 965 public int read(long pos, byte[] b, int offset, int length) 966 throws IOException { 967 int nlength = length; 968 if (start + nlength + pos > end) { 969 nlength = (int) (end - (start + pos)); 970 } 971 return underLyingStream.read(pos + start , b, offset, nlength); 972 } 973 974 /** 975 * position readable again. 976 */ 977 @Override 978 public void readFully(long pos, byte[] b, int offset, int length) 979 throws IOException { 980 if (start + length + pos > end) { 981 throw new IOException("Not enough bytes to read."); 982 } 983 underLyingStream.readFully(pos + start, b, offset, length); 984 } 985 986 @Override 987 public void readFully(long pos, byte[] b) throws IOException { 988 readFully(pos, b, 0, b.length); 989 } 990 991 @Override 992 public void setReadahead(Long readahead) throws IOException { 993 underLyingStream.setReadahead(readahead); 994 } 995 996 @Override 997 public void setDropBehind(Boolean dropBehind) throws IOException { 998 underLyingStream.setDropBehind(dropBehind); 999 } 1000 } 1001 1002 /** 1003 * constructors for har input stream. 1004 * @param fs the underlying filesystem 1005 * @param p The path in the underlying filesystem 1006 * @param start the start position in the part file 1007 * @param length the length of valid data in the part file 1008 * @param bufsize the buffer size 1009 * @throws IOException 1010 */ 1011 public HarFSDataInputStream(FileSystem fs, Path p, long start, 1012 long length, int bufsize) throws IOException { 1013 super(new HarFsInputStream(fs, p, start, length, bufsize)); 1014 } 1015 } 1016 1017 private class HarMetaData { 1018 private FileSystem fs; 1019 private int version; 1020 // the masterIndex of the archive 1021 private Path masterIndexPath; 1022 // the index file 1023 private Path archiveIndexPath; 1024 1025 private long masterIndexTimestamp; 1026 private long archiveIndexTimestamp; 1027 1028 List<Store> stores = new ArrayList<Store>(); 1029 Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>(); 1030 private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>(); 1031 1032 public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) { 1033 this.fs = fs; 1034 this.masterIndexPath = masterIndexPath; 1035 this.archiveIndexPath = archiveIndexPath; 1036 } 1037 1038 public FileStatus getPartFileStatus(Path partPath) throws IOException { 1039 FileStatus status; 1040 status = partFileStatuses.get(partPath); 1041 if (status == null) { 1042 status = fs.getFileStatus(partPath); 1043 partFileStatuses.put(partPath, status); 1044 } 1045 return status; 1046 } 1047 1048 public long getMasterIndexTimestamp() { 1049 return masterIndexTimestamp; 1050 } 1051 1052 public long getArchiveIndexTimestamp() { 1053 return archiveIndexTimestamp; 1054 } 1055 1056 private int getVersion() { 1057 return version; 1058 } 1059 1060 private void parseMetaData() throws IOException { 1061 Text line = new Text(); 1062 long read; 1063 FSDataInputStream in = null; 1064 LineReader lin = null; 1065 1066 try { 1067 in = fs.open(masterIndexPath); 1068 FileStatus masterStat = fs.getFileStatus(masterIndexPath); 1069 masterIndexTimestamp = masterStat.getModificationTime(); 1070 lin = new LineReader(in, getConf()); 1071 read = lin.readLine(line); 1072 1073 // the first line contains the version of the index file 1074 String versionLine = line.toString(); 1075 String[] arr = versionLine.split(" "); 1076 version = Integer.parseInt(arr[0]); 1077 // make it always backwards-compatible 1078 if (this.version > HarFileSystem.VERSION) { 1079 throw new IOException("Invalid version " + 1080 this.version + " expected " + HarFileSystem.VERSION); 1081 } 1082 1083 // each line contains a hashcode range and the index file name 1084 String[] readStr; 1085 while(read < masterStat.getLen()) { 1086 int b = lin.readLine(line); 1087 read += b; 1088 readStr = line.toString().split(" "); 1089 int startHash = Integer.parseInt(readStr[0]); 1090 int endHash = Integer.parseInt(readStr[1]); 1091 stores.add(new Store(Long.parseLong(readStr[2]), 1092 Long.parseLong(readStr[3]), startHash, 1093 endHash)); 1094 line.clear(); 1095 } 1096 } catch (IOException ioe) { 1097 LOG.warn("Encountered exception ", ioe); 1098 throw ioe; 1099 } finally { 1100 IOUtils.cleanup(LOG, lin, in); 1101 } 1102 1103 FSDataInputStream aIn = fs.open(archiveIndexPath); 1104 try { 1105 FileStatus archiveStat = fs.getFileStatus(archiveIndexPath); 1106 archiveIndexTimestamp = archiveStat.getModificationTime(); 1107 LineReader aLin; 1108 1109 // now start reading the real index file 1110 for (Store s: stores) { 1111 read = 0; 1112 aIn.seek(s.begin); 1113 aLin = new LineReader(aIn, getConf()); 1114 while (read + s.begin < s.end) { 1115 int tmp = aLin.readLine(line); 1116 read += tmp; 1117 String lineFeed = line.toString(); 1118 String[] parsed = lineFeed.split(" "); 1119 parsed[0] = decodeFileName(parsed[0]); 1120 archive.put(new Path(parsed[0]), new HarStatus(lineFeed)); 1121 line.clear(); 1122 } 1123 } 1124 } finally { 1125 IOUtils.cleanup(LOG, aIn); 1126 } 1127 } 1128 } 1129 1130 /* 1131 * testing purposes only: 1132 */ 1133 HarMetaData getMetadata() { 1134 return metadata; 1135 } 1136 1137 private static class LruCache<K, V> extends LinkedHashMap<K, V> { 1138 private final int MAX_ENTRIES; 1139 1140 public LruCache(int maxEntries) { 1141 super(maxEntries + 1, 1.0f, true); 1142 MAX_ENTRIES = maxEntries; 1143 } 1144 1145 @Override 1146 protected boolean removeEldestEntry(Map.Entry<K, V> eldest) { 1147 return size() > MAX_ENTRIES; 1148 } 1149 } 1150 }