001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 package org.apache.hadoop.fs; 019 020 import java.io.FileNotFoundException; 021 import java.io.IOException; 022 import java.io.UnsupportedEncodingException; 023 import java.net.URI; 024 import java.net.URISyntaxException; 025 import java.net.URLDecoder; 026 import java.util.ArrayList; 027 import java.util.Collections; 028 import java.util.List; 029 import java.util.LinkedHashMap; 030 import java.util.Map; 031 import java.util.TreeMap; 032 import java.util.HashMap; 033 034 import org.apache.commons.logging.Log; 035 import org.apache.commons.logging.LogFactory; 036 import org.apache.hadoop.conf.Configuration; 037 import org.apache.hadoop.fs.permission.FsPermission; 038 import org.apache.hadoop.io.IOUtils; 039 import org.apache.hadoop.io.Text; 040 import org.apache.hadoop.util.LineReader; 041 import org.apache.hadoop.util.Progressable; 042 043 /** 044 * This is an implementation of the Hadoop Archive 045 * Filesystem. This archive Filesystem has index files 046 * of the form _index* and has contents of the form 047 * part-*. The index files store the indexes of the 048 * real files. The index files are of the form _masterindex 049 * and _index. The master index is a level of indirection 050 * in to the index file to make the look ups faster. the index 051 * file is sorted with hash code of the paths that it contains 052 * and the master index contains pointers to the positions in 053 * index for ranges of hashcodes. 054 */ 055 056 public class HarFileSystem extends FilterFileSystem { 057 058 private static final Log LOG = LogFactory.getLog(HarFileSystem.class); 059 060 public static final String METADATA_CACHE_ENTRIES_KEY = "fs.har.metadatacache.entries"; 061 public static final int METADATA_CACHE_ENTRIES_DEFAULT = 10; 062 063 public static final int VERSION = 3; 064 065 private static Map<URI, HarMetaData> harMetaCache; 066 067 // uri representation of this Har filesystem 068 private URI uri; 069 // the top level path of the archive 070 // in the underlying file system 071 private Path archivePath; 072 // the har auth 073 private String harAuth; 074 075 // pointer into the static metadata cache 076 private HarMetaData metadata; 077 078 /** 079 * public construction of harfilesystem 080 * 081 */ 082 public HarFileSystem() { 083 } 084 085 /** 086 * Return the protocol scheme for the FileSystem. 087 * <p/> 088 * 089 * @return <code>har</code> 090 */ 091 @Override 092 public String getScheme() { 093 return "har"; 094 } 095 096 /** 097 * Constructor to create a HarFileSystem with an 098 * underlying filesystem. 099 * @param fs 100 */ 101 public HarFileSystem(FileSystem fs) { 102 super(fs); 103 } 104 105 private synchronized void initializeMetadataCache(Configuration conf) { 106 if (harMetaCache == null) { 107 int cacheSize = conf.getInt(METADATA_CACHE_ENTRIES_KEY, METADATA_CACHE_ENTRIES_DEFAULT); 108 harMetaCache = Collections.synchronizedMap(new LruCache<URI, HarMetaData>(cacheSize)); 109 } 110 } 111 112 /** 113 * Initialize a Har filesystem per har archive. The 114 * archive home directory is the top level directory 115 * in the filesystem that contains the HAR archive. 116 * Be careful with this method, you do not want to go 117 * on creating new Filesystem instances per call to 118 * path.getFileSystem(). 119 * the uri of Har is 120 * har://underlyingfsscheme-host:port/archivepath. 121 * or 122 * har:///archivepath. This assumes the underlying filesystem 123 * to be used in case not specified. 124 */ 125 @Override 126 public void initialize(URI name, Configuration conf) throws IOException { 127 // initialize the metadata cache, if needed 128 initializeMetadataCache(conf); 129 130 // decode the name 131 URI underLyingURI = decodeHarURI(name, conf); 132 // we got the right har Path- now check if this is 133 // truly a har filesystem 134 Path harPath = archivePath( 135 new Path(name.getScheme(), name.getAuthority(), name.getPath())); 136 if (harPath == null) { 137 throw new IOException("Invalid path for the Har Filesystem. " + 138 name.toString()); 139 } 140 if (fs == null) { 141 fs = FileSystem.get(underLyingURI, conf); 142 } 143 uri = harPath.toUri(); 144 archivePath = new Path(uri.getPath()); 145 harAuth = getHarAuth(underLyingURI); 146 //check for the underlying fs containing 147 // the index file 148 Path masterIndexPath = new Path(archivePath, "_masterindex"); 149 Path archiveIndexPath = new Path(archivePath, "_index"); 150 if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) { 151 throw new IOException("Invalid path for the Har Filesystem. " + 152 "No index file in " + harPath); 153 } 154 155 metadata = harMetaCache.get(uri); 156 if (metadata != null) { 157 FileStatus mStat = fs.getFileStatus(masterIndexPath); 158 FileStatus aStat = fs.getFileStatus(archiveIndexPath); 159 if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() || 160 aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) { 161 // the archive has been overwritten since we last read it 162 // remove the entry from the meta data cache 163 metadata = null; 164 harMetaCache.remove(uri); 165 } 166 } 167 if (metadata == null) { 168 metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath); 169 metadata.parseMetaData(); 170 harMetaCache.put(uri, metadata); 171 } 172 } 173 174 // get the version of the filesystem from the masterindex file 175 // the version is currently not useful since its the first version 176 // of archives 177 public int getHarVersion() throws IOException { 178 if (metadata != null) { 179 return metadata.getVersion(); 180 } 181 else { 182 throw new IOException("Invalid meta data for the Har Filesystem"); 183 } 184 } 185 186 /* 187 * find the parent path that is the 188 * archive path in the path. The last 189 * path segment that ends with .har is 190 * the path that will be returned. 191 */ 192 private Path archivePath(Path p) { 193 Path retPath = null; 194 Path tmp = p; 195 for (int i=0; i< p.depth(); i++) { 196 if (tmp.toString().endsWith(".har")) { 197 retPath = tmp; 198 break; 199 } 200 tmp = tmp.getParent(); 201 } 202 return retPath; 203 } 204 205 /** 206 * decode the raw URI to get the underlying URI 207 * @param rawURI raw Har URI 208 * @return filtered URI of the underlying fileSystem 209 */ 210 private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException { 211 String tmpAuth = rawURI.getAuthority(); 212 //we are using the default file 213 //system in the config 214 //so create a underlying uri and 215 //return it 216 if (tmpAuth == null) { 217 //create a path 218 return FileSystem.getDefaultUri(conf); 219 } 220 String authority = rawURI.getAuthority(); 221 if (authority == null) { 222 throw new IOException("URI: " + rawURI 223 + " is an invalid Har URI since authority==null." 224 + " Expecting har://<scheme>-<host>/<path>."); 225 } 226 227 int i = authority.indexOf('-'); 228 if (i < 0) { 229 throw new IOException("URI: " + rawURI 230 + " is an invalid Har URI since '-' not found." 231 + " Expecting har://<scheme>-<host>/<path>."); 232 } 233 234 if (rawURI.getQuery() != null) { 235 // query component not allowed 236 throw new IOException("query component in Path not supported " + rawURI); 237 } 238 239 URI tmp = null; 240 241 try { 242 // convert <scheme>-<host> to <scheme>://<host> 243 URI baseUri = new URI(authority.replaceFirst("-", "://")); 244 245 tmp = new URI(baseUri.getScheme(), baseUri.getAuthority(), 246 rawURI.getPath(), rawURI.getQuery(), rawURI.getFragment()); 247 } catch (URISyntaxException e) { 248 throw new IOException("URI: " + rawURI 249 + " is an invalid Har URI. Expecting har://<scheme>-<host>/<path>."); 250 } 251 return tmp; 252 } 253 254 private static String decodeString(String str) 255 throws UnsupportedEncodingException { 256 return URLDecoder.decode(str, "UTF-8"); 257 } 258 259 private String decodeFileName(String fname) 260 throws UnsupportedEncodingException { 261 int version = metadata.getVersion(); 262 if (version == 2 || version == 3){ 263 return decodeString(fname); 264 } 265 return fname; 266 } 267 268 /** 269 * return the top level archive. 270 */ 271 @Override 272 public Path getWorkingDirectory() { 273 return new Path(uri.toString()); 274 } 275 276 /** 277 * Create a har specific auth 278 * har-underlyingfs:port 279 * @param underLyingURI the uri of underlying 280 * filesystem 281 * @return har specific auth 282 */ 283 private String getHarAuth(URI underLyingUri) { 284 String auth = underLyingUri.getScheme() + "-"; 285 if (underLyingUri.getHost() != null) { 286 auth += underLyingUri.getHost() + ":"; 287 if (underLyingUri.getPort() != -1) { 288 auth += underLyingUri.getPort(); 289 } 290 } 291 else { 292 auth += ":"; 293 } 294 return auth; 295 } 296 297 /** 298 * Returns the uri of this filesystem. 299 * The uri is of the form 300 * har://underlyingfsschema-host:port/pathintheunderlyingfs 301 */ 302 @Override 303 public URI getUri() { 304 return this.uri; 305 } 306 307 /** 308 * this method returns the path 309 * inside the har filesystem. 310 * this is relative path inside 311 * the har filesystem. 312 * @param path the fully qualified path in the har filesystem. 313 * @return relative path in the filesystem. 314 */ 315 private Path getPathInHar(Path path) { 316 Path harPath = new Path(path.toUri().getPath()); 317 if (archivePath.compareTo(harPath) == 0) 318 return new Path(Path.SEPARATOR); 319 Path tmp = new Path(harPath.getName()); 320 Path parent = harPath.getParent(); 321 while (!(parent.compareTo(archivePath) == 0)) { 322 if (parent.toString().equals(Path.SEPARATOR)) { 323 tmp = null; 324 break; 325 } 326 tmp = new Path(parent.getName(), tmp); 327 parent = parent.getParent(); 328 } 329 if (tmp != null) 330 tmp = new Path(Path.SEPARATOR, tmp); 331 return tmp; 332 } 333 334 //the relative path of p. basically 335 // getting rid of /. Parsing and doing 336 // string manipulation is not good - so 337 // just use the path api to do it. 338 private Path makeRelative(String initial, Path p) { 339 String scheme = this.uri.getScheme(); 340 String authority = this.uri.getAuthority(); 341 Path root = new Path(Path.SEPARATOR); 342 if (root.compareTo(p) == 0) 343 return new Path(scheme, authority, initial); 344 Path retPath = new Path(p.getName()); 345 Path parent = p.getParent(); 346 for (int i=0; i < p.depth()-1; i++) { 347 retPath = new Path(parent.getName(), retPath); 348 parent = parent.getParent(); 349 } 350 return new Path(new Path(scheme, authority, initial), 351 retPath.toString()); 352 } 353 354 /* this makes a path qualified in the har filesystem 355 * (non-Javadoc) 356 * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified( 357 * org.apache.hadoop.fs.Path) 358 */ 359 @Override 360 public Path makeQualified(Path path) { 361 // make sure that we just get the 362 // path component 363 Path fsPath = path; 364 if (!path.isAbsolute()) { 365 fsPath = new Path(archivePath, path); 366 } 367 368 URI tmpURI = fsPath.toUri(); 369 //change this to Har uri 370 return new Path(uri.getScheme(), harAuth, tmpURI.getPath()); 371 } 372 373 /** 374 * Fix offset and length of block locations. 375 * Note that this method modifies the original array. 376 * @param locations block locations of har part file 377 * @param start the start of the desired range in the contained file 378 * @param len the length of the desired range 379 * @param fileOffsetInHar the offset of the desired file in the har part file 380 * @return block locations with fixed offset and length 381 */ 382 static BlockLocation[] fixBlockLocations(BlockLocation[] locations, 383 long start, 384 long len, 385 long fileOffsetInHar) { 386 // offset 1 past last byte of desired range 387 long end = start + len; 388 389 for (BlockLocation location : locations) { 390 // offset of part block relative to beginning of desired file 391 // (may be negative if file starts in this part block) 392 long harBlockStart = location.getOffset() - fileOffsetInHar; 393 // offset 1 past last byte of har block relative to beginning of 394 // desired file 395 long harBlockEnd = harBlockStart + location.getLength(); 396 397 if (start > harBlockStart) { 398 // desired range starts after beginning of this har block 399 // fix offset to beginning of relevant range (relative to desired file) 400 location.setOffset(start); 401 // fix length to relevant portion of har block 402 location.setLength(location.getLength() - (start - harBlockStart)); 403 } else { 404 // desired range includes beginning of this har block 405 location.setOffset(harBlockStart); 406 } 407 408 if (harBlockEnd > end) { 409 // range ends before end of this har block 410 // fix length to remove irrelevant portion at the end 411 location.setLength(location.getLength() - (harBlockEnd - end)); 412 } 413 } 414 415 return locations; 416 } 417 418 /** 419 * Get block locations from the underlying fs and fix their 420 * offsets and lengths. 421 * @param file the input filestatus to get block locations 422 * @param start the start of the desired range in the contained file 423 * @param len the length of the desired range 424 * @return block locations for this segment of file 425 * @throws IOException 426 */ 427 @Override 428 public BlockLocation[] getFileBlockLocations(FileStatus file, long start, 429 long len) throws IOException { 430 HarStatus hstatus = getFileHarStatus(file.getPath()); 431 Path partPath = new Path(archivePath, hstatus.getPartName()); 432 FileStatus partStatus = metadata.getPartFileStatus(partPath); 433 434 // get all part blocks that overlap with the desired file blocks 435 BlockLocation[] locations = 436 fs.getFileBlockLocations(partStatus, 437 hstatus.getStartIndex() + start, len); 438 439 return fixBlockLocations(locations, start, len, hstatus.getStartIndex()); 440 } 441 442 /** 443 * the hash of the path p inside iniside 444 * the filesystem 445 * @param p the path in the harfilesystem 446 * @return the hash code of the path. 447 */ 448 public static int getHarHash(Path p) { 449 return (p.toString().hashCode() & 0x7fffffff); 450 } 451 452 static class Store { 453 public Store() { 454 begin = end = startHash = endHash = 0; 455 } 456 public Store(long begin, long end, int startHash, int endHash) { 457 this.begin = begin; 458 this.end = end; 459 this.startHash = startHash; 460 this.endHash = endHash; 461 } 462 public long begin; 463 public long end; 464 public int startHash; 465 public int endHash; 466 } 467 468 /** 469 * Get filestatuses of all the children of a given directory. This just reads 470 * through index file and reads line by line to get all statuses for children 471 * of a directory. Its a brute force way of getting all such filestatuses 472 * 473 * @param parent 474 * the parent path directory 475 * @param statuses 476 * the list to add the children filestatuses to 477 * @param children 478 * the string list of children for this parent 479 * @param archiveIndexStat 480 * the archive index filestatus 481 */ 482 private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses, 483 List<String> children) throws IOException { 484 String parentString = parent.getName(); 485 if (!parentString.endsWith(Path.SEPARATOR)){ 486 parentString += Path.SEPARATOR; 487 } 488 Path harPath = new Path(parentString); 489 int harlen = harPath.depth(); 490 final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>(); 491 492 for (HarStatus hstatus : metadata.archive.values()) { 493 String child = hstatus.getName(); 494 if ((child.startsWith(parentString))) { 495 Path thisPath = new Path(child); 496 if (thisPath.depth() == harlen + 1) { 497 statuses.add(toFileStatus(hstatus, cache)); 498 } 499 } 500 } 501 } 502 503 /** 504 * Combine the status stored in the index and the underlying status. 505 * @param h status stored in the index 506 * @param cache caching the underlying file statuses 507 * @return the combined file status 508 * @throws IOException 509 */ 510 private FileStatus toFileStatus(HarStatus h, 511 Map<String, FileStatus> cache) throws IOException { 512 FileStatus underlying = null; 513 if (cache != null) { 514 underlying = cache.get(h.partName); 515 } 516 if (underlying == null) { 517 final Path p = h.isDir? archivePath: new Path(archivePath, h.partName); 518 underlying = fs.getFileStatus(p); 519 if (cache != null) { 520 cache.put(h.partName, underlying); 521 } 522 } 523 524 long modTime = 0; 525 int version = metadata.getVersion(); 526 if (version < 3) { 527 modTime = underlying.getModificationTime(); 528 } else if (version == 3) { 529 modTime = h.getModificationTime(); 530 } 531 532 return new FileStatus( 533 h.isDir()? 0L: h.getLength(), 534 h.isDir(), 535 underlying.getReplication(), 536 underlying.getBlockSize(), 537 modTime, 538 underlying.getAccessTime(), 539 underlying.getPermission(), 540 underlying.getOwner(), 541 underlying.getGroup(), 542 makeRelative(this.uri.getPath(), new Path(h.name))); 543 } 544 545 // a single line parser for hadoop archives status 546 // stored in a single line in the index files 547 // the format is of the form 548 // filename "dir"/"file" partFileName startIndex length 549 // <space seperated children> 550 private class HarStatus { 551 boolean isDir; 552 String name; 553 List<String> children; 554 String partName; 555 long startIndex; 556 long length; 557 long modificationTime = 0; 558 559 public HarStatus(String harString) throws UnsupportedEncodingException { 560 String[] splits = harString.split(" "); 561 this.name = decodeFileName(splits[0]); 562 this.isDir = "dir".equals(splits[1]) ? true: false; 563 // this is equal to "none" if its a directory 564 this.partName = splits[2]; 565 this.startIndex = Long.parseLong(splits[3]); 566 this.length = Long.parseLong(splits[4]); 567 568 int version = metadata.getVersion(); 569 String[] propSplits = null; 570 // propSplits is used to retrieve the metainformation that Har versions 571 // 1 & 2 missed (modification time, permission, owner group). 572 // These fields are stored in an encoded string placed in different 573 // locations depending on whether it's a file or directory entry. 574 // If it's a directory, the string will be placed at the partName 575 // location (directories have no partName because they don't have data 576 // to be stored). This is done because the number of fields in a 577 // directory entry is unbounded (all children are listed at the end) 578 // If it's a file, the string will be the last field. 579 if (isDir) { 580 if (version == 3){ 581 propSplits = decodeString(this.partName).split(" "); 582 } 583 children = new ArrayList<String>(); 584 for (int i = 5; i < splits.length; i++) { 585 children.add(decodeFileName(splits[i])); 586 } 587 } else if (version == 3) { 588 propSplits = decodeString(splits[5]).split(" "); 589 } 590 591 if (propSplits != null && propSplits.length >= 4) { 592 modificationTime = Long.parseLong(propSplits[0]); 593 // the fields below are stored in the file but are currently not used 594 // by HarFileSystem 595 // permission = new FsPermission(Short.parseShort(propSplits[1])); 596 // owner = decodeString(propSplits[2]); 597 // group = decodeString(propSplits[3]); 598 } 599 } 600 public boolean isDir() { 601 return isDir; 602 } 603 604 public String getName() { 605 return name; 606 } 607 public String getPartName() { 608 return partName; 609 } 610 public long getStartIndex() { 611 return startIndex; 612 } 613 public long getLength() { 614 return length; 615 } 616 public long getModificationTime() { 617 return modificationTime; 618 } 619 } 620 621 /** 622 * return the filestatus of files in har archive. 623 * The permission returned are that of the archive 624 * index files. The permissions are not persisted 625 * while creating a hadoop archive. 626 * @param f the path in har filesystem 627 * @return filestatus. 628 * @throws IOException 629 */ 630 @Override 631 public FileStatus getFileStatus(Path f) throws IOException { 632 HarStatus hstatus = getFileHarStatus(f); 633 return toFileStatus(hstatus, null); 634 } 635 636 private HarStatus getFileHarStatus(Path f) throws IOException { 637 // get the fs DataInputStream for the underlying file 638 // look up the index. 639 Path p = makeQualified(f); 640 Path harPath = getPathInHar(p); 641 if (harPath == null) { 642 throw new IOException("Invalid file name: " + f + " in " + uri); 643 } 644 HarStatus hstatus = metadata.archive.get(harPath); 645 if (hstatus == null) { 646 throw new FileNotFoundException("File: " + f + " does not exist in " + uri); 647 } 648 return hstatus; 649 } 650 651 /** 652 * @return null since no checksum algorithm is implemented. 653 */ 654 @Override 655 public FileChecksum getFileChecksum(Path f) { 656 return null; 657 } 658 659 /** 660 * Returns a har input stream which fakes end of 661 * file. It reads the index files to get the part 662 * file name and the size and start of the file. 663 */ 664 @Override 665 public FSDataInputStream open(Path f, int bufferSize) throws IOException { 666 // get the fs DataInputStream for the underlying file 667 HarStatus hstatus = getFileHarStatus(f); 668 // we got it.. woo hooo!!! 669 if (hstatus.isDir()) { 670 throw new FileNotFoundException(f + " : not a file in " + 671 archivePath); 672 } 673 return new HarFSDataInputStream(fs, new Path(archivePath, 674 hstatus.getPartName()), 675 hstatus.getStartIndex(), hstatus.getLength(), bufferSize); 676 } 677 678 @Override 679 public FSDataOutputStream create(Path f, 680 FsPermission permission, 681 boolean overwrite, 682 int bufferSize, 683 short replication, 684 long blockSize, 685 Progressable progress) throws IOException { 686 throw new IOException("Har: create not allowed."); 687 } 688 689 @Override 690 public void close() throws IOException { 691 if (fs != null) { 692 try { 693 fs.close(); 694 } catch(IOException ie) { 695 //this might already be closed 696 // ignore 697 } 698 } 699 } 700 701 /** 702 * Not implemented. 703 */ 704 @Override 705 public boolean setReplication(Path src, short replication) throws IOException{ 706 throw new IOException("Har: setreplication not allowed"); 707 } 708 709 /** 710 * Not implemented. 711 */ 712 @Override 713 public boolean delete(Path f, boolean recursive) throws IOException { 714 throw new IOException("Har: delete not allowed"); 715 } 716 717 /** 718 * liststatus returns the children of a directory 719 * after looking up the index files. 720 */ 721 @Override 722 public FileStatus[] listStatus(Path f) throws IOException { 723 //need to see if the file is an index in file 724 //get the filestatus of the archive directory 725 // we will create fake filestatuses to return 726 // to the client 727 List<FileStatus> statuses = new ArrayList<FileStatus>(); 728 Path tmpPath = makeQualified(f); 729 Path harPath = getPathInHar(tmpPath); 730 HarStatus hstatus = metadata.archive.get(harPath); 731 if (hstatus == null) { 732 throw new FileNotFoundException("File " + f + " not found in " + archivePath); 733 } 734 if (hstatus.isDir()) { 735 fileStatusesInIndex(hstatus, statuses, hstatus.children); 736 } else { 737 statuses.add(toFileStatus(hstatus, null)); 738 } 739 740 return statuses.toArray(new FileStatus[statuses.size()]); 741 } 742 743 /** 744 * return the top level archive path. 745 */ 746 @Override 747 public Path getHomeDirectory() { 748 return new Path(uri.toString()); 749 } 750 751 @Override 752 public void setWorkingDirectory(Path newDir) { 753 //does nothing. 754 } 755 756 /** 757 * not implemented. 758 */ 759 @Override 760 public boolean mkdirs(Path f, FsPermission permission) throws IOException { 761 throw new IOException("Har: mkdirs not allowed"); 762 } 763 764 /** 765 * not implemented. 766 */ 767 @Override 768 public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws 769 IOException { 770 throw new IOException("Har: copyfromlocalfile not allowed"); 771 } 772 773 /** 774 * copies the file in the har filesystem to a local file. 775 */ 776 @Override 777 public void copyToLocalFile(boolean delSrc, Path src, Path dst) 778 throws IOException { 779 FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf()); 780 } 781 782 /** 783 * not implemented. 784 */ 785 @Override 786 public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) 787 throws IOException { 788 throw new IOException("Har: startLocalOutput not allowed"); 789 } 790 791 /** 792 * not implemented. 793 */ 794 @Override 795 public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) 796 throws IOException { 797 throw new IOException("Har: completeLocalOutput not allowed"); 798 } 799 800 /** 801 * not implemented. 802 */ 803 @Override 804 public void setOwner(Path p, String username, String groupname) 805 throws IOException { 806 throw new IOException("Har: setowner not allowed"); 807 } 808 809 /** 810 * Not implemented. 811 */ 812 @Override 813 public void setPermission(Path p, FsPermission permisssion) 814 throws IOException { 815 throw new IOException("Har: setPermission not allowed"); 816 } 817 818 /** 819 * Hadoop archives input stream. This input stream fakes EOF 820 * since archive files are part of bigger part files. 821 */ 822 private static class HarFSDataInputStream extends FSDataInputStream { 823 /** 824 * Create an input stream that fakes all the reads/positions/seeking. 825 */ 826 private static class HarFsInputStream extends FSInputStream { 827 private long position, start, end; 828 //The underlying data input stream that the 829 // underlying filesystem will return. 830 private FSDataInputStream underLyingStream; 831 //one byte buffer 832 private byte[] oneBytebuff = new byte[1]; 833 HarFsInputStream(FileSystem fs, Path path, long start, 834 long length, int bufferSize) throws IOException { 835 underLyingStream = fs.open(path, bufferSize); 836 underLyingStream.seek(start); 837 // the start of this file in the part file 838 this.start = start; 839 // the position pointer in the part file 840 this.position = start; 841 // the end pointer in the part file 842 this.end = start + length; 843 } 844 845 @Override 846 public synchronized int available() throws IOException { 847 long remaining = end - underLyingStream.getPos(); 848 if (remaining > (long)Integer.MAX_VALUE) { 849 return Integer.MAX_VALUE; 850 } 851 return (int) remaining; 852 } 853 854 @Override 855 public synchronized void close() throws IOException { 856 underLyingStream.close(); 857 super.close(); 858 } 859 860 //not implemented 861 @Override 862 public void mark(int readLimit) { 863 // do nothing 864 } 865 866 /** 867 * reset is not implemented 868 */ 869 @Override 870 public void reset() throws IOException { 871 throw new IOException("reset not implemented."); 872 } 873 874 @Override 875 public synchronized int read() throws IOException { 876 int ret = read(oneBytebuff, 0, 1); 877 return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff); 878 } 879 880 @Override 881 public synchronized int read(byte[] b) throws IOException { 882 int ret = read(b, 0, b.length); 883 if (ret != -1) { 884 position += ret; 885 } 886 return ret; 887 } 888 889 /** 890 * 891 */ 892 @Override 893 public synchronized int read(byte[] b, int offset, int len) 894 throws IOException { 895 int newlen = len; 896 int ret = -1; 897 if (position + len > end) { 898 newlen = (int) (end - position); 899 } 900 // end case 901 if (newlen == 0) 902 return ret; 903 ret = underLyingStream.read(b, offset, newlen); 904 position += ret; 905 return ret; 906 } 907 908 @Override 909 public synchronized long skip(long n) throws IOException { 910 long tmpN = n; 911 if (tmpN > 0) { 912 if (position + tmpN > end) { 913 tmpN = end - position; 914 } 915 underLyingStream.seek(tmpN + position); 916 position += tmpN; 917 return tmpN; 918 } 919 return (tmpN < 0)? -1 : 0; 920 } 921 922 @Override 923 public synchronized long getPos() throws IOException { 924 return (position - start); 925 } 926 927 @Override 928 public synchronized void seek(long pos) throws IOException { 929 if (pos < 0 || (start + pos > end)) { 930 throw new IOException("Failed to seek: EOF"); 931 } 932 position = start + pos; 933 underLyingStream.seek(position); 934 } 935 936 @Override 937 public boolean seekToNewSource(long targetPos) throws IOException { 938 //do not need to implement this 939 // hdfs in itself does seektonewsource 940 // while reading. 941 return false; 942 } 943 944 /** 945 * implementing position readable. 946 */ 947 @Override 948 public int read(long pos, byte[] b, int offset, int length) 949 throws IOException { 950 int nlength = length; 951 if (start + nlength + pos > end) { 952 nlength = (int) (end - (start + pos)); 953 } 954 return underLyingStream.read(pos + start , b, offset, nlength); 955 } 956 957 /** 958 * position readable again. 959 */ 960 @Override 961 public void readFully(long pos, byte[] b, int offset, int length) 962 throws IOException { 963 if (start + length + pos > end) { 964 throw new IOException("Not enough bytes to read."); 965 } 966 underLyingStream.readFully(pos + start, b, offset, length); 967 } 968 969 @Override 970 public void readFully(long pos, byte[] b) throws IOException { 971 readFully(pos, b, 0, b.length); 972 } 973 974 } 975 976 /** 977 * constructors for har input stream. 978 * @param fs the underlying filesystem 979 * @param p The path in the underlying filesystem 980 * @param start the start position in the part file 981 * @param length the length of valid data in the part file 982 * @param bufsize the buffer size 983 * @throws IOException 984 */ 985 public HarFSDataInputStream(FileSystem fs, Path p, long start, 986 long length, int bufsize) throws IOException { 987 super(new HarFsInputStream(fs, p, start, length, bufsize)); 988 } 989 990 /** 991 * constructor for har input stream. 992 * @param fs the underlying filesystem 993 * @param p the path in the underlying file system 994 * @param start the start position in the part file 995 * @param length the length of valid data in the part file. 996 * @throws IOException 997 */ 998 public HarFSDataInputStream(FileSystem fs, Path p, long start, long length) 999 throws IOException { 1000 super(new HarFsInputStream(fs, p, start, length, 0)); 1001 } 1002 } 1003 1004 private class HarMetaData { 1005 private FileSystem fs; 1006 private int version; 1007 // the masterIndex of the archive 1008 private Path masterIndexPath; 1009 // the index file 1010 private Path archiveIndexPath; 1011 1012 private long masterIndexTimestamp; 1013 private long archiveIndexTimestamp; 1014 1015 List<Store> stores = new ArrayList<Store>(); 1016 Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>(); 1017 private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>(); 1018 1019 public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) { 1020 this.fs = fs; 1021 this.masterIndexPath = masterIndexPath; 1022 this.archiveIndexPath = archiveIndexPath; 1023 } 1024 1025 public FileStatus getPartFileStatus(Path partPath) throws IOException { 1026 FileStatus status; 1027 status = partFileStatuses.get(partPath); 1028 if (status == null) { 1029 status = fs.getFileStatus(partPath); 1030 partFileStatuses.put(partPath, status); 1031 } 1032 return status; 1033 } 1034 1035 public long getMasterIndexTimestamp() { 1036 return masterIndexTimestamp; 1037 } 1038 1039 public long getArchiveIndexTimestamp() { 1040 return archiveIndexTimestamp; 1041 } 1042 1043 private int getVersion() { 1044 return version; 1045 } 1046 1047 private void parseMetaData() throws IOException { 1048 Text line; 1049 long read; 1050 FSDataInputStream in = null; 1051 LineReader lin = null; 1052 1053 try { 1054 in = fs.open(masterIndexPath); 1055 FileStatus masterStat = fs.getFileStatus(masterIndexPath); 1056 masterIndexTimestamp = masterStat.getModificationTime(); 1057 lin = new LineReader(in, getConf()); 1058 line = new Text(); 1059 read = lin.readLine(line); 1060 1061 // the first line contains the version of the index file 1062 String versionLine = line.toString(); 1063 String[] arr = versionLine.split(" "); 1064 version = Integer.parseInt(arr[0]); 1065 // make it always backwards-compatible 1066 if (this.version > HarFileSystem.VERSION) { 1067 throw new IOException("Invalid version " + 1068 this.version + " expected " + HarFileSystem.VERSION); 1069 } 1070 1071 // each line contains a hashcode range and the index file name 1072 String[] readStr = null; 1073 while(read < masterStat.getLen()) { 1074 int b = lin.readLine(line); 1075 read += b; 1076 readStr = line.toString().split(" "); 1077 int startHash = Integer.parseInt(readStr[0]); 1078 int endHash = Integer.parseInt(readStr[1]); 1079 stores.add(new Store(Long.parseLong(readStr[2]), 1080 Long.parseLong(readStr[3]), startHash, 1081 endHash)); 1082 line.clear(); 1083 } 1084 } finally { 1085 IOUtils.cleanup(LOG, lin, in); 1086 } 1087 1088 FSDataInputStream aIn = fs.open(archiveIndexPath); 1089 try { 1090 FileStatus archiveStat = fs.getFileStatus(archiveIndexPath); 1091 archiveIndexTimestamp = archiveStat.getModificationTime(); 1092 LineReader aLin; 1093 1094 // now start reading the real index file 1095 for (Store s: stores) { 1096 read = 0; 1097 aIn.seek(s.begin); 1098 aLin = new LineReader(aIn, getConf()); 1099 while (read + s.begin < s.end) { 1100 int tmp = aLin.readLine(line); 1101 read += tmp; 1102 String lineFeed = line.toString(); 1103 String[] parsed = lineFeed.split(" "); 1104 parsed[0] = decodeFileName(parsed[0]); 1105 archive.put(new Path(parsed[0]), new HarStatus(lineFeed)); 1106 line.clear(); 1107 } 1108 } 1109 } finally { 1110 IOUtils.cleanup(LOG, aIn); 1111 } 1112 } 1113 } 1114 1115 /* 1116 * testing purposes only: 1117 */ 1118 HarMetaData getMetadata() { 1119 return metadata; 1120 } 1121 1122 private static class LruCache<K, V> extends LinkedHashMap<K, V> { 1123 private final int MAX_ENTRIES; 1124 1125 public LruCache(int maxEntries) { 1126 super(maxEntries + 1, 1.0f, true); 1127 MAX_ENTRIES = maxEntries; 1128 } 1129 1130 @Override 1131 protected boolean removeEldestEntry(Map.Entry<K, V> eldest) { 1132 return size() > MAX_ENTRIES; 1133 } 1134 } 1135 }