001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.fs; 019 020import java.io.FileNotFoundException; 021import java.io.IOException; 022import java.io.UnsupportedEncodingException; 023import java.net.URI; 024import java.net.URISyntaxException; 025import java.net.URLDecoder; 026import java.util.ArrayList; 027import java.util.Collections; 028import java.util.List; 029import java.util.LinkedHashMap; 030import java.util.Map; 031import java.util.TreeMap; 032import java.util.HashMap; 033 034import org.apache.commons.logging.Log; 035import org.apache.commons.logging.LogFactory; 036import org.apache.hadoop.conf.Configuration; 037import org.apache.hadoop.fs.permission.FsPermission; 038import org.apache.hadoop.io.IOUtils; 039import org.apache.hadoop.io.Text; 040import org.apache.hadoop.util.LineReader; 041import org.apache.hadoop.util.Progressable; 042 043/** 044 * This is an implementation of the Hadoop Archive 045 * Filesystem. This archive Filesystem has index files 046 * of the form _index* and has contents of the form 047 * part-*. The index files store the indexes of the 048 * real files. The index files are of the form _masterindex 049 * and _index. The master index is a level of indirection 050 * in to the index file to make the look ups faster. the index 051 * file is sorted with hash code of the paths that it contains 052 * and the master index contains pointers to the positions in 053 * index for ranges of hashcodes. 054 */ 055 056public class HarFileSystem extends FilterFileSystem { 057 058 private static final Log LOG = LogFactory.getLog(HarFileSystem.class); 059 060 public static final String METADATA_CACHE_ENTRIES_KEY = "fs.har.metadatacache.entries"; 061 public static final int METADATA_CACHE_ENTRIES_DEFAULT = 10; 062 063 public static final int VERSION = 3; 064 065 private static Map<URI, HarMetaData> harMetaCache; 066 067 // uri representation of this Har filesystem 068 private URI uri; 069 // the top level path of the archive 070 // in the underlying file system 071 private Path archivePath; 072 // the har auth 073 private String harAuth; 074 075 // pointer into the static metadata cache 076 private HarMetaData metadata; 077 078 /** 079 * public construction of harfilesystem 080 * 081 */ 082 public HarFileSystem() { 083 } 084 085 /** 086 * Constructor to create a HarFileSystem with an 087 * underlying filesystem. 088 * @param fs 089 */ 090 public HarFileSystem(FileSystem fs) { 091 super(fs); 092 } 093 094 private synchronized void initializeMetadataCache(Configuration conf) { 095 if (harMetaCache == null) { 096 int cacheSize = conf.getInt(METADATA_CACHE_ENTRIES_KEY, METADATA_CACHE_ENTRIES_DEFAULT); 097 harMetaCache = Collections.synchronizedMap(new LruCache<URI, HarMetaData>(cacheSize)); 098 } 099 } 100 101 /** 102 * Initialize a Har filesystem per har archive. The 103 * archive home directory is the top level directory 104 * in the filesystem that contains the HAR archive. 105 * Be careful with this method, you do not want to go 106 * on creating new Filesystem instances per call to 107 * path.getFileSystem(). 108 * the uri of Har is 109 * har://underlyingfsscheme-host:port/archivepath. 110 * or 111 * har:///archivepath. This assumes the underlying filesystem 112 * to be used in case not specified. 113 */ 114 public void initialize(URI name, Configuration conf) throws IOException { 115 // initialize the metadata cache, if needed 116 initializeMetadataCache(conf); 117 118 // decode the name 119 URI underLyingURI = decodeHarURI(name, conf); 120 // we got the right har Path- now check if this is 121 // truly a har filesystem 122 Path harPath = archivePath( 123 new Path(name.getScheme(), name.getAuthority(), name.getPath())); 124 if (harPath == null) { 125 throw new IOException("Invalid path for the Har Filesystem. " + 126 name.toString()); 127 } 128 if (fs == null) { 129 fs = FileSystem.get(underLyingURI, conf); 130 } 131 uri = harPath.toUri(); 132 archivePath = new Path(uri.getPath()); 133 harAuth = getHarAuth(underLyingURI); 134 //check for the underlying fs containing 135 // the index file 136 Path masterIndexPath = new Path(archivePath, "_masterindex"); 137 Path archiveIndexPath = new Path(archivePath, "_index"); 138 if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) { 139 throw new IOException("Invalid path for the Har Filesystem. " + 140 "No index file in " + harPath); 141 } 142 143 metadata = harMetaCache.get(uri); 144 if (metadata != null) { 145 FileStatus mStat = fs.getFileStatus(masterIndexPath); 146 FileStatus aStat = fs.getFileStatus(archiveIndexPath); 147 if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() || 148 aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) { 149 // the archive has been overwritten since we last read it 150 // remove the entry from the meta data cache 151 metadata = null; 152 harMetaCache.remove(uri); 153 } 154 } 155 if (metadata == null) { 156 metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath); 157 metadata.parseMetaData(); 158 harMetaCache.put(uri, metadata); 159 } 160 } 161 162 // get the version of the filesystem from the masterindex file 163 // the version is currently not useful since its the first version 164 // of archives 165 public int getHarVersion() throws IOException { 166 if (metadata != null) { 167 return metadata.getVersion(); 168 } 169 else { 170 throw new IOException("Invalid meta data for the Har Filesystem"); 171 } 172 } 173 174 /* 175 * find the parent path that is the 176 * archive path in the path. The last 177 * path segment that ends with .har is 178 * the path that will be returned. 179 */ 180 private Path archivePath(Path p) { 181 Path retPath = null; 182 Path tmp = p; 183 for (int i=0; i< p.depth(); i++) { 184 if (tmp.toString().endsWith(".har")) { 185 retPath = tmp; 186 break; 187 } 188 tmp = tmp.getParent(); 189 } 190 return retPath; 191 } 192 193 /** 194 * decode the raw URI to get the underlying URI 195 * @param rawURI raw Har URI 196 * @return filtered URI of the underlying fileSystem 197 */ 198 private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException { 199 String tmpAuth = rawURI.getAuthority(); 200 //we are using the default file 201 //system in the config 202 //so create a underlying uri and 203 //return it 204 if (tmpAuth == null) { 205 //create a path 206 return FileSystem.getDefaultUri(conf); 207 } 208 String host = rawURI.getHost(); 209 if (host == null) { 210 throw new IOException("URI: " + rawURI 211 + " is an invalid Har URI since host==null." 212 + " Expecting har://<scheme>-<host>/<path>."); 213 } 214 int i = host.indexOf('-'); 215 if (i < 0) { 216 throw new IOException("URI: " + rawURI 217 + " is an invalid Har URI since '-' not found." 218 + " Expecting har://<scheme>-<host>/<path>."); 219 } 220 final String underLyingScheme = host.substring(0, i); 221 i++; 222 final String underLyingHost = i == host.length()? null: host.substring(i); 223 int underLyingPort = rawURI.getPort(); 224 String auth = (underLyingHost == null && underLyingPort == -1)? 225 null:(underLyingHost+":"+underLyingPort); 226 URI tmp = null; 227 if (rawURI.getQuery() != null) { 228 // query component not allowed 229 throw new IOException("query component in Path not supported " + rawURI); 230 } 231 try { 232 tmp = new URI(underLyingScheme, auth, rawURI.getPath(), 233 rawURI.getQuery(), rawURI.getFragment()); 234 } catch (URISyntaxException e) { 235 // do nothing should not happen 236 } 237 return tmp; 238 } 239 240 private static String decodeString(String str) 241 throws UnsupportedEncodingException { 242 return URLDecoder.decode(str, "UTF-8"); 243 } 244 245 private String decodeFileName(String fname) 246 throws UnsupportedEncodingException { 247 int version = metadata.getVersion(); 248 if (version == 2 || version == 3){ 249 return decodeString(fname); 250 } 251 return fname; 252 } 253 254 /** 255 * return the top level archive. 256 */ 257 public Path getWorkingDirectory() { 258 return new Path(uri.toString()); 259 } 260 261 /** 262 * Create a har specific auth 263 * har-underlyingfs:port 264 * @param underLyingURI the uri of underlying 265 * filesystem 266 * @return har specific auth 267 */ 268 private String getHarAuth(URI underLyingUri) { 269 String auth = underLyingUri.getScheme() + "-"; 270 if (underLyingUri.getHost() != null) { 271 auth += underLyingUri.getHost() + ":"; 272 if (underLyingUri.getPort() != -1) { 273 auth += underLyingUri.getPort(); 274 } 275 } 276 else { 277 auth += ":"; 278 } 279 return auth; 280 } 281 282 /** 283 * Returns the uri of this filesystem. 284 * The uri is of the form 285 * har://underlyingfsschema-host:port/pathintheunderlyingfs 286 */ 287 @Override 288 public URI getUri() { 289 return this.uri; 290 } 291 292 /** 293 * this method returns the path 294 * inside the har filesystem. 295 * this is relative path inside 296 * the har filesystem. 297 * @param path the fully qualified path in the har filesystem. 298 * @return relative path in the filesystem. 299 */ 300 private Path getPathInHar(Path path) { 301 Path harPath = new Path(path.toUri().getPath()); 302 if (archivePath.compareTo(harPath) == 0) 303 return new Path(Path.SEPARATOR); 304 Path tmp = new Path(harPath.getName()); 305 Path parent = harPath.getParent(); 306 while (!(parent.compareTo(archivePath) == 0)) { 307 if (parent.toString().equals(Path.SEPARATOR)) { 308 tmp = null; 309 break; 310 } 311 tmp = new Path(parent.getName(), tmp); 312 parent = parent.getParent(); 313 } 314 if (tmp != null) 315 tmp = new Path(Path.SEPARATOR, tmp); 316 return tmp; 317 } 318 319 //the relative path of p. basically 320 // getting rid of /. Parsing and doing 321 // string manipulation is not good - so 322 // just use the path api to do it. 323 private Path makeRelative(String initial, Path p) { 324 String scheme = this.uri.getScheme(); 325 String authority = this.uri.getAuthority(); 326 Path root = new Path(Path.SEPARATOR); 327 if (root.compareTo(p) == 0) 328 return new Path(scheme, authority, initial); 329 Path retPath = new Path(p.getName()); 330 Path parent = p.getParent(); 331 for (int i=0; i < p.depth()-1; i++) { 332 retPath = new Path(parent.getName(), retPath); 333 parent = parent.getParent(); 334 } 335 return new Path(new Path(scheme, authority, initial), 336 retPath.toString()); 337 } 338 339 /* this makes a path qualified in the har filesystem 340 * (non-Javadoc) 341 * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified( 342 * org.apache.hadoop.fs.Path) 343 */ 344 @Override 345 public Path makeQualified(Path path) { 346 // make sure that we just get the 347 // path component 348 Path fsPath = path; 349 if (!path.isAbsolute()) { 350 fsPath = new Path(archivePath, path); 351 } 352 353 URI tmpURI = fsPath.toUri(); 354 //change this to Har uri 355 return new Path(uri.getScheme(), harAuth, tmpURI.getPath()); 356 } 357 358 /** 359 * Fix offset and length of block locations. 360 * Note that this method modifies the original array. 361 * @param locations block locations of har part file 362 * @param start the start of the desired range in the contained file 363 * @param len the length of the desired range 364 * @param fileOffsetInHar the offset of the desired file in the har part file 365 * @return block locations with fixed offset and length 366 */ 367 static BlockLocation[] fixBlockLocations(BlockLocation[] locations, 368 long start, 369 long len, 370 long fileOffsetInHar) { 371 // offset 1 past last byte of desired range 372 long end = start + len; 373 374 for (BlockLocation location : locations) { 375 // offset of part block relative to beginning of desired file 376 // (may be negative if file starts in this part block) 377 long harBlockStart = location.getOffset() - fileOffsetInHar; 378 // offset 1 past last byte of har block relative to beginning of 379 // desired file 380 long harBlockEnd = harBlockStart + location.getLength(); 381 382 if (start > harBlockStart) { 383 // desired range starts after beginning of this har block 384 // fix offset to beginning of relevant range (relative to desired file) 385 location.setOffset(start); 386 // fix length to relevant portion of har block 387 location.setLength(location.getLength() - (start - harBlockStart)); 388 } else { 389 // desired range includes beginning of this har block 390 location.setOffset(harBlockStart); 391 } 392 393 if (harBlockEnd > end) { 394 // range ends before end of this har block 395 // fix length to remove irrelevant portion at the end 396 location.setLength(location.getLength() - (harBlockEnd - end)); 397 } 398 } 399 400 return locations; 401 } 402 403 /** 404 * Get block locations from the underlying fs and fix their 405 * offsets and lengths. 406 * @param file the input filestatus to get block locations 407 * @param start the start of the desired range in the contained file 408 * @param len the length of the desired range 409 * @return block locations for this segment of file 410 * @throws IOException 411 */ 412 @Override 413 public BlockLocation[] getFileBlockLocations(FileStatus file, long start, 414 long len) throws IOException { 415 HarStatus hstatus = getFileHarStatus(file.getPath()); 416 Path partPath = new Path(archivePath, hstatus.getPartName()); 417 FileStatus partStatus = metadata.getPartFileStatus(partPath); 418 419 // get all part blocks that overlap with the desired file blocks 420 BlockLocation[] locations = 421 fs.getFileBlockLocations(partStatus, 422 hstatus.getStartIndex() + start, len); 423 424 return fixBlockLocations(locations, start, len, hstatus.getStartIndex()); 425 } 426 427 /** 428 * the hash of the path p inside iniside 429 * the filesystem 430 * @param p the path in the harfilesystem 431 * @return the hash code of the path. 432 */ 433 public static int getHarHash(Path p) { 434 return (p.toString().hashCode() & 0x7fffffff); 435 } 436 437 static class Store { 438 public Store() { 439 begin = end = startHash = endHash = 0; 440 } 441 public Store(long begin, long end, int startHash, int endHash) { 442 this.begin = begin; 443 this.end = end; 444 this.startHash = startHash; 445 this.endHash = endHash; 446 } 447 public long begin; 448 public long end; 449 public int startHash; 450 public int endHash; 451 } 452 453 /** 454 * Get filestatuses of all the children of a given directory. This just reads 455 * through index file and reads line by line to get all statuses for children 456 * of a directory. Its a brute force way of getting all such filestatuses 457 * 458 * @param parent 459 * the parent path directory 460 * @param statuses 461 * the list to add the children filestatuses to 462 * @param children 463 * the string list of children for this parent 464 * @param archiveIndexStat 465 * the archive index filestatus 466 */ 467 private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses, 468 List<String> children) throws IOException { 469 String parentString = parent.getName(); 470 if (!parentString.endsWith(Path.SEPARATOR)){ 471 parentString += Path.SEPARATOR; 472 } 473 Path harPath = new Path(parentString); 474 int harlen = harPath.depth(); 475 final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>(); 476 477 for (HarStatus hstatus : metadata.archive.values()) { 478 String child = hstatus.getName(); 479 if ((child.startsWith(parentString))) { 480 Path thisPath = new Path(child); 481 if (thisPath.depth() == harlen + 1) { 482 statuses.add(toFileStatus(hstatus, cache)); 483 } 484 } 485 } 486 } 487 488 /** 489 * Combine the status stored in the index and the underlying status. 490 * @param h status stored in the index 491 * @param cache caching the underlying file statuses 492 * @return the combined file status 493 * @throws IOException 494 */ 495 private FileStatus toFileStatus(HarStatus h, 496 Map<String, FileStatus> cache) throws IOException { 497 FileStatus underlying = null; 498 if (cache != null) { 499 underlying = cache.get(h.partName); 500 } 501 if (underlying == null) { 502 final Path p = h.isDir? archivePath: new Path(archivePath, h.partName); 503 underlying = fs.getFileStatus(p); 504 if (cache != null) { 505 cache.put(h.partName, underlying); 506 } 507 } 508 509 long modTime = 0; 510 int version = metadata.getVersion(); 511 if (version < 3) { 512 modTime = underlying.getModificationTime(); 513 } else if (version == 3) { 514 modTime = h.getModificationTime(); 515 } 516 517 return new FileStatus( 518 h.isDir()? 0L: h.getLength(), 519 h.isDir(), 520 underlying.getReplication(), 521 underlying.getBlockSize(), 522 modTime, 523 underlying.getAccessTime(), 524 underlying.getPermission(), 525 underlying.getOwner(), 526 underlying.getGroup(), 527 makeRelative(this.uri.getPath(), new Path(h.name))); 528 } 529 530 // a single line parser for hadoop archives status 531 // stored in a single line in the index files 532 // the format is of the form 533 // filename "dir"/"file" partFileName startIndex length 534 // <space seperated children> 535 private class HarStatus { 536 boolean isDir; 537 String name; 538 List<String> children; 539 String partName; 540 long startIndex; 541 long length; 542 long modificationTime = 0; 543 544 public HarStatus(String harString) throws UnsupportedEncodingException { 545 String[] splits = harString.split(" "); 546 this.name = decodeFileName(splits[0]); 547 this.isDir = "dir".equals(splits[1]) ? true: false; 548 // this is equal to "none" if its a directory 549 this.partName = splits[2]; 550 this.startIndex = Long.parseLong(splits[3]); 551 this.length = Long.parseLong(splits[4]); 552 553 int version = metadata.getVersion(); 554 String[] propSplits = null; 555 // propSplits is used to retrieve the metainformation that Har versions 556 // 1 & 2 missed (modification time, permission, owner group). 557 // These fields are stored in an encoded string placed in different 558 // locations depending on whether it's a file or directory entry. 559 // If it's a directory, the string will be placed at the partName 560 // location (directories have no partName because they don't have data 561 // to be stored). This is done because the number of fields in a 562 // directory entry is unbounded (all children are listed at the end) 563 // If it's a file, the string will be the last field. 564 if (isDir) { 565 if (version == 3){ 566 propSplits = decodeString(this.partName).split(" "); 567 } 568 children = new ArrayList<String>(); 569 for (int i = 5; i < splits.length; i++) { 570 children.add(decodeFileName(splits[i])); 571 } 572 } else if (version == 3) { 573 propSplits = decodeString(splits[5]).split(" "); 574 } 575 576 if (propSplits != null && propSplits.length >= 4) { 577 modificationTime = Long.parseLong(propSplits[0]); 578 // the fields below are stored in the file but are currently not used 579 // by HarFileSystem 580 // permission = new FsPermission(Short.parseShort(propSplits[1])); 581 // owner = decodeString(propSplits[2]); 582 // group = decodeString(propSplits[3]); 583 } 584 } 585 public boolean isDir() { 586 return isDir; 587 } 588 589 public String getName() { 590 return name; 591 } 592 public String getPartName() { 593 return partName; 594 } 595 public long getStartIndex() { 596 return startIndex; 597 } 598 public long getLength() { 599 return length; 600 } 601 public long getModificationTime() { 602 return modificationTime; 603 } 604 } 605 606 /** 607 * return the filestatus of files in har archive. 608 * The permission returned are that of the archive 609 * index files. The permissions are not persisted 610 * while creating a hadoop archive. 611 * @param f the path in har filesystem 612 * @return filestatus. 613 * @throws IOException 614 */ 615 @Override 616 public FileStatus getFileStatus(Path f) throws IOException { 617 Path p = makeQualified(f); 618 if (p.toUri().getPath().length() < archivePath.toString().length()) { 619 // still in the source file system 620 return fs.getFileStatus(new Path(p.toUri().getPath())); 621 } 622 623 HarStatus hstatus = getFileHarStatus(p); 624 return toFileStatus(hstatus, null); 625 } 626 627 private HarStatus getFileHarStatus(Path f) throws IOException { 628 // get the fs DataInputStream for the underlying file 629 // look up the index. 630 Path harPath = getPathInHar(f); 631 if (harPath == null) { 632 throw new IOException("Invalid file name: " + f + " in " + uri); 633 } 634 HarStatus hstatus = metadata.archive.get(harPath); 635 if (hstatus == null) { 636 throw new FileNotFoundException("File: " + f + " does not exist in " + uri); 637 } 638 return hstatus; 639 } 640 641 /** 642 * @return null since no checksum algorithm is implemented. 643 */ 644 public FileChecksum getFileChecksum(Path f) { 645 return null; 646 } 647 648 /** 649 * Returns a har input stream which fakes end of 650 * file. It reads the index files to get the part 651 * file name and the size and start of the file. 652 */ 653 @Override 654 public FSDataInputStream open(Path f, int bufferSize) throws IOException { 655 // get the fs DataInputStream for the underlying file 656 HarStatus hstatus = getFileHarStatus(f); 657 // we got it.. woo hooo!!! 658 if (hstatus.isDir()) { 659 throw new FileNotFoundException(f + " : not a file in " + 660 archivePath); 661 } 662 return new HarFSDataInputStream(fs, new Path(archivePath, 663 hstatus.getPartName()), 664 hstatus.getStartIndex(), hstatus.getLength(), bufferSize); 665 } 666 667 public FSDataOutputStream create(Path f, 668 FsPermission permission, 669 boolean overwrite, 670 int bufferSize, 671 short replication, 672 long blockSize, 673 Progressable progress) throws IOException { 674 throw new IOException("Har: create not allowed."); 675 } 676 677 @Override 678 public void close() throws IOException { 679 if (fs != null) { 680 try { 681 fs.close(); 682 } catch(IOException ie) { 683 //this might already be closed 684 // ignore 685 } 686 } 687 } 688 689 /** 690 * Not implemented. 691 */ 692 @Override 693 public boolean setReplication(Path src, short replication) throws IOException{ 694 throw new IOException("Har: setreplication not allowed"); 695 } 696 697 /** 698 * Not implemented. 699 */ 700 @Override 701 public boolean delete(Path f, boolean recursive) throws IOException { 702 throw new IOException("Har: delete not allowed"); 703 } 704 705 @Override 706 public RemoteIterator<LocatedFileStatus> listLocatedStatus(Path f) 707 throws IOException { 708 // Use FileSystem's implementation 709 return listLocatedStatus(f, DEFAULT_FILTER); 710 } 711 712 /** 713 * liststatus returns the children of a directory 714 * after looking up the index files. 715 */ 716 @Override 717 public FileStatus[] listStatus(Path f) throws IOException { 718 //need to see if the file is an index in file 719 //get the filestatus of the archive directory 720 // we will create fake filestatuses to return 721 // to the client 722 List<FileStatus> statuses = new ArrayList<FileStatus>(); 723 Path tmpPath = makeQualified(f); 724 if (tmpPath.toUri().getPath().length() < archivePath.toString().length()) { 725 // still in the source file system 726 return fs.listStatus(new Path(tmpPath.toUri().getPath())); 727 } 728 729 Path harPath = getPathInHar(tmpPath); 730 HarStatus hstatus = metadata.archive.get(harPath); 731 if (hstatus == null) { 732 throw new FileNotFoundException("File " + f + " not found in " + archivePath); 733 } 734 if (hstatus.isDir()) { 735 fileStatusesInIndex(hstatus, statuses, hstatus.children); 736 } else { 737 statuses.add(toFileStatus(hstatus, null)); 738 } 739 740 return statuses.toArray(new FileStatus[statuses.size()]); 741 } 742 743 /** 744 * return the top level archive path. 745 */ 746 public Path getHomeDirectory() { 747 return new Path(uri.toString()); 748 } 749 750 public void setWorkingDirectory(Path newDir) { 751 //does nothing. 752 } 753 754 /** 755 * not implemented. 756 */ 757 public boolean mkdirs(Path f, FsPermission permission) throws IOException { 758 throw new IOException("Har: mkdirs not allowed"); 759 } 760 761 /** 762 * not implemented. 763 */ 764 public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws 765 IOException { 766 throw new IOException("Har: copyfromlocalfile not allowed"); 767 } 768 769 /** 770 * copies the file in the har filesystem to a local file. 771 */ 772 public void copyToLocalFile(boolean delSrc, Path src, Path dst) 773 throws IOException { 774 FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf()); 775 } 776 777 /** 778 * not implemented. 779 */ 780 public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) 781 throws IOException { 782 throw new IOException("Har: startLocalOutput not allowed"); 783 } 784 785 /** 786 * not implemented. 787 */ 788 public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) 789 throws IOException { 790 throw new IOException("Har: completeLocalOutput not allowed"); 791 } 792 793 /** 794 * not implemented. 795 */ 796 public void setOwner(Path p, String username, String groupname) 797 throws IOException { 798 throw new IOException("Har: setowner not allowed"); 799 } 800 801 /** 802 * Not implemented. 803 */ 804 public void setPermission(Path p, FsPermission permisssion) 805 throws IOException { 806 throw new IOException("Har: setPermission not allowed"); 807 } 808 809 /** 810 * Hadoop archives input stream. This input stream fakes EOF 811 * since archive files are part of bigger part files. 812 */ 813 private static class HarFSDataInputStream extends FSDataInputStream { 814 /** 815 * Create an input stream that fakes all the reads/positions/seeking. 816 */ 817 private static class HarFsInputStream extends FSInputStream { 818 private long position, start, end; 819 //The underlying data input stream that the 820 // underlying filesystem will return. 821 private FSDataInputStream underLyingStream; 822 //one byte buffer 823 private byte[] oneBytebuff = new byte[1]; 824 HarFsInputStream(FileSystem fs, Path path, long start, 825 long length, int bufferSize) throws IOException { 826 underLyingStream = fs.open(path, bufferSize); 827 underLyingStream.seek(start); 828 // the start of this file in the part file 829 this.start = start; 830 // the position pointer in the part file 831 this.position = start; 832 // the end pointer in the part file 833 this.end = start + length; 834 } 835 836 public synchronized int available() throws IOException { 837 long remaining = end - underLyingStream.getPos(); 838 if (remaining > (long)Integer.MAX_VALUE) { 839 return Integer.MAX_VALUE; 840 } 841 return (int) remaining; 842 } 843 844 public synchronized void close() throws IOException { 845 underLyingStream.close(); 846 super.close(); 847 } 848 849 //not implemented 850 @Override 851 public void mark(int readLimit) { 852 // do nothing 853 } 854 855 /** 856 * reset is not implemented 857 */ 858 public void reset() throws IOException { 859 throw new IOException("reset not implemented."); 860 } 861 862 public synchronized int read() throws IOException { 863 int ret = read(oneBytebuff, 0, 1); 864 return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff); 865 } 866 867 public synchronized int read(byte[] b) throws IOException { 868 int ret = read(b, 0, b.length); 869 if (ret != -1) { 870 position += ret; 871 } 872 return ret; 873 } 874 875 /** 876 * 877 */ 878 public synchronized int read(byte[] b, int offset, int len) 879 throws IOException { 880 int newlen = len; 881 int ret = -1; 882 if (position + len > end) { 883 newlen = (int) (end - position); 884 } 885 // end case 886 if (newlen == 0) 887 return ret; 888 ret = underLyingStream.read(b, offset, newlen); 889 position += ret; 890 return ret; 891 } 892 893 public synchronized long skip(long n) throws IOException { 894 long tmpN = n; 895 if (tmpN > 0) { 896 if (position + tmpN > end) { 897 tmpN = end - position; 898 } 899 underLyingStream.seek(tmpN + position); 900 position += tmpN; 901 return tmpN; 902 } 903 return (tmpN < 0)? -1 : 0; 904 } 905 906 public synchronized long getPos() throws IOException { 907 return (position - start); 908 } 909 910 public synchronized void seek(long pos) throws IOException { 911 if (pos < 0 || (start + pos > end)) { 912 throw new IOException("Failed to seek: EOF"); 913 } 914 position = start + pos; 915 underLyingStream.seek(position); 916 } 917 918 public boolean seekToNewSource(long targetPos) throws IOException { 919 //do not need to implement this 920 // hdfs in itself does seektonewsource 921 // while reading. 922 return false; 923 } 924 925 /** 926 * implementing position readable. 927 */ 928 public int read(long pos, byte[] b, int offset, int length) 929 throws IOException { 930 int nlength = length; 931 if (start + nlength + pos > end) { 932 nlength = (int) (end - (start + pos)); 933 } 934 return underLyingStream.read(pos + start , b, offset, nlength); 935 } 936 937 /** 938 * position readable again. 939 */ 940 public void readFully(long pos, byte[] b, int offset, int length) 941 throws IOException { 942 if (start + length + pos > end) { 943 throw new IOException("Not enough bytes to read."); 944 } 945 underLyingStream.readFully(pos + start, b, offset, length); 946 } 947 948 public void readFully(long pos, byte[] b) throws IOException { 949 readFully(pos, b, 0, b.length); 950 } 951 952 } 953 954 /** 955 * constructors for har input stream. 956 * @param fs the underlying filesystem 957 * @param p The path in the underlying filesystem 958 * @param start the start position in the part file 959 * @param length the length of valid data in the part file 960 * @param bufsize the buffer size 961 * @throws IOException 962 */ 963 public HarFSDataInputStream(FileSystem fs, Path p, long start, 964 long length, int bufsize) throws IOException { 965 super(new HarFsInputStream(fs, p, start, length, bufsize)); 966 } 967 968 /** 969 * constructor for har input stream. 970 * @param fs the underlying filesystem 971 * @param p the path in the underlying file system 972 * @param start the start position in the part file 973 * @param length the length of valid data in the part file. 974 * @throws IOException 975 */ 976 public HarFSDataInputStream(FileSystem fs, Path p, long start, long length) 977 throws IOException { 978 super(new HarFsInputStream(fs, p, start, length, 0)); 979 } 980 } 981 982 private class HarMetaData { 983 private FileSystem fs; 984 private int version; 985 // the masterIndex of the archive 986 private Path masterIndexPath; 987 // the index file 988 private Path archiveIndexPath; 989 990 private long masterIndexTimestamp; 991 private long archiveIndexTimestamp; 992 993 List<Store> stores = new ArrayList<Store>(); 994 Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>(); 995 private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>(); 996 997 public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) { 998 this.fs = fs; 999 this.masterIndexPath = masterIndexPath; 1000 this.archiveIndexPath = archiveIndexPath; 1001 } 1002 1003 public FileStatus getPartFileStatus(Path partPath) throws IOException { 1004 FileStatus status; 1005 status = partFileStatuses.get(partPath); 1006 if (status == null) { 1007 status = fs.getFileStatus(partPath); 1008 partFileStatuses.put(partPath, status); 1009 } 1010 return status; 1011 } 1012 1013 public long getMasterIndexTimestamp() { 1014 return masterIndexTimestamp; 1015 } 1016 1017 public long getArchiveIndexTimestamp() { 1018 return archiveIndexTimestamp; 1019 } 1020 1021 private int getVersion() { 1022 return version; 1023 } 1024 1025 private void parseMetaData() throws IOException { 1026 Text line; 1027 long read; 1028 FSDataInputStream in = null; 1029 LineReader lin = null; 1030 1031 try { 1032 in = fs.open(masterIndexPath); 1033 FileStatus masterStat = fs.getFileStatus(masterIndexPath); 1034 masterIndexTimestamp = masterStat.getModificationTime(); 1035 lin = new LineReader(in, getConf()); 1036 line = new Text(); 1037 read = lin.readLine(line); 1038 1039 // the first line contains the version of the index file 1040 String versionLine = line.toString(); 1041 String[] arr = versionLine.split(" "); 1042 version = Integer.parseInt(arr[0]); 1043 // make it always backwards-compatible 1044 if (this.version > HarFileSystem.VERSION) { 1045 throw new IOException("Invalid version " + 1046 this.version + " expected " + HarFileSystem.VERSION); 1047 } 1048 1049 // each line contains a hashcode range and the index file name 1050 String[] readStr = null; 1051 while(read < masterStat.getLen()) { 1052 int b = lin.readLine(line); 1053 read += b; 1054 readStr = line.toString().split(" "); 1055 int startHash = Integer.parseInt(readStr[0]); 1056 int endHash = Integer.parseInt(readStr[1]); 1057 stores.add(new Store(Long.parseLong(readStr[2]), 1058 Long.parseLong(readStr[3]), startHash, 1059 endHash)); 1060 line.clear(); 1061 } 1062 } finally { 1063 IOUtils.cleanup(LOG, lin, in); 1064 } 1065 1066 FSDataInputStream aIn = fs.open(archiveIndexPath); 1067 try { 1068 FileStatus archiveStat = fs.getFileStatus(archiveIndexPath); 1069 archiveIndexTimestamp = archiveStat.getModificationTime(); 1070 LineReader aLin; 1071 1072 // now start reading the real index file 1073 for (Store s: stores) { 1074 read = 0; 1075 aIn.seek(s.begin); 1076 aLin = new LineReader(aIn, getConf()); 1077 while (read + s.begin < s.end) { 1078 int tmp = aLin.readLine(line); 1079 read += tmp; 1080 String lineFeed = line.toString(); 1081 String[] parsed = lineFeed.split(" "); 1082 parsed[0] = decodeFileName(parsed[0]); 1083 archive.put(new Path(parsed[0]), new HarStatus(lineFeed)); 1084 line.clear(); 1085 } 1086 } 1087 } finally { 1088 IOUtils.cleanup(LOG, aIn); 1089 } 1090 } 1091 } 1092 1093 /* 1094 * testing purposes only: 1095 */ 1096 HarMetaData getMetadata() { 1097 return metadata; 1098 } 1099 1100 private static class LruCache<K, V> extends LinkedHashMap<K, V> { 1101 private final int MAX_ENTRIES; 1102 1103 public LruCache(int maxEntries) { 1104 super(maxEntries + 1, 1.0f, true); 1105 MAX_ENTRIES = maxEntries; 1106 } 1107 1108 @Override 1109 protected boolean removeEldestEntry(Map.Entry<K, V> eldest) { 1110 return size() > MAX_ENTRIES; 1111 } 1112 } 1113}