001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 package org.apache.hadoop.fs; 019 020 import java.io.FileNotFoundException; 021 import java.io.IOException; 022 import java.io.UnsupportedEncodingException; 023 import java.net.URI; 024 import java.net.URISyntaxException; 025 import java.net.URLDecoder; 026 import java.util.ArrayList; 027 import java.util.List; 028 import java.util.Map; 029 import java.util.TreeMap; 030 import java.util.HashMap; 031 import java.util.concurrent.ConcurrentHashMap; 032 033 import org.apache.commons.logging.Log; 034 import org.apache.commons.logging.LogFactory; 035 import org.apache.hadoop.conf.Configuration; 036 import org.apache.hadoop.fs.permission.FsPermission; 037 import org.apache.hadoop.io.IOUtils; 038 import org.apache.hadoop.io.Text; 039 import org.apache.hadoop.util.LineReader; 040 import org.apache.hadoop.util.Progressable; 041 042 /** 043 * This is an implementation of the Hadoop Archive 044 * Filesystem. This archive Filesystem has index files 045 * of the form _index* and has contents of the form 046 * part-*. The index files store the indexes of the 047 * real files. The index files are of the form _masterindex 048 * and _index. The master index is a level of indirection 049 * in to the index file to make the look ups faster. the index 050 * file is sorted with hash code of the paths that it contains 051 * and the master index contains pointers to the positions in 052 * index for ranges of hashcodes. 053 */ 054 055 public class HarFileSystem extends FilterFileSystem { 056 057 private static final Log LOG = LogFactory.getLog(HarFileSystem.class); 058 059 public static final int VERSION = 3; 060 061 private static final Map<URI, HarMetaData> harMetaCache = 062 new ConcurrentHashMap<URI, HarMetaData>(); 063 064 // uri representation of this Har filesystem 065 private URI uri; 066 // the top level path of the archive 067 // in the underlying file system 068 private Path archivePath; 069 // the har auth 070 private String harAuth; 071 072 // pointer into the static metadata cache 073 private HarMetaData metadata; 074 075 /** 076 * public construction of harfilesystem 077 * 078 */ 079 public HarFileSystem() { 080 } 081 082 /** 083 * Return the protocol scheme for the FileSystem. 084 * <p/> 085 * 086 * @return <code>har</code> 087 */ 088 @Override 089 public String getScheme() { 090 return "har"; 091 } 092 093 /** 094 * Constructor to create a HarFileSystem with an 095 * underlying filesystem. 096 * @param fs 097 */ 098 public HarFileSystem(FileSystem fs) { 099 super(fs); 100 } 101 102 /** 103 * Initialize a Har filesystem per har archive. The 104 * archive home directory is the top level directory 105 * in the filesystem that contains the HAR archive. 106 * Be careful with this method, you do not want to go 107 * on creating new Filesystem instances per call to 108 * path.getFileSystem(). 109 * the uri of Har is 110 * har://underlyingfsscheme-host:port/archivepath. 111 * or 112 * har:///archivepath. This assumes the underlying filesystem 113 * to be used in case not specified. 114 */ 115 @Override 116 public void initialize(URI name, Configuration conf) throws IOException { 117 // decode the name 118 URI underLyingURI = decodeHarURI(name, conf); 119 // we got the right har Path- now check if this is 120 // truly a har filesystem 121 Path harPath = archivePath( 122 new Path(name.getScheme(), name.getAuthority(), name.getPath())); 123 if (harPath == null) { 124 throw new IOException("Invalid path for the Har Filesystem. " + 125 name.toString()); 126 } 127 if (fs == null) { 128 fs = FileSystem.get(underLyingURI, conf); 129 } 130 uri = harPath.toUri(); 131 archivePath = new Path(uri.getPath()); 132 harAuth = getHarAuth(underLyingURI); 133 //check for the underlying fs containing 134 // the index file 135 Path masterIndexPath = new Path(archivePath, "_masterindex"); 136 Path archiveIndexPath = new Path(archivePath, "_index"); 137 if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) { 138 throw new IOException("Invalid path for the Har Filesystem. " + 139 "No index file in " + harPath); 140 } 141 142 metadata = harMetaCache.get(uri); 143 if (metadata != null) { 144 FileStatus mStat = fs.getFileStatus(masterIndexPath); 145 FileStatus aStat = fs.getFileStatus(archiveIndexPath); 146 if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() || 147 aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) { 148 // the archive has been overwritten since we last read it 149 // remove the entry from the meta data cache 150 metadata = null; 151 harMetaCache.remove(uri); 152 } 153 } 154 if (metadata == null) { 155 metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath); 156 metadata.parseMetaData(); 157 harMetaCache.put(uri, metadata); 158 } 159 } 160 161 // get the version of the filesystem from the masterindex file 162 // the version is currently not useful since its the first version 163 // of archives 164 public int getHarVersion() throws IOException { 165 if (metadata != null) { 166 return metadata.getVersion(); 167 } 168 else { 169 throw new IOException("Invalid meta data for the Har Filesystem"); 170 } 171 } 172 173 /* 174 * find the parent path that is the 175 * archive path in the path. The last 176 * path segment that ends with .har is 177 * the path that will be returned. 178 */ 179 private Path archivePath(Path p) { 180 Path retPath = null; 181 Path tmp = p; 182 for (int i=0; i< p.depth(); i++) { 183 if (tmp.toString().endsWith(".har")) { 184 retPath = tmp; 185 break; 186 } 187 tmp = tmp.getParent(); 188 } 189 return retPath; 190 } 191 192 /** 193 * decode the raw URI to get the underlying URI 194 * @param rawURI raw Har URI 195 * @return filtered URI of the underlying fileSystem 196 */ 197 private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException { 198 String tmpAuth = rawURI.getAuthority(); 199 //we are using the default file 200 //system in the config 201 //so create a underlying uri and 202 //return it 203 if (tmpAuth == null) { 204 //create a path 205 return FileSystem.getDefaultUri(conf); 206 } 207 String host = rawURI.getHost(); 208 if (host == null) { 209 throw new IOException("URI: " + rawURI 210 + " is an invalid Har URI since host==null." 211 + " Expecting har://<scheme>-<host>/<path>."); 212 } 213 int i = host.indexOf('-'); 214 if (i < 0) { 215 throw new IOException("URI: " + rawURI 216 + " is an invalid Har URI since '-' not found." 217 + " Expecting har://<scheme>-<host>/<path>."); 218 } 219 final String underLyingScheme = host.substring(0, i); 220 i++; 221 final String underLyingHost = i == host.length()? null: host.substring(i); 222 int underLyingPort = rawURI.getPort(); 223 String auth = (underLyingHost == null && underLyingPort == -1)? 224 null:(underLyingHost+ 225 (underLyingPort == -1 ? "" : ":"+underLyingPort)); 226 URI tmp = null; 227 if (rawURI.getQuery() != null) { 228 // query component not allowed 229 throw new IOException("query component in Path not supported " + rawURI); 230 } 231 try { 232 tmp = new URI(underLyingScheme, auth, rawURI.getPath(), 233 rawURI.getQuery(), rawURI.getFragment()); 234 } catch (URISyntaxException e) { 235 // do nothing should not happen 236 } 237 return tmp; 238 } 239 240 private static String decodeString(String str) 241 throws UnsupportedEncodingException { 242 return URLDecoder.decode(str, "UTF-8"); 243 } 244 245 private String decodeFileName(String fname) 246 throws UnsupportedEncodingException { 247 int version = metadata.getVersion(); 248 if (version == 2 || version == 3){ 249 return decodeString(fname); 250 } 251 return fname; 252 } 253 254 /** 255 * return the top level archive. 256 */ 257 @Override 258 public Path getWorkingDirectory() { 259 return new Path(uri.toString()); 260 } 261 262 /** 263 * Create a har specific auth 264 * har-underlyingfs:port 265 * @param underLyingURI the uri of underlying 266 * filesystem 267 * @return har specific auth 268 */ 269 private String getHarAuth(URI underLyingUri) { 270 String auth = underLyingUri.getScheme() + "-"; 271 if (underLyingUri.getHost() != null) { 272 auth += underLyingUri.getHost() + ":"; 273 if (underLyingUri.getPort() != -1) { 274 auth += underLyingUri.getPort(); 275 } 276 } 277 else { 278 auth += ":"; 279 } 280 return auth; 281 } 282 283 /** 284 * Returns the uri of this filesystem. 285 * The uri is of the form 286 * har://underlyingfsschema-host:port/pathintheunderlyingfs 287 */ 288 @Override 289 public URI getUri() { 290 return this.uri; 291 } 292 293 /** 294 * this method returns the path 295 * inside the har filesystem. 296 * this is relative path inside 297 * the har filesystem. 298 * @param path the fully qualified path in the har filesystem. 299 * @return relative path in the filesystem. 300 */ 301 private Path getPathInHar(Path path) { 302 Path harPath = new Path(path.toUri().getPath()); 303 if (archivePath.compareTo(harPath) == 0) 304 return new Path(Path.SEPARATOR); 305 Path tmp = new Path(harPath.getName()); 306 Path parent = harPath.getParent(); 307 while (!(parent.compareTo(archivePath) == 0)) { 308 if (parent.toString().equals(Path.SEPARATOR)) { 309 tmp = null; 310 break; 311 } 312 tmp = new Path(parent.getName(), tmp); 313 parent = parent.getParent(); 314 } 315 if (tmp != null) 316 tmp = new Path(Path.SEPARATOR, tmp); 317 return tmp; 318 } 319 320 //the relative path of p. basically 321 // getting rid of /. Parsing and doing 322 // string manipulation is not good - so 323 // just use the path api to do it. 324 private Path makeRelative(String initial, Path p) { 325 String scheme = this.uri.getScheme(); 326 String authority = this.uri.getAuthority(); 327 Path root = new Path(Path.SEPARATOR); 328 if (root.compareTo(p) == 0) 329 return new Path(scheme, authority, initial); 330 Path retPath = new Path(p.getName()); 331 Path parent = p.getParent(); 332 for (int i=0; i < p.depth()-1; i++) { 333 retPath = new Path(parent.getName(), retPath); 334 parent = parent.getParent(); 335 } 336 return new Path(new Path(scheme, authority, initial), 337 retPath.toString()); 338 } 339 340 /* this makes a path qualified in the har filesystem 341 * (non-Javadoc) 342 * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified( 343 * org.apache.hadoop.fs.Path) 344 */ 345 @Override 346 public Path makeQualified(Path path) { 347 // make sure that we just get the 348 // path component 349 Path fsPath = path; 350 if (!path.isAbsolute()) { 351 fsPath = new Path(archivePath, path); 352 } 353 354 URI tmpURI = fsPath.toUri(); 355 //change this to Har uri 356 return new Path(uri.getScheme(), harAuth, tmpURI.getPath()); 357 } 358 359 /** 360 * Fix offset and length of block locations. 361 * Note that this method modifies the original array. 362 * @param locations block locations of har part file 363 * @param start the start of the desired range in the contained file 364 * @param len the length of the desired range 365 * @param fileOffsetInHar the offset of the desired file in the har part file 366 * @return block locations with fixed offset and length 367 */ 368 static BlockLocation[] fixBlockLocations(BlockLocation[] locations, 369 long start, 370 long len, 371 long fileOffsetInHar) { 372 // offset 1 past last byte of desired range 373 long end = start + len; 374 375 for (BlockLocation location : locations) { 376 // offset of part block relative to beginning of desired file 377 // (may be negative if file starts in this part block) 378 long harBlockStart = location.getOffset() - fileOffsetInHar; 379 // offset 1 past last byte of har block relative to beginning of 380 // desired file 381 long harBlockEnd = harBlockStart + location.getLength(); 382 383 if (start > harBlockStart) { 384 // desired range starts after beginning of this har block 385 // fix offset to beginning of relevant range (relative to desired file) 386 location.setOffset(start); 387 // fix length to relevant portion of har block 388 location.setLength(location.getLength() - (start - harBlockStart)); 389 } else { 390 // desired range includes beginning of this har block 391 location.setOffset(harBlockStart); 392 } 393 394 if (harBlockEnd > end) { 395 // range ends before end of this har block 396 // fix length to remove irrelevant portion at the end 397 location.setLength(location.getLength() - (harBlockEnd - end)); 398 } 399 } 400 401 return locations; 402 } 403 404 /** 405 * Get block locations from the underlying fs and fix their 406 * offsets and lengths. 407 * @param file the input filestatus to get block locations 408 * @param start the start of the desired range in the contained file 409 * @param len the length of the desired range 410 * @return block locations for this segment of file 411 * @throws IOException 412 */ 413 @Override 414 public BlockLocation[] getFileBlockLocations(FileStatus file, long start, 415 long len) throws IOException { 416 HarStatus hstatus = getFileHarStatus(file.getPath()); 417 Path partPath = new Path(archivePath, hstatus.getPartName()); 418 FileStatus partStatus = metadata.getPartFileStatus(partPath); 419 420 // get all part blocks that overlap with the desired file blocks 421 BlockLocation[] locations = 422 fs.getFileBlockLocations(partStatus, 423 hstatus.getStartIndex() + start, len); 424 425 return fixBlockLocations(locations, start, len, hstatus.getStartIndex()); 426 } 427 428 /** 429 * the hash of the path p inside iniside 430 * the filesystem 431 * @param p the path in the harfilesystem 432 * @return the hash code of the path. 433 */ 434 public static int getHarHash(Path p) { 435 return (p.toString().hashCode() & 0x7fffffff); 436 } 437 438 static class Store { 439 public Store() { 440 begin = end = startHash = endHash = 0; 441 } 442 public Store(long begin, long end, int startHash, int endHash) { 443 this.begin = begin; 444 this.end = end; 445 this.startHash = startHash; 446 this.endHash = endHash; 447 } 448 public long begin; 449 public long end; 450 public int startHash; 451 public int endHash; 452 } 453 454 /** 455 * Get filestatuses of all the children of a given directory. This just reads 456 * through index file and reads line by line to get all statuses for children 457 * of a directory. Its a brute force way of getting all such filestatuses 458 * 459 * @param parent 460 * the parent path directory 461 * @param statuses 462 * the list to add the children filestatuses to 463 * @param children 464 * the string list of children for this parent 465 * @param archiveIndexStat 466 * the archive index filestatus 467 */ 468 private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses, 469 List<String> children) throws IOException { 470 String parentString = parent.getName(); 471 if (!parentString.endsWith(Path.SEPARATOR)){ 472 parentString += Path.SEPARATOR; 473 } 474 Path harPath = new Path(parentString); 475 int harlen = harPath.depth(); 476 final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>(); 477 478 for (HarStatus hstatus : metadata.archive.values()) { 479 String child = hstatus.getName(); 480 if ((child.startsWith(parentString))) { 481 Path thisPath = new Path(child); 482 if (thisPath.depth() == harlen + 1) { 483 statuses.add(toFileStatus(hstatus, cache)); 484 } 485 } 486 } 487 } 488 489 /** 490 * Combine the status stored in the index and the underlying status. 491 * @param h status stored in the index 492 * @param cache caching the underlying file statuses 493 * @return the combined file status 494 * @throws IOException 495 */ 496 private FileStatus toFileStatus(HarStatus h, 497 Map<String, FileStatus> cache) throws IOException { 498 FileStatus underlying = null; 499 if (cache != null) { 500 underlying = cache.get(h.partName); 501 } 502 if (underlying == null) { 503 final Path p = h.isDir? archivePath: new Path(archivePath, h.partName); 504 underlying = fs.getFileStatus(p); 505 if (cache != null) { 506 cache.put(h.partName, underlying); 507 } 508 } 509 510 long modTime = 0; 511 int version = metadata.getVersion(); 512 if (version < 3) { 513 modTime = underlying.getModificationTime(); 514 } else if (version == 3) { 515 modTime = h.getModificationTime(); 516 } 517 518 return new FileStatus( 519 h.isDir()? 0L: h.getLength(), 520 h.isDir(), 521 underlying.getReplication(), 522 underlying.getBlockSize(), 523 modTime, 524 underlying.getAccessTime(), 525 underlying.getPermission(), 526 underlying.getOwner(), 527 underlying.getGroup(), 528 makeRelative(this.uri.getPath(), new Path(h.name))); 529 } 530 531 // a single line parser for hadoop archives status 532 // stored in a single line in the index files 533 // the format is of the form 534 // filename "dir"/"file" partFileName startIndex length 535 // <space seperated children> 536 private class HarStatus { 537 boolean isDir; 538 String name; 539 List<String> children; 540 String partName; 541 long startIndex; 542 long length; 543 long modificationTime = 0; 544 545 public HarStatus(String harString) throws UnsupportedEncodingException { 546 String[] splits = harString.split(" "); 547 this.name = decodeFileName(splits[0]); 548 this.isDir = "dir".equals(splits[1]) ? true: false; 549 // this is equal to "none" if its a directory 550 this.partName = splits[2]; 551 this.startIndex = Long.parseLong(splits[3]); 552 this.length = Long.parseLong(splits[4]); 553 554 int version = metadata.getVersion(); 555 String[] propSplits = null; 556 // propSplits is used to retrieve the metainformation that Har versions 557 // 1 & 2 missed (modification time, permission, owner group). 558 // These fields are stored in an encoded string placed in different 559 // locations depending on whether it's a file or directory entry. 560 // If it's a directory, the string will be placed at the partName 561 // location (directories have no partName because they don't have data 562 // to be stored). This is done because the number of fields in a 563 // directory entry is unbounded (all children are listed at the end) 564 // If it's a file, the string will be the last field. 565 if (isDir) { 566 if (version == 3){ 567 propSplits = decodeString(this.partName).split(" "); 568 } 569 children = new ArrayList<String>(); 570 for (int i = 5; i < splits.length; i++) { 571 children.add(decodeFileName(splits[i])); 572 } 573 } else if (version == 3) { 574 propSplits = decodeString(splits[5]).split(" "); 575 } 576 577 if (propSplits != null && propSplits.length >= 4) { 578 modificationTime = Long.parseLong(propSplits[0]); 579 // the fields below are stored in the file but are currently not used 580 // by HarFileSystem 581 // permission = new FsPermission(Short.parseShort(propSplits[1])); 582 // owner = decodeString(propSplits[2]); 583 // group = decodeString(propSplits[3]); 584 } 585 } 586 public boolean isDir() { 587 return isDir; 588 } 589 590 public String getName() { 591 return name; 592 } 593 public String getPartName() { 594 return partName; 595 } 596 public long getStartIndex() { 597 return startIndex; 598 } 599 public long getLength() { 600 return length; 601 } 602 public long getModificationTime() { 603 return modificationTime; 604 } 605 } 606 607 /** 608 * return the filestatus of files in har archive. 609 * The permission returned are that of the archive 610 * index files. The permissions are not persisted 611 * while creating a hadoop archive. 612 * @param f the path in har filesystem 613 * @return filestatus. 614 * @throws IOException 615 */ 616 @Override 617 public FileStatus getFileStatus(Path f) throws IOException { 618 HarStatus hstatus = getFileHarStatus(f); 619 return toFileStatus(hstatus, null); 620 } 621 622 private HarStatus getFileHarStatus(Path f) throws IOException { 623 // get the fs DataInputStream for the underlying file 624 // look up the index. 625 Path p = makeQualified(f); 626 Path harPath = getPathInHar(p); 627 if (harPath == null) { 628 throw new IOException("Invalid file name: " + f + " in " + uri); 629 } 630 HarStatus hstatus = metadata.archive.get(harPath); 631 if (hstatus == null) { 632 throw new FileNotFoundException("File: " + f + " does not exist in " + uri); 633 } 634 return hstatus; 635 } 636 637 /** 638 * @return null since no checksum algorithm is implemented. 639 */ 640 @Override 641 public FileChecksum getFileChecksum(Path f) { 642 return null; 643 } 644 645 /** 646 * Returns a har input stream which fakes end of 647 * file. It reads the index files to get the part 648 * file name and the size and start of the file. 649 */ 650 @Override 651 public FSDataInputStream open(Path f, int bufferSize) throws IOException { 652 // get the fs DataInputStream for the underlying file 653 HarStatus hstatus = getFileHarStatus(f); 654 // we got it.. woo hooo!!! 655 if (hstatus.isDir()) { 656 throw new FileNotFoundException(f + " : not a file in " + 657 archivePath); 658 } 659 return new HarFSDataInputStream(fs, new Path(archivePath, 660 hstatus.getPartName()), 661 hstatus.getStartIndex(), hstatus.getLength(), bufferSize); 662 } 663 664 @Override 665 public FSDataOutputStream create(Path f, 666 FsPermission permission, 667 boolean overwrite, 668 int bufferSize, 669 short replication, 670 long blockSize, 671 Progressable progress) throws IOException { 672 throw new IOException("Har: create not allowed."); 673 } 674 675 @Override 676 public void close() throws IOException { 677 if (fs != null) { 678 try { 679 fs.close(); 680 } catch(IOException ie) { 681 //this might already be closed 682 // ignore 683 } 684 } 685 } 686 687 /** 688 * Not implemented. 689 */ 690 @Override 691 public boolean setReplication(Path src, short replication) throws IOException{ 692 throw new IOException("Har: setreplication not allowed"); 693 } 694 695 /** 696 * Not implemented. 697 */ 698 @Override 699 public boolean delete(Path f, boolean recursive) throws IOException { 700 throw new IOException("Har: delete not allowed"); 701 } 702 703 /** 704 * liststatus returns the children of a directory 705 * after looking up the index files. 706 */ 707 @Override 708 public FileStatus[] listStatus(Path f) throws IOException { 709 //need to see if the file is an index in file 710 //get the filestatus of the archive directory 711 // we will create fake filestatuses to return 712 // to the client 713 List<FileStatus> statuses = new ArrayList<FileStatus>(); 714 Path tmpPath = makeQualified(f); 715 Path harPath = getPathInHar(tmpPath); 716 HarStatus hstatus = metadata.archive.get(harPath); 717 if (hstatus == null) { 718 throw new FileNotFoundException("File " + f + " not found in " + archivePath); 719 } 720 if (hstatus.isDir()) { 721 fileStatusesInIndex(hstatus, statuses, hstatus.children); 722 } else { 723 statuses.add(toFileStatus(hstatus, null)); 724 } 725 726 return statuses.toArray(new FileStatus[statuses.size()]); 727 } 728 729 /** 730 * return the top level archive path. 731 */ 732 @Override 733 public Path getHomeDirectory() { 734 return new Path(uri.toString()); 735 } 736 737 @Override 738 public void setWorkingDirectory(Path newDir) { 739 //does nothing. 740 } 741 742 /** 743 * not implemented. 744 */ 745 @Override 746 public boolean mkdirs(Path f, FsPermission permission) throws IOException { 747 throw new IOException("Har: mkdirs not allowed"); 748 } 749 750 /** 751 * not implemented. 752 */ 753 @Override 754 public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws 755 IOException { 756 throw new IOException("Har: copyfromlocalfile not allowed"); 757 } 758 759 /** 760 * copies the file in the har filesystem to a local file. 761 */ 762 @Override 763 public void copyToLocalFile(boolean delSrc, Path src, Path dst) 764 throws IOException { 765 FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf()); 766 } 767 768 /** 769 * not implemented. 770 */ 771 @Override 772 public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) 773 throws IOException { 774 throw new IOException("Har: startLocalOutput not allowed"); 775 } 776 777 /** 778 * not implemented. 779 */ 780 @Override 781 public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) 782 throws IOException { 783 throw new IOException("Har: completeLocalOutput not allowed"); 784 } 785 786 /** 787 * not implemented. 788 */ 789 @Override 790 public void setOwner(Path p, String username, String groupname) 791 throws IOException { 792 throw new IOException("Har: setowner not allowed"); 793 } 794 795 /** 796 * Not implemented. 797 */ 798 @Override 799 public void setPermission(Path p, FsPermission permisssion) 800 throws IOException { 801 throw new IOException("Har: setPermission not allowed"); 802 } 803 804 /** 805 * Hadoop archives input stream. This input stream fakes EOF 806 * since archive files are part of bigger part files. 807 */ 808 private static class HarFSDataInputStream extends FSDataInputStream { 809 /** 810 * Create an input stream that fakes all the reads/positions/seeking. 811 */ 812 private static class HarFsInputStream extends FSInputStream { 813 private long position, start, end; 814 //The underlying data input stream that the 815 // underlying filesystem will return. 816 private FSDataInputStream underLyingStream; 817 //one byte buffer 818 private byte[] oneBytebuff = new byte[1]; 819 HarFsInputStream(FileSystem fs, Path path, long start, 820 long length, int bufferSize) throws IOException { 821 underLyingStream = fs.open(path, bufferSize); 822 underLyingStream.seek(start); 823 // the start of this file in the part file 824 this.start = start; 825 // the position pointer in the part file 826 this.position = start; 827 // the end pointer in the part file 828 this.end = start + length; 829 } 830 831 @Override 832 public synchronized int available() throws IOException { 833 long remaining = end - underLyingStream.getPos(); 834 if (remaining > (long)Integer.MAX_VALUE) { 835 return Integer.MAX_VALUE; 836 } 837 return (int) remaining; 838 } 839 840 @Override 841 public synchronized void close() throws IOException { 842 underLyingStream.close(); 843 super.close(); 844 } 845 846 //not implemented 847 @Override 848 public void mark(int readLimit) { 849 // do nothing 850 } 851 852 /** 853 * reset is not implemented 854 */ 855 @Override 856 public void reset() throws IOException { 857 throw new IOException("reset not implemented."); 858 } 859 860 @Override 861 public synchronized int read() throws IOException { 862 int ret = read(oneBytebuff, 0, 1); 863 return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff); 864 } 865 866 @Override 867 public synchronized int read(byte[] b) throws IOException { 868 int ret = read(b, 0, b.length); 869 if (ret != -1) { 870 position += ret; 871 } 872 return ret; 873 } 874 875 /** 876 * 877 */ 878 @Override 879 public synchronized int read(byte[] b, int offset, int len) 880 throws IOException { 881 int newlen = len; 882 int ret = -1; 883 if (position + len > end) { 884 newlen = (int) (end - position); 885 } 886 // end case 887 if (newlen == 0) 888 return ret; 889 ret = underLyingStream.read(b, offset, newlen); 890 position += ret; 891 return ret; 892 } 893 894 @Override 895 public synchronized long skip(long n) throws IOException { 896 long tmpN = n; 897 if (tmpN > 0) { 898 if (position + tmpN > end) { 899 tmpN = end - position; 900 } 901 underLyingStream.seek(tmpN + position); 902 position += tmpN; 903 return tmpN; 904 } 905 return (tmpN < 0)? -1 : 0; 906 } 907 908 @Override 909 public synchronized long getPos() throws IOException { 910 return (position - start); 911 } 912 913 @Override 914 public synchronized void seek(long pos) throws IOException { 915 if (pos < 0 || (start + pos > end)) { 916 throw new IOException("Failed to seek: EOF"); 917 } 918 position = start + pos; 919 underLyingStream.seek(position); 920 } 921 922 @Override 923 public boolean seekToNewSource(long targetPos) throws IOException { 924 //do not need to implement this 925 // hdfs in itself does seektonewsource 926 // while reading. 927 return false; 928 } 929 930 /** 931 * implementing position readable. 932 */ 933 @Override 934 public int read(long pos, byte[] b, int offset, int length) 935 throws IOException { 936 int nlength = length; 937 if (start + nlength + pos > end) { 938 nlength = (int) (end - (start + pos)); 939 } 940 return underLyingStream.read(pos + start , b, offset, nlength); 941 } 942 943 /** 944 * position readable again. 945 */ 946 @Override 947 public void readFully(long pos, byte[] b, int offset, int length) 948 throws IOException { 949 if (start + length + pos > end) { 950 throw new IOException("Not enough bytes to read."); 951 } 952 underLyingStream.readFully(pos + start, b, offset, length); 953 } 954 955 @Override 956 public void readFully(long pos, byte[] b) throws IOException { 957 readFully(pos, b, 0, b.length); 958 } 959 960 } 961 962 /** 963 * constructors for har input stream. 964 * @param fs the underlying filesystem 965 * @param p The path in the underlying filesystem 966 * @param start the start position in the part file 967 * @param length the length of valid data in the part file 968 * @param bufsize the buffer size 969 * @throws IOException 970 */ 971 public HarFSDataInputStream(FileSystem fs, Path p, long start, 972 long length, int bufsize) throws IOException { 973 super(new HarFsInputStream(fs, p, start, length, bufsize)); 974 } 975 976 /** 977 * constructor for har input stream. 978 * @param fs the underlying filesystem 979 * @param p the path in the underlying file system 980 * @param start the start position in the part file 981 * @param length the length of valid data in the part file. 982 * @throws IOException 983 */ 984 public HarFSDataInputStream(FileSystem fs, Path p, long start, long length) 985 throws IOException { 986 super(new HarFsInputStream(fs, p, start, length, 0)); 987 } 988 } 989 990 private class HarMetaData { 991 private FileSystem fs; 992 private int version; 993 // the masterIndex of the archive 994 private Path masterIndexPath; 995 // the index file 996 private Path archiveIndexPath; 997 998 private long masterIndexTimestamp; 999 private long archiveIndexTimestamp; 1000 1001 List<Store> stores = new ArrayList<Store>(); 1002 Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>(); 1003 private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>(); 1004 1005 public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) { 1006 this.fs = fs; 1007 this.masterIndexPath = masterIndexPath; 1008 this.archiveIndexPath = archiveIndexPath; 1009 } 1010 1011 public FileStatus getPartFileStatus(Path partPath) throws IOException { 1012 FileStatus status; 1013 status = partFileStatuses.get(partPath); 1014 if (status == null) { 1015 status = fs.getFileStatus(partPath); 1016 partFileStatuses.put(partPath, status); 1017 } 1018 return status; 1019 } 1020 1021 public long getMasterIndexTimestamp() { 1022 return masterIndexTimestamp; 1023 } 1024 1025 public long getArchiveIndexTimestamp() { 1026 return archiveIndexTimestamp; 1027 } 1028 1029 private int getVersion() { 1030 return version; 1031 } 1032 1033 private void parseMetaData() throws IOException { 1034 Text line; 1035 long read; 1036 FSDataInputStream in = null; 1037 LineReader lin = null; 1038 1039 try { 1040 in = fs.open(masterIndexPath); 1041 FileStatus masterStat = fs.getFileStatus(masterIndexPath); 1042 masterIndexTimestamp = masterStat.getModificationTime(); 1043 lin = new LineReader(in, getConf()); 1044 line = new Text(); 1045 read = lin.readLine(line); 1046 1047 // the first line contains the version of the index file 1048 String versionLine = line.toString(); 1049 String[] arr = versionLine.split(" "); 1050 version = Integer.parseInt(arr[0]); 1051 // make it always backwards-compatible 1052 if (this.version > HarFileSystem.VERSION) { 1053 throw new IOException("Invalid version " + 1054 this.version + " expected " + HarFileSystem.VERSION); 1055 } 1056 1057 // each line contains a hashcode range and the index file name 1058 String[] readStr = null; 1059 while(read < masterStat.getLen()) { 1060 int b = lin.readLine(line); 1061 read += b; 1062 readStr = line.toString().split(" "); 1063 int startHash = Integer.parseInt(readStr[0]); 1064 int endHash = Integer.parseInt(readStr[1]); 1065 stores.add(new Store(Long.parseLong(readStr[2]), 1066 Long.parseLong(readStr[3]), startHash, 1067 endHash)); 1068 line.clear(); 1069 } 1070 } finally { 1071 IOUtils.cleanup(LOG, lin, in); 1072 } 1073 1074 FSDataInputStream aIn = fs.open(archiveIndexPath); 1075 try { 1076 FileStatus archiveStat = fs.getFileStatus(archiveIndexPath); 1077 archiveIndexTimestamp = archiveStat.getModificationTime(); 1078 LineReader aLin; 1079 1080 // now start reading the real index file 1081 for (Store s: stores) { 1082 read = 0; 1083 aIn.seek(s.begin); 1084 aLin = new LineReader(aIn, getConf()); 1085 while (read + s.begin < s.end) { 1086 int tmp = aLin.readLine(line); 1087 read += tmp; 1088 String lineFeed = line.toString(); 1089 String[] parsed = lineFeed.split(" "); 1090 parsed[0] = decodeFileName(parsed[0]); 1091 archive.put(new Path(parsed[0]), new HarStatus(lineFeed)); 1092 line.clear(); 1093 } 1094 } 1095 } finally { 1096 IOUtils.cleanup(LOG, aIn); 1097 } 1098 } 1099 } 1100 1101 /* 1102 * testing purposes only: 1103 */ 1104 HarMetaData getMetadata() { 1105 return metadata; 1106 } 1107 }