001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 package org.apache.hadoop.fs; 019 020 import java.io.FileNotFoundException; 021 import java.io.IOException; 022 import java.io.UnsupportedEncodingException; 023 import java.net.URI; 024 import java.net.URISyntaxException; 025 import java.net.URLDecoder; 026 import java.util.ArrayList; 027 import java.util.List; 028 import java.util.Map; 029 import java.util.TreeMap; 030 import java.util.HashMap; 031 import java.util.concurrent.ConcurrentHashMap; 032 033 import org.apache.commons.logging.Log; 034 import org.apache.commons.logging.LogFactory; 035 import org.apache.hadoop.conf.Configuration; 036 import org.apache.hadoop.fs.permission.FsPermission; 037 import org.apache.hadoop.io.IOUtils; 038 import org.apache.hadoop.io.Text; 039 import org.apache.hadoop.util.LineReader; 040 import org.apache.hadoop.util.Progressable; 041 042 /** 043 * This is an implementation of the Hadoop Archive 044 * Filesystem. This archive Filesystem has index files 045 * of the form _index* and has contents of the form 046 * part-*. The index files store the indexes of the 047 * real files. The index files are of the form _masterindex 048 * and _index. The master index is a level of indirection 049 * in to the index file to make the look ups faster. the index 050 * file is sorted with hash code of the paths that it contains 051 * and the master index contains pointers to the positions in 052 * index for ranges of hashcodes. 053 */ 054 055 public class HarFileSystem extends FilterFileSystem { 056 057 private static final Log LOG = LogFactory.getLog(HarFileSystem.class); 058 059 public static final int VERSION = 3; 060 061 private static final Map<URI, HarMetaData> harMetaCache = 062 new ConcurrentHashMap<URI, HarMetaData>(); 063 064 // uri representation of this Har filesystem 065 private URI uri; 066 // the top level path of the archive 067 // in the underlying file system 068 private Path archivePath; 069 // the har auth 070 private String harAuth; 071 072 // pointer into the static metadata cache 073 private HarMetaData metadata; 074 075 /** 076 * public construction of harfilesystem 077 * 078 */ 079 public HarFileSystem() { 080 } 081 082 /** 083 * Return the protocol scheme for the FileSystem. 084 * <p/> 085 * 086 * @return <code>har</code> 087 */ 088 @Override 089 public String getScheme() { 090 return "har"; 091 } 092 093 /** 094 * Constructor to create a HarFileSystem with an 095 * underlying filesystem. 096 * @param fs 097 */ 098 public HarFileSystem(FileSystem fs) { 099 super(fs); 100 } 101 102 /** 103 * Initialize a Har filesystem per har archive. The 104 * archive home directory is the top level directory 105 * in the filesystem that contains the HAR archive. 106 * Be careful with this method, you do not want to go 107 * on creating new Filesystem instances per call to 108 * path.getFileSystem(). 109 * the uri of Har is 110 * har://underlyingfsscheme-host:port/archivepath. 111 * or 112 * har:///archivepath. This assumes the underlying filesystem 113 * to be used in case not specified. 114 */ 115 @Override 116 public void initialize(URI name, Configuration conf) throws IOException { 117 // decode the name 118 URI underLyingURI = decodeHarURI(name, conf); 119 // we got the right har Path- now check if this is 120 // truly a har filesystem 121 Path harPath = archivePath( 122 new Path(name.getScheme(), name.getAuthority(), name.getPath())); 123 if (harPath == null) { 124 throw new IOException("Invalid path for the Har Filesystem. " + 125 name.toString()); 126 } 127 if (fs == null) { 128 fs = FileSystem.get(underLyingURI, conf); 129 } 130 uri = harPath.toUri(); 131 archivePath = new Path(uri.getPath()); 132 harAuth = getHarAuth(underLyingURI); 133 //check for the underlying fs containing 134 // the index file 135 Path masterIndexPath = new Path(archivePath, "_masterindex"); 136 Path archiveIndexPath = new Path(archivePath, "_index"); 137 if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) { 138 throw new IOException("Invalid path for the Har Filesystem. " + 139 "No index file in " + harPath); 140 } 141 142 metadata = harMetaCache.get(uri); 143 if (metadata != null) { 144 FileStatus mStat = fs.getFileStatus(masterIndexPath); 145 FileStatus aStat = fs.getFileStatus(archiveIndexPath); 146 if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() || 147 aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) { 148 // the archive has been overwritten since we last read it 149 // remove the entry from the meta data cache 150 metadata = null; 151 harMetaCache.remove(uri); 152 } 153 } 154 if (metadata == null) { 155 metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath); 156 metadata.parseMetaData(); 157 harMetaCache.put(uri, metadata); 158 } 159 } 160 161 // get the version of the filesystem from the masterindex file 162 // the version is currently not useful since its the first version 163 // of archives 164 public int getHarVersion() throws IOException { 165 if (metadata != null) { 166 return metadata.getVersion(); 167 } 168 else { 169 throw new IOException("Invalid meta data for the Har Filesystem"); 170 } 171 } 172 173 /* 174 * find the parent path that is the 175 * archive path in the path. The last 176 * path segment that ends with .har is 177 * the path that will be returned. 178 */ 179 private Path archivePath(Path p) { 180 Path retPath = null; 181 Path tmp = p; 182 for (int i=0; i< p.depth(); i++) { 183 if (tmp.toString().endsWith(".har")) { 184 retPath = tmp; 185 break; 186 } 187 tmp = tmp.getParent(); 188 } 189 return retPath; 190 } 191 192 /** 193 * decode the raw URI to get the underlying URI 194 * @param rawURI raw Har URI 195 * @return filtered URI of the underlying fileSystem 196 */ 197 private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException { 198 String tmpAuth = rawURI.getAuthority(); 199 //we are using the default file 200 //system in the config 201 //so create a underlying uri and 202 //return it 203 if (tmpAuth == null) { 204 //create a path 205 return FileSystem.getDefaultUri(conf); 206 } 207 String authority = rawURI.getAuthority(); 208 if (authority == null) { 209 throw new IOException("URI: " + rawURI 210 + " is an invalid Har URI since authority==null." 211 + " Expecting har://<scheme>-<host>/<path>."); 212 } 213 214 int i = authority.indexOf('-'); 215 if (i < 0) { 216 throw new IOException("URI: " + rawURI 217 + " is an invalid Har URI since '-' not found." 218 + " Expecting har://<scheme>-<host>/<path>."); 219 } 220 221 if (rawURI.getQuery() != null) { 222 // query component not allowed 223 throw new IOException("query component in Path not supported " + rawURI); 224 } 225 226 URI tmp = null; 227 228 try { 229 // convert <scheme>-<host> to <scheme>://<host> 230 URI baseUri = new URI(authority.replaceFirst("-", "://")); 231 232 tmp = new URI(baseUri.getScheme(), baseUri.getAuthority(), 233 rawURI.getPath(), rawURI.getQuery(), rawURI.getFragment()); 234 } catch (URISyntaxException e) { 235 throw new IOException("URI: " + rawURI 236 + " is an invalid Har URI. Expecting har://<scheme>-<host>/<path>."); 237 } 238 return tmp; 239 } 240 241 private static String decodeString(String str) 242 throws UnsupportedEncodingException { 243 return URLDecoder.decode(str, "UTF-8"); 244 } 245 246 private String decodeFileName(String fname) 247 throws UnsupportedEncodingException { 248 int version = metadata.getVersion(); 249 if (version == 2 || version == 3){ 250 return decodeString(fname); 251 } 252 return fname; 253 } 254 255 /** 256 * return the top level archive. 257 */ 258 @Override 259 public Path getWorkingDirectory() { 260 return new Path(uri.toString()); 261 } 262 263 /** 264 * Create a har specific auth 265 * har-underlyingfs:port 266 * @param underLyingURI the uri of underlying 267 * filesystem 268 * @return har specific auth 269 */ 270 private String getHarAuth(URI underLyingUri) { 271 String auth = underLyingUri.getScheme() + "-"; 272 if (underLyingUri.getHost() != null) { 273 auth += underLyingUri.getHost() + ":"; 274 if (underLyingUri.getPort() != -1) { 275 auth += underLyingUri.getPort(); 276 } 277 } 278 else { 279 auth += ":"; 280 } 281 return auth; 282 } 283 284 /** 285 * Returns the uri of this filesystem. 286 * The uri is of the form 287 * har://underlyingfsschema-host:port/pathintheunderlyingfs 288 */ 289 @Override 290 public URI getUri() { 291 return this.uri; 292 } 293 294 /** 295 * this method returns the path 296 * inside the har filesystem. 297 * this is relative path inside 298 * the har filesystem. 299 * @param path the fully qualified path in the har filesystem. 300 * @return relative path in the filesystem. 301 */ 302 private Path getPathInHar(Path path) { 303 Path harPath = new Path(path.toUri().getPath()); 304 if (archivePath.compareTo(harPath) == 0) 305 return new Path(Path.SEPARATOR); 306 Path tmp = new Path(harPath.getName()); 307 Path parent = harPath.getParent(); 308 while (!(parent.compareTo(archivePath) == 0)) { 309 if (parent.toString().equals(Path.SEPARATOR)) { 310 tmp = null; 311 break; 312 } 313 tmp = new Path(parent.getName(), tmp); 314 parent = parent.getParent(); 315 } 316 if (tmp != null) 317 tmp = new Path(Path.SEPARATOR, tmp); 318 return tmp; 319 } 320 321 //the relative path of p. basically 322 // getting rid of /. Parsing and doing 323 // string manipulation is not good - so 324 // just use the path api to do it. 325 private Path makeRelative(String initial, Path p) { 326 String scheme = this.uri.getScheme(); 327 String authority = this.uri.getAuthority(); 328 Path root = new Path(Path.SEPARATOR); 329 if (root.compareTo(p) == 0) 330 return new Path(scheme, authority, initial); 331 Path retPath = new Path(p.getName()); 332 Path parent = p.getParent(); 333 for (int i=0; i < p.depth()-1; i++) { 334 retPath = new Path(parent.getName(), retPath); 335 parent = parent.getParent(); 336 } 337 return new Path(new Path(scheme, authority, initial), 338 retPath.toString()); 339 } 340 341 /* this makes a path qualified in the har filesystem 342 * (non-Javadoc) 343 * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified( 344 * org.apache.hadoop.fs.Path) 345 */ 346 @Override 347 public Path makeQualified(Path path) { 348 // make sure that we just get the 349 // path component 350 Path fsPath = path; 351 if (!path.isAbsolute()) { 352 fsPath = new Path(archivePath, path); 353 } 354 355 URI tmpURI = fsPath.toUri(); 356 //change this to Har uri 357 return new Path(uri.getScheme(), harAuth, tmpURI.getPath()); 358 } 359 360 /** 361 * Fix offset and length of block locations. 362 * Note that this method modifies the original array. 363 * @param locations block locations of har part file 364 * @param start the start of the desired range in the contained file 365 * @param len the length of the desired range 366 * @param fileOffsetInHar the offset of the desired file in the har part file 367 * @return block locations with fixed offset and length 368 */ 369 static BlockLocation[] fixBlockLocations(BlockLocation[] locations, 370 long start, 371 long len, 372 long fileOffsetInHar) { 373 // offset 1 past last byte of desired range 374 long end = start + len; 375 376 for (BlockLocation location : locations) { 377 // offset of part block relative to beginning of desired file 378 // (may be negative if file starts in this part block) 379 long harBlockStart = location.getOffset() - fileOffsetInHar; 380 // offset 1 past last byte of har block relative to beginning of 381 // desired file 382 long harBlockEnd = harBlockStart + location.getLength(); 383 384 if (start > harBlockStart) { 385 // desired range starts after beginning of this har block 386 // fix offset to beginning of relevant range (relative to desired file) 387 location.setOffset(start); 388 // fix length to relevant portion of har block 389 location.setLength(location.getLength() - (start - harBlockStart)); 390 } else { 391 // desired range includes beginning of this har block 392 location.setOffset(harBlockStart); 393 } 394 395 if (harBlockEnd > end) { 396 // range ends before end of this har block 397 // fix length to remove irrelevant portion at the end 398 location.setLength(location.getLength() - (harBlockEnd - end)); 399 } 400 } 401 402 return locations; 403 } 404 405 /** 406 * Get block locations from the underlying fs and fix their 407 * offsets and lengths. 408 * @param file the input filestatus to get block locations 409 * @param start the start of the desired range in the contained file 410 * @param len the length of the desired range 411 * @return block locations for this segment of file 412 * @throws IOException 413 */ 414 @Override 415 public BlockLocation[] getFileBlockLocations(FileStatus file, long start, 416 long len) throws IOException { 417 HarStatus hstatus = getFileHarStatus(file.getPath()); 418 Path partPath = new Path(archivePath, hstatus.getPartName()); 419 FileStatus partStatus = metadata.getPartFileStatus(partPath); 420 421 // get all part blocks that overlap with the desired file blocks 422 BlockLocation[] locations = 423 fs.getFileBlockLocations(partStatus, 424 hstatus.getStartIndex() + start, len); 425 426 return fixBlockLocations(locations, start, len, hstatus.getStartIndex()); 427 } 428 429 /** 430 * the hash of the path p inside iniside 431 * the filesystem 432 * @param p the path in the harfilesystem 433 * @return the hash code of the path. 434 */ 435 public static int getHarHash(Path p) { 436 return (p.toString().hashCode() & 0x7fffffff); 437 } 438 439 static class Store { 440 public Store() { 441 begin = end = startHash = endHash = 0; 442 } 443 public Store(long begin, long end, int startHash, int endHash) { 444 this.begin = begin; 445 this.end = end; 446 this.startHash = startHash; 447 this.endHash = endHash; 448 } 449 public long begin; 450 public long end; 451 public int startHash; 452 public int endHash; 453 } 454 455 /** 456 * Get filestatuses of all the children of a given directory. This just reads 457 * through index file and reads line by line to get all statuses for children 458 * of a directory. Its a brute force way of getting all such filestatuses 459 * 460 * @param parent 461 * the parent path directory 462 * @param statuses 463 * the list to add the children filestatuses to 464 * @param children 465 * the string list of children for this parent 466 * @param archiveIndexStat 467 * the archive index filestatus 468 */ 469 private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses, 470 List<String> children) throws IOException { 471 String parentString = parent.getName(); 472 if (!parentString.endsWith(Path.SEPARATOR)){ 473 parentString += Path.SEPARATOR; 474 } 475 Path harPath = new Path(parentString); 476 int harlen = harPath.depth(); 477 final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>(); 478 479 for (HarStatus hstatus : metadata.archive.values()) { 480 String child = hstatus.getName(); 481 if ((child.startsWith(parentString))) { 482 Path thisPath = new Path(child); 483 if (thisPath.depth() == harlen + 1) { 484 statuses.add(toFileStatus(hstatus, cache)); 485 } 486 } 487 } 488 } 489 490 /** 491 * Combine the status stored in the index and the underlying status. 492 * @param h status stored in the index 493 * @param cache caching the underlying file statuses 494 * @return the combined file status 495 * @throws IOException 496 */ 497 private FileStatus toFileStatus(HarStatus h, 498 Map<String, FileStatus> cache) throws IOException { 499 FileStatus underlying = null; 500 if (cache != null) { 501 underlying = cache.get(h.partName); 502 } 503 if (underlying == null) { 504 final Path p = h.isDir? archivePath: new Path(archivePath, h.partName); 505 underlying = fs.getFileStatus(p); 506 if (cache != null) { 507 cache.put(h.partName, underlying); 508 } 509 } 510 511 long modTime = 0; 512 int version = metadata.getVersion(); 513 if (version < 3) { 514 modTime = underlying.getModificationTime(); 515 } else if (version == 3) { 516 modTime = h.getModificationTime(); 517 } 518 519 return new FileStatus( 520 h.isDir()? 0L: h.getLength(), 521 h.isDir(), 522 underlying.getReplication(), 523 underlying.getBlockSize(), 524 modTime, 525 underlying.getAccessTime(), 526 underlying.getPermission(), 527 underlying.getOwner(), 528 underlying.getGroup(), 529 makeRelative(this.uri.getPath(), new Path(h.name))); 530 } 531 532 // a single line parser for hadoop archives status 533 // stored in a single line in the index files 534 // the format is of the form 535 // filename "dir"/"file" partFileName startIndex length 536 // <space seperated children> 537 private class HarStatus { 538 boolean isDir; 539 String name; 540 List<String> children; 541 String partName; 542 long startIndex; 543 long length; 544 long modificationTime = 0; 545 546 public HarStatus(String harString) throws UnsupportedEncodingException { 547 String[] splits = harString.split(" "); 548 this.name = decodeFileName(splits[0]); 549 this.isDir = "dir".equals(splits[1]) ? true: false; 550 // this is equal to "none" if its a directory 551 this.partName = splits[2]; 552 this.startIndex = Long.parseLong(splits[3]); 553 this.length = Long.parseLong(splits[4]); 554 555 int version = metadata.getVersion(); 556 String[] propSplits = null; 557 // propSplits is used to retrieve the metainformation that Har versions 558 // 1 & 2 missed (modification time, permission, owner group). 559 // These fields are stored in an encoded string placed in different 560 // locations depending on whether it's a file or directory entry. 561 // If it's a directory, the string will be placed at the partName 562 // location (directories have no partName because they don't have data 563 // to be stored). This is done because the number of fields in a 564 // directory entry is unbounded (all children are listed at the end) 565 // If it's a file, the string will be the last field. 566 if (isDir) { 567 if (version == 3){ 568 propSplits = decodeString(this.partName).split(" "); 569 } 570 children = new ArrayList<String>(); 571 for (int i = 5; i < splits.length; i++) { 572 children.add(decodeFileName(splits[i])); 573 } 574 } else if (version == 3) { 575 propSplits = decodeString(splits[5]).split(" "); 576 } 577 578 if (propSplits != null && propSplits.length >= 4) { 579 modificationTime = Long.parseLong(propSplits[0]); 580 // the fields below are stored in the file but are currently not used 581 // by HarFileSystem 582 // permission = new FsPermission(Short.parseShort(propSplits[1])); 583 // owner = decodeString(propSplits[2]); 584 // group = decodeString(propSplits[3]); 585 } 586 } 587 public boolean isDir() { 588 return isDir; 589 } 590 591 public String getName() { 592 return name; 593 } 594 public String getPartName() { 595 return partName; 596 } 597 public long getStartIndex() { 598 return startIndex; 599 } 600 public long getLength() { 601 return length; 602 } 603 public long getModificationTime() { 604 return modificationTime; 605 } 606 } 607 608 /** 609 * return the filestatus of files in har archive. 610 * The permission returned are that of the archive 611 * index files. The permissions are not persisted 612 * while creating a hadoop archive. 613 * @param f the path in har filesystem 614 * @return filestatus. 615 * @throws IOException 616 */ 617 @Override 618 public FileStatus getFileStatus(Path f) throws IOException { 619 HarStatus hstatus = getFileHarStatus(f); 620 return toFileStatus(hstatus, null); 621 } 622 623 private HarStatus getFileHarStatus(Path f) throws IOException { 624 // get the fs DataInputStream for the underlying file 625 // look up the index. 626 Path p = makeQualified(f); 627 Path harPath = getPathInHar(p); 628 if (harPath == null) { 629 throw new IOException("Invalid file name: " + f + " in " + uri); 630 } 631 HarStatus hstatus = metadata.archive.get(harPath); 632 if (hstatus == null) { 633 throw new FileNotFoundException("File: " + f + " does not exist in " + uri); 634 } 635 return hstatus; 636 } 637 638 /** 639 * @return null since no checksum algorithm is implemented. 640 */ 641 @Override 642 public FileChecksum getFileChecksum(Path f) { 643 return null; 644 } 645 646 /** 647 * Returns a har input stream which fakes end of 648 * file. It reads the index files to get the part 649 * file name and the size and start of the file. 650 */ 651 @Override 652 public FSDataInputStream open(Path f, int bufferSize) throws IOException { 653 // get the fs DataInputStream for the underlying file 654 HarStatus hstatus = getFileHarStatus(f); 655 // we got it.. woo hooo!!! 656 if (hstatus.isDir()) { 657 throw new FileNotFoundException(f + " : not a file in " + 658 archivePath); 659 } 660 return new HarFSDataInputStream(fs, new Path(archivePath, 661 hstatus.getPartName()), 662 hstatus.getStartIndex(), hstatus.getLength(), bufferSize); 663 } 664 665 @Override 666 public FSDataOutputStream create(Path f, 667 FsPermission permission, 668 boolean overwrite, 669 int bufferSize, 670 short replication, 671 long blockSize, 672 Progressable progress) throws IOException { 673 throw new IOException("Har: create not allowed."); 674 } 675 676 @Override 677 public void close() throws IOException { 678 if (fs != null) { 679 try { 680 fs.close(); 681 } catch(IOException ie) { 682 //this might already be closed 683 // ignore 684 } 685 } 686 } 687 688 /** 689 * Not implemented. 690 */ 691 @Override 692 public boolean setReplication(Path src, short replication) throws IOException{ 693 throw new IOException("Har: setreplication not allowed"); 694 } 695 696 /** 697 * Not implemented. 698 */ 699 @Override 700 public boolean delete(Path f, boolean recursive) throws IOException { 701 throw new IOException("Har: delete not allowed"); 702 } 703 704 /** 705 * liststatus returns the children of a directory 706 * after looking up the index files. 707 */ 708 @Override 709 public FileStatus[] listStatus(Path f) throws IOException { 710 //need to see if the file is an index in file 711 //get the filestatus of the archive directory 712 // we will create fake filestatuses to return 713 // to the client 714 List<FileStatus> statuses = new ArrayList<FileStatus>(); 715 Path tmpPath = makeQualified(f); 716 Path harPath = getPathInHar(tmpPath); 717 HarStatus hstatus = metadata.archive.get(harPath); 718 if (hstatus == null) { 719 throw new FileNotFoundException("File " + f + " not found in " + archivePath); 720 } 721 if (hstatus.isDir()) { 722 fileStatusesInIndex(hstatus, statuses, hstatus.children); 723 } else { 724 statuses.add(toFileStatus(hstatus, null)); 725 } 726 727 return statuses.toArray(new FileStatus[statuses.size()]); 728 } 729 730 /** 731 * return the top level archive path. 732 */ 733 @Override 734 public Path getHomeDirectory() { 735 return new Path(uri.toString()); 736 } 737 738 @Override 739 public void setWorkingDirectory(Path newDir) { 740 //does nothing. 741 } 742 743 /** 744 * not implemented. 745 */ 746 @Override 747 public boolean mkdirs(Path f, FsPermission permission) throws IOException { 748 throw new IOException("Har: mkdirs not allowed"); 749 } 750 751 /** 752 * not implemented. 753 */ 754 @Override 755 public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws 756 IOException { 757 throw new IOException("Har: copyfromlocalfile not allowed"); 758 } 759 760 /** 761 * copies the file in the har filesystem to a local file. 762 */ 763 @Override 764 public void copyToLocalFile(boolean delSrc, Path src, Path dst) 765 throws IOException { 766 FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf()); 767 } 768 769 /** 770 * not implemented. 771 */ 772 @Override 773 public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) 774 throws IOException { 775 throw new IOException("Har: startLocalOutput not allowed"); 776 } 777 778 /** 779 * not implemented. 780 */ 781 @Override 782 public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) 783 throws IOException { 784 throw new IOException("Har: completeLocalOutput not allowed"); 785 } 786 787 /** 788 * not implemented. 789 */ 790 @Override 791 public void setOwner(Path p, String username, String groupname) 792 throws IOException { 793 throw new IOException("Har: setowner not allowed"); 794 } 795 796 /** 797 * Not implemented. 798 */ 799 @Override 800 public void setPermission(Path p, FsPermission permisssion) 801 throws IOException { 802 throw new IOException("Har: setPermission not allowed"); 803 } 804 805 /** 806 * Hadoop archives input stream. This input stream fakes EOF 807 * since archive files are part of bigger part files. 808 */ 809 private static class HarFSDataInputStream extends FSDataInputStream { 810 /** 811 * Create an input stream that fakes all the reads/positions/seeking. 812 */ 813 private static class HarFsInputStream extends FSInputStream { 814 private long position, start, end; 815 //The underlying data input stream that the 816 // underlying filesystem will return. 817 private FSDataInputStream underLyingStream; 818 //one byte buffer 819 private byte[] oneBytebuff = new byte[1]; 820 HarFsInputStream(FileSystem fs, Path path, long start, 821 long length, int bufferSize) throws IOException { 822 underLyingStream = fs.open(path, bufferSize); 823 underLyingStream.seek(start); 824 // the start of this file in the part file 825 this.start = start; 826 // the position pointer in the part file 827 this.position = start; 828 // the end pointer in the part file 829 this.end = start + length; 830 } 831 832 @Override 833 public synchronized int available() throws IOException { 834 long remaining = end - underLyingStream.getPos(); 835 if (remaining > (long)Integer.MAX_VALUE) { 836 return Integer.MAX_VALUE; 837 } 838 return (int) remaining; 839 } 840 841 @Override 842 public synchronized void close() throws IOException { 843 underLyingStream.close(); 844 super.close(); 845 } 846 847 //not implemented 848 @Override 849 public void mark(int readLimit) { 850 // do nothing 851 } 852 853 /** 854 * reset is not implemented 855 */ 856 @Override 857 public void reset() throws IOException { 858 throw new IOException("reset not implemented."); 859 } 860 861 @Override 862 public synchronized int read() throws IOException { 863 int ret = read(oneBytebuff, 0, 1); 864 return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff); 865 } 866 867 @Override 868 public synchronized int read(byte[] b) throws IOException { 869 int ret = read(b, 0, b.length); 870 if (ret != -1) { 871 position += ret; 872 } 873 return ret; 874 } 875 876 /** 877 * 878 */ 879 @Override 880 public synchronized int read(byte[] b, int offset, int len) 881 throws IOException { 882 int newlen = len; 883 int ret = -1; 884 if (position + len > end) { 885 newlen = (int) (end - position); 886 } 887 // end case 888 if (newlen == 0) 889 return ret; 890 ret = underLyingStream.read(b, offset, newlen); 891 position += ret; 892 return ret; 893 } 894 895 @Override 896 public synchronized long skip(long n) throws IOException { 897 long tmpN = n; 898 if (tmpN > 0) { 899 if (position + tmpN > end) { 900 tmpN = end - position; 901 } 902 underLyingStream.seek(tmpN + position); 903 position += tmpN; 904 return tmpN; 905 } 906 return (tmpN < 0)? -1 : 0; 907 } 908 909 @Override 910 public synchronized long getPos() throws IOException { 911 return (position - start); 912 } 913 914 @Override 915 public synchronized void seek(long pos) throws IOException { 916 if (pos < 0 || (start + pos > end)) { 917 throw new IOException("Failed to seek: EOF"); 918 } 919 position = start + pos; 920 underLyingStream.seek(position); 921 } 922 923 @Override 924 public boolean seekToNewSource(long targetPos) throws IOException { 925 //do not need to implement this 926 // hdfs in itself does seektonewsource 927 // while reading. 928 return false; 929 } 930 931 /** 932 * implementing position readable. 933 */ 934 @Override 935 public int read(long pos, byte[] b, int offset, int length) 936 throws IOException { 937 int nlength = length; 938 if (start + nlength + pos > end) { 939 nlength = (int) (end - (start + pos)); 940 } 941 return underLyingStream.read(pos + start , b, offset, nlength); 942 } 943 944 /** 945 * position readable again. 946 */ 947 @Override 948 public void readFully(long pos, byte[] b, int offset, int length) 949 throws IOException { 950 if (start + length + pos > end) { 951 throw new IOException("Not enough bytes to read."); 952 } 953 underLyingStream.readFully(pos + start, b, offset, length); 954 } 955 956 @Override 957 public void readFully(long pos, byte[] b) throws IOException { 958 readFully(pos, b, 0, b.length); 959 } 960 961 } 962 963 /** 964 * constructors for har input stream. 965 * @param fs the underlying filesystem 966 * @param p The path in the underlying filesystem 967 * @param start the start position in the part file 968 * @param length the length of valid data in the part file 969 * @param bufsize the buffer size 970 * @throws IOException 971 */ 972 public HarFSDataInputStream(FileSystem fs, Path p, long start, 973 long length, int bufsize) throws IOException { 974 super(new HarFsInputStream(fs, p, start, length, bufsize)); 975 } 976 977 /** 978 * constructor for har input stream. 979 * @param fs the underlying filesystem 980 * @param p the path in the underlying file system 981 * @param start the start position in the part file 982 * @param length the length of valid data in the part file. 983 * @throws IOException 984 */ 985 public HarFSDataInputStream(FileSystem fs, Path p, long start, long length) 986 throws IOException { 987 super(new HarFsInputStream(fs, p, start, length, 0)); 988 } 989 } 990 991 private class HarMetaData { 992 private FileSystem fs; 993 private int version; 994 // the masterIndex of the archive 995 private Path masterIndexPath; 996 // the index file 997 private Path archiveIndexPath; 998 999 private long masterIndexTimestamp; 1000 private long archiveIndexTimestamp; 1001 1002 List<Store> stores = new ArrayList<Store>(); 1003 Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>(); 1004 private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>(); 1005 1006 public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) { 1007 this.fs = fs; 1008 this.masterIndexPath = masterIndexPath; 1009 this.archiveIndexPath = archiveIndexPath; 1010 } 1011 1012 public FileStatus getPartFileStatus(Path partPath) throws IOException { 1013 FileStatus status; 1014 status = partFileStatuses.get(partPath); 1015 if (status == null) { 1016 status = fs.getFileStatus(partPath); 1017 partFileStatuses.put(partPath, status); 1018 } 1019 return status; 1020 } 1021 1022 public long getMasterIndexTimestamp() { 1023 return masterIndexTimestamp; 1024 } 1025 1026 public long getArchiveIndexTimestamp() { 1027 return archiveIndexTimestamp; 1028 } 1029 1030 private int getVersion() { 1031 return version; 1032 } 1033 1034 private void parseMetaData() throws IOException { 1035 Text line; 1036 long read; 1037 FSDataInputStream in = null; 1038 LineReader lin = null; 1039 1040 try { 1041 in = fs.open(masterIndexPath); 1042 FileStatus masterStat = fs.getFileStatus(masterIndexPath); 1043 masterIndexTimestamp = masterStat.getModificationTime(); 1044 lin = new LineReader(in, getConf()); 1045 line = new Text(); 1046 read = lin.readLine(line); 1047 1048 // the first line contains the version of the index file 1049 String versionLine = line.toString(); 1050 String[] arr = versionLine.split(" "); 1051 version = Integer.parseInt(arr[0]); 1052 // make it always backwards-compatible 1053 if (this.version > HarFileSystem.VERSION) { 1054 throw new IOException("Invalid version " + 1055 this.version + " expected " + HarFileSystem.VERSION); 1056 } 1057 1058 // each line contains a hashcode range and the index file name 1059 String[] readStr = null; 1060 while(read < masterStat.getLen()) { 1061 int b = lin.readLine(line); 1062 read += b; 1063 readStr = line.toString().split(" "); 1064 int startHash = Integer.parseInt(readStr[0]); 1065 int endHash = Integer.parseInt(readStr[1]); 1066 stores.add(new Store(Long.parseLong(readStr[2]), 1067 Long.parseLong(readStr[3]), startHash, 1068 endHash)); 1069 line.clear(); 1070 } 1071 } finally { 1072 IOUtils.cleanup(LOG, lin, in); 1073 } 1074 1075 FSDataInputStream aIn = fs.open(archiveIndexPath); 1076 try { 1077 FileStatus archiveStat = fs.getFileStatus(archiveIndexPath); 1078 archiveIndexTimestamp = archiveStat.getModificationTime(); 1079 LineReader aLin; 1080 1081 // now start reading the real index file 1082 for (Store s: stores) { 1083 read = 0; 1084 aIn.seek(s.begin); 1085 aLin = new LineReader(aIn, getConf()); 1086 while (read + s.begin < s.end) { 1087 int tmp = aLin.readLine(line); 1088 read += tmp; 1089 String lineFeed = line.toString(); 1090 String[] parsed = lineFeed.split(" "); 1091 parsed[0] = decodeFileName(parsed[0]); 1092 archive.put(new Path(parsed[0]), new HarStatus(lineFeed)); 1093 line.clear(); 1094 } 1095 } 1096 } finally { 1097 IOUtils.cleanup(LOG, aIn); 1098 } 1099 } 1100 } 1101 1102 /* 1103 * testing purposes only: 1104 */ 1105 HarMetaData getMetadata() { 1106 return metadata; 1107 } 1108 }