001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 package org.apache.hadoop.fs; 019 020 import java.io.FileNotFoundException; 021 import java.io.IOException; 022 import java.io.UnsupportedEncodingException; 023 import java.net.URI; 024 import java.net.URISyntaxException; 025 import java.net.URLDecoder; 026 import java.util.ArrayList; 027 import java.util.List; 028 import java.util.Map; 029 import java.util.TreeMap; 030 import java.util.HashMap; 031 import java.util.concurrent.ConcurrentHashMap; 032 033 import org.apache.hadoop.conf.Configuration; 034 import org.apache.hadoop.fs.permission.FsPermission; 035 import org.apache.hadoop.io.Text; 036 import org.apache.hadoop.util.LineReader; 037 import org.apache.hadoop.util.Progressable; 038 039 /** 040 * This is an implementation of the Hadoop Archive 041 * Filesystem. This archive Filesystem has index files 042 * of the form _index* and has contents of the form 043 * part-*. The index files store the indexes of the 044 * real files. The index files are of the form _masterindex 045 * and _index. The master index is a level of indirection 046 * in to the index file to make the look ups faster. the index 047 * file is sorted with hash code of the paths that it contains 048 * and the master index contains pointers to the positions in 049 * index for ranges of hashcodes. 050 */ 051 052 public class HarFileSystem extends FilterFileSystem { 053 public static final int VERSION = 3; 054 055 private static final Map<URI, HarMetaData> harMetaCache = 056 new ConcurrentHashMap<URI, HarMetaData>(); 057 058 // uri representation of this Har filesystem 059 private URI uri; 060 // the top level path of the archive 061 // in the underlying file system 062 private Path archivePath; 063 // the har auth 064 private String harAuth; 065 066 // pointer into the static metadata cache 067 private HarMetaData metadata; 068 069 /** 070 * public construction of harfilesystem 071 * 072 */ 073 public HarFileSystem() { 074 } 075 076 /** 077 * Return the protocol scheme for the FileSystem. 078 * <p/> 079 * 080 * @return <code>har</code> 081 */ 082 @Override 083 public String getScheme() { 084 return "har"; 085 } 086 087 /** 088 * Constructor to create a HarFileSystem with an 089 * underlying filesystem. 090 * @param fs 091 */ 092 public HarFileSystem(FileSystem fs) { 093 super(fs); 094 } 095 096 /** 097 * Initialize a Har filesystem per har archive. The 098 * archive home directory is the top level directory 099 * in the filesystem that contains the HAR archive. 100 * Be careful with this method, you do not want to go 101 * on creating new Filesystem instances per call to 102 * path.getFileSystem(). 103 * the uri of Har is 104 * har://underlyingfsscheme-host:port/archivepath. 105 * or 106 * har:///archivepath. This assumes the underlying filesystem 107 * to be used in case not specified. 108 */ 109 @Override 110 public void initialize(URI name, Configuration conf) throws IOException { 111 // decode the name 112 URI underLyingURI = decodeHarURI(name, conf); 113 // we got the right har Path- now check if this is 114 // truly a har filesystem 115 Path harPath = archivePath( 116 new Path(name.getScheme(), name.getAuthority(), name.getPath())); 117 if (harPath == null) { 118 throw new IOException("Invalid path for the Har Filesystem. " + 119 name.toString()); 120 } 121 if (fs == null) { 122 fs = FileSystem.get(underLyingURI, conf); 123 } 124 uri = harPath.toUri(); 125 archivePath = new Path(uri.getPath()); 126 harAuth = getHarAuth(underLyingURI); 127 //check for the underlying fs containing 128 // the index file 129 Path masterIndexPath = new Path(archivePath, "_masterindex"); 130 Path archiveIndexPath = new Path(archivePath, "_index"); 131 if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) { 132 throw new IOException("Invalid path for the Har Filesystem. " + 133 "No index file in " + harPath); 134 } 135 136 metadata = harMetaCache.get(uri); 137 if (metadata != null) { 138 FileStatus mStat = fs.getFileStatus(masterIndexPath); 139 FileStatus aStat = fs.getFileStatus(archiveIndexPath); 140 if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() || 141 aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) { 142 // the archive has been overwritten since we last read it 143 // remove the entry from the meta data cache 144 metadata = null; 145 harMetaCache.remove(uri); 146 } 147 } 148 if (metadata == null) { 149 metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath); 150 metadata.parseMetaData(); 151 harMetaCache.put(uri, metadata); 152 } 153 } 154 155 // get the version of the filesystem from the masterindex file 156 // the version is currently not useful since its the first version 157 // of archives 158 public int getHarVersion() throws IOException { 159 if (metadata != null) { 160 return metadata.getVersion(); 161 } 162 else { 163 throw new IOException("Invalid meta data for the Har Filesystem"); 164 } 165 } 166 167 /* 168 * find the parent path that is the 169 * archive path in the path. The last 170 * path segment that ends with .har is 171 * the path that will be returned. 172 */ 173 private Path archivePath(Path p) { 174 Path retPath = null; 175 Path tmp = p; 176 for (int i=0; i< p.depth(); i++) { 177 if (tmp.toString().endsWith(".har")) { 178 retPath = tmp; 179 break; 180 } 181 tmp = tmp.getParent(); 182 } 183 return retPath; 184 } 185 186 /** 187 * decode the raw URI to get the underlying URI 188 * @param rawURI raw Har URI 189 * @return filtered URI of the underlying fileSystem 190 */ 191 private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException { 192 String tmpAuth = rawURI.getAuthority(); 193 //we are using the default file 194 //system in the config 195 //so create a underlying uri and 196 //return it 197 if (tmpAuth == null) { 198 //create a path 199 return FileSystem.getDefaultUri(conf); 200 } 201 String host = rawURI.getHost(); 202 if (host == null) { 203 throw new IOException("URI: " + rawURI 204 + " is an invalid Har URI since host==null." 205 + " Expecting har://<scheme>-<host>/<path>."); 206 } 207 int i = host.indexOf('-'); 208 if (i < 0) { 209 throw new IOException("URI: " + rawURI 210 + " is an invalid Har URI since '-' not found." 211 + " Expecting har://<scheme>-<host>/<path>."); 212 } 213 final String underLyingScheme = host.substring(0, i); 214 i++; 215 final String underLyingHost = i == host.length()? null: host.substring(i); 216 int underLyingPort = rawURI.getPort(); 217 String auth = (underLyingHost == null && underLyingPort == -1)? 218 null:(underLyingHost+ 219 (underLyingPort == -1 ? "" : ":"+underLyingPort)); 220 URI tmp = null; 221 if (rawURI.getQuery() != null) { 222 // query component not allowed 223 throw new IOException("query component in Path not supported " + rawURI); 224 } 225 try { 226 tmp = new URI(underLyingScheme, auth, rawURI.getPath(), 227 rawURI.getQuery(), rawURI.getFragment()); 228 } catch (URISyntaxException e) { 229 // do nothing should not happen 230 } 231 return tmp; 232 } 233 234 private static String decodeString(String str) 235 throws UnsupportedEncodingException { 236 return URLDecoder.decode(str, "UTF-8"); 237 } 238 239 private String decodeFileName(String fname) 240 throws UnsupportedEncodingException { 241 int version = metadata.getVersion(); 242 if (version == 2 || version == 3){ 243 return decodeString(fname); 244 } 245 return fname; 246 } 247 248 /** 249 * return the top level archive. 250 */ 251 @Override 252 public Path getWorkingDirectory() { 253 return new Path(uri.toString()); 254 } 255 256 /** 257 * Create a har specific auth 258 * har-underlyingfs:port 259 * @param underLyingURI the uri of underlying 260 * filesystem 261 * @return har specific auth 262 */ 263 private String getHarAuth(URI underLyingUri) { 264 String auth = underLyingUri.getScheme() + "-"; 265 if (underLyingUri.getHost() != null) { 266 auth += underLyingUri.getHost() + ":"; 267 if (underLyingUri.getPort() != -1) { 268 auth += underLyingUri.getPort(); 269 } 270 } 271 else { 272 auth += ":"; 273 } 274 return auth; 275 } 276 277 /** 278 * Returns the uri of this filesystem. 279 * The uri is of the form 280 * har://underlyingfsschema-host:port/pathintheunderlyingfs 281 */ 282 @Override 283 public URI getUri() { 284 return this.uri; 285 } 286 287 /** 288 * this method returns the path 289 * inside the har filesystem. 290 * this is relative path inside 291 * the har filesystem. 292 * @param path the fully qualified path in the har filesystem. 293 * @return relative path in the filesystem. 294 */ 295 private Path getPathInHar(Path path) { 296 Path harPath = new Path(path.toUri().getPath()); 297 if (archivePath.compareTo(harPath) == 0) 298 return new Path(Path.SEPARATOR); 299 Path tmp = new Path(harPath.getName()); 300 Path parent = harPath.getParent(); 301 while (!(parent.compareTo(archivePath) == 0)) { 302 if (parent.toString().equals(Path.SEPARATOR)) { 303 tmp = null; 304 break; 305 } 306 tmp = new Path(parent.getName(), tmp); 307 parent = parent.getParent(); 308 } 309 if (tmp != null) 310 tmp = new Path(Path.SEPARATOR, tmp); 311 return tmp; 312 } 313 314 //the relative path of p. basically 315 // getting rid of /. Parsing and doing 316 // string manipulation is not good - so 317 // just use the path api to do it. 318 private Path makeRelative(String initial, Path p) { 319 String scheme = this.uri.getScheme(); 320 String authority = this.uri.getAuthority(); 321 Path root = new Path(Path.SEPARATOR); 322 if (root.compareTo(p) == 0) 323 return new Path(scheme, authority, initial); 324 Path retPath = new Path(p.getName()); 325 Path parent = p.getParent(); 326 for (int i=0; i < p.depth()-1; i++) { 327 retPath = new Path(parent.getName(), retPath); 328 parent = parent.getParent(); 329 } 330 return new Path(new Path(scheme, authority, initial), 331 retPath.toString()); 332 } 333 334 /* this makes a path qualified in the har filesystem 335 * (non-Javadoc) 336 * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified( 337 * org.apache.hadoop.fs.Path) 338 */ 339 @Override 340 public Path makeQualified(Path path) { 341 // make sure that we just get the 342 // path component 343 Path fsPath = path; 344 if (!path.isAbsolute()) { 345 fsPath = new Path(archivePath, path); 346 } 347 348 URI tmpURI = fsPath.toUri(); 349 //change this to Har uri 350 return new Path(uri.getScheme(), harAuth, tmpURI.getPath()); 351 } 352 353 /** 354 * Fix offset and length of block locations. 355 * Note that this method modifies the original array. 356 * @param locations block locations of har part file 357 * @param start the start of the desired range in the contained file 358 * @param len the length of the desired range 359 * @param fileOffsetInHar the offset of the desired file in the har part file 360 * @return block locations with fixed offset and length 361 */ 362 static BlockLocation[] fixBlockLocations(BlockLocation[] locations, 363 long start, 364 long len, 365 long fileOffsetInHar) { 366 // offset 1 past last byte of desired range 367 long end = start + len; 368 369 for (BlockLocation location : locations) { 370 // offset of part block relative to beginning of desired file 371 // (may be negative if file starts in this part block) 372 long harBlockStart = location.getOffset() - fileOffsetInHar; 373 // offset 1 past last byte of har block relative to beginning of 374 // desired file 375 long harBlockEnd = harBlockStart + location.getLength(); 376 377 if (start > harBlockStart) { 378 // desired range starts after beginning of this har block 379 // fix offset to beginning of relevant range (relative to desired file) 380 location.setOffset(start); 381 // fix length to relevant portion of har block 382 location.setLength(location.getLength() - (start - harBlockStart)); 383 } else { 384 // desired range includes beginning of this har block 385 location.setOffset(harBlockStart); 386 } 387 388 if (harBlockEnd > end) { 389 // range ends before end of this har block 390 // fix length to remove irrelevant portion at the end 391 location.setLength(location.getLength() - (harBlockEnd - end)); 392 } 393 } 394 395 return locations; 396 } 397 398 /** 399 * Get block locations from the underlying fs and fix their 400 * offsets and lengths. 401 * @param file the input filestatus to get block locations 402 * @param start the start of the desired range in the contained file 403 * @param len the length of the desired range 404 * @return block locations for this segment of file 405 * @throws IOException 406 */ 407 @Override 408 public BlockLocation[] getFileBlockLocations(FileStatus file, long start, 409 long len) throws IOException { 410 HarStatus hstatus = getFileHarStatus(file.getPath()); 411 Path partPath = new Path(archivePath, hstatus.getPartName()); 412 FileStatus partStatus = metadata.getPartFileStatus(partPath); 413 414 // get all part blocks that overlap with the desired file blocks 415 BlockLocation[] locations = 416 fs.getFileBlockLocations(partStatus, 417 hstatus.getStartIndex() + start, len); 418 419 return fixBlockLocations(locations, start, len, hstatus.getStartIndex()); 420 } 421 422 /** 423 * the hash of the path p inside iniside 424 * the filesystem 425 * @param p the path in the harfilesystem 426 * @return the hash code of the path. 427 */ 428 public static int getHarHash(Path p) { 429 return (p.toString().hashCode() & 0x7fffffff); 430 } 431 432 static class Store { 433 public Store() { 434 begin = end = startHash = endHash = 0; 435 } 436 public Store(long begin, long end, int startHash, int endHash) { 437 this.begin = begin; 438 this.end = end; 439 this.startHash = startHash; 440 this.endHash = endHash; 441 } 442 public long begin; 443 public long end; 444 public int startHash; 445 public int endHash; 446 } 447 448 /** 449 * Get filestatuses of all the children of a given directory. This just reads 450 * through index file and reads line by line to get all statuses for children 451 * of a directory. Its a brute force way of getting all such filestatuses 452 * 453 * @param parent 454 * the parent path directory 455 * @param statuses 456 * the list to add the children filestatuses to 457 * @param children 458 * the string list of children for this parent 459 * @param archiveIndexStat 460 * the archive index filestatus 461 */ 462 private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses, 463 List<String> children) throws IOException { 464 String parentString = parent.getName(); 465 if (!parentString.endsWith(Path.SEPARATOR)){ 466 parentString += Path.SEPARATOR; 467 } 468 Path harPath = new Path(parentString); 469 int harlen = harPath.depth(); 470 final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>(); 471 472 for (HarStatus hstatus : metadata.archive.values()) { 473 String child = hstatus.getName(); 474 if ((child.startsWith(parentString))) { 475 Path thisPath = new Path(child); 476 if (thisPath.depth() == harlen + 1) { 477 statuses.add(toFileStatus(hstatus, cache)); 478 } 479 } 480 } 481 } 482 483 /** 484 * Combine the status stored in the index and the underlying status. 485 * @param h status stored in the index 486 * @param cache caching the underlying file statuses 487 * @return the combined file status 488 * @throws IOException 489 */ 490 private FileStatus toFileStatus(HarStatus h, 491 Map<String, FileStatus> cache) throws IOException { 492 FileStatus underlying = null; 493 if (cache != null) { 494 underlying = cache.get(h.partName); 495 } 496 if (underlying == null) { 497 final Path p = h.isDir? archivePath: new Path(archivePath, h.partName); 498 underlying = fs.getFileStatus(p); 499 if (cache != null) { 500 cache.put(h.partName, underlying); 501 } 502 } 503 504 long modTime = 0; 505 int version = metadata.getVersion(); 506 if (version < 3) { 507 modTime = underlying.getModificationTime(); 508 } else if (version == 3) { 509 modTime = h.getModificationTime(); 510 } 511 512 return new FileStatus( 513 h.isDir()? 0L: h.getLength(), 514 h.isDir(), 515 underlying.getReplication(), 516 underlying.getBlockSize(), 517 modTime, 518 underlying.getAccessTime(), 519 underlying.getPermission(), 520 underlying.getOwner(), 521 underlying.getGroup(), 522 makeRelative(this.uri.getPath(), new Path(h.name))); 523 } 524 525 // a single line parser for hadoop archives status 526 // stored in a single line in the index files 527 // the format is of the form 528 // filename "dir"/"file" partFileName startIndex length 529 // <space seperated children> 530 private class HarStatus { 531 boolean isDir; 532 String name; 533 List<String> children; 534 String partName; 535 long startIndex; 536 long length; 537 long modificationTime = 0; 538 539 public HarStatus(String harString) throws UnsupportedEncodingException { 540 String[] splits = harString.split(" "); 541 this.name = decodeFileName(splits[0]); 542 this.isDir = "dir".equals(splits[1]) ? true: false; 543 // this is equal to "none" if its a directory 544 this.partName = splits[2]; 545 this.startIndex = Long.parseLong(splits[3]); 546 this.length = Long.parseLong(splits[4]); 547 548 int version = metadata.getVersion(); 549 String[] propSplits = null; 550 // propSplits is used to retrieve the metainformation that Har versions 551 // 1 & 2 missed (modification time, permission, owner group). 552 // These fields are stored in an encoded string placed in different 553 // locations depending on whether it's a file or directory entry. 554 // If it's a directory, the string will be placed at the partName 555 // location (directories have no partName because they don't have data 556 // to be stored). This is done because the number of fields in a 557 // directory entry is unbounded (all children are listed at the end) 558 // If it's a file, the string will be the last field. 559 if (isDir) { 560 if (version == 3){ 561 propSplits = decodeString(this.partName).split(" "); 562 } 563 children = new ArrayList<String>(); 564 for (int i = 5; i < splits.length; i++) { 565 children.add(decodeFileName(splits[i])); 566 } 567 } else if (version == 3) { 568 propSplits = decodeString(splits[5]).split(" "); 569 } 570 571 if (propSplits != null && propSplits.length >= 4) { 572 modificationTime = Long.parseLong(propSplits[0]); 573 // the fields below are stored in the file but are currently not used 574 // by HarFileSystem 575 // permission = new FsPermission(Short.parseShort(propSplits[1])); 576 // owner = decodeString(propSplits[2]); 577 // group = decodeString(propSplits[3]); 578 } 579 } 580 public boolean isDir() { 581 return isDir; 582 } 583 584 public String getName() { 585 return name; 586 } 587 588 public List<String> getChildren() { 589 return children; 590 } 591 public String getFileName() { 592 return name; 593 } 594 public String getPartName() { 595 return partName; 596 } 597 public long getStartIndex() { 598 return startIndex; 599 } 600 public long getLength() { 601 return length; 602 } 603 public long getModificationTime() { 604 return modificationTime; 605 } 606 } 607 608 /** 609 * return the filestatus of files in har archive. 610 * The permission returned are that of the archive 611 * index files. The permissions are not persisted 612 * while creating a hadoop archive. 613 * @param f the path in har filesystem 614 * @return filestatus. 615 * @throws IOException 616 */ 617 @Override 618 public FileStatus getFileStatus(Path f) throws IOException { 619 HarStatus hstatus = getFileHarStatus(f); 620 return toFileStatus(hstatus, null); 621 } 622 623 private HarStatus getFileHarStatus(Path f) throws IOException { 624 // get the fs DataInputStream for the underlying file 625 // look up the index. 626 Path p = makeQualified(f); 627 Path harPath = getPathInHar(p); 628 if (harPath == null) { 629 throw new IOException("Invalid file name: " + f + " in " + uri); 630 } 631 HarStatus hstatus = metadata.archive.get(harPath); 632 if (hstatus == null) { 633 throw new FileNotFoundException("File: " + f + " does not exist in " + uri); 634 } 635 return hstatus; 636 } 637 638 /** 639 * @return null since no checksum algorithm is implemented. 640 */ 641 @Override 642 public FileChecksum getFileChecksum(Path f) { 643 return null; 644 } 645 646 /** 647 * Returns a har input stream which fakes end of 648 * file. It reads the index files to get the part 649 * file name and the size and start of the file. 650 */ 651 @Override 652 public FSDataInputStream open(Path f, int bufferSize) throws IOException { 653 // get the fs DataInputStream for the underlying file 654 HarStatus hstatus = getFileHarStatus(f); 655 // we got it.. woo hooo!!! 656 if (hstatus.isDir()) { 657 throw new FileNotFoundException(f + " : not a file in " + 658 archivePath); 659 } 660 return new HarFSDataInputStream(fs, new Path(archivePath, 661 hstatus.getPartName()), 662 hstatus.getStartIndex(), hstatus.getLength(), bufferSize); 663 } 664 665 /* 666 * create throws an exception in Har filesystem. 667 * The archive once created cannot be changed. 668 */ 669 public FSDataOutputStream create(Path f, int bufferSize) 670 throws IOException { 671 throw new IOException("Har: Create not allowed"); 672 } 673 674 @Override 675 public FSDataOutputStream create(Path f, 676 FsPermission permission, 677 boolean overwrite, 678 int bufferSize, 679 short replication, 680 long blockSize, 681 Progressable progress) throws IOException { 682 throw new IOException("Har: create not allowed."); 683 } 684 685 @Override 686 public void close() throws IOException { 687 if (fs != null) { 688 try { 689 fs.close(); 690 } catch(IOException ie) { 691 //this might already be closed 692 // ignore 693 } 694 } 695 } 696 697 /** 698 * Not implemented. 699 */ 700 @Override 701 public boolean setReplication(Path src, short replication) throws IOException{ 702 throw new IOException("Har: setreplication not allowed"); 703 } 704 705 /** 706 * Not implemented. 707 */ 708 @Override 709 public boolean delete(Path f, boolean recursive) throws IOException { 710 throw new IOException("Har: delete not allowed"); 711 } 712 713 /** 714 * liststatus returns the children of a directory 715 * after looking up the index files. 716 */ 717 @Override 718 public FileStatus[] listStatus(Path f) throws IOException { 719 //need to see if the file is an index in file 720 //get the filestatus of the archive directory 721 // we will create fake filestatuses to return 722 // to the client 723 List<FileStatus> statuses = new ArrayList<FileStatus>(); 724 Path tmpPath = makeQualified(f); 725 Path harPath = getPathInHar(tmpPath); 726 HarStatus hstatus = metadata.archive.get(harPath); 727 if (hstatus == null) { 728 throw new FileNotFoundException("File " + f + " not found in " + archivePath); 729 } 730 if (hstatus.isDir()) { 731 fileStatusesInIndex(hstatus, statuses, hstatus.children); 732 } else { 733 statuses.add(toFileStatus(hstatus, null)); 734 } 735 736 return statuses.toArray(new FileStatus[statuses.size()]); 737 } 738 739 /** 740 * return the top level archive path. 741 */ 742 @Override 743 public Path getHomeDirectory() { 744 return new Path(uri.toString()); 745 } 746 747 @Override 748 public void setWorkingDirectory(Path newDir) { 749 //does nothing. 750 } 751 752 /** 753 * not implemented. 754 */ 755 @Override 756 public boolean mkdirs(Path f, FsPermission permission) throws IOException { 757 throw new IOException("Har: mkdirs not allowed"); 758 } 759 760 /** 761 * not implemented. 762 */ 763 @Override 764 public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws 765 IOException { 766 throw new IOException("Har: copyfromlocalfile not allowed"); 767 } 768 769 /** 770 * copies the file in the har filesystem to a local file. 771 */ 772 @Override 773 public void copyToLocalFile(boolean delSrc, Path src, Path dst) 774 throws IOException { 775 FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf()); 776 } 777 778 /** 779 * not implemented. 780 */ 781 @Override 782 public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) 783 throws IOException { 784 throw new IOException("Har: startLocalOutput not allowed"); 785 } 786 787 /** 788 * not implemented. 789 */ 790 @Override 791 public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) 792 throws IOException { 793 throw new IOException("Har: completeLocalOutput not allowed"); 794 } 795 796 /** 797 * not implemented. 798 */ 799 @Override 800 public void setOwner(Path p, String username, String groupname) 801 throws IOException { 802 throw new IOException("Har: setowner not allowed"); 803 } 804 805 /** 806 * Not implemented. 807 */ 808 @Override 809 public void setPermission(Path p, FsPermission permisssion) 810 throws IOException { 811 throw new IOException("Har: setPermission not allowed"); 812 } 813 814 /** 815 * Hadoop archives input stream. This input stream fakes EOF 816 * since archive files are part of bigger part files. 817 */ 818 private static class HarFSDataInputStream extends FSDataInputStream { 819 /** 820 * Create an input stream that fakes all the reads/positions/seeking. 821 */ 822 private static class HarFsInputStream extends FSInputStream { 823 private long position, start, end; 824 //The underlying data input stream that the 825 // underlying filesystem will return. 826 private FSDataInputStream underLyingStream; 827 //one byte buffer 828 private byte[] oneBytebuff = new byte[1]; 829 HarFsInputStream(FileSystem fs, Path path, long start, 830 long length, int bufferSize) throws IOException { 831 underLyingStream = fs.open(path, bufferSize); 832 underLyingStream.seek(start); 833 // the start of this file in the part file 834 this.start = start; 835 // the position pointer in the part file 836 this.position = start; 837 // the end pointer in the part file 838 this.end = start + length; 839 } 840 841 @Override 842 public synchronized int available() throws IOException { 843 long remaining = end - underLyingStream.getPos(); 844 if (remaining > (long)Integer.MAX_VALUE) { 845 return Integer.MAX_VALUE; 846 } 847 return (int) remaining; 848 } 849 850 @Override 851 public synchronized void close() throws IOException { 852 underLyingStream.close(); 853 super.close(); 854 } 855 856 //not implemented 857 @Override 858 public void mark(int readLimit) { 859 // do nothing 860 } 861 862 /** 863 * reset is not implemented 864 */ 865 @Override 866 public void reset() throws IOException { 867 throw new IOException("reset not implemented."); 868 } 869 870 @Override 871 public synchronized int read() throws IOException { 872 int ret = read(oneBytebuff, 0, 1); 873 return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff); 874 } 875 876 @Override 877 public synchronized int read(byte[] b) throws IOException { 878 int ret = read(b, 0, b.length); 879 if (ret != -1) { 880 position += ret; 881 } 882 return ret; 883 } 884 885 /** 886 * 887 */ 888 @Override 889 public synchronized int read(byte[] b, int offset, int len) 890 throws IOException { 891 int newlen = len; 892 int ret = -1; 893 if (position + len > end) { 894 newlen = (int) (end - position); 895 } 896 // end case 897 if (newlen == 0) 898 return ret; 899 ret = underLyingStream.read(b, offset, newlen); 900 position += ret; 901 return ret; 902 } 903 904 @Override 905 public synchronized long skip(long n) throws IOException { 906 long tmpN = n; 907 if (tmpN > 0) { 908 if (position + tmpN > end) { 909 tmpN = end - position; 910 } 911 underLyingStream.seek(tmpN + position); 912 position += tmpN; 913 return tmpN; 914 } 915 return (tmpN < 0)? -1 : 0; 916 } 917 918 @Override 919 public synchronized long getPos() throws IOException { 920 return (position - start); 921 } 922 923 @Override 924 public synchronized void seek(long pos) throws IOException { 925 if (pos < 0 || (start + pos > end)) { 926 throw new IOException("Failed to seek: EOF"); 927 } 928 position = start + pos; 929 underLyingStream.seek(position); 930 } 931 932 @Override 933 public boolean seekToNewSource(long targetPos) throws IOException { 934 //do not need to implement this 935 // hdfs in itself does seektonewsource 936 // while reading. 937 return false; 938 } 939 940 /** 941 * implementing position readable. 942 */ 943 @Override 944 public int read(long pos, byte[] b, int offset, int length) 945 throws IOException { 946 int nlength = length; 947 if (start + nlength + pos > end) { 948 nlength = (int) (end - (start + pos)); 949 } 950 return underLyingStream.read(pos + start , b, offset, nlength); 951 } 952 953 /** 954 * position readable again. 955 */ 956 @Override 957 public void readFully(long pos, byte[] b, int offset, int length) 958 throws IOException { 959 if (start + length + pos > end) { 960 throw new IOException("Not enough bytes to read."); 961 } 962 underLyingStream.readFully(pos + start, b, offset, length); 963 } 964 965 @Override 966 public void readFully(long pos, byte[] b) throws IOException { 967 readFully(pos, b, 0, b.length); 968 } 969 970 } 971 972 /** 973 * constructors for har input stream. 974 * @param fs the underlying filesystem 975 * @param p The path in the underlying filesystem 976 * @param start the start position in the part file 977 * @param length the length of valid data in the part file 978 * @param bufsize the buffer size 979 * @throws IOException 980 */ 981 public HarFSDataInputStream(FileSystem fs, Path p, long start, 982 long length, int bufsize) throws IOException { 983 super(new HarFsInputStream(fs, p, start, length, bufsize)); 984 } 985 986 /** 987 * constructor for har input stream. 988 * @param fs the underlying filesystem 989 * @param p the path in the underlying file system 990 * @param start the start position in the part file 991 * @param length the length of valid data in the part file. 992 * @throws IOException 993 */ 994 public HarFSDataInputStream(FileSystem fs, Path p, long start, long length) 995 throws IOException { 996 super(new HarFsInputStream(fs, p, start, length, 0)); 997 } 998 } 999 1000 private class HarMetaData { 1001 private FileSystem fs; 1002 private int version; 1003 // the masterIndex of the archive 1004 private Path masterIndexPath; 1005 // the index file 1006 private Path archiveIndexPath; 1007 1008 private long masterIndexTimestamp; 1009 private long archiveIndexTimestamp; 1010 1011 List<Store> stores = new ArrayList<Store>(); 1012 Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>(); 1013 private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>(); 1014 1015 public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) { 1016 this.fs = fs; 1017 this.masterIndexPath = masterIndexPath; 1018 this.archiveIndexPath = archiveIndexPath; 1019 } 1020 1021 public FileStatus getPartFileStatus(Path partPath) throws IOException { 1022 FileStatus status; 1023 status = partFileStatuses.get(partPath); 1024 if (status == null) { 1025 status = fs.getFileStatus(partPath); 1026 partFileStatuses.put(partPath, status); 1027 } 1028 return status; 1029 } 1030 1031 public long getMasterIndexTimestamp() { 1032 return masterIndexTimestamp; 1033 } 1034 1035 public long getArchiveIndexTimestamp() { 1036 return archiveIndexTimestamp; 1037 } 1038 1039 private int getVersion() { 1040 return version; 1041 } 1042 1043 private void parseMetaData() throws IOException { 1044 FSDataInputStream in = fs.open(masterIndexPath); 1045 FileStatus masterStat = fs.getFileStatus(masterIndexPath); 1046 masterIndexTimestamp = masterStat.getModificationTime(); 1047 LineReader lin = new LineReader(in, getConf()); 1048 Text line = new Text(); 1049 long read = lin.readLine(line); 1050 1051 // the first line contains the version of the index file 1052 String versionLine = line.toString(); 1053 String[] arr = versionLine.split(" "); 1054 version = Integer.parseInt(arr[0]); 1055 // make it always backwards-compatible 1056 if (this.version > HarFileSystem.VERSION) { 1057 throw new IOException("Invalid version " + 1058 this.version + " expected " + HarFileSystem.VERSION); 1059 } 1060 1061 // each line contains a hashcode range and the index file name 1062 String[] readStr = null; 1063 while(read < masterStat.getLen()) { 1064 int b = lin.readLine(line); 1065 read += b; 1066 readStr = line.toString().split(" "); 1067 int startHash = Integer.parseInt(readStr[0]); 1068 int endHash = Integer.parseInt(readStr[1]); 1069 stores.add(new Store(Long.parseLong(readStr[2]), 1070 Long.parseLong(readStr[3]), startHash, 1071 endHash)); 1072 line.clear(); 1073 } 1074 try { 1075 // close the master index 1076 lin.close(); 1077 } catch(IOException io){ 1078 // do nothing just a read. 1079 } 1080 1081 FSDataInputStream aIn = fs.open(archiveIndexPath); 1082 FileStatus archiveStat = fs.getFileStatus(archiveIndexPath); 1083 archiveIndexTimestamp = archiveStat.getModificationTime(); 1084 LineReader aLin; 1085 1086 // now start reading the real index file 1087 for (Store s: stores) { 1088 read = 0; 1089 aIn.seek(s.begin); 1090 aLin = new LineReader(aIn, getConf()); 1091 while (read + s.begin < s.end) { 1092 int tmp = aLin.readLine(line); 1093 read += tmp; 1094 String lineFeed = line.toString(); 1095 String[] parsed = lineFeed.split(" "); 1096 parsed[0] = decodeFileName(parsed[0]); 1097 archive.put(new Path(parsed[0]), new HarStatus(lineFeed)); 1098 line.clear(); 1099 } 1100 } 1101 try { 1102 // close the archive index 1103 aIn.close(); 1104 } catch(IOException io) { 1105 // do nothing just a read. 1106 } 1107 } 1108 } 1109 }