001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 package org.apache.hadoop.fs; 019 020 import java.io.FileNotFoundException; 021 import java.io.IOException; 022 import java.io.UnsupportedEncodingException; 023 import java.net.URI; 024 import java.net.URISyntaxException; 025 import java.net.URLDecoder; 026 import java.util.ArrayList; 027 import java.util.EnumSet; 028 import java.util.List; 029 import java.util.Map; 030 import java.util.TreeMap; 031 import java.util.HashMap; 032 033 import org.apache.hadoop.conf.Configuration; 034 import org.apache.hadoop.fs.permission.FsPermission; 035 import org.apache.hadoop.io.Text; 036 import org.apache.hadoop.util.LineReader; 037 import org.apache.hadoop.util.Progressable; 038 039 /** 040 * This is an implementation of the Hadoop Archive 041 * Filesystem. This archive Filesystem has index files 042 * of the form _index* and has contents of the form 043 * part-*. The index files store the indexes of the 044 * real files. The index files are of the form _masterindex 045 * and _index. The master index is a level of indirection 046 * in to the index file to make the look ups faster. the index 047 * file is sorted with hash code of the paths that it contains 048 * and the master index contains pointers to the positions in 049 * index for ranges of hashcodes. 050 */ 051 052 public class HarFileSystem extends FilterFileSystem { 053 public static final int VERSION = 3; 054 055 private static final Map<URI, HarMetaData> harMetaCache = new HashMap<URI, HarMetaData>(); 056 057 // uri representation of this Har filesystem 058 private URI uri; 059 // the top level path of the archive 060 // in the underlying file system 061 private Path archivePath; 062 // the har auth 063 private String harAuth; 064 065 // pointer into the static metadata cache 066 private HarMetaData metadata; 067 068 /** 069 * public construction of harfilesystem 070 * 071 */ 072 public HarFileSystem() { 073 } 074 075 /** 076 * Return the protocol scheme for the FileSystem. 077 * <p/> 078 * 079 * @return <code>har</code> 080 */ 081 @Override 082 public String getScheme() { 083 return "har"; 084 } 085 086 /** 087 * Constructor to create a HarFileSystem with an 088 * underlying filesystem. 089 * @param fs 090 */ 091 public HarFileSystem(FileSystem fs) { 092 super(fs); 093 } 094 095 /** 096 * Initialize a Har filesystem per har archive. The 097 * archive home directory is the top level directory 098 * in the filesystem that contains the HAR archive. 099 * Be careful with this method, you do not want to go 100 * on creating new Filesystem instances per call to 101 * path.getFileSystem(). 102 * the uri of Har is 103 * har://underlyingfsscheme-host:port/archivepath. 104 * or 105 * har:///archivepath. This assumes the underlying filesystem 106 * to be used in case not specified. 107 */ 108 public void initialize(URI name, Configuration conf) throws IOException { 109 // decode the name 110 URI underLyingURI = decodeHarURI(name, conf); 111 // we got the right har Path- now check if this is 112 // truly a har filesystem 113 Path harPath = archivePath( 114 new Path(name.getScheme(), name.getAuthority(), name.getPath())); 115 if (harPath == null) { 116 throw new IOException("Invalid path for the Har Filesystem. " + 117 name.toString()); 118 } 119 if (fs == null) { 120 fs = FileSystem.get(underLyingURI, conf); 121 } 122 uri = harPath.toUri(); 123 archivePath = new Path(uri.getPath()); 124 harAuth = getHarAuth(underLyingURI); 125 //check for the underlying fs containing 126 // the index file 127 Path masterIndexPath = new Path(archivePath, "_masterindex"); 128 Path archiveIndexPath = new Path(archivePath, "_index"); 129 if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) { 130 throw new IOException("Invalid path for the Har Filesystem. " + 131 "No index file in " + harPath); 132 } 133 134 metadata = harMetaCache.get(uri); 135 if (metadata != null) { 136 FileStatus mStat = fs.getFileStatus(masterIndexPath); 137 FileStatus aStat = fs.getFileStatus(archiveIndexPath); 138 if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() || 139 aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) { 140 // the archive has been overwritten since we last read it 141 // remove the entry from the meta data cache 142 metadata = null; 143 harMetaCache.remove(uri); 144 } 145 } 146 if (metadata == null) { 147 metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath); 148 metadata.parseMetaData(); 149 harMetaCache.put(uri, metadata); 150 } 151 } 152 153 // get the version of the filesystem from the masterindex file 154 // the version is currently not useful since its the first version 155 // of archives 156 public int getHarVersion() throws IOException { 157 if (metadata != null) { 158 return metadata.getVersion(); 159 } 160 else { 161 throw new IOException("Invalid meta data for the Har Filesystem"); 162 } 163 } 164 165 /* 166 * find the parent path that is the 167 * archive path in the path. The last 168 * path segment that ends with .har is 169 * the path that will be returned. 170 */ 171 private Path archivePath(Path p) { 172 Path retPath = null; 173 Path tmp = p; 174 for (int i=0; i< p.depth(); i++) { 175 if (tmp.toString().endsWith(".har")) { 176 retPath = tmp; 177 break; 178 } 179 tmp = tmp.getParent(); 180 } 181 return retPath; 182 } 183 184 /** 185 * decode the raw URI to get the underlying URI 186 * @param rawURI raw Har URI 187 * @return filtered URI of the underlying fileSystem 188 */ 189 private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException { 190 String tmpAuth = rawURI.getAuthority(); 191 //we are using the default file 192 //system in the config 193 //so create a underlying uri and 194 //return it 195 if (tmpAuth == null) { 196 //create a path 197 return FileSystem.getDefaultUri(conf); 198 } 199 String host = rawURI.getHost(); 200 if (host == null) { 201 throw new IOException("URI: " + rawURI 202 + " is an invalid Har URI since host==null." 203 + " Expecting har://<scheme>-<host>/<path>."); 204 } 205 int i = host.indexOf('-'); 206 if (i < 0) { 207 throw new IOException("URI: " + rawURI 208 + " is an invalid Har URI since '-' not found." 209 + " Expecting har://<scheme>-<host>/<path>."); 210 } 211 final String underLyingScheme = host.substring(0, i); 212 i++; 213 final String underLyingHost = i == host.length()? null: host.substring(i); 214 int underLyingPort = rawURI.getPort(); 215 String auth = (underLyingHost == null && underLyingPort == -1)? 216 null:(underLyingHost+ 217 (underLyingPort == -1 ? "" : ":"+underLyingPort)); 218 URI tmp = null; 219 if (rawURI.getQuery() != null) { 220 // query component not allowed 221 throw new IOException("query component in Path not supported " + rawURI); 222 } 223 try { 224 tmp = new URI(underLyingScheme, auth, rawURI.getPath(), 225 rawURI.getQuery(), rawURI.getFragment()); 226 } catch (URISyntaxException e) { 227 // do nothing should not happen 228 } 229 return tmp; 230 } 231 232 private static String decodeString(String str) 233 throws UnsupportedEncodingException { 234 return URLDecoder.decode(str, "UTF-8"); 235 } 236 237 private String decodeFileName(String fname) 238 throws UnsupportedEncodingException { 239 int version = metadata.getVersion(); 240 if (version == 2 || version == 3){ 241 return decodeString(fname); 242 } 243 return fname; 244 } 245 246 /** 247 * return the top level archive. 248 */ 249 public Path getWorkingDirectory() { 250 return new Path(uri.toString()); 251 } 252 253 /** 254 * Create a har specific auth 255 * har-underlyingfs:port 256 * @param underLyingURI the uri of underlying 257 * filesystem 258 * @return har specific auth 259 */ 260 private String getHarAuth(URI underLyingUri) { 261 String auth = underLyingUri.getScheme() + "-"; 262 if (underLyingUri.getHost() != null) { 263 auth += underLyingUri.getHost() + ":"; 264 if (underLyingUri.getPort() != -1) { 265 auth += underLyingUri.getPort(); 266 } 267 } 268 else { 269 auth += ":"; 270 } 271 return auth; 272 } 273 274 /** 275 * Returns the uri of this filesystem. 276 * The uri is of the form 277 * har://underlyingfsschema-host:port/pathintheunderlyingfs 278 */ 279 @Override 280 public URI getUri() { 281 return this.uri; 282 } 283 284 /** 285 * this method returns the path 286 * inside the har filesystem. 287 * this is relative path inside 288 * the har filesystem. 289 * @param path the fully qualified path in the har filesystem. 290 * @return relative path in the filesystem. 291 */ 292 private Path getPathInHar(Path path) { 293 Path harPath = new Path(path.toUri().getPath()); 294 if (archivePath.compareTo(harPath) == 0) 295 return new Path(Path.SEPARATOR); 296 Path tmp = new Path(harPath.getName()); 297 Path parent = harPath.getParent(); 298 while (!(parent.compareTo(archivePath) == 0)) { 299 if (parent.toString().equals(Path.SEPARATOR)) { 300 tmp = null; 301 break; 302 } 303 tmp = new Path(parent.getName(), tmp); 304 parent = parent.getParent(); 305 } 306 if (tmp != null) 307 tmp = new Path(Path.SEPARATOR, tmp); 308 return tmp; 309 } 310 311 //the relative path of p. basically 312 // getting rid of /. Parsing and doing 313 // string manipulation is not good - so 314 // just use the path api to do it. 315 private Path makeRelative(String initial, Path p) { 316 String scheme = this.uri.getScheme(); 317 String authority = this.uri.getAuthority(); 318 Path root = new Path(Path.SEPARATOR); 319 if (root.compareTo(p) == 0) 320 return new Path(scheme, authority, initial); 321 Path retPath = new Path(p.getName()); 322 Path parent = p.getParent(); 323 for (int i=0; i < p.depth()-1; i++) { 324 retPath = new Path(parent.getName(), retPath); 325 parent = parent.getParent(); 326 } 327 return new Path(new Path(scheme, authority, initial), 328 retPath.toString()); 329 } 330 331 /* this makes a path qualified in the har filesystem 332 * (non-Javadoc) 333 * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified( 334 * org.apache.hadoop.fs.Path) 335 */ 336 @Override 337 public Path makeQualified(Path path) { 338 // make sure that we just get the 339 // path component 340 Path fsPath = path; 341 if (!path.isAbsolute()) { 342 fsPath = new Path(archivePath, path); 343 } 344 345 URI tmpURI = fsPath.toUri(); 346 //change this to Har uri 347 return new Path(uri.getScheme(), harAuth, tmpURI.getPath()); 348 } 349 350 /** 351 * Fix offset and length of block locations. 352 * Note that this method modifies the original array. 353 * @param locations block locations of har part file 354 * @param start the start of the desired range in the contained file 355 * @param len the length of the desired range 356 * @param fileOffsetInHar the offset of the desired file in the har part file 357 * @return block locations with fixed offset and length 358 */ 359 static BlockLocation[] fixBlockLocations(BlockLocation[] locations, 360 long start, 361 long len, 362 long fileOffsetInHar) { 363 // offset 1 past last byte of desired range 364 long end = start + len; 365 366 for (BlockLocation location : locations) { 367 // offset of part block relative to beginning of desired file 368 // (may be negative if file starts in this part block) 369 long harBlockStart = location.getOffset() - fileOffsetInHar; 370 // offset 1 past last byte of har block relative to beginning of 371 // desired file 372 long harBlockEnd = harBlockStart + location.getLength(); 373 374 if (start > harBlockStart) { 375 // desired range starts after beginning of this har block 376 // fix offset to beginning of relevant range (relative to desired file) 377 location.setOffset(start); 378 // fix length to relevant portion of har block 379 location.setLength(location.getLength() - (start - harBlockStart)); 380 } else { 381 // desired range includes beginning of this har block 382 location.setOffset(harBlockStart); 383 } 384 385 if (harBlockEnd > end) { 386 // range ends before end of this har block 387 // fix length to remove irrelevant portion at the end 388 location.setLength(location.getLength() - (harBlockEnd - end)); 389 } 390 } 391 392 return locations; 393 } 394 395 /** 396 * Get block locations from the underlying fs and fix their 397 * offsets and lengths. 398 * @param file the input filestatus to get block locations 399 * @param start the start of the desired range in the contained file 400 * @param len the length of the desired range 401 * @return block locations for this segment of file 402 * @throws IOException 403 */ 404 @Override 405 public BlockLocation[] getFileBlockLocations(FileStatus file, long start, 406 long len) throws IOException { 407 HarStatus hstatus = getFileHarStatus(file.getPath()); 408 Path partPath = new Path(archivePath, hstatus.getPartName()); 409 FileStatus partStatus = metadata.getPartFileStatus(partPath); 410 411 // get all part blocks that overlap with the desired file blocks 412 BlockLocation[] locations = 413 fs.getFileBlockLocations(partStatus, 414 hstatus.getStartIndex() + start, len); 415 416 return fixBlockLocations(locations, start, len, hstatus.getStartIndex()); 417 } 418 419 /** 420 * the hash of the path p inside iniside 421 * the filesystem 422 * @param p the path in the harfilesystem 423 * @return the hash code of the path. 424 */ 425 public static int getHarHash(Path p) { 426 return (p.toString().hashCode() & 0x7fffffff); 427 } 428 429 static class Store { 430 public Store() { 431 begin = end = startHash = endHash = 0; 432 } 433 public Store(long begin, long end, int startHash, int endHash) { 434 this.begin = begin; 435 this.end = end; 436 this.startHash = startHash; 437 this.endHash = endHash; 438 } 439 public long begin; 440 public long end; 441 public int startHash; 442 public int endHash; 443 } 444 445 /** 446 * Get filestatuses of all the children of a given directory. This just reads 447 * through index file and reads line by line to get all statuses for children 448 * of a directory. Its a brute force way of getting all such filestatuses 449 * 450 * @param parent 451 * the parent path directory 452 * @param statuses 453 * the list to add the children filestatuses to 454 * @param children 455 * the string list of children for this parent 456 * @param archiveIndexStat 457 * the archive index filestatus 458 */ 459 private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses, 460 List<String> children) throws IOException { 461 String parentString = parent.getName(); 462 if (!parentString.endsWith(Path.SEPARATOR)){ 463 parentString += Path.SEPARATOR; 464 } 465 Path harPath = new Path(parentString); 466 int harlen = harPath.depth(); 467 final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>(); 468 469 for (HarStatus hstatus : metadata.archive.values()) { 470 String child = hstatus.getName(); 471 if ((child.startsWith(parentString))) { 472 Path thisPath = new Path(child); 473 if (thisPath.depth() == harlen + 1) { 474 statuses.add(toFileStatus(hstatus, cache)); 475 } 476 } 477 } 478 } 479 480 /** 481 * Combine the status stored in the index and the underlying status. 482 * @param h status stored in the index 483 * @param cache caching the underlying file statuses 484 * @return the combined file status 485 * @throws IOException 486 */ 487 private FileStatus toFileStatus(HarStatus h, 488 Map<String, FileStatus> cache) throws IOException { 489 FileStatus underlying = null; 490 if (cache != null) { 491 underlying = cache.get(h.partName); 492 } 493 if (underlying == null) { 494 final Path p = h.isDir? archivePath: new Path(archivePath, h.partName); 495 underlying = fs.getFileStatus(p); 496 if (cache != null) { 497 cache.put(h.partName, underlying); 498 } 499 } 500 501 long modTime = 0; 502 int version = metadata.getVersion(); 503 if (version < 3) { 504 modTime = underlying.getModificationTime(); 505 } else if (version == 3) { 506 modTime = h.getModificationTime(); 507 } 508 509 return new FileStatus( 510 h.isDir()? 0L: h.getLength(), 511 h.isDir(), 512 underlying.getReplication(), 513 underlying.getBlockSize(), 514 modTime, 515 underlying.getAccessTime(), 516 underlying.getPermission(), 517 underlying.getOwner(), 518 underlying.getGroup(), 519 makeRelative(this.uri.getPath(), new Path(h.name))); 520 } 521 522 // a single line parser for hadoop archives status 523 // stored in a single line in the index files 524 // the format is of the form 525 // filename "dir"/"file" partFileName startIndex length 526 // <space seperated children> 527 private class HarStatus { 528 boolean isDir; 529 String name; 530 List<String> children; 531 String partName; 532 long startIndex; 533 long length; 534 long modificationTime = 0; 535 536 public HarStatus(String harString) throws UnsupportedEncodingException { 537 String[] splits = harString.split(" "); 538 this.name = decodeFileName(splits[0]); 539 this.isDir = "dir".equals(splits[1]) ? true: false; 540 // this is equal to "none" if its a directory 541 this.partName = splits[2]; 542 this.startIndex = Long.parseLong(splits[3]); 543 this.length = Long.parseLong(splits[4]); 544 545 int version = metadata.getVersion(); 546 String[] propSplits = null; 547 // propSplits is used to retrieve the metainformation that Har versions 548 // 1 & 2 missed (modification time, permission, owner group). 549 // These fields are stored in an encoded string placed in different 550 // locations depending on whether it's a file or directory entry. 551 // If it's a directory, the string will be placed at the partName 552 // location (directories have no partName because they don't have data 553 // to be stored). This is done because the number of fields in a 554 // directory entry is unbounded (all children are listed at the end) 555 // If it's a file, the string will be the last field. 556 if (isDir) { 557 if (version == 3){ 558 propSplits = decodeString(this.partName).split(" "); 559 } 560 children = new ArrayList<String>(); 561 for (int i = 5; i < splits.length; i++) { 562 children.add(decodeFileName(splits[i])); 563 } 564 } else if (version == 3) { 565 propSplits = decodeString(splits[5]).split(" "); 566 } 567 568 if (propSplits != null && propSplits.length >= 4) { 569 modificationTime = Long.parseLong(propSplits[0]); 570 // the fields below are stored in the file but are currently not used 571 // by HarFileSystem 572 // permission = new FsPermission(Short.parseShort(propSplits[1])); 573 // owner = decodeString(propSplits[2]); 574 // group = decodeString(propSplits[3]); 575 } 576 } 577 public boolean isDir() { 578 return isDir; 579 } 580 581 public String getName() { 582 return name; 583 } 584 585 public List<String> getChildren() { 586 return children; 587 } 588 public String getFileName() { 589 return name; 590 } 591 public String getPartName() { 592 return partName; 593 } 594 public long getStartIndex() { 595 return startIndex; 596 } 597 public long getLength() { 598 return length; 599 } 600 public long getModificationTime() { 601 return modificationTime; 602 } 603 } 604 605 /** 606 * return the filestatus of files in har archive. 607 * The permission returned are that of the archive 608 * index files. The permissions are not persisted 609 * while creating a hadoop archive. 610 * @param f the path in har filesystem 611 * @return filestatus. 612 * @throws IOException 613 */ 614 @Override 615 public FileStatus getFileStatus(Path f) throws IOException { 616 HarStatus hstatus = getFileHarStatus(f); 617 return toFileStatus(hstatus, null); 618 } 619 620 private HarStatus getFileHarStatus(Path f) throws IOException { 621 // get the fs DataInputStream for the underlying file 622 // look up the index. 623 Path p = makeQualified(f); 624 Path harPath = getPathInHar(p); 625 if (harPath == null) { 626 throw new IOException("Invalid file name: " + f + " in " + uri); 627 } 628 HarStatus hstatus = metadata.archive.get(harPath); 629 if (hstatus == null) { 630 throw new FileNotFoundException("File: " + f + " does not exist in " + uri); 631 } 632 return hstatus; 633 } 634 635 /** 636 * @return null since no checksum algorithm is implemented. 637 */ 638 public FileChecksum getFileChecksum(Path f) { 639 return null; 640 } 641 642 /** 643 * Returns a har input stream which fakes end of 644 * file. It reads the index files to get the part 645 * file name and the size and start of the file. 646 */ 647 @Override 648 public FSDataInputStream open(Path f, int bufferSize) throws IOException { 649 // get the fs DataInputStream for the underlying file 650 HarStatus hstatus = getFileHarStatus(f); 651 // we got it.. woo hooo!!! 652 if (hstatus.isDir()) { 653 throw new FileNotFoundException(f + " : not a file in " + 654 archivePath); 655 } 656 return new HarFSDataInputStream(fs, new Path(archivePath, 657 hstatus.getPartName()), 658 hstatus.getStartIndex(), hstatus.getLength(), bufferSize); 659 } 660 661 /* 662 * create throws an exception in Har filesystem. 663 * The archive once created cannot be changed. 664 */ 665 public FSDataOutputStream create(Path f, int bufferSize) 666 throws IOException { 667 throw new IOException("Har: Create not allowed"); 668 } 669 670 public FSDataOutputStream create(Path f, 671 FsPermission permission, 672 boolean overwrite, 673 int bufferSize, 674 short replication, 675 long blockSize, 676 Progressable progress) throws IOException { 677 throw new IOException("Har: create not allowed."); 678 } 679 680 @Override 681 public void close() throws IOException { 682 if (fs != null) { 683 try { 684 fs.close(); 685 } catch(IOException ie) { 686 //this might already be closed 687 // ignore 688 } 689 } 690 } 691 692 /** 693 * Not implemented. 694 */ 695 @Override 696 public boolean setReplication(Path src, short replication) throws IOException{ 697 throw new IOException("Har: setreplication not allowed"); 698 } 699 700 /** 701 * Not implemented. 702 */ 703 @Override 704 public boolean delete(Path f, boolean recursive) throws IOException { 705 throw new IOException("Har: delete not allowed"); 706 } 707 708 /** 709 * liststatus returns the children of a directory 710 * after looking up the index files. 711 */ 712 @Override 713 public FileStatus[] listStatus(Path f) throws IOException { 714 //need to see if the file is an index in file 715 //get the filestatus of the archive directory 716 // we will create fake filestatuses to return 717 // to the client 718 List<FileStatus> statuses = new ArrayList<FileStatus>(); 719 Path tmpPath = makeQualified(f); 720 Path harPath = getPathInHar(tmpPath); 721 HarStatus hstatus = metadata.archive.get(harPath); 722 if (hstatus == null) { 723 throw new FileNotFoundException("File " + f + " not found in " + archivePath); 724 } 725 if (hstatus.isDir()) { 726 fileStatusesInIndex(hstatus, statuses, hstatus.children); 727 } else { 728 statuses.add(toFileStatus(hstatus, null)); 729 } 730 731 return statuses.toArray(new FileStatus[statuses.size()]); 732 } 733 734 /** 735 * return the top level archive path. 736 */ 737 public Path getHomeDirectory() { 738 return new Path(uri.toString()); 739 } 740 741 public void setWorkingDirectory(Path newDir) { 742 //does nothing. 743 } 744 745 /** 746 * not implemented. 747 */ 748 public boolean mkdirs(Path f, FsPermission permission) throws IOException { 749 throw new IOException("Har: mkdirs not allowed"); 750 } 751 752 /** 753 * not implemented. 754 */ 755 public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws 756 IOException { 757 throw new IOException("Har: copyfromlocalfile not allowed"); 758 } 759 760 /** 761 * copies the file in the har filesystem to a local file. 762 */ 763 public void copyToLocalFile(boolean delSrc, Path src, Path dst) 764 throws IOException { 765 FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf()); 766 } 767 768 /** 769 * not implemented. 770 */ 771 public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) 772 throws IOException { 773 throw new IOException("Har: startLocalOutput not allowed"); 774 } 775 776 /** 777 * not implemented. 778 */ 779 public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) 780 throws IOException { 781 throw new IOException("Har: completeLocalOutput not allowed"); 782 } 783 784 /** 785 * not implemented. 786 */ 787 public void setOwner(Path p, String username, String groupname) 788 throws IOException { 789 throw new IOException("Har: setowner not allowed"); 790 } 791 792 /** 793 * Not implemented. 794 */ 795 public void setPermission(Path p, FsPermission permisssion) 796 throws IOException { 797 throw new IOException("Har: setPermission not allowed"); 798 } 799 800 /** 801 * Hadoop archives input stream. This input stream fakes EOF 802 * since archive files are part of bigger part files. 803 */ 804 private static class HarFSDataInputStream extends FSDataInputStream { 805 /** 806 * Create an input stream that fakes all the reads/positions/seeking. 807 */ 808 private static class HarFsInputStream extends FSInputStream { 809 private long position, start, end; 810 //The underlying data input stream that the 811 // underlying filesystem will return. 812 private FSDataInputStream underLyingStream; 813 //one byte buffer 814 private byte[] oneBytebuff = new byte[1]; 815 HarFsInputStream(FileSystem fs, Path path, long start, 816 long length, int bufferSize) throws IOException { 817 underLyingStream = fs.open(path, bufferSize); 818 underLyingStream.seek(start); 819 // the start of this file in the part file 820 this.start = start; 821 // the position pointer in the part file 822 this.position = start; 823 // the end pointer in the part file 824 this.end = start + length; 825 } 826 827 public synchronized int available() throws IOException { 828 long remaining = end - underLyingStream.getPos(); 829 if (remaining > (long)Integer.MAX_VALUE) { 830 return Integer.MAX_VALUE; 831 } 832 return (int) remaining; 833 } 834 835 public synchronized void close() throws IOException { 836 underLyingStream.close(); 837 super.close(); 838 } 839 840 //not implemented 841 @Override 842 public void mark(int readLimit) { 843 // do nothing 844 } 845 846 /** 847 * reset is not implemented 848 */ 849 public void reset() throws IOException { 850 throw new IOException("reset not implemented."); 851 } 852 853 public synchronized int read() throws IOException { 854 int ret = read(oneBytebuff, 0, 1); 855 return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff); 856 } 857 858 public synchronized int read(byte[] b) throws IOException { 859 int ret = read(b, 0, b.length); 860 if (ret != -1) { 861 position += ret; 862 } 863 return ret; 864 } 865 866 /** 867 * 868 */ 869 public synchronized int read(byte[] b, int offset, int len) 870 throws IOException { 871 int newlen = len; 872 int ret = -1; 873 if (position + len > end) { 874 newlen = (int) (end - position); 875 } 876 // end case 877 if (newlen == 0) 878 return ret; 879 ret = underLyingStream.read(b, offset, newlen); 880 position += ret; 881 return ret; 882 } 883 884 public synchronized long skip(long n) throws IOException { 885 long tmpN = n; 886 if (tmpN > 0) { 887 if (position + tmpN > end) { 888 tmpN = end - position; 889 } 890 underLyingStream.seek(tmpN + position); 891 position += tmpN; 892 return tmpN; 893 } 894 return (tmpN < 0)? -1 : 0; 895 } 896 897 public synchronized long getPos() throws IOException { 898 return (position - start); 899 } 900 901 public synchronized void seek(long pos) throws IOException { 902 if (pos < 0 || (start + pos > end)) { 903 throw new IOException("Failed to seek: EOF"); 904 } 905 position = start + pos; 906 underLyingStream.seek(position); 907 } 908 909 public boolean seekToNewSource(long targetPos) throws IOException { 910 //do not need to implement this 911 // hdfs in itself does seektonewsource 912 // while reading. 913 return false; 914 } 915 916 /** 917 * implementing position readable. 918 */ 919 public int read(long pos, byte[] b, int offset, int length) 920 throws IOException { 921 int nlength = length; 922 if (start + nlength + pos > end) { 923 nlength = (int) (end - (start + pos)); 924 } 925 return underLyingStream.read(pos + start , b, offset, nlength); 926 } 927 928 /** 929 * position readable again. 930 */ 931 public void readFully(long pos, byte[] b, int offset, int length) 932 throws IOException { 933 if (start + length + pos > end) { 934 throw new IOException("Not enough bytes to read."); 935 } 936 underLyingStream.readFully(pos + start, b, offset, length); 937 } 938 939 public void readFully(long pos, byte[] b) throws IOException { 940 readFully(pos, b, 0, b.length); 941 } 942 943 } 944 945 /** 946 * constructors for har input stream. 947 * @param fs the underlying filesystem 948 * @param p The path in the underlying filesystem 949 * @param start the start position in the part file 950 * @param length the length of valid data in the part file 951 * @param bufsize the buffer size 952 * @throws IOException 953 */ 954 public HarFSDataInputStream(FileSystem fs, Path p, long start, 955 long length, int bufsize) throws IOException { 956 super(new HarFsInputStream(fs, p, start, length, bufsize)); 957 } 958 959 /** 960 * constructor for har input stream. 961 * @param fs the underlying filesystem 962 * @param p the path in the underlying file system 963 * @param start the start position in the part file 964 * @param length the length of valid data in the part file. 965 * @throws IOException 966 */ 967 public HarFSDataInputStream(FileSystem fs, Path p, long start, long length) 968 throws IOException { 969 super(new HarFsInputStream(fs, p, start, length, 0)); 970 } 971 } 972 973 private class HarMetaData { 974 private FileSystem fs; 975 private int version; 976 // the masterIndex of the archive 977 private Path masterIndexPath; 978 // the index file 979 private Path archiveIndexPath; 980 981 private long masterIndexTimestamp; 982 private long archiveIndexTimestamp; 983 984 List<Store> stores = new ArrayList<Store>(); 985 Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>(); 986 private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>(); 987 988 public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) { 989 this.fs = fs; 990 this.masterIndexPath = masterIndexPath; 991 this.archiveIndexPath = archiveIndexPath; 992 } 993 994 public FileStatus getPartFileStatus(Path partPath) throws IOException { 995 FileStatus status; 996 status = partFileStatuses.get(partPath); 997 if (status == null) { 998 status = fs.getFileStatus(partPath); 999 partFileStatuses.put(partPath, status); 1000 } 1001 return status; 1002 } 1003 1004 public long getMasterIndexTimestamp() { 1005 return masterIndexTimestamp; 1006 } 1007 1008 public long getArchiveIndexTimestamp() { 1009 return archiveIndexTimestamp; 1010 } 1011 1012 private int getVersion() { 1013 return version; 1014 } 1015 1016 private void parseMetaData() throws IOException { 1017 FSDataInputStream in = fs.open(masterIndexPath); 1018 FileStatus masterStat = fs.getFileStatus(masterIndexPath); 1019 masterIndexTimestamp = masterStat.getModificationTime(); 1020 LineReader lin = new LineReader(in, getConf()); 1021 Text line = new Text(); 1022 long read = lin.readLine(line); 1023 1024 // the first line contains the version of the index file 1025 String versionLine = line.toString(); 1026 String[] arr = versionLine.split(" "); 1027 version = Integer.parseInt(arr[0]); 1028 // make it always backwards-compatible 1029 if (this.version > HarFileSystem.VERSION) { 1030 throw new IOException("Invalid version " + 1031 this.version + " expected " + HarFileSystem.VERSION); 1032 } 1033 1034 // each line contains a hashcode range and the index file name 1035 String[] readStr = null; 1036 while(read < masterStat.getLen()) { 1037 int b = lin.readLine(line); 1038 read += b; 1039 readStr = line.toString().split(" "); 1040 int startHash = Integer.parseInt(readStr[0]); 1041 int endHash = Integer.parseInt(readStr[1]); 1042 stores.add(new Store(Long.parseLong(readStr[2]), 1043 Long.parseLong(readStr[3]), startHash, 1044 endHash)); 1045 line.clear(); 1046 } 1047 try { 1048 // close the master index 1049 lin.close(); 1050 } catch(IOException io){ 1051 // do nothing just a read. 1052 } 1053 1054 FSDataInputStream aIn = fs.open(archiveIndexPath); 1055 FileStatus archiveStat = fs.getFileStatus(archiveIndexPath); 1056 archiveIndexTimestamp = archiveStat.getModificationTime(); 1057 LineReader aLin; 1058 String retStr = null; 1059 // now start reading the real index file 1060 for (Store s: stores) { 1061 read = 0; 1062 aIn.seek(s.begin); 1063 aLin = new LineReader(aIn, getConf()); 1064 while (read + s.begin < s.end) { 1065 int tmp = aLin.readLine(line); 1066 read += tmp; 1067 String lineFeed = line.toString(); 1068 String[] parsed = lineFeed.split(" "); 1069 parsed[0] = decodeFileName(parsed[0]); 1070 archive.put(new Path(parsed[0]), new HarStatus(lineFeed)); 1071 line.clear(); 1072 } 1073 } 1074 try { 1075 // close the archive index 1076 aIn.close(); 1077 } catch(IOException io) { 1078 // do nothing just a read. 1079 } 1080 } 1081 } 1082 }