001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 package org.apache.hadoop.fs; 019 020 import java.io.FileNotFoundException; 021 import java.io.IOException; 022 import java.io.UnsupportedEncodingException; 023 import java.net.URI; 024 import java.net.URISyntaxException; 025 import java.net.URLDecoder; 026 import java.util.ArrayList; 027 import java.util.List; 028 import java.util.Map; 029 import java.util.TreeMap; 030 import java.util.HashMap; 031 import java.util.concurrent.ConcurrentHashMap; 032 033 import org.apache.hadoop.conf.Configuration; 034 import org.apache.hadoop.fs.permission.FsPermission; 035 import org.apache.hadoop.io.Text; 036 import org.apache.hadoop.util.LineReader; 037 import org.apache.hadoop.util.Progressable; 038 039 /** 040 * This is an implementation of the Hadoop Archive 041 * Filesystem. This archive Filesystem has index files 042 * of the form _index* and has contents of the form 043 * part-*. The index files store the indexes of the 044 * real files. The index files are of the form _masterindex 045 * and _index. The master index is a level of indirection 046 * in to the index file to make the look ups faster. the index 047 * file is sorted with hash code of the paths that it contains 048 * and the master index contains pointers to the positions in 049 * index for ranges of hashcodes. 050 */ 051 052 public class HarFileSystem extends FilterFileSystem { 053 public static final int VERSION = 3; 054 055 private static final Map<URI, HarMetaData> harMetaCache = 056 new ConcurrentHashMap<URI, HarMetaData>(); 057 058 // uri representation of this Har filesystem 059 private URI uri; 060 // the top level path of the archive 061 // in the underlying file system 062 private Path archivePath; 063 // the har auth 064 private String harAuth; 065 066 // pointer into the static metadata cache 067 private HarMetaData metadata; 068 069 /** 070 * public construction of harfilesystem 071 * 072 */ 073 public HarFileSystem() { 074 } 075 076 /** 077 * Constructor to create a HarFileSystem with an 078 * underlying filesystem. 079 * @param fs 080 */ 081 public HarFileSystem(FileSystem fs) { 082 super(fs); 083 } 084 085 /** 086 * Initialize a Har filesystem per har archive. The 087 * archive home directory is the top level directory 088 * in the filesystem that contains the HAR archive. 089 * Be careful with this method, you do not want to go 090 * on creating new Filesystem instances per call to 091 * path.getFileSystem(). 092 * the uri of Har is 093 * har://underlyingfsscheme-host:port/archivepath. 094 * or 095 * har:///archivepath. This assumes the underlying filesystem 096 * to be used in case not specified. 097 */ 098 public void initialize(URI name, Configuration conf) throws IOException { 099 // decode the name 100 URI underLyingURI = decodeHarURI(name, conf); 101 // we got the right har Path- now check if this is 102 // truly a har filesystem 103 Path harPath = archivePath( 104 new Path(name.getScheme(), name.getAuthority(), name.getPath())); 105 if (harPath == null) { 106 throw new IOException("Invalid path for the Har Filesystem. " + 107 name.toString()); 108 } 109 if (fs == null) { 110 fs = FileSystem.get(underLyingURI, conf); 111 } 112 uri = harPath.toUri(); 113 archivePath = new Path(uri.getPath()); 114 harAuth = getHarAuth(underLyingURI); 115 //check for the underlying fs containing 116 // the index file 117 Path masterIndexPath = new Path(archivePath, "_masterindex"); 118 Path archiveIndexPath = new Path(archivePath, "_index"); 119 if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) { 120 throw new IOException("Invalid path for the Har Filesystem. " + 121 "No index file in " + harPath); 122 } 123 124 metadata = harMetaCache.get(uri); 125 if (metadata != null) { 126 FileStatus mStat = fs.getFileStatus(masterIndexPath); 127 FileStatus aStat = fs.getFileStatus(archiveIndexPath); 128 if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() || 129 aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) { 130 // the archive has been overwritten since we last read it 131 // remove the entry from the meta data cache 132 metadata = null; 133 harMetaCache.remove(uri); 134 } 135 } 136 if (metadata == null) { 137 metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath); 138 metadata.parseMetaData(); 139 harMetaCache.put(uri, metadata); 140 } 141 } 142 143 // get the version of the filesystem from the masterindex file 144 // the version is currently not useful since its the first version 145 // of archives 146 public int getHarVersion() throws IOException { 147 if (metadata != null) { 148 return metadata.getVersion(); 149 } 150 else { 151 throw new IOException("Invalid meta data for the Har Filesystem"); 152 } 153 } 154 155 /* 156 * find the parent path that is the 157 * archive path in the path. The last 158 * path segment that ends with .har is 159 * the path that will be returned. 160 */ 161 private Path archivePath(Path p) { 162 Path retPath = null; 163 Path tmp = p; 164 for (int i=0; i< p.depth(); i++) { 165 if (tmp.toString().endsWith(".har")) { 166 retPath = tmp; 167 break; 168 } 169 tmp = tmp.getParent(); 170 } 171 return retPath; 172 } 173 174 /** 175 * decode the raw URI to get the underlying URI 176 * @param rawURI raw Har URI 177 * @return filtered URI of the underlying fileSystem 178 */ 179 private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException { 180 String tmpAuth = rawURI.getAuthority(); 181 //we are using the default file 182 //system in the config 183 //so create a underlying uri and 184 //return it 185 if (tmpAuth == null) { 186 //create a path 187 return FileSystem.getDefaultUri(conf); 188 } 189 String host = rawURI.getHost(); 190 if (host == null) { 191 throw new IOException("URI: " + rawURI 192 + " is an invalid Har URI since host==null." 193 + " Expecting har://<scheme>-<host>/<path>."); 194 } 195 int i = host.indexOf('-'); 196 if (i < 0) { 197 throw new IOException("URI: " + rawURI 198 + " is an invalid Har URI since '-' not found." 199 + " Expecting har://<scheme>-<host>/<path>."); 200 } 201 final String underLyingScheme = host.substring(0, i); 202 i++; 203 final String underLyingHost = i == host.length()? null: host.substring(i); 204 int underLyingPort = rawURI.getPort(); 205 String auth = (underLyingHost == null && underLyingPort == -1)? 206 null:(underLyingHost+":"+underLyingPort); 207 URI tmp = null; 208 if (rawURI.getQuery() != null) { 209 // query component not allowed 210 throw new IOException("query component in Path not supported " + rawURI); 211 } 212 try { 213 tmp = new URI(underLyingScheme, auth, rawURI.getPath(), 214 rawURI.getQuery(), rawURI.getFragment()); 215 } catch (URISyntaxException e) { 216 // do nothing should not happen 217 } 218 return tmp; 219 } 220 221 private static String decodeString(String str) 222 throws UnsupportedEncodingException { 223 return URLDecoder.decode(str, "UTF-8"); 224 } 225 226 private String decodeFileName(String fname) 227 throws UnsupportedEncodingException { 228 int version = metadata.getVersion(); 229 if (version == 2 || version == 3){ 230 return decodeString(fname); 231 } 232 return fname; 233 } 234 235 /** 236 * return the top level archive. 237 */ 238 public Path getWorkingDirectory() { 239 return new Path(uri.toString()); 240 } 241 242 /** 243 * Create a har specific auth 244 * har-underlyingfs:port 245 * @param underLyingURI the uri of underlying 246 * filesystem 247 * @return har specific auth 248 */ 249 private String getHarAuth(URI underLyingUri) { 250 String auth = underLyingUri.getScheme() + "-"; 251 if (underLyingUri.getHost() != null) { 252 auth += underLyingUri.getHost() + ":"; 253 if (underLyingUri.getPort() != -1) { 254 auth += underLyingUri.getPort(); 255 } 256 } 257 else { 258 auth += ":"; 259 } 260 return auth; 261 } 262 263 /** 264 * Returns the uri of this filesystem. 265 * The uri is of the form 266 * har://underlyingfsschema-host:port/pathintheunderlyingfs 267 */ 268 @Override 269 public URI getUri() { 270 return this.uri; 271 } 272 273 /** 274 * this method returns the path 275 * inside the har filesystem. 276 * this is relative path inside 277 * the har filesystem. 278 * @param path the fully qualified path in the har filesystem. 279 * @return relative path in the filesystem. 280 */ 281 private Path getPathInHar(Path path) { 282 Path harPath = new Path(path.toUri().getPath()); 283 if (archivePath.compareTo(harPath) == 0) 284 return new Path(Path.SEPARATOR); 285 Path tmp = new Path(harPath.getName()); 286 Path parent = harPath.getParent(); 287 while (!(parent.compareTo(archivePath) == 0)) { 288 if (parent.toString().equals(Path.SEPARATOR)) { 289 tmp = null; 290 break; 291 } 292 tmp = new Path(parent.getName(), tmp); 293 parent = parent.getParent(); 294 } 295 if (tmp != null) 296 tmp = new Path(Path.SEPARATOR, tmp); 297 return tmp; 298 } 299 300 //the relative path of p. basically 301 // getting rid of /. Parsing and doing 302 // string manipulation is not good - so 303 // just use the path api to do it. 304 private Path makeRelative(String initial, Path p) { 305 String scheme = this.uri.getScheme(); 306 String authority = this.uri.getAuthority(); 307 Path root = new Path(Path.SEPARATOR); 308 if (root.compareTo(p) == 0) 309 return new Path(scheme, authority, initial); 310 Path retPath = new Path(p.getName()); 311 Path parent = p.getParent(); 312 for (int i=0; i < p.depth()-1; i++) { 313 retPath = new Path(parent.getName(), retPath); 314 parent = parent.getParent(); 315 } 316 return new Path(new Path(scheme, authority, initial), 317 retPath.toString()); 318 } 319 320 /* this makes a path qualified in the har filesystem 321 * (non-Javadoc) 322 * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified( 323 * org.apache.hadoop.fs.Path) 324 */ 325 @Override 326 public Path makeQualified(Path path) { 327 // make sure that we just get the 328 // path component 329 Path fsPath = path; 330 if (!path.isAbsolute()) { 331 fsPath = new Path(archivePath, path); 332 } 333 334 URI tmpURI = fsPath.toUri(); 335 //change this to Har uri 336 return new Path(uri.getScheme(), harAuth, tmpURI.getPath()); 337 } 338 339 /** 340 * Fix offset and length of block locations. 341 * Note that this method modifies the original array. 342 * @param locations block locations of har part file 343 * @param start the start of the desired range in the contained file 344 * @param len the length of the desired range 345 * @param fileOffsetInHar the offset of the desired file in the har part file 346 * @return block locations with fixed offset and length 347 */ 348 static BlockLocation[] fixBlockLocations(BlockLocation[] locations, 349 long start, 350 long len, 351 long fileOffsetInHar) { 352 // offset 1 past last byte of desired range 353 long end = start + len; 354 355 for (BlockLocation location : locations) { 356 // offset of part block relative to beginning of desired file 357 // (may be negative if file starts in this part block) 358 long harBlockStart = location.getOffset() - fileOffsetInHar; 359 // offset 1 past last byte of har block relative to beginning of 360 // desired file 361 long harBlockEnd = harBlockStart + location.getLength(); 362 363 if (start > harBlockStart) { 364 // desired range starts after beginning of this har block 365 // fix offset to beginning of relevant range (relative to desired file) 366 location.setOffset(start); 367 // fix length to relevant portion of har block 368 location.setLength(location.getLength() - (start - harBlockStart)); 369 } else { 370 // desired range includes beginning of this har block 371 location.setOffset(harBlockStart); 372 } 373 374 if (harBlockEnd > end) { 375 // range ends before end of this har block 376 // fix length to remove irrelevant portion at the end 377 location.setLength(location.getLength() - (harBlockEnd - end)); 378 } 379 } 380 381 return locations; 382 } 383 384 /** 385 * Get block locations from the underlying fs and fix their 386 * offsets and lengths. 387 * @param file the input filestatus to get block locations 388 * @param start the start of the desired range in the contained file 389 * @param len the length of the desired range 390 * @return block locations for this segment of file 391 * @throws IOException 392 */ 393 @Override 394 public BlockLocation[] getFileBlockLocations(FileStatus file, long start, 395 long len) throws IOException { 396 HarStatus hstatus = getFileHarStatus(file.getPath()); 397 Path partPath = new Path(archivePath, hstatus.getPartName()); 398 FileStatus partStatus = metadata.getPartFileStatus(partPath); 399 400 // get all part blocks that overlap with the desired file blocks 401 BlockLocation[] locations = 402 fs.getFileBlockLocations(partStatus, 403 hstatus.getStartIndex() + start, len); 404 405 return fixBlockLocations(locations, start, len, hstatus.getStartIndex()); 406 } 407 408 /** 409 * the hash of the path p inside iniside 410 * the filesystem 411 * @param p the path in the harfilesystem 412 * @return the hash code of the path. 413 */ 414 public static int getHarHash(Path p) { 415 return (p.toString().hashCode() & 0x7fffffff); 416 } 417 418 static class Store { 419 public Store() { 420 begin = end = startHash = endHash = 0; 421 } 422 public Store(long begin, long end, int startHash, int endHash) { 423 this.begin = begin; 424 this.end = end; 425 this.startHash = startHash; 426 this.endHash = endHash; 427 } 428 public long begin; 429 public long end; 430 public int startHash; 431 public int endHash; 432 } 433 434 /** 435 * Get filestatuses of all the children of a given directory. This just reads 436 * through index file and reads line by line to get all statuses for children 437 * of a directory. Its a brute force way of getting all such filestatuses 438 * 439 * @param parent 440 * the parent path directory 441 * @param statuses 442 * the list to add the children filestatuses to 443 * @param children 444 * the string list of children for this parent 445 * @param archiveIndexStat 446 * the archive index filestatus 447 */ 448 private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses, 449 List<String> children) throws IOException { 450 String parentString = parent.getName(); 451 if (!parentString.endsWith(Path.SEPARATOR)){ 452 parentString += Path.SEPARATOR; 453 } 454 Path harPath = new Path(parentString); 455 int harlen = harPath.depth(); 456 final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>(); 457 458 for (HarStatus hstatus : metadata.archive.values()) { 459 String child = hstatus.getName(); 460 if ((child.startsWith(parentString))) { 461 Path thisPath = new Path(child); 462 if (thisPath.depth() == harlen + 1) { 463 statuses.add(toFileStatus(hstatus, cache)); 464 } 465 } 466 } 467 } 468 469 /** 470 * Combine the status stored in the index and the underlying status. 471 * @param h status stored in the index 472 * @param cache caching the underlying file statuses 473 * @return the combined file status 474 * @throws IOException 475 */ 476 private FileStatus toFileStatus(HarStatus h, 477 Map<String, FileStatus> cache) throws IOException { 478 FileStatus underlying = null; 479 if (cache != null) { 480 underlying = cache.get(h.partName); 481 } 482 if (underlying == null) { 483 final Path p = h.isDir? archivePath: new Path(archivePath, h.partName); 484 underlying = fs.getFileStatus(p); 485 if (cache != null) { 486 cache.put(h.partName, underlying); 487 } 488 } 489 490 long modTime = 0; 491 int version = metadata.getVersion(); 492 if (version < 3) { 493 modTime = underlying.getModificationTime(); 494 } else if (version == 3) { 495 modTime = h.getModificationTime(); 496 } 497 498 return new FileStatus( 499 h.isDir()? 0L: h.getLength(), 500 h.isDir(), 501 underlying.getReplication(), 502 underlying.getBlockSize(), 503 modTime, 504 underlying.getAccessTime(), 505 underlying.getPermission(), 506 underlying.getOwner(), 507 underlying.getGroup(), 508 makeRelative(this.uri.getPath(), new Path(h.name))); 509 } 510 511 // a single line parser for hadoop archives status 512 // stored in a single line in the index files 513 // the format is of the form 514 // filename "dir"/"file" partFileName startIndex length 515 // <space seperated children> 516 private class HarStatus { 517 boolean isDir; 518 String name; 519 List<String> children; 520 String partName; 521 long startIndex; 522 long length; 523 long modificationTime = 0; 524 525 public HarStatus(String harString) throws UnsupportedEncodingException { 526 String[] splits = harString.split(" "); 527 this.name = decodeFileName(splits[0]); 528 this.isDir = "dir".equals(splits[1]) ? true: false; 529 // this is equal to "none" if its a directory 530 this.partName = splits[2]; 531 this.startIndex = Long.parseLong(splits[3]); 532 this.length = Long.parseLong(splits[4]); 533 534 int version = metadata.getVersion(); 535 String[] propSplits = null; 536 // propSplits is used to retrieve the metainformation that Har versions 537 // 1 & 2 missed (modification time, permission, owner group). 538 // These fields are stored in an encoded string placed in different 539 // locations depending on whether it's a file or directory entry. 540 // If it's a directory, the string will be placed at the partName 541 // location (directories have no partName because they don't have data 542 // to be stored). This is done because the number of fields in a 543 // directory entry is unbounded (all children are listed at the end) 544 // If it's a file, the string will be the last field. 545 if (isDir) { 546 if (version == 3){ 547 propSplits = decodeString(this.partName).split(" "); 548 } 549 children = new ArrayList<String>(); 550 for (int i = 5; i < splits.length; i++) { 551 children.add(decodeFileName(splits[i])); 552 } 553 } else if (version == 3) { 554 propSplits = decodeString(splits[5]).split(" "); 555 } 556 557 if (propSplits != null && propSplits.length >= 4) { 558 modificationTime = Long.parseLong(propSplits[0]); 559 // the fields below are stored in the file but are currently not used 560 // by HarFileSystem 561 // permission = new FsPermission(Short.parseShort(propSplits[1])); 562 // owner = decodeString(propSplits[2]); 563 // group = decodeString(propSplits[3]); 564 } 565 } 566 public boolean isDir() { 567 return isDir; 568 } 569 570 public String getName() { 571 return name; 572 } 573 public String getPartName() { 574 return partName; 575 } 576 public long getStartIndex() { 577 return startIndex; 578 } 579 public long getLength() { 580 return length; 581 } 582 public long getModificationTime() { 583 return modificationTime; 584 } 585 } 586 587 /** 588 * return the filestatus of files in har archive. 589 * The permission returned are that of the archive 590 * index files. The permissions are not persisted 591 * while creating a hadoop archive. 592 * @param f the path in har filesystem 593 * @return filestatus. 594 * @throws IOException 595 */ 596 @Override 597 public FileStatus getFileStatus(Path f) throws IOException { 598 HarStatus hstatus = getFileHarStatus(f); 599 return toFileStatus(hstatus, null); 600 } 601 602 private HarStatus getFileHarStatus(Path f) throws IOException { 603 // get the fs DataInputStream for the underlying file 604 // look up the index. 605 Path p = makeQualified(f); 606 Path harPath = getPathInHar(p); 607 if (harPath == null) { 608 throw new IOException("Invalid file name: " + f + " in " + uri); 609 } 610 HarStatus hstatus = metadata.archive.get(harPath); 611 if (hstatus == null) { 612 throw new FileNotFoundException("File: " + f + " does not exist in " + uri); 613 } 614 return hstatus; 615 } 616 617 /** 618 * @return null since no checksum algorithm is implemented. 619 */ 620 public FileChecksum getFileChecksum(Path f) { 621 return null; 622 } 623 624 /** 625 * Returns a har input stream which fakes end of 626 * file. It reads the index files to get the part 627 * file name and the size and start of the file. 628 */ 629 @Override 630 public FSDataInputStream open(Path f, int bufferSize) throws IOException { 631 // get the fs DataInputStream for the underlying file 632 HarStatus hstatus = getFileHarStatus(f); 633 // we got it.. woo hooo!!! 634 if (hstatus.isDir()) { 635 throw new FileNotFoundException(f + " : not a file in " + 636 archivePath); 637 } 638 return new HarFSDataInputStream(fs, new Path(archivePath, 639 hstatus.getPartName()), 640 hstatus.getStartIndex(), hstatus.getLength(), bufferSize); 641 } 642 643 public FSDataOutputStream create(Path f, 644 FsPermission permission, 645 boolean overwrite, 646 int bufferSize, 647 short replication, 648 long blockSize, 649 Progressable progress) throws IOException { 650 throw new IOException("Har: create not allowed."); 651 } 652 653 @Override 654 public void close() throws IOException { 655 if (fs != null) { 656 try { 657 fs.close(); 658 } catch(IOException ie) { 659 //this might already be closed 660 // ignore 661 } 662 } 663 } 664 665 /** 666 * Not implemented. 667 */ 668 @Override 669 public boolean setReplication(Path src, short replication) throws IOException{ 670 throw new IOException("Har: setreplication not allowed"); 671 } 672 673 /** 674 * Not implemented. 675 */ 676 @Override 677 public boolean delete(Path f, boolean recursive) throws IOException { 678 throw new IOException("Har: delete not allowed"); 679 } 680 681 /** 682 * liststatus returns the children of a directory 683 * after looking up the index files. 684 */ 685 @Override 686 public FileStatus[] listStatus(Path f) throws IOException { 687 //need to see if the file is an index in file 688 //get the filestatus of the archive directory 689 // we will create fake filestatuses to return 690 // to the client 691 List<FileStatus> statuses = new ArrayList<FileStatus>(); 692 Path tmpPath = makeQualified(f); 693 Path harPath = getPathInHar(tmpPath); 694 HarStatus hstatus = metadata.archive.get(harPath); 695 if (hstatus == null) { 696 throw new FileNotFoundException("File " + f + " not found in " + archivePath); 697 } 698 if (hstatus.isDir()) { 699 fileStatusesInIndex(hstatus, statuses, hstatus.children); 700 } else { 701 statuses.add(toFileStatus(hstatus, null)); 702 } 703 704 return statuses.toArray(new FileStatus[statuses.size()]); 705 } 706 707 /** 708 * return the top level archive path. 709 */ 710 public Path getHomeDirectory() { 711 return new Path(uri.toString()); 712 } 713 714 public void setWorkingDirectory(Path newDir) { 715 //does nothing. 716 } 717 718 /** 719 * not implemented. 720 */ 721 public boolean mkdirs(Path f, FsPermission permission) throws IOException { 722 throw new IOException("Har: mkdirs not allowed"); 723 } 724 725 /** 726 * not implemented. 727 */ 728 public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws 729 IOException { 730 throw new IOException("Har: copyfromlocalfile not allowed"); 731 } 732 733 /** 734 * copies the file in the har filesystem to a local file. 735 */ 736 public void copyToLocalFile(boolean delSrc, Path src, Path dst) 737 throws IOException { 738 FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf()); 739 } 740 741 /** 742 * not implemented. 743 */ 744 public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) 745 throws IOException { 746 throw new IOException("Har: startLocalOutput not allowed"); 747 } 748 749 /** 750 * not implemented. 751 */ 752 public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) 753 throws IOException { 754 throw new IOException("Har: completeLocalOutput not allowed"); 755 } 756 757 /** 758 * not implemented. 759 */ 760 public void setOwner(Path p, String username, String groupname) 761 throws IOException { 762 throw new IOException("Har: setowner not allowed"); 763 } 764 765 /** 766 * Not implemented. 767 */ 768 public void setPermission(Path p, FsPermission permisssion) 769 throws IOException { 770 throw new IOException("Har: setPermission not allowed"); 771 } 772 773 /** 774 * Hadoop archives input stream. This input stream fakes EOF 775 * since archive files are part of bigger part files. 776 */ 777 private static class HarFSDataInputStream extends FSDataInputStream { 778 /** 779 * Create an input stream that fakes all the reads/positions/seeking. 780 */ 781 private static class HarFsInputStream extends FSInputStream { 782 private long position, start, end; 783 //The underlying data input stream that the 784 // underlying filesystem will return. 785 private FSDataInputStream underLyingStream; 786 //one byte buffer 787 private byte[] oneBytebuff = new byte[1]; 788 HarFsInputStream(FileSystem fs, Path path, long start, 789 long length, int bufferSize) throws IOException { 790 underLyingStream = fs.open(path, bufferSize); 791 underLyingStream.seek(start); 792 // the start of this file in the part file 793 this.start = start; 794 // the position pointer in the part file 795 this.position = start; 796 // the end pointer in the part file 797 this.end = start + length; 798 } 799 800 public synchronized int available() throws IOException { 801 long remaining = end - underLyingStream.getPos(); 802 if (remaining > (long)Integer.MAX_VALUE) { 803 return Integer.MAX_VALUE; 804 } 805 return (int) remaining; 806 } 807 808 public synchronized void close() throws IOException { 809 underLyingStream.close(); 810 super.close(); 811 } 812 813 //not implemented 814 @Override 815 public void mark(int readLimit) { 816 // do nothing 817 } 818 819 /** 820 * reset is not implemented 821 */ 822 public void reset() throws IOException { 823 throw new IOException("reset not implemented."); 824 } 825 826 public synchronized int read() throws IOException { 827 int ret = read(oneBytebuff, 0, 1); 828 return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff); 829 } 830 831 public synchronized int read(byte[] b) throws IOException { 832 int ret = read(b, 0, b.length); 833 if (ret != -1) { 834 position += ret; 835 } 836 return ret; 837 } 838 839 /** 840 * 841 */ 842 public synchronized int read(byte[] b, int offset, int len) 843 throws IOException { 844 int newlen = len; 845 int ret = -1; 846 if (position + len > end) { 847 newlen = (int) (end - position); 848 } 849 // end case 850 if (newlen == 0) 851 return ret; 852 ret = underLyingStream.read(b, offset, newlen); 853 position += ret; 854 return ret; 855 } 856 857 public synchronized long skip(long n) throws IOException { 858 long tmpN = n; 859 if (tmpN > 0) { 860 if (position + tmpN > end) { 861 tmpN = end - position; 862 } 863 underLyingStream.seek(tmpN + position); 864 position += tmpN; 865 return tmpN; 866 } 867 return (tmpN < 0)? -1 : 0; 868 } 869 870 public synchronized long getPos() throws IOException { 871 return (position - start); 872 } 873 874 public synchronized void seek(long pos) throws IOException { 875 if (pos < 0 || (start + pos > end)) { 876 throw new IOException("Failed to seek: EOF"); 877 } 878 position = start + pos; 879 underLyingStream.seek(position); 880 } 881 882 public boolean seekToNewSource(long targetPos) throws IOException { 883 //do not need to implement this 884 // hdfs in itself does seektonewsource 885 // while reading. 886 return false; 887 } 888 889 /** 890 * implementing position readable. 891 */ 892 public int read(long pos, byte[] b, int offset, int length) 893 throws IOException { 894 int nlength = length; 895 if (start + nlength + pos > end) { 896 nlength = (int) (end - (start + pos)); 897 } 898 return underLyingStream.read(pos + start , b, offset, nlength); 899 } 900 901 /** 902 * position readable again. 903 */ 904 public void readFully(long pos, byte[] b, int offset, int length) 905 throws IOException { 906 if (start + length + pos > end) { 907 throw new IOException("Not enough bytes to read."); 908 } 909 underLyingStream.readFully(pos + start, b, offset, length); 910 } 911 912 public void readFully(long pos, byte[] b) throws IOException { 913 readFully(pos, b, 0, b.length); 914 } 915 916 } 917 918 /** 919 * constructors for har input stream. 920 * @param fs the underlying filesystem 921 * @param p The path in the underlying filesystem 922 * @param start the start position in the part file 923 * @param length the length of valid data in the part file 924 * @param bufsize the buffer size 925 * @throws IOException 926 */ 927 public HarFSDataInputStream(FileSystem fs, Path p, long start, 928 long length, int bufsize) throws IOException { 929 super(new HarFsInputStream(fs, p, start, length, bufsize)); 930 } 931 932 /** 933 * constructor for har input stream. 934 * @param fs the underlying filesystem 935 * @param p the path in the underlying file system 936 * @param start the start position in the part file 937 * @param length the length of valid data in the part file. 938 * @throws IOException 939 */ 940 public HarFSDataInputStream(FileSystem fs, Path p, long start, long length) 941 throws IOException { 942 super(new HarFsInputStream(fs, p, start, length, 0)); 943 } 944 } 945 946 private class HarMetaData { 947 private FileSystem fs; 948 private int version; 949 // the masterIndex of the archive 950 private Path masterIndexPath; 951 // the index file 952 private Path archiveIndexPath; 953 954 private long masterIndexTimestamp; 955 private long archiveIndexTimestamp; 956 957 List<Store> stores = new ArrayList<Store>(); 958 Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>(); 959 private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>(); 960 961 public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) { 962 this.fs = fs; 963 this.masterIndexPath = masterIndexPath; 964 this.archiveIndexPath = archiveIndexPath; 965 } 966 967 public FileStatus getPartFileStatus(Path partPath) throws IOException { 968 FileStatus status; 969 status = partFileStatuses.get(partPath); 970 if (status == null) { 971 status = fs.getFileStatus(partPath); 972 partFileStatuses.put(partPath, status); 973 } 974 return status; 975 } 976 977 public long getMasterIndexTimestamp() { 978 return masterIndexTimestamp; 979 } 980 981 public long getArchiveIndexTimestamp() { 982 return archiveIndexTimestamp; 983 } 984 985 private int getVersion() { 986 return version; 987 } 988 989 private void parseMetaData() throws IOException { 990 FSDataInputStream in = fs.open(masterIndexPath); 991 FileStatus masterStat = fs.getFileStatus(masterIndexPath); 992 masterIndexTimestamp = masterStat.getModificationTime(); 993 LineReader lin = new LineReader(in, getConf()); 994 Text line = new Text(); 995 long read = lin.readLine(line); 996 997 // the first line contains the version of the index file 998 String versionLine = line.toString(); 999 String[] arr = versionLine.split(" "); 1000 version = Integer.parseInt(arr[0]); 1001 // make it always backwards-compatible 1002 if (this.version > HarFileSystem.VERSION) { 1003 throw new IOException("Invalid version " + 1004 this.version + " expected " + HarFileSystem.VERSION); 1005 } 1006 1007 // each line contains a hashcode range and the index file name 1008 String[] readStr = null; 1009 while(read < masterStat.getLen()) { 1010 int b = lin.readLine(line); 1011 read += b; 1012 readStr = line.toString().split(" "); 1013 int startHash = Integer.parseInt(readStr[0]); 1014 int endHash = Integer.parseInt(readStr[1]); 1015 stores.add(new Store(Long.parseLong(readStr[2]), 1016 Long.parseLong(readStr[3]), startHash, 1017 endHash)); 1018 line.clear(); 1019 } 1020 try { 1021 // close the master index 1022 lin.close(); 1023 } catch(IOException io){ 1024 // do nothing just a read. 1025 } 1026 1027 FSDataInputStream aIn = fs.open(archiveIndexPath); 1028 FileStatus archiveStat = fs.getFileStatus(archiveIndexPath); 1029 archiveIndexTimestamp = archiveStat.getModificationTime(); 1030 LineReader aLin; 1031 1032 // now start reading the real index file 1033 for (Store s: stores) { 1034 read = 0; 1035 aIn.seek(s.begin); 1036 aLin = new LineReader(aIn, getConf()); 1037 while (read + s.begin < s.end) { 1038 int tmp = aLin.readLine(line); 1039 read += tmp; 1040 String lineFeed = line.toString(); 1041 String[] parsed = lineFeed.split(" "); 1042 parsed[0] = decodeFileName(parsed[0]); 1043 archive.put(new Path(parsed[0]), new HarStatus(lineFeed)); 1044 line.clear(); 1045 } 1046 } 1047 try { 1048 // close the archive index 1049 aIn.close(); 1050 } catch(IOException io) { 1051 // do nothing just a read. 1052 } 1053 } 1054 } 1055 1056 /* 1057 * testing purposes only: 1058 */ 1059 HarMetaData getMetadata() { 1060 return metadata; 1061 } 1062 }