001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 package org.apache.hadoop.fs; 019 020 import java.io.FileNotFoundException; 021 import java.io.IOException; 022 import java.io.UnsupportedEncodingException; 023 import java.net.URI; 024 import java.net.URISyntaxException; 025 import java.net.URLDecoder; 026 import java.util.ArrayList; 027 import java.util.EnumSet; 028 import java.util.List; 029 import java.util.Map; 030 import java.util.TreeMap; 031 import java.util.HashMap; 032 033 import org.apache.hadoop.conf.Configuration; 034 import org.apache.hadoop.fs.permission.FsPermission; 035 import org.apache.hadoop.io.Text; 036 import org.apache.hadoop.util.LineReader; 037 import org.apache.hadoop.util.Progressable; 038 039 /** 040 * This is an implementation of the Hadoop Archive 041 * Filesystem. This archive Filesystem has index files 042 * of the form _index* and has contents of the form 043 * part-*. The index files store the indexes of the 044 * real files. The index files are of the form _masterindex 045 * and _index. The master index is a level of indirection 046 * in to the index file to make the look ups faster. the index 047 * file is sorted with hash code of the paths that it contains 048 * and the master index contains pointers to the positions in 049 * index for ranges of hashcodes. 050 */ 051 052 public class HarFileSystem extends FilterFileSystem { 053 public static final int VERSION = 3; 054 055 private static final Map<URI, HarMetaData> harMetaCache = new HashMap<URI, HarMetaData>(); 056 057 // uri representation of this Har filesystem 058 private URI uri; 059 // the top level path of the archive 060 // in the underlying file system 061 private Path archivePath; 062 // the har auth 063 private String harAuth; 064 065 // pointer into the static metadata cache 066 private HarMetaData metadata; 067 068 /** 069 * public construction of harfilesystem 070 * 071 */ 072 public HarFileSystem() { 073 } 074 075 /** 076 * Constructor to create a HarFileSystem with an 077 * underlying filesystem. 078 * @param fs 079 */ 080 public HarFileSystem(FileSystem fs) { 081 super(fs); 082 } 083 084 /** 085 * Initialize a Har filesystem per har archive. The 086 * archive home directory is the top level directory 087 * in the filesystem that contains the HAR archive. 088 * Be careful with this method, you do not want to go 089 * on creating new Filesystem instances per call to 090 * path.getFileSystem(). 091 * the uri of Har is 092 * har://underlyingfsscheme-host:port/archivepath. 093 * or 094 * har:///archivepath. This assumes the underlying filesystem 095 * to be used in case not specified. 096 */ 097 public void initialize(URI name, Configuration conf) throws IOException { 098 // decode the name 099 URI underLyingURI = decodeHarURI(name, conf); 100 // we got the right har Path- now check if this is 101 // truly a har filesystem 102 Path harPath = archivePath( 103 new Path(name.getScheme(), name.getAuthority(), name.getPath())); 104 if (harPath == null) { 105 throw new IOException("Invalid path for the Har Filesystem. " + 106 name.toString()); 107 } 108 if (fs == null) { 109 fs = FileSystem.get(underLyingURI, conf); 110 } 111 uri = harPath.toUri(); 112 archivePath = new Path(uri.getPath()); 113 harAuth = getHarAuth(underLyingURI); 114 //check for the underlying fs containing 115 // the index file 116 Path masterIndexPath = new Path(archivePath, "_masterindex"); 117 Path archiveIndexPath = new Path(archivePath, "_index"); 118 if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) { 119 throw new IOException("Invalid path for the Har Filesystem. " + 120 "No index file in " + harPath); 121 } 122 123 metadata = harMetaCache.get(uri); 124 if (metadata != null) { 125 FileStatus mStat = fs.getFileStatus(masterIndexPath); 126 FileStatus aStat = fs.getFileStatus(archiveIndexPath); 127 if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() || 128 aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) { 129 // the archive has been overwritten since we last read it 130 // remove the entry from the meta data cache 131 metadata = null; 132 harMetaCache.remove(uri); 133 } 134 } 135 if (metadata == null) { 136 metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath); 137 metadata.parseMetaData(); 138 harMetaCache.put(uri, metadata); 139 } 140 } 141 142 // get the version of the filesystem from the masterindex file 143 // the version is currently not useful since its the first version 144 // of archives 145 public int getHarVersion() throws IOException { 146 if (metadata != null) { 147 return metadata.getVersion(); 148 } 149 else { 150 throw new IOException("Invalid meta data for the Har Filesystem"); 151 } 152 } 153 154 /* 155 * find the parent path that is the 156 * archive path in the path. The last 157 * path segment that ends with .har is 158 * the path that will be returned. 159 */ 160 private Path archivePath(Path p) { 161 Path retPath = null; 162 Path tmp = p; 163 for (int i=0; i< p.depth(); i++) { 164 if (tmp.toString().endsWith(".har")) { 165 retPath = tmp; 166 break; 167 } 168 tmp = tmp.getParent(); 169 } 170 return retPath; 171 } 172 173 /** 174 * decode the raw URI to get the underlying URI 175 * @param rawURI raw Har URI 176 * @return filtered URI of the underlying fileSystem 177 */ 178 private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException { 179 String tmpAuth = rawURI.getAuthority(); 180 //we are using the default file 181 //system in the config 182 //so create a underlying uri and 183 //return it 184 if (tmpAuth == null) { 185 //create a path 186 return FileSystem.getDefaultUri(conf); 187 } 188 String host = rawURI.getHost(); 189 if (host == null) { 190 throw new IOException("URI: " + rawURI 191 + " is an invalid Har URI since host==null." 192 + " Expecting har://<scheme>-<host>/<path>."); 193 } 194 int i = host.indexOf('-'); 195 if (i < 0) { 196 throw new IOException("URI: " + rawURI 197 + " is an invalid Har URI since '-' not found." 198 + " Expecting har://<scheme>-<host>/<path>."); 199 } 200 final String underLyingScheme = host.substring(0, i); 201 i++; 202 final String underLyingHost = i == host.length()? null: host.substring(i); 203 int underLyingPort = rawURI.getPort(); 204 String auth = (underLyingHost == null && underLyingPort == -1)? 205 null:(underLyingHost+":"+underLyingPort); 206 URI tmp = null; 207 if (rawURI.getQuery() != null) { 208 // query component not allowed 209 throw new IOException("query component in Path not supported " + rawURI); 210 } 211 try { 212 tmp = new URI(underLyingScheme, auth, rawURI.getPath(), 213 rawURI.getQuery(), rawURI.getFragment()); 214 } catch (URISyntaxException e) { 215 // do nothing should not happen 216 } 217 return tmp; 218 } 219 220 private static String decodeString(String str) 221 throws UnsupportedEncodingException { 222 return URLDecoder.decode(str, "UTF-8"); 223 } 224 225 private String decodeFileName(String fname) 226 throws UnsupportedEncodingException { 227 int version = metadata.getVersion(); 228 if (version == 2 || version == 3){ 229 return decodeString(fname); 230 } 231 return fname; 232 } 233 234 /** 235 * return the top level archive. 236 */ 237 public Path getWorkingDirectory() { 238 return new Path(uri.toString()); 239 } 240 241 /** 242 * Create a har specific auth 243 * har-underlyingfs:port 244 * @param underLyingURI the uri of underlying 245 * filesystem 246 * @return har specific auth 247 */ 248 private String getHarAuth(URI underLyingUri) { 249 String auth = underLyingUri.getScheme() + "-"; 250 if (underLyingUri.getHost() != null) { 251 auth += underLyingUri.getHost() + ":"; 252 if (underLyingUri.getPort() != -1) { 253 auth += underLyingUri.getPort(); 254 } 255 } 256 else { 257 auth += ":"; 258 } 259 return auth; 260 } 261 262 /** 263 * Returns the uri of this filesystem. 264 * The uri is of the form 265 * har://underlyingfsschema-host:port/pathintheunderlyingfs 266 */ 267 @Override 268 public URI getUri() { 269 return this.uri; 270 } 271 272 /** 273 * this method returns the path 274 * inside the har filesystem. 275 * this is relative path inside 276 * the har filesystem. 277 * @param path the fully qualified path in the har filesystem. 278 * @return relative path in the filesystem. 279 */ 280 private Path getPathInHar(Path path) { 281 Path harPath = new Path(path.toUri().getPath()); 282 if (archivePath.compareTo(harPath) == 0) 283 return new Path(Path.SEPARATOR); 284 Path tmp = new Path(harPath.getName()); 285 Path parent = harPath.getParent(); 286 while (!(parent.compareTo(archivePath) == 0)) { 287 if (parent.toString().equals(Path.SEPARATOR)) { 288 tmp = null; 289 break; 290 } 291 tmp = new Path(parent.getName(), tmp); 292 parent = parent.getParent(); 293 } 294 if (tmp != null) 295 tmp = new Path(Path.SEPARATOR, tmp); 296 return tmp; 297 } 298 299 //the relative path of p. basically 300 // getting rid of /. Parsing and doing 301 // string manipulation is not good - so 302 // just use the path api to do it. 303 private Path makeRelative(String initial, Path p) { 304 String scheme = this.uri.getScheme(); 305 String authority = this.uri.getAuthority(); 306 Path root = new Path(Path.SEPARATOR); 307 if (root.compareTo(p) == 0) 308 return new Path(scheme, authority, initial); 309 Path retPath = new Path(p.getName()); 310 Path parent = p.getParent(); 311 for (int i=0; i < p.depth()-1; i++) { 312 retPath = new Path(parent.getName(), retPath); 313 parent = parent.getParent(); 314 } 315 return new Path(new Path(scheme, authority, initial), 316 retPath.toString()); 317 } 318 319 /* this makes a path qualified in the har filesystem 320 * (non-Javadoc) 321 * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified( 322 * org.apache.hadoop.fs.Path) 323 */ 324 @Override 325 public Path makeQualified(Path path) { 326 // make sure that we just get the 327 // path component 328 Path fsPath = path; 329 if (!path.isAbsolute()) { 330 fsPath = new Path(archivePath, path); 331 } 332 333 URI tmpURI = fsPath.toUri(); 334 //change this to Har uri 335 return new Path(uri.getScheme(), harAuth, tmpURI.getPath()); 336 } 337 338 /** 339 * Fix offset and length of block locations. 340 * Note that this method modifies the original array. 341 * @param locations block locations of har part file 342 * @param start the start of the desired range in the contained file 343 * @param len the length of the desired range 344 * @param fileOffsetInHar the offset of the desired file in the har part file 345 * @return block locations with fixed offset and length 346 */ 347 static BlockLocation[] fixBlockLocations(BlockLocation[] locations, 348 long start, 349 long len, 350 long fileOffsetInHar) { 351 // offset 1 past last byte of desired range 352 long end = start + len; 353 354 for (BlockLocation location : locations) { 355 // offset of part block relative to beginning of desired file 356 // (may be negative if file starts in this part block) 357 long harBlockStart = location.getOffset() - fileOffsetInHar; 358 // offset 1 past last byte of har block relative to beginning of 359 // desired file 360 long harBlockEnd = harBlockStart + location.getLength(); 361 362 if (start > harBlockStart) { 363 // desired range starts after beginning of this har block 364 // fix offset to beginning of relevant range (relative to desired file) 365 location.setOffset(start); 366 // fix length to relevant portion of har block 367 location.setLength(location.getLength() - (start - harBlockStart)); 368 } else { 369 // desired range includes beginning of this har block 370 location.setOffset(harBlockStart); 371 } 372 373 if (harBlockEnd > end) { 374 // range ends before end of this har block 375 // fix length to remove irrelevant portion at the end 376 location.setLength(location.getLength() - (harBlockEnd - end)); 377 } 378 } 379 380 return locations; 381 } 382 383 /** 384 * Get block locations from the underlying fs and fix their 385 * offsets and lengths. 386 * @param file the input filestatus to get block locations 387 * @param start the start of the desired range in the contained file 388 * @param len the length of the desired range 389 * @return block locations for this segment of file 390 * @throws IOException 391 */ 392 @Override 393 public BlockLocation[] getFileBlockLocations(FileStatus file, long start, 394 long len) throws IOException { 395 HarStatus hstatus = getFileHarStatus(file.getPath()); 396 Path partPath = new Path(archivePath, hstatus.getPartName()); 397 FileStatus partStatus = metadata.getPartFileStatus(partPath); 398 399 // get all part blocks that overlap with the desired file blocks 400 BlockLocation[] locations = 401 fs.getFileBlockLocations(partStatus, 402 hstatus.getStartIndex() + start, len); 403 404 return fixBlockLocations(locations, start, len, hstatus.getStartIndex()); 405 } 406 407 /** 408 * the hash of the path p inside iniside 409 * the filesystem 410 * @param p the path in the harfilesystem 411 * @return the hash code of the path. 412 */ 413 public static int getHarHash(Path p) { 414 return (p.toString().hashCode() & 0x7fffffff); 415 } 416 417 static class Store { 418 public Store() { 419 begin = end = startHash = endHash = 0; 420 } 421 public Store(long begin, long end, int startHash, int endHash) { 422 this.begin = begin; 423 this.end = end; 424 this.startHash = startHash; 425 this.endHash = endHash; 426 } 427 public long begin; 428 public long end; 429 public int startHash; 430 public int endHash; 431 } 432 433 /** 434 * Get filestatuses of all the children of a given directory. This just reads 435 * through index file and reads line by line to get all statuses for children 436 * of a directory. Its a brute force way of getting all such filestatuses 437 * 438 * @param parent 439 * the parent path directory 440 * @param statuses 441 * the list to add the children filestatuses to 442 * @param children 443 * the string list of children for this parent 444 * @param archiveIndexStat 445 * the archive index filestatus 446 */ 447 private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses, 448 List<String> children) throws IOException { 449 String parentString = parent.getName(); 450 if (!parentString.endsWith(Path.SEPARATOR)){ 451 parentString += Path.SEPARATOR; 452 } 453 Path harPath = new Path(parentString); 454 int harlen = harPath.depth(); 455 final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>(); 456 457 for (HarStatus hstatus : metadata.archive.values()) { 458 String child = hstatus.getName(); 459 if ((child.startsWith(parentString))) { 460 Path thisPath = new Path(child); 461 if (thisPath.depth() == harlen + 1) { 462 statuses.add(toFileStatus(hstatus, cache)); 463 } 464 } 465 } 466 } 467 468 /** 469 * Combine the status stored in the index and the underlying status. 470 * @param h status stored in the index 471 * @param cache caching the underlying file statuses 472 * @return the combined file status 473 * @throws IOException 474 */ 475 private FileStatus toFileStatus(HarStatus h, 476 Map<String, FileStatus> cache) throws IOException { 477 FileStatus underlying = null; 478 if (cache != null) { 479 underlying = cache.get(h.partName); 480 } 481 if (underlying == null) { 482 final Path p = h.isDir? archivePath: new Path(archivePath, h.partName); 483 underlying = fs.getFileStatus(p); 484 if (cache != null) { 485 cache.put(h.partName, underlying); 486 } 487 } 488 489 long modTime = 0; 490 int version = metadata.getVersion(); 491 if (version < 3) { 492 modTime = underlying.getModificationTime(); 493 } else if (version == 3) { 494 modTime = h.getModificationTime(); 495 } 496 497 return new FileStatus( 498 h.isDir()? 0L: h.getLength(), 499 h.isDir(), 500 underlying.getReplication(), 501 underlying.getBlockSize(), 502 modTime, 503 underlying.getAccessTime(), 504 underlying.getPermission(), 505 underlying.getOwner(), 506 underlying.getGroup(), 507 makeRelative(this.uri.getPath(), new Path(h.name))); 508 } 509 510 // a single line parser for hadoop archives status 511 // stored in a single line in the index files 512 // the format is of the form 513 // filename "dir"/"file" partFileName startIndex length 514 // <space seperated children> 515 private class HarStatus { 516 boolean isDir; 517 String name; 518 List<String> children; 519 String partName; 520 long startIndex; 521 long length; 522 long modificationTime = 0; 523 524 public HarStatus(String harString) throws UnsupportedEncodingException { 525 String[] splits = harString.split(" "); 526 this.name = decodeFileName(splits[0]); 527 this.isDir = "dir".equals(splits[1]) ? true: false; 528 // this is equal to "none" if its a directory 529 this.partName = splits[2]; 530 this.startIndex = Long.parseLong(splits[3]); 531 this.length = Long.parseLong(splits[4]); 532 533 int version = metadata.getVersion(); 534 String[] propSplits = null; 535 // propSplits is used to retrieve the metainformation that Har versions 536 // 1 & 2 missed (modification time, permission, owner group). 537 // These fields are stored in an encoded string placed in different 538 // locations depending on whether it's a file or directory entry. 539 // If it's a directory, the string will be placed at the partName 540 // location (directories have no partName because they don't have data 541 // to be stored). This is done because the number of fields in a 542 // directory entry is unbounded (all children are listed at the end) 543 // If it's a file, the string will be the last field. 544 if (isDir) { 545 if (version == 3){ 546 propSplits = decodeString(this.partName).split(" "); 547 } 548 children = new ArrayList<String>(); 549 for (int i = 5; i < splits.length; i++) { 550 children.add(decodeFileName(splits[i])); 551 } 552 } else if (version == 3) { 553 propSplits = decodeString(splits[5]).split(" "); 554 } 555 556 if (propSplits != null && propSplits.length >= 4) { 557 modificationTime = Long.parseLong(propSplits[0]); 558 // the fields below are stored in the file but are currently not used 559 // by HarFileSystem 560 // permission = new FsPermission(Short.parseShort(propSplits[1])); 561 // owner = decodeString(propSplits[2]); 562 // group = decodeString(propSplits[3]); 563 } 564 } 565 public boolean isDir() { 566 return isDir; 567 } 568 569 public String getName() { 570 return name; 571 } 572 573 public List<String> getChildren() { 574 return children; 575 } 576 public String getFileName() { 577 return name; 578 } 579 public String getPartName() { 580 return partName; 581 } 582 public long getStartIndex() { 583 return startIndex; 584 } 585 public long getLength() { 586 return length; 587 } 588 public long getModificationTime() { 589 return modificationTime; 590 } 591 } 592 593 /** 594 * return the filestatus of files in har archive. 595 * The permission returned are that of the archive 596 * index files. The permissions are not persisted 597 * while creating a hadoop archive. 598 * @param f the path in har filesystem 599 * @return filestatus. 600 * @throws IOException 601 */ 602 @Override 603 public FileStatus getFileStatus(Path f) throws IOException { 604 HarStatus hstatus = getFileHarStatus(f); 605 return toFileStatus(hstatus, null); 606 } 607 608 private HarStatus getFileHarStatus(Path f) throws IOException { 609 // get the fs DataInputStream for the underlying file 610 // look up the index. 611 Path p = makeQualified(f); 612 Path harPath = getPathInHar(p); 613 if (harPath == null) { 614 throw new IOException("Invalid file name: " + f + " in " + uri); 615 } 616 HarStatus hstatus = metadata.archive.get(harPath); 617 if (hstatus == null) { 618 throw new FileNotFoundException("File: " + f + " does not exist in " + uri); 619 } 620 return hstatus; 621 } 622 623 /** 624 * @return null since no checksum algorithm is implemented. 625 */ 626 public FileChecksum getFileChecksum(Path f) { 627 return null; 628 } 629 630 /** 631 * Returns a har input stream which fakes end of 632 * file. It reads the index files to get the part 633 * file name and the size and start of the file. 634 */ 635 @Override 636 public FSDataInputStream open(Path f, int bufferSize) throws IOException { 637 // get the fs DataInputStream for the underlying file 638 HarStatus hstatus = getFileHarStatus(f); 639 // we got it.. woo hooo!!! 640 if (hstatus.isDir()) { 641 throw new FileNotFoundException(f + " : not a file in " + 642 archivePath); 643 } 644 return new HarFSDataInputStream(fs, new Path(archivePath, 645 hstatus.getPartName()), 646 hstatus.getStartIndex(), hstatus.getLength(), bufferSize); 647 } 648 649 /* 650 * create throws an exception in Har filesystem. 651 * The archive once created cannot be changed. 652 */ 653 public FSDataOutputStream create(Path f, int bufferSize) 654 throws IOException { 655 throw new IOException("Har: Create not allowed"); 656 } 657 658 public FSDataOutputStream create(Path f, 659 FsPermission permission, 660 boolean overwrite, 661 int bufferSize, 662 short replication, 663 long blockSize, 664 Progressable progress) throws IOException { 665 throw new IOException("Har: create not allowed."); 666 } 667 668 @Override 669 public void close() throws IOException { 670 if (fs != null) { 671 try { 672 fs.close(); 673 } catch(IOException ie) { 674 //this might already be closed 675 // ignore 676 } 677 } 678 } 679 680 /** 681 * Not implemented. 682 */ 683 @Override 684 public boolean setReplication(Path src, short replication) throws IOException{ 685 throw new IOException("Har: setreplication not allowed"); 686 } 687 688 /** 689 * Not implemented. 690 */ 691 @Override 692 public boolean delete(Path f, boolean recursive) throws IOException { 693 throw new IOException("Har: delete not allowed"); 694 } 695 696 /** 697 * liststatus returns the children of a directory 698 * after looking up the index files. 699 */ 700 @Override 701 public FileStatus[] listStatus(Path f) throws IOException { 702 //need to see if the file is an index in file 703 //get the filestatus of the archive directory 704 // we will create fake filestatuses to return 705 // to the client 706 List<FileStatus> statuses = new ArrayList<FileStatus>(); 707 Path tmpPath = makeQualified(f); 708 Path harPath = getPathInHar(tmpPath); 709 HarStatus hstatus = metadata.archive.get(harPath); 710 if (hstatus == null) { 711 throw new FileNotFoundException("File " + f + " not found in " + archivePath); 712 } 713 if (hstatus.isDir()) { 714 fileStatusesInIndex(hstatus, statuses, hstatus.children); 715 } else { 716 statuses.add(toFileStatus(hstatus, null)); 717 } 718 719 return statuses.toArray(new FileStatus[statuses.size()]); 720 } 721 722 /** 723 * return the top level archive path. 724 */ 725 public Path getHomeDirectory() { 726 return new Path(uri.toString()); 727 } 728 729 public void setWorkingDirectory(Path newDir) { 730 //does nothing. 731 } 732 733 /** 734 * not implemented. 735 */ 736 public boolean mkdirs(Path f, FsPermission permission) throws IOException { 737 throw new IOException("Har: mkdirs not allowed"); 738 } 739 740 /** 741 * not implemented. 742 */ 743 public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws 744 IOException { 745 throw new IOException("Har: copyfromlocalfile not allowed"); 746 } 747 748 /** 749 * copies the file in the har filesystem to a local file. 750 */ 751 public void copyToLocalFile(boolean delSrc, Path src, Path dst) 752 throws IOException { 753 FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf()); 754 } 755 756 /** 757 * not implemented. 758 */ 759 public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) 760 throws IOException { 761 throw new IOException("Har: startLocalOutput not allowed"); 762 } 763 764 /** 765 * not implemented. 766 */ 767 public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) 768 throws IOException { 769 throw new IOException("Har: completeLocalOutput not allowed"); 770 } 771 772 /** 773 * not implemented. 774 */ 775 public void setOwner(Path p, String username, String groupname) 776 throws IOException { 777 throw new IOException("Har: setowner not allowed"); 778 } 779 780 /** 781 * Not implemented. 782 */ 783 public void setPermission(Path p, FsPermission permisssion) 784 throws IOException { 785 throw new IOException("Har: setPermission not allowed"); 786 } 787 788 /** 789 * Hadoop archives input stream. This input stream fakes EOF 790 * since archive files are part of bigger part files. 791 */ 792 private static class HarFSDataInputStream extends FSDataInputStream { 793 /** 794 * Create an input stream that fakes all the reads/positions/seeking. 795 */ 796 private static class HarFsInputStream extends FSInputStream { 797 private long position, start, end; 798 //The underlying data input stream that the 799 // underlying filesystem will return. 800 private FSDataInputStream underLyingStream; 801 //one byte buffer 802 private byte[] oneBytebuff = new byte[1]; 803 HarFsInputStream(FileSystem fs, Path path, long start, 804 long length, int bufferSize) throws IOException { 805 underLyingStream = fs.open(path, bufferSize); 806 underLyingStream.seek(start); 807 // the start of this file in the part file 808 this.start = start; 809 // the position pointer in the part file 810 this.position = start; 811 // the end pointer in the part file 812 this.end = start + length; 813 } 814 815 public synchronized int available() throws IOException { 816 long remaining = end - underLyingStream.getPos(); 817 if (remaining > (long)Integer.MAX_VALUE) { 818 return Integer.MAX_VALUE; 819 } 820 return (int) remaining; 821 } 822 823 public synchronized void close() throws IOException { 824 underLyingStream.close(); 825 super.close(); 826 } 827 828 //not implemented 829 @Override 830 public void mark(int readLimit) { 831 // do nothing 832 } 833 834 /** 835 * reset is not implemented 836 */ 837 public void reset() throws IOException { 838 throw new IOException("reset not implemented."); 839 } 840 841 public synchronized int read() throws IOException { 842 int ret = read(oneBytebuff, 0, 1); 843 return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff); 844 } 845 846 public synchronized int read(byte[] b) throws IOException { 847 int ret = read(b, 0, b.length); 848 if (ret != -1) { 849 position += ret; 850 } 851 return ret; 852 } 853 854 /** 855 * 856 */ 857 public synchronized int read(byte[] b, int offset, int len) 858 throws IOException { 859 int newlen = len; 860 int ret = -1; 861 if (position + len > end) { 862 newlen = (int) (end - position); 863 } 864 // end case 865 if (newlen == 0) 866 return ret; 867 ret = underLyingStream.read(b, offset, newlen); 868 position += ret; 869 return ret; 870 } 871 872 public synchronized long skip(long n) throws IOException { 873 long tmpN = n; 874 if (tmpN > 0) { 875 if (position + tmpN > end) { 876 tmpN = end - position; 877 } 878 underLyingStream.seek(tmpN + position); 879 position += tmpN; 880 return tmpN; 881 } 882 return (tmpN < 0)? -1 : 0; 883 } 884 885 public synchronized long getPos() throws IOException { 886 return (position - start); 887 } 888 889 public synchronized void seek(long pos) throws IOException { 890 if (pos < 0 || (start + pos > end)) { 891 throw new IOException("Failed to seek: EOF"); 892 } 893 position = start + pos; 894 underLyingStream.seek(position); 895 } 896 897 public boolean seekToNewSource(long targetPos) throws IOException { 898 //do not need to implement this 899 // hdfs in itself does seektonewsource 900 // while reading. 901 return false; 902 } 903 904 /** 905 * implementing position readable. 906 */ 907 public int read(long pos, byte[] b, int offset, int length) 908 throws IOException { 909 int nlength = length; 910 if (start + nlength + pos > end) { 911 nlength = (int) (end - (start + pos)); 912 } 913 return underLyingStream.read(pos + start , b, offset, nlength); 914 } 915 916 /** 917 * position readable again. 918 */ 919 public void readFully(long pos, byte[] b, int offset, int length) 920 throws IOException { 921 if (start + length + pos > end) { 922 throw new IOException("Not enough bytes to read."); 923 } 924 underLyingStream.readFully(pos + start, b, offset, length); 925 } 926 927 public void readFully(long pos, byte[] b) throws IOException { 928 readFully(pos, b, 0, b.length); 929 } 930 931 } 932 933 /** 934 * constructors for har input stream. 935 * @param fs the underlying filesystem 936 * @param p The path in the underlying filesystem 937 * @param start the start position in the part file 938 * @param length the length of valid data in the part file 939 * @param bufsize the buffer size 940 * @throws IOException 941 */ 942 public HarFSDataInputStream(FileSystem fs, Path p, long start, 943 long length, int bufsize) throws IOException { 944 super(new HarFsInputStream(fs, p, start, length, bufsize)); 945 } 946 947 /** 948 * constructor for har input stream. 949 * @param fs the underlying filesystem 950 * @param p the path in the underlying file system 951 * @param start the start position in the part file 952 * @param length the length of valid data in the part file. 953 * @throws IOException 954 */ 955 public HarFSDataInputStream(FileSystem fs, Path p, long start, long length) 956 throws IOException { 957 super(new HarFsInputStream(fs, p, start, length, 0)); 958 } 959 } 960 961 private class HarMetaData { 962 private FileSystem fs; 963 private int version; 964 // the masterIndex of the archive 965 private Path masterIndexPath; 966 // the index file 967 private Path archiveIndexPath; 968 969 private long masterIndexTimestamp; 970 private long archiveIndexTimestamp; 971 972 List<Store> stores = new ArrayList<Store>(); 973 Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>(); 974 private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>(); 975 976 public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) { 977 this.fs = fs; 978 this.masterIndexPath = masterIndexPath; 979 this.archiveIndexPath = archiveIndexPath; 980 } 981 982 public FileStatus getPartFileStatus(Path partPath) throws IOException { 983 FileStatus status; 984 status = partFileStatuses.get(partPath); 985 if (status == null) { 986 status = fs.getFileStatus(partPath); 987 partFileStatuses.put(partPath, status); 988 } 989 return status; 990 } 991 992 public long getMasterIndexTimestamp() { 993 return masterIndexTimestamp; 994 } 995 996 public long getArchiveIndexTimestamp() { 997 return archiveIndexTimestamp; 998 } 999 1000 private int getVersion() { 1001 return version; 1002 } 1003 1004 private void parseMetaData() throws IOException { 1005 FSDataInputStream in = fs.open(masterIndexPath); 1006 FileStatus masterStat = fs.getFileStatus(masterIndexPath); 1007 masterIndexTimestamp = masterStat.getModificationTime(); 1008 LineReader lin = new LineReader(in, getConf()); 1009 Text line = new Text(); 1010 long read = lin.readLine(line); 1011 1012 // the first line contains the version of the index file 1013 String versionLine = line.toString(); 1014 String[] arr = versionLine.split(" "); 1015 version = Integer.parseInt(arr[0]); 1016 // make it always backwards-compatible 1017 if (this.version > HarFileSystem.VERSION) { 1018 throw new IOException("Invalid version " + 1019 this.version + " expected " + HarFileSystem.VERSION); 1020 } 1021 1022 // each line contains a hashcode range and the index file name 1023 String[] readStr = null; 1024 while(read < masterStat.getLen()) { 1025 int b = lin.readLine(line); 1026 read += b; 1027 readStr = line.toString().split(" "); 1028 int startHash = Integer.parseInt(readStr[0]); 1029 int endHash = Integer.parseInt(readStr[1]); 1030 stores.add(new Store(Long.parseLong(readStr[2]), 1031 Long.parseLong(readStr[3]), startHash, 1032 endHash)); 1033 line.clear(); 1034 } 1035 try { 1036 // close the master index 1037 lin.close(); 1038 } catch(IOException io){ 1039 // do nothing just a read. 1040 } 1041 1042 FSDataInputStream aIn = fs.open(archiveIndexPath); 1043 FileStatus archiveStat = fs.getFileStatus(archiveIndexPath); 1044 archiveIndexTimestamp = archiveStat.getModificationTime(); 1045 LineReader aLin; 1046 String retStr = null; 1047 // now start reading the real index file 1048 for (Store s: stores) { 1049 read = 0; 1050 aIn.seek(s.begin); 1051 aLin = new LineReader(aIn, getConf()); 1052 while (read + s.begin < s.end) { 1053 int tmp = aLin.readLine(line); 1054 read += tmp; 1055 String lineFeed = line.toString(); 1056 String[] parsed = lineFeed.split(" "); 1057 parsed[0] = decodeFileName(parsed[0]); 1058 archive.put(new Path(parsed[0]), new HarStatus(lineFeed)); 1059 line.clear(); 1060 } 1061 } 1062 try { 1063 // close the archive index 1064 aIn.close(); 1065 } catch(IOException io) { 1066 // do nothing just a read. 1067 } 1068 } 1069 } 1070 }