001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 package org.apache.hadoop.fs; 019 020 import java.io.FileNotFoundException; 021 import java.io.IOException; 022 import java.io.UnsupportedEncodingException; 023 import java.net.URI; 024 import java.net.URISyntaxException; 025 import java.net.URLDecoder; 026 import java.util.ArrayList; 027 import java.util.List; 028 import java.util.Map; 029 import java.util.TreeMap; 030 import java.util.HashMap; 031 import java.util.concurrent.ConcurrentHashMap; 032 033 import org.apache.commons.logging.Log; 034 import org.apache.commons.logging.LogFactory; 035 import org.apache.hadoop.conf.Configuration; 036 import org.apache.hadoop.fs.permission.FsPermission; 037 import org.apache.hadoop.io.IOUtils; 038 import org.apache.hadoop.io.Text; 039 import org.apache.hadoop.util.LineReader; 040 import org.apache.hadoop.util.Progressable; 041 042 /** 043 * This is an implementation of the Hadoop Archive 044 * Filesystem. This archive Filesystem has index files 045 * of the form _index* and has contents of the form 046 * part-*. The index files store the indexes of the 047 * real files. The index files are of the form _masterindex 048 * and _index. The master index is a level of indirection 049 * in to the index file to make the look ups faster. the index 050 * file is sorted with hash code of the paths that it contains 051 * and the master index contains pointers to the positions in 052 * index for ranges of hashcodes. 053 */ 054 055 public class HarFileSystem extends FilterFileSystem { 056 057 private static final Log LOG = LogFactory.getLog(HarFileSystem.class); 058 059 public static final int VERSION = 3; 060 061 private static final Map<URI, HarMetaData> harMetaCache = 062 new ConcurrentHashMap<URI, HarMetaData>(); 063 064 // uri representation of this Har filesystem 065 private URI uri; 066 // the top level path of the archive 067 // in the underlying file system 068 private Path archivePath; 069 // the har auth 070 private String harAuth; 071 072 // pointer into the static metadata cache 073 private HarMetaData metadata; 074 075 /** 076 * public construction of harfilesystem 077 * 078 */ 079 public HarFileSystem() { 080 } 081 082 /** 083 * Constructor to create a HarFileSystem with an 084 * underlying filesystem. 085 * @param fs 086 */ 087 public HarFileSystem(FileSystem fs) { 088 super(fs); 089 } 090 091 /** 092 * Initialize a Har filesystem per har archive. The 093 * archive home directory is the top level directory 094 * in the filesystem that contains the HAR archive. 095 * Be careful with this method, you do not want to go 096 * on creating new Filesystem instances per call to 097 * path.getFileSystem(). 098 * the uri of Har is 099 * har://underlyingfsscheme-host:port/archivepath. 100 * or 101 * har:///archivepath. This assumes the underlying filesystem 102 * to be used in case not specified. 103 */ 104 public void initialize(URI name, Configuration conf) throws IOException { 105 // decode the name 106 URI underLyingURI = decodeHarURI(name, conf); 107 // we got the right har Path- now check if this is 108 // truly a har filesystem 109 Path harPath = archivePath( 110 new Path(name.getScheme(), name.getAuthority(), name.getPath())); 111 if (harPath == null) { 112 throw new IOException("Invalid path for the Har Filesystem. " + 113 name.toString()); 114 } 115 if (fs == null) { 116 fs = FileSystem.get(underLyingURI, conf); 117 } 118 uri = harPath.toUri(); 119 archivePath = new Path(uri.getPath()); 120 harAuth = getHarAuth(underLyingURI); 121 //check for the underlying fs containing 122 // the index file 123 Path masterIndexPath = new Path(archivePath, "_masterindex"); 124 Path archiveIndexPath = new Path(archivePath, "_index"); 125 if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) { 126 throw new IOException("Invalid path for the Har Filesystem. " + 127 "No index file in " + harPath); 128 } 129 130 metadata = harMetaCache.get(uri); 131 if (metadata != null) { 132 FileStatus mStat = fs.getFileStatus(masterIndexPath); 133 FileStatus aStat = fs.getFileStatus(archiveIndexPath); 134 if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() || 135 aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) { 136 // the archive has been overwritten since we last read it 137 // remove the entry from the meta data cache 138 metadata = null; 139 harMetaCache.remove(uri); 140 } 141 } 142 if (metadata == null) { 143 metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath); 144 metadata.parseMetaData(); 145 harMetaCache.put(uri, metadata); 146 } 147 } 148 149 // get the version of the filesystem from the masterindex file 150 // the version is currently not useful since its the first version 151 // of archives 152 public int getHarVersion() throws IOException { 153 if (metadata != null) { 154 return metadata.getVersion(); 155 } 156 else { 157 throw new IOException("Invalid meta data for the Har Filesystem"); 158 } 159 } 160 161 /* 162 * find the parent path that is the 163 * archive path in the path. The last 164 * path segment that ends with .har is 165 * the path that will be returned. 166 */ 167 private Path archivePath(Path p) { 168 Path retPath = null; 169 Path tmp = p; 170 for (int i=0; i< p.depth(); i++) { 171 if (tmp.toString().endsWith(".har")) { 172 retPath = tmp; 173 break; 174 } 175 tmp = tmp.getParent(); 176 } 177 return retPath; 178 } 179 180 /** 181 * decode the raw URI to get the underlying URI 182 * @param rawURI raw Har URI 183 * @return filtered URI of the underlying fileSystem 184 */ 185 private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException { 186 String tmpAuth = rawURI.getAuthority(); 187 //we are using the default file 188 //system in the config 189 //so create a underlying uri and 190 //return it 191 if (tmpAuth == null) { 192 //create a path 193 return FileSystem.getDefaultUri(conf); 194 } 195 String host = rawURI.getHost(); 196 if (host == null) { 197 throw new IOException("URI: " + rawURI 198 + " is an invalid Har URI since host==null." 199 + " Expecting har://<scheme>-<host>/<path>."); 200 } 201 int i = host.indexOf('-'); 202 if (i < 0) { 203 throw new IOException("URI: " + rawURI 204 + " is an invalid Har URI since '-' not found." 205 + " Expecting har://<scheme>-<host>/<path>."); 206 } 207 final String underLyingScheme = host.substring(0, i); 208 i++; 209 final String underLyingHost = i == host.length()? null: host.substring(i); 210 int underLyingPort = rawURI.getPort(); 211 String auth = (underLyingHost == null && underLyingPort == -1)? 212 null:(underLyingHost+":"+underLyingPort); 213 URI tmp = null; 214 if (rawURI.getQuery() != null) { 215 // query component not allowed 216 throw new IOException("query component in Path not supported " + rawURI); 217 } 218 try { 219 tmp = new URI(underLyingScheme, auth, rawURI.getPath(), 220 rawURI.getQuery(), rawURI.getFragment()); 221 } catch (URISyntaxException e) { 222 // do nothing should not happen 223 } 224 return tmp; 225 } 226 227 private static String decodeString(String str) 228 throws UnsupportedEncodingException { 229 return URLDecoder.decode(str, "UTF-8"); 230 } 231 232 private String decodeFileName(String fname) 233 throws UnsupportedEncodingException { 234 int version = metadata.getVersion(); 235 if (version == 2 || version == 3){ 236 return decodeString(fname); 237 } 238 return fname; 239 } 240 241 /** 242 * return the top level archive. 243 */ 244 public Path getWorkingDirectory() { 245 return new Path(uri.toString()); 246 } 247 248 /** 249 * Create a har specific auth 250 * har-underlyingfs:port 251 * @param underLyingURI the uri of underlying 252 * filesystem 253 * @return har specific auth 254 */ 255 private String getHarAuth(URI underLyingUri) { 256 String auth = underLyingUri.getScheme() + "-"; 257 if (underLyingUri.getHost() != null) { 258 auth += underLyingUri.getHost() + ":"; 259 if (underLyingUri.getPort() != -1) { 260 auth += underLyingUri.getPort(); 261 } 262 } 263 else { 264 auth += ":"; 265 } 266 return auth; 267 } 268 269 /** 270 * Returns the uri of this filesystem. 271 * The uri is of the form 272 * har://underlyingfsschema-host:port/pathintheunderlyingfs 273 */ 274 @Override 275 public URI getUri() { 276 return this.uri; 277 } 278 279 /** 280 * this method returns the path 281 * inside the har filesystem. 282 * this is relative path inside 283 * the har filesystem. 284 * @param path the fully qualified path in the har filesystem. 285 * @return relative path in the filesystem. 286 */ 287 private Path getPathInHar(Path path) { 288 Path harPath = new Path(path.toUri().getPath()); 289 if (archivePath.compareTo(harPath) == 0) 290 return new Path(Path.SEPARATOR); 291 Path tmp = new Path(harPath.getName()); 292 Path parent = harPath.getParent(); 293 while (!(parent.compareTo(archivePath) == 0)) { 294 if (parent.toString().equals(Path.SEPARATOR)) { 295 tmp = null; 296 break; 297 } 298 tmp = new Path(parent.getName(), tmp); 299 parent = parent.getParent(); 300 } 301 if (tmp != null) 302 tmp = new Path(Path.SEPARATOR, tmp); 303 return tmp; 304 } 305 306 //the relative path of p. basically 307 // getting rid of /. Parsing and doing 308 // string manipulation is not good - so 309 // just use the path api to do it. 310 private Path makeRelative(String initial, Path p) { 311 String scheme = this.uri.getScheme(); 312 String authority = this.uri.getAuthority(); 313 Path root = new Path(Path.SEPARATOR); 314 if (root.compareTo(p) == 0) 315 return new Path(scheme, authority, initial); 316 Path retPath = new Path(p.getName()); 317 Path parent = p.getParent(); 318 for (int i=0; i < p.depth()-1; i++) { 319 retPath = new Path(parent.getName(), retPath); 320 parent = parent.getParent(); 321 } 322 return new Path(new Path(scheme, authority, initial), 323 retPath.toString()); 324 } 325 326 /* this makes a path qualified in the har filesystem 327 * (non-Javadoc) 328 * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified( 329 * org.apache.hadoop.fs.Path) 330 */ 331 @Override 332 public Path makeQualified(Path path) { 333 // make sure that we just get the 334 // path component 335 Path fsPath = path; 336 if (!path.isAbsolute()) { 337 fsPath = new Path(archivePath, path); 338 } 339 340 URI tmpURI = fsPath.toUri(); 341 //change this to Har uri 342 return new Path(uri.getScheme(), harAuth, tmpURI.getPath()); 343 } 344 345 /** 346 * Fix offset and length of block locations. 347 * Note that this method modifies the original array. 348 * @param locations block locations of har part file 349 * @param start the start of the desired range in the contained file 350 * @param len the length of the desired range 351 * @param fileOffsetInHar the offset of the desired file in the har part file 352 * @return block locations with fixed offset and length 353 */ 354 static BlockLocation[] fixBlockLocations(BlockLocation[] locations, 355 long start, 356 long len, 357 long fileOffsetInHar) { 358 // offset 1 past last byte of desired range 359 long end = start + len; 360 361 for (BlockLocation location : locations) { 362 // offset of part block relative to beginning of desired file 363 // (may be negative if file starts in this part block) 364 long harBlockStart = location.getOffset() - fileOffsetInHar; 365 // offset 1 past last byte of har block relative to beginning of 366 // desired file 367 long harBlockEnd = harBlockStart + location.getLength(); 368 369 if (start > harBlockStart) { 370 // desired range starts after beginning of this har block 371 // fix offset to beginning of relevant range (relative to desired file) 372 location.setOffset(start); 373 // fix length to relevant portion of har block 374 location.setLength(location.getLength() - (start - harBlockStart)); 375 } else { 376 // desired range includes beginning of this har block 377 location.setOffset(harBlockStart); 378 } 379 380 if (harBlockEnd > end) { 381 // range ends before end of this har block 382 // fix length to remove irrelevant portion at the end 383 location.setLength(location.getLength() - (harBlockEnd - end)); 384 } 385 } 386 387 return locations; 388 } 389 390 /** 391 * Get block locations from the underlying fs and fix their 392 * offsets and lengths. 393 * @param file the input filestatus to get block locations 394 * @param start the start of the desired range in the contained file 395 * @param len the length of the desired range 396 * @return block locations for this segment of file 397 * @throws IOException 398 */ 399 @Override 400 public BlockLocation[] getFileBlockLocations(FileStatus file, long start, 401 long len) throws IOException { 402 HarStatus hstatus = getFileHarStatus(file.getPath()); 403 Path partPath = new Path(archivePath, hstatus.getPartName()); 404 FileStatus partStatus = metadata.getPartFileStatus(partPath); 405 406 // get all part blocks that overlap with the desired file blocks 407 BlockLocation[] locations = 408 fs.getFileBlockLocations(partStatus, 409 hstatus.getStartIndex() + start, len); 410 411 return fixBlockLocations(locations, start, len, hstatus.getStartIndex()); 412 } 413 414 /** 415 * the hash of the path p inside iniside 416 * the filesystem 417 * @param p the path in the harfilesystem 418 * @return the hash code of the path. 419 */ 420 public static int getHarHash(Path p) { 421 return (p.toString().hashCode() & 0x7fffffff); 422 } 423 424 static class Store { 425 public Store() { 426 begin = end = startHash = endHash = 0; 427 } 428 public Store(long begin, long end, int startHash, int endHash) { 429 this.begin = begin; 430 this.end = end; 431 this.startHash = startHash; 432 this.endHash = endHash; 433 } 434 public long begin; 435 public long end; 436 public int startHash; 437 public int endHash; 438 } 439 440 /** 441 * Get filestatuses of all the children of a given directory. This just reads 442 * through index file and reads line by line to get all statuses for children 443 * of a directory. Its a brute force way of getting all such filestatuses 444 * 445 * @param parent 446 * the parent path directory 447 * @param statuses 448 * the list to add the children filestatuses to 449 * @param children 450 * the string list of children for this parent 451 * @param archiveIndexStat 452 * the archive index filestatus 453 */ 454 private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses, 455 List<String> children) throws IOException { 456 String parentString = parent.getName(); 457 if (!parentString.endsWith(Path.SEPARATOR)){ 458 parentString += Path.SEPARATOR; 459 } 460 Path harPath = new Path(parentString); 461 int harlen = harPath.depth(); 462 final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>(); 463 464 for (HarStatus hstatus : metadata.archive.values()) { 465 String child = hstatus.getName(); 466 if ((child.startsWith(parentString))) { 467 Path thisPath = new Path(child); 468 if (thisPath.depth() == harlen + 1) { 469 statuses.add(toFileStatus(hstatus, cache)); 470 } 471 } 472 } 473 } 474 475 /** 476 * Combine the status stored in the index and the underlying status. 477 * @param h status stored in the index 478 * @param cache caching the underlying file statuses 479 * @return the combined file status 480 * @throws IOException 481 */ 482 private FileStatus toFileStatus(HarStatus h, 483 Map<String, FileStatus> cache) throws IOException { 484 FileStatus underlying = null; 485 if (cache != null) { 486 underlying = cache.get(h.partName); 487 } 488 if (underlying == null) { 489 final Path p = h.isDir? archivePath: new Path(archivePath, h.partName); 490 underlying = fs.getFileStatus(p); 491 if (cache != null) { 492 cache.put(h.partName, underlying); 493 } 494 } 495 496 long modTime = 0; 497 int version = metadata.getVersion(); 498 if (version < 3) { 499 modTime = underlying.getModificationTime(); 500 } else if (version == 3) { 501 modTime = h.getModificationTime(); 502 } 503 504 return new FileStatus( 505 h.isDir()? 0L: h.getLength(), 506 h.isDir(), 507 underlying.getReplication(), 508 underlying.getBlockSize(), 509 modTime, 510 underlying.getAccessTime(), 511 underlying.getPermission(), 512 underlying.getOwner(), 513 underlying.getGroup(), 514 makeRelative(this.uri.getPath(), new Path(h.name))); 515 } 516 517 // a single line parser for hadoop archives status 518 // stored in a single line in the index files 519 // the format is of the form 520 // filename "dir"/"file" partFileName startIndex length 521 // <space seperated children> 522 private class HarStatus { 523 boolean isDir; 524 String name; 525 List<String> children; 526 String partName; 527 long startIndex; 528 long length; 529 long modificationTime = 0; 530 531 public HarStatus(String harString) throws UnsupportedEncodingException { 532 String[] splits = harString.split(" "); 533 this.name = decodeFileName(splits[0]); 534 this.isDir = "dir".equals(splits[1]) ? true: false; 535 // this is equal to "none" if its a directory 536 this.partName = splits[2]; 537 this.startIndex = Long.parseLong(splits[3]); 538 this.length = Long.parseLong(splits[4]); 539 540 int version = metadata.getVersion(); 541 String[] propSplits = null; 542 // propSplits is used to retrieve the metainformation that Har versions 543 // 1 & 2 missed (modification time, permission, owner group). 544 // These fields are stored in an encoded string placed in different 545 // locations depending on whether it's a file or directory entry. 546 // If it's a directory, the string will be placed at the partName 547 // location (directories have no partName because they don't have data 548 // to be stored). This is done because the number of fields in a 549 // directory entry is unbounded (all children are listed at the end) 550 // If it's a file, the string will be the last field. 551 if (isDir) { 552 if (version == 3){ 553 propSplits = decodeString(this.partName).split(" "); 554 } 555 children = new ArrayList<String>(); 556 for (int i = 5; i < splits.length; i++) { 557 children.add(decodeFileName(splits[i])); 558 } 559 } else if (version == 3) { 560 propSplits = decodeString(splits[5]).split(" "); 561 } 562 563 if (propSplits != null && propSplits.length >= 4) { 564 modificationTime = Long.parseLong(propSplits[0]); 565 // the fields below are stored in the file but are currently not used 566 // by HarFileSystem 567 // permission = new FsPermission(Short.parseShort(propSplits[1])); 568 // owner = decodeString(propSplits[2]); 569 // group = decodeString(propSplits[3]); 570 } 571 } 572 public boolean isDir() { 573 return isDir; 574 } 575 576 public String getName() { 577 return name; 578 } 579 public String getPartName() { 580 return partName; 581 } 582 public long getStartIndex() { 583 return startIndex; 584 } 585 public long getLength() { 586 return length; 587 } 588 public long getModificationTime() { 589 return modificationTime; 590 } 591 } 592 593 /** 594 * return the filestatus of files in har archive. 595 * The permission returned are that of the archive 596 * index files. The permissions are not persisted 597 * while creating a hadoop archive. 598 * @param f the path in har filesystem 599 * @return filestatus. 600 * @throws IOException 601 */ 602 @Override 603 public FileStatus getFileStatus(Path f) throws IOException { 604 HarStatus hstatus = getFileHarStatus(f); 605 return toFileStatus(hstatus, null); 606 } 607 608 private HarStatus getFileHarStatus(Path f) throws IOException { 609 // get the fs DataInputStream for the underlying file 610 // look up the index. 611 Path p = makeQualified(f); 612 Path harPath = getPathInHar(p); 613 if (harPath == null) { 614 throw new IOException("Invalid file name: " + f + " in " + uri); 615 } 616 HarStatus hstatus = metadata.archive.get(harPath); 617 if (hstatus == null) { 618 throw new FileNotFoundException("File: " + f + " does not exist in " + uri); 619 } 620 return hstatus; 621 } 622 623 /** 624 * @return null since no checksum algorithm is implemented. 625 */ 626 public FileChecksum getFileChecksum(Path f) { 627 return null; 628 } 629 630 /** 631 * Returns a har input stream which fakes end of 632 * file. It reads the index files to get the part 633 * file name and the size and start of the file. 634 */ 635 @Override 636 public FSDataInputStream open(Path f, int bufferSize) throws IOException { 637 // get the fs DataInputStream for the underlying file 638 HarStatus hstatus = getFileHarStatus(f); 639 // we got it.. woo hooo!!! 640 if (hstatus.isDir()) { 641 throw new FileNotFoundException(f + " : not a file in " + 642 archivePath); 643 } 644 return new HarFSDataInputStream(fs, new Path(archivePath, 645 hstatus.getPartName()), 646 hstatus.getStartIndex(), hstatus.getLength(), bufferSize); 647 } 648 649 public FSDataOutputStream create(Path f, 650 FsPermission permission, 651 boolean overwrite, 652 int bufferSize, 653 short replication, 654 long blockSize, 655 Progressable progress) throws IOException { 656 throw new IOException("Har: create not allowed."); 657 } 658 659 @Override 660 public void close() throws IOException { 661 if (fs != null) { 662 try { 663 fs.close(); 664 } catch(IOException ie) { 665 //this might already be closed 666 // ignore 667 } 668 } 669 } 670 671 /** 672 * Not implemented. 673 */ 674 @Override 675 public boolean setReplication(Path src, short replication) throws IOException{ 676 throw new IOException("Har: setreplication not allowed"); 677 } 678 679 /** 680 * Not implemented. 681 */ 682 @Override 683 public boolean delete(Path f, boolean recursive) throws IOException { 684 throw new IOException("Har: delete not allowed"); 685 } 686 687 /** 688 * liststatus returns the children of a directory 689 * after looking up the index files. 690 */ 691 @Override 692 public FileStatus[] listStatus(Path f) throws IOException { 693 //need to see if the file is an index in file 694 //get the filestatus of the archive directory 695 // we will create fake filestatuses to return 696 // to the client 697 List<FileStatus> statuses = new ArrayList<FileStatus>(); 698 Path tmpPath = makeQualified(f); 699 Path harPath = getPathInHar(tmpPath); 700 HarStatus hstatus = metadata.archive.get(harPath); 701 if (hstatus == null) { 702 throw new FileNotFoundException("File " + f + " not found in " + archivePath); 703 } 704 if (hstatus.isDir()) { 705 fileStatusesInIndex(hstatus, statuses, hstatus.children); 706 } else { 707 statuses.add(toFileStatus(hstatus, null)); 708 } 709 710 return statuses.toArray(new FileStatus[statuses.size()]); 711 } 712 713 /** 714 * return the top level archive path. 715 */ 716 public Path getHomeDirectory() { 717 return new Path(uri.toString()); 718 } 719 720 public void setWorkingDirectory(Path newDir) { 721 //does nothing. 722 } 723 724 /** 725 * not implemented. 726 */ 727 public boolean mkdirs(Path f, FsPermission permission) throws IOException { 728 throw new IOException("Har: mkdirs not allowed"); 729 } 730 731 /** 732 * not implemented. 733 */ 734 public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws 735 IOException { 736 throw new IOException("Har: copyfromlocalfile not allowed"); 737 } 738 739 /** 740 * copies the file in the har filesystem to a local file. 741 */ 742 public void copyToLocalFile(boolean delSrc, Path src, Path dst) 743 throws IOException { 744 FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf()); 745 } 746 747 /** 748 * not implemented. 749 */ 750 public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) 751 throws IOException { 752 throw new IOException("Har: startLocalOutput not allowed"); 753 } 754 755 /** 756 * not implemented. 757 */ 758 public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) 759 throws IOException { 760 throw new IOException("Har: completeLocalOutput not allowed"); 761 } 762 763 /** 764 * not implemented. 765 */ 766 public void setOwner(Path p, String username, String groupname) 767 throws IOException { 768 throw new IOException("Har: setowner not allowed"); 769 } 770 771 /** 772 * Not implemented. 773 */ 774 public void setPermission(Path p, FsPermission permisssion) 775 throws IOException { 776 throw new IOException("Har: setPermission not allowed"); 777 } 778 779 /** 780 * Hadoop archives input stream. This input stream fakes EOF 781 * since archive files are part of bigger part files. 782 */ 783 private static class HarFSDataInputStream extends FSDataInputStream { 784 /** 785 * Create an input stream that fakes all the reads/positions/seeking. 786 */ 787 private static class HarFsInputStream extends FSInputStream { 788 private long position, start, end; 789 //The underlying data input stream that the 790 // underlying filesystem will return. 791 private FSDataInputStream underLyingStream; 792 //one byte buffer 793 private byte[] oneBytebuff = new byte[1]; 794 HarFsInputStream(FileSystem fs, Path path, long start, 795 long length, int bufferSize) throws IOException { 796 underLyingStream = fs.open(path, bufferSize); 797 underLyingStream.seek(start); 798 // the start of this file in the part file 799 this.start = start; 800 // the position pointer in the part file 801 this.position = start; 802 // the end pointer in the part file 803 this.end = start + length; 804 } 805 806 public synchronized int available() throws IOException { 807 long remaining = end - underLyingStream.getPos(); 808 if (remaining > (long)Integer.MAX_VALUE) { 809 return Integer.MAX_VALUE; 810 } 811 return (int) remaining; 812 } 813 814 public synchronized void close() throws IOException { 815 underLyingStream.close(); 816 super.close(); 817 } 818 819 //not implemented 820 @Override 821 public void mark(int readLimit) { 822 // do nothing 823 } 824 825 /** 826 * reset is not implemented 827 */ 828 public void reset() throws IOException { 829 throw new IOException("reset not implemented."); 830 } 831 832 public synchronized int read() throws IOException { 833 int ret = read(oneBytebuff, 0, 1); 834 return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff); 835 } 836 837 public synchronized int read(byte[] b) throws IOException { 838 int ret = read(b, 0, b.length); 839 if (ret != -1) { 840 position += ret; 841 } 842 return ret; 843 } 844 845 /** 846 * 847 */ 848 public synchronized int read(byte[] b, int offset, int len) 849 throws IOException { 850 int newlen = len; 851 int ret = -1; 852 if (position + len > end) { 853 newlen = (int) (end - position); 854 } 855 // end case 856 if (newlen == 0) 857 return ret; 858 ret = underLyingStream.read(b, offset, newlen); 859 position += ret; 860 return ret; 861 } 862 863 public synchronized long skip(long n) throws IOException { 864 long tmpN = n; 865 if (tmpN > 0) { 866 if (position + tmpN > end) { 867 tmpN = end - position; 868 } 869 underLyingStream.seek(tmpN + position); 870 position += tmpN; 871 return tmpN; 872 } 873 return (tmpN < 0)? -1 : 0; 874 } 875 876 public synchronized long getPos() throws IOException { 877 return (position - start); 878 } 879 880 public synchronized void seek(long pos) throws IOException { 881 if (pos < 0 || (start + pos > end)) { 882 throw new IOException("Failed to seek: EOF"); 883 } 884 position = start + pos; 885 underLyingStream.seek(position); 886 } 887 888 public boolean seekToNewSource(long targetPos) throws IOException { 889 //do not need to implement this 890 // hdfs in itself does seektonewsource 891 // while reading. 892 return false; 893 } 894 895 /** 896 * implementing position readable. 897 */ 898 public int read(long pos, byte[] b, int offset, int length) 899 throws IOException { 900 int nlength = length; 901 if (start + nlength + pos > end) { 902 nlength = (int) (end - (start + pos)); 903 } 904 return underLyingStream.read(pos + start , b, offset, nlength); 905 } 906 907 /** 908 * position readable again. 909 */ 910 public void readFully(long pos, byte[] b, int offset, int length) 911 throws IOException { 912 if (start + length + pos > end) { 913 throw new IOException("Not enough bytes to read."); 914 } 915 underLyingStream.readFully(pos + start, b, offset, length); 916 } 917 918 public void readFully(long pos, byte[] b) throws IOException { 919 readFully(pos, b, 0, b.length); 920 } 921 922 } 923 924 /** 925 * constructors for har input stream. 926 * @param fs the underlying filesystem 927 * @param p The path in the underlying filesystem 928 * @param start the start position in the part file 929 * @param length the length of valid data in the part file 930 * @param bufsize the buffer size 931 * @throws IOException 932 */ 933 public HarFSDataInputStream(FileSystem fs, Path p, long start, 934 long length, int bufsize) throws IOException { 935 super(new HarFsInputStream(fs, p, start, length, bufsize)); 936 } 937 938 /** 939 * constructor for har input stream. 940 * @param fs the underlying filesystem 941 * @param p the path in the underlying file system 942 * @param start the start position in the part file 943 * @param length the length of valid data in the part file. 944 * @throws IOException 945 */ 946 public HarFSDataInputStream(FileSystem fs, Path p, long start, long length) 947 throws IOException { 948 super(new HarFsInputStream(fs, p, start, length, 0)); 949 } 950 } 951 952 private class HarMetaData { 953 private FileSystem fs; 954 private int version; 955 // the masterIndex of the archive 956 private Path masterIndexPath; 957 // the index file 958 private Path archiveIndexPath; 959 960 private long masterIndexTimestamp; 961 private long archiveIndexTimestamp; 962 963 List<Store> stores = new ArrayList<Store>(); 964 Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>(); 965 private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>(); 966 967 public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) { 968 this.fs = fs; 969 this.masterIndexPath = masterIndexPath; 970 this.archiveIndexPath = archiveIndexPath; 971 } 972 973 public FileStatus getPartFileStatus(Path partPath) throws IOException { 974 FileStatus status; 975 status = partFileStatuses.get(partPath); 976 if (status == null) { 977 status = fs.getFileStatus(partPath); 978 partFileStatuses.put(partPath, status); 979 } 980 return status; 981 } 982 983 public long getMasterIndexTimestamp() { 984 return masterIndexTimestamp; 985 } 986 987 public long getArchiveIndexTimestamp() { 988 return archiveIndexTimestamp; 989 } 990 991 private int getVersion() { 992 return version; 993 } 994 995 private void parseMetaData() throws IOException { 996 Text line; 997 long read; 998 FSDataInputStream in = null; 999 LineReader lin = null; 1000 1001 try { 1002 in = fs.open(masterIndexPath); 1003 FileStatus masterStat = fs.getFileStatus(masterIndexPath); 1004 masterIndexTimestamp = masterStat.getModificationTime(); 1005 lin = new LineReader(in, getConf()); 1006 line = new Text(); 1007 read = lin.readLine(line); 1008 1009 // the first line contains the version of the index file 1010 String versionLine = line.toString(); 1011 String[] arr = versionLine.split(" "); 1012 version = Integer.parseInt(arr[0]); 1013 // make it always backwards-compatible 1014 if (this.version > HarFileSystem.VERSION) { 1015 throw new IOException("Invalid version " + 1016 this.version + " expected " + HarFileSystem.VERSION); 1017 } 1018 1019 // each line contains a hashcode range and the index file name 1020 String[] readStr = null; 1021 while(read < masterStat.getLen()) { 1022 int b = lin.readLine(line); 1023 read += b; 1024 readStr = line.toString().split(" "); 1025 int startHash = Integer.parseInt(readStr[0]); 1026 int endHash = Integer.parseInt(readStr[1]); 1027 stores.add(new Store(Long.parseLong(readStr[2]), 1028 Long.parseLong(readStr[3]), startHash, 1029 endHash)); 1030 line.clear(); 1031 } 1032 } finally { 1033 IOUtils.cleanup(LOG, lin, in); 1034 } 1035 1036 FSDataInputStream aIn = fs.open(archiveIndexPath); 1037 try { 1038 FileStatus archiveStat = fs.getFileStatus(archiveIndexPath); 1039 archiveIndexTimestamp = archiveStat.getModificationTime(); 1040 LineReader aLin; 1041 1042 // now start reading the real index file 1043 for (Store s: stores) { 1044 read = 0; 1045 aIn.seek(s.begin); 1046 aLin = new LineReader(aIn, getConf()); 1047 while (read + s.begin < s.end) { 1048 int tmp = aLin.readLine(line); 1049 read += tmp; 1050 String lineFeed = line.toString(); 1051 String[] parsed = lineFeed.split(" "); 1052 parsed[0] = decodeFileName(parsed[0]); 1053 archive.put(new Path(parsed[0]), new HarStatus(lineFeed)); 1054 line.clear(); 1055 } 1056 } 1057 } finally { 1058 IOUtils.cleanup(LOG, aIn); 1059 } 1060 } 1061 } 1062 1063 /* 1064 * testing purposes only: 1065 */ 1066 HarMetaData getMetadata() { 1067 return metadata; 1068 } 1069 }