001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 package org.apache.hadoop.fs; 019 020 import java.io.FileNotFoundException; 021 import java.io.IOException; 022 import java.io.UnsupportedEncodingException; 023 import java.net.URI; 024 import java.net.URISyntaxException; 025 import java.net.URLDecoder; 026 import java.util.ArrayList; 027 import java.util.List; 028 import java.util.Map; 029 import java.util.TreeMap; 030 import java.util.HashMap; 031 import java.util.concurrent.ConcurrentHashMap; 032 033 import org.apache.hadoop.conf.Configuration; 034 import org.apache.hadoop.fs.permission.FsPermission; 035 import org.apache.hadoop.io.Text; 036 import org.apache.hadoop.util.LineReader; 037 import org.apache.hadoop.util.Progressable; 038 039 /** 040 * This is an implementation of the Hadoop Archive 041 * Filesystem. This archive Filesystem has index files 042 * of the form _index* and has contents of the form 043 * part-*. The index files store the indexes of the 044 * real files. The index files are of the form _masterindex 045 * and _index. The master index is a level of indirection 046 * in to the index file to make the look ups faster. the index 047 * file is sorted with hash code of the paths that it contains 048 * and the master index contains pointers to the positions in 049 * index for ranges of hashcodes. 050 */ 051 052 public class HarFileSystem extends FilterFileSystem { 053 public static final int VERSION = 3; 054 055 private static final Map<URI, HarMetaData> harMetaCache = 056 new ConcurrentHashMap<URI, HarMetaData>(); 057 058 // uri representation of this Har filesystem 059 private URI uri; 060 // the top level path of the archive 061 // in the underlying file system 062 private Path archivePath; 063 // the har auth 064 private String harAuth; 065 066 // pointer into the static metadata cache 067 private HarMetaData metadata; 068 069 /** 070 * public construction of harfilesystem 071 * 072 */ 073 public HarFileSystem() { 074 } 075 076 /** 077 * Constructor to create a HarFileSystem with an 078 * underlying filesystem. 079 * @param fs 080 */ 081 public HarFileSystem(FileSystem fs) { 082 super(fs); 083 } 084 085 /** 086 * Initialize a Har filesystem per har archive. The 087 * archive home directory is the top level directory 088 * in the filesystem that contains the HAR archive. 089 * Be careful with this method, you do not want to go 090 * on creating new Filesystem instances per call to 091 * path.getFileSystem(). 092 * the uri of Har is 093 * har://underlyingfsscheme-host:port/archivepath. 094 * or 095 * har:///archivepath. This assumes the underlying filesystem 096 * to be used in case not specified. 097 */ 098 public void initialize(URI name, Configuration conf) throws IOException { 099 // decode the name 100 URI underLyingURI = decodeHarURI(name, conf); 101 // we got the right har Path- now check if this is 102 // truly a har filesystem 103 Path harPath = archivePath( 104 new Path(name.getScheme(), name.getAuthority(), name.getPath())); 105 if (harPath == null) { 106 throw new IOException("Invalid path for the Har Filesystem. " + 107 name.toString()); 108 } 109 if (fs == null) { 110 fs = FileSystem.get(underLyingURI, conf); 111 } 112 uri = harPath.toUri(); 113 archivePath = new Path(uri.getPath()); 114 harAuth = getHarAuth(underLyingURI); 115 //check for the underlying fs containing 116 // the index file 117 Path masterIndexPath = new Path(archivePath, "_masterindex"); 118 Path archiveIndexPath = new Path(archivePath, "_index"); 119 if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) { 120 throw new IOException("Invalid path for the Har Filesystem. " + 121 "No index file in " + harPath); 122 } 123 124 metadata = harMetaCache.get(uri); 125 if (metadata != null) { 126 FileStatus mStat = fs.getFileStatus(masterIndexPath); 127 FileStatus aStat = fs.getFileStatus(archiveIndexPath); 128 if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() || 129 aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) { 130 // the archive has been overwritten since we last read it 131 // remove the entry from the meta data cache 132 metadata = null; 133 harMetaCache.remove(uri); 134 } 135 } 136 if (metadata == null) { 137 metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath); 138 metadata.parseMetaData(); 139 harMetaCache.put(uri, metadata); 140 } 141 } 142 143 // get the version of the filesystem from the masterindex file 144 // the version is currently not useful since its the first version 145 // of archives 146 public int getHarVersion() throws IOException { 147 if (metadata != null) { 148 return metadata.getVersion(); 149 } 150 else { 151 throw new IOException("Invalid meta data for the Har Filesystem"); 152 } 153 } 154 155 /* 156 * find the parent path that is the 157 * archive path in the path. The last 158 * path segment that ends with .har is 159 * the path that will be returned. 160 */ 161 private Path archivePath(Path p) { 162 Path retPath = null; 163 Path tmp = p; 164 for (int i=0; i< p.depth(); i++) { 165 if (tmp.toString().endsWith(".har")) { 166 retPath = tmp; 167 break; 168 } 169 tmp = tmp.getParent(); 170 } 171 return retPath; 172 } 173 174 /** 175 * decode the raw URI to get the underlying URI 176 * @param rawURI raw Har URI 177 * @return filtered URI of the underlying fileSystem 178 */ 179 private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException { 180 String tmpAuth = rawURI.getAuthority(); 181 //we are using the default file 182 //system in the config 183 //so create a underlying uri and 184 //return it 185 if (tmpAuth == null) { 186 //create a path 187 return FileSystem.getDefaultUri(conf); 188 } 189 String host = rawURI.getHost(); 190 if (host == null) { 191 throw new IOException("URI: " + rawURI 192 + " is an invalid Har URI since host==null." 193 + " Expecting har://<scheme>-<host>/<path>."); 194 } 195 int i = host.indexOf('-'); 196 if (i < 0) { 197 throw new IOException("URI: " + rawURI 198 + " is an invalid Har URI since '-' not found." 199 + " Expecting har://<scheme>-<host>/<path>."); 200 } 201 final String underLyingScheme = host.substring(0, i); 202 i++; 203 final String underLyingHost = i == host.length()? null: host.substring(i); 204 int underLyingPort = rawURI.getPort(); 205 String auth = (underLyingHost == null && underLyingPort == -1)? 206 null:(underLyingHost+":"+underLyingPort); 207 URI tmp = null; 208 if (rawURI.getQuery() != null) { 209 // query component not allowed 210 throw new IOException("query component in Path not supported " + rawURI); 211 } 212 try { 213 tmp = new URI(underLyingScheme, auth, rawURI.getPath(), 214 rawURI.getQuery(), rawURI.getFragment()); 215 } catch (URISyntaxException e) { 216 // do nothing should not happen 217 } 218 return tmp; 219 } 220 221 private static String decodeString(String str) 222 throws UnsupportedEncodingException { 223 return URLDecoder.decode(str, "UTF-8"); 224 } 225 226 private String decodeFileName(String fname) 227 throws UnsupportedEncodingException { 228 int version = metadata.getVersion(); 229 if (version == 2 || version == 3){ 230 return decodeString(fname); 231 } 232 return fname; 233 } 234 235 /** 236 * return the top level archive. 237 */ 238 public Path getWorkingDirectory() { 239 return new Path(uri.toString()); 240 } 241 242 /** 243 * Create a har specific auth 244 * har-underlyingfs:port 245 * @param underLyingURI the uri of underlying 246 * filesystem 247 * @return har specific auth 248 */ 249 private String getHarAuth(URI underLyingUri) { 250 String auth = underLyingUri.getScheme() + "-"; 251 if (underLyingUri.getHost() != null) { 252 auth += underLyingUri.getHost() + ":"; 253 if (underLyingUri.getPort() != -1) { 254 auth += underLyingUri.getPort(); 255 } 256 } 257 else { 258 auth += ":"; 259 } 260 return auth; 261 } 262 263 /** 264 * Returns the uri of this filesystem. 265 * The uri is of the form 266 * har://underlyingfsschema-host:port/pathintheunderlyingfs 267 */ 268 @Override 269 public URI getUri() { 270 return this.uri; 271 } 272 273 /** 274 * this method returns the path 275 * inside the har filesystem. 276 * this is relative path inside 277 * the har filesystem. 278 * @param path the fully qualified path in the har filesystem. 279 * @return relative path in the filesystem. 280 */ 281 private Path getPathInHar(Path path) { 282 Path harPath = new Path(path.toUri().getPath()); 283 if (archivePath.compareTo(harPath) == 0) 284 return new Path(Path.SEPARATOR); 285 Path tmp = new Path(harPath.getName()); 286 Path parent = harPath.getParent(); 287 while (!(parent.compareTo(archivePath) == 0)) { 288 if (parent.toString().equals(Path.SEPARATOR)) { 289 tmp = null; 290 break; 291 } 292 tmp = new Path(parent.getName(), tmp); 293 parent = parent.getParent(); 294 } 295 if (tmp != null) 296 tmp = new Path(Path.SEPARATOR, tmp); 297 return tmp; 298 } 299 300 //the relative path of p. basically 301 // getting rid of /. Parsing and doing 302 // string manipulation is not good - so 303 // just use the path api to do it. 304 private Path makeRelative(String initial, Path p) { 305 String scheme = this.uri.getScheme(); 306 String authority = this.uri.getAuthority(); 307 Path root = new Path(Path.SEPARATOR); 308 if (root.compareTo(p) == 0) 309 return new Path(scheme, authority, initial); 310 Path retPath = new Path(p.getName()); 311 Path parent = p.getParent(); 312 for (int i=0; i < p.depth()-1; i++) { 313 retPath = new Path(parent.getName(), retPath); 314 parent = parent.getParent(); 315 } 316 return new Path(new Path(scheme, authority, initial), 317 retPath.toString()); 318 } 319 320 /* this makes a path qualified in the har filesystem 321 * (non-Javadoc) 322 * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified( 323 * org.apache.hadoop.fs.Path) 324 */ 325 @Override 326 public Path makeQualified(Path path) { 327 // make sure that we just get the 328 // path component 329 Path fsPath = path; 330 if (!path.isAbsolute()) { 331 fsPath = new Path(archivePath, path); 332 } 333 334 URI tmpURI = fsPath.toUri(); 335 //change this to Har uri 336 return new Path(uri.getScheme(), harAuth, tmpURI.getPath()); 337 } 338 339 /** 340 * Fix offset and length of block locations. 341 * Note that this method modifies the original array. 342 * @param locations block locations of har part file 343 * @param start the start of the desired range in the contained file 344 * @param len the length of the desired range 345 * @param fileOffsetInHar the offset of the desired file in the har part file 346 * @return block locations with fixed offset and length 347 */ 348 static BlockLocation[] fixBlockLocations(BlockLocation[] locations, 349 long start, 350 long len, 351 long fileOffsetInHar) { 352 // offset 1 past last byte of desired range 353 long end = start + len; 354 355 for (BlockLocation location : locations) { 356 // offset of part block relative to beginning of desired file 357 // (may be negative if file starts in this part block) 358 long harBlockStart = location.getOffset() - fileOffsetInHar; 359 // offset 1 past last byte of har block relative to beginning of 360 // desired file 361 long harBlockEnd = harBlockStart + location.getLength(); 362 363 if (start > harBlockStart) { 364 // desired range starts after beginning of this har block 365 // fix offset to beginning of relevant range (relative to desired file) 366 location.setOffset(start); 367 // fix length to relevant portion of har block 368 location.setLength(location.getLength() - (start - harBlockStart)); 369 } else { 370 // desired range includes beginning of this har block 371 location.setOffset(harBlockStart); 372 } 373 374 if (harBlockEnd > end) { 375 // range ends before end of this har block 376 // fix length to remove irrelevant portion at the end 377 location.setLength(location.getLength() - (harBlockEnd - end)); 378 } 379 } 380 381 return locations; 382 } 383 384 /** 385 * Get block locations from the underlying fs and fix their 386 * offsets and lengths. 387 * @param file the input filestatus to get block locations 388 * @param start the start of the desired range in the contained file 389 * @param len the length of the desired range 390 * @return block locations for this segment of file 391 * @throws IOException 392 */ 393 @Override 394 public BlockLocation[] getFileBlockLocations(FileStatus file, long start, 395 long len) throws IOException { 396 HarStatus hstatus = getFileHarStatus(file.getPath()); 397 Path partPath = new Path(archivePath, hstatus.getPartName()); 398 FileStatus partStatus = metadata.getPartFileStatus(partPath); 399 400 // get all part blocks that overlap with the desired file blocks 401 BlockLocation[] locations = 402 fs.getFileBlockLocations(partStatus, 403 hstatus.getStartIndex() + start, len); 404 405 return fixBlockLocations(locations, start, len, hstatus.getStartIndex()); 406 } 407 408 /** 409 * the hash of the path p inside iniside 410 * the filesystem 411 * @param p the path in the harfilesystem 412 * @return the hash code of the path. 413 */ 414 public static int getHarHash(Path p) { 415 return (p.toString().hashCode() & 0x7fffffff); 416 } 417 418 static class Store { 419 public Store() { 420 begin = end = startHash = endHash = 0; 421 } 422 public Store(long begin, long end, int startHash, int endHash) { 423 this.begin = begin; 424 this.end = end; 425 this.startHash = startHash; 426 this.endHash = endHash; 427 } 428 public long begin; 429 public long end; 430 public int startHash; 431 public int endHash; 432 } 433 434 /** 435 * Get filestatuses of all the children of a given directory. This just reads 436 * through index file and reads line by line to get all statuses for children 437 * of a directory. Its a brute force way of getting all such filestatuses 438 * 439 * @param parent 440 * the parent path directory 441 * @param statuses 442 * the list to add the children filestatuses to 443 * @param children 444 * the string list of children for this parent 445 * @param archiveIndexStat 446 * the archive index filestatus 447 */ 448 private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses, 449 List<String> children) throws IOException { 450 String parentString = parent.getName(); 451 if (!parentString.endsWith(Path.SEPARATOR)){ 452 parentString += Path.SEPARATOR; 453 } 454 Path harPath = new Path(parentString); 455 int harlen = harPath.depth(); 456 final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>(); 457 458 for (HarStatus hstatus : metadata.archive.values()) { 459 String child = hstatus.getName(); 460 if ((child.startsWith(parentString))) { 461 Path thisPath = new Path(child); 462 if (thisPath.depth() == harlen + 1) { 463 statuses.add(toFileStatus(hstatus, cache)); 464 } 465 } 466 } 467 } 468 469 /** 470 * Combine the status stored in the index and the underlying status. 471 * @param h status stored in the index 472 * @param cache caching the underlying file statuses 473 * @return the combined file status 474 * @throws IOException 475 */ 476 private FileStatus toFileStatus(HarStatus h, 477 Map<String, FileStatus> cache) throws IOException { 478 FileStatus underlying = null; 479 if (cache != null) { 480 underlying = cache.get(h.partName); 481 } 482 if (underlying == null) { 483 final Path p = h.isDir? archivePath: new Path(archivePath, h.partName); 484 underlying = fs.getFileStatus(p); 485 if (cache != null) { 486 cache.put(h.partName, underlying); 487 } 488 } 489 490 long modTime = 0; 491 int version = metadata.getVersion(); 492 if (version < 3) { 493 modTime = underlying.getModificationTime(); 494 } else if (version == 3) { 495 modTime = h.getModificationTime(); 496 } 497 498 return new FileStatus( 499 h.isDir()? 0L: h.getLength(), 500 h.isDir(), 501 underlying.getReplication(), 502 underlying.getBlockSize(), 503 modTime, 504 underlying.getAccessTime(), 505 underlying.getPermission(), 506 underlying.getOwner(), 507 underlying.getGroup(), 508 makeRelative(this.uri.getPath(), new Path(h.name))); 509 } 510 511 // a single line parser for hadoop archives status 512 // stored in a single line in the index files 513 // the format is of the form 514 // filename "dir"/"file" partFileName startIndex length 515 // <space seperated children> 516 private class HarStatus { 517 boolean isDir; 518 String name; 519 List<String> children; 520 String partName; 521 long startIndex; 522 long length; 523 long modificationTime = 0; 524 525 public HarStatus(String harString) throws UnsupportedEncodingException { 526 String[] splits = harString.split(" "); 527 this.name = decodeFileName(splits[0]); 528 this.isDir = "dir".equals(splits[1]) ? true: false; 529 // this is equal to "none" if its a directory 530 this.partName = splits[2]; 531 this.startIndex = Long.parseLong(splits[3]); 532 this.length = Long.parseLong(splits[4]); 533 534 int version = metadata.getVersion(); 535 String[] propSplits = null; 536 // propSplits is used to retrieve the metainformation that Har versions 537 // 1 & 2 missed (modification time, permission, owner group). 538 // These fields are stored in an encoded string placed in different 539 // locations depending on whether it's a file or directory entry. 540 // If it's a directory, the string will be placed at the partName 541 // location (directories have no partName because they don't have data 542 // to be stored). This is done because the number of fields in a 543 // directory entry is unbounded (all children are listed at the end) 544 // If it's a file, the string will be the last field. 545 if (isDir) { 546 if (version == 3){ 547 propSplits = decodeString(this.partName).split(" "); 548 } 549 children = new ArrayList<String>(); 550 for (int i = 5; i < splits.length; i++) { 551 children.add(decodeFileName(splits[i])); 552 } 553 } else if (version == 3) { 554 propSplits = decodeString(splits[5]).split(" "); 555 } 556 557 if (propSplits != null && propSplits.length >= 4) { 558 modificationTime = Long.parseLong(propSplits[0]); 559 // the fields below are stored in the file but are currently not used 560 // by HarFileSystem 561 // permission = new FsPermission(Short.parseShort(propSplits[1])); 562 // owner = decodeString(propSplits[2]); 563 // group = decodeString(propSplits[3]); 564 } 565 } 566 public boolean isDir() { 567 return isDir; 568 } 569 570 public String getName() { 571 return name; 572 } 573 574 public List<String> getChildren() { 575 return children; 576 } 577 public String getFileName() { 578 return name; 579 } 580 public String getPartName() { 581 return partName; 582 } 583 public long getStartIndex() { 584 return startIndex; 585 } 586 public long getLength() { 587 return length; 588 } 589 public long getModificationTime() { 590 return modificationTime; 591 } 592 } 593 594 /** 595 * return the filestatus of files in har archive. 596 * The permission returned are that of the archive 597 * index files. The permissions are not persisted 598 * while creating a hadoop archive. 599 * @param f the path in har filesystem 600 * @return filestatus. 601 * @throws IOException 602 */ 603 @Override 604 public FileStatus getFileStatus(Path f) throws IOException { 605 HarStatus hstatus = getFileHarStatus(f); 606 return toFileStatus(hstatus, null); 607 } 608 609 private HarStatus getFileHarStatus(Path f) throws IOException { 610 // get the fs DataInputStream for the underlying file 611 // look up the index. 612 Path p = makeQualified(f); 613 Path harPath = getPathInHar(p); 614 if (harPath == null) { 615 throw new IOException("Invalid file name: " + f + " in " + uri); 616 } 617 HarStatus hstatus = metadata.archive.get(harPath); 618 if (hstatus == null) { 619 throw new FileNotFoundException("File: " + f + " does not exist in " + uri); 620 } 621 return hstatus; 622 } 623 624 /** 625 * @return null since no checksum algorithm is implemented. 626 */ 627 public FileChecksum getFileChecksum(Path f) { 628 return null; 629 } 630 631 /** 632 * Returns a har input stream which fakes end of 633 * file. It reads the index files to get the part 634 * file name and the size and start of the file. 635 */ 636 @Override 637 public FSDataInputStream open(Path f, int bufferSize) throws IOException { 638 // get the fs DataInputStream for the underlying file 639 HarStatus hstatus = getFileHarStatus(f); 640 // we got it.. woo hooo!!! 641 if (hstatus.isDir()) { 642 throw new FileNotFoundException(f + " : not a file in " + 643 archivePath); 644 } 645 return new HarFSDataInputStream(fs, new Path(archivePath, 646 hstatus.getPartName()), 647 hstatus.getStartIndex(), hstatus.getLength(), bufferSize); 648 } 649 650 /* 651 * create throws an exception in Har filesystem. 652 * The archive once created cannot be changed. 653 */ 654 public FSDataOutputStream create(Path f, int bufferSize) 655 throws IOException { 656 throw new IOException("Har: Create not allowed"); 657 } 658 659 public FSDataOutputStream create(Path f, 660 FsPermission permission, 661 boolean overwrite, 662 int bufferSize, 663 short replication, 664 long blockSize, 665 Progressable progress) throws IOException { 666 throw new IOException("Har: create not allowed."); 667 } 668 669 @Override 670 public void close() throws IOException { 671 if (fs != null) { 672 try { 673 fs.close(); 674 } catch(IOException ie) { 675 //this might already be closed 676 // ignore 677 } 678 } 679 } 680 681 /** 682 * Not implemented. 683 */ 684 @Override 685 public boolean setReplication(Path src, short replication) throws IOException{ 686 throw new IOException("Har: setreplication not allowed"); 687 } 688 689 /** 690 * Not implemented. 691 */ 692 @Override 693 public boolean delete(Path f, boolean recursive) throws IOException { 694 throw new IOException("Har: delete not allowed"); 695 } 696 697 /** 698 * liststatus returns the children of a directory 699 * after looking up the index files. 700 */ 701 @Override 702 public FileStatus[] listStatus(Path f) throws IOException { 703 //need to see if the file is an index in file 704 //get the filestatus of the archive directory 705 // we will create fake filestatuses to return 706 // to the client 707 List<FileStatus> statuses = new ArrayList<FileStatus>(); 708 Path tmpPath = makeQualified(f); 709 Path harPath = getPathInHar(tmpPath); 710 HarStatus hstatus = metadata.archive.get(harPath); 711 if (hstatus == null) { 712 throw new FileNotFoundException("File " + f + " not found in " + archivePath); 713 } 714 if (hstatus.isDir()) { 715 fileStatusesInIndex(hstatus, statuses, hstatus.children); 716 } else { 717 statuses.add(toFileStatus(hstatus, null)); 718 } 719 720 return statuses.toArray(new FileStatus[statuses.size()]); 721 } 722 723 /** 724 * return the top level archive path. 725 */ 726 public Path getHomeDirectory() { 727 return new Path(uri.toString()); 728 } 729 730 public void setWorkingDirectory(Path newDir) { 731 //does nothing. 732 } 733 734 /** 735 * not implemented. 736 */ 737 public boolean mkdirs(Path f, FsPermission permission) throws IOException { 738 throw new IOException("Har: mkdirs not allowed"); 739 } 740 741 /** 742 * not implemented. 743 */ 744 public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws 745 IOException { 746 throw new IOException("Har: copyfromlocalfile not allowed"); 747 } 748 749 /** 750 * copies the file in the har filesystem to a local file. 751 */ 752 public void copyToLocalFile(boolean delSrc, Path src, Path dst) 753 throws IOException { 754 FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf()); 755 } 756 757 /** 758 * not implemented. 759 */ 760 public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) 761 throws IOException { 762 throw new IOException("Har: startLocalOutput not allowed"); 763 } 764 765 /** 766 * not implemented. 767 */ 768 public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) 769 throws IOException { 770 throw new IOException("Har: completeLocalOutput not allowed"); 771 } 772 773 /** 774 * not implemented. 775 */ 776 public void setOwner(Path p, String username, String groupname) 777 throws IOException { 778 throw new IOException("Har: setowner not allowed"); 779 } 780 781 /** 782 * Not implemented. 783 */ 784 public void setPermission(Path p, FsPermission permisssion) 785 throws IOException { 786 throw new IOException("Har: setPermission not allowed"); 787 } 788 789 /** 790 * Hadoop archives input stream. This input stream fakes EOF 791 * since archive files are part of bigger part files. 792 */ 793 private static class HarFSDataInputStream extends FSDataInputStream { 794 /** 795 * Create an input stream that fakes all the reads/positions/seeking. 796 */ 797 private static class HarFsInputStream extends FSInputStream { 798 private long position, start, end; 799 //The underlying data input stream that the 800 // underlying filesystem will return. 801 private FSDataInputStream underLyingStream; 802 //one byte buffer 803 private byte[] oneBytebuff = new byte[1]; 804 HarFsInputStream(FileSystem fs, Path path, long start, 805 long length, int bufferSize) throws IOException { 806 underLyingStream = fs.open(path, bufferSize); 807 underLyingStream.seek(start); 808 // the start of this file in the part file 809 this.start = start; 810 // the position pointer in the part file 811 this.position = start; 812 // the end pointer in the part file 813 this.end = start + length; 814 } 815 816 public synchronized int available() throws IOException { 817 long remaining = end - underLyingStream.getPos(); 818 if (remaining > (long)Integer.MAX_VALUE) { 819 return Integer.MAX_VALUE; 820 } 821 return (int) remaining; 822 } 823 824 public synchronized void close() throws IOException { 825 underLyingStream.close(); 826 super.close(); 827 } 828 829 //not implemented 830 @Override 831 public void mark(int readLimit) { 832 // do nothing 833 } 834 835 /** 836 * reset is not implemented 837 */ 838 public void reset() throws IOException { 839 throw new IOException("reset not implemented."); 840 } 841 842 public synchronized int read() throws IOException { 843 int ret = read(oneBytebuff, 0, 1); 844 return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff); 845 } 846 847 public synchronized int read(byte[] b) throws IOException { 848 int ret = read(b, 0, b.length); 849 if (ret != -1) { 850 position += ret; 851 } 852 return ret; 853 } 854 855 /** 856 * 857 */ 858 public synchronized int read(byte[] b, int offset, int len) 859 throws IOException { 860 int newlen = len; 861 int ret = -1; 862 if (position + len > end) { 863 newlen = (int) (end - position); 864 } 865 // end case 866 if (newlen == 0) 867 return ret; 868 ret = underLyingStream.read(b, offset, newlen); 869 position += ret; 870 return ret; 871 } 872 873 public synchronized long skip(long n) throws IOException { 874 long tmpN = n; 875 if (tmpN > 0) { 876 if (position + tmpN > end) { 877 tmpN = end - position; 878 } 879 underLyingStream.seek(tmpN + position); 880 position += tmpN; 881 return tmpN; 882 } 883 return (tmpN < 0)? -1 : 0; 884 } 885 886 public synchronized long getPos() throws IOException { 887 return (position - start); 888 } 889 890 public synchronized void seek(long pos) throws IOException { 891 if (pos < 0 || (start + pos > end)) { 892 throw new IOException("Failed to seek: EOF"); 893 } 894 position = start + pos; 895 underLyingStream.seek(position); 896 } 897 898 public boolean seekToNewSource(long targetPos) throws IOException { 899 //do not need to implement this 900 // hdfs in itself does seektonewsource 901 // while reading. 902 return false; 903 } 904 905 /** 906 * implementing position readable. 907 */ 908 public int read(long pos, byte[] b, int offset, int length) 909 throws IOException { 910 int nlength = length; 911 if (start + nlength + pos > end) { 912 nlength = (int) (end - (start + pos)); 913 } 914 return underLyingStream.read(pos + start , b, offset, nlength); 915 } 916 917 /** 918 * position readable again. 919 */ 920 public void readFully(long pos, byte[] b, int offset, int length) 921 throws IOException { 922 if (start + length + pos > end) { 923 throw new IOException("Not enough bytes to read."); 924 } 925 underLyingStream.readFully(pos + start, b, offset, length); 926 } 927 928 public void readFully(long pos, byte[] b) throws IOException { 929 readFully(pos, b, 0, b.length); 930 } 931 932 } 933 934 /** 935 * constructors for har input stream. 936 * @param fs the underlying filesystem 937 * @param p The path in the underlying filesystem 938 * @param start the start position in the part file 939 * @param length the length of valid data in the part file 940 * @param bufsize the buffer size 941 * @throws IOException 942 */ 943 public HarFSDataInputStream(FileSystem fs, Path p, long start, 944 long length, int bufsize) throws IOException { 945 super(new HarFsInputStream(fs, p, start, length, bufsize)); 946 } 947 948 /** 949 * constructor for har input stream. 950 * @param fs the underlying filesystem 951 * @param p the path in the underlying file system 952 * @param start the start position in the part file 953 * @param length the length of valid data in the part file. 954 * @throws IOException 955 */ 956 public HarFSDataInputStream(FileSystem fs, Path p, long start, long length) 957 throws IOException { 958 super(new HarFsInputStream(fs, p, start, length, 0)); 959 } 960 } 961 962 private class HarMetaData { 963 private FileSystem fs; 964 private int version; 965 // the masterIndex of the archive 966 private Path masterIndexPath; 967 // the index file 968 private Path archiveIndexPath; 969 970 private long masterIndexTimestamp; 971 private long archiveIndexTimestamp; 972 973 List<Store> stores = new ArrayList<Store>(); 974 Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>(); 975 private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>(); 976 977 public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) { 978 this.fs = fs; 979 this.masterIndexPath = masterIndexPath; 980 this.archiveIndexPath = archiveIndexPath; 981 } 982 983 public FileStatus getPartFileStatus(Path partPath) throws IOException { 984 FileStatus status; 985 status = partFileStatuses.get(partPath); 986 if (status == null) { 987 status = fs.getFileStatus(partPath); 988 partFileStatuses.put(partPath, status); 989 } 990 return status; 991 } 992 993 public long getMasterIndexTimestamp() { 994 return masterIndexTimestamp; 995 } 996 997 public long getArchiveIndexTimestamp() { 998 return archiveIndexTimestamp; 999 } 1000 1001 private int getVersion() { 1002 return version; 1003 } 1004 1005 private void parseMetaData() throws IOException { 1006 FSDataInputStream in = fs.open(masterIndexPath); 1007 FileStatus masterStat = fs.getFileStatus(masterIndexPath); 1008 masterIndexTimestamp = masterStat.getModificationTime(); 1009 LineReader lin = new LineReader(in, getConf()); 1010 Text line = new Text(); 1011 long read = lin.readLine(line); 1012 1013 // the first line contains the version of the index file 1014 String versionLine = line.toString(); 1015 String[] arr = versionLine.split(" "); 1016 version = Integer.parseInt(arr[0]); 1017 // make it always backwards-compatible 1018 if (this.version > HarFileSystem.VERSION) { 1019 throw new IOException("Invalid version " + 1020 this.version + " expected " + HarFileSystem.VERSION); 1021 } 1022 1023 // each line contains a hashcode range and the index file name 1024 String[] readStr = null; 1025 while(read < masterStat.getLen()) { 1026 int b = lin.readLine(line); 1027 read += b; 1028 readStr = line.toString().split(" "); 1029 int startHash = Integer.parseInt(readStr[0]); 1030 int endHash = Integer.parseInt(readStr[1]); 1031 stores.add(new Store(Long.parseLong(readStr[2]), 1032 Long.parseLong(readStr[3]), startHash, 1033 endHash)); 1034 line.clear(); 1035 } 1036 try { 1037 // close the master index 1038 lin.close(); 1039 } catch(IOException io){ 1040 // do nothing just a read. 1041 } 1042 1043 FSDataInputStream aIn = fs.open(archiveIndexPath); 1044 FileStatus archiveStat = fs.getFileStatus(archiveIndexPath); 1045 archiveIndexTimestamp = archiveStat.getModificationTime(); 1046 LineReader aLin; 1047 1048 // now start reading the real index file 1049 for (Store s: stores) { 1050 read = 0; 1051 aIn.seek(s.begin); 1052 aLin = new LineReader(aIn, getConf()); 1053 while (read + s.begin < s.end) { 1054 int tmp = aLin.readLine(line); 1055 read += tmp; 1056 String lineFeed = line.toString(); 1057 String[] parsed = lineFeed.split(" "); 1058 parsed[0] = decodeFileName(parsed[0]); 1059 archive.put(new Path(parsed[0]), new HarStatus(lineFeed)); 1060 line.clear(); 1061 } 1062 } 1063 try { 1064 // close the archive index 1065 aIn.close(); 1066 } catch(IOException io) { 1067 // do nothing just a read. 1068 } 1069 } 1070 } 1071 }