001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.fs;
019
020import java.io.FileNotFoundException;
021import java.io.IOException;
022import java.io.UnsupportedEncodingException;
023import java.net.URI;
024import java.net.URISyntaxException;
025import java.net.URLDecoder;
026import java.util.ArrayList;
027import java.util.List;
028import java.util.Map;
029import java.util.TreeMap;
030import java.util.HashMap;
031import java.util.concurrent.ConcurrentHashMap;
032
033import org.apache.commons.logging.Log;
034import org.apache.commons.logging.LogFactory;
035import org.apache.hadoop.conf.Configuration;
036import org.apache.hadoop.fs.permission.FsPermission;
037import org.apache.hadoop.io.IOUtils;
038import org.apache.hadoop.io.Text;
039import org.apache.hadoop.util.LineReader;
040import org.apache.hadoop.util.Progressable;
041
042/**
043 * This is an implementation of the Hadoop Archive 
044 * Filesystem. This archive Filesystem has index files
045 * of the form _index* and has contents of the form
046 * part-*. The index files store the indexes of the 
047 * real files. The index files are of the form _masterindex
048 * and _index. The master index is a level of indirection 
049 * in to the index file to make the look ups faster. the index
050 * file is sorted with hash code of the paths that it contains 
051 * and the master index contains pointers to the positions in 
052 * index for ranges of hashcodes.
053 */
054
055public class HarFileSystem extends FilterFileSystem {
056
057  private static final Log LOG = LogFactory.getLog(HarFileSystem.class);
058
059  public static final int VERSION = 3;
060
061  private static final Map<URI, HarMetaData> harMetaCache =
062      new ConcurrentHashMap<URI, HarMetaData>();
063
064  // uri representation of this Har filesystem
065  private URI uri;
066  // the top level path of the archive
067  // in the underlying file system
068  private Path archivePath;
069  // the har auth
070  private String harAuth;
071
072  // pointer into the static metadata cache
073  private HarMetaData metadata;
074
075  /**
076   * public construction of harfilesystem
077   *
078   */
079  public HarFileSystem() {
080  }
081
082  /**
083   * Return the protocol scheme for the FileSystem.
084   * <p/>
085   *
086   * @return <code>har</code>
087   */
088  @Override
089  public String getScheme() {
090    return "har";
091  }
092
093  /**
094   * Constructor to create a HarFileSystem with an
095   * underlying filesystem.
096   * @param fs
097   */
098  public HarFileSystem(FileSystem fs) {
099    super(fs);
100  }
101  
102  /**
103   * Initialize a Har filesystem per har archive. The 
104   * archive home directory is the top level directory
105   * in the filesystem that contains the HAR archive.
106   * Be careful with this method, you do not want to go 
107   * on creating new Filesystem instances per call to 
108   * path.getFileSystem().
109   * the uri of Har is 
110   * har://underlyingfsscheme-host:port/archivepath.
111   * or 
112   * har:///archivepath. This assumes the underlying filesystem
113   * to be used in case not specified.
114   */
115  @Override
116  public void initialize(URI name, Configuration conf) throws IOException {
117    // decode the name
118    URI underLyingURI = decodeHarURI(name, conf);
119    // we got the right har Path- now check if this is 
120    // truly a har filesystem
121    Path harPath = archivePath(
122      new Path(name.getScheme(), name.getAuthority(), name.getPath()));
123    if (harPath == null) { 
124      throw new IOException("Invalid path for the Har Filesystem. " + 
125                           name.toString());
126    }
127    if (fs == null) {
128      fs = FileSystem.get(underLyingURI, conf);
129    }
130    uri = harPath.toUri();
131    archivePath = new Path(uri.getPath());
132    harAuth = getHarAuth(underLyingURI);
133    //check for the underlying fs containing
134    // the index file
135    Path masterIndexPath = new Path(archivePath, "_masterindex");
136    Path archiveIndexPath = new Path(archivePath, "_index");
137    if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) {
138      throw new IOException("Invalid path for the Har Filesystem. " +
139          "No index file in " + harPath);
140    }
141
142    metadata = harMetaCache.get(uri);
143    if (metadata != null) {
144      FileStatus mStat = fs.getFileStatus(masterIndexPath);
145      FileStatus aStat = fs.getFileStatus(archiveIndexPath);
146      if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() ||
147          aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) {
148        // the archive has been overwritten since we last read it
149        // remove the entry from the meta data cache
150        metadata = null;
151        harMetaCache.remove(uri);
152      }
153    }
154    if (metadata == null) {
155      metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath);
156      metadata.parseMetaData();
157      harMetaCache.put(uri, metadata);
158    }
159  }
160
161  // get the version of the filesystem from the masterindex file
162  // the version is currently not useful since its the first version
163  // of archives
164  public int getHarVersion() throws IOException {
165    if (metadata != null) {
166      return metadata.getVersion();
167    }
168    else {
169      throw new IOException("Invalid meta data for the Har Filesystem");
170    }
171  }
172
173  /*
174   * find the parent path that is the 
175   * archive path in the path. The last
176   * path segment that ends with .har is 
177   * the path that will be returned.
178   */
179  private Path archivePath(Path p) {
180    Path retPath = null;
181    Path tmp = p;
182    for (int i=0; i< p.depth(); i++) {
183      if (tmp.toString().endsWith(".har")) {
184        retPath = tmp;
185        break;
186      }
187      tmp = tmp.getParent();
188    }
189    return retPath;
190  }
191
192  /**
193   * decode the raw URI to get the underlying URI
194   * @param rawURI raw Har URI
195   * @return filtered URI of the underlying fileSystem
196   */
197  private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException {
198    String tmpAuth = rawURI.getAuthority();
199    //we are using the default file
200    //system in the config 
201    //so create a underlying uri and 
202    //return it
203    if (tmpAuth == null) {
204      //create a path 
205      return FileSystem.getDefaultUri(conf);
206    }
207    String host = rawURI.getHost();
208    if (host == null) {
209      throw new IOException("URI: " + rawURI
210          + " is an invalid Har URI since host==null."
211          + "  Expecting har://<scheme>-<host>/<path>.");
212    }
213    int i = host.indexOf('-');
214    if (i < 0) {
215      throw new IOException("URI: " + rawURI
216          + " is an invalid Har URI since '-' not found."
217          + "  Expecting har://<scheme>-<host>/<path>.");
218    }
219    final String underLyingScheme = host.substring(0, i);
220    i++;
221    final String underLyingHost = i == host.length()? null: host.substring(i);
222    int underLyingPort = rawURI.getPort();
223    String auth = (underLyingHost == null && underLyingPort == -1)?
224                  null:(underLyingHost+
225                      (underLyingPort == -1 ? "" : ":"+underLyingPort));
226    URI tmp = null;
227    if (rawURI.getQuery() != null) {
228      // query component not allowed
229      throw new IOException("query component in Path not supported  " + rawURI);
230    }
231    try {
232      tmp = new URI(underLyingScheme, auth, rawURI.getPath(), 
233            rawURI.getQuery(), rawURI.getFragment());
234    } catch (URISyntaxException e) {
235        // do nothing should not happen
236    }
237    return tmp;
238  }
239
240  private static String decodeString(String str)
241    throws UnsupportedEncodingException {
242    return URLDecoder.decode(str, "UTF-8");
243  }
244
245  private String decodeFileName(String fname) 
246    throws UnsupportedEncodingException {
247    int version = metadata.getVersion();
248    if (version == 2 || version == 3){
249      return decodeString(fname);
250    }
251    return fname;
252  }
253
254  /**
255   * return the top level archive.
256   */
257  @Override
258  public Path getWorkingDirectory() {
259    return new Path(uri.toString());
260  }
261  
262  /**
263   * Create a har specific auth 
264   * har-underlyingfs:port
265   * @param underLyingURI the uri of underlying
266   * filesystem
267   * @return har specific auth
268   */
269  private String getHarAuth(URI underLyingUri) {
270    String auth = underLyingUri.getScheme() + "-";
271    if (underLyingUri.getHost() != null) {
272      auth += underLyingUri.getHost() + ":";
273      if (underLyingUri.getPort() != -1) {
274        auth +=  underLyingUri.getPort();
275      }
276    }
277    else {
278      auth += ":";
279    }
280    return auth;
281  }
282  
283  /**
284   * Returns the uri of this filesystem.
285   * The uri is of the form 
286   * har://underlyingfsschema-host:port/pathintheunderlyingfs
287   */
288  @Override
289  public URI getUri() {
290    return this.uri;
291  }
292  
293  /**
294   * this method returns the path 
295   * inside the har filesystem.
296   * this is relative path inside 
297   * the har filesystem.
298   * @param path the fully qualified path in the har filesystem.
299   * @return relative path in the filesystem.
300   */
301  private Path getPathInHar(Path path) {
302    Path harPath = new Path(path.toUri().getPath());
303    if (archivePath.compareTo(harPath) == 0)
304      return new Path(Path.SEPARATOR);
305    Path tmp = new Path(harPath.getName());
306    Path parent = harPath.getParent();
307    while (!(parent.compareTo(archivePath) == 0)) {
308      if (parent.toString().equals(Path.SEPARATOR)) {
309        tmp = null;
310        break;
311      }
312      tmp = new Path(parent.getName(), tmp);
313      parent = parent.getParent();
314    }
315    if (tmp != null) 
316      tmp = new Path(Path.SEPARATOR, tmp);
317    return tmp;
318  }
319  
320  //the relative path of p. basically 
321  // getting rid of /. Parsing and doing 
322  // string manipulation is not good - so
323  // just use the path api to do it.
324  private Path makeRelative(String initial, Path p) {
325    String scheme = this.uri.getScheme();
326    String authority = this.uri.getAuthority();
327    Path root = new Path(Path.SEPARATOR);
328    if (root.compareTo(p) == 0)
329      return new Path(scheme, authority, initial);
330    Path retPath = new Path(p.getName());
331    Path parent = p.getParent();
332    for (int i=0; i < p.depth()-1; i++) {
333      retPath = new Path(parent.getName(), retPath);
334      parent = parent.getParent();
335    }
336    return new Path(new Path(scheme, authority, initial),
337      retPath.toString());
338  }
339  
340  /* this makes a path qualified in the har filesystem
341   * (non-Javadoc)
342   * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified(
343   * org.apache.hadoop.fs.Path)
344   */
345  @Override
346  public Path makeQualified(Path path) {
347    // make sure that we just get the 
348    // path component 
349    Path fsPath = path;
350    if (!path.isAbsolute()) {
351      fsPath = new Path(archivePath, path);
352    }
353
354    URI tmpURI = fsPath.toUri();
355    //change this to Har uri 
356    return new Path(uri.getScheme(), harAuth, tmpURI.getPath());
357  }
358
359  /**
360   * Fix offset and length of block locations.
361   * Note that this method modifies the original array.
362   * @param locations block locations of har part file
363   * @param start the start of the desired range in the contained file
364   * @param len the length of the desired range
365   * @param fileOffsetInHar the offset of the desired file in the har part file
366   * @return block locations with fixed offset and length
367   */  
368  static BlockLocation[] fixBlockLocations(BlockLocation[] locations,
369                                          long start,
370                                          long len,
371                                          long fileOffsetInHar) {
372    // offset 1 past last byte of desired range
373    long end = start + len;
374
375    for (BlockLocation location : locations) {
376      // offset of part block relative to beginning of desired file
377      // (may be negative if file starts in this part block)
378      long harBlockStart = location.getOffset() - fileOffsetInHar;
379      // offset 1 past last byte of har block relative to beginning of
380      // desired file
381      long harBlockEnd = harBlockStart + location.getLength();
382      
383      if (start > harBlockStart) {
384        // desired range starts after beginning of this har block
385        // fix offset to beginning of relevant range (relative to desired file)
386        location.setOffset(start);
387        // fix length to relevant portion of har block
388        location.setLength(location.getLength() - (start - harBlockStart));
389      } else {
390        // desired range includes beginning of this har block
391        location.setOffset(harBlockStart);
392      }
393      
394      if (harBlockEnd > end) {
395        // range ends before end of this har block
396        // fix length to remove irrelevant portion at the end
397        location.setLength(location.getLength() - (harBlockEnd - end));
398      }
399    }
400    
401    return locations;
402  }
403  
404  /**
405   * Get block locations from the underlying fs and fix their
406   * offsets and lengths.
407   * @param file the input filestatus to get block locations
408   * @param start the start of the desired range in the contained file
409   * @param len the length of the desired range
410   * @return block locations for this segment of file
411   * @throws IOException
412   */
413  @Override
414  public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
415                                               long len) throws IOException {
416    HarStatus hstatus = getFileHarStatus(file.getPath());
417    Path partPath = new Path(archivePath, hstatus.getPartName());
418    FileStatus partStatus = metadata.getPartFileStatus(partPath);
419
420    // get all part blocks that overlap with the desired file blocks
421    BlockLocation[] locations = 
422      fs.getFileBlockLocations(partStatus,
423                               hstatus.getStartIndex() + start, len);
424
425    return fixBlockLocations(locations, start, len, hstatus.getStartIndex());
426  }
427  
428  /**
429   * the hash of the path p inside iniside
430   * the filesystem
431   * @param p the path in the harfilesystem
432   * @return the hash code of the path.
433   */
434  public static int getHarHash(Path p) {
435    return (p.toString().hashCode() & 0x7fffffff);
436  }
437  
438  static class Store {
439    public Store() {
440      begin = end = startHash = endHash = 0;
441    }
442    public Store(long begin, long end, int startHash, int endHash) {
443      this.begin = begin;
444      this.end = end;
445      this.startHash = startHash;
446      this.endHash = endHash;
447    }
448    public long begin;
449    public long end;
450    public int startHash;
451    public int endHash;
452  }
453  
454  /**
455   * Get filestatuses of all the children of a given directory. This just reads
456   * through index file and reads line by line to get all statuses for children
457   * of a directory. Its a brute force way of getting all such filestatuses
458   * 
459   * @param parent
460   *          the parent path directory
461   * @param statuses
462   *          the list to add the children filestatuses to
463   * @param children
464   *          the string list of children for this parent
465   * @param archiveIndexStat
466   *          the archive index filestatus
467   */
468  private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses,
469      List<String> children) throws IOException {
470    String parentString = parent.getName();
471    if (!parentString.endsWith(Path.SEPARATOR)){
472        parentString += Path.SEPARATOR;
473    }
474    Path harPath = new Path(parentString);
475    int harlen = harPath.depth();
476    final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
477
478    for (HarStatus hstatus : metadata.archive.values()) {
479      String child = hstatus.getName();
480      if ((child.startsWith(parentString))) {
481        Path thisPath = new Path(child);
482        if (thisPath.depth() == harlen + 1) {
483          statuses.add(toFileStatus(hstatus, cache));
484        }
485      }
486    }
487  }
488
489  /**
490   * Combine the status stored in the index and the underlying status. 
491   * @param h status stored in the index
492   * @param cache caching the underlying file statuses
493   * @return the combined file status
494   * @throws IOException
495   */
496  private FileStatus toFileStatus(HarStatus h,
497      Map<String, FileStatus> cache) throws IOException {
498    FileStatus underlying = null;
499    if (cache != null) {
500      underlying = cache.get(h.partName);
501    }
502    if (underlying == null) {
503      final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
504      underlying = fs.getFileStatus(p);
505      if (cache != null) {
506        cache.put(h.partName, underlying);
507      }
508    }
509
510    long modTime = 0;
511    int version = metadata.getVersion();
512    if (version < 3) {
513      modTime = underlying.getModificationTime();
514    } else if (version == 3) {
515      modTime = h.getModificationTime();
516    }
517
518    return new FileStatus(
519        h.isDir()? 0L: h.getLength(),
520        h.isDir(),
521        underlying.getReplication(),
522        underlying.getBlockSize(),
523        modTime,
524        underlying.getAccessTime(),
525        underlying.getPermission(),
526        underlying.getOwner(),
527        underlying.getGroup(),
528        makeRelative(this.uri.getPath(), new Path(h.name)));
529  }
530
531  // a single line parser for hadoop archives status 
532  // stored in a single line in the index files 
533  // the format is of the form 
534  // filename "dir"/"file" partFileName startIndex length 
535  // <space seperated children>
536  private class HarStatus {
537    boolean isDir;
538    String name;
539    List<String> children;
540    String partName;
541    long startIndex;
542    long length;
543    long modificationTime = 0;
544
545    public HarStatus(String harString) throws UnsupportedEncodingException {
546      String[] splits = harString.split(" ");
547      this.name = decodeFileName(splits[0]);
548      this.isDir = "dir".equals(splits[1]) ? true: false;
549      // this is equal to "none" if its a directory
550      this.partName = splits[2];
551      this.startIndex = Long.parseLong(splits[3]);
552      this.length = Long.parseLong(splits[4]);
553
554      int version = metadata.getVersion();
555      String[] propSplits = null;
556      // propSplits is used to retrieve the metainformation that Har versions
557      // 1 & 2 missed (modification time, permission, owner group).
558      // These fields are stored in an encoded string placed in different
559      // locations depending on whether it's a file or directory entry.
560      // If it's a directory, the string will be placed at the partName
561      // location (directories have no partName because they don't have data
562      // to be stored). This is done because the number of fields in a
563      // directory entry is unbounded (all children are listed at the end)
564      // If it's a file, the string will be the last field.
565      if (isDir) {
566        if (version == 3){
567          propSplits = decodeString(this.partName).split(" ");
568        }
569        children = new ArrayList<String>();
570        for (int i = 5; i < splits.length; i++) {
571          children.add(decodeFileName(splits[i]));
572        }
573      } else if (version == 3) {
574        propSplits = decodeString(splits[5]).split(" ");
575      }
576
577      if (propSplits != null && propSplits.length >= 4) {
578        modificationTime = Long.parseLong(propSplits[0]);
579        // the fields below are stored in the file but are currently not used
580        // by HarFileSystem
581        // permission = new FsPermission(Short.parseShort(propSplits[1]));
582        // owner = decodeString(propSplits[2]);
583        // group = decodeString(propSplits[3]);
584      }
585    }
586    public boolean isDir() {
587      return isDir;
588    }
589    
590    public String getName() {
591      return name;
592    }
593    public String getPartName() {
594      return partName;
595    }
596    public long getStartIndex() {
597      return startIndex;
598    }
599    public long getLength() {
600      return length;
601    }
602    public long getModificationTime() {
603      return modificationTime;
604    }
605  }
606  
607  /**
608   * return the filestatus of files in har archive.
609   * The permission returned are that of the archive
610   * index files. The permissions are not persisted 
611   * while creating a hadoop archive.
612   * @param f the path in har filesystem
613   * @return filestatus.
614   * @throws IOException
615   */
616  @Override
617  public FileStatus getFileStatus(Path f) throws IOException {
618    HarStatus hstatus = getFileHarStatus(f);
619    return toFileStatus(hstatus, null);
620  }
621
622  private HarStatus getFileHarStatus(Path f) throws IOException {
623    // get the fs DataInputStream for the underlying file
624    // look up the index.
625    Path p = makeQualified(f);
626    Path harPath = getPathInHar(p);
627    if (harPath == null) {
628      throw new IOException("Invalid file name: " + f + " in " + uri);
629    }
630    HarStatus hstatus = metadata.archive.get(harPath);
631    if (hstatus == null) {
632      throw new FileNotFoundException("File: " +  f + " does not exist in " + uri);
633    }
634    return hstatus;
635  }
636
637  /**
638   * @return null since no checksum algorithm is implemented.
639   */
640  @Override
641  public FileChecksum getFileChecksum(Path f) {
642    return null;
643  }
644
645  /**
646   * Returns a har input stream which fakes end of 
647   * file. It reads the index files to get the part 
648   * file name and the size and start of the file.
649   */
650  @Override
651  public FSDataInputStream open(Path f, int bufferSize) throws IOException {
652    // get the fs DataInputStream for the underlying file
653    HarStatus hstatus = getFileHarStatus(f);
654    // we got it.. woo hooo!!! 
655    if (hstatus.isDir()) {
656      throw new FileNotFoundException(f + " : not a file in " +
657                archivePath);
658    }
659    return new HarFSDataInputStream(fs, new Path(archivePath, 
660        hstatus.getPartName()),
661        hstatus.getStartIndex(), hstatus.getLength(), bufferSize);
662  }
663 
664  @Override
665  public FSDataOutputStream create(Path f,
666      FsPermission permission,
667      boolean overwrite,
668      int bufferSize,
669      short replication,
670      long blockSize,
671      Progressable progress) throws IOException {
672    throw new IOException("Har: create not allowed.");
673  }
674  
675  @Override
676  public void close() throws IOException {
677    if (fs != null) {
678      try {
679        fs.close();
680      } catch(IOException ie) {
681        //this might already be closed
682        // ignore
683      }
684    }
685  }
686  
687  /**
688   * Not implemented.
689   */
690  @Override
691  public boolean setReplication(Path src, short replication) throws IOException{
692    throw new IOException("Har: setreplication not allowed");
693  }
694  
695  /**
696   * Not implemented.
697   */
698  @Override
699  public boolean delete(Path f, boolean recursive) throws IOException { 
700    throw new IOException("Har: delete not allowed");
701  }
702  
703  /**
704   * liststatus returns the children of a directory 
705   * after looking up the index files.
706   */
707  @Override
708  public FileStatus[] listStatus(Path f) throws IOException {
709    //need to see if the file is an index in file
710    //get the filestatus of the archive directory
711    // we will create fake filestatuses to return
712    // to the client
713    List<FileStatus> statuses = new ArrayList<FileStatus>();
714    Path tmpPath = makeQualified(f);
715    Path harPath = getPathInHar(tmpPath);
716    HarStatus hstatus = metadata.archive.get(harPath);
717    if (hstatus == null) {
718      throw new FileNotFoundException("File " + f + " not found in " + archivePath);
719    }
720    if (hstatus.isDir()) {
721      fileStatusesInIndex(hstatus, statuses, hstatus.children);
722    } else {
723      statuses.add(toFileStatus(hstatus, null));
724    }
725    
726    return statuses.toArray(new FileStatus[statuses.size()]);
727  }
728  
729  /**
730   * return the top level archive path.
731   */
732  @Override
733  public Path getHomeDirectory() {
734    return new Path(uri.toString());
735  }
736  
737  @Override
738  public void setWorkingDirectory(Path newDir) {
739    //does nothing.
740  }
741  
742  /**
743   * not implemented.
744   */
745  @Override
746  public boolean mkdirs(Path f, FsPermission permission) throws IOException {
747    throw new IOException("Har: mkdirs not allowed");
748  }
749  
750  /**
751   * not implemented.
752   */
753  @Override
754  public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws 
755        IOException {
756    throw new IOException("Har: copyfromlocalfile not allowed");
757  }
758  
759  /**
760   * copies the file in the har filesystem to a local file.
761   */
762  @Override
763  public void copyToLocalFile(boolean delSrc, Path src, Path dst) 
764    throws IOException {
765    FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf());
766  }
767  
768  /**
769   * not implemented.
770   */
771  @Override
772  public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) 
773    throws IOException {
774    throw new IOException("Har: startLocalOutput not allowed");
775  }
776  
777  /**
778   * not implemented.
779   */
780  @Override
781  public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) 
782    throws IOException {
783    throw new IOException("Har: completeLocalOutput not allowed");
784  }
785  
786  /**
787   * not implemented.
788   */
789  @Override
790  public void setOwner(Path p, String username, String groupname)
791    throws IOException {
792    throw new IOException("Har: setowner not allowed");
793  }
794
795  /**
796   * Not implemented.
797   */
798  @Override
799  public void setPermission(Path p, FsPermission permisssion) 
800    throws IOException {
801    throw new IOException("Har: setPermission not allowed");
802  }
803  
804  /**
805   * Hadoop archives input stream. This input stream fakes EOF 
806   * since archive files are part of bigger part files.
807   */
808  private static class HarFSDataInputStream extends FSDataInputStream {
809    /**
810     * Create an input stream that fakes all the reads/positions/seeking.
811     */
812    private static class HarFsInputStream extends FSInputStream {
813      private long position, start, end;
814      //The underlying data input stream that the
815      // underlying filesystem will return.
816      private FSDataInputStream underLyingStream;
817      //one byte buffer
818      private byte[] oneBytebuff = new byte[1];
819      HarFsInputStream(FileSystem fs, Path path, long start,
820          long length, int bufferSize) throws IOException {
821        underLyingStream = fs.open(path, bufferSize);
822        underLyingStream.seek(start);
823        // the start of this file in the part file
824        this.start = start;
825        // the position pointer in the part file
826        this.position = start;
827        // the end pointer in the part file
828        this.end = start + length;
829      }
830      
831      @Override
832      public synchronized int available() throws IOException {
833        long remaining = end - underLyingStream.getPos();
834        if (remaining > (long)Integer.MAX_VALUE) {
835          return Integer.MAX_VALUE;
836        }
837        return (int) remaining;
838      }
839      
840      @Override
841      public synchronized  void close() throws IOException {
842        underLyingStream.close();
843        super.close();
844      }
845      
846      //not implemented
847      @Override
848      public void mark(int readLimit) {
849        // do nothing 
850      }
851      
852      /**
853       * reset is not implemented
854       */
855      @Override
856      public void reset() throws IOException {
857        throw new IOException("reset not implemented.");
858      }
859      
860      @Override
861      public synchronized int read() throws IOException {
862        int ret = read(oneBytebuff, 0, 1);
863        return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff);
864      }
865      
866      @Override
867      public synchronized int read(byte[] b) throws IOException {
868        int ret = read(b, 0, b.length);
869        if (ret != -1) {
870          position += ret;
871        }
872        return ret;
873      }
874      
875      /**
876       * 
877       */
878      @Override
879      public synchronized int read(byte[] b, int offset, int len) 
880        throws IOException {
881        int newlen = len;
882        int ret = -1;
883        if (position + len > end) {
884          newlen = (int) (end - position);
885        }
886        // end case
887        if (newlen == 0) 
888          return ret;
889        ret = underLyingStream.read(b, offset, newlen);
890        position += ret;
891        return ret;
892      }
893      
894      @Override
895      public synchronized long skip(long n) throws IOException {
896        long tmpN = n;
897        if (tmpN > 0) {
898          if (position + tmpN > end) {
899            tmpN = end - position;
900          }
901          underLyingStream.seek(tmpN + position);
902          position += tmpN;
903          return tmpN;
904        }
905        return (tmpN < 0)? -1 : 0;
906      }
907      
908      @Override
909      public synchronized long getPos() throws IOException {
910        return (position - start);
911      }
912      
913      @Override
914      public synchronized void seek(long pos) throws IOException {
915        if (pos < 0 || (start + pos > end)) {
916          throw new IOException("Failed to seek: EOF");
917        }
918        position = start + pos;
919        underLyingStream.seek(position);
920      }
921
922      @Override
923      public boolean seekToNewSource(long targetPos) throws IOException {
924        //do not need to implement this
925        // hdfs in itself does seektonewsource 
926        // while reading.
927        return false;
928      }
929      
930      /**
931       * implementing position readable. 
932       */
933      @Override
934      public int read(long pos, byte[] b, int offset, int length) 
935      throws IOException {
936        int nlength = length;
937        if (start + nlength + pos > end) {
938          nlength = (int) (end - (start + pos));
939        }
940        return underLyingStream.read(pos + start , b, offset, nlength);
941      }
942      
943      /**
944       * position readable again.
945       */
946      @Override
947      public void readFully(long pos, byte[] b, int offset, int length) 
948      throws IOException {
949        if (start + length + pos > end) {
950          throw new IOException("Not enough bytes to read.");
951        }
952        underLyingStream.readFully(pos + start, b, offset, length);
953      }
954      
955      @Override
956      public void readFully(long pos, byte[] b) throws IOException {
957          readFully(pos, b, 0, b.length);
958      }
959      
960    }
961  
962    /**
963     * constructors for har input stream.
964     * @param fs the underlying filesystem
965     * @param p The path in the underlying filesystem
966     * @param start the start position in the part file
967     * @param length the length of valid data in the part file
968     * @param bufsize the buffer size
969     * @throws IOException
970     */
971    public HarFSDataInputStream(FileSystem fs, Path  p, long start, 
972        long length, int bufsize) throws IOException {
973        super(new HarFsInputStream(fs, p, start, length, bufsize));
974    }
975
976    /**
977     * constructor for har input stream.
978     * @param fs the underlying filesystem
979     * @param p the path in the underlying file system
980     * @param start the start position in the part file
981     * @param length the length of valid data in the part file.
982     * @throws IOException
983     */
984    public HarFSDataInputStream(FileSystem fs, Path  p, long start, long length)
985      throws IOException {
986        super(new HarFsInputStream(fs, p, start, length, 0));
987    }
988  }
989
990  private class HarMetaData {
991    private FileSystem fs;
992    private int version;
993    // the masterIndex of the archive
994    private Path masterIndexPath;
995    // the index file 
996    private Path archiveIndexPath;
997
998    private long masterIndexTimestamp;
999    private long archiveIndexTimestamp;
1000
1001    List<Store> stores = new ArrayList<Store>();
1002    Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
1003    private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
1004
1005    public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
1006      this.fs = fs;
1007      this.masterIndexPath = masterIndexPath;
1008      this.archiveIndexPath = archiveIndexPath;
1009    }
1010
1011    public FileStatus getPartFileStatus(Path partPath) throws IOException {
1012      FileStatus status;
1013      status = partFileStatuses.get(partPath);
1014      if (status == null) {
1015        status = fs.getFileStatus(partPath);
1016        partFileStatuses.put(partPath, status);
1017      }
1018      return status;
1019    }
1020
1021    public long getMasterIndexTimestamp() {
1022      return masterIndexTimestamp;
1023    }
1024
1025    public long getArchiveIndexTimestamp() {
1026      return archiveIndexTimestamp;
1027    }
1028
1029    private int getVersion() {
1030      return version;
1031    }
1032
1033    private void parseMetaData() throws IOException {
1034      Text line;
1035      long read;
1036      FSDataInputStream in = null;
1037      LineReader lin = null;
1038
1039      try {
1040        in = fs.open(masterIndexPath);
1041        FileStatus masterStat = fs.getFileStatus(masterIndexPath);
1042        masterIndexTimestamp = masterStat.getModificationTime();
1043        lin = new LineReader(in, getConf());
1044        line = new Text();
1045        read = lin.readLine(line);
1046
1047        // the first line contains the version of the index file
1048        String versionLine = line.toString();
1049        String[] arr = versionLine.split(" ");
1050        version = Integer.parseInt(arr[0]);
1051        // make it always backwards-compatible
1052        if (this.version > HarFileSystem.VERSION) {
1053          throw new IOException("Invalid version " + 
1054              this.version + " expected " + HarFileSystem.VERSION);
1055        }
1056
1057        // each line contains a hashcode range and the index file name
1058        String[] readStr = null;
1059        while(read < masterStat.getLen()) {
1060          int b = lin.readLine(line);
1061          read += b;
1062          readStr = line.toString().split(" ");
1063          int startHash = Integer.parseInt(readStr[0]);
1064          int endHash  = Integer.parseInt(readStr[1]);
1065          stores.add(new Store(Long.parseLong(readStr[2]), 
1066              Long.parseLong(readStr[3]), startHash,
1067              endHash));
1068          line.clear();
1069        }
1070      } finally {
1071        IOUtils.cleanup(LOG, lin, in);
1072      }
1073
1074      FSDataInputStream aIn = fs.open(archiveIndexPath);
1075      try {
1076        FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
1077        archiveIndexTimestamp = archiveStat.getModificationTime();
1078        LineReader aLin;
1079
1080        // now start reading the real index file
1081        for (Store s: stores) {
1082          read = 0;
1083          aIn.seek(s.begin);
1084          aLin = new LineReader(aIn, getConf());
1085          while (read + s.begin < s.end) {
1086            int tmp = aLin.readLine(line);
1087            read += tmp;
1088            String lineFeed = line.toString();
1089            String[] parsed = lineFeed.split(" ");
1090            parsed[0] = decodeFileName(parsed[0]);
1091            archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
1092            line.clear();
1093          }
1094        }
1095      } finally {
1096        IOUtils.cleanup(LOG, aIn);
1097      }
1098    }
1099  }
1100  
1101  /*
1102   * testing purposes only:
1103   */
1104  HarMetaData getMetadata() {
1105    return metadata;
1106  }
1107}