001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.fs;
019    
020    import java.io.FileNotFoundException;
021    import java.io.IOException;
022    import java.io.UnsupportedEncodingException;
023    import java.net.URI;
024    import java.net.URISyntaxException;
025    import java.net.URLDecoder;
026    import java.util.ArrayList;
027    import java.util.List;
028    import java.util.Map;
029    import java.util.TreeMap;
030    import java.util.HashMap;
031    import java.util.concurrent.ConcurrentHashMap;
032    
033    import org.apache.hadoop.conf.Configuration;
034    import org.apache.hadoop.fs.permission.FsPermission;
035    import org.apache.hadoop.io.Text;
036    import org.apache.hadoop.util.LineReader;
037    import org.apache.hadoop.util.Progressable;
038    
039    /**
040     * This is an implementation of the Hadoop Archive 
041     * Filesystem. This archive Filesystem has index files
042     * of the form _index* and has contents of the form
043     * part-*. The index files store the indexes of the 
044     * real files. The index files are of the form _masterindex
045     * and _index. The master index is a level of indirection 
046     * in to the index file to make the look ups faster. the index
047     * file is sorted with hash code of the paths that it contains 
048     * and the master index contains pointers to the positions in 
049     * index for ranges of hashcodes.
050     */
051    
052    public class HarFileSystem extends FilterFileSystem {
053      public static final int VERSION = 3;
054    
055      private static final Map<URI, HarMetaData> harMetaCache =
056          new ConcurrentHashMap<URI, HarMetaData>();
057    
058      // uri representation of this Har filesystem
059      private URI uri;
060      // the top level path of the archive
061      // in the underlying file system
062      private Path archivePath;
063      // the har auth
064      private String harAuth;
065    
066      // pointer into the static metadata cache
067      private HarMetaData metadata;
068    
069      /**
070       * public construction of harfilesystem
071       *
072       */
073      public HarFileSystem() {
074      }
075    
076      /**
077       * Return the protocol scheme for the FileSystem.
078       * <p/>
079       *
080       * @return <code>har</code>
081       */
082      @Override
083      public String getScheme() {
084        return "har";
085      }
086    
087      /**
088       * Constructor to create a HarFileSystem with an
089       * underlying filesystem.
090       * @param fs
091       */
092      public HarFileSystem(FileSystem fs) {
093        super(fs);
094      }
095      
096      /**
097       * Initialize a Har filesystem per har archive. The 
098       * archive home directory is the top level directory
099       * in the filesystem that contains the HAR archive.
100       * Be careful with this method, you do not want to go 
101       * on creating new Filesystem instances per call to 
102       * path.getFileSystem().
103       * the uri of Har is 
104       * har://underlyingfsscheme-host:port/archivepath.
105       * or 
106       * har:///archivepath. This assumes the underlying filesystem
107       * to be used in case not specified.
108       */
109      @Override
110      public void initialize(URI name, Configuration conf) throws IOException {
111        // decode the name
112        URI underLyingURI = decodeHarURI(name, conf);
113        // we got the right har Path- now check if this is 
114        // truly a har filesystem
115        Path harPath = archivePath(
116          new Path(name.getScheme(), name.getAuthority(), name.getPath()));
117        if (harPath == null) { 
118          throw new IOException("Invalid path for the Har Filesystem. " + 
119                               name.toString());
120        }
121        if (fs == null) {
122          fs = FileSystem.get(underLyingURI, conf);
123        }
124        uri = harPath.toUri();
125        archivePath = new Path(uri.getPath());
126        harAuth = getHarAuth(underLyingURI);
127        //check for the underlying fs containing
128        // the index file
129        Path masterIndexPath = new Path(archivePath, "_masterindex");
130        Path archiveIndexPath = new Path(archivePath, "_index");
131        if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) {
132          throw new IOException("Invalid path for the Har Filesystem. " +
133              "No index file in " + harPath);
134        }
135    
136        metadata = harMetaCache.get(uri);
137        if (metadata != null) {
138          FileStatus mStat = fs.getFileStatus(masterIndexPath);
139          FileStatus aStat = fs.getFileStatus(archiveIndexPath);
140          if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() ||
141              aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) {
142            // the archive has been overwritten since we last read it
143            // remove the entry from the meta data cache
144            metadata = null;
145            harMetaCache.remove(uri);
146          }
147        }
148        if (metadata == null) {
149          metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath);
150          metadata.parseMetaData();
151          harMetaCache.put(uri, metadata);
152        }
153      }
154    
155      // get the version of the filesystem from the masterindex file
156      // the version is currently not useful since its the first version
157      // of archives
158      public int getHarVersion() throws IOException {
159        if (metadata != null) {
160          return metadata.getVersion();
161        }
162        else {
163          throw new IOException("Invalid meta data for the Har Filesystem");
164        }
165      }
166    
167      /*
168       * find the parent path that is the 
169       * archive path in the path. The last
170       * path segment that ends with .har is 
171       * the path that will be returned.
172       */
173      private Path archivePath(Path p) {
174        Path retPath = null;
175        Path tmp = p;
176        for (int i=0; i< p.depth(); i++) {
177          if (tmp.toString().endsWith(".har")) {
178            retPath = tmp;
179            break;
180          }
181          tmp = tmp.getParent();
182        }
183        return retPath;
184      }
185    
186      /**
187       * decode the raw URI to get the underlying URI
188       * @param rawURI raw Har URI
189       * @return filtered URI of the underlying fileSystem
190       */
191      private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException {
192        String tmpAuth = rawURI.getAuthority();
193        //we are using the default file
194        //system in the config 
195        //so create a underlying uri and 
196        //return it
197        if (tmpAuth == null) {
198          //create a path 
199          return FileSystem.getDefaultUri(conf);
200        }
201        String host = rawURI.getHost();
202        if (host == null) {
203          throw new IOException("URI: " + rawURI
204              + " is an invalid Har URI since host==null."
205              + "  Expecting har://<scheme>-<host>/<path>.");
206        }
207        int i = host.indexOf('-');
208        if (i < 0) {
209          throw new IOException("URI: " + rawURI
210              + " is an invalid Har URI since '-' not found."
211              + "  Expecting har://<scheme>-<host>/<path>.");
212        }
213        final String underLyingScheme = host.substring(0, i);
214        i++;
215        final String underLyingHost = i == host.length()? null: host.substring(i);
216        int underLyingPort = rawURI.getPort();
217        String auth = (underLyingHost == null && underLyingPort == -1)?
218                      null:(underLyingHost+
219                          (underLyingPort == -1 ? "" : ":"+underLyingPort));
220        URI tmp = null;
221        if (rawURI.getQuery() != null) {
222          // query component not allowed
223          throw new IOException("query component in Path not supported  " + rawURI);
224        }
225        try {
226          tmp = new URI(underLyingScheme, auth, rawURI.getPath(), 
227                rawURI.getQuery(), rawURI.getFragment());
228        } catch (URISyntaxException e) {
229            // do nothing should not happen
230        }
231        return tmp;
232      }
233    
234      private static String decodeString(String str)
235        throws UnsupportedEncodingException {
236        return URLDecoder.decode(str, "UTF-8");
237      }
238    
239      private String decodeFileName(String fname) 
240        throws UnsupportedEncodingException {
241        int version = metadata.getVersion();
242        if (version == 2 || version == 3){
243          return decodeString(fname);
244        }
245        return fname;
246      }
247    
248      /**
249       * return the top level archive.
250       */
251      @Override
252      public Path getWorkingDirectory() {
253        return new Path(uri.toString());
254      }
255      
256      /**
257       * Create a har specific auth 
258       * har-underlyingfs:port
259       * @param underLyingURI the uri of underlying
260       * filesystem
261       * @return har specific auth
262       */
263      private String getHarAuth(URI underLyingUri) {
264        String auth = underLyingUri.getScheme() + "-";
265        if (underLyingUri.getHost() != null) {
266          auth += underLyingUri.getHost() + ":";
267          if (underLyingUri.getPort() != -1) {
268            auth +=  underLyingUri.getPort();
269          }
270        }
271        else {
272          auth += ":";
273        }
274        return auth;
275      }
276      
277      /**
278       * Returns the uri of this filesystem.
279       * The uri is of the form 
280       * har://underlyingfsschema-host:port/pathintheunderlyingfs
281       */
282      @Override
283      public URI getUri() {
284        return this.uri;
285      }
286      
287      /**
288       * this method returns the path 
289       * inside the har filesystem.
290       * this is relative path inside 
291       * the har filesystem.
292       * @param path the fully qualified path in the har filesystem.
293       * @return relative path in the filesystem.
294       */
295      private Path getPathInHar(Path path) {
296        Path harPath = new Path(path.toUri().getPath());
297        if (archivePath.compareTo(harPath) == 0)
298          return new Path(Path.SEPARATOR);
299        Path tmp = new Path(harPath.getName());
300        Path parent = harPath.getParent();
301        while (!(parent.compareTo(archivePath) == 0)) {
302          if (parent.toString().equals(Path.SEPARATOR)) {
303            tmp = null;
304            break;
305          }
306          tmp = new Path(parent.getName(), tmp);
307          parent = parent.getParent();
308        }
309        if (tmp != null) 
310          tmp = new Path(Path.SEPARATOR, tmp);
311        return tmp;
312      }
313      
314      //the relative path of p. basically 
315      // getting rid of /. Parsing and doing 
316      // string manipulation is not good - so
317      // just use the path api to do it.
318      private Path makeRelative(String initial, Path p) {
319        String scheme = this.uri.getScheme();
320        String authority = this.uri.getAuthority();
321        Path root = new Path(Path.SEPARATOR);
322        if (root.compareTo(p) == 0)
323          return new Path(scheme, authority, initial);
324        Path retPath = new Path(p.getName());
325        Path parent = p.getParent();
326        for (int i=0; i < p.depth()-1; i++) {
327          retPath = new Path(parent.getName(), retPath);
328          parent = parent.getParent();
329        }
330        return new Path(new Path(scheme, authority, initial),
331          retPath.toString());
332      }
333      
334      /* this makes a path qualified in the har filesystem
335       * (non-Javadoc)
336       * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified(
337       * org.apache.hadoop.fs.Path)
338       */
339      @Override
340      public Path makeQualified(Path path) {
341        // make sure that we just get the 
342        // path component 
343        Path fsPath = path;
344        if (!path.isAbsolute()) {
345          fsPath = new Path(archivePath, path);
346        }
347    
348        URI tmpURI = fsPath.toUri();
349        //change this to Har uri 
350        return new Path(uri.getScheme(), harAuth, tmpURI.getPath());
351      }
352    
353      /**
354       * Fix offset and length of block locations.
355       * Note that this method modifies the original array.
356       * @param locations block locations of har part file
357       * @param start the start of the desired range in the contained file
358       * @param len the length of the desired range
359       * @param fileOffsetInHar the offset of the desired file in the har part file
360       * @return block locations with fixed offset and length
361       */  
362      static BlockLocation[] fixBlockLocations(BlockLocation[] locations,
363                                              long start,
364                                              long len,
365                                              long fileOffsetInHar) {
366        // offset 1 past last byte of desired range
367        long end = start + len;
368    
369        for (BlockLocation location : locations) {
370          // offset of part block relative to beginning of desired file
371          // (may be negative if file starts in this part block)
372          long harBlockStart = location.getOffset() - fileOffsetInHar;
373          // offset 1 past last byte of har block relative to beginning of
374          // desired file
375          long harBlockEnd = harBlockStart + location.getLength();
376          
377          if (start > harBlockStart) {
378            // desired range starts after beginning of this har block
379            // fix offset to beginning of relevant range (relative to desired file)
380            location.setOffset(start);
381            // fix length to relevant portion of har block
382            location.setLength(location.getLength() - (start - harBlockStart));
383          } else {
384            // desired range includes beginning of this har block
385            location.setOffset(harBlockStart);
386          }
387          
388          if (harBlockEnd > end) {
389            // range ends before end of this har block
390            // fix length to remove irrelevant portion at the end
391            location.setLength(location.getLength() - (harBlockEnd - end));
392          }
393        }
394        
395        return locations;
396      }
397      
398      /**
399       * Get block locations from the underlying fs and fix their
400       * offsets and lengths.
401       * @param file the input filestatus to get block locations
402       * @param start the start of the desired range in the contained file
403       * @param len the length of the desired range
404       * @return block locations for this segment of file
405       * @throws IOException
406       */
407      @Override
408      public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
409                                                   long len) throws IOException {
410        HarStatus hstatus = getFileHarStatus(file.getPath());
411        Path partPath = new Path(archivePath, hstatus.getPartName());
412        FileStatus partStatus = metadata.getPartFileStatus(partPath);
413    
414        // get all part blocks that overlap with the desired file blocks
415        BlockLocation[] locations = 
416          fs.getFileBlockLocations(partStatus,
417                                   hstatus.getStartIndex() + start, len);
418    
419        return fixBlockLocations(locations, start, len, hstatus.getStartIndex());
420      }
421      
422      /**
423       * the hash of the path p inside iniside
424       * the filesystem
425       * @param p the path in the harfilesystem
426       * @return the hash code of the path.
427       */
428      public static int getHarHash(Path p) {
429        return (p.toString().hashCode() & 0x7fffffff);
430      }
431      
432      static class Store {
433        public Store() {
434          begin = end = startHash = endHash = 0;
435        }
436        public Store(long begin, long end, int startHash, int endHash) {
437          this.begin = begin;
438          this.end = end;
439          this.startHash = startHash;
440          this.endHash = endHash;
441        }
442        public long begin;
443        public long end;
444        public int startHash;
445        public int endHash;
446      }
447      
448      /**
449       * Get filestatuses of all the children of a given directory. This just reads
450       * through index file and reads line by line to get all statuses for children
451       * of a directory. Its a brute force way of getting all such filestatuses
452       * 
453       * @param parent
454       *          the parent path directory
455       * @param statuses
456       *          the list to add the children filestatuses to
457       * @param children
458       *          the string list of children for this parent
459       * @param archiveIndexStat
460       *          the archive index filestatus
461       */
462      private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses,
463          List<String> children) throws IOException {
464        String parentString = parent.getName();
465        if (!parentString.endsWith(Path.SEPARATOR)){
466            parentString += Path.SEPARATOR;
467        }
468        Path harPath = new Path(parentString);
469        int harlen = harPath.depth();
470        final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
471    
472        for (HarStatus hstatus : metadata.archive.values()) {
473          String child = hstatus.getName();
474          if ((child.startsWith(parentString))) {
475            Path thisPath = new Path(child);
476            if (thisPath.depth() == harlen + 1) {
477              statuses.add(toFileStatus(hstatus, cache));
478            }
479          }
480        }
481      }
482    
483      /**
484       * Combine the status stored in the index and the underlying status. 
485       * @param h status stored in the index
486       * @param cache caching the underlying file statuses
487       * @return the combined file status
488       * @throws IOException
489       */
490      private FileStatus toFileStatus(HarStatus h,
491          Map<String, FileStatus> cache) throws IOException {
492        FileStatus underlying = null;
493        if (cache != null) {
494          underlying = cache.get(h.partName);
495        }
496        if (underlying == null) {
497          final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
498          underlying = fs.getFileStatus(p);
499          if (cache != null) {
500            cache.put(h.partName, underlying);
501          }
502        }
503    
504        long modTime = 0;
505        int version = metadata.getVersion();
506        if (version < 3) {
507          modTime = underlying.getModificationTime();
508        } else if (version == 3) {
509          modTime = h.getModificationTime();
510        }
511    
512        return new FileStatus(
513            h.isDir()? 0L: h.getLength(),
514            h.isDir(),
515            underlying.getReplication(),
516            underlying.getBlockSize(),
517            modTime,
518            underlying.getAccessTime(),
519            underlying.getPermission(),
520            underlying.getOwner(),
521            underlying.getGroup(),
522            makeRelative(this.uri.getPath(), new Path(h.name)));
523      }
524    
525      // a single line parser for hadoop archives status 
526      // stored in a single line in the index files 
527      // the format is of the form 
528      // filename "dir"/"file" partFileName startIndex length 
529      // <space seperated children>
530      private class HarStatus {
531        boolean isDir;
532        String name;
533        List<String> children;
534        String partName;
535        long startIndex;
536        long length;
537        long modificationTime = 0;
538    
539        public HarStatus(String harString) throws UnsupportedEncodingException {
540          String[] splits = harString.split(" ");
541          this.name = decodeFileName(splits[0]);
542          this.isDir = "dir".equals(splits[1]) ? true: false;
543          // this is equal to "none" if its a directory
544          this.partName = splits[2];
545          this.startIndex = Long.parseLong(splits[3]);
546          this.length = Long.parseLong(splits[4]);
547    
548          int version = metadata.getVersion();
549          String[] propSplits = null;
550          // propSplits is used to retrieve the metainformation that Har versions
551          // 1 & 2 missed (modification time, permission, owner group).
552          // These fields are stored in an encoded string placed in different
553          // locations depending on whether it's a file or directory entry.
554          // If it's a directory, the string will be placed at the partName
555          // location (directories have no partName because they don't have data
556          // to be stored). This is done because the number of fields in a
557          // directory entry is unbounded (all children are listed at the end)
558          // If it's a file, the string will be the last field.
559          if (isDir) {
560            if (version == 3){
561              propSplits = decodeString(this.partName).split(" ");
562            }
563            children = new ArrayList<String>();
564            for (int i = 5; i < splits.length; i++) {
565              children.add(decodeFileName(splits[i]));
566            }
567          } else if (version == 3) {
568            propSplits = decodeString(splits[5]).split(" ");
569          }
570    
571          if (propSplits != null && propSplits.length >= 4) {
572            modificationTime = Long.parseLong(propSplits[0]);
573            // the fields below are stored in the file but are currently not used
574            // by HarFileSystem
575            // permission = new FsPermission(Short.parseShort(propSplits[1]));
576            // owner = decodeString(propSplits[2]);
577            // group = decodeString(propSplits[3]);
578          }
579        }
580        public boolean isDir() {
581          return isDir;
582        }
583        
584        public String getName() {
585          return name;
586        }
587        
588        public List<String> getChildren() {
589          return children;
590        }
591        public String getFileName() {
592          return name;
593        }
594        public String getPartName() {
595          return partName;
596        }
597        public long getStartIndex() {
598          return startIndex;
599        }
600        public long getLength() {
601          return length;
602        }
603        public long getModificationTime() {
604          return modificationTime;
605        }
606      }
607      
608      /**
609       * return the filestatus of files in har archive.
610       * The permission returned are that of the archive
611       * index files. The permissions are not persisted 
612       * while creating a hadoop archive.
613       * @param f the path in har filesystem
614       * @return filestatus.
615       * @throws IOException
616       */
617      @Override
618      public FileStatus getFileStatus(Path f) throws IOException {
619        HarStatus hstatus = getFileHarStatus(f);
620        return toFileStatus(hstatus, null);
621      }
622    
623      private HarStatus getFileHarStatus(Path f) throws IOException {
624        // get the fs DataInputStream for the underlying file
625        // look up the index.
626        Path p = makeQualified(f);
627        Path harPath = getPathInHar(p);
628        if (harPath == null) {
629          throw new IOException("Invalid file name: " + f + " in " + uri);
630        }
631        HarStatus hstatus = metadata.archive.get(harPath);
632        if (hstatus == null) {
633          throw new FileNotFoundException("File: " +  f + " does not exist in " + uri);
634        }
635        return hstatus;
636      }
637    
638      /**
639       * @return null since no checksum algorithm is implemented.
640       */
641      @Override
642      public FileChecksum getFileChecksum(Path f) {
643        return null;
644      }
645    
646      /**
647       * Returns a har input stream which fakes end of 
648       * file. It reads the index files to get the part 
649       * file name and the size and start of the file.
650       */
651      @Override
652      public FSDataInputStream open(Path f, int bufferSize) throws IOException {
653        // get the fs DataInputStream for the underlying file
654        HarStatus hstatus = getFileHarStatus(f);
655        // we got it.. woo hooo!!! 
656        if (hstatus.isDir()) {
657          throw new FileNotFoundException(f + " : not a file in " +
658                    archivePath);
659        }
660        return new HarFSDataInputStream(fs, new Path(archivePath, 
661            hstatus.getPartName()),
662            hstatus.getStartIndex(), hstatus.getLength(), bufferSize);
663      }
664     
665      /*
666       * create throws an exception in Har filesystem.
667       * The archive once created cannot be changed.
668       */
669      public FSDataOutputStream create(Path f, int bufferSize) 
670                                        throws IOException {
671        throw new IOException("Har: Create not allowed");
672      }
673      
674      @Override
675      public FSDataOutputStream create(Path f,
676          FsPermission permission,
677          boolean overwrite,
678          int bufferSize,
679          short replication,
680          long blockSize,
681          Progressable progress) throws IOException {
682        throw new IOException("Har: create not allowed.");
683      }
684      
685      @Override
686      public void close() throws IOException {
687        if (fs != null) {
688          try {
689            fs.close();
690          } catch(IOException ie) {
691            //this might already be closed
692            // ignore
693          }
694        }
695      }
696      
697      /**
698       * Not implemented.
699       */
700      @Override
701      public boolean setReplication(Path src, short replication) throws IOException{
702        throw new IOException("Har: setreplication not allowed");
703      }
704      
705      /**
706       * Not implemented.
707       */
708      @Override
709      public boolean delete(Path f, boolean recursive) throws IOException { 
710        throw new IOException("Har: delete not allowed");
711      }
712      
713      /**
714       * liststatus returns the children of a directory 
715       * after looking up the index files.
716       */
717      @Override
718      public FileStatus[] listStatus(Path f) throws IOException {
719        //need to see if the file is an index in file
720        //get the filestatus of the archive directory
721        // we will create fake filestatuses to return
722        // to the client
723        List<FileStatus> statuses = new ArrayList<FileStatus>();
724        Path tmpPath = makeQualified(f);
725        Path harPath = getPathInHar(tmpPath);
726        HarStatus hstatus = metadata.archive.get(harPath);
727        if (hstatus == null) {
728          throw new FileNotFoundException("File " + f + " not found in " + archivePath);
729        }
730        if (hstatus.isDir()) {
731          fileStatusesInIndex(hstatus, statuses, hstatus.children);
732        } else {
733          statuses.add(toFileStatus(hstatus, null));
734        }
735        
736        return statuses.toArray(new FileStatus[statuses.size()]);
737      }
738      
739      /**
740       * return the top level archive path.
741       */
742      @Override
743      public Path getHomeDirectory() {
744        return new Path(uri.toString());
745      }
746      
747      @Override
748      public void setWorkingDirectory(Path newDir) {
749        //does nothing.
750      }
751      
752      /**
753       * not implemented.
754       */
755      @Override
756      public boolean mkdirs(Path f, FsPermission permission) throws IOException {
757        throw new IOException("Har: mkdirs not allowed");
758      }
759      
760      /**
761       * not implemented.
762       */
763      @Override
764      public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws 
765            IOException {
766        throw new IOException("Har: copyfromlocalfile not allowed");
767      }
768      
769      /**
770       * copies the file in the har filesystem to a local file.
771       */
772      @Override
773      public void copyToLocalFile(boolean delSrc, Path src, Path dst) 
774        throws IOException {
775        FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf());
776      }
777      
778      /**
779       * not implemented.
780       */
781      @Override
782      public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) 
783        throws IOException {
784        throw new IOException("Har: startLocalOutput not allowed");
785      }
786      
787      /**
788       * not implemented.
789       */
790      @Override
791      public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) 
792        throws IOException {
793        throw new IOException("Har: completeLocalOutput not allowed");
794      }
795      
796      /**
797       * not implemented.
798       */
799      @Override
800      public void setOwner(Path p, String username, String groupname)
801        throws IOException {
802        throw new IOException("Har: setowner not allowed");
803      }
804    
805      /**
806       * Not implemented.
807       */
808      @Override
809      public void setPermission(Path p, FsPermission permisssion) 
810        throws IOException {
811        throw new IOException("Har: setPermission not allowed");
812      }
813      
814      /**
815       * Hadoop archives input stream. This input stream fakes EOF 
816       * since archive files are part of bigger part files.
817       */
818      private static class HarFSDataInputStream extends FSDataInputStream {
819        /**
820         * Create an input stream that fakes all the reads/positions/seeking.
821         */
822        private static class HarFsInputStream extends FSInputStream {
823          private long position, start, end;
824          //The underlying data input stream that the
825          // underlying filesystem will return.
826          private FSDataInputStream underLyingStream;
827          //one byte buffer
828          private byte[] oneBytebuff = new byte[1];
829          HarFsInputStream(FileSystem fs, Path path, long start,
830              long length, int bufferSize) throws IOException {
831            underLyingStream = fs.open(path, bufferSize);
832            underLyingStream.seek(start);
833            // the start of this file in the part file
834            this.start = start;
835            // the position pointer in the part file
836            this.position = start;
837            // the end pointer in the part file
838            this.end = start + length;
839          }
840          
841          @Override
842          public synchronized int available() throws IOException {
843            long remaining = end - underLyingStream.getPos();
844            if (remaining > (long)Integer.MAX_VALUE) {
845              return Integer.MAX_VALUE;
846            }
847            return (int) remaining;
848          }
849          
850          @Override
851          public synchronized  void close() throws IOException {
852            underLyingStream.close();
853            super.close();
854          }
855          
856          //not implemented
857          @Override
858          public void mark(int readLimit) {
859            // do nothing 
860          }
861          
862          /**
863           * reset is not implemented
864           */
865          @Override
866          public void reset() throws IOException {
867            throw new IOException("reset not implemented.");
868          }
869          
870          @Override
871          public synchronized int read() throws IOException {
872            int ret = read(oneBytebuff, 0, 1);
873            return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff);
874          }
875          
876          @Override
877          public synchronized int read(byte[] b) throws IOException {
878            int ret = read(b, 0, b.length);
879            if (ret != -1) {
880              position += ret;
881            }
882            return ret;
883          }
884          
885          /**
886           * 
887           */
888          @Override
889          public synchronized int read(byte[] b, int offset, int len) 
890            throws IOException {
891            int newlen = len;
892            int ret = -1;
893            if (position + len > end) {
894              newlen = (int) (end - position);
895            }
896            // end case
897            if (newlen == 0) 
898              return ret;
899            ret = underLyingStream.read(b, offset, newlen);
900            position += ret;
901            return ret;
902          }
903          
904          @Override
905          public synchronized long skip(long n) throws IOException {
906            long tmpN = n;
907            if (tmpN > 0) {
908              if (position + tmpN > end) {
909                tmpN = end - position;
910              }
911              underLyingStream.seek(tmpN + position);
912              position += tmpN;
913              return tmpN;
914            }
915            return (tmpN < 0)? -1 : 0;
916          }
917          
918          @Override
919          public synchronized long getPos() throws IOException {
920            return (position - start);
921          }
922          
923          @Override
924          public synchronized void seek(long pos) throws IOException {
925            if (pos < 0 || (start + pos > end)) {
926              throw new IOException("Failed to seek: EOF");
927            }
928            position = start + pos;
929            underLyingStream.seek(position);
930          }
931    
932          @Override
933          public boolean seekToNewSource(long targetPos) throws IOException {
934            //do not need to implement this
935            // hdfs in itself does seektonewsource 
936            // while reading.
937            return false;
938          }
939          
940          /**
941           * implementing position readable. 
942           */
943          @Override
944          public int read(long pos, byte[] b, int offset, int length) 
945          throws IOException {
946            int nlength = length;
947            if (start + nlength + pos > end) {
948              nlength = (int) (end - (start + pos));
949            }
950            return underLyingStream.read(pos + start , b, offset, nlength);
951          }
952          
953          /**
954           * position readable again.
955           */
956          @Override
957          public void readFully(long pos, byte[] b, int offset, int length) 
958          throws IOException {
959            if (start + length + pos > end) {
960              throw new IOException("Not enough bytes to read.");
961            }
962            underLyingStream.readFully(pos + start, b, offset, length);
963          }
964          
965          @Override
966          public void readFully(long pos, byte[] b) throws IOException {
967              readFully(pos, b, 0, b.length);
968          }
969          
970        }
971      
972        /**
973         * constructors for har input stream.
974         * @param fs the underlying filesystem
975         * @param p The path in the underlying filesystem
976         * @param start the start position in the part file
977         * @param length the length of valid data in the part file
978         * @param bufsize the buffer size
979         * @throws IOException
980         */
981        public HarFSDataInputStream(FileSystem fs, Path  p, long start, 
982            long length, int bufsize) throws IOException {
983            super(new HarFsInputStream(fs, p, start, length, bufsize));
984        }
985    
986        /**
987         * constructor for har input stream.
988         * @param fs the underlying filesystem
989         * @param p the path in the underlying file system
990         * @param start the start position in the part file
991         * @param length the length of valid data in the part file.
992         * @throws IOException
993         */
994        public HarFSDataInputStream(FileSystem fs, Path  p, long start, long length)
995          throws IOException {
996            super(new HarFsInputStream(fs, p, start, length, 0));
997        }
998      }
999    
1000      private class HarMetaData {
1001        private FileSystem fs;
1002        private int version;
1003        // the masterIndex of the archive
1004        private Path masterIndexPath;
1005        // the index file 
1006        private Path archiveIndexPath;
1007    
1008        private long masterIndexTimestamp;
1009        private long archiveIndexTimestamp;
1010    
1011        List<Store> stores = new ArrayList<Store>();
1012        Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
1013        private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
1014    
1015        public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
1016          this.fs = fs;
1017          this.masterIndexPath = masterIndexPath;
1018          this.archiveIndexPath = archiveIndexPath;
1019        }
1020    
1021        public FileStatus getPartFileStatus(Path partPath) throws IOException {
1022          FileStatus status;
1023          status = partFileStatuses.get(partPath);
1024          if (status == null) {
1025            status = fs.getFileStatus(partPath);
1026            partFileStatuses.put(partPath, status);
1027          }
1028          return status;
1029        }
1030    
1031        public long getMasterIndexTimestamp() {
1032          return masterIndexTimestamp;
1033        }
1034    
1035        public long getArchiveIndexTimestamp() {
1036          return archiveIndexTimestamp;
1037        }
1038    
1039        private int getVersion() {
1040          return version;
1041        }
1042    
1043        private void parseMetaData() throws IOException {
1044          FSDataInputStream in = fs.open(masterIndexPath);
1045          FileStatus masterStat = fs.getFileStatus(masterIndexPath);
1046          masterIndexTimestamp = masterStat.getModificationTime();
1047          LineReader lin = new LineReader(in, getConf());
1048          Text line = new Text();
1049          long read = lin.readLine(line);
1050    
1051         // the first line contains the version of the index file
1052          String versionLine = line.toString();
1053          String[] arr = versionLine.split(" ");
1054          version = Integer.parseInt(arr[0]);
1055          // make it always backwards-compatible
1056          if (this.version > HarFileSystem.VERSION) {
1057            throw new IOException("Invalid version " + 
1058                this.version + " expected " + HarFileSystem.VERSION);
1059          }
1060    
1061          // each line contains a hashcode range and the index file name
1062          String[] readStr = null;
1063          while(read < masterStat.getLen()) {
1064            int b = lin.readLine(line);
1065            read += b;
1066            readStr = line.toString().split(" ");
1067            int startHash = Integer.parseInt(readStr[0]);
1068            int endHash  = Integer.parseInt(readStr[1]);
1069            stores.add(new Store(Long.parseLong(readStr[2]), 
1070                Long.parseLong(readStr[3]), startHash,
1071                endHash));
1072            line.clear();
1073          }
1074          try {
1075            // close the master index
1076            lin.close();
1077          } catch(IOException io){
1078            // do nothing just a read.
1079          }
1080    
1081          FSDataInputStream aIn = fs.open(archiveIndexPath);
1082          FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
1083          archiveIndexTimestamp = archiveStat.getModificationTime();
1084          LineReader aLin;
1085    
1086          // now start reading the real index file
1087          for (Store s: stores) {
1088            read = 0;
1089            aIn.seek(s.begin);
1090            aLin = new LineReader(aIn, getConf());
1091            while (read + s.begin < s.end) {
1092              int tmp = aLin.readLine(line);
1093              read += tmp;
1094              String lineFeed = line.toString();
1095              String[] parsed = lineFeed.split(" ");
1096              parsed[0] = decodeFileName(parsed[0]);
1097              archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
1098              line.clear();
1099            }
1100          }
1101          try {
1102            // close the archive index
1103            aIn.close();
1104          } catch(IOException io) {
1105            // do nothing just a read.
1106          }
1107        }
1108      }
1109    }