001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.fs;
019    
020    import java.io.FileNotFoundException;
021    import java.io.IOException;
022    import java.io.UnsupportedEncodingException;
023    import java.net.URI;
024    import java.net.URISyntaxException;
025    import java.net.URLDecoder;
026    import java.util.ArrayList;
027    import java.util.EnumSet;
028    import java.util.List;
029    import java.util.Map;
030    import java.util.TreeMap;
031    import java.util.HashMap;
032    
033    import org.apache.hadoop.conf.Configuration;
034    import org.apache.hadoop.fs.permission.FsPermission;
035    import org.apache.hadoop.io.Text;
036    import org.apache.hadoop.util.LineReader;
037    import org.apache.hadoop.util.Progressable;
038    
039    /**
040     * This is an implementation of the Hadoop Archive 
041     * Filesystem. This archive Filesystem has index files
042     * of the form _index* and has contents of the form
043     * part-*. The index files store the indexes of the 
044     * real files. The index files are of the form _masterindex
045     * and _index. The master index is a level of indirection 
046     * in to the index file to make the look ups faster. the index
047     * file is sorted with hash code of the paths that it contains 
048     * and the master index contains pointers to the positions in 
049     * index for ranges of hashcodes.
050     */
051    
052    public class HarFileSystem extends FilterFileSystem {
053      public static final int VERSION = 3;
054    
055      private static final Map<URI, HarMetaData> harMetaCache = new HashMap<URI, HarMetaData>();
056    
057      // uri representation of this Har filesystem
058      private URI uri;
059      // the top level path of the archive
060      // in the underlying file system
061      private Path archivePath;
062      // the har auth
063      private String harAuth;
064    
065      // pointer into the static metadata cache
066      private HarMetaData metadata;
067    
068      /**
069       * public construction of harfilesystem
070       *
071       */
072      public HarFileSystem() {
073      }
074    
075      /**
076       * Return the protocol scheme for the FileSystem.
077       * <p/>
078       *
079       * @return <code>har</code>
080       */
081      @Override
082      public String getScheme() {
083        return "har";
084      }
085    
086      /**
087       * Constructor to create a HarFileSystem with an
088       * underlying filesystem.
089       * @param fs
090       */
091      public HarFileSystem(FileSystem fs) {
092        super(fs);
093      }
094      
095      /**
096       * Initialize a Har filesystem per har archive. The 
097       * archive home directory is the top level directory
098       * in the filesystem that contains the HAR archive.
099       * Be careful with this method, you do not want to go 
100       * on creating new Filesystem instances per call to 
101       * path.getFileSystem().
102       * the uri of Har is 
103       * har://underlyingfsscheme-host:port/archivepath.
104       * or 
105       * har:///archivepath. This assumes the underlying filesystem
106       * to be used in case not specified.
107       */
108      public void initialize(URI name, Configuration conf) throws IOException {
109        // decode the name
110        URI underLyingURI = decodeHarURI(name, conf);
111        // we got the right har Path- now check if this is 
112        // truly a har filesystem
113        Path harPath = archivePath(
114          new Path(name.getScheme(), name.getAuthority(), name.getPath()));
115        if (harPath == null) { 
116          throw new IOException("Invalid path for the Har Filesystem. " + 
117                               name.toString());
118        }
119        if (fs == null) {
120          fs = FileSystem.get(underLyingURI, conf);
121        }
122        uri = harPath.toUri();
123        archivePath = new Path(uri.getPath());
124        harAuth = getHarAuth(underLyingURI);
125        //check for the underlying fs containing
126        // the index file
127        Path masterIndexPath = new Path(archivePath, "_masterindex");
128        Path archiveIndexPath = new Path(archivePath, "_index");
129        if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) {
130          throw new IOException("Invalid path for the Har Filesystem. " +
131              "No index file in " + harPath);
132        }
133    
134        metadata = harMetaCache.get(uri);
135        if (metadata != null) {
136          FileStatus mStat = fs.getFileStatus(masterIndexPath);
137          FileStatus aStat = fs.getFileStatus(archiveIndexPath);
138          if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() ||
139              aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) {
140            // the archive has been overwritten since we last read it
141            // remove the entry from the meta data cache
142            metadata = null;
143            harMetaCache.remove(uri);
144          }
145        }
146        if (metadata == null) {
147          metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath);
148          metadata.parseMetaData();
149          harMetaCache.put(uri, metadata);
150        }
151      }
152    
153      // get the version of the filesystem from the masterindex file
154      // the version is currently not useful since its the first version
155      // of archives
156      public int getHarVersion() throws IOException {
157        if (metadata != null) {
158          return metadata.getVersion();
159        }
160        else {
161          throw new IOException("Invalid meta data for the Har Filesystem");
162        }
163      }
164    
165      /*
166       * find the parent path that is the 
167       * archive path in the path. The last
168       * path segment that ends with .har is 
169       * the path that will be returned.
170       */
171      private Path archivePath(Path p) {
172        Path retPath = null;
173        Path tmp = p;
174        for (int i=0; i< p.depth(); i++) {
175          if (tmp.toString().endsWith(".har")) {
176            retPath = tmp;
177            break;
178          }
179          tmp = tmp.getParent();
180        }
181        return retPath;
182      }
183    
184      /**
185       * decode the raw URI to get the underlying URI
186       * @param rawURI raw Har URI
187       * @return filtered URI of the underlying fileSystem
188       */
189      private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException {
190        String tmpAuth = rawURI.getAuthority();
191        //we are using the default file
192        //system in the config 
193        //so create a underlying uri and 
194        //return it
195        if (tmpAuth == null) {
196          //create a path 
197          return FileSystem.getDefaultUri(conf);
198        }
199        String host = rawURI.getHost();
200        if (host == null) {
201          throw new IOException("URI: " + rawURI
202              + " is an invalid Har URI since host==null."
203              + "  Expecting har://<scheme>-<host>/<path>.");
204        }
205        int i = host.indexOf('-');
206        if (i < 0) {
207          throw new IOException("URI: " + rawURI
208              + " is an invalid Har URI since '-' not found."
209              + "  Expecting har://<scheme>-<host>/<path>.");
210        }
211        final String underLyingScheme = host.substring(0, i);
212        i++;
213        final String underLyingHost = i == host.length()? null: host.substring(i);
214        int underLyingPort = rawURI.getPort();
215        String auth = (underLyingHost == null && underLyingPort == -1)?
216                      null:(underLyingHost+
217                          (underLyingPort == -1 ? "" : ":"+underLyingPort));
218        URI tmp = null;
219        if (rawURI.getQuery() != null) {
220          // query component not allowed
221          throw new IOException("query component in Path not supported  " + rawURI);
222        }
223        try {
224          tmp = new URI(underLyingScheme, auth, rawURI.getPath(), 
225                rawURI.getQuery(), rawURI.getFragment());
226        } catch (URISyntaxException e) {
227            // do nothing should not happen
228        }
229        return tmp;
230      }
231    
232      private static String decodeString(String str)
233        throws UnsupportedEncodingException {
234        return URLDecoder.decode(str, "UTF-8");
235      }
236    
237      private String decodeFileName(String fname) 
238        throws UnsupportedEncodingException {
239        int version = metadata.getVersion();
240        if (version == 2 || version == 3){
241          return decodeString(fname);
242        }
243        return fname;
244      }
245    
246      /**
247       * return the top level archive.
248       */
249      public Path getWorkingDirectory() {
250        return new Path(uri.toString());
251      }
252      
253      /**
254       * Create a har specific auth 
255       * har-underlyingfs:port
256       * @param underLyingURI the uri of underlying
257       * filesystem
258       * @return har specific auth
259       */
260      private String getHarAuth(URI underLyingUri) {
261        String auth = underLyingUri.getScheme() + "-";
262        if (underLyingUri.getHost() != null) {
263          auth += underLyingUri.getHost() + ":";
264          if (underLyingUri.getPort() != -1) {
265            auth +=  underLyingUri.getPort();
266          }
267        }
268        else {
269          auth += ":";
270        }
271        return auth;
272      }
273      
274      /**
275       * Returns the uri of this filesystem.
276       * The uri is of the form 
277       * har://underlyingfsschema-host:port/pathintheunderlyingfs
278       */
279      @Override
280      public URI getUri() {
281        return this.uri;
282      }
283      
284      /**
285       * this method returns the path 
286       * inside the har filesystem.
287       * this is relative path inside 
288       * the har filesystem.
289       * @param path the fully qualified path in the har filesystem.
290       * @return relative path in the filesystem.
291       */
292      private Path getPathInHar(Path path) {
293        Path harPath = new Path(path.toUri().getPath());
294        if (archivePath.compareTo(harPath) == 0)
295          return new Path(Path.SEPARATOR);
296        Path tmp = new Path(harPath.getName());
297        Path parent = harPath.getParent();
298        while (!(parent.compareTo(archivePath) == 0)) {
299          if (parent.toString().equals(Path.SEPARATOR)) {
300            tmp = null;
301            break;
302          }
303          tmp = new Path(parent.getName(), tmp);
304          parent = parent.getParent();
305        }
306        if (tmp != null) 
307          tmp = new Path(Path.SEPARATOR, tmp);
308        return tmp;
309      }
310      
311      //the relative path of p. basically 
312      // getting rid of /. Parsing and doing 
313      // string manipulation is not good - so
314      // just use the path api to do it.
315      private Path makeRelative(String initial, Path p) {
316        String scheme = this.uri.getScheme();
317        String authority = this.uri.getAuthority();
318        Path root = new Path(Path.SEPARATOR);
319        if (root.compareTo(p) == 0)
320          return new Path(scheme, authority, initial);
321        Path retPath = new Path(p.getName());
322        Path parent = p.getParent();
323        for (int i=0; i < p.depth()-1; i++) {
324          retPath = new Path(parent.getName(), retPath);
325          parent = parent.getParent();
326        }
327        return new Path(new Path(scheme, authority, initial),
328          retPath.toString());
329      }
330      
331      /* this makes a path qualified in the har filesystem
332       * (non-Javadoc)
333       * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified(
334       * org.apache.hadoop.fs.Path)
335       */
336      @Override
337      public Path makeQualified(Path path) {
338        // make sure that we just get the 
339        // path component 
340        Path fsPath = path;
341        if (!path.isAbsolute()) {
342          fsPath = new Path(archivePath, path);
343        }
344    
345        URI tmpURI = fsPath.toUri();
346        //change this to Har uri 
347        return new Path(uri.getScheme(), harAuth, tmpURI.getPath());
348      }
349    
350      /**
351       * Fix offset and length of block locations.
352       * Note that this method modifies the original array.
353       * @param locations block locations of har part file
354       * @param start the start of the desired range in the contained file
355       * @param len the length of the desired range
356       * @param fileOffsetInHar the offset of the desired file in the har part file
357       * @return block locations with fixed offset and length
358       */  
359      static BlockLocation[] fixBlockLocations(BlockLocation[] locations,
360                                              long start,
361                                              long len,
362                                              long fileOffsetInHar) {
363        // offset 1 past last byte of desired range
364        long end = start + len;
365    
366        for (BlockLocation location : locations) {
367          // offset of part block relative to beginning of desired file
368          // (may be negative if file starts in this part block)
369          long harBlockStart = location.getOffset() - fileOffsetInHar;
370          // offset 1 past last byte of har block relative to beginning of
371          // desired file
372          long harBlockEnd = harBlockStart + location.getLength();
373          
374          if (start > harBlockStart) {
375            // desired range starts after beginning of this har block
376            // fix offset to beginning of relevant range (relative to desired file)
377            location.setOffset(start);
378            // fix length to relevant portion of har block
379            location.setLength(location.getLength() - (start - harBlockStart));
380          } else {
381            // desired range includes beginning of this har block
382            location.setOffset(harBlockStart);
383          }
384          
385          if (harBlockEnd > end) {
386            // range ends before end of this har block
387            // fix length to remove irrelevant portion at the end
388            location.setLength(location.getLength() - (harBlockEnd - end));
389          }
390        }
391        
392        return locations;
393      }
394      
395      /**
396       * Get block locations from the underlying fs and fix their
397       * offsets and lengths.
398       * @param file the input filestatus to get block locations
399       * @param start the start of the desired range in the contained file
400       * @param len the length of the desired range
401       * @return block locations for this segment of file
402       * @throws IOException
403       */
404      @Override
405      public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
406                                                   long len) throws IOException {
407        HarStatus hstatus = getFileHarStatus(file.getPath());
408        Path partPath = new Path(archivePath, hstatus.getPartName());
409        FileStatus partStatus = metadata.getPartFileStatus(partPath);
410    
411        // get all part blocks that overlap with the desired file blocks
412        BlockLocation[] locations = 
413          fs.getFileBlockLocations(partStatus,
414                                   hstatus.getStartIndex() + start, len);
415    
416        return fixBlockLocations(locations, start, len, hstatus.getStartIndex());
417      }
418      
419      /**
420       * the hash of the path p inside iniside
421       * the filesystem
422       * @param p the path in the harfilesystem
423       * @return the hash code of the path.
424       */
425      public static int getHarHash(Path p) {
426        return (p.toString().hashCode() & 0x7fffffff);
427      }
428      
429      static class Store {
430        public Store() {
431          begin = end = startHash = endHash = 0;
432        }
433        public Store(long begin, long end, int startHash, int endHash) {
434          this.begin = begin;
435          this.end = end;
436          this.startHash = startHash;
437          this.endHash = endHash;
438        }
439        public long begin;
440        public long end;
441        public int startHash;
442        public int endHash;
443      }
444      
445      /**
446       * Get filestatuses of all the children of a given directory. This just reads
447       * through index file and reads line by line to get all statuses for children
448       * of a directory. Its a brute force way of getting all such filestatuses
449       * 
450       * @param parent
451       *          the parent path directory
452       * @param statuses
453       *          the list to add the children filestatuses to
454       * @param children
455       *          the string list of children for this parent
456       * @param archiveIndexStat
457       *          the archive index filestatus
458       */
459      private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses,
460          List<String> children) throws IOException {
461        String parentString = parent.getName();
462        if (!parentString.endsWith(Path.SEPARATOR)){
463            parentString += Path.SEPARATOR;
464        }
465        Path harPath = new Path(parentString);
466        int harlen = harPath.depth();
467        final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
468    
469        for (HarStatus hstatus : metadata.archive.values()) {
470          String child = hstatus.getName();
471          if ((child.startsWith(parentString))) {
472            Path thisPath = new Path(child);
473            if (thisPath.depth() == harlen + 1) {
474              statuses.add(toFileStatus(hstatus, cache));
475            }
476          }
477        }
478      }
479    
480      /**
481       * Combine the status stored in the index and the underlying status. 
482       * @param h status stored in the index
483       * @param cache caching the underlying file statuses
484       * @return the combined file status
485       * @throws IOException
486       */
487      private FileStatus toFileStatus(HarStatus h,
488          Map<String, FileStatus> cache) throws IOException {
489        FileStatus underlying = null;
490        if (cache != null) {
491          underlying = cache.get(h.partName);
492        }
493        if (underlying == null) {
494          final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
495          underlying = fs.getFileStatus(p);
496          if (cache != null) {
497            cache.put(h.partName, underlying);
498          }
499        }
500    
501        long modTime = 0;
502        int version = metadata.getVersion();
503        if (version < 3) {
504          modTime = underlying.getModificationTime();
505        } else if (version == 3) {
506          modTime = h.getModificationTime();
507        }
508    
509        return new FileStatus(
510            h.isDir()? 0L: h.getLength(),
511            h.isDir(),
512            underlying.getReplication(),
513            underlying.getBlockSize(),
514            modTime,
515            underlying.getAccessTime(),
516            underlying.getPermission(),
517            underlying.getOwner(),
518            underlying.getGroup(),
519            makeRelative(this.uri.getPath(), new Path(h.name)));
520      }
521    
522      // a single line parser for hadoop archives status 
523      // stored in a single line in the index files 
524      // the format is of the form 
525      // filename "dir"/"file" partFileName startIndex length 
526      // <space seperated children>
527      private class HarStatus {
528        boolean isDir;
529        String name;
530        List<String> children;
531        String partName;
532        long startIndex;
533        long length;
534        long modificationTime = 0;
535    
536        public HarStatus(String harString) throws UnsupportedEncodingException {
537          String[] splits = harString.split(" ");
538          this.name = decodeFileName(splits[0]);
539          this.isDir = "dir".equals(splits[1]) ? true: false;
540          // this is equal to "none" if its a directory
541          this.partName = splits[2];
542          this.startIndex = Long.parseLong(splits[3]);
543          this.length = Long.parseLong(splits[4]);
544    
545          int version = metadata.getVersion();
546          String[] propSplits = null;
547          // propSplits is used to retrieve the metainformation that Har versions
548          // 1 & 2 missed (modification time, permission, owner group).
549          // These fields are stored in an encoded string placed in different
550          // locations depending on whether it's a file or directory entry.
551          // If it's a directory, the string will be placed at the partName
552          // location (directories have no partName because they don't have data
553          // to be stored). This is done because the number of fields in a
554          // directory entry is unbounded (all children are listed at the end)
555          // If it's a file, the string will be the last field.
556          if (isDir) {
557            if (version == 3){
558              propSplits = decodeString(this.partName).split(" ");
559            }
560            children = new ArrayList<String>();
561            for (int i = 5; i < splits.length; i++) {
562              children.add(decodeFileName(splits[i]));
563            }
564          } else if (version == 3) {
565            propSplits = decodeString(splits[5]).split(" ");
566          }
567    
568          if (propSplits != null && propSplits.length >= 4) {
569            modificationTime = Long.parseLong(propSplits[0]);
570            // the fields below are stored in the file but are currently not used
571            // by HarFileSystem
572            // permission = new FsPermission(Short.parseShort(propSplits[1]));
573            // owner = decodeString(propSplits[2]);
574            // group = decodeString(propSplits[3]);
575          }
576        }
577        public boolean isDir() {
578          return isDir;
579        }
580        
581        public String getName() {
582          return name;
583        }
584        
585        public List<String> getChildren() {
586          return children;
587        }
588        public String getFileName() {
589          return name;
590        }
591        public String getPartName() {
592          return partName;
593        }
594        public long getStartIndex() {
595          return startIndex;
596        }
597        public long getLength() {
598          return length;
599        }
600        public long getModificationTime() {
601          return modificationTime;
602        }
603      }
604      
605      /**
606       * return the filestatus of files in har archive.
607       * The permission returned are that of the archive
608       * index files. The permissions are not persisted 
609       * while creating a hadoop archive.
610       * @param f the path in har filesystem
611       * @return filestatus.
612       * @throws IOException
613       */
614      @Override
615      public FileStatus getFileStatus(Path f) throws IOException {
616        HarStatus hstatus = getFileHarStatus(f);
617        return toFileStatus(hstatus, null);
618      }
619    
620      private HarStatus getFileHarStatus(Path f) throws IOException {
621        // get the fs DataInputStream for the underlying file
622        // look up the index.
623        Path p = makeQualified(f);
624        Path harPath = getPathInHar(p);
625        if (harPath == null) {
626          throw new IOException("Invalid file name: " + f + " in " + uri);
627        }
628        HarStatus hstatus = metadata.archive.get(harPath);
629        if (hstatus == null) {
630          throw new FileNotFoundException("File: " +  f + " does not exist in " + uri);
631        }
632        return hstatus;
633      }
634    
635      /**
636       * @return null since no checksum algorithm is implemented.
637       */
638      public FileChecksum getFileChecksum(Path f) {
639        return null;
640      }
641    
642      /**
643       * Returns a har input stream which fakes end of 
644       * file. It reads the index files to get the part 
645       * file name and the size and start of the file.
646       */
647      @Override
648      public FSDataInputStream open(Path f, int bufferSize) throws IOException {
649        // get the fs DataInputStream for the underlying file
650        HarStatus hstatus = getFileHarStatus(f);
651        // we got it.. woo hooo!!! 
652        if (hstatus.isDir()) {
653          throw new FileNotFoundException(f + " : not a file in " +
654                    archivePath);
655        }
656        return new HarFSDataInputStream(fs, new Path(archivePath, 
657            hstatus.getPartName()),
658            hstatus.getStartIndex(), hstatus.getLength(), bufferSize);
659      }
660     
661      /*
662       * create throws an exception in Har filesystem.
663       * The archive once created cannot be changed.
664       */
665      public FSDataOutputStream create(Path f, int bufferSize) 
666                                        throws IOException {
667        throw new IOException("Har: Create not allowed");
668      }
669      
670      public FSDataOutputStream create(Path f,
671          FsPermission permission,
672          boolean overwrite,
673          int bufferSize,
674          short replication,
675          long blockSize,
676          Progressable progress) throws IOException {
677        throw new IOException("Har: create not allowed.");
678      }
679      
680      @Override
681      public void close() throws IOException {
682        if (fs != null) {
683          try {
684            fs.close();
685          } catch(IOException ie) {
686            //this might already be closed
687            // ignore
688          }
689        }
690      }
691      
692      /**
693       * Not implemented.
694       */
695      @Override
696      public boolean setReplication(Path src, short replication) throws IOException{
697        throw new IOException("Har: setreplication not allowed");
698      }
699      
700      /**
701       * Not implemented.
702       */
703      @Override
704      public boolean delete(Path f, boolean recursive) throws IOException { 
705        throw new IOException("Har: delete not allowed");
706      }
707      
708      /**
709       * liststatus returns the children of a directory 
710       * after looking up the index files.
711       */
712      @Override
713      public FileStatus[] listStatus(Path f) throws IOException {
714        //need to see if the file is an index in file
715        //get the filestatus of the archive directory
716        // we will create fake filestatuses to return
717        // to the client
718        List<FileStatus> statuses = new ArrayList<FileStatus>();
719        Path tmpPath = makeQualified(f);
720        Path harPath = getPathInHar(tmpPath);
721        HarStatus hstatus = metadata.archive.get(harPath);
722        if (hstatus == null) {
723          throw new FileNotFoundException("File " + f + " not found in " + archivePath);
724        }
725        if (hstatus.isDir()) {
726          fileStatusesInIndex(hstatus, statuses, hstatus.children);
727        } else {
728          statuses.add(toFileStatus(hstatus, null));
729        }
730        
731        return statuses.toArray(new FileStatus[statuses.size()]);
732      }
733      
734      /**
735       * return the top level archive path.
736       */
737      public Path getHomeDirectory() {
738        return new Path(uri.toString());
739      }
740      
741      public void setWorkingDirectory(Path newDir) {
742        //does nothing.
743      }
744      
745      /**
746       * not implemented.
747       */
748      public boolean mkdirs(Path f, FsPermission permission) throws IOException {
749        throw new IOException("Har: mkdirs not allowed");
750      }
751      
752      /**
753       * not implemented.
754       */
755      public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws 
756            IOException {
757        throw new IOException("Har: copyfromlocalfile not allowed");
758      }
759      
760      /**
761       * copies the file in the har filesystem to a local file.
762       */
763      public void copyToLocalFile(boolean delSrc, Path src, Path dst) 
764        throws IOException {
765        FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf());
766      }
767      
768      /**
769       * not implemented.
770       */
771      public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) 
772        throws IOException {
773        throw new IOException("Har: startLocalOutput not allowed");
774      }
775      
776      /**
777       * not implemented.
778       */
779      public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) 
780        throws IOException {
781        throw new IOException("Har: completeLocalOutput not allowed");
782      }
783      
784      /**
785       * not implemented.
786       */
787      public void setOwner(Path p, String username, String groupname)
788        throws IOException {
789        throw new IOException("Har: setowner not allowed");
790      }
791    
792      /**
793       * Not implemented.
794       */
795      public void setPermission(Path p, FsPermission permisssion) 
796        throws IOException {
797        throw new IOException("Har: setPermission not allowed");
798      }
799      
800      /**
801       * Hadoop archives input stream. This input stream fakes EOF 
802       * since archive files are part of bigger part files.
803       */
804      private static class HarFSDataInputStream extends FSDataInputStream {
805        /**
806         * Create an input stream that fakes all the reads/positions/seeking.
807         */
808        private static class HarFsInputStream extends FSInputStream {
809          private long position, start, end;
810          //The underlying data input stream that the
811          // underlying filesystem will return.
812          private FSDataInputStream underLyingStream;
813          //one byte buffer
814          private byte[] oneBytebuff = new byte[1];
815          HarFsInputStream(FileSystem fs, Path path, long start,
816              long length, int bufferSize) throws IOException {
817            underLyingStream = fs.open(path, bufferSize);
818            underLyingStream.seek(start);
819            // the start of this file in the part file
820            this.start = start;
821            // the position pointer in the part file
822            this.position = start;
823            // the end pointer in the part file
824            this.end = start + length;
825          }
826          
827          public synchronized int available() throws IOException {
828            long remaining = end - underLyingStream.getPos();
829            if (remaining > (long)Integer.MAX_VALUE) {
830              return Integer.MAX_VALUE;
831            }
832            return (int) remaining;
833          }
834          
835          public synchronized  void close() throws IOException {
836            underLyingStream.close();
837            super.close();
838          }
839          
840          //not implemented
841          @Override
842          public void mark(int readLimit) {
843            // do nothing 
844          }
845          
846          /**
847           * reset is not implemented
848           */
849          public void reset() throws IOException {
850            throw new IOException("reset not implemented.");
851          }
852          
853          public synchronized int read() throws IOException {
854            int ret = read(oneBytebuff, 0, 1);
855            return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff);
856          }
857          
858          public synchronized int read(byte[] b) throws IOException {
859            int ret = read(b, 0, b.length);
860            if (ret != -1) {
861              position += ret;
862            }
863            return ret;
864          }
865          
866          /**
867           * 
868           */
869          public synchronized int read(byte[] b, int offset, int len) 
870            throws IOException {
871            int newlen = len;
872            int ret = -1;
873            if (position + len > end) {
874              newlen = (int) (end - position);
875            }
876            // end case
877            if (newlen == 0) 
878              return ret;
879            ret = underLyingStream.read(b, offset, newlen);
880            position += ret;
881            return ret;
882          }
883          
884          public synchronized long skip(long n) throws IOException {
885            long tmpN = n;
886            if (tmpN > 0) {
887              if (position + tmpN > end) {
888                tmpN = end - position;
889              }
890              underLyingStream.seek(tmpN + position);
891              position += tmpN;
892              return tmpN;
893            }
894            return (tmpN < 0)? -1 : 0;
895          }
896          
897          public synchronized long getPos() throws IOException {
898            return (position - start);
899          }
900          
901          public synchronized void seek(long pos) throws IOException {
902            if (pos < 0 || (start + pos > end)) {
903              throw new IOException("Failed to seek: EOF");
904            }
905            position = start + pos;
906            underLyingStream.seek(position);
907          }
908    
909          public boolean seekToNewSource(long targetPos) throws IOException {
910            //do not need to implement this
911            // hdfs in itself does seektonewsource 
912            // while reading.
913            return false;
914          }
915          
916          /**
917           * implementing position readable. 
918           */
919          public int read(long pos, byte[] b, int offset, int length) 
920          throws IOException {
921            int nlength = length;
922            if (start + nlength + pos > end) {
923              nlength = (int) (end - (start + pos));
924            }
925            return underLyingStream.read(pos + start , b, offset, nlength);
926          }
927          
928          /**
929           * position readable again.
930           */
931          public void readFully(long pos, byte[] b, int offset, int length) 
932          throws IOException {
933            if (start + length + pos > end) {
934              throw new IOException("Not enough bytes to read.");
935            }
936            underLyingStream.readFully(pos + start, b, offset, length);
937          }
938          
939          public void readFully(long pos, byte[] b) throws IOException {
940              readFully(pos, b, 0, b.length);
941          }
942          
943        }
944      
945        /**
946         * constructors for har input stream.
947         * @param fs the underlying filesystem
948         * @param p The path in the underlying filesystem
949         * @param start the start position in the part file
950         * @param length the length of valid data in the part file
951         * @param bufsize the buffer size
952         * @throws IOException
953         */
954        public HarFSDataInputStream(FileSystem fs, Path  p, long start, 
955            long length, int bufsize) throws IOException {
956            super(new HarFsInputStream(fs, p, start, length, bufsize));
957        }
958    
959        /**
960         * constructor for har input stream.
961         * @param fs the underlying filesystem
962         * @param p the path in the underlying file system
963         * @param start the start position in the part file
964         * @param length the length of valid data in the part file.
965         * @throws IOException
966         */
967        public HarFSDataInputStream(FileSystem fs, Path  p, long start, long length)
968          throws IOException {
969            super(new HarFsInputStream(fs, p, start, length, 0));
970        }
971      }
972    
973      private class HarMetaData {
974        private FileSystem fs;
975        private int version;
976        // the masterIndex of the archive
977        private Path masterIndexPath;
978        // the index file 
979        private Path archiveIndexPath;
980    
981        private long masterIndexTimestamp;
982        private long archiveIndexTimestamp;
983    
984        List<Store> stores = new ArrayList<Store>();
985        Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
986        private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
987    
988        public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
989          this.fs = fs;
990          this.masterIndexPath = masterIndexPath;
991          this.archiveIndexPath = archiveIndexPath;
992        }
993    
994        public FileStatus getPartFileStatus(Path partPath) throws IOException {
995          FileStatus status;
996          status = partFileStatuses.get(partPath);
997          if (status == null) {
998            status = fs.getFileStatus(partPath);
999            partFileStatuses.put(partPath, status);
1000          }
1001          return status;
1002        }
1003    
1004        public long getMasterIndexTimestamp() {
1005          return masterIndexTimestamp;
1006        }
1007    
1008        public long getArchiveIndexTimestamp() {
1009          return archiveIndexTimestamp;
1010        }
1011    
1012        private int getVersion() {
1013          return version;
1014        }
1015    
1016        private void parseMetaData() throws IOException {
1017          FSDataInputStream in = fs.open(masterIndexPath);
1018          FileStatus masterStat = fs.getFileStatus(masterIndexPath);
1019          masterIndexTimestamp = masterStat.getModificationTime();
1020          LineReader lin = new LineReader(in, getConf());
1021          Text line = new Text();
1022          long read = lin.readLine(line);
1023    
1024         // the first line contains the version of the index file
1025          String versionLine = line.toString();
1026          String[] arr = versionLine.split(" ");
1027          version = Integer.parseInt(arr[0]);
1028          // make it always backwards-compatible
1029          if (this.version > HarFileSystem.VERSION) {
1030            throw new IOException("Invalid version " + 
1031                this.version + " expected " + HarFileSystem.VERSION);
1032          }
1033    
1034          // each line contains a hashcode range and the index file name
1035          String[] readStr = null;
1036          while(read < masterStat.getLen()) {
1037            int b = lin.readLine(line);
1038            read += b;
1039            readStr = line.toString().split(" ");
1040            int startHash = Integer.parseInt(readStr[0]);
1041            int endHash  = Integer.parseInt(readStr[1]);
1042            stores.add(new Store(Long.parseLong(readStr[2]), 
1043                Long.parseLong(readStr[3]), startHash,
1044                endHash));
1045            line.clear();
1046          }
1047          try {
1048            // close the master index
1049            lin.close();
1050          } catch(IOException io){
1051            // do nothing just a read.
1052          }
1053    
1054          FSDataInputStream aIn = fs.open(archiveIndexPath);
1055          FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
1056          archiveIndexTimestamp = archiveStat.getModificationTime();
1057          LineReader aLin;
1058          String retStr = null;
1059          // now start reading the real index file
1060          for (Store s: stores) {
1061            read = 0;
1062            aIn.seek(s.begin);
1063            aLin = new LineReader(aIn, getConf());
1064            while (read + s.begin < s.end) {
1065              int tmp = aLin.readLine(line);
1066              read += tmp;
1067              String lineFeed = line.toString();
1068              String[] parsed = lineFeed.split(" ");
1069              parsed[0] = decodeFileName(parsed[0]);
1070              archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
1071              line.clear();
1072            }
1073          }
1074          try {
1075            // close the archive index
1076            aIn.close();
1077          } catch(IOException io) {
1078            // do nothing just a read.
1079          }
1080        }
1081      }
1082    }