001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.fs;
019    
020    import java.io.FileNotFoundException;
021    import java.io.IOException;
022    import java.io.UnsupportedEncodingException;
023    import java.net.URI;
024    import java.net.URISyntaxException;
025    import java.net.URLDecoder;
026    import java.util.ArrayList;
027    import java.util.List;
028    import java.util.Map;
029    import java.util.TreeMap;
030    import java.util.HashMap;
031    import java.util.concurrent.ConcurrentHashMap;
032    
033    import org.apache.commons.logging.Log;
034    import org.apache.commons.logging.LogFactory;
035    import org.apache.hadoop.conf.Configuration;
036    import org.apache.hadoop.fs.permission.FsPermission;
037    import org.apache.hadoop.io.IOUtils;
038    import org.apache.hadoop.io.Text;
039    import org.apache.hadoop.util.LineReader;
040    import org.apache.hadoop.util.Progressable;
041    
042    /**
043     * This is an implementation of the Hadoop Archive 
044     * Filesystem. This archive Filesystem has index files
045     * of the form _index* and has contents of the form
046     * part-*. The index files store the indexes of the 
047     * real files. The index files are of the form _masterindex
048     * and _index. The master index is a level of indirection 
049     * in to the index file to make the look ups faster. the index
050     * file is sorted with hash code of the paths that it contains 
051     * and the master index contains pointers to the positions in 
052     * index for ranges of hashcodes.
053     */
054    
055    public class HarFileSystem extends FilterFileSystem {
056    
057      private static final Log LOG = LogFactory.getLog(HarFileSystem.class);
058    
059      public static final int VERSION = 3;
060    
061      private static final Map<URI, HarMetaData> harMetaCache =
062          new ConcurrentHashMap<URI, HarMetaData>();
063    
064      // uri representation of this Har filesystem
065      private URI uri;
066      // the top level path of the archive
067      // in the underlying file system
068      private Path archivePath;
069      // the har auth
070      private String harAuth;
071    
072      // pointer into the static metadata cache
073      private HarMetaData metadata;
074    
075      /**
076       * public construction of harfilesystem
077       *
078       */
079      public HarFileSystem() {
080      }
081    
082      /**
083       * Return the protocol scheme for the FileSystem.
084       * <p/>
085       *
086       * @return <code>har</code>
087       */
088      @Override
089      public String getScheme() {
090        return "har";
091      }
092    
093      /**
094       * Constructor to create a HarFileSystem with an
095       * underlying filesystem.
096       * @param fs
097       */
098      public HarFileSystem(FileSystem fs) {
099        super(fs);
100      }
101      
102      /**
103       * Initialize a Har filesystem per har archive. The 
104       * archive home directory is the top level directory
105       * in the filesystem that contains the HAR archive.
106       * Be careful with this method, you do not want to go 
107       * on creating new Filesystem instances per call to 
108       * path.getFileSystem().
109       * the uri of Har is 
110       * har://underlyingfsscheme-host:port/archivepath.
111       * or 
112       * har:///archivepath. This assumes the underlying filesystem
113       * to be used in case not specified.
114       */
115      @Override
116      public void initialize(URI name, Configuration conf) throws IOException {
117        // decode the name
118        URI underLyingURI = decodeHarURI(name, conf);
119        // we got the right har Path- now check if this is 
120        // truly a har filesystem
121        Path harPath = archivePath(
122          new Path(name.getScheme(), name.getAuthority(), name.getPath()));
123        if (harPath == null) { 
124          throw new IOException("Invalid path for the Har Filesystem. " + 
125                               name.toString());
126        }
127        if (fs == null) {
128          fs = FileSystem.get(underLyingURI, conf);
129        }
130        uri = harPath.toUri();
131        archivePath = new Path(uri.getPath());
132        harAuth = getHarAuth(underLyingURI);
133        //check for the underlying fs containing
134        // the index file
135        Path masterIndexPath = new Path(archivePath, "_masterindex");
136        Path archiveIndexPath = new Path(archivePath, "_index");
137        if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) {
138          throw new IOException("Invalid path for the Har Filesystem. " +
139              "No index file in " + harPath);
140        }
141    
142        metadata = harMetaCache.get(uri);
143        if (metadata != null) {
144          FileStatus mStat = fs.getFileStatus(masterIndexPath);
145          FileStatus aStat = fs.getFileStatus(archiveIndexPath);
146          if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() ||
147              aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) {
148            // the archive has been overwritten since we last read it
149            // remove the entry from the meta data cache
150            metadata = null;
151            harMetaCache.remove(uri);
152          }
153        }
154        if (metadata == null) {
155          metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath);
156          metadata.parseMetaData();
157          harMetaCache.put(uri, metadata);
158        }
159      }
160    
161      // get the version of the filesystem from the masterindex file
162      // the version is currently not useful since its the first version
163      // of archives
164      public int getHarVersion() throws IOException {
165        if (metadata != null) {
166          return metadata.getVersion();
167        }
168        else {
169          throw new IOException("Invalid meta data for the Har Filesystem");
170        }
171      }
172    
173      /*
174       * find the parent path that is the 
175       * archive path in the path. The last
176       * path segment that ends with .har is 
177       * the path that will be returned.
178       */
179      private Path archivePath(Path p) {
180        Path retPath = null;
181        Path tmp = p;
182        for (int i=0; i< p.depth(); i++) {
183          if (tmp.toString().endsWith(".har")) {
184            retPath = tmp;
185            break;
186          }
187          tmp = tmp.getParent();
188        }
189        return retPath;
190      }
191    
192      /**
193       * decode the raw URI to get the underlying URI
194       * @param rawURI raw Har URI
195       * @return filtered URI of the underlying fileSystem
196       */
197      private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException {
198        String tmpAuth = rawURI.getAuthority();
199        //we are using the default file
200        //system in the config 
201        //so create a underlying uri and 
202        //return it
203        if (tmpAuth == null) {
204          //create a path 
205          return FileSystem.getDefaultUri(conf);
206        }
207        String authority = rawURI.getAuthority();
208        if (authority == null) {
209          throw new IOException("URI: " + rawURI
210              + " is an invalid Har URI since authority==null."
211              + "  Expecting har://<scheme>-<host>/<path>.");
212        }
213     
214        int i = authority.indexOf('-');
215        if (i < 0) {
216          throw new IOException("URI: " + rawURI
217              + " is an invalid Har URI since '-' not found."
218              + "  Expecting har://<scheme>-<host>/<path>.");
219        }
220     
221        if (rawURI.getQuery() != null) {
222          // query component not allowed
223          throw new IOException("query component in Path not supported  " + rawURI);
224        }
225     
226        URI tmp = null;
227     
228        try {
229          // convert <scheme>-<host> to <scheme>://<host>
230          URI baseUri = new URI(authority.replaceFirst("-", "://"));
231     
232          tmp = new URI(baseUri.getScheme(), baseUri.getAuthority(),
233                rawURI.getPath(), rawURI.getQuery(), rawURI.getFragment());
234        } catch (URISyntaxException e) {
235          throw new IOException("URI: " + rawURI
236              + " is an invalid Har URI. Expecting har://<scheme>-<host>/<path>.");
237        }
238        return tmp;
239      }
240    
241      private static String decodeString(String str)
242        throws UnsupportedEncodingException {
243        return URLDecoder.decode(str, "UTF-8");
244      }
245    
246      private String decodeFileName(String fname) 
247        throws UnsupportedEncodingException {
248        int version = metadata.getVersion();
249        if (version == 2 || version == 3){
250          return decodeString(fname);
251        }
252        return fname;
253      }
254    
255      /**
256       * return the top level archive.
257       */
258      @Override
259      public Path getWorkingDirectory() {
260        return new Path(uri.toString());
261      }
262      
263      /**
264       * Create a har specific auth 
265       * har-underlyingfs:port
266       * @param underLyingURI the uri of underlying
267       * filesystem
268       * @return har specific auth
269       */
270      private String getHarAuth(URI underLyingUri) {
271        String auth = underLyingUri.getScheme() + "-";
272        if (underLyingUri.getHost() != null) {
273          auth += underLyingUri.getHost() + ":";
274          if (underLyingUri.getPort() != -1) {
275            auth +=  underLyingUri.getPort();
276          }
277        }
278        else {
279          auth += ":";
280        }
281        return auth;
282      }
283      
284      /**
285       * Returns the uri of this filesystem.
286       * The uri is of the form 
287       * har://underlyingfsschema-host:port/pathintheunderlyingfs
288       */
289      @Override
290      public URI getUri() {
291        return this.uri;
292      }
293      
294      /**
295       * this method returns the path 
296       * inside the har filesystem.
297       * this is relative path inside 
298       * the har filesystem.
299       * @param path the fully qualified path in the har filesystem.
300       * @return relative path in the filesystem.
301       */
302      private Path getPathInHar(Path path) {
303        Path harPath = new Path(path.toUri().getPath());
304        if (archivePath.compareTo(harPath) == 0)
305          return new Path(Path.SEPARATOR);
306        Path tmp = new Path(harPath.getName());
307        Path parent = harPath.getParent();
308        while (!(parent.compareTo(archivePath) == 0)) {
309          if (parent.toString().equals(Path.SEPARATOR)) {
310            tmp = null;
311            break;
312          }
313          tmp = new Path(parent.getName(), tmp);
314          parent = parent.getParent();
315        }
316        if (tmp != null) 
317          tmp = new Path(Path.SEPARATOR, tmp);
318        return tmp;
319      }
320      
321      //the relative path of p. basically 
322      // getting rid of /. Parsing and doing 
323      // string manipulation is not good - so
324      // just use the path api to do it.
325      private Path makeRelative(String initial, Path p) {
326        String scheme = this.uri.getScheme();
327        String authority = this.uri.getAuthority();
328        Path root = new Path(Path.SEPARATOR);
329        if (root.compareTo(p) == 0)
330          return new Path(scheme, authority, initial);
331        Path retPath = new Path(p.getName());
332        Path parent = p.getParent();
333        for (int i=0; i < p.depth()-1; i++) {
334          retPath = new Path(parent.getName(), retPath);
335          parent = parent.getParent();
336        }
337        return new Path(new Path(scheme, authority, initial),
338          retPath.toString());
339      }
340      
341      /* this makes a path qualified in the har filesystem
342       * (non-Javadoc)
343       * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified(
344       * org.apache.hadoop.fs.Path)
345       */
346      @Override
347      public Path makeQualified(Path path) {
348        // make sure that we just get the 
349        // path component 
350        Path fsPath = path;
351        if (!path.isAbsolute()) {
352          fsPath = new Path(archivePath, path);
353        }
354    
355        URI tmpURI = fsPath.toUri();
356        //change this to Har uri 
357        return new Path(uri.getScheme(), harAuth, tmpURI.getPath());
358      }
359    
360      /**
361       * Fix offset and length of block locations.
362       * Note that this method modifies the original array.
363       * @param locations block locations of har part file
364       * @param start the start of the desired range in the contained file
365       * @param len the length of the desired range
366       * @param fileOffsetInHar the offset of the desired file in the har part file
367       * @return block locations with fixed offset and length
368       */  
369      static BlockLocation[] fixBlockLocations(BlockLocation[] locations,
370                                              long start,
371                                              long len,
372                                              long fileOffsetInHar) {
373        // offset 1 past last byte of desired range
374        long end = start + len;
375    
376        for (BlockLocation location : locations) {
377          // offset of part block relative to beginning of desired file
378          // (may be negative if file starts in this part block)
379          long harBlockStart = location.getOffset() - fileOffsetInHar;
380          // offset 1 past last byte of har block relative to beginning of
381          // desired file
382          long harBlockEnd = harBlockStart + location.getLength();
383          
384          if (start > harBlockStart) {
385            // desired range starts after beginning of this har block
386            // fix offset to beginning of relevant range (relative to desired file)
387            location.setOffset(start);
388            // fix length to relevant portion of har block
389            location.setLength(location.getLength() - (start - harBlockStart));
390          } else {
391            // desired range includes beginning of this har block
392            location.setOffset(harBlockStart);
393          }
394          
395          if (harBlockEnd > end) {
396            // range ends before end of this har block
397            // fix length to remove irrelevant portion at the end
398            location.setLength(location.getLength() - (harBlockEnd - end));
399          }
400        }
401        
402        return locations;
403      }
404      
405      /**
406       * Get block locations from the underlying fs and fix their
407       * offsets and lengths.
408       * @param file the input filestatus to get block locations
409       * @param start the start of the desired range in the contained file
410       * @param len the length of the desired range
411       * @return block locations for this segment of file
412       * @throws IOException
413       */
414      @Override
415      public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
416                                                   long len) throws IOException {
417        HarStatus hstatus = getFileHarStatus(file.getPath());
418        Path partPath = new Path(archivePath, hstatus.getPartName());
419        FileStatus partStatus = metadata.getPartFileStatus(partPath);
420    
421        // get all part blocks that overlap with the desired file blocks
422        BlockLocation[] locations = 
423          fs.getFileBlockLocations(partStatus,
424                                   hstatus.getStartIndex() + start, len);
425    
426        return fixBlockLocations(locations, start, len, hstatus.getStartIndex());
427      }
428      
429      /**
430       * the hash of the path p inside iniside
431       * the filesystem
432       * @param p the path in the harfilesystem
433       * @return the hash code of the path.
434       */
435      public static int getHarHash(Path p) {
436        return (p.toString().hashCode() & 0x7fffffff);
437      }
438      
439      static class Store {
440        public Store() {
441          begin = end = startHash = endHash = 0;
442        }
443        public Store(long begin, long end, int startHash, int endHash) {
444          this.begin = begin;
445          this.end = end;
446          this.startHash = startHash;
447          this.endHash = endHash;
448        }
449        public long begin;
450        public long end;
451        public int startHash;
452        public int endHash;
453      }
454      
455      /**
456       * Get filestatuses of all the children of a given directory. This just reads
457       * through index file and reads line by line to get all statuses for children
458       * of a directory. Its a brute force way of getting all such filestatuses
459       * 
460       * @param parent
461       *          the parent path directory
462       * @param statuses
463       *          the list to add the children filestatuses to
464       * @param children
465       *          the string list of children for this parent
466       * @param archiveIndexStat
467       *          the archive index filestatus
468       */
469      private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses,
470          List<String> children) throws IOException {
471        String parentString = parent.getName();
472        if (!parentString.endsWith(Path.SEPARATOR)){
473            parentString += Path.SEPARATOR;
474        }
475        Path harPath = new Path(parentString);
476        int harlen = harPath.depth();
477        final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
478    
479        for (HarStatus hstatus : metadata.archive.values()) {
480          String child = hstatus.getName();
481          if ((child.startsWith(parentString))) {
482            Path thisPath = new Path(child);
483            if (thisPath.depth() == harlen + 1) {
484              statuses.add(toFileStatus(hstatus, cache));
485            }
486          }
487        }
488      }
489    
490      /**
491       * Combine the status stored in the index and the underlying status. 
492       * @param h status stored in the index
493       * @param cache caching the underlying file statuses
494       * @return the combined file status
495       * @throws IOException
496       */
497      private FileStatus toFileStatus(HarStatus h,
498          Map<String, FileStatus> cache) throws IOException {
499        FileStatus underlying = null;
500        if (cache != null) {
501          underlying = cache.get(h.partName);
502        }
503        if (underlying == null) {
504          final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
505          underlying = fs.getFileStatus(p);
506          if (cache != null) {
507            cache.put(h.partName, underlying);
508          }
509        }
510    
511        long modTime = 0;
512        int version = metadata.getVersion();
513        if (version < 3) {
514          modTime = underlying.getModificationTime();
515        } else if (version == 3) {
516          modTime = h.getModificationTime();
517        }
518    
519        return new FileStatus(
520            h.isDir()? 0L: h.getLength(),
521            h.isDir(),
522            underlying.getReplication(),
523            underlying.getBlockSize(),
524            modTime,
525            underlying.getAccessTime(),
526            underlying.getPermission(),
527            underlying.getOwner(),
528            underlying.getGroup(),
529            makeRelative(this.uri.getPath(), new Path(h.name)));
530      }
531    
532      // a single line parser for hadoop archives status 
533      // stored in a single line in the index files 
534      // the format is of the form 
535      // filename "dir"/"file" partFileName startIndex length 
536      // <space seperated children>
537      private class HarStatus {
538        boolean isDir;
539        String name;
540        List<String> children;
541        String partName;
542        long startIndex;
543        long length;
544        long modificationTime = 0;
545    
546        public HarStatus(String harString) throws UnsupportedEncodingException {
547          String[] splits = harString.split(" ");
548          this.name = decodeFileName(splits[0]);
549          this.isDir = "dir".equals(splits[1]) ? true: false;
550          // this is equal to "none" if its a directory
551          this.partName = splits[2];
552          this.startIndex = Long.parseLong(splits[3]);
553          this.length = Long.parseLong(splits[4]);
554    
555          int version = metadata.getVersion();
556          String[] propSplits = null;
557          // propSplits is used to retrieve the metainformation that Har versions
558          // 1 & 2 missed (modification time, permission, owner group).
559          // These fields are stored in an encoded string placed in different
560          // locations depending on whether it's a file or directory entry.
561          // If it's a directory, the string will be placed at the partName
562          // location (directories have no partName because they don't have data
563          // to be stored). This is done because the number of fields in a
564          // directory entry is unbounded (all children are listed at the end)
565          // If it's a file, the string will be the last field.
566          if (isDir) {
567            if (version == 3){
568              propSplits = decodeString(this.partName).split(" ");
569            }
570            children = new ArrayList<String>();
571            for (int i = 5; i < splits.length; i++) {
572              children.add(decodeFileName(splits[i]));
573            }
574          } else if (version == 3) {
575            propSplits = decodeString(splits[5]).split(" ");
576          }
577    
578          if (propSplits != null && propSplits.length >= 4) {
579            modificationTime = Long.parseLong(propSplits[0]);
580            // the fields below are stored in the file but are currently not used
581            // by HarFileSystem
582            // permission = new FsPermission(Short.parseShort(propSplits[1]));
583            // owner = decodeString(propSplits[2]);
584            // group = decodeString(propSplits[3]);
585          }
586        }
587        public boolean isDir() {
588          return isDir;
589        }
590        
591        public String getName() {
592          return name;
593        }
594        public String getPartName() {
595          return partName;
596        }
597        public long getStartIndex() {
598          return startIndex;
599        }
600        public long getLength() {
601          return length;
602        }
603        public long getModificationTime() {
604          return modificationTime;
605        }
606      }
607      
608      /**
609       * return the filestatus of files in har archive.
610       * The permission returned are that of the archive
611       * index files. The permissions are not persisted 
612       * while creating a hadoop archive.
613       * @param f the path in har filesystem
614       * @return filestatus.
615       * @throws IOException
616       */
617      @Override
618      public FileStatus getFileStatus(Path f) throws IOException {
619        HarStatus hstatus = getFileHarStatus(f);
620        return toFileStatus(hstatus, null);
621      }
622    
623      private HarStatus getFileHarStatus(Path f) throws IOException {
624        // get the fs DataInputStream for the underlying file
625        // look up the index.
626        Path p = makeQualified(f);
627        Path harPath = getPathInHar(p);
628        if (harPath == null) {
629          throw new IOException("Invalid file name: " + f + " in " + uri);
630        }
631        HarStatus hstatus = metadata.archive.get(harPath);
632        if (hstatus == null) {
633          throw new FileNotFoundException("File: " +  f + " does not exist in " + uri);
634        }
635        return hstatus;
636      }
637    
638      /**
639       * @return null since no checksum algorithm is implemented.
640       */
641      @Override
642      public FileChecksum getFileChecksum(Path f) {
643        return null;
644      }
645    
646      /**
647       * Returns a har input stream which fakes end of 
648       * file. It reads the index files to get the part 
649       * file name and the size and start of the file.
650       */
651      @Override
652      public FSDataInputStream open(Path f, int bufferSize) throws IOException {
653        // get the fs DataInputStream for the underlying file
654        HarStatus hstatus = getFileHarStatus(f);
655        // we got it.. woo hooo!!! 
656        if (hstatus.isDir()) {
657          throw new FileNotFoundException(f + " : not a file in " +
658                    archivePath);
659        }
660        return new HarFSDataInputStream(fs, new Path(archivePath, 
661            hstatus.getPartName()),
662            hstatus.getStartIndex(), hstatus.getLength(), bufferSize);
663      }
664     
665      @Override
666      public FSDataOutputStream create(Path f,
667          FsPermission permission,
668          boolean overwrite,
669          int bufferSize,
670          short replication,
671          long blockSize,
672          Progressable progress) throws IOException {
673        throw new IOException("Har: create not allowed.");
674      }
675      
676      @Override
677      public void close() throws IOException {
678        if (fs != null) {
679          try {
680            fs.close();
681          } catch(IOException ie) {
682            //this might already be closed
683            // ignore
684          }
685        }
686      }
687      
688      /**
689       * Not implemented.
690       */
691      @Override
692      public boolean setReplication(Path src, short replication) throws IOException{
693        throw new IOException("Har: setreplication not allowed");
694      }
695      
696      /**
697       * Not implemented.
698       */
699      @Override
700      public boolean delete(Path f, boolean recursive) throws IOException { 
701        throw new IOException("Har: delete not allowed");
702      }
703      
704      /**
705       * liststatus returns the children of a directory 
706       * after looking up the index files.
707       */
708      @Override
709      public FileStatus[] listStatus(Path f) throws IOException {
710        //need to see if the file is an index in file
711        //get the filestatus of the archive directory
712        // we will create fake filestatuses to return
713        // to the client
714        List<FileStatus> statuses = new ArrayList<FileStatus>();
715        Path tmpPath = makeQualified(f);
716        Path harPath = getPathInHar(tmpPath);
717        HarStatus hstatus = metadata.archive.get(harPath);
718        if (hstatus == null) {
719          throw new FileNotFoundException("File " + f + " not found in " + archivePath);
720        }
721        if (hstatus.isDir()) {
722          fileStatusesInIndex(hstatus, statuses, hstatus.children);
723        } else {
724          statuses.add(toFileStatus(hstatus, null));
725        }
726        
727        return statuses.toArray(new FileStatus[statuses.size()]);
728      }
729      
730      /**
731       * return the top level archive path.
732       */
733      @Override
734      public Path getHomeDirectory() {
735        return new Path(uri.toString());
736      }
737      
738      @Override
739      public void setWorkingDirectory(Path newDir) {
740        //does nothing.
741      }
742      
743      /**
744       * not implemented.
745       */
746      @Override
747      public boolean mkdirs(Path f, FsPermission permission) throws IOException {
748        throw new IOException("Har: mkdirs not allowed");
749      }
750      
751      /**
752       * not implemented.
753       */
754      @Override
755      public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws 
756            IOException {
757        throw new IOException("Har: copyfromlocalfile not allowed");
758      }
759      
760      /**
761       * copies the file in the har filesystem to a local file.
762       */
763      @Override
764      public void copyToLocalFile(boolean delSrc, Path src, Path dst) 
765        throws IOException {
766        FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf());
767      }
768      
769      /**
770       * not implemented.
771       */
772      @Override
773      public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) 
774        throws IOException {
775        throw new IOException("Har: startLocalOutput not allowed");
776      }
777      
778      /**
779       * not implemented.
780       */
781      @Override
782      public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) 
783        throws IOException {
784        throw new IOException("Har: completeLocalOutput not allowed");
785      }
786      
787      /**
788       * not implemented.
789       */
790      @Override
791      public void setOwner(Path p, String username, String groupname)
792        throws IOException {
793        throw new IOException("Har: setowner not allowed");
794      }
795    
796      /**
797       * Not implemented.
798       */
799      @Override
800      public void setPermission(Path p, FsPermission permisssion) 
801        throws IOException {
802        throw new IOException("Har: setPermission not allowed");
803      }
804      
805      /**
806       * Hadoop archives input stream. This input stream fakes EOF 
807       * since archive files are part of bigger part files.
808       */
809      private static class HarFSDataInputStream extends FSDataInputStream {
810        /**
811         * Create an input stream that fakes all the reads/positions/seeking.
812         */
813        private static class HarFsInputStream extends FSInputStream {
814          private long position, start, end;
815          //The underlying data input stream that the
816          // underlying filesystem will return.
817          private FSDataInputStream underLyingStream;
818          //one byte buffer
819          private byte[] oneBytebuff = new byte[1];
820          HarFsInputStream(FileSystem fs, Path path, long start,
821              long length, int bufferSize) throws IOException {
822            underLyingStream = fs.open(path, bufferSize);
823            underLyingStream.seek(start);
824            // the start of this file in the part file
825            this.start = start;
826            // the position pointer in the part file
827            this.position = start;
828            // the end pointer in the part file
829            this.end = start + length;
830          }
831          
832          @Override
833          public synchronized int available() throws IOException {
834            long remaining = end - underLyingStream.getPos();
835            if (remaining > (long)Integer.MAX_VALUE) {
836              return Integer.MAX_VALUE;
837            }
838            return (int) remaining;
839          }
840          
841          @Override
842          public synchronized  void close() throws IOException {
843            underLyingStream.close();
844            super.close();
845          }
846          
847          //not implemented
848          @Override
849          public void mark(int readLimit) {
850            // do nothing 
851          }
852          
853          /**
854           * reset is not implemented
855           */
856          @Override
857          public void reset() throws IOException {
858            throw new IOException("reset not implemented.");
859          }
860          
861          @Override
862          public synchronized int read() throws IOException {
863            int ret = read(oneBytebuff, 0, 1);
864            return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff);
865          }
866          
867          @Override
868          public synchronized int read(byte[] b) throws IOException {
869            int ret = read(b, 0, b.length);
870            if (ret != -1) {
871              position += ret;
872            }
873            return ret;
874          }
875          
876          /**
877           * 
878           */
879          @Override
880          public synchronized int read(byte[] b, int offset, int len) 
881            throws IOException {
882            int newlen = len;
883            int ret = -1;
884            if (position + len > end) {
885              newlen = (int) (end - position);
886            }
887            // end case
888            if (newlen == 0) 
889              return ret;
890            ret = underLyingStream.read(b, offset, newlen);
891            position += ret;
892            return ret;
893          }
894          
895          @Override
896          public synchronized long skip(long n) throws IOException {
897            long tmpN = n;
898            if (tmpN > 0) {
899              if (position + tmpN > end) {
900                tmpN = end - position;
901              }
902              underLyingStream.seek(tmpN + position);
903              position += tmpN;
904              return tmpN;
905            }
906            return (tmpN < 0)? -1 : 0;
907          }
908          
909          @Override
910          public synchronized long getPos() throws IOException {
911            return (position - start);
912          }
913          
914          @Override
915          public synchronized void seek(long pos) throws IOException {
916            if (pos < 0 || (start + pos > end)) {
917              throw new IOException("Failed to seek: EOF");
918            }
919            position = start + pos;
920            underLyingStream.seek(position);
921          }
922    
923          @Override
924          public boolean seekToNewSource(long targetPos) throws IOException {
925            //do not need to implement this
926            // hdfs in itself does seektonewsource 
927            // while reading.
928            return false;
929          }
930          
931          /**
932           * implementing position readable. 
933           */
934          @Override
935          public int read(long pos, byte[] b, int offset, int length) 
936          throws IOException {
937            int nlength = length;
938            if (start + nlength + pos > end) {
939              nlength = (int) (end - (start + pos));
940            }
941            return underLyingStream.read(pos + start , b, offset, nlength);
942          }
943          
944          /**
945           * position readable again.
946           */
947          @Override
948          public void readFully(long pos, byte[] b, int offset, int length) 
949          throws IOException {
950            if (start + length + pos > end) {
951              throw new IOException("Not enough bytes to read.");
952            }
953            underLyingStream.readFully(pos + start, b, offset, length);
954          }
955          
956          @Override
957          public void readFully(long pos, byte[] b) throws IOException {
958              readFully(pos, b, 0, b.length);
959          }
960          
961        }
962      
963        /**
964         * constructors for har input stream.
965         * @param fs the underlying filesystem
966         * @param p The path in the underlying filesystem
967         * @param start the start position in the part file
968         * @param length the length of valid data in the part file
969         * @param bufsize the buffer size
970         * @throws IOException
971         */
972        public HarFSDataInputStream(FileSystem fs, Path  p, long start, 
973            long length, int bufsize) throws IOException {
974            super(new HarFsInputStream(fs, p, start, length, bufsize));
975        }
976    
977        /**
978         * constructor for har input stream.
979         * @param fs the underlying filesystem
980         * @param p the path in the underlying file system
981         * @param start the start position in the part file
982         * @param length the length of valid data in the part file.
983         * @throws IOException
984         */
985        public HarFSDataInputStream(FileSystem fs, Path  p, long start, long length)
986          throws IOException {
987            super(new HarFsInputStream(fs, p, start, length, 0));
988        }
989      }
990    
991      private class HarMetaData {
992        private FileSystem fs;
993        private int version;
994        // the masterIndex of the archive
995        private Path masterIndexPath;
996        // the index file 
997        private Path archiveIndexPath;
998    
999        private long masterIndexTimestamp;
1000        private long archiveIndexTimestamp;
1001    
1002        List<Store> stores = new ArrayList<Store>();
1003        Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
1004        private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
1005    
1006        public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
1007          this.fs = fs;
1008          this.masterIndexPath = masterIndexPath;
1009          this.archiveIndexPath = archiveIndexPath;
1010        }
1011    
1012        public FileStatus getPartFileStatus(Path partPath) throws IOException {
1013          FileStatus status;
1014          status = partFileStatuses.get(partPath);
1015          if (status == null) {
1016            status = fs.getFileStatus(partPath);
1017            partFileStatuses.put(partPath, status);
1018          }
1019          return status;
1020        }
1021    
1022        public long getMasterIndexTimestamp() {
1023          return masterIndexTimestamp;
1024        }
1025    
1026        public long getArchiveIndexTimestamp() {
1027          return archiveIndexTimestamp;
1028        }
1029    
1030        private int getVersion() {
1031          return version;
1032        }
1033    
1034        private void parseMetaData() throws IOException {
1035          Text line;
1036          long read;
1037          FSDataInputStream in = null;
1038          LineReader lin = null;
1039    
1040          try {
1041            in = fs.open(masterIndexPath);
1042            FileStatus masterStat = fs.getFileStatus(masterIndexPath);
1043            masterIndexTimestamp = masterStat.getModificationTime();
1044            lin = new LineReader(in, getConf());
1045            line = new Text();
1046            read = lin.readLine(line);
1047    
1048            // the first line contains the version of the index file
1049            String versionLine = line.toString();
1050            String[] arr = versionLine.split(" ");
1051            version = Integer.parseInt(arr[0]);
1052            // make it always backwards-compatible
1053            if (this.version > HarFileSystem.VERSION) {
1054              throw new IOException("Invalid version " + 
1055                  this.version + " expected " + HarFileSystem.VERSION);
1056            }
1057    
1058            // each line contains a hashcode range and the index file name
1059            String[] readStr = null;
1060            while(read < masterStat.getLen()) {
1061              int b = lin.readLine(line);
1062              read += b;
1063              readStr = line.toString().split(" ");
1064              int startHash = Integer.parseInt(readStr[0]);
1065              int endHash  = Integer.parseInt(readStr[1]);
1066              stores.add(new Store(Long.parseLong(readStr[2]), 
1067                  Long.parseLong(readStr[3]), startHash,
1068                  endHash));
1069              line.clear();
1070            }
1071          } finally {
1072            IOUtils.cleanup(LOG, lin, in);
1073          }
1074    
1075          FSDataInputStream aIn = fs.open(archiveIndexPath);
1076          try {
1077            FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
1078            archiveIndexTimestamp = archiveStat.getModificationTime();
1079            LineReader aLin;
1080    
1081            // now start reading the real index file
1082            for (Store s: stores) {
1083              read = 0;
1084              aIn.seek(s.begin);
1085              aLin = new LineReader(aIn, getConf());
1086              while (read + s.begin < s.end) {
1087                int tmp = aLin.readLine(line);
1088                read += tmp;
1089                String lineFeed = line.toString();
1090                String[] parsed = lineFeed.split(" ");
1091                parsed[0] = decodeFileName(parsed[0]);
1092                archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
1093                line.clear();
1094              }
1095            }
1096          } finally {
1097            IOUtils.cleanup(LOG, aIn);
1098          }
1099        }
1100      }
1101      
1102      /*
1103       * testing purposes only:
1104       */
1105      HarMetaData getMetadata() {
1106        return metadata;
1107      }
1108    }