001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.fs;
019    
020    import org.apache.commons.logging.Log;
021    import org.apache.commons.logging.LogFactory;
022    import org.apache.hadoop.conf.Configuration;
023    import org.apache.hadoop.fs.permission.FsPermission;
024    import org.apache.hadoop.io.IOUtils;
025    import org.apache.hadoop.io.Text;
026    import org.apache.hadoop.util.LineReader;
027    import org.apache.hadoop.util.Progressable;
028    
029    import java.io.FileNotFoundException;
030    import java.io.IOException;
031    import java.io.UnsupportedEncodingException;
032    import java.net.URI;
033    import java.net.URISyntaxException;
034    import java.net.URLDecoder;
035    import java.util.*;
036    
037    /**
038     * This is an implementation of the Hadoop Archive 
039     * Filesystem. This archive Filesystem has index files
040     * of the form _index* and has contents of the form
041     * part-*. The index files store the indexes of the 
042     * real files. The index files are of the form _masterindex
043     * and _index. The master index is a level of indirection 
044     * in to the index file to make the look ups faster. the index
045     * file is sorted with hash code of the paths that it contains 
046     * and the master index contains pointers to the positions in 
047     * index for ranges of hashcodes.
048     */
049    
050    public class HarFileSystem extends FileSystem {
051    
052      private static final Log LOG = LogFactory.getLog(HarFileSystem.class);
053    
054      public static final String METADATA_CACHE_ENTRIES_KEY = "fs.har.metadatacache.entries";
055      public static final int METADATA_CACHE_ENTRIES_DEFAULT = 10;
056    
057      public static final int VERSION = 3;
058    
059      private static Map<URI, HarMetaData> harMetaCache;
060    
061      // uri representation of this Har filesystem
062      private URI uri;
063      // the top level path of the archive
064      // in the underlying file system
065      private Path archivePath;
066      // the har auth
067      private String harAuth;
068    
069      // pointer into the static metadata cache
070      private HarMetaData metadata;
071    
072      private FileSystem fs;
073    
074      /**
075       * public construction of harfilesystem
076       */
077      public HarFileSystem() {
078        // Must call #initialize() method to set the underlying file system
079      }
080    
081      /**
082       * Return the protocol scheme for the FileSystem.
083       * <p/>
084       *
085       * @return <code>har</code>
086       */
087      @Override
088      public String getScheme() {
089        return "har";
090      }
091    
092      /**
093       * Constructor to create a HarFileSystem with an
094       * underlying filesystem.
095       * @param fs underlying file system
096       */
097      public HarFileSystem(FileSystem fs) {
098        this.fs = fs;
099        this.statistics = fs.statistics;
100      }
101     
102      private synchronized void initializeMetadataCache(Configuration conf) {
103        if (harMetaCache == null) {
104          int cacheSize = conf.getInt(METADATA_CACHE_ENTRIES_KEY, METADATA_CACHE_ENTRIES_DEFAULT);
105          harMetaCache = Collections.synchronizedMap(new LruCache<URI, HarMetaData>(cacheSize));
106        }
107      }
108     
109      /**
110       * Initialize a Har filesystem per har archive. The 
111       * archive home directory is the top level directory
112       * in the filesystem that contains the HAR archive.
113       * Be careful with this method, you do not want to go 
114       * on creating new Filesystem instances per call to 
115       * path.getFileSystem().
116       * the uri of Har is 
117       * har://underlyingfsscheme-host:port/archivepath.
118       * or 
119       * har:///archivepath. This assumes the underlying filesystem
120       * to be used in case not specified.
121       */
122      @Override
123      public void initialize(URI name, Configuration conf) throws IOException {
124        // initialize the metadata cache, if needed
125        initializeMetadataCache(conf);
126    
127        // decode the name
128        URI underLyingURI = decodeHarURI(name, conf);
129        // we got the right har Path- now check if this is 
130        // truly a har filesystem
131        Path harPath = archivePath(
132          new Path(name.getScheme(), name.getAuthority(), name.getPath()));
133        if (harPath == null) { 
134          throw new IOException("Invalid path for the Har Filesystem. " + 
135                               name.toString());
136        }
137        if (fs == null) {
138          fs = FileSystem.get(underLyingURI, conf);
139        }
140        uri = harPath.toUri();
141        archivePath = new Path(uri.getPath());
142        harAuth = getHarAuth(underLyingURI);
143        //check for the underlying fs containing
144        // the index file
145        Path masterIndexPath = new Path(archivePath, "_masterindex");
146        Path archiveIndexPath = new Path(archivePath, "_index");
147        if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) {
148          throw new IOException("Invalid path for the Har Filesystem. " +
149              "No index file in " + harPath);
150        }
151    
152        metadata = harMetaCache.get(uri);
153        if (metadata != null) {
154          FileStatus mStat = fs.getFileStatus(masterIndexPath);
155          FileStatus aStat = fs.getFileStatus(archiveIndexPath);
156          if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() ||
157              aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) {
158            // the archive has been overwritten since we last read it
159            // remove the entry from the meta data cache
160            metadata = null;
161            harMetaCache.remove(uri);
162          }
163        }
164        if (metadata == null) {
165          metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath);
166          metadata.parseMetaData();
167          harMetaCache.put(uri, metadata);
168        }
169      }
170    
171      @Override
172      public Configuration getConf() {
173        return fs.getConf();
174      }
175    
176      // get the version of the filesystem from the masterindex file
177      // the version is currently not useful since its the first version
178      // of archives
179      public int getHarVersion() throws IOException {
180        if (metadata != null) {
181          return metadata.getVersion();
182        }
183        else {
184          throw new IOException("Invalid meta data for the Har Filesystem");
185        }
186      }
187    
188      /*
189       * find the parent path that is the 
190       * archive path in the path. The last
191       * path segment that ends with .har is 
192       * the path that will be returned.
193       */
194      private Path archivePath(Path p) {
195        Path retPath = null;
196        Path tmp = p;
197        for (int i=0; i< p.depth(); i++) {
198          if (tmp.toString().endsWith(".har")) {
199            retPath = tmp;
200            break;
201          }
202          tmp = tmp.getParent();
203        }
204        return retPath;
205      }
206    
207      /**
208       * decode the raw URI to get the underlying URI
209       * @param rawURI raw Har URI
210       * @return filtered URI of the underlying fileSystem
211       */
212      private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException {
213        String tmpAuth = rawURI.getAuthority();
214        //we are using the default file
215        //system in the config 
216        //so create a underlying uri and 
217        //return it
218        if (tmpAuth == null) {
219          //create a path 
220          return FileSystem.getDefaultUri(conf);
221        }
222        String authority = rawURI.getAuthority();
223        if (authority == null) {
224          throw new IOException("URI: " + rawURI
225              + " is an invalid Har URI since authority==null."
226              + "  Expecting har://<scheme>-<host>/<path>.");
227        }
228     
229        int i = authority.indexOf('-');
230        if (i < 0) {
231          throw new IOException("URI: " + rawURI
232              + " is an invalid Har URI since '-' not found."
233              + "  Expecting har://<scheme>-<host>/<path>.");
234        }
235     
236        if (rawURI.getQuery() != null) {
237          // query component not allowed
238          throw new IOException("query component in Path not supported  " + rawURI);
239        }
240     
241        URI tmp;
242        try {
243          // convert <scheme>-<host> to <scheme>://<host>
244          URI baseUri = new URI(authority.replaceFirst("-", "://"));
245     
246          tmp = new URI(baseUri.getScheme(), baseUri.getAuthority(),
247                rawURI.getPath(), rawURI.getQuery(), rawURI.getFragment());
248        } catch (URISyntaxException e) {
249          throw new IOException("URI: " + rawURI
250              + " is an invalid Har URI. Expecting har://<scheme>-<host>/<path>.");
251        }
252        return tmp;
253      }
254    
255      private static String decodeString(String str)
256        throws UnsupportedEncodingException {
257        return URLDecoder.decode(str, "UTF-8");
258      }
259    
260      private String decodeFileName(String fname)
261        throws UnsupportedEncodingException {
262        int version = metadata.getVersion();
263        if (version == 2 || version == 3){
264          return decodeString(fname);
265        }
266        return fname;
267      }
268    
269      /**
270       * return the top level archive.
271       */
272      @Override
273      public Path getWorkingDirectory() {
274        return new Path(uri.toString());
275      }
276      
277      /**
278       * Create a har specific auth 
279       * har-underlyingfs:port
280       * @param underLyingUri the uri of underlying
281       * filesystem
282       * @return har specific auth
283       */
284      private String getHarAuth(URI underLyingUri) {
285        String auth = underLyingUri.getScheme() + "-";
286        if (underLyingUri.getHost() != null) {
287          auth += underLyingUri.getHost();
288          if (underLyingUri.getPort() != -1) {
289            auth += ":";
290            auth +=  underLyingUri.getPort();
291          }
292        }
293        else {
294          auth += ":";
295        }
296        return auth;
297      }
298    
299      @Override
300      protected URI getCanonicalUri() {
301        return fs.canonicalizeUri(getUri());
302      }
303    
304      /**
305       * Returns the uri of this filesystem.
306       * The uri is of the form 
307       * har://underlyingfsschema-host:port/pathintheunderlyingfs
308       */
309      @Override
310      public URI getUri() {
311        return this.uri;
312      }
313      
314      /**
315       * this method returns the path 
316       * inside the har filesystem.
317       * this is relative path inside 
318       * the har filesystem.
319       * @param path the fully qualified path in the har filesystem.
320       * @return relative path in the filesystem.
321       */
322      private Path getPathInHar(Path path) {
323        Path harPath = new Path(path.toUri().getPath());
324        if (archivePath.compareTo(harPath) == 0)
325          return new Path(Path.SEPARATOR);
326        Path tmp = new Path(harPath.getName());
327        Path parent = harPath.getParent();
328        while (!(parent.compareTo(archivePath) == 0)) {
329          if (parent.toString().equals(Path.SEPARATOR)) {
330            tmp = null;
331            break;
332          }
333          tmp = new Path(parent.getName(), tmp);
334          parent = parent.getParent();
335        }
336        if (tmp != null) 
337          tmp = new Path(Path.SEPARATOR, tmp);
338        return tmp;
339      }
340      
341      //the relative path of p. basically 
342      // getting rid of /. Parsing and doing 
343      // string manipulation is not good - so
344      // just use the path api to do it.
345      private Path makeRelative(String initial, Path p) {
346        String scheme = this.uri.getScheme();
347        String authority = this.uri.getAuthority();
348        Path root = new Path(Path.SEPARATOR);
349        if (root.compareTo(p) == 0)
350          return new Path(scheme, authority, initial);
351        Path retPath = new Path(p.getName());
352        Path parent = p.getParent();
353        for (int i=0; i < p.depth()-1; i++) {
354          retPath = new Path(parent.getName(), retPath);
355          parent = parent.getParent();
356        }
357        return new Path(new Path(scheme, authority, initial),
358          retPath.toString());
359      }
360      
361      /* this makes a path qualified in the har filesystem
362       * (non-Javadoc)
363       * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified(
364       * org.apache.hadoop.fs.Path)
365       */
366      @Override
367      public Path makeQualified(Path path) {
368        // make sure that we just get the 
369        // path component 
370        Path fsPath = path;
371        if (!path.isAbsolute()) {
372          fsPath = new Path(archivePath, path);
373        }
374    
375        URI tmpURI = fsPath.toUri();
376        //change this to Har uri 
377        return new Path(uri.getScheme(), harAuth, tmpURI.getPath());
378      }
379    
380      /**
381       * Fix offset and length of block locations.
382       * Note that this method modifies the original array.
383       * @param locations block locations of har part file
384       * @param start the start of the desired range in the contained file
385       * @param len the length of the desired range
386       * @param fileOffsetInHar the offset of the desired file in the har part file
387       * @return block locations with fixed offset and length
388       */  
389      static BlockLocation[] fixBlockLocations(BlockLocation[] locations,
390                                              long start,
391                                              long len,
392                                              long fileOffsetInHar) {
393        // offset 1 past last byte of desired range
394        long end = start + len;
395    
396        for (BlockLocation location : locations) {
397          // offset of part block relative to beginning of desired file
398          // (may be negative if file starts in this part block)
399          long harBlockStart = location.getOffset() - fileOffsetInHar;
400          // offset 1 past last byte of har block relative to beginning of
401          // desired file
402          long harBlockEnd = harBlockStart + location.getLength();
403          
404          if (start > harBlockStart) {
405            // desired range starts after beginning of this har block
406            // fix offset to beginning of relevant range (relative to desired file)
407            location.setOffset(start);
408            // fix length to relevant portion of har block
409            location.setLength(location.getLength() - (start - harBlockStart));
410          } else {
411            // desired range includes beginning of this har block
412            location.setOffset(harBlockStart);
413          }
414          
415          if (harBlockEnd > end) {
416            // range ends before end of this har block
417            // fix length to remove irrelevant portion at the end
418            location.setLength(location.getLength() - (harBlockEnd - end));
419          }
420        }
421        
422        return locations;
423      }
424      
425      /**
426       * Get block locations from the underlying fs and fix their
427       * offsets and lengths.
428       * @param file the input file status to get block locations
429       * @param start the start of the desired range in the contained file
430       * @param len the length of the desired range
431       * @return block locations for this segment of file
432       * @throws IOException
433       */
434      @Override
435      public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
436                                                   long len) throws IOException {
437        HarStatus hstatus = getFileHarStatus(file.getPath());
438        Path partPath = new Path(archivePath, hstatus.getPartName());
439        FileStatus partStatus = metadata.getPartFileStatus(partPath);
440    
441        // get all part blocks that overlap with the desired file blocks
442        BlockLocation[] locations = 
443          fs.getFileBlockLocations(partStatus,
444                                   hstatus.getStartIndex() + start, len);
445    
446        return fixBlockLocations(locations, start, len, hstatus.getStartIndex());
447      }
448      
449      /**
450       * the hash of the path p inside  the filesystem
451       * @param p the path in the harfilesystem
452       * @return the hash code of the path.
453       */
454      public static int getHarHash(Path p) {
455        return (p.toString().hashCode() & 0x7fffffff);
456      }
457      
458      static class Store {
459        public Store() {
460          begin = end = startHash = endHash = 0;
461        }
462        public Store(long begin, long end, int startHash, int endHash) {
463          this.begin = begin;
464          this.end = end;
465          this.startHash = startHash;
466          this.endHash = endHash;
467        }
468        public long begin;
469        public long end;
470        public int startHash;
471        public int endHash;
472      }
473      
474      /**
475       * Get filestatuses of all the children of a given directory. This just reads
476       * through index file and reads line by line to get all statuses for children
477       * of a directory. Its a brute force way of getting all such filestatuses
478       * 
479       * @param parent
480       *          the parent path directory
481       * @param statuses
482       *          the list to add the children filestatuses to
483       */
484      private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses)
485              throws IOException {
486        String parentString = parent.getName();
487        if (!parentString.endsWith(Path.SEPARATOR)){
488            parentString += Path.SEPARATOR;
489        }
490        Path harPath = new Path(parentString);
491        int harlen = harPath.depth();
492        final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
493    
494        for (HarStatus hstatus : metadata.archive.values()) {
495          String child = hstatus.getName();
496          if ((child.startsWith(parentString))) {
497            Path thisPath = new Path(child);
498            if (thisPath.depth() == harlen + 1) {
499              statuses.add(toFileStatus(hstatus, cache));
500            }
501          }
502        }
503      }
504    
505      /**
506       * Combine the status stored in the index and the underlying status. 
507       * @param h status stored in the index
508       * @param cache caching the underlying file statuses
509       * @return the combined file status
510       * @throws IOException
511       */
512      private FileStatus toFileStatus(HarStatus h,
513          Map<String, FileStatus> cache) throws IOException {
514        FileStatus underlying = null;
515        if (cache != null) {
516          underlying = cache.get(h.partName);
517        }
518        if (underlying == null) {
519          final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
520          underlying = fs.getFileStatus(p);
521          if (cache != null) {
522            cache.put(h.partName, underlying);
523          }
524        }
525    
526        long modTime = 0;
527        int version = metadata.getVersion();
528        if (version < 3) {
529          modTime = underlying.getModificationTime();
530        } else if (version == 3) {
531          modTime = h.getModificationTime();
532        }
533    
534        return new FileStatus(
535            h.isDir()? 0L: h.getLength(),
536            h.isDir(),
537            underlying.getReplication(),
538            underlying.getBlockSize(),
539            modTime,
540            underlying.getAccessTime(),
541            underlying.getPermission(),
542            underlying.getOwner(),
543            underlying.getGroup(),
544            makeRelative(this.uri.getPath(), new Path(h.name)));
545      }
546    
547      // a single line parser for hadoop archives status 
548      // stored in a single line in the index files 
549      // the format is of the form 
550      // filename "dir"/"file" partFileName startIndex length 
551      // <space separated children>
552      private class HarStatus {
553        boolean isDir;
554        String name;
555        List<String> children;
556        String partName;
557        long startIndex;
558        long length;
559        long modificationTime = 0;
560    
561        public HarStatus(String harString) throws UnsupportedEncodingException {
562          String[] splits = harString.split(" ");
563          this.name = decodeFileName(splits[0]);
564          this.isDir = "dir".equals(splits[1]) ? true: false;
565          // this is equal to "none" if its a directory
566          this.partName = splits[2];
567          this.startIndex = Long.parseLong(splits[3]);
568          this.length = Long.parseLong(splits[4]);
569    
570          int version = metadata.getVersion();
571          String[] propSplits = null;
572          // propSplits is used to retrieve the metainformation that Har versions
573          // 1 & 2 missed (modification time, permission, owner group).
574          // These fields are stored in an encoded string placed in different
575          // locations depending on whether it's a file or directory entry.
576          // If it's a directory, the string will be placed at the partName
577          // location (directories have no partName because they don't have data
578          // to be stored). This is done because the number of fields in a
579          // directory entry is unbounded (all children are listed at the end)
580          // If it's a file, the string will be the last field.
581          if (isDir) {
582            if (version == 3){
583              propSplits = decodeString(this.partName).split(" ");
584            }
585            children = new ArrayList<String>();
586            for (int i = 5; i < splits.length; i++) {
587              children.add(decodeFileName(splits[i]));
588            }
589          } else if (version == 3) {
590            propSplits = decodeString(splits[5]).split(" ");
591          }
592    
593          if (propSplits != null && propSplits.length >= 4) {
594            modificationTime = Long.parseLong(propSplits[0]);
595            // the fields below are stored in the file but are currently not used
596            // by HarFileSystem
597            // permission = new FsPermission(Short.parseShort(propSplits[1]));
598            // owner = decodeString(propSplits[2]);
599            // group = decodeString(propSplits[3]);
600          }
601        }
602        public boolean isDir() {
603          return isDir;
604        }
605        
606        public String getName() {
607          return name;
608        }
609        public String getPartName() {
610          return partName;
611        }
612        public long getStartIndex() {
613          return startIndex;
614        }
615        public long getLength() {
616          return length;
617        }
618        public long getModificationTime() {
619          return modificationTime;
620        }
621      }
622      
623      /**
624       * return the filestatus of files in har archive.
625       * The permission returned are that of the archive
626       * index files. The permissions are not persisted 
627       * while creating a hadoop archive.
628       * @param f the path in har filesystem
629       * @return filestatus.
630       * @throws IOException
631       */
632      @Override
633      public FileStatus getFileStatus(Path f) throws IOException {
634        HarStatus hstatus = getFileHarStatus(f);
635        return toFileStatus(hstatus, null);
636      }
637    
638      private HarStatus getFileHarStatus(Path f) throws IOException {
639        // get the fs DataInputStream for the underlying file
640        // look up the index.
641        Path p = makeQualified(f);
642        Path harPath = getPathInHar(p);
643        if (harPath == null) {
644          throw new IOException("Invalid file name: " + f + " in " + uri);
645        }
646        HarStatus hstatus = metadata.archive.get(harPath);
647        if (hstatus == null) {
648          throw new FileNotFoundException("File: " +  f + " does not exist in " + uri);
649        }
650        return hstatus;
651      }
652    
653      /**
654       * @return null since no checksum algorithm is implemented.
655       */
656      @Override
657      public FileChecksum getFileChecksum(Path f) {
658        return null;
659      }
660    
661      /**
662       * Returns a har input stream which fakes end of 
663       * file. It reads the index files to get the part 
664       * file name and the size and start of the file.
665       */
666      @Override
667      public FSDataInputStream open(Path f, int bufferSize) throws IOException {
668        // get the fs DataInputStream for the underlying file
669        HarStatus hstatus = getFileHarStatus(f);
670        if (hstatus.isDir()) {
671          throw new FileNotFoundException(f + " : not a file in " +
672                    archivePath);
673        }
674        return new HarFSDataInputStream(fs, new Path(archivePath, 
675            hstatus.getPartName()),
676            hstatus.getStartIndex(), hstatus.getLength(), bufferSize);
677      }
678     
679      @Override
680      public FSDataOutputStream create(Path f,
681          FsPermission permission,
682          boolean overwrite,
683          int bufferSize,
684          short replication,
685          long blockSize,
686          Progressable progress) throws IOException {
687        throw new IOException("Har: create not allowed.");
688      }
689    
690      @Override
691      public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) throws IOException {
692        throw new IOException("Har: append not allowed.");
693      }
694    
695      @Override
696      public void close() throws IOException {
697        if (fs != null) {
698          try {
699            fs.close();
700          } catch(IOException ie) {
701            //this might already be closed
702            // ignore
703          }
704        }
705      }
706      
707      /**
708       * Not implemented.
709       */
710      @Override
711      public boolean setReplication(Path src, short replication) throws IOException{
712        throw new IOException("Har: setReplication not allowed");
713      }
714    
715      @Override
716      public boolean rename(Path src, Path dst) throws IOException {
717        throw new IOException("Har: rename not allowed");
718      }
719    
720      @Override
721      public FSDataOutputStream append(Path f) throws IOException {
722        throw new IOException("Har: append not allowed");
723      }
724    
725      /**
726       * Not implemented.
727       */
728      @Override
729      public boolean delete(Path f, boolean recursive) throws IOException { 
730        throw new IOException("Har: delete not allowed");
731      }
732    
733      /**
734       * liststatus returns the children of a directory 
735       * after looking up the index files.
736       */
737      @Override
738      public FileStatus[] listStatus(Path f) throws IOException {
739        //need to see if the file is an index in file
740        //get the filestatus of the archive directory
741        // we will create fake filestatuses to return
742        // to the client
743        List<FileStatus> statuses = new ArrayList<FileStatus>();
744        Path tmpPath = makeQualified(f);
745        Path harPath = getPathInHar(tmpPath);
746        HarStatus hstatus = metadata.archive.get(harPath);
747        if (hstatus == null) {
748          throw new FileNotFoundException("File " + f + " not found in " + archivePath);
749        }
750        if (hstatus.isDir()) {
751          fileStatusesInIndex(hstatus, statuses);
752        } else {
753          statuses.add(toFileStatus(hstatus, null));
754        }
755        
756        return statuses.toArray(new FileStatus[statuses.size()]);
757      }
758      
759      /**
760       * return the top level archive path.
761       */
762      @Override
763      public Path getHomeDirectory() {
764        return new Path(uri.toString());
765      }
766    
767      @Override
768      public void setWorkingDirectory(Path newDir) {
769        //does nothing.
770      }
771      
772      /**
773       * not implemented.
774       */
775      @Override
776      public boolean mkdirs(Path f, FsPermission permission) throws IOException {
777        throw new IOException("Har: mkdirs not allowed");
778      }
779      
780      /**
781       * not implemented.
782       */
783      @Override
784      public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws 
785            IOException {
786        throw new IOException("Har: copyfromlocalfile not allowed");
787      }
788      
789      /**
790       * copies the file in the har filesystem to a local file.
791       */
792      @Override
793      public void copyToLocalFile(boolean delSrc, Path src, Path dst) 
794        throws IOException {
795        FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf());
796      }
797      
798      /**
799       * not implemented.
800       */
801      @Override
802      public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) 
803        throws IOException {
804        throw new IOException("Har: startLocalOutput not allowed");
805      }
806      
807      /**
808       * not implemented.
809       */
810      @Override
811      public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) 
812        throws IOException {
813        throw new IOException("Har: completeLocalOutput not allowed");
814      }
815      
816      /**
817       * not implemented.
818       */
819      @Override
820      public void setOwner(Path p, String username, String groupname)
821        throws IOException {
822        throw new IOException("Har: setowner not allowed");
823      }
824    
825      /**
826       * Not implemented.
827       */
828      @Override
829      public void setPermission(Path p, FsPermission permission)
830        throws IOException {
831        throw new IOException("Har: setPermission not allowed");
832      }
833      
834      /**
835       * Hadoop archives input stream. This input stream fakes EOF 
836       * since archive files are part of bigger part files.
837       */
838      private static class HarFSDataInputStream extends FSDataInputStream {
839        /**
840         * Create an input stream that fakes all the reads/positions/seeking.
841         */
842        private static class HarFsInputStream extends FSInputStream
843            implements CanSetDropBehind, CanSetReadahead {
844          private long position, start, end;
845          //The underlying data input stream that the
846          // underlying filesystem will return.
847          private FSDataInputStream underLyingStream;
848          //one byte buffer
849          private byte[] oneBytebuff = new byte[1];
850          HarFsInputStream(FileSystem fs, Path path, long start,
851              long length, int bufferSize) throws IOException {
852            underLyingStream = fs.open(path, bufferSize);
853            underLyingStream.seek(start);
854            // the start of this file in the part file
855            this.start = start;
856            // the position pointer in the part file
857            this.position = start;
858            // the end pointer in the part file
859            this.end = start + length;
860          }
861          
862          @Override
863          public synchronized int available() throws IOException {
864            long remaining = end - underLyingStream.getPos();
865            if (remaining > (long)Integer.MAX_VALUE) {
866              return Integer.MAX_VALUE;
867            }
868            return (int) remaining;
869          }
870          
871          @Override
872          public synchronized  void close() throws IOException {
873            underLyingStream.close();
874            super.close();
875          }
876          
877          //not implemented
878          @Override
879          public void mark(int readLimit) {
880            // do nothing 
881          }
882          
883          /**
884           * reset is not implemented
885           */
886          @Override
887          public void reset() throws IOException {
888            throw new IOException("reset not implemented.");
889          }
890          
891          @Override
892          public synchronized int read() throws IOException {
893            int ret = read(oneBytebuff, 0, 1);
894            return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff);
895          }
896          
897          @Override
898          public synchronized int read(byte[] b) throws IOException {
899            int ret = read(b, 0, b.length);
900            if (ret != -1) {
901              position += ret;
902            }
903            return ret;
904          }
905          
906          /**
907           * 
908           */
909          @Override
910          public synchronized int read(byte[] b, int offset, int len) 
911            throws IOException {
912            int newlen = len;
913            int ret = -1;
914            if (position + len > end) {
915              newlen = (int) (end - position);
916            }
917            // end case
918            if (newlen == 0)
919              return ret;
920            ret = underLyingStream.read(b, offset, newlen);
921            position += ret;
922            return ret;
923          }
924          
925          @Override
926          public synchronized long skip(long n) throws IOException {
927            long tmpN = n;
928            if (tmpN > 0) {
929              if (position + tmpN > end) {
930                tmpN = end - position;
931              }
932              underLyingStream.seek(tmpN + position);
933              position += tmpN;
934              return tmpN;
935            }
936            return (tmpN < 0)? -1 : 0;
937          }
938          
939          @Override
940          public synchronized long getPos() throws IOException {
941            return (position - start);
942          }
943          
944          @Override
945          public synchronized void seek(long pos) throws IOException {
946            if (pos < 0 || (start + pos > end)) {
947              throw new IOException("Failed to seek: EOF");
948            }
949            position = start + pos;
950            underLyingStream.seek(position);
951          }
952    
953          @Override
954          public boolean seekToNewSource(long targetPos) throws IOException {
955            // do not need to implement this
956            // hdfs in itself does seektonewsource
957            // while reading.
958            return false;
959          }
960          
961          /**
962           * implementing position readable. 
963           */
964          @Override
965          public int read(long pos, byte[] b, int offset, int length) 
966          throws IOException {
967            int nlength = length;
968            if (start + nlength + pos > end) {
969              nlength = (int) (end - (start + pos));
970            }
971            return underLyingStream.read(pos + start , b, offset, nlength);
972          }
973          
974          /**
975           * position readable again.
976           */
977          @Override
978          public void readFully(long pos, byte[] b, int offset, int length) 
979          throws IOException {
980            if (start + length + pos > end) {
981              throw new IOException("Not enough bytes to read.");
982            }
983            underLyingStream.readFully(pos + start, b, offset, length);
984          }
985          
986          @Override
987          public void readFully(long pos, byte[] b) throws IOException {
988              readFully(pos, b, 0, b.length);
989          }
990    
991          @Override
992          public void setReadahead(Long readahead) throws IOException {
993            underLyingStream.setReadahead(readahead);
994          }
995    
996          @Override
997          public void setDropBehind(Boolean dropBehind) throws IOException {
998            underLyingStream.setDropBehind(dropBehind);
999          }
1000        }
1001      
1002        /**
1003         * constructors for har input stream.
1004         * @param fs the underlying filesystem
1005         * @param p The path in the underlying filesystem
1006         * @param start the start position in the part file
1007         * @param length the length of valid data in the part file
1008         * @param bufsize the buffer size
1009         * @throws IOException
1010         */
1011        public HarFSDataInputStream(FileSystem fs, Path  p, long start, 
1012            long length, int bufsize) throws IOException {
1013            super(new HarFsInputStream(fs, p, start, length, bufsize));
1014        }
1015      }
1016    
1017      private class HarMetaData {
1018        private FileSystem fs;
1019        private int version;
1020        // the masterIndex of the archive
1021        private Path masterIndexPath;
1022        // the index file 
1023        private Path archiveIndexPath;
1024    
1025        private long masterIndexTimestamp;
1026        private long archiveIndexTimestamp;
1027    
1028        List<Store> stores = new ArrayList<Store>();
1029        Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
1030        private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
1031    
1032        public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
1033          this.fs = fs;
1034          this.masterIndexPath = masterIndexPath;
1035          this.archiveIndexPath = archiveIndexPath;
1036        }
1037    
1038        public FileStatus getPartFileStatus(Path partPath) throws IOException {
1039          FileStatus status;
1040          status = partFileStatuses.get(partPath);
1041          if (status == null) {
1042            status = fs.getFileStatus(partPath);
1043            partFileStatuses.put(partPath, status);
1044          }
1045          return status;
1046        }
1047    
1048        public long getMasterIndexTimestamp() {
1049          return masterIndexTimestamp;
1050        }
1051    
1052        public long getArchiveIndexTimestamp() {
1053          return archiveIndexTimestamp;
1054        }
1055    
1056        private int getVersion() {
1057          return version;
1058        }
1059    
1060        private void parseMetaData() throws IOException {
1061          Text line = new Text();
1062          long read;
1063          FSDataInputStream in = null;
1064          LineReader lin = null;
1065    
1066          try {
1067            in = fs.open(masterIndexPath);
1068            FileStatus masterStat = fs.getFileStatus(masterIndexPath);
1069            masterIndexTimestamp = masterStat.getModificationTime();
1070            lin = new LineReader(in, getConf());
1071            read = lin.readLine(line);
1072    
1073            // the first line contains the version of the index file
1074            String versionLine = line.toString();
1075            String[] arr = versionLine.split(" ");
1076            version = Integer.parseInt(arr[0]);
1077            // make it always backwards-compatible
1078            if (this.version > HarFileSystem.VERSION) {
1079              throw new IOException("Invalid version " + 
1080                  this.version + " expected " + HarFileSystem.VERSION);
1081            }
1082    
1083            // each line contains a hashcode range and the index file name
1084            String[] readStr;
1085            while(read < masterStat.getLen()) {
1086              int b = lin.readLine(line);
1087              read += b;
1088              readStr = line.toString().split(" ");
1089              int startHash = Integer.parseInt(readStr[0]);
1090              int endHash  = Integer.parseInt(readStr[1]);
1091              stores.add(new Store(Long.parseLong(readStr[2]), 
1092                  Long.parseLong(readStr[3]), startHash,
1093                  endHash));
1094              line.clear();
1095            }
1096          } catch (IOException ioe) {
1097            LOG.warn("Encountered exception ", ioe);
1098            throw ioe;
1099          } finally {
1100            IOUtils.cleanup(LOG, lin, in);
1101          }
1102    
1103          FSDataInputStream aIn = fs.open(archiveIndexPath);
1104          try {
1105            FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
1106            archiveIndexTimestamp = archiveStat.getModificationTime();
1107            LineReader aLin;
1108    
1109            // now start reading the real index file
1110            for (Store s: stores) {
1111              read = 0;
1112              aIn.seek(s.begin);
1113              aLin = new LineReader(aIn, getConf());
1114              while (read + s.begin < s.end) {
1115                int tmp = aLin.readLine(line);
1116                read += tmp;
1117                String lineFeed = line.toString();
1118                String[] parsed = lineFeed.split(" ");
1119                parsed[0] = decodeFileName(parsed[0]);
1120                archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
1121                line.clear();
1122              }
1123            }
1124          } finally {
1125            IOUtils.cleanup(LOG, aIn);
1126          }
1127        }
1128      }
1129      
1130      /*
1131       * testing purposes only:
1132       */
1133      HarMetaData getMetadata() {
1134        return metadata;
1135      }
1136    
1137      private static class LruCache<K, V> extends LinkedHashMap<K, V> {
1138        private final int MAX_ENTRIES;
1139    
1140        public LruCache(int maxEntries) {
1141            super(maxEntries + 1, 1.0f, true);
1142            MAX_ENTRIES = maxEntries;
1143        }
1144    
1145        @Override
1146        protected boolean removeEldestEntry(Map.Entry<K, V> eldest) {
1147            return size() > MAX_ENTRIES;
1148        }
1149      }
1150    }