001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.fs;
019    
020    import java.io.FileNotFoundException;
021    import java.io.IOException;
022    import java.io.UnsupportedEncodingException;
023    import java.net.URI;
024    import java.net.URISyntaxException;
025    import java.net.URLDecoder;
026    import java.util.ArrayList;
027    import java.util.Collections;
028    import java.util.List;
029    import java.util.LinkedHashMap;
030    import java.util.Map;
031    import java.util.TreeMap;
032    import java.util.HashMap;
033    
034    import org.apache.commons.logging.Log;
035    import org.apache.commons.logging.LogFactory;
036    import org.apache.hadoop.conf.Configuration;
037    import org.apache.hadoop.fs.permission.FsPermission;
038    import org.apache.hadoop.io.IOUtils;
039    import org.apache.hadoop.io.Text;
040    import org.apache.hadoop.util.LineReader;
041    import org.apache.hadoop.util.Progressable;
042    
043    /**
044     * This is an implementation of the Hadoop Archive 
045     * Filesystem. This archive Filesystem has index files
046     * of the form _index* and has contents of the form
047     * part-*. The index files store the indexes of the 
048     * real files. The index files are of the form _masterindex
049     * and _index. The master index is a level of indirection 
050     * in to the index file to make the look ups faster. the index
051     * file is sorted with hash code of the paths that it contains 
052     * and the master index contains pointers to the positions in 
053     * index for ranges of hashcodes.
054     */
055    
056    public class HarFileSystem extends FilterFileSystem {
057    
058      private static final Log LOG = LogFactory.getLog(HarFileSystem.class);
059    
060      public static final String METADATA_CACHE_ENTRIES_KEY = "fs.har.metadatacache.entries";
061      public static final int METADATA_CACHE_ENTRIES_DEFAULT = 10;
062    
063      public static final int VERSION = 3;
064    
065      private static Map<URI, HarMetaData> harMetaCache;
066    
067      // uri representation of this Har filesystem
068      private URI uri;
069      // the top level path of the archive
070      // in the underlying file system
071      private Path archivePath;
072      // the har auth
073      private String harAuth;
074    
075      // pointer into the static metadata cache
076      private HarMetaData metadata;
077    
078      /**
079       * public construction of harfilesystem
080       *
081       */
082      public HarFileSystem() {
083      }
084    
085      /**
086       * Return the protocol scheme for the FileSystem.
087       * <p/>
088       *
089       * @return <code>har</code>
090       */
091      @Override
092      public String getScheme() {
093        return "har";
094      }
095    
096      /**
097       * Constructor to create a HarFileSystem with an
098       * underlying filesystem.
099       * @param fs
100       */
101      public HarFileSystem(FileSystem fs) {
102        super(fs);
103      }
104     
105      private synchronized void initializeMetadataCache(Configuration conf) {
106        if (harMetaCache == null) {
107          int cacheSize = conf.getInt(METADATA_CACHE_ENTRIES_KEY, METADATA_CACHE_ENTRIES_DEFAULT);
108          harMetaCache = Collections.synchronizedMap(new LruCache<URI, HarMetaData>(cacheSize));
109        }
110      }
111     
112      /**
113       * Initialize a Har filesystem per har archive. The 
114       * archive home directory is the top level directory
115       * in the filesystem that contains the HAR archive.
116       * Be careful with this method, you do not want to go 
117       * on creating new Filesystem instances per call to 
118       * path.getFileSystem().
119       * the uri of Har is 
120       * har://underlyingfsscheme-host:port/archivepath.
121       * or 
122       * har:///archivepath. This assumes the underlying filesystem
123       * to be used in case not specified.
124       */
125      @Override
126      public void initialize(URI name, Configuration conf) throws IOException {
127        // initialize the metadata cache, if needed
128        initializeMetadataCache(conf);
129    
130        // decode the name
131        URI underLyingURI = decodeHarURI(name, conf);
132        // we got the right har Path- now check if this is 
133        // truly a har filesystem
134        Path harPath = archivePath(
135          new Path(name.getScheme(), name.getAuthority(), name.getPath()));
136        if (harPath == null) { 
137          throw new IOException("Invalid path for the Har Filesystem. " + 
138                               name.toString());
139        }
140        if (fs == null) {
141          fs = FileSystem.get(underLyingURI, conf);
142        }
143        uri = harPath.toUri();
144        archivePath = new Path(uri.getPath());
145        harAuth = getHarAuth(underLyingURI);
146        //check for the underlying fs containing
147        // the index file
148        Path masterIndexPath = new Path(archivePath, "_masterindex");
149        Path archiveIndexPath = new Path(archivePath, "_index");
150        if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) {
151          throw new IOException("Invalid path for the Har Filesystem. " +
152              "No index file in " + harPath);
153        }
154    
155        metadata = harMetaCache.get(uri);
156        if (metadata != null) {
157          FileStatus mStat = fs.getFileStatus(masterIndexPath);
158          FileStatus aStat = fs.getFileStatus(archiveIndexPath);
159          if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() ||
160              aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) {
161            // the archive has been overwritten since we last read it
162            // remove the entry from the meta data cache
163            metadata = null;
164            harMetaCache.remove(uri);
165          }
166        }
167        if (metadata == null) {
168          metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath);
169          metadata.parseMetaData();
170          harMetaCache.put(uri, metadata);
171        }
172      }
173    
174      // get the version of the filesystem from the masterindex file
175      // the version is currently not useful since its the first version
176      // of archives
177      public int getHarVersion() throws IOException {
178        if (metadata != null) {
179          return metadata.getVersion();
180        }
181        else {
182          throw new IOException("Invalid meta data for the Har Filesystem");
183        }
184      }
185    
186      /*
187       * find the parent path that is the 
188       * archive path in the path. The last
189       * path segment that ends with .har is 
190       * the path that will be returned.
191       */
192      private Path archivePath(Path p) {
193        Path retPath = null;
194        Path tmp = p;
195        for (int i=0; i< p.depth(); i++) {
196          if (tmp.toString().endsWith(".har")) {
197            retPath = tmp;
198            break;
199          }
200          tmp = tmp.getParent();
201        }
202        return retPath;
203      }
204    
205      /**
206       * decode the raw URI to get the underlying URI
207       * @param rawURI raw Har URI
208       * @return filtered URI of the underlying fileSystem
209       */
210      private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException {
211        String tmpAuth = rawURI.getAuthority();
212        //we are using the default file
213        //system in the config 
214        //so create a underlying uri and 
215        //return it
216        if (tmpAuth == null) {
217          //create a path 
218          return FileSystem.getDefaultUri(conf);
219        }
220        String authority = rawURI.getAuthority();
221        if (authority == null) {
222          throw new IOException("URI: " + rawURI
223              + " is an invalid Har URI since authority==null."
224              + "  Expecting har://<scheme>-<host>/<path>.");
225        }
226     
227        int i = authority.indexOf('-');
228        if (i < 0) {
229          throw new IOException("URI: " + rawURI
230              + " is an invalid Har URI since '-' not found."
231              + "  Expecting har://<scheme>-<host>/<path>.");
232        }
233     
234        if (rawURI.getQuery() != null) {
235          // query component not allowed
236          throw new IOException("query component in Path not supported  " + rawURI);
237        }
238     
239        URI tmp = null;
240     
241        try {
242          // convert <scheme>-<host> to <scheme>://<host>
243          URI baseUri = new URI(authority.replaceFirst("-", "://"));
244     
245          tmp = new URI(baseUri.getScheme(), baseUri.getAuthority(),
246                rawURI.getPath(), rawURI.getQuery(), rawURI.getFragment());
247        } catch (URISyntaxException e) {
248          throw new IOException("URI: " + rawURI
249              + " is an invalid Har URI. Expecting har://<scheme>-<host>/<path>.");
250        }
251        return tmp;
252      }
253    
254      private static String decodeString(String str)
255        throws UnsupportedEncodingException {
256        return URLDecoder.decode(str, "UTF-8");
257      }
258    
259      private String decodeFileName(String fname) 
260        throws UnsupportedEncodingException {
261        int version = metadata.getVersion();
262        if (version == 2 || version == 3){
263          return decodeString(fname);
264        }
265        return fname;
266      }
267    
268      /**
269       * return the top level archive.
270       */
271      @Override
272      public Path getWorkingDirectory() {
273        return new Path(uri.toString());
274      }
275      
276      /**
277       * Create a har specific auth 
278       * har-underlyingfs:port
279       * @param underLyingURI the uri of underlying
280       * filesystem
281       * @return har specific auth
282       */
283      private String getHarAuth(URI underLyingUri) {
284        String auth = underLyingUri.getScheme() + "-";
285        if (underLyingUri.getHost() != null) {
286          auth += underLyingUri.getHost() + ":";
287          if (underLyingUri.getPort() != -1) {
288            auth +=  underLyingUri.getPort();
289          }
290        }
291        else {
292          auth += ":";
293        }
294        return auth;
295      }
296      
297      /**
298       * Returns the uri of this filesystem.
299       * The uri is of the form 
300       * har://underlyingfsschema-host:port/pathintheunderlyingfs
301       */
302      @Override
303      public URI getUri() {
304        return this.uri;
305      }
306      
307      /**
308       * this method returns the path 
309       * inside the har filesystem.
310       * this is relative path inside 
311       * the har filesystem.
312       * @param path the fully qualified path in the har filesystem.
313       * @return relative path in the filesystem.
314       */
315      private Path getPathInHar(Path path) {
316        Path harPath = new Path(path.toUri().getPath());
317        if (archivePath.compareTo(harPath) == 0)
318          return new Path(Path.SEPARATOR);
319        Path tmp = new Path(harPath.getName());
320        Path parent = harPath.getParent();
321        while (!(parent.compareTo(archivePath) == 0)) {
322          if (parent.toString().equals(Path.SEPARATOR)) {
323            tmp = null;
324            break;
325          }
326          tmp = new Path(parent.getName(), tmp);
327          parent = parent.getParent();
328        }
329        if (tmp != null) 
330          tmp = new Path(Path.SEPARATOR, tmp);
331        return tmp;
332      }
333      
334      //the relative path of p. basically 
335      // getting rid of /. Parsing and doing 
336      // string manipulation is not good - so
337      // just use the path api to do it.
338      private Path makeRelative(String initial, Path p) {
339        String scheme = this.uri.getScheme();
340        String authority = this.uri.getAuthority();
341        Path root = new Path(Path.SEPARATOR);
342        if (root.compareTo(p) == 0)
343          return new Path(scheme, authority, initial);
344        Path retPath = new Path(p.getName());
345        Path parent = p.getParent();
346        for (int i=0; i < p.depth()-1; i++) {
347          retPath = new Path(parent.getName(), retPath);
348          parent = parent.getParent();
349        }
350        return new Path(new Path(scheme, authority, initial),
351          retPath.toString());
352      }
353      
354      /* this makes a path qualified in the har filesystem
355       * (non-Javadoc)
356       * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified(
357       * org.apache.hadoop.fs.Path)
358       */
359      @Override
360      public Path makeQualified(Path path) {
361        // make sure that we just get the 
362        // path component 
363        Path fsPath = path;
364        if (!path.isAbsolute()) {
365          fsPath = new Path(archivePath, path);
366        }
367    
368        URI tmpURI = fsPath.toUri();
369        //change this to Har uri 
370        return new Path(uri.getScheme(), harAuth, tmpURI.getPath());
371      }
372    
373      /**
374       * Fix offset and length of block locations.
375       * Note that this method modifies the original array.
376       * @param locations block locations of har part file
377       * @param start the start of the desired range in the contained file
378       * @param len the length of the desired range
379       * @param fileOffsetInHar the offset of the desired file in the har part file
380       * @return block locations with fixed offset and length
381       */  
382      static BlockLocation[] fixBlockLocations(BlockLocation[] locations,
383                                              long start,
384                                              long len,
385                                              long fileOffsetInHar) {
386        // offset 1 past last byte of desired range
387        long end = start + len;
388    
389        for (BlockLocation location : locations) {
390          // offset of part block relative to beginning of desired file
391          // (may be negative if file starts in this part block)
392          long harBlockStart = location.getOffset() - fileOffsetInHar;
393          // offset 1 past last byte of har block relative to beginning of
394          // desired file
395          long harBlockEnd = harBlockStart + location.getLength();
396          
397          if (start > harBlockStart) {
398            // desired range starts after beginning of this har block
399            // fix offset to beginning of relevant range (relative to desired file)
400            location.setOffset(start);
401            // fix length to relevant portion of har block
402            location.setLength(location.getLength() - (start - harBlockStart));
403          } else {
404            // desired range includes beginning of this har block
405            location.setOffset(harBlockStart);
406          }
407          
408          if (harBlockEnd > end) {
409            // range ends before end of this har block
410            // fix length to remove irrelevant portion at the end
411            location.setLength(location.getLength() - (harBlockEnd - end));
412          }
413        }
414        
415        return locations;
416      }
417      
418      /**
419       * Get block locations from the underlying fs and fix their
420       * offsets and lengths.
421       * @param file the input filestatus to get block locations
422       * @param start the start of the desired range in the contained file
423       * @param len the length of the desired range
424       * @return block locations for this segment of file
425       * @throws IOException
426       */
427      @Override
428      public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
429                                                   long len) throws IOException {
430        HarStatus hstatus = getFileHarStatus(file.getPath());
431        Path partPath = new Path(archivePath, hstatus.getPartName());
432        FileStatus partStatus = metadata.getPartFileStatus(partPath);
433    
434        // get all part blocks that overlap with the desired file blocks
435        BlockLocation[] locations = 
436          fs.getFileBlockLocations(partStatus,
437                                   hstatus.getStartIndex() + start, len);
438    
439        return fixBlockLocations(locations, start, len, hstatus.getStartIndex());
440      }
441      
442      /**
443       * the hash of the path p inside iniside
444       * the filesystem
445       * @param p the path in the harfilesystem
446       * @return the hash code of the path.
447       */
448      public static int getHarHash(Path p) {
449        return (p.toString().hashCode() & 0x7fffffff);
450      }
451      
452      static class Store {
453        public Store() {
454          begin = end = startHash = endHash = 0;
455        }
456        public Store(long begin, long end, int startHash, int endHash) {
457          this.begin = begin;
458          this.end = end;
459          this.startHash = startHash;
460          this.endHash = endHash;
461        }
462        public long begin;
463        public long end;
464        public int startHash;
465        public int endHash;
466      }
467      
468      /**
469       * Get filestatuses of all the children of a given directory. This just reads
470       * through index file and reads line by line to get all statuses for children
471       * of a directory. Its a brute force way of getting all such filestatuses
472       * 
473       * @param parent
474       *          the parent path directory
475       * @param statuses
476       *          the list to add the children filestatuses to
477       * @param children
478       *          the string list of children for this parent
479       * @param archiveIndexStat
480       *          the archive index filestatus
481       */
482      private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses,
483          List<String> children) throws IOException {
484        String parentString = parent.getName();
485        if (!parentString.endsWith(Path.SEPARATOR)){
486            parentString += Path.SEPARATOR;
487        }
488        Path harPath = new Path(parentString);
489        int harlen = harPath.depth();
490        final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
491    
492        for (HarStatus hstatus : metadata.archive.values()) {
493          String child = hstatus.getName();
494          if ((child.startsWith(parentString))) {
495            Path thisPath = new Path(child);
496            if (thisPath.depth() == harlen + 1) {
497              statuses.add(toFileStatus(hstatus, cache));
498            }
499          }
500        }
501      }
502    
503      /**
504       * Combine the status stored in the index and the underlying status. 
505       * @param h status stored in the index
506       * @param cache caching the underlying file statuses
507       * @return the combined file status
508       * @throws IOException
509       */
510      private FileStatus toFileStatus(HarStatus h,
511          Map<String, FileStatus> cache) throws IOException {
512        FileStatus underlying = null;
513        if (cache != null) {
514          underlying = cache.get(h.partName);
515        }
516        if (underlying == null) {
517          final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
518          underlying = fs.getFileStatus(p);
519          if (cache != null) {
520            cache.put(h.partName, underlying);
521          }
522        }
523    
524        long modTime = 0;
525        int version = metadata.getVersion();
526        if (version < 3) {
527          modTime = underlying.getModificationTime();
528        } else if (version == 3) {
529          modTime = h.getModificationTime();
530        }
531    
532        return new FileStatus(
533            h.isDir()? 0L: h.getLength(),
534            h.isDir(),
535            underlying.getReplication(),
536            underlying.getBlockSize(),
537            modTime,
538            underlying.getAccessTime(),
539            underlying.getPermission(),
540            underlying.getOwner(),
541            underlying.getGroup(),
542            makeRelative(this.uri.getPath(), new Path(h.name)));
543      }
544    
545      // a single line parser for hadoop archives status 
546      // stored in a single line in the index files 
547      // the format is of the form 
548      // filename "dir"/"file" partFileName startIndex length 
549      // <space seperated children>
550      private class HarStatus {
551        boolean isDir;
552        String name;
553        List<String> children;
554        String partName;
555        long startIndex;
556        long length;
557        long modificationTime = 0;
558    
559        public HarStatus(String harString) throws UnsupportedEncodingException {
560          String[] splits = harString.split(" ");
561          this.name = decodeFileName(splits[0]);
562          this.isDir = "dir".equals(splits[1]) ? true: false;
563          // this is equal to "none" if its a directory
564          this.partName = splits[2];
565          this.startIndex = Long.parseLong(splits[3]);
566          this.length = Long.parseLong(splits[4]);
567    
568          int version = metadata.getVersion();
569          String[] propSplits = null;
570          // propSplits is used to retrieve the metainformation that Har versions
571          // 1 & 2 missed (modification time, permission, owner group).
572          // These fields are stored in an encoded string placed in different
573          // locations depending on whether it's a file or directory entry.
574          // If it's a directory, the string will be placed at the partName
575          // location (directories have no partName because they don't have data
576          // to be stored). This is done because the number of fields in a
577          // directory entry is unbounded (all children are listed at the end)
578          // If it's a file, the string will be the last field.
579          if (isDir) {
580            if (version == 3){
581              propSplits = decodeString(this.partName).split(" ");
582            }
583            children = new ArrayList<String>();
584            for (int i = 5; i < splits.length; i++) {
585              children.add(decodeFileName(splits[i]));
586            }
587          } else if (version == 3) {
588            propSplits = decodeString(splits[5]).split(" ");
589          }
590    
591          if (propSplits != null && propSplits.length >= 4) {
592            modificationTime = Long.parseLong(propSplits[0]);
593            // the fields below are stored in the file but are currently not used
594            // by HarFileSystem
595            // permission = new FsPermission(Short.parseShort(propSplits[1]));
596            // owner = decodeString(propSplits[2]);
597            // group = decodeString(propSplits[3]);
598          }
599        }
600        public boolean isDir() {
601          return isDir;
602        }
603        
604        public String getName() {
605          return name;
606        }
607        public String getPartName() {
608          return partName;
609        }
610        public long getStartIndex() {
611          return startIndex;
612        }
613        public long getLength() {
614          return length;
615        }
616        public long getModificationTime() {
617          return modificationTime;
618        }
619      }
620      
621      /**
622       * return the filestatus of files in har archive.
623       * The permission returned are that of the archive
624       * index files. The permissions are not persisted 
625       * while creating a hadoop archive.
626       * @param f the path in har filesystem
627       * @return filestatus.
628       * @throws IOException
629       */
630      @Override
631      public FileStatus getFileStatus(Path f) throws IOException {
632        HarStatus hstatus = getFileHarStatus(f);
633        return toFileStatus(hstatus, null);
634      }
635    
636      private HarStatus getFileHarStatus(Path f) throws IOException {
637        // get the fs DataInputStream for the underlying file
638        // look up the index.
639        Path p = makeQualified(f);
640        Path harPath = getPathInHar(p);
641        if (harPath == null) {
642          throw new IOException("Invalid file name: " + f + " in " + uri);
643        }
644        HarStatus hstatus = metadata.archive.get(harPath);
645        if (hstatus == null) {
646          throw new FileNotFoundException("File: " +  f + " does not exist in " + uri);
647        }
648        return hstatus;
649      }
650    
651      /**
652       * @return null since no checksum algorithm is implemented.
653       */
654      @Override
655      public FileChecksum getFileChecksum(Path f) {
656        return null;
657      }
658    
659      /**
660       * Returns a har input stream which fakes end of 
661       * file. It reads the index files to get the part 
662       * file name and the size and start of the file.
663       */
664      @Override
665      public FSDataInputStream open(Path f, int bufferSize) throws IOException {
666        // get the fs DataInputStream for the underlying file
667        HarStatus hstatus = getFileHarStatus(f);
668        // we got it.. woo hooo!!! 
669        if (hstatus.isDir()) {
670          throw new FileNotFoundException(f + " : not a file in " +
671                    archivePath);
672        }
673        return new HarFSDataInputStream(fs, new Path(archivePath, 
674            hstatus.getPartName()),
675            hstatus.getStartIndex(), hstatus.getLength(), bufferSize);
676      }
677     
678      @Override
679      public FSDataOutputStream create(Path f,
680          FsPermission permission,
681          boolean overwrite,
682          int bufferSize,
683          short replication,
684          long blockSize,
685          Progressable progress) throws IOException {
686        throw new IOException("Har: create not allowed.");
687      }
688      
689      @Override
690      public void close() throws IOException {
691        if (fs != null) {
692          try {
693            fs.close();
694          } catch(IOException ie) {
695            //this might already be closed
696            // ignore
697          }
698        }
699      }
700      
701      /**
702       * Not implemented.
703       */
704      @Override
705      public boolean setReplication(Path src, short replication) throws IOException{
706        throw new IOException("Har: setreplication not allowed");
707      }
708      
709      /**
710       * Not implemented.
711       */
712      @Override
713      public boolean delete(Path f, boolean recursive) throws IOException { 
714        throw new IOException("Har: delete not allowed");
715      }
716      
717      /**
718       * liststatus returns the children of a directory 
719       * after looking up the index files.
720       */
721      @Override
722      public FileStatus[] listStatus(Path f) throws IOException {
723        //need to see if the file is an index in file
724        //get the filestatus of the archive directory
725        // we will create fake filestatuses to return
726        // to the client
727        List<FileStatus> statuses = new ArrayList<FileStatus>();
728        Path tmpPath = makeQualified(f);
729        Path harPath = getPathInHar(tmpPath);
730        HarStatus hstatus = metadata.archive.get(harPath);
731        if (hstatus == null) {
732          throw new FileNotFoundException("File " + f + " not found in " + archivePath);
733        }
734        if (hstatus.isDir()) {
735          fileStatusesInIndex(hstatus, statuses, hstatus.children);
736        } else {
737          statuses.add(toFileStatus(hstatus, null));
738        }
739        
740        return statuses.toArray(new FileStatus[statuses.size()]);
741      }
742      
743      /**
744       * return the top level archive path.
745       */
746      @Override
747      public Path getHomeDirectory() {
748        return new Path(uri.toString());
749      }
750      
751      @Override
752      public void setWorkingDirectory(Path newDir) {
753        //does nothing.
754      }
755      
756      /**
757       * not implemented.
758       */
759      @Override
760      public boolean mkdirs(Path f, FsPermission permission) throws IOException {
761        throw new IOException("Har: mkdirs not allowed");
762      }
763      
764      /**
765       * not implemented.
766       */
767      @Override
768      public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws 
769            IOException {
770        throw new IOException("Har: copyfromlocalfile not allowed");
771      }
772      
773      /**
774       * copies the file in the har filesystem to a local file.
775       */
776      @Override
777      public void copyToLocalFile(boolean delSrc, Path src, Path dst) 
778        throws IOException {
779        FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf());
780      }
781      
782      /**
783       * not implemented.
784       */
785      @Override
786      public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) 
787        throws IOException {
788        throw new IOException("Har: startLocalOutput not allowed");
789      }
790      
791      /**
792       * not implemented.
793       */
794      @Override
795      public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) 
796        throws IOException {
797        throw new IOException("Har: completeLocalOutput not allowed");
798      }
799      
800      /**
801       * not implemented.
802       */
803      @Override
804      public void setOwner(Path p, String username, String groupname)
805        throws IOException {
806        throw new IOException("Har: setowner not allowed");
807      }
808    
809      /**
810       * Not implemented.
811       */
812      @Override
813      public void setPermission(Path p, FsPermission permisssion) 
814        throws IOException {
815        throw new IOException("Har: setPermission not allowed");
816      }
817      
818      /**
819       * Hadoop archives input stream. This input stream fakes EOF 
820       * since archive files are part of bigger part files.
821       */
822      private static class HarFSDataInputStream extends FSDataInputStream {
823        /**
824         * Create an input stream that fakes all the reads/positions/seeking.
825         */
826        private static class HarFsInputStream extends FSInputStream {
827          private long position, start, end;
828          //The underlying data input stream that the
829          // underlying filesystem will return.
830          private FSDataInputStream underLyingStream;
831          //one byte buffer
832          private byte[] oneBytebuff = new byte[1];
833          HarFsInputStream(FileSystem fs, Path path, long start,
834              long length, int bufferSize) throws IOException {
835            underLyingStream = fs.open(path, bufferSize);
836            underLyingStream.seek(start);
837            // the start of this file in the part file
838            this.start = start;
839            // the position pointer in the part file
840            this.position = start;
841            // the end pointer in the part file
842            this.end = start + length;
843          }
844          
845          @Override
846          public synchronized int available() throws IOException {
847            long remaining = end - underLyingStream.getPos();
848            if (remaining > (long)Integer.MAX_VALUE) {
849              return Integer.MAX_VALUE;
850            }
851            return (int) remaining;
852          }
853          
854          @Override
855          public synchronized  void close() throws IOException {
856            underLyingStream.close();
857            super.close();
858          }
859          
860          //not implemented
861          @Override
862          public void mark(int readLimit) {
863            // do nothing 
864          }
865          
866          /**
867           * reset is not implemented
868           */
869          @Override
870          public void reset() throws IOException {
871            throw new IOException("reset not implemented.");
872          }
873          
874          @Override
875          public synchronized int read() throws IOException {
876            int ret = read(oneBytebuff, 0, 1);
877            return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff);
878          }
879          
880          @Override
881          public synchronized int read(byte[] b) throws IOException {
882            int ret = read(b, 0, b.length);
883            if (ret != -1) {
884              position += ret;
885            }
886            return ret;
887          }
888          
889          /**
890           * 
891           */
892          @Override
893          public synchronized int read(byte[] b, int offset, int len) 
894            throws IOException {
895            int newlen = len;
896            int ret = -1;
897            if (position + len > end) {
898              newlen = (int) (end - position);
899            }
900            // end case
901            if (newlen == 0) 
902              return ret;
903            ret = underLyingStream.read(b, offset, newlen);
904            position += ret;
905            return ret;
906          }
907          
908          @Override
909          public synchronized long skip(long n) throws IOException {
910            long tmpN = n;
911            if (tmpN > 0) {
912              if (position + tmpN > end) {
913                tmpN = end - position;
914              }
915              underLyingStream.seek(tmpN + position);
916              position += tmpN;
917              return tmpN;
918            }
919            return (tmpN < 0)? -1 : 0;
920          }
921          
922          @Override
923          public synchronized long getPos() throws IOException {
924            return (position - start);
925          }
926          
927          @Override
928          public synchronized void seek(long pos) throws IOException {
929            if (pos < 0 || (start + pos > end)) {
930              throw new IOException("Failed to seek: EOF");
931            }
932            position = start + pos;
933            underLyingStream.seek(position);
934          }
935    
936          @Override
937          public boolean seekToNewSource(long targetPos) throws IOException {
938            //do not need to implement this
939            // hdfs in itself does seektonewsource 
940            // while reading.
941            return false;
942          }
943          
944          /**
945           * implementing position readable. 
946           */
947          @Override
948          public int read(long pos, byte[] b, int offset, int length) 
949          throws IOException {
950            int nlength = length;
951            if (start + nlength + pos > end) {
952              nlength = (int) (end - (start + pos));
953            }
954            return underLyingStream.read(pos + start , b, offset, nlength);
955          }
956          
957          /**
958           * position readable again.
959           */
960          @Override
961          public void readFully(long pos, byte[] b, int offset, int length) 
962          throws IOException {
963            if (start + length + pos > end) {
964              throw new IOException("Not enough bytes to read.");
965            }
966            underLyingStream.readFully(pos + start, b, offset, length);
967          }
968          
969          @Override
970          public void readFully(long pos, byte[] b) throws IOException {
971              readFully(pos, b, 0, b.length);
972          }
973          
974        }
975      
976        /**
977         * constructors for har input stream.
978         * @param fs the underlying filesystem
979         * @param p The path in the underlying filesystem
980         * @param start the start position in the part file
981         * @param length the length of valid data in the part file
982         * @param bufsize the buffer size
983         * @throws IOException
984         */
985        public HarFSDataInputStream(FileSystem fs, Path  p, long start, 
986            long length, int bufsize) throws IOException {
987            super(new HarFsInputStream(fs, p, start, length, bufsize));
988        }
989    
990        /**
991         * constructor for har input stream.
992         * @param fs the underlying filesystem
993         * @param p the path in the underlying file system
994         * @param start the start position in the part file
995         * @param length the length of valid data in the part file.
996         * @throws IOException
997         */
998        public HarFSDataInputStream(FileSystem fs, Path  p, long start, long length)
999          throws IOException {
1000            super(new HarFsInputStream(fs, p, start, length, 0));
1001        }
1002      }
1003    
1004      private class HarMetaData {
1005        private FileSystem fs;
1006        private int version;
1007        // the masterIndex of the archive
1008        private Path masterIndexPath;
1009        // the index file 
1010        private Path archiveIndexPath;
1011    
1012        private long masterIndexTimestamp;
1013        private long archiveIndexTimestamp;
1014    
1015        List<Store> stores = new ArrayList<Store>();
1016        Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
1017        private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
1018    
1019        public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
1020          this.fs = fs;
1021          this.masterIndexPath = masterIndexPath;
1022          this.archiveIndexPath = archiveIndexPath;
1023        }
1024    
1025        public FileStatus getPartFileStatus(Path partPath) throws IOException {
1026          FileStatus status;
1027          status = partFileStatuses.get(partPath);
1028          if (status == null) {
1029            status = fs.getFileStatus(partPath);
1030            partFileStatuses.put(partPath, status);
1031          }
1032          return status;
1033        }
1034    
1035        public long getMasterIndexTimestamp() {
1036          return masterIndexTimestamp;
1037        }
1038    
1039        public long getArchiveIndexTimestamp() {
1040          return archiveIndexTimestamp;
1041        }
1042    
1043        private int getVersion() {
1044          return version;
1045        }
1046    
1047        private void parseMetaData() throws IOException {
1048          Text line;
1049          long read;
1050          FSDataInputStream in = null;
1051          LineReader lin = null;
1052    
1053          try {
1054            in = fs.open(masterIndexPath);
1055            FileStatus masterStat = fs.getFileStatus(masterIndexPath);
1056            masterIndexTimestamp = masterStat.getModificationTime();
1057            lin = new LineReader(in, getConf());
1058            line = new Text();
1059            read = lin.readLine(line);
1060    
1061            // the first line contains the version of the index file
1062            String versionLine = line.toString();
1063            String[] arr = versionLine.split(" ");
1064            version = Integer.parseInt(arr[0]);
1065            // make it always backwards-compatible
1066            if (this.version > HarFileSystem.VERSION) {
1067              throw new IOException("Invalid version " + 
1068                  this.version + " expected " + HarFileSystem.VERSION);
1069            }
1070    
1071            // each line contains a hashcode range and the index file name
1072            String[] readStr = null;
1073            while(read < masterStat.getLen()) {
1074              int b = lin.readLine(line);
1075              read += b;
1076              readStr = line.toString().split(" ");
1077              int startHash = Integer.parseInt(readStr[0]);
1078              int endHash  = Integer.parseInt(readStr[1]);
1079              stores.add(new Store(Long.parseLong(readStr[2]), 
1080                  Long.parseLong(readStr[3]), startHash,
1081                  endHash));
1082              line.clear();
1083            }
1084          } finally {
1085            IOUtils.cleanup(LOG, lin, in);
1086          }
1087    
1088          FSDataInputStream aIn = fs.open(archiveIndexPath);
1089          try {
1090            FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
1091            archiveIndexTimestamp = archiveStat.getModificationTime();
1092            LineReader aLin;
1093    
1094            // now start reading the real index file
1095            for (Store s: stores) {
1096              read = 0;
1097              aIn.seek(s.begin);
1098              aLin = new LineReader(aIn, getConf());
1099              while (read + s.begin < s.end) {
1100                int tmp = aLin.readLine(line);
1101                read += tmp;
1102                String lineFeed = line.toString();
1103                String[] parsed = lineFeed.split(" ");
1104                parsed[0] = decodeFileName(parsed[0]);
1105                archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
1106                line.clear();
1107              }
1108            }
1109          } finally {
1110            IOUtils.cleanup(LOG, aIn);
1111          }
1112        }
1113      }
1114      
1115      /*
1116       * testing purposes only:
1117       */
1118      HarMetaData getMetadata() {
1119        return metadata;
1120      }
1121    
1122      private static class LruCache<K, V> extends LinkedHashMap<K, V> {
1123        private final int MAX_ENTRIES;
1124    
1125        public LruCache(int maxEntries) {
1126            super(maxEntries + 1, 1.0f, true);
1127            MAX_ENTRIES = maxEntries;
1128        }
1129    
1130        @Override
1131        protected boolean removeEldestEntry(Map.Entry<K, V> eldest) {
1132            return size() > MAX_ENTRIES;
1133        }
1134      }
1135    }