001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.fs;
019    
020    import java.io.FileNotFoundException;
021    import java.io.IOException;
022    import java.io.UnsupportedEncodingException;
023    import java.net.URI;
024    import java.net.URISyntaxException;
025    import java.net.URLDecoder;
026    import java.util.ArrayList;
027    import java.util.List;
028    import java.util.Map;
029    import java.util.TreeMap;
030    import java.util.HashMap;
031    import java.util.concurrent.ConcurrentHashMap;
032    
033    import org.apache.commons.logging.Log;
034    import org.apache.commons.logging.LogFactory;
035    import org.apache.hadoop.conf.Configuration;
036    import org.apache.hadoop.fs.permission.FsPermission;
037    import org.apache.hadoop.io.IOUtils;
038    import org.apache.hadoop.io.Text;
039    import org.apache.hadoop.util.LineReader;
040    import org.apache.hadoop.util.Progressable;
041    
042    /**
043     * This is an implementation of the Hadoop Archive 
044     * Filesystem. This archive Filesystem has index files
045     * of the form _index* and has contents of the form
046     * part-*. The index files store the indexes of the 
047     * real files. The index files are of the form _masterindex
048     * and _index. The master index is a level of indirection 
049     * in to the index file to make the look ups faster. the index
050     * file is sorted with hash code of the paths that it contains 
051     * and the master index contains pointers to the positions in 
052     * index for ranges of hashcodes.
053     */
054    
055    public class HarFileSystem extends FilterFileSystem {
056    
057      private static final Log LOG = LogFactory.getLog(HarFileSystem.class);
058    
059      public static final int VERSION = 3;
060    
061      private static final Map<URI, HarMetaData> harMetaCache =
062          new ConcurrentHashMap<URI, HarMetaData>();
063    
064      // uri representation of this Har filesystem
065      private URI uri;
066      // the top level path of the archive
067      // in the underlying file system
068      private Path archivePath;
069      // the har auth
070      private String harAuth;
071    
072      // pointer into the static metadata cache
073      private HarMetaData metadata;
074    
075      /**
076       * public construction of harfilesystem
077       *
078       */
079      public HarFileSystem() {
080      }
081    
082      /**
083       * Return the protocol scheme for the FileSystem.
084       * <p/>
085       *
086       * @return <code>har</code>
087       */
088      @Override
089      public String getScheme() {
090        return "har";
091      }
092    
093      /**
094       * Constructor to create a HarFileSystem with an
095       * underlying filesystem.
096       * @param fs
097       */
098      public HarFileSystem(FileSystem fs) {
099        super(fs);
100      }
101      
102      /**
103       * Initialize a Har filesystem per har archive. The 
104       * archive home directory is the top level directory
105       * in the filesystem that contains the HAR archive.
106       * Be careful with this method, you do not want to go 
107       * on creating new Filesystem instances per call to 
108       * path.getFileSystem().
109       * the uri of Har is 
110       * har://underlyingfsscheme-host:port/archivepath.
111       * or 
112       * har:///archivepath. This assumes the underlying filesystem
113       * to be used in case not specified.
114       */
115      @Override
116      public void initialize(URI name, Configuration conf) throws IOException {
117        // decode the name
118        URI underLyingURI = decodeHarURI(name, conf);
119        // we got the right har Path- now check if this is 
120        // truly a har filesystem
121        Path harPath = archivePath(
122          new Path(name.getScheme(), name.getAuthority(), name.getPath()));
123        if (harPath == null) { 
124          throw new IOException("Invalid path for the Har Filesystem. " + 
125                               name.toString());
126        }
127        if (fs == null) {
128          fs = FileSystem.get(underLyingURI, conf);
129        }
130        uri = harPath.toUri();
131        archivePath = new Path(uri.getPath());
132        harAuth = getHarAuth(underLyingURI);
133        //check for the underlying fs containing
134        // the index file
135        Path masterIndexPath = new Path(archivePath, "_masterindex");
136        Path archiveIndexPath = new Path(archivePath, "_index");
137        if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) {
138          throw new IOException("Invalid path for the Har Filesystem. " +
139              "No index file in " + harPath);
140        }
141    
142        metadata = harMetaCache.get(uri);
143        if (metadata != null) {
144          FileStatus mStat = fs.getFileStatus(masterIndexPath);
145          FileStatus aStat = fs.getFileStatus(archiveIndexPath);
146          if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() ||
147              aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) {
148            // the archive has been overwritten since we last read it
149            // remove the entry from the meta data cache
150            metadata = null;
151            harMetaCache.remove(uri);
152          }
153        }
154        if (metadata == null) {
155          metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath);
156          metadata.parseMetaData();
157          harMetaCache.put(uri, metadata);
158        }
159      }
160    
161      // get the version of the filesystem from the masterindex file
162      // the version is currently not useful since its the first version
163      // of archives
164      public int getHarVersion() throws IOException {
165        if (metadata != null) {
166          return metadata.getVersion();
167        }
168        else {
169          throw new IOException("Invalid meta data for the Har Filesystem");
170        }
171      }
172    
173      /*
174       * find the parent path that is the 
175       * archive path in the path. The last
176       * path segment that ends with .har is 
177       * the path that will be returned.
178       */
179      private Path archivePath(Path p) {
180        Path retPath = null;
181        Path tmp = p;
182        for (int i=0; i< p.depth(); i++) {
183          if (tmp.toString().endsWith(".har")) {
184            retPath = tmp;
185            break;
186          }
187          tmp = tmp.getParent();
188        }
189        return retPath;
190      }
191    
192      /**
193       * decode the raw URI to get the underlying URI
194       * @param rawURI raw Har URI
195       * @return filtered URI of the underlying fileSystem
196       */
197      private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException {
198        String tmpAuth = rawURI.getAuthority();
199        //we are using the default file
200        //system in the config 
201        //so create a underlying uri and 
202        //return it
203        if (tmpAuth == null) {
204          //create a path 
205          return FileSystem.getDefaultUri(conf);
206        }
207        String host = rawURI.getHost();
208        if (host == null) {
209          throw new IOException("URI: " + rawURI
210              + " is an invalid Har URI since host==null."
211              + "  Expecting har://<scheme>-<host>/<path>.");
212        }
213        int i = host.indexOf('-');
214        if (i < 0) {
215          throw new IOException("URI: " + rawURI
216              + " is an invalid Har URI since '-' not found."
217              + "  Expecting har://<scheme>-<host>/<path>.");
218        }
219        final String underLyingScheme = host.substring(0, i);
220        i++;
221        final String underLyingHost = i == host.length()? null: host.substring(i);
222        int underLyingPort = rawURI.getPort();
223        String auth = (underLyingHost == null && underLyingPort == -1)?
224                      null:(underLyingHost+
225                          (underLyingPort == -1 ? "" : ":"+underLyingPort));
226        URI tmp = null;
227        if (rawURI.getQuery() != null) {
228          // query component not allowed
229          throw new IOException("query component in Path not supported  " + rawURI);
230        }
231        try {
232          tmp = new URI(underLyingScheme, auth, rawURI.getPath(), 
233                rawURI.getQuery(), rawURI.getFragment());
234        } catch (URISyntaxException e) {
235            // do nothing should not happen
236        }
237        return tmp;
238      }
239    
240      private static String decodeString(String str)
241        throws UnsupportedEncodingException {
242        return URLDecoder.decode(str, "UTF-8");
243      }
244    
245      private String decodeFileName(String fname) 
246        throws UnsupportedEncodingException {
247        int version = metadata.getVersion();
248        if (version == 2 || version == 3){
249          return decodeString(fname);
250        }
251        return fname;
252      }
253    
254      /**
255       * return the top level archive.
256       */
257      @Override
258      public Path getWorkingDirectory() {
259        return new Path(uri.toString());
260      }
261      
262      /**
263       * Create a har specific auth 
264       * har-underlyingfs:port
265       * @param underLyingURI the uri of underlying
266       * filesystem
267       * @return har specific auth
268       */
269      private String getHarAuth(URI underLyingUri) {
270        String auth = underLyingUri.getScheme() + "-";
271        if (underLyingUri.getHost() != null) {
272          auth += underLyingUri.getHost() + ":";
273          if (underLyingUri.getPort() != -1) {
274            auth +=  underLyingUri.getPort();
275          }
276        }
277        else {
278          auth += ":";
279        }
280        return auth;
281      }
282      
283      /**
284       * Returns the uri of this filesystem.
285       * The uri is of the form 
286       * har://underlyingfsschema-host:port/pathintheunderlyingfs
287       */
288      @Override
289      public URI getUri() {
290        return this.uri;
291      }
292      
293      /**
294       * this method returns the path 
295       * inside the har filesystem.
296       * this is relative path inside 
297       * the har filesystem.
298       * @param path the fully qualified path in the har filesystem.
299       * @return relative path in the filesystem.
300       */
301      private Path getPathInHar(Path path) {
302        Path harPath = new Path(path.toUri().getPath());
303        if (archivePath.compareTo(harPath) == 0)
304          return new Path(Path.SEPARATOR);
305        Path tmp = new Path(harPath.getName());
306        Path parent = harPath.getParent();
307        while (!(parent.compareTo(archivePath) == 0)) {
308          if (parent.toString().equals(Path.SEPARATOR)) {
309            tmp = null;
310            break;
311          }
312          tmp = new Path(parent.getName(), tmp);
313          parent = parent.getParent();
314        }
315        if (tmp != null) 
316          tmp = new Path(Path.SEPARATOR, tmp);
317        return tmp;
318      }
319      
320      //the relative path of p. basically 
321      // getting rid of /. Parsing and doing 
322      // string manipulation is not good - so
323      // just use the path api to do it.
324      private Path makeRelative(String initial, Path p) {
325        String scheme = this.uri.getScheme();
326        String authority = this.uri.getAuthority();
327        Path root = new Path(Path.SEPARATOR);
328        if (root.compareTo(p) == 0)
329          return new Path(scheme, authority, initial);
330        Path retPath = new Path(p.getName());
331        Path parent = p.getParent();
332        for (int i=0; i < p.depth()-1; i++) {
333          retPath = new Path(parent.getName(), retPath);
334          parent = parent.getParent();
335        }
336        return new Path(new Path(scheme, authority, initial),
337          retPath.toString());
338      }
339      
340      /* this makes a path qualified in the har filesystem
341       * (non-Javadoc)
342       * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified(
343       * org.apache.hadoop.fs.Path)
344       */
345      @Override
346      public Path makeQualified(Path path) {
347        // make sure that we just get the 
348        // path component 
349        Path fsPath = path;
350        if (!path.isAbsolute()) {
351          fsPath = new Path(archivePath, path);
352        }
353    
354        URI tmpURI = fsPath.toUri();
355        //change this to Har uri 
356        return new Path(uri.getScheme(), harAuth, tmpURI.getPath());
357      }
358    
359      /**
360       * Fix offset and length of block locations.
361       * Note that this method modifies the original array.
362       * @param locations block locations of har part file
363       * @param start the start of the desired range in the contained file
364       * @param len the length of the desired range
365       * @param fileOffsetInHar the offset of the desired file in the har part file
366       * @return block locations with fixed offset and length
367       */  
368      static BlockLocation[] fixBlockLocations(BlockLocation[] locations,
369                                              long start,
370                                              long len,
371                                              long fileOffsetInHar) {
372        // offset 1 past last byte of desired range
373        long end = start + len;
374    
375        for (BlockLocation location : locations) {
376          // offset of part block relative to beginning of desired file
377          // (may be negative if file starts in this part block)
378          long harBlockStart = location.getOffset() - fileOffsetInHar;
379          // offset 1 past last byte of har block relative to beginning of
380          // desired file
381          long harBlockEnd = harBlockStart + location.getLength();
382          
383          if (start > harBlockStart) {
384            // desired range starts after beginning of this har block
385            // fix offset to beginning of relevant range (relative to desired file)
386            location.setOffset(start);
387            // fix length to relevant portion of har block
388            location.setLength(location.getLength() - (start - harBlockStart));
389          } else {
390            // desired range includes beginning of this har block
391            location.setOffset(harBlockStart);
392          }
393          
394          if (harBlockEnd > end) {
395            // range ends before end of this har block
396            // fix length to remove irrelevant portion at the end
397            location.setLength(location.getLength() - (harBlockEnd - end));
398          }
399        }
400        
401        return locations;
402      }
403      
404      /**
405       * Get block locations from the underlying fs and fix their
406       * offsets and lengths.
407       * @param file the input filestatus to get block locations
408       * @param start the start of the desired range in the contained file
409       * @param len the length of the desired range
410       * @return block locations for this segment of file
411       * @throws IOException
412       */
413      @Override
414      public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
415                                                   long len) throws IOException {
416        HarStatus hstatus = getFileHarStatus(file.getPath());
417        Path partPath = new Path(archivePath, hstatus.getPartName());
418        FileStatus partStatus = metadata.getPartFileStatus(partPath);
419    
420        // get all part blocks that overlap with the desired file blocks
421        BlockLocation[] locations = 
422          fs.getFileBlockLocations(partStatus,
423                                   hstatus.getStartIndex() + start, len);
424    
425        return fixBlockLocations(locations, start, len, hstatus.getStartIndex());
426      }
427      
428      /**
429       * the hash of the path p inside iniside
430       * the filesystem
431       * @param p the path in the harfilesystem
432       * @return the hash code of the path.
433       */
434      public static int getHarHash(Path p) {
435        return (p.toString().hashCode() & 0x7fffffff);
436      }
437      
438      static class Store {
439        public Store() {
440          begin = end = startHash = endHash = 0;
441        }
442        public Store(long begin, long end, int startHash, int endHash) {
443          this.begin = begin;
444          this.end = end;
445          this.startHash = startHash;
446          this.endHash = endHash;
447        }
448        public long begin;
449        public long end;
450        public int startHash;
451        public int endHash;
452      }
453      
454      /**
455       * Get filestatuses of all the children of a given directory. This just reads
456       * through index file and reads line by line to get all statuses for children
457       * of a directory. Its a brute force way of getting all such filestatuses
458       * 
459       * @param parent
460       *          the parent path directory
461       * @param statuses
462       *          the list to add the children filestatuses to
463       * @param children
464       *          the string list of children for this parent
465       * @param archiveIndexStat
466       *          the archive index filestatus
467       */
468      private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses,
469          List<String> children) throws IOException {
470        String parentString = parent.getName();
471        if (!parentString.endsWith(Path.SEPARATOR)){
472            parentString += Path.SEPARATOR;
473        }
474        Path harPath = new Path(parentString);
475        int harlen = harPath.depth();
476        final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
477    
478        for (HarStatus hstatus : metadata.archive.values()) {
479          String child = hstatus.getName();
480          if ((child.startsWith(parentString))) {
481            Path thisPath = new Path(child);
482            if (thisPath.depth() == harlen + 1) {
483              statuses.add(toFileStatus(hstatus, cache));
484            }
485          }
486        }
487      }
488    
489      /**
490       * Combine the status stored in the index and the underlying status. 
491       * @param h status stored in the index
492       * @param cache caching the underlying file statuses
493       * @return the combined file status
494       * @throws IOException
495       */
496      private FileStatus toFileStatus(HarStatus h,
497          Map<String, FileStatus> cache) throws IOException {
498        FileStatus underlying = null;
499        if (cache != null) {
500          underlying = cache.get(h.partName);
501        }
502        if (underlying == null) {
503          final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
504          underlying = fs.getFileStatus(p);
505          if (cache != null) {
506            cache.put(h.partName, underlying);
507          }
508        }
509    
510        long modTime = 0;
511        int version = metadata.getVersion();
512        if (version < 3) {
513          modTime = underlying.getModificationTime();
514        } else if (version == 3) {
515          modTime = h.getModificationTime();
516        }
517    
518        return new FileStatus(
519            h.isDir()? 0L: h.getLength(),
520            h.isDir(),
521            underlying.getReplication(),
522            underlying.getBlockSize(),
523            modTime,
524            underlying.getAccessTime(),
525            underlying.getPermission(),
526            underlying.getOwner(),
527            underlying.getGroup(),
528            makeRelative(this.uri.getPath(), new Path(h.name)));
529      }
530    
531      // a single line parser for hadoop archives status 
532      // stored in a single line in the index files 
533      // the format is of the form 
534      // filename "dir"/"file" partFileName startIndex length 
535      // <space seperated children>
536      private class HarStatus {
537        boolean isDir;
538        String name;
539        List<String> children;
540        String partName;
541        long startIndex;
542        long length;
543        long modificationTime = 0;
544    
545        public HarStatus(String harString) throws UnsupportedEncodingException {
546          String[] splits = harString.split(" ");
547          this.name = decodeFileName(splits[0]);
548          this.isDir = "dir".equals(splits[1]) ? true: false;
549          // this is equal to "none" if its a directory
550          this.partName = splits[2];
551          this.startIndex = Long.parseLong(splits[3]);
552          this.length = Long.parseLong(splits[4]);
553    
554          int version = metadata.getVersion();
555          String[] propSplits = null;
556          // propSplits is used to retrieve the metainformation that Har versions
557          // 1 & 2 missed (modification time, permission, owner group).
558          // These fields are stored in an encoded string placed in different
559          // locations depending on whether it's a file or directory entry.
560          // If it's a directory, the string will be placed at the partName
561          // location (directories have no partName because they don't have data
562          // to be stored). This is done because the number of fields in a
563          // directory entry is unbounded (all children are listed at the end)
564          // If it's a file, the string will be the last field.
565          if (isDir) {
566            if (version == 3){
567              propSplits = decodeString(this.partName).split(" ");
568            }
569            children = new ArrayList<String>();
570            for (int i = 5; i < splits.length; i++) {
571              children.add(decodeFileName(splits[i]));
572            }
573          } else if (version == 3) {
574            propSplits = decodeString(splits[5]).split(" ");
575          }
576    
577          if (propSplits != null && propSplits.length >= 4) {
578            modificationTime = Long.parseLong(propSplits[0]);
579            // the fields below are stored in the file but are currently not used
580            // by HarFileSystem
581            // permission = new FsPermission(Short.parseShort(propSplits[1]));
582            // owner = decodeString(propSplits[2]);
583            // group = decodeString(propSplits[3]);
584          }
585        }
586        public boolean isDir() {
587          return isDir;
588        }
589        
590        public String getName() {
591          return name;
592        }
593        public String getPartName() {
594          return partName;
595        }
596        public long getStartIndex() {
597          return startIndex;
598        }
599        public long getLength() {
600          return length;
601        }
602        public long getModificationTime() {
603          return modificationTime;
604        }
605      }
606      
607      /**
608       * return the filestatus of files in har archive.
609       * The permission returned are that of the archive
610       * index files. The permissions are not persisted 
611       * while creating a hadoop archive.
612       * @param f the path in har filesystem
613       * @return filestatus.
614       * @throws IOException
615       */
616      @Override
617      public FileStatus getFileStatus(Path f) throws IOException {
618        HarStatus hstatus = getFileHarStatus(f);
619        return toFileStatus(hstatus, null);
620      }
621    
622      private HarStatus getFileHarStatus(Path f) throws IOException {
623        // get the fs DataInputStream for the underlying file
624        // look up the index.
625        Path p = makeQualified(f);
626        Path harPath = getPathInHar(p);
627        if (harPath == null) {
628          throw new IOException("Invalid file name: " + f + " in " + uri);
629        }
630        HarStatus hstatus = metadata.archive.get(harPath);
631        if (hstatus == null) {
632          throw new FileNotFoundException("File: " +  f + " does not exist in " + uri);
633        }
634        return hstatus;
635      }
636    
637      /**
638       * @return null since no checksum algorithm is implemented.
639       */
640      @Override
641      public FileChecksum getFileChecksum(Path f) {
642        return null;
643      }
644    
645      /**
646       * Returns a har input stream which fakes end of 
647       * file. It reads the index files to get the part 
648       * file name and the size and start of the file.
649       */
650      @Override
651      public FSDataInputStream open(Path f, int bufferSize) throws IOException {
652        // get the fs DataInputStream for the underlying file
653        HarStatus hstatus = getFileHarStatus(f);
654        // we got it.. woo hooo!!! 
655        if (hstatus.isDir()) {
656          throw new FileNotFoundException(f + " : not a file in " +
657                    archivePath);
658        }
659        return new HarFSDataInputStream(fs, new Path(archivePath, 
660            hstatus.getPartName()),
661            hstatus.getStartIndex(), hstatus.getLength(), bufferSize);
662      }
663     
664      @Override
665      public FSDataOutputStream create(Path f,
666          FsPermission permission,
667          boolean overwrite,
668          int bufferSize,
669          short replication,
670          long blockSize,
671          Progressable progress) throws IOException {
672        throw new IOException("Har: create not allowed.");
673      }
674      
675      @Override
676      public void close() throws IOException {
677        if (fs != null) {
678          try {
679            fs.close();
680          } catch(IOException ie) {
681            //this might already be closed
682            // ignore
683          }
684        }
685      }
686      
687      /**
688       * Not implemented.
689       */
690      @Override
691      public boolean setReplication(Path src, short replication) throws IOException{
692        throw new IOException("Har: setreplication not allowed");
693      }
694      
695      /**
696       * Not implemented.
697       */
698      @Override
699      public boolean delete(Path f, boolean recursive) throws IOException { 
700        throw new IOException("Har: delete not allowed");
701      }
702      
703      /**
704       * liststatus returns the children of a directory 
705       * after looking up the index files.
706       */
707      @Override
708      public FileStatus[] listStatus(Path f) throws IOException {
709        //need to see if the file is an index in file
710        //get the filestatus of the archive directory
711        // we will create fake filestatuses to return
712        // to the client
713        List<FileStatus> statuses = new ArrayList<FileStatus>();
714        Path tmpPath = makeQualified(f);
715        Path harPath = getPathInHar(tmpPath);
716        HarStatus hstatus = metadata.archive.get(harPath);
717        if (hstatus == null) {
718          throw new FileNotFoundException("File " + f + " not found in " + archivePath);
719        }
720        if (hstatus.isDir()) {
721          fileStatusesInIndex(hstatus, statuses, hstatus.children);
722        } else {
723          statuses.add(toFileStatus(hstatus, null));
724        }
725        
726        return statuses.toArray(new FileStatus[statuses.size()]);
727      }
728      
729      /**
730       * return the top level archive path.
731       */
732      @Override
733      public Path getHomeDirectory() {
734        return new Path(uri.toString());
735      }
736      
737      @Override
738      public void setWorkingDirectory(Path newDir) {
739        //does nothing.
740      }
741      
742      /**
743       * not implemented.
744       */
745      @Override
746      public boolean mkdirs(Path f, FsPermission permission) throws IOException {
747        throw new IOException("Har: mkdirs not allowed");
748      }
749      
750      /**
751       * not implemented.
752       */
753      @Override
754      public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws 
755            IOException {
756        throw new IOException("Har: copyfromlocalfile not allowed");
757      }
758      
759      /**
760       * copies the file in the har filesystem to a local file.
761       */
762      @Override
763      public void copyToLocalFile(boolean delSrc, Path src, Path dst) 
764        throws IOException {
765        FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf());
766      }
767      
768      /**
769       * not implemented.
770       */
771      @Override
772      public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) 
773        throws IOException {
774        throw new IOException("Har: startLocalOutput not allowed");
775      }
776      
777      /**
778       * not implemented.
779       */
780      @Override
781      public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) 
782        throws IOException {
783        throw new IOException("Har: completeLocalOutput not allowed");
784      }
785      
786      /**
787       * not implemented.
788       */
789      @Override
790      public void setOwner(Path p, String username, String groupname)
791        throws IOException {
792        throw new IOException("Har: setowner not allowed");
793      }
794    
795      /**
796       * Not implemented.
797       */
798      @Override
799      public void setPermission(Path p, FsPermission permisssion) 
800        throws IOException {
801        throw new IOException("Har: setPermission not allowed");
802      }
803      
804      /**
805       * Hadoop archives input stream. This input stream fakes EOF 
806       * since archive files are part of bigger part files.
807       */
808      private static class HarFSDataInputStream extends FSDataInputStream {
809        /**
810         * Create an input stream that fakes all the reads/positions/seeking.
811         */
812        private static class HarFsInputStream extends FSInputStream {
813          private long position, start, end;
814          //The underlying data input stream that the
815          // underlying filesystem will return.
816          private FSDataInputStream underLyingStream;
817          //one byte buffer
818          private byte[] oneBytebuff = new byte[1];
819          HarFsInputStream(FileSystem fs, Path path, long start,
820              long length, int bufferSize) throws IOException {
821            underLyingStream = fs.open(path, bufferSize);
822            underLyingStream.seek(start);
823            // the start of this file in the part file
824            this.start = start;
825            // the position pointer in the part file
826            this.position = start;
827            // the end pointer in the part file
828            this.end = start + length;
829          }
830          
831          @Override
832          public synchronized int available() throws IOException {
833            long remaining = end - underLyingStream.getPos();
834            if (remaining > (long)Integer.MAX_VALUE) {
835              return Integer.MAX_VALUE;
836            }
837            return (int) remaining;
838          }
839          
840          @Override
841          public synchronized  void close() throws IOException {
842            underLyingStream.close();
843            super.close();
844          }
845          
846          //not implemented
847          @Override
848          public void mark(int readLimit) {
849            // do nothing 
850          }
851          
852          /**
853           * reset is not implemented
854           */
855          @Override
856          public void reset() throws IOException {
857            throw new IOException("reset not implemented.");
858          }
859          
860          @Override
861          public synchronized int read() throws IOException {
862            int ret = read(oneBytebuff, 0, 1);
863            return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff);
864          }
865          
866          @Override
867          public synchronized int read(byte[] b) throws IOException {
868            int ret = read(b, 0, b.length);
869            if (ret != -1) {
870              position += ret;
871            }
872            return ret;
873          }
874          
875          /**
876           * 
877           */
878          @Override
879          public synchronized int read(byte[] b, int offset, int len) 
880            throws IOException {
881            int newlen = len;
882            int ret = -1;
883            if (position + len > end) {
884              newlen = (int) (end - position);
885            }
886            // end case
887            if (newlen == 0) 
888              return ret;
889            ret = underLyingStream.read(b, offset, newlen);
890            position += ret;
891            return ret;
892          }
893          
894          @Override
895          public synchronized long skip(long n) throws IOException {
896            long tmpN = n;
897            if (tmpN > 0) {
898              if (position + tmpN > end) {
899                tmpN = end - position;
900              }
901              underLyingStream.seek(tmpN + position);
902              position += tmpN;
903              return tmpN;
904            }
905            return (tmpN < 0)? -1 : 0;
906          }
907          
908          @Override
909          public synchronized long getPos() throws IOException {
910            return (position - start);
911          }
912          
913          @Override
914          public synchronized void seek(long pos) throws IOException {
915            if (pos < 0 || (start + pos > end)) {
916              throw new IOException("Failed to seek: EOF");
917            }
918            position = start + pos;
919            underLyingStream.seek(position);
920          }
921    
922          @Override
923          public boolean seekToNewSource(long targetPos) throws IOException {
924            //do not need to implement this
925            // hdfs in itself does seektonewsource 
926            // while reading.
927            return false;
928          }
929          
930          /**
931           * implementing position readable. 
932           */
933          @Override
934          public int read(long pos, byte[] b, int offset, int length) 
935          throws IOException {
936            int nlength = length;
937            if (start + nlength + pos > end) {
938              nlength = (int) (end - (start + pos));
939            }
940            return underLyingStream.read(pos + start , b, offset, nlength);
941          }
942          
943          /**
944           * position readable again.
945           */
946          @Override
947          public void readFully(long pos, byte[] b, int offset, int length) 
948          throws IOException {
949            if (start + length + pos > end) {
950              throw new IOException("Not enough bytes to read.");
951            }
952            underLyingStream.readFully(pos + start, b, offset, length);
953          }
954          
955          @Override
956          public void readFully(long pos, byte[] b) throws IOException {
957              readFully(pos, b, 0, b.length);
958          }
959          
960        }
961      
962        /**
963         * constructors for har input stream.
964         * @param fs the underlying filesystem
965         * @param p The path in the underlying filesystem
966         * @param start the start position in the part file
967         * @param length the length of valid data in the part file
968         * @param bufsize the buffer size
969         * @throws IOException
970         */
971        public HarFSDataInputStream(FileSystem fs, Path  p, long start, 
972            long length, int bufsize) throws IOException {
973            super(new HarFsInputStream(fs, p, start, length, bufsize));
974        }
975    
976        /**
977         * constructor for har input stream.
978         * @param fs the underlying filesystem
979         * @param p the path in the underlying file system
980         * @param start the start position in the part file
981         * @param length the length of valid data in the part file.
982         * @throws IOException
983         */
984        public HarFSDataInputStream(FileSystem fs, Path  p, long start, long length)
985          throws IOException {
986            super(new HarFsInputStream(fs, p, start, length, 0));
987        }
988      }
989    
990      private class HarMetaData {
991        private FileSystem fs;
992        private int version;
993        // the masterIndex of the archive
994        private Path masterIndexPath;
995        // the index file 
996        private Path archiveIndexPath;
997    
998        private long masterIndexTimestamp;
999        private long archiveIndexTimestamp;
1000    
1001        List<Store> stores = new ArrayList<Store>();
1002        Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
1003        private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
1004    
1005        public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
1006          this.fs = fs;
1007          this.masterIndexPath = masterIndexPath;
1008          this.archiveIndexPath = archiveIndexPath;
1009        }
1010    
1011        public FileStatus getPartFileStatus(Path partPath) throws IOException {
1012          FileStatus status;
1013          status = partFileStatuses.get(partPath);
1014          if (status == null) {
1015            status = fs.getFileStatus(partPath);
1016            partFileStatuses.put(partPath, status);
1017          }
1018          return status;
1019        }
1020    
1021        public long getMasterIndexTimestamp() {
1022          return masterIndexTimestamp;
1023        }
1024    
1025        public long getArchiveIndexTimestamp() {
1026          return archiveIndexTimestamp;
1027        }
1028    
1029        private int getVersion() {
1030          return version;
1031        }
1032    
1033        private void parseMetaData() throws IOException {
1034          Text line;
1035          long read;
1036          FSDataInputStream in = null;
1037          LineReader lin = null;
1038    
1039          try {
1040            in = fs.open(masterIndexPath);
1041            FileStatus masterStat = fs.getFileStatus(masterIndexPath);
1042            masterIndexTimestamp = masterStat.getModificationTime();
1043            lin = new LineReader(in, getConf());
1044            line = new Text();
1045            read = lin.readLine(line);
1046    
1047            // the first line contains the version of the index file
1048            String versionLine = line.toString();
1049            String[] arr = versionLine.split(" ");
1050            version = Integer.parseInt(arr[0]);
1051            // make it always backwards-compatible
1052            if (this.version > HarFileSystem.VERSION) {
1053              throw new IOException("Invalid version " + 
1054                  this.version + " expected " + HarFileSystem.VERSION);
1055            }
1056    
1057            // each line contains a hashcode range and the index file name
1058            String[] readStr = null;
1059            while(read < masterStat.getLen()) {
1060              int b = lin.readLine(line);
1061              read += b;
1062              readStr = line.toString().split(" ");
1063              int startHash = Integer.parseInt(readStr[0]);
1064              int endHash  = Integer.parseInt(readStr[1]);
1065              stores.add(new Store(Long.parseLong(readStr[2]), 
1066                  Long.parseLong(readStr[3]), startHash,
1067                  endHash));
1068              line.clear();
1069            }
1070          } finally {
1071            IOUtils.cleanup(LOG, lin, in);
1072          }
1073    
1074          FSDataInputStream aIn = fs.open(archiveIndexPath);
1075          try {
1076            FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
1077            archiveIndexTimestamp = archiveStat.getModificationTime();
1078            LineReader aLin;
1079    
1080            // now start reading the real index file
1081            for (Store s: stores) {
1082              read = 0;
1083              aIn.seek(s.begin);
1084              aLin = new LineReader(aIn, getConf());
1085              while (read + s.begin < s.end) {
1086                int tmp = aLin.readLine(line);
1087                read += tmp;
1088                String lineFeed = line.toString();
1089                String[] parsed = lineFeed.split(" ");
1090                parsed[0] = decodeFileName(parsed[0]);
1091                archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
1092                line.clear();
1093              }
1094            }
1095          } finally {
1096            IOUtils.cleanup(LOG, aIn);
1097          }
1098        }
1099      }
1100      
1101      /*
1102       * testing purposes only:
1103       */
1104      HarMetaData getMetadata() {
1105        return metadata;
1106      }
1107    }