001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.fs;
019    
020    import java.io.FileNotFoundException;
021    import java.io.IOException;
022    import java.io.UnsupportedEncodingException;
023    import java.net.URI;
024    import java.net.URISyntaxException;
025    import java.net.URLDecoder;
026    import java.util.ArrayList;
027    import java.util.EnumSet;
028    import java.util.List;
029    import java.util.Map;
030    import java.util.TreeMap;
031    import java.util.HashMap;
032    
033    import org.apache.hadoop.conf.Configuration;
034    import org.apache.hadoop.fs.permission.FsPermission;
035    import org.apache.hadoop.io.Text;
036    import org.apache.hadoop.util.LineReader;
037    import org.apache.hadoop.util.Progressable;
038    
039    /**
040     * This is an implementation of the Hadoop Archive 
041     * Filesystem. This archive Filesystem has index files
042     * of the form _index* and has contents of the form
043     * part-*. The index files store the indexes of the 
044     * real files. The index files are of the form _masterindex
045     * and _index. The master index is a level of indirection 
046     * in to the index file to make the look ups faster. the index
047     * file is sorted with hash code of the paths that it contains 
048     * and the master index contains pointers to the positions in 
049     * index for ranges of hashcodes.
050     */
051    
052    public class HarFileSystem extends FilterFileSystem {
053      public static final int VERSION = 3;
054    
055      private static final Map<URI, HarMetaData> harMetaCache = new HashMap<URI, HarMetaData>();
056    
057      // uri representation of this Har filesystem
058      private URI uri;
059      // the top level path of the archive
060      // in the underlying file system
061      private Path archivePath;
062      // the har auth
063      private String harAuth;
064    
065      // pointer into the static metadata cache
066      private HarMetaData metadata;
067    
068      /**
069       * public construction of harfilesystem
070       *
071       */
072      public HarFileSystem() {
073      }
074      
075      /**
076       * Constructor to create a HarFileSystem with an
077       * underlying filesystem.
078       * @param fs
079       */
080      public HarFileSystem(FileSystem fs) {
081        super(fs);
082      }
083      
084      /**
085       * Initialize a Har filesystem per har archive. The 
086       * archive home directory is the top level directory
087       * in the filesystem that contains the HAR archive.
088       * Be careful with this method, you do not want to go 
089       * on creating new Filesystem instances per call to 
090       * path.getFileSystem().
091       * the uri of Har is 
092       * har://underlyingfsscheme-host:port/archivepath.
093       * or 
094       * har:///archivepath. This assumes the underlying filesystem
095       * to be used in case not specified.
096       */
097      public void initialize(URI name, Configuration conf) throws IOException {
098        // decode the name
099        URI underLyingURI = decodeHarURI(name, conf);
100        // we got the right har Path- now check if this is 
101        // truly a har filesystem
102        Path harPath = archivePath(
103          new Path(name.getScheme(), name.getAuthority(), name.getPath()));
104        if (harPath == null) { 
105          throw new IOException("Invalid path for the Har Filesystem. " + 
106                               name.toString());
107        }
108        if (fs == null) {
109          fs = FileSystem.get(underLyingURI, conf);
110        }
111        uri = harPath.toUri();
112        archivePath = new Path(uri.getPath());
113        harAuth = getHarAuth(underLyingURI);
114        //check for the underlying fs containing
115        // the index file
116        Path masterIndexPath = new Path(archivePath, "_masterindex");
117        Path archiveIndexPath = new Path(archivePath, "_index");
118        if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) {
119          throw new IOException("Invalid path for the Har Filesystem. " +
120              "No index file in " + harPath);
121        }
122    
123        metadata = harMetaCache.get(uri);
124        if (metadata != null) {
125          FileStatus mStat = fs.getFileStatus(masterIndexPath);
126          FileStatus aStat = fs.getFileStatus(archiveIndexPath);
127          if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() ||
128              aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) {
129            // the archive has been overwritten since we last read it
130            // remove the entry from the meta data cache
131            metadata = null;
132            harMetaCache.remove(uri);
133          }
134        }
135        if (metadata == null) {
136          metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath);
137          metadata.parseMetaData();
138          harMetaCache.put(uri, metadata);
139        }
140      }
141    
142      // get the version of the filesystem from the masterindex file
143      // the version is currently not useful since its the first version
144      // of archives
145      public int getHarVersion() throws IOException {
146        if (metadata != null) {
147          return metadata.getVersion();
148        }
149        else {
150          throw new IOException("Invalid meta data for the Har Filesystem");
151        }
152      }
153    
154      /*
155       * find the parent path that is the 
156       * archive path in the path. The last
157       * path segment that ends with .har is 
158       * the path that will be returned.
159       */
160      private Path archivePath(Path p) {
161        Path retPath = null;
162        Path tmp = p;
163        for (int i=0; i< p.depth(); i++) {
164          if (tmp.toString().endsWith(".har")) {
165            retPath = tmp;
166            break;
167          }
168          tmp = tmp.getParent();
169        }
170        return retPath;
171      }
172    
173      /**
174       * decode the raw URI to get the underlying URI
175       * @param rawURI raw Har URI
176       * @return filtered URI of the underlying fileSystem
177       */
178      private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException {
179        String tmpAuth = rawURI.getAuthority();
180        //we are using the default file
181        //system in the config 
182        //so create a underlying uri and 
183        //return it
184        if (tmpAuth == null) {
185          //create a path 
186          return FileSystem.getDefaultUri(conf);
187        }
188        String host = rawURI.getHost();
189        if (host == null) {
190          throw new IOException("URI: " + rawURI
191              + " is an invalid Har URI since host==null."
192              + "  Expecting har://<scheme>-<host>/<path>.");
193        }
194        int i = host.indexOf('-');
195        if (i < 0) {
196          throw new IOException("URI: " + rawURI
197              + " is an invalid Har URI since '-' not found."
198              + "  Expecting har://<scheme>-<host>/<path>.");
199        }
200        final String underLyingScheme = host.substring(0, i);
201        i++;
202        final String underLyingHost = i == host.length()? null: host.substring(i);
203        int underLyingPort = rawURI.getPort();
204        String auth = (underLyingHost == null && underLyingPort == -1)?
205                      null:(underLyingHost+":"+underLyingPort);
206        URI tmp = null;
207        if (rawURI.getQuery() != null) {
208          // query component not allowed
209          throw new IOException("query component in Path not supported  " + rawURI);
210        }
211        try {
212          tmp = new URI(underLyingScheme, auth, rawURI.getPath(), 
213                rawURI.getQuery(), rawURI.getFragment());
214        } catch (URISyntaxException e) {
215            // do nothing should not happen
216        }
217        return tmp;
218      }
219    
220      private static String decodeString(String str)
221        throws UnsupportedEncodingException {
222        return URLDecoder.decode(str, "UTF-8");
223      }
224    
225      private String decodeFileName(String fname) 
226        throws UnsupportedEncodingException {
227        int version = metadata.getVersion();
228        if (version == 2 || version == 3){
229          return decodeString(fname);
230        }
231        return fname;
232      }
233    
234      /**
235       * return the top level archive.
236       */
237      public Path getWorkingDirectory() {
238        return new Path(uri.toString());
239      }
240      
241      /**
242       * Create a har specific auth 
243       * har-underlyingfs:port
244       * @param underLyingURI the uri of underlying
245       * filesystem
246       * @return har specific auth
247       */
248      private String getHarAuth(URI underLyingUri) {
249        String auth = underLyingUri.getScheme() + "-";
250        if (underLyingUri.getHost() != null) {
251          auth += underLyingUri.getHost() + ":";
252          if (underLyingUri.getPort() != -1) {
253            auth +=  underLyingUri.getPort();
254          }
255        }
256        else {
257          auth += ":";
258        }
259        return auth;
260      }
261      
262      /**
263       * Returns the uri of this filesystem.
264       * The uri is of the form 
265       * har://underlyingfsschema-host:port/pathintheunderlyingfs
266       */
267      @Override
268      public URI getUri() {
269        return this.uri;
270      }
271      
272      /**
273       * this method returns the path 
274       * inside the har filesystem.
275       * this is relative path inside 
276       * the har filesystem.
277       * @param path the fully qualified path in the har filesystem.
278       * @return relative path in the filesystem.
279       */
280      private Path getPathInHar(Path path) {
281        Path harPath = new Path(path.toUri().getPath());
282        if (archivePath.compareTo(harPath) == 0)
283          return new Path(Path.SEPARATOR);
284        Path tmp = new Path(harPath.getName());
285        Path parent = harPath.getParent();
286        while (!(parent.compareTo(archivePath) == 0)) {
287          if (parent.toString().equals(Path.SEPARATOR)) {
288            tmp = null;
289            break;
290          }
291          tmp = new Path(parent.getName(), tmp);
292          parent = parent.getParent();
293        }
294        if (tmp != null) 
295          tmp = new Path(Path.SEPARATOR, tmp);
296        return tmp;
297      }
298      
299      //the relative path of p. basically 
300      // getting rid of /. Parsing and doing 
301      // string manipulation is not good - so
302      // just use the path api to do it.
303      private Path makeRelative(String initial, Path p) {
304        String scheme = this.uri.getScheme();
305        String authority = this.uri.getAuthority();
306        Path root = new Path(Path.SEPARATOR);
307        if (root.compareTo(p) == 0)
308          return new Path(scheme, authority, initial);
309        Path retPath = new Path(p.getName());
310        Path parent = p.getParent();
311        for (int i=0; i < p.depth()-1; i++) {
312          retPath = new Path(parent.getName(), retPath);
313          parent = parent.getParent();
314        }
315        return new Path(new Path(scheme, authority, initial),
316          retPath.toString());
317      }
318      
319      /* this makes a path qualified in the har filesystem
320       * (non-Javadoc)
321       * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified(
322       * org.apache.hadoop.fs.Path)
323       */
324      @Override
325      public Path makeQualified(Path path) {
326        // make sure that we just get the 
327        // path component 
328        Path fsPath = path;
329        if (!path.isAbsolute()) {
330          fsPath = new Path(archivePath, path);
331        }
332    
333        URI tmpURI = fsPath.toUri();
334        //change this to Har uri 
335        return new Path(uri.getScheme(), harAuth, tmpURI.getPath());
336      }
337    
338      /**
339       * Fix offset and length of block locations.
340       * Note that this method modifies the original array.
341       * @param locations block locations of har part file
342       * @param start the start of the desired range in the contained file
343       * @param len the length of the desired range
344       * @param fileOffsetInHar the offset of the desired file in the har part file
345       * @return block locations with fixed offset and length
346       */  
347      static BlockLocation[] fixBlockLocations(BlockLocation[] locations,
348                                              long start,
349                                              long len,
350                                              long fileOffsetInHar) {
351        // offset 1 past last byte of desired range
352        long end = start + len;
353    
354        for (BlockLocation location : locations) {
355          // offset of part block relative to beginning of desired file
356          // (may be negative if file starts in this part block)
357          long harBlockStart = location.getOffset() - fileOffsetInHar;
358          // offset 1 past last byte of har block relative to beginning of
359          // desired file
360          long harBlockEnd = harBlockStart + location.getLength();
361          
362          if (start > harBlockStart) {
363            // desired range starts after beginning of this har block
364            // fix offset to beginning of relevant range (relative to desired file)
365            location.setOffset(start);
366            // fix length to relevant portion of har block
367            location.setLength(location.getLength() - (start - harBlockStart));
368          } else {
369            // desired range includes beginning of this har block
370            location.setOffset(harBlockStart);
371          }
372          
373          if (harBlockEnd > end) {
374            // range ends before end of this har block
375            // fix length to remove irrelevant portion at the end
376            location.setLength(location.getLength() - (harBlockEnd - end));
377          }
378        }
379        
380        return locations;
381      }
382      
383      /**
384       * Get block locations from the underlying fs and fix their
385       * offsets and lengths.
386       * @param file the input filestatus to get block locations
387       * @param start the start of the desired range in the contained file
388       * @param len the length of the desired range
389       * @return block locations for this segment of file
390       * @throws IOException
391       */
392      @Override
393      public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
394                                                   long len) throws IOException {
395        HarStatus hstatus = getFileHarStatus(file.getPath());
396        Path partPath = new Path(archivePath, hstatus.getPartName());
397        FileStatus partStatus = metadata.getPartFileStatus(partPath);
398    
399        // get all part blocks that overlap with the desired file blocks
400        BlockLocation[] locations = 
401          fs.getFileBlockLocations(partStatus,
402                                   hstatus.getStartIndex() + start, len);
403    
404        return fixBlockLocations(locations, start, len, hstatus.getStartIndex());
405      }
406      
407      /**
408       * the hash of the path p inside iniside
409       * the filesystem
410       * @param p the path in the harfilesystem
411       * @return the hash code of the path.
412       */
413      public static int getHarHash(Path p) {
414        return (p.toString().hashCode() & 0x7fffffff);
415      }
416      
417      static class Store {
418        public Store() {
419          begin = end = startHash = endHash = 0;
420        }
421        public Store(long begin, long end, int startHash, int endHash) {
422          this.begin = begin;
423          this.end = end;
424          this.startHash = startHash;
425          this.endHash = endHash;
426        }
427        public long begin;
428        public long end;
429        public int startHash;
430        public int endHash;
431      }
432      
433      /**
434       * Get filestatuses of all the children of a given directory. This just reads
435       * through index file and reads line by line to get all statuses for children
436       * of a directory. Its a brute force way of getting all such filestatuses
437       * 
438       * @param parent
439       *          the parent path directory
440       * @param statuses
441       *          the list to add the children filestatuses to
442       * @param children
443       *          the string list of children for this parent
444       * @param archiveIndexStat
445       *          the archive index filestatus
446       */
447      private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses,
448          List<String> children) throws IOException {
449        String parentString = parent.getName();
450        if (!parentString.endsWith(Path.SEPARATOR)){
451            parentString += Path.SEPARATOR;
452        }
453        Path harPath = new Path(parentString);
454        int harlen = harPath.depth();
455        final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
456    
457        for (HarStatus hstatus : metadata.archive.values()) {
458          String child = hstatus.getName();
459          if ((child.startsWith(parentString))) {
460            Path thisPath = new Path(child);
461            if (thisPath.depth() == harlen + 1) {
462              statuses.add(toFileStatus(hstatus, cache));
463            }
464          }
465        }
466      }
467    
468      /**
469       * Combine the status stored in the index and the underlying status. 
470       * @param h status stored in the index
471       * @param cache caching the underlying file statuses
472       * @return the combined file status
473       * @throws IOException
474       */
475      private FileStatus toFileStatus(HarStatus h,
476          Map<String, FileStatus> cache) throws IOException {
477        FileStatus underlying = null;
478        if (cache != null) {
479          underlying = cache.get(h.partName);
480        }
481        if (underlying == null) {
482          final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
483          underlying = fs.getFileStatus(p);
484          if (cache != null) {
485            cache.put(h.partName, underlying);
486          }
487        }
488    
489        long modTime = 0;
490        int version = metadata.getVersion();
491        if (version < 3) {
492          modTime = underlying.getModificationTime();
493        } else if (version == 3) {
494          modTime = h.getModificationTime();
495        }
496    
497        return new FileStatus(
498            h.isDir()? 0L: h.getLength(),
499            h.isDir(),
500            underlying.getReplication(),
501            underlying.getBlockSize(),
502            modTime,
503            underlying.getAccessTime(),
504            underlying.getPermission(),
505            underlying.getOwner(),
506            underlying.getGroup(),
507            makeRelative(this.uri.getPath(), new Path(h.name)));
508      }
509    
510      // a single line parser for hadoop archives status 
511      // stored in a single line in the index files 
512      // the format is of the form 
513      // filename "dir"/"file" partFileName startIndex length 
514      // <space seperated children>
515      private class HarStatus {
516        boolean isDir;
517        String name;
518        List<String> children;
519        String partName;
520        long startIndex;
521        long length;
522        long modificationTime = 0;
523    
524        public HarStatus(String harString) throws UnsupportedEncodingException {
525          String[] splits = harString.split(" ");
526          this.name = decodeFileName(splits[0]);
527          this.isDir = "dir".equals(splits[1]) ? true: false;
528          // this is equal to "none" if its a directory
529          this.partName = splits[2];
530          this.startIndex = Long.parseLong(splits[3]);
531          this.length = Long.parseLong(splits[4]);
532    
533          int version = metadata.getVersion();
534          String[] propSplits = null;
535          // propSplits is used to retrieve the metainformation that Har versions
536          // 1 & 2 missed (modification time, permission, owner group).
537          // These fields are stored in an encoded string placed in different
538          // locations depending on whether it's a file or directory entry.
539          // If it's a directory, the string will be placed at the partName
540          // location (directories have no partName because they don't have data
541          // to be stored). This is done because the number of fields in a
542          // directory entry is unbounded (all children are listed at the end)
543          // If it's a file, the string will be the last field.
544          if (isDir) {
545            if (version == 3){
546              propSplits = decodeString(this.partName).split(" ");
547            }
548            children = new ArrayList<String>();
549            for (int i = 5; i < splits.length; i++) {
550              children.add(decodeFileName(splits[i]));
551            }
552          } else if (version == 3) {
553            propSplits = decodeString(splits[5]).split(" ");
554          }
555    
556          if (propSplits != null && propSplits.length >= 4) {
557            modificationTime = Long.parseLong(propSplits[0]);
558            // the fields below are stored in the file but are currently not used
559            // by HarFileSystem
560            // permission = new FsPermission(Short.parseShort(propSplits[1]));
561            // owner = decodeString(propSplits[2]);
562            // group = decodeString(propSplits[3]);
563          }
564        }
565        public boolean isDir() {
566          return isDir;
567        }
568        
569        public String getName() {
570          return name;
571        }
572        
573        public List<String> getChildren() {
574          return children;
575        }
576        public String getFileName() {
577          return name;
578        }
579        public String getPartName() {
580          return partName;
581        }
582        public long getStartIndex() {
583          return startIndex;
584        }
585        public long getLength() {
586          return length;
587        }
588        public long getModificationTime() {
589          return modificationTime;
590        }
591      }
592      
593      /**
594       * return the filestatus of files in har archive.
595       * The permission returned are that of the archive
596       * index files. The permissions are not persisted 
597       * while creating a hadoop archive.
598       * @param f the path in har filesystem
599       * @return filestatus.
600       * @throws IOException
601       */
602      @Override
603      public FileStatus getFileStatus(Path f) throws IOException {
604        HarStatus hstatus = getFileHarStatus(f);
605        return toFileStatus(hstatus, null);
606      }
607    
608      private HarStatus getFileHarStatus(Path f) throws IOException {
609        // get the fs DataInputStream for the underlying file
610        // look up the index.
611        Path p = makeQualified(f);
612        Path harPath = getPathInHar(p);
613        if (harPath == null) {
614          throw new IOException("Invalid file name: " + f + " in " + uri);
615        }
616        HarStatus hstatus = metadata.archive.get(harPath);
617        if (hstatus == null) {
618          throw new FileNotFoundException("File: " +  f + " does not exist in " + uri);
619        }
620        return hstatus;
621      }
622    
623      /**
624       * @return null since no checksum algorithm is implemented.
625       */
626      public FileChecksum getFileChecksum(Path f) {
627        return null;
628      }
629    
630      /**
631       * Returns a har input stream which fakes end of 
632       * file. It reads the index files to get the part 
633       * file name and the size and start of the file.
634       */
635      @Override
636      public FSDataInputStream open(Path f, int bufferSize) throws IOException {
637        // get the fs DataInputStream for the underlying file
638        HarStatus hstatus = getFileHarStatus(f);
639        // we got it.. woo hooo!!! 
640        if (hstatus.isDir()) {
641          throw new FileNotFoundException(f + " : not a file in " +
642                    archivePath);
643        }
644        return new HarFSDataInputStream(fs, new Path(archivePath, 
645            hstatus.getPartName()),
646            hstatus.getStartIndex(), hstatus.getLength(), bufferSize);
647      }
648     
649      /*
650       * create throws an exception in Har filesystem.
651       * The archive once created cannot be changed.
652       */
653      public FSDataOutputStream create(Path f, int bufferSize) 
654                                        throws IOException {
655        throw new IOException("Har: Create not allowed");
656      }
657      
658      public FSDataOutputStream create(Path f,
659          FsPermission permission,
660          boolean overwrite,
661          int bufferSize,
662          short replication,
663          long blockSize,
664          Progressable progress) throws IOException {
665        throw new IOException("Har: create not allowed.");
666      }
667      
668      @Override
669      public void close() throws IOException {
670        if (fs != null) {
671          try {
672            fs.close();
673          } catch(IOException ie) {
674            //this might already be closed
675            // ignore
676          }
677        }
678      }
679      
680      /**
681       * Not implemented.
682       */
683      @Override
684      public boolean setReplication(Path src, short replication) throws IOException{
685        throw new IOException("Har: setreplication not allowed");
686      }
687      
688      /**
689       * Not implemented.
690       */
691      @Override
692      public boolean delete(Path f, boolean recursive) throws IOException { 
693        throw new IOException("Har: delete not allowed");
694      }
695      
696      /**
697       * liststatus returns the children of a directory 
698       * after looking up the index files.
699       */
700      @Override
701      public FileStatus[] listStatus(Path f) throws IOException {
702        //need to see if the file is an index in file
703        //get the filestatus of the archive directory
704        // we will create fake filestatuses to return
705        // to the client
706        List<FileStatus> statuses = new ArrayList<FileStatus>();
707        Path tmpPath = makeQualified(f);
708        Path harPath = getPathInHar(tmpPath);
709        HarStatus hstatus = metadata.archive.get(harPath);
710        if (hstatus == null) {
711          throw new FileNotFoundException("File " + f + " not found in " + archivePath);
712        }
713        if (hstatus.isDir()) {
714          fileStatusesInIndex(hstatus, statuses, hstatus.children);
715        } else {
716          statuses.add(toFileStatus(hstatus, null));
717        }
718        
719        return statuses.toArray(new FileStatus[statuses.size()]);
720      }
721      
722      /**
723       * return the top level archive path.
724       */
725      public Path getHomeDirectory() {
726        return new Path(uri.toString());
727      }
728      
729      public void setWorkingDirectory(Path newDir) {
730        //does nothing.
731      }
732      
733      /**
734       * not implemented.
735       */
736      public boolean mkdirs(Path f, FsPermission permission) throws IOException {
737        throw new IOException("Har: mkdirs not allowed");
738      }
739      
740      /**
741       * not implemented.
742       */
743      public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws 
744            IOException {
745        throw new IOException("Har: copyfromlocalfile not allowed");
746      }
747      
748      /**
749       * copies the file in the har filesystem to a local file.
750       */
751      public void copyToLocalFile(boolean delSrc, Path src, Path dst) 
752        throws IOException {
753        FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf());
754      }
755      
756      /**
757       * not implemented.
758       */
759      public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) 
760        throws IOException {
761        throw new IOException("Har: startLocalOutput not allowed");
762      }
763      
764      /**
765       * not implemented.
766       */
767      public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) 
768        throws IOException {
769        throw new IOException("Har: completeLocalOutput not allowed");
770      }
771      
772      /**
773       * not implemented.
774       */
775      public void setOwner(Path p, String username, String groupname)
776        throws IOException {
777        throw new IOException("Har: setowner not allowed");
778      }
779    
780      /**
781       * Not implemented.
782       */
783      public void setPermission(Path p, FsPermission permisssion) 
784        throws IOException {
785        throw new IOException("Har: setPermission not allowed");
786      }
787      
788      /**
789       * Hadoop archives input stream. This input stream fakes EOF 
790       * since archive files are part of bigger part files.
791       */
792      private static class HarFSDataInputStream extends FSDataInputStream {
793        /**
794         * Create an input stream that fakes all the reads/positions/seeking.
795         */
796        private static class HarFsInputStream extends FSInputStream {
797          private long position, start, end;
798          //The underlying data input stream that the
799          // underlying filesystem will return.
800          private FSDataInputStream underLyingStream;
801          //one byte buffer
802          private byte[] oneBytebuff = new byte[1];
803          HarFsInputStream(FileSystem fs, Path path, long start,
804              long length, int bufferSize) throws IOException {
805            underLyingStream = fs.open(path, bufferSize);
806            underLyingStream.seek(start);
807            // the start of this file in the part file
808            this.start = start;
809            // the position pointer in the part file
810            this.position = start;
811            // the end pointer in the part file
812            this.end = start + length;
813          }
814          
815          public synchronized int available() throws IOException {
816            long remaining = end - underLyingStream.getPos();
817            if (remaining > (long)Integer.MAX_VALUE) {
818              return Integer.MAX_VALUE;
819            }
820            return (int) remaining;
821          }
822          
823          public synchronized  void close() throws IOException {
824            underLyingStream.close();
825            super.close();
826          }
827          
828          //not implemented
829          @Override
830          public void mark(int readLimit) {
831            // do nothing 
832          }
833          
834          /**
835           * reset is not implemented
836           */
837          public void reset() throws IOException {
838            throw new IOException("reset not implemented.");
839          }
840          
841          public synchronized int read() throws IOException {
842            int ret = read(oneBytebuff, 0, 1);
843            return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff);
844          }
845          
846          public synchronized int read(byte[] b) throws IOException {
847            int ret = read(b, 0, b.length);
848            if (ret != -1) {
849              position += ret;
850            }
851            return ret;
852          }
853          
854          /**
855           * 
856           */
857          public synchronized int read(byte[] b, int offset, int len) 
858            throws IOException {
859            int newlen = len;
860            int ret = -1;
861            if (position + len > end) {
862              newlen = (int) (end - position);
863            }
864            // end case
865            if (newlen == 0) 
866              return ret;
867            ret = underLyingStream.read(b, offset, newlen);
868            position += ret;
869            return ret;
870          }
871          
872          public synchronized long skip(long n) throws IOException {
873            long tmpN = n;
874            if (tmpN > 0) {
875              if (position + tmpN > end) {
876                tmpN = end - position;
877              }
878              underLyingStream.seek(tmpN + position);
879              position += tmpN;
880              return tmpN;
881            }
882            return (tmpN < 0)? -1 : 0;
883          }
884          
885          public synchronized long getPos() throws IOException {
886            return (position - start);
887          }
888          
889          public synchronized void seek(long pos) throws IOException {
890            if (pos < 0 || (start + pos > end)) {
891              throw new IOException("Failed to seek: EOF");
892            }
893            position = start + pos;
894            underLyingStream.seek(position);
895          }
896    
897          public boolean seekToNewSource(long targetPos) throws IOException {
898            //do not need to implement this
899            // hdfs in itself does seektonewsource 
900            // while reading.
901            return false;
902          }
903          
904          /**
905           * implementing position readable. 
906           */
907          public int read(long pos, byte[] b, int offset, int length) 
908          throws IOException {
909            int nlength = length;
910            if (start + nlength + pos > end) {
911              nlength = (int) (end - (start + pos));
912            }
913            return underLyingStream.read(pos + start , b, offset, nlength);
914          }
915          
916          /**
917           * position readable again.
918           */
919          public void readFully(long pos, byte[] b, int offset, int length) 
920          throws IOException {
921            if (start + length + pos > end) {
922              throw new IOException("Not enough bytes to read.");
923            }
924            underLyingStream.readFully(pos + start, b, offset, length);
925          }
926          
927          public void readFully(long pos, byte[] b) throws IOException {
928              readFully(pos, b, 0, b.length);
929          }
930          
931        }
932      
933        /**
934         * constructors for har input stream.
935         * @param fs the underlying filesystem
936         * @param p The path in the underlying filesystem
937         * @param start the start position in the part file
938         * @param length the length of valid data in the part file
939         * @param bufsize the buffer size
940         * @throws IOException
941         */
942        public HarFSDataInputStream(FileSystem fs, Path  p, long start, 
943            long length, int bufsize) throws IOException {
944            super(new HarFsInputStream(fs, p, start, length, bufsize));
945        }
946    
947        /**
948         * constructor for har input stream.
949         * @param fs the underlying filesystem
950         * @param p the path in the underlying file system
951         * @param start the start position in the part file
952         * @param length the length of valid data in the part file.
953         * @throws IOException
954         */
955        public HarFSDataInputStream(FileSystem fs, Path  p, long start, long length)
956          throws IOException {
957            super(new HarFsInputStream(fs, p, start, length, 0));
958        }
959      }
960    
961      private class HarMetaData {
962        private FileSystem fs;
963        private int version;
964        // the masterIndex of the archive
965        private Path masterIndexPath;
966        // the index file 
967        private Path archiveIndexPath;
968    
969        private long masterIndexTimestamp;
970        private long archiveIndexTimestamp;
971    
972        List<Store> stores = new ArrayList<Store>();
973        Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
974        private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
975    
976        public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
977          this.fs = fs;
978          this.masterIndexPath = masterIndexPath;
979          this.archiveIndexPath = archiveIndexPath;
980        }
981    
982        public FileStatus getPartFileStatus(Path partPath) throws IOException {
983          FileStatus status;
984          status = partFileStatuses.get(partPath);
985          if (status == null) {
986            status = fs.getFileStatus(partPath);
987            partFileStatuses.put(partPath, status);
988          }
989          return status;
990        }
991    
992        public long getMasterIndexTimestamp() {
993          return masterIndexTimestamp;
994        }
995    
996        public long getArchiveIndexTimestamp() {
997          return archiveIndexTimestamp;
998        }
999    
1000        private int getVersion() {
1001          return version;
1002        }
1003    
1004        private void parseMetaData() throws IOException {
1005          FSDataInputStream in = fs.open(masterIndexPath);
1006          FileStatus masterStat = fs.getFileStatus(masterIndexPath);
1007          masterIndexTimestamp = masterStat.getModificationTime();
1008          LineReader lin = new LineReader(in, getConf());
1009          Text line = new Text();
1010          long read = lin.readLine(line);
1011    
1012         // the first line contains the version of the index file
1013          String versionLine = line.toString();
1014          String[] arr = versionLine.split(" ");
1015          version = Integer.parseInt(arr[0]);
1016          // make it always backwards-compatible
1017          if (this.version > HarFileSystem.VERSION) {
1018            throw new IOException("Invalid version " + 
1019                this.version + " expected " + HarFileSystem.VERSION);
1020          }
1021    
1022          // each line contains a hashcode range and the index file name
1023          String[] readStr = null;
1024          while(read < masterStat.getLen()) {
1025            int b = lin.readLine(line);
1026            read += b;
1027            readStr = line.toString().split(" ");
1028            int startHash = Integer.parseInt(readStr[0]);
1029            int endHash  = Integer.parseInt(readStr[1]);
1030            stores.add(new Store(Long.parseLong(readStr[2]), 
1031                Long.parseLong(readStr[3]), startHash,
1032                endHash));
1033            line.clear();
1034          }
1035          try {
1036            // close the master index
1037            lin.close();
1038          } catch(IOException io){
1039            // do nothing just a read.
1040          }
1041    
1042          FSDataInputStream aIn = fs.open(archiveIndexPath);
1043          FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
1044          archiveIndexTimestamp = archiveStat.getModificationTime();
1045          LineReader aLin;
1046          String retStr = null;
1047          // now start reading the real index file
1048          for (Store s: stores) {
1049            read = 0;
1050            aIn.seek(s.begin);
1051            aLin = new LineReader(aIn, getConf());
1052            while (read + s.begin < s.end) {
1053              int tmp = aLin.readLine(line);
1054              read += tmp;
1055              String lineFeed = line.toString();
1056              String[] parsed = lineFeed.split(" ");
1057              parsed[0] = decodeFileName(parsed[0]);
1058              archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
1059              line.clear();
1060            }
1061          }
1062          try {
1063            // close the archive index
1064            aIn.close();
1065          } catch(IOException io) {
1066            // do nothing just a read.
1067          }
1068        }
1069      }
1070    }