001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.fs;
019    
020    import java.io.FileNotFoundException;
021    import java.io.IOException;
022    import java.io.UnsupportedEncodingException;
023    import java.net.URI;
024    import java.net.URISyntaxException;
025    import java.net.URLDecoder;
026    import java.util.ArrayList;
027    import java.util.List;
028    import java.util.Map;
029    import java.util.TreeMap;
030    import java.util.HashMap;
031    import java.util.concurrent.ConcurrentHashMap;
032    
033    import org.apache.hadoop.conf.Configuration;
034    import org.apache.hadoop.fs.permission.FsPermission;
035    import org.apache.hadoop.io.Text;
036    import org.apache.hadoop.util.LineReader;
037    import org.apache.hadoop.util.Progressable;
038    
039    /**
040     * This is an implementation of the Hadoop Archive 
041     * Filesystem. This archive Filesystem has index files
042     * of the form _index* and has contents of the form
043     * part-*. The index files store the indexes of the 
044     * real files. The index files are of the form _masterindex
045     * and _index. The master index is a level of indirection 
046     * in to the index file to make the look ups faster. the index
047     * file is sorted with hash code of the paths that it contains 
048     * and the master index contains pointers to the positions in 
049     * index for ranges of hashcodes.
050     */
051    
052    public class HarFileSystem extends FilterFileSystem {
053      public static final int VERSION = 3;
054    
055      private static final Map<URI, HarMetaData> harMetaCache =
056          new ConcurrentHashMap<URI, HarMetaData>();
057    
058      // uri representation of this Har filesystem
059      private URI uri;
060      // the top level path of the archive
061      // in the underlying file system
062      private Path archivePath;
063      // the har auth
064      private String harAuth;
065    
066      // pointer into the static metadata cache
067      private HarMetaData metadata;
068    
069      /**
070       * public construction of harfilesystem
071       *
072       */
073      public HarFileSystem() {
074      }
075      
076      /**
077       * Constructor to create a HarFileSystem with an
078       * underlying filesystem.
079       * @param fs
080       */
081      public HarFileSystem(FileSystem fs) {
082        super(fs);
083      }
084      
085      /**
086       * Initialize a Har filesystem per har archive. The 
087       * archive home directory is the top level directory
088       * in the filesystem that contains the HAR archive.
089       * Be careful with this method, you do not want to go 
090       * on creating new Filesystem instances per call to 
091       * path.getFileSystem().
092       * the uri of Har is 
093       * har://underlyingfsscheme-host:port/archivepath.
094       * or 
095       * har:///archivepath. This assumes the underlying filesystem
096       * to be used in case not specified.
097       */
098      public void initialize(URI name, Configuration conf) throws IOException {
099        // decode the name
100        URI underLyingURI = decodeHarURI(name, conf);
101        // we got the right har Path- now check if this is 
102        // truly a har filesystem
103        Path harPath = archivePath(
104          new Path(name.getScheme(), name.getAuthority(), name.getPath()));
105        if (harPath == null) { 
106          throw new IOException("Invalid path for the Har Filesystem. " + 
107                               name.toString());
108        }
109        if (fs == null) {
110          fs = FileSystem.get(underLyingURI, conf);
111        }
112        uri = harPath.toUri();
113        archivePath = new Path(uri.getPath());
114        harAuth = getHarAuth(underLyingURI);
115        //check for the underlying fs containing
116        // the index file
117        Path masterIndexPath = new Path(archivePath, "_masterindex");
118        Path archiveIndexPath = new Path(archivePath, "_index");
119        if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) {
120          throw new IOException("Invalid path for the Har Filesystem. " +
121              "No index file in " + harPath);
122        }
123    
124        metadata = harMetaCache.get(uri);
125        if (metadata != null) {
126          FileStatus mStat = fs.getFileStatus(masterIndexPath);
127          FileStatus aStat = fs.getFileStatus(archiveIndexPath);
128          if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() ||
129              aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) {
130            // the archive has been overwritten since we last read it
131            // remove the entry from the meta data cache
132            metadata = null;
133            harMetaCache.remove(uri);
134          }
135        }
136        if (metadata == null) {
137          metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath);
138          metadata.parseMetaData();
139          harMetaCache.put(uri, metadata);
140        }
141      }
142    
143      // get the version of the filesystem from the masterindex file
144      // the version is currently not useful since its the first version
145      // of archives
146      public int getHarVersion() throws IOException {
147        if (metadata != null) {
148          return metadata.getVersion();
149        }
150        else {
151          throw new IOException("Invalid meta data for the Har Filesystem");
152        }
153      }
154    
155      /*
156       * find the parent path that is the 
157       * archive path in the path. The last
158       * path segment that ends with .har is 
159       * the path that will be returned.
160       */
161      private Path archivePath(Path p) {
162        Path retPath = null;
163        Path tmp = p;
164        for (int i=0; i< p.depth(); i++) {
165          if (tmp.toString().endsWith(".har")) {
166            retPath = tmp;
167            break;
168          }
169          tmp = tmp.getParent();
170        }
171        return retPath;
172      }
173    
174      /**
175       * decode the raw URI to get the underlying URI
176       * @param rawURI raw Har URI
177       * @return filtered URI of the underlying fileSystem
178       */
179      private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException {
180        String tmpAuth = rawURI.getAuthority();
181        //we are using the default file
182        //system in the config 
183        //so create a underlying uri and 
184        //return it
185        if (tmpAuth == null) {
186          //create a path 
187          return FileSystem.getDefaultUri(conf);
188        }
189        String host = rawURI.getHost();
190        if (host == null) {
191          throw new IOException("URI: " + rawURI
192              + " is an invalid Har URI since host==null."
193              + "  Expecting har://<scheme>-<host>/<path>.");
194        }
195        int i = host.indexOf('-');
196        if (i < 0) {
197          throw new IOException("URI: " + rawURI
198              + " is an invalid Har URI since '-' not found."
199              + "  Expecting har://<scheme>-<host>/<path>.");
200        }
201        final String underLyingScheme = host.substring(0, i);
202        i++;
203        final String underLyingHost = i == host.length()? null: host.substring(i);
204        int underLyingPort = rawURI.getPort();
205        String auth = (underLyingHost == null && underLyingPort == -1)?
206                      null:(underLyingHost+":"+underLyingPort);
207        URI tmp = null;
208        if (rawURI.getQuery() != null) {
209          // query component not allowed
210          throw new IOException("query component in Path not supported  " + rawURI);
211        }
212        try {
213          tmp = new URI(underLyingScheme, auth, rawURI.getPath(), 
214                rawURI.getQuery(), rawURI.getFragment());
215        } catch (URISyntaxException e) {
216            // do nothing should not happen
217        }
218        return tmp;
219      }
220    
221      private static String decodeString(String str)
222        throws UnsupportedEncodingException {
223        return URLDecoder.decode(str, "UTF-8");
224      }
225    
226      private String decodeFileName(String fname) 
227        throws UnsupportedEncodingException {
228        int version = metadata.getVersion();
229        if (version == 2 || version == 3){
230          return decodeString(fname);
231        }
232        return fname;
233      }
234    
235      /**
236       * return the top level archive.
237       */
238      public Path getWorkingDirectory() {
239        return new Path(uri.toString());
240      }
241      
242      /**
243       * Create a har specific auth 
244       * har-underlyingfs:port
245       * @param underLyingURI the uri of underlying
246       * filesystem
247       * @return har specific auth
248       */
249      private String getHarAuth(URI underLyingUri) {
250        String auth = underLyingUri.getScheme() + "-";
251        if (underLyingUri.getHost() != null) {
252          auth += underLyingUri.getHost() + ":";
253          if (underLyingUri.getPort() != -1) {
254            auth +=  underLyingUri.getPort();
255          }
256        }
257        else {
258          auth += ":";
259        }
260        return auth;
261      }
262      
263      /**
264       * Returns the uri of this filesystem.
265       * The uri is of the form 
266       * har://underlyingfsschema-host:port/pathintheunderlyingfs
267       */
268      @Override
269      public URI getUri() {
270        return this.uri;
271      }
272      
273      /**
274       * this method returns the path 
275       * inside the har filesystem.
276       * this is relative path inside 
277       * the har filesystem.
278       * @param path the fully qualified path in the har filesystem.
279       * @return relative path in the filesystem.
280       */
281      private Path getPathInHar(Path path) {
282        Path harPath = new Path(path.toUri().getPath());
283        if (archivePath.compareTo(harPath) == 0)
284          return new Path(Path.SEPARATOR);
285        Path tmp = new Path(harPath.getName());
286        Path parent = harPath.getParent();
287        while (!(parent.compareTo(archivePath) == 0)) {
288          if (parent.toString().equals(Path.SEPARATOR)) {
289            tmp = null;
290            break;
291          }
292          tmp = new Path(parent.getName(), tmp);
293          parent = parent.getParent();
294        }
295        if (tmp != null) 
296          tmp = new Path(Path.SEPARATOR, tmp);
297        return tmp;
298      }
299      
300      //the relative path of p. basically 
301      // getting rid of /. Parsing and doing 
302      // string manipulation is not good - so
303      // just use the path api to do it.
304      private Path makeRelative(String initial, Path p) {
305        String scheme = this.uri.getScheme();
306        String authority = this.uri.getAuthority();
307        Path root = new Path(Path.SEPARATOR);
308        if (root.compareTo(p) == 0)
309          return new Path(scheme, authority, initial);
310        Path retPath = new Path(p.getName());
311        Path parent = p.getParent();
312        for (int i=0; i < p.depth()-1; i++) {
313          retPath = new Path(parent.getName(), retPath);
314          parent = parent.getParent();
315        }
316        return new Path(new Path(scheme, authority, initial),
317          retPath.toString());
318      }
319      
320      /* this makes a path qualified in the har filesystem
321       * (non-Javadoc)
322       * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified(
323       * org.apache.hadoop.fs.Path)
324       */
325      @Override
326      public Path makeQualified(Path path) {
327        // make sure that we just get the 
328        // path component 
329        Path fsPath = path;
330        if (!path.isAbsolute()) {
331          fsPath = new Path(archivePath, path);
332        }
333    
334        URI tmpURI = fsPath.toUri();
335        //change this to Har uri 
336        return new Path(uri.getScheme(), harAuth, tmpURI.getPath());
337      }
338    
339      /**
340       * Fix offset and length of block locations.
341       * Note that this method modifies the original array.
342       * @param locations block locations of har part file
343       * @param start the start of the desired range in the contained file
344       * @param len the length of the desired range
345       * @param fileOffsetInHar the offset of the desired file in the har part file
346       * @return block locations with fixed offset and length
347       */  
348      static BlockLocation[] fixBlockLocations(BlockLocation[] locations,
349                                              long start,
350                                              long len,
351                                              long fileOffsetInHar) {
352        // offset 1 past last byte of desired range
353        long end = start + len;
354    
355        for (BlockLocation location : locations) {
356          // offset of part block relative to beginning of desired file
357          // (may be negative if file starts in this part block)
358          long harBlockStart = location.getOffset() - fileOffsetInHar;
359          // offset 1 past last byte of har block relative to beginning of
360          // desired file
361          long harBlockEnd = harBlockStart + location.getLength();
362          
363          if (start > harBlockStart) {
364            // desired range starts after beginning of this har block
365            // fix offset to beginning of relevant range (relative to desired file)
366            location.setOffset(start);
367            // fix length to relevant portion of har block
368            location.setLength(location.getLength() - (start - harBlockStart));
369          } else {
370            // desired range includes beginning of this har block
371            location.setOffset(harBlockStart);
372          }
373          
374          if (harBlockEnd > end) {
375            // range ends before end of this har block
376            // fix length to remove irrelevant portion at the end
377            location.setLength(location.getLength() - (harBlockEnd - end));
378          }
379        }
380        
381        return locations;
382      }
383      
384      /**
385       * Get block locations from the underlying fs and fix their
386       * offsets and lengths.
387       * @param file the input filestatus to get block locations
388       * @param start the start of the desired range in the contained file
389       * @param len the length of the desired range
390       * @return block locations for this segment of file
391       * @throws IOException
392       */
393      @Override
394      public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
395                                                   long len) throws IOException {
396        HarStatus hstatus = getFileHarStatus(file.getPath());
397        Path partPath = new Path(archivePath, hstatus.getPartName());
398        FileStatus partStatus = metadata.getPartFileStatus(partPath);
399    
400        // get all part blocks that overlap with the desired file blocks
401        BlockLocation[] locations = 
402          fs.getFileBlockLocations(partStatus,
403                                   hstatus.getStartIndex() + start, len);
404    
405        return fixBlockLocations(locations, start, len, hstatus.getStartIndex());
406      }
407      
408      /**
409       * the hash of the path p inside iniside
410       * the filesystem
411       * @param p the path in the harfilesystem
412       * @return the hash code of the path.
413       */
414      public static int getHarHash(Path p) {
415        return (p.toString().hashCode() & 0x7fffffff);
416      }
417      
418      static class Store {
419        public Store() {
420          begin = end = startHash = endHash = 0;
421        }
422        public Store(long begin, long end, int startHash, int endHash) {
423          this.begin = begin;
424          this.end = end;
425          this.startHash = startHash;
426          this.endHash = endHash;
427        }
428        public long begin;
429        public long end;
430        public int startHash;
431        public int endHash;
432      }
433      
434      /**
435       * Get filestatuses of all the children of a given directory. This just reads
436       * through index file and reads line by line to get all statuses for children
437       * of a directory. Its a brute force way of getting all such filestatuses
438       * 
439       * @param parent
440       *          the parent path directory
441       * @param statuses
442       *          the list to add the children filestatuses to
443       * @param children
444       *          the string list of children for this parent
445       * @param archiveIndexStat
446       *          the archive index filestatus
447       */
448      private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses,
449          List<String> children) throws IOException {
450        String parentString = parent.getName();
451        if (!parentString.endsWith(Path.SEPARATOR)){
452            parentString += Path.SEPARATOR;
453        }
454        Path harPath = new Path(parentString);
455        int harlen = harPath.depth();
456        final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
457    
458        for (HarStatus hstatus : metadata.archive.values()) {
459          String child = hstatus.getName();
460          if ((child.startsWith(parentString))) {
461            Path thisPath = new Path(child);
462            if (thisPath.depth() == harlen + 1) {
463              statuses.add(toFileStatus(hstatus, cache));
464            }
465          }
466        }
467      }
468    
469      /**
470       * Combine the status stored in the index and the underlying status. 
471       * @param h status stored in the index
472       * @param cache caching the underlying file statuses
473       * @return the combined file status
474       * @throws IOException
475       */
476      private FileStatus toFileStatus(HarStatus h,
477          Map<String, FileStatus> cache) throws IOException {
478        FileStatus underlying = null;
479        if (cache != null) {
480          underlying = cache.get(h.partName);
481        }
482        if (underlying == null) {
483          final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
484          underlying = fs.getFileStatus(p);
485          if (cache != null) {
486            cache.put(h.partName, underlying);
487          }
488        }
489    
490        long modTime = 0;
491        int version = metadata.getVersion();
492        if (version < 3) {
493          modTime = underlying.getModificationTime();
494        } else if (version == 3) {
495          modTime = h.getModificationTime();
496        }
497    
498        return new FileStatus(
499            h.isDir()? 0L: h.getLength(),
500            h.isDir(),
501            underlying.getReplication(),
502            underlying.getBlockSize(),
503            modTime,
504            underlying.getAccessTime(),
505            underlying.getPermission(),
506            underlying.getOwner(),
507            underlying.getGroup(),
508            makeRelative(this.uri.getPath(), new Path(h.name)));
509      }
510    
511      // a single line parser for hadoop archives status 
512      // stored in a single line in the index files 
513      // the format is of the form 
514      // filename "dir"/"file" partFileName startIndex length 
515      // <space seperated children>
516      private class HarStatus {
517        boolean isDir;
518        String name;
519        List<String> children;
520        String partName;
521        long startIndex;
522        long length;
523        long modificationTime = 0;
524    
525        public HarStatus(String harString) throws UnsupportedEncodingException {
526          String[] splits = harString.split(" ");
527          this.name = decodeFileName(splits[0]);
528          this.isDir = "dir".equals(splits[1]) ? true: false;
529          // this is equal to "none" if its a directory
530          this.partName = splits[2];
531          this.startIndex = Long.parseLong(splits[3]);
532          this.length = Long.parseLong(splits[4]);
533    
534          int version = metadata.getVersion();
535          String[] propSplits = null;
536          // propSplits is used to retrieve the metainformation that Har versions
537          // 1 & 2 missed (modification time, permission, owner group).
538          // These fields are stored in an encoded string placed in different
539          // locations depending on whether it's a file or directory entry.
540          // If it's a directory, the string will be placed at the partName
541          // location (directories have no partName because they don't have data
542          // to be stored). This is done because the number of fields in a
543          // directory entry is unbounded (all children are listed at the end)
544          // If it's a file, the string will be the last field.
545          if (isDir) {
546            if (version == 3){
547              propSplits = decodeString(this.partName).split(" ");
548            }
549            children = new ArrayList<String>();
550            for (int i = 5; i < splits.length; i++) {
551              children.add(decodeFileName(splits[i]));
552            }
553          } else if (version == 3) {
554            propSplits = decodeString(splits[5]).split(" ");
555          }
556    
557          if (propSplits != null && propSplits.length >= 4) {
558            modificationTime = Long.parseLong(propSplits[0]);
559            // the fields below are stored in the file but are currently not used
560            // by HarFileSystem
561            // permission = new FsPermission(Short.parseShort(propSplits[1]));
562            // owner = decodeString(propSplits[2]);
563            // group = decodeString(propSplits[3]);
564          }
565        }
566        public boolean isDir() {
567          return isDir;
568        }
569        
570        public String getName() {
571          return name;
572        }
573        public String getPartName() {
574          return partName;
575        }
576        public long getStartIndex() {
577          return startIndex;
578        }
579        public long getLength() {
580          return length;
581        }
582        public long getModificationTime() {
583          return modificationTime;
584        }
585      }
586      
587      /**
588       * return the filestatus of files in har archive.
589       * The permission returned are that of the archive
590       * index files. The permissions are not persisted 
591       * while creating a hadoop archive.
592       * @param f the path in har filesystem
593       * @return filestatus.
594       * @throws IOException
595       */
596      @Override
597      public FileStatus getFileStatus(Path f) throws IOException {
598        HarStatus hstatus = getFileHarStatus(f);
599        return toFileStatus(hstatus, null);
600      }
601    
602      private HarStatus getFileHarStatus(Path f) throws IOException {
603        // get the fs DataInputStream for the underlying file
604        // look up the index.
605        Path p = makeQualified(f);
606        Path harPath = getPathInHar(p);
607        if (harPath == null) {
608          throw new IOException("Invalid file name: " + f + " in " + uri);
609        }
610        HarStatus hstatus = metadata.archive.get(harPath);
611        if (hstatus == null) {
612          throw new FileNotFoundException("File: " +  f + " does not exist in " + uri);
613        }
614        return hstatus;
615      }
616    
617      /**
618       * @return null since no checksum algorithm is implemented.
619       */
620      public FileChecksum getFileChecksum(Path f) {
621        return null;
622      }
623    
624      /**
625       * Returns a har input stream which fakes end of 
626       * file. It reads the index files to get the part 
627       * file name and the size and start of the file.
628       */
629      @Override
630      public FSDataInputStream open(Path f, int bufferSize) throws IOException {
631        // get the fs DataInputStream for the underlying file
632        HarStatus hstatus = getFileHarStatus(f);
633        // we got it.. woo hooo!!! 
634        if (hstatus.isDir()) {
635          throw new FileNotFoundException(f + " : not a file in " +
636                    archivePath);
637        }
638        return new HarFSDataInputStream(fs, new Path(archivePath, 
639            hstatus.getPartName()),
640            hstatus.getStartIndex(), hstatus.getLength(), bufferSize);
641      }
642     
643      public FSDataOutputStream create(Path f,
644          FsPermission permission,
645          boolean overwrite,
646          int bufferSize,
647          short replication,
648          long blockSize,
649          Progressable progress) throws IOException {
650        throw new IOException("Har: create not allowed.");
651      }
652      
653      @Override
654      public void close() throws IOException {
655        if (fs != null) {
656          try {
657            fs.close();
658          } catch(IOException ie) {
659            //this might already be closed
660            // ignore
661          }
662        }
663      }
664      
665      /**
666       * Not implemented.
667       */
668      @Override
669      public boolean setReplication(Path src, short replication) throws IOException{
670        throw new IOException("Har: setreplication not allowed");
671      }
672      
673      /**
674       * Not implemented.
675       */
676      @Override
677      public boolean delete(Path f, boolean recursive) throws IOException { 
678        throw new IOException("Har: delete not allowed");
679      }
680      
681      /**
682       * liststatus returns the children of a directory 
683       * after looking up the index files.
684       */
685      @Override
686      public FileStatus[] listStatus(Path f) throws IOException {
687        //need to see if the file is an index in file
688        //get the filestatus of the archive directory
689        // we will create fake filestatuses to return
690        // to the client
691        List<FileStatus> statuses = new ArrayList<FileStatus>();
692        Path tmpPath = makeQualified(f);
693        Path harPath = getPathInHar(tmpPath);
694        HarStatus hstatus = metadata.archive.get(harPath);
695        if (hstatus == null) {
696          throw new FileNotFoundException("File " + f + " not found in " + archivePath);
697        }
698        if (hstatus.isDir()) {
699          fileStatusesInIndex(hstatus, statuses, hstatus.children);
700        } else {
701          statuses.add(toFileStatus(hstatus, null));
702        }
703        
704        return statuses.toArray(new FileStatus[statuses.size()]);
705      }
706      
707      /**
708       * return the top level archive path.
709       */
710      public Path getHomeDirectory() {
711        return new Path(uri.toString());
712      }
713      
714      public void setWorkingDirectory(Path newDir) {
715        //does nothing.
716      }
717      
718      /**
719       * not implemented.
720       */
721      public boolean mkdirs(Path f, FsPermission permission) throws IOException {
722        throw new IOException("Har: mkdirs not allowed");
723      }
724      
725      /**
726       * not implemented.
727       */
728      public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws 
729            IOException {
730        throw new IOException("Har: copyfromlocalfile not allowed");
731      }
732      
733      /**
734       * copies the file in the har filesystem to a local file.
735       */
736      public void copyToLocalFile(boolean delSrc, Path src, Path dst) 
737        throws IOException {
738        FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf());
739      }
740      
741      /**
742       * not implemented.
743       */
744      public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) 
745        throws IOException {
746        throw new IOException("Har: startLocalOutput not allowed");
747      }
748      
749      /**
750       * not implemented.
751       */
752      public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) 
753        throws IOException {
754        throw new IOException("Har: completeLocalOutput not allowed");
755      }
756      
757      /**
758       * not implemented.
759       */
760      public void setOwner(Path p, String username, String groupname)
761        throws IOException {
762        throw new IOException("Har: setowner not allowed");
763      }
764    
765      /**
766       * Not implemented.
767       */
768      public void setPermission(Path p, FsPermission permisssion) 
769        throws IOException {
770        throw new IOException("Har: setPermission not allowed");
771      }
772      
773      /**
774       * Hadoop archives input stream. This input stream fakes EOF 
775       * since archive files are part of bigger part files.
776       */
777      private static class HarFSDataInputStream extends FSDataInputStream {
778        /**
779         * Create an input stream that fakes all the reads/positions/seeking.
780         */
781        private static class HarFsInputStream extends FSInputStream {
782          private long position, start, end;
783          //The underlying data input stream that the
784          // underlying filesystem will return.
785          private FSDataInputStream underLyingStream;
786          //one byte buffer
787          private byte[] oneBytebuff = new byte[1];
788          HarFsInputStream(FileSystem fs, Path path, long start,
789              long length, int bufferSize) throws IOException {
790            underLyingStream = fs.open(path, bufferSize);
791            underLyingStream.seek(start);
792            // the start of this file in the part file
793            this.start = start;
794            // the position pointer in the part file
795            this.position = start;
796            // the end pointer in the part file
797            this.end = start + length;
798          }
799          
800          public synchronized int available() throws IOException {
801            long remaining = end - underLyingStream.getPos();
802            if (remaining > (long)Integer.MAX_VALUE) {
803              return Integer.MAX_VALUE;
804            }
805            return (int) remaining;
806          }
807          
808          public synchronized  void close() throws IOException {
809            underLyingStream.close();
810            super.close();
811          }
812          
813          //not implemented
814          @Override
815          public void mark(int readLimit) {
816            // do nothing 
817          }
818          
819          /**
820           * reset is not implemented
821           */
822          public void reset() throws IOException {
823            throw new IOException("reset not implemented.");
824          }
825          
826          public synchronized int read() throws IOException {
827            int ret = read(oneBytebuff, 0, 1);
828            return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff);
829          }
830          
831          public synchronized int read(byte[] b) throws IOException {
832            int ret = read(b, 0, b.length);
833            if (ret != -1) {
834              position += ret;
835            }
836            return ret;
837          }
838          
839          /**
840           * 
841           */
842          public synchronized int read(byte[] b, int offset, int len) 
843            throws IOException {
844            int newlen = len;
845            int ret = -1;
846            if (position + len > end) {
847              newlen = (int) (end - position);
848            }
849            // end case
850            if (newlen == 0) 
851              return ret;
852            ret = underLyingStream.read(b, offset, newlen);
853            position += ret;
854            return ret;
855          }
856          
857          public synchronized long skip(long n) throws IOException {
858            long tmpN = n;
859            if (tmpN > 0) {
860              if (position + tmpN > end) {
861                tmpN = end - position;
862              }
863              underLyingStream.seek(tmpN + position);
864              position += tmpN;
865              return tmpN;
866            }
867            return (tmpN < 0)? -1 : 0;
868          }
869          
870          public synchronized long getPos() throws IOException {
871            return (position - start);
872          }
873          
874          public synchronized void seek(long pos) throws IOException {
875            if (pos < 0 || (start + pos > end)) {
876              throw new IOException("Failed to seek: EOF");
877            }
878            position = start + pos;
879            underLyingStream.seek(position);
880          }
881    
882          public boolean seekToNewSource(long targetPos) throws IOException {
883            //do not need to implement this
884            // hdfs in itself does seektonewsource 
885            // while reading.
886            return false;
887          }
888          
889          /**
890           * implementing position readable. 
891           */
892          public int read(long pos, byte[] b, int offset, int length) 
893          throws IOException {
894            int nlength = length;
895            if (start + nlength + pos > end) {
896              nlength = (int) (end - (start + pos));
897            }
898            return underLyingStream.read(pos + start , b, offset, nlength);
899          }
900          
901          /**
902           * position readable again.
903           */
904          public void readFully(long pos, byte[] b, int offset, int length) 
905          throws IOException {
906            if (start + length + pos > end) {
907              throw new IOException("Not enough bytes to read.");
908            }
909            underLyingStream.readFully(pos + start, b, offset, length);
910          }
911          
912          public void readFully(long pos, byte[] b) throws IOException {
913              readFully(pos, b, 0, b.length);
914          }
915          
916        }
917      
918        /**
919         * constructors for har input stream.
920         * @param fs the underlying filesystem
921         * @param p The path in the underlying filesystem
922         * @param start the start position in the part file
923         * @param length the length of valid data in the part file
924         * @param bufsize the buffer size
925         * @throws IOException
926         */
927        public HarFSDataInputStream(FileSystem fs, Path  p, long start, 
928            long length, int bufsize) throws IOException {
929            super(new HarFsInputStream(fs, p, start, length, bufsize));
930        }
931    
932        /**
933         * constructor for har input stream.
934         * @param fs the underlying filesystem
935         * @param p the path in the underlying file system
936         * @param start the start position in the part file
937         * @param length the length of valid data in the part file.
938         * @throws IOException
939         */
940        public HarFSDataInputStream(FileSystem fs, Path  p, long start, long length)
941          throws IOException {
942            super(new HarFsInputStream(fs, p, start, length, 0));
943        }
944      }
945    
946      private class HarMetaData {
947        private FileSystem fs;
948        private int version;
949        // the masterIndex of the archive
950        private Path masterIndexPath;
951        // the index file 
952        private Path archiveIndexPath;
953    
954        private long masterIndexTimestamp;
955        private long archiveIndexTimestamp;
956    
957        List<Store> stores = new ArrayList<Store>();
958        Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
959        private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
960    
961        public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
962          this.fs = fs;
963          this.masterIndexPath = masterIndexPath;
964          this.archiveIndexPath = archiveIndexPath;
965        }
966    
967        public FileStatus getPartFileStatus(Path partPath) throws IOException {
968          FileStatus status;
969          status = partFileStatuses.get(partPath);
970          if (status == null) {
971            status = fs.getFileStatus(partPath);
972            partFileStatuses.put(partPath, status);
973          }
974          return status;
975        }
976    
977        public long getMasterIndexTimestamp() {
978          return masterIndexTimestamp;
979        }
980    
981        public long getArchiveIndexTimestamp() {
982          return archiveIndexTimestamp;
983        }
984    
985        private int getVersion() {
986          return version;
987        }
988    
989        private void parseMetaData() throws IOException {
990          FSDataInputStream in = fs.open(masterIndexPath);
991          FileStatus masterStat = fs.getFileStatus(masterIndexPath);
992          masterIndexTimestamp = masterStat.getModificationTime();
993          LineReader lin = new LineReader(in, getConf());
994          Text line = new Text();
995          long read = lin.readLine(line);
996    
997         // the first line contains the version of the index file
998          String versionLine = line.toString();
999          String[] arr = versionLine.split(" ");
1000          version = Integer.parseInt(arr[0]);
1001          // make it always backwards-compatible
1002          if (this.version > HarFileSystem.VERSION) {
1003            throw new IOException("Invalid version " + 
1004                this.version + " expected " + HarFileSystem.VERSION);
1005          }
1006    
1007          // each line contains a hashcode range and the index file name
1008          String[] readStr = null;
1009          while(read < masterStat.getLen()) {
1010            int b = lin.readLine(line);
1011            read += b;
1012            readStr = line.toString().split(" ");
1013            int startHash = Integer.parseInt(readStr[0]);
1014            int endHash  = Integer.parseInt(readStr[1]);
1015            stores.add(new Store(Long.parseLong(readStr[2]), 
1016                Long.parseLong(readStr[3]), startHash,
1017                endHash));
1018            line.clear();
1019          }
1020          try {
1021            // close the master index
1022            lin.close();
1023          } catch(IOException io){
1024            // do nothing just a read.
1025          }
1026    
1027          FSDataInputStream aIn = fs.open(archiveIndexPath);
1028          FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
1029          archiveIndexTimestamp = archiveStat.getModificationTime();
1030          LineReader aLin;
1031    
1032          // now start reading the real index file
1033          for (Store s: stores) {
1034            read = 0;
1035            aIn.seek(s.begin);
1036            aLin = new LineReader(aIn, getConf());
1037            while (read + s.begin < s.end) {
1038              int tmp = aLin.readLine(line);
1039              read += tmp;
1040              String lineFeed = line.toString();
1041              String[] parsed = lineFeed.split(" ");
1042              parsed[0] = decodeFileName(parsed[0]);
1043              archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
1044              line.clear();
1045            }
1046          }
1047          try {
1048            // close the archive index
1049            aIn.close();
1050          } catch(IOException io) {
1051            // do nothing just a read.
1052          }
1053        }
1054      }
1055      
1056      /*
1057       * testing purposes only:
1058       */
1059      HarMetaData getMetadata() {
1060        return metadata;
1061      }
1062    }