001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.fs;
019    
020    import java.io.FileNotFoundException;
021    import java.io.IOException;
022    import java.io.UnsupportedEncodingException;
023    import java.net.URI;
024    import java.net.URISyntaxException;
025    import java.net.URLDecoder;
026    import java.util.ArrayList;
027    import java.util.List;
028    import java.util.Map;
029    import java.util.TreeMap;
030    import java.util.HashMap;
031    import java.util.concurrent.ConcurrentHashMap;
032    
033    import org.apache.commons.logging.Log;
034    import org.apache.commons.logging.LogFactory;
035    import org.apache.hadoop.conf.Configuration;
036    import org.apache.hadoop.fs.permission.FsPermission;
037    import org.apache.hadoop.io.IOUtils;
038    import org.apache.hadoop.io.Text;
039    import org.apache.hadoop.util.LineReader;
040    import org.apache.hadoop.util.Progressable;
041    
042    /**
043     * This is an implementation of the Hadoop Archive 
044     * Filesystem. This archive Filesystem has index files
045     * of the form _index* and has contents of the form
046     * part-*. The index files store the indexes of the 
047     * real files. The index files are of the form _masterindex
048     * and _index. The master index is a level of indirection 
049     * in to the index file to make the look ups faster. the index
050     * file is sorted with hash code of the paths that it contains 
051     * and the master index contains pointers to the positions in 
052     * index for ranges of hashcodes.
053     */
054    
055    public class HarFileSystem extends FilterFileSystem {
056    
057      private static final Log LOG = LogFactory.getLog(HarFileSystem.class);
058    
059      public static final int VERSION = 3;
060    
061      private static final Map<URI, HarMetaData> harMetaCache =
062          new ConcurrentHashMap<URI, HarMetaData>();
063    
064      // uri representation of this Har filesystem
065      private URI uri;
066      // the top level path of the archive
067      // in the underlying file system
068      private Path archivePath;
069      // the har auth
070      private String harAuth;
071    
072      // pointer into the static metadata cache
073      private HarMetaData metadata;
074    
075      /**
076       * public construction of harfilesystem
077       *
078       */
079      public HarFileSystem() {
080      }
081      
082      /**
083       * Constructor to create a HarFileSystem with an
084       * underlying filesystem.
085       * @param fs
086       */
087      public HarFileSystem(FileSystem fs) {
088        super(fs);
089      }
090      
091      /**
092       * Initialize a Har filesystem per har archive. The 
093       * archive home directory is the top level directory
094       * in the filesystem that contains the HAR archive.
095       * Be careful with this method, you do not want to go 
096       * on creating new Filesystem instances per call to 
097       * path.getFileSystem().
098       * the uri of Har is 
099       * har://underlyingfsscheme-host:port/archivepath.
100       * or 
101       * har:///archivepath. This assumes the underlying filesystem
102       * to be used in case not specified.
103       */
104      public void initialize(URI name, Configuration conf) throws IOException {
105        // decode the name
106        URI underLyingURI = decodeHarURI(name, conf);
107        // we got the right har Path- now check if this is 
108        // truly a har filesystem
109        Path harPath = archivePath(
110          new Path(name.getScheme(), name.getAuthority(), name.getPath()));
111        if (harPath == null) { 
112          throw new IOException("Invalid path for the Har Filesystem. " + 
113                               name.toString());
114        }
115        if (fs == null) {
116          fs = FileSystem.get(underLyingURI, conf);
117        }
118        uri = harPath.toUri();
119        archivePath = new Path(uri.getPath());
120        harAuth = getHarAuth(underLyingURI);
121        //check for the underlying fs containing
122        // the index file
123        Path masterIndexPath = new Path(archivePath, "_masterindex");
124        Path archiveIndexPath = new Path(archivePath, "_index");
125        if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) {
126          throw new IOException("Invalid path for the Har Filesystem. " +
127              "No index file in " + harPath);
128        }
129    
130        metadata = harMetaCache.get(uri);
131        if (metadata != null) {
132          FileStatus mStat = fs.getFileStatus(masterIndexPath);
133          FileStatus aStat = fs.getFileStatus(archiveIndexPath);
134          if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() ||
135              aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) {
136            // the archive has been overwritten since we last read it
137            // remove the entry from the meta data cache
138            metadata = null;
139            harMetaCache.remove(uri);
140          }
141        }
142        if (metadata == null) {
143          metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath);
144          metadata.parseMetaData();
145          harMetaCache.put(uri, metadata);
146        }
147      }
148    
149      // get the version of the filesystem from the masterindex file
150      // the version is currently not useful since its the first version
151      // of archives
152      public int getHarVersion() throws IOException {
153        if (metadata != null) {
154          return metadata.getVersion();
155        }
156        else {
157          throw new IOException("Invalid meta data for the Har Filesystem");
158        }
159      }
160    
161      /*
162       * find the parent path that is the 
163       * archive path in the path. The last
164       * path segment that ends with .har is 
165       * the path that will be returned.
166       */
167      private Path archivePath(Path p) {
168        Path retPath = null;
169        Path tmp = p;
170        for (int i=0; i< p.depth(); i++) {
171          if (tmp.toString().endsWith(".har")) {
172            retPath = tmp;
173            break;
174          }
175          tmp = tmp.getParent();
176        }
177        return retPath;
178      }
179    
180      /**
181       * decode the raw URI to get the underlying URI
182       * @param rawURI raw Har URI
183       * @return filtered URI of the underlying fileSystem
184       */
185      private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException {
186        String tmpAuth = rawURI.getAuthority();
187        //we are using the default file
188        //system in the config 
189        //so create a underlying uri and 
190        //return it
191        if (tmpAuth == null) {
192          //create a path 
193          return FileSystem.getDefaultUri(conf);
194        }
195        String host = rawURI.getHost();
196        if (host == null) {
197          throw new IOException("URI: " + rawURI
198              + " is an invalid Har URI since host==null."
199              + "  Expecting har://<scheme>-<host>/<path>.");
200        }
201        int i = host.indexOf('-');
202        if (i < 0) {
203          throw new IOException("URI: " + rawURI
204              + " is an invalid Har URI since '-' not found."
205              + "  Expecting har://<scheme>-<host>/<path>.");
206        }
207        final String underLyingScheme = host.substring(0, i);
208        i++;
209        final String underLyingHost = i == host.length()? null: host.substring(i);
210        int underLyingPort = rawURI.getPort();
211        String auth = (underLyingHost == null && underLyingPort == -1)?
212                      null:(underLyingHost+":"+underLyingPort);
213        URI tmp = null;
214        if (rawURI.getQuery() != null) {
215          // query component not allowed
216          throw new IOException("query component in Path not supported  " + rawURI);
217        }
218        try {
219          tmp = new URI(underLyingScheme, auth, rawURI.getPath(), 
220                rawURI.getQuery(), rawURI.getFragment());
221        } catch (URISyntaxException e) {
222            // do nothing should not happen
223        }
224        return tmp;
225      }
226    
227      private static String decodeString(String str)
228        throws UnsupportedEncodingException {
229        return URLDecoder.decode(str, "UTF-8");
230      }
231    
232      private String decodeFileName(String fname) 
233        throws UnsupportedEncodingException {
234        int version = metadata.getVersion();
235        if (version == 2 || version == 3){
236          return decodeString(fname);
237        }
238        return fname;
239      }
240    
241      /**
242       * return the top level archive.
243       */
244      public Path getWorkingDirectory() {
245        return new Path(uri.toString());
246      }
247      
248      /**
249       * Create a har specific auth 
250       * har-underlyingfs:port
251       * @param underLyingURI the uri of underlying
252       * filesystem
253       * @return har specific auth
254       */
255      private String getHarAuth(URI underLyingUri) {
256        String auth = underLyingUri.getScheme() + "-";
257        if (underLyingUri.getHost() != null) {
258          auth += underLyingUri.getHost() + ":";
259          if (underLyingUri.getPort() != -1) {
260            auth +=  underLyingUri.getPort();
261          }
262        }
263        else {
264          auth += ":";
265        }
266        return auth;
267      }
268      
269      /**
270       * Returns the uri of this filesystem.
271       * The uri is of the form 
272       * har://underlyingfsschema-host:port/pathintheunderlyingfs
273       */
274      @Override
275      public URI getUri() {
276        return this.uri;
277      }
278      
279      /**
280       * this method returns the path 
281       * inside the har filesystem.
282       * this is relative path inside 
283       * the har filesystem.
284       * @param path the fully qualified path in the har filesystem.
285       * @return relative path in the filesystem.
286       */
287      private Path getPathInHar(Path path) {
288        Path harPath = new Path(path.toUri().getPath());
289        if (archivePath.compareTo(harPath) == 0)
290          return new Path(Path.SEPARATOR);
291        Path tmp = new Path(harPath.getName());
292        Path parent = harPath.getParent();
293        while (!(parent.compareTo(archivePath) == 0)) {
294          if (parent.toString().equals(Path.SEPARATOR)) {
295            tmp = null;
296            break;
297          }
298          tmp = new Path(parent.getName(), tmp);
299          parent = parent.getParent();
300        }
301        if (tmp != null) 
302          tmp = new Path(Path.SEPARATOR, tmp);
303        return tmp;
304      }
305      
306      //the relative path of p. basically 
307      // getting rid of /. Parsing and doing 
308      // string manipulation is not good - so
309      // just use the path api to do it.
310      private Path makeRelative(String initial, Path p) {
311        String scheme = this.uri.getScheme();
312        String authority = this.uri.getAuthority();
313        Path root = new Path(Path.SEPARATOR);
314        if (root.compareTo(p) == 0)
315          return new Path(scheme, authority, initial);
316        Path retPath = new Path(p.getName());
317        Path parent = p.getParent();
318        for (int i=0; i < p.depth()-1; i++) {
319          retPath = new Path(parent.getName(), retPath);
320          parent = parent.getParent();
321        }
322        return new Path(new Path(scheme, authority, initial),
323          retPath.toString());
324      }
325      
326      /* this makes a path qualified in the har filesystem
327       * (non-Javadoc)
328       * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified(
329       * org.apache.hadoop.fs.Path)
330       */
331      @Override
332      public Path makeQualified(Path path) {
333        // make sure that we just get the 
334        // path component 
335        Path fsPath = path;
336        if (!path.isAbsolute()) {
337          fsPath = new Path(archivePath, path);
338        }
339    
340        URI tmpURI = fsPath.toUri();
341        //change this to Har uri 
342        return new Path(uri.getScheme(), harAuth, tmpURI.getPath());
343      }
344    
345      /**
346       * Fix offset and length of block locations.
347       * Note that this method modifies the original array.
348       * @param locations block locations of har part file
349       * @param start the start of the desired range in the contained file
350       * @param len the length of the desired range
351       * @param fileOffsetInHar the offset of the desired file in the har part file
352       * @return block locations with fixed offset and length
353       */  
354      static BlockLocation[] fixBlockLocations(BlockLocation[] locations,
355                                              long start,
356                                              long len,
357                                              long fileOffsetInHar) {
358        // offset 1 past last byte of desired range
359        long end = start + len;
360    
361        for (BlockLocation location : locations) {
362          // offset of part block relative to beginning of desired file
363          // (may be negative if file starts in this part block)
364          long harBlockStart = location.getOffset() - fileOffsetInHar;
365          // offset 1 past last byte of har block relative to beginning of
366          // desired file
367          long harBlockEnd = harBlockStart + location.getLength();
368          
369          if (start > harBlockStart) {
370            // desired range starts after beginning of this har block
371            // fix offset to beginning of relevant range (relative to desired file)
372            location.setOffset(start);
373            // fix length to relevant portion of har block
374            location.setLength(location.getLength() - (start - harBlockStart));
375          } else {
376            // desired range includes beginning of this har block
377            location.setOffset(harBlockStart);
378          }
379          
380          if (harBlockEnd > end) {
381            // range ends before end of this har block
382            // fix length to remove irrelevant portion at the end
383            location.setLength(location.getLength() - (harBlockEnd - end));
384          }
385        }
386        
387        return locations;
388      }
389      
390      /**
391       * Get block locations from the underlying fs and fix their
392       * offsets and lengths.
393       * @param file the input filestatus to get block locations
394       * @param start the start of the desired range in the contained file
395       * @param len the length of the desired range
396       * @return block locations for this segment of file
397       * @throws IOException
398       */
399      @Override
400      public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
401                                                   long len) throws IOException {
402        HarStatus hstatus = getFileHarStatus(file.getPath());
403        Path partPath = new Path(archivePath, hstatus.getPartName());
404        FileStatus partStatus = metadata.getPartFileStatus(partPath);
405    
406        // get all part blocks that overlap with the desired file blocks
407        BlockLocation[] locations = 
408          fs.getFileBlockLocations(partStatus,
409                                   hstatus.getStartIndex() + start, len);
410    
411        return fixBlockLocations(locations, start, len, hstatus.getStartIndex());
412      }
413      
414      /**
415       * the hash of the path p inside iniside
416       * the filesystem
417       * @param p the path in the harfilesystem
418       * @return the hash code of the path.
419       */
420      public static int getHarHash(Path p) {
421        return (p.toString().hashCode() & 0x7fffffff);
422      }
423      
424      static class Store {
425        public Store() {
426          begin = end = startHash = endHash = 0;
427        }
428        public Store(long begin, long end, int startHash, int endHash) {
429          this.begin = begin;
430          this.end = end;
431          this.startHash = startHash;
432          this.endHash = endHash;
433        }
434        public long begin;
435        public long end;
436        public int startHash;
437        public int endHash;
438      }
439      
440      /**
441       * Get filestatuses of all the children of a given directory. This just reads
442       * through index file and reads line by line to get all statuses for children
443       * of a directory. Its a brute force way of getting all such filestatuses
444       * 
445       * @param parent
446       *          the parent path directory
447       * @param statuses
448       *          the list to add the children filestatuses to
449       * @param children
450       *          the string list of children for this parent
451       * @param archiveIndexStat
452       *          the archive index filestatus
453       */
454      private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses,
455          List<String> children) throws IOException {
456        String parentString = parent.getName();
457        if (!parentString.endsWith(Path.SEPARATOR)){
458            parentString += Path.SEPARATOR;
459        }
460        Path harPath = new Path(parentString);
461        int harlen = harPath.depth();
462        final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
463    
464        for (HarStatus hstatus : metadata.archive.values()) {
465          String child = hstatus.getName();
466          if ((child.startsWith(parentString))) {
467            Path thisPath = new Path(child);
468            if (thisPath.depth() == harlen + 1) {
469              statuses.add(toFileStatus(hstatus, cache));
470            }
471          }
472        }
473      }
474    
475      /**
476       * Combine the status stored in the index and the underlying status. 
477       * @param h status stored in the index
478       * @param cache caching the underlying file statuses
479       * @return the combined file status
480       * @throws IOException
481       */
482      private FileStatus toFileStatus(HarStatus h,
483          Map<String, FileStatus> cache) throws IOException {
484        FileStatus underlying = null;
485        if (cache != null) {
486          underlying = cache.get(h.partName);
487        }
488        if (underlying == null) {
489          final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
490          underlying = fs.getFileStatus(p);
491          if (cache != null) {
492            cache.put(h.partName, underlying);
493          }
494        }
495    
496        long modTime = 0;
497        int version = metadata.getVersion();
498        if (version < 3) {
499          modTime = underlying.getModificationTime();
500        } else if (version == 3) {
501          modTime = h.getModificationTime();
502        }
503    
504        return new FileStatus(
505            h.isDir()? 0L: h.getLength(),
506            h.isDir(),
507            underlying.getReplication(),
508            underlying.getBlockSize(),
509            modTime,
510            underlying.getAccessTime(),
511            underlying.getPermission(),
512            underlying.getOwner(),
513            underlying.getGroup(),
514            makeRelative(this.uri.getPath(), new Path(h.name)));
515      }
516    
517      // a single line parser for hadoop archives status 
518      // stored in a single line in the index files 
519      // the format is of the form 
520      // filename "dir"/"file" partFileName startIndex length 
521      // <space seperated children>
522      private class HarStatus {
523        boolean isDir;
524        String name;
525        List<String> children;
526        String partName;
527        long startIndex;
528        long length;
529        long modificationTime = 0;
530    
531        public HarStatus(String harString) throws UnsupportedEncodingException {
532          String[] splits = harString.split(" ");
533          this.name = decodeFileName(splits[0]);
534          this.isDir = "dir".equals(splits[1]) ? true: false;
535          // this is equal to "none" if its a directory
536          this.partName = splits[2];
537          this.startIndex = Long.parseLong(splits[3]);
538          this.length = Long.parseLong(splits[4]);
539    
540          int version = metadata.getVersion();
541          String[] propSplits = null;
542          // propSplits is used to retrieve the metainformation that Har versions
543          // 1 & 2 missed (modification time, permission, owner group).
544          // These fields are stored in an encoded string placed in different
545          // locations depending on whether it's a file or directory entry.
546          // If it's a directory, the string will be placed at the partName
547          // location (directories have no partName because they don't have data
548          // to be stored). This is done because the number of fields in a
549          // directory entry is unbounded (all children are listed at the end)
550          // If it's a file, the string will be the last field.
551          if (isDir) {
552            if (version == 3){
553              propSplits = decodeString(this.partName).split(" ");
554            }
555            children = new ArrayList<String>();
556            for (int i = 5; i < splits.length; i++) {
557              children.add(decodeFileName(splits[i]));
558            }
559          } else if (version == 3) {
560            propSplits = decodeString(splits[5]).split(" ");
561          }
562    
563          if (propSplits != null && propSplits.length >= 4) {
564            modificationTime = Long.parseLong(propSplits[0]);
565            // the fields below are stored in the file but are currently not used
566            // by HarFileSystem
567            // permission = new FsPermission(Short.parseShort(propSplits[1]));
568            // owner = decodeString(propSplits[2]);
569            // group = decodeString(propSplits[3]);
570          }
571        }
572        public boolean isDir() {
573          return isDir;
574        }
575        
576        public String getName() {
577          return name;
578        }
579        public String getPartName() {
580          return partName;
581        }
582        public long getStartIndex() {
583          return startIndex;
584        }
585        public long getLength() {
586          return length;
587        }
588        public long getModificationTime() {
589          return modificationTime;
590        }
591      }
592      
593      /**
594       * return the filestatus of files in har archive.
595       * The permission returned are that of the archive
596       * index files. The permissions are not persisted 
597       * while creating a hadoop archive.
598       * @param f the path in har filesystem
599       * @return filestatus.
600       * @throws IOException
601       */
602      @Override
603      public FileStatus getFileStatus(Path f) throws IOException {
604        HarStatus hstatus = getFileHarStatus(f);
605        return toFileStatus(hstatus, null);
606      }
607    
608      private HarStatus getFileHarStatus(Path f) throws IOException {
609        // get the fs DataInputStream for the underlying file
610        // look up the index.
611        Path p = makeQualified(f);
612        Path harPath = getPathInHar(p);
613        if (harPath == null) {
614          throw new IOException("Invalid file name: " + f + " in " + uri);
615        }
616        HarStatus hstatus = metadata.archive.get(harPath);
617        if (hstatus == null) {
618          throw new FileNotFoundException("File: " +  f + " does not exist in " + uri);
619        }
620        return hstatus;
621      }
622    
623      /**
624       * @return null since no checksum algorithm is implemented.
625       */
626      public FileChecksum getFileChecksum(Path f) {
627        return null;
628      }
629    
630      /**
631       * Returns a har input stream which fakes end of 
632       * file. It reads the index files to get the part 
633       * file name and the size and start of the file.
634       */
635      @Override
636      public FSDataInputStream open(Path f, int bufferSize) throws IOException {
637        // get the fs DataInputStream for the underlying file
638        HarStatus hstatus = getFileHarStatus(f);
639        // we got it.. woo hooo!!! 
640        if (hstatus.isDir()) {
641          throw new FileNotFoundException(f + " : not a file in " +
642                    archivePath);
643        }
644        return new HarFSDataInputStream(fs, new Path(archivePath, 
645            hstatus.getPartName()),
646            hstatus.getStartIndex(), hstatus.getLength(), bufferSize);
647      }
648     
649      public FSDataOutputStream create(Path f,
650          FsPermission permission,
651          boolean overwrite,
652          int bufferSize,
653          short replication,
654          long blockSize,
655          Progressable progress) throws IOException {
656        throw new IOException("Har: create not allowed.");
657      }
658      
659      @Override
660      public void close() throws IOException {
661        if (fs != null) {
662          try {
663            fs.close();
664          } catch(IOException ie) {
665            //this might already be closed
666            // ignore
667          }
668        }
669      }
670      
671      /**
672       * Not implemented.
673       */
674      @Override
675      public boolean setReplication(Path src, short replication) throws IOException{
676        throw new IOException("Har: setreplication not allowed");
677      }
678      
679      /**
680       * Not implemented.
681       */
682      @Override
683      public boolean delete(Path f, boolean recursive) throws IOException { 
684        throw new IOException("Har: delete not allowed");
685      }
686      
687      /**
688       * liststatus returns the children of a directory 
689       * after looking up the index files.
690       */
691      @Override
692      public FileStatus[] listStatus(Path f) throws IOException {
693        //need to see if the file is an index in file
694        //get the filestatus of the archive directory
695        // we will create fake filestatuses to return
696        // to the client
697        List<FileStatus> statuses = new ArrayList<FileStatus>();
698        Path tmpPath = makeQualified(f);
699        Path harPath = getPathInHar(tmpPath);
700        HarStatus hstatus = metadata.archive.get(harPath);
701        if (hstatus == null) {
702          throw new FileNotFoundException("File " + f + " not found in " + archivePath);
703        }
704        if (hstatus.isDir()) {
705          fileStatusesInIndex(hstatus, statuses, hstatus.children);
706        } else {
707          statuses.add(toFileStatus(hstatus, null));
708        }
709        
710        return statuses.toArray(new FileStatus[statuses.size()]);
711      }
712      
713      /**
714       * return the top level archive path.
715       */
716      public Path getHomeDirectory() {
717        return new Path(uri.toString());
718      }
719      
720      public void setWorkingDirectory(Path newDir) {
721        //does nothing.
722      }
723      
724      /**
725       * not implemented.
726       */
727      public boolean mkdirs(Path f, FsPermission permission) throws IOException {
728        throw new IOException("Har: mkdirs not allowed");
729      }
730      
731      /**
732       * not implemented.
733       */
734      public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws 
735            IOException {
736        throw new IOException("Har: copyfromlocalfile not allowed");
737      }
738      
739      /**
740       * copies the file in the har filesystem to a local file.
741       */
742      public void copyToLocalFile(boolean delSrc, Path src, Path dst) 
743        throws IOException {
744        FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf());
745      }
746      
747      /**
748       * not implemented.
749       */
750      public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) 
751        throws IOException {
752        throw new IOException("Har: startLocalOutput not allowed");
753      }
754      
755      /**
756       * not implemented.
757       */
758      public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) 
759        throws IOException {
760        throw new IOException("Har: completeLocalOutput not allowed");
761      }
762      
763      /**
764       * not implemented.
765       */
766      public void setOwner(Path p, String username, String groupname)
767        throws IOException {
768        throw new IOException("Har: setowner not allowed");
769      }
770    
771      /**
772       * Not implemented.
773       */
774      public void setPermission(Path p, FsPermission permisssion) 
775        throws IOException {
776        throw new IOException("Har: setPermission not allowed");
777      }
778      
779      /**
780       * Hadoop archives input stream. This input stream fakes EOF 
781       * since archive files are part of bigger part files.
782       */
783      private static class HarFSDataInputStream extends FSDataInputStream {
784        /**
785         * Create an input stream that fakes all the reads/positions/seeking.
786         */
787        private static class HarFsInputStream extends FSInputStream {
788          private long position, start, end;
789          //The underlying data input stream that the
790          // underlying filesystem will return.
791          private FSDataInputStream underLyingStream;
792          //one byte buffer
793          private byte[] oneBytebuff = new byte[1];
794          HarFsInputStream(FileSystem fs, Path path, long start,
795              long length, int bufferSize) throws IOException {
796            underLyingStream = fs.open(path, bufferSize);
797            underLyingStream.seek(start);
798            // the start of this file in the part file
799            this.start = start;
800            // the position pointer in the part file
801            this.position = start;
802            // the end pointer in the part file
803            this.end = start + length;
804          }
805          
806          public synchronized int available() throws IOException {
807            long remaining = end - underLyingStream.getPos();
808            if (remaining > (long)Integer.MAX_VALUE) {
809              return Integer.MAX_VALUE;
810            }
811            return (int) remaining;
812          }
813          
814          public synchronized  void close() throws IOException {
815            underLyingStream.close();
816            super.close();
817          }
818          
819          //not implemented
820          @Override
821          public void mark(int readLimit) {
822            // do nothing 
823          }
824          
825          /**
826           * reset is not implemented
827           */
828          public void reset() throws IOException {
829            throw new IOException("reset not implemented.");
830          }
831          
832          public synchronized int read() throws IOException {
833            int ret = read(oneBytebuff, 0, 1);
834            return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff);
835          }
836          
837          public synchronized int read(byte[] b) throws IOException {
838            int ret = read(b, 0, b.length);
839            if (ret != -1) {
840              position += ret;
841            }
842            return ret;
843          }
844          
845          /**
846           * 
847           */
848          public synchronized int read(byte[] b, int offset, int len) 
849            throws IOException {
850            int newlen = len;
851            int ret = -1;
852            if (position + len > end) {
853              newlen = (int) (end - position);
854            }
855            // end case
856            if (newlen == 0) 
857              return ret;
858            ret = underLyingStream.read(b, offset, newlen);
859            position += ret;
860            return ret;
861          }
862          
863          public synchronized long skip(long n) throws IOException {
864            long tmpN = n;
865            if (tmpN > 0) {
866              if (position + tmpN > end) {
867                tmpN = end - position;
868              }
869              underLyingStream.seek(tmpN + position);
870              position += tmpN;
871              return tmpN;
872            }
873            return (tmpN < 0)? -1 : 0;
874          }
875          
876          public synchronized long getPos() throws IOException {
877            return (position - start);
878          }
879          
880          public synchronized void seek(long pos) throws IOException {
881            if (pos < 0 || (start + pos > end)) {
882              throw new IOException("Failed to seek: EOF");
883            }
884            position = start + pos;
885            underLyingStream.seek(position);
886          }
887    
888          public boolean seekToNewSource(long targetPos) throws IOException {
889            //do not need to implement this
890            // hdfs in itself does seektonewsource 
891            // while reading.
892            return false;
893          }
894          
895          /**
896           * implementing position readable. 
897           */
898          public int read(long pos, byte[] b, int offset, int length) 
899          throws IOException {
900            int nlength = length;
901            if (start + nlength + pos > end) {
902              nlength = (int) (end - (start + pos));
903            }
904            return underLyingStream.read(pos + start , b, offset, nlength);
905          }
906          
907          /**
908           * position readable again.
909           */
910          public void readFully(long pos, byte[] b, int offset, int length) 
911          throws IOException {
912            if (start + length + pos > end) {
913              throw new IOException("Not enough bytes to read.");
914            }
915            underLyingStream.readFully(pos + start, b, offset, length);
916          }
917          
918          public void readFully(long pos, byte[] b) throws IOException {
919              readFully(pos, b, 0, b.length);
920          }
921          
922        }
923      
924        /**
925         * constructors for har input stream.
926         * @param fs the underlying filesystem
927         * @param p The path in the underlying filesystem
928         * @param start the start position in the part file
929         * @param length the length of valid data in the part file
930         * @param bufsize the buffer size
931         * @throws IOException
932         */
933        public HarFSDataInputStream(FileSystem fs, Path  p, long start, 
934            long length, int bufsize) throws IOException {
935            super(new HarFsInputStream(fs, p, start, length, bufsize));
936        }
937    
938        /**
939         * constructor for har input stream.
940         * @param fs the underlying filesystem
941         * @param p the path in the underlying file system
942         * @param start the start position in the part file
943         * @param length the length of valid data in the part file.
944         * @throws IOException
945         */
946        public HarFSDataInputStream(FileSystem fs, Path  p, long start, long length)
947          throws IOException {
948            super(new HarFsInputStream(fs, p, start, length, 0));
949        }
950      }
951    
952      private class HarMetaData {
953        private FileSystem fs;
954        private int version;
955        // the masterIndex of the archive
956        private Path masterIndexPath;
957        // the index file 
958        private Path archiveIndexPath;
959    
960        private long masterIndexTimestamp;
961        private long archiveIndexTimestamp;
962    
963        List<Store> stores = new ArrayList<Store>();
964        Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
965        private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
966    
967        public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
968          this.fs = fs;
969          this.masterIndexPath = masterIndexPath;
970          this.archiveIndexPath = archiveIndexPath;
971        }
972    
973        public FileStatus getPartFileStatus(Path partPath) throws IOException {
974          FileStatus status;
975          status = partFileStatuses.get(partPath);
976          if (status == null) {
977            status = fs.getFileStatus(partPath);
978            partFileStatuses.put(partPath, status);
979          }
980          return status;
981        }
982    
983        public long getMasterIndexTimestamp() {
984          return masterIndexTimestamp;
985        }
986    
987        public long getArchiveIndexTimestamp() {
988          return archiveIndexTimestamp;
989        }
990    
991        private int getVersion() {
992          return version;
993        }
994    
995        private void parseMetaData() throws IOException {
996          Text line;
997          long read;
998          FSDataInputStream in = null;
999          LineReader lin = null;
1000    
1001          try {
1002            in = fs.open(masterIndexPath);
1003            FileStatus masterStat = fs.getFileStatus(masterIndexPath);
1004            masterIndexTimestamp = masterStat.getModificationTime();
1005            lin = new LineReader(in, getConf());
1006            line = new Text();
1007            read = lin.readLine(line);
1008    
1009            // the first line contains the version of the index file
1010            String versionLine = line.toString();
1011            String[] arr = versionLine.split(" ");
1012            version = Integer.parseInt(arr[0]);
1013            // make it always backwards-compatible
1014            if (this.version > HarFileSystem.VERSION) {
1015              throw new IOException("Invalid version " + 
1016                  this.version + " expected " + HarFileSystem.VERSION);
1017            }
1018    
1019            // each line contains a hashcode range and the index file name
1020            String[] readStr = null;
1021            while(read < masterStat.getLen()) {
1022              int b = lin.readLine(line);
1023              read += b;
1024              readStr = line.toString().split(" ");
1025              int startHash = Integer.parseInt(readStr[0]);
1026              int endHash  = Integer.parseInt(readStr[1]);
1027              stores.add(new Store(Long.parseLong(readStr[2]), 
1028                  Long.parseLong(readStr[3]), startHash,
1029                  endHash));
1030              line.clear();
1031            }
1032          } finally {
1033            IOUtils.cleanup(LOG, lin, in);
1034          }
1035    
1036          FSDataInputStream aIn = fs.open(archiveIndexPath);
1037          try {
1038            FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
1039            archiveIndexTimestamp = archiveStat.getModificationTime();
1040            LineReader aLin;
1041    
1042            // now start reading the real index file
1043            for (Store s: stores) {
1044              read = 0;
1045              aIn.seek(s.begin);
1046              aLin = new LineReader(aIn, getConf());
1047              while (read + s.begin < s.end) {
1048                int tmp = aLin.readLine(line);
1049                read += tmp;
1050                String lineFeed = line.toString();
1051                String[] parsed = lineFeed.split(" ");
1052                parsed[0] = decodeFileName(parsed[0]);
1053                archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
1054                line.clear();
1055              }
1056            }
1057          } finally {
1058            IOUtils.cleanup(LOG, aIn);
1059          }
1060        }
1061      }
1062      
1063      /*
1064       * testing purposes only:
1065       */
1066      HarMetaData getMetadata() {
1067        return metadata;
1068      }
1069    }