001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.fs;
019
020import java.io.FileNotFoundException;
021import java.io.IOException;
022import java.io.UnsupportedEncodingException;
023import java.net.URI;
024import java.net.URISyntaxException;
025import java.net.URLDecoder;
026import java.util.ArrayList;
027import java.util.Collections;
028import java.util.List;
029import java.util.LinkedHashMap;
030import java.util.Map;
031import java.util.TreeMap;
032import java.util.HashMap;
033
034import org.apache.commons.logging.Log;
035import org.apache.commons.logging.LogFactory;
036import org.apache.hadoop.conf.Configuration;
037import org.apache.hadoop.fs.permission.FsPermission;
038import org.apache.hadoop.io.IOUtils;
039import org.apache.hadoop.io.Text;
040import org.apache.hadoop.util.LineReader;
041import org.apache.hadoop.util.Progressable;
042
043/**
044 * This is an implementation of the Hadoop Archive 
045 * Filesystem. This archive Filesystem has index files
046 * of the form _index* and has contents of the form
047 * part-*. The index files store the indexes of the 
048 * real files. The index files are of the form _masterindex
049 * and _index. The master index is a level of indirection 
050 * in to the index file to make the look ups faster. the index
051 * file is sorted with hash code of the paths that it contains 
052 * and the master index contains pointers to the positions in 
053 * index for ranges of hashcodes.
054 */
055
056public class HarFileSystem extends FilterFileSystem {
057
058  private static final Log LOG = LogFactory.getLog(HarFileSystem.class);
059
060  public static final String METADATA_CACHE_ENTRIES_KEY = "fs.har.metadatacache.entries";
061  public static final int METADATA_CACHE_ENTRIES_DEFAULT = 10;
062
063  public static final int VERSION = 3;
064
065  private static Map<URI, HarMetaData> harMetaCache;
066
067  // uri representation of this Har filesystem
068  private URI uri;
069  // the top level path of the archive
070  // in the underlying file system
071  private Path archivePath;
072  // the har auth
073  private String harAuth;
074
075  // pointer into the static metadata cache
076  private HarMetaData metadata;
077
078  /**
079   * public construction of harfilesystem
080   *
081   */
082  public HarFileSystem() {
083  }
084  
085  /**
086   * Constructor to create a HarFileSystem with an
087   * underlying filesystem.
088   * @param fs
089   */
090  public HarFileSystem(FileSystem fs) {
091    super(fs);
092  }
093 
094  private synchronized void initializeMetadataCache(Configuration conf) {
095    if (harMetaCache == null) {
096      int cacheSize = conf.getInt(METADATA_CACHE_ENTRIES_KEY, METADATA_CACHE_ENTRIES_DEFAULT);
097      harMetaCache = Collections.synchronizedMap(new LruCache<URI, HarMetaData>(cacheSize));
098    }
099  }
100 
101  /**
102   * Initialize a Har filesystem per har archive. The 
103   * archive home directory is the top level directory
104   * in the filesystem that contains the HAR archive.
105   * Be careful with this method, you do not want to go 
106   * on creating new Filesystem instances per call to 
107   * path.getFileSystem().
108   * the uri of Har is 
109   * har://underlyingfsscheme-host:port/archivepath.
110   * or 
111   * har:///archivepath. This assumes the underlying filesystem
112   * to be used in case not specified.
113   */
114  public void initialize(URI name, Configuration conf) throws IOException {
115    // initialize the metadata cache, if needed
116    initializeMetadataCache(conf);
117
118    // decode the name
119    URI underLyingURI = decodeHarURI(name, conf);
120    // we got the right har Path- now check if this is 
121    // truly a har filesystem
122    Path harPath = archivePath(
123      new Path(name.getScheme(), name.getAuthority(), name.getPath()));
124    if (harPath == null) { 
125      throw new IOException("Invalid path for the Har Filesystem. " + 
126                           name.toString());
127    }
128    if (fs == null) {
129      fs = FileSystem.get(underLyingURI, conf);
130    }
131    uri = harPath.toUri();
132    archivePath = new Path(uri.getPath());
133    harAuth = getHarAuth(underLyingURI);
134    //check for the underlying fs containing
135    // the index file
136    Path masterIndexPath = new Path(archivePath, "_masterindex");
137    Path archiveIndexPath = new Path(archivePath, "_index");
138    if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) {
139      throw new IOException("Invalid path for the Har Filesystem. " +
140          "No index file in " + harPath);
141    }
142
143    metadata = harMetaCache.get(uri);
144    if (metadata != null) {
145      FileStatus mStat = fs.getFileStatus(masterIndexPath);
146      FileStatus aStat = fs.getFileStatus(archiveIndexPath);
147      if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() ||
148          aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) {
149        // the archive has been overwritten since we last read it
150        // remove the entry from the meta data cache
151        metadata = null;
152        harMetaCache.remove(uri);
153      }
154    }
155    if (metadata == null) {
156      metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath);
157      metadata.parseMetaData();
158      harMetaCache.put(uri, metadata);
159    }
160  }
161
162  // get the version of the filesystem from the masterindex file
163  // the version is currently not useful since its the first version
164  // of archives
165  public int getHarVersion() throws IOException {
166    if (metadata != null) {
167      return metadata.getVersion();
168    }
169    else {
170      throw new IOException("Invalid meta data for the Har Filesystem");
171    }
172  }
173
174  /*
175   * find the parent path that is the 
176   * archive path in the path. The last
177   * path segment that ends with .har is 
178   * the path that will be returned.
179   */
180  private Path archivePath(Path p) {
181    Path retPath = null;
182    Path tmp = p;
183    for (int i=0; i< p.depth(); i++) {
184      if (tmp.toString().endsWith(".har")) {
185        retPath = tmp;
186        break;
187      }
188      tmp = tmp.getParent();
189    }
190    return retPath;
191  }
192
193  /**
194   * decode the raw URI to get the underlying URI
195   * @param rawURI raw Har URI
196   * @return filtered URI of the underlying fileSystem
197   */
198  private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException {
199    String tmpAuth = rawURI.getAuthority();
200    //we are using the default file
201    //system in the config 
202    //so create a underlying uri and 
203    //return it
204    if (tmpAuth == null) {
205      //create a path 
206      return FileSystem.getDefaultUri(conf);
207    }
208    String host = rawURI.getHost();
209    if (host == null) {
210      throw new IOException("URI: " + rawURI
211          + " is an invalid Har URI since host==null."
212          + "  Expecting har://<scheme>-<host>/<path>.");
213    }
214    int i = host.indexOf('-');
215    if (i < 0) {
216      throw new IOException("URI: " + rawURI
217          + " is an invalid Har URI since '-' not found."
218          + "  Expecting har://<scheme>-<host>/<path>.");
219    }
220    final String underLyingScheme = host.substring(0, i);
221    i++;
222    final String underLyingHost = i == host.length()? null: host.substring(i);
223    int underLyingPort = rawURI.getPort();
224    String auth = (underLyingHost == null && underLyingPort == -1)?
225                  null:(underLyingHost+":"+underLyingPort);
226    URI tmp = null;
227    if (rawURI.getQuery() != null) {
228      // query component not allowed
229      throw new IOException("query component in Path not supported  " + rawURI);
230    }
231    try {
232      tmp = new URI(underLyingScheme, auth, rawURI.getPath(), 
233            rawURI.getQuery(), rawURI.getFragment());
234    } catch (URISyntaxException e) {
235        // do nothing should not happen
236    }
237    return tmp;
238  }
239
240  private static String decodeString(String str)
241    throws UnsupportedEncodingException {
242    return URLDecoder.decode(str, "UTF-8");
243  }
244
245  private String decodeFileName(String fname) 
246    throws UnsupportedEncodingException {
247    int version = metadata.getVersion();
248    if (version == 2 || version == 3){
249      return decodeString(fname);
250    }
251    return fname;
252  }
253
254  /**
255   * return the top level archive.
256   */
257  public Path getWorkingDirectory() {
258    return new Path(uri.toString());
259  }
260  
261  /**
262   * Create a har specific auth 
263   * har-underlyingfs:port
264   * @param underLyingURI the uri of underlying
265   * filesystem
266   * @return har specific auth
267   */
268  private String getHarAuth(URI underLyingUri) {
269    String auth = underLyingUri.getScheme() + "-";
270    if (underLyingUri.getHost() != null) {
271      auth += underLyingUri.getHost() + ":";
272      if (underLyingUri.getPort() != -1) {
273        auth +=  underLyingUri.getPort();
274      }
275    }
276    else {
277      auth += ":";
278    }
279    return auth;
280  }
281  
282  /**
283   * Returns the uri of this filesystem.
284   * The uri is of the form 
285   * har://underlyingfsschema-host:port/pathintheunderlyingfs
286   */
287  @Override
288  public URI getUri() {
289    return this.uri;
290  }
291  
292  /**
293   * this method returns the path 
294   * inside the har filesystem.
295   * this is relative path inside 
296   * the har filesystem.
297   * @param path the fully qualified path in the har filesystem.
298   * @return relative path in the filesystem.
299   */
300  private Path getPathInHar(Path path) {
301    Path harPath = new Path(path.toUri().getPath());
302    if (archivePath.compareTo(harPath) == 0)
303      return new Path(Path.SEPARATOR);
304    Path tmp = new Path(harPath.getName());
305    Path parent = harPath.getParent();
306    while (!(parent.compareTo(archivePath) == 0)) {
307      if (parent.toString().equals(Path.SEPARATOR)) {
308        tmp = null;
309        break;
310      }
311      tmp = new Path(parent.getName(), tmp);
312      parent = parent.getParent();
313    }
314    if (tmp != null) 
315      tmp = new Path(Path.SEPARATOR, tmp);
316    return tmp;
317  }
318  
319  //the relative path of p. basically 
320  // getting rid of /. Parsing and doing 
321  // string manipulation is not good - so
322  // just use the path api to do it.
323  private Path makeRelative(String initial, Path p) {
324    String scheme = this.uri.getScheme();
325    String authority = this.uri.getAuthority();
326    Path root = new Path(Path.SEPARATOR);
327    if (root.compareTo(p) == 0)
328      return new Path(scheme, authority, initial);
329    Path retPath = new Path(p.getName());
330    Path parent = p.getParent();
331    for (int i=0; i < p.depth()-1; i++) {
332      retPath = new Path(parent.getName(), retPath);
333      parent = parent.getParent();
334    }
335    return new Path(new Path(scheme, authority, initial),
336      retPath.toString());
337  }
338  
339  /* this makes a path qualified in the har filesystem
340   * (non-Javadoc)
341   * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified(
342   * org.apache.hadoop.fs.Path)
343   */
344  @Override
345  public Path makeQualified(Path path) {
346    // make sure that we just get the 
347    // path component 
348    Path fsPath = path;
349    if (!path.isAbsolute()) {
350      fsPath = new Path(archivePath, path);
351    }
352
353    URI tmpURI = fsPath.toUri();
354    //change this to Har uri 
355    return new Path(uri.getScheme(), harAuth, tmpURI.getPath());
356  }
357
358  /**
359   * Fix offset and length of block locations.
360   * Note that this method modifies the original array.
361   * @param locations block locations of har part file
362   * @param start the start of the desired range in the contained file
363   * @param len the length of the desired range
364   * @param fileOffsetInHar the offset of the desired file in the har part file
365   * @return block locations with fixed offset and length
366   */  
367  static BlockLocation[] fixBlockLocations(BlockLocation[] locations,
368                                          long start,
369                                          long len,
370                                          long fileOffsetInHar) {
371    // offset 1 past last byte of desired range
372    long end = start + len;
373
374    for (BlockLocation location : locations) {
375      // offset of part block relative to beginning of desired file
376      // (may be negative if file starts in this part block)
377      long harBlockStart = location.getOffset() - fileOffsetInHar;
378      // offset 1 past last byte of har block relative to beginning of
379      // desired file
380      long harBlockEnd = harBlockStart + location.getLength();
381      
382      if (start > harBlockStart) {
383        // desired range starts after beginning of this har block
384        // fix offset to beginning of relevant range (relative to desired file)
385        location.setOffset(start);
386        // fix length to relevant portion of har block
387        location.setLength(location.getLength() - (start - harBlockStart));
388      } else {
389        // desired range includes beginning of this har block
390        location.setOffset(harBlockStart);
391      }
392      
393      if (harBlockEnd > end) {
394        // range ends before end of this har block
395        // fix length to remove irrelevant portion at the end
396        location.setLength(location.getLength() - (harBlockEnd - end));
397      }
398    }
399    
400    return locations;
401  }
402  
403  /**
404   * Get block locations from the underlying fs and fix their
405   * offsets and lengths.
406   * @param file the input filestatus to get block locations
407   * @param start the start of the desired range in the contained file
408   * @param len the length of the desired range
409   * @return block locations for this segment of file
410   * @throws IOException
411   */
412  @Override
413  public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
414                                               long len) throws IOException {
415    HarStatus hstatus = getFileHarStatus(file.getPath());
416    Path partPath = new Path(archivePath, hstatus.getPartName());
417    FileStatus partStatus = metadata.getPartFileStatus(partPath);
418
419    // get all part blocks that overlap with the desired file blocks
420    BlockLocation[] locations = 
421      fs.getFileBlockLocations(partStatus,
422                               hstatus.getStartIndex() + start, len);
423
424    return fixBlockLocations(locations, start, len, hstatus.getStartIndex());
425  }
426  
427  /**
428   * the hash of the path p inside iniside
429   * the filesystem
430   * @param p the path in the harfilesystem
431   * @return the hash code of the path.
432   */
433  public static int getHarHash(Path p) {
434    return (p.toString().hashCode() & 0x7fffffff);
435  }
436  
437  static class Store {
438    public Store() {
439      begin = end = startHash = endHash = 0;
440    }
441    public Store(long begin, long end, int startHash, int endHash) {
442      this.begin = begin;
443      this.end = end;
444      this.startHash = startHash;
445      this.endHash = endHash;
446    }
447    public long begin;
448    public long end;
449    public int startHash;
450    public int endHash;
451  }
452  
453  /**
454   * Get filestatuses of all the children of a given directory. This just reads
455   * through index file and reads line by line to get all statuses for children
456   * of a directory. Its a brute force way of getting all such filestatuses
457   * 
458   * @param parent
459   *          the parent path directory
460   * @param statuses
461   *          the list to add the children filestatuses to
462   * @param children
463   *          the string list of children for this parent
464   * @param archiveIndexStat
465   *          the archive index filestatus
466   */
467  private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses,
468      List<String> children) throws IOException {
469    String parentString = parent.getName();
470    if (!parentString.endsWith(Path.SEPARATOR)){
471        parentString += Path.SEPARATOR;
472    }
473    Path harPath = new Path(parentString);
474    int harlen = harPath.depth();
475    final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
476
477    for (HarStatus hstatus : metadata.archive.values()) {
478      String child = hstatus.getName();
479      if ((child.startsWith(parentString))) {
480        Path thisPath = new Path(child);
481        if (thisPath.depth() == harlen + 1) {
482          statuses.add(toFileStatus(hstatus, cache));
483        }
484      }
485    }
486  }
487
488  /**
489   * Combine the status stored in the index and the underlying status. 
490   * @param h status stored in the index
491   * @param cache caching the underlying file statuses
492   * @return the combined file status
493   * @throws IOException
494   */
495  private FileStatus toFileStatus(HarStatus h,
496      Map<String, FileStatus> cache) throws IOException {
497    FileStatus underlying = null;
498    if (cache != null) {
499      underlying = cache.get(h.partName);
500    }
501    if (underlying == null) {
502      final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
503      underlying = fs.getFileStatus(p);
504      if (cache != null) {
505        cache.put(h.partName, underlying);
506      }
507    }
508
509    long modTime = 0;
510    int version = metadata.getVersion();
511    if (version < 3) {
512      modTime = underlying.getModificationTime();
513    } else if (version == 3) {
514      modTime = h.getModificationTime();
515    }
516
517    return new FileStatus(
518        h.isDir()? 0L: h.getLength(),
519        h.isDir(),
520        underlying.getReplication(),
521        underlying.getBlockSize(),
522        modTime,
523        underlying.getAccessTime(),
524        underlying.getPermission(),
525        underlying.getOwner(),
526        underlying.getGroup(),
527        makeRelative(this.uri.getPath(), new Path(h.name)));
528  }
529
530  // a single line parser for hadoop archives status 
531  // stored in a single line in the index files 
532  // the format is of the form 
533  // filename "dir"/"file" partFileName startIndex length 
534  // <space seperated children>
535  private class HarStatus {
536    boolean isDir;
537    String name;
538    List<String> children;
539    String partName;
540    long startIndex;
541    long length;
542    long modificationTime = 0;
543
544    public HarStatus(String harString) throws UnsupportedEncodingException {
545      String[] splits = harString.split(" ");
546      this.name = decodeFileName(splits[0]);
547      this.isDir = "dir".equals(splits[1]) ? true: false;
548      // this is equal to "none" if its a directory
549      this.partName = splits[2];
550      this.startIndex = Long.parseLong(splits[3]);
551      this.length = Long.parseLong(splits[4]);
552
553      int version = metadata.getVersion();
554      String[] propSplits = null;
555      // propSplits is used to retrieve the metainformation that Har versions
556      // 1 & 2 missed (modification time, permission, owner group).
557      // These fields are stored in an encoded string placed in different
558      // locations depending on whether it's a file or directory entry.
559      // If it's a directory, the string will be placed at the partName
560      // location (directories have no partName because they don't have data
561      // to be stored). This is done because the number of fields in a
562      // directory entry is unbounded (all children are listed at the end)
563      // If it's a file, the string will be the last field.
564      if (isDir) {
565        if (version == 3){
566          propSplits = decodeString(this.partName).split(" ");
567        }
568        children = new ArrayList<String>();
569        for (int i = 5; i < splits.length; i++) {
570          children.add(decodeFileName(splits[i]));
571        }
572      } else if (version == 3) {
573        propSplits = decodeString(splits[5]).split(" ");
574      }
575
576      if (propSplits != null && propSplits.length >= 4) {
577        modificationTime = Long.parseLong(propSplits[0]);
578        // the fields below are stored in the file but are currently not used
579        // by HarFileSystem
580        // permission = new FsPermission(Short.parseShort(propSplits[1]));
581        // owner = decodeString(propSplits[2]);
582        // group = decodeString(propSplits[3]);
583      }
584    }
585    public boolean isDir() {
586      return isDir;
587    }
588    
589    public String getName() {
590      return name;
591    }
592    public String getPartName() {
593      return partName;
594    }
595    public long getStartIndex() {
596      return startIndex;
597    }
598    public long getLength() {
599      return length;
600    }
601    public long getModificationTime() {
602      return modificationTime;
603    }
604  }
605  
606  /**
607   * return the filestatus of files in har archive.
608   * The permission returned are that of the archive
609   * index files. The permissions are not persisted 
610   * while creating a hadoop archive.
611   * @param f the path in har filesystem
612   * @return filestatus.
613   * @throws IOException
614   */
615  @Override
616  public FileStatus getFileStatus(Path f) throws IOException {
617    HarStatus hstatus = getFileHarStatus(f);
618    return toFileStatus(hstatus, null);
619  }
620
621  private HarStatus getFileHarStatus(Path f) throws IOException {
622    // get the fs DataInputStream for the underlying file
623    // look up the index.
624    Path p = makeQualified(f);
625    Path harPath = getPathInHar(p);
626    if (harPath == null) {
627      throw new IOException("Invalid file name: " + f + " in " + uri);
628    }
629    HarStatus hstatus = metadata.archive.get(harPath);
630    if (hstatus == null) {
631      throw new FileNotFoundException("File: " +  f + " does not exist in " + uri);
632    }
633    return hstatus;
634  }
635
636  /**
637   * @return null since no checksum algorithm is implemented.
638   */
639  public FileChecksum getFileChecksum(Path f) {
640    return null;
641  }
642
643  /**
644   * Returns a har input stream which fakes end of 
645   * file. It reads the index files to get the part 
646   * file name and the size and start of the file.
647   */
648  @Override
649  public FSDataInputStream open(Path f, int bufferSize) throws IOException {
650    // get the fs DataInputStream for the underlying file
651    HarStatus hstatus = getFileHarStatus(f);
652    // we got it.. woo hooo!!! 
653    if (hstatus.isDir()) {
654      throw new FileNotFoundException(f + " : not a file in " +
655                archivePath);
656    }
657    return new HarFSDataInputStream(fs, new Path(archivePath, 
658        hstatus.getPartName()),
659        hstatus.getStartIndex(), hstatus.getLength(), bufferSize);
660  }
661 
662  public FSDataOutputStream create(Path f,
663      FsPermission permission,
664      boolean overwrite,
665      int bufferSize,
666      short replication,
667      long blockSize,
668      Progressable progress) throws IOException {
669    throw new IOException("Har: create not allowed.");
670  }
671  
672  @Override
673  public void close() throws IOException {
674    if (fs != null) {
675      try {
676        fs.close();
677      } catch(IOException ie) {
678        //this might already be closed
679        // ignore
680      }
681    }
682  }
683  
684  /**
685   * Not implemented.
686   */
687  @Override
688  public boolean setReplication(Path src, short replication) throws IOException{
689    throw new IOException("Har: setreplication not allowed");
690  }
691  
692  /**
693   * Not implemented.
694   */
695  @Override
696  public boolean delete(Path f, boolean recursive) throws IOException { 
697    throw new IOException("Har: delete not allowed");
698  }
699  
700  @Override
701  public RemoteIterator<LocatedFileStatus> listLocatedStatus(Path f)
702  throws IOException {
703    // Use FileSystem's implementation
704    return listLocatedStatus(f, DEFAULT_FILTER);
705  }
706
707  /**
708   * liststatus returns the children of a directory 
709   * after looking up the index files.
710   */
711  @Override
712  public FileStatus[] listStatus(Path f) throws IOException {
713    //need to see if the file is an index in file
714    //get the filestatus of the archive directory
715    // we will create fake filestatuses to return
716    // to the client
717    List<FileStatus> statuses = new ArrayList<FileStatus>();
718    Path tmpPath = makeQualified(f);
719    Path harPath = getPathInHar(tmpPath);
720    HarStatus hstatus = metadata.archive.get(harPath);
721    if (hstatus == null) {
722      throw new FileNotFoundException("File " + f + " not found in " + archivePath);
723    }
724    if (hstatus.isDir()) {
725      fileStatusesInIndex(hstatus, statuses, hstatus.children);
726    } else {
727      statuses.add(toFileStatus(hstatus, null));
728    }
729    
730    return statuses.toArray(new FileStatus[statuses.size()]);
731  }
732  
733  /**
734   * return the top level archive path.
735   */
736  public Path getHomeDirectory() {
737    return new Path(uri.toString());
738  }
739  
740  public void setWorkingDirectory(Path newDir) {
741    //does nothing.
742  }
743  
744  /**
745   * not implemented.
746   */
747  public boolean mkdirs(Path f, FsPermission permission) throws IOException {
748    throw new IOException("Har: mkdirs not allowed");
749  }
750  
751  /**
752   * not implemented.
753   */
754  public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws 
755        IOException {
756    throw new IOException("Har: copyfromlocalfile not allowed");
757  }
758  
759  /**
760   * copies the file in the har filesystem to a local file.
761   */
762  public void copyToLocalFile(boolean delSrc, Path src, Path dst) 
763    throws IOException {
764    FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf());
765  }
766  
767  /**
768   * not implemented.
769   */
770  public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) 
771    throws IOException {
772    throw new IOException("Har: startLocalOutput not allowed");
773  }
774  
775  /**
776   * not implemented.
777   */
778  public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) 
779    throws IOException {
780    throw new IOException("Har: completeLocalOutput not allowed");
781  }
782  
783  /**
784   * not implemented.
785   */
786  public void setOwner(Path p, String username, String groupname)
787    throws IOException {
788    throw new IOException("Har: setowner not allowed");
789  }
790
791  /**
792   * Not implemented.
793   */
794  public void setPermission(Path p, FsPermission permisssion) 
795    throws IOException {
796    throw new IOException("Har: setPermission not allowed");
797  }
798  
799  /**
800   * Hadoop archives input stream. This input stream fakes EOF 
801   * since archive files are part of bigger part files.
802   */
803  private static class HarFSDataInputStream extends FSDataInputStream {
804    /**
805     * Create an input stream that fakes all the reads/positions/seeking.
806     */
807    private static class HarFsInputStream extends FSInputStream {
808      private long position, start, end;
809      //The underlying data input stream that the
810      // underlying filesystem will return.
811      private FSDataInputStream underLyingStream;
812      //one byte buffer
813      private byte[] oneBytebuff = new byte[1];
814      HarFsInputStream(FileSystem fs, Path path, long start,
815          long length, int bufferSize) throws IOException {
816        underLyingStream = fs.open(path, bufferSize);
817        underLyingStream.seek(start);
818        // the start of this file in the part file
819        this.start = start;
820        // the position pointer in the part file
821        this.position = start;
822        // the end pointer in the part file
823        this.end = start + length;
824      }
825      
826      public synchronized int available() throws IOException {
827        long remaining = end - underLyingStream.getPos();
828        if (remaining > (long)Integer.MAX_VALUE) {
829          return Integer.MAX_VALUE;
830        }
831        return (int) remaining;
832      }
833      
834      public synchronized  void close() throws IOException {
835        underLyingStream.close();
836        super.close();
837      }
838      
839      //not implemented
840      @Override
841      public void mark(int readLimit) {
842        // do nothing 
843      }
844      
845      /**
846       * reset is not implemented
847       */
848      public void reset() throws IOException {
849        throw new IOException("reset not implemented.");
850      }
851      
852      public synchronized int read() throws IOException {
853        int ret = read(oneBytebuff, 0, 1);
854        return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff);
855      }
856      
857      public synchronized int read(byte[] b) throws IOException {
858        int ret = read(b, 0, b.length);
859        if (ret != -1) {
860          position += ret;
861        }
862        return ret;
863      }
864      
865      /**
866       * 
867       */
868      public synchronized int read(byte[] b, int offset, int len) 
869        throws IOException {
870        int newlen = len;
871        int ret = -1;
872        if (position + len > end) {
873          newlen = (int) (end - position);
874        }
875        // end case
876        if (newlen == 0) 
877          return ret;
878        ret = underLyingStream.read(b, offset, newlen);
879        position += ret;
880        return ret;
881      }
882      
883      public synchronized long skip(long n) throws IOException {
884        long tmpN = n;
885        if (tmpN > 0) {
886          if (position + tmpN > end) {
887            tmpN = end - position;
888          }
889          underLyingStream.seek(tmpN + position);
890          position += tmpN;
891          return tmpN;
892        }
893        return (tmpN < 0)? -1 : 0;
894      }
895      
896      public synchronized long getPos() throws IOException {
897        return (position - start);
898      }
899      
900      public synchronized void seek(long pos) throws IOException {
901        if (pos < 0 || (start + pos > end)) {
902          throw new IOException("Failed to seek: EOF");
903        }
904        position = start + pos;
905        underLyingStream.seek(position);
906      }
907
908      public boolean seekToNewSource(long targetPos) throws IOException {
909        //do not need to implement this
910        // hdfs in itself does seektonewsource 
911        // while reading.
912        return false;
913      }
914      
915      /**
916       * implementing position readable. 
917       */
918      public int read(long pos, byte[] b, int offset, int length) 
919      throws IOException {
920        int nlength = length;
921        if (start + nlength + pos > end) {
922          nlength = (int) (end - (start + pos));
923        }
924        return underLyingStream.read(pos + start , b, offset, nlength);
925      }
926      
927      /**
928       * position readable again.
929       */
930      public void readFully(long pos, byte[] b, int offset, int length) 
931      throws IOException {
932        if (start + length + pos > end) {
933          throw new IOException("Not enough bytes to read.");
934        }
935        underLyingStream.readFully(pos + start, b, offset, length);
936      }
937      
938      public void readFully(long pos, byte[] b) throws IOException {
939          readFully(pos, b, 0, b.length);
940      }
941      
942    }
943  
944    /**
945     * constructors for har input stream.
946     * @param fs the underlying filesystem
947     * @param p The path in the underlying filesystem
948     * @param start the start position in the part file
949     * @param length the length of valid data in the part file
950     * @param bufsize the buffer size
951     * @throws IOException
952     */
953    public HarFSDataInputStream(FileSystem fs, Path  p, long start, 
954        long length, int bufsize) throws IOException {
955        super(new HarFsInputStream(fs, p, start, length, bufsize));
956    }
957
958    /**
959     * constructor for har input stream.
960     * @param fs the underlying filesystem
961     * @param p the path in the underlying file system
962     * @param start the start position in the part file
963     * @param length the length of valid data in the part file.
964     * @throws IOException
965     */
966    public HarFSDataInputStream(FileSystem fs, Path  p, long start, long length)
967      throws IOException {
968        super(new HarFsInputStream(fs, p, start, length, 0));
969    }
970  }
971
972  private class HarMetaData {
973    private FileSystem fs;
974    private int version;
975    // the masterIndex of the archive
976    private Path masterIndexPath;
977    // the index file 
978    private Path archiveIndexPath;
979
980    private long masterIndexTimestamp;
981    private long archiveIndexTimestamp;
982
983    List<Store> stores = new ArrayList<Store>();
984    Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
985    private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
986
987    public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
988      this.fs = fs;
989      this.masterIndexPath = masterIndexPath;
990      this.archiveIndexPath = archiveIndexPath;
991    }
992
993    public FileStatus getPartFileStatus(Path partPath) throws IOException {
994      FileStatus status;
995      status = partFileStatuses.get(partPath);
996      if (status == null) {
997        status = fs.getFileStatus(partPath);
998        partFileStatuses.put(partPath, status);
999      }
1000      return status;
1001    }
1002
1003    public long getMasterIndexTimestamp() {
1004      return masterIndexTimestamp;
1005    }
1006
1007    public long getArchiveIndexTimestamp() {
1008      return archiveIndexTimestamp;
1009    }
1010
1011    private int getVersion() {
1012      return version;
1013    }
1014
1015    private void parseMetaData() throws IOException {
1016      Text line;
1017      long read;
1018      FSDataInputStream in = null;
1019      LineReader lin = null;
1020
1021      try {
1022        in = fs.open(masterIndexPath);
1023        FileStatus masterStat = fs.getFileStatus(masterIndexPath);
1024        masterIndexTimestamp = masterStat.getModificationTime();
1025        lin = new LineReader(in, getConf());
1026        line = new Text();
1027        read = lin.readLine(line);
1028
1029        // the first line contains the version of the index file
1030        String versionLine = line.toString();
1031        String[] arr = versionLine.split(" ");
1032        version = Integer.parseInt(arr[0]);
1033        // make it always backwards-compatible
1034        if (this.version > HarFileSystem.VERSION) {
1035          throw new IOException("Invalid version " + 
1036              this.version + " expected " + HarFileSystem.VERSION);
1037        }
1038
1039        // each line contains a hashcode range and the index file name
1040        String[] readStr = null;
1041        while(read < masterStat.getLen()) {
1042          int b = lin.readLine(line);
1043          read += b;
1044          readStr = line.toString().split(" ");
1045          int startHash = Integer.parseInt(readStr[0]);
1046          int endHash  = Integer.parseInt(readStr[1]);
1047          stores.add(new Store(Long.parseLong(readStr[2]), 
1048              Long.parseLong(readStr[3]), startHash,
1049              endHash));
1050          line.clear();
1051        }
1052      } finally {
1053        IOUtils.cleanup(LOG, lin, in);
1054      }
1055
1056      FSDataInputStream aIn = fs.open(archiveIndexPath);
1057      try {
1058        FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
1059        archiveIndexTimestamp = archiveStat.getModificationTime();
1060        LineReader aLin;
1061
1062        // now start reading the real index file
1063        for (Store s: stores) {
1064          read = 0;
1065          aIn.seek(s.begin);
1066          aLin = new LineReader(aIn, getConf());
1067          while (read + s.begin < s.end) {
1068            int tmp = aLin.readLine(line);
1069            read += tmp;
1070            String lineFeed = line.toString();
1071            String[] parsed = lineFeed.split(" ");
1072            parsed[0] = decodeFileName(parsed[0]);
1073            archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
1074            line.clear();
1075          }
1076        }
1077      } finally {
1078        IOUtils.cleanup(LOG, aIn);
1079      }
1080    }
1081  }
1082  
1083  /*
1084   * testing purposes only:
1085   */
1086  HarMetaData getMetadata() {
1087    return metadata;
1088  }
1089
1090  private static class LruCache<K, V> extends LinkedHashMap<K, V> {
1091    private final int MAX_ENTRIES;
1092
1093    public LruCache(int maxEntries) {
1094        super(maxEntries + 1, 1.0f, true);
1095        MAX_ENTRIES = maxEntries;
1096    }
1097
1098    @Override
1099    protected boolean removeEldestEntry(Map.Entry<K, V> eldest) {
1100        return size() > MAX_ENTRIES;
1101    }
1102  }
1103}