001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.fs;
019
020 import org.apache.commons.logging.Log;
021 import org.apache.commons.logging.LogFactory;
022 import org.apache.hadoop.conf.Configuration;
023 import org.apache.hadoop.fs.permission.FsPermission;
024 import org.apache.hadoop.io.IOUtils;
025 import org.apache.hadoop.io.Text;
026 import org.apache.hadoop.util.LineReader;
027 import org.apache.hadoop.util.Progressable;
028
029 import java.io.FileNotFoundException;
030 import java.io.IOException;
031 import java.io.UnsupportedEncodingException;
032 import java.net.URI;
033 import java.net.URISyntaxException;
034 import java.net.URLDecoder;
035 import java.util.*;
036
037 /**
038 * This is an implementation of the Hadoop Archive
039 * Filesystem. This archive Filesystem has index files
040 * of the form _index* and has contents of the form
041 * part-*. The index files store the indexes of the
042 * real files. The index files are of the form _masterindex
043 * and _index. The master index is a level of indirection
044 * in to the index file to make the look ups faster. the index
045 * file is sorted with hash code of the paths that it contains
046 * and the master index contains pointers to the positions in
047 * index for ranges of hashcodes.
048 */
049
050 public class HarFileSystem extends FileSystem {
051
052 private static final Log LOG = LogFactory.getLog(HarFileSystem.class);
053
054 public static final String METADATA_CACHE_ENTRIES_KEY = "fs.har.metadatacache.entries";
055 public static final int METADATA_CACHE_ENTRIES_DEFAULT = 10;
056
057 public static final int VERSION = 3;
058
059 private static Map<URI, HarMetaData> harMetaCache;
060
061 // uri representation of this Har filesystem
062 private URI uri;
063 // the top level path of the archive
064 // in the underlying file system
065 private Path archivePath;
066 // the har auth
067 private String harAuth;
068
069 // pointer into the static metadata cache
070 private HarMetaData metadata;
071
072 private FileSystem fs;
073
074 /**
075 * public construction of harfilesystem
076 */
077 public HarFileSystem() {
078 // Must call #initialize() method to set the underlying file system
079 }
080
081 /**
082 * Return the protocol scheme for the FileSystem.
083 * <p/>
084 *
085 * @return <code>har</code>
086 */
087 @Override
088 public String getScheme() {
089 return "har";
090 }
091
092 /**
093 * Constructor to create a HarFileSystem with an
094 * underlying filesystem.
095 * @param fs underlying file system
096 */
097 public HarFileSystem(FileSystem fs) {
098 this.fs = fs;
099 this.statistics = fs.statistics;
100 }
101
102 private synchronized void initializeMetadataCache(Configuration conf) {
103 if (harMetaCache == null) {
104 int cacheSize = conf.getInt(METADATA_CACHE_ENTRIES_KEY, METADATA_CACHE_ENTRIES_DEFAULT);
105 harMetaCache = Collections.synchronizedMap(new LruCache<URI, HarMetaData>(cacheSize));
106 }
107 }
108
109 /**
110 * Initialize a Har filesystem per har archive. The
111 * archive home directory is the top level directory
112 * in the filesystem that contains the HAR archive.
113 * Be careful with this method, you do not want to go
114 * on creating new Filesystem instances per call to
115 * path.getFileSystem().
116 * the uri of Har is
117 * har://underlyingfsscheme-host:port/archivepath.
118 * or
119 * har:///archivepath. This assumes the underlying filesystem
120 * to be used in case not specified.
121 */
122 @Override
123 public void initialize(URI name, Configuration conf) throws IOException {
124 // initialize the metadata cache, if needed
125 initializeMetadataCache(conf);
126
127 // decode the name
128 URI underLyingURI = decodeHarURI(name, conf);
129 // we got the right har Path- now check if this is
130 // truly a har filesystem
131 Path harPath = archivePath(
132 new Path(name.getScheme(), name.getAuthority(), name.getPath()));
133 if (harPath == null) {
134 throw new IOException("Invalid path for the Har Filesystem. " +
135 name.toString());
136 }
137 if (fs == null) {
138 fs = FileSystem.get(underLyingURI, conf);
139 }
140 uri = harPath.toUri();
141 archivePath = new Path(uri.getPath());
142 harAuth = getHarAuth(underLyingURI);
143 //check for the underlying fs containing
144 // the index file
145 Path masterIndexPath = new Path(archivePath, "_masterindex");
146 Path archiveIndexPath = new Path(archivePath, "_index");
147 if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) {
148 throw new IOException("Invalid path for the Har Filesystem. " +
149 "No index file in " + harPath);
150 }
151
152 metadata = harMetaCache.get(uri);
153 if (metadata != null) {
154 FileStatus mStat = fs.getFileStatus(masterIndexPath);
155 FileStatus aStat = fs.getFileStatus(archiveIndexPath);
156 if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() ||
157 aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) {
158 // the archive has been overwritten since we last read it
159 // remove the entry from the meta data cache
160 metadata = null;
161 harMetaCache.remove(uri);
162 }
163 }
164 if (metadata == null) {
165 metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath);
166 metadata.parseMetaData();
167 harMetaCache.put(uri, metadata);
168 }
169 }
170
171 @Override
172 public Configuration getConf() {
173 return fs.getConf();
174 }
175
176 // get the version of the filesystem from the masterindex file
177 // the version is currently not useful since its the first version
178 // of archives
179 public int getHarVersion() throws IOException {
180 if (metadata != null) {
181 return metadata.getVersion();
182 }
183 else {
184 throw new IOException("Invalid meta data for the Har Filesystem");
185 }
186 }
187
188 /*
189 * find the parent path that is the
190 * archive path in the path. The last
191 * path segment that ends with .har is
192 * the path that will be returned.
193 */
194 private Path archivePath(Path p) {
195 Path retPath = null;
196 Path tmp = p;
197 for (int i=0; i< p.depth(); i++) {
198 if (tmp.toString().endsWith(".har")) {
199 retPath = tmp;
200 break;
201 }
202 tmp = tmp.getParent();
203 }
204 return retPath;
205 }
206
207 /**
208 * decode the raw URI to get the underlying URI
209 * @param rawURI raw Har URI
210 * @return filtered URI of the underlying fileSystem
211 */
212 private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException {
213 String tmpAuth = rawURI.getAuthority();
214 //we are using the default file
215 //system in the config
216 //so create a underlying uri and
217 //return it
218 if (tmpAuth == null) {
219 //create a path
220 return FileSystem.getDefaultUri(conf);
221 }
222 String authority = rawURI.getAuthority();
223 if (authority == null) {
224 throw new IOException("URI: " + rawURI
225 + " is an invalid Har URI since authority==null."
226 + " Expecting har://<scheme>-<host>/<path>.");
227 }
228
229 int i = authority.indexOf('-');
230 if (i < 0) {
231 throw new IOException("URI: " + rawURI
232 + " is an invalid Har URI since '-' not found."
233 + " Expecting har://<scheme>-<host>/<path>.");
234 }
235
236 if (rawURI.getQuery() != null) {
237 // query component not allowed
238 throw new IOException("query component in Path not supported " + rawURI);
239 }
240
241 URI tmp;
242 try {
243 // convert <scheme>-<host> to <scheme>://<host>
244 URI baseUri = new URI(authority.replaceFirst("-", "://"));
245
246 tmp = new URI(baseUri.getScheme(), baseUri.getAuthority(),
247 rawURI.getPath(), rawURI.getQuery(), rawURI.getFragment());
248 } catch (URISyntaxException e) {
249 throw new IOException("URI: " + rawURI
250 + " is an invalid Har URI. Expecting har://<scheme>-<host>/<path>.");
251 }
252 return tmp;
253 }
254
255 private static String decodeString(String str)
256 throws UnsupportedEncodingException {
257 return URLDecoder.decode(str, "UTF-8");
258 }
259
260 private String decodeFileName(String fname)
261 throws UnsupportedEncodingException {
262 int version = metadata.getVersion();
263 if (version == 2 || version == 3){
264 return decodeString(fname);
265 }
266 return fname;
267 }
268
269 /**
270 * return the top level archive.
271 */
272 @Override
273 public Path getWorkingDirectory() {
274 return new Path(uri.toString());
275 }
276
277 @Override
278 public Path getInitialWorkingDirectory() {
279 return getWorkingDirectory();
280 }
281
282 @Override
283 public FsStatus getStatus(Path p) throws IOException {
284 return fs.getStatus(p);
285 }
286
287 /**
288 * Create a har specific auth
289 * har-underlyingfs:port
290 * @param underLyingUri the uri of underlying
291 * filesystem
292 * @return har specific auth
293 */
294 private String getHarAuth(URI underLyingUri) {
295 String auth = underLyingUri.getScheme() + "-";
296 if (underLyingUri.getHost() != null) {
297 if (underLyingUri.getUserInfo() != null) {
298 auth += underLyingUri.getUserInfo();
299 auth += "@";
300 }
301 auth += underLyingUri.getHost();
302 if (underLyingUri.getPort() != -1) {
303 auth += ":";
304 auth += underLyingUri.getPort();
305 }
306 }
307 else {
308 auth += ":";
309 }
310 return auth;
311 }
312
313 /**
314 * Used for delegation token related functionality. Must delegate to
315 * underlying file system.
316 */
317 @Override
318 protected URI getCanonicalUri() {
319 return fs.getCanonicalUri();
320 }
321
322 @Override
323 protected URI canonicalizeUri(URI uri) {
324 return fs.canonicalizeUri(uri);
325 }
326
327 /**
328 * Returns the uri of this filesystem.
329 * The uri is of the form
330 * har://underlyingfsschema-host:port/pathintheunderlyingfs
331 */
332 @Override
333 public URI getUri() {
334 return this.uri;
335 }
336
337 @Override
338 protected void checkPath(Path path) {
339 fs.checkPath(path);
340 }
341
342 @Override
343 public Path resolvePath(Path p) throws IOException {
344 return fs.resolvePath(p);
345 }
346
347 /**
348 * this method returns the path
349 * inside the har filesystem.
350 * this is relative path inside
351 * the har filesystem.
352 * @param path the fully qualified path in the har filesystem.
353 * @return relative path in the filesystem.
354 */
355 private Path getPathInHar(Path path) {
356 Path harPath = new Path(path.toUri().getPath());
357 if (archivePath.compareTo(harPath) == 0)
358 return new Path(Path.SEPARATOR);
359 Path tmp = new Path(harPath.getName());
360 Path parent = harPath.getParent();
361 while (!(parent.compareTo(archivePath) == 0)) {
362 if (parent.toString().equals(Path.SEPARATOR)) {
363 tmp = null;
364 break;
365 }
366 tmp = new Path(parent.getName(), tmp);
367 parent = parent.getParent();
368 }
369 if (tmp != null)
370 tmp = new Path(Path.SEPARATOR, tmp);
371 return tmp;
372 }
373
374 //the relative path of p. basically
375 // getting rid of /. Parsing and doing
376 // string manipulation is not good - so
377 // just use the path api to do it.
378 private Path makeRelative(String initial, Path p) {
379 String scheme = this.uri.getScheme();
380 String authority = this.uri.getAuthority();
381 Path root = new Path(Path.SEPARATOR);
382 if (root.compareTo(p) == 0)
383 return new Path(scheme, authority, initial);
384 Path retPath = new Path(p.getName());
385 Path parent = p.getParent();
386 for (int i=0; i < p.depth()-1; i++) {
387 retPath = new Path(parent.getName(), retPath);
388 parent = parent.getParent();
389 }
390 return new Path(new Path(scheme, authority, initial),
391 retPath.toString());
392 }
393
394 /* this makes a path qualified in the har filesystem
395 * (non-Javadoc)
396 * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified(
397 * org.apache.hadoop.fs.Path)
398 */
399 @Override
400 public Path makeQualified(Path path) {
401 // make sure that we just get the
402 // path component
403 Path fsPath = path;
404 if (!path.isAbsolute()) {
405 fsPath = new Path(archivePath, path);
406 }
407
408 URI tmpURI = fsPath.toUri();
409 //change this to Har uri
410 return new Path(uri.getScheme(), harAuth, tmpURI.getPath());
411 }
412
413 /**
414 * Fix offset and length of block locations.
415 * Note that this method modifies the original array.
416 * @param locations block locations of har part file
417 * @param start the start of the desired range in the contained file
418 * @param len the length of the desired range
419 * @param fileOffsetInHar the offset of the desired file in the har part file
420 * @return block locations with fixed offset and length
421 */
422 static BlockLocation[] fixBlockLocations(BlockLocation[] locations,
423 long start,
424 long len,
425 long fileOffsetInHar) {
426 // offset 1 past last byte of desired range
427 long end = start + len;
428
429 for (BlockLocation location : locations) {
430 // offset of part block relative to beginning of desired file
431 // (may be negative if file starts in this part block)
432 long harBlockStart = location.getOffset() - fileOffsetInHar;
433 // offset 1 past last byte of har block relative to beginning of
434 // desired file
435 long harBlockEnd = harBlockStart + location.getLength();
436
437 if (start > harBlockStart) {
438 // desired range starts after beginning of this har block
439 // fix offset to beginning of relevant range (relative to desired file)
440 location.setOffset(start);
441 // fix length to relevant portion of har block
442 location.setLength(location.getLength() - (start - harBlockStart));
443 } else {
444 // desired range includes beginning of this har block
445 location.setOffset(harBlockStart);
446 }
447
448 if (harBlockEnd > end) {
449 // range ends before end of this har block
450 // fix length to remove irrelevant portion at the end
451 location.setLength(location.getLength() - (harBlockEnd - end));
452 }
453 }
454
455 return locations;
456 }
457
458 /**
459 * Get block locations from the underlying fs and fix their
460 * offsets and lengths.
461 * @param file the input file status to get block locations
462 * @param start the start of the desired range in the contained file
463 * @param len the length of the desired range
464 * @return block locations for this segment of file
465 * @throws IOException
466 */
467 @Override
468 public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
469 long len) throws IOException {
470 HarStatus hstatus = getFileHarStatus(file.getPath());
471 Path partPath = new Path(archivePath, hstatus.getPartName());
472 FileStatus partStatus = metadata.getPartFileStatus(partPath);
473
474 // get all part blocks that overlap with the desired file blocks
475 BlockLocation[] locations =
476 fs.getFileBlockLocations(partStatus,
477 hstatus.getStartIndex() + start, len);
478
479 return fixBlockLocations(locations, start, len, hstatus.getStartIndex());
480 }
481
482 /**
483 * the hash of the path p inside the filesystem
484 * @param p the path in the harfilesystem
485 * @return the hash code of the path.
486 */
487 public static int getHarHash(Path p) {
488 return (p.toString().hashCode() & 0x7fffffff);
489 }
490
491 static class Store {
492 public Store() {
493 begin = end = startHash = endHash = 0;
494 }
495 public Store(long begin, long end, int startHash, int endHash) {
496 this.begin = begin;
497 this.end = end;
498 this.startHash = startHash;
499 this.endHash = endHash;
500 }
501 public long begin;
502 public long end;
503 public int startHash;
504 public int endHash;
505 }
506
507 /**
508 * Get filestatuses of all the children of a given directory. This just reads
509 * through index file and reads line by line to get all statuses for children
510 * of a directory. Its a brute force way of getting all such filestatuses
511 *
512 * @param parent
513 * the parent path directory
514 * @param statuses
515 * the list to add the children filestatuses to
516 */
517 private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses)
518 throws IOException {
519 String parentString = parent.getName();
520 if (!parentString.endsWith(Path.SEPARATOR)){
521 parentString += Path.SEPARATOR;
522 }
523 Path harPath = new Path(parentString);
524 int harlen = harPath.depth();
525 final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
526
527 for (HarStatus hstatus : metadata.archive.values()) {
528 String child = hstatus.getName();
529 if ((child.startsWith(parentString))) {
530 Path thisPath = new Path(child);
531 if (thisPath.depth() == harlen + 1) {
532 statuses.add(toFileStatus(hstatus, cache));
533 }
534 }
535 }
536 }
537
538 /**
539 * Combine the status stored in the index and the underlying status.
540 * @param h status stored in the index
541 * @param cache caching the underlying file statuses
542 * @return the combined file status
543 * @throws IOException
544 */
545 private FileStatus toFileStatus(HarStatus h,
546 Map<String, FileStatus> cache) throws IOException {
547 FileStatus underlying = null;
548 if (cache != null) {
549 underlying = cache.get(h.partName);
550 }
551 if (underlying == null) {
552 final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
553 underlying = fs.getFileStatus(p);
554 if (cache != null) {
555 cache.put(h.partName, underlying);
556 }
557 }
558
559 long modTime = 0;
560 int version = metadata.getVersion();
561 if (version < 3) {
562 modTime = underlying.getModificationTime();
563 } else if (version == 3) {
564 modTime = h.getModificationTime();
565 }
566
567 return new FileStatus(
568 h.isDir()? 0L: h.getLength(),
569 h.isDir(),
570 underlying.getReplication(),
571 underlying.getBlockSize(),
572 modTime,
573 underlying.getAccessTime(),
574 underlying.getPermission(),
575 underlying.getOwner(),
576 underlying.getGroup(),
577 makeRelative(this.uri.getPath(), new Path(h.name)));
578 }
579
580 // a single line parser for hadoop archives status
581 // stored in a single line in the index files
582 // the format is of the form
583 // filename "dir"/"file" partFileName startIndex length
584 // <space separated children>
585 private class HarStatus {
586 boolean isDir;
587 String name;
588 List<String> children;
589 String partName;
590 long startIndex;
591 long length;
592 long modificationTime = 0;
593
594 public HarStatus(String harString) throws UnsupportedEncodingException {
595 String[] splits = harString.split(" ");
596 this.name = decodeFileName(splits[0]);
597 this.isDir = "dir".equals(splits[1]) ? true: false;
598 // this is equal to "none" if its a directory
599 this.partName = splits[2];
600 this.startIndex = Long.parseLong(splits[3]);
601 this.length = Long.parseLong(splits[4]);
602
603 int version = metadata.getVersion();
604 String[] propSplits = null;
605 // propSplits is used to retrieve the metainformation that Har versions
606 // 1 & 2 missed (modification time, permission, owner group).
607 // These fields are stored in an encoded string placed in different
608 // locations depending on whether it's a file or directory entry.
609 // If it's a directory, the string will be placed at the partName
610 // location (directories have no partName because they don't have data
611 // to be stored). This is done because the number of fields in a
612 // directory entry is unbounded (all children are listed at the end)
613 // If it's a file, the string will be the last field.
614 if (isDir) {
615 if (version == 3){
616 propSplits = decodeString(this.partName).split(" ");
617 }
618 children = new ArrayList<String>();
619 for (int i = 5; i < splits.length; i++) {
620 children.add(decodeFileName(splits[i]));
621 }
622 } else if (version == 3) {
623 propSplits = decodeString(splits[5]).split(" ");
624 }
625
626 if (propSplits != null && propSplits.length >= 4) {
627 modificationTime = Long.parseLong(propSplits[0]);
628 // the fields below are stored in the file but are currently not used
629 // by HarFileSystem
630 // permission = new FsPermission(Short.parseShort(propSplits[1]));
631 // owner = decodeString(propSplits[2]);
632 // group = decodeString(propSplits[3]);
633 }
634 }
635 public boolean isDir() {
636 return isDir;
637 }
638
639 public String getName() {
640 return name;
641 }
642 public String getPartName() {
643 return partName;
644 }
645 public long getStartIndex() {
646 return startIndex;
647 }
648 public long getLength() {
649 return length;
650 }
651 public long getModificationTime() {
652 return modificationTime;
653 }
654 }
655
656 /**
657 * return the filestatus of files in har archive.
658 * The permission returned are that of the archive
659 * index files. The permissions are not persisted
660 * while creating a hadoop archive.
661 * @param f the path in har filesystem
662 * @return filestatus.
663 * @throws IOException
664 */
665 @Override
666 public FileStatus getFileStatus(Path f) throws IOException {
667 HarStatus hstatus = getFileHarStatus(f);
668 return toFileStatus(hstatus, null);
669 }
670
671 private HarStatus getFileHarStatus(Path f) throws IOException {
672 // get the fs DataInputStream for the underlying file
673 // look up the index.
674 Path p = makeQualified(f);
675 Path harPath = getPathInHar(p);
676 if (harPath == null) {
677 throw new IOException("Invalid file name: " + f + " in " + uri);
678 }
679 HarStatus hstatus = metadata.archive.get(harPath);
680 if (hstatus == null) {
681 throw new FileNotFoundException("File: " + f + " does not exist in " + uri);
682 }
683 return hstatus;
684 }
685
686 /**
687 * @return null since no checksum algorithm is implemented.
688 */
689 @Override
690 public FileChecksum getFileChecksum(Path f, long length) {
691 return null;
692 }
693
694 /**
695 * Returns a har input stream which fakes end of
696 * file. It reads the index files to get the part
697 * file name and the size and start of the file.
698 */
699 @Override
700 public FSDataInputStream open(Path f, int bufferSize) throws IOException {
701 // get the fs DataInputStream for the underlying file
702 HarStatus hstatus = getFileHarStatus(f);
703 if (hstatus.isDir()) {
704 throw new FileNotFoundException(f + " : not a file in " +
705 archivePath);
706 }
707 return new HarFSDataInputStream(fs, new Path(archivePath,
708 hstatus.getPartName()),
709 hstatus.getStartIndex(), hstatus.getLength(), bufferSize);
710 }
711
712 /**
713 * Used for delegation token related functionality. Must delegate to
714 * underlying file system.
715 */
716 @Override
717 public FileSystem[] getChildFileSystems() {
718 return new FileSystem[]{fs};
719 }
720
721 @Override
722 public FSDataOutputStream create(Path f, FsPermission permission,
723 boolean overwrite, int bufferSize, short replication, long blockSize,
724 Progressable progress) throws IOException {
725 throw new IOException("Har: create not allowed.");
726 }
727
728 @SuppressWarnings("deprecation")
729 @Override
730 public FSDataOutputStream createNonRecursive(Path f, boolean overwrite,
731 int bufferSize, short replication, long blockSize, Progressable progress)
732 throws IOException {
733 throw new IOException("Har: create not allowed.");
734 }
735
736 @Override
737 public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) throws IOException {
738 throw new IOException("Har: append not allowed.");
739 }
740
741 @Override
742 public void close() throws IOException {
743 super.close();
744 if (fs != null) {
745 try {
746 fs.close();
747 } catch(IOException ie) {
748 //this might already be closed
749 // ignore
750 }
751 }
752 }
753
754 /**
755 * Not implemented.
756 */
757 @Override
758 public boolean setReplication(Path src, short replication) throws IOException{
759 throw new IOException("Har: setReplication not allowed");
760 }
761
762 @Override
763 public boolean rename(Path src, Path dst) throws IOException {
764 throw new IOException("Har: rename not allowed");
765 }
766
767 @Override
768 public FSDataOutputStream append(Path f) throws IOException {
769 throw new IOException("Har: append not allowed");
770 }
771
772 /**
773 * Not implemented.
774 */
775 @Override
776 public boolean delete(Path f, boolean recursive) throws IOException {
777 throw new IOException("Har: delete not allowed");
778 }
779
780 /**
781 * liststatus returns the children of a directory
782 * after looking up the index files.
783 */
784 @Override
785 public FileStatus[] listStatus(Path f) throws IOException {
786 //need to see if the file is an index in file
787 //get the filestatus of the archive directory
788 // we will create fake filestatuses to return
789 // to the client
790 List<FileStatus> statuses = new ArrayList<FileStatus>();
791 Path tmpPath = makeQualified(f);
792 Path harPath = getPathInHar(tmpPath);
793 HarStatus hstatus = metadata.archive.get(harPath);
794 if (hstatus == null) {
795 throw new FileNotFoundException("File " + f + " not found in " + archivePath);
796 }
797 if (hstatus.isDir()) {
798 fileStatusesInIndex(hstatus, statuses);
799 } else {
800 statuses.add(toFileStatus(hstatus, null));
801 }
802
803 return statuses.toArray(new FileStatus[statuses.size()]);
804 }
805
806 /**
807 * return the top level archive path.
808 */
809 @Override
810 public Path getHomeDirectory() {
811 return new Path(uri.toString());
812 }
813
814 @Override
815 public void setWorkingDirectory(Path newDir) {
816 //does nothing.
817 }
818
819 /**
820 * not implemented.
821 */
822 @Override
823 public boolean mkdirs(Path f, FsPermission permission) throws IOException {
824 throw new IOException("Har: mkdirs not allowed");
825 }
826
827 /**
828 * not implemented.
829 */
830 @Override
831 public void copyFromLocalFile(boolean delSrc, boolean overwrite,
832 Path src, Path dst) throws IOException {
833 throw new IOException("Har: copyfromlocalfile not allowed");
834 }
835
836 @Override
837 public void copyFromLocalFile(boolean delSrc, boolean overwrite,
838 Path[] srcs, Path dst) throws IOException {
839 throw new IOException("Har: copyfromlocalfile not allowed");
840 }
841
842 /**
843 * copies the file in the har filesystem to a local file.
844 */
845 @Override
846 public void copyToLocalFile(boolean delSrc, Path src, Path dst)
847 throws IOException {
848 FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf());
849 }
850
851 /**
852 * not implemented.
853 */
854 @Override
855 public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile)
856 throws IOException {
857 throw new IOException("Har: startLocalOutput not allowed");
858 }
859
860 /**
861 * not implemented.
862 */
863 @Override
864 public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile)
865 throws IOException {
866 throw new IOException("Har: completeLocalOutput not allowed");
867 }
868
869 /**
870 * not implemented.
871 */
872 @Override
873 public void setOwner(Path p, String username, String groupname)
874 throws IOException {
875 throw new IOException("Har: setowner not allowed");
876 }
877
878 @Override
879 public void setTimes(Path p, long mtime, long atime) throws IOException {
880 throw new IOException("Har: setTimes not allowed");
881 }
882
883 /**
884 * Not implemented.
885 */
886 @Override
887 public void setPermission(Path p, FsPermission permission)
888 throws IOException {
889 throw new IOException("Har: setPermission not allowed");
890 }
891
892 /**
893 * Hadoop archives input stream. This input stream fakes EOF
894 * since archive files are part of bigger part files.
895 */
896 private static class HarFSDataInputStream extends FSDataInputStream {
897 /**
898 * Create an input stream that fakes all the reads/positions/seeking.
899 */
900 private static class HarFsInputStream extends FSInputStream
901 implements CanSetDropBehind, CanSetReadahead {
902 private long position, start, end;
903 //The underlying data input stream that the
904 // underlying filesystem will return.
905 private final FSDataInputStream underLyingStream;
906 //one byte buffer
907 private final byte[] oneBytebuff = new byte[1];
908
909 HarFsInputStream(FileSystem fs, Path path, long start,
910 long length, int bufferSize) throws IOException {
911 if (length < 0) {
912 throw new IllegalArgumentException("Negative length ["+length+"]");
913 }
914 underLyingStream = fs.open(path, bufferSize);
915 underLyingStream.seek(start);
916 // the start of this file in the part file
917 this.start = start;
918 // the position pointer in the part file
919 this.position = start;
920 // the end pointer in the part file
921 this.end = start + length;
922 }
923
924 @Override
925 public synchronized int available() throws IOException {
926 long remaining = end - underLyingStream.getPos();
927 if (remaining > Integer.MAX_VALUE) {
928 return Integer.MAX_VALUE;
929 }
930 return (int) remaining;
931 }
932
933 @Override
934 public synchronized void close() throws IOException {
935 underLyingStream.close();
936 super.close();
937 }
938
939 //not implemented
940 @Override
941 public void mark(int readLimit) {
942 // do nothing
943 }
944
945 /**
946 * reset is not implemented
947 */
948 @Override
949 public void reset() throws IOException {
950 throw new IOException("reset not implemented.");
951 }
952
953 @Override
954 public synchronized int read() throws IOException {
955 int ret = read(oneBytebuff, 0, 1);
956 return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff);
957 }
958
959 // NB: currently this method actually never executed becusae
960 // java.io.DataInputStream.read(byte[]) directly delegates to
961 // method java.io.InputStream.read(byte[], int, int).
962 // However, potentially it can be invoked, so leave it intact for now.
963 @Override
964 public synchronized int read(byte[] b) throws IOException {
965 final int ret = read(b, 0, b.length);
966 return ret;
967 }
968
969 /**
970 *
971 */
972 @Override
973 public synchronized int read(byte[] b, int offset, int len)
974 throws IOException {
975 int newlen = len;
976 int ret = -1;
977 if (position + len > end) {
978 newlen = (int) (end - position);
979 }
980 // end case
981 if (newlen == 0)
982 return ret;
983 ret = underLyingStream.read(b, offset, newlen);
984 position += ret;
985 return ret;
986 }
987
988 @Override
989 public synchronized long skip(long n) throws IOException {
990 long tmpN = n;
991 if (tmpN > 0) {
992 final long actualRemaining = end - position;
993 if (tmpN > actualRemaining) {
994 tmpN = actualRemaining;
995 }
996 underLyingStream.seek(tmpN + position);
997 position += tmpN;
998 return tmpN;
999 }
1000 // NB: the contract is described in java.io.InputStream.skip(long):
1001 // this method returns the number of bytes actually skipped, so,
1002 // the return value should never be negative.
1003 return 0;
1004 }
1005
1006 @Override
1007 public synchronized long getPos() throws IOException {
1008 return (position - start);
1009 }
1010
1011 @Override
1012 public synchronized void seek(final long pos) throws IOException {
1013 validatePosition(pos);
1014 position = start + pos;
1015 underLyingStream.seek(position);
1016 }
1017
1018 private void validatePosition(final long pos) throws IOException {
1019 if (pos < 0) {
1020 throw new IOException("Negative position: "+pos);
1021 }
1022 final long length = end - start;
1023 if (pos > length) {
1024 throw new IOException("Position behind the end " +
1025 "of the stream (length = "+length+"): " + pos);
1026 }
1027 }
1028
1029 @Override
1030 public boolean seekToNewSource(long targetPos) throws IOException {
1031 // do not need to implement this
1032 // hdfs in itself does seektonewsource
1033 // while reading.
1034 return false;
1035 }
1036
1037 /**
1038 * implementing position readable.
1039 */
1040 @Override
1041 public int read(long pos, byte[] b, int offset, int length)
1042 throws IOException {
1043 int nlength = length;
1044 if (start + nlength + pos > end) {
1045 // length corrected to the real remaining length:
1046 nlength = (int) (end - start - pos);
1047 }
1048 if (nlength <= 0) {
1049 // EOS:
1050 return -1;
1051 }
1052 return underLyingStream.read(pos + start , b, offset, nlength);
1053 }
1054
1055 /**
1056 * position readable again.
1057 */
1058 @Override
1059 public void readFully(long pos, byte[] b, int offset, int length)
1060 throws IOException {
1061 if (start + length + pos > end) {
1062 throw new IOException("Not enough bytes to read.");
1063 }
1064 underLyingStream.readFully(pos + start, b, offset, length);
1065 }
1066
1067 @Override
1068 public void readFully(long pos, byte[] b) throws IOException {
1069 readFully(pos, b, 0, b.length);
1070 }
1071
1072 @Override
1073 public void setReadahead(Long readahead) throws IOException {
1074 underLyingStream.setReadahead(readahead);
1075 }
1076
1077 @Override
1078 public void setDropBehind(Boolean dropBehind) throws IOException {
1079 underLyingStream.setDropBehind(dropBehind);
1080 }
1081 }
1082
1083 /**
1084 * constructors for har input stream.
1085 * @param fs the underlying filesystem
1086 * @param p The path in the underlying filesystem
1087 * @param start the start position in the part file
1088 * @param length the length of valid data in the part file
1089 * @param bufsize the buffer size
1090 * @throws IOException
1091 */
1092 public HarFSDataInputStream(FileSystem fs, Path p, long start,
1093 long length, int bufsize) throws IOException {
1094 super(new HarFsInputStream(fs, p, start, length, bufsize));
1095 }
1096 }
1097
1098 private class HarMetaData {
1099 private FileSystem fs;
1100 private int version;
1101 // the masterIndex of the archive
1102 private Path masterIndexPath;
1103 // the index file
1104 private Path archiveIndexPath;
1105
1106 private long masterIndexTimestamp;
1107 private long archiveIndexTimestamp;
1108
1109 List<Store> stores = new ArrayList<Store>();
1110 Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
1111 private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
1112
1113 public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
1114 this.fs = fs;
1115 this.masterIndexPath = masterIndexPath;
1116 this.archiveIndexPath = archiveIndexPath;
1117 }
1118
1119 public FileStatus getPartFileStatus(Path partPath) throws IOException {
1120 FileStatus status;
1121 status = partFileStatuses.get(partPath);
1122 if (status == null) {
1123 status = fs.getFileStatus(partPath);
1124 partFileStatuses.put(partPath, status);
1125 }
1126 return status;
1127 }
1128
1129 public long getMasterIndexTimestamp() {
1130 return masterIndexTimestamp;
1131 }
1132
1133 public long getArchiveIndexTimestamp() {
1134 return archiveIndexTimestamp;
1135 }
1136
1137 private int getVersion() {
1138 return version;
1139 }
1140
1141 private void parseMetaData() throws IOException {
1142 Text line = new Text();
1143 long read;
1144 FSDataInputStream in = null;
1145 LineReader lin = null;
1146
1147 try {
1148 in = fs.open(masterIndexPath);
1149 FileStatus masterStat = fs.getFileStatus(masterIndexPath);
1150 masterIndexTimestamp = masterStat.getModificationTime();
1151 lin = new LineReader(in, getConf());
1152 read = lin.readLine(line);
1153
1154 // the first line contains the version of the index file
1155 String versionLine = line.toString();
1156 String[] arr = versionLine.split(" ");
1157 version = Integer.parseInt(arr[0]);
1158 // make it always backwards-compatible
1159 if (this.version > HarFileSystem.VERSION) {
1160 throw new IOException("Invalid version " +
1161 this.version + " expected " + HarFileSystem.VERSION);
1162 }
1163
1164 // each line contains a hashcode range and the index file name
1165 String[] readStr;
1166 while(read < masterStat.getLen()) {
1167 int b = lin.readLine(line);
1168 read += b;
1169 readStr = line.toString().split(" ");
1170 int startHash = Integer.parseInt(readStr[0]);
1171 int endHash = Integer.parseInt(readStr[1]);
1172 stores.add(new Store(Long.parseLong(readStr[2]),
1173 Long.parseLong(readStr[3]), startHash,
1174 endHash));
1175 line.clear();
1176 }
1177 } catch (IOException ioe) {
1178 LOG.warn("Encountered exception ", ioe);
1179 throw ioe;
1180 } finally {
1181 IOUtils.cleanup(LOG, lin, in);
1182 }
1183
1184 FSDataInputStream aIn = fs.open(archiveIndexPath);
1185 try {
1186 FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
1187 archiveIndexTimestamp = archiveStat.getModificationTime();
1188 LineReader aLin;
1189
1190 // now start reading the real index file
1191 for (Store s: stores) {
1192 read = 0;
1193 aIn.seek(s.begin);
1194 aLin = new LineReader(aIn, getConf());
1195 while (read + s.begin < s.end) {
1196 int tmp = aLin.readLine(line);
1197 read += tmp;
1198 String lineFeed = line.toString();
1199 String[] parsed = lineFeed.split(" ");
1200 parsed[0] = decodeFileName(parsed[0]);
1201 archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
1202 line.clear();
1203 }
1204 }
1205 } finally {
1206 IOUtils.cleanup(LOG, aIn);
1207 }
1208 }
1209 }
1210
1211 /*
1212 * testing purposes only:
1213 */
1214 HarMetaData getMetadata() {
1215 return metadata;
1216 }
1217
1218 private static class LruCache<K, V> extends LinkedHashMap<K, V> {
1219 private final int MAX_ENTRIES;
1220
1221 public LruCache(int maxEntries) {
1222 super(maxEntries + 1, 1.0f, true);
1223 MAX_ENTRIES = maxEntries;
1224 }
1225
1226 @Override
1227 protected boolean removeEldestEntry(Map.Entry<K, V> eldest) {
1228 return size() > MAX_ENTRIES;
1229 }
1230 }
1231
1232 @SuppressWarnings("deprecation")
1233 @Override
1234 public FsServerDefaults getServerDefaults() throws IOException {
1235 return fs.getServerDefaults();
1236 }
1237
1238 @Override
1239 public FsServerDefaults getServerDefaults(Path f) throws IOException {
1240 return fs.getServerDefaults(f);
1241 }
1242
1243 @Override
1244 public long getUsed() throws IOException{
1245 return fs.getUsed();
1246 }
1247
1248 @SuppressWarnings("deprecation")
1249 @Override
1250 public long getDefaultBlockSize() {
1251 return fs.getDefaultBlockSize();
1252 }
1253
1254 @SuppressWarnings("deprecation")
1255 @Override
1256 public long getDefaultBlockSize(Path f) {
1257 return fs.getDefaultBlockSize(f);
1258 }
1259
1260 @SuppressWarnings("deprecation")
1261 @Override
1262 public short getDefaultReplication() {
1263 return fs.getDefaultReplication();
1264 }
1265
1266 @Override
1267 public short getDefaultReplication(Path f) {
1268 return fs.getDefaultReplication(f);
1269 }
1270 }