001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.fs;
019
020 import org.apache.commons.logging.Log;
021 import org.apache.commons.logging.LogFactory;
022 import org.apache.hadoop.conf.Configuration;
023 import org.apache.hadoop.fs.permission.FsPermission;
024 import org.apache.hadoop.io.IOUtils;
025 import org.apache.hadoop.io.Text;
026 import org.apache.hadoop.util.LineReader;
027 import org.apache.hadoop.util.Progressable;
028
029 import java.io.FileNotFoundException;
030 import java.io.IOException;
031 import java.io.UnsupportedEncodingException;
032 import java.net.URI;
033 import java.net.URISyntaxException;
034 import java.net.URLDecoder;
035 import java.util.*;
036
037 /**
038 * This is an implementation of the Hadoop Archive
039 * Filesystem. This archive Filesystem has index files
040 * of the form _index* and has contents of the form
041 * part-*. The index files store the indexes of the
042 * real files. The index files are of the form _masterindex
043 * and _index. The master index is a level of indirection
044 * in to the index file to make the look ups faster. the index
045 * file is sorted with hash code of the paths that it contains
046 * and the master index contains pointers to the positions in
047 * index for ranges of hashcodes.
048 */
049
050 public class HarFileSystem extends FileSystem {
051
052 private static final Log LOG = LogFactory.getLog(HarFileSystem.class);
053
054 public static final String METADATA_CACHE_ENTRIES_KEY = "fs.har.metadatacache.entries";
055 public static final int METADATA_CACHE_ENTRIES_DEFAULT = 10;
056
057 public static final int VERSION = 3;
058
059 private static Map<URI, HarMetaData> harMetaCache;
060
061 // uri representation of this Har filesystem
062 private URI uri;
063 // the top level path of the archive
064 // in the underlying file system
065 private Path archivePath;
066 // the har auth
067 private String harAuth;
068
069 // pointer into the static metadata cache
070 private HarMetaData metadata;
071
072 private FileSystem fs;
073
074 /**
075 * public construction of harfilesystem
076 */
077 public HarFileSystem() {
078 // Must call #initialize() method to set the underlying file system
079 }
080
081 /**
082 * Return the protocol scheme for the FileSystem.
083 * <p/>
084 *
085 * @return <code>har</code>
086 */
087 @Override
088 public String getScheme() {
089 return "har";
090 }
091
092 /**
093 * Constructor to create a HarFileSystem with an
094 * underlying filesystem.
095 * @param fs underlying file system
096 */
097 public HarFileSystem(FileSystem fs) {
098 this.fs = fs;
099 this.statistics = fs.statistics;
100 }
101
102 private synchronized void initializeMetadataCache(Configuration conf) {
103 if (harMetaCache == null) {
104 int cacheSize = conf.getInt(METADATA_CACHE_ENTRIES_KEY, METADATA_CACHE_ENTRIES_DEFAULT);
105 harMetaCache = Collections.synchronizedMap(new LruCache<URI, HarMetaData>(cacheSize));
106 }
107 }
108
109 /**
110 * Initialize a Har filesystem per har archive. The
111 * archive home directory is the top level directory
112 * in the filesystem that contains the HAR archive.
113 * Be careful with this method, you do not want to go
114 * on creating new Filesystem instances per call to
115 * path.getFileSystem().
116 * the uri of Har is
117 * har://underlyingfsscheme-host:port/archivepath.
118 * or
119 * har:///archivepath. This assumes the underlying filesystem
120 * to be used in case not specified.
121 */
122 @Override
123 public void initialize(URI name, Configuration conf) throws IOException {
124 // initialize the metadata cache, if needed
125 initializeMetadataCache(conf);
126
127 // decode the name
128 URI underLyingURI = decodeHarURI(name, conf);
129 // we got the right har Path- now check if this is
130 // truly a har filesystem
131 Path harPath = archivePath(
132 new Path(name.getScheme(), name.getAuthority(), name.getPath()));
133 if (harPath == null) {
134 throw new IOException("Invalid path for the Har Filesystem. " +
135 name.toString());
136 }
137 if (fs == null) {
138 fs = FileSystem.get(underLyingURI, conf);
139 }
140 uri = harPath.toUri();
141 archivePath = new Path(uri.getPath());
142 harAuth = getHarAuth(underLyingURI);
143 //check for the underlying fs containing
144 // the index file
145 Path masterIndexPath = new Path(archivePath, "_masterindex");
146 Path archiveIndexPath = new Path(archivePath, "_index");
147 if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) {
148 throw new IOException("Invalid path for the Har Filesystem. " +
149 "No index file in " + harPath);
150 }
151
152 metadata = harMetaCache.get(uri);
153 if (metadata != null) {
154 FileStatus mStat = fs.getFileStatus(masterIndexPath);
155 FileStatus aStat = fs.getFileStatus(archiveIndexPath);
156 if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() ||
157 aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) {
158 // the archive has been overwritten since we last read it
159 // remove the entry from the meta data cache
160 metadata = null;
161 harMetaCache.remove(uri);
162 }
163 }
164 if (metadata == null) {
165 metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath);
166 metadata.parseMetaData();
167 harMetaCache.put(uri, metadata);
168 }
169 }
170
171 @Override
172 public Configuration getConf() {
173 return fs.getConf();
174 }
175
176 // get the version of the filesystem from the masterindex file
177 // the version is currently not useful since its the first version
178 // of archives
179 public int getHarVersion() throws IOException {
180 if (metadata != null) {
181 return metadata.getVersion();
182 }
183 else {
184 throw new IOException("Invalid meta data for the Har Filesystem");
185 }
186 }
187
188 /*
189 * find the parent path that is the
190 * archive path in the path. The last
191 * path segment that ends with .har is
192 * the path that will be returned.
193 */
194 private Path archivePath(Path p) {
195 Path retPath = null;
196 Path tmp = p;
197 for (int i=0; i< p.depth(); i++) {
198 if (tmp.toString().endsWith(".har")) {
199 retPath = tmp;
200 break;
201 }
202 tmp = tmp.getParent();
203 }
204 return retPath;
205 }
206
207 /**
208 * decode the raw URI to get the underlying URI
209 * @param rawURI raw Har URI
210 * @return filtered URI of the underlying fileSystem
211 */
212 private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException {
213 String tmpAuth = rawURI.getAuthority();
214 //we are using the default file
215 //system in the config
216 //so create a underlying uri and
217 //return it
218 if (tmpAuth == null) {
219 //create a path
220 return FileSystem.getDefaultUri(conf);
221 }
222 String authority = rawURI.getAuthority();
223 if (authority == null) {
224 throw new IOException("URI: " + rawURI
225 + " is an invalid Har URI since authority==null."
226 + " Expecting har://<scheme>-<host>/<path>.");
227 }
228
229 int i = authority.indexOf('-');
230 if (i < 0) {
231 throw new IOException("URI: " + rawURI
232 + " is an invalid Har URI since '-' not found."
233 + " Expecting har://<scheme>-<host>/<path>.");
234 }
235
236 if (rawURI.getQuery() != null) {
237 // query component not allowed
238 throw new IOException("query component in Path not supported " + rawURI);
239 }
240
241 URI tmp;
242 try {
243 // convert <scheme>-<host> to <scheme>://<host>
244 URI baseUri = new URI(authority.replaceFirst("-", "://"));
245
246 tmp = new URI(baseUri.getScheme(), baseUri.getAuthority(),
247 rawURI.getPath(), rawURI.getQuery(), rawURI.getFragment());
248 } catch (URISyntaxException e) {
249 throw new IOException("URI: " + rawURI
250 + " is an invalid Har URI. Expecting har://<scheme>-<host>/<path>.");
251 }
252 return tmp;
253 }
254
255 private static String decodeString(String str)
256 throws UnsupportedEncodingException {
257 return URLDecoder.decode(str, "UTF-8");
258 }
259
260 private String decodeFileName(String fname)
261 throws UnsupportedEncodingException {
262 int version = metadata.getVersion();
263 if (version == 2 || version == 3){
264 return decodeString(fname);
265 }
266 return fname;
267 }
268
269 /**
270 * return the top level archive.
271 */
272 @Override
273 public Path getWorkingDirectory() {
274 return new Path(uri.toString());
275 }
276
277 /**
278 * Create a har specific auth
279 * har-underlyingfs:port
280 * @param underLyingUri the uri of underlying
281 * filesystem
282 * @return har specific auth
283 */
284 private String getHarAuth(URI underLyingUri) {
285 String auth = underLyingUri.getScheme() + "-";
286 if (underLyingUri.getHost() != null) {
287 auth += underLyingUri.getHost();
288 if (underLyingUri.getPort() != -1) {
289 auth += ":";
290 auth += underLyingUri.getPort();
291 }
292 }
293 else {
294 auth += ":";
295 }
296 return auth;
297 }
298
299 @Override
300 protected URI getCanonicalUri() {
301 return fs.canonicalizeUri(getUri());
302 }
303
304 /**
305 * Returns the uri of this filesystem.
306 * The uri is of the form
307 * har://underlyingfsschema-host:port/pathintheunderlyingfs
308 */
309 @Override
310 public URI getUri() {
311 return this.uri;
312 }
313
314 /**
315 * this method returns the path
316 * inside the har filesystem.
317 * this is relative path inside
318 * the har filesystem.
319 * @param path the fully qualified path in the har filesystem.
320 * @return relative path in the filesystem.
321 */
322 private Path getPathInHar(Path path) {
323 Path harPath = new Path(path.toUri().getPath());
324 if (archivePath.compareTo(harPath) == 0)
325 return new Path(Path.SEPARATOR);
326 Path tmp = new Path(harPath.getName());
327 Path parent = harPath.getParent();
328 while (!(parent.compareTo(archivePath) == 0)) {
329 if (parent.toString().equals(Path.SEPARATOR)) {
330 tmp = null;
331 break;
332 }
333 tmp = new Path(parent.getName(), tmp);
334 parent = parent.getParent();
335 }
336 if (tmp != null)
337 tmp = new Path(Path.SEPARATOR, tmp);
338 return tmp;
339 }
340
341 //the relative path of p. basically
342 // getting rid of /. Parsing and doing
343 // string manipulation is not good - so
344 // just use the path api to do it.
345 private Path makeRelative(String initial, Path p) {
346 String scheme = this.uri.getScheme();
347 String authority = this.uri.getAuthority();
348 Path root = new Path(Path.SEPARATOR);
349 if (root.compareTo(p) == 0)
350 return new Path(scheme, authority, initial);
351 Path retPath = new Path(p.getName());
352 Path parent = p.getParent();
353 for (int i=0; i < p.depth()-1; i++) {
354 retPath = new Path(parent.getName(), retPath);
355 parent = parent.getParent();
356 }
357 return new Path(new Path(scheme, authority, initial),
358 retPath.toString());
359 }
360
361 /* this makes a path qualified in the har filesystem
362 * (non-Javadoc)
363 * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified(
364 * org.apache.hadoop.fs.Path)
365 */
366 @Override
367 public Path makeQualified(Path path) {
368 // make sure that we just get the
369 // path component
370 Path fsPath = path;
371 if (!path.isAbsolute()) {
372 fsPath = new Path(archivePath, path);
373 }
374
375 URI tmpURI = fsPath.toUri();
376 //change this to Har uri
377 return new Path(uri.getScheme(), harAuth, tmpURI.getPath());
378 }
379
380 /**
381 * Fix offset and length of block locations.
382 * Note that this method modifies the original array.
383 * @param locations block locations of har part file
384 * @param start the start of the desired range in the contained file
385 * @param len the length of the desired range
386 * @param fileOffsetInHar the offset of the desired file in the har part file
387 * @return block locations with fixed offset and length
388 */
389 static BlockLocation[] fixBlockLocations(BlockLocation[] locations,
390 long start,
391 long len,
392 long fileOffsetInHar) {
393 // offset 1 past last byte of desired range
394 long end = start + len;
395
396 for (BlockLocation location : locations) {
397 // offset of part block relative to beginning of desired file
398 // (may be negative if file starts in this part block)
399 long harBlockStart = location.getOffset() - fileOffsetInHar;
400 // offset 1 past last byte of har block relative to beginning of
401 // desired file
402 long harBlockEnd = harBlockStart + location.getLength();
403
404 if (start > harBlockStart) {
405 // desired range starts after beginning of this har block
406 // fix offset to beginning of relevant range (relative to desired file)
407 location.setOffset(start);
408 // fix length to relevant portion of har block
409 location.setLength(location.getLength() - (start - harBlockStart));
410 } else {
411 // desired range includes beginning of this har block
412 location.setOffset(harBlockStart);
413 }
414
415 if (harBlockEnd > end) {
416 // range ends before end of this har block
417 // fix length to remove irrelevant portion at the end
418 location.setLength(location.getLength() - (harBlockEnd - end));
419 }
420 }
421
422 return locations;
423 }
424
425 /**
426 * Get block locations from the underlying fs and fix their
427 * offsets and lengths.
428 * @param file the input file status to get block locations
429 * @param start the start of the desired range in the contained file
430 * @param len the length of the desired range
431 * @return block locations for this segment of file
432 * @throws IOException
433 */
434 @Override
435 public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
436 long len) throws IOException {
437 HarStatus hstatus = getFileHarStatus(file.getPath());
438 Path partPath = new Path(archivePath, hstatus.getPartName());
439 FileStatus partStatus = metadata.getPartFileStatus(partPath);
440
441 // get all part blocks that overlap with the desired file blocks
442 BlockLocation[] locations =
443 fs.getFileBlockLocations(partStatus,
444 hstatus.getStartIndex() + start, len);
445
446 return fixBlockLocations(locations, start, len, hstatus.getStartIndex());
447 }
448
449 /**
450 * the hash of the path p inside the filesystem
451 * @param p the path in the harfilesystem
452 * @return the hash code of the path.
453 */
454 public static int getHarHash(Path p) {
455 return (p.toString().hashCode() & 0x7fffffff);
456 }
457
458 static class Store {
459 public Store() {
460 begin = end = startHash = endHash = 0;
461 }
462 public Store(long begin, long end, int startHash, int endHash) {
463 this.begin = begin;
464 this.end = end;
465 this.startHash = startHash;
466 this.endHash = endHash;
467 }
468 public long begin;
469 public long end;
470 public int startHash;
471 public int endHash;
472 }
473
474 /**
475 * Get filestatuses of all the children of a given directory. This just reads
476 * through index file and reads line by line to get all statuses for children
477 * of a directory. Its a brute force way of getting all such filestatuses
478 *
479 * @param parent
480 * the parent path directory
481 * @param statuses
482 * the list to add the children filestatuses to
483 */
484 private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses)
485 throws IOException {
486 String parentString = parent.getName();
487 if (!parentString.endsWith(Path.SEPARATOR)){
488 parentString += Path.SEPARATOR;
489 }
490 Path harPath = new Path(parentString);
491 int harlen = harPath.depth();
492 final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
493
494 for (HarStatus hstatus : metadata.archive.values()) {
495 String child = hstatus.getName();
496 if ((child.startsWith(parentString))) {
497 Path thisPath = new Path(child);
498 if (thisPath.depth() == harlen + 1) {
499 statuses.add(toFileStatus(hstatus, cache));
500 }
501 }
502 }
503 }
504
505 /**
506 * Combine the status stored in the index and the underlying status.
507 * @param h status stored in the index
508 * @param cache caching the underlying file statuses
509 * @return the combined file status
510 * @throws IOException
511 */
512 private FileStatus toFileStatus(HarStatus h,
513 Map<String, FileStatus> cache) throws IOException {
514 FileStatus underlying = null;
515 if (cache != null) {
516 underlying = cache.get(h.partName);
517 }
518 if (underlying == null) {
519 final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
520 underlying = fs.getFileStatus(p);
521 if (cache != null) {
522 cache.put(h.partName, underlying);
523 }
524 }
525
526 long modTime = 0;
527 int version = metadata.getVersion();
528 if (version < 3) {
529 modTime = underlying.getModificationTime();
530 } else if (version == 3) {
531 modTime = h.getModificationTime();
532 }
533
534 return new FileStatus(
535 h.isDir()? 0L: h.getLength(),
536 h.isDir(),
537 underlying.getReplication(),
538 underlying.getBlockSize(),
539 modTime,
540 underlying.getAccessTime(),
541 underlying.getPermission(),
542 underlying.getOwner(),
543 underlying.getGroup(),
544 makeRelative(this.uri.getPath(), new Path(h.name)));
545 }
546
547 // a single line parser for hadoop archives status
548 // stored in a single line in the index files
549 // the format is of the form
550 // filename "dir"/"file" partFileName startIndex length
551 // <space separated children>
552 private class HarStatus {
553 boolean isDir;
554 String name;
555 List<String> children;
556 String partName;
557 long startIndex;
558 long length;
559 long modificationTime = 0;
560
561 public HarStatus(String harString) throws UnsupportedEncodingException {
562 String[] splits = harString.split(" ");
563 this.name = decodeFileName(splits[0]);
564 this.isDir = "dir".equals(splits[1]) ? true: false;
565 // this is equal to "none" if its a directory
566 this.partName = splits[2];
567 this.startIndex = Long.parseLong(splits[3]);
568 this.length = Long.parseLong(splits[4]);
569
570 int version = metadata.getVersion();
571 String[] propSplits = null;
572 // propSplits is used to retrieve the metainformation that Har versions
573 // 1 & 2 missed (modification time, permission, owner group).
574 // These fields are stored in an encoded string placed in different
575 // locations depending on whether it's a file or directory entry.
576 // If it's a directory, the string will be placed at the partName
577 // location (directories have no partName because they don't have data
578 // to be stored). This is done because the number of fields in a
579 // directory entry is unbounded (all children are listed at the end)
580 // If it's a file, the string will be the last field.
581 if (isDir) {
582 if (version == 3){
583 propSplits = decodeString(this.partName).split(" ");
584 }
585 children = new ArrayList<String>();
586 for (int i = 5; i < splits.length; i++) {
587 children.add(decodeFileName(splits[i]));
588 }
589 } else if (version == 3) {
590 propSplits = decodeString(splits[5]).split(" ");
591 }
592
593 if (propSplits != null && propSplits.length >= 4) {
594 modificationTime = Long.parseLong(propSplits[0]);
595 // the fields below are stored in the file but are currently not used
596 // by HarFileSystem
597 // permission = new FsPermission(Short.parseShort(propSplits[1]));
598 // owner = decodeString(propSplits[2]);
599 // group = decodeString(propSplits[3]);
600 }
601 }
602 public boolean isDir() {
603 return isDir;
604 }
605
606 public String getName() {
607 return name;
608 }
609 public String getPartName() {
610 return partName;
611 }
612 public long getStartIndex() {
613 return startIndex;
614 }
615 public long getLength() {
616 return length;
617 }
618 public long getModificationTime() {
619 return modificationTime;
620 }
621 }
622
623 /**
624 * return the filestatus of files in har archive.
625 * The permission returned are that of the archive
626 * index files. The permissions are not persisted
627 * while creating a hadoop archive.
628 * @param f the path in har filesystem
629 * @return filestatus.
630 * @throws IOException
631 */
632 @Override
633 public FileStatus getFileStatus(Path f) throws IOException {
634 HarStatus hstatus = getFileHarStatus(f);
635 return toFileStatus(hstatus, null);
636 }
637
638 private HarStatus getFileHarStatus(Path f) throws IOException {
639 // get the fs DataInputStream for the underlying file
640 // look up the index.
641 Path p = makeQualified(f);
642 Path harPath = getPathInHar(p);
643 if (harPath == null) {
644 throw new IOException("Invalid file name: " + f + " in " + uri);
645 }
646 HarStatus hstatus = metadata.archive.get(harPath);
647 if (hstatus == null) {
648 throw new FileNotFoundException("File: " + f + " does not exist in " + uri);
649 }
650 return hstatus;
651 }
652
653 /**
654 * @return null since no checksum algorithm is implemented.
655 */
656 @Override
657 public FileChecksum getFileChecksum(Path f) {
658 return null;
659 }
660
661 /**
662 * Returns a har input stream which fakes end of
663 * file. It reads the index files to get the part
664 * file name and the size and start of the file.
665 */
666 @Override
667 public FSDataInputStream open(Path f, int bufferSize) throws IOException {
668 // get the fs DataInputStream for the underlying file
669 HarStatus hstatus = getFileHarStatus(f);
670 if (hstatus.isDir()) {
671 throw new FileNotFoundException(f + " : not a file in " +
672 archivePath);
673 }
674 return new HarFSDataInputStream(fs, new Path(archivePath,
675 hstatus.getPartName()),
676 hstatus.getStartIndex(), hstatus.getLength(), bufferSize);
677 }
678
679 @Override
680 public FSDataOutputStream create(Path f,
681 FsPermission permission,
682 boolean overwrite,
683 int bufferSize,
684 short replication,
685 long blockSize,
686 Progressable progress) throws IOException {
687 throw new IOException("Har: create not allowed.");
688 }
689
690 @Override
691 public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) throws IOException {
692 throw new IOException("Har: append not allowed.");
693 }
694
695 @Override
696 public void close() throws IOException {
697 if (fs != null) {
698 try {
699 fs.close();
700 } catch(IOException ie) {
701 //this might already be closed
702 // ignore
703 }
704 }
705 }
706
707 /**
708 * Not implemented.
709 */
710 @Override
711 public boolean setReplication(Path src, short replication) throws IOException{
712 throw new IOException("Har: setReplication not allowed");
713 }
714
715 @Override
716 public boolean rename(Path src, Path dst) throws IOException {
717 throw new IOException("Har: rename not allowed");
718 }
719
720 @Override
721 public FSDataOutputStream append(Path f) throws IOException {
722 throw new IOException("Har: append not allowed");
723 }
724
725 /**
726 * Not implemented.
727 */
728 @Override
729 public boolean delete(Path f, boolean recursive) throws IOException {
730 throw new IOException("Har: delete not allowed");
731 }
732
733 /**
734 * liststatus returns the children of a directory
735 * after looking up the index files.
736 */
737 @Override
738 public FileStatus[] listStatus(Path f) throws IOException {
739 //need to see if the file is an index in file
740 //get the filestatus of the archive directory
741 // we will create fake filestatuses to return
742 // to the client
743 List<FileStatus> statuses = new ArrayList<FileStatus>();
744 Path tmpPath = makeQualified(f);
745 Path harPath = getPathInHar(tmpPath);
746 HarStatus hstatus = metadata.archive.get(harPath);
747 if (hstatus == null) {
748 throw new FileNotFoundException("File " + f + " not found in " + archivePath);
749 }
750 if (hstatus.isDir()) {
751 fileStatusesInIndex(hstatus, statuses);
752 } else {
753 statuses.add(toFileStatus(hstatus, null));
754 }
755
756 return statuses.toArray(new FileStatus[statuses.size()]);
757 }
758
759 /**
760 * return the top level archive path.
761 */
762 @Override
763 public Path getHomeDirectory() {
764 return new Path(uri.toString());
765 }
766
767 @Override
768 public void setWorkingDirectory(Path newDir) {
769 //does nothing.
770 }
771
772 /**
773 * not implemented.
774 */
775 @Override
776 public boolean mkdirs(Path f, FsPermission permission) throws IOException {
777 throw new IOException("Har: mkdirs not allowed");
778 }
779
780 /**
781 * not implemented.
782 */
783 @Override
784 public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws
785 IOException {
786 throw new IOException("Har: copyfromlocalfile not allowed");
787 }
788
789 /**
790 * copies the file in the har filesystem to a local file.
791 */
792 @Override
793 public void copyToLocalFile(boolean delSrc, Path src, Path dst)
794 throws IOException {
795 FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf());
796 }
797
798 /**
799 * not implemented.
800 */
801 @Override
802 public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile)
803 throws IOException {
804 throw new IOException("Har: startLocalOutput not allowed");
805 }
806
807 /**
808 * not implemented.
809 */
810 @Override
811 public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile)
812 throws IOException {
813 throw new IOException("Har: completeLocalOutput not allowed");
814 }
815
816 /**
817 * not implemented.
818 */
819 @Override
820 public void setOwner(Path p, String username, String groupname)
821 throws IOException {
822 throw new IOException("Har: setowner not allowed");
823 }
824
825 /**
826 * Not implemented.
827 */
828 @Override
829 public void setPermission(Path p, FsPermission permission)
830 throws IOException {
831 throw new IOException("Har: setPermission not allowed");
832 }
833
834 /**
835 * Hadoop archives input stream. This input stream fakes EOF
836 * since archive files are part of bigger part files.
837 */
838 private static class HarFSDataInputStream extends FSDataInputStream {
839 /**
840 * Create an input stream that fakes all the reads/positions/seeking.
841 */
842 private static class HarFsInputStream extends FSInputStream
843 implements CanSetDropBehind, CanSetReadahead {
844 private long position, start, end;
845 //The underlying data input stream that the
846 // underlying filesystem will return.
847 private FSDataInputStream underLyingStream;
848 //one byte buffer
849 private byte[] oneBytebuff = new byte[1];
850 HarFsInputStream(FileSystem fs, Path path, long start,
851 long length, int bufferSize) throws IOException {
852 underLyingStream = fs.open(path, bufferSize);
853 underLyingStream.seek(start);
854 // the start of this file in the part file
855 this.start = start;
856 // the position pointer in the part file
857 this.position = start;
858 // the end pointer in the part file
859 this.end = start + length;
860 }
861
862 @Override
863 public synchronized int available() throws IOException {
864 long remaining = end - underLyingStream.getPos();
865 if (remaining > (long)Integer.MAX_VALUE) {
866 return Integer.MAX_VALUE;
867 }
868 return (int) remaining;
869 }
870
871 @Override
872 public synchronized void close() throws IOException {
873 underLyingStream.close();
874 super.close();
875 }
876
877 //not implemented
878 @Override
879 public void mark(int readLimit) {
880 // do nothing
881 }
882
883 /**
884 * reset is not implemented
885 */
886 @Override
887 public void reset() throws IOException {
888 throw new IOException("reset not implemented.");
889 }
890
891 @Override
892 public synchronized int read() throws IOException {
893 int ret = read(oneBytebuff, 0, 1);
894 return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff);
895 }
896
897 @Override
898 public synchronized int read(byte[] b) throws IOException {
899 int ret = read(b, 0, b.length);
900 if (ret != -1) {
901 position += ret;
902 }
903 return ret;
904 }
905
906 /**
907 *
908 */
909 @Override
910 public synchronized int read(byte[] b, int offset, int len)
911 throws IOException {
912 int newlen = len;
913 int ret = -1;
914 if (position + len > end) {
915 newlen = (int) (end - position);
916 }
917 // end case
918 if (newlen == 0)
919 return ret;
920 ret = underLyingStream.read(b, offset, newlen);
921 position += ret;
922 return ret;
923 }
924
925 @Override
926 public synchronized long skip(long n) throws IOException {
927 long tmpN = n;
928 if (tmpN > 0) {
929 if (position + tmpN > end) {
930 tmpN = end - position;
931 }
932 underLyingStream.seek(tmpN + position);
933 position += tmpN;
934 return tmpN;
935 }
936 return (tmpN < 0)? -1 : 0;
937 }
938
939 @Override
940 public synchronized long getPos() throws IOException {
941 return (position - start);
942 }
943
944 @Override
945 public synchronized void seek(long pos) throws IOException {
946 if (pos < 0 || (start + pos > end)) {
947 throw new IOException("Failed to seek: EOF");
948 }
949 position = start + pos;
950 underLyingStream.seek(position);
951 }
952
953 @Override
954 public boolean seekToNewSource(long targetPos) throws IOException {
955 // do not need to implement this
956 // hdfs in itself does seektonewsource
957 // while reading.
958 return false;
959 }
960
961 /**
962 * implementing position readable.
963 */
964 @Override
965 public int read(long pos, byte[] b, int offset, int length)
966 throws IOException {
967 int nlength = length;
968 if (start + nlength + pos > end) {
969 nlength = (int) (end - (start + pos));
970 }
971 return underLyingStream.read(pos + start , b, offset, nlength);
972 }
973
974 /**
975 * position readable again.
976 */
977 @Override
978 public void readFully(long pos, byte[] b, int offset, int length)
979 throws IOException {
980 if (start + length + pos > end) {
981 throw new IOException("Not enough bytes to read.");
982 }
983 underLyingStream.readFully(pos + start, b, offset, length);
984 }
985
986 @Override
987 public void readFully(long pos, byte[] b) throws IOException {
988 readFully(pos, b, 0, b.length);
989 }
990
991 @Override
992 public void setReadahead(Long readahead) throws IOException {
993 underLyingStream.setReadahead(readahead);
994 }
995
996 @Override
997 public void setDropBehind(Boolean dropBehind) throws IOException {
998 underLyingStream.setDropBehind(dropBehind);
999 }
1000 }
1001
1002 /**
1003 * constructors for har input stream.
1004 * @param fs the underlying filesystem
1005 * @param p The path in the underlying filesystem
1006 * @param start the start position in the part file
1007 * @param length the length of valid data in the part file
1008 * @param bufsize the buffer size
1009 * @throws IOException
1010 */
1011 public HarFSDataInputStream(FileSystem fs, Path p, long start,
1012 long length, int bufsize) throws IOException {
1013 super(new HarFsInputStream(fs, p, start, length, bufsize));
1014 }
1015 }
1016
1017 private class HarMetaData {
1018 private FileSystem fs;
1019 private int version;
1020 // the masterIndex of the archive
1021 private Path masterIndexPath;
1022 // the index file
1023 private Path archiveIndexPath;
1024
1025 private long masterIndexTimestamp;
1026 private long archiveIndexTimestamp;
1027
1028 List<Store> stores = new ArrayList<Store>();
1029 Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
1030 private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
1031
1032 public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
1033 this.fs = fs;
1034 this.masterIndexPath = masterIndexPath;
1035 this.archiveIndexPath = archiveIndexPath;
1036 }
1037
1038 public FileStatus getPartFileStatus(Path partPath) throws IOException {
1039 FileStatus status;
1040 status = partFileStatuses.get(partPath);
1041 if (status == null) {
1042 status = fs.getFileStatus(partPath);
1043 partFileStatuses.put(partPath, status);
1044 }
1045 return status;
1046 }
1047
1048 public long getMasterIndexTimestamp() {
1049 return masterIndexTimestamp;
1050 }
1051
1052 public long getArchiveIndexTimestamp() {
1053 return archiveIndexTimestamp;
1054 }
1055
1056 private int getVersion() {
1057 return version;
1058 }
1059
1060 private void parseMetaData() throws IOException {
1061 Text line = new Text();
1062 long read;
1063 FSDataInputStream in = null;
1064 LineReader lin = null;
1065
1066 try {
1067 in = fs.open(masterIndexPath);
1068 FileStatus masterStat = fs.getFileStatus(masterIndexPath);
1069 masterIndexTimestamp = masterStat.getModificationTime();
1070 lin = new LineReader(in, getConf());
1071 read = lin.readLine(line);
1072
1073 // the first line contains the version of the index file
1074 String versionLine = line.toString();
1075 String[] arr = versionLine.split(" ");
1076 version = Integer.parseInt(arr[0]);
1077 // make it always backwards-compatible
1078 if (this.version > HarFileSystem.VERSION) {
1079 throw new IOException("Invalid version " +
1080 this.version + " expected " + HarFileSystem.VERSION);
1081 }
1082
1083 // each line contains a hashcode range and the index file name
1084 String[] readStr;
1085 while(read < masterStat.getLen()) {
1086 int b = lin.readLine(line);
1087 read += b;
1088 readStr = line.toString().split(" ");
1089 int startHash = Integer.parseInt(readStr[0]);
1090 int endHash = Integer.parseInt(readStr[1]);
1091 stores.add(new Store(Long.parseLong(readStr[2]),
1092 Long.parseLong(readStr[3]), startHash,
1093 endHash));
1094 line.clear();
1095 }
1096 } catch (IOException ioe) {
1097 LOG.warn("Encountered exception ", ioe);
1098 throw ioe;
1099 } finally {
1100 IOUtils.cleanup(LOG, lin, in);
1101 }
1102
1103 FSDataInputStream aIn = fs.open(archiveIndexPath);
1104 try {
1105 FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
1106 archiveIndexTimestamp = archiveStat.getModificationTime();
1107 LineReader aLin;
1108
1109 // now start reading the real index file
1110 for (Store s: stores) {
1111 read = 0;
1112 aIn.seek(s.begin);
1113 aLin = new LineReader(aIn, getConf());
1114 while (read + s.begin < s.end) {
1115 int tmp = aLin.readLine(line);
1116 read += tmp;
1117 String lineFeed = line.toString();
1118 String[] parsed = lineFeed.split(" ");
1119 parsed[0] = decodeFileName(parsed[0]);
1120 archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
1121 line.clear();
1122 }
1123 }
1124 } finally {
1125 IOUtils.cleanup(LOG, aIn);
1126 }
1127 }
1128 }
1129
1130 /*
1131 * testing purposes only:
1132 */
1133 HarMetaData getMetadata() {
1134 return metadata;
1135 }
1136
1137 private static class LruCache<K, V> extends LinkedHashMap<K, V> {
1138 private final int MAX_ENTRIES;
1139
1140 public LruCache(int maxEntries) {
1141 super(maxEntries + 1, 1.0f, true);
1142 MAX_ENTRIES = maxEntries;
1143 }
1144
1145 @Override
1146 protected boolean removeEldestEntry(Map.Entry<K, V> eldest) {
1147 return size() > MAX_ENTRIES;
1148 }
1149 }
1150 }