001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.fs;
019
020 import java.io.FileNotFoundException;
021 import java.io.IOException;
022 import java.io.UnsupportedEncodingException;
023 import java.net.URI;
024 import java.net.URISyntaxException;
025 import java.net.URLDecoder;
026 import java.util.ArrayList;
027 import java.util.Collections;
028 import java.util.List;
029 import java.util.LinkedHashMap;
030 import java.util.Map;
031 import java.util.TreeMap;
032 import java.util.HashMap;
033
034 import org.apache.commons.logging.Log;
035 import org.apache.commons.logging.LogFactory;
036 import org.apache.hadoop.conf.Configuration;
037 import org.apache.hadoop.fs.permission.FsPermission;
038 import org.apache.hadoop.io.IOUtils;
039 import org.apache.hadoop.io.Text;
040 import org.apache.hadoop.util.LineReader;
041 import org.apache.hadoop.util.Progressable;
042
043 /**
044 * This is an implementation of the Hadoop Archive
045 * Filesystem. This archive Filesystem has index files
046 * of the form _index* and has contents of the form
047 * part-*. The index files store the indexes of the
048 * real files. The index files are of the form _masterindex
049 * and _index. The master index is a level of indirection
050 * in to the index file to make the look ups faster. the index
051 * file is sorted with hash code of the paths that it contains
052 * and the master index contains pointers to the positions in
053 * index for ranges of hashcodes.
054 */
055
056 public class HarFileSystem extends FilterFileSystem {
057
058 private static final Log LOG = LogFactory.getLog(HarFileSystem.class);
059
060 public static final String METADATA_CACHE_ENTRIES_KEY = "fs.har.metadatacache.entries";
061 public static final int METADATA_CACHE_ENTRIES_DEFAULT = 10;
062
063 public static final int VERSION = 3;
064
065 private static Map<URI, HarMetaData> harMetaCache;
066
067 // uri representation of this Har filesystem
068 private URI uri;
069 // the top level path of the archive
070 // in the underlying file system
071 private Path archivePath;
072 // the har auth
073 private String harAuth;
074
075 // pointer into the static metadata cache
076 private HarMetaData metadata;
077
078 /**
079 * public construction of harfilesystem
080 *
081 */
082 public HarFileSystem() {
083 }
084
085 /**
086 * Return the protocol scheme for the FileSystem.
087 * <p/>
088 *
089 * @return <code>har</code>
090 */
091 @Override
092 public String getScheme() {
093 return "har";
094 }
095
096 /**
097 * Constructor to create a HarFileSystem with an
098 * underlying filesystem.
099 * @param fs
100 */
101 public HarFileSystem(FileSystem fs) {
102 super(fs);
103 }
104
105 private synchronized void initializeMetadataCache(Configuration conf) {
106 if (harMetaCache == null) {
107 int cacheSize = conf.getInt(METADATA_CACHE_ENTRIES_KEY, METADATA_CACHE_ENTRIES_DEFAULT);
108 harMetaCache = Collections.synchronizedMap(new LruCache<URI, HarMetaData>(cacheSize));
109 }
110 }
111
112 /**
113 * Initialize a Har filesystem per har archive. The
114 * archive home directory is the top level directory
115 * in the filesystem that contains the HAR archive.
116 * Be careful with this method, you do not want to go
117 * on creating new Filesystem instances per call to
118 * path.getFileSystem().
119 * the uri of Har is
120 * har://underlyingfsscheme-host:port/archivepath.
121 * or
122 * har:///archivepath. This assumes the underlying filesystem
123 * to be used in case not specified.
124 */
125 @Override
126 public void initialize(URI name, Configuration conf) throws IOException {
127 // initialize the metadata cache, if needed
128 initializeMetadataCache(conf);
129
130 // decode the name
131 URI underLyingURI = decodeHarURI(name, conf);
132 // we got the right har Path- now check if this is
133 // truly a har filesystem
134 Path harPath = archivePath(
135 new Path(name.getScheme(), name.getAuthority(), name.getPath()));
136 if (harPath == null) {
137 throw new IOException("Invalid path for the Har Filesystem. " +
138 name.toString());
139 }
140 if (fs == null) {
141 fs = FileSystem.get(underLyingURI, conf);
142 }
143 uri = harPath.toUri();
144 archivePath = new Path(uri.getPath());
145 harAuth = getHarAuth(underLyingURI);
146 //check for the underlying fs containing
147 // the index file
148 Path masterIndexPath = new Path(archivePath, "_masterindex");
149 Path archiveIndexPath = new Path(archivePath, "_index");
150 if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) {
151 throw new IOException("Invalid path for the Har Filesystem. " +
152 "No index file in " + harPath);
153 }
154
155 metadata = harMetaCache.get(uri);
156 if (metadata != null) {
157 FileStatus mStat = fs.getFileStatus(masterIndexPath);
158 FileStatus aStat = fs.getFileStatus(archiveIndexPath);
159 if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() ||
160 aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) {
161 // the archive has been overwritten since we last read it
162 // remove the entry from the meta data cache
163 metadata = null;
164 harMetaCache.remove(uri);
165 }
166 }
167 if (metadata == null) {
168 metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath);
169 metadata.parseMetaData();
170 harMetaCache.put(uri, metadata);
171 }
172 }
173
174 // get the version of the filesystem from the masterindex file
175 // the version is currently not useful since its the first version
176 // of archives
177 public int getHarVersion() throws IOException {
178 if (metadata != null) {
179 return metadata.getVersion();
180 }
181 else {
182 throw new IOException("Invalid meta data for the Har Filesystem");
183 }
184 }
185
186 /*
187 * find the parent path that is the
188 * archive path in the path. The last
189 * path segment that ends with .har is
190 * the path that will be returned.
191 */
192 private Path archivePath(Path p) {
193 Path retPath = null;
194 Path tmp = p;
195 for (int i=0; i< p.depth(); i++) {
196 if (tmp.toString().endsWith(".har")) {
197 retPath = tmp;
198 break;
199 }
200 tmp = tmp.getParent();
201 }
202 return retPath;
203 }
204
205 /**
206 * decode the raw URI to get the underlying URI
207 * @param rawURI raw Har URI
208 * @return filtered URI of the underlying fileSystem
209 */
210 private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException {
211 String tmpAuth = rawURI.getAuthority();
212 //we are using the default file
213 //system in the config
214 //so create a underlying uri and
215 //return it
216 if (tmpAuth == null) {
217 //create a path
218 return FileSystem.getDefaultUri(conf);
219 }
220 String authority = rawURI.getAuthority();
221 if (authority == null) {
222 throw new IOException("URI: " + rawURI
223 + " is an invalid Har URI since authority==null."
224 + " Expecting har://<scheme>-<host>/<path>.");
225 }
226
227 int i = authority.indexOf('-');
228 if (i < 0) {
229 throw new IOException("URI: " + rawURI
230 + " is an invalid Har URI since '-' not found."
231 + " Expecting har://<scheme>-<host>/<path>.");
232 }
233
234 if (rawURI.getQuery() != null) {
235 // query component not allowed
236 throw new IOException("query component in Path not supported " + rawURI);
237 }
238
239 URI tmp = null;
240
241 try {
242 // convert <scheme>-<host> to <scheme>://<host>
243 URI baseUri = new URI(authority.replaceFirst("-", "://"));
244
245 tmp = new URI(baseUri.getScheme(), baseUri.getAuthority(),
246 rawURI.getPath(), rawURI.getQuery(), rawURI.getFragment());
247 } catch (URISyntaxException e) {
248 throw new IOException("URI: " + rawURI
249 + " is an invalid Har URI. Expecting har://<scheme>-<host>/<path>.");
250 }
251 return tmp;
252 }
253
254 private static String decodeString(String str)
255 throws UnsupportedEncodingException {
256 return URLDecoder.decode(str, "UTF-8");
257 }
258
259 private String decodeFileName(String fname)
260 throws UnsupportedEncodingException {
261 int version = metadata.getVersion();
262 if (version == 2 || version == 3){
263 return decodeString(fname);
264 }
265 return fname;
266 }
267
268 /**
269 * return the top level archive.
270 */
271 @Override
272 public Path getWorkingDirectory() {
273 return new Path(uri.toString());
274 }
275
276 /**
277 * Create a har specific auth
278 * har-underlyingfs:port
279 * @param underLyingURI the uri of underlying
280 * filesystem
281 * @return har specific auth
282 */
283 private String getHarAuth(URI underLyingUri) {
284 String auth = underLyingUri.getScheme() + "-";
285 if (underLyingUri.getHost() != null) {
286 auth += underLyingUri.getHost() + ":";
287 if (underLyingUri.getPort() != -1) {
288 auth += underLyingUri.getPort();
289 }
290 }
291 else {
292 auth += ":";
293 }
294 return auth;
295 }
296
297 /**
298 * Returns the uri of this filesystem.
299 * The uri is of the form
300 * har://underlyingfsschema-host:port/pathintheunderlyingfs
301 */
302 @Override
303 public URI getUri() {
304 return this.uri;
305 }
306
307 /**
308 * this method returns the path
309 * inside the har filesystem.
310 * this is relative path inside
311 * the har filesystem.
312 * @param path the fully qualified path in the har filesystem.
313 * @return relative path in the filesystem.
314 */
315 private Path getPathInHar(Path path) {
316 Path harPath = new Path(path.toUri().getPath());
317 if (archivePath.compareTo(harPath) == 0)
318 return new Path(Path.SEPARATOR);
319 Path tmp = new Path(harPath.getName());
320 Path parent = harPath.getParent();
321 while (!(parent.compareTo(archivePath) == 0)) {
322 if (parent.toString().equals(Path.SEPARATOR)) {
323 tmp = null;
324 break;
325 }
326 tmp = new Path(parent.getName(), tmp);
327 parent = parent.getParent();
328 }
329 if (tmp != null)
330 tmp = new Path(Path.SEPARATOR, tmp);
331 return tmp;
332 }
333
334 //the relative path of p. basically
335 // getting rid of /. Parsing and doing
336 // string manipulation is not good - so
337 // just use the path api to do it.
338 private Path makeRelative(String initial, Path p) {
339 String scheme = this.uri.getScheme();
340 String authority = this.uri.getAuthority();
341 Path root = new Path(Path.SEPARATOR);
342 if (root.compareTo(p) == 0)
343 return new Path(scheme, authority, initial);
344 Path retPath = new Path(p.getName());
345 Path parent = p.getParent();
346 for (int i=0; i < p.depth()-1; i++) {
347 retPath = new Path(parent.getName(), retPath);
348 parent = parent.getParent();
349 }
350 return new Path(new Path(scheme, authority, initial),
351 retPath.toString());
352 }
353
354 /* this makes a path qualified in the har filesystem
355 * (non-Javadoc)
356 * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified(
357 * org.apache.hadoop.fs.Path)
358 */
359 @Override
360 public Path makeQualified(Path path) {
361 // make sure that we just get the
362 // path component
363 Path fsPath = path;
364 if (!path.isAbsolute()) {
365 fsPath = new Path(archivePath, path);
366 }
367
368 URI tmpURI = fsPath.toUri();
369 //change this to Har uri
370 return new Path(uri.getScheme(), harAuth, tmpURI.getPath());
371 }
372
373 /**
374 * Fix offset and length of block locations.
375 * Note that this method modifies the original array.
376 * @param locations block locations of har part file
377 * @param start the start of the desired range in the contained file
378 * @param len the length of the desired range
379 * @param fileOffsetInHar the offset of the desired file in the har part file
380 * @return block locations with fixed offset and length
381 */
382 static BlockLocation[] fixBlockLocations(BlockLocation[] locations,
383 long start,
384 long len,
385 long fileOffsetInHar) {
386 // offset 1 past last byte of desired range
387 long end = start + len;
388
389 for (BlockLocation location : locations) {
390 // offset of part block relative to beginning of desired file
391 // (may be negative if file starts in this part block)
392 long harBlockStart = location.getOffset() - fileOffsetInHar;
393 // offset 1 past last byte of har block relative to beginning of
394 // desired file
395 long harBlockEnd = harBlockStart + location.getLength();
396
397 if (start > harBlockStart) {
398 // desired range starts after beginning of this har block
399 // fix offset to beginning of relevant range (relative to desired file)
400 location.setOffset(start);
401 // fix length to relevant portion of har block
402 location.setLength(location.getLength() - (start - harBlockStart));
403 } else {
404 // desired range includes beginning of this har block
405 location.setOffset(harBlockStart);
406 }
407
408 if (harBlockEnd > end) {
409 // range ends before end of this har block
410 // fix length to remove irrelevant portion at the end
411 location.setLength(location.getLength() - (harBlockEnd - end));
412 }
413 }
414
415 return locations;
416 }
417
418 /**
419 * Get block locations from the underlying fs and fix their
420 * offsets and lengths.
421 * @param file the input filestatus to get block locations
422 * @param start the start of the desired range in the contained file
423 * @param len the length of the desired range
424 * @return block locations for this segment of file
425 * @throws IOException
426 */
427 @Override
428 public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
429 long len) throws IOException {
430 HarStatus hstatus = getFileHarStatus(file.getPath());
431 Path partPath = new Path(archivePath, hstatus.getPartName());
432 FileStatus partStatus = metadata.getPartFileStatus(partPath);
433
434 // get all part blocks that overlap with the desired file blocks
435 BlockLocation[] locations =
436 fs.getFileBlockLocations(partStatus,
437 hstatus.getStartIndex() + start, len);
438
439 return fixBlockLocations(locations, start, len, hstatus.getStartIndex());
440 }
441
442 /**
443 * the hash of the path p inside iniside
444 * the filesystem
445 * @param p the path in the harfilesystem
446 * @return the hash code of the path.
447 */
448 public static int getHarHash(Path p) {
449 return (p.toString().hashCode() & 0x7fffffff);
450 }
451
452 static class Store {
453 public Store() {
454 begin = end = startHash = endHash = 0;
455 }
456 public Store(long begin, long end, int startHash, int endHash) {
457 this.begin = begin;
458 this.end = end;
459 this.startHash = startHash;
460 this.endHash = endHash;
461 }
462 public long begin;
463 public long end;
464 public int startHash;
465 public int endHash;
466 }
467
468 /**
469 * Get filestatuses of all the children of a given directory. This just reads
470 * through index file and reads line by line to get all statuses for children
471 * of a directory. Its a brute force way of getting all such filestatuses
472 *
473 * @param parent
474 * the parent path directory
475 * @param statuses
476 * the list to add the children filestatuses to
477 * @param children
478 * the string list of children for this parent
479 * @param archiveIndexStat
480 * the archive index filestatus
481 */
482 private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses,
483 List<String> children) throws IOException {
484 String parentString = parent.getName();
485 if (!parentString.endsWith(Path.SEPARATOR)){
486 parentString += Path.SEPARATOR;
487 }
488 Path harPath = new Path(parentString);
489 int harlen = harPath.depth();
490 final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
491
492 for (HarStatus hstatus : metadata.archive.values()) {
493 String child = hstatus.getName();
494 if ((child.startsWith(parentString))) {
495 Path thisPath = new Path(child);
496 if (thisPath.depth() == harlen + 1) {
497 statuses.add(toFileStatus(hstatus, cache));
498 }
499 }
500 }
501 }
502
503 /**
504 * Combine the status stored in the index and the underlying status.
505 * @param h status stored in the index
506 * @param cache caching the underlying file statuses
507 * @return the combined file status
508 * @throws IOException
509 */
510 private FileStatus toFileStatus(HarStatus h,
511 Map<String, FileStatus> cache) throws IOException {
512 FileStatus underlying = null;
513 if (cache != null) {
514 underlying = cache.get(h.partName);
515 }
516 if (underlying == null) {
517 final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
518 underlying = fs.getFileStatus(p);
519 if (cache != null) {
520 cache.put(h.partName, underlying);
521 }
522 }
523
524 long modTime = 0;
525 int version = metadata.getVersion();
526 if (version < 3) {
527 modTime = underlying.getModificationTime();
528 } else if (version == 3) {
529 modTime = h.getModificationTime();
530 }
531
532 return new FileStatus(
533 h.isDir()? 0L: h.getLength(),
534 h.isDir(),
535 underlying.getReplication(),
536 underlying.getBlockSize(),
537 modTime,
538 underlying.getAccessTime(),
539 underlying.getPermission(),
540 underlying.getOwner(),
541 underlying.getGroup(),
542 makeRelative(this.uri.getPath(), new Path(h.name)));
543 }
544
545 // a single line parser for hadoop archives status
546 // stored in a single line in the index files
547 // the format is of the form
548 // filename "dir"/"file" partFileName startIndex length
549 // <space seperated children>
550 private class HarStatus {
551 boolean isDir;
552 String name;
553 List<String> children;
554 String partName;
555 long startIndex;
556 long length;
557 long modificationTime = 0;
558
559 public HarStatus(String harString) throws UnsupportedEncodingException {
560 String[] splits = harString.split(" ");
561 this.name = decodeFileName(splits[0]);
562 this.isDir = "dir".equals(splits[1]) ? true: false;
563 // this is equal to "none" if its a directory
564 this.partName = splits[2];
565 this.startIndex = Long.parseLong(splits[3]);
566 this.length = Long.parseLong(splits[4]);
567
568 int version = metadata.getVersion();
569 String[] propSplits = null;
570 // propSplits is used to retrieve the metainformation that Har versions
571 // 1 & 2 missed (modification time, permission, owner group).
572 // These fields are stored in an encoded string placed in different
573 // locations depending on whether it's a file or directory entry.
574 // If it's a directory, the string will be placed at the partName
575 // location (directories have no partName because they don't have data
576 // to be stored). This is done because the number of fields in a
577 // directory entry is unbounded (all children are listed at the end)
578 // If it's a file, the string will be the last field.
579 if (isDir) {
580 if (version == 3){
581 propSplits = decodeString(this.partName).split(" ");
582 }
583 children = new ArrayList<String>();
584 for (int i = 5; i < splits.length; i++) {
585 children.add(decodeFileName(splits[i]));
586 }
587 } else if (version == 3) {
588 propSplits = decodeString(splits[5]).split(" ");
589 }
590
591 if (propSplits != null && propSplits.length >= 4) {
592 modificationTime = Long.parseLong(propSplits[0]);
593 // the fields below are stored in the file but are currently not used
594 // by HarFileSystem
595 // permission = new FsPermission(Short.parseShort(propSplits[1]));
596 // owner = decodeString(propSplits[2]);
597 // group = decodeString(propSplits[3]);
598 }
599 }
600 public boolean isDir() {
601 return isDir;
602 }
603
604 public String getName() {
605 return name;
606 }
607 public String getPartName() {
608 return partName;
609 }
610 public long getStartIndex() {
611 return startIndex;
612 }
613 public long getLength() {
614 return length;
615 }
616 public long getModificationTime() {
617 return modificationTime;
618 }
619 }
620
621 /**
622 * return the filestatus of files in har archive.
623 * The permission returned are that of the archive
624 * index files. The permissions are not persisted
625 * while creating a hadoop archive.
626 * @param f the path in har filesystem
627 * @return filestatus.
628 * @throws IOException
629 */
630 @Override
631 public FileStatus getFileStatus(Path f) throws IOException {
632 HarStatus hstatus = getFileHarStatus(f);
633 return toFileStatus(hstatus, null);
634 }
635
636 private HarStatus getFileHarStatus(Path f) throws IOException {
637 // get the fs DataInputStream for the underlying file
638 // look up the index.
639 Path p = makeQualified(f);
640 Path harPath = getPathInHar(p);
641 if (harPath == null) {
642 throw new IOException("Invalid file name: " + f + " in " + uri);
643 }
644 HarStatus hstatus = metadata.archive.get(harPath);
645 if (hstatus == null) {
646 throw new FileNotFoundException("File: " + f + " does not exist in " + uri);
647 }
648 return hstatus;
649 }
650
651 /**
652 * @return null since no checksum algorithm is implemented.
653 */
654 @Override
655 public FileChecksum getFileChecksum(Path f) {
656 return null;
657 }
658
659 /**
660 * Returns a har input stream which fakes end of
661 * file. It reads the index files to get the part
662 * file name and the size and start of the file.
663 */
664 @Override
665 public FSDataInputStream open(Path f, int bufferSize) throws IOException {
666 // get the fs DataInputStream for the underlying file
667 HarStatus hstatus = getFileHarStatus(f);
668 // we got it.. woo hooo!!!
669 if (hstatus.isDir()) {
670 throw new FileNotFoundException(f + " : not a file in " +
671 archivePath);
672 }
673 return new HarFSDataInputStream(fs, new Path(archivePath,
674 hstatus.getPartName()),
675 hstatus.getStartIndex(), hstatus.getLength(), bufferSize);
676 }
677
678 @Override
679 public FSDataOutputStream create(Path f,
680 FsPermission permission,
681 boolean overwrite,
682 int bufferSize,
683 short replication,
684 long blockSize,
685 Progressable progress) throws IOException {
686 throw new IOException("Har: create not allowed.");
687 }
688
689 @Override
690 public void close() throws IOException {
691 if (fs != null) {
692 try {
693 fs.close();
694 } catch(IOException ie) {
695 //this might already be closed
696 // ignore
697 }
698 }
699 }
700
701 /**
702 * Not implemented.
703 */
704 @Override
705 public boolean setReplication(Path src, short replication) throws IOException{
706 throw new IOException("Har: setreplication not allowed");
707 }
708
709 /**
710 * Not implemented.
711 */
712 @Override
713 public boolean delete(Path f, boolean recursive) throws IOException {
714 throw new IOException("Har: delete not allowed");
715 }
716
717 /**
718 * liststatus returns the children of a directory
719 * after looking up the index files.
720 */
721 @Override
722 public FileStatus[] listStatus(Path f) throws IOException {
723 //need to see if the file is an index in file
724 //get the filestatus of the archive directory
725 // we will create fake filestatuses to return
726 // to the client
727 List<FileStatus> statuses = new ArrayList<FileStatus>();
728 Path tmpPath = makeQualified(f);
729 Path harPath = getPathInHar(tmpPath);
730 HarStatus hstatus = metadata.archive.get(harPath);
731 if (hstatus == null) {
732 throw new FileNotFoundException("File " + f + " not found in " + archivePath);
733 }
734 if (hstatus.isDir()) {
735 fileStatusesInIndex(hstatus, statuses, hstatus.children);
736 } else {
737 statuses.add(toFileStatus(hstatus, null));
738 }
739
740 return statuses.toArray(new FileStatus[statuses.size()]);
741 }
742
743 /**
744 * return the top level archive path.
745 */
746 @Override
747 public Path getHomeDirectory() {
748 return new Path(uri.toString());
749 }
750
751 @Override
752 public void setWorkingDirectory(Path newDir) {
753 //does nothing.
754 }
755
756 /**
757 * not implemented.
758 */
759 @Override
760 public boolean mkdirs(Path f, FsPermission permission) throws IOException {
761 throw new IOException("Har: mkdirs not allowed");
762 }
763
764 /**
765 * not implemented.
766 */
767 @Override
768 public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws
769 IOException {
770 throw new IOException("Har: copyfromlocalfile not allowed");
771 }
772
773 /**
774 * copies the file in the har filesystem to a local file.
775 */
776 @Override
777 public void copyToLocalFile(boolean delSrc, Path src, Path dst)
778 throws IOException {
779 FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf());
780 }
781
782 /**
783 * not implemented.
784 */
785 @Override
786 public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile)
787 throws IOException {
788 throw new IOException("Har: startLocalOutput not allowed");
789 }
790
791 /**
792 * not implemented.
793 */
794 @Override
795 public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile)
796 throws IOException {
797 throw new IOException("Har: completeLocalOutput not allowed");
798 }
799
800 /**
801 * not implemented.
802 */
803 @Override
804 public void setOwner(Path p, String username, String groupname)
805 throws IOException {
806 throw new IOException("Har: setowner not allowed");
807 }
808
809 /**
810 * Not implemented.
811 */
812 @Override
813 public void setPermission(Path p, FsPermission permisssion)
814 throws IOException {
815 throw new IOException("Har: setPermission not allowed");
816 }
817
818 /**
819 * Hadoop archives input stream. This input stream fakes EOF
820 * since archive files are part of bigger part files.
821 */
822 private static class HarFSDataInputStream extends FSDataInputStream {
823 /**
824 * Create an input stream that fakes all the reads/positions/seeking.
825 */
826 private static class HarFsInputStream extends FSInputStream {
827 private long position, start, end;
828 //The underlying data input stream that the
829 // underlying filesystem will return.
830 private FSDataInputStream underLyingStream;
831 //one byte buffer
832 private byte[] oneBytebuff = new byte[1];
833 HarFsInputStream(FileSystem fs, Path path, long start,
834 long length, int bufferSize) throws IOException {
835 underLyingStream = fs.open(path, bufferSize);
836 underLyingStream.seek(start);
837 // the start of this file in the part file
838 this.start = start;
839 // the position pointer in the part file
840 this.position = start;
841 // the end pointer in the part file
842 this.end = start + length;
843 }
844
845 @Override
846 public synchronized int available() throws IOException {
847 long remaining = end - underLyingStream.getPos();
848 if (remaining > (long)Integer.MAX_VALUE) {
849 return Integer.MAX_VALUE;
850 }
851 return (int) remaining;
852 }
853
854 @Override
855 public synchronized void close() throws IOException {
856 underLyingStream.close();
857 super.close();
858 }
859
860 //not implemented
861 @Override
862 public void mark(int readLimit) {
863 // do nothing
864 }
865
866 /**
867 * reset is not implemented
868 */
869 @Override
870 public void reset() throws IOException {
871 throw new IOException("reset not implemented.");
872 }
873
874 @Override
875 public synchronized int read() throws IOException {
876 int ret = read(oneBytebuff, 0, 1);
877 return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff);
878 }
879
880 @Override
881 public synchronized int read(byte[] b) throws IOException {
882 int ret = read(b, 0, b.length);
883 if (ret != -1) {
884 position += ret;
885 }
886 return ret;
887 }
888
889 /**
890 *
891 */
892 @Override
893 public synchronized int read(byte[] b, int offset, int len)
894 throws IOException {
895 int newlen = len;
896 int ret = -1;
897 if (position + len > end) {
898 newlen = (int) (end - position);
899 }
900 // end case
901 if (newlen == 0)
902 return ret;
903 ret = underLyingStream.read(b, offset, newlen);
904 position += ret;
905 return ret;
906 }
907
908 @Override
909 public synchronized long skip(long n) throws IOException {
910 long tmpN = n;
911 if (tmpN > 0) {
912 if (position + tmpN > end) {
913 tmpN = end - position;
914 }
915 underLyingStream.seek(tmpN + position);
916 position += tmpN;
917 return tmpN;
918 }
919 return (tmpN < 0)? -1 : 0;
920 }
921
922 @Override
923 public synchronized long getPos() throws IOException {
924 return (position - start);
925 }
926
927 @Override
928 public synchronized void seek(long pos) throws IOException {
929 if (pos < 0 || (start + pos > end)) {
930 throw new IOException("Failed to seek: EOF");
931 }
932 position = start + pos;
933 underLyingStream.seek(position);
934 }
935
936 @Override
937 public boolean seekToNewSource(long targetPos) throws IOException {
938 //do not need to implement this
939 // hdfs in itself does seektonewsource
940 // while reading.
941 return false;
942 }
943
944 /**
945 * implementing position readable.
946 */
947 @Override
948 public int read(long pos, byte[] b, int offset, int length)
949 throws IOException {
950 int nlength = length;
951 if (start + nlength + pos > end) {
952 nlength = (int) (end - (start + pos));
953 }
954 return underLyingStream.read(pos + start , b, offset, nlength);
955 }
956
957 /**
958 * position readable again.
959 */
960 @Override
961 public void readFully(long pos, byte[] b, int offset, int length)
962 throws IOException {
963 if (start + length + pos > end) {
964 throw new IOException("Not enough bytes to read.");
965 }
966 underLyingStream.readFully(pos + start, b, offset, length);
967 }
968
969 @Override
970 public void readFully(long pos, byte[] b) throws IOException {
971 readFully(pos, b, 0, b.length);
972 }
973
974 }
975
976 /**
977 * constructors for har input stream.
978 * @param fs the underlying filesystem
979 * @param p The path in the underlying filesystem
980 * @param start the start position in the part file
981 * @param length the length of valid data in the part file
982 * @param bufsize the buffer size
983 * @throws IOException
984 */
985 public HarFSDataInputStream(FileSystem fs, Path p, long start,
986 long length, int bufsize) throws IOException {
987 super(new HarFsInputStream(fs, p, start, length, bufsize));
988 }
989
990 /**
991 * constructor for har input stream.
992 * @param fs the underlying filesystem
993 * @param p the path in the underlying file system
994 * @param start the start position in the part file
995 * @param length the length of valid data in the part file.
996 * @throws IOException
997 */
998 public HarFSDataInputStream(FileSystem fs, Path p, long start, long length)
999 throws IOException {
1000 super(new HarFsInputStream(fs, p, start, length, 0));
1001 }
1002 }
1003
1004 private class HarMetaData {
1005 private FileSystem fs;
1006 private int version;
1007 // the masterIndex of the archive
1008 private Path masterIndexPath;
1009 // the index file
1010 private Path archiveIndexPath;
1011
1012 private long masterIndexTimestamp;
1013 private long archiveIndexTimestamp;
1014
1015 List<Store> stores = new ArrayList<Store>();
1016 Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
1017 private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
1018
1019 public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
1020 this.fs = fs;
1021 this.masterIndexPath = masterIndexPath;
1022 this.archiveIndexPath = archiveIndexPath;
1023 }
1024
1025 public FileStatus getPartFileStatus(Path partPath) throws IOException {
1026 FileStatus status;
1027 status = partFileStatuses.get(partPath);
1028 if (status == null) {
1029 status = fs.getFileStatus(partPath);
1030 partFileStatuses.put(partPath, status);
1031 }
1032 return status;
1033 }
1034
1035 public long getMasterIndexTimestamp() {
1036 return masterIndexTimestamp;
1037 }
1038
1039 public long getArchiveIndexTimestamp() {
1040 return archiveIndexTimestamp;
1041 }
1042
1043 private int getVersion() {
1044 return version;
1045 }
1046
1047 private void parseMetaData() throws IOException {
1048 Text line;
1049 long read;
1050 FSDataInputStream in = null;
1051 LineReader lin = null;
1052
1053 try {
1054 in = fs.open(masterIndexPath);
1055 FileStatus masterStat = fs.getFileStatus(masterIndexPath);
1056 masterIndexTimestamp = masterStat.getModificationTime();
1057 lin = new LineReader(in, getConf());
1058 line = new Text();
1059 read = lin.readLine(line);
1060
1061 // the first line contains the version of the index file
1062 String versionLine = line.toString();
1063 String[] arr = versionLine.split(" ");
1064 version = Integer.parseInt(arr[0]);
1065 // make it always backwards-compatible
1066 if (this.version > HarFileSystem.VERSION) {
1067 throw new IOException("Invalid version " +
1068 this.version + " expected " + HarFileSystem.VERSION);
1069 }
1070
1071 // each line contains a hashcode range and the index file name
1072 String[] readStr = null;
1073 while(read < masterStat.getLen()) {
1074 int b = lin.readLine(line);
1075 read += b;
1076 readStr = line.toString().split(" ");
1077 int startHash = Integer.parseInt(readStr[0]);
1078 int endHash = Integer.parseInt(readStr[1]);
1079 stores.add(new Store(Long.parseLong(readStr[2]),
1080 Long.parseLong(readStr[3]), startHash,
1081 endHash));
1082 line.clear();
1083 }
1084 } finally {
1085 IOUtils.cleanup(LOG, lin, in);
1086 }
1087
1088 FSDataInputStream aIn = fs.open(archiveIndexPath);
1089 try {
1090 FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
1091 archiveIndexTimestamp = archiveStat.getModificationTime();
1092 LineReader aLin;
1093
1094 // now start reading the real index file
1095 for (Store s: stores) {
1096 read = 0;
1097 aIn.seek(s.begin);
1098 aLin = new LineReader(aIn, getConf());
1099 while (read + s.begin < s.end) {
1100 int tmp = aLin.readLine(line);
1101 read += tmp;
1102 String lineFeed = line.toString();
1103 String[] parsed = lineFeed.split(" ");
1104 parsed[0] = decodeFileName(parsed[0]);
1105 archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
1106 line.clear();
1107 }
1108 }
1109 } finally {
1110 IOUtils.cleanup(LOG, aIn);
1111 }
1112 }
1113 }
1114
1115 /*
1116 * testing purposes only:
1117 */
1118 HarMetaData getMetadata() {
1119 return metadata;
1120 }
1121
1122 private static class LruCache<K, V> extends LinkedHashMap<K, V> {
1123 private final int MAX_ENTRIES;
1124
1125 public LruCache(int maxEntries) {
1126 super(maxEntries + 1, 1.0f, true);
1127 MAX_ENTRIES = maxEntries;
1128 }
1129
1130 @Override
1131 protected boolean removeEldestEntry(Map.Entry<K, V> eldest) {
1132 return size() > MAX_ENTRIES;
1133 }
1134 }
1135 }