001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.fs;
019
020 import java.io.FileNotFoundException;
021 import java.io.IOException;
022 import java.io.UnsupportedEncodingException;
023 import java.net.URI;
024 import java.net.URISyntaxException;
025 import java.net.URLDecoder;
026 import java.util.ArrayList;
027 import java.util.List;
028 import java.util.Map;
029 import java.util.TreeMap;
030 import java.util.HashMap;
031 import java.util.concurrent.ConcurrentHashMap;
032
033 import org.apache.hadoop.conf.Configuration;
034 import org.apache.hadoop.fs.permission.FsPermission;
035 import org.apache.hadoop.io.Text;
036 import org.apache.hadoop.util.LineReader;
037 import org.apache.hadoop.util.Progressable;
038
039 /**
040 * This is an implementation of the Hadoop Archive
041 * Filesystem. This archive Filesystem has index files
042 * of the form _index* and has contents of the form
043 * part-*. The index files store the indexes of the
044 * real files. The index files are of the form _masterindex
045 * and _index. The master index is a level of indirection
046 * in to the index file to make the look ups faster. the index
047 * file is sorted with hash code of the paths that it contains
048 * and the master index contains pointers to the positions in
049 * index for ranges of hashcodes.
050 */
051
052 public class HarFileSystem extends FilterFileSystem {
053 public static final int VERSION = 3;
054
055 private static final Map<URI, HarMetaData> harMetaCache =
056 new ConcurrentHashMap<URI, HarMetaData>();
057
058 // uri representation of this Har filesystem
059 private URI uri;
060 // the top level path of the archive
061 // in the underlying file system
062 private Path archivePath;
063 // the har auth
064 private String harAuth;
065
066 // pointer into the static metadata cache
067 private HarMetaData metadata;
068
069 /**
070 * public construction of harfilesystem
071 *
072 */
073 public HarFileSystem() {
074 }
075
076 /**
077 * Constructor to create a HarFileSystem with an
078 * underlying filesystem.
079 * @param fs
080 */
081 public HarFileSystem(FileSystem fs) {
082 super(fs);
083 }
084
085 /**
086 * Initialize a Har filesystem per har archive. The
087 * archive home directory is the top level directory
088 * in the filesystem that contains the HAR archive.
089 * Be careful with this method, you do not want to go
090 * on creating new Filesystem instances per call to
091 * path.getFileSystem().
092 * the uri of Har is
093 * har://underlyingfsscheme-host:port/archivepath.
094 * or
095 * har:///archivepath. This assumes the underlying filesystem
096 * to be used in case not specified.
097 */
098 public void initialize(URI name, Configuration conf) throws IOException {
099 // decode the name
100 URI underLyingURI = decodeHarURI(name, conf);
101 // we got the right har Path- now check if this is
102 // truly a har filesystem
103 Path harPath = archivePath(
104 new Path(name.getScheme(), name.getAuthority(), name.getPath()));
105 if (harPath == null) {
106 throw new IOException("Invalid path for the Har Filesystem. " +
107 name.toString());
108 }
109 if (fs == null) {
110 fs = FileSystem.get(underLyingURI, conf);
111 }
112 uri = harPath.toUri();
113 archivePath = new Path(uri.getPath());
114 harAuth = getHarAuth(underLyingURI);
115 //check for the underlying fs containing
116 // the index file
117 Path masterIndexPath = new Path(archivePath, "_masterindex");
118 Path archiveIndexPath = new Path(archivePath, "_index");
119 if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) {
120 throw new IOException("Invalid path for the Har Filesystem. " +
121 "No index file in " + harPath);
122 }
123
124 metadata = harMetaCache.get(uri);
125 if (metadata != null) {
126 FileStatus mStat = fs.getFileStatus(masterIndexPath);
127 FileStatus aStat = fs.getFileStatus(archiveIndexPath);
128 if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() ||
129 aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) {
130 // the archive has been overwritten since we last read it
131 // remove the entry from the meta data cache
132 metadata = null;
133 harMetaCache.remove(uri);
134 }
135 }
136 if (metadata == null) {
137 metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath);
138 metadata.parseMetaData();
139 harMetaCache.put(uri, metadata);
140 }
141 }
142
143 // get the version of the filesystem from the masterindex file
144 // the version is currently not useful since its the first version
145 // of archives
146 public int getHarVersion() throws IOException {
147 if (metadata != null) {
148 return metadata.getVersion();
149 }
150 else {
151 throw new IOException("Invalid meta data for the Har Filesystem");
152 }
153 }
154
155 /*
156 * find the parent path that is the
157 * archive path in the path. The last
158 * path segment that ends with .har is
159 * the path that will be returned.
160 */
161 private Path archivePath(Path p) {
162 Path retPath = null;
163 Path tmp = p;
164 for (int i=0; i< p.depth(); i++) {
165 if (tmp.toString().endsWith(".har")) {
166 retPath = tmp;
167 break;
168 }
169 tmp = tmp.getParent();
170 }
171 return retPath;
172 }
173
174 /**
175 * decode the raw URI to get the underlying URI
176 * @param rawURI raw Har URI
177 * @return filtered URI of the underlying fileSystem
178 */
179 private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException {
180 String tmpAuth = rawURI.getAuthority();
181 //we are using the default file
182 //system in the config
183 //so create a underlying uri and
184 //return it
185 if (tmpAuth == null) {
186 //create a path
187 return FileSystem.getDefaultUri(conf);
188 }
189 String host = rawURI.getHost();
190 if (host == null) {
191 throw new IOException("URI: " + rawURI
192 + " is an invalid Har URI since host==null."
193 + " Expecting har://<scheme>-<host>/<path>.");
194 }
195 int i = host.indexOf('-');
196 if (i < 0) {
197 throw new IOException("URI: " + rawURI
198 + " is an invalid Har URI since '-' not found."
199 + " Expecting har://<scheme>-<host>/<path>.");
200 }
201 final String underLyingScheme = host.substring(0, i);
202 i++;
203 final String underLyingHost = i == host.length()? null: host.substring(i);
204 int underLyingPort = rawURI.getPort();
205 String auth = (underLyingHost == null && underLyingPort == -1)?
206 null:(underLyingHost+":"+underLyingPort);
207 URI tmp = null;
208 if (rawURI.getQuery() != null) {
209 // query component not allowed
210 throw new IOException("query component in Path not supported " + rawURI);
211 }
212 try {
213 tmp = new URI(underLyingScheme, auth, rawURI.getPath(),
214 rawURI.getQuery(), rawURI.getFragment());
215 } catch (URISyntaxException e) {
216 // do nothing should not happen
217 }
218 return tmp;
219 }
220
221 private static String decodeString(String str)
222 throws UnsupportedEncodingException {
223 return URLDecoder.decode(str, "UTF-8");
224 }
225
226 private String decodeFileName(String fname)
227 throws UnsupportedEncodingException {
228 int version = metadata.getVersion();
229 if (version == 2 || version == 3){
230 return decodeString(fname);
231 }
232 return fname;
233 }
234
235 /**
236 * return the top level archive.
237 */
238 public Path getWorkingDirectory() {
239 return new Path(uri.toString());
240 }
241
242 /**
243 * Create a har specific auth
244 * har-underlyingfs:port
245 * @param underLyingURI the uri of underlying
246 * filesystem
247 * @return har specific auth
248 */
249 private String getHarAuth(URI underLyingUri) {
250 String auth = underLyingUri.getScheme() + "-";
251 if (underLyingUri.getHost() != null) {
252 auth += underLyingUri.getHost() + ":";
253 if (underLyingUri.getPort() != -1) {
254 auth += underLyingUri.getPort();
255 }
256 }
257 else {
258 auth += ":";
259 }
260 return auth;
261 }
262
263 /**
264 * Returns the uri of this filesystem.
265 * The uri is of the form
266 * har://underlyingfsschema-host:port/pathintheunderlyingfs
267 */
268 @Override
269 public URI getUri() {
270 return this.uri;
271 }
272
273 /**
274 * this method returns the path
275 * inside the har filesystem.
276 * this is relative path inside
277 * the har filesystem.
278 * @param path the fully qualified path in the har filesystem.
279 * @return relative path in the filesystem.
280 */
281 private Path getPathInHar(Path path) {
282 Path harPath = new Path(path.toUri().getPath());
283 if (archivePath.compareTo(harPath) == 0)
284 return new Path(Path.SEPARATOR);
285 Path tmp = new Path(harPath.getName());
286 Path parent = harPath.getParent();
287 while (!(parent.compareTo(archivePath) == 0)) {
288 if (parent.toString().equals(Path.SEPARATOR)) {
289 tmp = null;
290 break;
291 }
292 tmp = new Path(parent.getName(), tmp);
293 parent = parent.getParent();
294 }
295 if (tmp != null)
296 tmp = new Path(Path.SEPARATOR, tmp);
297 return tmp;
298 }
299
300 //the relative path of p. basically
301 // getting rid of /. Parsing and doing
302 // string manipulation is not good - so
303 // just use the path api to do it.
304 private Path makeRelative(String initial, Path p) {
305 String scheme = this.uri.getScheme();
306 String authority = this.uri.getAuthority();
307 Path root = new Path(Path.SEPARATOR);
308 if (root.compareTo(p) == 0)
309 return new Path(scheme, authority, initial);
310 Path retPath = new Path(p.getName());
311 Path parent = p.getParent();
312 for (int i=0; i < p.depth()-1; i++) {
313 retPath = new Path(parent.getName(), retPath);
314 parent = parent.getParent();
315 }
316 return new Path(new Path(scheme, authority, initial),
317 retPath.toString());
318 }
319
320 /* this makes a path qualified in the har filesystem
321 * (non-Javadoc)
322 * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified(
323 * org.apache.hadoop.fs.Path)
324 */
325 @Override
326 public Path makeQualified(Path path) {
327 // make sure that we just get the
328 // path component
329 Path fsPath = path;
330 if (!path.isAbsolute()) {
331 fsPath = new Path(archivePath, path);
332 }
333
334 URI tmpURI = fsPath.toUri();
335 //change this to Har uri
336 return new Path(uri.getScheme(), harAuth, tmpURI.getPath());
337 }
338
339 /**
340 * Fix offset and length of block locations.
341 * Note that this method modifies the original array.
342 * @param locations block locations of har part file
343 * @param start the start of the desired range in the contained file
344 * @param len the length of the desired range
345 * @param fileOffsetInHar the offset of the desired file in the har part file
346 * @return block locations with fixed offset and length
347 */
348 static BlockLocation[] fixBlockLocations(BlockLocation[] locations,
349 long start,
350 long len,
351 long fileOffsetInHar) {
352 // offset 1 past last byte of desired range
353 long end = start + len;
354
355 for (BlockLocation location : locations) {
356 // offset of part block relative to beginning of desired file
357 // (may be negative if file starts in this part block)
358 long harBlockStart = location.getOffset() - fileOffsetInHar;
359 // offset 1 past last byte of har block relative to beginning of
360 // desired file
361 long harBlockEnd = harBlockStart + location.getLength();
362
363 if (start > harBlockStart) {
364 // desired range starts after beginning of this har block
365 // fix offset to beginning of relevant range (relative to desired file)
366 location.setOffset(start);
367 // fix length to relevant portion of har block
368 location.setLength(location.getLength() - (start - harBlockStart));
369 } else {
370 // desired range includes beginning of this har block
371 location.setOffset(harBlockStart);
372 }
373
374 if (harBlockEnd > end) {
375 // range ends before end of this har block
376 // fix length to remove irrelevant portion at the end
377 location.setLength(location.getLength() - (harBlockEnd - end));
378 }
379 }
380
381 return locations;
382 }
383
384 /**
385 * Get block locations from the underlying fs and fix their
386 * offsets and lengths.
387 * @param file the input filestatus to get block locations
388 * @param start the start of the desired range in the contained file
389 * @param len the length of the desired range
390 * @return block locations for this segment of file
391 * @throws IOException
392 */
393 @Override
394 public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
395 long len) throws IOException {
396 HarStatus hstatus = getFileHarStatus(file.getPath());
397 Path partPath = new Path(archivePath, hstatus.getPartName());
398 FileStatus partStatus = metadata.getPartFileStatus(partPath);
399
400 // get all part blocks that overlap with the desired file blocks
401 BlockLocation[] locations =
402 fs.getFileBlockLocations(partStatus,
403 hstatus.getStartIndex() + start, len);
404
405 return fixBlockLocations(locations, start, len, hstatus.getStartIndex());
406 }
407
408 /**
409 * the hash of the path p inside iniside
410 * the filesystem
411 * @param p the path in the harfilesystem
412 * @return the hash code of the path.
413 */
414 public static int getHarHash(Path p) {
415 return (p.toString().hashCode() & 0x7fffffff);
416 }
417
418 static class Store {
419 public Store() {
420 begin = end = startHash = endHash = 0;
421 }
422 public Store(long begin, long end, int startHash, int endHash) {
423 this.begin = begin;
424 this.end = end;
425 this.startHash = startHash;
426 this.endHash = endHash;
427 }
428 public long begin;
429 public long end;
430 public int startHash;
431 public int endHash;
432 }
433
434 /**
435 * Get filestatuses of all the children of a given directory. This just reads
436 * through index file and reads line by line to get all statuses for children
437 * of a directory. Its a brute force way of getting all such filestatuses
438 *
439 * @param parent
440 * the parent path directory
441 * @param statuses
442 * the list to add the children filestatuses to
443 * @param children
444 * the string list of children for this parent
445 * @param archiveIndexStat
446 * the archive index filestatus
447 */
448 private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses,
449 List<String> children) throws IOException {
450 String parentString = parent.getName();
451 if (!parentString.endsWith(Path.SEPARATOR)){
452 parentString += Path.SEPARATOR;
453 }
454 Path harPath = new Path(parentString);
455 int harlen = harPath.depth();
456 final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
457
458 for (HarStatus hstatus : metadata.archive.values()) {
459 String child = hstatus.getName();
460 if ((child.startsWith(parentString))) {
461 Path thisPath = new Path(child);
462 if (thisPath.depth() == harlen + 1) {
463 statuses.add(toFileStatus(hstatus, cache));
464 }
465 }
466 }
467 }
468
469 /**
470 * Combine the status stored in the index and the underlying status.
471 * @param h status stored in the index
472 * @param cache caching the underlying file statuses
473 * @return the combined file status
474 * @throws IOException
475 */
476 private FileStatus toFileStatus(HarStatus h,
477 Map<String, FileStatus> cache) throws IOException {
478 FileStatus underlying = null;
479 if (cache != null) {
480 underlying = cache.get(h.partName);
481 }
482 if (underlying == null) {
483 final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
484 underlying = fs.getFileStatus(p);
485 if (cache != null) {
486 cache.put(h.partName, underlying);
487 }
488 }
489
490 long modTime = 0;
491 int version = metadata.getVersion();
492 if (version < 3) {
493 modTime = underlying.getModificationTime();
494 } else if (version == 3) {
495 modTime = h.getModificationTime();
496 }
497
498 return new FileStatus(
499 h.isDir()? 0L: h.getLength(),
500 h.isDir(),
501 underlying.getReplication(),
502 underlying.getBlockSize(),
503 modTime,
504 underlying.getAccessTime(),
505 underlying.getPermission(),
506 underlying.getOwner(),
507 underlying.getGroup(),
508 makeRelative(this.uri.getPath(), new Path(h.name)));
509 }
510
511 // a single line parser for hadoop archives status
512 // stored in a single line in the index files
513 // the format is of the form
514 // filename "dir"/"file" partFileName startIndex length
515 // <space seperated children>
516 private class HarStatus {
517 boolean isDir;
518 String name;
519 List<String> children;
520 String partName;
521 long startIndex;
522 long length;
523 long modificationTime = 0;
524
525 public HarStatus(String harString) throws UnsupportedEncodingException {
526 String[] splits = harString.split(" ");
527 this.name = decodeFileName(splits[0]);
528 this.isDir = "dir".equals(splits[1]) ? true: false;
529 // this is equal to "none" if its a directory
530 this.partName = splits[2];
531 this.startIndex = Long.parseLong(splits[3]);
532 this.length = Long.parseLong(splits[4]);
533
534 int version = metadata.getVersion();
535 String[] propSplits = null;
536 // propSplits is used to retrieve the metainformation that Har versions
537 // 1 & 2 missed (modification time, permission, owner group).
538 // These fields are stored in an encoded string placed in different
539 // locations depending on whether it's a file or directory entry.
540 // If it's a directory, the string will be placed at the partName
541 // location (directories have no partName because they don't have data
542 // to be stored). This is done because the number of fields in a
543 // directory entry is unbounded (all children are listed at the end)
544 // If it's a file, the string will be the last field.
545 if (isDir) {
546 if (version == 3){
547 propSplits = decodeString(this.partName).split(" ");
548 }
549 children = new ArrayList<String>();
550 for (int i = 5; i < splits.length; i++) {
551 children.add(decodeFileName(splits[i]));
552 }
553 } else if (version == 3) {
554 propSplits = decodeString(splits[5]).split(" ");
555 }
556
557 if (propSplits != null && propSplits.length >= 4) {
558 modificationTime = Long.parseLong(propSplits[0]);
559 // the fields below are stored in the file but are currently not used
560 // by HarFileSystem
561 // permission = new FsPermission(Short.parseShort(propSplits[1]));
562 // owner = decodeString(propSplits[2]);
563 // group = decodeString(propSplits[3]);
564 }
565 }
566 public boolean isDir() {
567 return isDir;
568 }
569
570 public String getName() {
571 return name;
572 }
573
574 public List<String> getChildren() {
575 return children;
576 }
577 public String getFileName() {
578 return name;
579 }
580 public String getPartName() {
581 return partName;
582 }
583 public long getStartIndex() {
584 return startIndex;
585 }
586 public long getLength() {
587 return length;
588 }
589 public long getModificationTime() {
590 return modificationTime;
591 }
592 }
593
594 /**
595 * return the filestatus of files in har archive.
596 * The permission returned are that of the archive
597 * index files. The permissions are not persisted
598 * while creating a hadoop archive.
599 * @param f the path in har filesystem
600 * @return filestatus.
601 * @throws IOException
602 */
603 @Override
604 public FileStatus getFileStatus(Path f) throws IOException {
605 HarStatus hstatus = getFileHarStatus(f);
606 return toFileStatus(hstatus, null);
607 }
608
609 private HarStatus getFileHarStatus(Path f) throws IOException {
610 // get the fs DataInputStream for the underlying file
611 // look up the index.
612 Path p = makeQualified(f);
613 Path harPath = getPathInHar(p);
614 if (harPath == null) {
615 throw new IOException("Invalid file name: " + f + " in " + uri);
616 }
617 HarStatus hstatus = metadata.archive.get(harPath);
618 if (hstatus == null) {
619 throw new FileNotFoundException("File: " + f + " does not exist in " + uri);
620 }
621 return hstatus;
622 }
623
624 /**
625 * @return null since no checksum algorithm is implemented.
626 */
627 public FileChecksum getFileChecksum(Path f) {
628 return null;
629 }
630
631 /**
632 * Returns a har input stream which fakes end of
633 * file. It reads the index files to get the part
634 * file name and the size and start of the file.
635 */
636 @Override
637 public FSDataInputStream open(Path f, int bufferSize) throws IOException {
638 // get the fs DataInputStream for the underlying file
639 HarStatus hstatus = getFileHarStatus(f);
640 // we got it.. woo hooo!!!
641 if (hstatus.isDir()) {
642 throw new FileNotFoundException(f + " : not a file in " +
643 archivePath);
644 }
645 return new HarFSDataInputStream(fs, new Path(archivePath,
646 hstatus.getPartName()),
647 hstatus.getStartIndex(), hstatus.getLength(), bufferSize);
648 }
649
650 /*
651 * create throws an exception in Har filesystem.
652 * The archive once created cannot be changed.
653 */
654 public FSDataOutputStream create(Path f, int bufferSize)
655 throws IOException {
656 throw new IOException("Har: Create not allowed");
657 }
658
659 public FSDataOutputStream create(Path f,
660 FsPermission permission,
661 boolean overwrite,
662 int bufferSize,
663 short replication,
664 long blockSize,
665 Progressable progress) throws IOException {
666 throw new IOException("Har: create not allowed.");
667 }
668
669 @Override
670 public void close() throws IOException {
671 if (fs != null) {
672 try {
673 fs.close();
674 } catch(IOException ie) {
675 //this might already be closed
676 // ignore
677 }
678 }
679 }
680
681 /**
682 * Not implemented.
683 */
684 @Override
685 public boolean setReplication(Path src, short replication) throws IOException{
686 throw new IOException("Har: setreplication not allowed");
687 }
688
689 /**
690 * Not implemented.
691 */
692 @Override
693 public boolean delete(Path f, boolean recursive) throws IOException {
694 throw new IOException("Har: delete not allowed");
695 }
696
697 /**
698 * liststatus returns the children of a directory
699 * after looking up the index files.
700 */
701 @Override
702 public FileStatus[] listStatus(Path f) throws IOException {
703 //need to see if the file is an index in file
704 //get the filestatus of the archive directory
705 // we will create fake filestatuses to return
706 // to the client
707 List<FileStatus> statuses = new ArrayList<FileStatus>();
708 Path tmpPath = makeQualified(f);
709 Path harPath = getPathInHar(tmpPath);
710 HarStatus hstatus = metadata.archive.get(harPath);
711 if (hstatus == null) {
712 throw new FileNotFoundException("File " + f + " not found in " + archivePath);
713 }
714 if (hstatus.isDir()) {
715 fileStatusesInIndex(hstatus, statuses, hstatus.children);
716 } else {
717 statuses.add(toFileStatus(hstatus, null));
718 }
719
720 return statuses.toArray(new FileStatus[statuses.size()]);
721 }
722
723 /**
724 * return the top level archive path.
725 */
726 public Path getHomeDirectory() {
727 return new Path(uri.toString());
728 }
729
730 public void setWorkingDirectory(Path newDir) {
731 //does nothing.
732 }
733
734 /**
735 * not implemented.
736 */
737 public boolean mkdirs(Path f, FsPermission permission) throws IOException {
738 throw new IOException("Har: mkdirs not allowed");
739 }
740
741 /**
742 * not implemented.
743 */
744 public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws
745 IOException {
746 throw new IOException("Har: copyfromlocalfile not allowed");
747 }
748
749 /**
750 * copies the file in the har filesystem to a local file.
751 */
752 public void copyToLocalFile(boolean delSrc, Path src, Path dst)
753 throws IOException {
754 FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf());
755 }
756
757 /**
758 * not implemented.
759 */
760 public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile)
761 throws IOException {
762 throw new IOException("Har: startLocalOutput not allowed");
763 }
764
765 /**
766 * not implemented.
767 */
768 public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile)
769 throws IOException {
770 throw new IOException("Har: completeLocalOutput not allowed");
771 }
772
773 /**
774 * not implemented.
775 */
776 public void setOwner(Path p, String username, String groupname)
777 throws IOException {
778 throw new IOException("Har: setowner not allowed");
779 }
780
781 /**
782 * Not implemented.
783 */
784 public void setPermission(Path p, FsPermission permisssion)
785 throws IOException {
786 throw new IOException("Har: setPermission not allowed");
787 }
788
789 /**
790 * Hadoop archives input stream. This input stream fakes EOF
791 * since archive files are part of bigger part files.
792 */
793 private static class HarFSDataInputStream extends FSDataInputStream {
794 /**
795 * Create an input stream that fakes all the reads/positions/seeking.
796 */
797 private static class HarFsInputStream extends FSInputStream {
798 private long position, start, end;
799 //The underlying data input stream that the
800 // underlying filesystem will return.
801 private FSDataInputStream underLyingStream;
802 //one byte buffer
803 private byte[] oneBytebuff = new byte[1];
804 HarFsInputStream(FileSystem fs, Path path, long start,
805 long length, int bufferSize) throws IOException {
806 underLyingStream = fs.open(path, bufferSize);
807 underLyingStream.seek(start);
808 // the start of this file in the part file
809 this.start = start;
810 // the position pointer in the part file
811 this.position = start;
812 // the end pointer in the part file
813 this.end = start + length;
814 }
815
816 public synchronized int available() throws IOException {
817 long remaining = end - underLyingStream.getPos();
818 if (remaining > (long)Integer.MAX_VALUE) {
819 return Integer.MAX_VALUE;
820 }
821 return (int) remaining;
822 }
823
824 public synchronized void close() throws IOException {
825 underLyingStream.close();
826 super.close();
827 }
828
829 //not implemented
830 @Override
831 public void mark(int readLimit) {
832 // do nothing
833 }
834
835 /**
836 * reset is not implemented
837 */
838 public void reset() throws IOException {
839 throw new IOException("reset not implemented.");
840 }
841
842 public synchronized int read() throws IOException {
843 int ret = read(oneBytebuff, 0, 1);
844 return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff);
845 }
846
847 public synchronized int read(byte[] b) throws IOException {
848 int ret = read(b, 0, b.length);
849 if (ret != -1) {
850 position += ret;
851 }
852 return ret;
853 }
854
855 /**
856 *
857 */
858 public synchronized int read(byte[] b, int offset, int len)
859 throws IOException {
860 int newlen = len;
861 int ret = -1;
862 if (position + len > end) {
863 newlen = (int) (end - position);
864 }
865 // end case
866 if (newlen == 0)
867 return ret;
868 ret = underLyingStream.read(b, offset, newlen);
869 position += ret;
870 return ret;
871 }
872
873 public synchronized long skip(long n) throws IOException {
874 long tmpN = n;
875 if (tmpN > 0) {
876 if (position + tmpN > end) {
877 tmpN = end - position;
878 }
879 underLyingStream.seek(tmpN + position);
880 position += tmpN;
881 return tmpN;
882 }
883 return (tmpN < 0)? -1 : 0;
884 }
885
886 public synchronized long getPos() throws IOException {
887 return (position - start);
888 }
889
890 public synchronized void seek(long pos) throws IOException {
891 if (pos < 0 || (start + pos > end)) {
892 throw new IOException("Failed to seek: EOF");
893 }
894 position = start + pos;
895 underLyingStream.seek(position);
896 }
897
898 public boolean seekToNewSource(long targetPos) throws IOException {
899 //do not need to implement this
900 // hdfs in itself does seektonewsource
901 // while reading.
902 return false;
903 }
904
905 /**
906 * implementing position readable.
907 */
908 public int read(long pos, byte[] b, int offset, int length)
909 throws IOException {
910 int nlength = length;
911 if (start + nlength + pos > end) {
912 nlength = (int) (end - (start + pos));
913 }
914 return underLyingStream.read(pos + start , b, offset, nlength);
915 }
916
917 /**
918 * position readable again.
919 */
920 public void readFully(long pos, byte[] b, int offset, int length)
921 throws IOException {
922 if (start + length + pos > end) {
923 throw new IOException("Not enough bytes to read.");
924 }
925 underLyingStream.readFully(pos + start, b, offset, length);
926 }
927
928 public void readFully(long pos, byte[] b) throws IOException {
929 readFully(pos, b, 0, b.length);
930 }
931
932 }
933
934 /**
935 * constructors for har input stream.
936 * @param fs the underlying filesystem
937 * @param p The path in the underlying filesystem
938 * @param start the start position in the part file
939 * @param length the length of valid data in the part file
940 * @param bufsize the buffer size
941 * @throws IOException
942 */
943 public HarFSDataInputStream(FileSystem fs, Path p, long start,
944 long length, int bufsize) throws IOException {
945 super(new HarFsInputStream(fs, p, start, length, bufsize));
946 }
947
948 /**
949 * constructor for har input stream.
950 * @param fs the underlying filesystem
951 * @param p the path in the underlying file system
952 * @param start the start position in the part file
953 * @param length the length of valid data in the part file.
954 * @throws IOException
955 */
956 public HarFSDataInputStream(FileSystem fs, Path p, long start, long length)
957 throws IOException {
958 super(new HarFsInputStream(fs, p, start, length, 0));
959 }
960 }
961
962 private class HarMetaData {
963 private FileSystem fs;
964 private int version;
965 // the masterIndex of the archive
966 private Path masterIndexPath;
967 // the index file
968 private Path archiveIndexPath;
969
970 private long masterIndexTimestamp;
971 private long archiveIndexTimestamp;
972
973 List<Store> stores = new ArrayList<Store>();
974 Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
975 private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
976
977 public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
978 this.fs = fs;
979 this.masterIndexPath = masterIndexPath;
980 this.archiveIndexPath = archiveIndexPath;
981 }
982
983 public FileStatus getPartFileStatus(Path partPath) throws IOException {
984 FileStatus status;
985 status = partFileStatuses.get(partPath);
986 if (status == null) {
987 status = fs.getFileStatus(partPath);
988 partFileStatuses.put(partPath, status);
989 }
990 return status;
991 }
992
993 public long getMasterIndexTimestamp() {
994 return masterIndexTimestamp;
995 }
996
997 public long getArchiveIndexTimestamp() {
998 return archiveIndexTimestamp;
999 }
1000
1001 private int getVersion() {
1002 return version;
1003 }
1004
1005 private void parseMetaData() throws IOException {
1006 FSDataInputStream in = fs.open(masterIndexPath);
1007 FileStatus masterStat = fs.getFileStatus(masterIndexPath);
1008 masterIndexTimestamp = masterStat.getModificationTime();
1009 LineReader lin = new LineReader(in, getConf());
1010 Text line = new Text();
1011 long read = lin.readLine(line);
1012
1013 // the first line contains the version of the index file
1014 String versionLine = line.toString();
1015 String[] arr = versionLine.split(" ");
1016 version = Integer.parseInt(arr[0]);
1017 // make it always backwards-compatible
1018 if (this.version > HarFileSystem.VERSION) {
1019 throw new IOException("Invalid version " +
1020 this.version + " expected " + HarFileSystem.VERSION);
1021 }
1022
1023 // each line contains a hashcode range and the index file name
1024 String[] readStr = null;
1025 while(read < masterStat.getLen()) {
1026 int b = lin.readLine(line);
1027 read += b;
1028 readStr = line.toString().split(" ");
1029 int startHash = Integer.parseInt(readStr[0]);
1030 int endHash = Integer.parseInt(readStr[1]);
1031 stores.add(new Store(Long.parseLong(readStr[2]),
1032 Long.parseLong(readStr[3]), startHash,
1033 endHash));
1034 line.clear();
1035 }
1036 try {
1037 // close the master index
1038 lin.close();
1039 } catch(IOException io){
1040 // do nothing just a read.
1041 }
1042
1043 FSDataInputStream aIn = fs.open(archiveIndexPath);
1044 FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
1045 archiveIndexTimestamp = archiveStat.getModificationTime();
1046 LineReader aLin;
1047
1048 // now start reading the real index file
1049 for (Store s: stores) {
1050 read = 0;
1051 aIn.seek(s.begin);
1052 aLin = new LineReader(aIn, getConf());
1053 while (read + s.begin < s.end) {
1054 int tmp = aLin.readLine(line);
1055 read += tmp;
1056 String lineFeed = line.toString();
1057 String[] parsed = lineFeed.split(" ");
1058 parsed[0] = decodeFileName(parsed[0]);
1059 archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
1060 line.clear();
1061 }
1062 }
1063 try {
1064 // close the archive index
1065 aIn.close();
1066 } catch(IOException io) {
1067 // do nothing just a read.
1068 }
1069 }
1070 }
1071 }