001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.fs;
019
020 import java.io.FileNotFoundException;
021 import java.io.IOException;
022 import java.io.UnsupportedEncodingException;
023 import java.net.URI;
024 import java.net.URISyntaxException;
025 import java.net.URLDecoder;
026 import java.util.ArrayList;
027 import java.util.EnumSet;
028 import java.util.List;
029 import java.util.Map;
030 import java.util.TreeMap;
031 import java.util.HashMap;
032
033 import org.apache.hadoop.conf.Configuration;
034 import org.apache.hadoop.fs.permission.FsPermission;
035 import org.apache.hadoop.io.Text;
036 import org.apache.hadoop.util.LineReader;
037 import org.apache.hadoop.util.Progressable;
038
039 /**
040 * This is an implementation of the Hadoop Archive
041 * Filesystem. This archive Filesystem has index files
042 * of the form _index* and has contents of the form
043 * part-*. The index files store the indexes of the
044 * real files. The index files are of the form _masterindex
045 * and _index. The master index is a level of indirection
046 * in to the index file to make the look ups faster. the index
047 * file is sorted with hash code of the paths that it contains
048 * and the master index contains pointers to the positions in
049 * index for ranges of hashcodes.
050 */
051
052 public class HarFileSystem extends FilterFileSystem {
053 public static final int VERSION = 3;
054
055 private static final Map<URI, HarMetaData> harMetaCache = new HashMap<URI, HarMetaData>();
056
057 // uri representation of this Har filesystem
058 private URI uri;
059 // the top level path of the archive
060 // in the underlying file system
061 private Path archivePath;
062 // the har auth
063 private String harAuth;
064
065 // pointer into the static metadata cache
066 private HarMetaData metadata;
067
068 /**
069 * public construction of harfilesystem
070 *
071 */
072 public HarFileSystem() {
073 }
074
075 /**
076 * Constructor to create a HarFileSystem with an
077 * underlying filesystem.
078 * @param fs
079 */
080 public HarFileSystem(FileSystem fs) {
081 super(fs);
082 }
083
084 /**
085 * Initialize a Har filesystem per har archive. The
086 * archive home directory is the top level directory
087 * in the filesystem that contains the HAR archive.
088 * Be careful with this method, you do not want to go
089 * on creating new Filesystem instances per call to
090 * path.getFileSystem().
091 * the uri of Har is
092 * har://underlyingfsscheme-host:port/archivepath.
093 * or
094 * har:///archivepath. This assumes the underlying filesystem
095 * to be used in case not specified.
096 */
097 public void initialize(URI name, Configuration conf) throws IOException {
098 // decode the name
099 URI underLyingURI = decodeHarURI(name, conf);
100 // we got the right har Path- now check if this is
101 // truly a har filesystem
102 Path harPath = archivePath(
103 new Path(name.getScheme(), name.getAuthority(), name.getPath()));
104 if (harPath == null) {
105 throw new IOException("Invalid path for the Har Filesystem. " +
106 name.toString());
107 }
108 if (fs == null) {
109 fs = FileSystem.get(underLyingURI, conf);
110 }
111 uri = harPath.toUri();
112 archivePath = new Path(uri.getPath());
113 harAuth = getHarAuth(underLyingURI);
114 //check for the underlying fs containing
115 // the index file
116 Path masterIndexPath = new Path(archivePath, "_masterindex");
117 Path archiveIndexPath = new Path(archivePath, "_index");
118 if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) {
119 throw new IOException("Invalid path for the Har Filesystem. " +
120 "No index file in " + harPath);
121 }
122
123 metadata = harMetaCache.get(uri);
124 if (metadata != null) {
125 FileStatus mStat = fs.getFileStatus(masterIndexPath);
126 FileStatus aStat = fs.getFileStatus(archiveIndexPath);
127 if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() ||
128 aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) {
129 // the archive has been overwritten since we last read it
130 // remove the entry from the meta data cache
131 metadata = null;
132 harMetaCache.remove(uri);
133 }
134 }
135 if (metadata == null) {
136 metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath);
137 metadata.parseMetaData();
138 harMetaCache.put(uri, metadata);
139 }
140 }
141
142 // get the version of the filesystem from the masterindex file
143 // the version is currently not useful since its the first version
144 // of archives
145 public int getHarVersion() throws IOException {
146 if (metadata != null) {
147 return metadata.getVersion();
148 }
149 else {
150 throw new IOException("Invalid meta data for the Har Filesystem");
151 }
152 }
153
154 /*
155 * find the parent path that is the
156 * archive path in the path. The last
157 * path segment that ends with .har is
158 * the path that will be returned.
159 */
160 private Path archivePath(Path p) {
161 Path retPath = null;
162 Path tmp = p;
163 for (int i=0; i< p.depth(); i++) {
164 if (tmp.toString().endsWith(".har")) {
165 retPath = tmp;
166 break;
167 }
168 tmp = tmp.getParent();
169 }
170 return retPath;
171 }
172
173 /**
174 * decode the raw URI to get the underlying URI
175 * @param rawURI raw Har URI
176 * @return filtered URI of the underlying fileSystem
177 */
178 private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException {
179 String tmpAuth = rawURI.getAuthority();
180 //we are using the default file
181 //system in the config
182 //so create a underlying uri and
183 //return it
184 if (tmpAuth == null) {
185 //create a path
186 return FileSystem.getDefaultUri(conf);
187 }
188 String host = rawURI.getHost();
189 if (host == null) {
190 throw new IOException("URI: " + rawURI
191 + " is an invalid Har URI since host==null."
192 + " Expecting har://<scheme>-<host>/<path>.");
193 }
194 int i = host.indexOf('-');
195 if (i < 0) {
196 throw new IOException("URI: " + rawURI
197 + " is an invalid Har URI since '-' not found."
198 + " Expecting har://<scheme>-<host>/<path>.");
199 }
200 final String underLyingScheme = host.substring(0, i);
201 i++;
202 final String underLyingHost = i == host.length()? null: host.substring(i);
203 int underLyingPort = rawURI.getPort();
204 String auth = (underLyingHost == null && underLyingPort == -1)?
205 null:(underLyingHost+":"+underLyingPort);
206 URI tmp = null;
207 if (rawURI.getQuery() != null) {
208 // query component not allowed
209 throw new IOException("query component in Path not supported " + rawURI);
210 }
211 try {
212 tmp = new URI(underLyingScheme, auth, rawURI.getPath(),
213 rawURI.getQuery(), rawURI.getFragment());
214 } catch (URISyntaxException e) {
215 // do nothing should not happen
216 }
217 return tmp;
218 }
219
220 private static String decodeString(String str)
221 throws UnsupportedEncodingException {
222 return URLDecoder.decode(str, "UTF-8");
223 }
224
225 private String decodeFileName(String fname)
226 throws UnsupportedEncodingException {
227 int version = metadata.getVersion();
228 if (version == 2 || version == 3){
229 return decodeString(fname);
230 }
231 return fname;
232 }
233
234 /**
235 * return the top level archive.
236 */
237 public Path getWorkingDirectory() {
238 return new Path(uri.toString());
239 }
240
241 /**
242 * Create a har specific auth
243 * har-underlyingfs:port
244 * @param underLyingURI the uri of underlying
245 * filesystem
246 * @return har specific auth
247 */
248 private String getHarAuth(URI underLyingUri) {
249 String auth = underLyingUri.getScheme() + "-";
250 if (underLyingUri.getHost() != null) {
251 auth += underLyingUri.getHost() + ":";
252 if (underLyingUri.getPort() != -1) {
253 auth += underLyingUri.getPort();
254 }
255 }
256 else {
257 auth += ":";
258 }
259 return auth;
260 }
261
262 /**
263 * Returns the uri of this filesystem.
264 * The uri is of the form
265 * har://underlyingfsschema-host:port/pathintheunderlyingfs
266 */
267 @Override
268 public URI getUri() {
269 return this.uri;
270 }
271
272 /**
273 * this method returns the path
274 * inside the har filesystem.
275 * this is relative path inside
276 * the har filesystem.
277 * @param path the fully qualified path in the har filesystem.
278 * @return relative path in the filesystem.
279 */
280 private Path getPathInHar(Path path) {
281 Path harPath = new Path(path.toUri().getPath());
282 if (archivePath.compareTo(harPath) == 0)
283 return new Path(Path.SEPARATOR);
284 Path tmp = new Path(harPath.getName());
285 Path parent = harPath.getParent();
286 while (!(parent.compareTo(archivePath) == 0)) {
287 if (parent.toString().equals(Path.SEPARATOR)) {
288 tmp = null;
289 break;
290 }
291 tmp = new Path(parent.getName(), tmp);
292 parent = parent.getParent();
293 }
294 if (tmp != null)
295 tmp = new Path(Path.SEPARATOR, tmp);
296 return tmp;
297 }
298
299 //the relative path of p. basically
300 // getting rid of /. Parsing and doing
301 // string manipulation is not good - so
302 // just use the path api to do it.
303 private Path makeRelative(String initial, Path p) {
304 String scheme = this.uri.getScheme();
305 String authority = this.uri.getAuthority();
306 Path root = new Path(Path.SEPARATOR);
307 if (root.compareTo(p) == 0)
308 return new Path(scheme, authority, initial);
309 Path retPath = new Path(p.getName());
310 Path parent = p.getParent();
311 for (int i=0; i < p.depth()-1; i++) {
312 retPath = new Path(parent.getName(), retPath);
313 parent = parent.getParent();
314 }
315 return new Path(new Path(scheme, authority, initial),
316 retPath.toString());
317 }
318
319 /* this makes a path qualified in the har filesystem
320 * (non-Javadoc)
321 * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified(
322 * org.apache.hadoop.fs.Path)
323 */
324 @Override
325 public Path makeQualified(Path path) {
326 // make sure that we just get the
327 // path component
328 Path fsPath = path;
329 if (!path.isAbsolute()) {
330 fsPath = new Path(archivePath, path);
331 }
332
333 URI tmpURI = fsPath.toUri();
334 //change this to Har uri
335 return new Path(uri.getScheme(), harAuth, tmpURI.getPath());
336 }
337
338 /**
339 * Fix offset and length of block locations.
340 * Note that this method modifies the original array.
341 * @param locations block locations of har part file
342 * @param start the start of the desired range in the contained file
343 * @param len the length of the desired range
344 * @param fileOffsetInHar the offset of the desired file in the har part file
345 * @return block locations with fixed offset and length
346 */
347 static BlockLocation[] fixBlockLocations(BlockLocation[] locations,
348 long start,
349 long len,
350 long fileOffsetInHar) {
351 // offset 1 past last byte of desired range
352 long end = start + len;
353
354 for (BlockLocation location : locations) {
355 // offset of part block relative to beginning of desired file
356 // (may be negative if file starts in this part block)
357 long harBlockStart = location.getOffset() - fileOffsetInHar;
358 // offset 1 past last byte of har block relative to beginning of
359 // desired file
360 long harBlockEnd = harBlockStart + location.getLength();
361
362 if (start > harBlockStart) {
363 // desired range starts after beginning of this har block
364 // fix offset to beginning of relevant range (relative to desired file)
365 location.setOffset(start);
366 // fix length to relevant portion of har block
367 location.setLength(location.getLength() - (start - harBlockStart));
368 } else {
369 // desired range includes beginning of this har block
370 location.setOffset(harBlockStart);
371 }
372
373 if (harBlockEnd > end) {
374 // range ends before end of this har block
375 // fix length to remove irrelevant portion at the end
376 location.setLength(location.getLength() - (harBlockEnd - end));
377 }
378 }
379
380 return locations;
381 }
382
383 /**
384 * Get block locations from the underlying fs and fix their
385 * offsets and lengths.
386 * @param file the input filestatus to get block locations
387 * @param start the start of the desired range in the contained file
388 * @param len the length of the desired range
389 * @return block locations for this segment of file
390 * @throws IOException
391 */
392 @Override
393 public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
394 long len) throws IOException {
395 HarStatus hstatus = getFileHarStatus(file.getPath());
396 Path partPath = new Path(archivePath, hstatus.getPartName());
397 FileStatus partStatus = metadata.getPartFileStatus(partPath);
398
399 // get all part blocks that overlap with the desired file blocks
400 BlockLocation[] locations =
401 fs.getFileBlockLocations(partStatus,
402 hstatus.getStartIndex() + start, len);
403
404 return fixBlockLocations(locations, start, len, hstatus.getStartIndex());
405 }
406
407 /**
408 * the hash of the path p inside iniside
409 * the filesystem
410 * @param p the path in the harfilesystem
411 * @return the hash code of the path.
412 */
413 public static int getHarHash(Path p) {
414 return (p.toString().hashCode() & 0x7fffffff);
415 }
416
417 static class Store {
418 public Store() {
419 begin = end = startHash = endHash = 0;
420 }
421 public Store(long begin, long end, int startHash, int endHash) {
422 this.begin = begin;
423 this.end = end;
424 this.startHash = startHash;
425 this.endHash = endHash;
426 }
427 public long begin;
428 public long end;
429 public int startHash;
430 public int endHash;
431 }
432
433 /**
434 * Get filestatuses of all the children of a given directory. This just reads
435 * through index file and reads line by line to get all statuses for children
436 * of a directory. Its a brute force way of getting all such filestatuses
437 *
438 * @param parent
439 * the parent path directory
440 * @param statuses
441 * the list to add the children filestatuses to
442 * @param children
443 * the string list of children for this parent
444 * @param archiveIndexStat
445 * the archive index filestatus
446 */
447 private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses,
448 List<String> children) throws IOException {
449 String parentString = parent.getName();
450 if (!parentString.endsWith(Path.SEPARATOR)){
451 parentString += Path.SEPARATOR;
452 }
453 Path harPath = new Path(parentString);
454 int harlen = harPath.depth();
455 final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
456
457 for (HarStatus hstatus : metadata.archive.values()) {
458 String child = hstatus.getName();
459 if ((child.startsWith(parentString))) {
460 Path thisPath = new Path(child);
461 if (thisPath.depth() == harlen + 1) {
462 statuses.add(toFileStatus(hstatus, cache));
463 }
464 }
465 }
466 }
467
468 /**
469 * Combine the status stored in the index and the underlying status.
470 * @param h status stored in the index
471 * @param cache caching the underlying file statuses
472 * @return the combined file status
473 * @throws IOException
474 */
475 private FileStatus toFileStatus(HarStatus h,
476 Map<String, FileStatus> cache) throws IOException {
477 FileStatus underlying = null;
478 if (cache != null) {
479 underlying = cache.get(h.partName);
480 }
481 if (underlying == null) {
482 final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
483 underlying = fs.getFileStatus(p);
484 if (cache != null) {
485 cache.put(h.partName, underlying);
486 }
487 }
488
489 long modTime = 0;
490 int version = metadata.getVersion();
491 if (version < 3) {
492 modTime = underlying.getModificationTime();
493 } else if (version == 3) {
494 modTime = h.getModificationTime();
495 }
496
497 return new FileStatus(
498 h.isDir()? 0L: h.getLength(),
499 h.isDir(),
500 underlying.getReplication(),
501 underlying.getBlockSize(),
502 modTime,
503 underlying.getAccessTime(),
504 underlying.getPermission(),
505 underlying.getOwner(),
506 underlying.getGroup(),
507 makeRelative(this.uri.getPath(), new Path(h.name)));
508 }
509
510 // a single line parser for hadoop archives status
511 // stored in a single line in the index files
512 // the format is of the form
513 // filename "dir"/"file" partFileName startIndex length
514 // <space seperated children>
515 private class HarStatus {
516 boolean isDir;
517 String name;
518 List<String> children;
519 String partName;
520 long startIndex;
521 long length;
522 long modificationTime = 0;
523
524 public HarStatus(String harString) throws UnsupportedEncodingException {
525 String[] splits = harString.split(" ");
526 this.name = decodeFileName(splits[0]);
527 this.isDir = "dir".equals(splits[1]) ? true: false;
528 // this is equal to "none" if its a directory
529 this.partName = splits[2];
530 this.startIndex = Long.parseLong(splits[3]);
531 this.length = Long.parseLong(splits[4]);
532
533 int version = metadata.getVersion();
534 String[] propSplits = null;
535 // propSplits is used to retrieve the metainformation that Har versions
536 // 1 & 2 missed (modification time, permission, owner group).
537 // These fields are stored in an encoded string placed in different
538 // locations depending on whether it's a file or directory entry.
539 // If it's a directory, the string will be placed at the partName
540 // location (directories have no partName because they don't have data
541 // to be stored). This is done because the number of fields in a
542 // directory entry is unbounded (all children are listed at the end)
543 // If it's a file, the string will be the last field.
544 if (isDir) {
545 if (version == 3){
546 propSplits = decodeString(this.partName).split(" ");
547 }
548 children = new ArrayList<String>();
549 for (int i = 5; i < splits.length; i++) {
550 children.add(decodeFileName(splits[i]));
551 }
552 } else if (version == 3) {
553 propSplits = decodeString(splits[5]).split(" ");
554 }
555
556 if (propSplits != null && propSplits.length >= 4) {
557 modificationTime = Long.parseLong(propSplits[0]);
558 // the fields below are stored in the file but are currently not used
559 // by HarFileSystem
560 // permission = new FsPermission(Short.parseShort(propSplits[1]));
561 // owner = decodeString(propSplits[2]);
562 // group = decodeString(propSplits[3]);
563 }
564 }
565 public boolean isDir() {
566 return isDir;
567 }
568
569 public String getName() {
570 return name;
571 }
572
573 public List<String> getChildren() {
574 return children;
575 }
576 public String getFileName() {
577 return name;
578 }
579 public String getPartName() {
580 return partName;
581 }
582 public long getStartIndex() {
583 return startIndex;
584 }
585 public long getLength() {
586 return length;
587 }
588 public long getModificationTime() {
589 return modificationTime;
590 }
591 }
592
593 /**
594 * return the filestatus of files in har archive.
595 * The permission returned are that of the archive
596 * index files. The permissions are not persisted
597 * while creating a hadoop archive.
598 * @param f the path in har filesystem
599 * @return filestatus.
600 * @throws IOException
601 */
602 @Override
603 public FileStatus getFileStatus(Path f) throws IOException {
604 HarStatus hstatus = getFileHarStatus(f);
605 return toFileStatus(hstatus, null);
606 }
607
608 private HarStatus getFileHarStatus(Path f) throws IOException {
609 // get the fs DataInputStream for the underlying file
610 // look up the index.
611 Path p = makeQualified(f);
612 Path harPath = getPathInHar(p);
613 if (harPath == null) {
614 throw new IOException("Invalid file name: " + f + " in " + uri);
615 }
616 HarStatus hstatus = metadata.archive.get(harPath);
617 if (hstatus == null) {
618 throw new FileNotFoundException("File: " + f + " does not exist in " + uri);
619 }
620 return hstatus;
621 }
622
623 /**
624 * @return null since no checksum algorithm is implemented.
625 */
626 public FileChecksum getFileChecksum(Path f) {
627 return null;
628 }
629
630 /**
631 * Returns a har input stream which fakes end of
632 * file. It reads the index files to get the part
633 * file name and the size and start of the file.
634 */
635 @Override
636 public FSDataInputStream open(Path f, int bufferSize) throws IOException {
637 // get the fs DataInputStream for the underlying file
638 HarStatus hstatus = getFileHarStatus(f);
639 // we got it.. woo hooo!!!
640 if (hstatus.isDir()) {
641 throw new FileNotFoundException(f + " : not a file in " +
642 archivePath);
643 }
644 return new HarFSDataInputStream(fs, new Path(archivePath,
645 hstatus.getPartName()),
646 hstatus.getStartIndex(), hstatus.getLength(), bufferSize);
647 }
648
649 /*
650 * create throws an exception in Har filesystem.
651 * The archive once created cannot be changed.
652 */
653 public FSDataOutputStream create(Path f, int bufferSize)
654 throws IOException {
655 throw new IOException("Har: Create not allowed");
656 }
657
658 public FSDataOutputStream create(Path f,
659 FsPermission permission,
660 boolean overwrite,
661 int bufferSize,
662 short replication,
663 long blockSize,
664 Progressable progress) throws IOException {
665 throw new IOException("Har: create not allowed.");
666 }
667
668 @Override
669 public void close() throws IOException {
670 if (fs != null) {
671 try {
672 fs.close();
673 } catch(IOException ie) {
674 //this might already be closed
675 // ignore
676 }
677 }
678 }
679
680 /**
681 * Not implemented.
682 */
683 @Override
684 public boolean setReplication(Path src, short replication) throws IOException{
685 throw new IOException("Har: setreplication not allowed");
686 }
687
688 /**
689 * Not implemented.
690 */
691 @Override
692 public boolean delete(Path f, boolean recursive) throws IOException {
693 throw new IOException("Har: delete not allowed");
694 }
695
696 /**
697 * liststatus returns the children of a directory
698 * after looking up the index files.
699 */
700 @Override
701 public FileStatus[] listStatus(Path f) throws IOException {
702 //need to see if the file is an index in file
703 //get the filestatus of the archive directory
704 // we will create fake filestatuses to return
705 // to the client
706 List<FileStatus> statuses = new ArrayList<FileStatus>();
707 Path tmpPath = makeQualified(f);
708 Path harPath = getPathInHar(tmpPath);
709 HarStatus hstatus = metadata.archive.get(harPath);
710 if (hstatus == null) {
711 throw new FileNotFoundException("File " + f + " not found in " + archivePath);
712 }
713 if (hstatus.isDir()) {
714 fileStatusesInIndex(hstatus, statuses, hstatus.children);
715 } else {
716 statuses.add(toFileStatus(hstatus, null));
717 }
718
719 return statuses.toArray(new FileStatus[statuses.size()]);
720 }
721
722 /**
723 * return the top level archive path.
724 */
725 public Path getHomeDirectory() {
726 return new Path(uri.toString());
727 }
728
729 public void setWorkingDirectory(Path newDir) {
730 //does nothing.
731 }
732
733 /**
734 * not implemented.
735 */
736 public boolean mkdirs(Path f, FsPermission permission) throws IOException {
737 throw new IOException("Har: mkdirs not allowed");
738 }
739
740 /**
741 * not implemented.
742 */
743 public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws
744 IOException {
745 throw new IOException("Har: copyfromlocalfile not allowed");
746 }
747
748 /**
749 * copies the file in the har filesystem to a local file.
750 */
751 public void copyToLocalFile(boolean delSrc, Path src, Path dst)
752 throws IOException {
753 FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf());
754 }
755
756 /**
757 * not implemented.
758 */
759 public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile)
760 throws IOException {
761 throw new IOException("Har: startLocalOutput not allowed");
762 }
763
764 /**
765 * not implemented.
766 */
767 public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile)
768 throws IOException {
769 throw new IOException("Har: completeLocalOutput not allowed");
770 }
771
772 /**
773 * not implemented.
774 */
775 public void setOwner(Path p, String username, String groupname)
776 throws IOException {
777 throw new IOException("Har: setowner not allowed");
778 }
779
780 /**
781 * Not implemented.
782 */
783 public void setPermission(Path p, FsPermission permisssion)
784 throws IOException {
785 throw new IOException("Har: setPermission not allowed");
786 }
787
788 /**
789 * Hadoop archives input stream. This input stream fakes EOF
790 * since archive files are part of bigger part files.
791 */
792 private static class HarFSDataInputStream extends FSDataInputStream {
793 /**
794 * Create an input stream that fakes all the reads/positions/seeking.
795 */
796 private static class HarFsInputStream extends FSInputStream {
797 private long position, start, end;
798 //The underlying data input stream that the
799 // underlying filesystem will return.
800 private FSDataInputStream underLyingStream;
801 //one byte buffer
802 private byte[] oneBytebuff = new byte[1];
803 HarFsInputStream(FileSystem fs, Path path, long start,
804 long length, int bufferSize) throws IOException {
805 underLyingStream = fs.open(path, bufferSize);
806 underLyingStream.seek(start);
807 // the start of this file in the part file
808 this.start = start;
809 // the position pointer in the part file
810 this.position = start;
811 // the end pointer in the part file
812 this.end = start + length;
813 }
814
815 public synchronized int available() throws IOException {
816 long remaining = end - underLyingStream.getPos();
817 if (remaining > (long)Integer.MAX_VALUE) {
818 return Integer.MAX_VALUE;
819 }
820 return (int) remaining;
821 }
822
823 public synchronized void close() throws IOException {
824 underLyingStream.close();
825 super.close();
826 }
827
828 //not implemented
829 @Override
830 public void mark(int readLimit) {
831 // do nothing
832 }
833
834 /**
835 * reset is not implemented
836 */
837 public void reset() throws IOException {
838 throw new IOException("reset not implemented.");
839 }
840
841 public synchronized int read() throws IOException {
842 int ret = read(oneBytebuff, 0, 1);
843 return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff);
844 }
845
846 public synchronized int read(byte[] b) throws IOException {
847 int ret = read(b, 0, b.length);
848 if (ret != -1) {
849 position += ret;
850 }
851 return ret;
852 }
853
854 /**
855 *
856 */
857 public synchronized int read(byte[] b, int offset, int len)
858 throws IOException {
859 int newlen = len;
860 int ret = -1;
861 if (position + len > end) {
862 newlen = (int) (end - position);
863 }
864 // end case
865 if (newlen == 0)
866 return ret;
867 ret = underLyingStream.read(b, offset, newlen);
868 position += ret;
869 return ret;
870 }
871
872 public synchronized long skip(long n) throws IOException {
873 long tmpN = n;
874 if (tmpN > 0) {
875 if (position + tmpN > end) {
876 tmpN = end - position;
877 }
878 underLyingStream.seek(tmpN + position);
879 position += tmpN;
880 return tmpN;
881 }
882 return (tmpN < 0)? -1 : 0;
883 }
884
885 public synchronized long getPos() throws IOException {
886 return (position - start);
887 }
888
889 public synchronized void seek(long pos) throws IOException {
890 if (pos < 0 || (start + pos > end)) {
891 throw new IOException("Failed to seek: EOF");
892 }
893 position = start + pos;
894 underLyingStream.seek(position);
895 }
896
897 public boolean seekToNewSource(long targetPos) throws IOException {
898 //do not need to implement this
899 // hdfs in itself does seektonewsource
900 // while reading.
901 return false;
902 }
903
904 /**
905 * implementing position readable.
906 */
907 public int read(long pos, byte[] b, int offset, int length)
908 throws IOException {
909 int nlength = length;
910 if (start + nlength + pos > end) {
911 nlength = (int) (end - (start + pos));
912 }
913 return underLyingStream.read(pos + start , b, offset, nlength);
914 }
915
916 /**
917 * position readable again.
918 */
919 public void readFully(long pos, byte[] b, int offset, int length)
920 throws IOException {
921 if (start + length + pos > end) {
922 throw new IOException("Not enough bytes to read.");
923 }
924 underLyingStream.readFully(pos + start, b, offset, length);
925 }
926
927 public void readFully(long pos, byte[] b) throws IOException {
928 readFully(pos, b, 0, b.length);
929 }
930
931 }
932
933 /**
934 * constructors for har input stream.
935 * @param fs the underlying filesystem
936 * @param p The path in the underlying filesystem
937 * @param start the start position in the part file
938 * @param length the length of valid data in the part file
939 * @param bufsize the buffer size
940 * @throws IOException
941 */
942 public HarFSDataInputStream(FileSystem fs, Path p, long start,
943 long length, int bufsize) throws IOException {
944 super(new HarFsInputStream(fs, p, start, length, bufsize));
945 }
946
947 /**
948 * constructor for har input stream.
949 * @param fs the underlying filesystem
950 * @param p the path in the underlying file system
951 * @param start the start position in the part file
952 * @param length the length of valid data in the part file.
953 * @throws IOException
954 */
955 public HarFSDataInputStream(FileSystem fs, Path p, long start, long length)
956 throws IOException {
957 super(new HarFsInputStream(fs, p, start, length, 0));
958 }
959 }
960
961 private class HarMetaData {
962 private FileSystem fs;
963 private int version;
964 // the masterIndex of the archive
965 private Path masterIndexPath;
966 // the index file
967 private Path archiveIndexPath;
968
969 private long masterIndexTimestamp;
970 private long archiveIndexTimestamp;
971
972 List<Store> stores = new ArrayList<Store>();
973 Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
974 private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
975
976 public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
977 this.fs = fs;
978 this.masterIndexPath = masterIndexPath;
979 this.archiveIndexPath = archiveIndexPath;
980 }
981
982 public FileStatus getPartFileStatus(Path partPath) throws IOException {
983 FileStatus status;
984 status = partFileStatuses.get(partPath);
985 if (status == null) {
986 status = fs.getFileStatus(partPath);
987 partFileStatuses.put(partPath, status);
988 }
989 return status;
990 }
991
992 public long getMasterIndexTimestamp() {
993 return masterIndexTimestamp;
994 }
995
996 public long getArchiveIndexTimestamp() {
997 return archiveIndexTimestamp;
998 }
999
1000 private int getVersion() {
1001 return version;
1002 }
1003
1004 private void parseMetaData() throws IOException {
1005 FSDataInputStream in = fs.open(masterIndexPath);
1006 FileStatus masterStat = fs.getFileStatus(masterIndexPath);
1007 masterIndexTimestamp = masterStat.getModificationTime();
1008 LineReader lin = new LineReader(in, getConf());
1009 Text line = new Text();
1010 long read = lin.readLine(line);
1011
1012 // the first line contains the version of the index file
1013 String versionLine = line.toString();
1014 String[] arr = versionLine.split(" ");
1015 version = Integer.parseInt(arr[0]);
1016 // make it always backwards-compatible
1017 if (this.version > HarFileSystem.VERSION) {
1018 throw new IOException("Invalid version " +
1019 this.version + " expected " + HarFileSystem.VERSION);
1020 }
1021
1022 // each line contains a hashcode range and the index file name
1023 String[] readStr = null;
1024 while(read < masterStat.getLen()) {
1025 int b = lin.readLine(line);
1026 read += b;
1027 readStr = line.toString().split(" ");
1028 int startHash = Integer.parseInt(readStr[0]);
1029 int endHash = Integer.parseInt(readStr[1]);
1030 stores.add(new Store(Long.parseLong(readStr[2]),
1031 Long.parseLong(readStr[3]), startHash,
1032 endHash));
1033 line.clear();
1034 }
1035 try {
1036 // close the master index
1037 lin.close();
1038 } catch(IOException io){
1039 // do nothing just a read.
1040 }
1041
1042 FSDataInputStream aIn = fs.open(archiveIndexPath);
1043 FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
1044 archiveIndexTimestamp = archiveStat.getModificationTime();
1045 LineReader aLin;
1046 String retStr = null;
1047 // now start reading the real index file
1048 for (Store s: stores) {
1049 read = 0;
1050 aIn.seek(s.begin);
1051 aLin = new LineReader(aIn, getConf());
1052 while (read + s.begin < s.end) {
1053 int tmp = aLin.readLine(line);
1054 read += tmp;
1055 String lineFeed = line.toString();
1056 String[] parsed = lineFeed.split(" ");
1057 parsed[0] = decodeFileName(parsed[0]);
1058 archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
1059 line.clear();
1060 }
1061 }
1062 try {
1063 // close the archive index
1064 aIn.close();
1065 } catch(IOException io) {
1066 // do nothing just a read.
1067 }
1068 }
1069 }
1070 }