001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.fs;
019
020 import java.io.FileNotFoundException;
021 import java.io.IOException;
022 import java.io.UnsupportedEncodingException;
023 import java.net.URI;
024 import java.net.URISyntaxException;
025 import java.net.URLDecoder;
026 import java.util.ArrayList;
027 import java.util.List;
028 import java.util.Map;
029 import java.util.TreeMap;
030 import java.util.HashMap;
031 import java.util.concurrent.ConcurrentHashMap;
032
033 import org.apache.hadoop.conf.Configuration;
034 import org.apache.hadoop.fs.permission.FsPermission;
035 import org.apache.hadoop.io.Text;
036 import org.apache.hadoop.util.LineReader;
037 import org.apache.hadoop.util.Progressable;
038
039 /**
040 * This is an implementation of the Hadoop Archive
041 * Filesystem. This archive Filesystem has index files
042 * of the form _index* and has contents of the form
043 * part-*. The index files store the indexes of the
044 * real files. The index files are of the form _masterindex
045 * and _index. The master index is a level of indirection
046 * in to the index file to make the look ups faster. the index
047 * file is sorted with hash code of the paths that it contains
048 * and the master index contains pointers to the positions in
049 * index for ranges of hashcodes.
050 */
051
052 public class HarFileSystem extends FilterFileSystem {
053 public static final int VERSION = 3;
054
055 private static final Map<URI, HarMetaData> harMetaCache =
056 new ConcurrentHashMap<URI, HarMetaData>();
057
058 // uri representation of this Har filesystem
059 private URI uri;
060 // the top level path of the archive
061 // in the underlying file system
062 private Path archivePath;
063 // the har auth
064 private String harAuth;
065
066 // pointer into the static metadata cache
067 private HarMetaData metadata;
068
069 /**
070 * public construction of harfilesystem
071 *
072 */
073 public HarFileSystem() {
074 }
075
076 /**
077 * Return the protocol scheme for the FileSystem.
078 * <p/>
079 *
080 * @return <code>har</code>
081 */
082 @Override
083 public String getScheme() {
084 return "har";
085 }
086
087 /**
088 * Constructor to create a HarFileSystem with an
089 * underlying filesystem.
090 * @param fs
091 */
092 public HarFileSystem(FileSystem fs) {
093 super(fs);
094 }
095
096 /**
097 * Initialize a Har filesystem per har archive. The
098 * archive home directory is the top level directory
099 * in the filesystem that contains the HAR archive.
100 * Be careful with this method, you do not want to go
101 * on creating new Filesystem instances per call to
102 * path.getFileSystem().
103 * the uri of Har is
104 * har://underlyingfsscheme-host:port/archivepath.
105 * or
106 * har:///archivepath. This assumes the underlying filesystem
107 * to be used in case not specified.
108 */
109 @Override
110 public void initialize(URI name, Configuration conf) throws IOException {
111 // decode the name
112 URI underLyingURI = decodeHarURI(name, conf);
113 // we got the right har Path- now check if this is
114 // truly a har filesystem
115 Path harPath = archivePath(
116 new Path(name.getScheme(), name.getAuthority(), name.getPath()));
117 if (harPath == null) {
118 throw new IOException("Invalid path for the Har Filesystem. " +
119 name.toString());
120 }
121 if (fs == null) {
122 fs = FileSystem.get(underLyingURI, conf);
123 }
124 uri = harPath.toUri();
125 archivePath = new Path(uri.getPath());
126 harAuth = getHarAuth(underLyingURI);
127 //check for the underlying fs containing
128 // the index file
129 Path masterIndexPath = new Path(archivePath, "_masterindex");
130 Path archiveIndexPath = new Path(archivePath, "_index");
131 if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) {
132 throw new IOException("Invalid path for the Har Filesystem. " +
133 "No index file in " + harPath);
134 }
135
136 metadata = harMetaCache.get(uri);
137 if (metadata != null) {
138 FileStatus mStat = fs.getFileStatus(masterIndexPath);
139 FileStatus aStat = fs.getFileStatus(archiveIndexPath);
140 if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() ||
141 aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) {
142 // the archive has been overwritten since we last read it
143 // remove the entry from the meta data cache
144 metadata = null;
145 harMetaCache.remove(uri);
146 }
147 }
148 if (metadata == null) {
149 metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath);
150 metadata.parseMetaData();
151 harMetaCache.put(uri, metadata);
152 }
153 }
154
155 // get the version of the filesystem from the masterindex file
156 // the version is currently not useful since its the first version
157 // of archives
158 public int getHarVersion() throws IOException {
159 if (metadata != null) {
160 return metadata.getVersion();
161 }
162 else {
163 throw new IOException("Invalid meta data for the Har Filesystem");
164 }
165 }
166
167 /*
168 * find the parent path that is the
169 * archive path in the path. The last
170 * path segment that ends with .har is
171 * the path that will be returned.
172 */
173 private Path archivePath(Path p) {
174 Path retPath = null;
175 Path tmp = p;
176 for (int i=0; i< p.depth(); i++) {
177 if (tmp.toString().endsWith(".har")) {
178 retPath = tmp;
179 break;
180 }
181 tmp = tmp.getParent();
182 }
183 return retPath;
184 }
185
186 /**
187 * decode the raw URI to get the underlying URI
188 * @param rawURI raw Har URI
189 * @return filtered URI of the underlying fileSystem
190 */
191 private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException {
192 String tmpAuth = rawURI.getAuthority();
193 //we are using the default file
194 //system in the config
195 //so create a underlying uri and
196 //return it
197 if (tmpAuth == null) {
198 //create a path
199 return FileSystem.getDefaultUri(conf);
200 }
201 String host = rawURI.getHost();
202 if (host == null) {
203 throw new IOException("URI: " + rawURI
204 + " is an invalid Har URI since host==null."
205 + " Expecting har://<scheme>-<host>/<path>.");
206 }
207 int i = host.indexOf('-');
208 if (i < 0) {
209 throw new IOException("URI: " + rawURI
210 + " is an invalid Har URI since '-' not found."
211 + " Expecting har://<scheme>-<host>/<path>.");
212 }
213 final String underLyingScheme = host.substring(0, i);
214 i++;
215 final String underLyingHost = i == host.length()? null: host.substring(i);
216 int underLyingPort = rawURI.getPort();
217 String auth = (underLyingHost == null && underLyingPort == -1)?
218 null:(underLyingHost+
219 (underLyingPort == -1 ? "" : ":"+underLyingPort));
220 URI tmp = null;
221 if (rawURI.getQuery() != null) {
222 // query component not allowed
223 throw new IOException("query component in Path not supported " + rawURI);
224 }
225 try {
226 tmp = new URI(underLyingScheme, auth, rawURI.getPath(),
227 rawURI.getQuery(), rawURI.getFragment());
228 } catch (URISyntaxException e) {
229 // do nothing should not happen
230 }
231 return tmp;
232 }
233
234 private static String decodeString(String str)
235 throws UnsupportedEncodingException {
236 return URLDecoder.decode(str, "UTF-8");
237 }
238
239 private String decodeFileName(String fname)
240 throws UnsupportedEncodingException {
241 int version = metadata.getVersion();
242 if (version == 2 || version == 3){
243 return decodeString(fname);
244 }
245 return fname;
246 }
247
248 /**
249 * return the top level archive.
250 */
251 @Override
252 public Path getWorkingDirectory() {
253 return new Path(uri.toString());
254 }
255
256 /**
257 * Create a har specific auth
258 * har-underlyingfs:port
259 * @param underLyingURI the uri of underlying
260 * filesystem
261 * @return har specific auth
262 */
263 private String getHarAuth(URI underLyingUri) {
264 String auth = underLyingUri.getScheme() + "-";
265 if (underLyingUri.getHost() != null) {
266 auth += underLyingUri.getHost() + ":";
267 if (underLyingUri.getPort() != -1) {
268 auth += underLyingUri.getPort();
269 }
270 }
271 else {
272 auth += ":";
273 }
274 return auth;
275 }
276
277 /**
278 * Returns the uri of this filesystem.
279 * The uri is of the form
280 * har://underlyingfsschema-host:port/pathintheunderlyingfs
281 */
282 @Override
283 public URI getUri() {
284 return this.uri;
285 }
286
287 /**
288 * this method returns the path
289 * inside the har filesystem.
290 * this is relative path inside
291 * the har filesystem.
292 * @param path the fully qualified path in the har filesystem.
293 * @return relative path in the filesystem.
294 */
295 private Path getPathInHar(Path path) {
296 Path harPath = new Path(path.toUri().getPath());
297 if (archivePath.compareTo(harPath) == 0)
298 return new Path(Path.SEPARATOR);
299 Path tmp = new Path(harPath.getName());
300 Path parent = harPath.getParent();
301 while (!(parent.compareTo(archivePath) == 0)) {
302 if (parent.toString().equals(Path.SEPARATOR)) {
303 tmp = null;
304 break;
305 }
306 tmp = new Path(parent.getName(), tmp);
307 parent = parent.getParent();
308 }
309 if (tmp != null)
310 tmp = new Path(Path.SEPARATOR, tmp);
311 return tmp;
312 }
313
314 //the relative path of p. basically
315 // getting rid of /. Parsing and doing
316 // string manipulation is not good - so
317 // just use the path api to do it.
318 private Path makeRelative(String initial, Path p) {
319 String scheme = this.uri.getScheme();
320 String authority = this.uri.getAuthority();
321 Path root = new Path(Path.SEPARATOR);
322 if (root.compareTo(p) == 0)
323 return new Path(scheme, authority, initial);
324 Path retPath = new Path(p.getName());
325 Path parent = p.getParent();
326 for (int i=0; i < p.depth()-1; i++) {
327 retPath = new Path(parent.getName(), retPath);
328 parent = parent.getParent();
329 }
330 return new Path(new Path(scheme, authority, initial),
331 retPath.toString());
332 }
333
334 /* this makes a path qualified in the har filesystem
335 * (non-Javadoc)
336 * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified(
337 * org.apache.hadoop.fs.Path)
338 */
339 @Override
340 public Path makeQualified(Path path) {
341 // make sure that we just get the
342 // path component
343 Path fsPath = path;
344 if (!path.isAbsolute()) {
345 fsPath = new Path(archivePath, path);
346 }
347
348 URI tmpURI = fsPath.toUri();
349 //change this to Har uri
350 return new Path(uri.getScheme(), harAuth, tmpURI.getPath());
351 }
352
353 /**
354 * Fix offset and length of block locations.
355 * Note that this method modifies the original array.
356 * @param locations block locations of har part file
357 * @param start the start of the desired range in the contained file
358 * @param len the length of the desired range
359 * @param fileOffsetInHar the offset of the desired file in the har part file
360 * @return block locations with fixed offset and length
361 */
362 static BlockLocation[] fixBlockLocations(BlockLocation[] locations,
363 long start,
364 long len,
365 long fileOffsetInHar) {
366 // offset 1 past last byte of desired range
367 long end = start + len;
368
369 for (BlockLocation location : locations) {
370 // offset of part block relative to beginning of desired file
371 // (may be negative if file starts in this part block)
372 long harBlockStart = location.getOffset() - fileOffsetInHar;
373 // offset 1 past last byte of har block relative to beginning of
374 // desired file
375 long harBlockEnd = harBlockStart + location.getLength();
376
377 if (start > harBlockStart) {
378 // desired range starts after beginning of this har block
379 // fix offset to beginning of relevant range (relative to desired file)
380 location.setOffset(start);
381 // fix length to relevant portion of har block
382 location.setLength(location.getLength() - (start - harBlockStart));
383 } else {
384 // desired range includes beginning of this har block
385 location.setOffset(harBlockStart);
386 }
387
388 if (harBlockEnd > end) {
389 // range ends before end of this har block
390 // fix length to remove irrelevant portion at the end
391 location.setLength(location.getLength() - (harBlockEnd - end));
392 }
393 }
394
395 return locations;
396 }
397
398 /**
399 * Get block locations from the underlying fs and fix their
400 * offsets and lengths.
401 * @param file the input filestatus to get block locations
402 * @param start the start of the desired range in the contained file
403 * @param len the length of the desired range
404 * @return block locations for this segment of file
405 * @throws IOException
406 */
407 @Override
408 public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
409 long len) throws IOException {
410 HarStatus hstatus = getFileHarStatus(file.getPath());
411 Path partPath = new Path(archivePath, hstatus.getPartName());
412 FileStatus partStatus = metadata.getPartFileStatus(partPath);
413
414 // get all part blocks that overlap with the desired file blocks
415 BlockLocation[] locations =
416 fs.getFileBlockLocations(partStatus,
417 hstatus.getStartIndex() + start, len);
418
419 return fixBlockLocations(locations, start, len, hstatus.getStartIndex());
420 }
421
422 /**
423 * the hash of the path p inside iniside
424 * the filesystem
425 * @param p the path in the harfilesystem
426 * @return the hash code of the path.
427 */
428 public static int getHarHash(Path p) {
429 return (p.toString().hashCode() & 0x7fffffff);
430 }
431
432 static class Store {
433 public Store() {
434 begin = end = startHash = endHash = 0;
435 }
436 public Store(long begin, long end, int startHash, int endHash) {
437 this.begin = begin;
438 this.end = end;
439 this.startHash = startHash;
440 this.endHash = endHash;
441 }
442 public long begin;
443 public long end;
444 public int startHash;
445 public int endHash;
446 }
447
448 /**
449 * Get filestatuses of all the children of a given directory. This just reads
450 * through index file and reads line by line to get all statuses for children
451 * of a directory. Its a brute force way of getting all such filestatuses
452 *
453 * @param parent
454 * the parent path directory
455 * @param statuses
456 * the list to add the children filestatuses to
457 * @param children
458 * the string list of children for this parent
459 * @param archiveIndexStat
460 * the archive index filestatus
461 */
462 private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses,
463 List<String> children) throws IOException {
464 String parentString = parent.getName();
465 if (!parentString.endsWith(Path.SEPARATOR)){
466 parentString += Path.SEPARATOR;
467 }
468 Path harPath = new Path(parentString);
469 int harlen = harPath.depth();
470 final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
471
472 for (HarStatus hstatus : metadata.archive.values()) {
473 String child = hstatus.getName();
474 if ((child.startsWith(parentString))) {
475 Path thisPath = new Path(child);
476 if (thisPath.depth() == harlen + 1) {
477 statuses.add(toFileStatus(hstatus, cache));
478 }
479 }
480 }
481 }
482
483 /**
484 * Combine the status stored in the index and the underlying status.
485 * @param h status stored in the index
486 * @param cache caching the underlying file statuses
487 * @return the combined file status
488 * @throws IOException
489 */
490 private FileStatus toFileStatus(HarStatus h,
491 Map<String, FileStatus> cache) throws IOException {
492 FileStatus underlying = null;
493 if (cache != null) {
494 underlying = cache.get(h.partName);
495 }
496 if (underlying == null) {
497 final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
498 underlying = fs.getFileStatus(p);
499 if (cache != null) {
500 cache.put(h.partName, underlying);
501 }
502 }
503
504 long modTime = 0;
505 int version = metadata.getVersion();
506 if (version < 3) {
507 modTime = underlying.getModificationTime();
508 } else if (version == 3) {
509 modTime = h.getModificationTime();
510 }
511
512 return new FileStatus(
513 h.isDir()? 0L: h.getLength(),
514 h.isDir(),
515 underlying.getReplication(),
516 underlying.getBlockSize(),
517 modTime,
518 underlying.getAccessTime(),
519 underlying.getPermission(),
520 underlying.getOwner(),
521 underlying.getGroup(),
522 makeRelative(this.uri.getPath(), new Path(h.name)));
523 }
524
525 // a single line parser for hadoop archives status
526 // stored in a single line in the index files
527 // the format is of the form
528 // filename "dir"/"file" partFileName startIndex length
529 // <space seperated children>
530 private class HarStatus {
531 boolean isDir;
532 String name;
533 List<String> children;
534 String partName;
535 long startIndex;
536 long length;
537 long modificationTime = 0;
538
539 public HarStatus(String harString) throws UnsupportedEncodingException {
540 String[] splits = harString.split(" ");
541 this.name = decodeFileName(splits[0]);
542 this.isDir = "dir".equals(splits[1]) ? true: false;
543 // this is equal to "none" if its a directory
544 this.partName = splits[2];
545 this.startIndex = Long.parseLong(splits[3]);
546 this.length = Long.parseLong(splits[4]);
547
548 int version = metadata.getVersion();
549 String[] propSplits = null;
550 // propSplits is used to retrieve the metainformation that Har versions
551 // 1 & 2 missed (modification time, permission, owner group).
552 // These fields are stored in an encoded string placed in different
553 // locations depending on whether it's a file or directory entry.
554 // If it's a directory, the string will be placed at the partName
555 // location (directories have no partName because they don't have data
556 // to be stored). This is done because the number of fields in a
557 // directory entry is unbounded (all children are listed at the end)
558 // If it's a file, the string will be the last field.
559 if (isDir) {
560 if (version == 3){
561 propSplits = decodeString(this.partName).split(" ");
562 }
563 children = new ArrayList<String>();
564 for (int i = 5; i < splits.length; i++) {
565 children.add(decodeFileName(splits[i]));
566 }
567 } else if (version == 3) {
568 propSplits = decodeString(splits[5]).split(" ");
569 }
570
571 if (propSplits != null && propSplits.length >= 4) {
572 modificationTime = Long.parseLong(propSplits[0]);
573 // the fields below are stored in the file but are currently not used
574 // by HarFileSystem
575 // permission = new FsPermission(Short.parseShort(propSplits[1]));
576 // owner = decodeString(propSplits[2]);
577 // group = decodeString(propSplits[3]);
578 }
579 }
580 public boolean isDir() {
581 return isDir;
582 }
583
584 public String getName() {
585 return name;
586 }
587
588 public List<String> getChildren() {
589 return children;
590 }
591 public String getFileName() {
592 return name;
593 }
594 public String getPartName() {
595 return partName;
596 }
597 public long getStartIndex() {
598 return startIndex;
599 }
600 public long getLength() {
601 return length;
602 }
603 public long getModificationTime() {
604 return modificationTime;
605 }
606 }
607
608 /**
609 * return the filestatus of files in har archive.
610 * The permission returned are that of the archive
611 * index files. The permissions are not persisted
612 * while creating a hadoop archive.
613 * @param f the path in har filesystem
614 * @return filestatus.
615 * @throws IOException
616 */
617 @Override
618 public FileStatus getFileStatus(Path f) throws IOException {
619 HarStatus hstatus = getFileHarStatus(f);
620 return toFileStatus(hstatus, null);
621 }
622
623 private HarStatus getFileHarStatus(Path f) throws IOException {
624 // get the fs DataInputStream for the underlying file
625 // look up the index.
626 Path p = makeQualified(f);
627 Path harPath = getPathInHar(p);
628 if (harPath == null) {
629 throw new IOException("Invalid file name: " + f + " in " + uri);
630 }
631 HarStatus hstatus = metadata.archive.get(harPath);
632 if (hstatus == null) {
633 throw new FileNotFoundException("File: " + f + " does not exist in " + uri);
634 }
635 return hstatus;
636 }
637
638 /**
639 * @return null since no checksum algorithm is implemented.
640 */
641 @Override
642 public FileChecksum getFileChecksum(Path f) {
643 return null;
644 }
645
646 /**
647 * Returns a har input stream which fakes end of
648 * file. It reads the index files to get the part
649 * file name and the size and start of the file.
650 */
651 @Override
652 public FSDataInputStream open(Path f, int bufferSize) throws IOException {
653 // get the fs DataInputStream for the underlying file
654 HarStatus hstatus = getFileHarStatus(f);
655 // we got it.. woo hooo!!!
656 if (hstatus.isDir()) {
657 throw new FileNotFoundException(f + " : not a file in " +
658 archivePath);
659 }
660 return new HarFSDataInputStream(fs, new Path(archivePath,
661 hstatus.getPartName()),
662 hstatus.getStartIndex(), hstatus.getLength(), bufferSize);
663 }
664
665 /*
666 * create throws an exception in Har filesystem.
667 * The archive once created cannot be changed.
668 */
669 public FSDataOutputStream create(Path f, int bufferSize)
670 throws IOException {
671 throw new IOException("Har: Create not allowed");
672 }
673
674 @Override
675 public FSDataOutputStream create(Path f,
676 FsPermission permission,
677 boolean overwrite,
678 int bufferSize,
679 short replication,
680 long blockSize,
681 Progressable progress) throws IOException {
682 throw new IOException("Har: create not allowed.");
683 }
684
685 @Override
686 public void close() throws IOException {
687 if (fs != null) {
688 try {
689 fs.close();
690 } catch(IOException ie) {
691 //this might already be closed
692 // ignore
693 }
694 }
695 }
696
697 /**
698 * Not implemented.
699 */
700 @Override
701 public boolean setReplication(Path src, short replication) throws IOException{
702 throw new IOException("Har: setreplication not allowed");
703 }
704
705 /**
706 * Not implemented.
707 */
708 @Override
709 public boolean delete(Path f, boolean recursive) throws IOException {
710 throw new IOException("Har: delete not allowed");
711 }
712
713 /**
714 * liststatus returns the children of a directory
715 * after looking up the index files.
716 */
717 @Override
718 public FileStatus[] listStatus(Path f) throws IOException {
719 //need to see if the file is an index in file
720 //get the filestatus of the archive directory
721 // we will create fake filestatuses to return
722 // to the client
723 List<FileStatus> statuses = new ArrayList<FileStatus>();
724 Path tmpPath = makeQualified(f);
725 Path harPath = getPathInHar(tmpPath);
726 HarStatus hstatus = metadata.archive.get(harPath);
727 if (hstatus == null) {
728 throw new FileNotFoundException("File " + f + " not found in " + archivePath);
729 }
730 if (hstatus.isDir()) {
731 fileStatusesInIndex(hstatus, statuses, hstatus.children);
732 } else {
733 statuses.add(toFileStatus(hstatus, null));
734 }
735
736 return statuses.toArray(new FileStatus[statuses.size()]);
737 }
738
739 /**
740 * return the top level archive path.
741 */
742 @Override
743 public Path getHomeDirectory() {
744 return new Path(uri.toString());
745 }
746
747 @Override
748 public void setWorkingDirectory(Path newDir) {
749 //does nothing.
750 }
751
752 /**
753 * not implemented.
754 */
755 @Override
756 public boolean mkdirs(Path f, FsPermission permission) throws IOException {
757 throw new IOException("Har: mkdirs not allowed");
758 }
759
760 /**
761 * not implemented.
762 */
763 @Override
764 public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws
765 IOException {
766 throw new IOException("Har: copyfromlocalfile not allowed");
767 }
768
769 /**
770 * copies the file in the har filesystem to a local file.
771 */
772 @Override
773 public void copyToLocalFile(boolean delSrc, Path src, Path dst)
774 throws IOException {
775 FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf());
776 }
777
778 /**
779 * not implemented.
780 */
781 @Override
782 public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile)
783 throws IOException {
784 throw new IOException("Har: startLocalOutput not allowed");
785 }
786
787 /**
788 * not implemented.
789 */
790 @Override
791 public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile)
792 throws IOException {
793 throw new IOException("Har: completeLocalOutput not allowed");
794 }
795
796 /**
797 * not implemented.
798 */
799 @Override
800 public void setOwner(Path p, String username, String groupname)
801 throws IOException {
802 throw new IOException("Har: setowner not allowed");
803 }
804
805 /**
806 * Not implemented.
807 */
808 @Override
809 public void setPermission(Path p, FsPermission permisssion)
810 throws IOException {
811 throw new IOException("Har: setPermission not allowed");
812 }
813
814 /**
815 * Hadoop archives input stream. This input stream fakes EOF
816 * since archive files are part of bigger part files.
817 */
818 private static class HarFSDataInputStream extends FSDataInputStream {
819 /**
820 * Create an input stream that fakes all the reads/positions/seeking.
821 */
822 private static class HarFsInputStream extends FSInputStream {
823 private long position, start, end;
824 //The underlying data input stream that the
825 // underlying filesystem will return.
826 private FSDataInputStream underLyingStream;
827 //one byte buffer
828 private byte[] oneBytebuff = new byte[1];
829 HarFsInputStream(FileSystem fs, Path path, long start,
830 long length, int bufferSize) throws IOException {
831 underLyingStream = fs.open(path, bufferSize);
832 underLyingStream.seek(start);
833 // the start of this file in the part file
834 this.start = start;
835 // the position pointer in the part file
836 this.position = start;
837 // the end pointer in the part file
838 this.end = start + length;
839 }
840
841 @Override
842 public synchronized int available() throws IOException {
843 long remaining = end - underLyingStream.getPos();
844 if (remaining > (long)Integer.MAX_VALUE) {
845 return Integer.MAX_VALUE;
846 }
847 return (int) remaining;
848 }
849
850 @Override
851 public synchronized void close() throws IOException {
852 underLyingStream.close();
853 super.close();
854 }
855
856 //not implemented
857 @Override
858 public void mark(int readLimit) {
859 // do nothing
860 }
861
862 /**
863 * reset is not implemented
864 */
865 @Override
866 public void reset() throws IOException {
867 throw new IOException("reset not implemented.");
868 }
869
870 @Override
871 public synchronized int read() throws IOException {
872 int ret = read(oneBytebuff, 0, 1);
873 return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff);
874 }
875
876 @Override
877 public synchronized int read(byte[] b) throws IOException {
878 int ret = read(b, 0, b.length);
879 if (ret != -1) {
880 position += ret;
881 }
882 return ret;
883 }
884
885 /**
886 *
887 */
888 @Override
889 public synchronized int read(byte[] b, int offset, int len)
890 throws IOException {
891 int newlen = len;
892 int ret = -1;
893 if (position + len > end) {
894 newlen = (int) (end - position);
895 }
896 // end case
897 if (newlen == 0)
898 return ret;
899 ret = underLyingStream.read(b, offset, newlen);
900 position += ret;
901 return ret;
902 }
903
904 @Override
905 public synchronized long skip(long n) throws IOException {
906 long tmpN = n;
907 if (tmpN > 0) {
908 if (position + tmpN > end) {
909 tmpN = end - position;
910 }
911 underLyingStream.seek(tmpN + position);
912 position += tmpN;
913 return tmpN;
914 }
915 return (tmpN < 0)? -1 : 0;
916 }
917
918 @Override
919 public synchronized long getPos() throws IOException {
920 return (position - start);
921 }
922
923 @Override
924 public synchronized void seek(long pos) throws IOException {
925 if (pos < 0 || (start + pos > end)) {
926 throw new IOException("Failed to seek: EOF");
927 }
928 position = start + pos;
929 underLyingStream.seek(position);
930 }
931
932 @Override
933 public boolean seekToNewSource(long targetPos) throws IOException {
934 //do not need to implement this
935 // hdfs in itself does seektonewsource
936 // while reading.
937 return false;
938 }
939
940 /**
941 * implementing position readable.
942 */
943 @Override
944 public int read(long pos, byte[] b, int offset, int length)
945 throws IOException {
946 int nlength = length;
947 if (start + nlength + pos > end) {
948 nlength = (int) (end - (start + pos));
949 }
950 return underLyingStream.read(pos + start , b, offset, nlength);
951 }
952
953 /**
954 * position readable again.
955 */
956 @Override
957 public void readFully(long pos, byte[] b, int offset, int length)
958 throws IOException {
959 if (start + length + pos > end) {
960 throw new IOException("Not enough bytes to read.");
961 }
962 underLyingStream.readFully(pos + start, b, offset, length);
963 }
964
965 @Override
966 public void readFully(long pos, byte[] b) throws IOException {
967 readFully(pos, b, 0, b.length);
968 }
969
970 }
971
972 /**
973 * constructors for har input stream.
974 * @param fs the underlying filesystem
975 * @param p The path in the underlying filesystem
976 * @param start the start position in the part file
977 * @param length the length of valid data in the part file
978 * @param bufsize the buffer size
979 * @throws IOException
980 */
981 public HarFSDataInputStream(FileSystem fs, Path p, long start,
982 long length, int bufsize) throws IOException {
983 super(new HarFsInputStream(fs, p, start, length, bufsize));
984 }
985
986 /**
987 * constructor for har input stream.
988 * @param fs the underlying filesystem
989 * @param p the path in the underlying file system
990 * @param start the start position in the part file
991 * @param length the length of valid data in the part file.
992 * @throws IOException
993 */
994 public HarFSDataInputStream(FileSystem fs, Path p, long start, long length)
995 throws IOException {
996 super(new HarFsInputStream(fs, p, start, length, 0));
997 }
998 }
999
1000 private class HarMetaData {
1001 private FileSystem fs;
1002 private int version;
1003 // the masterIndex of the archive
1004 private Path masterIndexPath;
1005 // the index file
1006 private Path archiveIndexPath;
1007
1008 private long masterIndexTimestamp;
1009 private long archiveIndexTimestamp;
1010
1011 List<Store> stores = new ArrayList<Store>();
1012 Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
1013 private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
1014
1015 public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
1016 this.fs = fs;
1017 this.masterIndexPath = masterIndexPath;
1018 this.archiveIndexPath = archiveIndexPath;
1019 }
1020
1021 public FileStatus getPartFileStatus(Path partPath) throws IOException {
1022 FileStatus status;
1023 status = partFileStatuses.get(partPath);
1024 if (status == null) {
1025 status = fs.getFileStatus(partPath);
1026 partFileStatuses.put(partPath, status);
1027 }
1028 return status;
1029 }
1030
1031 public long getMasterIndexTimestamp() {
1032 return masterIndexTimestamp;
1033 }
1034
1035 public long getArchiveIndexTimestamp() {
1036 return archiveIndexTimestamp;
1037 }
1038
1039 private int getVersion() {
1040 return version;
1041 }
1042
1043 private void parseMetaData() throws IOException {
1044 FSDataInputStream in = fs.open(masterIndexPath);
1045 FileStatus masterStat = fs.getFileStatus(masterIndexPath);
1046 masterIndexTimestamp = masterStat.getModificationTime();
1047 LineReader lin = new LineReader(in, getConf());
1048 Text line = new Text();
1049 long read = lin.readLine(line);
1050
1051 // the first line contains the version of the index file
1052 String versionLine = line.toString();
1053 String[] arr = versionLine.split(" ");
1054 version = Integer.parseInt(arr[0]);
1055 // make it always backwards-compatible
1056 if (this.version > HarFileSystem.VERSION) {
1057 throw new IOException("Invalid version " +
1058 this.version + " expected " + HarFileSystem.VERSION);
1059 }
1060
1061 // each line contains a hashcode range and the index file name
1062 String[] readStr = null;
1063 while(read < masterStat.getLen()) {
1064 int b = lin.readLine(line);
1065 read += b;
1066 readStr = line.toString().split(" ");
1067 int startHash = Integer.parseInt(readStr[0]);
1068 int endHash = Integer.parseInt(readStr[1]);
1069 stores.add(new Store(Long.parseLong(readStr[2]),
1070 Long.parseLong(readStr[3]), startHash,
1071 endHash));
1072 line.clear();
1073 }
1074 try {
1075 // close the master index
1076 lin.close();
1077 } catch(IOException io){
1078 // do nothing just a read.
1079 }
1080
1081 FSDataInputStream aIn = fs.open(archiveIndexPath);
1082 FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
1083 archiveIndexTimestamp = archiveStat.getModificationTime();
1084 LineReader aLin;
1085
1086 // now start reading the real index file
1087 for (Store s: stores) {
1088 read = 0;
1089 aIn.seek(s.begin);
1090 aLin = new LineReader(aIn, getConf());
1091 while (read + s.begin < s.end) {
1092 int tmp = aLin.readLine(line);
1093 read += tmp;
1094 String lineFeed = line.toString();
1095 String[] parsed = lineFeed.split(" ");
1096 parsed[0] = decodeFileName(parsed[0]);
1097 archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
1098 line.clear();
1099 }
1100 }
1101 try {
1102 // close the archive index
1103 aIn.close();
1104 } catch(IOException io) {
1105 // do nothing just a read.
1106 }
1107 }
1108 }
1109 }