001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.fs;
019
020 import java.io.FileNotFoundException;
021 import java.io.IOException;
022 import java.io.UnsupportedEncodingException;
023 import java.net.URI;
024 import java.net.URISyntaxException;
025 import java.net.URLDecoder;
026 import java.util.ArrayList;
027 import java.util.List;
028 import java.util.Map;
029 import java.util.TreeMap;
030 import java.util.HashMap;
031 import java.util.concurrent.ConcurrentHashMap;
032
033 import org.apache.commons.logging.Log;
034 import org.apache.commons.logging.LogFactory;
035 import org.apache.hadoop.conf.Configuration;
036 import org.apache.hadoop.fs.permission.FsPermission;
037 import org.apache.hadoop.io.IOUtils;
038 import org.apache.hadoop.io.Text;
039 import org.apache.hadoop.util.LineReader;
040 import org.apache.hadoop.util.Progressable;
041
042 /**
043 * This is an implementation of the Hadoop Archive
044 * Filesystem. This archive Filesystem has index files
045 * of the form _index* and has contents of the form
046 * part-*. The index files store the indexes of the
047 * real files. The index files are of the form _masterindex
048 * and _index. The master index is a level of indirection
049 * in to the index file to make the look ups faster. the index
050 * file is sorted with hash code of the paths that it contains
051 * and the master index contains pointers to the positions in
052 * index for ranges of hashcodes.
053 */
054
055 public class HarFileSystem extends FilterFileSystem {
056
057 private static final Log LOG = LogFactory.getLog(HarFileSystem.class);
058
059 public static final int VERSION = 3;
060
061 private static final Map<URI, HarMetaData> harMetaCache =
062 new ConcurrentHashMap<URI, HarMetaData>();
063
064 // uri representation of this Har filesystem
065 private URI uri;
066 // the top level path of the archive
067 // in the underlying file system
068 private Path archivePath;
069 // the har auth
070 private String harAuth;
071
072 // pointer into the static metadata cache
073 private HarMetaData metadata;
074
075 /**
076 * public construction of harfilesystem
077 *
078 */
079 public HarFileSystem() {
080 }
081
082 /**
083 * Return the protocol scheme for the FileSystem.
084 * <p/>
085 *
086 * @return <code>har</code>
087 */
088 @Override
089 public String getScheme() {
090 return "har";
091 }
092
093 /**
094 * Constructor to create a HarFileSystem with an
095 * underlying filesystem.
096 * @param fs
097 */
098 public HarFileSystem(FileSystem fs) {
099 super(fs);
100 }
101
102 /**
103 * Initialize a Har filesystem per har archive. The
104 * archive home directory is the top level directory
105 * in the filesystem that contains the HAR archive.
106 * Be careful with this method, you do not want to go
107 * on creating new Filesystem instances per call to
108 * path.getFileSystem().
109 * the uri of Har is
110 * har://underlyingfsscheme-host:port/archivepath.
111 * or
112 * har:///archivepath. This assumes the underlying filesystem
113 * to be used in case not specified.
114 */
115 @Override
116 public void initialize(URI name, Configuration conf) throws IOException {
117 // decode the name
118 URI underLyingURI = decodeHarURI(name, conf);
119 // we got the right har Path- now check if this is
120 // truly a har filesystem
121 Path harPath = archivePath(
122 new Path(name.getScheme(), name.getAuthority(), name.getPath()));
123 if (harPath == null) {
124 throw new IOException("Invalid path for the Har Filesystem. " +
125 name.toString());
126 }
127 if (fs == null) {
128 fs = FileSystem.get(underLyingURI, conf);
129 }
130 uri = harPath.toUri();
131 archivePath = new Path(uri.getPath());
132 harAuth = getHarAuth(underLyingURI);
133 //check for the underlying fs containing
134 // the index file
135 Path masterIndexPath = new Path(archivePath, "_masterindex");
136 Path archiveIndexPath = new Path(archivePath, "_index");
137 if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) {
138 throw new IOException("Invalid path for the Har Filesystem. " +
139 "No index file in " + harPath);
140 }
141
142 metadata = harMetaCache.get(uri);
143 if (metadata != null) {
144 FileStatus mStat = fs.getFileStatus(masterIndexPath);
145 FileStatus aStat = fs.getFileStatus(archiveIndexPath);
146 if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() ||
147 aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) {
148 // the archive has been overwritten since we last read it
149 // remove the entry from the meta data cache
150 metadata = null;
151 harMetaCache.remove(uri);
152 }
153 }
154 if (metadata == null) {
155 metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath);
156 metadata.parseMetaData();
157 harMetaCache.put(uri, metadata);
158 }
159 }
160
161 // get the version of the filesystem from the masterindex file
162 // the version is currently not useful since its the first version
163 // of archives
164 public int getHarVersion() throws IOException {
165 if (metadata != null) {
166 return metadata.getVersion();
167 }
168 else {
169 throw new IOException("Invalid meta data for the Har Filesystem");
170 }
171 }
172
173 /*
174 * find the parent path that is the
175 * archive path in the path. The last
176 * path segment that ends with .har is
177 * the path that will be returned.
178 */
179 private Path archivePath(Path p) {
180 Path retPath = null;
181 Path tmp = p;
182 for (int i=0; i< p.depth(); i++) {
183 if (tmp.toString().endsWith(".har")) {
184 retPath = tmp;
185 break;
186 }
187 tmp = tmp.getParent();
188 }
189 return retPath;
190 }
191
192 /**
193 * decode the raw URI to get the underlying URI
194 * @param rawURI raw Har URI
195 * @return filtered URI of the underlying fileSystem
196 */
197 private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException {
198 String tmpAuth = rawURI.getAuthority();
199 //we are using the default file
200 //system in the config
201 //so create a underlying uri and
202 //return it
203 if (tmpAuth == null) {
204 //create a path
205 return FileSystem.getDefaultUri(conf);
206 }
207 String authority = rawURI.getAuthority();
208 if (authority == null) {
209 throw new IOException("URI: " + rawURI
210 + " is an invalid Har URI since authority==null."
211 + " Expecting har://<scheme>-<host>/<path>.");
212 }
213
214 int i = authority.indexOf('-');
215 if (i < 0) {
216 throw new IOException("URI: " + rawURI
217 + " is an invalid Har URI since '-' not found."
218 + " Expecting har://<scheme>-<host>/<path>.");
219 }
220
221 if (rawURI.getQuery() != null) {
222 // query component not allowed
223 throw new IOException("query component in Path not supported " + rawURI);
224 }
225
226 URI tmp = null;
227
228 try {
229 // convert <scheme>-<host> to <scheme>://<host>
230 URI baseUri = new URI(authority.replaceFirst("-", "://"));
231
232 tmp = new URI(baseUri.getScheme(), baseUri.getAuthority(),
233 rawURI.getPath(), rawURI.getQuery(), rawURI.getFragment());
234 } catch (URISyntaxException e) {
235 throw new IOException("URI: " + rawURI
236 + " is an invalid Har URI. Expecting har://<scheme>-<host>/<path>.");
237 }
238 return tmp;
239 }
240
241 private static String decodeString(String str)
242 throws UnsupportedEncodingException {
243 return URLDecoder.decode(str, "UTF-8");
244 }
245
246 private String decodeFileName(String fname)
247 throws UnsupportedEncodingException {
248 int version = metadata.getVersion();
249 if (version == 2 || version == 3){
250 return decodeString(fname);
251 }
252 return fname;
253 }
254
255 /**
256 * return the top level archive.
257 */
258 @Override
259 public Path getWorkingDirectory() {
260 return new Path(uri.toString());
261 }
262
263 /**
264 * Create a har specific auth
265 * har-underlyingfs:port
266 * @param underLyingURI the uri of underlying
267 * filesystem
268 * @return har specific auth
269 */
270 private String getHarAuth(URI underLyingUri) {
271 String auth = underLyingUri.getScheme() + "-";
272 if (underLyingUri.getHost() != null) {
273 auth += underLyingUri.getHost() + ":";
274 if (underLyingUri.getPort() != -1) {
275 auth += underLyingUri.getPort();
276 }
277 }
278 else {
279 auth += ":";
280 }
281 return auth;
282 }
283
284 /**
285 * Returns the uri of this filesystem.
286 * The uri is of the form
287 * har://underlyingfsschema-host:port/pathintheunderlyingfs
288 */
289 @Override
290 public URI getUri() {
291 return this.uri;
292 }
293
294 /**
295 * this method returns the path
296 * inside the har filesystem.
297 * this is relative path inside
298 * the har filesystem.
299 * @param path the fully qualified path in the har filesystem.
300 * @return relative path in the filesystem.
301 */
302 private Path getPathInHar(Path path) {
303 Path harPath = new Path(path.toUri().getPath());
304 if (archivePath.compareTo(harPath) == 0)
305 return new Path(Path.SEPARATOR);
306 Path tmp = new Path(harPath.getName());
307 Path parent = harPath.getParent();
308 while (!(parent.compareTo(archivePath) == 0)) {
309 if (parent.toString().equals(Path.SEPARATOR)) {
310 tmp = null;
311 break;
312 }
313 tmp = new Path(parent.getName(), tmp);
314 parent = parent.getParent();
315 }
316 if (tmp != null)
317 tmp = new Path(Path.SEPARATOR, tmp);
318 return tmp;
319 }
320
321 //the relative path of p. basically
322 // getting rid of /. Parsing and doing
323 // string manipulation is not good - so
324 // just use the path api to do it.
325 private Path makeRelative(String initial, Path p) {
326 String scheme = this.uri.getScheme();
327 String authority = this.uri.getAuthority();
328 Path root = new Path(Path.SEPARATOR);
329 if (root.compareTo(p) == 0)
330 return new Path(scheme, authority, initial);
331 Path retPath = new Path(p.getName());
332 Path parent = p.getParent();
333 for (int i=0; i < p.depth()-1; i++) {
334 retPath = new Path(parent.getName(), retPath);
335 parent = parent.getParent();
336 }
337 return new Path(new Path(scheme, authority, initial),
338 retPath.toString());
339 }
340
341 /* this makes a path qualified in the har filesystem
342 * (non-Javadoc)
343 * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified(
344 * org.apache.hadoop.fs.Path)
345 */
346 @Override
347 public Path makeQualified(Path path) {
348 // make sure that we just get the
349 // path component
350 Path fsPath = path;
351 if (!path.isAbsolute()) {
352 fsPath = new Path(archivePath, path);
353 }
354
355 URI tmpURI = fsPath.toUri();
356 //change this to Har uri
357 return new Path(uri.getScheme(), harAuth, tmpURI.getPath());
358 }
359
360 /**
361 * Fix offset and length of block locations.
362 * Note that this method modifies the original array.
363 * @param locations block locations of har part file
364 * @param start the start of the desired range in the contained file
365 * @param len the length of the desired range
366 * @param fileOffsetInHar the offset of the desired file in the har part file
367 * @return block locations with fixed offset and length
368 */
369 static BlockLocation[] fixBlockLocations(BlockLocation[] locations,
370 long start,
371 long len,
372 long fileOffsetInHar) {
373 // offset 1 past last byte of desired range
374 long end = start + len;
375
376 for (BlockLocation location : locations) {
377 // offset of part block relative to beginning of desired file
378 // (may be negative if file starts in this part block)
379 long harBlockStart = location.getOffset() - fileOffsetInHar;
380 // offset 1 past last byte of har block relative to beginning of
381 // desired file
382 long harBlockEnd = harBlockStart + location.getLength();
383
384 if (start > harBlockStart) {
385 // desired range starts after beginning of this har block
386 // fix offset to beginning of relevant range (relative to desired file)
387 location.setOffset(start);
388 // fix length to relevant portion of har block
389 location.setLength(location.getLength() - (start - harBlockStart));
390 } else {
391 // desired range includes beginning of this har block
392 location.setOffset(harBlockStart);
393 }
394
395 if (harBlockEnd > end) {
396 // range ends before end of this har block
397 // fix length to remove irrelevant portion at the end
398 location.setLength(location.getLength() - (harBlockEnd - end));
399 }
400 }
401
402 return locations;
403 }
404
405 /**
406 * Get block locations from the underlying fs and fix their
407 * offsets and lengths.
408 * @param file the input filestatus to get block locations
409 * @param start the start of the desired range in the contained file
410 * @param len the length of the desired range
411 * @return block locations for this segment of file
412 * @throws IOException
413 */
414 @Override
415 public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
416 long len) throws IOException {
417 HarStatus hstatus = getFileHarStatus(file.getPath());
418 Path partPath = new Path(archivePath, hstatus.getPartName());
419 FileStatus partStatus = metadata.getPartFileStatus(partPath);
420
421 // get all part blocks that overlap with the desired file blocks
422 BlockLocation[] locations =
423 fs.getFileBlockLocations(partStatus,
424 hstatus.getStartIndex() + start, len);
425
426 return fixBlockLocations(locations, start, len, hstatus.getStartIndex());
427 }
428
429 /**
430 * the hash of the path p inside iniside
431 * the filesystem
432 * @param p the path in the harfilesystem
433 * @return the hash code of the path.
434 */
435 public static int getHarHash(Path p) {
436 return (p.toString().hashCode() & 0x7fffffff);
437 }
438
439 static class Store {
440 public Store() {
441 begin = end = startHash = endHash = 0;
442 }
443 public Store(long begin, long end, int startHash, int endHash) {
444 this.begin = begin;
445 this.end = end;
446 this.startHash = startHash;
447 this.endHash = endHash;
448 }
449 public long begin;
450 public long end;
451 public int startHash;
452 public int endHash;
453 }
454
455 /**
456 * Get filestatuses of all the children of a given directory. This just reads
457 * through index file and reads line by line to get all statuses for children
458 * of a directory. Its a brute force way of getting all such filestatuses
459 *
460 * @param parent
461 * the parent path directory
462 * @param statuses
463 * the list to add the children filestatuses to
464 * @param children
465 * the string list of children for this parent
466 * @param archiveIndexStat
467 * the archive index filestatus
468 */
469 private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses,
470 List<String> children) throws IOException {
471 String parentString = parent.getName();
472 if (!parentString.endsWith(Path.SEPARATOR)){
473 parentString += Path.SEPARATOR;
474 }
475 Path harPath = new Path(parentString);
476 int harlen = harPath.depth();
477 final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
478
479 for (HarStatus hstatus : metadata.archive.values()) {
480 String child = hstatus.getName();
481 if ((child.startsWith(parentString))) {
482 Path thisPath = new Path(child);
483 if (thisPath.depth() == harlen + 1) {
484 statuses.add(toFileStatus(hstatus, cache));
485 }
486 }
487 }
488 }
489
490 /**
491 * Combine the status stored in the index and the underlying status.
492 * @param h status stored in the index
493 * @param cache caching the underlying file statuses
494 * @return the combined file status
495 * @throws IOException
496 */
497 private FileStatus toFileStatus(HarStatus h,
498 Map<String, FileStatus> cache) throws IOException {
499 FileStatus underlying = null;
500 if (cache != null) {
501 underlying = cache.get(h.partName);
502 }
503 if (underlying == null) {
504 final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
505 underlying = fs.getFileStatus(p);
506 if (cache != null) {
507 cache.put(h.partName, underlying);
508 }
509 }
510
511 long modTime = 0;
512 int version = metadata.getVersion();
513 if (version < 3) {
514 modTime = underlying.getModificationTime();
515 } else if (version == 3) {
516 modTime = h.getModificationTime();
517 }
518
519 return new FileStatus(
520 h.isDir()? 0L: h.getLength(),
521 h.isDir(),
522 underlying.getReplication(),
523 underlying.getBlockSize(),
524 modTime,
525 underlying.getAccessTime(),
526 underlying.getPermission(),
527 underlying.getOwner(),
528 underlying.getGroup(),
529 makeRelative(this.uri.getPath(), new Path(h.name)));
530 }
531
532 // a single line parser for hadoop archives status
533 // stored in a single line in the index files
534 // the format is of the form
535 // filename "dir"/"file" partFileName startIndex length
536 // <space seperated children>
537 private class HarStatus {
538 boolean isDir;
539 String name;
540 List<String> children;
541 String partName;
542 long startIndex;
543 long length;
544 long modificationTime = 0;
545
546 public HarStatus(String harString) throws UnsupportedEncodingException {
547 String[] splits = harString.split(" ");
548 this.name = decodeFileName(splits[0]);
549 this.isDir = "dir".equals(splits[1]) ? true: false;
550 // this is equal to "none" if its a directory
551 this.partName = splits[2];
552 this.startIndex = Long.parseLong(splits[3]);
553 this.length = Long.parseLong(splits[4]);
554
555 int version = metadata.getVersion();
556 String[] propSplits = null;
557 // propSplits is used to retrieve the metainformation that Har versions
558 // 1 & 2 missed (modification time, permission, owner group).
559 // These fields are stored in an encoded string placed in different
560 // locations depending on whether it's a file or directory entry.
561 // If it's a directory, the string will be placed at the partName
562 // location (directories have no partName because they don't have data
563 // to be stored). This is done because the number of fields in a
564 // directory entry is unbounded (all children are listed at the end)
565 // If it's a file, the string will be the last field.
566 if (isDir) {
567 if (version == 3){
568 propSplits = decodeString(this.partName).split(" ");
569 }
570 children = new ArrayList<String>();
571 for (int i = 5; i < splits.length; i++) {
572 children.add(decodeFileName(splits[i]));
573 }
574 } else if (version == 3) {
575 propSplits = decodeString(splits[5]).split(" ");
576 }
577
578 if (propSplits != null && propSplits.length >= 4) {
579 modificationTime = Long.parseLong(propSplits[0]);
580 // the fields below are stored in the file but are currently not used
581 // by HarFileSystem
582 // permission = new FsPermission(Short.parseShort(propSplits[1]));
583 // owner = decodeString(propSplits[2]);
584 // group = decodeString(propSplits[3]);
585 }
586 }
587 public boolean isDir() {
588 return isDir;
589 }
590
591 public String getName() {
592 return name;
593 }
594 public String getPartName() {
595 return partName;
596 }
597 public long getStartIndex() {
598 return startIndex;
599 }
600 public long getLength() {
601 return length;
602 }
603 public long getModificationTime() {
604 return modificationTime;
605 }
606 }
607
608 /**
609 * return the filestatus of files in har archive.
610 * The permission returned are that of the archive
611 * index files. The permissions are not persisted
612 * while creating a hadoop archive.
613 * @param f the path in har filesystem
614 * @return filestatus.
615 * @throws IOException
616 */
617 @Override
618 public FileStatus getFileStatus(Path f) throws IOException {
619 HarStatus hstatus = getFileHarStatus(f);
620 return toFileStatus(hstatus, null);
621 }
622
623 private HarStatus getFileHarStatus(Path f) throws IOException {
624 // get the fs DataInputStream for the underlying file
625 // look up the index.
626 Path p = makeQualified(f);
627 Path harPath = getPathInHar(p);
628 if (harPath == null) {
629 throw new IOException("Invalid file name: " + f + " in " + uri);
630 }
631 HarStatus hstatus = metadata.archive.get(harPath);
632 if (hstatus == null) {
633 throw new FileNotFoundException("File: " + f + " does not exist in " + uri);
634 }
635 return hstatus;
636 }
637
638 /**
639 * @return null since no checksum algorithm is implemented.
640 */
641 @Override
642 public FileChecksum getFileChecksum(Path f) {
643 return null;
644 }
645
646 /**
647 * Returns a har input stream which fakes end of
648 * file. It reads the index files to get the part
649 * file name and the size and start of the file.
650 */
651 @Override
652 public FSDataInputStream open(Path f, int bufferSize) throws IOException {
653 // get the fs DataInputStream for the underlying file
654 HarStatus hstatus = getFileHarStatus(f);
655 // we got it.. woo hooo!!!
656 if (hstatus.isDir()) {
657 throw new FileNotFoundException(f + " : not a file in " +
658 archivePath);
659 }
660 return new HarFSDataInputStream(fs, new Path(archivePath,
661 hstatus.getPartName()),
662 hstatus.getStartIndex(), hstatus.getLength(), bufferSize);
663 }
664
665 @Override
666 public FSDataOutputStream create(Path f,
667 FsPermission permission,
668 boolean overwrite,
669 int bufferSize,
670 short replication,
671 long blockSize,
672 Progressable progress) throws IOException {
673 throw new IOException("Har: create not allowed.");
674 }
675
676 @Override
677 public void close() throws IOException {
678 if (fs != null) {
679 try {
680 fs.close();
681 } catch(IOException ie) {
682 //this might already be closed
683 // ignore
684 }
685 }
686 }
687
688 /**
689 * Not implemented.
690 */
691 @Override
692 public boolean setReplication(Path src, short replication) throws IOException{
693 throw new IOException("Har: setreplication not allowed");
694 }
695
696 /**
697 * Not implemented.
698 */
699 @Override
700 public boolean delete(Path f, boolean recursive) throws IOException {
701 throw new IOException("Har: delete not allowed");
702 }
703
704 /**
705 * liststatus returns the children of a directory
706 * after looking up the index files.
707 */
708 @Override
709 public FileStatus[] listStatus(Path f) throws IOException {
710 //need to see if the file is an index in file
711 //get the filestatus of the archive directory
712 // we will create fake filestatuses to return
713 // to the client
714 List<FileStatus> statuses = new ArrayList<FileStatus>();
715 Path tmpPath = makeQualified(f);
716 Path harPath = getPathInHar(tmpPath);
717 HarStatus hstatus = metadata.archive.get(harPath);
718 if (hstatus == null) {
719 throw new FileNotFoundException("File " + f + " not found in " + archivePath);
720 }
721 if (hstatus.isDir()) {
722 fileStatusesInIndex(hstatus, statuses, hstatus.children);
723 } else {
724 statuses.add(toFileStatus(hstatus, null));
725 }
726
727 return statuses.toArray(new FileStatus[statuses.size()]);
728 }
729
730 /**
731 * return the top level archive path.
732 */
733 @Override
734 public Path getHomeDirectory() {
735 return new Path(uri.toString());
736 }
737
738 @Override
739 public void setWorkingDirectory(Path newDir) {
740 //does nothing.
741 }
742
743 /**
744 * not implemented.
745 */
746 @Override
747 public boolean mkdirs(Path f, FsPermission permission) throws IOException {
748 throw new IOException("Har: mkdirs not allowed");
749 }
750
751 /**
752 * not implemented.
753 */
754 @Override
755 public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws
756 IOException {
757 throw new IOException("Har: copyfromlocalfile not allowed");
758 }
759
760 /**
761 * copies the file in the har filesystem to a local file.
762 */
763 @Override
764 public void copyToLocalFile(boolean delSrc, Path src, Path dst)
765 throws IOException {
766 FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf());
767 }
768
769 /**
770 * not implemented.
771 */
772 @Override
773 public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile)
774 throws IOException {
775 throw new IOException("Har: startLocalOutput not allowed");
776 }
777
778 /**
779 * not implemented.
780 */
781 @Override
782 public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile)
783 throws IOException {
784 throw new IOException("Har: completeLocalOutput not allowed");
785 }
786
787 /**
788 * not implemented.
789 */
790 @Override
791 public void setOwner(Path p, String username, String groupname)
792 throws IOException {
793 throw new IOException("Har: setowner not allowed");
794 }
795
796 /**
797 * Not implemented.
798 */
799 @Override
800 public void setPermission(Path p, FsPermission permisssion)
801 throws IOException {
802 throw new IOException("Har: setPermission not allowed");
803 }
804
805 /**
806 * Hadoop archives input stream. This input stream fakes EOF
807 * since archive files are part of bigger part files.
808 */
809 private static class HarFSDataInputStream extends FSDataInputStream {
810 /**
811 * Create an input stream that fakes all the reads/positions/seeking.
812 */
813 private static class HarFsInputStream extends FSInputStream {
814 private long position, start, end;
815 //The underlying data input stream that the
816 // underlying filesystem will return.
817 private FSDataInputStream underLyingStream;
818 //one byte buffer
819 private byte[] oneBytebuff = new byte[1];
820 HarFsInputStream(FileSystem fs, Path path, long start,
821 long length, int bufferSize) throws IOException {
822 underLyingStream = fs.open(path, bufferSize);
823 underLyingStream.seek(start);
824 // the start of this file in the part file
825 this.start = start;
826 // the position pointer in the part file
827 this.position = start;
828 // the end pointer in the part file
829 this.end = start + length;
830 }
831
832 @Override
833 public synchronized int available() throws IOException {
834 long remaining = end - underLyingStream.getPos();
835 if (remaining > (long)Integer.MAX_VALUE) {
836 return Integer.MAX_VALUE;
837 }
838 return (int) remaining;
839 }
840
841 @Override
842 public synchronized void close() throws IOException {
843 underLyingStream.close();
844 super.close();
845 }
846
847 //not implemented
848 @Override
849 public void mark(int readLimit) {
850 // do nothing
851 }
852
853 /**
854 * reset is not implemented
855 */
856 @Override
857 public void reset() throws IOException {
858 throw new IOException("reset not implemented.");
859 }
860
861 @Override
862 public synchronized int read() throws IOException {
863 int ret = read(oneBytebuff, 0, 1);
864 return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff);
865 }
866
867 @Override
868 public synchronized int read(byte[] b) throws IOException {
869 int ret = read(b, 0, b.length);
870 if (ret != -1) {
871 position += ret;
872 }
873 return ret;
874 }
875
876 /**
877 *
878 */
879 @Override
880 public synchronized int read(byte[] b, int offset, int len)
881 throws IOException {
882 int newlen = len;
883 int ret = -1;
884 if (position + len > end) {
885 newlen = (int) (end - position);
886 }
887 // end case
888 if (newlen == 0)
889 return ret;
890 ret = underLyingStream.read(b, offset, newlen);
891 position += ret;
892 return ret;
893 }
894
895 @Override
896 public synchronized long skip(long n) throws IOException {
897 long tmpN = n;
898 if (tmpN > 0) {
899 if (position + tmpN > end) {
900 tmpN = end - position;
901 }
902 underLyingStream.seek(tmpN + position);
903 position += tmpN;
904 return tmpN;
905 }
906 return (tmpN < 0)? -1 : 0;
907 }
908
909 @Override
910 public synchronized long getPos() throws IOException {
911 return (position - start);
912 }
913
914 @Override
915 public synchronized void seek(long pos) throws IOException {
916 if (pos < 0 || (start + pos > end)) {
917 throw new IOException("Failed to seek: EOF");
918 }
919 position = start + pos;
920 underLyingStream.seek(position);
921 }
922
923 @Override
924 public boolean seekToNewSource(long targetPos) throws IOException {
925 //do not need to implement this
926 // hdfs in itself does seektonewsource
927 // while reading.
928 return false;
929 }
930
931 /**
932 * implementing position readable.
933 */
934 @Override
935 public int read(long pos, byte[] b, int offset, int length)
936 throws IOException {
937 int nlength = length;
938 if (start + nlength + pos > end) {
939 nlength = (int) (end - (start + pos));
940 }
941 return underLyingStream.read(pos + start , b, offset, nlength);
942 }
943
944 /**
945 * position readable again.
946 */
947 @Override
948 public void readFully(long pos, byte[] b, int offset, int length)
949 throws IOException {
950 if (start + length + pos > end) {
951 throw new IOException("Not enough bytes to read.");
952 }
953 underLyingStream.readFully(pos + start, b, offset, length);
954 }
955
956 @Override
957 public void readFully(long pos, byte[] b) throws IOException {
958 readFully(pos, b, 0, b.length);
959 }
960
961 }
962
963 /**
964 * constructors for har input stream.
965 * @param fs the underlying filesystem
966 * @param p The path in the underlying filesystem
967 * @param start the start position in the part file
968 * @param length the length of valid data in the part file
969 * @param bufsize the buffer size
970 * @throws IOException
971 */
972 public HarFSDataInputStream(FileSystem fs, Path p, long start,
973 long length, int bufsize) throws IOException {
974 super(new HarFsInputStream(fs, p, start, length, bufsize));
975 }
976
977 /**
978 * constructor for har input stream.
979 * @param fs the underlying filesystem
980 * @param p the path in the underlying file system
981 * @param start the start position in the part file
982 * @param length the length of valid data in the part file.
983 * @throws IOException
984 */
985 public HarFSDataInputStream(FileSystem fs, Path p, long start, long length)
986 throws IOException {
987 super(new HarFsInputStream(fs, p, start, length, 0));
988 }
989 }
990
991 private class HarMetaData {
992 private FileSystem fs;
993 private int version;
994 // the masterIndex of the archive
995 private Path masterIndexPath;
996 // the index file
997 private Path archiveIndexPath;
998
999 private long masterIndexTimestamp;
1000 private long archiveIndexTimestamp;
1001
1002 List<Store> stores = new ArrayList<Store>();
1003 Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
1004 private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
1005
1006 public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
1007 this.fs = fs;
1008 this.masterIndexPath = masterIndexPath;
1009 this.archiveIndexPath = archiveIndexPath;
1010 }
1011
1012 public FileStatus getPartFileStatus(Path partPath) throws IOException {
1013 FileStatus status;
1014 status = partFileStatuses.get(partPath);
1015 if (status == null) {
1016 status = fs.getFileStatus(partPath);
1017 partFileStatuses.put(partPath, status);
1018 }
1019 return status;
1020 }
1021
1022 public long getMasterIndexTimestamp() {
1023 return masterIndexTimestamp;
1024 }
1025
1026 public long getArchiveIndexTimestamp() {
1027 return archiveIndexTimestamp;
1028 }
1029
1030 private int getVersion() {
1031 return version;
1032 }
1033
1034 private void parseMetaData() throws IOException {
1035 Text line;
1036 long read;
1037 FSDataInputStream in = null;
1038 LineReader lin = null;
1039
1040 try {
1041 in = fs.open(masterIndexPath);
1042 FileStatus masterStat = fs.getFileStatus(masterIndexPath);
1043 masterIndexTimestamp = masterStat.getModificationTime();
1044 lin = new LineReader(in, getConf());
1045 line = new Text();
1046 read = lin.readLine(line);
1047
1048 // the first line contains the version of the index file
1049 String versionLine = line.toString();
1050 String[] arr = versionLine.split(" ");
1051 version = Integer.parseInt(arr[0]);
1052 // make it always backwards-compatible
1053 if (this.version > HarFileSystem.VERSION) {
1054 throw new IOException("Invalid version " +
1055 this.version + " expected " + HarFileSystem.VERSION);
1056 }
1057
1058 // each line contains a hashcode range and the index file name
1059 String[] readStr = null;
1060 while(read < masterStat.getLen()) {
1061 int b = lin.readLine(line);
1062 read += b;
1063 readStr = line.toString().split(" ");
1064 int startHash = Integer.parseInt(readStr[0]);
1065 int endHash = Integer.parseInt(readStr[1]);
1066 stores.add(new Store(Long.parseLong(readStr[2]),
1067 Long.parseLong(readStr[3]), startHash,
1068 endHash));
1069 line.clear();
1070 }
1071 } finally {
1072 IOUtils.cleanup(LOG, lin, in);
1073 }
1074
1075 FSDataInputStream aIn = fs.open(archiveIndexPath);
1076 try {
1077 FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
1078 archiveIndexTimestamp = archiveStat.getModificationTime();
1079 LineReader aLin;
1080
1081 // now start reading the real index file
1082 for (Store s: stores) {
1083 read = 0;
1084 aIn.seek(s.begin);
1085 aLin = new LineReader(aIn, getConf());
1086 while (read + s.begin < s.end) {
1087 int tmp = aLin.readLine(line);
1088 read += tmp;
1089 String lineFeed = line.toString();
1090 String[] parsed = lineFeed.split(" ");
1091 parsed[0] = decodeFileName(parsed[0]);
1092 archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
1093 line.clear();
1094 }
1095 }
1096 } finally {
1097 IOUtils.cleanup(LOG, aIn);
1098 }
1099 }
1100 }
1101
1102 /*
1103 * testing purposes only:
1104 */
1105 HarMetaData getMetadata() {
1106 return metadata;
1107 }
1108 }