001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.fs;
019
020 import java.io.FileNotFoundException;
021 import java.io.IOException;
022 import java.io.UnsupportedEncodingException;
023 import java.net.URI;
024 import java.net.URISyntaxException;
025 import java.net.URLDecoder;
026 import java.util.ArrayList;
027 import java.util.List;
028 import java.util.Map;
029 import java.util.TreeMap;
030 import java.util.HashMap;
031 import java.util.concurrent.ConcurrentHashMap;
032
033 import org.apache.commons.logging.Log;
034 import org.apache.commons.logging.LogFactory;
035 import org.apache.hadoop.conf.Configuration;
036 import org.apache.hadoop.fs.permission.FsPermission;
037 import org.apache.hadoop.io.IOUtils;
038 import org.apache.hadoop.io.Text;
039 import org.apache.hadoop.util.LineReader;
040 import org.apache.hadoop.util.Progressable;
041
042 /**
043 * This is an implementation of the Hadoop Archive
044 * Filesystem. This archive Filesystem has index files
045 * of the form _index* and has contents of the form
046 * part-*. The index files store the indexes of the
047 * real files. The index files are of the form _masterindex
048 * and _index. The master index is a level of indirection
049 * in to the index file to make the look ups faster. the index
050 * file is sorted with hash code of the paths that it contains
051 * and the master index contains pointers to the positions in
052 * index for ranges of hashcodes.
053 */
054
055 public class HarFileSystem extends FilterFileSystem {
056
057 private static final Log LOG = LogFactory.getLog(HarFileSystem.class);
058
059 public static final int VERSION = 3;
060
061 private static final Map<URI, HarMetaData> harMetaCache =
062 new ConcurrentHashMap<URI, HarMetaData>();
063
064 // uri representation of this Har filesystem
065 private URI uri;
066 // the top level path of the archive
067 // in the underlying file system
068 private Path archivePath;
069 // the har auth
070 private String harAuth;
071
072 // pointer into the static metadata cache
073 private HarMetaData metadata;
074
075 /**
076 * public construction of harfilesystem
077 *
078 */
079 public HarFileSystem() {
080 }
081
082 /**
083 * Return the protocol scheme for the FileSystem.
084 * <p/>
085 *
086 * @return <code>har</code>
087 */
088 @Override
089 public String getScheme() {
090 return "har";
091 }
092
093 /**
094 * Constructor to create a HarFileSystem with an
095 * underlying filesystem.
096 * @param fs
097 */
098 public HarFileSystem(FileSystem fs) {
099 super(fs);
100 }
101
102 /**
103 * Initialize a Har filesystem per har archive. The
104 * archive home directory is the top level directory
105 * in the filesystem that contains the HAR archive.
106 * Be careful with this method, you do not want to go
107 * on creating new Filesystem instances per call to
108 * path.getFileSystem().
109 * the uri of Har is
110 * har://underlyingfsscheme-host:port/archivepath.
111 * or
112 * har:///archivepath. This assumes the underlying filesystem
113 * to be used in case not specified.
114 */
115 @Override
116 public void initialize(URI name, Configuration conf) throws IOException {
117 // decode the name
118 URI underLyingURI = decodeHarURI(name, conf);
119 // we got the right har Path- now check if this is
120 // truly a har filesystem
121 Path harPath = archivePath(
122 new Path(name.getScheme(), name.getAuthority(), name.getPath()));
123 if (harPath == null) {
124 throw new IOException("Invalid path for the Har Filesystem. " +
125 name.toString());
126 }
127 if (fs == null) {
128 fs = FileSystem.get(underLyingURI, conf);
129 }
130 uri = harPath.toUri();
131 archivePath = new Path(uri.getPath());
132 harAuth = getHarAuth(underLyingURI);
133 //check for the underlying fs containing
134 // the index file
135 Path masterIndexPath = new Path(archivePath, "_masterindex");
136 Path archiveIndexPath = new Path(archivePath, "_index");
137 if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) {
138 throw new IOException("Invalid path for the Har Filesystem. " +
139 "No index file in " + harPath);
140 }
141
142 metadata = harMetaCache.get(uri);
143 if (metadata != null) {
144 FileStatus mStat = fs.getFileStatus(masterIndexPath);
145 FileStatus aStat = fs.getFileStatus(archiveIndexPath);
146 if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() ||
147 aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) {
148 // the archive has been overwritten since we last read it
149 // remove the entry from the meta data cache
150 metadata = null;
151 harMetaCache.remove(uri);
152 }
153 }
154 if (metadata == null) {
155 metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath);
156 metadata.parseMetaData();
157 harMetaCache.put(uri, metadata);
158 }
159 }
160
161 // get the version of the filesystem from the masterindex file
162 // the version is currently not useful since its the first version
163 // of archives
164 public int getHarVersion() throws IOException {
165 if (metadata != null) {
166 return metadata.getVersion();
167 }
168 else {
169 throw new IOException("Invalid meta data for the Har Filesystem");
170 }
171 }
172
173 /*
174 * find the parent path that is the
175 * archive path in the path. The last
176 * path segment that ends with .har is
177 * the path that will be returned.
178 */
179 private Path archivePath(Path p) {
180 Path retPath = null;
181 Path tmp = p;
182 for (int i=0; i< p.depth(); i++) {
183 if (tmp.toString().endsWith(".har")) {
184 retPath = tmp;
185 break;
186 }
187 tmp = tmp.getParent();
188 }
189 return retPath;
190 }
191
192 /**
193 * decode the raw URI to get the underlying URI
194 * @param rawURI raw Har URI
195 * @return filtered URI of the underlying fileSystem
196 */
197 private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException {
198 String tmpAuth = rawURI.getAuthority();
199 //we are using the default file
200 //system in the config
201 //so create a underlying uri and
202 //return it
203 if (tmpAuth == null) {
204 //create a path
205 return FileSystem.getDefaultUri(conf);
206 }
207 String host = rawURI.getHost();
208 if (host == null) {
209 throw new IOException("URI: " + rawURI
210 + " is an invalid Har URI since host==null."
211 + " Expecting har://<scheme>-<host>/<path>.");
212 }
213 int i = host.indexOf('-');
214 if (i < 0) {
215 throw new IOException("URI: " + rawURI
216 + " is an invalid Har URI since '-' not found."
217 + " Expecting har://<scheme>-<host>/<path>.");
218 }
219 final String underLyingScheme = host.substring(0, i);
220 i++;
221 final String underLyingHost = i == host.length()? null: host.substring(i);
222 int underLyingPort = rawURI.getPort();
223 String auth = (underLyingHost == null && underLyingPort == -1)?
224 null:(underLyingHost+
225 (underLyingPort == -1 ? "" : ":"+underLyingPort));
226 URI tmp = null;
227 if (rawURI.getQuery() != null) {
228 // query component not allowed
229 throw new IOException("query component in Path not supported " + rawURI);
230 }
231 try {
232 tmp = new URI(underLyingScheme, auth, rawURI.getPath(),
233 rawURI.getQuery(), rawURI.getFragment());
234 } catch (URISyntaxException e) {
235 // do nothing should not happen
236 }
237 return tmp;
238 }
239
240 private static String decodeString(String str)
241 throws UnsupportedEncodingException {
242 return URLDecoder.decode(str, "UTF-8");
243 }
244
245 private String decodeFileName(String fname)
246 throws UnsupportedEncodingException {
247 int version = metadata.getVersion();
248 if (version == 2 || version == 3){
249 return decodeString(fname);
250 }
251 return fname;
252 }
253
254 /**
255 * return the top level archive.
256 */
257 @Override
258 public Path getWorkingDirectory() {
259 return new Path(uri.toString());
260 }
261
262 /**
263 * Create a har specific auth
264 * har-underlyingfs:port
265 * @param underLyingURI the uri of underlying
266 * filesystem
267 * @return har specific auth
268 */
269 private String getHarAuth(URI underLyingUri) {
270 String auth = underLyingUri.getScheme() + "-";
271 if (underLyingUri.getHost() != null) {
272 auth += underLyingUri.getHost() + ":";
273 if (underLyingUri.getPort() != -1) {
274 auth += underLyingUri.getPort();
275 }
276 }
277 else {
278 auth += ":";
279 }
280 return auth;
281 }
282
283 /**
284 * Returns the uri of this filesystem.
285 * The uri is of the form
286 * har://underlyingfsschema-host:port/pathintheunderlyingfs
287 */
288 @Override
289 public URI getUri() {
290 return this.uri;
291 }
292
293 /**
294 * this method returns the path
295 * inside the har filesystem.
296 * this is relative path inside
297 * the har filesystem.
298 * @param path the fully qualified path in the har filesystem.
299 * @return relative path in the filesystem.
300 */
301 private Path getPathInHar(Path path) {
302 Path harPath = new Path(path.toUri().getPath());
303 if (archivePath.compareTo(harPath) == 0)
304 return new Path(Path.SEPARATOR);
305 Path tmp = new Path(harPath.getName());
306 Path parent = harPath.getParent();
307 while (!(parent.compareTo(archivePath) == 0)) {
308 if (parent.toString().equals(Path.SEPARATOR)) {
309 tmp = null;
310 break;
311 }
312 tmp = new Path(parent.getName(), tmp);
313 parent = parent.getParent();
314 }
315 if (tmp != null)
316 tmp = new Path(Path.SEPARATOR, tmp);
317 return tmp;
318 }
319
320 //the relative path of p. basically
321 // getting rid of /. Parsing and doing
322 // string manipulation is not good - so
323 // just use the path api to do it.
324 private Path makeRelative(String initial, Path p) {
325 String scheme = this.uri.getScheme();
326 String authority = this.uri.getAuthority();
327 Path root = new Path(Path.SEPARATOR);
328 if (root.compareTo(p) == 0)
329 return new Path(scheme, authority, initial);
330 Path retPath = new Path(p.getName());
331 Path parent = p.getParent();
332 for (int i=0; i < p.depth()-1; i++) {
333 retPath = new Path(parent.getName(), retPath);
334 parent = parent.getParent();
335 }
336 return new Path(new Path(scheme, authority, initial),
337 retPath.toString());
338 }
339
340 /* this makes a path qualified in the har filesystem
341 * (non-Javadoc)
342 * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified(
343 * org.apache.hadoop.fs.Path)
344 */
345 @Override
346 public Path makeQualified(Path path) {
347 // make sure that we just get the
348 // path component
349 Path fsPath = path;
350 if (!path.isAbsolute()) {
351 fsPath = new Path(archivePath, path);
352 }
353
354 URI tmpURI = fsPath.toUri();
355 //change this to Har uri
356 return new Path(uri.getScheme(), harAuth, tmpURI.getPath());
357 }
358
359 /**
360 * Fix offset and length of block locations.
361 * Note that this method modifies the original array.
362 * @param locations block locations of har part file
363 * @param start the start of the desired range in the contained file
364 * @param len the length of the desired range
365 * @param fileOffsetInHar the offset of the desired file in the har part file
366 * @return block locations with fixed offset and length
367 */
368 static BlockLocation[] fixBlockLocations(BlockLocation[] locations,
369 long start,
370 long len,
371 long fileOffsetInHar) {
372 // offset 1 past last byte of desired range
373 long end = start + len;
374
375 for (BlockLocation location : locations) {
376 // offset of part block relative to beginning of desired file
377 // (may be negative if file starts in this part block)
378 long harBlockStart = location.getOffset() - fileOffsetInHar;
379 // offset 1 past last byte of har block relative to beginning of
380 // desired file
381 long harBlockEnd = harBlockStart + location.getLength();
382
383 if (start > harBlockStart) {
384 // desired range starts after beginning of this har block
385 // fix offset to beginning of relevant range (relative to desired file)
386 location.setOffset(start);
387 // fix length to relevant portion of har block
388 location.setLength(location.getLength() - (start - harBlockStart));
389 } else {
390 // desired range includes beginning of this har block
391 location.setOffset(harBlockStart);
392 }
393
394 if (harBlockEnd > end) {
395 // range ends before end of this har block
396 // fix length to remove irrelevant portion at the end
397 location.setLength(location.getLength() - (harBlockEnd - end));
398 }
399 }
400
401 return locations;
402 }
403
404 /**
405 * Get block locations from the underlying fs and fix their
406 * offsets and lengths.
407 * @param file the input filestatus to get block locations
408 * @param start the start of the desired range in the contained file
409 * @param len the length of the desired range
410 * @return block locations for this segment of file
411 * @throws IOException
412 */
413 @Override
414 public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
415 long len) throws IOException {
416 HarStatus hstatus = getFileHarStatus(file.getPath());
417 Path partPath = new Path(archivePath, hstatus.getPartName());
418 FileStatus partStatus = metadata.getPartFileStatus(partPath);
419
420 // get all part blocks that overlap with the desired file blocks
421 BlockLocation[] locations =
422 fs.getFileBlockLocations(partStatus,
423 hstatus.getStartIndex() + start, len);
424
425 return fixBlockLocations(locations, start, len, hstatus.getStartIndex());
426 }
427
428 /**
429 * the hash of the path p inside iniside
430 * the filesystem
431 * @param p the path in the harfilesystem
432 * @return the hash code of the path.
433 */
434 public static int getHarHash(Path p) {
435 return (p.toString().hashCode() & 0x7fffffff);
436 }
437
438 static class Store {
439 public Store() {
440 begin = end = startHash = endHash = 0;
441 }
442 public Store(long begin, long end, int startHash, int endHash) {
443 this.begin = begin;
444 this.end = end;
445 this.startHash = startHash;
446 this.endHash = endHash;
447 }
448 public long begin;
449 public long end;
450 public int startHash;
451 public int endHash;
452 }
453
454 /**
455 * Get filestatuses of all the children of a given directory. This just reads
456 * through index file and reads line by line to get all statuses for children
457 * of a directory. Its a brute force way of getting all such filestatuses
458 *
459 * @param parent
460 * the parent path directory
461 * @param statuses
462 * the list to add the children filestatuses to
463 * @param children
464 * the string list of children for this parent
465 * @param archiveIndexStat
466 * the archive index filestatus
467 */
468 private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses,
469 List<String> children) throws IOException {
470 String parentString = parent.getName();
471 if (!parentString.endsWith(Path.SEPARATOR)){
472 parentString += Path.SEPARATOR;
473 }
474 Path harPath = new Path(parentString);
475 int harlen = harPath.depth();
476 final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
477
478 for (HarStatus hstatus : metadata.archive.values()) {
479 String child = hstatus.getName();
480 if ((child.startsWith(parentString))) {
481 Path thisPath = new Path(child);
482 if (thisPath.depth() == harlen + 1) {
483 statuses.add(toFileStatus(hstatus, cache));
484 }
485 }
486 }
487 }
488
489 /**
490 * Combine the status stored in the index and the underlying status.
491 * @param h status stored in the index
492 * @param cache caching the underlying file statuses
493 * @return the combined file status
494 * @throws IOException
495 */
496 private FileStatus toFileStatus(HarStatus h,
497 Map<String, FileStatus> cache) throws IOException {
498 FileStatus underlying = null;
499 if (cache != null) {
500 underlying = cache.get(h.partName);
501 }
502 if (underlying == null) {
503 final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
504 underlying = fs.getFileStatus(p);
505 if (cache != null) {
506 cache.put(h.partName, underlying);
507 }
508 }
509
510 long modTime = 0;
511 int version = metadata.getVersion();
512 if (version < 3) {
513 modTime = underlying.getModificationTime();
514 } else if (version == 3) {
515 modTime = h.getModificationTime();
516 }
517
518 return new FileStatus(
519 h.isDir()? 0L: h.getLength(),
520 h.isDir(),
521 underlying.getReplication(),
522 underlying.getBlockSize(),
523 modTime,
524 underlying.getAccessTime(),
525 underlying.getPermission(),
526 underlying.getOwner(),
527 underlying.getGroup(),
528 makeRelative(this.uri.getPath(), new Path(h.name)));
529 }
530
531 // a single line parser for hadoop archives status
532 // stored in a single line in the index files
533 // the format is of the form
534 // filename "dir"/"file" partFileName startIndex length
535 // <space seperated children>
536 private class HarStatus {
537 boolean isDir;
538 String name;
539 List<String> children;
540 String partName;
541 long startIndex;
542 long length;
543 long modificationTime = 0;
544
545 public HarStatus(String harString) throws UnsupportedEncodingException {
546 String[] splits = harString.split(" ");
547 this.name = decodeFileName(splits[0]);
548 this.isDir = "dir".equals(splits[1]) ? true: false;
549 // this is equal to "none" if its a directory
550 this.partName = splits[2];
551 this.startIndex = Long.parseLong(splits[3]);
552 this.length = Long.parseLong(splits[4]);
553
554 int version = metadata.getVersion();
555 String[] propSplits = null;
556 // propSplits is used to retrieve the metainformation that Har versions
557 // 1 & 2 missed (modification time, permission, owner group).
558 // These fields are stored in an encoded string placed in different
559 // locations depending on whether it's a file or directory entry.
560 // If it's a directory, the string will be placed at the partName
561 // location (directories have no partName because they don't have data
562 // to be stored). This is done because the number of fields in a
563 // directory entry is unbounded (all children are listed at the end)
564 // If it's a file, the string will be the last field.
565 if (isDir) {
566 if (version == 3){
567 propSplits = decodeString(this.partName).split(" ");
568 }
569 children = new ArrayList<String>();
570 for (int i = 5; i < splits.length; i++) {
571 children.add(decodeFileName(splits[i]));
572 }
573 } else if (version == 3) {
574 propSplits = decodeString(splits[5]).split(" ");
575 }
576
577 if (propSplits != null && propSplits.length >= 4) {
578 modificationTime = Long.parseLong(propSplits[0]);
579 // the fields below are stored in the file but are currently not used
580 // by HarFileSystem
581 // permission = new FsPermission(Short.parseShort(propSplits[1]));
582 // owner = decodeString(propSplits[2]);
583 // group = decodeString(propSplits[3]);
584 }
585 }
586 public boolean isDir() {
587 return isDir;
588 }
589
590 public String getName() {
591 return name;
592 }
593 public String getPartName() {
594 return partName;
595 }
596 public long getStartIndex() {
597 return startIndex;
598 }
599 public long getLength() {
600 return length;
601 }
602 public long getModificationTime() {
603 return modificationTime;
604 }
605 }
606
607 /**
608 * return the filestatus of files in har archive.
609 * The permission returned are that of the archive
610 * index files. The permissions are not persisted
611 * while creating a hadoop archive.
612 * @param f the path in har filesystem
613 * @return filestatus.
614 * @throws IOException
615 */
616 @Override
617 public FileStatus getFileStatus(Path f) throws IOException {
618 HarStatus hstatus = getFileHarStatus(f);
619 return toFileStatus(hstatus, null);
620 }
621
622 private HarStatus getFileHarStatus(Path f) throws IOException {
623 // get the fs DataInputStream for the underlying file
624 // look up the index.
625 Path p = makeQualified(f);
626 Path harPath = getPathInHar(p);
627 if (harPath == null) {
628 throw new IOException("Invalid file name: " + f + " in " + uri);
629 }
630 HarStatus hstatus = metadata.archive.get(harPath);
631 if (hstatus == null) {
632 throw new FileNotFoundException("File: " + f + " does not exist in " + uri);
633 }
634 return hstatus;
635 }
636
637 /**
638 * @return null since no checksum algorithm is implemented.
639 */
640 @Override
641 public FileChecksum getFileChecksum(Path f) {
642 return null;
643 }
644
645 /**
646 * Returns a har input stream which fakes end of
647 * file. It reads the index files to get the part
648 * file name and the size and start of the file.
649 */
650 @Override
651 public FSDataInputStream open(Path f, int bufferSize) throws IOException {
652 // get the fs DataInputStream for the underlying file
653 HarStatus hstatus = getFileHarStatus(f);
654 // we got it.. woo hooo!!!
655 if (hstatus.isDir()) {
656 throw new FileNotFoundException(f + " : not a file in " +
657 archivePath);
658 }
659 return new HarFSDataInputStream(fs, new Path(archivePath,
660 hstatus.getPartName()),
661 hstatus.getStartIndex(), hstatus.getLength(), bufferSize);
662 }
663
664 @Override
665 public FSDataOutputStream create(Path f,
666 FsPermission permission,
667 boolean overwrite,
668 int bufferSize,
669 short replication,
670 long blockSize,
671 Progressable progress) throws IOException {
672 throw new IOException("Har: create not allowed.");
673 }
674
675 @Override
676 public void close() throws IOException {
677 if (fs != null) {
678 try {
679 fs.close();
680 } catch(IOException ie) {
681 //this might already be closed
682 // ignore
683 }
684 }
685 }
686
687 /**
688 * Not implemented.
689 */
690 @Override
691 public boolean setReplication(Path src, short replication) throws IOException{
692 throw new IOException("Har: setreplication not allowed");
693 }
694
695 /**
696 * Not implemented.
697 */
698 @Override
699 public boolean delete(Path f, boolean recursive) throws IOException {
700 throw new IOException("Har: delete not allowed");
701 }
702
703 /**
704 * liststatus returns the children of a directory
705 * after looking up the index files.
706 */
707 @Override
708 public FileStatus[] listStatus(Path f) throws IOException {
709 //need to see if the file is an index in file
710 //get the filestatus of the archive directory
711 // we will create fake filestatuses to return
712 // to the client
713 List<FileStatus> statuses = new ArrayList<FileStatus>();
714 Path tmpPath = makeQualified(f);
715 Path harPath = getPathInHar(tmpPath);
716 HarStatus hstatus = metadata.archive.get(harPath);
717 if (hstatus == null) {
718 throw new FileNotFoundException("File " + f + " not found in " + archivePath);
719 }
720 if (hstatus.isDir()) {
721 fileStatusesInIndex(hstatus, statuses, hstatus.children);
722 } else {
723 statuses.add(toFileStatus(hstatus, null));
724 }
725
726 return statuses.toArray(new FileStatus[statuses.size()]);
727 }
728
729 /**
730 * return the top level archive path.
731 */
732 @Override
733 public Path getHomeDirectory() {
734 return new Path(uri.toString());
735 }
736
737 @Override
738 public void setWorkingDirectory(Path newDir) {
739 //does nothing.
740 }
741
742 /**
743 * not implemented.
744 */
745 @Override
746 public boolean mkdirs(Path f, FsPermission permission) throws IOException {
747 throw new IOException("Har: mkdirs not allowed");
748 }
749
750 /**
751 * not implemented.
752 */
753 @Override
754 public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws
755 IOException {
756 throw new IOException("Har: copyfromlocalfile not allowed");
757 }
758
759 /**
760 * copies the file in the har filesystem to a local file.
761 */
762 @Override
763 public void copyToLocalFile(boolean delSrc, Path src, Path dst)
764 throws IOException {
765 FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf());
766 }
767
768 /**
769 * not implemented.
770 */
771 @Override
772 public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile)
773 throws IOException {
774 throw new IOException("Har: startLocalOutput not allowed");
775 }
776
777 /**
778 * not implemented.
779 */
780 @Override
781 public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile)
782 throws IOException {
783 throw new IOException("Har: completeLocalOutput not allowed");
784 }
785
786 /**
787 * not implemented.
788 */
789 @Override
790 public void setOwner(Path p, String username, String groupname)
791 throws IOException {
792 throw new IOException("Har: setowner not allowed");
793 }
794
795 /**
796 * Not implemented.
797 */
798 @Override
799 public void setPermission(Path p, FsPermission permisssion)
800 throws IOException {
801 throw new IOException("Har: setPermission not allowed");
802 }
803
804 /**
805 * Hadoop archives input stream. This input stream fakes EOF
806 * since archive files are part of bigger part files.
807 */
808 private static class HarFSDataInputStream extends FSDataInputStream {
809 /**
810 * Create an input stream that fakes all the reads/positions/seeking.
811 */
812 private static class HarFsInputStream extends FSInputStream {
813 private long position, start, end;
814 //The underlying data input stream that the
815 // underlying filesystem will return.
816 private FSDataInputStream underLyingStream;
817 //one byte buffer
818 private byte[] oneBytebuff = new byte[1];
819 HarFsInputStream(FileSystem fs, Path path, long start,
820 long length, int bufferSize) throws IOException {
821 underLyingStream = fs.open(path, bufferSize);
822 underLyingStream.seek(start);
823 // the start of this file in the part file
824 this.start = start;
825 // the position pointer in the part file
826 this.position = start;
827 // the end pointer in the part file
828 this.end = start + length;
829 }
830
831 @Override
832 public synchronized int available() throws IOException {
833 long remaining = end - underLyingStream.getPos();
834 if (remaining > (long)Integer.MAX_VALUE) {
835 return Integer.MAX_VALUE;
836 }
837 return (int) remaining;
838 }
839
840 @Override
841 public synchronized void close() throws IOException {
842 underLyingStream.close();
843 super.close();
844 }
845
846 //not implemented
847 @Override
848 public void mark(int readLimit) {
849 // do nothing
850 }
851
852 /**
853 * reset is not implemented
854 */
855 @Override
856 public void reset() throws IOException {
857 throw new IOException("reset not implemented.");
858 }
859
860 @Override
861 public synchronized int read() throws IOException {
862 int ret = read(oneBytebuff, 0, 1);
863 return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff);
864 }
865
866 @Override
867 public synchronized int read(byte[] b) throws IOException {
868 int ret = read(b, 0, b.length);
869 if (ret != -1) {
870 position += ret;
871 }
872 return ret;
873 }
874
875 /**
876 *
877 */
878 @Override
879 public synchronized int read(byte[] b, int offset, int len)
880 throws IOException {
881 int newlen = len;
882 int ret = -1;
883 if (position + len > end) {
884 newlen = (int) (end - position);
885 }
886 // end case
887 if (newlen == 0)
888 return ret;
889 ret = underLyingStream.read(b, offset, newlen);
890 position += ret;
891 return ret;
892 }
893
894 @Override
895 public synchronized long skip(long n) throws IOException {
896 long tmpN = n;
897 if (tmpN > 0) {
898 if (position + tmpN > end) {
899 tmpN = end - position;
900 }
901 underLyingStream.seek(tmpN + position);
902 position += tmpN;
903 return tmpN;
904 }
905 return (tmpN < 0)? -1 : 0;
906 }
907
908 @Override
909 public synchronized long getPos() throws IOException {
910 return (position - start);
911 }
912
913 @Override
914 public synchronized void seek(long pos) throws IOException {
915 if (pos < 0 || (start + pos > end)) {
916 throw new IOException("Failed to seek: EOF");
917 }
918 position = start + pos;
919 underLyingStream.seek(position);
920 }
921
922 @Override
923 public boolean seekToNewSource(long targetPos) throws IOException {
924 //do not need to implement this
925 // hdfs in itself does seektonewsource
926 // while reading.
927 return false;
928 }
929
930 /**
931 * implementing position readable.
932 */
933 @Override
934 public int read(long pos, byte[] b, int offset, int length)
935 throws IOException {
936 int nlength = length;
937 if (start + nlength + pos > end) {
938 nlength = (int) (end - (start + pos));
939 }
940 return underLyingStream.read(pos + start , b, offset, nlength);
941 }
942
943 /**
944 * position readable again.
945 */
946 @Override
947 public void readFully(long pos, byte[] b, int offset, int length)
948 throws IOException {
949 if (start + length + pos > end) {
950 throw new IOException("Not enough bytes to read.");
951 }
952 underLyingStream.readFully(pos + start, b, offset, length);
953 }
954
955 @Override
956 public void readFully(long pos, byte[] b) throws IOException {
957 readFully(pos, b, 0, b.length);
958 }
959
960 }
961
962 /**
963 * constructors for har input stream.
964 * @param fs the underlying filesystem
965 * @param p The path in the underlying filesystem
966 * @param start the start position in the part file
967 * @param length the length of valid data in the part file
968 * @param bufsize the buffer size
969 * @throws IOException
970 */
971 public HarFSDataInputStream(FileSystem fs, Path p, long start,
972 long length, int bufsize) throws IOException {
973 super(new HarFsInputStream(fs, p, start, length, bufsize));
974 }
975
976 /**
977 * constructor for har input stream.
978 * @param fs the underlying filesystem
979 * @param p the path in the underlying file system
980 * @param start the start position in the part file
981 * @param length the length of valid data in the part file.
982 * @throws IOException
983 */
984 public HarFSDataInputStream(FileSystem fs, Path p, long start, long length)
985 throws IOException {
986 super(new HarFsInputStream(fs, p, start, length, 0));
987 }
988 }
989
990 private class HarMetaData {
991 private FileSystem fs;
992 private int version;
993 // the masterIndex of the archive
994 private Path masterIndexPath;
995 // the index file
996 private Path archiveIndexPath;
997
998 private long masterIndexTimestamp;
999 private long archiveIndexTimestamp;
1000
1001 List<Store> stores = new ArrayList<Store>();
1002 Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
1003 private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
1004
1005 public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
1006 this.fs = fs;
1007 this.masterIndexPath = masterIndexPath;
1008 this.archiveIndexPath = archiveIndexPath;
1009 }
1010
1011 public FileStatus getPartFileStatus(Path partPath) throws IOException {
1012 FileStatus status;
1013 status = partFileStatuses.get(partPath);
1014 if (status == null) {
1015 status = fs.getFileStatus(partPath);
1016 partFileStatuses.put(partPath, status);
1017 }
1018 return status;
1019 }
1020
1021 public long getMasterIndexTimestamp() {
1022 return masterIndexTimestamp;
1023 }
1024
1025 public long getArchiveIndexTimestamp() {
1026 return archiveIndexTimestamp;
1027 }
1028
1029 private int getVersion() {
1030 return version;
1031 }
1032
1033 private void parseMetaData() throws IOException {
1034 Text line;
1035 long read;
1036 FSDataInputStream in = null;
1037 LineReader lin = null;
1038
1039 try {
1040 in = fs.open(masterIndexPath);
1041 FileStatus masterStat = fs.getFileStatus(masterIndexPath);
1042 masterIndexTimestamp = masterStat.getModificationTime();
1043 lin = new LineReader(in, getConf());
1044 line = new Text();
1045 read = lin.readLine(line);
1046
1047 // the first line contains the version of the index file
1048 String versionLine = line.toString();
1049 String[] arr = versionLine.split(" ");
1050 version = Integer.parseInt(arr[0]);
1051 // make it always backwards-compatible
1052 if (this.version > HarFileSystem.VERSION) {
1053 throw new IOException("Invalid version " +
1054 this.version + " expected " + HarFileSystem.VERSION);
1055 }
1056
1057 // each line contains a hashcode range and the index file name
1058 String[] readStr = null;
1059 while(read < masterStat.getLen()) {
1060 int b = lin.readLine(line);
1061 read += b;
1062 readStr = line.toString().split(" ");
1063 int startHash = Integer.parseInt(readStr[0]);
1064 int endHash = Integer.parseInt(readStr[1]);
1065 stores.add(new Store(Long.parseLong(readStr[2]),
1066 Long.parseLong(readStr[3]), startHash,
1067 endHash));
1068 line.clear();
1069 }
1070 } finally {
1071 IOUtils.cleanup(LOG, lin, in);
1072 }
1073
1074 FSDataInputStream aIn = fs.open(archiveIndexPath);
1075 try {
1076 FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
1077 archiveIndexTimestamp = archiveStat.getModificationTime();
1078 LineReader aLin;
1079
1080 // now start reading the real index file
1081 for (Store s: stores) {
1082 read = 0;
1083 aIn.seek(s.begin);
1084 aLin = new LineReader(aIn, getConf());
1085 while (read + s.begin < s.end) {
1086 int tmp = aLin.readLine(line);
1087 read += tmp;
1088 String lineFeed = line.toString();
1089 String[] parsed = lineFeed.split(" ");
1090 parsed[0] = decodeFileName(parsed[0]);
1091 archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
1092 line.clear();
1093 }
1094 }
1095 } finally {
1096 IOUtils.cleanup(LOG, aIn);
1097 }
1098 }
1099 }
1100
1101 /*
1102 * testing purposes only:
1103 */
1104 HarMetaData getMetadata() {
1105 return metadata;
1106 }
1107 }