001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.snappy; 020 021import java.io.IOException; 022import java.io.InputStream; 023 024import org.apache.commons.compress.compressors.lz77support.AbstractLZ77CompressorInputStream; 025import org.apache.commons.compress.utils.ByteUtils; 026 027/** 028 * CompressorInputStream for the raw Snappy format. 029 * 030 * <p>This implementation uses an internal buffer in order to handle 031 * the back-references that are at the heart of the LZ77 algorithm. 032 * The size of the buffer must be at least as big as the biggest 033 * offset used in the compressed stream. The current version of the 034 * Snappy algorithm as defined by Google works on 32k blocks and 035 * doesn't contain offsets bigger than 32k which is the default block 036 * size used by this class.</p> 037 * 038 * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a> 039 * @since 1.7 040 */ 041public class SnappyCompressorInputStream extends AbstractLZ77CompressorInputStream { 042 043 /** Mask used to determine the type of "tag" is being processed */ 044 private static final int TAG_MASK = 0x03; 045 046 /** Default block size */ 047 public static final int DEFAULT_BLOCK_SIZE = 32768; 048 049 /** The size of the uncompressed data */ 050 private final int size; 051 052 /** Number of uncompressed bytes still to be read. */ 053 private int uncompressedBytesRemaining; 054 055 /** Current state of the stream */ 056 private State state = State.NO_BLOCK; 057 058 private boolean endReached = false; 059 060 /** 061 * Constructor using the default buffer size of 32k. 062 * 063 * @param is 064 * An InputStream to read compressed data from 065 * 066 * @throws IOException if reading fails 067 */ 068 public SnappyCompressorInputStream(final InputStream is) throws IOException { 069 this(is, DEFAULT_BLOCK_SIZE); 070 } 071 072 /** 073 * Constructor using a configurable buffer size. 074 * 075 * @param is 076 * An InputStream to read compressed data from 077 * @param blockSize 078 * The block size used in compression 079 * 080 * @throws IOException if reading fails 081 * @throws IllegalArgumentException if blockSize is not bigger than 0 082 */ 083 public SnappyCompressorInputStream(final InputStream is, final int blockSize) 084 throws IOException { 085 super(is, blockSize); 086 uncompressedBytesRemaining = size = (int) readSize(); 087 } 088 089 /** 090 * {@inheritDoc} 091 */ 092 @Override 093 public int read(final byte[] b, final int off, final int len) throws IOException { 094 if (len == 0) { 095 return 0; 096 } 097 if (endReached) { 098 return -1; 099 } 100 switch (state) { 101 case NO_BLOCK: 102 fill(); 103 return read(b, off, len); 104 case IN_LITERAL: 105 int litLen = readLiteral(b, off, len); 106 if (!hasMoreDataInBlock()) { 107 state = State.NO_BLOCK; 108 } 109 return litLen > 0 ? litLen : read(b, off, len); 110 case IN_BACK_REFERENCE: 111 int backReferenceLen = readBackReference(b, off, len); 112 if (!hasMoreDataInBlock()) { 113 state = State.NO_BLOCK; 114 } 115 return backReferenceLen > 0 ? backReferenceLen : read(b, off, len); 116 default: 117 throw new IOException("Unknown stream state " + state); 118 } 119 } 120 121 /** 122 * Try to fill the buffer with the next block of data. 123 */ 124 private void fill() throws IOException { 125 if (uncompressedBytesRemaining == 0) { 126 endReached = true; 127 return; 128 } 129 130 int b = readOneByte(); 131 if (b == -1) { 132 throw new IOException("Premature end of stream reading block start"); 133 } 134 int length = 0; 135 int offset = 0; 136 137 switch (b & TAG_MASK) { 138 139 case 0x00: 140 141 length = readLiteralLength(b); 142 if (length < 0) { 143 throw new IOException("Illegal block with a negative literal size found"); 144 } 145 uncompressedBytesRemaining -= length; 146 startLiteral(length); 147 state = State.IN_LITERAL; 148 break; 149 150 case 0x01: 151 152 /* 153 * These elements can encode lengths between [4..11] bytes and 154 * offsets between [0..2047] bytes. (len-4) occupies three bits 155 * and is stored in bits [2..4] of the tag byte. The offset 156 * occupies 11 bits, of which the upper three are stored in the 157 * upper three bits ([5..7]) of the tag byte, and the lower 158 * eight are stored in a byte following the tag byte. 159 */ 160 161 length = 4 + ((b >> 2) & 0x07); 162 if (length < 0) { 163 throw new IOException("Illegal block with a negative match length found"); 164 } 165 uncompressedBytesRemaining -= length; 166 offset = (b & 0xE0) << 3; 167 b = readOneByte(); 168 if (b == -1) { 169 throw new IOException("Premature end of stream reading back-reference length"); 170 } 171 offset |= b; 172 173 try { 174 startBackReference(offset, length); 175 } catch (IllegalArgumentException ex) { 176 throw new IOException("Illegal block with bad offset found", ex); 177 } 178 state = State.IN_BACK_REFERENCE; 179 break; 180 181 case 0x02: 182 183 /* 184 * These elements can encode lengths between [1..64] and offsets 185 * from [0..65535]. (len-1) occupies six bits and is stored in 186 * the upper six bits ([2..7]) of the tag byte. The offset is 187 * stored as a little-endian 16-bit integer in the two bytes 188 * following the tag byte. 189 */ 190 191 length = (b >> 2) + 1; 192 if (length < 0) { 193 throw new IOException("Illegal block with a negative match length found"); 194 } 195 uncompressedBytesRemaining -= length; 196 197 offset = (int) ByteUtils.fromLittleEndian(supplier, 2); 198 199 try { 200 startBackReference(offset, length); 201 } catch (IllegalArgumentException ex) { 202 throw new IOException("Illegal block with bad offset found", ex); 203 } 204 state = State.IN_BACK_REFERENCE; 205 break; 206 207 case 0x03: 208 209 /* 210 * These are like the copies with 2-byte offsets (see previous 211 * subsection), except that the offset is stored as a 32-bit 212 * integer instead of a 16-bit integer (and thus will occupy 213 * four bytes). 214 */ 215 216 length = (b >> 2) + 1; 217 if (length < 0) { 218 throw new IOException("Illegal block with a negative match length found"); 219 } 220 uncompressedBytesRemaining -= length; 221 222 offset = (int) ByteUtils.fromLittleEndian(supplier, 4) & 0x7fffffff; 223 224 try { 225 startBackReference(offset, length); 226 } catch (IllegalArgumentException ex) { 227 throw new IOException("Illegal block with bad offset found", ex); 228 } 229 state = State.IN_BACK_REFERENCE; 230 break; 231 default: 232 // impossible as TAG_MASK is two bits and all four possible cases have been covered 233 break; 234 } 235 } 236 237 /* 238 * For literals up to and including 60 bytes in length, the 239 * upper six bits of the tag byte contain (len-1). The literal 240 * follows immediately thereafter in the bytestream. - For 241 * longer literals, the (len-1) value is stored after the tag 242 * byte, little-endian. The upper six bits of the tag byte 243 * describe how many bytes are used for the length; 60, 61, 62 244 * or 63 for 1-4 bytes, respectively. The literal itself follows 245 * after the length. 246 */ 247 private int readLiteralLength(final int b) throws IOException { 248 int length; 249 switch (b >> 2) { 250 case 60: 251 length = readOneByte(); 252 if (length == -1) { 253 throw new IOException("Premature end of stream reading literal length"); 254 } 255 break; 256 case 61: 257 length = (int) ByteUtils.fromLittleEndian(supplier, 2); 258 break; 259 case 62: 260 length = (int) ByteUtils.fromLittleEndian(supplier, 3); 261 break; 262 case 63: 263 length = (int) ByteUtils.fromLittleEndian(supplier, 4); 264 break; 265 default: 266 length = b >> 2; 267 break; 268 } 269 270 return length + 1; 271 } 272 273 /** 274 * The stream starts with the uncompressed length (up to a maximum of 2^32 - 275 * 1), stored as a little-endian varint. Varints consist of a series of 276 * bytes, where the lower 7 bits are data and the upper bit is set iff there 277 * are more bytes to be read. In other words, an uncompressed length of 64 278 * would be stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE) 279 * would be stored as 0xFE 0xFF 0x7F. 280 * 281 * @return The size of the uncompressed data 282 * 283 * @throws IOException 284 * Could not read a byte 285 */ 286 private long readSize() throws IOException { 287 int index = 0; 288 long sz = 0; 289 int b = 0; 290 291 do { 292 b = readOneByte(); 293 if (b == -1) { 294 throw new IOException("Premature end of stream reading size"); 295 } 296 sz |= (b & 0x7f) << (index++ * 7); 297 } while (0 != (b & 0x80)); 298 return sz; 299 } 300 301 /** 302 * Get the uncompressed size of the stream 303 * 304 * @return the uncompressed size 305 */ 306 @Override 307 public int getSize() { 308 return size; 309 } 310 311 private enum State { 312 NO_BLOCK, IN_LITERAL, IN_BACK_REFERENCE 313 } 314}