001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.util;
020
021import java.io.DataInputStream;
022import java.io.DataOutputStream;
023import java.io.IOException;
024import java.nio.ByteBuffer;
025import java.util.zip.CRC32;
026import java.util.zip.Checksum;
027
028import org.apache.hadoop.classification.InterfaceAudience;
029import org.apache.hadoop.classification.InterfaceStability;
030import org.apache.hadoop.fs.ChecksumException;
031
032/**
033 * This class provides interface and utilities for processing checksums for
034 * DFS data transfers.
035 */
036@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"})
037@InterfaceStability.Evolving
038public class DataChecksum implements Checksum {
039  
040  // checksum types
041  public static final int CHECKSUM_NULL    = 0;
042  public static final int CHECKSUM_CRC32   = 1;
043  public static final int CHECKSUM_CRC32C  = 2;
044  public static final int CHECKSUM_DEFAULT = 3; 
045  public static final int CHECKSUM_MIXED   = 4;
046 
047  /** The checksum types */
048  public static enum Type {
049    NULL  (CHECKSUM_NULL, 0),
050    CRC32 (CHECKSUM_CRC32, 4),
051    CRC32C(CHECKSUM_CRC32C, 4),
052    DEFAULT(CHECKSUM_DEFAULT, 0), // This cannot be used to create DataChecksum
053    MIXED (CHECKSUM_MIXED, 0); // This cannot be used to create DataChecksum
054
055    public final int id;
056    public final int size;
057    
058    private Type(int id, int size) {
059      this.id = id;
060      this.size = size;
061    }
062
063    /** @return the type corresponding to the id. */
064    public static Type valueOf(int id) {
065      if (id < 0 || id >= values().length) {
066        throw new IllegalArgumentException("id=" + id
067            + " out of range [0, " + values().length + ")");
068      }
069      return values()[id];
070    }
071  }
072
073  /**
074   * Create a Crc32 Checksum object. The implementation of the Crc32 algorithm
075   * is chosen depending on the platform.
076   */
077  public static Checksum newCrc32() {
078    return new CRC32();
079  }
080
081  public static DataChecksum newDataChecksum(Type type, int bytesPerChecksum ) {
082    if ( bytesPerChecksum <= 0 ) {
083      return null;
084    }
085    
086    switch ( type ) {
087    case NULL :
088      return new DataChecksum(type, new ChecksumNull(), bytesPerChecksum );
089    case CRC32 :
090      return new DataChecksum(type, newCrc32(), bytesPerChecksum );
091    case CRC32C:
092      return new DataChecksum(type, new PureJavaCrc32C(), bytesPerChecksum);
093    default:
094      return null;  
095    }
096  }
097  
098  /**
099   * Creates a DataChecksum from HEADER_LEN bytes from arr[offset].
100   * @return DataChecksum of the type in the array or null in case of an error.
101   */
102  public static DataChecksum newDataChecksum( byte bytes[], int offset ) {
103    if (offset < 0 || bytes.length < offset + getChecksumHeaderSize()) {
104      return null;
105    }
106    
107    // like readInt():
108    int bytesPerChecksum = ( (bytes[offset+1] & 0xff) << 24 ) | 
109                           ( (bytes[offset+2] & 0xff) << 16 ) |
110                           ( (bytes[offset+3] & 0xff) << 8 )  |
111                           ( (bytes[offset+4] & 0xff) );
112    return newDataChecksum( Type.valueOf(bytes[offset]), bytesPerChecksum );
113  }
114  
115  /**
116   * This constructs a DataChecksum by reading HEADER_LEN bytes from input
117   * stream <i>in</i>
118   */
119  public static DataChecksum newDataChecksum( DataInputStream in )
120                                 throws IOException {
121    int type = in.readByte();
122    int bpc = in.readInt();
123    DataChecksum summer = newDataChecksum(Type.valueOf(type), bpc );
124    if ( summer == null ) {
125      throw new InvalidChecksumSizeException("Could not create DataChecksum "
126          + "of type " + type + " with bytesPerChecksum " + bpc);
127    }
128    return summer;
129  }
130  
131  /**
132   * Writes the checksum header to the output stream <i>out</i>.
133   */
134  public void writeHeader( DataOutputStream out ) 
135                           throws IOException { 
136    out.writeByte( type.id );
137    out.writeInt( bytesPerChecksum );
138  }
139
140  public byte[] getHeader() {
141    byte[] header = new byte[getChecksumHeaderSize()];
142    header[0] = (byte) (type.id & 0xff);
143    // Writing in buffer just like DataOutput.WriteInt()
144    header[1+0] = (byte) ((bytesPerChecksum >>> 24) & 0xff);
145    header[1+1] = (byte) ((bytesPerChecksum >>> 16) & 0xff);
146    header[1+2] = (byte) ((bytesPerChecksum >>> 8) & 0xff);
147    header[1+3] = (byte) (bytesPerChecksum & 0xff);
148    return header;
149  }
150  
151  /**
152   * Writes the current checksum to the stream.
153   * If <i>reset</i> is true, then resets the checksum.
154   * @return number of bytes written. Will be equal to getChecksumSize();
155   */
156   public int writeValue( DataOutputStream out, boolean reset )
157                          throws IOException {
158     if ( type.size <= 0 ) {
159       return 0;
160     }
161
162     if ( type.size == 4 ) {
163       out.writeInt( (int) summer.getValue() );
164     } else {
165       throw new IOException( "Unknown Checksum " + type );
166     }
167     
168     if ( reset ) {
169       reset();
170     }
171     
172     return type.size;
173   }
174   
175   /**
176    * Writes the current checksum to a buffer.
177    * If <i>reset</i> is true, then resets the checksum.
178    * @return number of bytes written. Will be equal to getChecksumSize();
179    */
180    public int writeValue( byte[] buf, int offset, boolean reset )
181                           throws IOException {
182      if ( type.size <= 0 ) {
183        return 0;
184      }
185
186      if ( type.size == 4 ) {
187        int checksum = (int) summer.getValue();
188        buf[offset+0] = (byte) ((checksum >>> 24) & 0xff);
189        buf[offset+1] = (byte) ((checksum >>> 16) & 0xff);
190        buf[offset+2] = (byte) ((checksum >>> 8) & 0xff);
191        buf[offset+3] = (byte) (checksum & 0xff);
192      } else {
193        throw new IOException( "Unknown Checksum " + type );
194      }
195      
196      if ( reset ) {
197        reset();
198      }
199      
200      return type.size;
201    }
202   
203   /**
204    * Compares the checksum located at buf[offset] with the current checksum.
205    * @return true if the checksum matches and false otherwise.
206    */
207   public boolean compare( byte buf[], int offset ) {
208     if ( type.size == 4 ) {
209       int checksum = ( (buf[offset+0] & 0xff) << 24 ) | 
210                      ( (buf[offset+1] & 0xff) << 16 ) |
211                      ( (buf[offset+2] & 0xff) << 8 )  |
212                      ( (buf[offset+3] & 0xff) );
213       return checksum == (int) summer.getValue();
214     }
215     return type.size == 0;
216   }
217   
218  private final Type type;
219  private final Checksum summer;
220  private final int bytesPerChecksum;
221  private int inSum = 0;
222  
223  private DataChecksum( Type type, Checksum checksum, int chunkSize ) {
224    this.type = type;
225    summer = checksum;
226    bytesPerChecksum = chunkSize;
227  }
228  
229  /** @return the checksum algorithm type. */
230  public Type getChecksumType() {
231    return type;
232  }
233  /** @return the size for a checksum. */
234  public int getChecksumSize() {
235    return type.size;
236  }
237  /** @return the required checksum size given the data length. */
238  public int getChecksumSize(int dataSize) {
239    return ((dataSize - 1)/getBytesPerChecksum() + 1) * getChecksumSize(); 
240  }
241  public int getBytesPerChecksum() {
242    return bytesPerChecksum;
243  }
244  public int getNumBytesInSum() {
245    return inSum;
246  }
247  
248  public static final int SIZE_OF_INTEGER = Integer.SIZE / Byte.SIZE;
249  static public int getChecksumHeaderSize() {
250    return 1 + SIZE_OF_INTEGER; // type byte, bytesPerChecksum int
251  }
252  //Checksum Interface. Just a wrapper around member summer.
253  @Override
254  public long getValue() {
255    return summer.getValue();
256  }
257  @Override
258  public void reset() {
259    summer.reset();
260    inSum = 0;
261  }
262  @Override
263  public void update( byte[] b, int off, int len ) {
264    if ( len > 0 ) {
265      summer.update( b, off, len );
266      inSum += len;
267    }
268  }
269  @Override
270  public void update( int b ) {
271    summer.update( b );
272    inSum += 1;
273  }
274  
275  /**
276   * Verify that the given checksums match the given data.
277   * 
278   * The 'mark' of the ByteBuffer parameters may be modified by this function,.
279   * but the position is maintained.
280   *  
281   * @param data the DirectByteBuffer pointing to the data to verify.
282   * @param checksums the DirectByteBuffer pointing to a series of stored
283   *                  checksums
284   * @param fileName the name of the file being read, for error-reporting
285   * @param basePos the file position to which the start of 'data' corresponds
286   * @throws ChecksumException if the checksums do not match
287   */
288  public void verifyChunkedSums(ByteBuffer data, ByteBuffer checksums,
289      String fileName, long basePos) throws ChecksumException {
290    if (type.size == 0) return;
291    
292    if (data.hasArray() && checksums.hasArray()) {
293      final int dataOffset = data.arrayOffset() + data.position();
294      final int crcsOffset = checksums.arrayOffset() + checksums.position();
295      verifyChunked(type, summer, data.array(), dataOffset, data.remaining(),
296          bytesPerChecksum, checksums.array(), crcsOffset, fileName, basePos);
297      return;
298    }
299    if (NativeCrc32.isAvailable()) {
300      NativeCrc32.verifyChunkedSums(bytesPerChecksum, type.id, checksums, data,
301          fileName, basePos);
302    } else {
303      verifyChunked(type, summer, data, bytesPerChecksum, checksums, fileName,
304          basePos);
305    }
306  }
307
308  static void verifyChunked(final Type type, final Checksum algorithm,
309      final ByteBuffer data, final int bytesPerCrc, final ByteBuffer crcs,
310      final String filename, final long basePos) throws ChecksumException {
311    final byte[] bytes = new byte[bytesPerCrc];
312    final int dataOffset = data.position();
313    final int dataLength = data.remaining();
314    data.mark();
315    crcs.mark();
316
317    try {
318      int i = 0;
319      for(final int n = dataLength - bytesPerCrc + 1; i < n; i += bytesPerCrc) {
320        data.get(bytes);
321        algorithm.reset();
322        algorithm.update(bytes, 0, bytesPerCrc);
323        final int computed = (int)algorithm.getValue();
324        final int expected = crcs.getInt();
325
326        if (computed != expected) {
327          long errPos = basePos + data.position() - dataOffset - bytesPerCrc;
328          throwChecksumException(type, algorithm, filename, errPos, expected,
329              computed);
330        }
331      }
332
333      final int remainder = dataLength - i;
334      if (remainder > 0) {
335        data.get(bytes, 0, remainder);
336        algorithm.reset();
337        algorithm.update(bytes, 0, remainder);
338        final int computed = (int)algorithm.getValue();
339        final int expected = crcs.getInt();
340
341        if (computed != expected) {
342          long errPos = basePos + data.position() - dataOffset - remainder;
343          throwChecksumException(type, algorithm, filename, errPos, expected,
344              computed);
345        }
346      }
347    } finally {
348      data.reset();
349      crcs.reset();
350    }
351  }
352
353  /**
354   * Implementation of chunked verification specifically on byte arrays. This
355   * is to avoid the copy when dealing with ByteBuffers that have array backing.
356   */
357  static void verifyChunked(final Type type, final Checksum algorithm,
358      final byte[] data, final int dataOffset, final int dataLength,
359      final int bytesPerCrc, final byte[] crcs, final int crcsOffset,
360      final String filename, final long basePos) throws ChecksumException {
361    final int dataEnd = dataOffset + dataLength;
362    int i = dataOffset;
363    int j = crcsOffset;
364    for(final int n = dataEnd-bytesPerCrc+1; i < n; i += bytesPerCrc, j += 4) {
365      algorithm.reset();
366      algorithm.update(data, i, bytesPerCrc);
367      final int computed = (int)algorithm.getValue();
368      final int expected = ((crcs[j] << 24) + ((crcs[j + 1] << 24) >>> 8))
369          + (((crcs[j + 2] << 24) >>> 16) + ((crcs[j + 3] << 24) >>> 24));
370
371      if (computed != expected) {
372        final long errPos = basePos + i - dataOffset;
373        throwChecksumException(type, algorithm, filename, errPos, expected,
374            computed);
375      }
376    }
377    final int remainder = dataEnd - i;
378    if (remainder > 0) {
379      algorithm.reset();
380      algorithm.update(data, i, remainder);
381      final int computed = (int)algorithm.getValue();
382      final int expected = ((crcs[j] << 24) + ((crcs[j + 1] << 24) >>> 8))
383          + (((crcs[j + 2] << 24) >>> 16) + ((crcs[j + 3] << 24) >>> 24));
384
385      if (computed != expected) {
386        final long errPos = basePos + i - dataOffset;
387        throwChecksumException(type, algorithm, filename, errPos, expected,
388            computed);
389      }
390    }
391  }
392
393  private static void throwChecksumException(Type type, Checksum algorithm,
394      String filename, long errPos, int expected, int computed)
395          throws ChecksumException {
396    throw new ChecksumException("Checksum " + type
397        + " not matched for file " + filename + " at position "+ errPos
398        + String.format(": expected=%X but computed=%X", expected, computed)
399        + ", algorithm=" + algorithm.getClass().getSimpleName(), errPos);
400  }
401
402  /**
403   * Calculate checksums for the given data.
404   * 
405   * The 'mark' of the ByteBuffer parameters may be modified by this function,
406   * but the position is maintained.
407   * 
408   * @param data the DirectByteBuffer pointing to the data to checksum.
409   * @param checksums the DirectByteBuffer into which checksums will be
410   *                  stored. Enough space must be available in this
411   *                  buffer to put the checksums.
412   */
413  public void calculateChunkedSums(ByteBuffer data, ByteBuffer checksums) {
414    if (type.size == 0) return;
415    
416    if (data.hasArray() && checksums.hasArray()) {
417      calculateChunkedSums(data.array(), data.arrayOffset() + data.position(), data.remaining(),
418          checksums.array(), checksums.arrayOffset() + checksums.position());
419      return;
420    }
421
422    if (NativeCrc32.isAvailable()) {
423      NativeCrc32.calculateChunkedSums(bytesPerChecksum, type.id,
424          checksums, data);
425      return;
426    }
427    
428    data.mark();
429    checksums.mark();
430    try {
431      byte[] buf = new byte[bytesPerChecksum];
432      while (data.remaining() > 0) {
433        int n = Math.min(data.remaining(), bytesPerChecksum);
434        data.get(buf, 0, n);
435        summer.reset();
436        summer.update(buf, 0, n);
437        checksums.putInt((int)summer.getValue());
438      }
439    } finally {
440      data.reset();
441      checksums.reset();
442    }
443  }
444
445  /**
446   * Implementation of chunked calculation specifically on byte arrays. This
447   * is to avoid the copy when dealing with ByteBuffers that have array backing.
448   */
449  public void calculateChunkedSums(
450      byte[] data, int dataOffset, int dataLength,
451      byte[] sums, int sumsOffset) {
452    if (type.size == 0) return;
453
454    if (NativeCrc32.isAvailable()) {
455      NativeCrc32.calculateChunkedSumsByteArray(bytesPerChecksum, type.id,
456          sums, sumsOffset, data, dataOffset, dataLength);
457      return;
458    }
459
460    int remaining = dataLength;
461    while (remaining > 0) {
462      int n = Math.min(remaining, bytesPerChecksum);
463      summer.reset();
464      summer.update(data, dataOffset, n);
465      dataOffset += n;
466      remaining -= n;
467      long calculated = summer.getValue();
468      sums[sumsOffset++] = (byte) (calculated >> 24);
469      sums[sumsOffset++] = (byte) (calculated >> 16);
470      sums[sumsOffset++] = (byte) (calculated >> 8);
471      sums[sumsOffset++] = (byte) (calculated);
472    }
473  }
474
475  @Override
476  public boolean equals(Object other) {
477    if (!(other instanceof DataChecksum)) {
478      return false;
479    }
480    DataChecksum o = (DataChecksum)other;
481    return o.bytesPerChecksum == this.bytesPerChecksum &&
482      o.type == this.type;
483  }
484  
485  @Override
486  public int hashCode() {
487    return (this.type.id + 31) * this.bytesPerChecksum;
488  }
489  
490  @Override
491  public String toString() {
492    return "DataChecksum(type=" + type +
493      ", chunkSize=" + bytesPerChecksum + ")";
494  }
495  
496  /**
497   * This just provides a dummy implimentation for Checksum class
498   * This is used when there is no checksum available or required for 
499   * data
500   */
501  static class ChecksumNull implements Checksum {
502    
503    public ChecksumNull() {}
504    
505    //Dummy interface
506    @Override
507    public long getValue() { return 0; }
508    @Override
509    public void reset() {}
510    @Override
511    public void update(byte[] b, int off, int len) {}
512    @Override
513    public void update(int b) {}
514  };
515}