Skip to content

Commit

Permalink
Verify deletion vector checksum in Delta Lake
Browse files Browse the repository at this point in the history
  • Loading branch information
ebyhr committed May 27, 2024
1 parent 4c56f04 commit 483e0b2
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.UUID;
import java.util.zip.CRC32;
import java.util.zip.Checksum;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
Expand Down Expand Up @@ -87,10 +89,23 @@ public static byte[] readDeletionVector(TrinoInputFile inputFile, int offset, in
throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "The size of deletion vector %s expects %s but got %s".formatted(inputFile.location(), expectedSize, actualSize));
}
inputStream.readFully(bytes);
int checksum = inputStream.readInt();
if (calculateChecksum(bytes) != checksum) {
throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Checksum mismatch for deletion vector: " + inputFile.location());
}
}
return bytes;
}

private static int calculateChecksum(byte[] data)
{
// Delta Lake allows integer overflow intentionally because it's fine from checksum perspective
// https://github.com/delta-io/delta/blob/039a29abb4abc72ac5912651679233dc983398d6/spark/src/main/scala/org/apache/spark/sql/delta/storage/dv/DeletionVectorStore.scala#L115
Checksum crc = new CRC32();
crc.update(data);
return (int) crc.getValue();
}

private static Roaring64NavigableMap deserializeDeletionVectors(byte[] bytes)
throws IOException
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ public void testCacheDeletionVectorsFileOperations()
.add(new CacheOperation("Alluxio.writeCache", "data", 0, 796))
.add(new CacheOperation("Alluxio.readCached", "deletion_vector", 5, 34))
.add(new CacheOperation("Alluxio.readCached", "deletion_vector", 1, 4))
.add(new CacheOperation("Alluxio.readCached", "deletion_vector", 39, 4))
.add(new CacheOperation("Alluxio.readExternal", "deletion_vector", 38, 1))
.add(new CacheOperation("Alluxio.readExternal", "deletion_vector", 1, 4))
.addCopies(new CacheOperation("Alluxio.writeCache", "deletion_vector", 0, 43), 2)
Expand All @@ -217,6 +218,7 @@ public void testCacheDeletionVectorsFileOperations()
.add(new CacheOperation("Alluxio.readCached", "data", 0, 796))
.add(new CacheOperation("Alluxio.readCached", "deletion_vector", 5, 34))
.add(new CacheOperation("Alluxio.readCached", "deletion_vector", 1, 4))
.add(new CacheOperation("Alluxio.readCached", "deletion_vector", 39, 4))
.build());
}

Expand Down

0 comments on commit 483e0b2

Please sign in to comment.