From ae57dcb637d4ec6c94c6c1a2ad3cc85afcd793b0 Mon Sep 17 00:00:00 2001 From: Vincent Zhong Date: Sun, 22 Dec 2024 22:38:21 -0500 Subject: [PATCH] test: Add ParquetDenseVectorCollection tests using parquet-floor We created a test class for ParquetDenseVectorCollection that extends DocumentCollectionTest. Instead of creating new test files, we utilized an existing Parquet test file containing BGE embeddings. Replaces Hadoop dependencies with parquet-floor for: reduced dependency footprint, simplified Parquet file handling, removal of complex Hadoop configuration. Tests verify: basic Parquet file reading functionality, document iteration and content validation, integration with existing BGE embedding test data. --- pom.xml | 176 +----------------- .../ParquetDenseVectorCollectionTest.java | 68 +++++++ 2 files changed, 77 insertions(+), 167 deletions(-) create mode 100644 src/test/java/io/anserini/collection/ParquetDenseVectorCollectionTest.java diff --git a/pom.xml b/pom.xml index 54f1cb44d3..4802d9d694 100644 --- a/pom.xml +++ b/pom.xml @@ -41,6 +41,10 @@ + + jitpack.io + https://jitpack.io + public https://repo1.maven.org/maven2 @@ -50,8 +54,8 @@ https://raw.githubusercontent.com/lintool/AnseriniMaven/master/mvn-repo/ - jitpack.io - https://jitpack.io + strategicblue + https://raw.githubusercontent.com/strategicblue/parquet-floor/mvn-repo/ @@ -547,172 +551,10 @@ - - org.apache.parquet - parquet-hadoop - 1.12.3 - - - org.apache.hadoop - hadoop-mapreduce-client-core - - - org.apache.hadoop - hadoop-yarn-common - - - org.apache.hadoop - hadoop-hdfs - - - - - org.apache.parquet - parquet-common - 1.12.3 - - - org.apache.parquet - parquet-column - 1.12.3 - - - org.apache.parquet - parquet-format - 2.9.0 - - - org.apache.parquet - parquet-encoding - 1.12.3 - - - - - org.apache.hadoop - hadoop-common - 3.4.0 - - - org.apache.hadoop - hadoop-yarn-common - - - org.apache.hadoop - hadoop-yarn-server-common - - - org.apache.hadoop - hadoop-mapreduce-client-core - - - org.apache.hadoop - hadoop-annotations - - - org.apache.httpcomponents - httpclient - - - org.apache.httpcomponents - httpcore - - - org.eclipse.jetty - jetty-server - - - org.eclipse.jetty - jetty-servlet - - - org.eclipse.jetty - jetty-webapp - - - org.eclipse.jetty - jetty-util - - - javax.servlet - javax.servlet-api - - - commons-beanutils - commons-beanutils - - - log4j - log4j - - - commons-logging - commons-logging - - - com.google.protobuf - protobuf-java - - - org.apache.avro - avro - - - org.apache.hadoop - hadoop-auth - - - org.apache.curator - curator-client - - - org.apache.curator - curator-recipes - - - org.apache.kerby - kerb-core - - - org.apache.kerby - kerb-simplekdc - - - org.apache.zookeeper - zookeeper - - - org.apache.zookeeper - zookeeper-jute - - - org.apache.kerby - kerby-pkix - - - org.apache.kerby - kerb-admin - - - org.apache.hadoop - hadoop-client-api - - - - org.apache.hadoop - hadoop-client-api - 3.3.1 - - - org.apache.hadoop - hadoop-hdfs - - - org.apache.hadoop - hadoop-common - - + blue.strategic.parquet + parquet-floor + 1.36 diff --git a/src/test/java/io/anserini/collection/ParquetDenseVectorCollectionTest.java b/src/test/java/io/anserini/collection/ParquetDenseVectorCollectionTest.java new file mode 100644 index 0000000000..d200063248 --- /dev/null +++ b/src/test/java/io/anserini/collection/ParquetDenseVectorCollectionTest.java @@ -0,0 +1,68 @@ +/* + * Tests for the ParquetDenseVectorCollection class which handles dense vector embeddings stored in Parquet format. + * This test suite verifies the collection's ability to read and process Parquet files containing vector embeddings + * using the parquet-floor library instead of Hadoop dependencies. + */ +package io.anserini.collection; + +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class ParquetDenseVectorCollectionTest extends DocumentCollectionTest { + @Before + public void setUp() throws Exception { + super.setUp(); + } + + /* + * Verifies that a document's properties match expected values. + * This implementation focuses on three key aspects: + * 1. Document can be indexed + * 2. Document ID matches expected value + * 3. Document's vector content is present + */ + @Override + void checkDocument(SourceDocument doc, Map expected) { + assertTrue(doc.indexable()); + assertEquals(expected.get("id"), doc.id()); + assertTrue(doc.contents().contains(expected.get("vector"))); + } + + /* + * Tests the collection's ability to read and iterate over documents in a Parquet file. + * Uses a pre-existing test file containing BGE embeddings from MS MARCO passages. + * Verifies that: + * 1. Documents can be read from the Parquet file + * 2. Each document has a valid ID + * 3. The collection contains the expected number of documents + */ + @Test + public void testSegment() throws IOException { + Path path = Paths.get("src/test/resources/sample_docs/parquet/msmarco-passage-bge-base-en-v1.5.parquet"); + ParquetDenseVectorCollection collection = new ParquetDenseVectorCollection(path); + + AtomicInteger cnt = new AtomicInteger(); + Map docIds = new HashMap<>(); + + for (FileSegment segment : collection) { + for (ParquetDenseVectorCollection.Document doc : segment) { + docIds.put(doc.id(), cnt.incrementAndGet()); + } + } + + assertTrue("Collection should contain documents", docIds.size() > 0); + for (String docId : docIds.keySet()) { + assertTrue("Document ID should not be empty", docId != null && !docId.isEmpty()); + } + } +} \ No newline at end of file