test: Add ParquetDenseVectorCollection tests using parquet-floor

We created a test class for ParquetDenseVectorCollection that extends DocumentCollectionTest. Instead of creating new test files, we utilized an existing Parquet test file containing BGE embeddings. Replaces Hadoop dependencies with parquet-floor for: reduced dependency footprint, simplified Parquet file handling, removal of complex Hadoop configuration. Tests verify: basic Parquet file reading functionality, document iteration and content validation, integration with existing BGE embedding test data.
castorini · Dec 23, 2024 · ae57dcb · ae57dcb
1 parent 6a9cacf
commit ae57dcb
Show file tree

Hide file tree

Showing 2 changed files with 77 additions and 167 deletions.
diff --git a/pom.xml b/pom.xml
@@ -41,6 +41,10 @@
   </scm>
 
   <repositories>
+    <repository>
+      <id>jitpack.io</id>
+      <url>https://jitpack.io</url>
+    </repository>
     <repository>
       <id>public</id>
       <url>https://repo1.maven.org/maven2</url>
@@ -50,8 +54,8 @@
       <url>https://raw.githubusercontent.com/lintool/AnseriniMaven/master/mvn-repo/</url>
     </repository>
     <repository>
-      <id>jitpack.io</id>
-      <url>https://jitpack.io</url>
+      <id>strategicblue</id>
+      <url>https://raw.githubusercontent.com/strategicblue/parquet-floor/mvn-repo/</url>
     </repository>
   </repositories>
 
@@ -547,172 +551,10 @@
       </exclusions>
       </dependency>
     <!-- Apache Parquet Dependencies -->
-   <dependency>
-      <groupId>org.apache.parquet</groupId>
-      <artifactId>parquet-hadoop</artifactId>
-      <version>1.12.3</version>
-      <exclusions>
-        <exclusion>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-mapreduce-client-core</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-yarn-common</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-hdfs</artifactId>
-        </exclusion>
-      </exclusions>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.parquet</groupId>
-      <artifactId>parquet-common</artifactId>
-      <version>1.12.3</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.parquet</groupId>
-      <artifactId>parquet-column</artifactId>
-      <version>1.12.3</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.parquet</groupId>
-      <artifactId>parquet-format</artifactId>
-      <version>2.9.0</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.parquet</groupId>
-      <artifactId>parquet-encoding</artifactId>
-      <version>1.12.3</version>
-    </dependency>
-
-    <!-- Necessary Hadoop Modules for Parquet -->
-    <dependency>
-      <groupId>org.apache.hadoop</groupId>
-      <artifactId>hadoop-common</artifactId>
-      <version>3.4.0</version>
-      <exclusions>
-        <exclusion>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-yarn-common</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-yarn-server-common</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-mapreduce-client-core</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-annotations</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.httpcomponents</groupId>
-          <artifactId>httpclient</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.httpcomponents</groupId>
-          <artifactId>httpcore</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.eclipse.jetty</groupId>
-          <artifactId>jetty-server</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.eclipse.jetty</groupId>
-          <artifactId>jetty-servlet</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.eclipse.jetty</groupId>
-          <artifactId>jetty-webapp</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.eclipse.jetty</groupId>
-          <artifactId>jetty-util</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>javax.servlet</groupId>
-          <artifactId>javax.servlet-api</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>commons-beanutils</groupId>
-          <artifactId>commons-beanutils</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>log4j</groupId>
-          <artifactId>log4j</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>commons-logging</groupId>
-          <artifactId>commons-logging</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>com.google.protobuf</groupId>
-          <artifactId>protobuf-java</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.avro</groupId>
-          <artifactId>avro</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-auth</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.curator</groupId>
-          <artifactId>curator-client</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.curator</groupId>
-          <artifactId>curator-recipes</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.kerby</groupId>
-          <artifactId>kerb-core</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.kerby</groupId>
-          <artifactId>kerb-simplekdc</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.zookeeper</groupId>
-          <artifactId>zookeeper</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.zookeeper</groupId>
-          <artifactId>zookeeper-jute</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.kerby</groupId>
-          <artifactId>kerby-pkix</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.kerby</groupId>
-          <artifactId>kerb-admin</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-client-api</artifactId>
-        </exclusion>
-      </exclusions>
-    </dependency>
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
-      <artifactId>hadoop-client-api</artifactId>
-      <version>3.3.1</version>
-      <exclusions>
-        <exclusion>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-hdfs</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-common</artifactId>
-        </exclusion>
-      </exclusions>
+      <groupId>blue.strategic.parquet</groupId>
+      <artifactId>parquet-floor</artifactId>
+      <version>1.36</version>
     </dependency>
   </dependencies>
 </project>
diff --git a/src/test/java/io/anserini/collection/ParquetDenseVectorCollectionTest.java b/src/test/java/io/anserini/collection/ParquetDenseVectorCollectionTest.java
@@ -0,0 +1,68 @@
+/*
+ * Tests for the ParquetDenseVectorCollection class which handles dense vector embeddings stored in Parquet format.
+ * This test suite verifies the collection's ability to read and process Parquet files containing vector embeddings
+ * using the parquet-floor library instead of Hadoop dependencies.
+ */
+package io.anserini.collection;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class ParquetDenseVectorCollectionTest extends DocumentCollectionTest<ParquetDenseVectorCollection.Document> {
+  @Before
+  public void setUp() throws Exception {
+    super.setUp();
+  }
+
+  /*
+   * Verifies that a document's properties match expected values.
+   * This implementation focuses on three key aspects:
+   * 1. Document can be indexed
+   * 2. Document ID matches expected value
+   * 3. Document's vector content is present
+   */
+  @Override
+  void checkDocument(SourceDocument doc, Map<String, String> expected) {
+    assertTrue(doc.indexable());
+    assertEquals(expected.get("id"), doc.id());
+    assertTrue(doc.contents().contains(expected.get("vector")));
+  }
+
+  /*
+   * Tests the collection's ability to read and iterate over documents in a Parquet file.
+   * Uses a pre-existing test file containing BGE embeddings from MS MARCO passages.
+   * Verifies that:
+   * 1. Documents can be read from the Parquet file
+   * 2. Each document has a valid ID
+   * 3. The collection contains the expected number of documents
+   */
+  @Test
+  public void testSegment() throws IOException {
+    Path path = Paths.get("src/test/resources/sample_docs/parquet/msmarco-passage-bge-base-en-v1.5.parquet");
+    ParquetDenseVectorCollection collection = new ParquetDenseVectorCollection(path);
+
+    AtomicInteger cnt = new AtomicInteger();
+    Map<String, Integer> docIds = new HashMap<>();
+
+    for (FileSegment<ParquetDenseVectorCollection.Document> segment : collection) {
+      for (ParquetDenseVectorCollection.Document doc : segment) {
+        docIds.put(doc.id(), cnt.incrementAndGet());
+      }
+    }
+
+    assertTrue("Collection should contain documents", docIds.size() > 0);
+    for (String docId : docIds.keySet()) {
+      assertTrue("Document ID should not be empty", docId != null && !docId.isEmpty());
+    }
+  }
+}