Skip to content

Commit

Permalink
test: Add ParquetDenseVectorCollection tests using parquet-floor
Browse files Browse the repository at this point in the history
We created a test class for ParquetDenseVectorCollection that extends DocumentCollectionTest. Instead of creating new test files, we utilized an existing Parquet test file containing BGE embeddings.

Replaces Hadoop dependencies with parquet-floor for: reduced dependency footprint, simplified Parquet file handling, removal of complex Hadoop configuration.

Tests verify: basic Parquet file reading functionality, document iteration and content validation, integration with existing BGE embedding test data.
  • Loading branch information
Vincent Zhong committed Dec 23, 2024
1 parent 6a9cacf commit ae57dcb
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 167 deletions.
176 changes: 9 additions & 167 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@
</scm>

<repositories>
<repository>
<id>jitpack.io</id>
<url>https://jitpack.io</url>
</repository>
<repository>
<id>public</id>
<url>https://repo1.maven.org/maven2</url>
Expand All @@ -50,8 +54,8 @@
<url>https://raw.githubusercontent.com/lintool/AnseriniMaven/master/mvn-repo/</url>
</repository>
<repository>
<id>jitpack.io</id>
<url>https://jitpack.io</url>
<id>strategicblue</id>
<url>https://raw.githubusercontent.com/strategicblue/parquet-floor/mvn-repo/</url>
</repository>
</repositories>

Expand Down Expand Up @@ -547,172 +551,10 @@
</exclusions>
</dependency>
<!-- Apache Parquet Dependencies -->
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-hadoop</artifactId>
<version>1.12.3</version>
<exclusions>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-common</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-common</artifactId>
<version>1.12.3</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-column</artifactId>
<version>1.12.3</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-format</artifactId>
<version>2.9.0</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-encoding</artifactId>
<version>1.12.3</version>
</dependency>

<!-- Necessary Hadoop Modules for Parquet -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.4.0</version>
<exclusions>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-common</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-server-common</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-annotations</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
</exclusion>
<exclusion>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-server</artifactId>
</exclusion>
<exclusion>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-servlet</artifactId>
</exclusion>
<exclusion>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-webapp</artifactId>
</exclusion>
<exclusion>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-util</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>javax.servlet-api</artifactId>
</exclusion>
<exclusion>
<groupId>commons-beanutils</groupId>
<artifactId>commons-beanutils</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
<exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</exclusion>
<exclusion>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-auth</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.curator</groupId>
<artifactId>curator-client</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.curator</groupId>
<artifactId>curator-recipes</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.kerby</groupId>
<artifactId>kerb-core</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.kerby</groupId>
<artifactId>kerb-simplekdc</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper-jute</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.kerby</groupId>
<artifactId>kerby-pkix</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.kerby</groupId>
<artifactId>kerb-admin</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client-api</artifactId>
<version>3.3.1</version>
<exclusions>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
</exclusion>
</exclusions>
<groupId>blue.strategic.parquet</groupId>
<artifactId>parquet-floor</artifactId>
<version>1.36</version>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/*
* Tests for the ParquetDenseVectorCollection class which handles dense vector embeddings stored in Parquet format.
* This test suite verifies the collection's ability to read and process Parquet files containing vector embeddings
* using the parquet-floor library instead of Hadoop dependencies.
*/
package io.anserini.collection;

import org.junit.Before;
import org.junit.Test;

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

public class ParquetDenseVectorCollectionTest extends DocumentCollectionTest<ParquetDenseVectorCollection.Document> {
@Before
public void setUp() throws Exception {
super.setUp();
}

/*
* Verifies that a document's properties match expected values.
* This implementation focuses on three key aspects:
* 1. Document can be indexed
* 2. Document ID matches expected value
* 3. Document's vector content is present
*/
@Override
void checkDocument(SourceDocument doc, Map<String, String> expected) {
assertTrue(doc.indexable());
assertEquals(expected.get("id"), doc.id());
assertTrue(doc.contents().contains(expected.get("vector")));
}

/*
* Tests the collection's ability to read and iterate over documents in a Parquet file.
* Uses a pre-existing test file containing BGE embeddings from MS MARCO passages.
* Verifies that:
* 1. Documents can be read from the Parquet file
* 2. Each document has a valid ID
* 3. The collection contains the expected number of documents
*/
@Test
public void testSegment() throws IOException {
Path path = Paths.get("src/test/resources/sample_docs/parquet/msmarco-passage-bge-base-en-v1.5.parquet");
ParquetDenseVectorCollection collection = new ParquetDenseVectorCollection(path);

AtomicInteger cnt = new AtomicInteger();
Map<String, Integer> docIds = new HashMap<>();

for (FileSegment<ParquetDenseVectorCollection.Document> segment : collection) {
for (ParquetDenseVectorCollection.Document doc : segment) {
docIds.put(doc.id(), cnt.incrementAndGet());
}
}

assertTrue("Collection should contain documents", docIds.size() > 0);
for (String docId : docIds.keySet()) {
assertTrue("Document ID should not be empty", docId != null && !docId.isEmpty());
}
}
}

0 comments on commit ae57dcb

Please sign in to comment.