rapidsai · rapids-bot · Dec 5, 2023 · Dec 4, 2023 · Dec 4, 2023
@@ -48,7 +48,26 @@ public ParquetChunkedReader(long chunkSizeByteLimit, File filePath) {
    * @param filePath Full path of the input Parquet file to read.
    */
   public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, File filePath) {
-    handle = create(chunkSizeByteLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
+    handle = create(chunkSizeByteLimit, 0, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
+        filePath.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId());
+
+    if (handle == 0) {
+      throw new IllegalStateException("Cannot create native chunked Parquet reader object.");
+    }
+  }
+
+  /**
+   * Construct the reader instance from a read limit, a ParquetOptions object, and a file path.
+   *
+   * @param chunkSizeByteLimit Limit on total number of bytes to be returned per read,
+   *                           or 0 if there is no limit.
+   * @param passReadLimit Limit on the amount of memory used for reading and decompressing data or
+   *                      0 if there is no limit
+   * @param opts The options for Parquet reading.
+   * @param filePath Full path of the input Parquet file to read.
+   */
+  public ParquetChunkedReader(long chunkSizeByteLimit, long passReadLimit, ParquetOptions opts, File filePath) {
+    handle = create(chunkSizeByteLimit, passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
         filePath.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId());
 
     if (handle == 0) {
@@ -67,8 +86,31 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, File f
    * @param len The number of bytes to parse the given buffer.
    */
   public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, HostMemoryBuffer buffer,
-      long offset, long len) {
-    handle = create(chunkSizeByteLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null,
+                              long offset, long len) {
+    handle = create(chunkSizeByteLimit,0,  opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null,
+        buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId());
+
+    if (handle == 0) {
+      throw new IllegalStateException("Cannot create native chunked Parquet reader object.");
+    }
+  }
+
+  /**
+   * Construct the reader instance from a read limit and a file already read in a memory buffer.
+   *
+   * @param chunkSizeByteLimit Limit on total number of bytes to be returned per read,
+   *                           or 0 if there is no limit.
+   * @param passReadLimit Limit on the amount of memory used for reading and decompressing data or
+   *                      0 if there is no limit
+   * @param opts The options for Parquet reading.
+   * @param buffer Raw Parquet file content.
+   * @param offset The starting offset into buffer.
+   * @param len The number of bytes to parse the given buffer.
+   */
+  public ParquetChunkedReader(long chunkSizeByteLimit, long passReadLimit,
+                              ParquetOptions opts, HostMemoryBuffer buffer,
+                              long offset, long len) {
+    handle = create(chunkSizeByteLimit,passReadLimit,  opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null,
         buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId());
 
     if (handle == 0) {
@@ -169,15 +211,18 @@ public void close() {
    *
    * @param chunkSizeByteLimit Limit on total number of bytes to be returned per read,
    *                           or 0 if there is no limit.
+   * @param passReadLimit Limit on the amount of memory used for reading and decompressing
+   *                      data or 0 if there is no limit.
    * @param filterColumnNames Name of the columns to read, or an empty array if we want to read all.
    * @param binaryToString Whether to convert the corresponding column to String if it is binary.
    * @param filePath Full path of the file to read, or given as null if reading from a buffer.
    * @param bufferAddrs The address of a buffer to read from, or 0 if we are not using that buffer.
    * @param length The length of the buffer to read from.
    * @param timeUnit Return type of time unit for timestamps.
    */
-  private static native long create(long chunkSizeByteLimit, String[] filterColumnNames,
-      boolean[] binaryToString, String filePath, long bufferAddrs, long length, int timeUnit);
+  private static native long create(long chunkSizeByteLimit, long passReadLimit,
+                                    String[] filterColumnNames, boolean[] binaryToString,
+                                    String filePath, long bufferAddrs, long length, int timeUnit);
 
   private static native long createWithDataSource(long chunkedSizeByteLimit,
       String[] filterColumnNames, boolean[] binaryToString, int timeUnit, long dataSourceHandle);

@@ -36,8 +36,9 @@ extern "C" {
 // This function should take all the parameters that `Table.readParquet` takes,
 // plus one more parameter `long chunkSizeByteLimit`.
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_create(
-    JNIEnv *env, jclass, jlong chunk_read_limit, jobjectArray filter_col_names,
-    jbooleanArray j_col_binary_read, jstring inp_file_path, jlong buffer, jlong buffer_length,
+    JNIEnv *env, jclass, jlong chunk_read_limit, jlong pass_read_limit,
+    jobjectArray filter_col_names, jbooleanArray j_col_binary_read, jstring inp_file_path,
+    jlong buffer, jlong buffer_length,
     jint unit) {
   JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", 0);
   bool read_buffer = true;
@@ -80,7 +81,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_create(
                                .build();
 
     return reinterpret_cast<jlong>(new cudf::io::chunked_parquet_reader(
-        static_cast<std::size_t>(chunk_read_limit), read_opts));
+        static_cast<std::size_t>(chunk_read_limit),
+        static_cast<std::size_t>(pass_read_limit),
+        read_opts));
   }
   CATCH_STD(env, 0);
 }