rapidsai · rapids-bot · May 2, 2024 · Feb 24, 2024 · Feb 24, 2024 · Feb 24, 2024
@@ -386,9 +386,10 @@ add_library(
   src/io/orc/aggregate_orc_metadata.cpp
   src/io/orc/dict_enc.cu
   src/io/orc/orc.cpp
+  src/io/orc/reader.cu
   src/io/orc/reader_impl.cu
+  src/io/orc/reader_impl_chunking.cu
   src/io/orc/reader_impl_helpers.cpp
-  src/io/orc/reader_impl_preprocess.cu
   src/io/orc/stats_enc.cu
   src/io/orc/stripe_data.cu
   src/io/orc/stripe_enc.cu

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -255,7 +255,7 @@ ConfigureNVBench(
 
 # ##################################################################################################
 # * orc reader benchmark --------------------------------------------------------------------------
-ConfigureNVBench(ORC_READER_NVBENCH io/orc/orc_reader_input.cpp io/orc/orc_reader_options.cpp)
+ConfigureNVBench(ORC_READER_NVBENCH io/orc/orc_reader_input.cpp)
 
 # ##################################################################################################
 # * csv reader benchmark --------------------------------------------------------------------------

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -88,9 +88,6 @@ void BM_orc_read_io_compression(
   nvbench::type_list<nvbench::enum_type<IOType>, nvbench::enum_type<Compression>>)
 {
   auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL_SIGNED),
-                                         static_cast<int32_t>(data_type::FLOAT),
-                                         static_cast<int32_t>(data_type::DECIMAL),
-                                         static_cast<int32_t>(data_type::TIMESTAMP),
                                          static_cast<int32_t>(data_type::STRING),
                                          static_cast<int32_t>(data_type::LIST),
                                          static_cast<int32_t>(data_type::STRUCT)});
@@ -116,29 +113,23 @@ void BM_orc_read_io_compression(
   orc_read_common(num_rows_written, source_sink, state);
 }
 
-using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL_SIGNED,
-                                            data_type::FLOAT,
-                                            data_type::DECIMAL,
-                                            data_type::TIMESTAMP,
-                                            data_type::STRING,
-                                            data_type::LIST,
-                                            data_type::STRUCT>;
+using d_type_list = nvbench::
+  enum_type_list<data_type::INTEGRAL_SIGNED, data_type::STRING, data_type::LIST, data_type::STRUCT>;
 
-using io_list = nvbench::enum_type_list<cudf::io::io_type::FILEPATH,
-                                        cudf::io::io_type::HOST_BUFFER,
-                                        cudf::io::io_type::DEVICE_BUFFER>;
+using io_list =
+  nvbench::enum_type_list<cudf::io::io_type::FILEPATH, cudf::io::io_type::HOST_BUFFER>;
 
 using compression_list =
   nvbench::enum_type_list<cudf::io::compression_type::SNAPPY, cudf::io::compression_type::NONE>;
 
-NVBENCH_BENCH_TYPES(BM_orc_read_data,
-                    NVBENCH_TYPE_AXES(d_type_list,
-                                      nvbench::enum_type_list<cudf::io::io_type::DEVICE_BUFFER>))
-  .set_name("orc_read_decode")
-  .set_type_axes_names({"data_type", "io"})
-  .set_min_samples(4)
-  .add_int64_axis("cardinality", {0, 1000})
-  .add_int64_axis("run_length", {1, 32});
+// NVBENCH_BENCH_TYPES(BM_orc_read_data,
+//                     NVBENCH_TYPE_AXES(d_type_list,
+//                                       nvbench::enum_type_list<cudf::io::io_type::DEVICE_BUFFER>))
+//   .set_name("orc_read_decode")
+//   .set_type_axes_names({"data_type", "io"})
+//   .set_min_samples(4)
+//   .add_int64_axis("cardinality", {0, 1000})
+//   .add_int64_axis("run_length", {1, 32});
 
 NVBENCH_BENCH_TYPES(BM_orc_read_io_compression, NVBENCH_TYPE_AXES(io_list, compression_list))
   .set_name("orc_read_io_compression")

@@ -41,10 +41,15 @@ namespace orc::detail {
  * @brief Class to read ORC dataset data into columns.
  */
 class reader {
- private:
+ protected:
   class impl;
   std::unique_ptr<impl> _impl;
 
+  /**
+   * @brief Default constructor, needed for subclassing.
+   */
+  reader();
+
  public:
   /**
    * @brief Constructor from an array of datasources
@@ -62,7 +67,7 @@ class reader {
   /**
    * @brief Destructor explicitly declared to avoid inlining in header
    */
-  ~reader();
+  virtual ~reader();
 
   /**
    * @brief Reads the entire dataset.
@@ -73,6 +78,67 @@ class reader {
   table_with_metadata read(orc_reader_options const& options);
 };
 
+/**
+ * @brief The reader class that supports iterative reading of a given file.
+ *
+ * This class intentionally subclasses the `reader` class with private inheritance to hide the
+ * `reader::read()` API. As such, only chunked reading APIs are supported.
+ */
+class chunked_reader : private reader {
+ public:
+  /**
+   * @brief Constructor from size limits and an array of data sources with reader options.
+   *
+   * The typical usage should be similar to this:
+   * ```
+   *  do {
+   *    auto const chunk = reader.read_chunk();
+   *    // Process chunk
+   *  } while (reader.has_next());
+   *
+   * ```
+   *
+   * If `output_size_limit == 0` (i.e., no reading limit), a call to `read_chunk()` will read the
+   * whole file and return a table containing all rows.
+   *
+   * TODO: data read limit
+   *
+   * @param output_size_limit Limit on total number of bytes to be returned per read,
+   *        or `0` if there is no limit
+   * @param data_read_limit Limit on memory usage for the purposes of decompression and processing
+   *        of input, or `0` if there is no limit
+   * @param sources Input `datasource` objects to read the dataset from
+   * @param options Settings for controlling reading behavior
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
+   */
+  explicit chunked_reader(std::size_t output_size_limit,
+                          std::size_t data_read_limit,
+                          std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
+                          orc_reader_options const& options,
+                          rmm::cuda_stream_view stream,
+                          rmm::mr::device_memory_resource* mr);
+
+  /**
+   * @brief Destructor explicitly-declared to avoid inlined in header.
+   *
+   * Since the declaration of the internal `_impl` object does not exist in this header, this
+   * destructor needs to be defined in a separate source file which can access to that object's
+   * declaration.
+   */
+  ~chunked_reader();
+
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::has_next
+   */
+  [[nodiscard]] bool has_next() const;
+
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::read_chunk
+   */
+  [[nodiscard]] table_with_metadata read_chunk() const;
+};
+
 /**
  * @brief Class to write ORC dataset data into columns.
  */
@@ -133,5 +199,6 @@ class writer {
    */
   void skip_close();
 };
+
 }  // namespace orc::detail
 }  // namespace cudf::io
@@ -405,6 +405,80 @@ table_with_metadata read_orc(
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief The chunked orc reader class to read ORC file iteratively in to a series of
+ * tables, chunk by chunk.
+ *
+ * This class is designed to address the reading issue when reading very large ORC files such
+ * that sizes of their columns exceed the limit that can be stored in cudf columns. By reading the
+ * file content by chunks using this class, each chunk is guaranteed to have its size stay within
+ * the given limit.
+ */
+class chunked_orc_reader {
+ public:
+  /**
+   * @brief Default constructor, this should never be used.
+   *
+   * This is added just to satisfy cython.
+   */
+  chunked_orc_reader() = default;
+
+  /**
+   * @brief Constructor for chunked reader.
+   *
+   * This constructor requires the same `orc_reader_option` parameter as in
+   * `cudf::read_orc()`, and additional parameters to specify the size byte limits of the
+   * output table for each reading.
+   *
+   * TODO: data read limit
+   *
+   * @param output_size_limit Limit on total number of bytes to be returned per read,
+   *        or `0` if there is no limit
+   * @param data_read_limit Limit on memory usage for the purposes of decompression and processing
+   *        of input, or `0` if there is no limit
+   * @param options The options used to read Parquet file
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
+   */
+  chunked_orc_reader(std::size_t output_size_limit,
+                     std::size_t data_read_limit,
+                     orc_reader_options const& options,
+                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+  /**
+   * @brief Destructor, destroying the internal reader instance.
+   *
+   * Since the declaration of the internal `reader` object does not exist in this header, this
+   * destructor needs to be defined in a separate source file which can access to that object's
+   * declaration.
+   */
+  ~chunked_orc_reader();
+
+  /**
+   * @brief Check if there is any data in the given file has not yet read.
+   *
+   * @return A boolean value indicating if there is any data left to read
+   */
+  [[nodiscard]] bool has_next() const;
+
+  /**
+   * @brief Read a chunk of rows in the given ORC file.
+   *
+   * The sequence of returned tables, if concatenated by their order, guarantees to form a complete
+   * dataset as reading the entire given file at once.
+   *
+   * An empty table will be returned if the given file is empty, or all the data in the file has
+   * been read and returned by the previous calls.
+   *
+   * @return An output `cudf::table` along with its metadata
+   */
+  [[nodiscard]] table_with_metadata read_chunk() const;
+
+ private:
+  std::unique_ptr<cudf::io::orc::detail::chunked_reader> reader;
+};
+
 /** @} */  // end of group
 /**
  * @addtogroup io_writers
@@ -1038,7 +1112,7 @@ class chunked_orc_writer_options {
    */
   void set_stripe_size_bytes(size_t size_bytes)
   {
-    CUDF_EXPECTS(size_bytes >= 64 << 10, "64KB is the minimum stripe size");
+    // CUDF_EXPECTS(size_bytes >= 64 << 10, "64KB is the minimum stripe size");
     _stripe_size_bytes = size_bytes;
   }
 
@@ -1054,7 +1128,7 @@ class chunked_orc_writer_options {
    */
   void set_stripe_size_rows(size_type size_rows)
   {
-    CUDF_EXPECTS(size_rows >= 512, "maximum stripe size cannot be smaller than 512");
+    // CUDF_EXPECTS(size_rows >= 512, "maximum stripe size cannot be smaller than 512");
     _stripe_size_rows = size_rows;
   }
 

@@ -448,6 +448,48 @@ void write_orc(orc_writer_options const& options, rmm::cuda_stream_view stream)
   }
 }
 
+/**
+ * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader
+ */
+chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
+                                       std::size_t data_read_limit,
+                                       orc_reader_options const& options,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
+  : reader{std::make_unique<orc::detail::chunked_reader>(output_size_limit,
+                                                         data_read_limit,
+                                                         make_datasources(options.get_source()),
+                                                         options,
+                                                         stream,
+                                                         mr)}
+{
+}
+
+/**
+ * @copydoc cudf::io::chunked_orc_reader::~chunked_orc_reader
+ */
+chunked_orc_reader::~chunked_orc_reader() = default;
+
+/**
+ * @copydoc cudf::io::chunked_orc_reader::has_next
+ */
+bool chunked_orc_reader::has_next() const
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(reader != nullptr, "Reader has not been constructed properly.");
+  return reader->has_next();
+}
+
+/**
+ * @copydoc cudf::io::chunked_orc_reader::read_chunk
+ */
+table_with_metadata chunked_orc_reader::read_chunk() const
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(reader != nullptr, "Reader has not been constructed properly.");
+  return reader->read_chunk();
+}
+
 /**
  * @copydoc cudf::io::orc_chunked_writer::orc_chunked_writer
  */