From 81f8cdfdfb326afaee8177e4f40a607393b21b99 Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Thu, 2 May 2024 16:23:18 -0700 Subject: [PATCH] Implement ORC chunked reader (#15094) This implements ORC chunked reader, to support reading ORC such that: * The output is multiple tables instead of once, each of them is issue when calling to `read_chunk()`, and has limited size which stays within a given `output_limit` parameter. * The temporary device memory usage can be limited by a soft limit `data_read_limit` parameter, allowing to read very large ORC files without OOM. * ORC files containing many billions of rows can be properly read chunk-by-chunk without seeing the size overflow issue when the number of rows exceeds cudf size limit (`2^31` rows). Depends on: * https://github.com/rapidsai/cudf/pull/14911 * https://github.com/rapidsai/cudf/pull/15008 * https://github.com/rapidsai/cudf/pull/15169 * https://github.com/rapidsai/cudf/pull/15252 Partially contribute to https://github.com/rapidsai/cudf/issues/12228. --- ## Benchmarks Due to some small optimizations in ORC reader, reading ORC files all-at-once (reading the entire file into just one output table) can be a little bit faster. For example, with the benchmark `orc_read_io_compression`: ``` ## [0] Quadro RTX 6000 | io | compression | cardinality | run_length | Ref Time | Ref Noise | Cmp Time | Cmp Noise | Diff | %Diff | Status | |---------------|---------------|---------------|--------------|------------|-------------|------------|-------------|---------------|---------|----------| | FILEPATH | SNAPPY | 0 | 1 | 183.027 ms | 7.45% | 157.293 ms | 4.72% | -25733.837 us | -14.06% | FAIL | | FILEPATH | SNAPPY | 1000 | 1 | 198.228 ms | 6.43% | 164.395 ms | 4.14% | -33833.020 us | -17.07% | FAIL | | FILEPATH | SNAPPY | 0 | 32 | 96.676 ms | 6.19% | 82.522 ms | 1.36% | -14153.945 us | -14.64% | FAIL | | FILEPATH | SNAPPY | 1000 | 32 | 94.508 ms | 4.80% | 81.078 ms | 0.48% | -13429.672 us | -14.21% | FAIL | | FILEPATH | NONE | 0 | 1 | 161.868 ms | 5.40% | 139.849 ms | 2.44% | -22018.910 us | -13.60% | FAIL | | FILEPATH | NONE | 1000 | 1 | 164.902 ms | 5.80% | 142.041 ms | 3.43% | -22861.258 us | -13.86% | FAIL | | FILEPATH | NONE | 0 | 32 | 88.298 ms | 5.15% | 74.924 ms | 1.97% | -13374.607 us | -15.15% | FAIL | | FILEPATH | NONE | 1000 | 32 | 87.147 ms | 5.61% | 72.502 ms | 0.50% | -14645.122 us | -16.81% | FAIL | | HOST_BUFFER | SNAPPY | 0 | 1 | 124.990 ms | 0.39% | 111.670 ms | 2.13% | -13320.483 us | -10.66% | FAIL | | HOST_BUFFER | SNAPPY | 1000 | 1 | 149.858 ms | 4.10% | 126.266 ms | 0.48% | -23591.543 us | -15.74% | FAIL | | HOST_BUFFER | SNAPPY | 0 | 32 | 92.499 ms | 4.46% | 77.653 ms | 1.58% | -14846.471 us | -16.05% | FAIL | | HOST_BUFFER | SNAPPY | 1000 | 32 | 93.373 ms | 4.14% | 80.033 ms | 3.19% | -13340.002 us | -14.29% | FAIL | | HOST_BUFFER | NONE | 0 | 1 | 111.792 ms | 0.50% | 97.083 ms | 0.50% | -14709.530 us | -13.16% | FAIL | | HOST_BUFFER | NONE | 1000 | 1 | 117.646 ms | 5.60% | 97.634 ms | 0.44% | -20012.301 us | -17.01% | FAIL | | HOST_BUFFER | NONE | 0 | 32 | 84.983 ms | 4.96% | 66.975 ms | 0.50% | -18007.403 us | -21.19% | FAIL | | HOST_BUFFER | NONE | 1000 | 32 | 82.648 ms | 4.42% | 65.510 ms | 0.91% | -17137.910 us | -20.74% | FAIL | | DEVICE_BUFFER | SNAPPY | 0 | 1 | 65.538 ms | 4.02% | 59.399 ms | 2.54% | -6138.560 us | -9.37% | FAIL | | DEVICE_BUFFER | SNAPPY | 1000 | 1 | 101.427 ms | 4.10% | 92.276 ms | 3.30% | -9150.278 us | -9.02% | FAIL | | DEVICE_BUFFER | SNAPPY | 0 | 32 | 80.133 ms | 4.64% | 73.959 ms | 3.50% | -6173.818 us | -7.70% | FAIL | | DEVICE_BUFFER | SNAPPY | 1000 | 32 | 86.232 ms | 4.71% | 77.446 ms | 3.32% | -8786.606 us | -10.19% | FAIL | | DEVICE_BUFFER | NONE | 0 | 1 | 52.189 ms | 6.62% | 45.018 ms | 4.11% | -7171.043 us | -13.74% | FAIL | | DEVICE_BUFFER | NONE | 1000 | 1 | 54.664 ms | 6.76% | 46.855 ms | 3.35% | -7809.803 us | -14.29% | FAIL | | DEVICE_BUFFER | NONE | 0 | 32 | 67.975 ms | 5.12% | 60.553 ms | 4.22% | -7422.279 us | -10.92% | FAIL | | DEVICE_BUFFER | NONE | 1000 | 32 | 68.485 ms | 4.86% | 62.253 ms | 6.23% | -6232.340 us | -9.10% | FAIL | ``` When memory is limited, chunked read can help avoiding OOM but with some sort of performance trade-off. For example, for reading a table of size 500MB from file using 64MB output limits and 640 MB data read limit: ``` | io | compression | cardinality | run_length | Ref Time | Ref Noise | Cmp Time | Cmp Noise | Diff | %Diff | Status | |---------------|---------------|---------------|--------------|------------|-------------|------------|-------------|------------|---------|----------| | FILEPATH | SNAPPY | 0 | 1 | 183.027 ms | 7.45% | 350.824 ms | 2.74% | 167.796 ms | 91.68% | FAIL | | FILEPATH | SNAPPY | 1000 | 1 | 198.228 ms | 6.43% | 322.414 ms | 3.46% | 124.186 ms | 62.65% | FAIL | | FILEPATH | SNAPPY | 0 | 32 | 96.676 ms | 6.19% | 133.363 ms | 4.78% | 36.686 ms | 37.95% | FAIL | | FILEPATH | SNAPPY | 1000 | 32 | 94.508 ms | 4.80% | 128.897 ms | 0.37% | 34.389 ms | 36.39% | FAIL | | FILEPATH | NONE | 0 | 1 | 161.868 ms | 5.40% | 316.637 ms | 4.21% | 154.769 ms | 95.61% | FAIL | | FILEPATH | NONE | 1000 | 1 | 164.902 ms | 5.80% | 326.043 ms | 3.06% | 161.141 ms | 97.72% | FAIL | | FILEPATH | NONE | 0 | 32 | 88.298 ms | 5.15% | 124.819 ms | 5.17% | 36.520 ms | 41.36% | FAIL | | FILEPATH | NONE | 1000 | 32 | 87.147 ms | 5.61% | 123.047 ms | 5.82% | 35.900 ms | 41.19% | FAIL | | HOST_BUFFER | SNAPPY | 0 | 1 | 124.990 ms | 0.39% | 285.718 ms | 0.78% | 160.728 ms | 128.59% | FAIL | | HOST_BUFFER | SNAPPY | 1000 | 1 | 149.858 ms | 4.10% | 263.491 ms | 2.89% | 113.633 ms | 75.83% | FAIL | | HOST_BUFFER | SNAPPY | 0 | 32 | 92.499 ms | 4.46% | 127.881 ms | 0.86% | 35.382 ms | 38.25% | FAIL | | HOST_BUFFER | SNAPPY | 1000 | 32 | 93.373 ms | 4.14% | 128.022 ms | 0.98% | 34.650 ms | 37.11% | FAIL | | HOST_BUFFER | NONE | 0 | 1 | 111.792 ms | 0.50% | 241.064 ms | 1.89% | 129.271 ms | 115.64% | FAIL | | HOST_BUFFER | NONE | 1000 | 1 | 117.646 ms | 5.60% | 248.134 ms | 3.08% | 130.488 ms | 110.92% | FAIL | | HOST_BUFFER | NONE | 0 | 32 | 84.983 ms | 4.96% | 118.049 ms | 5.99% | 33.066 ms | 38.91% | FAIL | | HOST_BUFFER | NONE | 1000 | 32 | 82.648 ms | 4.42% | 114.577 ms | 2.34% | 31.929 ms | 38.63% | FAIL | | DEVICE_BUFFER | SNAPPY | 0 | 1 | 65.538 ms | 4.02% | 232.466 ms | 3.28% | 166.928 ms | 254.71% | FAIL | | DEVICE_BUFFER | SNAPPY | 1000 | 1 | 101.427 ms | 4.10% | 221.578 ms | 1.43% | 120.152 ms | 118.46% | FAIL | | DEVICE_BUFFER | SNAPPY | 0 | 32 | 80.133 ms | 4.64% | 120.604 ms | 0.35% | 40.471 ms | 50.50% | FAIL | | DEVICE_BUFFER | SNAPPY | 1000 | 32 | 86.232 ms | 4.71% | 125.521 ms | 3.93% | 39.289 ms | 45.56% | FAIL | | DEVICE_BUFFER | NONE | 0 | 1 | 52.189 ms | 6.62% | 182.943 ms | 0.29% | 130.754 ms | 250.54% | FAIL | | DEVICE_BUFFER | NONE | 1000 | 1 | 54.664 ms | 6.76% | 190.501 ms | 0.49% | 135.836 ms | 248.49% | FAIL | | DEVICE_BUFFER | NONE | 0 | 32 | 67.975 ms | 5.12% | 107.172 ms | 3.56% | 39.197 ms | 57.66% | FAIL | | DEVICE_BUFFER | NONE | 1000 | 32 | 68.485 ms | 4.86% | 108.097 ms | 2.92% | 39.611 ms | 57.84% | FAIL | ``` And if memory is too limited, chunked read with 8MB output limit/80MB data read limit: ``` | io | compression | cardinality | run_length | Ref Time | Ref Noise | Cmp Time | Cmp Noise | Diff | %Diff | Status | | io | compression | cardinality | run_length | Ref Time | Ref Noise | Cmp Time | Cmp Noise | Diff | %Diff | Status | |---------------|---------------|---------------|--------------|------------|-------------|------------|-------------|------------|---------|----------| | FILEPATH | SNAPPY | 0 | 1 | 183.027 ms | 7.45% | 732.926 ms | 1.98% | 549.899 ms | 300.45% | FAIL | | FILEPATH | SNAPPY | 1000 | 1 | 198.228 ms | 6.43% | 834.309 ms | 4.21% | 636.081 ms | 320.88% | FAIL | | FILEPATH | SNAPPY | 0 | 32 | 96.676 ms | 6.19% | 363.033 ms | 1.66% | 266.356 ms | 275.51% | FAIL | | FILEPATH | SNAPPY | 1000 | 32 | 94.508 ms | 4.80% | 313.813 ms | 1.28% | 219.305 ms | 232.05% | FAIL | | FILEPATH | NONE | 0 | 1 | 161.868 ms | 5.40% | 607.700 ms | 2.90% | 445.832 ms | 275.43% | FAIL | | FILEPATH | NONE | 1000 | 1 | 164.902 ms | 5.80% | 616.101 ms | 3.46% | 451.199 ms | 273.62% | FAIL | | FILEPATH | NONE | 0 | 32 | 88.298 ms | 5.15% | 267.703 ms | 0.46% | 179.405 ms | 203.18% | FAIL | | FILEPATH | NONE | 1000 | 32 | 87.147 ms | 5.61% | 250.528 ms | 0.43% | 163.381 ms | 187.48% | FAIL | | HOST_BUFFER | SNAPPY | 0 | 1 | 124.990 ms | 0.39% | 636.270 ms | 0.44% | 511.280 ms | 409.06% | FAIL | | HOST_BUFFER | SNAPPY | 1000 | 1 | 149.858 ms | 4.10% | 747.264 ms | 0.50% | 597.406 ms | 398.65% | FAIL | | HOST_BUFFER | SNAPPY | 0 | 32 | 92.499 ms | 4.46% | 359.660 ms | 0.19% | 267.161 ms | 288.82% | FAIL | | HOST_BUFFER | SNAPPY | 1000 | 32 | 93.373 ms | 4.14% | 311.608 ms | 0.43% | 218.235 ms | 233.73% | FAIL | | HOST_BUFFER | NONE | 0 | 1 | 111.792 ms | 0.50% | 493.797 ms | 0.13% | 382.005 ms | 341.71% | FAIL | | HOST_BUFFER | NONE | 1000 | 1 | 117.646 ms | 5.60% | 516.706 ms | 0.12% | 399.060 ms | 339.20% | FAIL | | HOST_BUFFER | NONE | 0 | 32 | 84.983 ms | 4.96% | 258.477 ms | 0.46% | 173.495 ms | 204.15% | FAIL | | HOST_BUFFER | NONE | 1000 | 32 | 82.648 ms | 4.42% | 248.028 ms | 5.30% | 165.380 ms | 200.10% | FAIL | | DEVICE_BUFFER | SNAPPY | 0 | 1 | 65.538 ms | 4.02% | 606.010 ms | 3.76% | 540.472 ms | 824.68% | FAIL | | DEVICE_BUFFER | SNAPPY | 1000 | 1 | 101.427 ms | 4.10% | 742.774 ms | 4.64% | 641.347 ms | 632.33% | FAIL | | DEVICE_BUFFER | SNAPPY | 0 | 32 | 80.133 ms | 4.64% | 364.701 ms | 2.70% | 284.568 ms | 355.12% | FAIL | | DEVICE_BUFFER | SNAPPY | 1000 | 32 | 86.232 ms | 4.71% | 320.387 ms | 2.80% | 234.155 ms | 271.54% | FAIL | | DEVICE_BUFFER | NONE | 0 | 1 | 52.189 ms | 6.62% | 458.100 ms | 2.15% | 405.912 ms | 777.78% | FAIL | | DEVICE_BUFFER | NONE | 1000 | 1 | 54.664 ms | 6.76% | 478.527 ms | 1.41% | 423.862 ms | 775.39% | FAIL | | DEVICE_BUFFER | NONE | 0 | 32 | 67.975 ms | 5.12% | 260.009 ms | 3.71% | 192.034 ms | 282.51% | FAIL | | DEVICE_BUFFER | NONE | 1000 | 32 | 68.485 ms | 4.86% | 243.705 ms | 2.09% | 175.220 ms | 255.85% | FAIL | ``` Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Bradley Dice (https://github.com/bdice) - https://github.com/nvdbaranec - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/15094 --- cpp/CMakeLists.txt | 3 +- cpp/benchmarks/io/orc/orc_reader_input.cpp | 106 +- cpp/include/cudf/io/detail/orc.hpp | 64 +- cpp/include/cudf/io/orc.hpp | 160 +- cpp/src/io/functions.cpp | 60 +- cpp/src/io/orc/aggregate_orc_metadata.cpp | 71 +- cpp/src/io/orc/aggregate_orc_metadata.hpp | 22 +- cpp/src/io/orc/orc.hpp | 8 +- cpp/src/io/orc/reader_impl.cu | 255 ++- cpp/src/io/orc/reader_impl.hpp | 164 +- cpp/src/io/orc/reader_impl_chunking.cu | 723 ++++++++ cpp/src/io/orc/reader_impl_chunking.hpp | 290 +++- ...pl_preprocess.cu => reader_impl_decode.cu} | 851 +++++----- cpp/src/io/orc/reader_impl_helpers.hpp | 4 +- cpp/src/io/parquet/reader_impl_helpers.cpp | 5 +- cpp/src/io/utilities/row_selection.cpp | 21 +- cpp/src/io/utilities/row_selection.hpp | 5 +- cpp/tests/CMakeLists.txt | 2 +- cpp/tests/io/orc_chunked_reader_test.cu | 1477 +++++++++++++++++ cpp/tests/io/row_selection_test.cpp | 13 - python/cudf/cudf/_lib/cpp/io/orc.pxd | 16 +- python/cudf/cudf/_lib/orc.pyx | 6 +- 22 files changed, 3685 insertions(+), 641 deletions(-) create mode 100644 cpp/src/io/orc/reader_impl_chunking.cu rename cpp/src/io/orc/{reader_impl_preprocess.cu => reader_impl_decode.cu} (56%) create mode 100644 cpp/tests/io/orc_chunked_reader_test.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 53da710f0ea..232a4f40d8e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -395,8 +395,9 @@ add_library( src/io/orc/dict_enc.cu src/io/orc/orc.cpp src/io/orc/reader_impl.cu + src/io/orc/reader_impl_chunking.cu + src/io/orc/reader_impl_decode.cu src/io/orc/reader_impl_helpers.cpp - src/io/orc/reader_impl_preprocess.cu src/io/orc/stats_enc.cu src/io/orc/stripe_data.cu src/io/orc/stripe_enc.cu diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp index fdb7dbe59b8..b7c214a8374 100644 --- a/cpp/benchmarks/io/orc/orc_reader_input.cpp +++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,31 +24,59 @@ #include +namespace { + // Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to // run on most GPUs, but large enough to allow highest throughput -constexpr int64_t data_size = 512 << 20; constexpr cudf::size_type num_cols = 64; +constexpr std::size_t data_size = 512 << 20; +constexpr std::size_t Mbytes = 1024 * 1024; +template void orc_read_common(cudf::size_type num_rows_to_read, cuio_source_sink_pair& source_sink, nvbench::state& state) { - cudf::io::orc_reader_options read_opts = - cudf::io::orc_reader_options::builder(source_sink.make_source_info()); + auto const read_opts = + cudf::io::orc_reader_options::builder(source_sink.make_source_info()).build(); auto mem_stats_logger = cudf::memory_stats_logger(); // init stats logger state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - state.exec( - nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { - try_drop_l3_cache(); - - timer.start(); - auto const result = cudf::io::read_orc(read_opts); - timer.stop(); - CUDF_EXPECTS(result.tbl->num_columns() == num_cols, "Unexpected number of columns"); - CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows"); - }); + if constexpr (is_chunked_read) { + state.exec( + nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch&, auto& timer) { + try_drop_l3_cache(); + auto const output_limit_MB = + static_cast(state.get_int64("chunk_read_limit_MB")); + auto const read_limit_MB = static_cast(state.get_int64("pass_read_limit_MB")); + + auto reader = + cudf::io::chunked_orc_reader(output_limit_MB * Mbytes, read_limit_MB * Mbytes, read_opts); + cudf::size_type num_rows{0}; + + timer.start(); + do { + auto chunk = reader.read_chunk(); + num_rows += chunk.tbl->num_rows(); + } while (reader.has_next()); + timer.stop(); + + CUDF_EXPECTS(num_rows == num_rows_to_read, "Unexpected number of rows"); + }); + } else { // not is_chunked_read + state.exec( + nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch&, auto& timer) { + try_drop_l3_cache(); + + timer.start(); + auto const result = cudf::io::read_orc(read_opts); + timer.stop(); + + CUDF_EXPECTS(result.tbl->num_columns() == num_cols, "Unexpected number of columns"); + CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows"); + }); + } auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); @@ -57,6 +85,8 @@ void orc_read_common(cudf::size_type num_rows_to_read, state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); } +} // namespace + template void BM_orc_read_data(nvbench::state& state, nvbench::type_list, nvbench::enum_type>) @@ -79,13 +109,11 @@ void BM_orc_read_data(nvbench::state& state, return view.num_rows(); }(); - orc_read_common(num_rows_written, source_sink, state); + orc_read_common(num_rows_written, source_sink, state); } -template -void BM_orc_read_io_compression( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +void orc_read_io_compression(nvbench::state& state) { auto const d_type = get_type_or_group({static_cast(data_type::INTEGRAL_SIGNED), static_cast(data_type::FLOAT), @@ -95,15 +123,21 @@ void BM_orc_read_io_compression( static_cast(data_type::LIST), static_cast(data_type::STRUCT)}); - cudf::size_type const cardinality = state.get_int64("cardinality"); - cudf::size_type const run_length = state.get_int64("run_length"); + auto const [cardinality, run_length] = [&]() -> std::pair { + if constexpr (chunked_read) { + return {0, 4}; + } else { + return {static_cast(state.get_int64("cardinality")), + static_cast(state.get_int64("run_length"))}; + } + }(); cuio_source_sink_pair source_sink(IOType); auto const num_rows_written = [&]() { auto const tbl = create_random_table( cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, - data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); + data_profile_builder{}.cardinality(cardinality).avg_run_length(run_length)); auto const view = tbl->view(); cudf::io::orc_writer_options opts = @@ -113,7 +147,23 @@ void BM_orc_read_io_compression( return view.num_rows(); }(); - orc_read_common(num_rows_written, source_sink, state); + orc_read_common(num_rows_written, source_sink, state); +} + +template +void BM_orc_read_io_compression( + nvbench::state& state, + nvbench::type_list, nvbench::enum_type>) +{ + return orc_read_io_compression(state); +} + +template +void BM_orc_chunked_read_io_compression(nvbench::state& state, + nvbench::type_list>) +{ + // Only run benchmark using HOST_BUFFER IO. + return orc_read_io_compression(state); } using d_type_list = nvbench::enum_type_list _impl; + std::unique_ptr _impl; public: /** @@ -68,10 +70,63 @@ class reader { /** * @brief Reads the entire dataset. * - * @param options Settings for controlling reading behavior * @return The set of columns along with table metadata */ - table_with_metadata read(orc_reader_options const& options); + table_with_metadata read(); +}; + +/** + * @brief The reader class that supports iterative reading from an array of data sources. + */ +class chunked_reader { + private: + std::unique_ptr _impl; + + public: + /** + * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader(std::size_t, std::size_t, size_type, + * orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref) + * + * @param sources Input `datasource` objects to read the dataset from + */ + explicit chunked_reader(std::size_t chunk_read_limit, + std::size_t pass_read_limit, + size_type output_row_granularity, + std::vector>&& sources, + orc_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + /** + * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader(std::size_t, std::size_t, + * orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref) + * + * @param sources Input `datasource` objects to read the dataset from + */ + explicit chunked_reader(std::size_t chunk_read_limit, + std::size_t pass_read_limit, + std::vector>&& sources, + orc_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + + /** + * @brief Destructor explicitly-declared to avoid inlined in header. + * + * Since the declaration of the internal `_impl` object does not exist in this header, this + * destructor needs to be defined in a separate source file which can access to that object's + * declaration. + */ + ~chunked_reader(); + + /** + * @copydoc cudf::io::chunked_orc_reader::has_next + */ + [[nodiscard]] bool has_next() const; + + /** + * @copydoc cudf::io::chunked_orc_reader::read_chunk + */ + [[nodiscard]] table_with_metadata read_chunk() const; }; /** @@ -126,5 +181,6 @@ class writer { */ void close(); }; + } // namespace orc::detail } // namespace cudf::io diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index bceb258cb38..8140f8897b7 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -58,10 +58,10 @@ class orc_reader_options { // List of individual stripes to read (ignored if empty) std::vector> _stripes; - // Rows to skip from the start; ORC stores the number of rows as uint64_t - uint64_t _skip_rows = 0; + // Rows to skip from the start + int64_t _skip_rows = 0; // Rows to read; `nullopt` is all - std::optional _num_rows; + std::optional _num_rows; // Whether to use row index to speed-up reading bool _use_index = true; @@ -125,7 +125,7 @@ class orc_reader_options { * * @return Number of rows to skip from the start */ - uint64_t get_skip_rows() const { return _skip_rows; } + int64_t get_skip_rows() const { return _skip_rows; } /** * @brief Returns number of row to read. @@ -133,7 +133,7 @@ class orc_reader_options { * @return Number of rows to read; `nullopt` if the option hasn't been set (in which case the file * is read until the end) */ - std::optional const& get_num_rows() const { return _num_rows; } + std::optional const& get_num_rows() const { return _num_rows; } /** * @brief Whether to use row index to speed-up reading. @@ -198,10 +198,10 @@ class orc_reader_options { * @throw cudf::logic_error if a negative value is passed * @throw cudf::logic_error if stripes have been previously set */ - void set_skip_rows(uint64_t rows) + void set_skip_rows(int64_t rows) { + CUDF_EXPECTS(rows >= 0, "skip_rows cannot be negative"); CUDF_EXPECTS(rows == 0 or _stripes.empty(), "Can't set both skip_rows along with stripes"); - CUDF_EXPECTS(rows <= std::numeric_limits::max(), "skip_rows is too large"); _skip_rows = rows; } @@ -213,7 +213,7 @@ class orc_reader_options { * @throw cudf::logic_error if a negative value is passed * @throw cudf::logic_error if stripes have been previously set */ - void set_num_rows(size_type nrows) + void set_num_rows(int64_t nrows) { CUDF_EXPECTS(nrows >= 0, "num_rows cannot be negative"); CUDF_EXPECTS(_stripes.empty(), "Can't set both num_rows and stripes"); @@ -271,7 +271,7 @@ class orc_reader_options_builder { * * @param src The source information used to read orc file */ - explicit orc_reader_options_builder(source_info src) : options{std::move(src)} {}; + explicit orc_reader_options_builder(source_info src) : options{std::move(src)} {} /** * @brief Sets names of the column to read. @@ -303,7 +303,7 @@ class orc_reader_options_builder { * @param rows Number of rows * @return this for chaining */ - orc_reader_options_builder& skip_rows(uint64_t rows) + orc_reader_options_builder& skip_rows(int64_t rows) { options.set_skip_rows(rows); return *this; @@ -315,7 +315,7 @@ class orc_reader_options_builder { * @param nrows Number of rows * @return this for chaining */ - orc_reader_options_builder& num_rows(size_type nrows) + orc_reader_options_builder& num_rows(int64_t nrows) { options.set_num_rows(nrows); return *this; @@ -406,6 +406,144 @@ table_with_metadata read_orc( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); +/** + * @brief The chunked orc reader class to read an ORC file iteratively into a series of + * tables, chunk by chunk. + * + * This class is designed to address the reading issue when reading very large ORC files such + * that sizes of their columns exceed the limit that can be stored in cudf columns. By reading the + * file content by chunks using this class, each chunk is guaranteed to have its size stay within + * the given limit. + */ +class chunked_orc_reader { + public: + /** + * @brief Default constructor, this should never be used. + * + * This is added just to satisfy cython. + */ + chunked_orc_reader() = default; + + /** + * @brief Construct the reader from input/output size limits, output row granularity, along with + * other ORC reader options. + * + * The typical usage should be similar to this: + * ``` + * do { + * auto const chunk = reader.read_chunk(); + * // Process chunk + * } while (reader.has_next()); + * + * ``` + * + * If `chunk_read_limit == 0` (i.e., no output limit) and `pass_read_limit == 0` (no temporary + * memory size limit), a call to `read_chunk()` will read the whole data source and return a table + * containing all rows. + * + * The `chunk_read_limit` parameter controls the size of the output table to be returned per + * `read_chunk()` call. If the user specifies a 100 MB limit, the reader will attempt to return + * tables that have a total bytes size (over all columns) of 100 MB or less. + * This is a soft limit and the code will not fail if it cannot satisfy the limit. + * + * The `pass_read_limit` parameter controls how much temporary memory is used in the entire + * process of loading, decompressing and decoding of data. Again, this is also a soft limit and + * the reader will try to make the best effort. + * + * Finally, the parameter `output_row_granularity` controls the changes in row number of the + * output chunk. For each call to `read_chunk()`, with respect to the given `pass_read_limit`, a + * subset of stripes may be loaded, decompressed and decoded into an intermediate table. The + * reader will then subdivide that table into smaller tables for final output using + * `output_row_granularity` as the subdivision step. + * + * @param chunk_read_limit Limit on total number of bytes to be returned per `read_chunk()` call, + * or `0` if there is no limit + * @param pass_read_limit Limit on temporary memory usage for reading the data sources, + * or `0` if there is no limit + * @param output_row_granularity The granularity parameter used for subdividing the decoded + * table for final output + * @param options Settings for controlling reading behaviors + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource to use for device memory allocation + * + * @throw cudf::logic_error if `output_row_granularity` is non-positive + */ + explicit chunked_orc_reader( + std::size_t chunk_read_limit, + std::size_t pass_read_limit, + size_type output_row_granularity, + orc_reader_options const& options, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + + /** + * @brief Construct the reader from input/output size limits along with other ORC reader options. + * + * This constructor implicitly call the other constructor with `output_row_granularity` set to + * `DEFAULT_OUTPUT_ROW_GRANULARITY` rows. + * + * @param chunk_read_limit Limit on total number of bytes to be returned per `read_chunk()` call, + * or `0` if there is no limit + * @param pass_read_limit Limit on temporary memory usage for reading the data sources, + * or `0` if there is no limit + * @param options Settings for controlling reading behaviors + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource to use for device memory allocation + */ + explicit chunked_orc_reader( + std::size_t chunk_read_limit, + std::size_t pass_read_limit, + orc_reader_options const& options, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + + /** + * @brief Construct the reader from output size limits along with other ORC reader options. + * + * This constructor implicitly call the other constructor with `pass_read_limit` set to `0` and + * `output_row_granularity` set to `DEFAULT_OUTPUT_ROW_GRANULARITY` rows. + * + * @param chunk_read_limit Limit on total number of bytes to be returned per `read_chunk()` call, + * or `0` if there is no limit + * @param options Settings for controlling reading behaviors + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource to use for device memory allocation + */ + explicit chunked_orc_reader( + std::size_t chunk_read_limit, + orc_reader_options const& options, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + + /** + * @brief Destructor, destroying the internal reader instance. + */ + ~chunked_orc_reader(); + + /** + * @brief Check if there is any data in the given data sources has not yet read. + * + * @return A boolean value indicating if there is any data left to read + */ + [[nodiscard]] bool has_next() const; + + /** + * @brief Read a chunk of rows in the given data sources. + * + * The sequence of returned tables, if concatenated by their order, guarantees to form a complete + * dataset as reading the entire given data sources at once. + * + * An empty table will be returned if the given sources are empty, or all the data has + * been read and returned by the previous calls. + * + * @return An output `cudf::table` along with its metadata + */ + [[nodiscard]] table_with_metadata read_chunk() const; + + private: + std::unique_ptr reader; +}; + /** @} */ // end of group /** * @addtogroup io_writers diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 12059dffa4e..98b010109ec 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -420,7 +420,7 @@ table_with_metadata read_orc(orc_reader_options const& options, auto datasources = make_datasources(options.get_source()); auto reader = std::make_unique(std::move(datasources), options, stream, mr); - return reader->read(options); + return reader->read(); } /** @@ -440,6 +440,64 @@ void write_orc(orc_writer_options const& options, rmm::cuda_stream_view stream) writer->write(options.get_table()); } +chunked_orc_reader::chunked_orc_reader(std::size_t chunk_read_limit, + std::size_t pass_read_limit, + size_type output_row_granularity, + orc_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + : reader{std::make_unique(chunk_read_limit, + pass_read_limit, + output_row_granularity, + make_datasources(options.get_source()), + options, + stream, + mr)} +{ +} + +chunked_orc_reader::chunked_orc_reader(std::size_t chunk_read_limit, + std::size_t pass_read_limit, + orc_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + : reader{std::make_unique(chunk_read_limit, + pass_read_limit, + make_datasources(options.get_source()), + options, + stream, + mr)} +{ +} + +chunked_orc_reader::chunked_orc_reader(std::size_t chunk_read_limit, + orc_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + : chunked_orc_reader(chunk_read_limit, 0UL, options, stream, mr) +{ +} + +// This destructor destroys the internal reader instance. +// Since the declaration of the internal `reader` object does not exist in the header, this +// destructor needs to be defined in a separate source file which can access to that object's +// declaration. +chunked_orc_reader::~chunked_orc_reader() = default; + +bool chunked_orc_reader::has_next() const +{ + CUDF_FUNC_RANGE(); + CUDF_EXPECTS(reader != nullptr, "Reader has not been constructed properly."); + return reader->has_next(); +} + +table_with_metadata chunked_orc_reader::read_chunk() const +{ + CUDF_FUNC_RANGE(); + CUDF_EXPECTS(reader != nullptr, "Reader has not been constructed properly."); + return reader->read_chunk(); +} + /** * @copydoc cudf::io::orc_chunked_writer::orc_chunked_writer */ diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp index d54524f0f0d..94a4d146b35 100644 --- a/cpp/src/io/orc/aggregate_orc_metadata.cpp +++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "aggregate_orc_metadata.hpp" +#include "io/orc/aggregate_orc_metadata.hpp" #include "io/utilities/row_selection.hpp" @@ -152,22 +152,28 @@ aggregate_orc_metadata::aggregate_orc_metadata( } } -std::tuple> +std::tuple> aggregate_orc_metadata::select_stripes( std::vector> const& user_specified_stripes, int64_t skip_rows, - std::optional const& num_rows, + std::optional const& num_read_rows, rmm::cuda_stream_view stream) { - CUDF_EXPECTS((skip_rows == 0 and not num_rows.has_value()) or user_specified_stripes.empty(), + CUDF_EXPECTS((skip_rows == 0 and not num_read_rows.has_value()) or user_specified_stripes.empty(), "Can't use both the row selection and the stripe selection"); auto [rows_to_skip, rows_to_read] = [&]() { - if (not user_specified_stripes.empty()) { return std::pair{0, 0}; } - return cudf::io::detail::skip_rows_num_rows_from_options(skip_rows, num_rows, get_num_rows()); + if (not user_specified_stripes.empty()) { return std::pair{0, 0}; } + return cudf::io::detail::skip_rows_num_rows_from_options( + skip_rows, num_read_rows, get_num_rows()); }(); - std::vector selected_stripes_mapping; + struct stripe_source_mapping { + int source_idx; + std::vector stripe_info; + }; + + std::vector selected_stripes_mapping; if (!user_specified_stripes.empty()) { CUDF_EXPECTS(user_specified_stripes.size() == per_file_metadata.size(), @@ -176,7 +182,8 @@ aggregate_orc_metadata::select_stripes( // Each vector entry represents a source file; each nested vector represents the // user_defined_stripes to get from that source file for (size_t src_file_idx = 0; src_file_idx < user_specified_stripes.size(); ++src_file_idx) { - std::vector stripe_infos; + std::vector stripe_infos; + stripe_infos.reserve(user_specified_stripes[src_file_idx].size()); // Coalesce stripe info at the source file later since that makes downstream processing much // easier in impl::read @@ -185,11 +192,19 @@ aggregate_orc_metadata::select_stripes( stripe_idx >= 0 and stripe_idx < static_cast( per_file_metadata[src_file_idx].ff.stripes.size()), "Invalid stripe index"); - stripe_infos.push_back( - std::pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr)); - rows_to_read += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows; + stripe_infos.push_back({&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], + nullptr, + static_cast(src_file_idx)}); + + auto const stripe_rows = + per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows; + CUDF_EXPECTS(stripe_rows < static_cast(std::numeric_limits::max()), + "The number of rows in one stripe exceeds the column size limit.", + std::overflow_error); + rows_to_read += static_cast(stripe_rows); } - selected_stripes_mapping.push_back({static_cast(src_file_idx), stripe_infos}); + selected_stripes_mapping.emplace_back( + stripe_source_mapping{static_cast(src_file_idx), std::move(stripe_infos)}); } } else { int64_t count = 0; @@ -198,33 +213,44 @@ aggregate_orc_metadata::select_stripes( for (size_t src_file_idx = 0; src_file_idx < per_file_metadata.size() && count < rows_to_skip + rows_to_read; ++src_file_idx) { - std::vector stripe_infos; + std::vector stripe_infos; + stripe_infos.reserve(per_file_metadata[src_file_idx].ff.stripes.size()); for (size_t stripe_idx = 0; stripe_idx < per_file_metadata[src_file_idx].ff.stripes.size() && count < rows_to_skip + rows_to_read; ++stripe_idx) { - count += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows; + auto const stripe_rows = + per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows; + CUDF_EXPECTS(stripe_rows < static_cast(std::numeric_limits::max()), + "The number of rows in one stripe exceeds the column size limit.", + std::overflow_error); + count += static_cast(stripe_rows); + if (count > rows_to_skip || count == 0) { - stripe_infos.push_back( - std::pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr)); + stripe_infos.push_back({&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], + nullptr, + static_cast(src_file_idx)}); } else { stripe_skip_rows = count; } } - selected_stripes_mapping.push_back({static_cast(src_file_idx), stripe_infos}); + selected_stripes_mapping.emplace_back( + stripe_source_mapping{static_cast(src_file_idx), std::move(stripe_infos)}); } // Need to remove skipped rows from the stripes which are not selected. rows_to_skip -= stripe_skip_rows; } + std::vector output; + // Read each stripe's stripefooter metadata for (auto& mapping : selected_stripes_mapping) { // Resize to all stripe_info for the source level per_file_metadata[mapping.source_idx].stripefooters.resize(mapping.stripe_info.size()); for (size_t i = 0; i < mapping.stripe_info.size(); i++) { - auto const stripe = mapping.stripe_info[i].first; + auto const stripe = mapping.stripe_info[i].stripe_info; auto const sf_comp_offset = stripe->offset + stripe->indexLength + stripe->dataLength; auto const sf_comp_length = stripe->footerLength; CUDF_EXPECTS( @@ -236,12 +262,17 @@ aggregate_orc_metadata::select_stripes( {buffer->data(), buffer->size()}, stream); ProtobufReader(sf_data.data(), sf_data.size()) .read(per_file_metadata[mapping.source_idx].stripefooters[i]); - mapping.stripe_info[i].second = &per_file_metadata[mapping.source_idx].stripefooters[i]; + mapping.stripe_info[i].stripe_footer = + &per_file_metadata[mapping.source_idx].stripefooters[i]; if (stripe->indexLength == 0) { row_grp_idx_present = false; } } + + output.insert(output.end(), + std::make_move_iterator(mapping.stripe_info.begin()), + std::make_move_iterator(mapping.stripe_info.end())); } - return {rows_to_skip, rows_to_read, selected_stripes_mapping}; + return {rows_to_skip, rows_to_read, std::move(output)}; } column_hierarchy aggregate_orc_metadata::select_columns( diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp index d1e053be481..5da5af58b9b 100644 --- a/cpp/src/io/orc/aggregate_orc_metadata.hpp +++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp @@ -45,8 +45,6 @@ struct column_hierarchy { * to aggregate that metadata from all the files. */ class aggregate_orc_metadata { - using OrcStripeInfo = std::pair; - /** * @brief Sums up the number of rows of each source */ @@ -114,12 +112,22 @@ class aggregate_orc_metadata { * @brief Selects the stripes to read, based on the row/stripe selection parameters. * * Stripes are potentially selected from multiple files. + * + * Upon parsing stripes' information, the number of skip rows and reading rows are also updated + * to be matched with the actual numbers for reading stripes from data sources. + * + * @param user_specified_stripes The specified stripe indices to read + * @param skip_rows Number of rows to skip from reading + * @param num_read_rows Number of rows to read + * @param stream CUDA stream used for device memory operations and kernel launches + * @return A tuple of the corrected skip_rows and num_rows values along with a vector of + * stripes' metadata such as footer, data information, and source index */ - [[nodiscard]] std::tuple> - select_stripes(std::vector> const& user_specified_stripes, - int64_t skip_rows, - std::optional const& num_rows, - rmm::cuda_stream_view stream); + [[nodiscard]] std::tuple> select_stripes( + std::vector> const& user_specified_stripes, + int64_t skip_rows, + std::optional const& num_read_rows, + rmm::cuda_stream_view stream); /** * @brief Filters ORC file to a selection of columns, based on their paths in the file. diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp index 88bd260a598..fd55cbb6846 100644 --- a/cpp/src/io/orc/orc.hpp +++ b/cpp/src/io/orc/orc.hpp @@ -602,13 +602,13 @@ struct column_validity_info { * convenience methods for initializing and accessing metadata. */ class metadata { - using OrcStripeInfo = std::pair; - public: - struct stripe_source_mapping { + struct orc_stripe_info { + StripeInformation const* stripe_info; + StripeFooter const* stripe_footer; int source_idx; - std::vector stripe_info; }; + std::vector stripe_info; public: explicit metadata(datasource* const src, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 77151f5b7b8..621d4c67691 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -14,42 +14,100 @@ * limitations under the License. */ -#include "reader_impl.hpp" -#include "reader_impl_chunking.hpp" -#include "reader_impl_helpers.hpp" +#include "io/orc/reader_impl.hpp" +#include "io/orc/reader_impl_chunking.hpp" +#include "io/orc/reader_impl_helpers.hpp" -#include +#include + +#include namespace cudf::io::orc::detail { -reader::impl::impl(std::vector>&& sources, - orc_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) - : _stream(stream), - _mr(mr), - _timestamp_type{options.get_timestamp_type()}, - _use_index{options.is_enabled_use_index()}, - _use_np_dtypes{options.is_enabled_use_np_dtypes()}, - _decimal128_columns{options.get_decimal128_columns()}, - _col_meta{std::make_unique()}, - _sources(std::move(sources)), - _metadata{_sources, stream}, - _selected_columns{_metadata.select_columns(options.get_columns())} +// This is just the proxy to call all other data preprocessing functions. +void reader_impl::prepare_data(read_mode mode) { + // There are no columns in the table. + if (_selected_columns.num_levels() == 0) { return; } + + // This will be no-op if it was called before. + preprocess_file(mode); + + if (!_chunk_read_data.more_table_chunks_to_output()) { + if (!_chunk_read_data.more_stripes_to_decode() && _chunk_read_data.more_stripes_to_load()) { + // Only load stripe data if: + // - There is more stripe to load, and + // - All loaded stripes were decoded, and + // - All the decoded results were output. + load_next_stripe_data(mode); + } + if (_chunk_read_data.more_stripes_to_decode()) { + // Only decompress/decode the loaded stripes if: + // - There are loaded stripes that were not decoded yet, and + // - All the decoded results were output. + decompress_and_decode_stripes(mode); + } + } } -table_with_metadata reader::impl::read(int64_t skip_rows, - std::optional const& num_rows_opt, - std::vector> const& stripes) +table_with_metadata reader_impl::make_output_chunk() { - prepare_data(skip_rows, num_rows_opt, stripes); - return read_chunk_internal(); + // There are no columns in the table. + if (_selected_columns.num_levels() == 0) { return {std::make_unique(), table_metadata{}}; } + + // If no rows or stripes to read, return empty columns. + if (!_chunk_read_data.more_table_chunks_to_output()) { + std::vector> out_columns; + auto out_metadata = get_meta_with_user_data(); + std::transform(_selected_columns.levels[0].begin(), + _selected_columns.levels[0].end(), + std::back_inserter(out_columns), + [&](auto const& col_meta) { + out_metadata.schema_info.emplace_back(""); + return create_empty_column(col_meta.id, + _metadata, + _options.decimal128_columns, + _options.use_np_dtypes, + _options.timestamp_type, + out_metadata.schema_info.back(), + _stream); + }); + return {std::make_unique
(std::move(out_columns)), std::move(out_metadata)}; + } + + auto const make_output_table = [&] { + if (_chunk_read_data.output_table_ranges.size() == 1) { + // Must change the index of the current output range such that calling `has_next()` after + // this will return the correct answer (`false`, since there is only one range). + _chunk_read_data.curr_output_table_range++; + + // Just hand over the decoded table without slicing. + return std::move(_chunk_read_data.decoded_table); + } + + // The range of rows in the decoded table to output. + auto const out_range = + _chunk_read_data.output_table_ranges[_chunk_read_data.curr_output_table_range++]; + auto const out_tview = cudf::detail::slice( + _chunk_read_data.decoded_table->view(), + {static_cast(out_range.begin), static_cast(out_range.end)}, + _stream)[0]; + auto output = std::make_unique
(out_tview, _stream, _mr); + + // If this is the last slice, we also delete the decoded table to free up memory. + if (!_chunk_read_data.more_table_chunks_to_output()) { + _chunk_read_data.decoded_table.reset(nullptr); + } + + return output; + }; + + return {make_output_table(), table_metadata{_out_metadata} /*copy cached metadata*/}; } -table_metadata reader::impl::make_output_metadata() +table_metadata reader_impl::get_meta_with_user_data() { - if (_output_metadata) { return table_metadata{*_output_metadata}; } + if (_meta_with_user_data) { return table_metadata{*_meta_with_user_data}; } // Copy user data to the output metadata. table_metadata out_metadata; @@ -70,69 +128,126 @@ table_metadata reader::impl::make_output_metadata() out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(), out_metadata.per_file_user_data[0].end()}; - // Save the output table metadata into `_output_metadata` for reuse next time. - _output_metadata = std::make_unique(out_metadata); + // Save the output table metadata into `_meta_with_user_data` for reuse next time. + _meta_with_user_data = std::make_unique(out_metadata); return out_metadata; } -table_with_metadata reader::impl::read_chunk_internal() +reader_impl::reader_impl(std::vector>&& sources, + orc_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + : reader_impl::reader_impl(0UL, 0UL, std::move(sources), options, stream, mr) { - // There is no columns in the table. - if (_selected_columns.num_levels() == 0) { return {std::make_unique
(), table_metadata{}}; } +} - std::vector> out_columns; - auto out_metadata = make_output_metadata(); +reader_impl::reader_impl(std::size_t chunk_read_limit, + std::size_t pass_read_limit, + std::vector>&& sources, + orc_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + : reader_impl::reader_impl(chunk_read_limit, + pass_read_limit, + DEFAULT_OUTPUT_ROW_GRANULARITY, + std::move(sources), + options, + stream, + mr) +{ +} - // If no rows or stripes to read, return empty columns - if (_file_itm_data->rows_to_read == 0 || _file_itm_data->selected_stripes.empty()) { - std::transform(_selected_columns.levels[0].begin(), - _selected_columns.levels[0].end(), - std::back_inserter(out_columns), - [&](auto const col_meta) { - out_metadata.schema_info.emplace_back(""); - return create_empty_column(col_meta.id, - _metadata, - _decimal128_columns, - _use_np_dtypes, - _timestamp_type, - out_metadata.schema_info.back(), - _stream); - }); - return {std::make_unique
(std::move(out_columns)), std::move(out_metadata)}; - } +reader_impl::reader_impl(std::size_t chunk_read_limit, + std::size_t pass_read_limit, + size_type output_row_granularity, + std::vector>&& sources, + orc_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + : _stream(stream), + _mr(mr), + _options{options.get_timestamp_type(), + options.is_enabled_use_index(), + options.is_enabled_use_np_dtypes(), + options.get_decimal128_columns(), + options.get_skip_rows(), + options.get_num_rows(), + options.get_stripes()}, + _col_meta{std::make_unique()}, + _sources(std::move(sources)), + _metadata{_sources, stream}, + _selected_columns{_metadata.select_columns(options.get_columns())}, + _chunk_read_data{chunk_read_limit, pass_read_limit, output_row_granularity} +{ + // Selected columns at different levels of nesting are stored in different elements + // of `selected_columns`; thus, size == 1 means no nested columns. + CUDF_EXPECTS(_options.skip_rows == 0 or _selected_columns.num_levels() == 1, + "skip_rows is not supported by nested column"); +} + +table_with_metadata reader_impl::read() +{ + prepare_data(read_mode::READ_ALL); + return make_output_chunk(); +} - // Create columns from buffer with respective schema information. - std::transform( - _selected_columns.levels[0].begin(), - _selected_columns.levels[0].end(), - std::back_inserter(out_columns), - [&](auto const& orc_col_meta) { - out_metadata.schema_info.emplace_back(""); - auto col_buffer = assemble_buffer( - orc_col_meta.id, 0, *_col_meta, _metadata, _selected_columns, _out_buffers, _stream, _mr); - return make_column(col_buffer, &out_metadata.schema_info.back(), std::nullopt, _stream); - }); - - return {std::make_unique
(std::move(out_columns)), std::move(out_metadata)}; +bool reader_impl::has_next() +{ + prepare_data(read_mode::CHUNKED_READ); + return _chunk_read_data.has_next(); +} + +table_with_metadata reader_impl::read_chunk() +{ + prepare_data(read_mode::CHUNKED_READ); + return make_output_chunk(); } -// Forward to implementation +chunked_reader::chunked_reader(std::size_t chunk_read_limit, + std::size_t pass_read_limit, + std::vector>&& sources, + orc_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + : _impl{std::make_unique( + chunk_read_limit, pass_read_limit, std::move(sources), options, stream, mr)} +{ +} + +chunked_reader::chunked_reader(std::size_t chunk_read_limit, + std::size_t pass_read_limit, + size_type output_row_granularity, + std::vector>&& sources, + orc_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + : _impl{std::make_unique(chunk_read_limit, + pass_read_limit, + output_row_granularity, + std::move(sources), + options, + stream, + mr)} +{ +} + +chunked_reader::~chunked_reader() = default; + +bool chunked_reader::has_next() const { return _impl->has_next(); } + +table_with_metadata chunked_reader::read_chunk() const { return _impl->read_chunk(); } + reader::reader(std::vector>&& sources, orc_reader_options const& options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) - : _impl{std::make_unique(std::move(sources), options, stream, mr)} + : _impl{std::make_unique(std::move(sources), options, stream, mr)} { } -// Destructor within this translation unit reader::~reader() = default; -// Forward to implementation -table_with_metadata reader::read(orc_reader_options const& options) -{ - return _impl->read(options.get_skip_rows(), options.get_num_rows(), options.get_stripes()); -} +table_with_metadata reader::read() { return _impl->read(); } } // namespace cudf::io::orc::detail diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp index 8b859da07e9..94b294087b8 100644 --- a/cpp/src/io/orc/reader_impl.hpp +++ b/cpp/src/io/orc/reader_impl.hpp @@ -16,8 +16,8 @@ #pragma once -#include "aggregate_orc_metadata.hpp" -#include "io/utilities/column_buffer.hpp" +#include "io/orc/aggregate_orc_metadata.hpp" +#include "io/orc/reader_impl_chunking.hpp" #include #include @@ -26,6 +26,8 @@ #include #include +#include + #include #include #include @@ -33,83 +35,169 @@ namespace cudf::io::orc::detail { struct reader_column_meta; -struct file_intermediate_data; /** * @brief Implementation for ORC reader. */ -class reader::impl { +class reader_impl { public: /** * @brief Constructor from a dataset source with reader options. * + * This constructor will call the other constructor with `chunk_read_limit` and `pass_read_limit` + * set to `0` and `output_row_granularity` set to `DEFAULT_OUTPUT_ROW_GRANULARITY`. + * * @param sources Dataset sources * @param options Settings for controlling reading behavior * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ - explicit impl(std::vector>&& sources, - orc_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); + explicit reader_impl(std::vector>&& sources, + orc_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); /** - * @brief Read an entire set or a subset of data and returns a set of columns - * - * @param skip_rows Number of rows to skip from the start - * @param num_rows_opt Optional number of rows to read, or `std::nullopt` to read all rows - * @param stripes Indices of individual stripes to load if non-empty - * @return The set of columns along with metadata + * @copydoc cudf::io::orc::detail::chunked_reader::chunked_reader(std::size_t, std::size_t, + * orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref) + */ + explicit reader_impl(std::size_t chunk_read_limit, + std::size_t pass_read_limit, + std::vector>&& sources, + orc_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + + /** + * @copydoc cudf::io::orc::detail::chunked_reader::chunked_reader(std::size_t, std::size_t, + * size_type, orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref) */ - table_with_metadata read(int64_t skip_rows, - std::optional const& num_rows_opt, - std::vector> const& stripes); + explicit reader_impl(std::size_t chunk_read_limit, + std::size_t pass_read_limit, + size_type output_row_granularity, + std::vector>&& sources, + orc_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + + /** + * @copydoc cudf::io::orc::detail::reader::read + */ + table_with_metadata read(); + + /** + * @copydoc cudf::io::chunked_orc_reader::has_next + */ + bool has_next(); + + /** + * @copydoc cudf::io::chunked_orc_reader::read_chunk + */ + table_with_metadata read_chunk(); private: + /** + * @brief The enum indicating whether the data sources are read all at once or chunk by chunk. + */ + enum class read_mode { READ_ALL, CHUNKED_READ }; + /** * @brief Perform all the necessary data preprocessing before creating an output table. * - * @param skip_rows Number of rows to skip from the start - * @param num_rows_opt Optional number of rows to read, or `std::nullopt` to read all rows - * @param stripes Indices of individual stripes to load if non-empty + * This is the proxy to call all other data preprocessing functions, which are prerequisite + * for generating the output. + * + * @param mode Value indicating if the data sources are read all at once or chunk by chunk */ - void prepare_data(int64_t skip_rows, - std::optional const& num_rows_opt, - std::vector> const& stripes); + void prepare_data(read_mode mode); /** - * @brief Create the output table metadata from file metadata. + * @brief Perform a preprocessing step on the input data sources that executes exactly once + * for the entire duration of the reader. * - * @return Columns' metadata to output with the table read from file + * In this step, the metadata of all stripes in the data sources is parsed, and information about + * data streams of the selected columns in all stripes are generated. If the reader has a data + * read limit, sizes of these streams are used to split the list of all stripes into multiple + * subsets, each of which will be loaded into memory in the `load_next_stripe_data()` step. These + * subsets are computed such that memory usage will be kept to be around a fixed size limit. + * + * @param mode Value indicating if the data sources are read all at once or chunk by chunk + */ + void preprocess_file(read_mode mode); + + /** + * @brief Load stripes from the input data sources into memory. + * + * If there is a data read limit, only a subset of stripes are read at a time such that + * their total data size does not exceed a fixed size limit. Then, the data is probed to + * estimate its uncompressed sizes, which are in turn used to split that stripe subset into + * smaller subsets, each of which to be decompressed and decoded in the next step + * `decompress_and_decode_stripes()`. This is to ensure that loading data from data sources + * together with decompression and decoding will be capped around the given data read limit. + * + * @param mode Value indicating if the data sources are read all at once or chunk by chunk */ - table_metadata make_output_metadata(); + void load_next_stripe_data(read_mode mode); /** - * @brief Read a chunk of data from the input source and return an output table with metadata. + * @brief Decompress and decode stripe data in the internal buffers, and store the result into + * an intermediate table. + * + * This function expects that the other preprocessing steps (`global preprocess()` and + * `load_next_stripe_data()`) have already been done. * - * This function is called internally and expects all preprocessing steps have already been done. + * @param mode Value indicating if the data sources are read all at once or chunk by chunk + */ + void decompress_and_decode_stripes(read_mode mode); + + /** + * @brief Create the output table from the intermediate table and return it along with metadata. * * @return The output table along with columns' metadata */ - table_with_metadata read_chunk_internal(); + table_with_metadata make_output_chunk(); + + /** + * @brief Create the output table metadata storing user data in source metadata. + * + * @return Columns' user data to output with the table read from file + */ + table_metadata get_meta_with_user_data(); rmm::cuda_stream_view const _stream; rmm::device_async_resource_ref const _mr; - // Reader configs - data_type const _timestamp_type; // Override output timestamp resolution - bool const _use_index; // Enable or disable attempt to use row index for parsing - bool const _use_np_dtypes; // Enable or disable the conversion to numpy-compatible dtypes - std::vector const _decimal128_columns; // Control decimals conversion - std::unique_ptr const _col_meta; // Track of orc mapping and child details + // Reader configs. + struct { + data_type timestamp_type; // override output timestamp resolution + bool use_index; // enable or disable attempt to use row index for parsing + bool use_np_dtypes; // enable or disable the conversion to numpy-compatible dtypes + std::vector decimal128_columns; // control decimals conversion - // Intermediate data for internal processing. + // User specified reading rows/stripes selection. + int64_t const skip_rows; + std::optional num_read_rows; + std::vector> const selected_stripes; + } const _options; + + // Intermediate data for reading. + std::unique_ptr const _col_meta; // Track of orc mapping and child details std::vector> const _sources; // Unused but owns data for `_metadata` aggregate_orc_metadata _metadata; column_hierarchy const _selected_columns; // Construct from `_metadata` thus declare after it - std::unique_ptr _file_itm_data; - std::unique_ptr _output_metadata; + file_intermediate_data _file_itm_data; + chunk_read_data _chunk_read_data; + + // Intermediate data for output. + std::unique_ptr _meta_with_user_data; + table_metadata _out_metadata; std::vector> _out_buffers; + + // The default value used for subdividing the decoded table for final output. + // Larger values will reduce the computation time but will make the output table less granular. + // Smaller values (minimum is `1`) will increase the computation time but the output table will + // have size closer to the given `chunk_read_limit`. + static inline constexpr size_type DEFAULT_OUTPUT_ROW_GRANULARITY = 10'000; }; } // namespace cudf::io::orc::detail diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu new file mode 100644 index 00000000000..5034aa14a95 --- /dev/null +++ b/cpp/src/io/orc/reader_impl_chunking.cu @@ -0,0 +1,723 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "io/comp/gpuinflate.hpp" +#include "io/orc/reader_impl.hpp" +#include "io/orc/reader_impl_chunking.hpp" +#include "io/orc/reader_impl_helpers.hpp" +#include "io/utilities/hostdevice_span.hpp" + +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include + +namespace cudf::io::orc::detail { + +std::size_t gather_stream_info_and_column_desc( + std::size_t stripe_id, + std::size_t level, + orc::StripeInformation const* stripeinfo, + orc::StripeFooter const* stripefooter, + host_span orc2gdf, + host_span types, + bool use_index, + bool apply_struct_map, + int64_t* num_dictionary_entries, + std::size_t* local_stream_order, + std::vector* stream_info, + cudf::detail::hostdevice_2dvector* chunks) +{ + CUDF_EXPECTS((stream_info == nullptr) ^ (chunks == nullptr), + "Either stream_info or chunks must be provided, but not both."); + + std::size_t src_offset = 0; + std::size_t dst_offset = 0; + + auto const get_stream_index_type = [](orc::StreamKind kind) { + switch (kind) { + case orc::DATA: return gpu::CI_DATA; + case orc::LENGTH: + case orc::SECONDARY: return gpu::CI_DATA2; + case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY; + case orc::PRESENT: return gpu::CI_PRESENT; + case orc::ROW_INDEX: return gpu::CI_INDEX; + default: + // Skip this stream as it's not strictly required + return gpu::CI_NUM_STREAMS; + } + }; + + for (auto const& stream : stripefooter->streams) { + if (!stream.column_id || *stream.column_id >= orc2gdf.size()) { + // Ignore reading this stream from source. + CUDF_LOG_WARN("Unexpected stream in the input ORC source. The stream will be ignored."); + src_offset += stream.length; + continue; + } + + auto const column_id = *stream.column_id; + auto col = orc2gdf[column_id]; + + if (col == -1 and apply_struct_map) { + // A struct-type column has no data itself, but rather child columns + // for each of its fields. There is only a PRESENT stream, which + // needs to be included for the reader. + auto const schema_type = types[column_id]; + if (!schema_type.subtypes.empty() && schema_type.kind == orc::STRUCT && + stream.kind == orc::PRESENT) { + for (auto const& idx : schema_type.subtypes) { + auto const child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1; + if (child_idx >= 0) { + col = child_idx; + if (chunks) { + auto& chunk = (*chunks)[stripe_id][col]; + chunk.strm_id[gpu::CI_PRESENT] = *local_stream_order; + chunk.strm_len[gpu::CI_PRESENT] = stream.length; + } + } + } + } + } else if (col != -1) { + if (chunks) { + if (src_offset >= stripeinfo->indexLength || use_index) { + auto const index_type = get_stream_index_type(stream.kind); + if (index_type < gpu::CI_NUM_STREAMS) { + auto& chunk = (*chunks)[stripe_id][col]; + chunk.strm_id[index_type] = *local_stream_order; + chunk.strm_len[index_type] = stream.length; + // NOTE: skip_count field is temporarily used to track the presence of index streams + chunk.skip_count |= 1 << index_type; + + if (index_type == gpu::CI_DICTIONARY) { + chunk.dictionary_start = *num_dictionary_entries; + chunk.dict_len = stripefooter->columns[column_id].dictionarySize; + *num_dictionary_entries += + static_cast(stripefooter->columns[column_id].dictionarySize); + } + } + } + + (*local_stream_order)++; + } else { // chunks == nullptr + stream_info->emplace_back( + orc_stream_info{stripeinfo->offset + src_offset, + dst_offset, + stream.length, + stream_source_info{stripe_id, level, column_id, stream.kind}}); + } + + dst_offset += stream.length; + } + src_offset += stream.length; + } + + return dst_offset; +} + +template +std::vector find_splits(host_span cumulative_sizes, + std::size_t total_count, + std::size_t size_limit) +{ + CUDF_EXPECTS(size_limit > 0, "Invalid size limit", std::invalid_argument); + + std::vector splits; + std::size_t cur_count{0}; + int64_t cur_pos{0}; + std::size_t cur_cumulative_size{0}; + + [[maybe_unused]] std::size_t cur_cumulative_rows{0}; + + auto const start = thrust::make_transform_iterator( + cumulative_sizes.begin(), + [&](auto const& size) { return size.size_bytes - cur_cumulative_size; }); + auto const end = start + cumulative_sizes.size(); + + while (cur_count < total_count) { + int64_t split_pos = static_cast( + thrust::distance(start, thrust::lower_bound(thrust::seq, start + cur_pos, end, size_limit))); + + // If we're past the end, or if the returned range has size exceeds the given size limit, + // move back one position. + if (split_pos >= static_cast(cumulative_sizes.size()) || + (cumulative_sizes[split_pos].size_bytes > cur_cumulative_size + size_limit)) { + split_pos--; + } + + if constexpr (std::is_same_v) { + // Similarly, while the returned range has total number of rows exceeds column size limit, + // move back one position. + while (split_pos > 0 && cumulative_sizes[split_pos].num_rows > + cur_cumulative_rows + + static_cast(std::numeric_limits::max())) { + split_pos--; + } + } + + // In case we have moved back too much in the steps above, far beyond the last split point, that + // means we could not find any range that has size fits within the given size limit. + // In such situations, we need to move forward until we move pass the last output range. + while (split_pos < (static_cast(cumulative_sizes.size()) - 1) && + (split_pos < 0 || cumulative_sizes[split_pos].count <= cur_count)) { + split_pos++; + } + + auto const start_count = cur_count; + cur_count = cumulative_sizes[split_pos].count; + splits.emplace_back(range{start_count, cur_count}); + cur_pos = split_pos; + cur_cumulative_size = cumulative_sizes[split_pos].size_bytes; + + if constexpr (std::is_same_v) { + cur_cumulative_rows = cumulative_sizes[split_pos].num_rows; + } + } + + // If the last range has size smaller than `merge_threshold` the size of the second last one, + // merge it with the second last one. + // This is to prevent having the last range too small. + if (splits.size() > 1) { + double constexpr merge_threshold = 0.15; + if (auto const last = splits.back(), second_last = splits[splits.size() - 2]; + last.size() <= static_cast(merge_threshold * second_last.size())) { + splits.pop_back(); + splits.back().end = last.end; + } + } + + return splits; +} + +// Since `find_splits` is a template function, we need to explicitly instantiate it so it can be +// used outside of this TU. +template std::vector find_splits(host_span sizes, + std::size_t total_count, + std::size_t size_limit); +template std::vector find_splits( + host_span sizes, std::size_t total_count, std::size_t size_limit); + +// In this step, the metadata of all stripes in the data sources is parsed, and information about +// data streams of the selected columns in all stripes are generated. If the reader has a data +// read limit, sizes of these streams are used to split the list of all stripes into multiple +// subsets, each of which will be loaded into memory in the `load_next_stripe_data()` step. These +// subsets are computed such that memory usage will be kept to be around a fixed size limit. +void reader_impl::preprocess_file(read_mode mode) +{ + if (_file_itm_data.global_preprocessed) { return; } + _file_itm_data.global_preprocessed = true; + + // + // Load stripes' metadata: + // + std::tie( + _file_itm_data.rows_to_skip, _file_itm_data.rows_to_read, _file_itm_data.selected_stripes) = + _metadata.select_stripes( + _options.selected_stripes, _options.skip_rows, _options.num_read_rows, _stream); + if (!_file_itm_data.has_data()) { return; } + + CUDF_EXPECTS( + mode == read_mode::CHUNKED_READ || + _file_itm_data.rows_to_read <= static_cast(std::numeric_limits::max()), + "READ_ALL mode does not support reading number of rows more than cudf's column size limit. " + "For reading large number of rows, please use chunked_reader.", + std::overflow_error); + + auto const& selected_stripes = _file_itm_data.selected_stripes; + auto const num_total_stripes = selected_stripes.size(); + auto const num_levels = _selected_columns.num_levels(); + + // Set up table for converting timestamp columns from local to UTC time + _file_itm_data.tz_table = [&] { + auto const has_timestamp_column = std::any_of( + _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) { + return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) { + return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP; + }); + }); + + return has_timestamp_column ? cudf::detail::make_timezone_transition_table( + {}, selected_stripes[0].stripe_footer->writerTimezone, _stream) + : std::make_unique(); + }(); + + // + // Pre allocate necessary memory for data processed in the other reading steps: + // + auto& stripe_data_read_ranges = _file_itm_data.stripe_data_read_ranges; + stripe_data_read_ranges.resize(num_total_stripes); + + auto& lvl_stripe_data = _file_itm_data.lvl_stripe_data; + auto& lvl_stripe_sizes = _file_itm_data.lvl_stripe_sizes; + auto& lvl_stream_info = _file_itm_data.lvl_stream_info; + auto& lvl_stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges; + auto& lvl_column_types = _file_itm_data.lvl_column_types; + auto& lvl_nested_cols = _file_itm_data.lvl_nested_cols; + + lvl_stripe_data.resize(num_levels); + lvl_stripe_sizes.resize(num_levels); + lvl_stream_info.resize(num_levels); + lvl_stripe_stream_ranges.resize(num_levels); + lvl_column_types.resize(num_levels); + lvl_nested_cols.resize(num_levels); + _out_buffers.resize(num_levels); + + auto& read_info = _file_itm_data.data_read_info; + auto& col_meta = *_col_meta; + + // + // Collect columns' types: + // + for (std::size_t level = 0; level < num_levels; ++level) { + lvl_stripe_sizes[level].resize(num_total_stripes); + lvl_stripe_stream_ranges[level].resize(num_total_stripes); + + // Association between each ORC column and its cudf::column + col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1); + + auto const& columns_level = _selected_columns.levels[level]; + size_type col_id{0}; + + for (auto const& col : columns_level) { + // Map each ORC column to its column + col_meta.orc_col_map[level][col.id] = col_id++; + + auto const col_type = + to_cudf_type(_metadata.get_col_type(col.id).kind, + _options.use_np_dtypes, + _options.timestamp_type.id(), + to_cudf_decimal_type(_options.decimal128_columns, _metadata, col.id)); + CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type"); + + auto& column_types = lvl_column_types[level]; + auto& nested_cols = lvl_nested_cols[level]; + + if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or + col_type == type_id::DECIMAL128) { + // sign of the scale is changed since cuDF follows c++ libraries like CNL + // which uses negative scaling, but liborc and other libraries + // follow positive scaling. + auto const scale = + -static_cast(_metadata.get_col_type(col.id).scale.value_or(0)); + column_types.emplace_back(col_type, scale); + } else { + column_types.emplace_back(col_type); + } + + // Map each ORC column to its column. + if (col_type == type_id::LIST or col_type == type_id::STRUCT) { + nested_cols.emplace_back(col); + } + } + + // Try to reserve some memory, but the final size is unknown, + // since each column may have more than one stream. + auto const num_columns = columns_level.size(); + lvl_stream_info[level].reserve(num_total_stripes * num_columns); + if (read_info.capacity() < num_total_stripes * num_columns) { + read_info.reserve(num_total_stripes * num_columns); + } + } + + // + // Collect all data streams' information: + // + + // Load all stripes if we are in READ_ALL mode or there is no read limit. + auto const load_all_stripes = + mode == read_mode::READ_ALL || _chunk_read_data.pass_read_limit == 0; + + // Accumulate data size for data streams in each stripe, used for chunking. + // This will be used only for CHUNKED_READ mode when there is a read limit. + // Otherwise, we do not need this since we just load all stripes. + cudf::detail::hostdevice_vector total_stripe_sizes( + load_all_stripes ? std::size_t{0} : num_total_stripes, _stream); + + for (std::size_t stripe_global_idx = 0; stripe_global_idx < num_total_stripes; + ++stripe_global_idx) { + auto const& stripe = selected_stripes[stripe_global_idx]; + auto const stripe_info = stripe.stripe_info; + auto const stripe_footer = stripe.stripe_footer; + + std::size_t this_stripe_size{0}; + auto const last_read_size = read_info.size(); + for (std::size_t level = 0; level < num_levels; ++level) { + auto& stream_info = _file_itm_data.lvl_stream_info[level]; + + auto stream_level_count = stream_info.size(); + auto const stripe_level_size = + gather_stream_info_and_column_desc(stripe_global_idx, + level, + stripe_info, + stripe_footer, + col_meta.orc_col_map[level], + _metadata.get_types(), + false, // use_index, + level == 0, + nullptr, // num_dictionary_entries + nullptr, // local_stream_order + &stream_info, + nullptr // chunks + ); + + auto const is_stripe_data_empty = stripe_level_size == 0; + CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0, + "Invalid index rowgroup stream data"); + + lvl_stripe_sizes[level][stripe_global_idx] = stripe_level_size; + this_stripe_size += stripe_level_size; + + // Range of the streams in `stream_info` corresponding to this stripe at the current level. + lvl_stripe_stream_ranges[level][stripe_global_idx] = + range{stream_level_count, stream_info.size()}; + + // Coalesce consecutive streams into one read. + while (not is_stripe_data_empty and stream_level_count < stream_info.size()) { + auto const d_dst = stream_info[stream_level_count].dst_pos; + auto const offset = stream_info[stream_level_count].offset; + auto len = stream_info[stream_level_count].length; + stream_level_count++; + + while (stream_level_count < stream_info.size() && + stream_info[stream_level_count].offset == offset + len) { + len += stream_info[stream_level_count].length; + stream_level_count++; + } + read_info.emplace_back(stream_data_read_info{offset, + d_dst, + len, + static_cast(stripe.source_idx), + stripe_global_idx, + level}); + } + } // end loop level + + if (!load_all_stripes) { total_stripe_sizes[stripe_global_idx] = {1, this_stripe_size}; } + + // Range of all stream reads in `read_info` corresponding to this stripe, in all levels. + stripe_data_read_ranges[stripe_global_idx] = range{last_read_size, read_info.size()}; + } + + // + // Split range of all stripes into subranges that can be loaded separately while maintaining + // the memory usage under the given pass limit: + // + + // Load range is reset to start from the first position in `load_stripe_ranges`. + _chunk_read_data.curr_load_stripe_range = 0; + + if (load_all_stripes) { + _chunk_read_data.load_stripe_ranges = {range{0UL, num_total_stripes}}; + return; + } + + // Compute the prefix sum of stripes' data sizes. + total_stripe_sizes.host_to_device_async(_stream); + thrust::inclusive_scan(rmm::exec_policy_nosync(_stream), + total_stripe_sizes.d_begin(), + total_stripe_sizes.d_end(), + total_stripe_sizes.d_begin(), + cumulative_size_plus{}); + total_stripe_sizes.device_to_host_sync(_stream); + + auto const load_limit = [&] { + auto const tmp = static_cast(_chunk_read_data.pass_read_limit * + chunk_read_data::load_limit_ratio); + // Make sure not to pass 0 byte limit (due to round-off) to `find_splits`. + return std::max(tmp, 1UL); + }(); + + _chunk_read_data.load_stripe_ranges = + find_splits(total_stripe_sizes, num_total_stripes, load_limit); +} + +// If there is a data read limit, only a subset of stripes are read at a time such that +// their total data size does not exceed a fixed size limit. Then, the data is probed to +// estimate its uncompressed sizes, which are in turn used to split that stripe subset into +// smaller subsets, each of which to be decompressed and decoded in the next step +// `decompress_and_decode_stripes()`. This is to ensure that loading data from data sources +// together with decompression and decoding will be capped around the given data read limit. +void reader_impl::load_next_stripe_data(read_mode mode) +{ + if (!_file_itm_data.has_data()) { return; } + + auto const load_stripe_range = + _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range++]; + auto const stripe_start = load_stripe_range.begin; + auto const stripe_count = load_stripe_range.size(); + + auto& lvl_stripe_data = _file_itm_data.lvl_stripe_data; + auto const num_levels = _selected_columns.num_levels(); + + // Prepare the buffer to read raw data onto. + for (std::size_t level = 0; level < num_levels; ++level) { + auto& stripe_data = lvl_stripe_data[level]; + stripe_data.resize(stripe_count); + + for (std::size_t idx = 0; idx < stripe_count; ++idx) { + auto const stripe_size = _file_itm_data.lvl_stripe_sizes[level][idx + stripe_start]; + stripe_data[idx] = rmm::device_buffer( + cudf::util::round_up_safe(stripe_size, BUFFER_PADDING_MULTIPLE), _stream); + } + } + + // + // Load stripe data into memory: + // + + // If we load data from sources into host buffers, we need to transfer (async) data to device + // memory. Such host buffers need to be kept alive until we sync the transfers. + std::vector> host_read_buffers; + + // If we load data directly from sources into device memory, the loads are also async. + // Thus, we need to make sure to sync all them at the end. + std::vector, std::size_t>> device_read_tasks; + + // Range of the read info (offset, length) to read for the current being loaded stripes. + auto const [read_begin, read_end] = + merge_selected_ranges(_file_itm_data.stripe_data_read_ranges, load_stripe_range); + + for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) { + auto const& read_info = _file_itm_data.data_read_info[read_idx]; + auto const source_ptr = _metadata.per_file_metadata[read_info.source_idx].source; + auto const dst_base = static_cast( + lvl_stripe_data[read_info.level][read_info.stripe_idx - stripe_start].data()); + + if (source_ptr->is_device_read_preferred(read_info.length)) { + device_read_tasks.push_back( + std::pair(source_ptr->device_read_async( + read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream), + read_info.length)); + + } else { + auto buffer = source_ptr->host_read(read_info.offset, read_info.length); + CUDF_EXPECTS(buffer->size() == read_info.length, "Unexpected discrepancy in bytes read."); + CUDF_CUDA_TRY(cudaMemcpyAsync(dst_base + read_info.dst_pos, + buffer->data(), + read_info.length, + cudaMemcpyDefault, + _stream.value())); + host_read_buffers.emplace_back(std::move(buffer)); + } + } + + if (host_read_buffers.size() > 0) { // if there was host read + _stream.synchronize(); + host_read_buffers.clear(); // its data was copied to device memory after stream sync + } + for (auto& task : device_read_tasks) { // if there was device read + CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read."); + } + + // Compute number of rows in the loading stripes. + auto const num_loading_rows = std::accumulate( + _file_itm_data.selected_stripes.begin() + stripe_start, + _file_itm_data.selected_stripes.begin() + stripe_start + stripe_count, + std::size_t{0}, + [](std::size_t count, const auto& stripe) { return count + stripe.stripe_info->numberOfRows; }); + + // Decoding range needs to be reset to start from the first position in `decode_stripe_ranges`. + _chunk_read_data.curr_decode_stripe_range = 0; + + // The cudf's column size limit. + auto constexpr column_size_limit = + static_cast(std::numeric_limits::max()); + + // Decode all loaded stripes if there is no read limit, or if we are in READ_ALL mode, + // and the number of loading rows is less than the column size limit. + // In theory, we should just decode 'enough' stripes for output one table chunk, instead of + // decoding all stripes like this, for better load-balancing and reduce memory usage. + // However, we do not have any good way to know how many stripes are 'enough'. + if ((mode == read_mode::READ_ALL || _chunk_read_data.pass_read_limit == 0) && + // In addition to read limit, we also need to check if the total number of + // rows in the loaded stripes exceeds the column size limit. + // If that is the case, we cannot decode all stripes at once into a cudf table. + num_loading_rows <= column_size_limit) { + _chunk_read_data.decode_stripe_ranges = {load_stripe_range}; + return; + } + + // From here, we have reading mode that is either: + // - CHUNKED_READ without read limit but the number of reading rows exceeds column size limit, or + // - CHUNKED_READ with a pass read limit. + // READ_ALL mode with number of rows more than cudf's column size limit should be handled early in + // `preprocess_file`. We just check again to make sure such situations never happen here. + CUDF_EXPECTS( + mode != read_mode::READ_ALL, + "READ_ALL mode does not support reading number of rows more than cudf's column size limit."); + + // This is the post-processing step after we've done with splitting `load_stripe_range` into + // `decode_stripe_ranges`. + auto const add_range_offset = [stripe_start](std::vector& new_ranges) { + // The split ranges always start from zero. + // We need to change these ranges to start from `stripe_start` which are the correct subranges + // of the current loaded stripe range. + for (auto& range : new_ranges) { + range.begin += stripe_start; + range.end += stripe_start; + } + }; + + // Optimized code path when we do not have any read limit but the number of rows in the + // loaded stripes exceeds column size limit. + // Note that the values `max_uncompressed_size` for each stripe are not computed here. + // Instead, they will be computed on the fly during decoding to avoid the overhead of + // storing and retrieving from memory. + if (_chunk_read_data.pass_read_limit == 0 && num_loading_rows > column_size_limit) { + std::vector cumulative_stripe_rows(stripe_count); + std::size_t rows{0}; + + for (std::size_t idx = 0; idx < stripe_count; ++idx) { + auto const& stripe = _file_itm_data.selected_stripes[idx + stripe_start]; + auto const stripe_info = stripe.stripe_info; + rows += stripe_info->numberOfRows; + + // We will split stripe ranges based only on stripes' number of rows, not data size. + // Thus, we override the cumulative `size_bytes` using the prefix sum of rows in stripes and + // will use the column size limit as the split size limit. + cumulative_stripe_rows[idx] = + cumulative_size_and_row{idx + 1UL /*count*/, rows /*size_bytes*/, rows}; + } + + _chunk_read_data.decode_stripe_ranges = + find_splits(cumulative_stripe_rows, stripe_count, column_size_limit); + add_range_offset(_chunk_read_data.decode_stripe_ranges); + return; + } + + // + // Split range of loaded stripes into subranges that can be decoded separately such that the + // memory usage is maintained around the given limit: + // + + // This is for estimating the decompressed sizes of the loaded stripes. + cudf::detail::hostdevice_vector stripe_decomp_sizes(stripe_count, + _stream); + + // Fill up the `cumulative_size_and_row` array with initial values. + // Note: `hostdevice_vector::begin()` mirrors `std::vector::data()` using incorrect API name. + for (std::size_t idx = 0; idx < stripe_count; ++idx) { + auto const& stripe = _file_itm_data.selected_stripes[idx + stripe_start]; + auto const stripe_info = stripe.stripe_info; + stripe_decomp_sizes[idx] = + cumulative_size_and_row{1UL /*count*/, 0UL /*size_bytes*/, stripe_info->numberOfRows}; + } + + auto& compinfo_map = _file_itm_data.compinfo_map; + compinfo_map.clear(); // clear cache of the last load + + // For parsing decompression data. + // We create an array that is large enough to use for all levels, thus only need to allocate + // memory once. + auto hd_compinfo = [&] { + std::size_t max_num_streams{0}; + if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) { + // Find the maximum number of streams in all levels of the loaded stripes. + for (std::size_t level = 0; level < num_levels; ++level) { + auto const stream_range = + merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range); + max_num_streams = std::max(max_num_streams, stream_range.size()); + } + } + return cudf::detail::hostdevice_vector(max_num_streams, _stream); + }(); + + for (std::size_t level = 0; level < num_levels; ++level) { + auto const& stream_info = _file_itm_data.lvl_stream_info[level]; + auto const num_columns = _selected_columns.levels[level].size(); + + auto& stripe_data = lvl_stripe_data[level]; + if (stripe_data.empty()) { continue; } + + // Range of all streams in the loaded stripes. + auto const stream_range = + merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range); + + if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) { + auto const& decompressor = *_metadata.per_file_metadata[0].decompressor; + + auto compinfo = cudf::detail::hostdevice_span( + hd_compinfo.begin(), hd_compinfo.d_begin(), stream_range.size()); + for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) { + auto const& info = stream_info[stream_idx]; + auto const dst_base = + static_cast(stripe_data[info.source.stripe_idx - stripe_start].data()); + compinfo[stream_idx - stream_range.begin] = + gpu::CompressedStreamInfo(dst_base + info.dst_pos, info.length); + } + + // Estimate the uncompressed data. + compinfo.host_to_device_async(_stream); + gpu::ParseCompressedStripeData(compinfo.device_ptr(), + compinfo.size(), + decompressor.GetBlockSize(), + decompressor.GetLog2MaxCompressionRatio(), + _stream); + compinfo.device_to_host_sync(_stream); + + for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) { + auto const& info = stream_info[stream_idx]; + auto const stream_compinfo = compinfo[stream_idx - stream_range.begin]; + + // Cache these parsed numbers so they can be reused in the decompression/decoding step. + compinfo_map[info.source] = {stream_compinfo.num_compressed_blocks, + stream_compinfo.num_uncompressed_blocks, + stream_compinfo.max_uncompressed_size}; + stripe_decomp_sizes[info.source.stripe_idx - stripe_start].size_bytes += + stream_compinfo.max_uncompressed_size; + } + + } else { // no decompression + // Set decompression sizes equal to the input sizes. + for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) { + auto const& info = stream_info[stream_idx]; + stripe_decomp_sizes[info.source.stripe_idx - stripe_start].size_bytes += info.length; + } + } + } // end loop level + + // Compute the prefix sum of stripe data sizes and rows. + stripe_decomp_sizes.host_to_device_async(_stream); + thrust::inclusive_scan(rmm::exec_policy_nosync(_stream), + stripe_decomp_sizes.d_begin(), + stripe_decomp_sizes.d_end(), + stripe_decomp_sizes.d_begin(), + cumulative_size_plus{}); + stripe_decomp_sizes.device_to_host_sync(_stream); + + auto const decode_limit = [&] { + auto const tmp = static_cast(_chunk_read_data.pass_read_limit * + chunk_read_data::decompress_and_decode_limit_ratio); + // Make sure not to pass 0 byte limit to `find_splits`. + return std::max(tmp, 1UL); + }(); + + _chunk_read_data.decode_stripe_ranges = + find_splits(stripe_decomp_sizes, stripe_count, decode_limit); + + add_range_offset(_chunk_read_data.decode_stripe_ranges); +} + +} // namespace cudf::io::orc::detail diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp index 0ad0f9af589..4ef68ee8d86 100644 --- a/cpp/src/io/orc/reader_impl_chunking.hpp +++ b/cpp/src/io/orc/reader_impl_chunking.hpp @@ -24,18 +24,298 @@ #include #include +#include + namespace cudf::io::orc::detail { /** - * @brief Struct to store file-level data that remains constant for all chunks being read. + * @brief Struct representing a range of of data offsets. + */ +struct range { + std::size_t begin{0}; + std::size_t end{0}; + + [[nodiscard]] auto size() const { return end - begin; } +}; + +/** + * @brief Expand a range of ranges into a simple range of data. + * + * @param input_ranges The list of all data ranges + * @param selected_ranges A range of ranges from `input_ranges` + * @return The range of data span by the selected range of ranges + */ +inline range merge_selected_ranges(host_span input_ranges, + range const& selected_ranges) +{ + // The first and last range. + auto const& first_range = input_ranges[selected_ranges.begin]; + auto const& last_range = input_ranges[selected_ranges.end - 1]; + + // The range of data covered from the first to the last range. + return {first_range.begin, last_range.end}; +} + +// Store information to identify where to read a chunk of data from source. +// Each read corresponds to one or more consecutive streams combined. +struct stream_data_read_info { + uint64_t offset; // offset in data source + std::size_t dst_pos; // offset to store data in memory relative to start of raw stripe data + std::size_t length; // data length to read + std::size_t source_idx; // the data source id + std::size_t stripe_idx; // global stripe index + std::size_t level; // nested level +}; + +/** + * @brief Compression information for a stripe at a specific nested level. + */ +struct stripe_level_comp_info { + std::size_t num_compressed_blocks{0}; + std::size_t num_uncompressed_blocks{0}; + std::size_t total_decomp_size{0}; +}; + +/** + * @brief Struct that stores source information of an ORC streams. + */ +struct stream_source_info { + std::size_t stripe_idx; // global stripe id throughout all data sources + std::size_t level; // level of the nested column + uint32_t orc_col_idx; // orc column id + StreamKind kind; // stream kind + + struct hash { + std::size_t operator()(stream_source_info const& id) const + { + auto const col_kind = + static_cast(id.orc_col_idx) | (static_cast(id.kind) << 32); + auto const hasher = std::hash{}; + return hasher(id.stripe_idx) ^ hasher(id.level) ^ hasher(col_kind); + } + }; + struct equal_to { + bool operator()(stream_source_info const& lhs, stream_source_info const& rhs) const + { + return lhs.stripe_idx == rhs.stripe_idx && lhs.level == rhs.level && + lhs.orc_col_idx == rhs.orc_col_idx && lhs.kind == rhs.kind; + } + }; +}; + +/** + * @brief Map to lookup a value from stream source. + */ +template +using stream_source_map = + std::unordered_map; + +/** + * @brief Struct that stores information of an ORC stream. + */ +struct orc_stream_info { + // Data info: + uint64_t offset; // offset in data source + std::size_t dst_pos; // offset to store data in memory relative to start of raw stripe data + std::size_t length; // stream length to read + + // Store source of the stream in the stripe, so we can look up where this stream comes from. + stream_source_info source; +}; + +/** + * @brief Struct storing intermediate processing data loaded from data sources. */ struct file_intermediate_data { + int64_t rows_to_skip; + int64_t rows_to_read; + std::vector selected_stripes; + + // Check if there is data to read. + bool has_data() const { return rows_to_read > 0 && !selected_stripes.empty(); } + + // For each stripe, we perform a number of reads for its streams. + // Those reads are identified by a chunk of consecutive read info stored in `data_read_info`. + std::vector stripe_data_read_ranges; + + // Identify what data to read from source. + std::vector data_read_info; + + // Store the compression information for each data stream. + stream_source_map compinfo_map; + + // Store info for each ORC stream at each nested level. + std::vector> lvl_stream_info; + + // At each nested level, the streams for each stripe are stored consecutively in lvl_stream_info. + // This is used to identify the range of streams for each stripe from that vector. + std::vector> lvl_stripe_stream_ranges; + + // The buffers to store raw data read from disk, initialized for each reading stripe chunks. + // After decoding, such buffers can be released. + // This can only be implemented after chunked output is ready. std::vector> lvl_stripe_data; - std::vector>> null_count_prefix_sums; - int64_t rows_to_skip; - size_type rows_to_read; - std::vector selected_stripes; + // Store the size of each stripe at each nested level. + // This is used to initialize the stripe_data buffers. + std::vector> lvl_stripe_sizes; + + // List of column data types at each nested level. + std::vector> lvl_column_types; + + // List of nested type columns at each nested level. + std::vector> lvl_nested_cols; + + // Table for converting timestamp columns from local to UTC time. + std::unique_ptr tz_table; + + bool global_preprocessed{false}; +}; + +/** + * @brief Struct collecting data necessary for chunked reading. + */ +struct chunk_read_data { + explicit chunk_read_data(std::size_t output_size_limit_, + std::size_t data_read_limit_, + size_type output_row_granularity_) + : chunk_read_limit{output_size_limit_}, + pass_read_limit{data_read_limit_}, + output_row_granularity{output_row_granularity_} + { + CUDF_EXPECTS(output_row_granularity > 0, + "The value of `output_row_granularity` must be positive."); + } + + std::size_t const + chunk_read_limit; // maximum size (in bytes) of an output chunk, or 0 for no limit + std::size_t const pass_read_limit; // approximate maximum size (in bytes) used for store + // intermediate data, or 0 for no limit + size_type const output_row_granularity; + + // Memory limits for loading data and decoding are computed as + // `*_limit_ratio * pass_read_limit`. + // This is to maintain the total memory usage to be **around** the given `pass_read_limit`. + // Note that sum of these limits may not be `1.0`, and their values are set empirically. + static double constexpr load_limit_ratio{0.25}; + static double constexpr decompress_and_decode_limit_ratio{0.6}; + + // Chunks of stripes that can be loaded into memory such that their data size is within the user + // specified limit. + std::vector load_stripe_ranges; + std::size_t curr_load_stripe_range{0}; + bool more_stripes_to_load() const { return curr_load_stripe_range < load_stripe_ranges.size(); } + + // Chunks of stripes such that their decompression size is within the user specified size limit. + std::vector decode_stripe_ranges; + std::size_t curr_decode_stripe_range{0}; + bool more_stripes_to_decode() const + { + return curr_decode_stripe_range < decode_stripe_ranges.size(); + } + + // Chunk of rows in the internal decoded table to output for each `read_chunk()`. + std::vector output_table_ranges; + std::size_t curr_output_table_range{0}; + std::unique_ptr decoded_table; + bool more_table_chunks_to_output() const + { + return curr_output_table_range < output_table_ranges.size(); + } + + bool has_next() const + { + // Only has more chunk to output if: + return more_stripes_to_load() || more_stripes_to_decode() || more_table_chunks_to_output(); + } +}; + +/** + * @brief Struct to accumulate counts and sizes of some types such as stripes or rows. + */ +struct cumulative_size { + std::size_t count{0}; + std::size_t size_bytes{0}; }; +/** + * @brief Struct to accumulate counts, sizes, and number of rows of some types such as stripes or + * rows in tables. + */ +struct cumulative_size_and_row : public cumulative_size { + std::size_t num_rows{0}; +}; + +/** + * @brief Functor to sum up cumulative data. + */ +struct cumulative_size_plus { + __device__ cumulative_size operator()(cumulative_size const& a, cumulative_size const& b) const + { + return cumulative_size{a.count + b.count, a.size_bytes + b.size_bytes}; + } + + __device__ cumulative_size_and_row operator()(cumulative_size_and_row const& a, + cumulative_size_and_row const& b) const + { + return cumulative_size_and_row{ + a.count + b.count, a.size_bytes + b.size_bytes, a.num_rows + b.num_rows}; + } +}; + +/** + * @brief Find the splits of the input data such that each split range has cumulative size less than + * a given `size_limit`. + * + * Note that the given limit is just a soft limit. The function will always output ranges that + * have at least one count, even such ranges have sizes exceed the value of `size_limit`. + * + * @param cumulative_sizes The input cumulative sizes to compute split ranges + * @param total_count The total count in the entire input + * @param size_limit The given soft limit to compute splits; must be positive + * @return A vector of ranges as splits of the input + */ +template +std::vector find_splits(host_span cumulative_sizes, + std::size_t total_count, + std::size_t size_limit); + +/** + * @brief Function that populates descriptors for either individual streams or chunks of column + * data, but not both. + * + * This function is firstly used in the global step, to gather information for streams of all + * stripes in the data sources (when `stream_info` is present). Later on, it is used again to + * populate column descriptors (`chunks` is present) during decompression and decoding. The two + * steps share most of the execution path thus this function takes mutually exclusive parameters + * `stream_info` or `chunks` depending on each use case. + * + * @param stripe_id The index of the current stripe, can be global index or local decoding index + * @param level The current processing nested level + * @param stripeinfo The pointer to current stripe's information + * @param stripefooter The pointer to current stripe's footer + * @param orc2gdf The mapping from ORC column ids to gdf column ids + * @param types The schema type + * @param use_index Whether to use the row index for parsing + * @param apply_struct_map Indicating if this is the root level + * @param num_dictionary_entries The number of dictionary entries + * @param local_stream_order For retrieving 0-based orders of streams in the decoding step + * @param stream_info The vector of streams' information + * @param chunks The vector of column descriptors + * @return The number of bytes in the gathered streams + */ +std::size_t gather_stream_info_and_column_desc( + std::size_t stripe_id, + std::size_t level, + orc::StripeInformation const* stripeinfo, + orc::StripeFooter const* stripefooter, + host_span orc2gdf, + host_span types, + bool use_index, + bool apply_struct_map, + int64_t* num_dictionary_entries, + std::size_t* local_stream_order, + std::vector* stream_info, + cudf::detail::hostdevice_2dvector* chunks); + } // namespace cudf::io::orc::detail diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_decode.cu similarity index 56% rename from cpp/src/io/orc/reader_impl_preprocess.cu rename to cpp/src/io/orc/reader_impl_decode.cu index 04cb223c696..ec936b85761 100644 --- a/cpp/src/io/orc/reader_impl_preprocess.cu +++ b/cpp/src/io/orc/reader_impl_decode.cu @@ -16,17 +16,17 @@ #include "io/comp/gpuinflate.hpp" #include "io/comp/nvcomp_adapter.hpp" +#include "io/orc/reader_impl.hpp" +#include "io/orc/reader_impl_chunking.hpp" +#include "io/orc/reader_impl_helpers.hpp" #include "io/utilities/config_utils.hpp" -#include "reader_impl.hpp" -#include "reader_impl_chunking.hpp" -#include "reader_impl_helpers.hpp" +#include "io/utilities/hostdevice_span.hpp" -#include +#include +#include #include -#include #include #include -#include #include #include @@ -45,175 +45,104 @@ #include #include -#include +#include namespace cudf::io::orc::detail { namespace { /** - * @brief Struct that maps ORC streams to columns - */ -struct orc_stream_info { - explicit orc_stream_info(uint64_t offset_, - std::size_t dst_pos_, - uint32_t length_, - uint32_t stripe_idx_) - : offset(offset_), dst_pos(dst_pos_), length(length_), stripe_idx(stripe_idx_) - { - } - uint64_t offset; // offset in file - std::size_t dst_pos; // offset in memory relative to start of compressed stripe data - std::size_t length; // length in file - uint32_t stripe_idx; // stripe index -}; - -/** - * @brief Function that populates column descriptors stream/chunk - */ -std::size_t gather_stream_info(std::size_t stripe_index, - orc::StripeInformation const* stripeinfo, - orc::StripeFooter const* stripefooter, - host_span orc2gdf, - host_span types, - bool use_index, - bool apply_struct_map, - int64_t* num_dictionary_entries, - std::vector& stream_info, - cudf::detail::hostdevice_2dvector& chunks) -{ - uint64_t src_offset = 0; - uint64_t dst_offset = 0; - - auto const get_stream_index_type = [](orc::StreamKind kind) { - switch (kind) { - case orc::DATA: return gpu::CI_DATA; - case orc::LENGTH: - case orc::SECONDARY: return gpu::CI_DATA2; - case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY; - case orc::PRESENT: return gpu::CI_PRESENT; - case orc::ROW_INDEX: return gpu::CI_INDEX; - default: - // Skip this stream as it's not strictly required - return gpu::CI_NUM_STREAMS; - } - }; - - for (auto const& stream : stripefooter->streams) { - if (!stream.column_id || *stream.column_id >= orc2gdf.size()) { - // Ignore reading this stream from source. - cudf::logger().warn("Unexpected stream in the input ORC source. The stream will be ignored."); - src_offset += stream.length; - continue; - } - - auto const column_id = *stream.column_id; - auto col = orc2gdf[column_id]; - - if (col == -1 and apply_struct_map) { - // A struct-type column has no data itself, but rather child columns - // for each of its fields. There is only a PRESENT stream, which - // needs to be included for the reader. - auto const schema_type = types[column_id]; - if (not schema_type.subtypes.empty()) { - if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) { - for (auto const& idx : schema_type.subtypes) { - auto child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1; - if (child_idx >= 0) { - col = child_idx; - auto& chunk = chunks[stripe_index][col]; - chunk.strm_id[gpu::CI_PRESENT] = stream_info.size(); - chunk.strm_len[gpu::CI_PRESENT] = stream.length; - } - } - } - } - } else if (col != -1) { - if (src_offset >= stripeinfo->indexLength || use_index) { - auto& chunk = chunks[stripe_index][col]; - auto const index_type = get_stream_index_type(stream.kind); - if (index_type < gpu::CI_NUM_STREAMS) { - chunk.strm_id[index_type] = stream_info.size(); - chunk.strm_len[index_type] = stream.length; - // NOTE: skip_count field is temporarily used to track the presence of index streams - chunk.skip_count |= 1 << index_type; - - if (index_type == gpu::CI_DICTIONARY) { - chunk.dictionary_start = *num_dictionary_entries; - chunk.dict_len = stripefooter->columns[column_id].dictionarySize; - *num_dictionary_entries += stripefooter->columns[column_id].dictionarySize; - } - } - } - stream_info.emplace_back( - stripeinfo->offset + src_offset, dst_offset, stream.length, stripe_index); - dst_offset += stream.length; - } - src_offset += stream.length; - } - - return dst_offset; -} - -/** - * @brief Decompresses the stripe data, at stream granularity. + * @brief Decompresses the stripe data, at stream granularity. + * + * Only the streams in the provided `stream_range` are decoded. That range is determined in + * the previous steps, after splitting stripes into ranges to maintain memory usage to be + * under data read limit. * + * @param loaded_stripe_range Range of stripes that are already loaded in memory + * @param stream_range Range of streams to be decoded + * @param num_decode_stripes Number of stripes that the decoding streams belong to + * @param compinfo_map A map to lookup compression info of streams * @param decompressor Block decompressor * @param stripe_data List of source stripe column data * @param stream_info List of stream to column mappings * @param chunks Vector of list of column chunk descriptors * @param row_groups Vector of list of row index descriptors - * @param num_stripes Number of stripes making up column chunks * @param row_index_stride Distance between each row index * @param use_base_stride Whether to use base stride obtained from meta or use the computed value * @param stream CUDA stream used for device memory operations and kernel launches - * @return Device buffer to decompressed page data + * @return Device buffer to decompressed data */ rmm::device_buffer decompress_stripe_data( + range const& loaded_stripe_range, + range const& stream_range, + std::size_t num_decode_stripes, + cudf::detail::hostdevice_span compinfo, + stream_source_map const& compinfo_map, OrcDecompressor const& decompressor, host_span stripe_data, - host_span stream_info, + host_span stream_info, cudf::detail::hostdevice_2dvector& chunks, cudf::detail::hostdevice_2dvector& row_groups, - size_type num_stripes, size_type row_index_stride, bool use_base_stride, rmm::cuda_stream_view stream) { - // Parse the columns' compressed info - cudf::detail::hostdevice_vector compinfo( - 0, stream_info.size(), stream); - for (auto const& info : stream_info) { - compinfo.push_back(gpu::CompressedStreamInfo( - static_cast(stripe_data[info.stripe_idx].data()) + info.dst_pos, - info.length)); - } - compinfo.host_to_device_async(stream); - - gpu::ParseCompressedStripeData(compinfo.device_ptr(), - compinfo.size(), - decompressor.GetBlockSize(), - decompressor.GetLog2MaxCompressionRatio(), - stream); - compinfo.device_to_host_sync(stream); + // Whether we have the comppression info precomputed. + auto const compinfo_ready = not compinfo_map.empty(); // Count the exact number of compressed blocks std::size_t num_compressed_blocks = 0; std::size_t num_uncompressed_blocks = 0; std::size_t total_decomp_size = 0; - for (std::size_t i = 0; i < compinfo.size(); ++i) { - num_compressed_blocks += compinfo[i].num_compressed_blocks; - num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks; - total_decomp_size += compinfo[i].max_uncompressed_size; + + for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) { + auto const& info = stream_info[stream_idx]; + + auto& stream_comp_info = compinfo[stream_idx - stream_range.begin]; + stream_comp_info = gpu::CompressedStreamInfo( + static_cast( + stripe_data[info.source.stripe_idx - loaded_stripe_range.begin].data()) + + info.dst_pos, + info.length); + + if (compinfo_ready) { + auto const& cached_comp_info = compinfo_map.at(info.source); + stream_comp_info.num_compressed_blocks = cached_comp_info.num_compressed_blocks; + stream_comp_info.num_uncompressed_blocks = cached_comp_info.num_uncompressed_blocks; + stream_comp_info.max_uncompressed_size = cached_comp_info.total_decomp_size; + + num_compressed_blocks += cached_comp_info.num_compressed_blocks; + num_uncompressed_blocks += cached_comp_info.num_uncompressed_blocks; + total_decomp_size += cached_comp_info.total_decomp_size; + } } + + if (!compinfo_ready) { + compinfo.host_to_device_async(stream); + gpu::ParseCompressedStripeData(compinfo.device_ptr(), + compinfo.size(), + decompressor.GetBlockSize(), + decompressor.GetLog2MaxCompressionRatio(), + stream); + compinfo.device_to_host_sync(stream); + + for (std::size_t i = 0; i < compinfo.size(); ++i) { + num_compressed_blocks += compinfo[i].num_compressed_blocks; + num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks; + total_decomp_size += compinfo[i].max_uncompressed_size; + } + } + CUDF_EXPECTS( not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)), "Inconsistent info on compression blocks"); - // Buffer needs to be padded. - // Required by `gpuDecodeOrcColumnData`. + // Buffer needs to be padded.This is required by `gpuDecodeOrcColumnData`. rmm::device_buffer decomp_data( cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream); + + // If total_decomp_size is zero, the input data may be just empty. + // This is still a valid input, thus do not be panick. if (decomp_data.is_empty()) { return decomp_data; } rmm::device_uvector> inflate_in( @@ -221,7 +150,7 @@ rmm::device_buffer decompress_stripe_data( rmm::device_uvector> inflate_out( num_compressed_blocks + num_uncompressed_blocks, stream); rmm::device_uvector inflate_res(num_compressed_blocks, stream); - thrust::fill(rmm::exec_policy(stream), + thrust::fill(rmm::exec_policy_nosync(stream), inflate_res.begin(), inflate_res.end(), compression_result{0, compression_status::FAILURE}); @@ -240,13 +169,13 @@ rmm::device_buffer decompress_stripe_data( compinfo[i].copy_in_ctl = inflate_in.data() + start_pos_uncomp; compinfo[i].copy_out_ctl = inflate_out.data() + start_pos_uncomp; - stream_info[i].dst_pos = decomp_offset; decomp_offset += compinfo[i].max_uncompressed_size; start_pos += compinfo[i].num_compressed_blocks; start_pos_uncomp += compinfo[i].num_uncompressed_blocks; max_uncomp_block_size = std::max(max_uncomp_block_size, compinfo[i].max_uncompressed_block_size); } + compinfo.host_to_device_async(stream); gpu::ParseCompressedStripeData(compinfo.device_ptr(), compinfo.size(), @@ -325,7 +254,7 @@ rmm::device_buffer decompress_stripe_data( // Check if any block has been failed to decompress. // Not using `thrust::any` or `thrust::count_if` to defer stream sync. thrust::for_each( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(std::size_t{0}), thrust::make_counting_iterator(inflate_res.size()), [results = inflate_res.begin(), @@ -351,15 +280,15 @@ rmm::device_buffer decompress_stripe_data( // We can check on host after stream synchronize CUDF_EXPECTS(not any_block_failure[0], "Error during decompression"); - size_type const num_columns = chunks.size().second; + auto const num_columns = chunks.size().second; // Update the stream information with the updated uncompressed info // TBD: We could update the value from the information we already // have in stream_info[], but using the gpu results also updates // max_uncompressed_size to the actual uncompressed size, or zero if // decompression failed. - for (size_type i = 0; i < num_stripes; ++i) { - for (size_type j = 0; j < num_columns; ++j) { + for (std::size_t i = 0; i < num_decode_stripes; ++i) { + for (std::size_t j = 0; j < num_columns; ++j) { auto& chunk = chunks[i][j]; for (int k = 0; k < gpu::CI_NUM_STREAMS; ++k) { if (chunk.strm_len[k] > 0 && chunk.strm_id[k] < compinfo.size()) { @@ -377,7 +306,7 @@ rmm::device_buffer decompress_stripe_data( compinfo.device_ptr(), chunks.base_device_ptr(), num_columns, - num_stripes, + num_decode_stripes, row_index_stride, use_base_stride, stream); @@ -424,7 +353,7 @@ void update_null_mask(cudf::detail::hostdevice_2dvector& chunks if (child_valid_map_base != nullptr) { rmm::device_uvector dst_idx(child_mask_len, stream); // Copy indexes at which the parent has valid value. - thrust::copy_if(rmm::exec_policy(stream), + thrust::copy_if(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + parent_mask_len, dst_idx.begin(), @@ -438,7 +367,7 @@ void update_null_mask(cudf::detail::hostdevice_2dvector& chunks uint32_t* dst_idx_ptr = dst_idx.data(); // Copy child valid bits from child column to valid indexes, this will merge both child // and parent null masks - thrust::for_each(rmm::exec_policy(stream), + thrust::for_each(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + dst_idx.size(), [child_valid_map_base, dst_idx_ptr, merged_mask] __device__(auto idx) { @@ -484,11 +413,11 @@ void update_null_mask(cudf::detail::hostdevice_2dvector& chunks * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ -void decode_stream_data(std::size_t num_dicts, +void decode_stream_data(int64_t num_dicts, int64_t skip_rows, size_type row_index_stride, std::size_t level, - table_view const& tz_table, + table_device_view const& d_tz_table, cudf::detail::hostdevice_2dvector& chunks, cudf::detail::device_2dspan row_groups, std::vector& out_buffers, @@ -497,6 +426,7 @@ void decode_stream_data(std::size_t num_dicts, { auto const num_stripes = chunks.size().first; auto const num_columns = chunks.size().second; + thrust::counting_iterator col_idx_it(0); thrust::counting_iterator stripe_idx_it(0); @@ -512,7 +442,7 @@ void decode_stream_data(std::size_t num_dicts, // Allocate global dictionary for deserializing rmm::device_uvector global_dict(num_dicts, stream); - chunks.host_to_device_sync(stream); + chunks.host_to_device_async(stream); gpu::DecodeNullsAndStringDictionaries( chunks.base_device_ptr(), global_dict.data(), num_columns, num_stripes, skip_rows, stream); @@ -521,16 +451,14 @@ void decode_stream_data(std::size_t num_dicts, update_null_mask(chunks, out_buffers, stream, mr); } - auto const tz_table_dptr = table_device_view::create(tz_table, stream); rmm::device_scalar error_count(0, stream); - // Update the null map for child columns gpu::DecodeOrcColumnData(chunks.base_device_ptr(), global_dict.data(), row_groups, num_columns, num_stripes, skip_rows, - *tz_table_dptr, + d_tz_table, row_groups.size().first, row_index_stride, level, @@ -557,40 +485,38 @@ void decode_stream_data(std::size_t num_dicts, * layer. */ void scan_null_counts(cudf::detail::hostdevice_2dvector const& chunks, - cudf::host_span> prefix_sums, + uint32_t* d_prefix_sums, rmm::cuda_stream_view stream) { auto const num_stripes = chunks.size().first; if (num_stripes == 0) return; auto const num_columns = chunks.size().second; - std::vector>> prefix_sums_to_update; + std::vector> prefix_sums_to_update; for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) { // Null counts sums are only needed for children of struct columns if (chunks[0][col_idx].type_kind == STRUCT) { - prefix_sums_to_update.emplace_back(col_idx, prefix_sums[col_idx]); + prefix_sums_to_update.emplace_back(col_idx, d_prefix_sums + num_stripes * col_idx); } } auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async( prefix_sums_to_update, stream, rmm::mr::get_current_device_resource()); - thrust::for_each(rmm::exec_policy(stream), - d_prefix_sums_to_update.begin(), - d_prefix_sums_to_update.end(), - [chunks = cudf::detail::device_2dspan{chunks}] __device__( - auto const& idx_psums) { - auto const col_idx = idx_psums.first; - auto const psums = idx_psums.second; - - thrust::transform( - thrust::seq, - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(0) + psums.size(), - psums.begin(), - [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; }); - - thrust::inclusive_scan(thrust::seq, psums.begin(), psums.end(), psums.begin()); - }); + thrust::for_each( + rmm::exec_policy_nosync(stream), + d_prefix_sums_to_update.begin(), + d_prefix_sums_to_update.end(), + [num_stripes, chunks = cudf::detail::device_2dspan{chunks}] __device__( + auto const& idx_psums) { + auto const col_idx = idx_psums.first; + auto const psums = idx_psums.second; + thrust::transform(thrust::seq, + thrust::make_counting_iterator(0ul), + thrust::make_counting_iterator(num_stripes), + psums, + [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; }); + thrust::inclusive_scan(thrust::seq, psums, psums + num_stripes, psums); + }); // `prefix_sums_to_update` goes out of scope, copy has to be done before we return stream.synchronize(); } @@ -634,6 +560,7 @@ void aggregate_child_meta(std::size_t level, // For each parent column, update its child column meta for each stripe. std::for_each(nested_cols.begin(), nested_cols.end(), [&](auto const p_col) { auto const parent_col_idx = col_meta.orc_col_map[level][p_col.id]; + int64_t start_row = 0; auto processed_row_groups = 0; @@ -657,10 +584,19 @@ void aggregate_child_meta(std::size_t level, // Aggregate start row, number of rows per chunk and total number of rows in a column auto const child_rows = chunks[stripe_id][parent_col_idx].num_child_rows; + for (size_type id = 0; id < p_col.num_children; id++) { auto const child_col_idx = index + id; num_child_rows[child_col_idx] += child_rows; + + // The number of rows in child column should not be very large otherwise we will have + // size overflow. + // If that is the case, we need to set a read limit to reduce number of decoding stripes. + CUDF_EXPECTS(num_child_rows[child_col_idx] <= + static_cast(std::numeric_limits::max()), + "Number of rows in the child column exceeds column size limit."); + num_child_rows_per_stripe[stripe_id][child_col_idx] = child_rows; // start row could be different for each column when there is nesting at each stripe level child_start_row[stripe_id][child_col_idx] = (stripe_id == 0) ? 0 : start_row; @@ -709,264 +645,291 @@ void generate_offsets_for_list(host_span buff_data, rmm::cuda_ } } +/** + * @brief Find the splits of the input table such that each split range of rows has data size less + * than a given `size_limit`. + * + * The parameter `segment_length` is to control the granularity of splits. The output ranges will + * always have numbers of rows that are multiple of this value, except the last range that contains + * the remaining rows. + * + * Similar to `find_splits`, the given limit is just a soft limit. This function will never output + * empty ranges, even they have sizes exceed the value of `size_limit`. + * + * @param input The input table to find splits + * @param segment_length Value to control granularity of the output ranges + * @param size_limit A limit on the output size of each split range + * @param stream CUDA stream used for device memory operations and kernel launches + * @return A vector of ranges as splits of the input + */ +std::vector find_table_splits(table_view const& input, + size_type segment_length, + std::size_t size_limit, + rmm::cuda_stream_view stream) +{ + if (size_limit == 0) { + return std::vector{range{0, static_cast(input.num_rows())}}; + } + + CUDF_EXPECTS(segment_length > 0, "Invalid segment_length", std::invalid_argument); + + // `segmented_row_bit_count` requires that `segment_length` is not larger than number of rows. + segment_length = std::min(segment_length, input.num_rows()); + + auto const d_segmented_sizes = cudf::detail::segmented_row_bit_count( + input, segment_length, stream, rmm::mr::get_current_device_resource()); + + auto segmented_sizes = + cudf::detail::hostdevice_vector(d_segmented_sizes->size(), stream); + + thrust::transform( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(d_segmented_sizes->size()), + segmented_sizes.d_begin(), + [segment_length, + num_rows = input.num_rows(), + d_sizes = d_segmented_sizes->view().begin()] __device__(auto const segment_idx) { + // Since the number of rows may not divisible by segment_length, + // the last segment may be shorter than the others. + auto const current_length = + cuda::std::min(segment_length, num_rows - segment_length * segment_idx); + auto const size = d_sizes[segment_idx] / CHAR_BIT; // divide by CHAR_BIT to get size in bytes + return cumulative_size{static_cast(current_length), + static_cast(size)}; + }); + + thrust::inclusive_scan(rmm::exec_policy_nosync(stream), + segmented_sizes.d_begin(), + segmented_sizes.d_end(), + segmented_sizes.d_begin(), + cumulative_size_plus{}); + segmented_sizes.device_to_host_sync(stream); + + return find_splits(segmented_sizes, input.num_rows(), size_limit); +} + } // namespace -void reader::impl::prepare_data(int64_t skip_rows, - std::optional const& num_rows_opt, - std::vector> const& stripes) +void reader_impl::decompress_and_decode_stripes(read_mode mode) { - // Selected columns at different levels of nesting are stored in different elements - // of `selected_columns`; thus, size == 1 means no nested columns - CUDF_EXPECTS(skip_rows == 0 or _selected_columns.num_levels() == 1, - "skip_rows is not supported by nested columns"); - - // There are no columns in the table - if (_selected_columns.num_levels() == 0) { return; } - - _file_itm_data = std::make_unique(); - - // Select only stripes required (aka row groups) - std::tie( - _file_itm_data->rows_to_skip, _file_itm_data->rows_to_read, _file_itm_data->selected_stripes) = - _metadata.select_stripes(stripes, skip_rows, num_rows_opt, _stream); - auto const rows_to_skip = _file_itm_data->rows_to_skip; - auto const rows_to_read = _file_itm_data->rows_to_read; - auto const& selected_stripes = _file_itm_data->selected_stripes; - - // If no rows or stripes to read, return empty columns - if (rows_to_read == 0 || selected_stripes.empty()) { return; } - - // Set up table for converting timestamp columns from local to UTC time - auto const tz_table = [&, &selected_stripes = selected_stripes] { - auto const has_timestamp_column = std::any_of( - _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) { - return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) { - return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP; - }); - }); + if (!_file_itm_data.has_data()) { return; } + + CUDF_EXPECTS(_chunk_read_data.curr_load_stripe_range > 0, "There is not any stripe loaded."); + + auto const stripe_range = + _chunk_read_data.decode_stripe_ranges[_chunk_read_data.curr_decode_stripe_range++]; + auto const stripe_start = stripe_range.begin; + auto const stripe_end = stripe_range.end; + auto const stripe_count = stripe_range.size(); + + // The start index of loaded stripes. They are different from decoding stripes. + auto const load_stripe_range = + _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range - 1]; + auto const load_stripe_start = load_stripe_range.begin; + + auto const rows_to_skip = _file_itm_data.rows_to_skip; + auto const& selected_stripes = _file_itm_data.selected_stripes; + + // Number of rows to decode in this decompressing/decoding step. + int64_t rows_to_decode = 0; + for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) { + auto const& stripe = selected_stripes[stripe_idx]; + auto const stripe_rows = static_cast(stripe.stripe_info->numberOfRows); + rows_to_decode += stripe_rows; + } - return has_timestamp_column - ? cudf::detail::make_timezone_transition_table( - {}, selected_stripes[0].stripe_info[0].second->writerTimezone, _stream) - : std::make_unique(); + CUDF_EXPECTS(rows_to_decode > rows_to_skip, "Invalid rows_to_decode computation."); + rows_to_decode = std::min(rows_to_decode - rows_to_skip, _file_itm_data.rows_to_read); + + // After this step, we no longer have any rows to skip. + // The number of rows remains to read in the future also reduced. + _file_itm_data.rows_to_skip = 0; + _file_itm_data.rows_to_read -= rows_to_decode; + + // Technically, overflow here should never happen because the `load_next_stripe_data()` step + // already handled it by splitting the loaded stripe range into multiple decode ranges. + CUDF_EXPECTS(rows_to_decode <= static_cast(std::numeric_limits::max()), + "Number or rows to decode exceeds the column size limit.", + std::overflow_error); + + auto const tz_table_dptr = table_device_view::create(_file_itm_data.tz_table->view(), _stream); + auto const num_levels = _selected_columns.num_levels(); + _out_buffers.resize(num_levels); + + // Column descriptors ('chunks'). + // Each 'chunk' of data here corresponds to an orc column, in a stripe, at a nested level. + // Unfortunately we cannot create one hostdevice_vector to use for all levels because + // currently we do not have a hostdevice_2dspan class. + std::vector> lvl_chunks(num_levels); + + // For computing null count. + auto null_count_prefix_sums = [&] { + auto const num_total_cols = std::accumulate( + _selected_columns.levels.begin(), + _selected_columns.levels.end(), + std::size_t{0}, + [](auto const& sum, auto const& cols_level) { return sum + cols_level.size(); }); + + return cudf::detail::make_zeroed_device_uvector_async( + num_total_cols * stripe_count, _stream, rmm::mr::get_current_device_resource()); + }(); + std::size_t num_processed_lvl_columns = 0; + std::size_t num_processed_prev_lvl_columns = 0; + + // For parsing decompression data. + // We create one hostdevice_vector that is large enough to use for all levels, + // thus only need to allocate memory once. + auto hd_compinfo = [&] { + std::size_t max_num_streams{0}; + if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) { + // Find the maximum number of streams in all levels of the decoding stripes. + for (std::size_t level = 0; level < num_levels; ++level) { + auto const stream_range = + merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], stripe_range); + max_num_streams = std::max(max_num_streams, stream_range.size()); + } + } + return cudf::detail::hostdevice_vector{max_num_streams, _stream}; }(); - auto& lvl_stripe_data = _file_itm_data->lvl_stripe_data; - auto& null_count_prefix_sums = _file_itm_data->null_count_prefix_sums; - lvl_stripe_data.resize(_selected_columns.num_levels()); - - _out_buffers.resize(_selected_columns.num_levels()); - - // Iterates through levels of nested columns, child column will be one level down - // compared to parent column. auto& col_meta = *_col_meta; for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) { - auto& columns_level = _selected_columns.levels[level]; - // Association between each ORC column and its cudf::column - col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1); - std::vector nested_cols; - - // Get a list of column data types - std::vector column_types; - for (auto& col : columns_level) { - auto col_type = to_cudf_type(_metadata.get_col_type(col.id).kind, - _use_np_dtypes, - _timestamp_type.id(), - to_cudf_decimal_type(_decimal128_columns, _metadata, col.id)); - CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type"); - if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or - col_type == type_id::DECIMAL128) { - // sign of the scale is changed since cuDF follows c++ libraries like CNL - // which uses negative scaling, but liborc and other libraries - // follow positive scaling. - auto const scale = - -static_cast(_metadata.get_col_type(col.id).scale.value_or(0)); - column_types.emplace_back(col_type, scale); - } else { - column_types.emplace_back(col_type); - } + auto const& stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges[level]; + auto const stream_range = merge_selected_ranges(stripe_stream_ranges, stripe_range); - // Map each ORC column to its column - col_meta.orc_col_map[level][col.id] = column_types.size() - 1; - if (col_type == type_id::LIST or col_type == type_id::STRUCT) { - nested_cols.emplace_back(col); - } - } + auto const& columns_level = _selected_columns.levels[level]; + auto const& stream_info = _file_itm_data.lvl_stream_info[level]; + auto const& column_types = _file_itm_data.lvl_column_types[level]; + auto const& nested_cols = _file_itm_data.lvl_nested_cols[level]; - // Get the total number of stripes across all input files. - std::size_t total_num_stripes = - std::accumulate(selected_stripes.begin(), - selected_stripes.end(), - 0, - [](std::size_t sum, auto& stripe_source_mapping) { - return sum + stripe_source_mapping.stripe_info.size(); - }); - auto const num_columns = columns_level.size(); - cudf::detail::hostdevice_2dvector chunks( - total_num_stripes, num_columns, _stream); + auto& stripe_data = _file_itm_data.lvl_stripe_data[level]; + auto& chunks = lvl_chunks[level]; + + auto const num_lvl_columns = columns_level.size(); + chunks = + cudf::detail::hostdevice_2dvector(stripe_count, num_lvl_columns, _stream); memset(chunks.base_host_ptr(), 0, chunks.size_bytes()); const bool use_index = - _use_index && + _options.use_index && // Do stripes have row group index _metadata.is_row_grp_idx_present() && // Only use if we don't have much work with complete columns & stripes // TODO: Consider nrows, gpu, and tune the threshold - (rows_to_read > _metadata.get_row_index_stride() && !(_metadata.get_row_index_stride() & 7) && - _metadata.get_row_index_stride() != 0 && num_columns * total_num_stripes < 8 * 128) && + (rows_to_decode > _metadata.get_row_index_stride() && + !(_metadata.get_row_index_stride() & 7) && _metadata.get_row_index_stride() != 0 && + num_lvl_columns * stripe_count < 8 * 128) && // Only use if first row is aligned to a stripe boundary // TODO: Fix logic to handle unaligned rows (rows_to_skip == 0); - // Logically view streams as columns - std::vector stream_info; - - null_count_prefix_sums.emplace_back(); - null_count_prefix_sums.back().reserve(_selected_columns.levels[level].size()); - std::generate_n(std::back_inserter(null_count_prefix_sums.back()), - _selected_columns.levels[level].size(), - [&]() { - return cudf::detail::make_zeroed_device_uvector_async( - total_num_stripes, _stream, rmm::mr::get_current_device_resource()); - }); - - // Tracker for eventually deallocating compressed and uncompressed data - auto& stripe_data = lvl_stripe_data[level]; - - int64_t stripe_start_row = 0; - int64_t num_dict_entries = 0; - int64_t num_rowgroups = 0; - size_type stripe_idx = 0; - - std::vector, std::size_t>> read_tasks; - for (auto const& stripe_source_mapping : selected_stripes) { - // Iterate through the source files selected stripes - for (auto const& stripe : stripe_source_mapping.stripe_info) { - auto const stripe_info = stripe.first; - auto const stripe_footer = stripe.second; - - auto stream_count = stream_info.size(); - auto const total_data_size = gather_stream_info(stripe_idx, - stripe_info, - stripe_footer, - col_meta.orc_col_map[level], - _metadata.get_types(), - use_index, - level == 0, - &num_dict_entries, - stream_info, - chunks); - - auto const is_stripe_data_empty = total_data_size == 0; - CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0, - "Invalid index rowgroup stream data"); - - // Buffer needs to be padded. - // Required by `copy_uncompressed_kernel`. - stripe_data.emplace_back( - cudf::util::round_up_safe(total_data_size, BUFFER_PADDING_MULTIPLE), _stream); - auto dst_base = static_cast(stripe_data.back().data()); - - // Coalesce consecutive streams into one read - while (not is_stripe_data_empty and stream_count < stream_info.size()) { - auto const d_dst = dst_base + stream_info[stream_count].dst_pos; - auto const offset = stream_info[stream_count].offset; - auto len = stream_info[stream_count].length; - stream_count++; - - while (stream_count < stream_info.size() && - stream_info[stream_count].offset == offset + len) { - len += stream_info[stream_count].length; - stream_count++; - } - if (_metadata.per_file_metadata[stripe_source_mapping.source_idx] - .source->is_device_read_preferred(len)) { - read_tasks.push_back( - std::pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx] - .source->device_read_async(offset, len, d_dst, _stream), - len)); - - } else { - auto const buffer = - _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read( - offset, len); - CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read."); - CUDF_CUDA_TRY( - cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, _stream.value())); - _stream.synchronize(); - } - } - - auto const num_rows_per_stripe = stripe_info->numberOfRows; - auto const rowgroup_id = num_rowgroups; - auto stripe_num_rowgroups = 0; - if (use_index) { - stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) / - _metadata.get_row_index_stride(); + // 0-based counters, used across all decoding stripes in this step. + int64_t stripe_start_row{0}; + int64_t num_dict_entries{0}; + uint32_t num_rowgroups{0}; + std::size_t local_stream_order{0}; + + for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) { + auto const& stripe = selected_stripes[stripe_idx]; + auto const stripe_info = stripe.stripe_info; + auto const stripe_footer = stripe.stripe_footer; + + // Normalize stripe_idx to 0-based. + auto const stripe_local_idx = stripe_idx - stripe_start; + + // The first parameter (`stripe_order`) must be normalized to 0-based. + auto const total_data_size = gather_stream_info_and_column_desc(stripe_local_idx, + level, + stripe_info, + stripe_footer, + col_meta.orc_col_map[level], + _metadata.get_types(), + use_index, + level == 0, + &num_dict_entries, + &local_stream_order, + nullptr, // stream_info + &chunks); + + auto const is_stripe_data_empty = total_data_size == 0; + CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0, + "Invalid index rowgroup stream data"); + + auto const dst_base = + static_cast(stripe_data[stripe_idx - load_stripe_start].data()); + auto const num_rows_in_stripe = static_cast(stripe_info->numberOfRows); + + uint32_t const rowgroup_id = num_rowgroups; + uint32_t const stripe_num_rowgroups = + use_index ? (num_rows_in_stripe + _metadata.get_row_index_stride() - 1) / + _metadata.get_row_index_stride() + : 0; + + // Update chunks to reference streams pointers. + for (std::size_t col_idx = 0; col_idx < num_lvl_columns; col_idx++) { + auto& chunk = chunks[stripe_local_idx][col_idx]; + // start row, number of rows in a each stripe and total number of rows + // may change in lower levels of nesting + chunk.start_row = + (level == 0) ? stripe_start_row + : col_meta.child_start_row[stripe_local_idx * num_lvl_columns + col_idx]; + chunk.num_rows = + (level == 0) + ? num_rows_in_stripe + : col_meta.num_child_rows_per_stripe[stripe_local_idx * num_lvl_columns + col_idx]; + chunk.column_num_rows = (level == 0) ? rows_to_decode : col_meta.num_child_rows[col_idx]; + chunk.parent_validity_info = + (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx]; + chunk.parent_null_count_prefix_sums = + (level == 0) ? nullptr + : null_count_prefix_sums.data() + (num_processed_prev_lvl_columns + + col_meta.parent_column_index[col_idx]) * + stripe_count; + chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind; + chunk.type_kind = + _metadata.per_file_metadata[stripe.source_idx].ff.types[columns_level[col_idx].id].kind; + + // num_child_rows for a struct column will be same, for other nested types it will be + // calculated. + chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows; + chunk.dtype_id = column_types[col_idx].id(); + chunk.decimal_scale = _metadata.per_file_metadata[stripe.source_idx] + .ff.types[columns_level[col_idx].id] + .scale.value_or(0); + + chunk.rowgroup_id = rowgroup_id; + chunk.dtype_len = (column_types[col_idx].id() == type_id::STRING) + ? sizeof(string_index_pair) + : ((column_types[col_idx].id() == type_id::LIST) or + (column_types[col_idx].id() == type_id::STRUCT)) + ? sizeof(size_type) + : cudf::size_of(column_types[col_idx]); + chunk.num_rowgroups = stripe_num_rowgroups; + + if (chunk.type_kind == orc::TIMESTAMP) { + chunk.timestamp_type_id = _options.timestamp_type.id(); } - // Update chunks to reference streams pointers - for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) { - auto& chunk = chunks[stripe_idx][col_idx]; - // start row, number of rows in a each stripe and total number of rows - // may change in lower levels of nesting - chunk.start_row = (level == 0) - ? stripe_start_row - : col_meta.child_start_row[stripe_idx * num_columns + col_idx]; - chunk.num_rows = - (level == 0) ? stripe_info->numberOfRows - : col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx]; - chunk.column_num_rows = (level == 0) ? rows_to_read : col_meta.num_child_rows[col_idx]; - chunk.parent_validity_info = - (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx]; - chunk.parent_null_count_prefix_sums = - (level == 0) - ? nullptr - : null_count_prefix_sums[level - 1][col_meta.parent_column_index[col_idx]].data(); - chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind; - chunk.type_kind = _metadata.per_file_metadata[stripe_source_mapping.source_idx] - .ff.types[columns_level[col_idx].id] - .kind; - // num_child_rows for a struct column will be same, for other nested types it will be - // calculated. - chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows; - chunk.dtype_id = column_types[col_idx].id(); - chunk.decimal_scale = _metadata.per_file_metadata[stripe_source_mapping.source_idx] - .ff.types[columns_level[col_idx].id] - .scale.value_or(0); - - chunk.rowgroup_id = rowgroup_id; - chunk.dtype_len = (column_types[col_idx].id() == type_id::STRING) - ? sizeof(string_index_pair) - : ((column_types[col_idx].id() == type_id::LIST) or - (column_types[col_idx].id() == type_id::STRUCT)) - ? sizeof(size_type) - : cudf::size_of(column_types[col_idx]); - chunk.num_rowgroups = stripe_num_rowgroups; - if (chunk.type_kind == orc::TIMESTAMP) { chunk.timestamp_type_id = _timestamp_type.id(); } - if (not is_stripe_data_empty) { - for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) { - chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos; - } + if (not is_stripe_data_empty) { + for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) { + chunk.streams[k] = + dst_base + stream_info[chunk.strm_id[k] + stream_range.begin].dst_pos; } } - stripe_start_row += num_rows_per_stripe; - num_rowgroups += stripe_num_rowgroups; - - stripe_idx++; } - } - for (auto& task : read_tasks) { - CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read."); + + stripe_start_row += num_rows_in_stripe; + num_rowgroups += stripe_num_rowgroups; } if (stripe_data.empty()) { continue; } - // Process dataset chunk pages into output columns + // Process dataset chunks into output columns. auto row_groups = - cudf::detail::hostdevice_2dvector(num_rowgroups, num_columns, _stream); + cudf::detail::hostdevice_2dvector(num_rowgroups, num_lvl_columns, _stream); if (level > 0 and row_groups.size().first) { cudf::host_span row_groups_span(row_groups.base_host_ptr(), - num_rowgroups * num_columns); + num_rowgroups * num_lvl_columns); auto& rw_grp_meta = col_meta.rwgrp_meta; // Update start row and num rows per row group @@ -980,19 +943,31 @@ void reader::impl::prepare_data(int64_t skip_rows, return meta; }); } - // Setup row group descriptors if using indexes + + // Setup row group descriptors if using indexes. if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) { - auto decomp_data = decompress_stripe_data(*_metadata.per_file_metadata[0].decompressor, + auto compinfo = cudf::detail::hostdevice_span( + hd_compinfo.begin(), hd_compinfo.d_begin(), stream_range.size()); + auto decomp_data = decompress_stripe_data(load_stripe_range, + stream_range, + stripe_count, + compinfo, + _file_itm_data.compinfo_map, + *_metadata.per_file_metadata[0].decompressor, stripe_data, stream_info, chunks, row_groups, - total_num_stripes, _metadata.get_row_index_stride(), level == 0, _stream); - stripe_data.clear(); - stripe_data.push_back(std::move(decomp_data)); + + // Just save the decompressed data and clear out the raw data to free up memory. + stripe_data[stripe_start - load_stripe_start] = std::move(decomp_data); + for (std::size_t i = 1; i < stripe_count; ++i) { + stripe_data[i + stripe_start - load_stripe_start] = {}; + } + } else { if (row_groups.size().first) { chunks.host_to_device_async(_stream); @@ -1001,34 +976,38 @@ void reader::impl::prepare_data(int64_t skip_rows, gpu::ParseRowGroupIndex(row_groups.base_device_ptr(), nullptr, chunks.base_device_ptr(), - num_columns, - total_num_stripes, + num_lvl_columns, + stripe_count, _metadata.get_row_index_stride(), level == 0, _stream); } } + _out_buffers[level].resize(0); + for (std::size_t i = 0; i < column_types.size(); ++i) { bool is_nullable = false; - for (std::size_t j = 0; j < total_num_stripes; ++j) { + for (std::size_t j = 0; j < stripe_count; ++j) { if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) { is_nullable = true; break; } } - auto is_list_type = (column_types[i].id() == type_id::LIST); - auto n_rows = (level == 0) ? rows_to_read : col_meta.num_child_rows[i]; - // For list column, offset column will be always size + 1 - if (is_list_type) n_rows++; - _out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, _stream, _mr); + + auto const is_list_type = (column_types[i].id() == type_id::LIST); + auto const n_rows = (level == 0) ? rows_to_decode : col_meta.num_child_rows[i]; + + // For list column, offset column will be always size + 1. + _out_buffers[level].emplace_back( + column_types[i], is_list_type ? n_rows + 1 : n_rows, is_nullable, _stream, _mr); } decode_stream_data(num_dict_entries, rows_to_skip, _metadata.get_row_index_stride(), level, - tz_table->view(), + *tz_table_dptr, chunks, row_groups, _out_buffers[level], @@ -1036,8 +1015,9 @@ void reader::impl::prepare_data(int64_t skip_rows, _mr); if (nested_cols.size()) { - // Extract information to process nested child columns - scan_null_counts(chunks, null_count_prefix_sums[level], _stream); + // Extract information to process nested child columns. + scan_null_counts( + chunks, null_count_prefix_sums.data() + num_processed_lvl_columns * stripe_count, _stream); row_groups.device_to_host_sync(_stream); aggregate_child_meta( @@ -1055,7 +1035,48 @@ void reader::impl::prepare_data(int64_t skip_rows, if (not buff_data.empty()) { generate_offsets_for_list(buff_data, _stream); } } + num_processed_prev_lvl_columns = num_processed_lvl_columns; + num_processed_lvl_columns += num_lvl_columns; } // end loop level + + // Now generate a table from the decoded result. + std::vector> out_columns; + _out_metadata = get_meta_with_user_data(); + std::transform( + _selected_columns.levels[0].begin(), + _selected_columns.levels[0].end(), + std::back_inserter(out_columns), + [&](auto const& orc_col_meta) { + _out_metadata.schema_info.emplace_back(""); + auto col_buffer = assemble_buffer( + orc_col_meta.id, 0, *_col_meta, _metadata, _selected_columns, _out_buffers, _stream, _mr); + return make_column(col_buffer, &_out_metadata.schema_info.back(), std::nullopt, _stream); + }); + _chunk_read_data.decoded_table = std::make_unique
(std::move(out_columns)); + + // Free up temp memory used for decoding. + for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) { + _out_buffers[level].resize(0); + + auto& stripe_data = _file_itm_data.lvl_stripe_data[level]; + if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) { + stripe_data[stripe_start - load_stripe_start] = {}; + } else { + for (std::size_t i = 0; i < stripe_count; ++i) { + stripe_data[i + stripe_start - load_stripe_start] = {}; + } + } + } + + // Output table range is reset to start from the first position. + _chunk_read_data.curr_output_table_range = 0; + + // Split the decoded table into ranges that be output into chunks having size within the given + // output size limit. + _chunk_read_data.output_table_ranges = find_table_splits(_chunk_read_data.decoded_table->view(), + _chunk_read_data.output_row_granularity, + _chunk_read_data.chunk_read_limit, + _stream); } } // namespace cudf::io::orc::detail diff --git a/cpp/src/io/orc/reader_impl_helpers.hpp b/cpp/src/io/orc/reader_impl_helpers.hpp index 6645eecbd29..a563fb19e15 100644 --- a/cpp/src/io/orc/reader_impl_helpers.hpp +++ b/cpp/src/io/orc/reader_impl_helpers.hpp @@ -16,9 +16,9 @@ #pragma once -#include "aggregate_orc_metadata.hpp" +#include "io/orc/aggregate_orc_metadata.hpp" +#include "io/orc/orc.hpp" #include "io/utilities/column_buffer.hpp" -#include "orc.hpp" #include diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index c7659be1adb..c47beb8d7ed 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -650,7 +650,10 @@ aggregate_reader_metadata::select_row_groups( if (not row_group_indices.empty()) { return std::pair{}; } auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options( skip_rows_opt, num_rows_opt, get_num_rows()); - return std::pair{static_cast(from_opts.first), from_opts.second}; + CUDF_EXPECTS(from_opts.second <= static_cast(std::numeric_limits::max()), + "Number of reading rows exceeds cudf's column size limit."); + return std::pair{static_cast(from_opts.first), + static_cast(from_opts.second)}; }(); if (!row_group_indices.empty()) { diff --git a/cpp/src/io/utilities/row_selection.cpp b/cpp/src/io/utilities/row_selection.cpp index f136cd11ff7..c0bbca39167 100644 --- a/cpp/src/io/utilities/row_selection.cpp +++ b/cpp/src/io/utilities/row_selection.cpp @@ -23,20 +23,17 @@ namespace cudf::io::detail { -std::pair skip_rows_num_rows_from_options( - int64_t skip_rows, std::optional const& num_rows, int64_t num_source_rows) +std::pair skip_rows_num_rows_from_options(int64_t skip_rows, + std::optional const& num_rows, + int64_t num_source_rows) { - auto const rows_to_skip = std::min(skip_rows, num_source_rows); - if (not num_rows.has_value()) { - CUDF_EXPECTS(num_source_rows - rows_to_skip <= std::numeric_limits::max(), - "The requested number of rows exceeds the column size limit", - std::overflow_error); - return {rows_to_skip, num_source_rows - rows_to_skip}; - } + auto const rows_to_skip = std::min(skip_rows, num_source_rows); + auto const num_rows_can_read = num_source_rows - rows_to_skip; + + if (not num_rows.has_value()) { return {rows_to_skip, num_rows_can_read}; } + // Limit the number of rows to the end of the input - return { - rows_to_skip, - static_cast(std::min(num_rows.value(), num_source_rows - rows_to_skip))}; + return {rows_to_skip, std::min(num_rows.value(), num_rows_can_read)}; } } // namespace cudf::io::detail diff --git a/cpp/src/io/utilities/row_selection.hpp b/cpp/src/io/utilities/row_selection.hpp index 0b5d3aef8bd..7fdcc65d77b 100644 --- a/cpp/src/io/utilities/row_selection.hpp +++ b/cpp/src/io/utilities/row_selection.hpp @@ -34,7 +34,8 @@ namespace cudf::io::detail { * * @throw std::overflow_exception The requested number of rows exceeds the column size limit */ -std::pair skip_rows_num_rows_from_options( - int64_t skip_rows, std::optional const& num_rows, int64_t num_source_rows); +std::pair skip_rows_num_rows_from_options(int64_t skip_rows, + std::optional const& num_rows, + int64_t num_source_rows); } // namespace cudf::io::detail diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 6c56d82007a..fa633dfa67b 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -294,7 +294,7 @@ ConfigureTest( PERCENT 30 ) ConfigureTest( - ORC_TEST io/orc_test.cpp + ORC_TEST io/orc_chunked_reader_test.cu io/orc_test.cpp GPUS 1 PERCENT 30 ) diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu new file mode 100644 index 00000000000..1c1b53ea17f --- /dev/null +++ b/cpp/tests/io/orc_chunked_reader_test.cu @@ -0,0 +1,1477 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace { +enum class output_limit : std::size_t {}; +enum class input_limit : std::size_t {}; +enum class output_row_granularity : cudf::size_type {}; + +// Global environment for temporary files +auto const temp_env = reinterpret_cast( + ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment)); + +using int32s_col = cudf::test::fixed_width_column_wrapper; +using int64s_col = cudf::test::fixed_width_column_wrapper; +using doubles_col = cudf::test::fixed_width_column_wrapper; +using strings_col = cudf::test::strings_column_wrapper; +using structs_col = cudf::test::structs_column_wrapper; +using int32s_lists_col = cudf::test::lists_column_wrapper; + +auto write_file(std::vector>& input_columns, + std::string const& filename, + bool nullable = false, + std::size_t stripe_size_bytes = cudf::io::default_stripe_size_bytes, + cudf::size_type stripe_size_rows = cudf::io::default_stripe_size_rows) +{ + if (nullable) { + // Generate deterministic bitmask instead of random bitmask for easy computation of data size. + auto const valid_iter = cudf::detail::make_counting_transform_iterator( + 0, [](cudf::size_type i) { return i % 4 != 3; }); + cudf::size_type offset{0}; + for (auto& col : input_columns) { + auto const [null_mask, null_count] = + cudf::test::detail::make_null_mask(valid_iter + offset, valid_iter + col->size() + offset); + col = cudf::structs::detail::superimpose_nulls( + static_cast(null_mask.data()), + null_count, + std::move(col), + cudf::get_default_stream(), + rmm::mr::get_current_device_resource()); + + // Shift nulls of the next column by one position, to avoid having all nulls + // in the same table rows. + ++offset; + } + } + + auto input_table = std::make_unique(std::move(input_columns)); + auto filepath = + temp_env->get_temp_filepath(nullable ? filename + "_nullable.orc" : filename + ".orc"); + + auto const write_opts = + cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, *input_table) + .stripe_size_bytes(stripe_size_bytes) + .stripe_size_rows(stripe_size_rows) + .build(); + cudf::io::write_orc(write_opts); + + return std::pair{std::move(input_table), std::move(filepath)}; +} + +// NOTE: By default, output_row_granularity=10'000 rows. +// This means if the input file has more than 10k rows then the output chunk will never +// have less than 10k rows. +auto chunked_read(std::string const& filepath, + output_limit output_limit_bytes, + input_limit input_limit_bytes = input_limit{0}, + output_row_granularity output_granularity = output_row_granularity{10'000}) +{ + auto const read_opts = + cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).build(); + auto reader = cudf::io::chunked_orc_reader(static_cast(output_limit_bytes), + static_cast(input_limit_bytes), + static_cast(output_granularity), + read_opts); + + auto num_chunks = 0; + auto out_tables = std::vector>{}; + + // TODO: remove this scope, when we get rid of mem stat in the reader. + // This is to avoid use-after-free of memory resource created by the mem stat object. + auto mr = rmm::mr::get_current_device_resource(); + + do { + auto chunk = reader.read_chunk(); + // If the input file is empty, the first call to `read_chunk` will return an empty table. + // Thus, we only check for non-empty output table from the second call. + if (num_chunks > 0) { + CUDF_EXPECTS(chunk.tbl->num_rows() != 0, "Number of rows in the new chunk is zero."); + } + ++num_chunks; + out_tables.emplace_back(std::move(chunk.tbl)); + } while (reader.has_next()); + + if (num_chunks > 1) { + CUDF_EXPECTS(out_tables.front()->num_rows() != 0, "Number of rows in the new chunk is zero."); + } + + auto out_tviews = std::vector{}; + for (auto const& tbl : out_tables) { + out_tviews.emplace_back(tbl->view()); + } + + // return std::pair(cudf::concatenate(out_tviews), num_chunks); + + // TODO: remove this + return std::pair(cudf::concatenate(out_tviews, cudf::get_default_stream(), mr), num_chunks); +} + +auto chunked_read(std::string const& filepath, + output_limit output_limit_bytes, + output_row_granularity output_granularity) +{ + return chunked_read(filepath, output_limit_bytes, input_limit{0UL}, output_granularity); +} + +} // namespace + +struct OrcChunkedReaderTest : public cudf::test::BaseFixture {}; + +TEST_F(OrcChunkedReaderTest, TestChunkedReadNoData) +{ + std::vector> input_columns; + input_columns.emplace_back(int32s_col{}.release()); + input_columns.emplace_back(int64s_col{}.release()); + + auto const [expected, filepath] = write_file(input_columns, "chunked_read_empty"); + auto const [result, num_chunks] = chunked_read(filepath, output_limit{1'000}); + EXPECT_EQ(num_chunks, 1); + EXPECT_EQ(result->num_rows(), 0); + EXPECT_EQ(result->num_columns(), 2); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); +} + +TEST_F(OrcChunkedReaderTest, TestChunkedReadInvalidParameter) +{ + std::vector> input_columns; + input_columns.emplace_back(int32s_col{}.release()); + input_columns.emplace_back(int64s_col{}.release()); + + auto const [expected, filepath] = write_file(input_columns, "chunked_read_invalid"); + EXPECT_THROW( + chunked_read(filepath, output_limit{1'000}, output_row_granularity{-1} /*invalid value*/), + cudf::logic_error); +} + +TEST_F(OrcChunkedReaderTest, TestChunkedReadSimpleData) +{ + auto constexpr num_rows = 40'000; + + auto const generate_input = [num_rows](bool nullable, std::size_t stripe_rows) { + std::vector> input_columns; + auto const value_iter = thrust::make_counting_iterator(0); + input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release()); + input_columns.emplace_back(int64s_col(value_iter, value_iter + num_rows).release()); + + return write_file(input_columns, + "chunked_read_simple", + nullable, + cudf::io::default_stripe_size_bytes, + stripe_rows); + }; + + { + auto const [expected, filepath] = generate_input(false, 1'000); + auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000}); + EXPECT_EQ(num_chunks, 2); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + { + auto const [expected, filepath] = generate_input(false, cudf::io::default_stripe_size_rows); + auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000}); + EXPECT_EQ(num_chunks, 2); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + { + auto const [expected, filepath] = generate_input(true, 1'000); + auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000}); + EXPECT_EQ(num_chunks, 2); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + { + auto const [expected, filepath] = generate_input(true, cudf::io::default_stripe_size_rows); + auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000}); + EXPECT_EQ(num_chunks, 2); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } +} + +TEST_F(OrcChunkedReaderTest, TestChunkedReadBoundaryCases) +{ + // Tests some specific boundary conditions in the split calculations. + + auto constexpr num_rows = 40'000; + + auto const [expected, filepath] = [num_rows]() { + std::vector> input_columns; + auto const value_iter = thrust::make_counting_iterator(0); + input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release()); + return write_file(input_columns, "chunked_read_simple_boundary"); + }(); + + // Test with zero limit: everything will be read in one chunk. + { + auto const [result, num_chunks] = chunked_read(filepath, output_limit{0UL}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + // Test with a very small limit: 1 byte. + { + auto const [result, num_chunks] = chunked_read(filepath, output_limit{1UL}); + // Number of chunks is 4 because of using default `output_row_granularity = 10k`. + EXPECT_EQ(num_chunks, 4); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + // Test with a very small limit: 1 byte, and small value of `output_row_granularity`. + { + auto const [result, num_chunks] = + chunked_read(filepath, output_limit{1UL}, output_row_granularity{1'000}); + EXPECT_EQ(num_chunks, 40); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + // Test with a very small limit: 1 byte, and large value of `output_row_granularity`. + { + auto const [result, num_chunks] = + chunked_read(filepath, output_limit{1UL}, output_row_granularity{30'000}); + EXPECT_EQ(num_chunks, 2); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + // Test with a very large limit + { + auto const [result, num_chunks] = chunked_read(filepath, output_limit{2L << 40}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + // Test with a limit slightly less than one granularity segment of data + // (output_row_granularity = 10k rows = 40'000 bytes). + { + auto const [result, num_chunks] = chunked_read(filepath, output_limit{39'000UL}); + EXPECT_EQ(num_chunks, 4); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + // Test with a limit exactly the size one granularity segment of data + // (output_row_granularity = 10k rows = 40'000 bytes). + { + auto const [result, num_chunks] = chunked_read(filepath, output_limit{40'000UL}); + EXPECT_EQ(num_chunks, 4); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + // Test with a limit slightly more than one granularity segment of data + // (output_row_granularity = 10k rows = 40'000 bytes). + { + auto const [result, num_chunks] = chunked_read(filepath, output_limit{41'000UL}); + EXPECT_EQ(num_chunks, 4); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + // Test with a limit slightly less than two granularity segments of data + { + auto const [result, num_chunks] = chunked_read(filepath, output_limit{79'000UL}); + EXPECT_EQ(num_chunks, 4); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + // Test with a limit exactly the size of two granularity segments of data minus 1 byte. + { + auto const [result, num_chunks] = chunked_read(filepath, output_limit{79'999UL}); + EXPECT_EQ(num_chunks, 4); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + // Test with a limit exactly the size of two granularity segments of data. + { + auto const [result, num_chunks] = chunked_read(filepath, output_limit{80'000UL}); + EXPECT_EQ(num_chunks, 2); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + // Test with a limit slightly more the size two granularity segments of data. + { + auto const [result, num_chunks] = chunked_read(filepath, output_limit{81'000}); + EXPECT_EQ(num_chunks, 2); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + // Test with a limit exactly the size of the input minus 1 byte. + { + auto const [result, num_chunks] = chunked_read(filepath, output_limit{159'999UL}); + EXPECT_EQ(num_chunks, 2); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + // Test with a limit exactly the size of the input. + { + auto const [result, num_chunks] = chunked_read(filepath, output_limit{160'000UL}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } +} + +TEST_F(OrcChunkedReaderTest, TestChunkedReadWithString) +{ + auto constexpr num_rows = 60'000; + auto constexpr output_granularity = output_row_granularity{20'000}; + + auto const generate_input = [num_rows](bool nullable) { + std::vector> input_columns; + auto const value_iter = thrust::make_counting_iterator(0); + + // ints Granularity Segment total bytes cumulative bytes + // 20000 rows of 4 bytes each = A0 80000 80000 + // 20000 rows of 4 bytes each = A1 80000 160000 + // 20000 rows of 4 bytes each = A2 80000 240000 + input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release()); + + // strings Granularity Segment total bytes cumulative bytes + // 20000 rows of 1 char each (20000 + 80004) = B0 100004 100004 + // 20000 rows of 4 chars each (80000 + 80004) = B1 160004 260008 + // 20000 rows of 16 chars each (320000 + 80004) = B2 400004 660012 + auto const strings = std::vector{"a", "bbbb", "cccccccccccccccc"}; + auto const str_iter = cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { + if (i < 20000) { return strings[0]; } + if (i < 40000) { return strings[1]; } + return strings[2]; + }); + input_columns.emplace_back(strings_col(str_iter, str_iter + num_rows).release()); + + // Cumulative sizes: + // A0 + B0 : 180004 + // A1 + B1 : 420008 + // A2 + B2 : 900012 + // skip_rows / num_rows + // byte_limit==500000 should give 2 chunks: {0, 40000}, {40000, 20000} + // byte_limit==1000000 should give 1 chunks: {0, 60000}, + return write_file(input_columns, "chunked_read_with_strings", nullable); + }; + + auto const [expected_no_null, filepath_no_null] = generate_input(false); + auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true); + + // Test with zero limit: everything will be read in one chunk. + { + auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result); + } + { + auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); + } + + // Test with a very small limit: 1 byte. + { + auto const [result, num_chunks] = + chunked_read(filepath_no_null, output_limit{1UL}, output_granularity); + EXPECT_EQ(num_chunks, 3); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result); + } + { + auto const [result, num_chunks] = + chunked_read(filepath_with_nulls, output_limit{1UL}, output_granularity); + EXPECT_EQ(num_chunks, 3); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); + } + + // Test with a very large limit. + { + auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2L << 40}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result); + } + { + auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2L << 40}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); + } + + // Other tests: + + { + auto const [result, num_chunks] = + chunked_read(filepath_no_null, output_limit{500'000UL}, output_granularity); + EXPECT_EQ(num_chunks, 2); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result); + } + { + auto const [result, num_chunks] = + chunked_read(filepath_with_nulls, output_limit{500'000UL}, output_granularity); + EXPECT_EQ(num_chunks, 2); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); + } + + { + auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'000'000UL}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result); + } + { + auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'000'000UL}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); + } +} + +TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructs) +{ + auto constexpr num_rows = 100'000; + auto constexpr output_granularity = output_row_granularity{20'000}; + + auto const generate_input = [num_rows](bool nullable) { + std::vector> input_columns; + auto const int_iter = thrust::make_counting_iterator(0); + input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release()); + input_columns.emplace_back([=] { + auto child1 = int32s_col(int_iter, int_iter + num_rows); + auto child2 = int32s_col(int_iter + num_rows, int_iter + num_rows * 2); + + auto const str_iter = cudf::detail::make_counting_transform_iterator( + 0, [&](int32_t i) { return std::to_string(i); }); + auto child3 = strings_col{str_iter, str_iter + num_rows}; + + return structs_col{{child1, child2, child3}}.release(); + }()); + + return write_file(input_columns, "chunked_read_with_structs", nullable); + }; + + auto const [expected_no_null, filepath_no_null] = generate_input(false); + auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true); + + // Test with zero limit: everything will be read in one chunk. + { + auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result); + } + { + auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); + } + + // Test with a very small limit: 1 byte. + { + auto const [result, num_chunks] = + chunked_read(filepath_no_null, output_limit{1UL}, output_granularity); + EXPECT_EQ(num_chunks, 5); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result); + } + { + auto const [result, num_chunks] = + chunked_read(filepath_with_nulls, output_limit{1UL}, output_granularity); + EXPECT_EQ(num_chunks, 5); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); + } + + // Test with a very large limit. + { + auto const [result, num_chunks] = + chunked_read(filepath_no_null, output_limit{2L << 40}, output_granularity); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result); + } + { + auto const [result, num_chunks] = + chunked_read(filepath_with_nulls, output_limit{2L << 40}, output_granularity); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); + } + + // Other tests: + + { + auto const [result, num_chunks] = + chunked_read(filepath_no_null, output_limit{500'000UL}, output_granularity); + EXPECT_EQ(num_chunks, 5); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result); + } + { + auto const [result, num_chunks] = + chunked_read(filepath_with_nulls, output_limit{500'000UL}, output_granularity); + EXPECT_EQ(num_chunks, 5); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); + } +} + +TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsNoNulls) +{ + auto constexpr num_rows = 100'000; + auto constexpr output_granularity = output_row_granularity{20'000}; + + auto const [expected, filepath] = [num_rows]() { + std::vector> input_columns; + // 20000 rows in 1 segment consist of: + // + // 20001 offsets : 80004 bytes + // 30000 ints : 120000 bytes + // total : 200004 bytes + // + // However, `segmented_row_bit_count` used in chunked reader returns 200000, + // thus we consider as having only 200000 bytes in total. + auto const template_lists = int32s_lists_col{ + int32s_lists_col{}, int32s_lists_col{0}, int32s_lists_col{1, 2}, int32s_lists_col{3, 4, 5}}; + + auto const gather_iter = + cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { return i % 4; }); + auto const gather_map = int32s_col(gather_iter, gather_iter + num_rows); + input_columns.emplace_back( + std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front())); + + return write_file(input_columns, "chunked_read_with_lists_no_null"); + }(); + + // Test with zero limit: everything will be read in one chunk. + { + auto const [result, num_chunks] = chunked_read(filepath, output_limit{0UL}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + // Test with a very small limit: 1 byte. + { + auto const [result, num_chunks] = chunked_read(filepath, output_limit{1UL}, output_granularity); + EXPECT_EQ(num_chunks, 5); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + // Test with a very large limit. + { + auto const [result, num_chunks] = + chunked_read(filepath, output_limit{2L << 40UL}, output_granularity); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + // Chunk size slightly less than 1 row segment (forcing it to be at least 1 segment per read). + { + auto const [result, num_chunks] = + chunked_read(filepath, output_limit{199'999UL}, output_granularity); + EXPECT_EQ(num_chunks, 5); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + // Chunk size exactly 1 row segment. + { + auto const [result, num_chunks] = + chunked_read(filepath, output_limit{200'000UL}, output_granularity); + EXPECT_EQ(num_chunks, 5); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + // Chunk size == size of 2 segments. Totally have 3 chunks. + { + auto const [result, num_chunks] = + chunked_read(filepath, output_limit{400'000UL}, output_granularity); + EXPECT_EQ(num_chunks, 3); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + // Chunk size == size of 2 segment minus one byte: each chunk will be just one segment. + { + auto const [result, num_chunks] = + chunked_read(filepath, output_limit{399'999UL}, output_granularity); + EXPECT_EQ(num_chunks, 5); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } +} + +TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsHavingNulls) +{ + auto constexpr num_rows = 100'000; + auto constexpr output_granularity = output_row_granularity{20'000}; + + auto const [expected, filepath] = [num_rows]() { + std::vector> input_columns; + // 20000 rows in 1 page consist of: + // + // 625 validity words : 2500 bytes (a null every 4 rows: null at indices [3, 7, 11, ...]) + // 20001 offsets : 80004 bytes + // 15000 ints : 60000 bytes + // total : 142504 bytes + // + // However, `segmented_row_bit_count` used in chunked reader returns 142500, + // thus we consider as having only 142500 bytes in total. + auto const template_lists = + int32s_lists_col{// these will all be null + int32s_lists_col{}, + int32s_lists_col{0}, + int32s_lists_col{1, 2}, + int32s_lists_col{3, 4, 5, 6, 7, 8, 9} /* this list will be nullified out */}; + auto const gather_iter = + cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { return i % 4; }); + auto const gather_map = int32s_col(gather_iter, gather_iter + num_rows); + input_columns.emplace_back( + std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front())); + + return write_file(input_columns, "chunked_read_with_lists_nulls", true /*nullable*/); + }(); + + // Test with zero limit: everything will be read in one chunk. + { + auto const [result, num_chunks] = chunked_read(filepath, output_limit{0UL}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + // Test with a very small limit: 1 byte. + { + auto const [result, num_chunks] = chunked_read(filepath, output_limit{1UL}, output_granularity); + EXPECT_EQ(num_chunks, 5); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + // Test with a very large limit. + { + auto const [result, num_chunks] = + chunked_read(filepath, output_limit{2L << 40}, output_granularity); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + // Chunk size slightly less than 1 row segment (forcing it to be at least 1 segment per read). + { + auto const [result, num_chunks] = + chunked_read(filepath, output_limit{142'499UL}, output_granularity); + EXPECT_EQ(num_chunks, 5); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + // Chunk size exactly 1 row segment. + { + auto const [result, num_chunks] = + chunked_read(filepath, output_limit{142'500UL}, output_granularity); + EXPECT_EQ(num_chunks, 5); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + // Chunk size == size of 2 segments. Totally have 3 chunks. + { + auto const [result, num_chunks] = + chunked_read(filepath, output_limit{285'000UL}, output_granularity); + EXPECT_EQ(num_chunks, 3); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + // Chunk size == size of 2 segment minus one byte: each chunk will be just one segment. + { + auto const [result, num_chunks] = + chunked_read(filepath, output_limit{284'999UL}, output_granularity); + EXPECT_EQ(num_chunks, 5); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } +} + +TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructsOfLists) +{ + auto constexpr num_rows = 100'000; + + // Size of each segment (10k row by default) is from 537k to 560k bytes (no nulls) + // and from 456k to 473k (with nulls). + auto const generate_input = [num_rows](bool nullable) { + std::vector> input_columns; + auto const int_iter = thrust::make_counting_iterator(0); + input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release()); + input_columns.emplace_back([=] { + std::vector> child_columns; + child_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release()); + child_columns.emplace_back( + int32s_col(int_iter + num_rows, int_iter + num_rows * 2).release()); + + auto const str_iter = cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { + return std::to_string(i) + "++++++++++++++++++++" + std::to_string(i); + }); + child_columns.emplace_back(strings_col{str_iter, str_iter + num_rows}.release()); + + auto const template_lists = int32s_lists_col{ + int32s_lists_col{}, int32s_lists_col{0}, int32s_lists_col{0, 1}, int32s_lists_col{0, 1, 2}}; + auto const gather_iter = + cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { return i % 4; }); + auto const gather_map = int32s_col(gather_iter, gather_iter + num_rows); + child_columns.emplace_back( + std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front())); + + return structs_col(std::move(child_columns)).release(); + }()); + + return write_file(input_columns, "chunked_read_with_structs_of_lists", nullable); + }; + + auto const [expected_no_null, filepath_no_null] = generate_input(false); + auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true); + + // Test with zero limit: everything will be read in one chunk. + { + auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result); + } + { + auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); + } + + // Test with a very small limit: 1 byte. + { + auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1UL}); + EXPECT_EQ(num_chunks, 10); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result); + } + { + auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1UL}); + EXPECT_EQ(num_chunks, 10); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); + } + + // Test with a very large limit. + { + auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2L << 40}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result); + } + { + auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2L << 40}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); + } + + // Other tests: + + { + auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'000'000UL}); + EXPECT_EQ(num_chunks, 10); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result); + } + + { + auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'500'000UL}); + EXPECT_EQ(num_chunks, 5); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result); + } + + { + auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2'000'000UL}); + EXPECT_EQ(num_chunks, 4); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result); + } + + { + auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{5'000'000UL}); + EXPECT_EQ(num_chunks, 2); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result); + } + + { + auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'000'000UL}); + EXPECT_EQ(num_chunks, 5); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); + } + + { + auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'500'000UL}); + EXPECT_EQ(num_chunks, 4); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); + } + + { + auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2'000'000UL}); + EXPECT_EQ(num_chunks, 3); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); + } + + { + auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{5'000'000UL}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); + } +} + +TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsOfStructs) +{ + auto constexpr num_rows = 100'000; + + // Size of each segment (10k row by default) is from 450k to 530k bytes (no nulls) + // and from 330k to 380k (with nulls). + auto const generate_input = [num_rows](bool nullable) { + std::vector> input_columns; + auto const int_iter = thrust::make_counting_iterator(0); + input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release()); + + auto offsets = std::vector{}; + offsets.reserve(num_rows * 2); + cudf::size_type num_structs = 0; + for (int i = 0; i < num_rows; ++i) { + offsets.push_back(num_structs); + auto const new_list_size = i % 4; + num_structs += new_list_size; + } + offsets.push_back(num_structs); + + auto const make_structs_col = [=] { + auto child1 = int32s_col(int_iter, int_iter + num_structs); + auto child2 = int32s_col(int_iter + num_structs, int_iter + num_structs * 2); + + auto const str_iter = cudf::detail::make_counting_transform_iterator( + 0, [&](int32_t i) { return std::to_string(i) + std::to_string(i) + std::to_string(i); }); + auto child3 = strings_col{str_iter, str_iter + num_structs}; + + return structs_col{{child1, child2, child3}}.release(); + }; + + input_columns.emplace_back( + cudf::make_lists_column(static_cast(offsets.size() - 1), + int32s_col(offsets.begin(), offsets.end()).release(), + make_structs_col(), + 0, + rmm::device_buffer{})); + + return write_file(input_columns, "chunked_read_with_lists_of_structs", nullable); + }; + + auto const [expected_no_null, filepath_no_null] = generate_input(false); + auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true); + + // Test with zero limit: everything will be read in one chunk. + { + auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result); + } + { + auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); + } + + // Test with a very small limit: 1 byte. + { + auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1UL}); + EXPECT_EQ(num_chunks, 10); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result); + } + { + auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1UL}); + EXPECT_EQ(num_chunks, 10); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); + } + + // Test with a very large limit. + { + auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2L << 40}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result); + } + { + auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2L << 40}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); + } + + // Other tests. + + { + auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'000'000UL}); + EXPECT_EQ(num_chunks, 7); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result); + } + + { + auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'500'000UL}); + EXPECT_EQ(num_chunks, 4); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result); + } + + { + auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2'000'000UL}); + EXPECT_EQ(num_chunks, 3); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result); + } + + { + auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{5'000'000UL}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result); + } + + { + auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'000'000UL}); + EXPECT_EQ(num_chunks, 5); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); + } + + { + auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'500'000UL}); + EXPECT_EQ(num_chunks, 3); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); + } + + { + auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2'000'000UL}); + EXPECT_EQ(num_chunks, 2); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); + } + + { + auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{5'000'000UL}); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); + } +} + +TEST_F(OrcChunkedReaderTest, TestChunkedReadNullCount) +{ + auto constexpr num_rows = 100'000; + + auto const sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; }); + auto const validity = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 4 != 3; }); + std::vector> cols; + cols.push_back(int32s_col{sequence, sequence + num_rows, validity}.release()); + auto const expected = std::make_unique(std::move(cols)); + + auto const filepath = temp_env->get_temp_filepath("chunked_reader_null_count.orc"); + auto const stripe_limit_rows = num_rows / 5; + auto const write_opts = + cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, *expected) + .stripe_size_rows(stripe_limit_rows) + .build(); + cudf::io::write_orc(write_opts); + + auto const byte_limit = stripe_limit_rows * sizeof(int); + auto const read_opts = + cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).build(); + auto reader = + cudf::io::chunked_orc_reader(byte_limit, 0UL /*read_limit*/, stripe_limit_rows, read_opts); + + do { + // Every fourth row is null. + EXPECT_EQ(reader.read_chunk().tbl->get_column(0).null_count(), stripe_limit_rows / 4UL); + } while (reader.has_next()); +} + +namespace { + +std::size_t constexpr input_limit_expected_file_count = 3; + +std::vector input_limit_get_test_names(std::string const& base_filename) +{ + return {base_filename + "_a.orc", base_filename + "_b.orc", base_filename + "_c.orc"}; +} + +void input_limit_test_write_one(std::string const& filepath, + cudf::table_view const& input, + cudf::size_type stripe_size_rows, + cudf::io::compression_type compression) +{ + auto const out_opts = cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, input) + .compression(compression) + .stripe_size_rows(stripe_size_rows) + .build(); + cudf::io::write_orc(out_opts); +} + +void input_limit_test_write( + std::vector const& test_files, + cudf::table_view const& input, + cudf::size_type stripe_size_rows = 20'000 /*write relatively small stripes by default*/) +{ + CUDF_EXPECTS(test_files.size() == input_limit_expected_file_count, + "Unexpected count of test filenames."); + + // ZSTD yields a very small decompression size, can be much smaller than SNAPPY. + // However, ORC reader typically over-estimates the decompression size of data + // compressed by ZSTD to be very large, can be much larger than that of SNAPPY. + // That is because ZSTD may use a lot of scratch space at decode time + // (2.5x the total decompressed buffer size). + // As such, we may see smaller output chunks for the input data compressed by ZSTD. + input_limit_test_write_one( + test_files[0], input, stripe_size_rows, cudf::io::compression_type::NONE); + input_limit_test_write_one( + test_files[1], input, stripe_size_rows, cudf::io::compression_type::ZSTD); + input_limit_test_write_one( + test_files[2], input, stripe_size_rows, cudf::io::compression_type::SNAPPY); +} + +void input_limit_test_read(int test_location, + std::vector const& test_files, + cudf::table_view const& input, + output_limit output_limit_bytes, + input_limit input_limit_bytes, + int const* expected_chunk_counts) +{ + CUDF_EXPECTS(test_files.size() == input_limit_expected_file_count, + "Unexpected count of test filenames."); + + for (size_t idx = 0; idx < test_files.size(); ++idx) { + SCOPED_TRACE("Original line of failure: " + std::to_string(test_location) + + ", file idx: " + std::to_string(idx)); + auto const [result, num_chunks] = + chunked_read(test_files[idx], output_limit_bytes, input_limit_bytes); + EXPECT_EQ(expected_chunk_counts[idx], num_chunks); + // TODO: equal + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, input); + } +} + +} // namespace + +struct OrcChunkedReaderInputLimitTest : public cudf::test::BaseFixture {}; + +TEST_F(OrcChunkedReaderInputLimitTest, SingleFixedWidthColumn) +{ + auto constexpr num_rows = 1'000'000; + auto const iter1 = thrust::make_constant_iterator(15); + auto const col1 = doubles_col(iter1, iter1 + num_rows); + + auto const filename = std::string{"single_col_fixed_width"}; + auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename)); + auto const input = cudf::table_view{{col1}}; + input_limit_test_write(test_files, input); + + { + int constexpr expected[] = {50, 50, 50}; + input_limit_test_read( + __LINE__, test_files, input, output_limit{0UL}, input_limit{1UL}, expected); + } + + { + int constexpr expected[] = {17, 13, 10}; + input_limit_test_read( + __LINE__, test_files, input, output_limit{0UL}, input_limit{2 * 1024 * 1024UL}, expected); + } +} + +TEST_F(OrcChunkedReaderInputLimitTest, MixedColumns) +{ + auto constexpr num_rows = 1'000'000; + + auto const iter1 = thrust::make_counting_iterator(0); + auto const col1 = int32s_col(iter1, iter1 + num_rows); + + auto const iter2 = thrust::make_counting_iterator(0); + auto const col2 = doubles_col(iter2, iter2 + num_rows); + + auto const strings = std::vector{"abc", "de", "fghi"}; + auto const str_iter = cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { + if (i < 250000) { return strings[0]; } + if (i < 750000) { return strings[1]; } + return strings[2]; + }); + auto const col3 = strings_col(str_iter, str_iter + num_rows); + + auto const filename = std::string{"mixed_columns"}; + auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename)); + auto const input = cudf::table_view{{col1, col2, col3}}; + input_limit_test_write(test_files, input); + + { + int constexpr expected[] = {50, 50, 50}; + input_limit_test_read( + __LINE__, test_files, input, output_limit{0UL}, input_limit{1UL}, expected); + } + + { + int constexpr expected[] = {17, 50, 17}; + input_limit_test_read( + __LINE__, test_files, input, output_limit{0UL}, input_limit{2 * 1024 * 1024UL}, expected); + } +} + +namespace { + +struct offset_gen { + int const group_size; + __device__ int operator()(int i) const { return i * group_size; } +}; + +template +struct value_gen { + __device__ T operator()(int i) const { return i % 1024; } +}; + +struct char_values { + __device__ int8_t operator()(int i) const + { + int const index = (i / 2) % 3; + // Generate repeating 3-runs of 2 values each: "aabbccaabbcc...". + return index == 0 ? 'a' : (index == 1 ? 'b' : 'c'); + } +}; + +} // namespace + +TEST_F(OrcChunkedReaderInputLimitTest, ListType) +{ + int constexpr num_rows = 50'000'000; + int constexpr list_size = 4; + + auto const stream = cudf::get_default_stream(); + auto const iter = thrust::make_counting_iterator(0); + + auto offset_col = cudf::make_fixed_width_column( + cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED); + thrust::transform(rmm::exec_policy(stream), + iter, + iter + num_rows + 1, + offset_col->mutable_view().begin(), + offset_gen{list_size}); + + int constexpr num_ints = num_rows * list_size; + auto value_col = cudf::make_fixed_width_column( + cudf::data_type{cudf::type_id::INT32}, num_ints, cudf::mask_state::UNALLOCATED); + thrust::transform(rmm::exec_policy(stream), + iter, + iter + num_ints, + value_col->mutable_view().begin(), + value_gen{}); + + auto const lists_col = + cudf::make_lists_column(num_rows, std::move(offset_col), std::move(value_col), 0, {}, stream); + + auto const filename = std::string{"list_type"}; + auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename)); + auto const input = cudf::table_view{{*lists_col}}; + + // Although we set `stripe_size_rows` to be very large, the writer only write + // 250k rows (top level) per stripe due to having nested type. + // Thus, we have 200 stripes in total. + input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows); + + { + int constexpr expected[] = {3, 40, 3}; + input_limit_test_read( + __LINE__, test_files, input, output_limit{0UL}, input_limit{5 * 1024 * 1024UL}, expected); + } + + { + int constexpr expected[] = {8, 40, 9}; + input_limit_test_read(__LINE__, + test_files, + input, + output_limit{128 * 1024 * 1024UL}, + input_limit{5 * 1024 * 1024UL}, + expected); + } +} + +TEST_F(OrcChunkedReaderInputLimitTest, MixedColumnsHavingList) +{ + int constexpr num_rows = 50'000'000; + int constexpr list_size = 4; + int constexpr str_size = 3; + + auto const stream = cudf::get_default_stream(); + auto const iter = thrust::make_counting_iterator(0); + + // list + auto offset_col = cudf::make_fixed_width_column( + cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED); + thrust::transform(rmm::exec_policy(stream), + iter, + iter + num_rows + 1, + offset_col->mutable_view().begin(), + offset_gen{list_size}); + + int constexpr num_ints = num_rows * list_size; + auto value_col = cudf::make_fixed_width_column( + cudf::data_type{cudf::type_id::INT32}, num_ints, cudf::mask_state::UNALLOCATED); + thrust::transform(rmm::exec_policy(stream), + iter, + iter + num_ints, + value_col->mutable_view().begin(), + value_gen{}); + + auto const lists_col = + cudf::make_lists_column(num_rows, std::move(offset_col), std::move(value_col), 0, {}, stream); + + // strings + int constexpr num_chars = num_rows * str_size; + auto str_offset_col = cudf::make_fixed_width_column( + cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED); + thrust::transform(rmm::exec_policy(stream), + iter, + iter + num_rows + 1, + str_offset_col->mutable_view().begin(), + offset_gen{str_size}); + rmm::device_buffer str_chars(num_chars, stream); + thrust::transform(rmm::exec_policy(stream), + iter, + iter + num_chars, + static_cast(str_chars.data()), + char_values{}); + auto const str_col = + cudf::make_strings_column(num_rows, std::move(str_offset_col), std::move(str_chars), 0, {}); + + // doubles + auto const double_col = cudf::make_fixed_width_column( + cudf::data_type{cudf::type_id::FLOAT64}, num_rows, cudf::mask_state::UNALLOCATED); + thrust::transform(rmm::exec_policy(stream), + iter, + iter + num_rows, + double_col->mutable_view().begin(), + value_gen{}); + + auto const filename = std::string{"mixed_cols_having_list"}; + auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename)); + auto const input = cudf::table_view{{*lists_col, *str_col, *double_col}}; + + // Although we set `stripe_size_rows` to be very large, the writer only write + // 250k rows (top level) per stripe due to having nested type. + // Thus, we have 200 stripes in total. + input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows); + + { + int constexpr expected[] = {13, 8, 6}; + input_limit_test_read( + __LINE__, test_files, input, output_limit{0UL}, input_limit{128 * 1024 * 1024UL}, expected); + } + + { + int constexpr expected[] = {13, 15, 17}; + input_limit_test_read(__LINE__, + test_files, + input, + output_limit{128 * 1024 * 1024UL}, + input_limit{128 * 1024 * 1024UL}, + expected); + } +} + +TEST_F(OrcChunkedReaderInputLimitTest, ReadWithRowSelection) +{ + // `num_rows` should not be divisible by `stripe_size_rows`, to test the correctness of row + // selections. + int64_t constexpr num_rows = 100'517'687l; + int constexpr rows_per_stripe = 100'000; + static_assert(num_rows % rows_per_stripe != 0, + "`num_rows` should not be divisible by `stripe_size_rows`."); + + auto const it = thrust::make_counting_iterator(0); + auto const col = int32s_col(it, it + num_rows); + auto const input = cudf::table_view{{col}}; + + auto const filepath = temp_env->get_temp_filepath("chunk_read_with_row_selection.orc"); + auto const write_opts = + cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, input) + .stripe_size_rows(rows_per_stripe) + .build(); + cudf::io::write_orc(write_opts); + + // Verify metadata. + auto const metadata = cudf::io::read_orc_metadata(cudf::io::source_info{filepath}); + EXPECT_EQ(metadata.num_rows(), num_rows); + EXPECT_EQ(metadata.num_stripes(), num_rows / rows_per_stripe + 1); + + int constexpr random_val = 123456; + + // Read some random number or rows that is not stripe size. + int constexpr num_rows_to_read = rows_per_stripe * 5 + random_val; + + // Just shift the read data region back by a random offset. + const auto num_rows_to_skip = num_rows - num_rows_to_read - random_val; + + const auto sequence_start = num_rows_to_skip % num_rows; + auto const skipped_col = int32s_col(it + sequence_start, it + sequence_start + num_rows_to_read); + auto const expected = cudf::table_view{{skipped_col}}; + + auto const read_opts = cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}) + .use_index(false) + .skip_rows(num_rows_to_skip) + .num_rows(num_rows_to_read) + .build(); + + auto reader = cudf::io::chunked_orc_reader( + 60'000UL * sizeof(int) /*output limit, equal to 60k rows, less than rows in 1 stripe*/, + rows_per_stripe * sizeof(int) /*input limit, around size of 1 stripe's decoded data*/, + 50'000 /*output granularity, or minimum number of rows for the output chunk*/, + read_opts); + + auto num_chunks = 0; + auto read_tables = std::vector>{}; + auto tviews = std::vector{}; + + do { + auto chunk = reader.read_chunk(); + // Each output chunk should have either exactly 50k rows, or num_rows_to_read % 50k. + EXPECT_TRUE(chunk.tbl->num_rows() == 50000 || + chunk.tbl->num_rows() == num_rows_to_read % 50000); + + tviews.emplace_back(chunk.tbl->view()); + read_tables.emplace_back(std::move(chunk.tbl)); + ++num_chunks; + } while (reader.has_next()); + + auto const read_result = cudf::concatenate(tviews); + EXPECT_EQ(num_chunks, 13); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, read_result->view()); +} + +TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow) +{ + using data_type = int16_t; + using data_col = cudf::test::fixed_width_column_wrapper; + + int64_t constexpr num_rows = 500'000'000l; + int constexpr rows_per_stripe = 1'000'000; + int constexpr num_reps = 10; + int64_t constexpr total_rows = num_rows * num_reps; + static_assert(total_rows > std::numeric_limits::max()); + + auto const it = cudf::detail::make_counting_transform_iterator(0l, [num_rows](int64_t i) { + return (i % num_rows) % static_cast(std::numeric_limits::max() / 2); + }); + auto const col = data_col(it, it + num_rows); + auto const chunk_table = cudf::table_view{{col}}; + + std::vector data_buffer; + { + auto const write_opts = + cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&data_buffer}) + .stripe_size_rows(rows_per_stripe) + .build(); + + auto writer = cudf::io::orc_chunked_writer(write_opts); + for (int i = 0; i < num_reps; ++i) { + writer.write(chunk_table); + } + } + + // Verify metadata. + auto const metadata = + cudf::io::read_orc_metadata(cudf::io::source_info{data_buffer.data(), data_buffer.size()}); + EXPECT_EQ(metadata.num_rows(), total_rows); + EXPECT_EQ(metadata.num_stripes(), total_rows / rows_per_stripe); + + // Read with row selections and memory limit. + { + int64_t constexpr num_rows_to_read = 5'000'000l; + int64_t const num_rows_to_skip = + static_cast(metadata.num_rows()) - num_rows_to_read - + 123456l /*just shift the read data region back by a random offset*/; + + // Check validity of the last 5 million rows. + auto const sequence_start = num_rows_to_skip % num_rows; + auto const skipped_col = data_col(it + sequence_start, it + sequence_start + num_rows_to_read); + auto const expected = cudf::table_view{{skipped_col}}; + + auto const read_opts = cudf::io::orc_reader_options::builder( + cudf::io::source_info{data_buffer.data(), data_buffer.size()}) + .use_index(false) + .skip_rows(num_rows_to_skip) + .num_rows(num_rows_to_read) + .build(); + auto reader = cudf::io::chunked_orc_reader( + 600'000UL * sizeof(data_type) /* output limit, equal to 600k rows */, + rows_per_stripe * sizeof(data_type) /* input limit, around size of 1 stripe's decoded data */, + rows_per_stripe / 2 /* output granularity, or minimum number of rows for the output chunk */, + read_opts); + + auto num_chunks = 0; + auto read_tables = std::vector>{}; + auto tviews = std::vector{}; + + do { + auto chunk = reader.read_chunk(); + ++num_chunks; + tviews.emplace_back(chunk.tbl->view()); + read_tables.emplace_back(std::move(chunk.tbl)); + } while (reader.has_next()); + + auto const read_result = cudf::concatenate(tviews); + EXPECT_EQ(num_chunks, 11); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, read_result->view()); + } + + // The test below requires a huge amount of memory, thus it is disabled by default. +#ifdef LOCAL_TEST + // Read with only output limit -- there is no limit on the memory usage. + // However, the reader should be able to detect and load only enough stripes each time + // to avoid decoding a table having number of rows that exceeds the column size limit. + { + auto const read_opts = cudf::io::orc_reader_options::builder( + cudf::io::source_info{data_buffer.data(), data_buffer.size()}) + .use_index(false) + .build(); + auto reader = cudf::io::chunked_orc_reader( + static_cast(rows_per_stripe * 5.7) * + sizeof(data_type) /* output limit, equal to 5.7M rows */, + 0UL /* no input limit */, + rows_per_stripe / 2 /* output granularity, or minimum number of rows for the output chunk */, + read_opts); + + int num_chunks = 0; + int64_t num_read_rows = 0; + int64_t test_rows_start = 0; + auto test_chunk = std::unique_ptr{}; + + do { + auto chunk = reader.read_chunk(); + auto const chunk_rows = chunk.tbl->num_rows(); + + // Just randomly select one output chunk to verify. + if (num_chunks == 123) { + test_rows_start = num_read_rows; + test_chunk = std::move(chunk.tbl); + } + + ++num_chunks; + num_read_rows += chunk_rows; + } while (reader.has_next()); + + EXPECT_EQ(num_read_rows, total_rows); + + // Typically, we got a chunk having 5M rows. + // However, since the reader internally splits file stripes that are not multiple of 5 stripes, + // we may have some extra chunks that have less than 5M rows. + EXPECT_EQ(num_chunks, 1002); + + // Verify the selected chunk. + using namespace cudf::test::iterators; + auto const skipped_col = + data_col(it + test_rows_start, it + test_rows_start + test_chunk->num_rows(), no_nulls()); + auto const expected = cudf::table_view{{skipped_col}}; + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, test_chunk->view()); + } + +#endif // LOCAL_TEST +} diff --git a/cpp/tests/io/row_selection_test.cpp b/cpp/tests/io/row_selection_test.cpp index 0c259c81a23..ebadd870091 100644 --- a/cpp/tests/io/row_selection_test.cpp +++ b/cpp/tests/io/row_selection_test.cpp @@ -122,17 +122,4 @@ TEST_F(FromOptsTest, LimitOptionsToFileRows) } } -TEST_F(FromOptsTest, OverFlowDetection) -{ - auto const too_large_for_32bit = std::numeric_limits::max(); - - // Too many rows to read until the end of the file - EXPECT_THROW(skip_rows_num_rows_from_options(0, std::nullopt, too_large_for_32bit), - std::overflow_error); - - // Should work fine with num_rows - EXPECT_NO_THROW( - skip_rows_num_rows_from_options(1000, too_large_for_32bit - 100, too_large_for_32bit)); -} - CUDF_TEST_PROGRAM_MAIN() diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd index d5ac8574fe4..d5bb1726a43 100644 --- a/python/cudf/cudf/_lib/cpp/io/orc.pxd +++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd @@ -1,6 +1,6 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libc.stdint cimport uint8_t +from libc.stdint cimport int64_t, uint8_t from libcpp cimport bool from libcpp.map cimport map from libcpp.memory cimport shared_ptr, unique_ptr @@ -21,8 +21,8 @@ cdef extern from "cudf/io/orc.hpp" \ cudf_io_types.source_info get_source() except + vector[vector[size_type]] get_stripes() except + - size_type get_skip_rows() except + - size_type get_num_rows() except + + int64_t get_skip_rows() except + + optional[int64_t] get_num_rows() except + bool is_enabled_use_index() except + bool is_enabled_use_np_dtypes() except + data_type get_timestamp_type() except + @@ -31,8 +31,8 @@ cdef extern from "cudf/io/orc.hpp" \ void set_columns(vector[string] col_names) except + void set_stripes(vector[vector[size_type]] strps) except + - void set_skip_rows(size_type rows) except + - void set_num_rows(size_type nrows) except + + void set_skip_rows(int64_t rows) except + + void set_num_rows(int64_t nrows) except + void enable_use_index(bool val) except + void enable_use_np_dtypes(bool val) except + void set_timestamp_type(data_type type) except + @@ -49,8 +49,8 @@ cdef extern from "cudf/io/orc.hpp" \ orc_reader_options_builder& columns(vector[string] col_names) except + orc_reader_options_builder& \ stripes(vector[vector[size_type]] strps) except + - orc_reader_options_builder& skip_rows(size_type rows) except + - orc_reader_options_builder& num_rows(size_type nrows) except + + orc_reader_options_builder& skip_rows(int64_t rows) except + + orc_reader_options_builder& num_rows(int64_t nrows) except + orc_reader_options_builder& use_index(bool val) except + orc_reader_options_builder& use_np_dtypes(bool val) except + orc_reader_options_builder& timestamp_type(data_type type) except + diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 836880a6f2c..918880648bf 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -472,11 +472,11 @@ cdef int64_t get_skiprows_arg(object arg) except*: raise TypeError("skiprows must be an int >= 0") return arg -cdef size_type get_num_rows_arg(object arg) except*: +cdef int64_t get_num_rows_arg(object arg) except*: arg = -1 if arg is None else arg if not isinstance(arg, int) or arg < -1: raise TypeError("num_rows must be an int >= -1") - return arg + return arg cdef orc_reader_options make_orc_reader_options( @@ -484,7 +484,7 @@ cdef orc_reader_options make_orc_reader_options( object column_names, object stripes, int64_t skip_rows, - size_type num_rows, + int64_t num_rows, type_id timestamp_type, bool use_index ) except*: