Write string data directly to column_buffer in Parquet reader (#13302)

The current Parquet reader decodes string data into a list of {ptr, length} tuples, which are then used in a gather step by `make_strings_column`. This gather step can be time consuming, especially when there are a large number of string columns. This PR addresses this by changing the decode step to write char and offset data directly to the `column_buffer`, which can then be used directly, bypassing the gather step. The image below compares the new approach to the old. The green arc at the top (82ms) is `gpuDecodePageData`, and the red arc (252ms) is the time spent in `make_strings_column`. The green arc below (25ms) is `gpuDecodePageData`, the amber arc (22ms) is a new kernel that computes string sizes for each page, and the magenta arc (106ms) is the kernel that decodes string columns. ![flat_edited](https://user-images.githubusercontent.com/25541553/236529570-f2d0d8d4-b2b5-4078-93ae-5123fa489c3c.png) NVbench shows a good speed up for strings as well. There is a jump in time for the INTEGRAL benchmark, but little to no change for other data types. The INTEGRAL time seems to be affected by extra time spent in `malloc` allocating host memory for a `hostdevice_vector`. This `malloc` always occurs, but for some reason in this branch it takes much longer to return. This is comparing to @nvdbaranec's branch for #13203. ``` | data_type | io | cardinality | run_length | Ref Time | Cmp Time | Diff | %Diff | |-------------|---------------|---------------|--------------|------------|------------|-------------|---------| | INTEGRAL | DEVICE_BUFFER | 0 | 1 | 14.288 ms | 14.729 ms | 440.423 us | 3.08% | | INTEGRAL | DEVICE_BUFFER | 1000 | 1 | 13.397 ms | 13.997 ms | 600.596 us | 4.48% | | INTEGRAL | DEVICE_BUFFER | 0 | 32 | 11.831 ms | 12.354 ms | 522.485 us | 4.42% | | INTEGRAL | DEVICE_BUFFER | 1000 | 32 | 11.335 ms | 11.854 ms | 518.791 us | 4.58% | | FLOAT | DEVICE_BUFFER | 0 | 1 | 8.681 ms | 8.715 ms | 34.846 us | 0.40% | | FLOAT | DEVICE_BUFFER | 1000 | 1 | 8.473 ms | 8.472 ms | -0.680 us | -0.01% | | FLOAT | DEVICE_BUFFER | 0 | 32 | 7.217 ms | 7.192 ms | -25.311 us | -0.35% | | FLOAT | DEVICE_BUFFER | 1000 | 32 | 7.425 ms | 7.422 ms | -3.162 us | -0.04% | | STRING | DEVICE_BUFFER | 0 | 1 | 50.079 ms | 42.566 ms |-7513.004 us | -15.00% | | STRING | DEVICE_BUFFER | 1000 | 1 | 16.813 ms | 14.989 ms |-1823.660 us | -10.85% | | STRING | DEVICE_BUFFER | 0 | 32 | 49.875 ms | 42.443 ms |-7432.718 us | -14.90% | | STRING | DEVICE_BUFFER | 1000 | 32 | 15.312 ms | 13.953 ms |-1358.910 us | -8.87% | | LIST | DEVICE_BUFFER | 0 | 1 | 80.303 ms | 80.688 ms | 385.916 us | 0.48% | | LIST | DEVICE_BUFFER | 1000 | 1 | 71.921 ms | 72.356 ms | 435.153 us | 0.61% | | LIST | DEVICE_BUFFER | 0 | 32 | 61.658 ms | 62.129 ms | 471.022 us | 0.76% | | LIST | DEVICE_BUFFER | 1000 | 32 | 63.086 ms | 63.371 ms | 285.608 us | 0.45% | | STRUCT | DEVICE_BUFFER | 0 | 1 | 66.272 ms | 61.142 ms |-5130.639 us | -7.74% | | STRUCT | DEVICE_BUFFER | 1000 | 1 | 40.217 ms | 39.328 ms | -888.781 us | -2.21% | | STRUCT | DEVICE_BUFFER | 0 | 32 | 63.660 ms | 58.837 ms |-4822.647 us | -7.58% | | STRUCT | DEVICE_BUFFER | 1000 | 32 | 38.080 ms | 37.104 ms | -976.133 us | -2.56% | ``` May address #13024 ~Depends on #13203~ Authors: - Ed Seidl (https://github.com/etseidl) - https://github.com/nvdbaranec - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Mike Wilson (https://github.com/hyperbolic2346) - https://github.com/nvdbaranec - Vyas Ramasubramani (https://github.com/vyasr) URL: #13302
rapidsai · Jun 23, 2023 · 0fc31a7 · 0fc31a7
1 parent 0b4e354
commit 0fc31a7
Show file tree

Hide file tree

Showing 16 changed files with 2,803 additions and 1,403 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -392,6 +392,7 @@ add_library(
   src/io/parquet/chunk_dict.cu
   src/io/parquet/page_enc.cu
   src/io/parquet/page_hdr.cu
+  src/io/parquet/page_string_decode.cu
   src/io/parquet/reader.cpp
   src/io/parquet/reader_impl.cpp
   src/io/parquet/reader_impl_helpers.cpp

diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
@@ -785,6 +785,25 @@ std::vector<cudf::type_id> cycle_dtypes(std::vector<cudf::type_id> const& dtype_
   return out_dtypes;
 }
 
+/**
+ * @brief Repeat the given two data types with a given ratio of a:b.
+ *
+ * The first dtype will have 'first_num' columns and the second will have 'num_cols - first_num'
+ * columns.
+ */
+std::vector<cudf::type_id> mix_dtypes(std::pair<cudf::type_id, cudf::type_id> const& dtype_ids,
+                                      cudf::size_type num_cols,
+                                      int first_num)
+{
+  std::vector<cudf::type_id> out_dtypes;
+  out_dtypes.reserve(num_cols);
+  for (cudf::size_type col = 0; col < first_num; ++col)
+    out_dtypes.push_back(dtype_ids.first);
+  for (cudf::size_type col = first_num; col < num_cols; ++col)
+    out_dtypes.push_back(dtype_ids.second);
+  return out_dtypes;
+}
+
 std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> const& dtype_ids,
                                                  table_size_bytes table_bytes,
                                                  data_profile const& profile,

diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp
@@ -666,6 +666,21 @@ std::unique_ptr<cudf::table> create_sequence_table(
  */
 std::vector<cudf::type_id> cycle_dtypes(std::vector<cudf::type_id> const& dtype_ids,
                                         cudf::size_type num_cols);
+
+/**
+ * @brief Repeat the given two data types with a given ratio of a:b.
+ *
+ * The first dtype will have 'first_num' columns and the second will have 'num_cols - first_num'
+ * columns.
+ *
+ * @param dtype_ids Pair of requested column types
+ * @param num_cols Total number of columns in the output vector
+ * @param first_num Total number of columns of type `dtype_ids.first`
+ * @return A vector of type_ids
+ */
+std::vector<cudf::type_id> mix_dtypes(std::pair<cudf::type_id, cudf::type_id> const& dtype_ids,
+                                      cudf::size_type num_cols,
+                                      int first_num);
 /**
  * @brief Create a random null mask object
  *

diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
@@ -114,6 +114,38 @@ void BM_parquet_read_io_compression(
   parquet_read_common(write_opts, source_sink, state);
 }
 
+template <cudf::io::io_type IOType>
+void BM_parquet_read_io_small_mixed(nvbench::state& state,
+                                    nvbench::type_list<nvbench::enum_type<IOType>>)
+{
+  auto const d_type =
+    std::pair<cudf::type_id, cudf::type_id>{cudf::type_id::STRING, cudf::type_id::INT32};
+
+  cudf::size_type const cardinality = state.get_int64("cardinality");
+  cudf::size_type const run_length  = state.get_int64("run_length");
+  cudf::size_type const num_strings = state.get_int64("num_string_cols");
+  auto const source_type            = IOType;
+
+  // want 80 pages total, across 4 columns, so 20 pages per column
+  cudf::size_type constexpr n_col          = 4;
+  cudf::size_type constexpr page_size_rows = 10'000;
+  cudf::size_type constexpr num_rows       = page_size_rows * (80 / n_col);
+
+  auto const tbl =
+    create_random_table(mix_dtypes(d_type, n_col, num_strings),
+                        row_count{num_rows},
+                        data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+  auto const view = tbl->view();
+
+  cuio_source_sink_pair source_sink(source_type);
+  cudf::io::parquet_writer_options write_opts =
+    cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
+      .max_page_size_rows(10'000)
+      .compression(cudf::io::compression_type::NONE);
+
+  parquet_read_common(write_opts, source_sink, state);
+}
+
 template <data_type DataType, cudf::io::io_type IOType>
 void BM_parquet_read_chunks(
   nvbench::state& state,
@@ -203,3 +235,12 @@ NVBENCH_BENCH_TYPES(BM_parquet_read_chunks,
   .add_int64_axis("cardinality", {0, 1000})
   .add_int64_axis("run_length", {1, 32})
   .add_int64_axis("byte_limit", {0, 500'000});
+
+NVBENCH_BENCH_TYPES(BM_parquet_read_io_small_mixed,
+                    NVBENCH_TYPE_AXES(nvbench::enum_type_list<cudf::io::io_type::FILEPATH>))
+  .set_name("parquet_read_io_small_mixed")
+  .set_type_axes_names({"io"})
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32})
+  .add_int64_axis("num_string_cols", {1, 2, 3});
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
@@ -525,13 +525,13 @@ void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks
                            };
                          });
 
-        out_buffers[col_idx]._null_mask = std::move(merged_null_mask);
+        out_buffers[col_idx].set_null_mask(std::move(merged_null_mask));
 
       } else {
         // Since child column doesn't have a mask, copy parent null mask
         auto mask_size = bitmask_allocation_size_bytes(parent_mask_len);
-        out_buffers[col_idx]._null_mask =
-          rmm::device_buffer(static_cast<void*>(parent_valid_map_base), mask_size, stream, mr);
+        out_buffers[col_idx].set_null_mask(
+          rmm::device_buffer(static_cast<void*>(parent_valid_map_base), mask_size, stream, mr));
       }
     }
   }