Merge branch 'branch-24.04' into bug_hd_vector

rapidsai · Mar 11, 2024 · 259d082 · 259d082
2 parents 4ed86b9 + c4f1a26
commit 259d082
Show file tree

Hide file tree

Showing 28 changed files with 711 additions and 2,083 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -182,6 +182,8 @@ endif()
 rapids_cpm_init()
 # find jitify
 include(cmake/thirdparty/get_jitify.cmake)
+# find NVTX
+include(cmake/thirdparty/get_nvtx.cmake)
 # find nvCOMP
 include(cmake/thirdparty/get_nvcomp.cmake)
 # find CCCL before rmm so that we get cudf's patched version of CCCL
@@ -382,6 +384,7 @@ add_library(
   src/io/json/read_json.cu
   src/io/json/legacy/json_gpu.cu
   src/io/json/legacy/reader_impl.cu
+  src/io/json/parser_features.cpp
   src/io/json/write_json.cu
   src/io/orc/aggregate_orc_metadata.cpp
   src/io/orc/dict_enc.cu
@@ -776,7 +779,7 @@ add_dependencies(cudf jitify_preprocess_run)
 target_link_libraries(
   cudf
   PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm
-  PRIVATE cuco::cuco ZLIB::ZLIB nvcomp::nvcomp kvikio::kvikio
+  PRIVATE nvtx3-cpp cuco::cuco ZLIB::ZLIB nvcomp::nvcomp kvikio::kvikio
           $<TARGET_NAME_IF_EXISTS:cuFile_interface>
 )
 

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -25,7 +25,7 @@ target_compile_options(
 target_link_libraries(
   cudf_datagen
   PUBLIC GTest::gmock GTest::gtest benchmark::benchmark nvbench::nvbench Threads::Threads cudf
-         cudftestutil
+         cudftestutil nvtx3-cpp
   PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
 )
 

diff --git a/cpp/benchmarks/groupby/group_max.cpp b/cpp/benchmarks/groupby/group_max.cpp
@@ -22,25 +22,30 @@
 #include <nvbench/nvbench.cuh>
 
 template <typename Type>
-void bench_groupby_max(nvbench::state& state, nvbench::type_list<Type>)
+void groupby_max_helper(nvbench::state& state,
+                        cudf::size_type num_rows,
+                        cudf::size_type cardinality,
+                        double null_probability)
 {
-  auto const size = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-
   auto const keys = [&] {
-    data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
-      cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, 100);
-    return create_random_column(cudf::type_to_id<int32_t>(), row_count{size}, profile);
+    data_profile const profile =
+      data_profile_builder()
+        .cardinality(cardinality)
+        .no_validity()
+        .distribution(cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, num_rows);
+    return create_random_column(cudf::type_to_id<int32_t>(), row_count{num_rows}, profile);
   }();
 
   auto const vals = [&] {
     auto builder = data_profile_builder().cardinality(0).distribution(
-      cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 1000);
-    if (const auto null_freq = state.get_float64("null_probability"); null_freq > 0) {
-      builder.null_probability(null_freq);
+      cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, num_rows);
+    if (null_probability > 0) {
+      builder.null_probability(null_probability);
     } else {
       builder.no_validity();
     }
-    return create_random_column(cudf::type_to_id<Type>(), row_count{size}, data_profile{builder});
+    return create_random_column(
+      cudf::type_to_id<Type>(), row_count{num_rows}, data_profile{builder});
   }();
 
   auto keys_view = keys->view();
@@ -55,13 +60,39 @@ void bench_groupby_max(nvbench::state& state, nvbench::type_list<Type>)
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
-
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
   state.add_buffer_size(
     mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
 }
 
+template <typename Type>
+void bench_groupby_max(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const cardinality      = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const num_rows         = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const null_probability = state.get_float64("null_probability");
+
+  groupby_max_helper<Type>(state, num_rows, cardinality, null_probability);
+}
+
+template <typename Type>
+void bench_groupby_max_cardinality(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto constexpr num_rows         = 20'000'000;
+  auto constexpr null_probability = 0.;
+  auto const cardinality          = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+
+  groupby_max_helper<Type>(state, num_rows, cardinality, null_probability);
+}
+
 NVBENCH_BENCH_TYPES(bench_groupby_max,
                     NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, int64_t, float, double>))
   .set_name("groupby_max")
+  .add_int64_axis("cardinality", {0})
   .add_int64_power_of_two_axis("num_rows", {12, 18, 24})
   .add_float64_axis("null_probability", {0, 0.1, 0.9});
+
+NVBENCH_BENCH_TYPES(bench_groupby_max_cardinality, NVBENCH_TYPE_AXES(nvbench::type_list<int32_t>))
+  .set_name("groupby_max_cardinality")
+  .add_int64_axis("cardinality", {10, 20, 50, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000});
diff --git a/cpp/benchmarks/groupby/group_nunique.cpp b/cpp/benchmarks/groupby/group_nunique.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,17 +39,23 @@ auto make_aggregation_request_vector(cudf::column_view const& values, Args&&...
 template <typename Type>
 void bench_groupby_nunique(nvbench::state& state, nvbench::type_list<Type>)
 {
-  auto const size = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const size        = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
 
   auto const keys = [&] {
-    data_profile profile = data_profile_builder().cardinality(0).no_validity().distribution(
-      cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, 100);
+    data_profile profile =
+      data_profile_builder()
+        .cardinality(cardinality)
+        .no_validity()
+        .distribution(cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, size);
     return create_random_column(cudf::type_to_id<int32_t>(), row_count{size}, profile);
   }();
 
   auto const vals = [&] {
-    data_profile profile = data_profile_builder().cardinality(0).distribution(
-      cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 1000);
+    data_profile profile =
+      data_profile_builder()
+        .cardinality(cardinality)
+        .distribution(cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, size);
     if (const auto null_freq = state.get_float64("null_probability"); null_freq > 0) {
       profile.set_null_probability(null_freq);
     } else {
@@ -71,4 +77,5 @@ void bench_groupby_nunique(nvbench::state& state, nvbench::type_list<Type>)
 NVBENCH_BENCH_TYPES(bench_groupby_nunique, NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, int64_t>))
   .set_name("groupby_nunique")
   .add_int64_power_of_two_axis("num_rows", {12, 16, 20, 24})
+  .add_int64_axis("cardinality", {0})
   .add_float64_axis("null_probability", {0, 0.5});
diff --git a/cpp/benchmarks/groupby/group_rank.cpp b/cpp/benchmarks/groupby/group_rank.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,10 +31,12 @@ static void nvbench_groupby_rank(nvbench::state& state,
 
   bool const is_sorted              = state.get_int64("is_sorted");
   cudf::size_type const column_size = state.get_int64("data_size");
-  constexpr int num_groups          = 100;
+  auto const cardinality            = static_cast<cudf::size_type>(state.get_int64("cardinality"));
 
-  data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
-    dtype, distribution_id::UNIFORM, 0, num_groups);
+  data_profile const profile = data_profile_builder()
+                                 .cardinality(cardinality)
+                                 .no_validity()
+                                 .distribution(dtype, distribution_id::UNIFORM, 0, column_size);
 
   auto source_table = create_random_table({dtype, dtype}, row_count{column_size}, profile);
 
@@ -100,5 +102,5 @@ NVBENCH_BENCH_TYPES(nvbench_groupby_rank, NVBENCH_TYPE_AXES(methods))
                     10000000,   // 10M
                     100000000,  // 100M
                   })
-
+  .add_int64_axis("cardinality", {0})
   .add_int64_axis("is_sorted", {0, 1});
diff --git a/cpp/cmake/thirdparty/get_nvtx.cmake b/cpp/cmake/thirdparty/get_nvtx.cmake
@@ -0,0 +1,27 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# This function finds NVTX and sets any additional necessary environment variables.
+function(find_and_configure_nvtx)
+  rapids_cpm_find(
+    NVTX3 3.1.0
+    GLOBAL_TARGETS nvtx3-c nvtx3-cpp
+    CPM_ARGS
+    GIT_REPOSITORY https://github.com/NVIDIA/NVTX.git
+    GIT_TAG v3.1.0
+    GIT_SHALLOW TRUE SOURCE_SUBDIR c
+  )
+endfunction()
+
+find_and_configure_nvtx()
diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -664,11 +664,11 @@ defaults.
 ## NVTX Ranges
 
 In order to aid in performance optimization and debugging, all compute intensive libcudf functions
-should have a corresponding NVTX range. Choose between `CUDF_FUNC_RANGE` or `cudf::thread_range`
+should have a corresponding NVTX range. Choose between `CUDF_FUNC_RANGE` or `cudf::scoped_range`
 for declaring NVTX ranges in the current scope:
 - Use the `CUDF_FUNC_RANGE()` macro if you want to use the name of the function as the name of the
 NVTX range
-- Use `cudf::thread_range rng{"custom_name"};` to provide a custom name for the current scope's
+- Use `cudf::scoped_range rng{"custom_name"};` to provide a custom name for the current scope's
 NVTX range
 
 For more information about NVTX, see [here](https://github.com/NVIDIA/NVTX/tree/dev/c).