Support for Pascal GPUs (which lack memcpy_async)

* first pass * pascal tomfoolery * Mithun fixed it * Figured out conditional compile. ** Must happen in __device__ context. * Experiments with __CUDA_ARCH__: ** Got it working with __global__, __device__, and thrust. * Initial stab at ASYNC_MEMCPY_SUPPORTED: ** 1. Found out that __host__ code does not have __CUDA_ARCH__ set. Everywhere else, this can be used reliably. ** 2. Replaced all the __CUDA_ARCH__ checks with ASYNC_MEMCPY_SUPPORTED. This is correct for all sites, EXCEPT convert_to/from_rows(), because those are __host__. ** 3. Running out of memory on Ampere box, for some reason. * Completed changes for __host__ code: ** 1. Changed convert_to_rows(), convert_from_rows() to use ifndef __CUDA_ARCH__. ** 2. Added comments for barrier initialization. * Reduced scope of ASYNC_MEMCPY_SUPPORTED in some if statements. * Formatting. * Updated JNI/row_conversion.cu.
mythrocks · Feb 18, 2022 · 8a8afaa · 8a8afaa
1 parent f263820
commit 8a8afaa
Show file tree

Hide file tree

Showing 9 changed files with 5,929 additions and 61 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -388,6 +388,7 @@ add_library(
   src/rolling/rolling.cu
   src/rolling/rolling_collect_list.cu
   src/round/round.cu
+  src/row_conversion/row_conversion.cu
   src/scalar/scalar.cpp
   src/scalar/scalar_factories.cpp
   src/search/search.cu

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -276,6 +276,10 @@ ConfigureBench(JSON_BENCH string/json.cpp)
 # * io benchmark ---------------------------------------------------------------------
 ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK io/text/multibyte_split.cpp)
 
+###################################################################################################
+# - row conversion benchmark ---------------------------------------------------------
+ConfigureBench(ROW_CONVERSION_BENCH row_conversion/row_conversion.cpp)
+
 add_custom_target(
   run_benchmarks
   DEPENDS CUDF_BENCHMARKS

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/row_conversion.hpp>
+#include <cudf_test/column_utilities.hpp>
+
+class RowConversion : public cudf::benchmark {
+};
+
+static void BM_old_to_row(benchmark::State& state)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  auto const table = create_random_table({cudf::type_id::INT8,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::INT16,
+                                          cudf::type_id::INT64,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::BOOL8,
+                                          cudf::type_id::UINT16,
+                                          cudf::type_id::UINT8,
+                                          cudf::type_id::UINT64},
+                                         212,
+                                         row_count{n_rows});
+
+  cudf::size_type total_bytes = 0;
+  for (int i = 0; i < table->num_columns(); ++i) {
+    auto t = table->get_column(i).type();
+    total_bytes += cudf::size_of(t);
+  }
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+
+    auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view());
+  }
+
+  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
+}
+
+static void BM_new_to_row(benchmark::State& state)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  auto const table = create_random_table({cudf::type_id::INT8,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::INT16,
+                                          cudf::type_id::INT64,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::BOOL8,
+                                          cudf::type_id::UINT16,
+                                          cudf::type_id::UINT8,
+                                          cudf::type_id::UINT64},
+                                         212,
+                                         row_count{n_rows});
+
+  cudf::size_type total_bytes = 0;
+  for (int i = 0; i < table->num_columns(); ++i) {
+    auto t = table->get_column(i).type();
+    total_bytes += cudf::size_of(t);
+  }
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+
+    auto new_rows = cudf::convert_to_rows(table->view());
+  }
+
+  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
+}
+
+static void BM_old_from_row(benchmark::State& state)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  auto const table = create_random_table({cudf::type_id::INT8,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::INT16,
+                                          cudf::type_id::INT64,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::BOOL8,
+                                          cudf::type_id::UINT16,
+                                          cudf::type_id::UINT8,
+                                          cudf::type_id::UINT64},
+                                         256,
+                                         row_count{n_rows});
+
+  std::vector<cudf::data_type> schema;
+  cudf::size_type total_bytes = 0;
+  for (int i = 0; i < table->num_columns(); ++i) {
+    auto t = table->get_column(i).type();
+    schema.push_back(t);
+    total_bytes += cudf::size_of(t);
+  }
+
+  auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view());
+  cudf::lists_column_view const first_list(rows.front()->view());
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+
+    auto out = cudf::convert_from_rows_fixed_width_optimized(first_list, schema);
+  }
+
+  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
+}
+
+static void BM_new_from_row(benchmark::State& state)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  auto const table = create_random_table({cudf::type_id::INT8,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::INT16,
+                                          cudf::type_id::INT64,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::BOOL8,
+                                          cudf::type_id::UINT16,
+                                          cudf::type_id::UINT8,
+                                          cudf::type_id::UINT64},
+                                         256,
+                                         row_count{n_rows});
+
+  std::vector<cudf::data_type> schema;
+  cudf::size_type total_bytes = 0;
+  for (int i = 0; i < table->num_columns(); ++i) {
+    auto t = table->get_column(i).type();
+    schema.push_back(t);
+    total_bytes += cudf::size_of(t);
+  }
+
+  auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view());
+  cudf::lists_column_view const first_list(rows.front()->view());
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+
+    auto out = cudf::convert_from_rows(first_list, schema);
+  }
+
+  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
+}
+
+#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
+  BENCHMARK_DEFINE_F(RowConversion, name)           \
+  (::benchmark::State & st) { f(st); }              \
+  BENCHMARK_REGISTER_F(RowConversion, name)         \
+    ->RangeMultiplier(8)                            \
+    ->Ranges({{1 << 20, 1 << 20}})                  \
+    ->UseManualTime()                               \
+    ->Unit(benchmark::kMillisecond);
+
+TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
+TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
+
+#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
+  BENCHMARK_DEFINE_F(RowConversion, name)             \
+  (::benchmark::State & st) { f(st); }                \
+  BENCHMARK_REGISTER_F(RowConversion, name)           \
+    ->RangeMultiplier(8)                              \
+    ->Ranges({{1 << 20, 1 << 20}})                    \
+    ->UseManualTime()                                 \
+    ->Unit(benchmark::kMillisecond);
+
+FROM_ROW_CONVERSION_BENCHMARK_DEFINE(old_from_row_conversion, BM_old_from_row)
+FROM_ROW_CONVERSION_BENCHMARK_DEFINE(new_from_row_conversion, BM_new_from_row)
diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <memory>
+
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/table/table_view.hpp>
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows_fixed_width_optimized(
+  cudf::table_view const& tbl,
+  // TODO need something for validity
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows(
+  cudf::table_view const& tbl,
+  // TODO need something for validity
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
+  cudf::lists_column_view const& input,
+  std::vector<cudf::data_type> const& schema,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+std::unique_ptr<cudf::table> convert_from_rows(
+  cudf::lists_column_view const& input,
+  std::vector<cudf::data_type> const& schema,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace cudf