Skip to content

Commit

Permalink
Support for Pascal GPUs (which lack memcpy_async)
Browse files Browse the repository at this point in the history
* first pass
* pascal tomfoolery
* Mithun fixed it
* Figured out conditional compile.
** Must happen in __device__ context.
* Experiments with __CUDA_ARCH__:
** Got it working with __global__, __device__, and thrust.
* Initial stab at ASYNC_MEMCPY_SUPPORTED:
** 1. Found out that __host__ code does not have __CUDA_ARCH__ set.
      Everywhere else, this can be used reliably.
** 2. Replaced all the __CUDA_ARCH__ checks with ASYNC_MEMCPY_SUPPORTED.
      This is correct for all sites, EXCEPT convert_to/from_rows(),
      because those are __host__.
** 3. Running out of memory on Ampere box, for some reason.
* Completed changes for __host__ code:
** 1. Changed convert_to_rows(), convert_from_rows() to use ifndef __CUDA_ARCH__.
** 2. Added comments for barrier initialization.
* Reduced scope of ASYNC_MEMCPY_SUPPORTED in some if statements.
* Formatting.
* Updated JNI/row_conversion.cu.
  • Loading branch information
hyperbolic2346 authored and mythrocks committed Feb 18, 2022
1 parent f263820 commit 8a8afaa
Show file tree
Hide file tree
Showing 9 changed files with 5,929 additions and 61 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,7 @@ add_library(
src/rolling/rolling.cu
src/rolling/rolling_collect_list.cu
src/round/round.cu
src/row_conversion/row_conversion.cu
src/scalar/scalar.cpp
src/scalar/scalar_factories.cpp
src/search/search.cu
Expand Down
4 changes: 4 additions & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,10 @@ ConfigureBench(JSON_BENCH string/json.cpp)
# * io benchmark ---------------------------------------------------------------------
ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK io/text/multibyte_split.cpp)

###################################################################################################
# - row conversion benchmark ---------------------------------------------------------
ConfigureBench(ROW_CONVERSION_BENCH row_conversion/row_conversion.cpp)

add_custom_target(
run_benchmarks
DEPENDS CUDF_BENCHMARKS
Expand Down
181 changes: 181 additions & 0 deletions cpp/benchmarks/row_conversion/row_conversion.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmark/benchmark.h>
#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/lists/lists_column_view.hpp>
#include <cudf/row_conversion.hpp>
#include <cudf_test/column_utilities.hpp>

class RowConversion : public cudf::benchmark {
};

static void BM_old_to_row(benchmark::State& state)
{
cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
auto const table = create_random_table({cudf::type_id::INT8,
cudf::type_id::INT32,
cudf::type_id::INT16,
cudf::type_id::INT64,
cudf::type_id::INT32,
cudf::type_id::BOOL8,
cudf::type_id::UINT16,
cudf::type_id::UINT8,
cudf::type_id::UINT64},
212,
row_count{n_rows});

cudf::size_type total_bytes = 0;
for (int i = 0; i < table->num_columns(); ++i) {
auto t = table->get_column(i).type();
total_bytes += cudf::size_of(t);
}

for (auto _ : state) {
cuda_event_timer raii(state, true, rmm::cuda_stream_default);

auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view());
}

state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
}

static void BM_new_to_row(benchmark::State& state)
{
cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
auto const table = create_random_table({cudf::type_id::INT8,
cudf::type_id::INT32,
cudf::type_id::INT16,
cudf::type_id::INT64,
cudf::type_id::INT32,
cudf::type_id::BOOL8,
cudf::type_id::UINT16,
cudf::type_id::UINT8,
cudf::type_id::UINT64},
212,
row_count{n_rows});

cudf::size_type total_bytes = 0;
for (int i = 0; i < table->num_columns(); ++i) {
auto t = table->get_column(i).type();
total_bytes += cudf::size_of(t);
}

for (auto _ : state) {
cuda_event_timer raii(state, true, rmm::cuda_stream_default);

auto new_rows = cudf::convert_to_rows(table->view());
}

state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
}

static void BM_old_from_row(benchmark::State& state)
{
cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
auto const table = create_random_table({cudf::type_id::INT8,
cudf::type_id::INT32,
cudf::type_id::INT16,
cudf::type_id::INT64,
cudf::type_id::INT32,
cudf::type_id::BOOL8,
cudf::type_id::UINT16,
cudf::type_id::UINT8,
cudf::type_id::UINT64},
256,
row_count{n_rows});

std::vector<cudf::data_type> schema;
cudf::size_type total_bytes = 0;
for (int i = 0; i < table->num_columns(); ++i) {
auto t = table->get_column(i).type();
schema.push_back(t);
total_bytes += cudf::size_of(t);
}

auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view());
cudf::lists_column_view const first_list(rows.front()->view());

for (auto _ : state) {
cuda_event_timer raii(state, true, rmm::cuda_stream_default);

auto out = cudf::convert_from_rows_fixed_width_optimized(first_list, schema);
}

state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
}

static void BM_new_from_row(benchmark::State& state)
{
cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
auto const table = create_random_table({cudf::type_id::INT8,
cudf::type_id::INT32,
cudf::type_id::INT16,
cudf::type_id::INT64,
cudf::type_id::INT32,
cudf::type_id::BOOL8,
cudf::type_id::UINT16,
cudf::type_id::UINT8,
cudf::type_id::UINT64},
256,
row_count{n_rows});

std::vector<cudf::data_type> schema;
cudf::size_type total_bytes = 0;
for (int i = 0; i < table->num_columns(); ++i) {
auto t = table->get_column(i).type();
schema.push_back(t);
total_bytes += cudf::size_of(t);
}

auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view());
cudf::lists_column_view const first_list(rows.front()->view());

for (auto _ : state) {
cuda_event_timer raii(state, true, rmm::cuda_stream_default);

auto out = cudf::convert_from_rows(first_list, schema);
}

state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
}

#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
BENCHMARK_DEFINE_F(RowConversion, name) \
(::benchmark::State & st) { f(st); } \
BENCHMARK_REGISTER_F(RowConversion, name) \
->RangeMultiplier(8) \
->Ranges({{1 << 20, 1 << 20}}) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)

#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
BENCHMARK_DEFINE_F(RowConversion, name) \
(::benchmark::State & st) { f(st); } \
BENCHMARK_REGISTER_F(RowConversion, name) \
->RangeMultiplier(8) \
->Ranges({{1 << 20, 1 << 20}}) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

FROM_ROW_CONVERSION_BENCHMARK_DEFINE(old_from_row_conversion, BM_old_from_row)
FROM_ROW_CONVERSION_BENCHMARK_DEFINE(new_from_row_conversion, BM_new_from_row)
51 changes: 51 additions & 0 deletions cpp/include/cudf/row_conversion.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <memory>

#include <cudf/lists/lists_column_view.hpp>
#include <cudf/table/table_view.hpp>
#include <rmm/cuda_stream_view.hpp>

namespace cudf {

std::vector<std::unique_ptr<cudf::column>> convert_to_rows_fixed_width_optimized(
cudf::table_view const& tbl,
// TODO need something for validity
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

std::vector<std::unique_ptr<cudf::column>> convert_to_rows(
cudf::table_view const& tbl,
// TODO need something for validity
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
cudf::lists_column_view const& input,
std::vector<cudf::data_type> const& schema,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

std::unique_ptr<cudf::table> convert_from_rows(
cudf::lists_column_view const& input,
std::vector<cudf::data_type> const& schema,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

} // namespace cudf
Loading

0 comments on commit 8a8afaa

Please sign in to comment.