forked from rapidsai/cudf
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support for Pascal GPUs (which lack memcpy_async)
* first pass * pascal tomfoolery * Mithun fixed it * Figured out conditional compile. ** Must happen in __device__ context. * Experiments with __CUDA_ARCH__: ** Got it working with __global__, __device__, and thrust. * Initial stab at ASYNC_MEMCPY_SUPPORTED: ** 1. Found out that __host__ code does not have __CUDA_ARCH__ set. Everywhere else, this can be used reliably. ** 2. Replaced all the __CUDA_ARCH__ checks with ASYNC_MEMCPY_SUPPORTED. This is correct for all sites, EXCEPT convert_to/from_rows(), because those are __host__. ** 3. Running out of memory on Ampere box, for some reason. * Completed changes for __host__ code: ** 1. Changed convert_to_rows(), convert_from_rows() to use ifndef __CUDA_ARCH__. ** 2. Added comments for barrier initialization. * Reduced scope of ASYNC_MEMCPY_SUPPORTED in some if statements. * Formatting. * Updated JNI/row_conversion.cu.
- Loading branch information
1 parent
f263820
commit 8a8afaa
Showing
9 changed files
with
5,929 additions
and
61 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,181 @@ | ||
/* | ||
* Copyright (c) 2021, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <benchmark/benchmark.h> | ||
#include <benchmarks/common/generate_input.hpp> | ||
#include <benchmarks/fixture/benchmark_fixture.hpp> | ||
#include <benchmarks/synchronization/synchronization.hpp> | ||
|
||
#include <cudf/lists/lists_column_view.hpp> | ||
#include <cudf/row_conversion.hpp> | ||
#include <cudf_test/column_utilities.hpp> | ||
|
||
class RowConversion : public cudf::benchmark { | ||
}; | ||
|
||
static void BM_old_to_row(benchmark::State& state) | ||
{ | ||
cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; | ||
auto const table = create_random_table({cudf::type_id::INT8, | ||
cudf::type_id::INT32, | ||
cudf::type_id::INT16, | ||
cudf::type_id::INT64, | ||
cudf::type_id::INT32, | ||
cudf::type_id::BOOL8, | ||
cudf::type_id::UINT16, | ||
cudf::type_id::UINT8, | ||
cudf::type_id::UINT64}, | ||
212, | ||
row_count{n_rows}); | ||
|
||
cudf::size_type total_bytes = 0; | ||
for (int i = 0; i < table->num_columns(); ++i) { | ||
auto t = table->get_column(i).type(); | ||
total_bytes += cudf::size_of(t); | ||
} | ||
|
||
for (auto _ : state) { | ||
cuda_event_timer raii(state, true, rmm::cuda_stream_default); | ||
|
||
auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view()); | ||
} | ||
|
||
state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); | ||
} | ||
|
||
static void BM_new_to_row(benchmark::State& state) | ||
{ | ||
cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; | ||
auto const table = create_random_table({cudf::type_id::INT8, | ||
cudf::type_id::INT32, | ||
cudf::type_id::INT16, | ||
cudf::type_id::INT64, | ||
cudf::type_id::INT32, | ||
cudf::type_id::BOOL8, | ||
cudf::type_id::UINT16, | ||
cudf::type_id::UINT8, | ||
cudf::type_id::UINT64}, | ||
212, | ||
row_count{n_rows}); | ||
|
||
cudf::size_type total_bytes = 0; | ||
for (int i = 0; i < table->num_columns(); ++i) { | ||
auto t = table->get_column(i).type(); | ||
total_bytes += cudf::size_of(t); | ||
} | ||
|
||
for (auto _ : state) { | ||
cuda_event_timer raii(state, true, rmm::cuda_stream_default); | ||
|
||
auto new_rows = cudf::convert_to_rows(table->view()); | ||
} | ||
|
||
state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); | ||
} | ||
|
||
static void BM_old_from_row(benchmark::State& state) | ||
{ | ||
cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; | ||
auto const table = create_random_table({cudf::type_id::INT8, | ||
cudf::type_id::INT32, | ||
cudf::type_id::INT16, | ||
cudf::type_id::INT64, | ||
cudf::type_id::INT32, | ||
cudf::type_id::BOOL8, | ||
cudf::type_id::UINT16, | ||
cudf::type_id::UINT8, | ||
cudf::type_id::UINT64}, | ||
256, | ||
row_count{n_rows}); | ||
|
||
std::vector<cudf::data_type> schema; | ||
cudf::size_type total_bytes = 0; | ||
for (int i = 0; i < table->num_columns(); ++i) { | ||
auto t = table->get_column(i).type(); | ||
schema.push_back(t); | ||
total_bytes += cudf::size_of(t); | ||
} | ||
|
||
auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view()); | ||
cudf::lists_column_view const first_list(rows.front()->view()); | ||
|
||
for (auto _ : state) { | ||
cuda_event_timer raii(state, true, rmm::cuda_stream_default); | ||
|
||
auto out = cudf::convert_from_rows_fixed_width_optimized(first_list, schema); | ||
} | ||
|
||
state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); | ||
} | ||
|
||
static void BM_new_from_row(benchmark::State& state) | ||
{ | ||
cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; | ||
auto const table = create_random_table({cudf::type_id::INT8, | ||
cudf::type_id::INT32, | ||
cudf::type_id::INT16, | ||
cudf::type_id::INT64, | ||
cudf::type_id::INT32, | ||
cudf::type_id::BOOL8, | ||
cudf::type_id::UINT16, | ||
cudf::type_id::UINT8, | ||
cudf::type_id::UINT64}, | ||
256, | ||
row_count{n_rows}); | ||
|
||
std::vector<cudf::data_type> schema; | ||
cudf::size_type total_bytes = 0; | ||
for (int i = 0; i < table->num_columns(); ++i) { | ||
auto t = table->get_column(i).type(); | ||
schema.push_back(t); | ||
total_bytes += cudf::size_of(t); | ||
} | ||
|
||
auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view()); | ||
cudf::lists_column_view const first_list(rows.front()->view()); | ||
|
||
for (auto _ : state) { | ||
cuda_event_timer raii(state, true, rmm::cuda_stream_default); | ||
|
||
auto out = cudf::convert_from_rows(first_list, schema); | ||
} | ||
|
||
state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); | ||
} | ||
|
||
#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ | ||
BENCHMARK_DEFINE_F(RowConversion, name) \ | ||
(::benchmark::State & st) { f(st); } \ | ||
BENCHMARK_REGISTER_F(RowConversion, name) \ | ||
->RangeMultiplier(8) \ | ||
->Ranges({{1 << 20, 1 << 20}}) \ | ||
->UseManualTime() \ | ||
->Unit(benchmark::kMillisecond); | ||
|
||
TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) | ||
TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) | ||
|
||
#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ | ||
BENCHMARK_DEFINE_F(RowConversion, name) \ | ||
(::benchmark::State & st) { f(st); } \ | ||
BENCHMARK_REGISTER_F(RowConversion, name) \ | ||
->RangeMultiplier(8) \ | ||
->Ranges({{1 << 20, 1 << 20}}) \ | ||
->UseManualTime() \ | ||
->Unit(benchmark::kMillisecond); | ||
|
||
FROM_ROW_CONVERSION_BENCHMARK_DEFINE(old_from_row_conversion, BM_old_from_row) | ||
FROM_ROW_CONVERSION_BENCHMARK_DEFINE(new_from_row_conversion, BM_new_from_row) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
/* | ||
* Copyright (c) 2020, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <memory> | ||
|
||
#include <cudf/lists/lists_column_view.hpp> | ||
#include <cudf/table/table_view.hpp> | ||
#include <rmm/cuda_stream_view.hpp> | ||
|
||
namespace cudf { | ||
|
||
std::vector<std::unique_ptr<cudf::column>> convert_to_rows_fixed_width_optimized( | ||
cudf::table_view const& tbl, | ||
// TODO need something for validity | ||
rmm::cuda_stream_view stream = rmm::cuda_stream_default, | ||
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); | ||
|
||
std::vector<std::unique_ptr<cudf::column>> convert_to_rows( | ||
cudf::table_view const& tbl, | ||
// TODO need something for validity | ||
rmm::cuda_stream_view stream = rmm::cuda_stream_default, | ||
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); | ||
|
||
std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized( | ||
cudf::lists_column_view const& input, | ||
std::vector<cudf::data_type> const& schema, | ||
rmm::cuda_stream_view stream = rmm::cuda_stream_default, | ||
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); | ||
|
||
std::unique_ptr<cudf::table> convert_from_rows( | ||
cudf::lists_column_view const& input, | ||
std::vector<cudf::data_type> const& schema, | ||
rmm::cuda_stream_view stream = rmm::cuda_stream_default, | ||
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); | ||
|
||
} // namespace cudf |
Oops, something went wrong.