Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace raw streams with rmm::cuda_stream_view (part 1) #6646

Merged
merged 21 commits into from
Nov 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
- PR #6608 Improve subword tokenizer docs
- PR #6610 Add ability to set scalar values in `cudf.DataFrame`
- PR #6612 Update JNI to new RMM cuda_stream_view API
- PR #6646 Replace `cudaStream_t` with `rmm::cuda_stream_view` (part 1)
- PR #6579 Update scatter APIs to use reference wrapper / const scalar
- PR #6614 Add support for conversion to Pandas nullable dtypes and fix related issue in `cudf.to_json`
- PR #6622 Update `to_pandas` api docs
Expand Down
10 changes: 6 additions & 4 deletions cpp/benchmarks/common/generate_benchmark_input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>

#include <future>
Expand Down Expand Up @@ -296,9 +297,9 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
return std::make_unique<cudf::column>(
cudf::data_type{cudf::type_to_id<T>()},
num_rows,
rmm::device_buffer(data.data(), num_rows * sizeof(stored_Type), cudaStream_t(0)),
rmm::device_buffer(data.data(), num_rows * sizeof(stored_Type), rmm::cuda_stream_default),
rmm::device_buffer(
null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), cudaStream_t(0)));
null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), rmm::cuda_stream_default));
}

/**
Expand Down Expand Up @@ -483,15 +484,16 @@ std::unique_ptr<cudf::column> create_random_column<cudf::list_view>(data_profile
auto offsets_column = std::make_unique<cudf::column>(
cudf::data_type{cudf::type_id::INT32},
offsets.size(),
rmm::device_buffer(offsets.data(), offsets.size() * sizeof(int32_t), cudaStream_t(0)));
rmm::device_buffer(
offsets.data(), offsets.size() * sizeof(int32_t), rmm::cuda_stream_default));

list_column = cudf::make_lists_column(
num_rows,
std::move(offsets_column),
std::move(current_child_column),
cudf::UNKNOWN_NULL_COUNT,
rmm::device_buffer(
null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), cudaStream_t(0)));
null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), rmm::cuda_stream_default));
}
return list_column; // return the top-level column
}
Expand Down
44 changes: 27 additions & 17 deletions cpp/benchmarks/copying/shift_benchmark.cu
Original file line number Diff line number Diff line change
@@ -1,30 +1,40 @@
#include <benchmark/benchmark.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/functional.h>
#include <thrust/sequence.h>
#include <thrust/transform.h>
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/copying.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <memory>

template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
std::unique_ptr<cudf::scalar> make_scalar(
cudaStream_t stream = 0,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
auto s = new ScalarType(0, false, stream, mr);
return std::unique_ptr<cudf::scalar>(s);
}
#include <benchmark/benchmark.h>

#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/functional.h>
#include <thrust/sequence.h>
#include <thrust/transform.h>

#include <memory>

template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
std::unique_ptr<cudf::scalar> make_scalar(
harrism marked this conversation as resolved.
Show resolved Hide resolved
T value,
cudaStream_t stream = 0,
T value = 0,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
auto s = new ScalarType(value, true, stream, mr);
Expand Down
11 changes: 6 additions & 5 deletions cpp/benchmarks/null_mask/set_null_mask_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,13 @@
* limitations under the License.
*/

#include <benchmark/benchmark.h>
#include "../fixture/benchmark_fixture.hpp"
#include "../synchronization/synchronization.hpp"
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/null_mask.hpp>

#include <benchmark/benchmark.h>

class SetNullmask : public cudf::benchmark {
};

Expand All @@ -31,7 +32,7 @@ void BM_setnullmask(benchmark::State& state)

for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask.data()), begin, end, true, 0);
cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask.data()), begin, end, true);
}

state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * size / 8);
Expand All @@ -44,4 +45,4 @@ void BM_setnullmask(benchmark::State& state)
->Range(1 << 10, 1 << 30) \
->UseManualTime();

NBM_BENCHMARK_DEFINE(SetNullMaskKernel);
NBM_BENCHMARK_DEFINE(SetNullMaskKernel);
6 changes: 3 additions & 3 deletions cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,13 @@ struct ColumnHandle {
template <typename ColumnType>
void operator()(mutable_column_device_view source_column,
int work_per_thread,
cudaStream_t stream = 0)
rmm::cuda_stream_view stream = rmm::cuda_stream_default)
{
cudf::detail::grid_1d grid_config{source_column.size(), block_size};
int grid_size = grid_config.num_blocks;
// Launch the kernel.
host_dispatching_kernel<functor_type, ColumnType>
<<<grid_size, block_size, 0, stream>>>(source_column);
<<<grid_size, block_size, 0, stream.value()>>>(source_column);
}
};

Expand Down Expand Up @@ -160,7 +160,7 @@ void launch_kernel(mutable_table_view input, T** d_ptr, int work_per_thread)
}

template <class TypeParam, FunctorType functor_type, DispatchingType dispatching_type>
void type_dispatcher_benchmark(benchmark::State& state)
void type_dispatcher_benchmark(::benchmark::State& state)
{
const cudf::size_type source_size = static_cast<cudf::size_type>(state.range(1));

Expand Down
4 changes: 2 additions & 2 deletions cpp/docs/DOCUMENTATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ You can use the `@copydoc` tag to avoid duplicating the comment block for a func
*/
```

Also, `@copydoc` is useful when documenting a `detail` function that differs only by the `cudaStream_t` parameter.
Also, `@copydoc` is useful when documenting a `detail` function that differs only by the `stream` parameter.

```c++
/**
Expand All @@ -235,7 +235,7 @@ Also, `@copydoc` is useful when documenting a `detail` function that differs onl
*/
std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
std::vector<size_type> const& indices,
cudaStream_t stream = 0);
rmm::cuda_stream_view stream = rmm::cuda_stream_default);
```
Note, you must specify the whole signature of the function, including optional parameters, so that doxygen will be able to locate it.
Expand Down
10 changes: 5 additions & 5 deletions cpp/docs/TRANSITIONGUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ A *mutable*, non-owning view of a table.
We do not yet expose CUDA streams in external libcudf APIs.
However, in order to ease the transition to future use of streams, all libcudf APIs that allocate device memory or execute a kernel should be implemented using asynchronous APIs on the default stream (e.g., stream 0).

The recommended pattern for doing this is to make the definition of the external API invoke an internal API in the `detail` namespace. The internal `detail` API will have all the same parameters, plus a `cudaStream_t` parameter at the end defaulted to `0`.
The recommended pattern for doing this is to make the definition of the external API invoke an internal API in the `detail` namespace. The internal `detail` API will have all the same parameters, plus a `rmm::cuda_stream_view` parameter at the end defaulted to `rmm::cuda_stream_default`.
The implementation should be wholly contained in the `detail` API definition and use only asynchronous versions of CUDA APIs with the defaulted stream parameter.

In order to make the `detail` API callable from other libcudf functions, it should be exposed in a header placed in the `cudf/cpp/include/detail/` directory.
Expand All @@ -144,19 +144,19 @@ void external_function(...);

// cpp/include/cudf/detail/header.hpp
namespace detail{
void external_function(..., cudaStream_t stream = 0)
void external_function(..., rmm::cuda_stream_view stream = rmm::cuda_stream_default)
} // namespace detail

// cudf/src/implementation.cpp
namespace detail{
// defaulted stream parameter
void external_function(..., cudaStream_t stream){
void external_function(..., rmm::cuda_stream_view stream){
// implementation uses stream w/ async APIs
RMM_ALLOC(...,stream);
CUDA_TRY(cudaMemcpyAsync(...,stream));
CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
kernel<<<..., stream>>>(...);
thrust::algorithm(rmm::exec_policy(stream)->on(stream), ...);
CUDA_TRY(cudaStreamSynchronize(stream));
stream.synchronize();
RMM_FREE(...,stream);
}
} // namespace detail
Expand Down
4 changes: 3 additions & 1 deletion cpp/include/cudf/ast/detail/transform.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
#include <cudf/table/table_view.hpp>
#include <cudf/types.hpp>

#include <rmm/cuda_stream_view.hpp>

#include <cstring>
#include <numeric>

Expand Down Expand Up @@ -369,7 +371,7 @@ struct ast_plan {
std::unique_ptr<column> compute_column(
table_view const table,
expression const& expr,
cudaStream_t stream = 0,
rmm::cuda_stream_view stream,
harrism marked this conversation as resolved.
Show resolved Hide resolved
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
} // namespace detail

Expand Down
13 changes: 6 additions & 7 deletions cpp/include/cudf/column/column.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,10 +15,12 @@
*/
#pragma once

#include <cudf/column/column_view.hpp>

#include <cudf/null_mask.hpp>
#include <cudf/types.hpp>
#include "column_view.hpp"

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>

#include <memory>
Expand Down Expand Up @@ -50,9 +52,6 @@ class column {
/**
* @brief Construct a new column by deep copying the contents of `other`.
*
* All device memory allocation and copying is done using the
* `device_memory_resource` and `stream` from `other`.
*
* @param other The column to copy
**/
column(column const& other);
Expand All @@ -69,7 +68,7 @@ class column {
* @param mr Device memory resource to use for all device memory allocations
*/
column(column const& other,
cudaStream_t stream,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down Expand Up @@ -124,7 +123,7 @@ class column {
* @param mr Device memory resource to use for all device memory allocations
*/
explicit column(column_view view,
cudaStream_t stream = 0,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down
11 changes: 7 additions & 4 deletions cpp/include/cudf/column/column_device_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
*/
#pragma once

#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <cudf/column/column_view.hpp>
#include <cudf/fixed_point/fixed_point.hpp>
#include <cudf/lists/list_view.cuh>
Expand All @@ -28,6 +26,11 @@
#include <cudf/utilities/traits.hpp>
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/cuda_stream_view.hpp>

#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>

/**
* @file column_device_view.cuh
* @brief Column device view class definitons
Expand Down Expand Up @@ -386,7 +389,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
*`source_view` available in device memory.
*/
static std::unique_ptr<column_device_view, std::function<void(column_device_view*)>> create(
column_view source_view, cudaStream_t stream = 0);
column_view source_view, rmm::cuda_stream_view stream = rmm::cuda_stream_default);

/**
* @brief Destroy the `column_device_view` object.
Expand Down Expand Up @@ -480,7 +483,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
*/
static std::unique_ptr<mutable_column_device_view,
std::function<void(mutable_column_device_view*)>>
create(mutable_column_view source_view, cudaStream_t stream = 0);
create(mutable_column_view source_view, rmm::cuda_stream_view stream = rmm::cuda_stream_default);

/**
* @brief Returns pointer to the base device memory allocation casted to
Expand Down
Loading