Skip to content

Commit

Permalink
Replace raw streams with rmm::cuda_stream_view (part 1) (#6646)
Browse files Browse the repository at this point in the history
Converting libcudf to use `rmm::cuda_stream_view` will require a LOT of changes, so I'm splitting it into multiple PRs to ease reviewing. This is the first PR in the series. This series of PRs will

 - Replace usage of `cudaStream_t` with `rmm::cuda_stream_view`
 - Replace usage of `0` or `nullptr` as a stream identifier with `rmm::cuda_stream_default`
 - Ensure all APIs always order the stream parameter *before* the memory resource parameter. #5119

This first PR converts:
 - column.hpp (and source)
 - device_column_view.cuh
 - copying.hpp (and source) : moves functions that had streams in public APIs to `namespace detail` and adds streamless public versions.
 - null_mask.hpp (and source) : moves functions that had streams in public APIs to `namespace detail` and adds streamless public versions.
 - AST (transform)
 - Usages of the above APIs in other source files
 - Some benchmarks

Contributes to #6645 and #5119

~Depends on #6732.~
  • Loading branch information
harrism authored Nov 13, 2020
1 parent 3fc8142 commit dea44d9
Show file tree
Hide file tree
Showing 94 changed files with 1,129 additions and 716 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
- PR #6608 Improve subword tokenizer docs
- PR #6610 Add ability to set scalar values in `cudf.DataFrame`
- PR #6612 Update JNI to new RMM cuda_stream_view API
- PR #6646 Replace `cudaStream_t` with `rmm::cuda_stream_view` (part 1)
- PR #6579 Update scatter APIs to use reference wrapper / const scalar
- PR #6614 Add support for conversion to Pandas nullable dtypes and fix related issue in `cudf.to_json`
- PR #6622 Update `to_pandas` api docs
Expand Down
10 changes: 6 additions & 4 deletions cpp/benchmarks/common/generate_benchmark_input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>

#include <future>
Expand Down Expand Up @@ -296,9 +297,9 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
return std::make_unique<cudf::column>(
cudf::data_type{cudf::type_to_id<T>()},
num_rows,
rmm::device_buffer(data.data(), num_rows * sizeof(stored_Type), cudaStream_t(0)),
rmm::device_buffer(data.data(), num_rows * sizeof(stored_Type), rmm::cuda_stream_default),
rmm::device_buffer(
null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), cudaStream_t(0)));
null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), rmm::cuda_stream_default));
}

/**
Expand Down Expand Up @@ -483,15 +484,16 @@ std::unique_ptr<cudf::column> create_random_column<cudf::list_view>(data_profile
auto offsets_column = std::make_unique<cudf::column>(
cudf::data_type{cudf::type_id::INT32},
offsets.size(),
rmm::device_buffer(offsets.data(), offsets.size() * sizeof(int32_t), cudaStream_t(0)));
rmm::device_buffer(
offsets.data(), offsets.size() * sizeof(int32_t), rmm::cuda_stream_default));

list_column = cudf::make_lists_column(
num_rows,
std::move(offsets_column),
std::move(current_child_column),
cudf::UNKNOWN_NULL_COUNT,
rmm::device_buffer(
null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), cudaStream_t(0)));
null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), rmm::cuda_stream_default));
}
return list_column; // return the top-level column
}
Expand Down
44 changes: 27 additions & 17 deletions cpp/benchmarks/copying/shift_benchmark.cu
Original file line number Diff line number Diff line change
@@ -1,30 +1,40 @@
#include <benchmark/benchmark.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/functional.h>
#include <thrust/sequence.h>
#include <thrust/transform.h>
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/copying.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <memory>

template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
std::unique_ptr<cudf::scalar> make_scalar(
cudaStream_t stream = 0,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
auto s = new ScalarType(0, false, stream, mr);
return std::unique_ptr<cudf::scalar>(s);
}
#include <benchmark/benchmark.h>

#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/functional.h>
#include <thrust/sequence.h>
#include <thrust/transform.h>

#include <memory>

template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
std::unique_ptr<cudf::scalar> make_scalar(
T value,
cudaStream_t stream = 0,
T value = 0,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
auto s = new ScalarType(value, true, stream, mr);
Expand Down
11 changes: 6 additions & 5 deletions cpp/benchmarks/null_mask/set_null_mask_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,13 @@
* limitations under the License.
*/

#include <benchmark/benchmark.h>
#include "../fixture/benchmark_fixture.hpp"
#include "../synchronization/synchronization.hpp"
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/null_mask.hpp>

#include <benchmark/benchmark.h>

class SetNullmask : public cudf::benchmark {
};

Expand All @@ -31,7 +32,7 @@ void BM_setnullmask(benchmark::State& state)

for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask.data()), begin, end, true, 0);
cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask.data()), begin, end, true);
}

state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * size / 8);
Expand All @@ -44,4 +45,4 @@ void BM_setnullmask(benchmark::State& state)
->Range(1 << 10, 1 << 30) \
->UseManualTime();

NBM_BENCHMARK_DEFINE(SetNullMaskKernel);
NBM_BENCHMARK_DEFINE(SetNullMaskKernel);
6 changes: 3 additions & 3 deletions cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,13 @@ struct ColumnHandle {
template <typename ColumnType>
void operator()(mutable_column_device_view source_column,
int work_per_thread,
cudaStream_t stream = 0)
rmm::cuda_stream_view stream = rmm::cuda_stream_default)
{
cudf::detail::grid_1d grid_config{source_column.size(), block_size};
int grid_size = grid_config.num_blocks;
// Launch the kernel.
host_dispatching_kernel<functor_type, ColumnType>
<<<grid_size, block_size, 0, stream>>>(source_column);
<<<grid_size, block_size, 0, stream.value()>>>(source_column);
}
};

Expand Down Expand Up @@ -160,7 +160,7 @@ void launch_kernel(mutable_table_view input, T** d_ptr, int work_per_thread)
}

template <class TypeParam, FunctorType functor_type, DispatchingType dispatching_type>
void type_dispatcher_benchmark(benchmark::State& state)
void type_dispatcher_benchmark(::benchmark::State& state)
{
const cudf::size_type source_size = static_cast<cudf::size_type>(state.range(1));

Expand Down
4 changes: 2 additions & 2 deletions cpp/docs/DOCUMENTATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ You can use the `@copydoc` tag to avoid duplicating the comment block for a func
*/
```

Also, `@copydoc` is useful when documenting a `detail` function that differs only by the `cudaStream_t` parameter.
Also, `@copydoc` is useful when documenting a `detail` function that differs only by the `stream` parameter.

```c++
/**
Expand All @@ -235,7 +235,7 @@ Also, `@copydoc` is useful when documenting a `detail` function that differs onl
*/
std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
std::vector<size_type> const& indices,
cudaStream_t stream = 0);
rmm::cuda_stream_view stream = rmm::cuda_stream_default);
```
Note, you must specify the whole signature of the function, including optional parameters, so that doxygen will be able to locate it.
Expand Down
10 changes: 5 additions & 5 deletions cpp/docs/TRANSITIONGUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ A *mutable*, non-owning view of a table.
We do not yet expose CUDA streams in external libcudf APIs.
However, in order to ease the transition to future use of streams, all libcudf APIs that allocate device memory or execute a kernel should be implemented using asynchronous APIs on the default stream (e.g., stream 0).
The recommended pattern for doing this is to make the definition of the external API invoke an internal API in the `detail` namespace. The internal `detail` API will have all the same parameters, plus a `cudaStream_t` parameter at the end defaulted to `0`.
The recommended pattern for doing this is to make the definition of the external API invoke an internal API in the `detail` namespace. The internal `detail` API will have all the same parameters, plus a `rmm::cuda_stream_view` parameter at the end defaulted to `rmm::cuda_stream_default`.
The implementation should be wholly contained in the `detail` API definition and use only asynchronous versions of CUDA APIs with the defaulted stream parameter.
In order to make the `detail` API callable from other libcudf functions, it should be exposed in a header placed in the `cudf/cpp/include/detail/` directory.
Expand All @@ -144,19 +144,19 @@ void external_function(...);
// cpp/include/cudf/detail/header.hpp
namespace detail{
void external_function(..., cudaStream_t stream = 0)
void external_function(..., rmm::cuda_stream_view stream = rmm::cuda_stream_default)
} // namespace detail
// cudf/src/implementation.cpp
namespace detail{
// defaulted stream parameter
void external_function(..., cudaStream_t stream){
void external_function(..., rmm::cuda_stream_view stream){
// implementation uses stream w/ async APIs
RMM_ALLOC(...,stream);
CUDA_TRY(cudaMemcpyAsync(...,stream));
CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
kernel<<<..., stream>>>(...);
thrust::algorithm(rmm::exec_policy(stream)->on(stream), ...);
CUDA_TRY(cudaStreamSynchronize(stream));
stream.synchronize();
RMM_FREE(...,stream);
}
} // namespace detail
Expand Down
4 changes: 3 additions & 1 deletion cpp/include/cudf/ast/detail/transform.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
#include <cudf/table/table_view.hpp>
#include <cudf/types.hpp>

#include <rmm/cuda_stream_view.hpp>

#include <cstring>
#include <numeric>

Expand Down Expand Up @@ -369,7 +371,7 @@ struct ast_plan {
std::unique_ptr<column> compute_column(
table_view const table,
expression const& expr,
cudaStream_t stream = 0,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
} // namespace detail

Expand Down
13 changes: 6 additions & 7 deletions cpp/include/cudf/column/column.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,10 +15,12 @@
*/
#pragma once

#include <cudf/column/column_view.hpp>

#include <cudf/null_mask.hpp>
#include <cudf/types.hpp>
#include "column_view.hpp"

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>

#include <memory>
Expand Down Expand Up @@ -50,9 +52,6 @@ class column {
/**
* @brief Construct a new column by deep copying the contents of `other`.
*
* All device memory allocation and copying is done using the
* `device_memory_resource` and `stream` from `other`.
*
* @param other The column to copy
**/
column(column const& other);
Expand All @@ -69,7 +68,7 @@ class column {
* @param mr Device memory resource to use for all device memory allocations
*/
column(column const& other,
cudaStream_t stream,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down Expand Up @@ -124,7 +123,7 @@ class column {
* @param mr Device memory resource to use for all device memory allocations
*/
explicit column(column_view view,
cudaStream_t stream = 0,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down
11 changes: 7 additions & 4 deletions cpp/include/cudf/column/column_device_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
*/
#pragma once

#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <cudf/column/column_view.hpp>
#include <cudf/fixed_point/fixed_point.hpp>
#include <cudf/lists/list_view.cuh>
Expand All @@ -28,6 +26,11 @@
#include <cudf/utilities/traits.hpp>
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/cuda_stream_view.hpp>

#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>

/**
* @file column_device_view.cuh
* @brief Column device view class definitons
Expand Down Expand Up @@ -386,7 +389,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
*`source_view` available in device memory.
*/
static std::unique_ptr<column_device_view, std::function<void(column_device_view*)>> create(
column_view source_view, cudaStream_t stream = 0);
column_view source_view, rmm::cuda_stream_view stream = rmm::cuda_stream_default);

/**
* @brief Destroy the `column_device_view` object.
Expand Down Expand Up @@ -480,7 +483,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
*/
static std::unique_ptr<mutable_column_device_view,
std::function<void(mutable_column_device_view*)>>
create(mutable_column_view source_view, cudaStream_t stream = 0);
create(mutable_column_view source_view, rmm::cuda_stream_view stream = rmm::cuda_stream_default);

/**
* @brief Returns pointer to the base device memory allocation casted to
Expand Down
Loading

0 comments on commit dea44d9

Please sign in to comment.