Skip to content

Commit

Permalink
Convert cudaStream_t to rmm::cuda_stream_view (#325)
Browse files Browse the repository at this point in the history
This PR converts all usage of cudaStream_t in cuSpatial to rmm::cuda_stream_view, following on from rapidsai/cudf#6646 and rapidsai/cudf#6648

Also reorders stream parameters to occur before MR parameters in all functions.
  • Loading branch information
harrism authored Nov 24, 2020
1 parent ec8941b commit d4ed2d8
Show file tree
Hide file tree
Showing 26 changed files with 407 additions and 354 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@

## Improvements
- PR #310 Pin cmake policies to cmake 3.17 version
- PR #325 Convert `cudaStream_t` to `rmm::cuda_stream_view`

## Bug Fixes
- PR #320 Fix a quadtree construction bug - needs zero out device_uvector before scatter
- PR #320 Fix quadtree construction bug: zero out `device_uvector` before `scatter`

# cuSpatial 0.16.0 (Date TBD)

Expand Down
34 changes: 16 additions & 18 deletions cpp/benchmarks/synchronization/synchronization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,47 +16,45 @@

#include "synchronization.hpp"

#include <rmm/device_buffer.hpp>
#include <cudf/utilities/error.hpp>

#define RMM_CUDA_ASSERT_OK(expr) \
do { \
cudaError_t const status = (expr); \
assert(cudaSuccess == status); \
} while (0);
#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>

cuda_event_timer::cuda_event_timer(benchmark::State& state,
bool flush_l2_cache,
cudaStream_t stream)
rmm::cuda_stream_view stream)
: p_state(&state), stream(stream)
{
// flush all of L2$
if (flush_l2_cache) {
int current_device = 0;
RMM_CUDA_TRY(cudaGetDevice(&current_device));
CUDA_TRY(cudaGetDevice(&current_device));

int l2_cache_bytes = 0;
RMM_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));
CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));

if (l2_cache_bytes > 0) {
const int memset_value = 0;
rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream);
RMM_CUDA_TRY(cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream));
CUDA_TRY(
cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value()));
}
}

RMM_CUDA_TRY(cudaEventCreate(&start));
RMM_CUDA_TRY(cudaEventCreate(&stop));
RMM_CUDA_TRY(cudaEventRecord(start, stream));
CUDA_TRY(cudaEventCreate(&start));
CUDA_TRY(cudaEventCreate(&stop));
CUDA_TRY(cudaEventRecord(start, stream.value()));
}

cuda_event_timer::~cuda_event_timer()
{
RMM_CUDA_ASSERT_OK(cudaEventRecord(stop, stream));
RMM_CUDA_ASSERT_OK(cudaEventSynchronize(stop));
CUDA_TRY(cudaEventRecord(stop, stream.value()));
CUDA_TRY(cudaEventSynchronize(stop));

float milliseconds = 0.0f;
RMM_CUDA_ASSERT_OK(cudaEventElapsedTime(&milliseconds, start, stop));
CUDA_TRY(cudaEventElapsedTime(&milliseconds, start, stop));
p_state->SetIterationTime(milliseconds / (1000.0f));
RMM_CUDA_ASSERT_OK(cudaEventDestroy(start));
RMM_CUDA_ASSERT_OK(cudaEventDestroy(stop));
CUDA_TRY(cudaEventDestroy(start));
CUDA_TRY(cudaEventDestroy(stop));
}
34 changes: 22 additions & 12 deletions cpp/benchmarks/synchronization/synchronization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
/**
* @file synchronization.hpp
* @brief This is the header file for `cuda_event_timer`.
*/
**/

/**
* @brief This class serves as a wrapper for using `cudaEvent_t` as the user
Expand All @@ -33,7 +33,7 @@
for (auto _ : state){
cudaStream_t stream = 0;
rmm::cuda_stream_view stream{}; // default stream, could be another stream
// Create (Construct) an object of this class. You HAVE to pass in the
// benchmark::State object you are using. It measures the time from its
Expand All @@ -44,23 +44,29 @@
cuda_event_timer raii(state, true, stream); // flush_l2_cache = true
// Now perform the operations that is to be benchmarked
sample_kernel<<<1, 256, 0, stream>>>(); // Possibly launching a CUDA kernel
sample_kernel<<<1, 256, 0, stream.value()>>>(); // Possibly launching a CUDA kernel
}
}
// Register the function as a benchmark. You will need to set the `UseManualTime()`
// flag in order to use the timer embeded in this class.
// flag in order to use the timer embedded in this class.
BENCHMARK(sample_cuda_benchmark)->UseManualTime();
*/
**/

#pragma once
#ifndef CUDF_BENCH_SYNCHRONIZATION_H
#define CUDF_BENCH_SYNCHRONIZATION_H

// Google Benchmark library
#include <benchmark/benchmark.h>
#include <cuda_runtime_api.h>

#include <cudf/types.hpp>

#include <rmm/cuda_stream_view.hpp>

#include <driver_types.h>

class cuda_event_timer {
public:
Expand All @@ -73,21 +79,25 @@ class cuda_event_timer {
* @param[in] flush_l2_cache_ whether or not to flush the L2 cache before
* every iteration.
* @param[in] stream_ The CUDA stream we are measuring time on.
*/
cuda_event_timer(benchmark::State& state, bool flush_l2_cache, cudaStream_t stream_ = 0);
**/
cuda_event_timer(benchmark::State& state,
bool flush_l2_cache,
rmm::cuda_stream_view stream = rmm::cuda_stream_default);

// The user will HAVE to provide a benchmark::State object to set
// The user must provide a benchmark::State object to set
// the timer so we disable the default c'tor.
cuda_event_timer() = delete;

// The d'tor stops the timer and performs a synchroniazation.
// The d'tor stops the timer and performs a synchronization.
// Time of the benchmark::State object provided to the c'tor
// will be set to the value given by `cudaEventElapsedTime`.
~cuda_event_timer();

private:
cudaEvent_t start;
cudaEvent_t stop;
cudaStream_t stream;
rmm::cuda_stream_view stream;
benchmark::State* p_state;
};

#endif
26 changes: 17 additions & 9 deletions cpp/include/cuspatial/cubic_spline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

#include <cudf/column/column.hpp>
#include <cudf/table/table.hpp>
#include <cudf/types.hpp>

#include <memory>

namespace cuspatial {
Expand All @@ -43,16 +45,19 @@ namespace cuspatial {
* @param[in] offsets the exclusive scan of the spline sizes, prefixed by
* 0. For example, for 3 splines of 5 vertices each, the offsets input array
* is {0, 5, 10, 15}.
* @param[in] mr The memory resource to use for allocating output
*
* @return cudf::table_view of coefficients for spline interpolation. The size
* of the table is ((M-n), 4) where M is `t.size()` and and n is
* `ids.size()-1`.
**/
std::unique_ptr<cudf::column> cubicspline_interpolate(cudf::column_view const& query_points,
cudf::column_view const& spline_ids,
cudf::column_view const& offsets,
cudf::column_view const& source_points,
cudf::table_view const& coefficients);
std::unique_ptr<cudf::column> cubicspline_interpolate(
cudf::column_view const& query_points,
cudf::column_view const& spline_ids,
cudf::column_view const& offsets,
cudf::column_view const& source_points,
cudf::table_view const& coefficients,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Compute cubic interpolations of a set of points based on their
Expand All @@ -69,11 +74,14 @@ std::unique_ptr<cudf::column> cubicspline_interpolate(cudf::column_view const& q
* identify which specific spline a given query_point is interpolated with.
* @param[in] coefficients table of spline coefficients produced by
* cubicspline_coefficients.
* @param[in] mr The memory resource to use for allocating output
*
* @return cudf::column `y` coordinates interpolated from `x` and `coefs`.
**/
std::unique_ptr<cudf::table> cubicspline_coefficients(cudf::column_view const& t,
cudf::column_view const& y,
cudf::column_view const& ids,
cudf::column_view const& offsets);
std::unique_ptr<cudf::table> cubicspline_coefficients(
cudf::column_view const& t,
cudf::column_view const& y,
cudf::column_view const& ids,
cudf::column_view const& offsets,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
} // namespace cuspatial
39 changes: 21 additions & 18 deletions cpp/src/indexing/construction/detail/phase_1.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/thrust_rmm_allocator.h>
#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>

#include <thrust/functional.h>
Expand Down Expand Up @@ -57,11 +58,11 @@ compute_point_keys_and_sorted_indices(cudf::column_view const &x,
T y_max,
T scale,
int8_t max_depth,
rmm::mr::device_memory_resource *mr,
cudaStream_t stream)
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource *mr)
{
rmm::device_uvector<uint32_t> keys(x.size(), stream);
thrust::transform(rmm::exec_policy(stream)->on(stream),
thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
make_zip_iterator(x.begin<T>(), y.begin<T>()),
make_zip_iterator(x.begin<T>(), y.begin<T>()) + x.size(),
keys.begin(),
Expand All @@ -77,12 +78,12 @@ compute_point_keys_and_sorted_indices(cudf::column_view const &x,

auto indices = make_fixed_width_column<uint32_t>(keys.size(), stream, mr);

thrust::sequence(rmm::exec_policy(stream)->on(stream),
thrust::sequence(rmm::exec_policy(stream)->on(stream.value()),
indices->mutable_view().begin<uint32_t>(),
indices->mutable_view().end<uint32_t>());

// Sort the codes and point indices
thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream),
thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream.value()),
keys.begin(),
keys.end(),
indices->mutable_view().begin<int32_t>());
Expand All @@ -106,9 +107,9 @@ inline cudf::size_type build_tree_level(InputIterator1 keys_begin,
OutputIterator1 keys_out,
OutputIterator2 vals_out,
BinaryOp binary_op,
cudaStream_t stream)
rmm::cuda_stream_view stream)
{
auto result = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
auto result = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream.value()),
keys_begin,
keys_end,
vals_in,
Expand All @@ -133,7 +134,7 @@ build_tree_levels(int8_t max_depth,
KeysIterator keys_begin,
ValsIterator quad_point_count_begin,
ValsIterator quad_child_count_begin,
cudaStream_t stream)
rmm::cuda_stream_view stream)
{
// begin/end offsets
cudf::size_type begin{0};
Expand Down Expand Up @@ -193,7 +194,7 @@ reverse_tree_levels(rmm::device_uvector<uint32_t> const &quad_keys_in,
std::vector<cudf::size_type> const &begin_pos,
std::vector<cudf::size_type> const &end_pos,
int8_t max_depth,
cudaStream_t stream)
rmm::cuda_stream_view stream)
{
rmm::device_uvector<uint32_t> quad_keys(quad_keys_in.size(), stream);
rmm::device_uvector<uint8_t> quad_levels(quad_keys_in.size(), stream);
Expand All @@ -205,19 +206,19 @@ reverse_tree_levels(rmm::device_uvector<uint32_t> const &quad_keys_in,
cudf::size_type level_end = end_pos[level];
cudf::size_type level_begin = begin_pos[level];
cudf::size_type num_quads = level_end - level_begin;
thrust::fill(rmm::exec_policy(stream)->on(stream),
thrust::fill(rmm::exec_policy(stream)->on(stream.value()),
quad_levels.begin() + offset,
quad_levels.begin() + offset + num_quads,
level);
thrust::copy(rmm::exec_policy(stream)->on(stream),
thrust::copy(rmm::exec_policy(stream)->on(stream.value()),
quad_keys_in.begin() + level_begin,
quad_keys_in.begin() + level_end,
quad_keys.begin() + offset);
thrust::copy(rmm::exec_policy(stream)->on(stream),
thrust::copy(rmm::exec_policy(stream)->on(stream.value()),
quad_point_count_in.begin() + level_begin,
quad_point_count_in.begin() + level_end,
quad_point_count.begin() + offset);
thrust::copy(rmm::exec_policy(stream)->on(stream),
thrust::copy(rmm::exec_policy(stream)->on(stream.value()),
quad_child_count_in.begin() + level_begin,
quad_child_count_in.begin() + level_end,
quad_child_count.begin() + offset);
Expand Down Expand Up @@ -255,15 +256,15 @@ inline auto make_full_levels(cudf::column_view const &x,
T scale,
int8_t max_depth,
cudf::size_type min_size,
rmm::mr::device_memory_resource *mr,
cudaStream_t stream)
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource *mr)
{
// Compute point keys and sort into bottom-level quadrants
// (i.e. quads at level `max_depth - 1`)

// Compute Morton code (z-order) keys for each point
auto keys_and_indices = compute_point_keys_and_sorted_indices<T>(
x, y, x_min, x_max, y_min, y_max, scale, max_depth, mr, stream);
x, y, x_min, x_max, y_min, y_max, scale, max_depth, stream, mr);

auto &point_keys = keys_and_indices.first;
auto &point_indices = keys_and_indices.second;
Expand Down Expand Up @@ -291,8 +292,10 @@ inline auto make_full_levels(cudf::column_view const &x,
quad_child_count.resize(num_bottom_quads * (max_depth + 1), stream);

// Zero out the quad_child_count vector because we're reusing the point_keys vector
thrust::fill(
rmm::exec_policy(stream)->on(stream), quad_child_count.begin(), quad_child_count.end(), 0);
thrust::fill(rmm::exec_policy(stream)->on(stream.value()),
quad_child_count.begin(),
quad_child_count.end(),
0);

//
// Compute "full" quads for the tree at each level. Starting from the quadrant
Expand Down
Loading

0 comments on commit d4ed2d8

Please sign in to comment.