Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace direct cudaMemcpyAsync calls with utility functions (within /include) #17557

Merged
merged 9 commits into from
Dec 11, 2024
12 changes: 3 additions & 9 deletions cpp/include/cudf/detail/copy_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -373,15 +373,9 @@ std::unique_ptr<table> copy_if(table_view const& input,

// As it is InclusiveSum, last value in block_offsets will be output_size
// unless num_blocks == 1, in which case output_size is just block_counts[0]
cudf::size_type output_size{0};
CUDF_CUDA_TRY(cudaMemcpyAsync(
&output_size,
grid.num_blocks > 1 ? block_offsets.begin() + grid.num_blocks : block_counts.begin(),
sizeof(cudf::size_type),
cudaMemcpyDefault,
stream.value()));

stream.synchronize();
auto const src_span = device_span<cudf::size_type const>{
grid.num_blocks > 1 ? block_offsets.begin() + grid.num_blocks : block_counts.begin(), 1};
auto const output_size = cudf::detail::make_host_vector_sync(src_span, stream)[0];

if (output_size == input.num_rows()) {
return std::make_unique<table>(input, stream, mr);
Expand Down
10 changes: 4 additions & 6 deletions cpp/include/cudf/detail/get_value.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -17,6 +17,7 @@
#pragma once

#include <cudf/column/column_view.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/traits.hpp>
#include <cudf/utilities/type_dispatcher.hpp>
Expand Down Expand Up @@ -48,11 +49,8 @@ T get_value(column_view const& col_view, size_type element_index, rmm::cuda_stre
CUDF_EXPECTS(data_type(type_to_id<T>()) == col_view.type(), "get_value data type mismatch");
CUDF_EXPECTS(element_index >= 0 && element_index < col_view.size(),
"invalid element_index value");
T result;
CUDF_CUDA_TRY(cudaMemcpyAsync(
&result, col_view.data<T>() + element_index, sizeof(T), cudaMemcpyDefault, stream.value()));
stream.synchronize();
vuule marked this conversation as resolved.
Show resolved Hide resolved
return result;
return cudf::detail::make_host_vector_sync(
device_span<T const>{col_view.data<T>() + element_index, 1}, stream)[0];
vuule marked this conversation as resolved.
Show resolved Hide resolved
}

} // namespace detail
Expand Down
10 changes: 7 additions & 3 deletions cpp/include/cudf/table/table_device_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#pragma once

#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/utilities/cuda_memcpy.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand Down Expand Up @@ -251,7 +253,7 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st
// A buffer of CPU memory is allocated to hold the ColumnDeviceView
// objects. Once filled, the CPU memory is then copied to device memory
// and the pointer is set in the d_columns member.
std::vector<int8_t> h_buffer(padded_views_size_bytes);
auto h_buffer = cudf::detail::make_host_vector<int8_t>(padded_views_size_bytes, stream);
// Each ColumnDeviceView instance may have child objects which may
// require setting some internal device pointers before being copied
// from CPU to device.
Expand All @@ -266,8 +268,10 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st
auto d_columns = detail::child_columns_to_device_array<ColumnDeviceView>(
source_view.begin(), source_view.end(), h_ptr, d_ptr);

CUDF_CUDA_TRY(cudaMemcpyAsync(d_ptr, h_ptr, views_size_bytes, cudaMemcpyDefault, stream.value()));
stream.synchronize();
auto const h_span = host_span<int8_t const>{h_buffer}.subspan(
static_cast<int8_t const*>(h_ptr) - h_buffer.data(), views_size_bytes);
auto const d_span = device_span<int8_t>{static_cast<int8_t*>(d_ptr), views_size_bytes};
cudf::detail::cuda_memcpy(d_span, h_span, stream);
return std::make_tuple(std::move(descendant_storage), d_columns);
}

Expand Down
Loading