Skip to content

Commit

Permalink
Merge branch 'branch-24.08' of github.com:rapidsai/cudf into pylibcud…
Browse files Browse the repository at this point in the history
…f-io-writers
  • Loading branch information
lithomas1 committed Jun 29, 2024
2 parents e940e30 + 3c3edfe commit e57a677
Show file tree
Hide file tree
Showing 87 changed files with 2,899 additions and 1,573 deletions.
13 changes: 6 additions & 7 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,14 @@ for a minimal build of libcudf without using conda are also listed below.

Compilers:

* `gcc` version 9.3+
* `nvcc` version 11.5+
* `cmake` version 3.26.4+
* `gcc` version 11.4+
* `nvcc` version 11.8+
* `cmake` version 3.29.6+

CUDA/GPU:
CUDA/GPU Runtime:

* CUDA 11.5+
* NVIDIA driver 450.80.02+
* Volta architecture or better (Compute Capability >=7.0)
* CUDA 11.4+
* Volta architecture or better ([Compute Capability](https://docs.nvidia.com/deploy/cuda-compatibility/) >=7.0)

You can obtain CUDA from
[https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads).
Expand Down
50 changes: 42 additions & 8 deletions cpp/benchmarks/io/parquet/parquet_reader_input.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -59,20 +59,18 @@ void parquet_read_common(cudf::size_type num_rows_to_read,
}

template <data_type DataType>
void BM_parquet_read_data(nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>>)
void BM_parquet_read_data_common(nvbench::state& state,
data_profile const& profile,
nvbench::type_list<nvbench::enum_type<DataType>>)
{
auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
auto const run_length = static_cast<cudf::size_type>(state.get_int64("run_length"));
auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
auto const compression = cudf::io::compression_type::SNAPPY;
cuio_source_sink_pair source_sink(source_type);

auto const num_rows_written = [&]() {
auto const tbl = create_random_table(
cycle_dtypes(d_type, num_cols),
table_size_bytes{data_size},
data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
auto const tbl =
create_random_table(cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, profile);
auto const view = tbl->view();

cudf::io::parquet_writer_options write_opts =
Expand All @@ -85,6 +83,32 @@ void BM_parquet_read_data(nvbench::state& state, nvbench::type_list<nvbench::enu
parquet_read_common(num_rows_written, num_cols, source_sink, state);
}

template <data_type DataType>
void BM_parquet_read_data(nvbench::state& state,
nvbench::type_list<nvbench::enum_type<DataType>> type_list)
{
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
auto const run_length = static_cast<cudf::size_type>(state.get_int64("run_length"));
BM_parquet_read_data_common<DataType>(
state, data_profile_builder().cardinality(cardinality).avg_run_length(run_length), type_list);
}

template <data_type DataType>
void BM_parquet_read_fixed_width_struct(nvbench::state& state,
nvbench::type_list<nvbench::enum_type<DataType>> type_list)
{
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
auto const run_length = static_cast<cudf::size_type>(state.get_int64("run_length"));
std::vector<cudf::type_id> s_types{
cudf::type_id::INT32, cudf::type_id::FLOAT32, cudf::type_id::INT64};
BM_parquet_read_data_common<DataType>(state,
data_profile_builder()
.cardinality(cardinality)
.avg_run_length(run_length)
.struct_types(s_types),
type_list);
}

void BM_parquet_read_io_compression(nvbench::state& state)
{
auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
Expand Down Expand Up @@ -247,3 +271,13 @@ NVBENCH_BENCH(BM_parquet_read_io_small_mixed)
.add_int64_axis("cardinality", {0, 1000})
.add_int64_axis("run_length", {1, 32})
.add_int64_axis("num_string_cols", {1, 2, 3});

// a benchmark for structs that only contain fixed-width types
using d_type_list_struct_only = nvbench::enum_type_list<data_type::STRUCT>;
NVBENCH_BENCH_TYPES(BM_parquet_read_fixed_width_struct, NVBENCH_TYPE_AXES(d_type_list_struct_only))
.set_name("parquet_read_fixed_width_struct")
.set_type_axes_names({"data_type"})
.add_string_axis("io_type", {"DEVICE_BUFFER"})
.set_min_samples(4)
.add_int64_axis("cardinality", {0, 1000})
.add_int64_axis("run_length", {1, 32});
60 changes: 34 additions & 26 deletions cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# libcudf C++ Developer Guide
# libcudf C++ Developer Guide {#DEVELOPER_GUIDE}

This document serves as a guide for contributors to libcudf C++ code. Developers should also refer
to these additional files for further documentation of libcudf best practices.
Expand Down Expand Up @@ -469,7 +469,7 @@ libcudf throws under different circumstances, see the [section on error handling

# libcudf API and Implementation

## Streams
## Streams {#streams}

libcudf is in the process of adding support for asynchronous execution using
CUDA streams. In order to facilitate the usage of streams, all new libcudf APIs
Expand All @@ -486,33 +486,37 @@ use only asynchronous versions of CUDA APIs with the stream parameter.

In order to make the `detail` API callable from other libcudf functions, it should be exposed in a
header placed in the `cudf/cpp/include/detail/` directory.
The declaration is not necessary if no other libcudf functions call the `detail` function.

For example:

```c++
// cpp/include/cudf/header.hpp
void external_function(...);
void external_function(...,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());

// cpp/include/cudf/detail/header.hpp
namespace detail{
void external_function(..., rmm::cuda_stream_view stream)
void external_function(..., rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
} // namespace detail

// cudf/src/implementation.cpp
namespace detail{
// Use the stream parameter in the detail implementation.
void external_function(..., rmm::cuda_stream_view stream){
// Implementation uses the stream with async APIs.
rmm::device_buffer buff(...,stream);
CUDF_CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
kernel<<<..., stream>>>(...);
thrust::algorithm(rmm::exec_policy(stream), ...);
}
// Use the stream parameter in the detail implementation.
void external_function(..., rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr){
// Implementation uses the stream with async APIs.
rmm::device_buffer buff(..., stream, mr);
CUDF_CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
kernel<<<..., stream>>>(...);
thrust::algorithm(rmm::exec_policy(stream), ...);
}
} // namespace detail

void external_function(...){
CUDF_FUNC_RANGE(); // Generates an NVTX range for the lifetime of this function.
detail::external_function(..., cudf::get_default_stream());
void external_function(..., rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE(); // Generates an NVTX range for the lifetime of this function.
detail::external_function(..., stream, mr);
}
```
Expand Down Expand Up @@ -703,28 +707,28 @@ The preferred style for how inputs are passed in and outputs are returned is the
- `column_view const&`
- Tables:
- `table_view const&`
- Scalar:
- `scalar const&`
- Everything else:
- Trivial or inexpensively copied types
- Pass by value
- Non-trivial or expensive to copy types
- Pass by `const&`
- Scalar:
- `scalar const&`
- Everything else:
- Trivial or inexpensively copied types
- Pass by value
- Non-trivial or expensive to copy types
- Pass by `const&`
- In/Outs
- Columns:
- `mutable_column_view&`
- Tables:
- `mutable_table_view&`
- Everything else:
- Pass by via raw pointer
- Everything else:
- Pass by via raw pointer
- Outputs
- Outputs should be *returned*, i.e., no output parameters
- Columns:
- `std::unique_ptr<column>`
- Tables:
- `std::unique_ptr<table>`
- Scalars:
- `std::unique_ptr<scalar>`
- Scalars:
- `std::unique_ptr<scalar>`


### Multiple Return Values
Expand Down Expand Up @@ -908,6 +912,10 @@ functions that are specific to columns of Strings. These functions reside in the
namespace. Similarly, functionality used exclusively for unit testing is in the `cudf::test::`
namespace.

The public function is expected to contain a call to `CUDF_FUNC_RANGE()` followed by a call to
a `detail` function with same name and parameters as the public function.
See the [Streams](#streams) section for an example of this pattern.

### Internal

Many functions are not meant for public use, so place them in either the `detail` or an *anonymous*
Expand Down
50 changes: 50 additions & 0 deletions cpp/include/cudf/binaryop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,56 @@ enum class binary_operator : int32_t {
///< (null, false) is null, and (valid, valid) == LOGICAL_OR(valid, valid)
INVALID_BINARY ///< invalid operation
};

/// Binary operation common type default
template <typename L, typename R, typename = void>
struct binary_op_common_type {};

/// Binary operation common type specialization
template <typename L, typename R>
struct binary_op_common_type<L, R, std::enable_if_t<has_common_type_v<L, R>>> {
/// The common type of the template parameters
using type = std::common_type_t<L, R>;
};

/// Binary operation common type specialization
template <typename L, typename R>
struct binary_op_common_type<
L,
R,
std::enable_if_t<is_fixed_point<L>() && cuda::std::is_floating_point_v<R>>> {
/// The common type of the template parameters
using type = L;
};

/// Binary operation common type specialization
template <typename L, typename R>
struct binary_op_common_type<
L,
R,
std::enable_if_t<is_fixed_point<R>() && cuda::std::is_floating_point_v<L>>> {
/// The common type of the template parameters
using type = R;
};

/// Binary operation common type helper
template <typename L, typename R>
using binary_op_common_type_t = typename binary_op_common_type<L, R>::type;

namespace detail {
template <typename AlwaysVoid, typename L, typename R>
struct binary_op_has_common_type_impl : std::false_type {};

template <typename L, typename R>
struct binary_op_has_common_type_impl<std::void_t<binary_op_common_type_t<L, R>>, L, R>
: std::true_type {};
} // namespace detail

/// Checks if binary operation types have a common type
template <typename L, typename R>
constexpr inline bool binary_op_has_common_type_v =
detail::binary_op_has_common_type_impl<void, L, R>::value;

/**
* @brief Performs a binary operation between a scalar and a column.
*
Expand Down
6 changes: 3 additions & 3 deletions cpp/include/cudf/detail/copy_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/copy.hpp>
#include <cudf/copying.hpp>
#include <cudf/detail/gather.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/cuda.cuh>
Expand Down Expand Up @@ -242,8 +242,8 @@ struct scatter_gather_functor {
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
auto output_column = cudf::detail::allocate_like(
input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr);
auto output_column =
cudf::allocate_like(input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr);
auto output = output_column->mutable_view();

bool has_valid = input.nullable();
Expand Down
22 changes: 21 additions & 1 deletion cpp/include/cudf/detail/distinct_hash_join.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ template <typename Equal>
struct comparator_adapter {
comparator_adapter(Equal const& d_equal) : _d_equal{d_equal} {}

// suppress "function was declared but never referenced warning"
#pragma nv_diagnostic push
#pragma nv_diag_suppress 177
__device__ constexpr auto operator()(
cuco::pair<hash_value_type, lhs_index_type> const&,
cuco::pair<hash_value_type, lhs_index_type> const&) const noexcept
Expand All @@ -50,6 +53,14 @@ struct comparator_adapter {
return false;
}

__device__ constexpr auto operator()(
cuco::pair<hash_value_type, rhs_index_type> const&,
cuco::pair<hash_value_type, rhs_index_type> const&) const noexcept
{
// All build table keys are distinct thus `false` no matter what
return false;
}

__device__ constexpr auto operator()(
cuco::pair<hash_value_type, lhs_index_type> const& lhs,
cuco::pair<hash_value_type, rhs_index_type> const& rhs) const noexcept
Expand All @@ -58,6 +69,15 @@ struct comparator_adapter {
return _d_equal(lhs.second, rhs.second);
}

__device__ constexpr auto operator()(
cuco::pair<hash_value_type, rhs_index_type> const& lhs,
cuco::pair<hash_value_type, lhs_index_type> const& rhs) const noexcept
{
if (lhs.first != rhs.first) { return false; }
return _d_equal(lhs.second, rhs.second);
}
#pragma nv_diagnostic pop

private:
Equal _d_equal;
};
Expand Down Expand Up @@ -94,7 +114,7 @@ struct distinct_hash_join {
using cuco_storage_type = cuco::storage<1>;

/// Hash table type
using hash_table_type = cuco::static_set<cuco::pair<hash_value_type, lhs_index_type>,
using hash_table_type = cuco::static_set<cuco::pair<hash_value_type, rhs_index_type>,
cuco::extent<size_type>,
cuda::thread_scope_device,
comparator_adapter<d_equal_type>,
Expand Down
13 changes: 6 additions & 7 deletions cpp/include/cudf/detail/gather.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
*/
#pragma once

#include <cudf/detail/copy.hpp>
#include <cudf/copying.hpp>
#include <cudf/detail/indexalator.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/utilities/assert.cuh>
Expand Down Expand Up @@ -217,10 +217,9 @@ struct column_gatherer_impl<Element, std::enable_if_t<is_rep_layout_compatible<E
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
auto const num_rows = cudf::distance(gather_map_begin, gather_map_end);
auto const policy = cudf::mask_allocation_policy::NEVER;
auto destination_column =
cudf::detail::allocate_like(source_column, num_rows, policy, stream, mr);
auto const num_rows = cudf::distance(gather_map_begin, gather_map_end);
auto const policy = cudf::mask_allocation_policy::NEVER;
auto destination_column = cudf::allocate_like(source_column, num_rows, policy, stream, mr);

gather_helper(source_column.data<Element>(),
source_column.size(),
Expand Down Expand Up @@ -413,8 +412,8 @@ struct column_gatherer_impl<dictionary32> {
auto keys_copy = std::make_unique<column>(dictionary.keys(), stream, mr);
// Perform gather on just the indices
column_view indices = dictionary.get_indices_annotated();
auto new_indices = cudf::detail::allocate_like(
indices, output_count, cudf::mask_allocation_policy::NEVER, stream, mr);
auto new_indices =
cudf::allocate_like(indices, output_count, cudf::mask_allocation_policy::NEVER, stream, mr);
gather_helper(
cudf::detail::indexalator_factory::make_input_iterator(indices),
indices.size(),
Expand Down
2 changes: 0 additions & 2 deletions cpp/include/cudf/detail/stream_compaction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,6 @@ std::unique_ptr<table> distinct(table_view const& input,

/**
* @copydoc cudf::stable_distinct
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<table> stable_distinct(table_view const& input,
std::vector<size_type> const& keys,
Expand Down
Loading

0 comments on commit e57a677

Please sign in to comment.