Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/branch-22.02' into refactor/mi…
Browse files Browse the repository at this point in the history
…sc_cleanup
  • Loading branch information
bdice committed Dec 23, 2021
2 parents 63e3896 + 04f4219 commit 3637574
Show file tree
Hide file tree
Showing 40 changed files with 2,228 additions and 906 deletions.
229 changes: 226 additions & 3 deletions CHANGELOG.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion ci/gpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ else
KAFKA_CONDA_FILE=${KAFKA_CONDA_FILE//-/=} #convert to conda install

gpuci_logger "Installing $CUDF_CONDA_FILE & $KAFKA_CONDA_FILE"
conda install -c ${CONDA_ARTIFACT_PATH} "$CUDF_CONDA_FILE" "$KAFKA_CONDA_FILE"
gpuci_mamba_retry install -c ${CONDA_ARTIFACT_PATH} "$CUDF_CONDA_FILE" "$KAFKA_CONDA_FILE"

install_dask

Expand Down
25 changes: 23 additions & 2 deletions cpp/benchmarks/common/generate_benchmark_input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,29 @@ struct random_value_fn<T, typename std::enable_if_t<cudf::is_chrono<T>()>> {
*/
template <typename T>
struct random_value_fn<T, typename std::enable_if_t<cudf::is_fixed_point<T>()>> {
random_value_fn(distribution_params<T> const&) {}
T operator()(std::mt19937& engine) { CUDF_FAIL("Not implemented"); }
using rep = typename T::rep;
rep const lower_bound;
rep const upper_bound;
distribution_fn<rep> dist;
std::optional<numeric::scale_type> scale;

random_value_fn(distribution_params<rep> const& desc)
: lower_bound{desc.lower_bound},
upper_bound{desc.upper_bound},
dist{make_distribution<rep>(desc.id, desc.lower_bound, desc.upper_bound)}
{
}

T operator()(std::mt19937& engine)
{
if (not scale.has_value()) {
int const max_scale = std::numeric_limits<rep>::digits10;
auto scale_dist = make_distribution<int>(distribution_id::NORMAL, -max_scale, max_scale);
scale = numeric::scale_type{std::max(std::min(scale_dist(engine), max_scale), -max_scale)};
}
// Clamp the generated random value to the specified range
return T{std::max(std::min(dist(engine), upper_bound), lower_bound), *scale};
}
};

/**
Expand Down
13 changes: 11 additions & 2 deletions cpp/benchmarks/common/generate_benchmark_input.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ class data_profile {
distribution_params<cudf::string_view> string_dist_desc{{distribution_id::NORMAL, 0, 32}};
distribution_params<cudf::list_view> list_dist_desc{
cudf::type_id::INT32, {distribution_id::GEOMETRIC, 0, 100}, 2};
std::map<cudf::type_id, distribution_params<__uint128_t>> decimal_params;

double bool_probability = 0.5;
double null_frequency = 0.01;
Expand Down Expand Up @@ -284,9 +285,17 @@ class data_profile {
}

template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
distribution_params<T> get_distribution_params() const
distribution_params<typename T::rep> get_distribution_params() const
{
CUDF_FAIL("Not implemented");
using rep = typename T::rep;
auto it = decimal_params.find(cudf::type_to_id<T>());
if (it == decimal_params.end()) {
auto const range = default_range<rep>();
return distribution_params<rep>{default_distribution_id<rep>(), range.first, range.second};
} else {
auto& desc = it->second;
return {desc.id, static_cast<rep>(desc.lower_bound), static_cast<rep>(desc.upper_bound)};
}
}

auto get_bool_probability() const { return bool_probability; }
Expand Down
27 changes: 16 additions & 11 deletions cpp/benchmarks/common/random_distribution_factory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,24 @@
#include <memory>
#include <random>

/**
* @brief Generates a normal(binomial) distribution between zero and upper_bound.
*/
template <typename T, typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
auto make_normal_dist(T range_start, T range_end)
auto make_normal_dist(T upper_bound)
{
using uT = typename std::make_unsigned<T>::type;
uT const range_size = range_end - range_start;
return std::binomial_distribution<uT>(range_size, 0.5);
using uT = typename std::make_unsigned<T>::type;
return std::binomial_distribution<uT>(upper_bound, 0.5);
}

/**
* @brief Generates a normal distribution between zero and upper_bound.
*/
template <typename T, std::enable_if_t<cudf::is_floating_point<T>()>* = nullptr>
auto make_normal_dist(T range_start, T range_end)
auto make_normal_dist(T upper_bound)
{
T const mean = range_start / 2 + range_end / 2;
T const stddev = range_end / 6 - range_start / 6;
T const mean = upper_bound / 2;
T const stddev = upper_bound / 6;
return std::normal_distribution<T>(mean, stddev);
}

Expand Down Expand Up @@ -82,8 +87,8 @@ distribution_fn<T> make_distribution(distribution_id did, T lower_bound, T upper
{
switch (did) {
case distribution_id::NORMAL:
return [lower_bound, dist = make_normal_dist(lower_bound, upper_bound)](
std::mt19937& engine) mutable -> T { return dist(engine) - lower_bound; };
return [lower_bound, dist = make_normal_dist(upper_bound - lower_bound)](
std::mt19937& engine) mutable -> T { return dist(engine) + lower_bound; };
case distribution_id::UNIFORM:
return [dist = make_uniform_dist(lower_bound, upper_bound)](
std::mt19937& engine) mutable -> T { return dist(engine); };
Expand All @@ -104,8 +109,8 @@ distribution_fn<T> make_distribution(distribution_id dist_id, T lower_bound, T u
{
switch (dist_id) {
case distribution_id::NORMAL:
return [dist = make_normal_dist(lower_bound, upper_bound)](
std::mt19937& engine) mutable -> T { return dist(engine); };
return [lower_bound, dist = make_normal_dist(upper_bound - lower_bound)](
std::mt19937& engine) mutable -> T { return dist(engine) + lower_bound; };
case distribution_id::UNIFORM:
return [dist = make_uniform_dist(lower_bound, upper_bound)](
std::mt19937& engine) mutable -> T { return dist(engine); };
Expand Down
2 changes: 2 additions & 0 deletions cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ void BM_csv_read_varying_options(benchmark::State& state)
auto const data_types =
dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL),
int32_t(type_group_id::FLOATING_POINT),
int32_t(type_group_id::FIXED_POINT),
int32_t(type_group_id::TIMESTAMP),
int32_t(cudf::type_id::STRING)}),
col_sel);
Expand Down Expand Up @@ -143,6 +144,7 @@ void BM_csv_read_varying_options(benchmark::State& state)

RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL);
RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING);

Expand Down
2 changes: 2 additions & 0 deletions cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ void BM_csv_write_varying_options(benchmark::State& state)

auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL),
int32_t(type_group_id::FLOATING_POINT),
int32_t(type_group_id::FIXED_POINT),
int32_t(type_group_id::TIMESTAMP),
int32_t(cudf::type_id::STRING)});

Expand Down Expand Up @@ -96,6 +97,7 @@ void BM_csv_write_varying_options(benchmark::State& state)

WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL);
WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING);

Expand Down
5 changes: 4 additions & 1 deletion cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,10 @@ void BM_orc_read_varying_options(benchmark::State& state)
auto const data_types =
dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
int32_t(type_group_id::FLOATING_POINT),
int32_t(type_group_id::FIXED_POINT),
int32_t(type_group_id::TIMESTAMP),
int32_t(cudf::type_id::STRING)}),
int32_t(cudf::type_id::STRING),
int32_t(cudf::type_id::LIST)}),
col_sel);
auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
auto const view = tbl->view();
Expand Down Expand Up @@ -158,6 +160,7 @@ void BM_orc_read_varying_options(benchmark::State& state)

RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL_SIGNED);
RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING);
RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, list, cudf::type_id::LIST);
Expand Down
5 changes: 4 additions & 1 deletion cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,10 @@ void BM_orc_write_varying_options(benchmark::State& state)

auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
int32_t(type_group_id::FLOATING_POINT),
int32_t(type_group_id::FIXED_POINT),
int32_t(type_group_id::TIMESTAMP),
int32_t(cudf::type_id::STRING)});
int32_t(cudf::type_id::STRING),
int32_t(cudf::type_id::LIST)});

auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
auto const view = tbl->view();
Expand Down Expand Up @@ -101,6 +103,7 @@ void BM_orc_write_varying_options(benchmark::State& state)

WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL_SIGNED);
WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING);
WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, list, cudf::type_id::LIST);
Expand Down
5 changes: 4 additions & 1 deletion cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,10 @@ void BM_parq_read_varying_options(benchmark::State& state)
auto const data_types =
dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL),
int32_t(type_group_id::FLOATING_POINT),
int32_t(type_group_id::FIXED_POINT),
int32_t(type_group_id::TIMESTAMP),
int32_t(cudf::type_id::STRING)}),
int32_t(cudf::type_id::STRING),
int32_t(cudf::type_id::LIST)}),
col_sel);
auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
auto const view = tbl->view();
Expand Down Expand Up @@ -160,6 +162,7 @@ void BM_parq_read_varying_options(benchmark::State& state)

RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL);
RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING);
RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, list, cudf::type_id::LIST);
Expand Down
5 changes: 4 additions & 1 deletion cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,10 @@ void BM_parq_write_varying_options(benchmark::State& state)

auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
int32_t(type_group_id::FLOATING_POINT),
int32_t(type_group_id::FIXED_POINT),
int32_t(type_group_id::TIMESTAMP),
int32_t(cudf::type_id::STRING)});
int32_t(cudf::type_id::STRING),
int32_t(cudf::type_id::LIST)});

auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
auto const view = tbl->view();
Expand Down Expand Up @@ -103,6 +105,7 @@ void BM_parq_write_varying_options(benchmark::State& state)

WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL);
WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING);
WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, list, cudf::type_id::LIST);
Expand Down
102 changes: 100 additions & 2 deletions cpp/include/cudf/lists/contains.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ namespace lists {
*/

/**
* @brief Create a column of bool values indicating whether the specified scalar
* @brief Create a column of `bool` values indicating whether the specified scalar
* is an element of each row of a list column.
*
* The output column has as many elements as the input `lists` column.
Expand All @@ -51,7 +51,7 @@ std::unique_ptr<column> contains(
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create a column of bool values indicating whether the list rows of the first
* @brief Create a column of `bool` values indicating whether the list rows of the first
* column contain the corresponding values in the second column
*
* The output column has as many elements as the input `lists` column.
Expand All @@ -74,6 +74,104 @@ std::unique_ptr<column> contains(
cudf::column_view const& search_keys,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create a column of `bool` values indicating whether each row in the `lists` column
* contains at least one null element.
*
* The output column has as many elements as the input `lists` column.
* Output `column[i]` is set to null the list row `lists[i]` is null.
* Otherwise, `column[i]` is set to a non-null boolean value, depending on whether that list
* contains a null element.
* (Empty list rows are considered *NOT* to contain a null element.)
*
* @param lists Lists column whose `n` rows are to be searched
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return std::unique_ptr<column> BOOL8 column of `n` rows with the result of the lookup
*/
std::unique_ptr<column> contains_nulls(
cudf::lists_column_view const& lists,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Option to choose whether `index_of()` returns the first or last match
* of a search key in a list row
*/
enum class duplicate_find_option : int32_t {
FIND_FIRST = 0, ///< Finds first instance of a search key in a list row.
FIND_LAST ///< Finds last instance of a search key in a list row.
};

/**
* @brief Create a column of `size_type` values indicating the position of a search key
* within each list row in the `lists` column
*
* The output column has as many elements as there are rows in the input `lists` column.
* Output `column[i]` contains a 0-based index indicating the position of the search key
* in each list, counting from the beginning of the list.
* Note:
* 1. If the `search_key` is null, all output rows are set to null.
* 2. If the row `lists[i]` is null, `output[i]` is also null.
* 3. If the row `lists[i]` does not contain the `search_key`, `output[i]` is set to `-1`.
* 4. In all other cases, `output[i]` is set to a non-negative `size_type` index.
*
* If the `find_option` is set to `FIND_FIRST`, the position of the first match for
* `search_key` is returned.
* If `find_option == FIND_LAST`, the position of the last match in the list row is
* returned.
*
* @param lists Lists column whose `n` rows are to be searched
* @param search_key The scalar key to be looked up in each list row
* @param find_option Whether to return the position of the first match (`FIND_FIRST`) or
* last (`FIND_LAST`)
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return std::unique_ptr<column> INT32 column of `n` rows with the location of the `search_key`
*
* @throw cudf::logic_error If `search_key` type does not match the element type in `lists`
* @throw cudf::logic_error If `search_key` is of a nested type, or `lists` contains nested
* elements (LIST, STRUCT)
*/
std::unique_ptr<column> index_of(
cudf::lists_column_view const& lists,
cudf::scalar const& search_key,
duplicate_find_option find_option = duplicate_find_option::FIND_FIRST,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create a column of `size_type` values indicating the position of a search key
* row within the corresponding list row in the `lists` column
*
* The output column has as many elements as there are rows in the input `lists` column.
* Output `column[i]` contains a 0-based index indicating the position of each search key
* row in its corresponding list row, counting from the beginning of the list.
* Note:
* 1. If `search_keys[i]` is null, `output[i]` is also null.
* 2. If the row `lists[i]` is null, `output[i]` is also null.
* 3. If the row `lists[i]` does not contain `search_key[i]`, `output[i]` is set to `-1`.
* 4. In all other cases, `output[i]` is set to a non-negative `size_type` index.
*
* If the `find_option` is set to `FIND_FIRST`, the position of the first match for
* `search_key` is returned.
* If `find_option == FIND_LAST`, the position of the last match in the list row is
* returned.
*
* @param lists Lists column whose `n` rows are to be searched
* @param search_keys A column of search keys to be looked up in each corresponding row of
* `lists`
* @param find_option Whether to return the position of the first match (`FIND_FIRST`) or
* last (`FIND_LAST`)
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return std::unique_ptr<column> INT32 column of `n` rows with the location of the `search_key`
*
* @throw cudf::logic_error If `search_keys` does not match `lists` in its number of rows
* @throw cudf::logic_error If `search_keys` type does not match the element type in `lists`
* @throw cudf::logic_error If `lists` or `search_keys` contains nested elements (LIST, STRUCT)
*/
std::unique_ptr<column> index_of(
cudf::lists_column_view const& lists,
cudf::column_view const& search_keys,
duplicate_find_option find_option = duplicate_find_option::FIND_FIRST,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
} // namespace lists
} // namespace cudf
Loading

0 comments on commit 3637574

Please sign in to comment.