Skip to content

Commit

Permalink
Merge branch 'branch-22.02' of https://github.com/rapidsai/cudf into …
Browse files Browse the repository at this point in the history
…bug-racecheck-orc
  • Loading branch information
vuule committed Jan 4, 2022
2 parents 6116a12 + d69ea61 commit 188f7b2
Show file tree
Hide file tree
Showing 44 changed files with 2,139 additions and 1,596 deletions.
229 changes: 226 additions & 3 deletions CHANGELOG.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ git submodule update --init --remote --recursive
```bash
# create the conda environment (assuming in base `cudf` directory)
# note: RAPIDS currently doesn't support `channel_priority: strict`; use `channel_priority: flexible` instead
conda env create --name cudf_dev --file conda/environments/cudf_dev_cuda11.0.yml
conda env create --name cudf_dev --file conda/environments/cudf_dev_cuda11.5.yml
# activate the environment
conda activate cudf_dev
```
Expand Down
2 changes: 1 addition & 1 deletion ci/gpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ else
KAFKA_CONDA_FILE=${KAFKA_CONDA_FILE//-/=} #convert to conda install

gpuci_logger "Installing $CUDF_CONDA_FILE & $KAFKA_CONDA_FILE"
conda install -c ${CONDA_ARTIFACT_PATH} "$CUDF_CONDA_FILE" "$KAFKA_CONDA_FILE"
gpuci_mamba_retry install -c ${CONDA_ARTIFACT_PATH} "$CUDF_CONDA_FILE" "$KAFKA_CONDA_FILE"

install_dask

Expand Down
25 changes: 23 additions & 2 deletions cpp/benchmarks/common/generate_benchmark_input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,29 @@ struct random_value_fn<T, typename std::enable_if_t<cudf::is_chrono<T>()>> {
*/
template <typename T>
struct random_value_fn<T, typename std::enable_if_t<cudf::is_fixed_point<T>()>> {
random_value_fn(distribution_params<T> const&) {}
T operator()(std::mt19937& engine) { CUDF_FAIL("Not implemented"); }
using rep = typename T::rep;
rep const lower_bound;
rep const upper_bound;
distribution_fn<rep> dist;
std::optional<numeric::scale_type> scale;

random_value_fn(distribution_params<rep> const& desc)
: lower_bound{desc.lower_bound},
upper_bound{desc.upper_bound},
dist{make_distribution<rep>(desc.id, desc.lower_bound, desc.upper_bound)}
{
}

T operator()(std::mt19937& engine)
{
if (not scale.has_value()) {
int const max_scale = std::numeric_limits<rep>::digits10;
auto scale_dist = make_distribution<int>(distribution_id::NORMAL, -max_scale, max_scale);
scale = numeric::scale_type{std::max(std::min(scale_dist(engine), max_scale), -max_scale)};
}
// Clamp the generated random value to the specified range
return T{std::max(std::min(dist(engine), upper_bound), lower_bound), *scale};
}
};

/**
Expand Down
13 changes: 11 additions & 2 deletions cpp/benchmarks/common/generate_benchmark_input.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ class data_profile {
distribution_params<cudf::string_view> string_dist_desc{{distribution_id::NORMAL, 0, 32}};
distribution_params<cudf::list_view> list_dist_desc{
cudf::type_id::INT32, {distribution_id::GEOMETRIC, 0, 100}, 2};
std::map<cudf::type_id, distribution_params<__uint128_t>> decimal_params;

double bool_probability = 0.5;
double null_frequency = 0.01;
Expand Down Expand Up @@ -284,9 +285,17 @@ class data_profile {
}

template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
distribution_params<T> get_distribution_params() const
distribution_params<typename T::rep> get_distribution_params() const
{
CUDF_FAIL("Not implemented");
using rep = typename T::rep;
auto it = decimal_params.find(cudf::type_to_id<T>());
if (it == decimal_params.end()) {
auto const range = default_range<rep>();
return distribution_params<rep>{default_distribution_id<rep>(), range.first, range.second};
} else {
auto& desc = it->second;
return {desc.id, static_cast<rep>(desc.lower_bound), static_cast<rep>(desc.upper_bound)};
}
}

auto get_bool_probability() const { return bool_probability; }
Expand Down
27 changes: 16 additions & 11 deletions cpp/benchmarks/common/random_distribution_factory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,24 @@
#include <memory>
#include <random>

/**
* @brief Generates a normal(binomial) distribution between zero and upper_bound.
*/
template <typename T, typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
auto make_normal_dist(T range_start, T range_end)
auto make_normal_dist(T upper_bound)
{
using uT = typename std::make_unsigned<T>::type;
uT const range_size = range_end - range_start;
return std::binomial_distribution<uT>(range_size, 0.5);
using uT = typename std::make_unsigned<T>::type;
return std::binomial_distribution<uT>(upper_bound, 0.5);
}

/**
* @brief Generates a normal distribution between zero and upper_bound.
*/
template <typename T, std::enable_if_t<cudf::is_floating_point<T>()>* = nullptr>
auto make_normal_dist(T range_start, T range_end)
auto make_normal_dist(T upper_bound)
{
T const mean = range_start / 2 + range_end / 2;
T const stddev = range_end / 6 - range_start / 6;
T const mean = upper_bound / 2;
T const stddev = upper_bound / 6;
return std::normal_distribution<T>(mean, stddev);
}

Expand Down Expand Up @@ -82,8 +87,8 @@ distribution_fn<T> make_distribution(distribution_id did, T lower_bound, T upper
{
switch (did) {
case distribution_id::NORMAL:
return [lower_bound, dist = make_normal_dist(lower_bound, upper_bound)](
std::mt19937& engine) mutable -> T { return dist(engine) - lower_bound; };
return [lower_bound, dist = make_normal_dist(upper_bound - lower_bound)](
std::mt19937& engine) mutable -> T { return dist(engine) + lower_bound; };
case distribution_id::UNIFORM:
return [dist = make_uniform_dist(lower_bound, upper_bound)](
std::mt19937& engine) mutable -> T { return dist(engine); };
Expand All @@ -104,8 +109,8 @@ distribution_fn<T> make_distribution(distribution_id dist_id, T lower_bound, T u
{
switch (dist_id) {
case distribution_id::NORMAL:
return [dist = make_normal_dist(lower_bound, upper_bound)](
std::mt19937& engine) mutable -> T { return dist(engine); };
return [lower_bound, dist = make_normal_dist(upper_bound - lower_bound)](
std::mt19937& engine) mutable -> T { return dist(engine) + lower_bound; };
case distribution_id::UNIFORM:
return [dist = make_uniform_dist(lower_bound, upper_bound)](
std::mt19937& engine) mutable -> T { return dist(engine); };
Expand Down
2 changes: 2 additions & 0 deletions cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ void BM_csv_read_varying_options(benchmark::State& state)
auto const data_types =
dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL),
int32_t(type_group_id::FLOATING_POINT),
int32_t(type_group_id::FIXED_POINT),
int32_t(type_group_id::TIMESTAMP),
int32_t(cudf::type_id::STRING)}),
col_sel);
Expand Down Expand Up @@ -143,6 +144,7 @@ void BM_csv_read_varying_options(benchmark::State& state)

RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL);
RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING);

Expand Down
2 changes: 2 additions & 0 deletions cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ void BM_csv_write_varying_options(benchmark::State& state)

auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL),
int32_t(type_group_id::FLOATING_POINT),
int32_t(type_group_id::FIXED_POINT),
int32_t(type_group_id::TIMESTAMP),
int32_t(cudf::type_id::STRING)});

Expand Down Expand Up @@ -96,6 +97,7 @@ void BM_csv_write_varying_options(benchmark::State& state)

WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL);
WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING);

Expand Down
5 changes: 4 additions & 1 deletion cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,10 @@ void BM_orc_read_varying_options(benchmark::State& state)
auto const data_types =
dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
int32_t(type_group_id::FLOATING_POINT),
int32_t(type_group_id::FIXED_POINT),
int32_t(type_group_id::TIMESTAMP),
int32_t(cudf::type_id::STRING)}),
int32_t(cudf::type_id::STRING),
int32_t(cudf::type_id::LIST)}),
col_sel);
auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
auto const view = tbl->view();
Expand Down Expand Up @@ -158,6 +160,7 @@ void BM_orc_read_varying_options(benchmark::State& state)

RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL_SIGNED);
RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING);
RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, list, cudf::type_id::LIST);
Expand Down
5 changes: 4 additions & 1 deletion cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,10 @@ void BM_orc_write_varying_options(benchmark::State& state)

auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
int32_t(type_group_id::FLOATING_POINT),
int32_t(type_group_id::FIXED_POINT),
int32_t(type_group_id::TIMESTAMP),
int32_t(cudf::type_id::STRING)});
int32_t(cudf::type_id::STRING),
int32_t(cudf::type_id::LIST)});

auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
auto const view = tbl->view();
Expand Down Expand Up @@ -101,6 +103,7 @@ void BM_orc_write_varying_options(benchmark::State& state)

WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL_SIGNED);
WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING);
WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, list, cudf::type_id::LIST);
Expand Down
5 changes: 4 additions & 1 deletion cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,10 @@ void BM_parq_read_varying_options(benchmark::State& state)
auto const data_types =
dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL),
int32_t(type_group_id::FLOATING_POINT),
int32_t(type_group_id::FIXED_POINT),
int32_t(type_group_id::TIMESTAMP),
int32_t(cudf::type_id::STRING)}),
int32_t(cudf::type_id::STRING),
int32_t(cudf::type_id::LIST)}),
col_sel);
auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
auto const view = tbl->view();
Expand Down Expand Up @@ -160,6 +162,7 @@ void BM_parq_read_varying_options(benchmark::State& state)

RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL);
RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING);
RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, list, cudf::type_id::LIST);
Expand Down
5 changes: 4 additions & 1 deletion cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,10 @@ void BM_parq_write_varying_options(benchmark::State& state)

auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
int32_t(type_group_id::FLOATING_POINT),
int32_t(type_group_id::FIXED_POINT),
int32_t(type_group_id::TIMESTAMP),
int32_t(cudf::type_id::STRING)});
int32_t(cudf::type_id::STRING),
int32_t(cudf::type_id::LIST)});

auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
auto const view = tbl->view();
Expand Down Expand Up @@ -103,6 +105,7 @@ void BM_parq_write_varying_options(benchmark::State& state)

WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL);
WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING);
WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, list, cudf::type_id::LIST);
Expand Down
Loading

0 comments on commit 188f7b2

Please sign in to comment.