Skip to content

Commit

Permalink
Merge branch 'branch-22.10' into fix-floor-division-by-zero
Browse files Browse the repository at this point in the history
  • Loading branch information
brandon-b-miller committed Aug 23, 2022
2 parents cb5ddd8 + e431440 commit b2b22c8
Show file tree
Hide file tree
Showing 258 changed files with 9,487 additions and 4,034 deletions.
6 changes: 3 additions & 3 deletions ci/benchmark/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ export GBENCH_BENCHMARKS_DIR="$WORKSPACE/cpp/build/gbenchmarks/"
export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache"

# Dask & Distributed option to install main(nightly) or `conda-forge` packages.
export INSTALL_DASK_MAIN=0
export INSTALL_DASK_MAIN=1

function remove_libcudf_kernel_cache_dir {
EXITCODE=$?
Expand Down Expand Up @@ -82,8 +82,8 @@ if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then
gpuci_logger "gpuci_mamba_retry update dask"
gpuci_mamba_retry update dask
else
gpuci_logger "gpuci_mamba_retry install conda-forge::dask==2022.7.1 conda-forge::distributed==2022.7.1 conda-forge::dask-core==2022.7.1 --force-reinstall"
gpuci_mamba_retry install conda-forge::dask==2022.7.1 conda-forge::distributed==2022.7.1 conda-forge::dask-core==2022.7.1 --force-reinstall
gpuci_logger "gpuci_mamba_retry install conda-forge::dask>=2022.7.1 conda-forge::distributed>=2022.7.1 conda-forge::dask-core>=2022.7.1 --force-reinstall"
gpuci_mamba_retry install conda-forge::dask>=2022.7.1 conda-forge::distributed>=2022.7.1 conda-forge::dask-core>=2022.7.1 --force-reinstall
fi

# Install the master version of streamz
Expand Down
6 changes: 3 additions & 3 deletions ci/gpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
unset GIT_DESCRIBE_TAG

# Dask & Distributed option to install main(nightly) or `conda-forge` packages.
export INSTALL_DASK_MAIN=0
export INSTALL_DASK_MAIN=1

# ucx-py version
export UCX_PY_VERSION='0.28.*'
Expand Down Expand Up @@ -92,8 +92,8 @@ function install_dask {
gpuci_mamba_retry update dask
conda list
else
gpuci_logger "gpuci_mamba_retry install conda-forge::dask==2022.7.1 conda-forge::distributed==2022.7.1 conda-forge::dask-core==2022.7.1 --force-reinstall"
gpuci_mamba_retry install conda-forge::dask==2022.7.1 conda-forge::distributed==2022.7.1 conda-forge::dask-core==2022.7.1 --force-reinstall
gpuci_logger "gpuci_mamba_retry install conda-forge::dask>=2022.7.1 conda-forge::distributed>=2022.7.1 conda-forge::dask-core>=2022.7.1 --force-reinstall"
gpuci_mamba_retry install conda-forge::dask>=2022.7.1 conda-forge::distributed>=2022.7.1 conda-forge::dask-core>=2022.7.1 --force-reinstall
fi
# Install the main version of streamz
gpuci_logger "Install the main version of streamz"
Expand Down
8 changes: 4 additions & 4 deletions conda/environments/cudf_dev_cuda11.5.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ dependencies:
- numba>=0.54
- numpy
- pandas>=1.0,<1.5.0dev0
- pyarrow=8
- pyarrow=9
- fastavro>=0.22.9
- python-snappy>=0.6.0
- notebook>=0.5.0
Expand All @@ -48,10 +48,10 @@ dependencies:
- pydocstyle=6.1.1
- typing_extensions
- pre-commit
- dask==2022.7.1
- distributed==2022.7.1
- dask>=2022.7.1
- distributed>=2022.7.1
- streamz
- arrow-cpp=8
- arrow-cpp=9
- dlpack>=0.5,<0.6.0a0
- double-conversion
- rapidjson
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ requirements:
- setuptools
- numba >=0.54
- dlpack>=0.5,<0.6.0a0
- pyarrow =8
- pyarrow =9
- libcudf ={{ version }}
- rmm ={{ minor_version }}
- cudatoolkit ={{ cuda_version }}
Expand Down
4 changes: 2 additions & 2 deletions conda/recipes/custreamz/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ requirements:
- python
- streamz
- cudf ={{ version }}
- dask==2022.7.1
- distributed==2022.7.1
- dask>=2022.7.1
- distributed>=2022.7.1
- python-confluent-kafka >=1.7.0,<1.8.0a0
- cudf_kafka ={{ version }}

Expand Down
8 changes: 4 additions & 4 deletions conda/recipes/dask-cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@ requirements:
host:
- python
- cudf ={{ version }}
- dask==2022.7.1
- distributed==2022.7.1
- dask>=2022.7.1
- distributed>=2022.7.1
- cudatoolkit ={{ cuda_version }}
run:
- python
- cudf ={{ version }}
- dask==2022.7.1
- distributed==2022.7.1
- dask>=2022.7.1
- distributed>=2022.7.1
- {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}

test: # [linux64]
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/libcudf/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
# Copyright (c) 2018-2022, NVIDIA CORPORATION.

export cudf_ROOT="$(realpath ./cpp/build)"
./build.sh -n -v libcudf libcudf_kafka benchmarks tests --build_metrics --incl_cache_stats --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib\"
./build.sh -n -v libcudf libcudf_kafka benchmarks tests --build_metrics --incl_cache_stats --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib -DCUDF_ENABLE_ARROW_S3=ON\"
2 changes: 1 addition & 1 deletion conda/recipes/libcudf/conda_build_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ gtest_version:
- "=1.10.0"

arrow_cpp_version:
- "=8"
- "=9"

dlpack_version:
- ">=0.5,<0.6.0a0"
Expand Down
1 change: 1 addition & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ outputs:
- test -f $PREFIX/include/cudf/strings/repeat_strings.hpp
- test -f $PREFIX/include/cudf/strings/replace.hpp
- test -f $PREFIX/include/cudf/strings/replace_re.hpp
- test -f $PREFIX/include/cudf/strings/side_type.hpp
- test -f $PREFIX/include/cudf/strings/split/partition.hpp
- test -f $PREFIX/include/cudf/strings/split/split.hpp
- test -f $PREFIX/include/cudf/strings/split/split_re.hpp
Expand Down
5 changes: 3 additions & 2 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ option(CUDF_USE_ARROW_STATIC "Build and statically link Arrow libraries" OFF)
option(CUDF_ENABLE_ARROW_ORC "Build the Arrow ORC adapter" OFF)
option(CUDF_ENABLE_ARROW_PYTHON "Find (or build) Arrow with Python support" OFF)
option(CUDF_ENABLE_ARROW_PARQUET "Find (or build) Arrow with Parquet support" OFF)
option(CUDF_ENABLE_ARROW_S3 "Build/Enable AWS S3 Arrow filesystem support" ON)
option(CUDF_ENABLE_ARROW_S3 "Build/Enable AWS S3 Arrow filesystem support" OFF)
option(
CUDF_USE_PER_THREAD_DEFAULT_STREAM
"Build cuDF with per-thread default stream, including passing the per-thread default
Expand Down Expand Up @@ -328,6 +328,7 @@ add_library(
src/io/csv/writer_impl.cu
src/io/functions.cpp
src/io/json/json_gpu.cu
src/io/json/nested_json_gpu.cu
src/io/json/reader_impl.cu
src/io/json/experimental/read_json.cpp
src/io/orc/aggregate_orc_metadata.cpp
Expand Down Expand Up @@ -448,8 +449,8 @@ add_library(
src/scalar/scalar.cpp
src/scalar/scalar_factories.cpp
src/search/contains_column.cu
src/search/contains_scalar.cu
src/search/contains_table.cu
src/search/contains_nested.cu
src/search/search_ordered.cu
src/sort/is_sorted.cu
src/sort/rank.cu
Expand Down
11 changes: 7 additions & 4 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,9 @@ ConfigureBench(
REDUCTION_BENCH reduction/anyall.cpp reduction/dictionary.cpp reduction/minmax.cpp
reduction/reduce.cpp reduction/scan.cpp
)
ConfigureNVBench(REDUCTION_NVBENCH reduction/segment_reduce.cu reduction/rank.cpp)
ConfigureNVBench(
REDUCTION_NVBENCH reduction/distinct_count.cpp reduction/rank.cpp reduction/segment_reduce.cu
)

# ##################################################################################################
# * reduction benchmark ---------------------------------------------------------------------------
Expand All @@ -200,7 +202,8 @@ ConfigureBench(
)

ConfigureNVBench(
GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_rank.cpp groupby/group_struct_keys.cpp
GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_nunique.cpp groupby/group_rank.cpp
groupby/group_struct_keys.cpp
)

# ##################################################################################################
Expand All @@ -225,7 +228,7 @@ ConfigureBench(PARQUET_READER_BENCH io/parquet/parquet_reader.cpp)

# ##################################################################################################
# * orc reader benchmark --------------------------------------------------------------------------
ConfigureBench(ORC_READER_BENCH io/orc/orc_reader.cpp)
ConfigureNVBench(ORC_READER_NVBENCH io/orc/orc_reader_input.cpp io/orc/orc_reader_options.cpp)

# ##################################################################################################
# * csv reader benchmark --------------------------------------------------------------------------
Expand Down Expand Up @@ -294,7 +297,7 @@ ConfigureNVBench(FST_NVBENCH io/fst.cu)

# ##################################################################################################
# * io benchmark ---------------------------------------------------------------------
ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK io/text/multibyte_split.cpp)
ConfigureNVBench(MULTIBYTE_SPLIT_BENCHMARK io/text/multibyte_split.cpp)

add_custom_target(
run_benchmarks
Expand Down
40 changes: 24 additions & 16 deletions cpp/benchmarks/common/generate_input.cu
Original file line number Diff line number Diff line change
Expand Up @@ -394,8 +394,8 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
cudf::size_type num_rows)
{
// Bernoulli distribution
auto valid_dist =
random_value_fn<bool>(distribution_params<bool>{1. - profile.get_null_frequency().value_or(0)});
auto valid_dist = random_value_fn<bool>(
distribution_params<bool>{1. - profile.get_null_probability().value_or(0)});
auto value_dist = random_value_fn<T>{profile.get_distribution_params<T>()};

// Distribution for picking elements from the array of samples
Expand Down Expand Up @@ -434,7 +434,7 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
cudf::data_type{cudf::type_to_id<T>()},
num_rows,
data.release(),
profile.get_null_frequency().has_value() ? std::move(result_bitmask) : rmm::device_buffer{});
profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{});
}

struct valid_or_zero {
Expand Down Expand Up @@ -481,8 +481,8 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
{
auto len_dist =
random_value_fn<uint32_t>{profile.get_distribution_params<cudf::string_view>().length_params};
auto valid_dist =
random_value_fn<bool>(distribution_params<bool>{1. - profile.get_null_frequency().value_or(0)});
auto valid_dist = random_value_fn<bool>(
distribution_params<bool>{1. - profile.get_null_probability().value_or(0)});
auto lengths = len_dist(engine, num_rows + 1);
auto null_mask = valid_dist(engine, num_rows + 1);
thrust::transform_if(
Expand Down Expand Up @@ -512,7 +512,7 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
num_rows,
std::move(offsets),
std::move(chars),
profile.get_null_frequency().has_value() ? std::move(result_bitmask) : rmm::device_buffer{});
profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{});
}

/**
Expand Down Expand Up @@ -609,8 +609,8 @@ std::unique_ptr<cudf::column> create_random_column<cudf::struct_view>(data_profi
cudf::data_type(type_id), create_rand_col_fn{}, profile, engine, num_rows);
});

auto valid_dist =
random_value_fn<bool>(distribution_params<bool>{1. - profile.get_null_frequency().value_or(0)});
auto valid_dist = random_value_fn<bool>(
distribution_params<bool>{1. - profile.get_null_probability().value_or(0)});

// Generate the column bottom-up
for (int lvl = dist_params.max_depth; lvl > 0; --lvl) {
Expand All @@ -621,7 +621,7 @@ std::unique_ptr<cudf::column> create_random_column<cudf::struct_view>(data_profi
auto current_child = children.begin();
for (auto current_parent = parents.begin(); current_parent != parents.end(); ++current_parent) {
auto [null_mask, null_count] = [&]() {
if (profile.get_null_frequency().has_value()) {
if (profile.get_null_probability().has_value()) {
auto valids = valid_dist(engine, num_rows);
return cudf::detail::valid_if(valids.begin(), valids.end(), thrust::identity<bool>{});
}
Expand Down Expand Up @@ -683,8 +683,8 @@ std::unique_ptr<cudf::column> create_random_column<cudf::list_view>(data_profile
cudf::data_type(dist_params.element_type), create_rand_col_fn{}, profile, engine, num_elements);
auto len_dist =
random_value_fn<uint32_t>{profile.get_distribution_params<cudf::list_view>().length_params};
auto valid_dist =
random_value_fn<bool>(distribution_params<bool>{1. - profile.get_null_frequency().value_or(0)});
auto valid_dist = random_value_fn<bool>(
distribution_params<bool>{1. - profile.get_null_probability().value_or(0)});

// Generate the list column bottom-up
auto list_column = std::move(leaf_column);
Expand Down Expand Up @@ -712,8 +712,8 @@ std::unique_ptr<cudf::column> create_random_column<cudf::list_view>(data_profile
num_rows,
std::move(offsets_column),
std::move(current_child_column),
profile.get_null_frequency().has_value() ? null_count : 0, // cudf::UNKNOWN_NULL_COUNT,
profile.get_null_frequency().has_value() ? std::move(null_mask) : rmm::device_buffer{});
profile.get_null_probability().has_value() ? null_count : 0, // cudf::UNKNOWN_NULL_COUNT,
profile.get_null_probability().has_value() ? std::move(null_mask) : rmm::device_buffer{});
}
return list_column; // return the top-level column
}
Expand Down Expand Up @@ -785,13 +785,21 @@ std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> cons
columns_vector output_columns;
std::transform(
dtype_ids.begin(), dtype_ids.end(), std::back_inserter(output_columns), [&](auto tid) mutable {
auto engine = deterministic_engine(seed_dist(seed_engine));
return cudf::type_dispatcher(
cudf::data_type(tid), create_rand_col_fn{}, profile, engine, num_rows.count);
return create_random_column(tid, num_rows, profile, seed_dist(seed_engine));
});
return std::make_unique<cudf::table>(std::move(output_columns));
}

std::unique_ptr<cudf::column> create_random_column(cudf::type_id dtype_id,
row_count num_rows,
data_profile const& profile,
unsigned seed)
{
auto engine = deterministic_engine(seed);
return cudf::type_dispatcher(
cudf::data_type(dtype_id), create_rand_col_fn{}, profile, engine, num_rows.count);
}

std::unique_ptr<cudf::table> create_sequence_table(std::vector<cudf::type_id> const& dtype_ids,
row_count num_rows,
std::optional<double> null_probability,
Expand Down
Loading

0 comments on commit b2b22c8

Please sign in to comment.