Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/branch-21.12' into nativefile-…
Browse files Browse the repository at this point in the history
…parquet
  • Loading branch information
rjzamora committed Sep 24, 2021
2 parents a821ba7 + 2718443 commit 587ee5b
Show file tree
Hide file tree
Showing 104 changed files with 6,101 additions and 1,684 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# cuDF 21.12.00 (Date TBD)

Please see https://github.com/rapidsai/cudf/releases/tag/v21.12.00a for the latest changes to this development branch.

# cuDF 21.10.00 (Date TBD)

Please see https://github.com/rapidsai/cudf/releases/tag/v21.10.00a for the latest changes to this development branch.
Expand Down
11 changes: 7 additions & 4 deletions ci/benchmark/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ export GBENCH_BENCHMARKS_DIR="$WORKSPACE/cpp/build/gbenchmarks/"
# like `/tmp` is.
export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache"

# Dask & Distributed git tag
export DASK_DISTRIBUTED_GIT_TAG='2021.09.1'

function remove_libcudf_kernel_cache_dir {
EXITCODE=$?
logger "removing kernel cache dir: $LIBCUDF_KERNEL_CACHE_PATH"
Expand Down Expand Up @@ -75,10 +78,10 @@ conda install "rmm=$MINOR_VERSION.*" "cudatoolkit=$CUDA_REL" \
# conda install "your-pkg=1.0.0"

# Install the master version of dask, distributed, and streamz
logger "pip install git+https://github.com/dask/distributed.git@main --upgrade --no-deps"
pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
logger "pip install git+https://github.com/dask/dask.git@main --upgrade --no-deps"
pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
logger "pip install git+https://github.com/dask/distributed.git@$DASK_DISTRIBUTED_GIT_TAG --upgrade --no-deps"
pip install "git+https://github.com/dask/distributed.git@$DASK_DISTRIBUTED_GIT_TAG" --upgrade --no-deps
logger "pip install git+https://github.com/dask/dask.git@$DASK_DISTRIBUTED_GIT_TAG --upgrade --no-deps"
pip install "git+https://github.com/dask/dask.git@$DASK_DISTRIBUTED_GIT_TAG" --upgrade --no-deps
logger "pip install git+https://github.com/python-streamz/streamz.git@master --upgrade --no-deps"
pip install "git+https://github.com/python-streamz/streamz.git@master" --upgrade --no-deps

Expand Down
7 changes: 5 additions & 2 deletions ci/gpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ export CONDA_ARTIFACT_PATH="$WORKSPACE/ci/artifacts/cudf/cpu/.conda-bld/"
export GIT_DESCRIBE_TAG=`git describe --tags`
export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`

# Dask & Distributed git tag
export DASK_DISTRIBUTED_GIT_TAG='2021.09.1'

################################################################################
# TRAP - Setup trap for removing jitify cache
################################################################################
Expand Down Expand Up @@ -101,8 +104,8 @@ function install_dask {
# Install the main version of dask, distributed, and streamz
gpuci_logger "Install the main version of dask, distributed, and streamz"
set -x
pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
pip install "git+https://github.com/dask/distributed.git@$DASK_DISTRIBUTED_GIT_TAG" --upgrade --no-deps
pip install "git+https://github.com/dask/dask.git@$DASK_DISTRIBUTED_GIT_TAG" --upgrade --no-deps
# Need to uninstall streamz that is already in the env.
pip uninstall -y streamz
pip install "git+https://github.com/python-streamz/streamz.git@master" --upgrade --no-deps
Expand Down
10 changes: 5 additions & 5 deletions conda/environments/cudf_dev_cuda11.0.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ dependencies:
- clang=11.0.0
- clang-tools=11.0.0
- cupy>7.1.0,<10.0.0a0
- rmm=21.10.*
- rmm=21.12.*
- cmake>=3.20.1
- cmake_setuptools>=0.1.3
- python>=3.7,<3.9
Expand Down Expand Up @@ -39,8 +39,8 @@ dependencies:
- mypy=0.782
- typing_extensions
- pre_commit
- dask>=2021.6.0
- distributed>=2021.6.0
- dask=2021.09.1
- distributed=2021.09.1
- streamz
- arrow-cpp=5.0.0
- dlpack>=0.5,<0.6.0a0
Expand All @@ -58,7 +58,7 @@ dependencies:
- transformers
- pydata-sphinx-theme
- pip:
- git+https://github.com/dask/dask.git@main
- git+https://github.com/dask/distributed.git@main
- git+https://github.com/dask/dask.git@2021.09.1
- git+https://github.com/dask/distributed.git@2021.09.1
- git+https://github.com/python-streamz/streamz.git@master
- pyorc
10 changes: 5 additions & 5 deletions conda/environments/cudf_dev_cuda11.2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ dependencies:
- clang=11.0.0
- clang-tools=11.0.0
- cupy>7.1.0,<10.0.0a0
- rmm=21.10.*
- rmm=21.12.*
- cmake>=3.20.1
- cmake_setuptools>=0.1.3
- python>=3.7,<3.9
Expand Down Expand Up @@ -39,8 +39,8 @@ dependencies:
- mypy=0.782
- typing_extensions
- pre_commit
- dask>=2021.6.0
- distributed>=2021.6.0
- dask=2021.09.1
- distributed=2021.09.1
- streamz
- arrow-cpp=5.0.0
- dlpack>=0.5,<0.6.0a0
Expand All @@ -58,7 +58,7 @@ dependencies:
- transformers
- pydata-sphinx-theme
- pip:
- git+https://github.com/dask/dask.git@main
- git+https://github.com/dask/distributed.git@main
- git+https://github.com/dask/dask.git@2021.09.1
- git+https://github.com/dask/distributed.git@2021.09.1
- git+https://github.com/python-streamz/streamz.git@master
- pyorc
4 changes: 2 additions & 2 deletions conda/recipes/custreamz/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ requirements:
- python
- streamz
- cudf {{ version }}
- dask>=2021.6.0
- distributed>=2021.6.0
- dask=2021.09.1
- distributed=2021.09.1
- python-confluent-kafka
- cudf_kafka {{ version }}

Expand Down
8 changes: 4 additions & 4 deletions conda/recipes/dask-cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ requirements:
host:
- python
- cudf {{ version }}
- dask>=2021.6.0
- distributed>=2021.6.0
- dask=2021.09.1
- distributed=2021.09.1
run:
- python
- cudf {{ version }}
- dask>=2021.6.0
- distributed>=2021.6.0
- dask=2021.09.1
- distributed=2021.09.1

test: # [linux64]
requires: # [linux64]
Expand Down
9 changes: 9 additions & 0 deletions conda/recipes/dask-cudf/run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,15 @@ function logger() {
echo -e "\n>>>> $@\n"
}

# Importing cudf on arm64 CPU only nodes is currently not working due to a
# difference in reported gpu devices between arm64 and amd64
ARCH=$(arch)

if [ "${ARCH}" = "aarch64" ]; then
logger "Skipping tests on arm64"
exit 0
fi

# Install the latest version of dask and distributed
logger "pip install git+https://github.com/dask/distributed.git@main --upgrade --no-deps"
pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
Expand Down
2 changes: 2 additions & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ test:
- test -f $PREFIX/include/cudf/detail/sequence.hpp
- test -f $PREFIX/include/cudf/detail/sorting.hpp
- test -f $PREFIX/include/cudf/detail/stream_compaction.hpp
- test -f $PREFIX/include/cudf/detail/tdigest/tdigest.hpp
- test -f $PREFIX/include/cudf/detail/transform.hpp
- test -f $PREFIX/include/cudf/detail/transpose.hpp
- test -f $PREFIX/include/cudf/detail/unary.hpp
Expand Down Expand Up @@ -238,6 +239,7 @@ test:
- test -f $PREFIX/include/cudf_test/cudf_gtest.hpp
- test -f $PREFIX/include/cudf_test/cxxopts.hpp
- test -f $PREFIX/include/cudf_test/file_utilities.hpp
- test -f $PREFIX/include/cudf_test/io_metadata_utilities.hpp
- test -f $PREFIX/include/cudf_test/iterator_utilities.hpp
- test -f $PREFIX/include/cudf_test/table_utilities.hpp
- test -f $PREFIX/include/cudf_test/timestamp_utilities.cuh
Expand Down
7 changes: 5 additions & 2 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ include(rapids-find)

rapids_cuda_init_architectures(CUDF)

project(CUDF VERSION 21.10.00 LANGUAGES C CXX CUDA)
project(CUDF VERSION 21.12.00 LANGUAGES C CXX CUDA)

# Needed because GoogleBenchmark changes the state of FindThreads.cmake,
# causing subsequent runs to have different values for the `Threads::Threads` target.
Expand Down Expand Up @@ -236,8 +236,9 @@ add_library(cudf
src/groupby/sort/group_max_scan.cu
src/groupby/sort/group_min_scan.cu
src/groupby/sort/group_rank_scan.cu
src/groupby/sort/group_sum_scan.cu
src/groupby/sort/group_replace_nulls.cu
src/groupby/sort/group_sum_scan.cu
src/groupby/sort/group_tdigest.cu
src/groupby/sort/sort_helper.cu
src/hash/hashing.cu
src/hash/md5_hash.cu
Expand Down Expand Up @@ -318,6 +319,7 @@ add_library(cudf
src/merge/merge.cu
src/partitioning/partitioning.cu
src/partitioning/round_robin.cu
src/quantiles/tdigest/tdigest.cu
src/quantiles/quantile.cu
src/quantiles/quantiles.cu
src/reductions/all.cu
Expand Down Expand Up @@ -565,6 +567,7 @@ add_library(cudftestutil STATIC
tests/utilities/base_fixture.cpp
tests/utilities/column_utilities.cu
tests/utilities/table_utilities.cu
tests/io/metadata_utilities.cpp
tests/strings/utilities.cu)

set_target_properties(cudftestutil
Expand Down
2 changes: 1 addition & 1 deletion cpp/cmake/thirdparty/get_nvcomp.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ function(find_and_configure_nvcomp VERSION)
GLOBAL_TARGETS nvcomp::nvcomp
CPM_ARGS
GITHUB_REPOSITORY NVIDIA/nvcomp
GIT_TAG 4f4e5713e69473be6e0c8ae483a932f666ae3c2f
GIT_TAG aa003db89e052e4ce408910ff17e1054b7c43b7d
OPTIONS "BUILD_STATIC ON"
"BUILD_TESTS OFF"
"BUILD_BENCHMARKS OFF"
Expand Down
4 changes: 2 additions & 2 deletions cpp/doxygen/Doxyfile
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ PROJECT_NAME = "libcudf"
# could be handy for archiving the generated documentation or if some version
# control system is used.

PROJECT_NUMBER = 21.10.00
PROJECT_NUMBER = 21.12.00

# Using the PROJECT_BRIEF tag one can provide an optional one line description
# for a project that appears at the top of each page and should give viewer a
Expand Down Expand Up @@ -2167,7 +2167,7 @@ SKIP_FUNCTION_MACROS = YES
# the path). If a tag file is not located in the directory in which doxygen is
# run, you must also specify the path to the tagfile here.

TAGFILES = rmm.tag=https://docs.rapids.ai/api/librmm/21.10
TAGFILES = rmm.tag=https://docs.rapids.ai/api/librmm/21.12

# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
# tag file that is based on the input files it reads. See section "Linking to
Expand Down
2 changes: 1 addition & 1 deletion cpp/examples/basic/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ set(CPM_DOWNLOAD_VERSION v0.32.2)
file(DOWNLOAD https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)

set(CUDF_TAG branch-21.10)
set(CUDF_TAG branch-21.12)
CPMFindPackage(NAME cudf
GIT_REPOSITORY https://github.com/rapidsai/cudf
GIT_TAG ${CUDF_TAG}
Expand Down
79 changes: 78 additions & 1 deletion cpp/include/cudf/aggregation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,9 @@ class aggregation {
CUDA, ///< CUDA UDF based reduction
MERGE_LISTS, ///< merge multiple lists values into one list
MERGE_SETS, ///< merge multiple lists values into one list then drop duplicate entries
MERGE_M2 ///< merge partial values of M2 aggregation
MERGE_M2, ///< merge partial values of M2 aggregation
TDIGEST, ///< create a tdigest from a set of input values
MERGE_TDIGEST ///< create a tdigest by merging multiple tdigests together
};

aggregation() = delete;
Expand Down Expand Up @@ -493,5 +495,80 @@ std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal = nu
template <typename Base = aggregation>
std::unique_ptr<Base> make_merge_m2_aggregation();

/**
* @brief Factory to create a TDIGEST aggregation
*
* Produces a tdigest (https://arxiv.org/pdf/1902.04023.pdf) column from input values.
* The input aggregation values are expected to be fixed-width numeric types.
*
* The tdigest column produced is of the following structure:
*
* struct {
* // centroids for the digest
* list {
* struct {
* double // mean
* double // weight
* },
* ...
* }
* // these are from the input stream, not the centroids. they are used
* // during the percentile_approx computation near the beginning or
* // end of the quantiles
* double // min
* double // max
* }
*
* Each output row is a single tdigest. The length of the row is the "size" of the
* tdigest, each element of which represents a weighted centroid (mean, weight).
*
* @param max_centroids Parameter controlling compression level and accuracy on subsequent
* queries on the output tdigest data. `max_centroids` places an upper bound on the size of
* the computed tdigests: A value of 1000 will result in a tdigest containing no
* more than 1000 centroids (32 bytes each). Higher result in more accurate tdigest information.
*
* @returns A TDIGEST aggregation object.
*/
template <typename Base>
std::unique_ptr<Base> make_tdigest_aggregation(int max_centroids = 1000);

/**
* @brief Factory to create a MERGE_TDIGEST aggregation
*
* Merges the results from a previous aggregation resulting from a `make_tdigest_aggregation`
* or `make_merge_tdigest_aggregation` to produce a new a tdigest
* (https://arxiv.org/pdf/1902.04023.pdf) column.
*
* The tdigest column produced is of the following structure:
*
* struct {
* // centroids for the digest
* list {
* struct {
* double // mean
* double // weight
* },
* ...
* }
* // these are from the input stream, not the centroids. they are used
* // during the percentile_approx computation near the beginning or
* // end of the quantiles
* double // min
* double // max
* }
*
* Each output row is a single tdigest. The length of the row is the "size" of the
* tdigest, each element of which represents a weighted centroid (mean, weight).
*
* @param max_centroids Parameter controlling compression level and accuracy on subsequent
* queries on the output tdigest data. `max_centroids` places an upper bound on the size of
* the computed tdigests: A value of 1000 will result in a tdigest containing no
* more than 1000 centroids (32 bytes each). Higher result in more accurate tdigest information.
*
* @returns A MERGE_TDIGEST aggregation object.
*/
template <typename Base>
std::unique_ptr<Base> make_merge_tdigest_aggregation(int max_centroids = 1000);

/** @} */ // end of group
} // namespace cudf
Loading

0 comments on commit 587ee5b

Please sign in to comment.