diff --git a/CHANGELOG.md b/CHANGELOG.md
index de00213a6f6..b46ac22d767 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# cuDF 21.12.00 (Date TBD)
+
+Please see https://github.com/rapidsai/cudf/releases/tag/v21.12.00a for the latest changes to this development branch.
+
# cuDF 21.10.00 (Date TBD)
Please see https://github.com/rapidsai/cudf/releases/tag/v21.10.00a for the latest changes to this development branch.
diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh
index e73153ce0c3..c2544ff7ffe 100755
--- a/ci/benchmark/build.sh
+++ b/ci/benchmark/build.sh
@@ -36,6 +36,9 @@ export GBENCH_BENCHMARKS_DIR="$WORKSPACE/cpp/build/gbenchmarks/"
# like `/tmp` is.
export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache"
+# Dask & Distributed git tag
+export DASK_DISTRIBUTED_GIT_TAG='2021.09.1'
+
function remove_libcudf_kernel_cache_dir {
EXITCODE=$?
logger "removing kernel cache dir: $LIBCUDF_KERNEL_CACHE_PATH"
@@ -75,10 +78,10 @@ conda install "rmm=$MINOR_VERSION.*" "cudatoolkit=$CUDA_REL" \
# conda install "your-pkg=1.0.0"
# Install the master version of dask, distributed, and streamz
-logger "pip install git+https://github.com/dask/distributed.git@main --upgrade --no-deps"
-pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
-logger "pip install git+https://github.com/dask/dask.git@main --upgrade --no-deps"
-pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
+logger "pip install git+https://github.com/dask/distributed.git@$DASK_DISTRIBUTED_GIT_TAG --upgrade --no-deps"
+pip install "git+https://github.com/dask/distributed.git@$DASK_DISTRIBUTED_GIT_TAG" --upgrade --no-deps
+logger "pip install git+https://github.com/dask/dask.git@$DASK_DISTRIBUTED_GIT_TAG --upgrade --no-deps"
+pip install "git+https://github.com/dask/dask.git@$DASK_DISTRIBUTED_GIT_TAG" --upgrade --no-deps
logger "pip install git+https://github.com/python-streamz/streamz.git@master --upgrade --no-deps"
pip install "git+https://github.com/python-streamz/streamz.git@master" --upgrade --no-deps
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 8e5b4d80115..7c5b9d836dd 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -30,6 +30,9 @@ export CONDA_ARTIFACT_PATH="$WORKSPACE/ci/artifacts/cudf/cpu/.conda-bld/"
export GIT_DESCRIBE_TAG=`git describe --tags`
export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
+# Dask & Distributed git tag
+export DASK_DISTRIBUTED_GIT_TAG='2021.09.1'
+
################################################################################
# TRAP - Setup trap for removing jitify cache
################################################################################
@@ -101,8 +104,8 @@ function install_dask {
# Install the main version of dask, distributed, and streamz
gpuci_logger "Install the main version of dask, distributed, and streamz"
set -x
- pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
- pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
+ pip install "git+https://github.com/dask/distributed.git@$DASK_DISTRIBUTED_GIT_TAG" --upgrade --no-deps
+ pip install "git+https://github.com/dask/dask.git@$DASK_DISTRIBUTED_GIT_TAG" --upgrade --no-deps
# Need to uninstall streamz that is already in the env.
pip uninstall -y streamz
pip install "git+https://github.com/python-streamz/streamz.git@master" --upgrade --no-deps
diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index bbe1ae70499..d5251b18582 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -10,7 +10,7 @@ dependencies:
- clang=11.0.0
- clang-tools=11.0.0
- cupy>7.1.0,<10.0.0a0
- - rmm=21.10.*
+ - rmm=21.12.*
- cmake>=3.20.1
- cmake_setuptools>=0.1.3
- python>=3.7,<3.9
@@ -39,8 +39,8 @@ dependencies:
- mypy=0.782
- typing_extensions
- pre_commit
- - dask>=2021.6.0
- - distributed>=2021.6.0
+ - dask=2021.09.1
+ - distributed=2021.09.1
- streamz
- arrow-cpp=5.0.0
- dlpack>=0.5,<0.6.0a0
@@ -58,7 +58,7 @@ dependencies:
- transformers
- pydata-sphinx-theme
- pip:
- - git+https://github.com/dask/dask.git@main
- - git+https://github.com/dask/distributed.git@main
+ - git+https://github.com/dask/dask.git@2021.09.1
+ - git+https://github.com/dask/distributed.git@2021.09.1
- git+https://github.com/python-streamz/streamz.git@master
- pyorc
diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
index ed4c3ee2efc..7ab2cd60ce3 100644
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ b/conda/environments/cudf_dev_cuda11.2.yml
@@ -10,7 +10,7 @@ dependencies:
- clang=11.0.0
- clang-tools=11.0.0
- cupy>7.1.0,<10.0.0a0
- - rmm=21.10.*
+ - rmm=21.12.*
- cmake>=3.20.1
- cmake_setuptools>=0.1.3
- python>=3.7,<3.9
@@ -39,8 +39,8 @@ dependencies:
- mypy=0.782
- typing_extensions
- pre_commit
- - dask>=2021.6.0
- - distributed>=2021.6.0
+ - dask=2021.09.1
+ - distributed=2021.09.1
- streamz
- arrow-cpp=5.0.0
- dlpack>=0.5,<0.6.0a0
@@ -58,7 +58,7 @@ dependencies:
- transformers
- pydata-sphinx-theme
- pip:
- - git+https://github.com/dask/dask.git@main
- - git+https://github.com/dask/distributed.git@main
+ - git+https://github.com/dask/dask.git@2021.09.1
+ - git+https://github.com/dask/distributed.git@2021.09.1
- git+https://github.com/python-streamz/streamz.git@master
- pyorc
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index d0965e97567..db8aa8e6c85 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -31,8 +31,8 @@ requirements:
- python
- streamz
- cudf {{ version }}
- - dask>=2021.6.0
- - distributed>=2021.6.0
+ - dask=2021.09.1
+ - distributed=2021.09.1
- python-confluent-kafka
- cudf_kafka {{ version }}
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index 1b2c4efd610..45d96a2de85 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -26,13 +26,13 @@ requirements:
host:
- python
- cudf {{ version }}
- - dask>=2021.6.0
- - distributed>=2021.6.0
+ - dask=2021.09.1
+ - distributed=2021.09.1
run:
- python
- cudf {{ version }}
- - dask>=2021.6.0
- - distributed>=2021.6.0
+ - dask=2021.09.1
+ - distributed=2021.09.1
test: # [linux64]
requires: # [linux64]
diff --git a/conda/recipes/dask-cudf/run_test.sh b/conda/recipes/dask-cudf/run_test.sh
index 3fc1182b33b..f56610bea86 100644
--- a/conda/recipes/dask-cudf/run_test.sh
+++ b/conda/recipes/dask-cudf/run_test.sh
@@ -8,6 +8,15 @@ function logger() {
echo -e "\n>>>> $@\n"
}
+# Importing cudf on arm64 CPU only nodes is currently not working due to a
+# difference in reported gpu devices between arm64 and amd64
+ARCH=$(arch)
+
+if [ "${ARCH}" = "aarch64" ]; then
+ logger "Skipping tests on arm64"
+ exit 0
+fi
+
# Install the latest version of dask and distributed
logger "pip install git+https://github.com/dask/distributed.git@main --upgrade --no-deps"
pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 0f05dcb4bb3..fd687de6698 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -93,6 +93,7 @@ test:
- test -f $PREFIX/include/cudf/detail/sequence.hpp
- test -f $PREFIX/include/cudf/detail/sorting.hpp
- test -f $PREFIX/include/cudf/detail/stream_compaction.hpp
+ - test -f $PREFIX/include/cudf/detail/tdigest/tdigest.hpp
- test -f $PREFIX/include/cudf/detail/transform.hpp
- test -f $PREFIX/include/cudf/detail/transpose.hpp
- test -f $PREFIX/include/cudf/detail/unary.hpp
@@ -238,6 +239,7 @@ test:
- test -f $PREFIX/include/cudf_test/cudf_gtest.hpp
- test -f $PREFIX/include/cudf_test/cxxopts.hpp
- test -f $PREFIX/include/cudf_test/file_utilities.hpp
+ - test -f $PREFIX/include/cudf_test/io_metadata_utilities.hpp
- test -f $PREFIX/include/cudf_test/iterator_utilities.hpp
- test -f $PREFIX/include/cudf_test/table_utilities.hpp
- test -f $PREFIX/include/cudf_test/timestamp_utilities.cuh
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c72c258fd18..982fee640d9 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -28,7 +28,7 @@ include(rapids-find)
rapids_cuda_init_architectures(CUDF)
-project(CUDF VERSION 21.10.00 LANGUAGES C CXX CUDA)
+project(CUDF VERSION 21.12.00 LANGUAGES C CXX CUDA)
# Needed because GoogleBenchmark changes the state of FindThreads.cmake,
# causing subsequent runs to have different values for the `Threads::Threads` target.
@@ -236,8 +236,9 @@ add_library(cudf
src/groupby/sort/group_max_scan.cu
src/groupby/sort/group_min_scan.cu
src/groupby/sort/group_rank_scan.cu
- src/groupby/sort/group_sum_scan.cu
src/groupby/sort/group_replace_nulls.cu
+ src/groupby/sort/group_sum_scan.cu
+ src/groupby/sort/group_tdigest.cu
src/groupby/sort/sort_helper.cu
src/hash/hashing.cu
src/hash/md5_hash.cu
@@ -318,6 +319,7 @@ add_library(cudf
src/merge/merge.cu
src/partitioning/partitioning.cu
src/partitioning/round_robin.cu
+ src/quantiles/tdigest/tdigest.cu
src/quantiles/quantile.cu
src/quantiles/quantiles.cu
src/reductions/all.cu
@@ -565,6 +567,7 @@ add_library(cudftestutil STATIC
tests/utilities/base_fixture.cpp
tests/utilities/column_utilities.cu
tests/utilities/table_utilities.cu
+ tests/io/metadata_utilities.cpp
tests/strings/utilities.cu)
set_target_properties(cudftestutil
diff --git a/cpp/cmake/thirdparty/get_nvcomp.cmake b/cpp/cmake/thirdparty/get_nvcomp.cmake
index cade101cbfd..16d50fd3388 100644
--- a/cpp/cmake/thirdparty/get_nvcomp.cmake
+++ b/cpp/cmake/thirdparty/get_nvcomp.cmake
@@ -21,7 +21,7 @@ function(find_and_configure_nvcomp VERSION)
GLOBAL_TARGETS nvcomp::nvcomp
CPM_ARGS
GITHUB_REPOSITORY NVIDIA/nvcomp
- GIT_TAG 4f4e5713e69473be6e0c8ae483a932f666ae3c2f
+ GIT_TAG aa003db89e052e4ce408910ff17e1054b7c43b7d
OPTIONS "BUILD_STATIC ON"
"BUILD_TESTS OFF"
"BUILD_BENCHMARKS OFF"
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 72524996a69..1141f20e3b1 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME = "libcudf"
# could be handy for archiving the generated documentation or if some version
# control system is used.
-PROJECT_NUMBER = 21.10.00
+PROJECT_NUMBER = 21.12.00
# Using the PROJECT_BRIEF tag one can provide an optional one line description
# for a project that appears at the top of each page and should give viewer a
@@ -2167,7 +2167,7 @@ SKIP_FUNCTION_MACROS = YES
# the path). If a tag file is not located in the directory in which doxygen is
# run, you must also specify the path to the tagfile here.
-TAGFILES = rmm.tag=https://docs.rapids.ai/api/librmm/21.10
+TAGFILES = rmm.tag=https://docs.rapids.ai/api/librmm/21.12
# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
# tag file that is based on the input files it reads. See section "Linking to
diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt
index aef477c91e1..4175b34ff40 100644
--- a/cpp/examples/basic/CMakeLists.txt
+++ b/cpp/examples/basic/CMakeLists.txt
@@ -6,7 +6,7 @@ set(CPM_DOWNLOAD_VERSION v0.32.2)
file(DOWNLOAD https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
-set(CUDF_TAG branch-21.10)
+set(CUDF_TAG branch-21.12)
CPMFindPackage(NAME cudf
GIT_REPOSITORY https://github.com/rapidsai/cudf
GIT_TAG ${CUDF_TAG}
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index c302895880d..fb6401a3cc1 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -87,7 +87,9 @@ class aggregation {
CUDA, ///< CUDA UDF based reduction
MERGE_LISTS, ///< merge multiple lists values into one list
MERGE_SETS, ///< merge multiple lists values into one list then drop duplicate entries
- MERGE_M2 ///< merge partial values of M2 aggregation
+ MERGE_M2, ///< merge partial values of M2 aggregation
+ TDIGEST, ///< create a tdigest from a set of input values
+ MERGE_TDIGEST ///< create a tdigest by merging multiple tdigests together
};
aggregation() = delete;
@@ -493,5 +495,80 @@ std::unique_ptr make_merge_sets_aggregation(null_equality nulls_equal = nu
template
std::unique_ptr make_merge_m2_aggregation();
+/**
+ * @brief Factory to create a TDIGEST aggregation
+ *
+ * Produces a tdigest (https://arxiv.org/pdf/1902.04023.pdf) column from input values.
+ * The input aggregation values are expected to be fixed-width numeric types.
+ *
+ * The tdigest column produced is of the following structure:
+ *
+ * struct {
+ * // centroids for the digest
+ * list {
+ * struct {
+ * double // mean
+ * double // weight
+ * },
+ * ...
+ * }
+ * // these are from the input stream, not the centroids. they are used
+ * // during the percentile_approx computation near the beginning or
+ * // end of the quantiles
+ * double // min
+ * double // max
+ * }
+ *
+ * Each output row is a single tdigest. The length of the row is the "size" of the
+ * tdigest, each element of which represents a weighted centroid (mean, weight).
+ *
+ * @param max_centroids Parameter controlling compression level and accuracy on subsequent
+ * queries on the output tdigest data. `max_centroids` places an upper bound on the size of
+ * the computed tdigests: A value of 1000 will result in a tdigest containing no
+ * more than 1000 centroids (32 bytes each). Higher result in more accurate tdigest information.
+ *
+ * @returns A TDIGEST aggregation object.
+ */
+template
+std::unique_ptr make_tdigest_aggregation(int max_centroids = 1000);
+
+/**
+ * @brief Factory to create a MERGE_TDIGEST aggregation
+ *
+ * Merges the results from a previous aggregation resulting from a `make_tdigest_aggregation`
+ * or `make_merge_tdigest_aggregation` to produce a new a tdigest
+ * (https://arxiv.org/pdf/1902.04023.pdf) column.
+ *
+ * The tdigest column produced is of the following structure:
+ *
+ * struct {
+ * // centroids for the digest
+ * list {
+ * struct {
+ * double // mean
+ * double // weight
+ * },
+ * ...
+ * }
+ * // these are from the input stream, not the centroids. they are used
+ * // during the percentile_approx computation near the beginning or
+ * // end of the quantiles
+ * double // min
+ * double // max
+ * }
+ *
+ * Each output row is a single tdigest. The length of the row is the "size" of the
+ * tdigest, each element of which represents a weighted centroid (mean, weight).
+ *
+ * @param max_centroids Parameter controlling compression level and accuracy on subsequent
+ * queries on the output tdigest data. `max_centroids` places an upper bound on the size of
+ * the computed tdigests: A value of 1000 will result in a tdigest containing no
+ * more than 1000 centroids (32 bytes each). Higher result in more accurate tdigest information.
+ *
+ * @returns A MERGE_TDIGEST aggregation object.
+ */
+template
+std::unique_ptr make_merge_tdigest_aggregation(int max_centroids = 1000);
+
/** @} */ // end of group
} // namespace cudf
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 4cf902ef562..05d1bf3e595 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -91,6 +91,10 @@ class simple_aggregations_collector { // Declares the interface for the simple
class merge_sets_aggregation const& agg);
virtual std::vector> visit(data_type col_type,
class merge_m2_aggregation const& agg);
+ virtual std::vector> visit(data_type col_type,
+ class tdigest_aggregation const& agg);
+ virtual std::vector> visit(
+ data_type col_type, class merge_tdigest_aggregation const& agg);
};
class aggregation_finalizer { // Declares the interface for the finalizer
@@ -125,6 +129,8 @@ class aggregation_finalizer { // Declares the interface for the finalizer
virtual void visit(class merge_lists_aggregation const& agg);
virtual void visit(class merge_sets_aggregation const& agg);
virtual void visit(class merge_m2_aggregation const& agg);
+ virtual void visit(class tdigest_aggregation const& agg);
+ virtual void visit(class merge_tdigest_aggregation const& agg);
};
/**
@@ -884,6 +890,54 @@ class merge_m2_aggregation final : public groupby_aggregation {
void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
};
+/**
+ * @brief Derived aggregation class for specifying TDIGEST aggregation
+ */
+class tdigest_aggregation final : public groupby_aggregation {
+ public:
+ explicit tdigest_aggregation(int max_centroids_)
+ : aggregation{TDIGEST}, max_centroids{max_centroids_}
+ {
+ }
+
+ int const max_centroids;
+
+ std::unique_ptr clone() const override
+ {
+ return std::make_unique(*this);
+ }
+ std::vector> get_simple_aggregations(
+ data_type col_type, simple_aggregations_collector& collector) const override
+ {
+ return collector.visit(col_type, *this);
+ }
+ void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
+/**
+ * @brief Derived aggregation class for specifying MERGE_TDIGEST aggregation
+ */
+class merge_tdigest_aggregation final : public groupby_aggregation {
+ public:
+ explicit merge_tdigest_aggregation(int max_centroids_)
+ : aggregation{MERGE_TDIGEST}, max_centroids{max_centroids_}
+ {
+ }
+
+ int const max_centroids;
+
+ std::unique_ptr clone() const override
+ {
+ return std::make_unique(*this);
+ }
+ std::vector> get_simple_aggregations(
+ data_type col_type, simple_aggregations_collector& collector) const override
+ {
+ return collector.visit(col_type, *this);
+ }
+ void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
/**
* @brief Sentinel value used for `ARGMAX` aggregation.
*
@@ -954,14 +1008,16 @@ template
struct target_type_impl<
Source,
k,
- std::enable_if_t() && !is_chrono