From a2804c35b0ef97498311d4fcf12b0361cddf2d95 Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Tue, 31 Mar 2020 08:28:45 -0700 Subject: [PATCH 01/34] REL v0.13.0 release --- docs/nvstrings/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/nvstrings/source/conf.py b/docs/nvstrings/source/conf.py index 868f52e93ad..a658ef69833 100644 --- a/docs/nvstrings/source/conf.py +++ b/docs/nvstrings/source/conf.py @@ -71,7 +71,7 @@ # The short X.Y version. version = '0.13' # The full version, including alpha/beta/rc tags. -release = "0.13.0" +release = '0.13.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From b34b838625139bb14f4754db558112e8fd9293c8 Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Wed, 3 Jun 2020 08:23:04 -0700 Subject: [PATCH 02/34] REL v0.14.0 release --- docs/cudf/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 3b2e59f29c3..a2dd899fd91 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -74,7 +74,7 @@ # built documents. # # The short X.Y version. -version = "0.14" +version = '0.14' # The full version, including alpha/beta/rc tags. release = cudf.__version__ From 9ff9cdb30814bc9ff2a2f8e49f63e6347a60a3ea Mon Sep 17 00:00:00 2001 From: AJ Schmidt Date: Tue, 14 Jul 2020 16:53:01 -0400 Subject: [PATCH 03/34] update master references --- .github/PULL_REQUEST_TEMPLATE.md | 4 ++-- CONTRIBUTING.md | 2 +- README.md | 2 +- python/custreamz/README.md | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 6d16273bc26..ae895daf28a 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -39,9 +39,9 @@ Here are some guidelines to help the review process go smoothly. features or make changes out of the scope of those requested by the reviewer (doing this just add delays as already reviewed code ends up having to be re-reviewed/it is hard to tell what is new etc!). Further, please do not - rebase your branch on master/force push/rewrite history, doing any of these + rebase your branch on main/force push/rewrite history, doing any of these causes the context of any comments made by reviewers to be lost. If - conflicts occur against master they should be resolved by merging master + conflicts occur against main they should be resolved by merging main into the branch used for making the pull request. Many thanks in advance for your cooperation! diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a60ad0f925f..9c44238f42c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -336,7 +336,7 @@ flag. Below is a list of the available arguments and their purpose: | `LINUX_VERSION` | ubuntu16.04 | ubuntu18.04 | set Ubuntu version | | `CC` & `CXX` | 5 | 7 | set gcc/g++ version; **NOTE:** gcc7 requires Ubuntu 18.04 | | `CUDF_REPO` | This repo | Forks of cuDF | set git URL to use for `git clone` | -| `CUDF_BRANCH` | master | Any branch name | set git branch to checkout of `CUDF_REPO` | +| `CUDF_BRANCH` | main | Any branch name | set git branch to checkout of `CUDF_REPO` | | `NUMBA_VERSION` | newest | >=0.40.0 | set numba version | | `NUMPY_VERSION` | newest | >=1.14.3 | set numpy version | | `PANDAS_VERSION` | newest | >=0.23.4 | set pandas version | diff --git a/README.md b/README.md index 25e026a70fb..84eb8c8e63b 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![Build Status](https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cudf/job/branches/job/cudf-branch-pipeline/badge/icon)](https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cudf/job/branches/job/cudf-branch-pipeline/) -**NOTE:** For the latest stable [README.md](https://github.com/rapidsai/cudf/blob/master/README.md) ensure you are on the `master` branch. +**NOTE:** For the latest stable [README.md](https://github.com/rapidsai/cudf/blob/main/README.md) ensure you are on the `main` branch. Built based on the [Apache Arrow](http://arrow.apache.org/) columnar memory format, cuDF is a GPU DataFrame library for loading, joining, aggregating, filtering, and otherwise manipulating data. diff --git a/python/custreamz/README.md b/python/custreamz/README.md index ba81aa8e4bc..b3ba1ca374c 100644 --- a/python/custreamz/README.md +++ b/python/custreamz/README.md @@ -1,4 +1,4 @@ 1. cuStreamz is a GPU-accelerated Streaming Library, which uses cuDF with Streamz for stream data processing on GPUs. 2. cuStreamz has its own conda metapackage which makes it as simple as possible to install the set of dependencies necessary to process streaming workloads on GPUs. 3. A series of tests for use in a cuDF gpuCI instance have been included ensuring that changes continuously rolled out as part of cuDF don't break its integration with Streamz. -4. You can find [example](https://github.com/rapidsai/notebooks-contrib/blob/master/getting_started_notebooks/basics/hello_streamz.ipynb) [notebooks](https://github.com/rapidsai/notebooks-contrib/blob/master/getting_started_notebooks/basics/streamz_weblogs.ipynb) on how to write cuStreamz jobs in the RAPIDS [notebooks-contrib repository](https://github.com/rapidsai/notebooks-contrib). +4. You can find [example](https://github.com/rapidsai/notebooks-contrib/blob/main/getting_started_notebooks/basics/hello_streamz.ipynb) [notebooks](https://github.com/rapidsai/notebooks-contrib/blob/main/getting_started_notebooks/basics/streamz_weblogs.ipynb) on how to write cuStreamz jobs in the RAPIDS [notebooks-contrib repository](https://github.com/rapidsai/notebooks-contrib). From 3a0f214464f5726ac7688abe2e554c03a94f8fac Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Wed, 26 Aug 2020 08:10:21 -0700 Subject: [PATCH 04/34] REL v0.15.0 release --- docs/cudf/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index a758852526c..7fa8cc707e8 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -73,7 +73,7 @@ # built documents. # # The short X.Y version. -version = "0.15" +version = '0.15' # The full version, including alpha/beta/rc tags. release = "0.15.0" From 71cb8c0e0ff0eefa8234ce80bbd971cc2e5b2bce Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Wed, 26 Aug 2020 17:18:24 -0700 Subject: [PATCH 05/34] REL v0.15.0 release From 2b8298f5663dcb2f1433528172674ef3a5d54817 Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Wed, 21 Oct 2020 09:57:38 -0700 Subject: [PATCH 06/34] REL v0.16.0 release From f56ef850e6c25655161ae4bf357ce63f5727d535 Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Thu, 10 Dec 2020 08:28:23 -0900 Subject: [PATCH 07/34] REL v0.17.0 release From 20778e5ddb7470845605acd7f879620eb25ff4ff Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Wed, 24 Feb 2021 10:08:46 -0800 Subject: [PATCH 08/34] REL v0.18.0 release --- cpp/doxygen/Doxyfile | 2 +- docs/cudf/source/conf.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index 2615eda3463..0e7428b9d17 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "libcudf" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 0.18 +PROJECT_NUMBER = 0.18.0 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 8daf3a0850e..2db7e46413d 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -74,9 +74,9 @@ # built documents. # # The short X.Y version. -version = "0.18" +version = '0.18' # The full version, including alpha/beta/rc tags. -release = "0.18.0" +release = '0.18.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From 999be56c805bcdca93ce818c1646468aed82d2c4 Mon Sep 17 00:00:00 2001 From: Raymond Douglass Date: Mon, 15 Mar 2021 17:17:51 -0400 Subject: [PATCH 09/34] REL v0.18.1 release --- cpp/CMakeLists.txt | 2 +- cpp/doxygen/Doxyfile | 2 +- docs/cudf/source/conf.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a2b0940ed68..ec530445b34 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -16,7 +16,7 @@ cmake_minimum_required(VERSION 3.14...3.17 FATAL_ERROR) -project(CUDA_DATAFRAME VERSION 0.18.0 LANGUAGES C CXX CUDA) +project(CUDA_DATAFRAME VERSION 0.18.1 LANGUAGES C CXX CUDA) if(NOT CMAKE_CUDA_COMPILER) message(SEND_ERROR "CMake cannot locate a CUDA compiler") diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index 0e7428b9d17..163f4d75b57 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "libcudf" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 0.18.0 +PROJECT_NUMBER = 0.18.1 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 2db7e46413d..8f5715281e5 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -76,7 +76,7 @@ # The short X.Y version. version = '0.18' # The full version, including alpha/beta/rc tags. -release = '0.18.0' +release = '0.18.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From 3341561949ef9cb02b83258aca301c47cac1afc2 Mon Sep 17 00:00:00 2001 From: Raymond Douglass Date: Thu, 15 Apr 2021 12:20:29 -0400 Subject: [PATCH 10/34] REL v0.18.2 release --- cpp/CMakeLists.txt | 2 +- cpp/doxygen/Doxyfile | 2 +- docs/cudf/source/conf.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index ec530445b34..e0c60ebd173 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -16,7 +16,7 @@ cmake_minimum_required(VERSION 3.14...3.17 FATAL_ERROR) -project(CUDA_DATAFRAME VERSION 0.18.1 LANGUAGES C CXX CUDA) +project(CUDA_DATAFRAME VERSION 0.18.2 LANGUAGES C CXX CUDA) if(NOT CMAKE_CUDA_COMPILER) message(SEND_ERROR "CMake cannot locate a CUDA compiler") diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index 163f4d75b57..dc02d11d2ee 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "libcudf" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 0.18.1 +PROJECT_NUMBER = 0.18.2 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 8f5715281e5..1e6ec952a7a 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -76,7 +76,7 @@ # The short X.Y version. version = '0.18' # The full version, including alpha/beta/rc tags. -release = '0.18.1' +release = '0.18.2' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From f07b25103e87229beacb25b6381e9b811e1e62f3 Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Wed, 21 Apr 2021 09:05:10 -0700 Subject: [PATCH 11/34] REL v0.19.0 release --- docs/cudf/source/conf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index b68d7b5849f..bd01cdcd4d8 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -77,9 +77,9 @@ # built documents. # # The short X.Y version. -version = "0.19" +version = '0.19' # The full version, including alpha/beta/rc tags. -release = "0.19.0" +release = '0.19.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From 61e5a207407e37c4958e17e664e20f8389330cf4 Mon Sep 17 00:00:00 2001 From: AJ Schmidt Date: Wed, 21 Apr 2021 13:18:36 -0400 Subject: [PATCH 12/34] REL Changelog update --- CHANGELOG.md | 317 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 315 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 21ab8ed3274..2bc2d5e417d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,319 @@ -# cuDF 0.19.0 (Date TBD) +# cuDF 0.19.0 (21 Apr 2021) + +## ๐Ÿšจ Breaking Changes + +- Allow hash_partition to take a seed value ([#7771](https://github.com/rapidsai/cudf/pull/7771)) [@magnatelee](https://github.com/magnatelee) +- Allow merging index column with data column using keyword "on" ([#7736](https://github.com/rapidsai/cudf/pull/7736)) [@skirui-source](https://github.com/skirui-source) +- Change JNI API to avoid loading native dependencies when creating sort order classes. ([#7729](https://github.com/rapidsai/cudf/pull/7729)) [@revans2](https://github.com/revans2) +- Replace device_vector with device_uvector in null_mask ([#7715](https://github.com/rapidsai/cudf/pull/7715)) [@harrism](https://github.com/harrism) +- Don't identify decimals as strings. ([#7710](https://github.com/rapidsai/cudf/pull/7710)) [@vyasr](https://github.com/vyasr) +- Fix Java Parquet write after writer API changes ([#7655](https://github.com/rapidsai/cudf/pull/7655)) [@revans2](https://github.com/revans2) +- Convert cudf::concatenate APIs to use spans and device_uvector ([#7621](https://github.com/rapidsai/cudf/pull/7621)) [@harrism](https://github.com/harrism) +- Update missing docstring examples in python public APIs ([#7546](https://github.com/rapidsai/cudf/pull/7546)) [@galipremsagar](https://github.com/galipremsagar) +- Remove unneeded step parameter from strings::detail::copy_slice ([#7525](https://github.com/rapidsai/cudf/pull/7525)) [@davidwendt](https://github.com/davidwendt) +- Rename ARROW_STATIC_LIB because it conflicts with one in FindArrow.cmake ([#7518](https://github.com/rapidsai/cudf/pull/7518)) [@trxcllnt](https://github.com/trxcllnt) +- Match Pandas logic for comparing two objects with nulls ([#7490](https://github.com/rapidsai/cudf/pull/7490)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Add struct support to parquet writer ([#7461](https://github.com/rapidsai/cudf/pull/7461)) [@devavret](https://github.com/devavret) +- Join APIs that return gathermaps ([#7454](https://github.com/rapidsai/cudf/pull/7454)) [@shwina](https://github.com/shwina) +- `fixed_point` + `cudf::binary_operation` API Changes ([#7435](https://github.com/rapidsai/cudf/pull/7435)) [@codereport](https://github.com/codereport) +- Fix BUG: Exception when PYTHONOPTIMIZE=2 ([#7434](https://github.com/rapidsai/cudf/pull/7434)) [@skirui-source](https://github.com/skirui-source) +- Change nvtext::load_vocabulary_file to return a unique ptr ([#7424](https://github.com/rapidsai/cudf/pull/7424)) [@davidwendt](https://github.com/davidwendt) +- Refactor strings column factories ([#7397](https://github.com/rapidsai/cudf/pull/7397)) [@harrism](https://github.com/harrism) +- Use CMAKE_CUDA_ARCHITECTURES ([#7391](https://github.com/rapidsai/cudf/pull/7391)) [@robertmaynard](https://github.com/robertmaynard) +- Upgrade pandas to 1.2 ([#7375](https://github.com/rapidsai/cudf/pull/7375)) [@galipremsagar](https://github.com/galipremsagar) +- Rename `logical_cast` to `bit_cast` and allow additional conversions ([#7373](https://github.com/rapidsai/cudf/pull/7373)) [@ttnghia](https://github.com/ttnghia) +- Rework libcudf CMakeLists.txt to export targets for CPM ([#7107](https://github.com/rapidsai/cudf/pull/7107)) [@trxcllnt](https://github.com/trxcllnt) + +## ๐Ÿ› Bug Fixes + +- Fix a `NameError` in meta dispatch API ([#7996](https://github.com/rapidsai/cudf/pull/7996)) [@galipremsagar](https://github.com/galipremsagar) +- Reindex in `DataFrame.__setitem__` ([#7957](https://github.com/rapidsai/cudf/pull/7957)) [@galipremsagar](https://github.com/galipremsagar) +- jitify direct-to-cubin compilation and caching. ([#7919](https://github.com/rapidsai/cudf/pull/7919)) [@cwharris](https://github.com/cwharris) +- Use dynamic cudart for nvcomp in java build ([#7896](https://github.com/rapidsai/cudf/pull/7896)) [@abellina](https://github.com/abellina) +- fix "incompatible redefinition" warnings ([#7894](https://github.com/rapidsai/cudf/pull/7894)) [@cwharris](https://github.com/cwharris) +- cudf consistently specifies the cuda runtime ([#7887](https://github.com/rapidsai/cudf/pull/7887)) [@robertmaynard](https://github.com/robertmaynard) +- disable verbose output for jitify_preprocess ([#7886](https://github.com/rapidsai/cudf/pull/7886)) [@cwharris](https://github.com/cwharris) +- CMake jit_preprocess_files function only runs when needed ([#7872](https://github.com/rapidsai/cudf/pull/7872)) [@robertmaynard](https://github.com/robertmaynard) +- Push DeviceScalar construction into cython for list.contains ([#7864](https://github.com/rapidsai/cudf/pull/7864)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- cudf now sets an install rpath of $ORIGIN ([#7863](https://github.com/rapidsai/cudf/pull/7863)) [@robertmaynard](https://github.com/robertmaynard) +- Don't install Thrust examples, tests, docs, and python files ([#7811](https://github.com/rapidsai/cudf/pull/7811)) [@robertmaynard](https://github.com/robertmaynard) +- Sort by index in groupby tests more consistently ([#7802](https://github.com/rapidsai/cudf/pull/7802)) [@shwina](https://github.com/shwina) +- Revert "Update conda recipes pinning of repo dependencies ([#7743)" (#7793](https://github.com/rapidsai/cudf/pull/7743)" (#7793)) [@raydouglass](https://github.com/raydouglass) +- Add decimal column handling in copy_type_metadata ([#7788](https://github.com/rapidsai/cudf/pull/7788)) [@shwina](https://github.com/shwina) +- Add column names validation in parquet writer ([#7786](https://github.com/rapidsai/cudf/pull/7786)) [@galipremsagar](https://github.com/galipremsagar) +- Fix Java explode outer unit tests ([#7782](https://github.com/rapidsai/cudf/pull/7782)) [@jlowe](https://github.com/jlowe) +- Fix compiler warning about non-POD types passed through ellipsis ([#7781](https://github.com/rapidsai/cudf/pull/7781)) [@jrhemstad](https://github.com/jrhemstad) +- User resource fix for replace_nulls ([#7769](https://github.com/rapidsai/cudf/pull/7769)) [@magnatelee](https://github.com/magnatelee) +- Fix type dispatch for columnar replace_nulls ([#7768](https://github.com/rapidsai/cudf/pull/7768)) [@jlowe](https://github.com/jlowe) +- Add `ignore_order` parameter to dask-cudf concat dispatch ([#7765](https://github.com/rapidsai/cudf/pull/7765)) [@galipremsagar](https://github.com/galipremsagar) +- Fix slicing and arrow representations of decimal columns ([#7755](https://github.com/rapidsai/cudf/pull/7755)) [@vyasr](https://github.com/vyasr) +- Fixing issue with explode_outer position not nulling position entries of null rows ([#7754](https://github.com/rapidsai/cudf/pull/7754)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Implement scatter for struct columns ([#7752](https://github.com/rapidsai/cudf/pull/7752)) [@ttnghia](https://github.com/ttnghia) +- Fix data corruption in string columns ([#7746](https://github.com/rapidsai/cudf/pull/7746)) [@galipremsagar](https://github.com/galipremsagar) +- Fix string length in stripe dictionary building ([#7744](https://github.com/rapidsai/cudf/pull/7744)) [@kaatish](https://github.com/kaatish) +- Update conda recipes pinning of repo dependencies ([#7743](https://github.com/rapidsai/cudf/pull/7743)) [@mike-wendt](https://github.com/mike-wendt) +- Enable dask dispatch to cuDF's `is_categorical_dtype` for cuDF objects ([#7740](https://github.com/rapidsai/cudf/pull/7740)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Fix dictionary size computation in ORC writer ([#7737](https://github.com/rapidsai/cudf/pull/7737)) [@vuule](https://github.com/vuule) +- Fix `cudf::cast` overflow for `decimal64` to `int32_t` or smaller in certain cases ([#7733](https://github.com/rapidsai/cudf/pull/7733)) [@codereport](https://github.com/codereport) +- Change JNI API to avoid loading native dependencies when creating sort order classes. ([#7729](https://github.com/rapidsai/cudf/pull/7729)) [@revans2](https://github.com/revans2) +- Disable column_view data accessors for unsupported types ([#7725](https://github.com/rapidsai/cudf/pull/7725)) [@jrhemstad](https://github.com/jrhemstad) +- Materialize `RangeIndex` when `index=True` in parquet writer ([#7711](https://github.com/rapidsai/cudf/pull/7711)) [@galipremsagar](https://github.com/galipremsagar) +- Don't identify decimals as strings. ([#7710](https://github.com/rapidsai/cudf/pull/7710)) [@vyasr](https://github.com/vyasr) +- Fix return type of `DataFrame.argsort` ([#7706](https://github.com/rapidsai/cudf/pull/7706)) [@galipremsagar](https://github.com/galipremsagar) +- Fix/correct cudf installed package requirements ([#7688](https://github.com/rapidsai/cudf/pull/7688)) [@robertmaynard](https://github.com/robertmaynard) +- Fix SparkMurmurHash3_32 hash inconsistencies with Apache Spark ([#7672](https://github.com/rapidsai/cudf/pull/7672)) [@jlowe](https://github.com/jlowe) +- Fix ORC reader issue with reading empty string columns ([#7656](https://github.com/rapidsai/cudf/pull/7656)) [@rgsl888prabhu](https://github.com/rgsl888prabhu) +- Fix Java Parquet write after writer API changes ([#7655](https://github.com/rapidsai/cudf/pull/7655)) [@revans2](https://github.com/revans2) +- Fixing empty null lists throwing explode_outer for a loop. ([#7649](https://github.com/rapidsai/cudf/pull/7649)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Fix internal compiler error during JNI Docker build ([#7645](https://github.com/rapidsai/cudf/pull/7645)) [@jlowe](https://github.com/jlowe) +- Fix Debug build break with device_uvectors in grouped_rolling.cu ([#7633](https://github.com/rapidsai/cudf/pull/7633)) [@mythrocks](https://github.com/mythrocks) +- Parquet reader: Fix issue when using skip_rows on non-nested columns containing nulls ([#7627](https://github.com/rapidsai/cudf/pull/7627)) [@nvdbaranec](https://github.com/nvdbaranec) +- Fix ORC reader for empty DataFrame/Table ([#7624](https://github.com/rapidsai/cudf/pull/7624)) [@rgsl888prabhu](https://github.com/rgsl888prabhu) +- Fix specifying GPU architecture in JNI build ([#7612](https://github.com/rapidsai/cudf/pull/7612)) [@jlowe](https://github.com/jlowe) +- Fix ORC writer OOM issue ([#7605](https://github.com/rapidsai/cudf/pull/7605)) [@vuule](https://github.com/vuule) +- Fix 0.18 --> 0.19 automerge ([#7589](https://github.com/rapidsai/cudf/pull/7589)) [@kkraus14](https://github.com/kkraus14) +- Fix ORC issue with incorrect timestamp nanosecond values ([#7581](https://github.com/rapidsai/cudf/pull/7581)) [@vuule](https://github.com/vuule) +- Fix missing Dask imports ([#7580](https://github.com/rapidsai/cudf/pull/7580)) [@kkraus14](https://github.com/kkraus14) +- CMAKE_CUDA_ARCHITECTURES doesn't change when build-system invokes cmake ([#7579](https://github.com/rapidsai/cudf/pull/7579)) [@robertmaynard](https://github.com/robertmaynard) +- Another fix for offsets_end() iterator in lists_column_view ([#7575](https://github.com/rapidsai/cudf/pull/7575)) [@ttnghia](https://github.com/ttnghia) +- Fix ORC writer output corruption with string columns ([#7565](https://github.com/rapidsai/cudf/pull/7565)) [@vuule](https://github.com/vuule) +- Fix cudf::lists::sort_lists failing for sliced column ([#7564](https://github.com/rapidsai/cudf/pull/7564)) [@ttnghia](https://github.com/ttnghia) +- FIX Fix Anaconda upload args ([#7558](https://github.com/rapidsai/cudf/pull/7558)) [@dillon-cullinan](https://github.com/dillon-cullinan) +- Fix index mismatch issue in equality related APIs ([#7555](https://github.com/rapidsai/cudf/pull/7555)) [@galipremsagar](https://github.com/galipremsagar) +- FIX Revert gpuci_conda_retry on conda file output locations ([#7552](https://github.com/rapidsai/cudf/pull/7552)) [@dillon-cullinan](https://github.com/dillon-cullinan) +- Fix offset_end iterator for lists_column_view, which was not correctlโ€ฆ ([#7551](https://github.com/rapidsai/cudf/pull/7551)) [@ttnghia](https://github.com/ttnghia) +- Fix no such file dlpack.h error when build libcudf ([#7549](https://github.com/rapidsai/cudf/pull/7549)) [@chenrui17](https://github.com/chenrui17) +- Update missing docstring examples in python public APIs ([#7546](https://github.com/rapidsai/cudf/pull/7546)) [@galipremsagar](https://github.com/galipremsagar) +- Decimal32 Build Fix ([#7544](https://github.com/rapidsai/cudf/pull/7544)) [@razajafri](https://github.com/razajafri) +- FIX Retry conda output location ([#7540](https://github.com/rapidsai/cudf/pull/7540)) [@dillon-cullinan](https://github.com/dillon-cullinan) +- fix missing renames of dask git branches from master to main ([#7535](https://github.com/rapidsai/cudf/pull/7535)) [@kkraus14](https://github.com/kkraus14) +- Remove detail from device_span ([#7533](https://github.com/rapidsai/cudf/pull/7533)) [@rwlee](https://github.com/rwlee) +- Change dask and distributed branch to main ([#7532](https://github.com/rapidsai/cudf/pull/7532)) [@dantegd](https://github.com/dantegd) +- Update JNI build to use CUDF_USE_ARROW_STATIC ([#7526](https://github.com/rapidsai/cudf/pull/7526)) [@jlowe](https://github.com/jlowe) +- Make sure rmm::rmm CMake target is visibile to cudf users ([#7524](https://github.com/rapidsai/cudf/pull/7524)) [@robertmaynard](https://github.com/robertmaynard) +- Fix contiguous_split not properly handling output partitions > 2 GB. ([#7515](https://github.com/rapidsai/cudf/pull/7515)) [@nvdbaranec](https://github.com/nvdbaranec) +- Change jit launch to safe_launch ([#7510](https://github.com/rapidsai/cudf/pull/7510)) [@devavret](https://github.com/devavret) +- Fix comparison between Datetime/Timedelta columns and NULL scalars ([#7504](https://github.com/rapidsai/cudf/pull/7504)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Fix off-by-one error in char-parallel string scalar replace ([#7502](https://github.com/rapidsai/cudf/pull/7502)) [@jlowe](https://github.com/jlowe) +- Fix JNI deprecation of all, put it on the wrong version before ([#7501](https://github.com/rapidsai/cudf/pull/7501)) [@revans2](https://github.com/revans2) +- Fix Series/Dataframe Mixed Arithmetic ([#7491](https://github.com/rapidsai/cudf/pull/7491)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Fix JNI build after removal of libcudf sub-libraries ([#7486](https://github.com/rapidsai/cudf/pull/7486)) [@jlowe](https://github.com/jlowe) +- Correctly compile benchmarks ([#7485](https://github.com/rapidsai/cudf/pull/7485)) [@robertmaynard](https://github.com/robertmaynard) +- Fix bool column corruption with ORC Reader ([#7483](https://github.com/rapidsai/cudf/pull/7483)) [@rgsl888prabhu](https://github.com/rgsl888prabhu) +- Fix `__repr__` for categorical dtype ([#7476](https://github.com/rapidsai/cudf/pull/7476)) [@galipremsagar](https://github.com/galipremsagar) +- Java cleaner synchronization ([#7474](https://github.com/rapidsai/cudf/pull/7474)) [@abellina](https://github.com/abellina) +- Fix java float/double parsing tests ([#7473](https://github.com/rapidsai/cudf/pull/7473)) [@revans2](https://github.com/revans2) +- Pass stream and user resource to make_default_constructed_scalar ([#7469](https://github.com/rapidsai/cudf/pull/7469)) [@magnatelee](https://github.com/magnatelee) +- Improve stability of dask_cudf.DataFrame.var and dask_cudf.DataFrame.std ([#7453](https://github.com/rapidsai/cudf/pull/7453)) [@rjzamora](https://github.com/rjzamora) +- Missing `device_storage_dispatch` change affecting `cudf::gather` ([#7449](https://github.com/rapidsai/cudf/pull/7449)) [@codereport](https://github.com/codereport) +- fix cuFile JNI compile errors ([#7445](https://github.com/rapidsai/cudf/pull/7445)) [@rongou](https://github.com/rongou) +- Support `Series.__setitem__` with key to a new row ([#7443](https://github.com/rapidsai/cudf/pull/7443)) [@isVoid](https://github.com/isVoid) +- Fix BUG: Exception when PYTHONOPTIMIZE=2 ([#7434](https://github.com/rapidsai/cudf/pull/7434)) [@skirui-source](https://github.com/skirui-source) +- Make inclusive scan safe for cases with leading nulls ([#7432](https://github.com/rapidsai/cudf/pull/7432)) [@magnatelee](https://github.com/magnatelee) +- Fix typo in list_device_view::pair_rep_end() ([#7423](https://github.com/rapidsai/cudf/pull/7423)) [@mythrocks](https://github.com/mythrocks) +- Fix string to double conversion and row equivalent comparison ([#7410](https://github.com/rapidsai/cudf/pull/7410)) [@ttnghia](https://github.com/ttnghia) +- Fix thrust failure when transfering data from device_vector to host_vector with vectors of size 1 ([#7382](https://github.com/rapidsai/cudf/pull/7382)) [@ttnghia](https://github.com/ttnghia) +- Fix std::exeception catch-by-reference gcc9 compile error ([#7380](https://github.com/rapidsai/cudf/pull/7380)) [@davidwendt](https://github.com/davidwendt) +- Fix skiprows issue with ORC Reader ([#7359](https://github.com/rapidsai/cudf/pull/7359)) [@rgsl888prabhu](https://github.com/rgsl888prabhu) +- fix Arrow CMake file ([#7358](https://github.com/rapidsai/cudf/pull/7358)) [@rongou](https://github.com/rongou) +- Fix lists::contains() for NaN and Decimals ([#7349](https://github.com/rapidsai/cudf/pull/7349)) [@mythrocks](https://github.com/mythrocks) +- Handle cupy array in `Dataframe.__setitem__` ([#7340](https://github.com/rapidsai/cudf/pull/7340)) [@galipremsagar](https://github.com/galipremsagar) +- Fix invalid-device-fn error in cudf::strings::replace_re with multiple regex's ([#7336](https://github.com/rapidsai/cudf/pull/7336)) [@davidwendt](https://github.com/davidwendt) +- FIX Add codecov upload block to gpu script ([#6860](https://github.com/rapidsai/cudf/pull/6860)) [@dillon-cullinan](https://github.com/dillon-cullinan) + +## ๐Ÿ“– Documentation + +- Fix join API doxygen ([#7890](https://github.com/rapidsai/cudf/pull/7890)) [@shwina](https://github.com/shwina) +- Add Resources to README. ([#7697](https://github.com/rapidsai/cudf/pull/7697)) [@bdice](https://github.com/bdice) +- Add `isin` examples in Docstring ([#7479](https://github.com/rapidsai/cudf/pull/7479)) [@galipremsagar](https://github.com/galipremsagar) +- Resolving unlinked type shorthands in cudf doc ([#7416](https://github.com/rapidsai/cudf/pull/7416)) [@isVoid](https://github.com/isVoid) +- Fix typo in regex.md doc page ([#7363](https://github.com/rapidsai/cudf/pull/7363)) [@davidwendt](https://github.com/davidwendt) +- Fix incorrect strings_column_view::chars_size documentation ([#7360](https://github.com/rapidsai/cudf/pull/7360)) [@jlowe](https://github.com/jlowe) + +## ๐Ÿš€ New Features + +- Enable basic reductions for decimal columns ([#7776](https://github.com/rapidsai/cudf/pull/7776)) [@ChrisJar](https://github.com/ChrisJar) +- Enable join on decimal columns ([#7764](https://github.com/rapidsai/cudf/pull/7764)) [@ChrisJar](https://github.com/ChrisJar) +- Allow merging index column with data column using keyword "on" ([#7736](https://github.com/rapidsai/cudf/pull/7736)) [@skirui-source](https://github.com/skirui-source) +- Implement DecimalColumn + Scalar and add cudf.Scalars of Decimal64Dtype ([#7732](https://github.com/rapidsai/cudf/pull/7732)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Add support for `unique` groupby aggregation ([#7726](https://github.com/rapidsai/cudf/pull/7726)) [@shwina](https://github.com/shwina) +- Expose libcudf's label_bins function to cudf ([#7724](https://github.com/rapidsai/cudf/pull/7724)) [@vyasr](https://github.com/vyasr) +- Adding support for equi-join on struct ([#7720](https://github.com/rapidsai/cudf/pull/7720)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Add decimal column comparison operations ([#7716](https://github.com/rapidsai/cudf/pull/7716)) [@isVoid](https://github.com/isVoid) +- Implement scan operations for decimal columns ([#7707](https://github.com/rapidsai/cudf/pull/7707)) [@ChrisJar](https://github.com/ChrisJar) +- Enable typecasting between decimal and int ([#7691](https://github.com/rapidsai/cudf/pull/7691)) [@ChrisJar](https://github.com/ChrisJar) +- Enable decimal support in parquet writer ([#7673](https://github.com/rapidsai/cudf/pull/7673)) [@devavret](https://github.com/devavret) +- Adds `list.unique` API ([#7664](https://github.com/rapidsai/cudf/pull/7664)) [@isVoid](https://github.com/isVoid) +- Fix NaN handling in drop_list_duplicates ([#7662](https://github.com/rapidsai/cudf/pull/7662)) [@ttnghia](https://github.com/ttnghia) +- Add `lists.sort_values` API ([#7657](https://github.com/rapidsai/cudf/pull/7657)) [@isVoid](https://github.com/isVoid) +- Add is_integer API that can check for the validity of a string-to-integer conversion ([#7642](https://github.com/rapidsai/cudf/pull/7642)) [@ttnghia](https://github.com/ttnghia) +- Adds `explode` API ([#7607](https://github.com/rapidsai/cudf/pull/7607)) [@isVoid](https://github.com/isVoid) +- Adds `list.take`, python binding for `cudf::lists::segmented_gather` ([#7591](https://github.com/rapidsai/cudf/pull/7591)) [@isVoid](https://github.com/isVoid) +- Implement cudf::label_bins() ([#7554](https://github.com/rapidsai/cudf/pull/7554)) [@vyasr](https://github.com/vyasr) +- Add Python bindings for `lists::contains` ([#7547](https://github.com/rapidsai/cudf/pull/7547)) [@skirui-source](https://github.com/skirui-source) +- cudf::row_bit_count() support. ([#7534](https://github.com/rapidsai/cudf/pull/7534)) [@nvdbaranec](https://github.com/nvdbaranec) +- Implement drop_list_duplicates ([#7528](https://github.com/rapidsai/cudf/pull/7528)) [@ttnghia](https://github.com/ttnghia) +- Add Python bindings for `lists::extract_lists_element` ([#7505](https://github.com/rapidsai/cudf/pull/7505)) [@skirui-source](https://github.com/skirui-source) +- Add explode_outer and explode_outer_position ([#7499](https://github.com/rapidsai/cudf/pull/7499)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Match Pandas logic for comparing two objects with nulls ([#7490](https://github.com/rapidsai/cudf/pull/7490)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Add struct support to parquet writer ([#7461](https://github.com/rapidsai/cudf/pull/7461)) [@devavret](https://github.com/devavret) +- Enable type conversion from float to decimal type ([#7450](https://github.com/rapidsai/cudf/pull/7450)) [@ChrisJar](https://github.com/ChrisJar) +- Add cython for converting strings/fixed-point functions ([#7429](https://github.com/rapidsai/cudf/pull/7429)) [@davidwendt](https://github.com/davidwendt) +- Add struct column support to cudf::sort and cudf::sorted_order ([#7422](https://github.com/rapidsai/cudf/pull/7422)) [@karthikeyann](https://github.com/karthikeyann) +- Implement groupby collect_set ([#7420](https://github.com/rapidsai/cudf/pull/7420)) [@ttnghia](https://github.com/ttnghia) +- Merge branch-0.18 into branch-0.19 ([#7411](https://github.com/rapidsai/cudf/pull/7411)) [@raydouglass](https://github.com/raydouglass) +- Refactor strings column factories ([#7397](https://github.com/rapidsai/cudf/pull/7397)) [@harrism](https://github.com/harrism) +- Add groupby scan operations (sort groupby) ([#7387](https://github.com/rapidsai/cudf/pull/7387)) [@karthikeyann](https://github.com/karthikeyann) +- Add cudf::explode_position ([#7376](https://github.com/rapidsai/cudf/pull/7376)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Add string conversion to/from decimal values libcudf APIs ([#7364](https://github.com/rapidsai/cudf/pull/7364)) [@davidwendt](https://github.com/davidwendt) +- Add groupby SUM_OF_SQUARES support ([#7362](https://github.com/rapidsai/cudf/pull/7362)) [@karthikeyann](https://github.com/karthikeyann) +- Add `Series.drop` api ([#7304](https://github.com/rapidsai/cudf/pull/7304)) [@isVoid](https://github.com/isVoid) +- get_json_object() implementation ([#7286](https://github.com/rapidsai/cudf/pull/7286)) [@nvdbaranec](https://github.com/nvdbaranec) +- Python API for `LIstMethods.len()` ([#7283](https://github.com/rapidsai/cudf/pull/7283)) [@isVoid](https://github.com/isVoid) +- Support null_policy::EXCLUDE for COLLECT rolling aggregation ([#7264](https://github.com/rapidsai/cudf/pull/7264)) [@mythrocks](https://github.com/mythrocks) +- Add support for special tokens in nvtext::subword_tokenizer ([#7254](https://github.com/rapidsai/cudf/pull/7254)) [@davidwendt](https://github.com/davidwendt) +- Fix inplace update of data and add Series.update ([#7201](https://github.com/rapidsai/cudf/pull/7201)) [@galipremsagar](https://github.com/galipremsagar) +- Implement `cudf::group_by` (hash) for `decimal32` and `decimal64` ([#7190](https://github.com/rapidsai/cudf/pull/7190)) [@codereport](https://github.com/codereport) +- Adding support to specify "level" parameter for `Dataframe.rename` ([#7135](https://github.com/rapidsai/cudf/pull/7135)) [@skirui-source](https://github.com/skirui-source) + +## ๐Ÿ› ๏ธ Improvements + +- fix GDS include path for version 0.95 ([#7877](https://github.com/rapidsai/cudf/pull/7877)) [@rongou](https://github.com/rongou) +- Update `dask` + `distributed` to `2021.4.0` ([#7858](https://github.com/rapidsai/cudf/pull/7858)) [@jakirkham](https://github.com/jakirkham) +- Add ability to extract include dirs from `CUDF_HOME` ([#7848](https://github.com/rapidsai/cudf/pull/7848)) [@galipremsagar](https://github.com/galipremsagar) +- Add USE_GDS as an option in build script ([#7833](https://github.com/rapidsai/cudf/pull/7833)) [@pxLi](https://github.com/pxLi) +- add an allocate method with stream in java DeviceMemoryBuffer ([#7826](https://github.com/rapidsai/cudf/pull/7826)) [@rongou](https://github.com/rongou) +- Constrain dask and distributed versions to 2021.3.1 ([#7825](https://github.com/rapidsai/cudf/pull/7825)) [@shwina](https://github.com/shwina) +- Revert dask versioning of concat dispatch ([#7823](https://github.com/rapidsai/cudf/pull/7823)) [@galipremsagar](https://github.com/galipremsagar) +- add copy methods in Java memory buffer ([#7791](https://github.com/rapidsai/cudf/pull/7791)) [@rongou](https://github.com/rongou) +- Update README and CONTRIBUTING for 0.19 ([#7778](https://github.com/rapidsai/cudf/pull/7778)) [@robertmaynard](https://github.com/robertmaynard) +- Allow hash_partition to take a seed value ([#7771](https://github.com/rapidsai/cudf/pull/7771)) [@magnatelee](https://github.com/magnatelee) +- Turn on NVTX by default in java build ([#7761](https://github.com/rapidsai/cudf/pull/7761)) [@tgravescs](https://github.com/tgravescs) +- Add Java bindings to join gather map APIs ([#7751](https://github.com/rapidsai/cudf/pull/7751)) [@jlowe](https://github.com/jlowe) +- Add replacements column support for Java replaceNulls ([#7750](https://github.com/rapidsai/cudf/pull/7750)) [@jlowe](https://github.com/jlowe) +- Add Java bindings for row_bit_count ([#7749](https://github.com/rapidsai/cudf/pull/7749)) [@jlowe](https://github.com/jlowe) +- Remove unused JVM array creation ([#7748](https://github.com/rapidsai/cudf/pull/7748)) [@jlowe](https://github.com/jlowe) +- Added JNI support for new is_integer ([#7739](https://github.com/rapidsai/cudf/pull/7739)) [@revans2](https://github.com/revans2) +- Create and promote library aliases in libcudf installations ([#7734](https://github.com/rapidsai/cudf/pull/7734)) [@trxcllnt](https://github.com/trxcllnt) +- Support groupby operations for decimal dtypes ([#7731](https://github.com/rapidsai/cudf/pull/7731)) [@vyasr](https://github.com/vyasr) +- Memory map the input file only when GDS compatiblity mode is not used ([#7717](https://github.com/rapidsai/cudf/pull/7717)) [@vuule](https://github.com/vuule) +- Replace device_vector with device_uvector in null_mask ([#7715](https://github.com/rapidsai/cudf/pull/7715)) [@harrism](https://github.com/harrism) +- Struct hashing support for SerialMurmur3 and SparkMurmur3 ([#7714](https://github.com/rapidsai/cudf/pull/7714)) [@jlowe](https://github.com/jlowe) +- Add gbenchmark for nvtext replace-tokens function ([#7708](https://github.com/rapidsai/cudf/pull/7708)) [@davidwendt](https://github.com/davidwendt) +- Use stream in groupby calls ([#7705](https://github.com/rapidsai/cudf/pull/7705)) [@karthikeyann](https://github.com/karthikeyann) +- Update codeowners file ([#7701](https://github.com/rapidsai/cudf/pull/7701)) [@ajschmidt8](https://github.com/ajschmidt8) +- Cleanup groupby to use host_span, device_span, device_uvector ([#7698](https://github.com/rapidsai/cudf/pull/7698)) [@karthikeyann](https://github.com/karthikeyann) +- Add gbenchmark for nvtext ngrams functions ([#7693](https://github.com/rapidsai/cudf/pull/7693)) [@davidwendt](https://github.com/davidwendt) +- Misc Python/Cython optimizations ([#7686](https://github.com/rapidsai/cudf/pull/7686)) [@shwina](https://github.com/shwina) +- Add gbenchmark for nvtext tokenize functions ([#7684](https://github.com/rapidsai/cudf/pull/7684)) [@davidwendt](https://github.com/davidwendt) +- Add column_device_view to orc writer ([#7676](https://github.com/rapidsai/cudf/pull/7676)) [@kaatish](https://github.com/kaatish) +- cudf_kafka now uses cuDF CMake export targets (CPM) ([#7674](https://github.com/rapidsai/cudf/pull/7674)) [@robertmaynard](https://github.com/robertmaynard) +- Add gbenchmark for nvtext normalize functions ([#7668](https://github.com/rapidsai/cudf/pull/7668)) [@davidwendt](https://github.com/davidwendt) +- Resolve unnecessary import of thrust/optional.hpp in types.hpp ([#7667](https://github.com/rapidsai/cudf/pull/7667)) [@vyasr](https://github.com/vyasr) +- Feature/optimize accessor copy ([#7660](https://github.com/rapidsai/cudf/pull/7660)) [@vyasr](https://github.com/vyasr) +- Fix `find_package(cudf)` ([#7658](https://github.com/rapidsai/cudf/pull/7658)) [@trxcllnt](https://github.com/trxcllnt) +- Work-around for gcc7 compile error on Centos7 ([#7652](https://github.com/rapidsai/cudf/pull/7652)) [@davidwendt](https://github.com/davidwendt) +- Add in JNI support for count_elements ([#7651](https://github.com/rapidsai/cudf/pull/7651)) [@revans2](https://github.com/revans2) +- Fix issues with building cudf in a non-conda environment ([#7647](https://github.com/rapidsai/cudf/pull/7647)) [@galipremsagar](https://github.com/galipremsagar) +- Refactor ConfigureCUDA to not conditionally insert compiler flags ([#7643](https://github.com/rapidsai/cudf/pull/7643)) [@robertmaynard](https://github.com/robertmaynard) +- Add gbenchmark for converting strings to/from timestamps ([#7641](https://github.com/rapidsai/cudf/pull/7641)) [@davidwendt](https://github.com/davidwendt) +- Handle constructing a `cudf.Scalar` from a `cudf.Scalar` ([#7639](https://github.com/rapidsai/cudf/pull/7639)) [@shwina](https://github.com/shwina) +- Add in JNI support for table partition ([#7637](https://github.com/rapidsai/cudf/pull/7637)) [@revans2](https://github.com/revans2) +- Add explicit fixed_point merge test ([#7635](https://github.com/rapidsai/cudf/pull/7635)) [@codereport](https://github.com/codereport) +- Add JNI support for IDENTITY hash partitioning ([#7626](https://github.com/rapidsai/cudf/pull/7626)) [@revans2](https://github.com/revans2) +- Java support on explode_outer ([#7625](https://github.com/rapidsai/cudf/pull/7625)) [@sperlingxx](https://github.com/sperlingxx) +- Java support of casting string from/to decimal ([#7623](https://github.com/rapidsai/cudf/pull/7623)) [@sperlingxx](https://github.com/sperlingxx) +- Convert cudf::concatenate APIs to use spans and device_uvector ([#7621](https://github.com/rapidsai/cudf/pull/7621)) [@harrism](https://github.com/harrism) +- Add gbenchmark for cudf::strings::translate function ([#7617](https://github.com/rapidsai/cudf/pull/7617)) [@davidwendt](https://github.com/davidwendt) +- Use file(COPY ) over file(INSTALL ) so cmake output is reduced ([#7616](https://github.com/rapidsai/cudf/pull/7616)) [@robertmaynard](https://github.com/robertmaynard) +- Use rmm::device_uvector in place of rmm::device_vector for ORC reader/writer and cudf::io::column_buffer ([#7614](https://github.com/rapidsai/cudf/pull/7614)) [@vuule](https://github.com/vuule) +- Refactor Java host-side buffer concatenation to expose separate steps ([#7610](https://github.com/rapidsai/cudf/pull/7610)) [@jlowe](https://github.com/jlowe) +- Add gbenchmarks for string substrings functions ([#7603](https://github.com/rapidsai/cudf/pull/7603)) [@davidwendt](https://github.com/davidwendt) +- Refactor string conversion check ([#7599](https://github.com/rapidsai/cudf/pull/7599)) [@ttnghia](https://github.com/ttnghia) +- JNI: Pass names of children struct columns to native Arrow IPC writer ([#7598](https://github.com/rapidsai/cudf/pull/7598)) [@firestarman](https://github.com/firestarman) +- Revert "ENH Fix stale GHA and prevent duplicates " ([#7595](https://github.com/rapidsai/cudf/pull/7595)) [@mike-wendt](https://github.com/mike-wendt) +- ENH Fix stale GHA and prevent duplicates ([#7594](https://github.com/rapidsai/cudf/pull/7594)) [@mike-wendt](https://github.com/mike-wendt) +- Fix auto-detecting GPU architectures ([#7593](https://github.com/rapidsai/cudf/pull/7593)) [@trxcllnt](https://github.com/trxcllnt) +- Reduce cudf library size ([#7583](https://github.com/rapidsai/cudf/pull/7583)) [@robertmaynard](https://github.com/robertmaynard) +- Optimize cudf::make_strings_column for long strings ([#7576](https://github.com/rapidsai/cudf/pull/7576)) [@davidwendt](https://github.com/davidwendt) +- Always build and export the cudf::cudftestutil target ([#7574](https://github.com/rapidsai/cudf/pull/7574)) [@trxcllnt](https://github.com/trxcllnt) +- Eliminate literal parameters to uvector::set_element_async and device_scalar::set_value ([#7563](https://github.com/rapidsai/cudf/pull/7563)) [@harrism](https://github.com/harrism) +- Add gbenchmark for strings::concatenate ([#7560](https://github.com/rapidsai/cudf/pull/7560)) [@davidwendt](https://github.com/davidwendt) +- Update Changelog Link ([#7550](https://github.com/rapidsai/cudf/pull/7550)) [@ajschmidt8](https://github.com/ajschmidt8) +- Add gbenchmarks for strings replace regex functions ([#7541](https://github.com/rapidsai/cudf/pull/7541)) [@davidwendt](https://github.com/davidwendt) +- Add `__repr__` for Column and ColumnAccessor ([#7531](https://github.com/rapidsai/cudf/pull/7531)) [@shwina](https://github.com/shwina) +- Support Decimal DIV changes in cudf ([#7527](https://github.com/rapidsai/cudf/pull/7527)) [@razajafri](https://github.com/razajafri) +- Remove unneeded step parameter from strings::detail::copy_slice ([#7525](https://github.com/rapidsai/cudf/pull/7525)) [@davidwendt](https://github.com/davidwendt) +- Use device_uvector, device_span in sort groupby ([#7523](https://github.com/rapidsai/cudf/pull/7523)) [@karthikeyann](https://github.com/karthikeyann) +- Add gbenchmarks for strings extract function ([#7522](https://github.com/rapidsai/cudf/pull/7522)) [@davidwendt](https://github.com/davidwendt) +- Rename ARROW_STATIC_LIB because it conflicts with one in FindArrow.cmake ([#7518](https://github.com/rapidsai/cudf/pull/7518)) [@trxcllnt](https://github.com/trxcllnt) +- Reduce compile time/size for scan.cu ([#7516](https://github.com/rapidsai/cudf/pull/7516)) [@davidwendt](https://github.com/davidwendt) +- Change device_vector to device_uvector in nvtext source files ([#7512](https://github.com/rapidsai/cudf/pull/7512)) [@davidwendt](https://github.com/davidwendt) +- Removed unneeded includes from traits.hpp ([#7509](https://github.com/rapidsai/cudf/pull/7509)) [@davidwendt](https://github.com/davidwendt) +- FIX Remove random build directory generation for ccache ([#7508](https://github.com/rapidsai/cudf/pull/7508)) [@dillon-cullinan](https://github.com/dillon-cullinan) +- xfail failing pytest in pandas 1.2.3 ([#7507](https://github.com/rapidsai/cudf/pull/7507)) [@galipremsagar](https://github.com/galipremsagar) +- JNI bit cast ([#7493](https://github.com/rapidsai/cudf/pull/7493)) [@revans2](https://github.com/revans2) +- Combine rolling window function tests ([#7480](https://github.com/rapidsai/cudf/pull/7480)) [@mythrocks](https://github.com/mythrocks) +- Prepare Changelog for Automation ([#7477](https://github.com/rapidsai/cudf/pull/7477)) [@ajschmidt8](https://github.com/ajschmidt8) +- Java support for explode position ([#7471](https://github.com/rapidsai/cudf/pull/7471)) [@sperlingxx](https://github.com/sperlingxx) +- Update 0.18 changelog entry ([#7463](https://github.com/rapidsai/cudf/pull/7463)) [@ajschmidt8](https://github.com/ajschmidt8) +- JNI: Support skipping nulls for collect aggregation ([#7457](https://github.com/rapidsai/cudf/pull/7457)) [@firestarman](https://github.com/firestarman) +- Join APIs that return gathermaps ([#7454](https://github.com/rapidsai/cudf/pull/7454)) [@shwina](https://github.com/shwina) +- Remove dependence on managed memory for multimap test ([#7451](https://github.com/rapidsai/cudf/pull/7451)) [@jrhemstad](https://github.com/jrhemstad) +- Use cuFile for Parquet IO when available ([#7444](https://github.com/rapidsai/cudf/pull/7444)) [@vuule](https://github.com/vuule) +- Statistics cleanup ([#7439](https://github.com/rapidsai/cudf/pull/7439)) [@kaatish](https://github.com/kaatish) +- Add gbenchmarks for strings filter functions ([#7438](https://github.com/rapidsai/cudf/pull/7438)) [@davidwendt](https://github.com/davidwendt) +- `fixed_point` + `cudf::binary_operation` API Changes ([#7435](https://github.com/rapidsai/cudf/pull/7435)) [@codereport](https://github.com/codereport) +- Improve string gather performance ([#7433](https://github.com/rapidsai/cudf/pull/7433)) [@jlowe](https://github.com/jlowe) +- Don't use user resource for a temporary allocation in sort_by_key ([#7431](https://github.com/rapidsai/cudf/pull/7431)) [@magnatelee](https://github.com/magnatelee) +- Detail APIs for datetime functions ([#7430](https://github.com/rapidsai/cudf/pull/7430)) [@magnatelee](https://github.com/magnatelee) +- Replace thrust::max_element with thrust::reduce in strings findall_re ([#7428](https://github.com/rapidsai/cudf/pull/7428)) [@davidwendt](https://github.com/davidwendt) +- Add gbenchmark for strings split/split_record functions ([#7427](https://github.com/rapidsai/cudf/pull/7427)) [@davidwendt](https://github.com/davidwendt) +- Update JNI build to use CMAKE_CUDA_ARCHITECTURES ([#7425](https://github.com/rapidsai/cudf/pull/7425)) [@jlowe](https://github.com/jlowe) +- Change nvtext::load_vocabulary_file to return a unique ptr ([#7424](https://github.com/rapidsai/cudf/pull/7424)) [@davidwendt](https://github.com/davidwendt) +- Simplify type dispatch with `device_storage_dispatch` ([#7419](https://github.com/rapidsai/cudf/pull/7419)) [@codereport](https://github.com/codereport) +- Java support for casting of nested child columns ([#7417](https://github.com/rapidsai/cudf/pull/7417)) [@razajafri](https://github.com/razajafri) +- Improve scalar string replace performance for long strings ([#7415](https://github.com/rapidsai/cudf/pull/7415)) [@jlowe](https://github.com/jlowe) +- Remove unneeded temporary device vector for strings scatter specialization ([#7409](https://github.com/rapidsai/cudf/pull/7409)) [@davidwendt](https://github.com/davidwendt) +- bitmask_or implementation with bitmask refactor ([#7406](https://github.com/rapidsai/cudf/pull/7406)) [@rwlee](https://github.com/rwlee) +- Add other cudf::strings::replace functions to current strings replace gbenchmark ([#7403](https://github.com/rapidsai/cudf/pull/7403)) [@davidwendt](https://github.com/davidwendt) +- Clean up included headers in `device_operators.cuh` ([#7401](https://github.com/rapidsai/cudf/pull/7401)) [@codereport](https://github.com/codereport) +- Move nullable index iterator to indexalator factory ([#7399](https://github.com/rapidsai/cudf/pull/7399)) [@davidwendt](https://github.com/davidwendt) +- ENH Pass ccache variables to conda recipe & use Ninja in CI ([#7398](https://github.com/rapidsai/cudf/pull/7398)) [@Ethyling](https://github.com/Ethyling) +- upgrade maven-antrun-plugin to support maven parallel builds ([#7393](https://github.com/rapidsai/cudf/pull/7393)) [@rongou](https://github.com/rongou) +- Add gbenchmark for strings find/contains functions ([#7392](https://github.com/rapidsai/cudf/pull/7392)) [@davidwendt](https://github.com/davidwendt) +- Use CMAKE_CUDA_ARCHITECTURES ([#7391](https://github.com/rapidsai/cudf/pull/7391)) [@robertmaynard](https://github.com/robertmaynard) +- Refactor libcudf strings::replace to use make_strings_children utility ([#7384](https://github.com/rapidsai/cudf/pull/7384)) [@davidwendt](https://github.com/davidwendt) +- Added in JNI support for out of core sort algorithm ([#7381](https://github.com/rapidsai/cudf/pull/7381)) [@revans2](https://github.com/revans2) +- Upgrade pandas to 1.2 ([#7375](https://github.com/rapidsai/cudf/pull/7375)) [@galipremsagar](https://github.com/galipremsagar) +- Rename `logical_cast` to `bit_cast` and allow additional conversions ([#7373](https://github.com/rapidsai/cudf/pull/7373)) [@ttnghia](https://github.com/ttnghia) +- jitify 2 support ([#7372](https://github.com/rapidsai/cudf/pull/7372)) [@cwharris](https://github.com/cwharris) +- compile_udf: Cache PTX for similar functions ([#7371](https://github.com/rapidsai/cudf/pull/7371)) [@gmarkall](https://github.com/gmarkall) +- Add string scalar replace benchmark ([#7369](https://github.com/rapidsai/cudf/pull/7369)) [@jlowe](https://github.com/jlowe) +- Add gbenchmark for strings contains_re/count_re functions ([#7366](https://github.com/rapidsai/cudf/pull/7366)) [@davidwendt](https://github.com/davidwendt) +- Update orc reader and writer fuzz tests ([#7357](https://github.com/rapidsai/cudf/pull/7357)) [@galipremsagar](https://github.com/galipremsagar) +- Improve url_decode performance for long strings ([#7353](https://github.com/rapidsai/cudf/pull/7353)) [@jlowe](https://github.com/jlowe) +- `cudf::ast` Small Refactorings ([#7352](https://github.com/rapidsai/cudf/pull/7352)) [@codereport](https://github.com/codereport) +- Remove std::cout and print in the scatter test function EmptyListsOfNullableStrings. ([#7342](https://github.com/rapidsai/cudf/pull/7342)) [@ttnghia](https://github.com/ttnghia) +- Use `cudf::detail::make_counting_transform_iterator` ([#7338](https://github.com/rapidsai/cudf/pull/7338)) [@codereport](https://github.com/codereport) +- Change block size parameter from a global to a template param. ([#7333](https://github.com/rapidsai/cudf/pull/7333)) [@nvdbaranec](https://github.com/nvdbaranec) +- Partial clean up of ORC writer ([#7324](https://github.com/rapidsai/cudf/pull/7324)) [@vuule](https://github.com/vuule) +- Add gbenchmark for cudf::strings::to_lower ([#7316](https://github.com/rapidsai/cudf/pull/7316)) [@davidwendt](https://github.com/davidwendt) +- Update Java bindings version to 0.19-SNAPSHOT ([#7307](https://github.com/rapidsai/cudf/pull/7307)) [@pxLi](https://github.com/pxLi) +- Move `cudf::test::make_counting_transform_iterator` to `cudf/detail/iterator.cuh` ([#7306](https://github.com/rapidsai/cudf/pull/7306)) [@codereport](https://github.com/codereport) +- Use string literals in `fixed_point` `release_assert`s ([#7303](https://github.com/rapidsai/cudf/pull/7303)) [@codereport](https://github.com/codereport) +- Fix merge conflicts for #7295 ([#7297](https://github.com/rapidsai/cudf/pull/7297)) [@ajschmidt8](https://github.com/ajschmidt8) +- Add UTF-8 chars to create_random_column<string_view> benchmark utility ([#7292](https://github.com/rapidsai/cudf/pull/7292)) [@davidwendt](https://github.com/davidwendt) +- Abstracting block reduce and block scan from cuIO kernels with `cub` apis ([#7278](https://github.com/rapidsai/cudf/pull/7278)) [@rgsl888prabhu](https://github.com/rgsl888prabhu) +- Build.sh use cmake --build to drive build system invocation ([#7270](https://github.com/rapidsai/cudf/pull/7270)) [@robertmaynard](https://github.com/robertmaynard) +- Refactor dictionary support for reductions any/all ([#7242](https://github.com/rapidsai/cudf/pull/7242)) [@davidwendt](https://github.com/davidwendt) +- Replace stream.value() with stream for stream_view args ([#7236](https://github.com/rapidsai/cudf/pull/7236)) [@karthikeyann](https://github.com/karthikeyann) +- Interval index and interval_range ([#7182](https://github.com/rapidsai/cudf/pull/7182)) [@marlenezw](https://github.com/marlenezw) +- avro reader integration tests ([#7156](https://github.com/rapidsai/cudf/pull/7156)) [@cwharris](https://github.com/cwharris) +- Rework libcudf CMakeLists.txt to export targets for CPM ([#7107](https://github.com/rapidsai/cudf/pull/7107)) [@trxcllnt](https://github.com/trxcllnt) +- Adding Interval Dtype ([#6984](https://github.com/rapidsai/cudf/pull/6984)) [@marlenezw](https://github.com/marlenezw) +- Cleaning up `for` loops with `make_(counting_)transform_iterator` ([#6546](https://github.com/rapidsai/cudf/pull/6546)) [@codereport](https://github.com/codereport) -Please see https://github.com/rapidsai/cudf/releases/tag/v0.19.0a for the latest changes to this development branch. # cuDF 0.18.0 (24 Feb 2021) From a9f345390e3b341c6bed8011c364587e13783732 Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Thu, 22 Apr 2021 10:48:33 -0700 Subject: [PATCH 13/34] REL v0.19.1 release --- cpp/doxygen/Doxyfile | 2 +- cpp/libcudf_kafka/CMakeLists.txt | 2 +- docs/cudf/source/conf.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index 8fde8098bd3..24d644081ff 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "libcudf" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 0.19.0 +PROJECT_NUMBER = 0.19.1 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt index e178f5a6280..86f6e153662 100644 --- a/cpp/libcudf_kafka/CMakeLists.txt +++ b/cpp/libcudf_kafka/CMakeLists.txt @@ -15,7 +15,7 @@ #============================================================================= cmake_minimum_required(VERSION 3.18 FATAL_ERROR) -project(CUDA_KAFKA VERSION 0.19.0 LANGUAGES CXX) +project(CUDA_KAFKA VERSION 0.19.1 LANGUAGES CXX) ################################################################################################### # - Build options diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index bd01cdcd4d8..13bff316bed 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -79,7 +79,7 @@ # The short X.Y version. version = '0.19' # The full version, including alpha/beta/rc tags. -release = '0.19.0' +release = '0.19.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From ab3b3f653ac9d94579f469bcf0dbec06bca07f2a Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Wed, 28 Apr 2021 11:30:28 -0700 Subject: [PATCH 14/34] REL v0.19.2 release --- cpp/doxygen/Doxyfile | 2 +- cpp/libcudf_kafka/CMakeLists.txt | 2 +- docs/cudf/source/conf.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index 24d644081ff..067205d3b1e 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "libcudf" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 0.19.1 +PROJECT_NUMBER = 0.19.2 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt index 86f6e153662..6bdf828685f 100644 --- a/cpp/libcudf_kafka/CMakeLists.txt +++ b/cpp/libcudf_kafka/CMakeLists.txt @@ -15,7 +15,7 @@ #============================================================================= cmake_minimum_required(VERSION 3.18 FATAL_ERROR) -project(CUDA_KAFKA VERSION 0.19.1 LANGUAGES CXX) +project(CUDA_KAFKA VERSION 0.19.2 LANGUAGES CXX) ################################################################################################### # - Build options diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 13bff316bed..5b91a411872 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -79,7 +79,7 @@ # The short X.Y version. version = '0.19' # The full version, including alpha/beta/rc tags. -release = '0.19.1' +release = '0.19.2' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From ae440465c2d8c0b299261923083fa1fe16b2f038 Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Wed, 9 Jun 2021 10:23:02 -0700 Subject: [PATCH 15/34] REL v21.06.00 release From cddc64f1694083fe0daf0d9bedc5e4f91c545d08 Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Thu, 17 Jun 2021 08:12:19 -0700 Subject: [PATCH 16/34] REL v21.06.01 release --- cpp/CMakeLists.txt | 2 +- cpp/doxygen/Doxyfile | 2 +- cpp/libcudf_kafka/CMakeLists.txt | 2 +- docs/cudf/source/conf.py | 2 +- java/src/main/native/CMakeLists.txt | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index b961080d162..65e526ef75b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -28,7 +28,7 @@ elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "") set(CUDF_BUILD_FOR_DETECTED_ARCHS TRUE) endif() -project(CUDF VERSION 21.06.00 LANGUAGES C CXX) +project(CUDF VERSION 21.06.01 LANGUAGES C CXX) # Needed because GoogleBenchmark changes the state of FindThreads.cmake, # causing subsequent runs to have different values for the `Threads::Threads` target. diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index d359fe59c1a..4898b25e746 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "libcudf" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 21.06.00 +PROJECT_NUMBER = 21.06.01 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt index d6b69e0bf73..52cbe774ddb 100644 --- a/cpp/libcudf_kafka/CMakeLists.txt +++ b/cpp/libcudf_kafka/CMakeLists.txt @@ -15,7 +15,7 @@ #============================================================================= cmake_minimum_required(VERSION 3.18 FATAL_ERROR) -project(CUDA_KAFKA VERSION 21.06.00 LANGUAGES CXX) +project(CUDA_KAFKA VERSION 21.06.01 LANGUAGES CXX) ################################################################################################### # - Build options diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index c9d4441efae..10ac027c2b8 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -79,7 +79,7 @@ # The short X.Y version. version = '21.06' # The full version, including alpha/beta/rc tags. -release = '21.06.00' +release = '21.06.01' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index b8c9241d756..6b6fef7784f 100755 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -32,7 +32,7 @@ elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "") set(CUDF_JNI_BUILD_FOR_DETECTED_ARCHS TRUE) endif() -project(CUDF_JNI VERSION 21.06.00 LANGUAGES C CXX) +project(CUDF_JNI VERSION 21.06.01 LANGUAGES C CXX) ################################################################################################### # - build options --------------------------------------------------------------------------------- From 106039c9c21cc48c7506ffadc0289de8d8b870cb Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Wed, 4 Aug 2021 08:26:49 -0700 Subject: [PATCH 17/34] REL v21.08.00 release --- docs/cudf/source/conf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 474e302528e..b5d9f093c98 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -76,9 +76,9 @@ # built documents. # # The short X.Y version. -version = "21.08" +version = '21.08' # The full version, including alpha/beta/rc tags. -release = "21.08.00" +release = '21.08.00' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From e0a81141a8f14f0121f0034de71b5819767a6374 Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Fri, 6 Aug 2021 07:56:12 -0700 Subject: [PATCH 18/34] REL v21.08.01 release --- cpp/CMakeLists.txt | 2 +- cpp/doxygen/Doxyfile | 2 +- cpp/libcudf_kafka/CMakeLists.txt | 2 +- docs/cudf/source/conf.py | 2 +- java/src/main/native/CMakeLists.txt | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index de3db75d97f..4cff8511494 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -28,7 +28,7 @@ elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "") set(CUDF_BUILD_FOR_DETECTED_ARCHS TRUE) endif() -project(CUDF VERSION 21.08.00 LANGUAGES C CXX) +project(CUDF VERSION 21.08.01 LANGUAGES C CXX) # Needed because GoogleBenchmark changes the state of FindThreads.cmake, # causing subsequent runs to have different values for the `Threads::Threads` target. diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index 2b6b16bd947..0fe833d3e96 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "libcudf" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 21.08.00 +PROJECT_NUMBER = 21.08.01 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt index 3f9343de1c6..17d8dbeef6b 100644 --- a/cpp/libcudf_kafka/CMakeLists.txt +++ b/cpp/libcudf_kafka/CMakeLists.txt @@ -15,7 +15,7 @@ #============================================================================= cmake_minimum_required(VERSION 3.18 FATAL_ERROR) -project(CUDA_KAFKA VERSION 21.08.00 LANGUAGES CXX) +project(CUDA_KAFKA VERSION 21.08.01 LANGUAGES CXX) ################################################################################################### # - Build options diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index b5d9f093c98..13d7e089f77 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -78,7 +78,7 @@ # The short X.Y version. version = '21.08' # The full version, including alpha/beta/rc tags. -release = '21.08.00' +release = '21.08.01' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index e779a74290d..9bc6cff5ee6 100755 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -32,7 +32,7 @@ elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "") set(CUDF_JNI_BUILD_FOR_DETECTED_ARCHS TRUE) endif() -project(CUDF_JNI VERSION 21.08.00 LANGUAGES C CXX) +project(CUDF_JNI VERSION 21.08.01 LANGUAGES C CXX) ################################################################################################### # - build options --------------------------------------------------------------------------------- From f6d31fa95d9b8d8658301438d0f9ba22a1c131aa Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Fri, 6 Aug 2021 13:26:20 -0700 Subject: [PATCH 19/34] REL v21.08.02 release --- cpp/CMakeLists.txt | 2 +- cpp/doxygen/Doxyfile | 2 +- cpp/libcudf_kafka/CMakeLists.txt | 2 +- docs/cudf/source/conf.py | 2 +- java/src/main/native/CMakeLists.txt | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index cffccd98896..aaacc76efe1 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -28,7 +28,7 @@ elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "") set(CUDF_BUILD_FOR_DETECTED_ARCHS TRUE) endif() -project(CUDF VERSION 21.08.01 LANGUAGES C CXX) +project(CUDF VERSION 21.08.02 LANGUAGES C CXX) # Needed because GoogleBenchmark changes the state of FindThreads.cmake, # causing subsequent runs to have different values for the `Threads::Threads` target. diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index 0fe833d3e96..4ddff58c573 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "libcudf" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 21.08.01 +PROJECT_NUMBER = 21.08.02 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt index 17d8dbeef6b..d526995a056 100644 --- a/cpp/libcudf_kafka/CMakeLists.txt +++ b/cpp/libcudf_kafka/CMakeLists.txt @@ -15,7 +15,7 @@ #============================================================================= cmake_minimum_required(VERSION 3.18 FATAL_ERROR) -project(CUDA_KAFKA VERSION 21.08.01 LANGUAGES CXX) +project(CUDA_KAFKA VERSION 21.08.02 LANGUAGES CXX) ################################################################################################### # - Build options diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 13d7e089f77..97352b54f04 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -78,7 +78,7 @@ # The short X.Y version. version = '21.08' # The full version, including alpha/beta/rc tags. -release = '21.08.01' +release = '21.08.02' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index 9bc6cff5ee6..3e0a3f20482 100755 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -32,7 +32,7 @@ elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "") set(CUDF_JNI_BUILD_FOR_DETECTED_ARCHS TRUE) endif() -project(CUDF_JNI VERSION 21.08.01 LANGUAGES C CXX) +project(CUDF_JNI VERSION 21.08.02 LANGUAGES C CXX) ################################################################################################### # - build options --------------------------------------------------------------------------------- From e4313b6a1e63f00408a0d64f6685b253cc3894ba Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Thu, 16 Sep 2021 12:23:22 -0700 Subject: [PATCH 20/34] REL v21.08.03 release --- cpp/CMakeLists.txt | 2 +- cpp/doxygen/Doxyfile | 2 +- cpp/libcudf_kafka/CMakeLists.txt | 2 +- docs/cudf/source/conf.py | 2 +- java/src/main/native/CMakeLists.txt | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index aaacc76efe1..f88b48e946f 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -28,7 +28,7 @@ elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "") set(CUDF_BUILD_FOR_DETECTED_ARCHS TRUE) endif() -project(CUDF VERSION 21.08.02 LANGUAGES C CXX) +project(CUDF VERSION 21.08.03 LANGUAGES C CXX) # Needed because GoogleBenchmark changes the state of FindThreads.cmake, # causing subsequent runs to have different values for the `Threads::Threads` target. diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index 4ddff58c573..01510c8ea1c 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "libcudf" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 21.08.02 +PROJECT_NUMBER = 21.08.03 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt index d526995a056..aa3bf71970c 100644 --- a/cpp/libcudf_kafka/CMakeLists.txt +++ b/cpp/libcudf_kafka/CMakeLists.txt @@ -15,7 +15,7 @@ #============================================================================= cmake_minimum_required(VERSION 3.18 FATAL_ERROR) -project(CUDA_KAFKA VERSION 21.08.02 LANGUAGES CXX) +project(CUDA_KAFKA VERSION 21.08.03 LANGUAGES CXX) ################################################################################################### # - Build options diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 97352b54f04..44bf094e01e 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -78,7 +78,7 @@ # The short X.Y version. version = '21.08' # The full version, including alpha/beta/rc tags. -release = '21.08.02' +release = '21.08.03' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index 3e0a3f20482..0b5a44b65d2 100755 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -32,7 +32,7 @@ elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "") set(CUDF_JNI_BUILD_FOR_DETECTED_ARCHS TRUE) endif() -project(CUDF_JNI VERSION 21.08.02 LANGUAGES C CXX) +project(CUDF_JNI VERSION 21.08.03 LANGUAGES C CXX) ################################################################################################### # - build options --------------------------------------------------------------------------------- From 072fd862cc37c2b9204de04045209bef79478319 Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Wed, 6 Oct 2021 08:51:32 -0700 Subject: [PATCH 21/34] REL v21.10.00 release From a1d2d13a14f4cb398e383e19a1828f2e6d78e5e2 Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Tue, 12 Oct 2021 13:16:40 -0700 Subject: [PATCH 22/34] REL v21.10.01 release --- cpp/CMakeLists.txt | 2 +- cpp/doxygen/Doxyfile | 2 +- cpp/libcudf_kafka/CMakeLists.txt | 2 +- docs/cudf/source/conf.py | 2 +- java/src/main/native/CMakeLists.txt | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1dd3348a9c9..b21890bf138 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -28,7 +28,7 @@ include(rapids-find) rapids_cuda_init_architectures(CUDF) -project(CUDF VERSION 21.10.00 LANGUAGES C CXX CUDA) +project(CUDF VERSION 21.10.01 LANGUAGES C CXX CUDA) # Needed because GoogleBenchmark changes the state of FindThreads.cmake, # causing subsequent runs to have different values for the `Threads::Threads` target. diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index 72524996a69..ddde3fd8dff 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "libcudf" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 21.10.00 +PROJECT_NUMBER = 21.10.01 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt index 020f5c76c10..c1c64553948 100644 --- a/cpp/libcudf_kafka/CMakeLists.txt +++ b/cpp/libcudf_kafka/CMakeLists.txt @@ -25,7 +25,7 @@ include(rapids-cuda) include(rapids-export) include(rapids-find) -project(CUDA_KAFKA VERSION 21.10.00 LANGUAGES CXX) +project(CUDA_KAFKA VERSION 21.10.01 LANGUAGES CXX) # Set a default build type if none was specified rapids_cmake_build_type(Release) diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index c5f1233d022..03f858ceb6e 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -82,7 +82,7 @@ # The short X.Y version. version = '21.10' # The full version, including alpha/beta/rc tags. -release = '21.10.00' +release = '21.10.01' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index fc74ee2a3a9..4b87199aee6 100755 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -29,7 +29,7 @@ if(DEFINED GPU_ARCHS) endif() rapids_cuda_init_architectures(CUDF_JNI) -project(CUDF_JNI VERSION 21.10.00 LANGUAGES C CXX CUDA) +project(CUDF_JNI VERSION 21.10.01 LANGUAGES C CXX CUDA) ################################################################################################### # - build options --------------------------------------------------------------------------------- From f1ef2d2daf1a6053c8629f09956d52a5cf28c8f8 Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Fri, 3 Dec 2021 19:16:58 +0000 Subject: [PATCH 23/34] REL v21.12.00 release From a0a0a3a317dc312da9b87f1ed5aa4a05d8f4218a Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Thu, 9 Dec 2021 18:32:36 +0000 Subject: [PATCH 24/34] REL v21.12.01 release --- cpp/doxygen/Doxyfile | 2 +- docs/cudf/source/conf.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index 1141f20e3b1..4ab649b5ce9 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "libcudf" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 21.12.00 +PROJECT_NUMBER = 21.12.01 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 4a7d115ae3b..01e38307977 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -82,7 +82,7 @@ # The short X.Y version. version = '21.12' # The full version, including alpha/beta/rc tags. -release = '21.12.00' +release = '21.12.01' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From 06540b9b3795931dccf0ba73b592af62c690086c Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Thu, 16 Dec 2021 18:48:16 +0000 Subject: [PATCH 25/34] REL v21.12.02 release --- cpp/doxygen/Doxyfile | 2 +- docs/cudf/source/conf.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index 4ab649b5ce9..d6e6feae2a7 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "libcudf" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 21.12.01 +PROJECT_NUMBER = 21.12.02 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 01e38307977..b2f2e28bb7c 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -82,7 +82,7 @@ # The short X.Y version. version = '21.12' # The full version, including alpha/beta/rc tags. -release = '21.12.01' +release = '21.12.02' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From 774d859fef2cb242dd8314d50b9c1e038468e266 Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Wed, 2 Feb 2022 16:43:00 +0000 Subject: [PATCH 26/34] REL v22.02.00 release From 8bf0520170bc4528bbf5896a950930e92f1dad7b Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Wed, 6 Apr 2022 15:19:43 +0000 Subject: [PATCH 27/34] REL v22.04.00 release From f92b0bb68edaad1c329cb1f0d47df54c11a8c6ee Mon Sep 17 00:00:00 2001 From: rjzamora Date: Thu, 21 Apr 2022 12:27:11 -0700 Subject: [PATCH 28/34] remove row_groups_per_part and clean up divisions and split_row_groups assumptions in parquet tests --- python/dask_cudf/dask_cudf/io/parquet.py | 11 --- .../dask_cudf/io/tests/test_parquet.py | 93 +++++++++++-------- 2 files changed, 56 insertions(+), 48 deletions(-) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index 042759f68cf..81af4619397 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -1,5 +1,4 @@ # Copyright (c) 2019-2022, NVIDIA CORPORATION. -import warnings from contextlib import ExitStack from functools import partial from io import BufferedWriter, BytesIO, IOBase @@ -353,7 +352,6 @@ def read_parquet( path, columns=None, split_row_groups=None, - row_groups_per_part=None, **kwargs, ): """Read parquet files into a Dask DataFrame @@ -376,15 +374,6 @@ def read_parquet( if isinstance(columns, str): columns = [columns] - if row_groups_per_part: - warnings.warn( - "row_groups_per_part is deprecated. " - "Pass an integer value to split_row_groups instead.", - FutureWarning, - ) - if split_row_groups is None: - split_row_groups = row_groups_per_part - return dd.read_parquet( path, columns=columns, diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index d9b8ee4595a..2139f45f512 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -36,42 +36,51 @@ ddf = dd.from_pandas(df, npartitions=npartitions) -@pytest.mark.parametrize("stats", [True, False]) -def test_roundtrip_from_dask(tmpdir, stats): +# Helper function to handle upcomming +# `gather_statistics` deprecation +def _divisions(setting): + return {"gather_statistics": setting} + + +@pytest.mark.parametrize("write_metadata_file", [True, False]) +@pytest.mark.parametrize("divisions", [True, False]) +def test_roundtrip_from_dask(tmpdir, divisions, write_metadata_file): tmpdir = str(tmpdir) - ddf.to_parquet(tmpdir, engine="pyarrow") + ddf.to_parquet( + tmpdir, write_metadata_file=write_metadata_file, engine="pyarrow" + ) files = sorted( (os.path.join(tmpdir, f) for f in os.listdir(tmpdir)), key=natural_sort_key, ) # Read list of parquet files - ddf2 = dask_cudf.read_parquet(files, gather_statistics=stats) - dd.assert_eq(ddf, ddf2, check_divisions=stats) + ddf2 = dask_cudf.read_parquet(files, **_divisions(divisions)) + dd.assert_eq(ddf, ddf2, check_divisions=divisions) # Specify columns=['x'] ddf2 = dask_cudf.read_parquet( - files, columns=["x"], gather_statistics=stats + files, columns=["x"], **_divisions(divisions) ) - dd.assert_eq(ddf[["x"]], ddf2, check_divisions=stats) + dd.assert_eq(ddf[["x"]], ddf2, check_divisions=divisions) # Specify columns='y' - ddf2 = dask_cudf.read_parquet(files, columns="y", gather_statistics=stats) - dd.assert_eq(ddf[["y"]], ddf2, check_divisions=stats) + ddf2 = dask_cudf.read_parquet(files, columns="y", **_divisions(divisions)) + dd.assert_eq(ddf[["y"]], ddf2, check_divisions=divisions) # Now include metadata - ddf2 = dask_cudf.read_parquet(tmpdir, gather_statistics=stats) - dd.assert_eq(ddf, ddf2, check_divisions=stats) + ddf2 = dask_cudf.read_parquet(tmpdir, **_divisions(divisions)) + dd.assert_eq(ddf, ddf2, check_divisions=divisions) # Specify columns=['x'] (with metadata) ddf2 = dask_cudf.read_parquet( - tmpdir, columns=["x"], gather_statistics=stats + tmpdir, columns=["x"], **_divisions(divisions) ) - dd.assert_eq(ddf[["x"]], ddf2, check_divisions=stats) + dd.assert_eq(ddf[["x"]], ddf2, check_divisions=divisions) # Specify columns='y' (with metadata) - ddf2 = dask_cudf.read_parquet(tmpdir, columns="y", gather_statistics=stats) - dd.assert_eq(ddf[["y"]], ddf2, check_divisions=stats) + ddf2 = dask_cudf.read_parquet(tmpdir, columns="y", **_divisions(divisions)) + dd.assert_eq(ddf[["y"]], ddf2, check_divisions=divisions) def test_roundtrip_from_dask_index_false(tmpdir): @@ -99,8 +108,8 @@ def test_roundtrip_from_dask_cudf(tmpdir, write_meta): gddf = dask_cudf.from_dask_dataframe(ddf) gddf.to_parquet(tmpdir, write_metadata_file=write_meta) - gddf2 = dask_cudf.read_parquet(tmpdir) - dd.assert_eq(gddf, gddf2, check_divisions=write_meta) + gddf2 = dask_cudf.read_parquet(tmpdir, **_divisions(True)) + dd.assert_eq(gddf, gddf2) def test_roundtrip_none_rangeindex(tmpdir): @@ -161,21 +170,21 @@ def test_dask_timeseries_from_pandas(tmpdir): @pytest.mark.parametrize("index", [False, None]) -@pytest.mark.parametrize("stats", [False, True]) -def test_dask_timeseries_from_dask(tmpdir, index, stats): +@pytest.mark.parametrize("divisions", [False, True]) +def test_dask_timeseries_from_dask(tmpdir, index, divisions): fn = str(tmpdir) ddf2 = dask.datasets.timeseries(freq="D") ddf2.to_parquet(fn, engine="pyarrow", write_index=index) - read_df = dask_cudf.read_parquet(fn, index=index, gather_statistics=stats) + read_df = dask_cudf.read_parquet(fn, index=index, **_divisions(divisions)) dd.assert_eq( - ddf2, read_df, check_divisions=(stats and index), check_index=index + ddf2, read_df, check_divisions=(divisions and index), check_index=index ) @pytest.mark.parametrize("index", [False, None]) -@pytest.mark.parametrize("stats", [False, True]) -def test_dask_timeseries_from_daskcudf(tmpdir, index, stats): +@pytest.mark.parametrize("divisions", [False, True]) +def test_dask_timeseries_from_daskcudf(tmpdir, index, divisions): fn = str(tmpdir) ddf2 = dask_cudf.from_cudf( @@ -183,9 +192,9 @@ def test_dask_timeseries_from_daskcudf(tmpdir, index, stats): ) ddf2.name = ddf2.name.astype("object") ddf2.to_parquet(fn, write_index=index) - read_df = dask_cudf.read_parquet(fn, index=index, gather_statistics=stats) + read_df = dask_cudf.read_parquet(fn, index=index, **_divisions(divisions)) dd.assert_eq( - ddf2, read_df, check_divisions=(stats and index), check_index=index + ddf2, read_df, check_divisions=(divisions and index), check_index=index ) @@ -212,17 +221,23 @@ def test_filters(tmpdir): ddf.to_parquet(tmp_path, engine="pyarrow") - a = dask_cudf.read_parquet(tmp_path, filters=[("x", ">", 4)]) + a = dask_cudf.read_parquet( + tmp_path, filters=[("x", ">", 4)], split_row_groups=True + ) assert a.npartitions == 3 assert (a.x > 3).all().compute() - b = dask_cudf.read_parquet(tmp_path, filters=[("y", "==", "c")]) + b = dask_cudf.read_parquet( + tmp_path, filters=[("y", "==", "c")], split_row_groups=True + ) assert b.npartitions == 1 b = b.compute().to_pandas() assert (b.y == "c").all() c = dask_cudf.read_parquet( - tmp_path, filters=[("y", "==", "c"), ("x", ">", 6)] + tmp_path, + filters=[("y", "==", "c"), ("x", ">", 6)], + split_row_groups=True, ) assert c.npartitions <= 1 assert not len(c) @@ -237,13 +252,17 @@ def test_filters_at_row_group_level(tmpdir): ddf.to_parquet(tmp_path, engine="pyarrow", row_group_size=10 / 5) - a = dask_cudf.read_parquet(tmp_path, filters=[("x", "==", 1)]) + a = dask_cudf.read_parquet( + tmp_path, filters=[("x", "==", 1)], split_row_groups=True + ) assert a.npartitions == 1 assert (a.shape[0] == 2).compute() ddf.to_parquet(tmp_path, engine="pyarrow", row_group_size=1) - b = dask_cudf.read_parquet(tmp_path, filters=[("x", "==", 1)]) + b = dask_cudf.read_parquet( + tmp_path, filters=[("x", "==", 1)], split_row_groups=True + ) assert b.npartitions == 1 assert (b.shape[0] == 1).compute() @@ -341,7 +360,7 @@ def test_chunksize(tmpdir, chunksize, metadata): path, chunksize=chunksize, split_row_groups=True, - gather_statistics=True, + **_divisions(True), ) ddf2.compute(scheduler="synchronous") @@ -360,8 +379,8 @@ def test_chunksize(tmpdir, chunksize, metadata): path, chunksize=chunksize, split_row_groups=True, - gather_statistics=True, aggregate_files=True, + **_divisions(True), ) dd.assert_eq(ddf1, ddf3, check_divisions=False) @@ -382,7 +401,7 @@ def test_chunksize(tmpdir, chunksize, metadata): @pytest.mark.parametrize("row_groups", [1, 3, 10, 12]) @pytest.mark.parametrize("index", [False, True]) -def test_row_groups_per_part(tmpdir, row_groups, index): +def test_split_row_groups(tmpdir, row_groups, index): nparts = 2 df_size = 100 row_group_size = 5 @@ -410,7 +429,7 @@ def test_row_groups_per_part(tmpdir, row_groups, index): ddf2 = dask_cudf.read_parquet( str(tmpdir), - row_groups_per_part=row_groups, + split_row_groups=row_groups, ) dd.assert_eq(ddf1, ddf2, check_divisions=False) @@ -448,9 +467,9 @@ def test_create_metadata_file(tmpdir, partition_on): # with the _metadata file present ddf2 = dask_cudf.read_parquet( tmpdir, - gather_statistics=True, split_row_groups=False, index="myindex", + **_divisions(True), ) if partition_on: ddf1 = df1.sort_values("b") @@ -481,7 +500,7 @@ def test_create_metadata_file_inconsistent_schema(tmpdir): # New pyarrow-dataset base can handle an inconsistent # schema (even without a _metadata file), but computing # and dtype validation may fail - ddf1 = dask_cudf.read_parquet(str(tmpdir), gather_statistics=True) + ddf1 = dask_cudf.read_parquet(str(tmpdir), **_divisions(True)) # Add global metadata file. # Dask-CuDF can do this without requiring schema @@ -490,7 +509,7 @@ def test_create_metadata_file_inconsistent_schema(tmpdir): # Check that we can still read the ddf # with the _metadata file present - ddf2 = dask_cudf.read_parquet(str(tmpdir), gather_statistics=True) + ddf2 = dask_cudf.read_parquet(str(tmpdir), **_divisions(True)) # Check that the result is the same with and # without the _metadata file. Note that we must From 8b4e11c71b5fb08aa9d4de5df35556cbbde73ff7 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Thu, 21 Apr 2022 12:40:21 -0700 Subject: [PATCH 29/34] clarify comment --- python/dask_cudf/dask_cudf/io/tests/test_parquet.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 2139f45f512..b8b134518a9 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -36,8 +36,12 @@ ddf = dd.from_pandas(df, npartitions=npartitions) -# Helper function to handle upcomming -# `gather_statistics` deprecation +# Helper function to make it easier to handle the +# upcoming deprecation of `gather_statistics`. +# See: https://github.com/dask/dask/issues/8937 +# TODO: This function should be used to switch to +# the "new" `calculate_divisions` kwarg (for newer +# Dask versions) once it is introduced def _divisions(setting): return {"gather_statistics": setting} From bd5b692c29d92b0ff1d72d602e33f52adaf873a6 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Fri, 22 Apr 2022 12:07:49 -0700 Subject: [PATCH 30/34] add file-size and memory checks to better-inform the user of split_row_groups usage --- python/dask_cudf/dask_cudf/io/parquet.py | 175 +++++++++++------- .../dask_cudf/io/tests/test_parquet.py | 9 + 2 files changed, 117 insertions(+), 67 deletions(-) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index 81af4619397..1dc40286e27 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -1,4 +1,5 @@ # Copyright (c) 2019-2022, NVIDIA CORPORATION. +import warnings from contextlib import ExitStack from functools import partial from io import BufferedWriter, BytesIO, IOBase @@ -176,65 +177,99 @@ def read_partition( strings_to_cats = kwargs.get("strings_to_categorical", False) read_kwargs = kwargs.get("read", {}) read_kwargs.update(open_file_options or {}) - - # Assume multi-piece read - paths = [] - rgs = [] - last_partition_keys = None - dfs = [] - - for i, piece in enumerate(pieces): - - (path, row_group, partition_keys) = piece - row_group = None if row_group == [None] else row_group - - if i > 0 and partition_keys != last_partition_keys: - dfs.append( - cls._read_paths( - paths, - fs, - columns=read_columns, - row_groups=rgs if rgs else None, - strings_to_categorical=strings_to_cats, - partitions=partitions, - partitioning=partitioning, - partition_keys=last_partition_keys, - **read_kwargs, + check_file_size = read_kwargs.pop("check_file_size", None) + + # Wrap reading logic in a `try` block so that we can + # inform the user that the `read_parquet` partition + # size is too large for the available memory + try: + + # Assume multi-piece read + paths = [] + rgs = [] + last_partition_keys = None + dfs = [] + + for i, piece in enumerate(pieces): + + (path, row_group, partition_keys) = piece + row_group = None if row_group == [None] else row_group + + # File-size check to help "protect" users from change + # to up-stream `split_row_groups` default. We only + # check the file size if this partition corresponds + # to a full file, and `check_file_size` is defined + if check_file_size and len(pieces) == 1 and row_group is None: + file_size = fs.size(path) + if file_size > check_file_size: + warnings.warn( + f"A large parquet file ({file_size}B) is being " + f"used to create a DataFrame partition in " + f"read_parquet. Did you mean to use the " + f"split_row_groups argument?" + ) + + if i > 0 and partition_keys != last_partition_keys: + dfs.append( + cls._read_paths( + paths, + fs, + columns=read_columns, + row_groups=rgs if rgs else None, + strings_to_categorical=strings_to_cats, + partitions=partitions, + partitioning=partitioning, + partition_keys=last_partition_keys, + **read_kwargs, + ) ) + paths = rgs = [] + last_partition_keys = None + paths.append(path) + rgs.append( + [row_group] + if not isinstance(row_group, list) + and row_group is not None + else row_group ) - paths = rgs = [] - last_partition_keys = None - paths.append(path) - rgs.append( - [row_group] - if not isinstance(row_group, list) and row_group is not None - else row_group - ) - last_partition_keys = partition_keys + last_partition_keys = partition_keys - dfs.append( - cls._read_paths( - paths, - fs, - columns=read_columns, - row_groups=rgs if rgs else None, - strings_to_categorical=strings_to_cats, - partitions=partitions, - partitioning=partitioning, - partition_keys=last_partition_keys, - **read_kwargs, + dfs.append( + cls._read_paths( + paths, + fs, + columns=read_columns, + row_groups=rgs if rgs else None, + strings_to_categorical=strings_to_cats, + partitions=partitions, + partitioning=partitioning, + partition_keys=last_partition_keys, + **read_kwargs, + ) ) - ) - df = cudf.concat(dfs) if len(dfs) > 1 else dfs[0] - - # Re-set "object" dtypes align with pa schema - set_object_dtypes_from_pa_schema(df, schema) + df = cudf.concat(dfs) if len(dfs) > 1 else dfs[0] - if index and (index[0] in df.columns): - df = df.set_index(index[0]) - elif index is False and df.index.names != (None,): - # If index=False, we shouldn't have a named index - df.reset_index(inplace=True) + # Re-set "object" dtypes align with pa schema + set_object_dtypes_from_pa_schema(df, schema) + + if index and (index[0] in df.columns): + df = df.set_index(index[0]) + elif index is False and df.index.names != (None,): + # If index=False, we shouldn't have a named index + df.reset_index(inplace=True) + + except MemoryError as err: + raise MemoryError( + "Parquet data was larger than the available GPU memory!\n\n" + "Please try `split_row_groups=True` or set this option " + "to a smaller integer (if applicable).\n\n" + "If you are using dask-cuda workers, this may indicate " + "that the current `device_memory_limit` is too high. " + "If you are not using dask-cuda workers, this may indicate " + "that your workflow requires dask-cuda spilling.\n\n" + "Original Error: " + str(err) + ) + raise err return df @@ -348,12 +383,7 @@ def set_object_dtypes_from_pa_schema(df, schema): df._data[col_name] = col.astype(typ) -def read_parquet( - path, - columns=None, - split_row_groups=None, - **kwargs, -): +def read_parquet(path, columns=None, **kwargs): """Read parquet files into a Dask DataFrame Calls ``dask.dataframe.read_parquet`` to cordinate the execution of @@ -374,13 +404,24 @@ def read_parquet( if isinstance(columns, str): columns = [columns] - return dd.read_parquet( - path, - columns=columns, - split_row_groups=split_row_groups, - engine=CudfEngine, - **kwargs, - ) + # Set "check_file_size" option to determine whether we + # should check the parquet-file size. This check is meant + # to "protect" users from `split_row_groups` default changes + check_file_size = kwargs.pop("check_file_size", 2_000_000_000) + if ( + check_file_size + and ("split_row_groups" not in kwargs) + and ("chunksize" not in kwargs) + ): + # User is not specifying `split_row_groups` or `chunksize`, + # so we should warn them if/when a file is ~>2GB on disk. + # The will be able to set `split_row_groups` explicitly to + # silence/skip this check + if "read" not in kwargs: + kwargs["read"] = {} + kwargs["read"]["check_file_size"] = check_file_size + + return dd.read_parquet(path, columns=columns, engine=CudfEngine, **kwargs) to_parquet = partial(dd.to_parquet, engine=CudfEngine) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index b8b134518a9..ef5741b0539 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -561,3 +561,12 @@ def test_cudf_list_struct_write(tmpdir): ddf.to_parquet(temp_file) new_ddf = dask_cudf.read_parquet(temp_file) dd.assert_eq(df, new_ddf) + + +def test_check_file_size(tmpdir): + # Test simple file-size check to help warn users + # of upstream change to `split_row_groups` default + fn = str(tmpdir.join("test.parquet")) + cudf.DataFrame({"a": np.arange(1000)}).to_parquet(fn) + with pytest.warns(match="large parquet file"): + dask_cudf.read_parquet(fn, check_file_size=1).compute() From d0423aae3ba62189181c021ad02a2122c5847183 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Tue, 26 Apr 2022 14:24:57 -0700 Subject: [PATCH 31/34] simplify error and warning messages --- CHANGELOG.md | 4 ---- python/dask_cudf/dask_cudf/io/parquet.py | 18 ++++++++---------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e9ec64016c..ede06e6df70 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1408,7 +1408,6 @@ Please see https://github.com/rapidsai/cudf/releases/tag/v22.06.00a for the late - Fixing empty null lists throwing explode_outer for a loop. ([#7649](https://github.com/rapidsai/cudf/pull/7649)) [@hyperbolic2346](https://github.com/hyperbolic2346) - Fix internal compiler error during JNI Docker build ([#7645](https://github.com/rapidsai/cudf/pull/7645)) [@jlowe](https://github.com/jlowe) - Fix Debug build break with device_uvectors in grouped_rolling.cu ([#7633](https://github.com/rapidsai/cudf/pull/7633)) [@mythrocks](https://github.com/mythrocks) -- Parquet reader: Fix issue when using skip_rows on non-nested columns containing nulls ([#7627](https://github.com/rapidsai/cudf/pull/7627)) [@nvdbaranec](https://github.com/nvdbaranec) - Parquet reader: Fix issue when using skip_rows on non-nested columns containing nulls ([#7627](https://github.com/rapidsai/cudf/pull/7627)) [@nvdbaranec](https://github.com/nvdbaranec) - Fix ORC reader for empty DataFrame/Table ([#7624](https://github.com/rapidsai/cudf/pull/7624)) [@rgsl888prabhu](https://github.com/rgsl888prabhu) - Fix specifying GPU architecture in JNI build ([#7612](https://github.com/rapidsai/cudf/pull/7612)) [@jlowe](https://github.com/jlowe) @@ -1508,7 +1507,6 @@ Please see https://github.com/rapidsai/cudf/releases/tag/v22.06.00a for the late - Add groupby scan operations (sort groupby) ([#7387](https://github.com/rapidsai/cudf/pull/7387)) [@karthikeyann](https://github.com/karthikeyann) - Add cudf::explode_position ([#7376](https://github.com/rapidsai/cudf/pull/7376)) [@hyperbolic2346](https://github.com/hyperbolic2346) - Add string conversion to/from decimal values libcudf APIs ([#7364](https://github.com/rapidsai/cudf/pull/7364)) [@davidwendt](https://github.com/davidwendt) -- Add groupby SUM_OF_SQUARES support ([#7362](https://github.com/rapidsai/cudf/pull/7362)) [@karthikeyann](https://github.com/karthikeyann) - Add groupby SUM_OF_SQUARES support ([#7362](https://github.com/rapidsai/cudf/pull/7362)) [@karthikeyann](https://github.com/karthikeyann) - Add `Series.drop` api ([#7304](https://github.com/rapidsai/cudf/pull/7304)) [@isVoid](https://github.com/isVoid) - get_json_object() implementation ([#7286](https://github.com/rapidsai/cudf/pull/7286)) [@nvdbaranec](https://github.com/nvdbaranec) @@ -1517,7 +1515,6 @@ Please see https://github.com/rapidsai/cudf/releases/tag/v22.06.00a for the late - Add support for special tokens in nvtext::subword_tokenizer ([#7254](https://github.com/rapidsai/cudf/pull/7254)) [@davidwendt](https://github.com/davidwendt) - Fix inplace update of data and add Series.update ([#7201](https://github.com/rapidsai/cudf/pull/7201)) [@galipremsagar](https://github.com/galipremsagar) - Implement `cudf::group_by` (hash) for `decimal32` and `decimal64` ([#7190](https://github.com/rapidsai/cudf/pull/7190)) [@codereport](https://github.com/codereport) -- Adding support to specify "level" parameter for `Dataframe.rename` ([#7135](https://github.com/rapidsai/cudf/pull/7135)) [@skirui-source](https://github.com/skirui-source) - Adding support to specify "level" parameter for `Dataframe.rename` ([#7135](https://github.com/rapidsai/cudf/pull/7135)) [@skirui-source](https://github.com/skirui-source) ## ๐Ÿ› ๏ธ Improvements @@ -1658,7 +1655,6 @@ Please see https://github.com/rapidsai/cudf/releases/tag/v22.06.00a for the late - Adding Interval Dtype ([#6984](https://github.com/rapidsai/cudf/pull/6984)) [@marlenezw](https://github.com/marlenezw) - Cleaning up `for` loops with `make_(counting_)transform_iterator` ([#6546](https://github.com/rapidsai/cudf/pull/6546)) [@codereport](https://github.com/codereport) - # cuDF 0.18.0 (24 Feb 2021) ## Breaking Changes ๐Ÿšจ diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index 1dc40286e27..3ada269a7a3 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -205,8 +205,10 @@ def read_partition( warnings.warn( f"A large parquet file ({file_size}B) is being " f"used to create a DataFrame partition in " - f"read_parquet. Did you mean to use the " - f"split_row_groups argument?" + f"read_parquet. This may cause out of memory " + f"exceptions in operations downstream. See the " + f"split_row_groups argument in the read_parquet " + f"documentation." ) if i > 0 and partition_keys != last_partition_keys: @@ -261,12 +263,8 @@ def read_partition( except MemoryError as err: raise MemoryError( "Parquet data was larger than the available GPU memory!\n\n" - "Please try `split_row_groups=True` or set this option " - "to a smaller integer (if applicable).\n\n" - "If you are using dask-cuda workers, this may indicate " - "that the current `device_memory_limit` is too high. " - "If you are not using dask-cuda workers, this may indicate " - "that your workflow requires dask-cuda spilling.\n\n" + "See the split_row_groups argument in the read_parquet " + "documentation.\n\n" "Original Error: " + str(err) ) raise err @@ -407,14 +405,14 @@ def read_parquet(path, columns=None, **kwargs): # Set "check_file_size" option to determine whether we # should check the parquet-file size. This check is meant # to "protect" users from `split_row_groups` default changes - check_file_size = kwargs.pop("check_file_size", 2_000_000_000) + check_file_size = kwargs.pop("check_file_size", 500_000_000) if ( check_file_size and ("split_row_groups" not in kwargs) and ("chunksize" not in kwargs) ): # User is not specifying `split_row_groups` or `chunksize`, - # so we should warn them if/when a file is ~>2GB on disk. + # so we should warn them if/when a file is ~>0.5GB on disk. # The will be able to set `split_row_groups` explicitly to # silence/skip this check if "read" not in kwargs: From 3798cf30559814d9d5d0eb11f09396ea5a79c35f Mon Sep 17 00:00:00 2001 From: rjzamora Date: Tue, 26 Apr 2022 14:54:22 -0700 Subject: [PATCH 32/34] improve docstring for read_parquet to discuss split_row_groups argument --- python/dask_cudf/dask_cudf/io/parquet.py | 29 ++++++++++++++++++------ 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index 3ada269a7a3..8657f55294c 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -384,16 +384,31 @@ def set_object_dtypes_from_pa_schema(df, schema): def read_parquet(path, columns=None, **kwargs): """Read parquet files into a Dask DataFrame - Calls ``dask.dataframe.read_parquet`` to cordinate the execution of - ``cudf.read_parquet``, and ultimately read multiple partitions into - a single Dask dataframe. The Dask version must supply an - ``ArrowDatasetEngine`` class to support full functionality. - See ``cudf.read_parquet`` and Dask documentation for further details. + Calls ``dask.dataframe.read_parquet`` with ``engine=CudfEngine`` + to cordinate the execution of ``cudf.read_parquet``, and to + ultimately create a ``dask_cudf.DataFrame`` collection. + + See the ``dask.dataframe.read_parquet`` documentation for + all available options. Examples -------- - >>> import dask_cudf - >>> df = dask_cudf.read_parquet("/path/to/dataset/") # doctest: +SKIP + >>> from dask_cudf import read_parquet + >>> df = read_parquet("/path/to/dataset/") # doctest: +SKIP + + When dealing with one or more large parquet files having an + in-memory footprint >15% device memory, the ``split_row_groups`` + argument should be used to map Parquet **row-groups** to DataFrame + partitions (instead of **files** to partitions). For example, the + following code will map each row-group to a distinct partition: + + >>> df = read_parquet(..., split_row_groups=True) # doctest: +SKIP + + To map **multiple** row-groups to each partition, an integer can be + passed to ``split_row_groups`` to specify the **maximum** number of + row-groups allowed in each output partition: + + >>> df = read_parquet(..., split_row_groups=10) # doctest: +SKIP See Also -------- From d6eb7a613fef2c86cfd5594798471a299e64df6e Mon Sep 17 00:00:00 2001 From: rjzamora Date: Tue, 26 Apr 2022 15:05:19 -0700 Subject: [PATCH 33/34] another tweak --- python/dask_cudf/dask_cudf/io/parquet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index 8657f55294c..5f9c82096c1 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -207,7 +207,7 @@ def read_partition( f"used to create a DataFrame partition in " f"read_parquet. This may cause out of memory " f"exceptions in operations downstream. See the " - f"split_row_groups argument in the read_parquet " + f"notes on split_row_groups in the read_parquet " f"documentation." ) @@ -263,7 +263,7 @@ def read_partition( except MemoryError as err: raise MemoryError( "Parquet data was larger than the available GPU memory!\n\n" - "See the split_row_groups argument in the read_parquet " + "See the notes on split_row_groups in the read_parquet " "documentation.\n\n" "Original Error: " + str(err) ) From 3791b19daa45d6b3da9c3df711f2076497a4a9eb Mon Sep 17 00:00:00 2001 From: rjzamora Date: Tue, 26 Apr 2022 15:13:16 -0700 Subject: [PATCH 34/34] inform the user that setting split_row_groups will silence the file-size check --- python/dask_cudf/dask_cudf/io/parquet.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index 5f9c82096c1..b201626becf 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -208,7 +208,8 @@ def read_partition( f"read_parquet. This may cause out of memory " f"exceptions in operations downstream. See the " f"notes on split_row_groups in the read_parquet " - f"documentation." + f"documentation. Setting split_row_groups " + f"explicitly will silence this warning." ) if i > 0 and partition_keys != last_partition_keys: @@ -428,8 +429,8 @@ def read_parquet(path, columns=None, **kwargs): ): # User is not specifying `split_row_groups` or `chunksize`, # so we should warn them if/when a file is ~>0.5GB on disk. - # The will be able to set `split_row_groups` explicitly to - # silence/skip this check + # They can set `split_row_groups` explicitly to silence/skip + # this check if "read" not in kwargs: kwargs["read"] = {} kwargs["read"]["check_file_size"] = check_file_size