diff --git a/CHANGELOG.md b/CHANGELOG.md index 316f1abde98..cc20728ca35 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,13 +2,325 @@ Please see https://github.com/rapidsai/cudf/releases/tag/v21.08.00a for the latest changes to this development branch. -# cuDF 0.20.0 (Date TBD) - -Please see https://github.com/rapidsai/cudf/releases/tag/v0.20.0a for the latest changes to this development branch. - -# cuDF 0.19.0 (Date TBD) - -Please see https://github.com/rapidsai/cudf/releases/tag/v0.19.0a for the latest changes to this development branch. +# cuDF 21.06.00 (Date TBD) + +Please see https://github.com/rapidsai/cudf/releases/tag/v21.06.00a for the latest changes to this development branch. + +# cuDF 0.19.0 (21 Apr 2021) + +## ๐Ÿšจ Breaking Changes + +- Allow hash_partition to take a seed value ([#7771](https://github.com/rapidsai/cudf/pull/7771)) [@magnatelee](https://github.com/magnatelee) +- Allow merging index column with data column using keyword "on" ([#7736](https://github.com/rapidsai/cudf/pull/7736)) [@skirui-source](https://github.com/skirui-source) +- Change JNI API to avoid loading native dependencies when creating sort order classes. ([#7729](https://github.com/rapidsai/cudf/pull/7729)) [@revans2](https://github.com/revans2) +- Replace device_vector with device_uvector in null_mask ([#7715](https://github.com/rapidsai/cudf/pull/7715)) [@harrism](https://github.com/harrism) +- Don't identify decimals as strings. ([#7710](https://github.com/rapidsai/cudf/pull/7710)) [@vyasr](https://github.com/vyasr) +- Fix Java Parquet write after writer API changes ([#7655](https://github.com/rapidsai/cudf/pull/7655)) [@revans2](https://github.com/revans2) +- Convert cudf::concatenate APIs to use spans and device_uvector ([#7621](https://github.com/rapidsai/cudf/pull/7621)) [@harrism](https://github.com/harrism) +- Update missing docstring examples in python public APIs ([#7546](https://github.com/rapidsai/cudf/pull/7546)) [@galipremsagar](https://github.com/galipremsagar) +- Remove unneeded step parameter from strings::detail::copy_slice ([#7525](https://github.com/rapidsai/cudf/pull/7525)) [@davidwendt](https://github.com/davidwendt) +- Rename ARROW_STATIC_LIB because it conflicts with one in FindArrow.cmake ([#7518](https://github.com/rapidsai/cudf/pull/7518)) [@trxcllnt](https://github.com/trxcllnt) +- Match Pandas logic for comparing two objects with nulls ([#7490](https://github.com/rapidsai/cudf/pull/7490)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Add struct support to parquet writer ([#7461](https://github.com/rapidsai/cudf/pull/7461)) [@devavret](https://github.com/devavret) +- Join APIs that return gathermaps ([#7454](https://github.com/rapidsai/cudf/pull/7454)) [@shwina](https://github.com/shwina) +- `fixed_point` + `cudf::binary_operation` API Changes ([#7435](https://github.com/rapidsai/cudf/pull/7435)) [@codereport](https://github.com/codereport) +- Fix BUG: Exception when PYTHONOPTIMIZE=2 ([#7434](https://github.com/rapidsai/cudf/pull/7434)) [@skirui-source](https://github.com/skirui-source) +- Change nvtext::load_vocabulary_file to return a unique ptr ([#7424](https://github.com/rapidsai/cudf/pull/7424)) [@davidwendt](https://github.com/davidwendt) +- Refactor strings column factories ([#7397](https://github.com/rapidsai/cudf/pull/7397)) [@harrism](https://github.com/harrism) +- Use CMAKE_CUDA_ARCHITECTURES ([#7391](https://github.com/rapidsai/cudf/pull/7391)) [@robertmaynard](https://github.com/robertmaynard) +- Upgrade pandas to 1.2 ([#7375](https://github.com/rapidsai/cudf/pull/7375)) [@galipremsagar](https://github.com/galipremsagar) +- Rename `logical_cast` to `bit_cast` and allow additional conversions ([#7373](https://github.com/rapidsai/cudf/pull/7373)) [@ttnghia](https://github.com/ttnghia) +- Rework libcudf CMakeLists.txt to export targets for CPM ([#7107](https://github.com/rapidsai/cudf/pull/7107)) [@trxcllnt](https://github.com/trxcllnt) + +## ๐Ÿ› Bug Fixes + +- Fix a `NameError` in meta dispatch API ([#7996](https://github.com/rapidsai/cudf/pull/7996)) [@galipremsagar](https://github.com/galipremsagar) +- Reindex in `DataFrame.__setitem__` ([#7957](https://github.com/rapidsai/cudf/pull/7957)) [@galipremsagar](https://github.com/galipremsagar) +- jitify direct-to-cubin compilation and caching. ([#7919](https://github.com/rapidsai/cudf/pull/7919)) [@cwharris](https://github.com/cwharris) +- Use dynamic cudart for nvcomp in java build ([#7896](https://github.com/rapidsai/cudf/pull/7896)) [@abellina](https://github.com/abellina) +- fix "incompatible redefinition" warnings ([#7894](https://github.com/rapidsai/cudf/pull/7894)) [@cwharris](https://github.com/cwharris) +- cudf consistently specifies the cuda runtime ([#7887](https://github.com/rapidsai/cudf/pull/7887)) [@robertmaynard](https://github.com/robertmaynard) +- disable verbose output for jitify_preprocess ([#7886](https://github.com/rapidsai/cudf/pull/7886)) [@cwharris](https://github.com/cwharris) +- CMake jit_preprocess_files function only runs when needed ([#7872](https://github.com/rapidsai/cudf/pull/7872)) [@robertmaynard](https://github.com/robertmaynard) +- Push DeviceScalar construction into cython for list.contains ([#7864](https://github.com/rapidsai/cudf/pull/7864)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- cudf now sets an install rpath of $ORIGIN ([#7863](https://github.com/rapidsai/cudf/pull/7863)) [@robertmaynard](https://github.com/robertmaynard) +- Don't install Thrust examples, tests, docs, and python files ([#7811](https://github.com/rapidsai/cudf/pull/7811)) [@robertmaynard](https://github.com/robertmaynard) +- Sort by index in groupby tests more consistently ([#7802](https://github.com/rapidsai/cudf/pull/7802)) [@shwina](https://github.com/shwina) +- Revert "Update conda recipes pinning of repo dependencies ([#7743)" (#7793](https://github.com/rapidsai/cudf/pull/7743)" (#7793)) [@raydouglass](https://github.com/raydouglass) +- Add decimal column handling in copy_type_metadata ([#7788](https://github.com/rapidsai/cudf/pull/7788)) [@shwina](https://github.com/shwina) +- Add column names validation in parquet writer ([#7786](https://github.com/rapidsai/cudf/pull/7786)) [@galipremsagar](https://github.com/galipremsagar) +- Fix Java explode outer unit tests ([#7782](https://github.com/rapidsai/cudf/pull/7782)) [@jlowe](https://github.com/jlowe) +- Fix compiler warning about non-POD types passed through ellipsis ([#7781](https://github.com/rapidsai/cudf/pull/7781)) [@jrhemstad](https://github.com/jrhemstad) +- User resource fix for replace_nulls ([#7769](https://github.com/rapidsai/cudf/pull/7769)) [@magnatelee](https://github.com/magnatelee) +- Fix type dispatch for columnar replace_nulls ([#7768](https://github.com/rapidsai/cudf/pull/7768)) [@jlowe](https://github.com/jlowe) +- Add `ignore_order` parameter to dask-cudf concat dispatch ([#7765](https://github.com/rapidsai/cudf/pull/7765)) [@galipremsagar](https://github.com/galipremsagar) +- Fix slicing and arrow representations of decimal columns ([#7755](https://github.com/rapidsai/cudf/pull/7755)) [@vyasr](https://github.com/vyasr) +- Fixing issue with explode_outer position not nulling position entries of null rows ([#7754](https://github.com/rapidsai/cudf/pull/7754)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Implement scatter for struct columns ([#7752](https://github.com/rapidsai/cudf/pull/7752)) [@ttnghia](https://github.com/ttnghia) +- Fix data corruption in string columns ([#7746](https://github.com/rapidsai/cudf/pull/7746)) [@galipremsagar](https://github.com/galipremsagar) +- Fix string length in stripe dictionary building ([#7744](https://github.com/rapidsai/cudf/pull/7744)) [@kaatish](https://github.com/kaatish) +- Update conda recipes pinning of repo dependencies ([#7743](https://github.com/rapidsai/cudf/pull/7743)) [@mike-wendt](https://github.com/mike-wendt) +- Enable dask dispatch to cuDF's `is_categorical_dtype` for cuDF objects ([#7740](https://github.com/rapidsai/cudf/pull/7740)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Fix dictionary size computation in ORC writer ([#7737](https://github.com/rapidsai/cudf/pull/7737)) [@vuule](https://github.com/vuule) +- Fix `cudf::cast` overflow for `decimal64` to `int32_t` or smaller in certain cases ([#7733](https://github.com/rapidsai/cudf/pull/7733)) [@codereport](https://github.com/codereport) +- Change JNI API to avoid loading native dependencies when creating sort order classes. ([#7729](https://github.com/rapidsai/cudf/pull/7729)) [@revans2](https://github.com/revans2) +- Disable column_view data accessors for unsupported types ([#7725](https://github.com/rapidsai/cudf/pull/7725)) [@jrhemstad](https://github.com/jrhemstad) +- Materialize `RangeIndex` when `index=True` in parquet writer ([#7711](https://github.com/rapidsai/cudf/pull/7711)) [@galipremsagar](https://github.com/galipremsagar) +- Don't identify decimals as strings. ([#7710](https://github.com/rapidsai/cudf/pull/7710)) [@vyasr](https://github.com/vyasr) +- Fix return type of `DataFrame.argsort` ([#7706](https://github.com/rapidsai/cudf/pull/7706)) [@galipremsagar](https://github.com/galipremsagar) +- Fix/correct cudf installed package requirements ([#7688](https://github.com/rapidsai/cudf/pull/7688)) [@robertmaynard](https://github.com/robertmaynard) +- Fix SparkMurmurHash3_32 hash inconsistencies with Apache Spark ([#7672](https://github.com/rapidsai/cudf/pull/7672)) [@jlowe](https://github.com/jlowe) +- Fix ORC reader issue with reading empty string columns ([#7656](https://github.com/rapidsai/cudf/pull/7656)) [@rgsl888prabhu](https://github.com/rgsl888prabhu) +- Fix Java Parquet write after writer API changes ([#7655](https://github.com/rapidsai/cudf/pull/7655)) [@revans2](https://github.com/revans2) +- Fixing empty null lists throwing explode_outer for a loop. ([#7649](https://github.com/rapidsai/cudf/pull/7649)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Fix internal compiler error during JNI Docker build ([#7645](https://github.com/rapidsai/cudf/pull/7645)) [@jlowe](https://github.com/jlowe) +- Fix Debug build break with device_uvectors in grouped_rolling.cu ([#7633](https://github.com/rapidsai/cudf/pull/7633)) [@mythrocks](https://github.com/mythrocks) +- Parquet reader: Fix issue when using skip_rows on non-nested columns containing nulls ([#7627](https://github.com/rapidsai/cudf/pull/7627)) [@nvdbaranec](https://github.com/nvdbaranec) +- Fix ORC reader for empty DataFrame/Table ([#7624](https://github.com/rapidsai/cudf/pull/7624)) [@rgsl888prabhu](https://github.com/rgsl888prabhu) +- Fix specifying GPU architecture in JNI build ([#7612](https://github.com/rapidsai/cudf/pull/7612)) [@jlowe](https://github.com/jlowe) +- Fix ORC writer OOM issue ([#7605](https://github.com/rapidsai/cudf/pull/7605)) [@vuule](https://github.com/vuule) +- Fix 0.18 --> 0.19 automerge ([#7589](https://github.com/rapidsai/cudf/pull/7589)) [@kkraus14](https://github.com/kkraus14) +- Fix ORC issue with incorrect timestamp nanosecond values ([#7581](https://github.com/rapidsai/cudf/pull/7581)) [@vuule](https://github.com/vuule) +- Fix missing Dask imports ([#7580](https://github.com/rapidsai/cudf/pull/7580)) [@kkraus14](https://github.com/kkraus14) +- CMAKE_CUDA_ARCHITECTURES doesn't change when build-system invokes cmake ([#7579](https://github.com/rapidsai/cudf/pull/7579)) [@robertmaynard](https://github.com/robertmaynard) +- Another fix for offsets_end() iterator in lists_column_view ([#7575](https://github.com/rapidsai/cudf/pull/7575)) [@ttnghia](https://github.com/ttnghia) +- Fix ORC writer output corruption with string columns ([#7565](https://github.com/rapidsai/cudf/pull/7565)) [@vuule](https://github.com/vuule) +- Fix cudf::lists::sort_lists failing for sliced column ([#7564](https://github.com/rapidsai/cudf/pull/7564)) [@ttnghia](https://github.com/ttnghia) +- FIX Fix Anaconda upload args ([#7558](https://github.com/rapidsai/cudf/pull/7558)) [@dillon-cullinan](https://github.com/dillon-cullinan) +- Fix index mismatch issue in equality related APIs ([#7555](https://github.com/rapidsai/cudf/pull/7555)) [@galipremsagar](https://github.com/galipremsagar) +- FIX Revert gpuci_conda_retry on conda file output locations ([#7552](https://github.com/rapidsai/cudf/pull/7552)) [@dillon-cullinan](https://github.com/dillon-cullinan) +- Fix offset_end iterator for lists_column_view, which was not correctlโ€ฆ ([#7551](https://github.com/rapidsai/cudf/pull/7551)) [@ttnghia](https://github.com/ttnghia) +- Fix no such file dlpack.h error when build libcudf ([#7549](https://github.com/rapidsai/cudf/pull/7549)) [@chenrui17](https://github.com/chenrui17) +- Update missing docstring examples in python public APIs ([#7546](https://github.com/rapidsai/cudf/pull/7546)) [@galipremsagar](https://github.com/galipremsagar) +- Decimal32 Build Fix ([#7544](https://github.com/rapidsai/cudf/pull/7544)) [@razajafri](https://github.com/razajafri) +- FIX Retry conda output location ([#7540](https://github.com/rapidsai/cudf/pull/7540)) [@dillon-cullinan](https://github.com/dillon-cullinan) +- fix missing renames of dask git branches from master to main ([#7535](https://github.com/rapidsai/cudf/pull/7535)) [@kkraus14](https://github.com/kkraus14) +- Remove detail from device_span ([#7533](https://github.com/rapidsai/cudf/pull/7533)) [@rwlee](https://github.com/rwlee) +- Change dask and distributed branch to main ([#7532](https://github.com/rapidsai/cudf/pull/7532)) [@dantegd](https://github.com/dantegd) +- Update JNI build to use CUDF_USE_ARROW_STATIC ([#7526](https://github.com/rapidsai/cudf/pull/7526)) [@jlowe](https://github.com/jlowe) +- Make sure rmm::rmm CMake target is visibile to cudf users ([#7524](https://github.com/rapidsai/cudf/pull/7524)) [@robertmaynard](https://github.com/robertmaynard) +- Fix contiguous_split not properly handling output partitions > 2 GB. ([#7515](https://github.com/rapidsai/cudf/pull/7515)) [@nvdbaranec](https://github.com/nvdbaranec) +- Change jit launch to safe_launch ([#7510](https://github.com/rapidsai/cudf/pull/7510)) [@devavret](https://github.com/devavret) +- Fix comparison between Datetime/Timedelta columns and NULL scalars ([#7504](https://github.com/rapidsai/cudf/pull/7504)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Fix off-by-one error in char-parallel string scalar replace ([#7502](https://github.com/rapidsai/cudf/pull/7502)) [@jlowe](https://github.com/jlowe) +- Fix JNI deprecation of all, put it on the wrong version before ([#7501](https://github.com/rapidsai/cudf/pull/7501)) [@revans2](https://github.com/revans2) +- Fix Series/Dataframe Mixed Arithmetic ([#7491](https://github.com/rapidsai/cudf/pull/7491)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Fix JNI build after removal of libcudf sub-libraries ([#7486](https://github.com/rapidsai/cudf/pull/7486)) [@jlowe](https://github.com/jlowe) +- Correctly compile benchmarks ([#7485](https://github.com/rapidsai/cudf/pull/7485)) [@robertmaynard](https://github.com/robertmaynard) +- Fix bool column corruption with ORC Reader ([#7483](https://github.com/rapidsai/cudf/pull/7483)) [@rgsl888prabhu](https://github.com/rgsl888prabhu) +- Fix `__repr__` for categorical dtype ([#7476](https://github.com/rapidsai/cudf/pull/7476)) [@galipremsagar](https://github.com/galipremsagar) +- Java cleaner synchronization ([#7474](https://github.com/rapidsai/cudf/pull/7474)) [@abellina](https://github.com/abellina) +- Fix java float/double parsing tests ([#7473](https://github.com/rapidsai/cudf/pull/7473)) [@revans2](https://github.com/revans2) +- Pass stream and user resource to make_default_constructed_scalar ([#7469](https://github.com/rapidsai/cudf/pull/7469)) [@magnatelee](https://github.com/magnatelee) +- Improve stability of dask_cudf.DataFrame.var and dask_cudf.DataFrame.std ([#7453](https://github.com/rapidsai/cudf/pull/7453)) [@rjzamora](https://github.com/rjzamora) +- Missing `device_storage_dispatch` change affecting `cudf::gather` ([#7449](https://github.com/rapidsai/cudf/pull/7449)) [@codereport](https://github.com/codereport) +- fix cuFile JNI compile errors ([#7445](https://github.com/rapidsai/cudf/pull/7445)) [@rongou](https://github.com/rongou) +- Support `Series.__setitem__` with key to a new row ([#7443](https://github.com/rapidsai/cudf/pull/7443)) [@isVoid](https://github.com/isVoid) +- Fix BUG: Exception when PYTHONOPTIMIZE=2 ([#7434](https://github.com/rapidsai/cudf/pull/7434)) [@skirui-source](https://github.com/skirui-source) +- Make inclusive scan safe for cases with leading nulls ([#7432](https://github.com/rapidsai/cudf/pull/7432)) [@magnatelee](https://github.com/magnatelee) +- Fix typo in list_device_view::pair_rep_end() ([#7423](https://github.com/rapidsai/cudf/pull/7423)) [@mythrocks](https://github.com/mythrocks) +- Fix string to double conversion and row equivalent comparison ([#7410](https://github.com/rapidsai/cudf/pull/7410)) [@ttnghia](https://github.com/ttnghia) +- Fix thrust failure when transfering data from device_vector to host_vector with vectors of size 1 ([#7382](https://github.com/rapidsai/cudf/pull/7382)) [@ttnghia](https://github.com/ttnghia) +- Fix std::exeception catch-by-reference gcc9 compile error ([#7380](https://github.com/rapidsai/cudf/pull/7380)) [@davidwendt](https://github.com/davidwendt) +- Fix skiprows issue with ORC Reader ([#7359](https://github.com/rapidsai/cudf/pull/7359)) [@rgsl888prabhu](https://github.com/rgsl888prabhu) +- fix Arrow CMake file ([#7358](https://github.com/rapidsai/cudf/pull/7358)) [@rongou](https://github.com/rongou) +- Fix lists::contains() for NaN and Decimals ([#7349](https://github.com/rapidsai/cudf/pull/7349)) [@mythrocks](https://github.com/mythrocks) +- Handle cupy array in `Dataframe.__setitem__` ([#7340](https://github.com/rapidsai/cudf/pull/7340)) [@galipremsagar](https://github.com/galipremsagar) +- Fix invalid-device-fn error in cudf::strings::replace_re with multiple regex's ([#7336](https://github.com/rapidsai/cudf/pull/7336)) [@davidwendt](https://github.com/davidwendt) +- FIX Add codecov upload block to gpu script ([#6860](https://github.com/rapidsai/cudf/pull/6860)) [@dillon-cullinan](https://github.com/dillon-cullinan) + +## ๐Ÿ“– Documentation + +- Fix join API doxygen ([#7890](https://github.com/rapidsai/cudf/pull/7890)) [@shwina](https://github.com/shwina) +- Add Resources to README. ([#7697](https://github.com/rapidsai/cudf/pull/7697)) [@bdice](https://github.com/bdice) +- Add `isin` examples in Docstring ([#7479](https://github.com/rapidsai/cudf/pull/7479)) [@galipremsagar](https://github.com/galipremsagar) +- Resolving unlinked type shorthands in cudf doc ([#7416](https://github.com/rapidsai/cudf/pull/7416)) [@isVoid](https://github.com/isVoid) +- Fix typo in regex.md doc page ([#7363](https://github.com/rapidsai/cudf/pull/7363)) [@davidwendt](https://github.com/davidwendt) +- Fix incorrect strings_column_view::chars_size documentation ([#7360](https://github.com/rapidsai/cudf/pull/7360)) [@jlowe](https://github.com/jlowe) + +## ๐Ÿš€ New Features + +- Enable basic reductions for decimal columns ([#7776](https://github.com/rapidsai/cudf/pull/7776)) [@ChrisJar](https://github.com/ChrisJar) +- Enable join on decimal columns ([#7764](https://github.com/rapidsai/cudf/pull/7764)) [@ChrisJar](https://github.com/ChrisJar) +- Allow merging index column with data column using keyword "on" ([#7736](https://github.com/rapidsai/cudf/pull/7736)) [@skirui-source](https://github.com/skirui-source) +- Implement DecimalColumn + Scalar and add cudf.Scalars of Decimal64Dtype ([#7732](https://github.com/rapidsai/cudf/pull/7732)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Add support for `unique` groupby aggregation ([#7726](https://github.com/rapidsai/cudf/pull/7726)) [@shwina](https://github.com/shwina) +- Expose libcudf's label_bins function to cudf ([#7724](https://github.com/rapidsai/cudf/pull/7724)) [@vyasr](https://github.com/vyasr) +- Adding support for equi-join on struct ([#7720](https://github.com/rapidsai/cudf/pull/7720)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Add decimal column comparison operations ([#7716](https://github.com/rapidsai/cudf/pull/7716)) [@isVoid](https://github.com/isVoid) +- Implement scan operations for decimal columns ([#7707](https://github.com/rapidsai/cudf/pull/7707)) [@ChrisJar](https://github.com/ChrisJar) +- Enable typecasting between decimal and int ([#7691](https://github.com/rapidsai/cudf/pull/7691)) [@ChrisJar](https://github.com/ChrisJar) +- Enable decimal support in parquet writer ([#7673](https://github.com/rapidsai/cudf/pull/7673)) [@devavret](https://github.com/devavret) +- Adds `list.unique` API ([#7664](https://github.com/rapidsai/cudf/pull/7664)) [@isVoid](https://github.com/isVoid) +- Fix NaN handling in drop_list_duplicates ([#7662](https://github.com/rapidsai/cudf/pull/7662)) [@ttnghia](https://github.com/ttnghia) +- Add `lists.sort_values` API ([#7657](https://github.com/rapidsai/cudf/pull/7657)) [@isVoid](https://github.com/isVoid) +- Add is_integer API that can check for the validity of a string-to-integer conversion ([#7642](https://github.com/rapidsai/cudf/pull/7642)) [@ttnghia](https://github.com/ttnghia) +- Adds `explode` API ([#7607](https://github.com/rapidsai/cudf/pull/7607)) [@isVoid](https://github.com/isVoid) +- Adds `list.take`, python binding for `cudf::lists::segmented_gather` ([#7591](https://github.com/rapidsai/cudf/pull/7591)) [@isVoid](https://github.com/isVoid) +- Implement cudf::label_bins() ([#7554](https://github.com/rapidsai/cudf/pull/7554)) [@vyasr](https://github.com/vyasr) +- Add Python bindings for `lists::contains` ([#7547](https://github.com/rapidsai/cudf/pull/7547)) [@skirui-source](https://github.com/skirui-source) +- cudf::row_bit_count() support. ([#7534](https://github.com/rapidsai/cudf/pull/7534)) [@nvdbaranec](https://github.com/nvdbaranec) +- Implement drop_list_duplicates ([#7528](https://github.com/rapidsai/cudf/pull/7528)) [@ttnghia](https://github.com/ttnghia) +- Add Python bindings for `lists::extract_lists_element` ([#7505](https://github.com/rapidsai/cudf/pull/7505)) [@skirui-source](https://github.com/skirui-source) +- Add explode_outer and explode_outer_position ([#7499](https://github.com/rapidsai/cudf/pull/7499)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Match Pandas logic for comparing two objects with nulls ([#7490](https://github.com/rapidsai/cudf/pull/7490)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Add struct support to parquet writer ([#7461](https://github.com/rapidsai/cudf/pull/7461)) [@devavret](https://github.com/devavret) +- Enable type conversion from float to decimal type ([#7450](https://github.com/rapidsai/cudf/pull/7450)) [@ChrisJar](https://github.com/ChrisJar) +- Add cython for converting strings/fixed-point functions ([#7429](https://github.com/rapidsai/cudf/pull/7429)) [@davidwendt](https://github.com/davidwendt) +- Add struct column support to cudf::sort and cudf::sorted_order ([#7422](https://github.com/rapidsai/cudf/pull/7422)) [@karthikeyann](https://github.com/karthikeyann) +- Implement groupby collect_set ([#7420](https://github.com/rapidsai/cudf/pull/7420)) [@ttnghia](https://github.com/ttnghia) +- Merge branch-0.18 into branch-0.19 ([#7411](https://github.com/rapidsai/cudf/pull/7411)) [@raydouglass](https://github.com/raydouglass) +- Refactor strings column factories ([#7397](https://github.com/rapidsai/cudf/pull/7397)) [@harrism](https://github.com/harrism) +- Add groupby scan operations (sort groupby) ([#7387](https://github.com/rapidsai/cudf/pull/7387)) [@karthikeyann](https://github.com/karthikeyann) +- Add cudf::explode_position ([#7376](https://github.com/rapidsai/cudf/pull/7376)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Add string conversion to/from decimal values libcudf APIs ([#7364](https://github.com/rapidsai/cudf/pull/7364)) [@davidwendt](https://github.com/davidwendt) +- Add groupby SUM_OF_SQUARES support ([#7362](https://github.com/rapidsai/cudf/pull/7362)) [@karthikeyann](https://github.com/karthikeyann) +- Add `Series.drop` api ([#7304](https://github.com/rapidsai/cudf/pull/7304)) [@isVoid](https://github.com/isVoid) +- get_json_object() implementation ([#7286](https://github.com/rapidsai/cudf/pull/7286)) [@nvdbaranec](https://github.com/nvdbaranec) +- Python API for `LIstMethods.len()` ([#7283](https://github.com/rapidsai/cudf/pull/7283)) [@isVoid](https://github.com/isVoid) +- Support null_policy::EXCLUDE for COLLECT rolling aggregation ([#7264](https://github.com/rapidsai/cudf/pull/7264)) [@mythrocks](https://github.com/mythrocks) +- Add support for special tokens in nvtext::subword_tokenizer ([#7254](https://github.com/rapidsai/cudf/pull/7254)) [@davidwendt](https://github.com/davidwendt) +- Fix inplace update of data and add Series.update ([#7201](https://github.com/rapidsai/cudf/pull/7201)) [@galipremsagar](https://github.com/galipremsagar) +- Implement `cudf::group_by` (hash) for `decimal32` and `decimal64` ([#7190](https://github.com/rapidsai/cudf/pull/7190)) [@codereport](https://github.com/codereport) +- Adding support to specify "level" parameter for `Dataframe.rename` ([#7135](https://github.com/rapidsai/cudf/pull/7135)) [@skirui-source](https://github.com/skirui-source) + +## ๐Ÿ› ๏ธ Improvements + +- fix GDS include path for version 0.95 ([#7877](https://github.com/rapidsai/cudf/pull/7877)) [@rongou](https://github.com/rongou) +- Update `dask` + `distributed` to `2021.4.0` ([#7858](https://github.com/rapidsai/cudf/pull/7858)) [@jakirkham](https://github.com/jakirkham) +- Add ability to extract include dirs from `CUDF_HOME` ([#7848](https://github.com/rapidsai/cudf/pull/7848)) [@galipremsagar](https://github.com/galipremsagar) +- Add USE_GDS as an option in build script ([#7833](https://github.com/rapidsai/cudf/pull/7833)) [@pxLi](https://github.com/pxLi) +- add an allocate method with stream in java DeviceMemoryBuffer ([#7826](https://github.com/rapidsai/cudf/pull/7826)) [@rongou](https://github.com/rongou) +- Constrain dask and distributed versions to 2021.3.1 ([#7825](https://github.com/rapidsai/cudf/pull/7825)) [@shwina](https://github.com/shwina) +- Revert dask versioning of concat dispatch ([#7823](https://github.com/rapidsai/cudf/pull/7823)) [@galipremsagar](https://github.com/galipremsagar) +- add copy methods in Java memory buffer ([#7791](https://github.com/rapidsai/cudf/pull/7791)) [@rongou](https://github.com/rongou) +- Update README and CONTRIBUTING for 0.19 ([#7778](https://github.com/rapidsai/cudf/pull/7778)) [@robertmaynard](https://github.com/robertmaynard) +- Allow hash_partition to take a seed value ([#7771](https://github.com/rapidsai/cudf/pull/7771)) [@magnatelee](https://github.com/magnatelee) +- Turn on NVTX by default in java build ([#7761](https://github.com/rapidsai/cudf/pull/7761)) [@tgravescs](https://github.com/tgravescs) +- Add Java bindings to join gather map APIs ([#7751](https://github.com/rapidsai/cudf/pull/7751)) [@jlowe](https://github.com/jlowe) +- Add replacements column support for Java replaceNulls ([#7750](https://github.com/rapidsai/cudf/pull/7750)) [@jlowe](https://github.com/jlowe) +- Add Java bindings for row_bit_count ([#7749](https://github.com/rapidsai/cudf/pull/7749)) [@jlowe](https://github.com/jlowe) +- Remove unused JVM array creation ([#7748](https://github.com/rapidsai/cudf/pull/7748)) [@jlowe](https://github.com/jlowe) +- Added JNI support for new is_integer ([#7739](https://github.com/rapidsai/cudf/pull/7739)) [@revans2](https://github.com/revans2) +- Create and promote library aliases in libcudf installations ([#7734](https://github.com/rapidsai/cudf/pull/7734)) [@trxcllnt](https://github.com/trxcllnt) +- Support groupby operations for decimal dtypes ([#7731](https://github.com/rapidsai/cudf/pull/7731)) [@vyasr](https://github.com/vyasr) +- Memory map the input file only when GDS compatiblity mode is not used ([#7717](https://github.com/rapidsai/cudf/pull/7717)) [@vuule](https://github.com/vuule) +- Replace device_vector with device_uvector in null_mask ([#7715](https://github.com/rapidsai/cudf/pull/7715)) [@harrism](https://github.com/harrism) +- Struct hashing support for SerialMurmur3 and SparkMurmur3 ([#7714](https://github.com/rapidsai/cudf/pull/7714)) [@jlowe](https://github.com/jlowe) +- Add gbenchmark for nvtext replace-tokens function ([#7708](https://github.com/rapidsai/cudf/pull/7708)) [@davidwendt](https://github.com/davidwendt) +- Use stream in groupby calls ([#7705](https://github.com/rapidsai/cudf/pull/7705)) [@karthikeyann](https://github.com/karthikeyann) +- Update codeowners file ([#7701](https://github.com/rapidsai/cudf/pull/7701)) [@ajschmidt8](https://github.com/ajschmidt8) +- Cleanup groupby to use host_span, device_span, device_uvector ([#7698](https://github.com/rapidsai/cudf/pull/7698)) [@karthikeyann](https://github.com/karthikeyann) +- Add gbenchmark for nvtext ngrams functions ([#7693](https://github.com/rapidsai/cudf/pull/7693)) [@davidwendt](https://github.com/davidwendt) +- Misc Python/Cython optimizations ([#7686](https://github.com/rapidsai/cudf/pull/7686)) [@shwina](https://github.com/shwina) +- Add gbenchmark for nvtext tokenize functions ([#7684](https://github.com/rapidsai/cudf/pull/7684)) [@davidwendt](https://github.com/davidwendt) +- Add column_device_view to orc writer ([#7676](https://github.com/rapidsai/cudf/pull/7676)) [@kaatish](https://github.com/kaatish) +- cudf_kafka now uses cuDF CMake export targets (CPM) ([#7674](https://github.com/rapidsai/cudf/pull/7674)) [@robertmaynard](https://github.com/robertmaynard) +- Add gbenchmark for nvtext normalize functions ([#7668](https://github.com/rapidsai/cudf/pull/7668)) [@davidwendt](https://github.com/davidwendt) +- Resolve unnecessary import of thrust/optional.hpp in types.hpp ([#7667](https://github.com/rapidsai/cudf/pull/7667)) [@vyasr](https://github.com/vyasr) +- Feature/optimize accessor copy ([#7660](https://github.com/rapidsai/cudf/pull/7660)) [@vyasr](https://github.com/vyasr) +- Fix `find_package(cudf)` ([#7658](https://github.com/rapidsai/cudf/pull/7658)) [@trxcllnt](https://github.com/trxcllnt) +- Work-around for gcc7 compile error on Centos7 ([#7652](https://github.com/rapidsai/cudf/pull/7652)) [@davidwendt](https://github.com/davidwendt) +- Add in JNI support for count_elements ([#7651](https://github.com/rapidsai/cudf/pull/7651)) [@revans2](https://github.com/revans2) +- Fix issues with building cudf in a non-conda environment ([#7647](https://github.com/rapidsai/cudf/pull/7647)) [@galipremsagar](https://github.com/galipremsagar) +- Refactor ConfigureCUDA to not conditionally insert compiler flags ([#7643](https://github.com/rapidsai/cudf/pull/7643)) [@robertmaynard](https://github.com/robertmaynard) +- Add gbenchmark for converting strings to/from timestamps ([#7641](https://github.com/rapidsai/cudf/pull/7641)) [@davidwendt](https://github.com/davidwendt) +- Handle constructing a `cudf.Scalar` from a `cudf.Scalar` ([#7639](https://github.com/rapidsai/cudf/pull/7639)) [@shwina](https://github.com/shwina) +- Add in JNI support for table partition ([#7637](https://github.com/rapidsai/cudf/pull/7637)) [@revans2](https://github.com/revans2) +- Add explicit fixed_point merge test ([#7635](https://github.com/rapidsai/cudf/pull/7635)) [@codereport](https://github.com/codereport) +- Add JNI support for IDENTITY hash partitioning ([#7626](https://github.com/rapidsai/cudf/pull/7626)) [@revans2](https://github.com/revans2) +- Java support on explode_outer ([#7625](https://github.com/rapidsai/cudf/pull/7625)) [@sperlingxx](https://github.com/sperlingxx) +- Java support of casting string from/to decimal ([#7623](https://github.com/rapidsai/cudf/pull/7623)) [@sperlingxx](https://github.com/sperlingxx) +- Convert cudf::concatenate APIs to use spans and device_uvector ([#7621](https://github.com/rapidsai/cudf/pull/7621)) [@harrism](https://github.com/harrism) +- Add gbenchmark for cudf::strings::translate function ([#7617](https://github.com/rapidsai/cudf/pull/7617)) [@davidwendt](https://github.com/davidwendt) +- Use file(COPY ) over file(INSTALL ) so cmake output is reduced ([#7616](https://github.com/rapidsai/cudf/pull/7616)) [@robertmaynard](https://github.com/robertmaynard) +- Use rmm::device_uvector in place of rmm::device_vector for ORC reader/writer and cudf::io::column_buffer ([#7614](https://github.com/rapidsai/cudf/pull/7614)) [@vuule](https://github.com/vuule) +- Refactor Java host-side buffer concatenation to expose separate steps ([#7610](https://github.com/rapidsai/cudf/pull/7610)) [@jlowe](https://github.com/jlowe) +- Add gbenchmarks for string substrings functions ([#7603](https://github.com/rapidsai/cudf/pull/7603)) [@davidwendt](https://github.com/davidwendt) +- Refactor string conversion check ([#7599](https://github.com/rapidsai/cudf/pull/7599)) [@ttnghia](https://github.com/ttnghia) +- JNI: Pass names of children struct columns to native Arrow IPC writer ([#7598](https://github.com/rapidsai/cudf/pull/7598)) [@firestarman](https://github.com/firestarman) +- Revert "ENH Fix stale GHA and prevent duplicates " ([#7595](https://github.com/rapidsai/cudf/pull/7595)) [@mike-wendt](https://github.com/mike-wendt) +- ENH Fix stale GHA and prevent duplicates ([#7594](https://github.com/rapidsai/cudf/pull/7594)) [@mike-wendt](https://github.com/mike-wendt) +- Fix auto-detecting GPU architectures ([#7593](https://github.com/rapidsai/cudf/pull/7593)) [@trxcllnt](https://github.com/trxcllnt) +- Reduce cudf library size ([#7583](https://github.com/rapidsai/cudf/pull/7583)) [@robertmaynard](https://github.com/robertmaynard) +- Optimize cudf::make_strings_column for long strings ([#7576](https://github.com/rapidsai/cudf/pull/7576)) [@davidwendt](https://github.com/davidwendt) +- Always build and export the cudf::cudftestutil target ([#7574](https://github.com/rapidsai/cudf/pull/7574)) [@trxcllnt](https://github.com/trxcllnt) +- Eliminate literal parameters to uvector::set_element_async and device_scalar::set_value ([#7563](https://github.com/rapidsai/cudf/pull/7563)) [@harrism](https://github.com/harrism) +- Add gbenchmark for strings::concatenate ([#7560](https://github.com/rapidsai/cudf/pull/7560)) [@davidwendt](https://github.com/davidwendt) +- Update Changelog Link ([#7550](https://github.com/rapidsai/cudf/pull/7550)) [@ajschmidt8](https://github.com/ajschmidt8) +- Add gbenchmarks for strings replace regex functions ([#7541](https://github.com/rapidsai/cudf/pull/7541)) [@davidwendt](https://github.com/davidwendt) +- Add `__repr__` for Column and ColumnAccessor ([#7531](https://github.com/rapidsai/cudf/pull/7531)) [@shwina](https://github.com/shwina) +- Support Decimal DIV changes in cudf ([#7527](https://github.com/rapidsai/cudf/pull/7527)) [@razajafri](https://github.com/razajafri) +- Remove unneeded step parameter from strings::detail::copy_slice ([#7525](https://github.com/rapidsai/cudf/pull/7525)) [@davidwendt](https://github.com/davidwendt) +- Use device_uvector, device_span in sort groupby ([#7523](https://github.com/rapidsai/cudf/pull/7523)) [@karthikeyann](https://github.com/karthikeyann) +- Add gbenchmarks for strings extract function ([#7522](https://github.com/rapidsai/cudf/pull/7522)) [@davidwendt](https://github.com/davidwendt) +- Rename ARROW_STATIC_LIB because it conflicts with one in FindArrow.cmake ([#7518](https://github.com/rapidsai/cudf/pull/7518)) [@trxcllnt](https://github.com/trxcllnt) +- Reduce compile time/size for scan.cu ([#7516](https://github.com/rapidsai/cudf/pull/7516)) [@davidwendt](https://github.com/davidwendt) +- Change device_vector to device_uvector in nvtext source files ([#7512](https://github.com/rapidsai/cudf/pull/7512)) [@davidwendt](https://github.com/davidwendt) +- Removed unneeded includes from traits.hpp ([#7509](https://github.com/rapidsai/cudf/pull/7509)) [@davidwendt](https://github.com/davidwendt) +- FIX Remove random build directory generation for ccache ([#7508](https://github.com/rapidsai/cudf/pull/7508)) [@dillon-cullinan](https://github.com/dillon-cullinan) +- xfail failing pytest in pandas 1.2.3 ([#7507](https://github.com/rapidsai/cudf/pull/7507)) [@galipremsagar](https://github.com/galipremsagar) +- JNI bit cast ([#7493](https://github.com/rapidsai/cudf/pull/7493)) [@revans2](https://github.com/revans2) +- Combine rolling window function tests ([#7480](https://github.com/rapidsai/cudf/pull/7480)) [@mythrocks](https://github.com/mythrocks) +- Prepare Changelog for Automation ([#7477](https://github.com/rapidsai/cudf/pull/7477)) [@ajschmidt8](https://github.com/ajschmidt8) +- Java support for explode position ([#7471](https://github.com/rapidsai/cudf/pull/7471)) [@sperlingxx](https://github.com/sperlingxx) +- Update 0.18 changelog entry ([#7463](https://github.com/rapidsai/cudf/pull/7463)) [@ajschmidt8](https://github.com/ajschmidt8) +- JNI: Support skipping nulls for collect aggregation ([#7457](https://github.com/rapidsai/cudf/pull/7457)) [@firestarman](https://github.com/firestarman) +- Join APIs that return gathermaps ([#7454](https://github.com/rapidsai/cudf/pull/7454)) [@shwina](https://github.com/shwina) +- Remove dependence on managed memory for multimap test ([#7451](https://github.com/rapidsai/cudf/pull/7451)) [@jrhemstad](https://github.com/jrhemstad) +- Use cuFile for Parquet IO when available ([#7444](https://github.com/rapidsai/cudf/pull/7444)) [@vuule](https://github.com/vuule) +- Statistics cleanup ([#7439](https://github.com/rapidsai/cudf/pull/7439)) [@kaatish](https://github.com/kaatish) +- Add gbenchmarks for strings filter functions ([#7438](https://github.com/rapidsai/cudf/pull/7438)) [@davidwendt](https://github.com/davidwendt) +- `fixed_point` + `cudf::binary_operation` API Changes ([#7435](https://github.com/rapidsai/cudf/pull/7435)) [@codereport](https://github.com/codereport) +- Improve string gather performance ([#7433](https://github.com/rapidsai/cudf/pull/7433)) [@jlowe](https://github.com/jlowe) +- Don't use user resource for a temporary allocation in sort_by_key ([#7431](https://github.com/rapidsai/cudf/pull/7431)) [@magnatelee](https://github.com/magnatelee) +- Detail APIs for datetime functions ([#7430](https://github.com/rapidsai/cudf/pull/7430)) [@magnatelee](https://github.com/magnatelee) +- Replace thrust::max_element with thrust::reduce in strings findall_re ([#7428](https://github.com/rapidsai/cudf/pull/7428)) [@davidwendt](https://github.com/davidwendt) +- Add gbenchmark for strings split/split_record functions ([#7427](https://github.com/rapidsai/cudf/pull/7427)) [@davidwendt](https://github.com/davidwendt) +- Update JNI build to use CMAKE_CUDA_ARCHITECTURES ([#7425](https://github.com/rapidsai/cudf/pull/7425)) [@jlowe](https://github.com/jlowe) +- Change nvtext::load_vocabulary_file to return a unique ptr ([#7424](https://github.com/rapidsai/cudf/pull/7424)) [@davidwendt](https://github.com/davidwendt) +- Simplify type dispatch with `device_storage_dispatch` ([#7419](https://github.com/rapidsai/cudf/pull/7419)) [@codereport](https://github.com/codereport) +- Java support for casting of nested child columns ([#7417](https://github.com/rapidsai/cudf/pull/7417)) [@razajafri](https://github.com/razajafri) +- Improve scalar string replace performance for long strings ([#7415](https://github.com/rapidsai/cudf/pull/7415)) [@jlowe](https://github.com/jlowe) +- Remove unneeded temporary device vector for strings scatter specialization ([#7409](https://github.com/rapidsai/cudf/pull/7409)) [@davidwendt](https://github.com/davidwendt) +- bitmask_or implementation with bitmask refactor ([#7406](https://github.com/rapidsai/cudf/pull/7406)) [@rwlee](https://github.com/rwlee) +- Add other cudf::strings::replace functions to current strings replace gbenchmark ([#7403](https://github.com/rapidsai/cudf/pull/7403)) [@davidwendt](https://github.com/davidwendt) +- Clean up included headers in `device_operators.cuh` ([#7401](https://github.com/rapidsai/cudf/pull/7401)) [@codereport](https://github.com/codereport) +- Move nullable index iterator to indexalator factory ([#7399](https://github.com/rapidsai/cudf/pull/7399)) [@davidwendt](https://github.com/davidwendt) +- ENH Pass ccache variables to conda recipe & use Ninja in CI ([#7398](https://github.com/rapidsai/cudf/pull/7398)) [@Ethyling](https://github.com/Ethyling) +- upgrade maven-antrun-plugin to support maven parallel builds ([#7393](https://github.com/rapidsai/cudf/pull/7393)) [@rongou](https://github.com/rongou) +- Add gbenchmark for strings find/contains functions ([#7392](https://github.com/rapidsai/cudf/pull/7392)) [@davidwendt](https://github.com/davidwendt) +- Use CMAKE_CUDA_ARCHITECTURES ([#7391](https://github.com/rapidsai/cudf/pull/7391)) [@robertmaynard](https://github.com/robertmaynard) +- Refactor libcudf strings::replace to use make_strings_children utility ([#7384](https://github.com/rapidsai/cudf/pull/7384)) [@davidwendt](https://github.com/davidwendt) +- Added in JNI support for out of core sort algorithm ([#7381](https://github.com/rapidsai/cudf/pull/7381)) [@revans2](https://github.com/revans2) +- Upgrade pandas to 1.2 ([#7375](https://github.com/rapidsai/cudf/pull/7375)) [@galipremsagar](https://github.com/galipremsagar) +- Rename `logical_cast` to `bit_cast` and allow additional conversions ([#7373](https://github.com/rapidsai/cudf/pull/7373)) [@ttnghia](https://github.com/ttnghia) +- jitify 2 support ([#7372](https://github.com/rapidsai/cudf/pull/7372)) [@cwharris](https://github.com/cwharris) +- compile_udf: Cache PTX for similar functions ([#7371](https://github.com/rapidsai/cudf/pull/7371)) [@gmarkall](https://github.com/gmarkall) +- Add string scalar replace benchmark ([#7369](https://github.com/rapidsai/cudf/pull/7369)) [@jlowe](https://github.com/jlowe) +- Add gbenchmark for strings contains_re/count_re functions ([#7366](https://github.com/rapidsai/cudf/pull/7366)) [@davidwendt](https://github.com/davidwendt) +- Update orc reader and writer fuzz tests ([#7357](https://github.com/rapidsai/cudf/pull/7357)) [@galipremsagar](https://github.com/galipremsagar) +- Improve url_decode performance for long strings ([#7353](https://github.com/rapidsai/cudf/pull/7353)) [@jlowe](https://github.com/jlowe) +- `cudf::ast` Small Refactorings ([#7352](https://github.com/rapidsai/cudf/pull/7352)) [@codereport](https://github.com/codereport) +- Remove std::cout and print in the scatter test function EmptyListsOfNullableStrings. ([#7342](https://github.com/rapidsai/cudf/pull/7342)) [@ttnghia](https://github.com/ttnghia) +- Use `cudf::detail::make_counting_transform_iterator` ([#7338](https://github.com/rapidsai/cudf/pull/7338)) [@codereport](https://github.com/codereport) +- Change block size parameter from a global to a template param. ([#7333](https://github.com/rapidsai/cudf/pull/7333)) [@nvdbaranec](https://github.com/nvdbaranec) +- Partial clean up of ORC writer ([#7324](https://github.com/rapidsai/cudf/pull/7324)) [@vuule](https://github.com/vuule) +- Add gbenchmark for cudf::strings::to_lower ([#7316](https://github.com/rapidsai/cudf/pull/7316)) [@davidwendt](https://github.com/davidwendt) +- Update Java bindings version to 0.19-SNAPSHOT ([#7307](https://github.com/rapidsai/cudf/pull/7307)) [@pxLi](https://github.com/pxLi) +- Move `cudf::test::make_counting_transform_iterator` to `cudf/detail/iterator.cuh` ([#7306](https://github.com/rapidsai/cudf/pull/7306)) [@codereport](https://github.com/codereport) +- Use string literals in `fixed_point` `release_assert`s ([#7303](https://github.com/rapidsai/cudf/pull/7303)) [@codereport](https://github.com/codereport) +- Fix merge conflicts for #7295 ([#7297](https://github.com/rapidsai/cudf/pull/7297)) [@ajschmidt8](https://github.com/ajschmidt8) +- Add UTF-8 chars to create_random_column<string_view> benchmark utility ([#7292](https://github.com/rapidsai/cudf/pull/7292)) [@davidwendt](https://github.com/davidwendt) +- Abstracting block reduce and block scan from cuIO kernels with `cub` apis ([#7278](https://github.com/rapidsai/cudf/pull/7278)) [@rgsl888prabhu](https://github.com/rgsl888prabhu) +- Build.sh use cmake --build to drive build system invocation ([#7270](https://github.com/rapidsai/cudf/pull/7270)) [@robertmaynard](https://github.com/robertmaynard) +- Refactor dictionary support for reductions any/all ([#7242](https://github.com/rapidsai/cudf/pull/7242)) [@davidwendt](https://github.com/davidwendt) +- Replace stream.value() with stream for stream_view args ([#7236](https://github.com/rapidsai/cudf/pull/7236)) [@karthikeyann](https://github.com/karthikeyann) +- Interval index and interval_range ([#7182](https://github.com/rapidsai/cudf/pull/7182)) [@marlenezw](https://github.com/marlenezw) +- avro reader integration tests ([#7156](https://github.com/rapidsai/cudf/pull/7156)) [@cwharris](https://github.com/cwharris) +- Rework libcudf CMakeLists.txt to export targets for CPM ([#7107](https://github.com/rapidsai/cudf/pull/7107)) [@trxcllnt](https://github.com/trxcllnt) +- Adding Interval Dtype ([#6984](https://github.com/rapidsai/cudf/pull/6984)) [@marlenezw](https://github.com/marlenezw) +- Cleaning up `for` loops with `make_(counting_)transform_iterator` ([#6546](https://github.com/rapidsai/cudf/pull/6546)) [@codereport](https://github.com/codereport) # cuDF 0.18.0 (24 Feb 2021) diff --git a/README.md b/README.md index ed4277e1fcb..c0c33c645e1 100644 --- a/README.md +++ b/README.md @@ -65,15 +65,16 @@ Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapids cuDF can be installed with conda ([miniconda](https://conda.io/miniconda.html), or the full [Anaconda distribution](https://www.anaconda.com/download)) from the `rapidsai` channel: -For `cudf version == 21.081.06` : +<<<<<<< HEAD +For `cudf version == 21.06` : ```bash -# for CUDA 10.1 +# for CUDA 11.0 conda install -c rapidsai -c nvidia -c numba -c conda-forge \ - cudf=21.081.06 python=3.7 cudatoolkit=10.1 + cudf=21.06 python=3.7 cudatoolkit=11.0 -# or, for CUDA 10.2 +# or, for CUDA 11.2 conda install -c rapidsai -c nvidia -c numba -c conda-forge \ - cudf=21.081.06 python=3.7 cudatoolkit=10.2 + cudf=21.06 python=3.7 cudatoolkit=11.2 ``` diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 39f2ba3188c..631ebf16aea 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -3,7 +3,7 @@ {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version=environ.get('CONDA_PY', 36) %} -{% set cuda_version='.'.join(environ.get('CUDA_VERSION', '10.1').split('.')[:2]) %} +{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %} package: name: cudf diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml index 35dfb1791d8..b59a49b0db7 100644 --- a/conda/recipes/cudf_kafka/meta.yaml +++ b/conda/recipes/cudf_kafka/meta.yaml @@ -3,7 +3,7 @@ {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version=environ.get('CONDA_PY', 36) %} -{% set cuda_version='.'.join(environ.get('CUDA_VERSION', '10.1').split('.')[:2]) %} +{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %} package: name: cudf_kafka diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml index 0ae0ce830ad..bb5186d7057 100644 --- a/conda/recipes/custreamz/meta.yaml +++ b/conda/recipes/custreamz/meta.yaml @@ -3,7 +3,7 @@ {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version=environ.get('CONDA_PY', 36) %} -{% set cuda_version='.'.join(environ.get('CUDA_VERSION', '10.1').split('.')[:2]) %} +{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %} package: name: custreamz diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index e66b4c930ec..14376f54ba1 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -3,7 +3,7 @@ {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version=environ.get('CONDA_PY', 36) %} -{% set cuda_version='.'.join(environ.get('CUDA_VERSION', '10.1').split('.')[:2]) %} +{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %} package: name: dask-cudf diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index ea2fda399fd..a8abe5b09f0 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -2,7 +2,7 @@ {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} -{% set cuda_version='.'.join(environ.get('CUDA_VERSION', '10.1').split('.')[:2]) %} +{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %} package: name: libcudf @@ -133,12 +133,14 @@ test: - test -f $PREFIX/include/cudf/io/types.hpp - test -f $PREFIX/include/cudf/ipc.hpp - test -f $PREFIX/include/cudf/join.hpp + - test -f $PREFIX/include/cudf/lists/detail/combine.hpp - test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp - test -f $PREFIX/include/cudf/lists/detail/copying.hpp + - test -f $PREFIX/include/cudf/lists/lists_column_factories.hpp - test -f $PREFIX/include/cudf/lists/detail/drop_list_duplicates.hpp - test -f $PREFIX/include/cudf/lists/detail/interleave_columns.hpp - test -f $PREFIX/include/cudf/lists/detail/sorting.hpp - - test -f $PREFIX/include/cudf/lists/concatenate_rows.hpp + - test -f $PREFIX/include/cudf/lists/combine.hpp - test -f $PREFIX/include/cudf/lists/count_elements.hpp - test -f $PREFIX/include/cudf/lists/explode.hpp - test -f $PREFIX/include/cudf/lists/drop_list_duplicates.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 2220a1b1a2c..8620531ec22 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -266,7 +266,8 @@ add_library(cudf src/join/join.cu src/join/semi_join.cu src/lists/contains.cu - src/lists/concatenate_rows.cu + src/lists/combine/concatenate_list_elements.cu + src/lists/combine/concatenate_rows.cu src/lists/copying/concatenate.cu src/lists/copying/copying.cu src/lists/copying/gather.cu @@ -332,8 +333,8 @@ add_library(cudf src/strings/char_types/char_cases.cu src/strings/char_types/char_types.cu src/strings/combine/concatenate.cu - src/strings/combine/concatenate_list_elements.cu src/strings/combine/join.cu + src/strings/combine/join_list_elements.cu src/strings/contains.cu src/strings/convert/convert_booleans.cu src/strings/convert/convert_datetime.cu diff --git a/cpp/cmake/thrust.patch b/cpp/cmake/thrust.patch index c14b8cdafe5..2f9201d8ab4 100644 --- a/cpp/cmake/thrust.patch +++ b/cpp/cmake/thrust.patch @@ -81,25 +81,3 @@ index c0c6d59..937ee31 100644 { typedef AgentScanPolicy< 128, 15, ///< Threads per block, items per thread -diff --git a/thrust/system/cuda/detail/scan_by_key.h b/thrust/system/cuda/detail/scan_by_key.h -index fe4b321c..b3974c69 100644 ---- a/thrust/system/cuda/detail/scan_by_key.h -+++ b/thrust/system/cuda/detail/scan_by_key.h -@@ -513,7 +513,7 @@ namespace __scan_by_key { - scan_op(scan_op_) - { - int tile_idx = blockIdx.x; -- Size tile_base = ITEMS_PER_TILE * tile_idx; -+ Size tile_base = ITEMS_PER_TILE * static_cast(tile_idx); - Size num_remaining = num_items - tile_base; - - if (num_remaining > ITEMS_PER_TILE) -@@ -734,7 +734,7 @@ namespace __scan_by_key { - ScanOp scan_op, - AddInitToScan add_init_to_scan) - { -- int num_items = static_cast(thrust::distance(keys_first, keys_last)); -+ size_t num_items = static_cast(thrust::distance(keys_first, keys_last)); - size_t storage_size = 0; - cudaStream_t stream = cuda_cub::stream(policy); - bool debug_sync = THRUST_DEBUG_SYNC_FLAG; diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp index 43c2407d629..e5424f0fc44 100644 --- a/cpp/include/cudf/column/column_factories.hpp +++ b/cpp/include/cudf/column/column_factories.hpp @@ -541,7 +541,8 @@ std::unique_ptr make_structs_column( * * The output column will have the same type as `s.type()` * The output column will contain all null rows if `s.invalid()==false` - * The output column will be empty if `size==0`. + * The output column will be empty if `size==0`. For LIST scalars, the column hierarchy + * from @p s is preserved. * * @param[in] s The scalar to use for values in the column. * @param[in] size The number of rows for the output column. diff --git a/cpp/include/cudf/lists/concatenate_rows.hpp b/cpp/include/cudf/lists/combine.hpp similarity index 57% rename from cpp/include/cudf/lists/concatenate_rows.hpp rename to cpp/include/cudf/lists/combine.hpp index 1d93de418f8..a9407ed57ca 100644 --- a/cpp/include/cudf/lists/concatenate_rows.hpp +++ b/cpp/include/cudf/lists/combine.hpp @@ -21,7 +21,7 @@ namespace cudf { namespace lists { /** - * @addtogroup lists_concatenate_rows + * @addtogroup lists_combine * @{ * @file */ @@ -53,16 +53,47 @@ enum class concatenate_null_policy { IGNORE, NULLIFY_OUTPUT_ROW }; * * @param input Table of lists to be concatenated. * @param null_policy The parameter to specify whether a null list element will be ignored from - * concatenation, or any concatenation involving a null list element will result in a null list. + * concatenation, or any concatenation involving a null element will result in a null list. * @param mr Device memory resource used to allocate the returned column's device memory. * @return A new column in which each row is a list resulted from concatenating all list elements in - * the corresponding row of the input table. + * the corresponding row of the input table. */ std::unique_ptr concatenate_rows( table_view const& input, concatenate_null_policy null_policy = concatenate_null_policy::IGNORE, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Concatenating multiple lists on the same row of a lists column into a single list. + * + * Given a lists column where each row in the column is a list of lists of entries, an output lists + * column is generated by concatenating all the list elements at the same row together. If any row + * contains null list elements, the concatenation process will either ignore those null elements, or + * will simply set the entire resulting row to be a null element. + * + * @code{.pseudo} + * l = [ [{1, 2}, {3, 4}, {5}], [{6}, {}, {7, 8, 9}] ] + * r = lists::concatenate_list_elements(l); + * r is [ {1, 2, 3, 4, 5}, {6, 7, 8, 9} ] + * @endcode + * + * @throws cudf::logic_error if the input column is not at least two-level depth lists column (i.e., + * each row must be a list of list). + * @throws cudf::logic_error if the input lists column contains nested typed entries that are not + * lists. + * + * @param input The lists column containing lists of list elements to concatenate. + * @param null_policy The parameter to specify whether a null list element will be ignored from + * concatenation, or any concatenation involving a null element will result in a null list. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return A new column in which each row is a list resulted from concatenating all list elements in + * the corresponding row of the input lists column. + */ +std::unique_ptr concatenate_list_elements( + column_view const& input, + concatenate_null_policy null_policy = concatenate_null_policy::IGNORE, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of group } // namespace lists } // namespace cudf diff --git a/cpp/include/cudf/lists/detail/combine.hpp b/cpp/include/cudf/lists/detail/combine.hpp new file mode 100644 index 00000000000..9f28074173a --- /dev/null +++ b/cpp/include/cudf/lists/detail/combine.hpp @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +namespace cudf { +namespace lists { +namespace detail { +/** + * @copydoc cudf::lists::concatenate_rows + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr concatenate_rows( + table_view const& input, + concatenate_null_policy null_policy, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @copydoc cudf::lists::concatenate_list_elements + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr concatenate_list_elements( + column_view const& input, + concatenate_null_policy null_policy, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace detail +} // namespace lists +} // namespace cudf diff --git a/cpp/include/cudf/lists/detail/copying.hpp b/cpp/include/cudf/lists/detail/copying.hpp index 548fec7e7f6..3760294f079 100644 --- a/cpp/include/cudf/lists/detail/copying.hpp +++ b/cpp/include/cudf/lists/detail/copying.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -48,21 +48,6 @@ std::unique_ptr copy_slice(lists_column_view const& lists, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); -/** - * @brief Create a single-level empty lists column. - * - * An empty lists column contains empty children so the column's - * basic type is recorded. - * - * @param child_type The type used for the child column. - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New empty lists column. - */ -std::unique_ptr make_empty_lists_column(data_type child_type, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - } // namespace detail } // namespace lists } // namespace cudf diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh index b179ccf228b..aec45d260bf 100644 --- a/cpp/include/cudf/lists/detail/scatter.cuh +++ b/cpp/include/cudf/lists/detail/scatter.cuh @@ -526,10 +526,7 @@ struct list_child_constructor { if (num_child_rows == 0) { // make an empty lists column using the input child type - return make_empty_lists_column( - source_lists_column_view.child().child(lists_column_view::child_column_index).type(), - stream, - mr); + return empty_like(source_lists_column_view.child()); } auto child_list_views = rmm::device_uvector(num_child_rows, stream, mr); diff --git a/cpp/include/cudf/lists/lists_column_factories.hpp b/cpp/include/cudf/lists/lists_column_factories.hpp new file mode 100644 index 00000000000..bdf06cfa9e7 --- /dev/null +++ b/cpp/include/cudf/lists/lists_column_factories.hpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +namespace cudf { +namespace lists { +namespace detail { + +/** + * @brief Internal API to construct a lists column from a `list_scalar`, for public + * use, use `cudf::make_column_from_scalar`. + * + * @param[in] value The `list_scalar` to construct from + * @param[in] size The number of rows for the output column. + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + * @param[in] mr Device memory resource used to allocate the returned column's device memory. + */ +std::unique_ptr make_lists_column_from_scalar( + list_scalar const& value, + size_type size, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace detail +} // namespace lists +} // namespace cudf diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp index 6887ef0e670..360efe15303 100644 --- a/cpp/include/cudf/strings/combine.hpp +++ b/cpp/include/cudf/strings/combine.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,12 +30,21 @@ namespace strings { * @brief Strings APIs for concatenate and join */ +/** + * @brief Setting for specifying how separators are added with + * null strings elements. + */ +enum class separator_on_nulls { + YES, ///< Always add separators between elements + NO ///< Do not add separators if an element is null +}; + /** * @brief Concatenates all strings in the column into one new string delimited * by an optional separator string. * * This returns a column with one string. Any null entries are ignored unless - * the narep parameter specifies a replacement string. + * the @p narep parameter specifies a replacement string. * * @code{.pseudo} * Example: @@ -70,11 +79,9 @@ std::unique_ptr join_strings( * * - If row separator for a given row is null, output column for that row is null, unless * there is a valid @p separator_narep - * - If all column values for a given row is null, output column for that row is null, unless - * there is a valid @p col_narep - * - null column values for a given row are skipped, if the column replacement isn't valid - * - The separator is only applied between two valid column values - * - If valid @p separator_narep and @p col_narep are provided, the output column is always + * - The separator is applied between two output row values if the @p separate_nulls + * is `YES` or only between valid rows if @p separate_nulls is `NO`. + * - If @p separator_narep and @p col_narep are both valid, the output column is always * non nullable * * @code{.pseudo} @@ -83,16 +90,25 @@ std::unique_ptr join_strings( * c1 = [null, 'cc', 'dd', null, null, 'gg'] * c2 = ['bb', '', null, null, null, 'hh'] * sep = ['::', '%%', '^^', '!', '*', null] - * out0 = concatenate([c0, c1, c2], sep) - * out0 is ['aa::bb', 'cc%%', '^^dd', 'ee', null, null] + * out = concatenate({c0, c1, c2}, sep) + * // all rows have at least one null or sep[i]==null + * out is [null, null, null, null, null, null] * * sep_rep = '+' - * out1 = concatenate([c0, c1, c2], sep, sep_rep) - * out1 is ['aa::bb', 'cc%%', '^^dd', 'ee', null, 'ff+gg+hh'] - * - * col_rep = '-' - * out2 = concatenate([c0, c1, c2], sep, invalid_sep_rep, col_rep) - * out2 is ['aa::-::bb', '-%%cc%%', '^^dd^^-', 'ee!-!-', '-*-*-', null] + * out = concatenate({c0, c1, c2}, sep, sep_rep) + * // all rows with at least one null output as null + * out is [null, null, null, null, null, 'ff+gg+hh'] + * + * col_narep = '-' + * sep_na = non-valid scalar + * out = concatenate({c0, c1, c2}, sep, sep_na, col_narep) + * // only the null entry in the sep column produces a null row + * out is ['aa::-::bb', '-%%cc%%', '^^dd^^-', 'ee!-!-', '-*-*-', null] + * + * col_narep = '' + * out = concatenate({c0, c1, c2}, sep, sep_rep, col_narep, separator_on_nulls:NO) + * // parameter suppresses separator for null rows + * out is ['aa::bb', 'cc%%', '^^dd', 'ee', '', 'ff+gg+hh'] * @endcode * * @throw cudf::logic_error if no input columns are specified - table view is empty @@ -108,6 +124,8 @@ std::unique_ptr join_strings( * @param col_narep String that should be used in place of any null strings * found in any column. Default of invalid-scalar means no null column value replacements. * Default is an invalid string. + * @param separate_nulls If YES, then the separator is included for null rows + * if `col_narep` is valid. * @param mr Resource for allocating device memory. * @return New column with concatenated results. */ @@ -116,15 +134,9 @@ std::unique_ptr concatenate( strings_column_view const& separators, string_scalar const& separator_narep = string_scalar("", false), string_scalar const& col_narep = string_scalar("", false), + separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @addtogroup strings_combine - * @{ - * @file strings/combine.hpp - * @brief Strings APIs for concatenate and join - */ - /** * @brief Row-wise concatenates the given list of strings columns and * returns a single strings column result. @@ -136,20 +148,30 @@ std::unique_ptr concatenate( * row to be null entry unless a narep string is specified to be used * in its place. * - * The number of strings in the columns provided must be the same. + * If @p separate_nulls is set to `NO` and @p narep is valid then + * separators are not added to the output between null elements. + * Otherwise, separators are always added if @p narep is valid. + * + * More than one column must be specified in the input @p strings_columns + * table. * * @code{.pseudo} * Example: - * s1 = ['aa', null, '', 'aa'] - * s2 = ['', 'bb', 'bb', null] - * r1 = concatenate([s1,s2]) - * r1 is ['aa', null, 'bb', null] - * r2 = concatenate([s1,s2],':','_') - * r2 is ['aa:', '_:bb', ':bb', 'aa:_'] + * s1 = ['aa', null, '', 'dd'] + * s2 = ['', 'bb', 'cc', null] + * out = concatenate({s1, s2}) + * out is ['aa', null, 'cc', null] + * + * out = concatenate({s1, s2}, ':', '_') + * out is ['aa:', '_:bb', ':cc', 'dd:_'] + * + * out = concatenate({s1, s2}, ':', '', separator_on_nulls::NO) + * out is ['aa:', 'bb', ':cc', 'dd'] * @endcode * * @throw cudf::logic_error if input columns are not all strings columns. * @throw cudf::logic_error if separator is not valid. + * @throw cudf::logic_error if only one column is specified * * @param strings_columns List of string columns to concatenate. * @param separator String that should inserted between each string from each row. @@ -157,6 +179,7 @@ std::unique_ptr concatenate( * @param narep String that should be used in place of any null strings * found in any column. Default of invalid-scalar means any null entry in any column will * produces a null result for that row. + * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New column with concatenated results. */ @@ -164,6 +187,7 @@ std::unique_ptr concatenate( table_view const& strings_columns, string_scalar const& separator = string_scalar(""), string_scalar const& narep = string_scalar("", false), + separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -171,24 +195,30 @@ std::unique_ptr concatenate( * within each row and returns a single strings column result. * * Each new string is created by concatenating the strings from the same row (same list element) - * delimited by the row separator provided in the `separators` strings column. + * delimited by the row separator provided in the @p separators strings column. * * A null list row will always result in a null string in the output row. Any non-null list row * having a null element will result in the corresponding output row to be null unless a valid - * `string_narep` scalar is provided to be used in its place. Any null row in the `separators` - * column will also result in a null output row unless a valid `separator_narep` scalar is provided + * @p string_narep scalar is provided to be used in its place. Any null row in the @p separators + * column will also result in a null output row unless a valid @p separator_narep scalar is provided * to be used in place of the null separators. * + * If @p separate_nulls is set to `NO` and @p narep is valid then separators are not added to the + * output between null elements. Otherwise, separators are always added if @p narep is valid. + * * @code{.pseudo} * Example: - * s = [ {'aa', 'bb', 'cc'}, null, {'', 'dd'}, {'ee', null}, {'ff', 'gg'} ] + * s = [ ['aa', 'bb', 'cc'], null, ['', 'dd'], ['ee', null], ['ff', 'gg'] ] * sep = ['::', '%%', '!', '*', null] * - * r1 = strings::concatenate_list_elements(s, sep) - * r1 is ['aa::bb::cc', null, '!dd', null, null] + * out = join_list_elements(s, sep) + * out is ['aa::bb::cc', null, '!dd', null, null] + * + * out = join_list_elements(s, sep, ':', '_') + * out is ['aa::bb::cc', null, '!dd', 'ee*_', 'ff:gg'] * - * r2 = strings::concatenate_list_elements(s, sep, ':', '_') - * r2 is ['aa::bb::cc', null, '!dd', 'ee*_', 'ff:gg'] + * out = join_list_elements(s, sep, ':', '', separator_on_nulls::NO) + * out is ['aa::bb::cc', null, '!dd', 'ee', 'ff:gg'] * @endcode * * @throw cudf::logic_error if input column is not lists of strings column. @@ -203,14 +233,16 @@ std::unique_ptr concatenate( * @param string_narep String that should be used to replace null strings in any non-null list row, * default is an invalid-scalar denoting that list rows containing null strings will result * in null string in the corresponding output rows. + * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column with concatenated results. */ -std::unique_ptr concatenate_list_elements( +std::unique_ptr join_list_elements( const lists_column_view& lists_strings_column, const strings_column_view& separators, string_scalar const& separator_narep = string_scalar("", false), string_scalar const& string_narep = string_scalar("", false), + separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -218,21 +250,27 @@ std::unique_ptr concatenate_list_elements( * within each row and returns a single strings column result. * * Each new string is created by concatenating the strings from the same row (same list element) - * delimited by the separator provided. + * delimited by the @p separator provided. * * A null list row will always result in a null string in the output row. Any non-null list row - * having a null elenent will result in the corresponding output row to be null unless a narep - * string is specified to be used in its place. + * having a null elenent will result in the corresponding output row to be null unless a + * @p narep string is specified to be used in its place. + * + * If @p separate_nulls is set to `NO` and @p narep is valid then separators are not added to the + * output between null elements. Otherwise, separators are always added if @p narep is valid. * * @code{.pseudo} * Example: - * s = [ {'aa', 'bb', 'cc'}, null, {'', 'dd'}, {'ee', null}, {'ff'} ] + * s = [ ['aa', 'bb', 'cc'], null, ['', 'dd'], ['ee', null], ['ff'] ] + * + * out = join_list_elements(s) + * out is ['aabbcc', null, 'dd', null, 'ff'] * - * r1 = strings::concatenate_list_elements(s) - * r1 is ['aabbcc', null, 'dd', null, 'ff'] + * out = join_list_elements(s, ':', '_') + * out is ['aa:bb:cc', null, ':dd', 'ee:_', 'ff'] * - * r2 = strings::concatenate_list_elements(s, ':', '_') - * r2 is ['aa:bb:cc', null, ':dd', 'ee:_', 'ff'] + * out = join_list_elements(s, ':', '', separator_on_nulls::NO) + * out is ['aa:bb:cc', null, ':dd', 'ee', 'ff'] * @endcode * * @throw cudf::logic_error if input column is not lists of strings column. @@ -244,13 +282,15 @@ std::unique_ptr concatenate_list_elements( * @param narep String that should be used to replace null strings in any non-null list row, default * is an invalid-scalar denoting that list rows containing null strings will result in null * string in the corresponding output rows. + * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column with concatenated results. */ -std::unique_ptr concatenate_list_elements( +std::unique_ptr join_list_elements( const lists_column_view& lists_strings_column, string_scalar const& separator = string_scalar(""), string_scalar const& narep = string_scalar("", false), + separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/detail/combine.hpp b/cpp/include/cudf/strings/detail/combine.hpp index 6e25a4dfa38..d6bdf398886 100644 --- a/cpp/include/cudf/strings/detail/combine.hpp +++ b/cpp/include/cudf/strings/detail/combine.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -36,6 +37,7 @@ std::unique_ptr concatenate( table_view const& strings_columns, string_scalar const& separator, string_scalar const& narep, + separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index 11b907e7f16..dda8ce87432 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -143,7 +143,7 @@ * @} * @defgroup lists_apis Lists * @{ - * @defgroup lists_concatenate_rows Combining + * @defgroup lists_combine Combining * @defgroup lists_extract Extracting * @defgroup lists_contains Searching * @defgroup lists_gather Gathering diff --git a/cpp/src/column/column_factories.cu b/cpp/src/column/column_factories.cu index 60e642ea3d5..6ba8497b320 100644 --- a/cpp/src/column/column_factories.cu +++ b/cpp/src/column/column_factories.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -32,6 +33,7 @@ struct column_from_scalar_dispatch { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { + if (size == 0) return make_empty_column(value.type()); if (!value.is_valid()) return make_fixed_width_column(value.type(), size, mask_state::ALL_NULL, stream, mr); auto output_column = @@ -49,6 +51,7 @@ std::unique_ptr column_from_scalar_dispatch::operator() column_from_scalar_dispatch::operator()(&value); + return lists::detail::make_lists_column_from_scalar(*lv, size, stream, mr); } template <> @@ -94,6 +98,7 @@ std::unique_ptr column_from_scalar_dispatch::operator() const&>(value); auto iter = thrust::make_constant_iterator(0); @@ -117,7 +122,6 @@ std::unique_ptr make_column_from_scalar(scalar const& s, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (size == 0) return make_empty_column(s.type()); return type_dispatcher(s.type(), column_from_scalar_dispatch{}, s, size, stream, mr); } diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index a5fd6d6f9bb..f132d6b1511 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -79,6 +79,44 @@ std::pair, std::vector> groupby::disp groupby::~groupby() = default; namespace { + +/** + * @brief Factory to construct empty result columns. + * + * Adds special handling for COLLECT_LIST/COLLECT_SET, because: + * 1. `make_empty_column()` does not support construction of nested columns. + * 2. Empty lists need empty child columns, to persist type information. + */ +struct empty_column_constructor { + column_view values; + + template + std::unique_ptr operator()() const + { + using namespace cudf; + using namespace cudf::detail; + + if constexpr (k == aggregation::Kind::COLLECT_LIST || k == aggregation::Kind::COLLECT_SET) { + return make_lists_column( + 0, make_empty_column(data_type{type_to_id()}), empty_like(values), 0, {}); + } + + // If `values` is LIST typed, and the aggregation results match the type, + // construct empty results based on `values`. + // Most generally, this applies if input type matches output type. + // + // Note: `target_type_t` is not recursive, and `ValuesType` does not consider children. + // It is important that `COLLECT_LIST` and `COLLECT_SET` are handled before this + // point, because `COLLECT_LIST(LIST)` produces `LIST`, but `target_type_t` + // wouldn't know the difference. + if constexpr (std::is_same_v, ValuesType>) { + return empty_like(values); + } + + return make_empty_column(target_type(values.type(), k)); + } +}; + /// Make an empty table with appropriate types for requested aggs auto empty_results(host_span requests) { @@ -93,7 +131,8 @@ auto empty_results(host_span requests) request.aggregations.end(), std::back_inserter(results), [&request](auto const& agg) { - return make_empty_column(cudf::detail::target_type(request.values.type(), agg->kind)); + return cudf::detail::dispatch_type_and_aggregation( + request.values.type(), agg->kind, empty_column_constructor{request.values}); }); return aggregation_result{std::move(results)}; diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index d2b6be5eead..bc0e1243d4f 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -119,7 +120,8 @@ struct column_to_strings_fn { return not((std::is_same::value) || (std::is_integral::value) || (std::is_floating_point::value) || - (cudf::is_timestamp()) || (cudf::is_duration())); + (cudf::is_fixed_point()) || (cudf::is_timestamp()) || + (cudf::is_duration())); } explicit column_to_strings_fn( @@ -189,6 +191,15 @@ struct column_to_strings_fn { return cudf::strings::detail::from_floats(column, stream_, mr_); } + // fixed point: + // + template + std::enable_if_t(), std::unique_ptr> operator()( + column_view const& column) const + { + return cudf::strings::detail::from_fixed_point(column, stream_, mr_); + } + // timestamps: // template @@ -404,11 +415,19 @@ void writer::impl::write(table_view const& table, auto str_table_view = str_table_ptr->view(); // concatenate columns in each row into one big string column - //(using null representation and delimiter): + // (using null representation and delimiter): // std::string delimiter_str{options_.get_inter_column_delimiter()}; - auto str_concat_col = cudf::strings::detail::concatenate( - str_table_view, delimiter_str, options_.get_na_rep(), stream); + auto str_concat_col = [&] { + if (str_table_view.num_columns() > 1) + return cudf::strings::detail::concatenate(str_table_view, + delimiter_str, + options_.get_na_rep(), + strings::separator_on_nulls::YES, + stream); + cudf::string_scalar narep{options_.get_na_rep()}; + return cudf::strings::detail::replace_nulls(str_table_view.column(0), narep, stream); + }(); write_chunked(str_concat_col->view(), metadata, stream); } diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu new file mode 100644 index 00000000000..c5a28a8ec5f --- /dev/null +++ b/cpp/src/lists/combine/concatenate_list_elements.cu @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +namespace cudf { +namespace lists { +namespace detail { +namespace { +/** + * @brief Concatenate lists within the same row into one list, ignoring any null list during + * concatenation. + */ +std::unique_ptr concatenate_lists_ignore_null(column_view const& input, + bool build_null_mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const num_rows = input.size(); + + static_assert(std::is_same_v && std::is_same_v); + auto out_offsets = make_numeric_column( + data_type{type_id::INT32}, num_rows + 1, mask_state::UNALLOCATED, stream, mr); + + // The array of int8_t stores validities for the output list elements. + auto validities = rmm::device_uvector(build_null_mask ? num_rows : 0, stream); + + auto const d_out_offsets = out_offsets->mutable_view().template begin(); + auto const d_row_offsets = lists_column_view(input).offsets_begin(); + auto const d_list_offsets = lists_column_view(lists_column_view(input).child()).offsets_begin(); + auto const lists_dv_ptr = column_device_view::create(lists_column_view(input).child()); + + // Concatenating the lists at the same row by converting the entry offsets from the child column + // into row offsets of the root column. Those entry offsets are subtracted by the first entry + // offset to output zero-based offsets. + auto const iter = thrust::make_counting_iterator(0); + thrust::transform(rmm::exec_policy(stream), + iter, + iter + num_rows + 1, + d_out_offsets, + [d_row_offsets, + d_list_offsets, + lists_dv = *lists_dv_ptr, + d_validities = validities.begin(), + build_null_mask, + iter] __device__(auto const idx) { + if (build_null_mask) { + // The output row will be null only if all lists on the input row are null. + auto const is_valid = thrust::any_of(thrust::seq, + iter + d_row_offsets[idx], + iter + d_row_offsets[idx + 1], + [&] __device__(auto const list_idx) { + return lists_dv.is_valid(list_idx); + }); + d_validities[idx] = static_cast(is_valid); + } + auto const start_offset = d_list_offsets[d_row_offsets[0]]; + return d_list_offsets[d_row_offsets[idx]] - start_offset; + }); + + // The child column of the output lists column is just copied from the input column. + auto out_entries = std::make_unique( + lists_column_view(lists_column_view(input).get_sliced_child(stream)).get_sliced_child(stream)); + + auto [null_mask, null_count] = [&] { + return build_null_mask + ? cudf::detail::valid_if( + validities.begin(), validities.end(), thrust::identity{}, stream, mr) + : std::make_pair(cudf::detail::copy_bitmask(input, stream, mr), input.null_count()); + }(); + + return make_lists_column(num_rows, + std::move(out_offsets), + std::move(out_entries), + null_count, + null_count > 0 ? std::move(null_mask) : rmm::device_buffer{}, + stream, + mr); +} + +/** + * @brief Generate list offsets and list validities for the output lists column. + * + * This function is called only when (has_null_list == true and null_policy == NULLIFY_OUTPUT_ROW). + */ +std::pair, rmm::device_uvector> +generate_list_offsets_and_validities(column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const num_rows = input.size(); + + static_assert(std::is_same_v && std::is_same_v); + auto out_offsets = make_numeric_column( + data_type{type_id::INT32}, num_rows + 1, mask_state::UNALLOCATED, stream, mr); + + auto const lists_of_lists_dv_ptr = column_device_view::create(input); + auto const lists_dv_ptr = column_device_view::create(lists_column_view(input).child()); + auto const d_out_offsets = out_offsets->mutable_view().template begin(); + auto const d_row_offsets = lists_column_view(input).offsets_begin(); + auto const d_list_offsets = lists_column_view(lists_column_view(input).child()).offsets_begin(); + + // The array of int8_t stores validities for the output list elements. + auto validities = rmm::device_uvector(num_rows, stream); + + // Compute output list sizes and validities. + auto const iter = thrust::make_counting_iterator(0); + thrust::transform( + rmm::exec_policy(stream), + iter, + iter + num_rows, + d_out_offsets, + [lists_of_lists_dv = *lists_of_lists_dv_ptr, + lists_dv = *lists_dv_ptr, + d_row_offsets, + d_list_offsets, + d_validities = validities.begin(), + iter] __device__(auto const idx) { + if (d_row_offsets[idx] == d_row_offsets[idx + 1]) { // This is a null/empty row. + d_validities[idx] = static_cast(lists_of_lists_dv.is_valid(idx)); + return size_type{0}; + } + // The output row will not be null only if all lists on the input row are not null. + auto const is_valid = + thrust::all_of(thrust::seq, + iter + d_row_offsets[idx], + iter + d_row_offsets[idx + 1], + [&] __device__(auto const list_idx) { return lists_dv.is_valid(list_idx); }); + d_validities[idx] = static_cast(is_valid); + if (!is_valid) { return size_type{0}; } + + // Compute size of the output list as sum of sizes of all lists in the current input row. + return d_list_offsets[d_row_offsets[idx + 1]] - d_list_offsets[d_row_offsets[idx]]; + }); + + // Compute offsets from sizes. + thrust::exclusive_scan( + rmm::exec_policy(stream), d_out_offsets, d_out_offsets + num_rows + 1, d_out_offsets); + + return {std::move(out_offsets), std::move(validities)}; +} + +/** + * @brief Gather entries from the input lists column, ignoring rows that have null list elements. + * + * This function is called only when (has_null_list == true and null_policy == NULLIFY_OUTPUT_ROW). + */ +std::unique_ptr gather_list_entries(column_view const& input, + column_view const& output_list_offsets, + size_type num_rows, + size_type num_output_entries, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const child_col = lists_column_view(input).child(); + auto const entry_col = lists_column_view(child_col).child(); + auto const d_row_offsets = lists_column_view(input).offsets_begin(); + auto const d_list_offsets = lists_column_view(child_col).offsets_begin(); + auto gather_map = rmm::device_uvector(num_output_entries, stream); + + // Fill the gather map with indices of the lists from the child column of the input column. + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + num_rows, + [d_row_offsets, + d_list_offsets, + d_indices = gather_map.begin(), + d_out_list_offsets = + output_list_offsets.template begin()] __device__(size_type const idx) { + // The output row has been identified as a null/empty list during list size computation. + if (d_out_list_offsets[idx + 1] == d_out_list_offsets[idx]) { return; } + + // The indices of the list elements on the row `idx` of the input column. + thrust::sequence(thrust::seq, + d_indices + d_out_list_offsets[idx], + d_indices + d_out_list_offsets[idx + 1], + d_list_offsets[d_row_offsets[idx]]); + }); + + auto result = cudf::detail::gather(table_view{{entry_col}}, + gather_map.begin(), + gather_map.end(), + out_of_bounds_policy::DONT_CHECK, + stream, + mr); + return std::move(result->release()[0]); +} + +std::unique_ptr concatenate_lists_nullifying_rows(column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // Generate offsets and validities of the output lists column. + auto [list_offsets, list_validities] = generate_list_offsets_and_validities(input, stream, mr); + auto const offsets_view = list_offsets->view(); + + auto const num_rows = input.size(); + auto const num_output_entries = + cudf::detail::get_value(offsets_view, num_rows, stream); + + auto list_entries = + gather_list_entries(input, offsets_view, num_rows, num_output_entries, stream, mr); + auto [null_mask, null_count] = cudf::detail::valid_if( + list_validities.begin(), list_validities.end(), thrust::identity{}, stream, mr); + + return make_lists_column(num_rows, + std::move(list_offsets), + std::move(list_entries), + null_count, + null_count ? std::move(null_mask) : rmm::device_buffer{}, + stream, + mr); +} + +} // namespace + +/** + * @copydoc cudf::lists::concatenate_list_elements + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr concatenate_list_elements(column_view const& input, + concatenate_null_policy null_policy, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto type = input.type(); // Column that is lists of lists. + CUDF_EXPECTS(type.id() == type_id::LIST, "Input column must be a lists column."); + + auto col = lists_column_view(input).child(); // Rows, which are lists. + type = col.type(); + CUDF_EXPECTS(type.id() == type_id::LIST, "Rows of the input column must be lists."); + + col = lists_column_view(col).child(); // The last level entries what we need to check. + type = col.type(); + CUDF_EXPECTS(type.id() == type_id::LIST || !cudf::is_nested(type), + "Entry of the input lists column must be of list or non-nested types."); + + if (input.size() == 0) { return cudf::empty_like(input); } + + bool has_null_list = lists_column_view(input).child().has_nulls(); + + return (null_policy == concatenate_null_policy::IGNORE || !has_null_list) + ? concatenate_lists_ignore_null(input, has_null_list, stream, mr) + : concatenate_lists_nullifying_rows(input, stream, mr); +} + +} // namespace detail + +/** + * @copydoc cudf::lists::concatenate_list_elements + */ +std::unique_ptr concatenate_list_elements(column_view const& input, + concatenate_null_policy null_policy, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::concatenate_list_elements(input, null_policy, rmm::cuda_stream_default, mr); +} + +} // namespace lists +} // namespace cudf diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu new file mode 100644 index 00000000000..fdd71aea7bf --- /dev/null +++ b/cpp/src/lists/combine/concatenate_rows.cu @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace cudf { +namespace lists { +namespace detail { +/** + * @copydoc cudf::lists::concatenate_rows + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr concatenate_rows(table_view const& input, + concatenate_null_policy null_policy, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(input.num_columns() > 0, "The input table must have at least one column."); + + auto const entry_type = lists_column_view(*input.begin()).child().type(); + for (auto const& col : input) { + CUDF_EXPECTS(col.type().id() == type_id::LIST, + "All columns of the input table must be of lists column type."); + + auto const child_col = lists_column_view(col).child(); + CUDF_EXPECTS(not cudf::is_nested(child_col.type()), "Nested types are not supported."); + CUDF_EXPECTS(entry_type == child_col.type(), + "The types of entries in the input columns must be the same."); + } + + auto const num_rows = input.num_rows(); + auto const num_cols = input.num_columns(); + if (num_rows == 0) { return cudf::empty_like(input.column(0)); } + if (num_cols == 1) { return std::make_unique(*(input.begin()), stream, mr); } + + // Memory resource for temporary data. + auto const default_mr = rmm::mr::get_current_device_resource(); + + // Interleave the input table into one column. + auto const has_null_mask = std::any_of( + std::cbegin(input), std::cend(input), [](auto const& col) { return col.nullable(); }); + auto interleaved_columns = detail::interleave_columns(input, has_null_mask, stream, default_mr); + + // Generate a lists column which has child column is the interleaved_columns. + // The new nested lists column will have each row is a list of `num_cols` list elements. + static_assert(std::is_same_v and std::is_same_v); + auto list_offsets = make_numeric_column( + data_type{type_id::INT32}, num_rows + 1, mask_state::UNALLOCATED, stream, default_mr); + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows + 1), + list_offsets->mutable_view().template begin(), + [num_cols] __device__(auto const idx) { return idx * num_cols; }); + auto const nested_lists_col = make_lists_column(num_rows, + std::move(list_offsets), + std::move(interleaved_columns), + 0, + rmm::device_buffer{}, + stream, + default_mr); + + // Concatenate lists on each row of the nested lists column, producing the desired output. + return concatenate_list_elements(nested_lists_col->view(), null_policy, stream, mr); +} + +} // namespace detail + +/** + * @copydoc cudf::lists::concatenate_rows + */ +std::unique_ptr concatenate_rows(table_view const& lists_columns, + concatenate_null_policy null_policy, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::concatenate_rows(lists_columns, null_policy, rmm::cuda_stream_default, mr); +} + +} // namespace lists +} // namespace cudf diff --git a/cpp/src/lists/concatenate_rows.cu b/cpp/src/lists/concatenate_rows.cu deleted file mode 100644 index 8528a7680f7..00000000000 --- a/cpp/src/lists/concatenate_rows.cu +++ /dev/null @@ -1,441 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include - -namespace cudf { -namespace lists { -namespace detail { -namespace { -/** - * @brief Concatenate lists within the same row into one list, ignoring any null list during - * concatenation. - */ -std::unique_ptr concatenate_rows_ignore_null(table_view const& input, - bool has_null_mask, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto const num_output_lists = input.num_rows(); - auto const table_dv_ptr = table_device_view::create(input); - - // Interleave the list element from the input table, thus all the lists at the same row now stay - // next to each other. - auto interleaved_columns = detail::interleave_columns(input, has_null_mask, stream); - - // Modify the list offsets to combine lists of the same input row. - static_assert(sizeof(offset_type) == sizeof(int32_t)); - static_assert(sizeof(size_type) == sizeof(int32_t)); - auto list_offsets = make_numeric_column( - data_type{type_id::INT32}, num_output_lists + 1, mask_state::UNALLOCATED, stream, mr); - auto const d_offsets = list_offsets->mutable_view().template begin(); - - // The array of int8_t to store validities for list elements. - // Since we combine multiple lists, we may need to recompute list validities. - auto validities = rmm::device_uvector(has_null_mask ? num_output_lists : 0, stream); - - // For an input table of `n` columns, if after interleaving we have the list offsets are - // [ i_0, i_1, ..., i_n, i_n+1, ..., i_2n, ... ] then to concatenate them just modify the offsets - // to be [ i_0, i_n, i_2n, i_3n, ... ]. - auto const d_interleaved_offsets = lists_column_view(interleaved_columns->view()).offsets_begin(); - thrust::transform( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_output_lists + 1), - d_offsets, - [d_interleaved_offsets, - num_cols = input.num_columns(), - table_dv = *table_dv_ptr, - d_validities = validities.begin(), - has_null_mask] __device__(auto const idx) { - if (has_null_mask) { - auto const any_valid = thrust::any_of( - thrust::seq, table_dv.begin(), table_dv.end(), [idx](auto const& list_col) { - return list_col.is_valid(idx); - }); - d_validities[idx] = static_cast(any_valid); - } - return d_interleaved_offsets[idx * num_cols]; - }); - - auto [null_mask, null_count] = [&] { - return has_null_mask - ? cudf::detail::valid_if( - validities.begin(), validities.end(), thrust::identity{}, stream, mr) - : std::make_pair(rmm::device_buffer{}, size_type{0}); - }(); - - // The child column containing list entries is taken from the `interleaved_columns` column. - auto interleaved_columns_content = interleaved_columns->release(); - - return make_lists_column( - num_output_lists, - std::move(list_offsets), - std::move(interleaved_columns_content.children[lists_column_view::child_column_index]), - null_count, - null_count > 0 ? std::move(null_mask) : rmm::device_buffer{}, - stream, - mr); -} - -/** - * @brief Generate list offsets and list validities for the output lists column from the table_view - * of the input lists columns. - * - * This function is called only when (has_null_mask == true and null_policy == NULLIFY_OUTPUT_ROW). - */ -std::pair, rmm::device_uvector> -generate_list_offsets_and_validities(table_view const& input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto const num_output_lists = input.num_rows(); - auto const table_dv_ptr = table_device_view::create(input); - - // The output offsets column. - static_assert(sizeof(offset_type) == sizeof(int32_t)); - static_assert(sizeof(size_type) == sizeof(int32_t)); - auto list_offsets = make_numeric_column( - data_type{type_id::INT32}, num_output_lists + 1, mask_state::UNALLOCATED, stream, mr); - auto const d_offsets = list_offsets->mutable_view().template begin(); - - // The array of int8_t to store validities for list elements. - auto validities = rmm::device_uvector(num_output_lists, stream); - - // Compute list sizes and validities. - thrust::transform( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_output_lists), - d_offsets, - [table_dv = *table_dv_ptr, d_validities = validities.begin()] __device__(size_type const idx) { - auto const all_valid = - thrust::all_of(thrust::seq, table_dv.begin(), table_dv.end(), [idx](auto const& list_col) { - return list_col.is_valid(idx); - }); - d_validities[idx] = static_cast(all_valid); - if (not all_valid) return size_type{0}; - - // Compute size of the output list as sum of sizes of input lists - return thrust::transform_reduce( - thrust::seq, - table_dv.begin(), - table_dv.end(), - [idx] __device__(auto const& lists_col) { - auto const list_offsets = - lists_col.child(lists_column_view::offsets_column_index).template data() + - lists_col.offset(); - return list_offsets[idx + 1] - list_offsets[idx]; // list size - }, - size_type{0}, - thrust::plus{}); - }); - - // Compute offsets from sizes. - thrust::exclusive_scan( - rmm::exec_policy(stream), d_offsets, d_offsets + num_output_lists + 1, d_offsets); - - return {std::move(list_offsets), std::move(validities)}; -} - -/** - * @brief Compute string sizes, string validities, and concatenate string lists functor. - * - * This functor is called only when (has_null_mask == true and null_policy == NULLIFY_OUTPUT_ROW). - * It is executed twice. In the first pass, the sizes and validities of the output strings will be - * computed. In the second pass, this will concatenate the lists of strings on the same row from the - * given input table. - */ -struct compute_string_sizes_and_concatenate_lists_fn { - table_device_view const table_dv; - - // Store list offsets of the output lists column. - offset_type const* const dst_list_offsets; - - // Store offsets of the strings. - offset_type* d_offsets{nullptr}; - - // If d_chars == nullptr: only compute sizes and validities of the output strings. - // If d_chars != nullptr: only concatenate lists of strings. - char* d_chars{nullptr}; - - // We need to set `1` or `0` for the validities of the strings in the child column. - int8_t* d_validities{nullptr}; - - __device__ void operator()(size_type const idx) - { - // The current row contain null, which has been identified during offsets computation. - if (dst_list_offsets[idx + 1] == dst_list_offsets[idx]) { return; } - - // read_idx and write_idx are indices of string elements. - size_type write_idx = dst_list_offsets[idx]; - thrust::for_each( - thrust::seq, table_dv.begin(), table_dv.end(), [&] __device__(auto const& lists_col) { - auto const list_offsets = - lists_col.child(lists_column_view::offsets_column_index).template data() + - lists_col.offset(); - auto const& str_col = lists_col.child(lists_column_view::child_column_index); - auto const str_offsets = - str_col.child(strings_column_view::offsets_column_index).template data(); - - // The range of indices of the strings within the source list. - auto const start_str_idx = list_offsets[idx]; - auto const end_str_idx = list_offsets[idx + 1]; - - if (not d_chars) { // just compute sizes of strings within a list - for (auto read_idx = start_str_idx; read_idx < end_str_idx; ++read_idx, ++write_idx) { - d_validities[write_idx] = static_cast(str_col.is_valid(read_idx)); - d_offsets[write_idx] = str_offsets[read_idx + 1] - str_offsets[read_idx]; - } - } else { // just copy the entire memory region containing all strings in the list - // start_byte and end_byte are indices of character of the string elements. - auto const start_byte = str_offsets[start_str_idx]; - auto const end_byte = str_offsets[end_str_idx]; - if (start_byte < end_byte) { - auto const input_ptr = - str_col.child(strings_column_view::chars_column_index).template data() + - start_byte; - auto const output_ptr = d_chars + d_offsets[write_idx]; - thrust::copy(thrust::seq, input_ptr, input_ptr + end_byte - start_byte, output_ptr); - } - write_idx += end_str_idx - start_str_idx; - } - }); - } -}; - -/** - * @brief Struct used in type_dispatcher to interleave list entries of the input lists columns and - * output the results into a destination column. - * - * This functor is called only when (has_null_mask == true and null_policy == NULLIFY_OUTPUT_ROW). - */ -struct concatenate_lists_fn { - template - std::enable_if_t, std::unique_ptr> operator()( - table_view const& input, - column_view const& output_list_offsets, - size_type num_output_lists, - size_type num_output_entries, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const noexcept - { - auto const table_dv_ptr = table_device_view::create(input); - auto const comp_fn = compute_string_sizes_and_concatenate_lists_fn{ - *table_dv_ptr, output_list_offsets.template begin()}; - - // Generate a null mask because the input table has nullable column. - auto [offsets_column, chars_column, null_mask, null_count] = - cudf::strings::detail::make_strings_children_with_null_mask( - comp_fn, num_output_lists, num_output_entries, stream, mr); - - return make_strings_column(num_output_entries, - std::move(offsets_column), - std::move(chars_column), - null_count, - std::move(null_mask), - stream, - mr); - } - - template - std::enable_if_t(), std::unique_ptr> operator()( - table_view const& input, - column_view const& output_list_offsets, - size_type num_output_lists, - size_type num_output_entries, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const noexcept - { - auto const table_dv_ptr = table_device_view::create(input); - - // The output child column. - auto const child_col = lists_column_view(*input.begin()).child(); - auto output = - allocate_like(child_col, num_output_entries, mask_allocation_policy::NEVER, stream, mr); - auto output_dv_ptr = mutable_column_device_view::create(*output); - - // The array of int8_t to store entry validities. - auto validities = rmm::device_uvector(num_output_entries, stream); - - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - num_output_lists, - [num_cols = input.num_columns(), - table_dv = *table_dv_ptr, - d_validities = validities.begin(), - dst_list_offsets = output_list_offsets.template begin(), - d_output = output_dv_ptr->template begin()] __device__(size_type const idx) { - // The output row has been identified as a null list during list size computation. - if (dst_list_offsets[idx + 1] == dst_list_offsets[idx]) { return; } - - auto write_start = dst_list_offsets[idx]; - thrust::for_each( - thrust::seq, table_dv.begin(), table_dv.end(), [&] __device__(auto const& lists_col) { - auto const list_offsets = lists_col.child(lists_column_view::offsets_column_index) - .template data() + - lists_col.offset(); - auto const& data_col = lists_col.child(lists_column_view::child_column_index); - - // The range of indices of the entries within the source list. - auto const start_idx = list_offsets[idx]; - auto const end_idx = list_offsets[idx + 1]; - - // Fill the validities array. - for (auto read_idx = start_idx, write_idx = write_start; read_idx < end_idx; - ++read_idx, ++write_idx) { - d_validities[write_idx] = static_cast(data_col.is_valid(read_idx)); - } - // Do a copy for the entire list entries. - auto const input_ptr = - reinterpret_cast(data_col.template data() + start_idx); - auto const output_ptr = reinterpret_cast(&d_output[write_start]); - thrust::copy( - thrust::seq, input_ptr, input_ptr + sizeof(T) * (end_idx - start_idx), output_ptr); - write_start += end_idx - start_idx; - }); - }); - - auto [null_mask, null_count] = cudf::detail::valid_if( - validities.begin(), validities.end(), thrust::identity{}, stream, mr); - if (null_count > 0) { output->set_null_mask(null_mask, null_count); } - - return output; - } - - template - std::enable_if_t and not cudf::is_fixed_width(), - std::unique_ptr> - operator()(table_view const&, - column_view const&, - size_type, - size_type, - rmm::cuda_stream_view, - rmm::mr::device_memory_resource*) const - { - // Currently, only support string_view and fixed-width types - CUDF_FAIL("Called `concatenate_lists_fn()` on non-supported types."); - } -}; - -std::unique_ptr concatenate_with_nullifying_rows(table_view const& input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - // Generate offsets of the output lists column. - auto [list_offsets, list_validities] = generate_list_offsets_and_validities(input, stream, mr); - auto const offsets_view = list_offsets->view(); - - // Copy entries from the input lists columns to the output lists column - this needed to be - // specialized for different types. - auto const num_output_lists = input.num_rows(); - auto const num_output_entries = - cudf::detail::get_value(offsets_view, num_output_lists, stream); - auto list_entries = - type_dispatcher(lists_column_view(*input.begin()).child().type(), - concatenate_lists_fn{}, - input, - offsets_view, - num_output_lists, - num_output_entries, - stream, - mr); - - auto [null_mask, null_count] = cudf::detail::valid_if( - list_validities.begin(), list_validities.end(), thrust::identity{}, stream, mr); - return make_lists_column(num_output_lists, - std::move(list_offsets), - std::move(list_entries), - null_count, - null_count ? std::move(null_mask) : rmm::device_buffer{}, - stream, - mr); -} - -} // namespace - -/** - * @copydoc cudf::lists::concatenate_rows - * - * @param stream CUDA stream used for device memory operations and kernel launches. - */ -std::unique_ptr concatenate_rows(table_view const& input, - concatenate_null_policy null_policy, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(input.num_columns() > 0, "The input table must have at least one column."); - - auto const entry_type = lists_column_view(*input.begin()).child().type(); - for (auto const& col : input) { - CUDF_EXPECTS(col.type().id() == type_id::LIST, - "All columns of the input table must be of lists column type."); - - auto const child_col = lists_column_view(col).child(); - CUDF_EXPECTS(not cudf::is_nested(child_col.type()), "Nested types are not supported."); - CUDF_EXPECTS(entry_type == child_col.type(), - "The types of entries in the input columns must be the same."); - } - - if (input.num_rows() == 0) { return cudf::empty_like(input.column(0)); } - if (input.num_columns() == 1) { return std::make_unique(*(input.begin()), stream, mr); } - - // List concatenation can be implemented by simply interleaving the lists columns, then modify the - // list offsets. - auto const has_null_mask = std::any_of( - std::cbegin(input), std::cend(input), [](auto const& col) { return col.nullable(); }); - if (not has_null_mask or null_policy == concatenate_null_policy::IGNORE) { - return concatenate_rows_ignore_null(input, has_null_mask, stream, mr); - } - - // Both conditions satisfied: has_null_mask == true and - // null_policy == NULLIFY_OUTPUT_ROW. - return concatenate_with_nullifying_rows(input, stream, mr); -} - -} // namespace detail - -/** - * @copydoc cudf::lists::concatenate_rows - */ -std::unique_ptr concatenate_rows(table_view const& lists_columns, - concatenate_null_policy null_policy, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::concatenate_rows(lists_columns, null_policy, rmm::cuda_stream_default, mr); -} - -} // namespace lists -} // namespace cudf diff --git a/cpp/src/lists/copying/copying.cu b/cpp/src/lists/copying/copying.cu index 3275a496cfd..ff4649f4945 100644 --- a/cpp/src/lists/copying/copying.cu +++ b/cpp/src/lists/copying/copying.cu @@ -84,19 +84,6 @@ std::unique_ptr copy_slice(lists_column_view const& lists, std::move(null_mask)); } -std::unique_ptr make_empty_lists_column(data_type child_type, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - return cudf::make_lists_column(0, - make_empty_column(data_type{type_to_id()}), - make_empty_column(child_type), - 0, // Null count - rmm::device_buffer{0, stream, mr}, // Null mask - stream, - mr); -} - } // namespace detail } // namespace lists } // namespace cudf diff --git a/cpp/src/lists/lists_column_factories.cu b/cpp/src/lists/lists_column_factories.cu index ebf5e07f76a..3291aeb9f22 100644 --- a/cpp/src/lists/lists_column_factories.cu +++ b/cpp/src/lists/lists_column_factories.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,10 +16,75 @@ #include #include +#include +#include #include +#include + +#include +#include namespace cudf { +namespace lists { +namespace detail { + +std::unique_ptr make_lists_column_from_scalar(list_scalar const& value, + size_type size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (size == 0) { + return make_lists_column(0, + make_empty_column(data_type{type_to_id()}), + empty_like(value.view()), + 0, + cudf::detail::create_null_mask(0, mask_state::UNALLOCATED, stream, mr), + stream, + mr); + } + auto mr_final = size == 1 ? mr : rmm::mr::get_current_device_resource(); + + // Handcraft a 1-row column + auto offsets = make_numeric_column( + data_type{type_to_id()}, 2, mask_state::UNALLOCATED, stream, mr_final); + auto m_offsets = offsets->mutable_view(); + thrust::sequence(rmm::exec_policy(stream), + m_offsets.begin(), + m_offsets.end(), + 0, + value.view().size()); + size_type null_count = value.is_valid(stream) ? 0 : 1; + auto null_mask_state = null_count ? mask_state::ALL_NULL : mask_state::UNALLOCATED; + auto null_mask = cudf::detail::create_null_mask(1, null_mask_state, stream, mr_final); + + if (size == 1) { + auto child = std::make_unique(value.view(), stream, mr_final); + return make_lists_column( + 1, std::move(offsets), std::move(child), null_count, std::move(null_mask), stream, mr_final); + } + + auto children_views = std::vector{offsets->view(), value.view()}; + auto one_row_col_view = column_view(data_type{type_id::LIST}, + 1, + nullptr, + static_cast(null_mask.data()), + null_count, + 0, + children_views); + + auto begin = thrust::make_constant_iterator(0); + auto res = cudf::detail::gather(table_view({one_row_col_view}), + begin, + begin + size, + out_of_bounds_policy::DONT_CHECK, + stream, + mr_final); + return std::move(res->release()[0]); +} + +} // namespace detail +} // namespace lists /** * @copydoc cudf::make_lists_column diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu index 5d7b9152ff3..1329ad3113f 100644 --- a/cpp/src/strings/combine/concatenate.cu +++ b/cpp/src/strings/combine/concatenate.cu @@ -41,67 +41,93 @@ namespace strings { namespace detail { namespace { -/** - * @brief Concatenate strings functor - * - * This will concatenate the strings from each row of the given table - * and apply the separator. The null-replacement string `d_narep` is - * used in place of any string in a row that contains a null entry. - */ -struct concat_strings_fn { +struct concat_strings_base { table_device_view const d_table; - string_view const d_separator; string_scalar_device_view const d_narep; + separator_on_nulls separate_nulls; offset_type* d_offsets{}; char* d_chars{}; - __device__ void operator()(size_type idx) + /** + * @brief Concatenate each table row to a single output string. + * + * This will concatenate the strings from each row of the given table + * and apply the separator. The null-replacement string `d_narep` is + * used in place of any string in a row that contains a null entry. + * + * @param idx The current row to process + * @param d_separator String to place in between each column's row + */ + __device__ void process_row(size_type idx, string_view const d_separator) { - bool const null_element = - thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { - return col.is_null(idx); - }); - // handle a null row - if (null_element && !d_narep.is_valid()) { + if (!d_narep.is_valid() && + thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { + return col.is_null(idx); + })) { if (!d_chars) d_offsets[idx] = 0; return; } - char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; - size_type bytes = 0; + char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; + offset_type bytes = 0; + bool write_separator = false; + for (auto itr = d_table.begin(); itr < d_table.end(); ++itr) { - auto const d_column = *itr; - auto const d_str = - d_column.is_null(idx) ? d_narep.value() : d_column.element(idx); - if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_str); - bytes += d_str.size_bytes(); - // separator goes only in between elements - if (itr + 1 < d_table.end()) { + auto const d_column = *itr; + bool const null_element = d_column.is_null(idx); + + if (write_separator && (separate_nulls == separator_on_nulls::YES || !null_element)) { if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_separator); bytes += d_separator.size_bytes(); + write_separator = false; } + + // write out column's row data (or narep if the row is null) + auto const d_str = null_element ? d_narep.value() : d_column.element(idx); + if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_str); + bytes += d_str.size_bytes(); + + write_separator = + write_separator || (separate_nulls == separator_on_nulls::YES) || !null_element; } + if (!d_chars) d_offsets[idx] = bytes; } }; +/** + * @brief Single separator concatenate functor + */ +struct concat_strings_fn : concat_strings_base { + string_view const d_separator; + + concat_strings_fn(table_device_view const& d_table, + string_view const& d_separator, + string_scalar_device_view const& d_narep, + separator_on_nulls separate_nulls) + : concat_strings_base{d_table, d_narep, separate_nulls}, d_separator(d_separator) + { + } + + __device__ void operator()(size_type idx) { process_row(idx, d_separator); } +}; + } // namespace std::unique_ptr concatenate(table_view const& strings_columns, string_scalar const& separator, string_scalar const& narep, + separator_on_nulls separate_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { auto const num_columns = strings_columns.num_columns(); - CUDF_EXPECTS(num_columns > 0, "At least one column must be specified"); + CUDF_EXPECTS(num_columns > 1, "At least two columns must be specified"); // check all columns are of type string CUDF_EXPECTS(std::all_of(strings_columns.begin(), strings_columns.end(), [](auto c) { return c.type().id() == type_id::STRING; }), "All columns must be of type string"); - if (num_columns == 1) // single strings column returns a copy - return std::make_unique(*(strings_columns.begin()), stream, mr); auto const strings_count = strings_columns.num_rows(); if (strings_count == 0) // empty begets empty return detail::make_empty_strings_column(stream, mr); @@ -112,7 +138,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, // Create device views from the strings columns. auto d_table = table_device_view::create(strings_columns, stream); - concat_strings_fn fn{*d_table, d_separator, d_narep}; + concat_strings_fn fn{*d_table, d_separator, d_narep, separate_nulls}; auto children = make_strings_children(fn, strings_count, stream, mr); // create resulting null mask @@ -120,9 +146,9 @@ std::unique_ptr concatenate(table_view const& strings_columns, thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), [d_table = *d_table, d_narep] __device__(size_type idx) { - bool null_element = thrust::any_of( + if (d_narep.is_valid()) return true; + return !thrust::any_of( thrust::seq, d_table.begin(), d_table.end(), [idx](auto col) { return col.is_null(idx); }); - return (!null_element || d_narep.is_valid()); }, stream, mr); @@ -145,68 +171,42 @@ namespace { * when a separator row is null `d_separator_narep`. The `d_narep` is * used in place of a null entry in the strings columns. */ -struct multi_separator_concat_fn { - table_device_view const d_table; +struct multi_separator_concat_fn : concat_strings_base { column_device_view const d_separators; string_scalar_device_view const d_separator_narep; - string_scalar_device_view const d_narep; - offset_type* d_offsets{}; - char* d_chars{}; - __device__ void operator()(size_type idx) + multi_separator_concat_fn(table_device_view const& d_table, + column_device_view const& d_separators, + string_scalar_device_view const& d_separator_narep, + string_scalar_device_view const& d_narep, + separator_on_nulls separate_nulls) + : concat_strings_base{d_table, d_narep, separate_nulls}, + d_separators(d_separators), + d_separator_narep(d_separator_narep) { - bool const all_nulls = - thrust::all_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { - return col.is_null(idx); - }); + } - if ((d_separators.is_null(idx) && !d_separator_narep.is_valid()) || - (all_nulls && !d_narep.is_valid())) { + __device__ void operator()(size_type idx) + { + if (d_separators.is_null(idx) && !d_separator_narep.is_valid()) { if (!d_chars) d_offsets[idx] = 0; return; } - // point to output location - char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; - offset_type bytes = 0; - - // there is at least one non-null column value auto const d_separator = d_separators.is_valid(idx) ? d_separators.element(idx) : d_separator_narep.value(); - auto const d_null_rep = d_narep.is_valid() ? d_narep.value() : string_view{}; - - // write output entry for this row - bool colval_written = false; // state variable for writing separators - for (auto const d_column : d_table) { - // if the row is null and if there is no replacement, skip it - if (d_column.is_null(idx) && !d_narep.is_valid()) continue; - - // separator in this row is written only after the first output - if (colval_written) { - if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_separator); - bytes += d_separator.size_bytes(); - } - - // write out column's row data (or narep if the row is null) - string_view const d_str = - d_column.is_null(idx) ? d_null_rep : d_column.element(idx); - if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_str); - bytes += d_str.size_bytes(); - - // column's string or narep could by empty so we need this flag - // to know we got this far even if no actual bytes were copied - colval_written = true; // use the separator before the next column - } - - if (!d_chars) d_offsets[idx] = bytes; + // base class utility function handles the rest + process_row(idx, d_separator); } }; + } // namespace std::unique_ptr concatenate(table_view const& strings_columns, strings_column_view const& separators, string_scalar const& separator_narep, string_scalar const& col_narep, + separator_on_nulls separate_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -234,20 +234,19 @@ std::unique_ptr concatenate(table_view const& strings_columns, // Create device views from the strings columns. auto d_table = table_device_view::create(strings_columns, stream); - multi_separator_concat_fn mscf{*d_table, separator_col_view, separator_rep, col_rep}; + multi_separator_concat_fn mscf{ + *d_table, separator_col_view, separator_rep, col_rep, separate_nulls}; auto children = make_strings_children(mscf, strings_count, stream, mr); // Create resulting null mask auto [null_mask, null_count] = cudf::detail::valid_if( thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), - [d_table = *d_table, separator_col_view, separator_rep, col_rep] __device__(size_type ridx) { - if (!separator_col_view.is_valid(ridx) && !separator_rep.is_valid()) return false; - bool all_nulls = - thrust::all_of(thrust::seq, d_table.begin(), d_table.end(), [ridx](auto const& col) { - return col.is_null(ridx); - }); - return all_nulls ? col_rep.is_valid() : true; + [d_table = *d_table, separator_col_view, separator_rep, col_rep] __device__(size_type idx) { + if (!separator_col_view.is_valid(idx) && !separator_rep.is_valid()) return false; + if (col_rep.is_valid()) return true; + return !thrust::any_of( + thrust::seq, d_table.begin(), d_table.end(), [idx](auto col) { return col.is_null(idx); }); }, stream, mr); @@ -268,21 +267,29 @@ std::unique_ptr concatenate(table_view const& strings_columns, std::unique_ptr concatenate(table_view const& strings_columns, string_scalar const& separator, string_scalar const& narep, + separator_on_nulls separate_nulls, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate(strings_columns, separator, narep, rmm::cuda_stream_default, mr); + return detail::concatenate( + strings_columns, separator, narep, separate_nulls, rmm::cuda_stream_default, mr); } std::unique_ptr concatenate(table_view const& strings_columns, strings_column_view const& separators, string_scalar const& separator_narep, string_scalar const& col_narep, + separator_on_nulls separate_nulls, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate( - strings_columns, separators, separator_narep, col_narep, rmm::cuda_stream_default, mr); + return detail::concatenate(strings_columns, + separators, + separator_narep, + col_narep, + separate_nulls, + rmm::cuda_stream_default, + mr); } } // namespace strings diff --git a/cpp/src/strings/combine/concatenate_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu similarity index 64% rename from cpp/src/strings/combine/concatenate_list_elements.cu rename to cpp/src/strings/combine/join_list_elements.cu index 1157b8f3fce..7a83097566c 100644 --- a/cpp/src/strings/combine/concatenate_list_elements.cu +++ b/cpp/src/strings/combine/join_list_elements.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,7 @@ struct compute_size_and_concatenate_fn { offset_type const* const list_offsets; column_device_view const strings_dv; string_scalar_device_view const string_narep_dv; + separator_on_nulls const separate_nulls; offset_type* d_offsets{nullptr}; @@ -72,33 +74,38 @@ struct compute_size_and_concatenate_fn { return; } - auto const separator = func.separator(idx); - auto const separator_size = separator.size_bytes(); - auto size_bytes = size_type{0}; - bool written = false; - char* output_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr; + auto const separator = func.separator(idx); + auto size_bytes = size_type{0}; + char* output_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr; + bool write_separator = false; for (size_type str_idx = list_offsets[idx], idx_end = list_offsets[idx + 1]; str_idx < idx_end; ++str_idx) { - if (not d_chars and (strings_dv.is_null(str_idx) and not string_narep_dv.is_valid())) { + bool null_element = strings_dv.is_null(str_idx); + + if (not d_chars and (null_element and not string_narep_dv.is_valid())) { d_offsets[idx] = 0; d_validities[idx] = false; return; // early termination: the entire list of strings will result in a null string } - auto const d_str = strings_dv.is_null(str_idx) ? string_narep_dv.value() - : strings_dv.element(str_idx); - size_bytes += separator_size + d_str.size_bytes(); - if (output_ptr) { - // Separator is inserted only in between strings - if (written) { output_ptr = detail::copy_string(output_ptr, separator); } - output_ptr = detail::copy_string(output_ptr, d_str); - written = true; + + if (write_separator && (separate_nulls == separator_on_nulls::YES || !null_element)) { + if (output_ptr) output_ptr = detail::copy_string(output_ptr, separator); + size_bytes += separator.size_bytes(); + write_separator = false; } + + auto const d_str = + null_element ? string_narep_dv.value() : strings_dv.element(str_idx); + if (output_ptr) output_ptr = detail::copy_string(output_ptr, d_str); + size_bytes += d_str.size_bytes(); + + write_separator = + write_separator || (separate_nulls == separator_on_nulls::YES) || !null_element; } - // Separator is inserted only in between strings if (not d_chars) { - d_offsets[idx] = static_cast(size_bytes - separator_size); + d_offsets[idx] = size_bytes; d_validities[idx] = true; } } @@ -123,11 +130,12 @@ struct scalar_separator_fn { } // namespace -std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, - string_scalar const& separator, - string_scalar const& narep, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr join_list_elements(lists_column_view const& lists_strings_column, + string_scalar const& separator, + string_scalar const& narep, + separator_on_nulls separate_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING, "The input column must be a column of lists of strings"); @@ -146,14 +154,14 @@ std::unique_ptr concatenate_list_elements(lists_column_view const& lists auto const sep_dv = get_scalar_device_view(const_cast(separator)); auto const string_narep_dv = get_scalar_device_view(const_cast(narep)); - auto const func = scalar_separator_fn{sep_dv}; - auto const comp_fn = compute_size_and_concatenate_fn{ - func, - *lists_dv_ptr, - lists_strings_column.offsets_begin(), - *strings_dv_ptr, - string_narep_dv, - }; + auto const func = scalar_separator_fn{sep_dv}; + auto const comp_fn = + compute_size_and_concatenate_fn{func, + *lists_dv_ptr, + lists_strings_column.offsets_begin(), + *strings_dv_ptr, + string_narep_dv, + separate_nulls}; auto [offsets_column, chars_column, null_mask, null_count] = make_strings_children_with_null_mask(comp_fn, num_rows, num_rows, stream, mr); @@ -191,12 +199,13 @@ struct column_separators_fn { } // namespace -std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, - strings_column_view const& separators, - string_scalar const& separator_narep, - string_scalar const& string_narep, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr join_list_elements(lists_column_view const& lists_strings_column, + strings_column_view const& separators, + string_scalar const& separator_narep, + string_scalar const& string_narep, + separator_on_nulls separate_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING, "The input column must be a column of lists of strings"); @@ -217,14 +226,14 @@ std::unique_ptr concatenate_list_elements(lists_column_view const& lists auto const sep_dv_ptr = column_device_view::create(separators.parent(), stream); auto const sep_narep_dv = get_scalar_device_view(const_cast(separator_narep)); - auto const func = column_separators_fn{*sep_dv_ptr, sep_narep_dv}; - auto const comp_fn = compute_size_and_concatenate_fn{ - func, - *lists_dv_ptr, - lists_strings_column.offsets_begin(), - *strings_dv_ptr, - string_narep_dv, - }; + auto const func = column_separators_fn{*sep_dv_ptr, sep_narep_dv}; + auto const comp_fn = + compute_size_and_concatenate_fn{func, + *lists_dv_ptr, + lists_strings_column.offsets_begin(), + *strings_dv_ptr, + string_narep_dv, + separate_nulls}; auto [offsets_column, chars_column, null_mask, null_count] = make_strings_children_with_null_mask(comp_fn, num_rows, num_rows, stream, mr); @@ -239,25 +248,32 @@ std::unique_ptr concatenate_list_elements(lists_column_view const& lists } // namespace detail -std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, - string_scalar const& separator, - string_scalar const& narep, - rmm::mr::device_memory_resource* mr) +std::unique_ptr join_list_elements(lists_column_view const& lists_strings_column, + string_scalar const& separator, + string_scalar const& narep, + separator_on_nulls separate_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate_list_elements( - lists_strings_column, separator, narep, rmm::cuda_stream_default, mr); + return detail::join_list_elements( + lists_strings_column, separator, narep, separate_nulls, rmm::cuda_stream_default, mr); } -std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, - strings_column_view const& separators, - string_scalar const& separator_narep, - string_scalar const& string_narep, - rmm::mr::device_memory_resource* mr) +std::unique_ptr join_list_elements(lists_column_view const& lists_strings_column, + strings_column_view const& separators, + string_scalar const& separator_narep, + string_scalar const& string_narep, + separator_on_nulls separate_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate_list_elements( - lists_strings_column, separators, separator_narep, string_narep, rmm::cuda_stream_default, mr); + return detail::join_list_elements(lists_strings_column, + separators, + separator_narep, + string_narep, + separate_nulls, + rmm::cuda_stream_default, + mr); } } // namespace strings diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index d87b4b81bdc..bbcfd69a52b 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -328,8 +328,8 @@ ConfigureTest(STRINGS_TEST strings/booleans_tests.cpp strings/case_tests.cpp strings/chars_types_tests.cpp - strings/combine/concatenate_list_elements_tests.cpp strings/combine/concatenate_tests.cpp + strings/combine/join_list_elements_tests.cpp strings/combine/join_strings_tests.cpp strings/concatenate_tests.cpp strings/contains_tests.cpp @@ -407,7 +407,8 @@ ConfigureTest(AST_TEST ast/transform_tests.cpp) ################################################################################################### # - lists tests ---------------------------------------------------------------------------------- ConfigureTest(LISTS_TEST - lists/concatenate_rows_tests.cpp + lists/combine/concatenate_list_elements_tests.cpp + lists/combine/concatenate_rows_tests.cpp lists/contains_tests.cpp lists/count_elements_tests.cpp lists/drop_list_duplicates_tests.cpp diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp index 71f65eedd91..f9e83311b1b 100644 --- a/cpp/tests/column/factories_test.cpp +++ b/cpp/tests/column/factories_test.cpp @@ -20,7 +20,9 @@ #include #include +#include #include +#include #include #include @@ -462,6 +464,300 @@ TEST_F(ColumnFactoryTest, DictionaryFromStringScalarError) EXPECT_THROW(cudf::make_dictionary_from_scalar(value, 1), cudf::logic_error); } +template +class ListsFixedWidthLeafTest : public ColumnFactoryTest { +}; + +TYPED_TEST_CASE(ListsFixedWidthLeafTest, cudf::test::FixedWidthTypes); + +TYPED_TEST(ListsFixedWidthLeafTest, FromNonNested) +{ + using FCW = cudf::test::fixed_width_column_wrapper; + using LCW = cudf::test::lists_column_wrapper; + using valid_t = std::vector; + + auto s = cudf::make_list_scalar(FCW({1, -1, 3}, {1, 0, 1})); + auto col = cudf::make_column_from_scalar(*s, 3); + + auto expected = LCW{LCW({1, 2, 3}, valid_t{1, 0, 1}.begin()), + LCW({1, 2, 3}, valid_t{1, 0, 1}.begin()), + LCW({1, 2, 3}, valid_t{1, 0, 1}.begin())}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, expected); +} + +TYPED_TEST(ListsFixedWidthLeafTest, FromNested) +{ + using LCW = cudf::test::lists_column_wrapper; + using valid_t = std::vector; + +#define row_data \ + LCW({LCW({-1, -1, 3}, valid_t{0, 0, 1}.begin()), LCW{}, LCW{}}, valid_t{1, 0, 1}.begin()) + + auto s = cudf::make_list_scalar(row_data); + auto col = cudf::make_column_from_scalar(*s, 5); + + auto expected = LCW{row_data, row_data, row_data, row_data, row_data}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, expected); + +#undef row_data +} + +template +class ListsDictionaryLeafTest : public ColumnFactoryTest { +}; + +TYPED_TEST_CASE(ListsDictionaryLeafTest, cudf::test::FixedWidthTypes); + +TYPED_TEST(ListsDictionaryLeafTest, FromNonNested) +{ + using DCW = cudf::test::dictionary_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; + + auto s = cudf::make_list_scalar(DCW({1, 3, -1, 1, 3}, {1, 1, 0, 1, 1})); + auto col = cudf::make_column_from_scalar(*s, 2); + + DCW leaf({1, 3, -1, 1, 3, 1, 3, -1, 1, 3}, {1, 1, 0, 1, 1, 1, 1, 0, 1, 1}); + offset_t offsets{0, 5, 10}; + auto mask = cudf::create_null_mask(2, cudf::mask_state::UNALLOCATED); + + auto expected = cudf::make_lists_column(2, offsets.release(), leaf.release(), 0, std::move(mask)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, *expected); +} + +TYPED_TEST(ListsDictionaryLeafTest, FromNested) +{ + using DCW = cudf::test::dictionary_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; + + DCW leaf({1, 3, -1, 1, 3, 1, 3, -1, 1, 3}, {1, 1, 0, 1, 1, 1, 1, 0, 1, 1}); + offset_t offsets{0, 3, 3, 6, 6, 10}; + auto mask = cudf::create_null_mask(5, cudf::mask_state::ALL_VALID); + cudf::set_null_mask(static_cast(mask.data()), 1, 2, false); + auto data = cudf::make_lists_column(5, offsets.release(), leaf.release(), 0, std::move(mask)); + + auto s = cudf::make_list_scalar(*data); + auto col = cudf::make_column_from_scalar(*s, 3); + + DCW leaf2( + {1, 3, -1, 1, 3, 1, 3, -1, 1, 3, 1, 3, -1, 1, 3, + 1, 3, -1, 1, 3, 1, 3, -1, 1, 3, 1, 3, -1, 1, 3}, + {1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1}); + offset_t offsets2{0, 3, 3, 6, 6, 10, 13, 13, 16, 16, 20, 23, 23, 26, 26, 30}; + auto mask2 = cudf::create_null_mask(15, cudf::mask_state::ALL_VALID); + cudf::set_null_mask(static_cast(mask2.data()), 1, 2, false); + cudf::set_null_mask(static_cast(mask2.data()), 6, 7, false); + cudf::set_null_mask(static_cast(mask2.data()), 11, 12, false); + auto nested = + cudf::make_lists_column(15, offsets2.release(), leaf2.release(), 3, std::move(mask2)); + + offset_t offsets3{0, 5, 10, 15}; + auto mask3 = cudf::create_null_mask(3, cudf::mask_state::UNALLOCATED); + auto expected = + cudf::make_lists_column(3, offsets3.release(), std::move(nested), 0, std::move(mask3)); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, *expected); +} + +class ListsStringLeafTest : public ColumnFactoryTest { +}; + +TEST_F(ListsStringLeafTest, FromNonNested) +{ + using SCW = cudf::test::strings_column_wrapper; + using LCW = cudf::test::lists_column_wrapper; + using valid_t = std::vector; + + auto s = cudf::make_list_scalar(SCW({"xx", "", "z"}, {true, false, true})); + auto col = cudf::make_column_from_scalar(*s, 4); + + auto expected = LCW{LCW({"xx", "", "z"}, valid_t{1, 0, 1}.begin()), + LCW({"xx", "", "z"}, valid_t{1, 0, 1}.begin()), + LCW({"xx", "", "z"}, valid_t{1, 0, 1}.begin()), + LCW({"xx", "", "z"}, valid_t{1, 0, 1}.begin())}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, expected); +} + +TEST_F(ListsStringLeafTest, FromNested) +{ + using LCW = cudf::test::lists_column_wrapper; + using valid_t = std::vector; + +#define row_data \ + LCW({LCW{}, \ + LCW({"@@", "rapids", "", "ๅ››", "ใ‚‰"}, valid_t{1, 1, 0, 1, 1}.begin()), \ + LCW{}, \ + LCW({"hello", ""}, valid_t{1, 0}.begin())}, \ + valid_t{0, 1, 1, 1}.begin()) + + auto s = cudf::make_list_scalar(row_data); + + auto col = cudf::make_column_from_scalar(*s, 3); + + auto expected = LCW{row_data, row_data, row_data}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, expected); +#undef row_data +} + +template +class ListsStructsLeafTest : public ColumnFactoryTest { + protected: + using SCW = cudf::test::structs_column_wrapper; + /** + * @brief Create a structs column that contains 3 fields: int, string, List + */ + template + SCW make_test_structs_column(cudf::test::fixed_width_column_wrapper field1, + cudf::test::strings_column_wrapper field2, + cudf::test::lists_column_wrapper field3, + MaskIterator mask) + { + return SCW{{field1, field2, field3}, mask}; + } +}; + +TYPED_TEST_CASE(ListsStructsLeafTest, cudf::test::FixedWidthTypes); + +TYPED_TEST(ListsStructsLeafTest, FromNonNested) +{ + using LCWinner_t = cudf::test::lists_column_wrapper; + using StringCW = cudf::test::strings_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; + using valid_t = std::vector; + + auto data = this->make_test_structs_column( + {{1, 3, 5, 2, 4}, {1, 0, 1, 0, 1}}, + StringCW({"fleur", "flower", "", "่Šฑ", "ใฏใช"}, {true, true, false, true, true}), + LCWinner_t({{1, 2}, {}, {4, 5}, {-1}, {}}, valid_t{1, 1, 1, 1, 0}.begin()), + valid_t{1, 1, 1, 0, 1}.begin()); + auto s = cudf::make_list_scalar(data); + auto col = cudf::make_column_from_scalar(*s, 2); + + auto leaf = this->make_test_structs_column( + {{1, 3, 5, 2, 4, 1, 3, 5, 2, 4}, {1, 0, 1, 0, 1, 1, 0, 1, 0, 1}}, + StringCW({"fleur", "flower", "", "่Šฑ", "ใฏใช", "fleur", "flower", "", "่Šฑ", "ใฏใช"}, + {true, true, false, true, true, true, true, false, true, true}), + LCWinner_t({{1, 2}, {}, {4, 5}, {-1}, {}, {1, 2}, {}, {4, 5}, {-1}, {}}, + valid_t{1, 1, 1, 1, 0, 1, 1, 1, 1, 0}.begin()), + valid_t{1, 1, 1, 0, 1, 1, 1, 1, 0, 1}.begin()); + auto expected = cudf::make_lists_column(2, + offset_t{0, 5, 10}.release(), + leaf.release(), + 0, + cudf::create_null_mask(2, cudf::mask_state::UNALLOCATED)); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, *expected); +} + +TYPED_TEST(ListsStructsLeafTest, FromNested) +{ + using LCWinner_t = cudf::test::lists_column_wrapper; + using StringCW = cudf::test::strings_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; + using valid_t = std::vector; + auto leaf = this->make_test_structs_column( + {{1, 2}, {0, 1}}, + StringCW({"รฉtoile", "ๆ˜Ÿ"}, {true, true}), + LCWinner_t({LCWinner_t{}, LCWinner_t{42}}, valid_t{1, 1}.begin()), + valid_t{0, 1}.begin()); + auto mask = cudf::create_null_mask(3, cudf::mask_state::ALL_VALID); + cudf::set_null_mask(static_cast(mask.data()), 0, 1, false); + auto data = + cudf::make_lists_column(3, offset_t{0, 0, 1, 2}.release(), leaf.release(), 1, std::move(mask)); + auto s = cudf::make_list_scalar(*data); + + auto col = cudf::make_column_from_scalar(*s, 3); + + auto leaf2 = this->make_test_structs_column( + {{1, 2, 1, 2, 1, 2}, {0, 1, 0, 1, 0, 1}}, + StringCW({"รฉtoile", "ๆ˜Ÿ", "รฉtoile", "ๆ˜Ÿ", "รฉtoile", "ๆ˜Ÿ"}, + {true, true, true, true, true, true}), + LCWinner_t( + {LCWinner_t{}, LCWinner_t{42}, LCWinner_t{}, LCWinner_t{42}, LCWinner_t{}, LCWinner_t{42}}, + valid_t{1, 1, 1, 1, 1, 1}.begin()), + valid_t{0, 1, 0, 1, 0, 1}.begin()); + auto mask2 = cudf::create_null_mask(9, cudf::mask_state::ALL_VALID); + cudf::set_null_mask(static_cast(mask2.data()), 0, 1, false); + cudf::set_null_mask(static_cast(mask2.data()), 3, 4, false); + cudf::set_null_mask(static_cast(mask2.data()), 6, 7, false); + auto data2 = cudf::make_lists_column( + 9, offset_t{0, 0, 1, 2, 2, 3, 4, 4, 5, 6}.release(), leaf2.release(), 3, std::move(mask2)); + auto expected = cudf::make_lists_column(3, + offset_t{0, 3, 6, 9}.release(), + std::move(data2), + 0, + cudf::create_null_mask(3, cudf::mask_state::UNALLOCATED)); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, *expected); +} + +class ListsZeroLengthColumnTest : public ColumnFactoryTest { + protected: + using StructsCW = cudf::test::structs_column_wrapper; + StructsCW make_test_structs_column(cudf::test::fixed_width_column_wrapper field1, + cudf::test::strings_column_wrapper field2, + cudf::test::lists_column_wrapper field3) + { + return StructsCW{field1, field2, field3}; + } +}; + +TEST_F(ListsZeroLengthColumnTest, MixedTypes) +{ + using FCW = cudf::test::fixed_width_column_wrapper; + using StringCW = cudf::test::strings_column_wrapper; + using LCW = cudf::test::lists_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; + { + auto s = cudf::make_list_scalar(FCW{1, 2, 3}); + auto got = cudf::make_column_from_scalar(*s, 0); + auto expected = + cudf::make_lists_column(0, + offset_t{}.release(), + FCW{}.release(), + 0, + cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*got, *expected); + } + + { + auto s = cudf::make_list_scalar(LCW{LCW{1, 2, 3}, LCW{}, LCW{5, 6}}); + auto got = cudf::make_column_from_scalar(*s, 0); + auto nested = cudf::make_lists_column(0, + offset_t{}.release(), + FCW{}.release(), + 0, + cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED)); + auto expected = + cudf::make_lists_column(0, + offset_t{}.release(), + std::move(nested), + 0, + cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*got, *expected); + } + + { + auto s = cudf::make_list_scalar( + this->make_test_structs_column({1, 2, 3}, StringCW({"x", "", "y"}), LCW{{5, 6}, {}, {7}})); + auto got = cudf::make_column_from_scalar(*s, 0); + + std::vector> children; + children.emplace_back(FCW{}.release()); + children.emplace_back(StringCW{}.release()); + children.emplace_back(LCW{}.release()); + auto nested = cudf::make_structs_column( + 0, std::move(children), 0, cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED)); + + auto expected = + cudf::make_lists_column(0, + offset_t{}.release(), + std::move(nested), + 0, + cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*got, *expected); + } +} + void struct_from_scalar(bool is_valid) { using LCW = cudf::test::lists_column_wrapper; diff --git a/cpp/tests/groupby/collect_list_tests.cpp b/cpp/tests/groupby/collect_list_tests.cpp index 7580c1c4e3b..9d2141c913c 100644 --- a/cpp/tests/groupby/collect_list_tests.cpp +++ b/cpp/tests/groupby/collect_list_tests.cpp @@ -86,6 +86,21 @@ TYPED_TEST(groupby_collect_list_test, CollectWithNullExclusion) test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg)); } +TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInput) +{ + using K = int32_t; + using V = TypeParam; + + fixed_width_column_wrapper keys{}; + fixed_width_column_wrapper values{}; + + fixed_width_column_wrapper expect_keys{}; + lists_column_wrapper expect_vals{}; + + auto agg = cudf::make_collect_list_aggregation(null_policy::EXCLUDE); + test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg)); +} + TYPED_TEST(groupby_collect_list_test, CollectLists) { using K = int32_t; @@ -124,6 +139,61 @@ TYPED_TEST(groupby_collect_list_test, CollectListsWithNullExclusion) test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg)); } +TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputLists) +{ + using K = int32_t; + using V = TypeParam; + + using LCW = cudf::test::lists_column_wrapper; + + auto offsets = data_type{type_to_id()}; + + fixed_width_column_wrapper keys{}; + auto values = cudf::make_lists_column(0, make_empty_column(offsets), LCW{}.release(), 0, {}); + + fixed_width_column_wrapper expect_keys{}; + + auto expect_child = + cudf::make_lists_column(0, make_empty_column(offsets), LCW{}.release(), 0, {}); + auto expect_values = + cudf::make_lists_column(0, make_empty_column(offsets), std::move(expect_child), 0, {}); + + auto agg = cudf::make_collect_list_aggregation(); + test_single_agg(keys, values->view(), expect_keys, expect_values->view(), std::move(agg)); +} + +TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputListsOfStructs) +{ + using K = int32_t; + using V = TypeParam; + + using LCW = cudf::test::lists_column_wrapper; + + fixed_width_column_wrapper keys{}; + auto struct_child = LCW{}; + auto struct_column = structs_column_wrapper{{struct_child}}; + + auto values = cudf::make_lists_column( + 0, make_empty_column(data_type{type_to_id()}), struct_column.release(), 0, {}); + + fixed_width_column_wrapper expect_keys{}; + + auto expect_struct_child = LCW{}; + auto expect_struct_column = structs_column_wrapper{{expect_struct_child}}; + + auto expect_child = + cudf::make_lists_column(0, + make_empty_column(data_type{type_to_id()}), + expect_struct_column.release(), + 0, + {}); + auto expect_values = cudf::make_lists_column( + 0, make_empty_column(data_type{type_to_id()}), std::move(expect_child), 0, {}); + + auto agg = cudf::make_collect_list_aggregation(); + test_single_agg(keys, values->view(), expect_keys, expect_values->view(), std::move(agg)); +} + TYPED_TEST(groupby_collect_list_test, dictionary) { using K = int32_t; diff --git a/cpp/tests/groupby/collect_set_tests.cpp b/cpp/tests/groupby/collect_set_tests.cpp index ce3a9a49372..d5a881a1993 100644 --- a/cpp/tests/groupby/collect_set_tests.cpp +++ b/cpp/tests/groupby/collect_set_tests.cpp @@ -58,8 +58,7 @@ TYPED_TEST_CASE(CollectSetTypedTest, FixedWidthTypesNotBool); TYPED_TEST(CollectSetTypedTest, TrivialInput) { // Empty input - // TODO: Enable this test after issue#7611 has been fixed - // test_single_agg(COL_K{}, COL_V{}, COL_K{}, COL_V{}, COLLECT_SET); + test_single_agg(COL_K{}, COL_V{}, COL_K{}, LCL_V{}, CollectSetTest::collect_set()); // Single key input { diff --git a/cpp/tests/groupby/nth_element_tests.cpp b/cpp/tests/groupby/nth_element_tests.cpp index ec0265a3023..5630cba09da 100644 --- a/cpp/tests/groupby/nth_element_tests.cpp +++ b/cpp/tests/groupby/nth_element_tests.cpp @@ -362,5 +362,45 @@ TEST_F(groupby_nth_element_string_test, dictionary) keys, vals, expect_keys, expect_vals->view(), cudf::make_nth_element_aggregation(2)); } +template +struct groupby_nth_element_lists_test : BaseFixture { +}; + +TYPED_TEST_CASE(groupby_nth_element_lists_test, FixedWidthTypesWithoutFixedPoint); + +TYPED_TEST(groupby_nth_element_lists_test, Basics) +{ + using K = int32_t; + using V = TypeParam; + + using lists = cudf::test::lists_column_wrapper; + + auto keys = fixed_width_column_wrapper{1, 1, 2, 2, 3, 3}; + auto values = lists{{1, 2}, {3, 4}, {5, 6, 7}, lists{}, {9, 10}, {11}}; + + auto expected_keys = fixed_width_column_wrapper{1, 2, 3}; + auto expected_values = lists{{1, 2}, {5, 6, 7}, {9, 10}}; + + test_single_agg( + keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(0)); +} + +TYPED_TEST(groupby_nth_element_lists_test, EmptyInput) +{ + using K = int32_t; + using V = TypeParam; + + using lists = cudf::test::lists_column_wrapper; + + auto keys = fixed_width_column_wrapper{}; + auto values = lists{}; + + auto expected_keys = fixed_width_column_wrapper{}; + auto expected_values = lists{}; + + test_single_agg( + keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(2)); +} + } // namespace test } // namespace cudf diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index 6bc08cf24a6..e45b67505ba 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -22,9 +22,11 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -61,6 +63,16 @@ using table_view = cudf::table_view; auto const temp_env = static_cast( ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment)); +// Base test fixture for tests +struct CsvWriterTest : public cudf::test::BaseFixture { +}; + +template +struct CsvFixedPointWriterTest : public CsvWriterTest { +}; + +TYPED_TEST_CASE(CsvFixedPointWriterTest, cudf::test::FixedPointTypes); + // Base test fixture for tests struct CsvReaderTest : public cudf::test::BaseFixture { }; @@ -307,6 +319,98 @@ TYPED_TEST(CsvReaderNumericTypeTest, SingleColumn) expect_column_data_equal(std::vector(sequence, sequence + num_rows), view.column(0)); } +TYPED_TEST(CsvFixedPointWriterTest, SingleColumnNegativeScale) +{ + std::vector reference_strings = { + "1.23", "-8.76", "5.43", "-0.12", "0.25", "-0.23", "-0.27", "0.00", "0.00"}; + + auto validity = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return (i % 2 == 0) ? true : false; }); + cudf::test::strings_column_wrapper strings( + reference_strings.begin(), reference_strings.end(), validity); + + std::vector valid_reference_strings; + thrust::copy_if(thrust::host, + reference_strings.begin(), + reference_strings.end(), + thrust::make_counting_iterator(0), + std::back_inserter(valid_reference_strings), + validity.functor()); + reference_strings = valid_reference_strings; + + using DecimalType = TypeParam; + auto input_column = cudf::strings::to_fixed_point( + cudf::strings_column_view(strings), + cudf::data_type{cudf::type_to_id(), numeric::scale_type{-2}}); + + auto input_table = cudf::table_view{std::vector{*input_column}}; + + auto filepath = temp_env->get_temp_dir() + "FixedPointSingleColumnNegativeScale.csv"; + + cudf_io::csv_writer_options writer_options = + cudf_io::csv_writer_options::builder(cudf_io::sink_info(filepath), input_table); + + cudf_io::write_csv(writer_options); + + std::vector result_strings; + result_strings.reserve(reference_strings.size()); + + std::ifstream read_result_file(filepath); + assert(read_result_file.is_open()); + + std::copy(std::istream_iterator(read_result_file), + std::istream_iterator(), + std::back_inserter(result_strings)); + + EXPECT_EQ(result_strings, reference_strings); +} + +TYPED_TEST(CsvFixedPointWriterTest, SingleColumnPositiveScale) +{ + std::vector reference_strings = { + "123000", "-876000", "543000", "-12000", "25000", "-23000", "-27000", "0000", "0000"}; + + auto validity = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return (i % 2 == 0) ? true : false; }); + cudf::test::strings_column_wrapper strings( + reference_strings.begin(), reference_strings.end(), validity); + + std::vector valid_reference_strings; + thrust::copy_if(thrust::host, + reference_strings.begin(), + reference_strings.end(), + thrust::make_counting_iterator(0), + std::back_inserter(valid_reference_strings), + validity.functor()); + reference_strings = valid_reference_strings; + + using DecimalType = TypeParam; + auto input_column = cudf::strings::to_fixed_point( + cudf::strings_column_view(strings), + cudf::data_type{cudf::type_to_id(), numeric::scale_type{3}}); + + auto input_table = cudf::table_view{std::vector{*input_column}}; + + auto filepath = temp_env->get_temp_dir() + "FixedPointSingleColumnPositiveScale.csv"; + + cudf_io::csv_writer_options writer_options = + cudf_io::csv_writer_options::builder(cudf_io::sink_info(filepath), input_table); + + cudf_io::write_csv(writer_options); + + std::vector result_strings; + result_strings.reserve(reference_strings.size()); + + std::ifstream read_result_file(filepath); + assert(read_result_file.is_open()); + + std::copy(std::istream_iterator(read_result_file), + std::istream_iterator(), + std::back_inserter(result_strings)); + + EXPECT_EQ(result_strings, reference_strings); +} + TEST_F(CsvReaderTest, MultiColumn) { constexpr auto num_rows = 10; diff --git a/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp b/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp new file mode 100644 index 00000000000..7d79cf4aebe --- /dev/null +++ b/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp @@ -0,0 +1,501 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +namespace { +using StrListsCol = cudf::test::lists_column_wrapper; +using IntListsCol = cudf::test::lists_column_wrapper; +using IntCol = cudf::test::fixed_width_column_wrapper; + +constexpr bool print_all{false}; // For debugging +constexpr int32_t null{0}; + +template +auto build_lists_col(T& list, Ts&... lists) +{ + return T(std::initializer_list{std::move(list), std::move(lists)...}); +} + +auto all_nulls() { return cudf::test::iterator_all_nulls(); } + +auto null_at(cudf::size_type idx) { return cudf::test::iterator_with_null_at(idx); } + +auto null_at(std::vector const& indices) +{ + return cudf::test::iterator_with_null_at(cudf::host_span{indices}); +} + +} // namespace + +struct ConcatenateListElementsTest : public cudf::test::BaseFixture { +}; + +TEST_F(ConcatenateListElementsTest, InvalidInput) +{ + // Input lists is not a 2-level depth lists column. + { + auto const col = IntCol{}; + EXPECT_THROW(cudf::lists::concatenate_list_elements(col), cudf::logic_error); + } + + // Input lists is not at least 2-level depth lists column. + { + auto const col = IntListsCol{1, 2, 3}; + EXPECT_THROW(cudf::lists::concatenate_list_elements(col), cudf::logic_error); + } +} + +template +struct ConcatenateListElementsTypedTest : public cudf::test::BaseFixture { +}; + +using TypesForTest = cudf::test::Concat; +TYPED_TEST_CASE(ConcatenateListElementsTypedTest, TypesForTest); + +TYPED_TEST(ConcatenateListElementsTypedTest, SimpleInputNoNull) +{ + using ListsCol = cudf::test::lists_column_wrapper; + + auto row0 = ListsCol{{1, 2}, {3}, {4, 5, 6}}; + auto row1 = ListsCol{ListsCol{}}; + auto row2 = ListsCol{{7, 8}, {9, 10}}; + auto const col = build_lists_col(row0, row1, row2); + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = ListsCol{{1, 2, 3, 4, 5, 6}, {}, {7, 8, 9, 10}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); +} + +TYPED_TEST(ConcatenateListElementsTypedTest, SimpleInputNestedManyLevelsNoNull) +{ + using ListsCol = cudf::test::lists_column_wrapper; + + auto row00 = ListsCol{{1, 2}, {3}, {4, 5, 6}}; + auto row01 = ListsCol{ListsCol{}}; + auto row02 = ListsCol{{7, 8}, {9, 10}}; + auto row0 = build_lists_col(row00, row01, row02); + + auto row10 = ListsCol{{1, 2}, {3}, {4, 5, 6}}; + auto row11 = ListsCol{ListsCol{}}; + auto row12 = ListsCol{{7, 8}, {9, 10}}; + auto row1 = build_lists_col(row10, row11, row12); + + auto row20 = ListsCol{{1, 2}, {3}, {4, 5, 6}}; + auto row21 = ListsCol{ListsCol{}}; + auto row22 = ListsCol{{7, 8}, {9, 10}}; + auto row2 = build_lists_col(row20, row21, row22); + + auto const col = build_lists_col(row0, row1, row2); + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = ListsCol{ListsCol{{1, 2}, {3}, {4, 5, 6}, {}, {7, 8}, {9, 10}}, + ListsCol{{1, 2}, {3}, {4, 5, 6}, {}, {7, 8}, {9, 10}}, + ListsCol{{1, 2}, {3}, {4, 5, 6}, {}, {7, 8}, {9, 10}}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); +} + +TEST_F(ConcatenateListElementsTest, SimpleInputStringsColumnNoNull) +{ + auto row0 = StrListsCol{StrListsCol{"Tomato", "Apple"}, StrListsCol{"Orange"}}; + auto row1 = StrListsCol{StrListsCol{"Banana", "Kiwi", "Cherry"}, StrListsCol{"Lemon", "Peach"}}; + auto row2 = StrListsCol{StrListsCol{"Coconut"}, StrListsCol{}}; + auto const col = build_lists_col(row0, row1, row2); + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = StrListsCol{StrListsCol{"Tomato", "Apple", "Orange"}, + StrListsCol{"Banana", "Kiwi", "Cherry", "Lemon", "Peach"}, + StrListsCol{"Coconut"}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); +} + +TYPED_TEST(ConcatenateListElementsTypedTest, SimpleInputWithNulls) +{ + using ListsCol = cudf::test::lists_column_wrapper; + auto row0 = ListsCol{{ListsCol{{1, null, 3, 4}, null_at(1)}, + ListsCol{{10, 11, 12, null}, null_at(3)}, + ListsCol{} /*NULL*/}, + null_at(2)}; + auto row1 = ListsCol{ListsCol{{null, 2, 3, 4}, null_at(0)}, + ListsCol{{13, 14, 15, 16, 17, null}, null_at(5)}, + ListsCol{{20, null}, null_at(1)}}; + auto row2 = ListsCol{{ListsCol{{null, 2, 3, 4}, null_at(0)}, + ListsCol{} /*NULL*/, + ListsCol{{null, 21, null, null}, null_at({0, 2, 3})}}, + null_at(1)}; + auto row3 = ListsCol{{ListsCol{} /*NULL*/, ListsCol{{null, 18}, null_at(0)}}, null_at(0)}; + auto row4 = ListsCol{ListsCol{{1, 2, null, 4}, null_at(2)}, + ListsCol{{19, 20, null}, null_at(2)}, + ListsCol{22, 23, 24, 25}}; + auto row5 = ListsCol{ListsCol{{1, 2, 3, null}, null_at(3)}, + ListsCol{{null}, null_at(0)}, + ListsCol{{null, null, null, null, null}, all_nulls()}}; + auto row6 = + ListsCol{{ListsCol{} /*NULL*/, ListsCol{} /*NULL*/, ListsCol{} /*NULL*/}, all_nulls()}; + auto const col = build_lists_col(row0, row1, row2, row3, row4, row5, row6); + + // Ignore null list elements. + { + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = + ListsCol{{ListsCol{{1, null, 3, 4, 10, 11, 12, null}, null_at({1, 7})}, + ListsCol{{null, 2, 3, 4, 13, 14, 15, 16, 17, null, 20, null}, null_at({0, 9, 11})}, + ListsCol{{null, 2, 3, 4, null, 21, null, null}, null_at({0, 4, 6, 7})}, + ListsCol{{null, 18}, null_at(0)}, + ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})}, + ListsCol{{1, 2, 3, null, null, null, null, null, null, null}, + null_at({3, 4, 5, 6, 7, 8, 9})}, + ListsCol{} /*NULL*/}, + null_at(6)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + + // Null lists result in null rows. + { + auto const results = cudf::lists::concatenate_list_elements( + col, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW); + auto const expected = + ListsCol{{ListsCol{} /*NULL*/, + ListsCol{{null, 2, 3, 4, 13, 14, 15, 16, 17, null, 20, null}, null_at({0, 9, 11})}, + ListsCol{} /*NULL*/, + ListsCol{} /*NULL*/, + ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})}, + ListsCol{{1, 2, 3, null, null, null, null, null, null, null}, + null_at({3, 4, 5, 6, 7, 8, 9})}, + ListsCol{} /*NULL*/}, + null_at({0, 2, 3, 6})}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } +} + +TYPED_TEST(ConcatenateListElementsTypedTest, SimpleInputNestedManyLevelsWithNulls) +{ + using ListsCol = cudf::test::lists_column_wrapper; + + auto row00 = ListsCol{{1, 2}, {3}, {4, 5, 6}}; + auto row01 = ListsCol{ListsCol{}}; /*NULL*/ + auto row02 = ListsCol{{7, 8}, {9, 10}}; + auto row0 = ListsCol{{std::move(row00), std::move(row01), std::move(row02)}, null_at(1)}; + + auto row10 = ListsCol{{{1, 2}, {3}, {4, 5, 6} /*NULL*/}, null_at(2)}; + auto row11 = ListsCol{ListsCol{}}; + auto row12 = ListsCol{{7, 8}, {9, 10}}; + auto row1 = build_lists_col(row10, row11, row12); + + auto row20 = ListsCol{{1, 2}, {3}, {4, 5, 6}}; + auto row21 = ListsCol{ListsCol{}}; + auto row22 = ListsCol{ListsCol{{null, 8}, null_at(0)}, {9, 10}}; + auto row2 = build_lists_col(row20, row21, row22); + + auto const col = build_lists_col(row0, row1, row2); + + // Ignore null list elements. + { + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = + ListsCol{ListsCol{{1, 2}, {3}, {4, 5, 6}, {7, 8}, {9, 10}}, + ListsCol{{{1, 2}, {3}, {} /*NULL*/, {}, {7, 8}, {9, 10}}, null_at(2)}, + ListsCol{{1, 2}, {3}, {4, 5, 6}, {}, ListsCol{{null, 8}, null_at(0)}, {9, 10}}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + + // Null lists result in null rows. + { + auto const results = cudf::lists::concatenate_list_elements( + col, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW); + auto const expected = + ListsCol{{ListsCol{ListsCol{}}, /*NULL*/ + ListsCol{{{1, 2}, {3}, {} /*NULL*/, {}, {7, 8}, {9, 10}}, null_at(2)}, + ListsCol{{1, 2}, {3}, {4, 5, 6}, {}, ListsCol{{null, 8}, null_at(0)}, {9, 10}}}, + null_at(0)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } +} + +TEST_F(ConcatenateListElementsTest, SimpleInputStringsColumnWithNulls) +{ + auto row0 = StrListsCol{ + StrListsCol{{"Tomato", "Bear" /*NULL*/, "Apple"}, null_at(1)}, + StrListsCol{{"Orange", "Dog" /*NULL*/, "Fox" /*NULL*/, "Duck" /*NULL*/}, null_at({1, 2, 3})}}; + auto row1 = StrListsCol{ + StrListsCol{{"Banana", "Pig" /*NULL*/, "Kiwi", "Cherry", "Whale" /*NULL*/}, null_at({1, 4})}, + StrListsCol{"Lemon", "Peach"}}; + auto row2 = StrListsCol{{StrListsCol{"Coconut"}, StrListsCol{} /*NULL*/}, null_at(1)}; + auto const col = build_lists_col(row0, row1, row2); + + // Ignore null list elements. + { + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = StrListsCol{ + StrListsCol{{"Tomato", "" /*NULL*/, "Apple", "Orange", "" /*NULL*/, "" /*NULL*/, "" + /*NULL*/}, + null_at({1, 4, 5, 6})}, + StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/, "Lemon", "Peach"}, + null_at({1, 4})}, + StrListsCol{"Coconut"}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + + // Null lists result in null rows. + { + auto const results = cudf::lists::concatenate_list_elements( + col, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW); + auto const expected = StrListsCol{ + {StrListsCol{ + {"Tomato", "" /*NULL*/, "Apple", "Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, + null_at({1, 4, 5, 6})}, + StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/, "Lemon", "Peach"}, + null_at({1, 4})}, + StrListsCol{} /*NULL*/}, + null_at(2)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } +} +TEST_F(ConcatenateListElementsTest, SimpleInputStringsColumnWithEmptyStringsAndNulls) +{ + auto row0 = + StrListsCol{StrListsCol{"", "", ""}, + StrListsCol{{"Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, null_at({1, 2, 3})}}; + auto row1 = StrListsCol{ + StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/}, null_at({1, 4})}, + StrListsCol{""}}; + auto row2 = StrListsCol{{StrListsCol{"Coconut"}, StrListsCol{} /*NULL*/}, null_at(1)}; + auto const col = build_lists_col(row0, row1, row2); + + // Ignore null list elements. + { + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = StrListsCol{ + StrListsCol{{"", "", "", "Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, + null_at({4, 5, 6})}, + StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/, ""}, null_at({1, 4})}, + StrListsCol{"Coconut"}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + + // Null lists result in null rows. + { + auto const results = cudf::lists::concatenate_list_elements( + col, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW); + auto const expected = StrListsCol{ + {StrListsCol{{"", "", "", "Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, + null_at({4, 5, 6})}, + StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/, ""}, null_at({1, 4})}, + StrListsCol{} /*NULL*/}, + null_at(2)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } +} + +TYPED_TEST(ConcatenateListElementsTypedTest, SlicedColumnsInputNoNull) +{ + using ListsCol = cudf::test::lists_column_wrapper; + + auto const col_original = ListsCol{ListsCol{{1, 2, 3}, {2, 3}}, + ListsCol{{3, 4, 5, 6}, {5, 6}, {}, {7}}, + ListsCol{{7, 7, 7}, {7, 8, 1, 0}, {1}}, + ListsCol{{9, 10, 11}}, + ListsCol{}, + ListsCol{{12, 13, 14, 15}, {16}, {17}}}; + + { + auto const col = cudf::slice(col_original, {0, 3})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = + ListsCol{{1, 2, 3, 2, 3}, {3, 4, 5, 6, 5, 6, 7}, {7, 7, 7, 7, 8, 1, 0, 1}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {1, 4})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = ListsCol{{3, 4, 5, 6, 5, 6, 7}, {7, 7, 7, 7, 8, 1, 0, 1}, {9, 10, 11}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {2, 5})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = ListsCol{{7, 7, 7, 7, 8, 1, 0, 1}, {9, 10, 11}, {}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {3, 6})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = ListsCol{{9, 10, 11}, {}, {12, 13, 14, 15, 16, 17}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } +} + +TYPED_TEST(ConcatenateListElementsTypedTest, SlicedColumnsInputWithNulls) +{ + using ListsCol = cudf::test::lists_column_wrapper; + + auto row0 = ListsCol{ListsCol{{null, 2, 3}, null_at(0)}, ListsCol{2, 3}}; + auto row1 = ListsCol{ListsCol{{3, null, null, 6}, null_at({1, 2})}, + ListsCol{{5, 6, null}, null_at(2)}, + ListsCol{}, + ListsCol{{7, null}, null_at(1)}}; + auto row2 = ListsCol{ListsCol{7, 7, 7}, ListsCol{{7, 8, null, 0}, null_at(2)}, ListsCol{1}}; + auto row3 = ListsCol{ListsCol{9, 10, 11}}; + auto row4 = ListsCol{ListsCol{}}; + auto row5 = ListsCol{ListsCol{{12, null, 14, 15}, null_at(1)}, ListsCol{16}, ListsCol{17}}; + auto const col_original = build_lists_col(row0, row1, row2, row3, row4, row5); + + { + auto const col = cudf::slice(col_original, {0, 3})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = + ListsCol{ListsCol{{null, 2, 3, 2, 3}, null_at(0)}, + ListsCol{{3, null, null, 6, 5, 6, null, 7, null}, null_at({1, 2, 6, 8})}, + ListsCol{{7, 7, 7, 7, 8, null, 0, 1}, null_at(5)}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {1, 4})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = + ListsCol{ListsCol{{3, null, null, 6, 5, 6, null, 7, null}, null_at({1, 2, 6, 8})}, + ListsCol{{7, 7, 7, 7, 8, null, 0, 1}, null_at(5)}, + ListsCol{9, 10, 11}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {2, 5})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = + ListsCol{ListsCol{{7, 7, 7, 7, 8, null, 0, 1}, null_at(5)}, ListsCol{9, 10, 11}, ListsCol{}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {3, 6})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = + ListsCol{ListsCol{9, 10, 11}, ListsCol{}, ListsCol{{12, null, 14, 15, 16, 17}, null_at(1)}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } +} + +TEST_F(ConcatenateListElementsTest, SlicedStringsColumnsInputWithNulls) +{ + auto row0 = StrListsCol{ + StrListsCol{{"Tomato", "Bear" /*NULL*/, "Apple"}, null_at(1)}, + StrListsCol{{"Banana", "Pig" /*NULL*/, "Kiwi", "Cherry", "Whale" /*NULL*/}, null_at({1, 4})}, + StrListsCol{"Coconut"}}; + auto row1 = StrListsCol{ + StrListsCol{{"Banana", "Pig" /*NULL*/, "Kiwi", "Cherry", "Whale" /*NULL*/}, null_at({1, 4})}, + StrListsCol{"Coconut"}, + StrListsCol{{"Orange", "Dog" /*NULL*/, "Fox" /*NULL*/, "Duck" /*NULL*/}, null_at({1, 2, 3})}}; + auto row2 = StrListsCol{ + StrListsCol{"Coconut"}, + StrListsCol{{"Orange", "Dog" /*NULL*/, "Fox" /*NULL*/, "Duck" /*NULL*/}, null_at({1, 2, 3})}, + StrListsCol{"Lemon", "Peach"}}; + auto row3 = StrListsCol{ + {StrListsCol{{"Orange", "Dog" /*NULL*/, "Fox" /*NULL*/, "Duck" /*NULL*/}, null_at({1, 2, 3})}, + StrListsCol{"Lemon", "Peach"}, + StrListsCol{} /*NULL*/}, + null_at(2)}; + auto const col_original = build_lists_col(row0, row1, row2, row3); + + { + auto const col = cudf::slice(col_original, {0, 2})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = StrListsCol{StrListsCol{{"Tomato", + "" /*NULL*/, + "Apple", + "Banana", + "" /*NULL*/, + "Kiwi", + "Cherry", + "" /*NULL*/, + "Coconut"}, + null_at({1, 4, 7})}, + StrListsCol{{"Banana", + "" /*NULL*/, + "Kiwi", + "Cherry", + "" /*NULL*/, + "Coconut", + "Orange", + "" /*NULL*/, + "" /*NULL*/, + "" /*NULL*/}, + null_at({1, 4, 7, 8, 9})}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {1, 3})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = StrListsCol{StrListsCol{{"Banana", + "" /*NULL*/, + "Kiwi", + "Cherry", + "" /*NULL*/, + "Coconut", + "Orange", + "" /*NULL*/, + "" /*NULL*/, + "" /*NULL*/}, + null_at({1, 4, 7, 8, 9})}, + StrListsCol{{"Coconut", + "Orange", + "" /*NULL*/, + "" /*NULL*/, + "", /*NULL*/ + "Lemon", + "Peach"}, + null_at({2, 3, 4})}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {2, 4})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = StrListsCol{StrListsCol{{"Coconut", + "Orange", + "" /*NULL*/, + "" /*NULL*/, + "", /*NULL*/ + "Lemon", + "Peach"}, + null_at({2, 3, 4})}, + StrListsCol{{"Orange", + "" /*NULL*/, + "" /*NULL*/, + "", /*NULL*/ + "Lemon", + "Peach"}, + null_at({1, 2, 3})}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {2, 4})[0]; + auto const results = cudf::lists::concatenate_list_elements( + col, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW); + auto const expected = StrListsCol{{StrListsCol{{"Coconut", + "Orange", + "" /*NULL*/, + "" /*NULL*/, + "", /*NULL*/ + "Lemon", + "Peach"}, + null_at({2, 3, 4})}, + StrListsCol{} /*NULL*/}, + null_at(1)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } +} diff --git a/cpp/tests/lists/concatenate_rows_tests.cpp b/cpp/tests/lists/combine/concatenate_rows_tests.cpp similarity index 94% rename from cpp/tests/lists/concatenate_rows_tests.cpp rename to cpp/tests/lists/combine/concatenate_rows_tests.cpp index 5abaf99f739..af22f329634 100644 --- a/cpp/tests/lists/concatenate_rows_tests.cpp +++ b/cpp/tests/lists/combine/concatenate_rows_tests.cpp @@ -19,7 +19,7 @@ #include #include -#include +#include namespace { using StrListsCol = cudf::test::lists_column_wrapper; @@ -184,24 +184,27 @@ TYPED_TEST(ListConcatenateRowsTypedTest, SimpleInputWithNulls) ListsCol{{null, 2, 3, 4}, null_at(0)}, ListsCol{} /*NULL*/, ListsCol{{1, 2, null, 4}, null_at(2)}, - ListsCol{{1, 2, 3, null}, null_at(3)}}, - null_at(3)} + ListsCol{{1, 2, 3, null}, null_at(3)}, + ListsCol{} /*NULL*/}, + null_at({3, 6})} .release(); auto const col2 = ListsCol{{ListsCol{{10, 11, 12, null}, null_at(3)}, ListsCol{{13, 14, 15, 16, 17, null}, null_at(5)}, ListsCol{} /*NULL*/, ListsCol{{null, 18}, null_at(0)}, ListsCol{{19, 20, null}, null_at(2)}, - ListsCol{{null}, null_at(0)}}, - null_at(2)} + ListsCol{{null}, null_at(0)}, + ListsCol{} /*NULL*/}, + null_at({2, 6})} .release(); auto const col3 = ListsCol{{ListsCol{} /*NULL*/, ListsCol{{20, null}, null_at(1)}, ListsCol{{null, 21, null, null}, null_at({0, 2, 3})}, ListsCol{}, ListsCol{22, 23, 24, 25}, - ListsCol{{null, null, null, null, null}, all_nulls()}}, - null_at(0)} + ListsCol{{null, null, null, null, null}, all_nulls()}, + ListsCol{} /*NULL*/}, + null_at({0, 6})} .release(); // Ignore null list elements @@ -209,13 +212,15 @@ TYPED_TEST(ListConcatenateRowsTypedTest, SimpleInputWithNulls) auto const results = cudf::lists::concatenate_rows(TView{{col1->view(), col2->view(), col3->view()}}); auto const expected = - ListsCol{ListsCol{{1, null, 3, 4, 10, 11, 12, null}, null_at({1, 7})}, - ListsCol{{null, 2, 3, 4, 13, 14, 15, 16, 17, null, 20, null}, null_at({0, 9, 11})}, - ListsCol{{null, 2, 3, 4, null, 21, null, null}, null_at({0, 4, 6, 7})}, - ListsCol{{null, 18}, null_at(0)}, - ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})}, - ListsCol{{1, 2, 3, null, null, null, null, null, null, null}, - null_at({3, 4, 5, 6, 7, 8, 9})}} + ListsCol{{ListsCol{{1, null, 3, 4, 10, 11, 12, null}, null_at({1, 7})}, + ListsCol{{null, 2, 3, 4, 13, 14, 15, 16, 17, null, 20, null}, null_at({0, 9, 11})}, + ListsCol{{null, 2, 3, 4, null, 21, null, null}, null_at({0, 4, 6, 7})}, + ListsCol{{null, 18}, null_at(0)}, + ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})}, + ListsCol{{1, 2, 3, null, null, null, null, null, null, null}, + null_at({3, 4, 5, 6, 7, 8, 9})}, + ListsCol{} /*NULL*/}, + null_at(6)} .release(); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all); } @@ -232,8 +237,9 @@ TYPED_TEST(ListConcatenateRowsTypedTest, SimpleInputWithNulls) ListsCol{} /*NULL*/, ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})}, ListsCol{{1, 2, 3, null, null, null, null, null, null, null}, - null_at({3, 4, 5, 6, 7, 8, 9})}}, - null_at({0, 2, 3})} + null_at({3, 4, 5, 6, 7, 8, 9})}, + ListsCol{} /*NULL*/}, + null_at({0, 2, 3, 6})} .release(); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all); } diff --git a/cpp/tests/partitioning/partition_test.cpp b/cpp/tests/partitioning/partition_test.cpp index bdd5e7bc780..669d406d80a 100644 --- a/cpp/tests/partitioning/partition_test.cpp +++ b/cpp/tests/partitioning/partition_test.cpp @@ -310,3 +310,23 @@ TEST_F(PartitionTestNotTyped, ListOfListOfIntEmpty) CUDF_TEST_EXPECT_TABLES_EQUAL(table_to_partition, result.first->view()); EXPECT_EQ(3, result.second.size()); } + +TEST_F(PartitionTestNotTyped, ListOfListOfListOfIntEmpty) +{ + cudf::test::lists_column_wrapper level_3_list{}; + + fixed_width_column_wrapper level_2_offsets{}; + std::unique_ptr level_2_list = + cudf::make_lists_column(0, level_2_offsets.release(), level_3_list.release(), 0, {}); + + fixed_width_column_wrapper level_1_offsets{0, 0}; + std::unique_ptr level_1_list = + cudf::make_lists_column(1, level_1_offsets.release(), std::move(level_2_list), 0, {}); + + auto table_to_partition = cudf::table_view{{*level_1_list}}; + fixed_width_column_wrapper map{0}; + + auto result = cudf::partition(table_to_partition, map, 2); + CUDF_TEST_EXPECT_TABLES_EQUAL(table_to_partition, result.first->view()); + EXPECT_EQ(3, result.second.size()); +} diff --git a/cpp/tests/strings/combine/concatenate_tests.cpp b/cpp/tests/strings/combine/concatenate_tests.cpp index c1c390e8a82..d91f669e42d 100644 --- a/cpp/tests/strings/combine/concatenate_tests.cpp +++ b/cpp/tests/strings/combine/concatenate_tests.cpp @@ -95,6 +95,58 @@ TEST_F(StringsCombineTest, Concatenate) } } +TEST_F(StringsCombineTest, ConcatenateSkipNulls) +{ + cudf::test::strings_column_wrapper strings1({"eee", "", "", "", "aa", "bbb", "รฉรฉรฉ"}, + {1, 0, 0, 1, 1, 1, 1}); + cudf::test::strings_column_wrapper strings2({"xyz", "", "d", "รฉa", "", "", "f"}, + {1, 0, 1, 1, 1, 0, 1}); + cudf::test::strings_column_wrapper strings3({"q", "", "s", "t", "u", "", "w"}, + {1, 1, 1, 1, 1, 0, 1}); + + cudf::table_view table({strings1, strings2, strings3}); + + { + cudf::test::strings_column_wrapper expected( + {"eee+xyz+q", "++", "+d+s", "+รฉa+t", "aa++u", "bbb++", "รฉรฉรฉ+f+w"}); + auto results = cudf::strings::concatenate(table, + cudf::string_scalar("+"), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::YES); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + } + { + cudf::test::strings_column_wrapper expected( + {"eee+xyz+q", "", "d+s", "+รฉa+t", "aa++u", "bbb", "รฉรฉรฉ+f+w"}); + auto results = cudf::strings::concatenate(table, + cudf::string_scalar("+"), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + } + { + cudf::test::strings_column_wrapper expected( + {"eee+xyz+q", "", "", "+รฉa+t", "aa++u", "", "รฉรฉรฉ+f+w"}, {1, 0, 0, 1, 1, 0, 1}); + auto results = cudf::strings::concatenate(table, + cudf::string_scalar("+"), + cudf::string_scalar("", false), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + } + { + cudf::test::strings_column_wrapper sep_col({"+", "-", ".", "@", "*", "^^", "#"}); + auto results = cudf::strings::concatenate(table, + cudf::strings_column_view(sep_col), + cudf::string_scalar(""), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + + cudf::test::strings_column_wrapper expected( + {"eee+xyz+q", "", "d.s", "@รฉa@t", "aa**u", "bbb", "รฉรฉรฉ#f#w"}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + } +} + TEST_F(StringsCombineTest, ConcatZeroSizeStringsColumns) { cudf::column_view zero_size_strings_column( @@ -107,6 +159,12 @@ TEST_F(StringsCombineTest, ConcatZeroSizeStringsColumns) cudf::test::expect_strings_empty(results->view()); } +TEST_F(StringsCombineTest, SingleColumnErrorCheck) +{ + cudf::column_view col0(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); + EXPECT_THROW(cudf::strings::concatenate(cudf::table_view{{col0}}), cudf::logic_error); +} + struct StringsConcatenateWithColSeparatorTest : public cudf::test::BaseFixture { }; @@ -157,7 +215,6 @@ TEST_F(StringsConcatenateWithColSeparatorTest, SingleColumnEmptyAndNullStringsNo auto exp_results = cudf::test::strings_column_wrapper({"", "", "", ""}, {false, true, false, false}); - auto results = cudf::strings::concatenate(cudf::table_view{{col0}}, cudf::strings_column_view(sep_col)); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true); @@ -295,12 +352,20 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnEmptyAndNullStringsNoR auto sep_col = cudf::test::strings_column_wrapper( {"", "", "", "", "", "", "", ""}, {true, false, true, false, true, false, true, false}); - auto exp_results = cudf::test::strings_column_wrapper( - {"", "", "", "", "", "", "", ""}, {false, false, true, false, true, false, true, false}); - + auto exp_results1 = cudf::test::strings_column_wrapper( + {"", "", "", "", "", "", "", ""}, {false, false, true, false, false, false, false, false}); auto results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true); + + auto exp_results2 = cudf::test::strings_column_wrapper( + {"", "", "", "", "", "", "", ""}, {true, false, true, false, true, false, true, false}); + results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, + cudf::strings_column_view(sep_col), + cudf::string_scalar("", false), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true); } TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixNoReplacements) @@ -315,13 +380,23 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixNoReplacement {"", "~~~", "", "@", "", "", "", "^^^^", "", "--", "*****", "######"}, {true, true, false, true, false, true, false, true, true, true, true, true}); - auto exp_results = cudf::test::strings_column_wrapper( - {"eeexyzfoo", "~~~", "", "รฉรฉรฉf", "", "", "", "valid", "doo", "", "", ""}, - {true, true, false, true, false, true, false, true, true, false, false, false}); + auto exp_results1 = cudf::test::strings_column_wrapper( + {"eeexyzfoo", "~~~", "", "", "", "", "", "", "", "", "", ""}, + {true, true, false, false, false, false, false, false, false, false, false, false}); auto results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true); + + auto exp_results2 = cudf::test::strings_column_wrapper( + {"eeexyzfoo", "~~~", "", "รฉรฉรฉf", "", "", "", "valid", "doo", "", "", ""}, + {true, true, false, true, false, true, false, true, true, true, true, true}); + results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, + cudf::strings_column_view(sep_col), + cudf::string_scalar("", false), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true); } TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixSeparatorReplacement) @@ -335,26 +410,26 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixSeparatorRepl auto sep_col = cudf::test::strings_column_wrapper( {"", "~~~", "", "@", "", "", "", "^^^^", "", "--", "*****", "######"}, {true, true, false, true, false, true, false, true, true, true, true, true}); - auto sep_rep = cudf::string_scalar("!!!!!!!!!!"); + auto sep_rep = cudf::string_scalar("!!!!!!!"); - auto exp_results = cudf::test::strings_column_wrapper( - {"eeexyzfoo", - "~~~", - "!!!!!!!!!!รฉaff", - "รฉรฉรฉf", - "รฉa", - "", - "รฉaff", - "valid", - "doo", - "", - "", - ""}, - {true, true, true, true, true, true, true, true, true, false, false, false}); + auto exp_results1 = cudf::test::strings_column_wrapper( + {"eeexyzfoo", "~~~", "!!!!!!!รฉaff", "รฉรฉรฉf", "รฉa", "", "รฉaff", "valid", "doo", "", "", ""}, + {true, true, true, false, false, false, false, false, false, false, false, false}); auto results = cudf::strings::concatenate( cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col), sep_rep); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true); + + auto exp_results2 = cudf::test::strings_column_wrapper( + {"eeexyzfoo", "~~~", "!!!!!!!รฉaff", "รฉรฉรฉf", "รฉa", "", "รฉaff", "valid", "doo", "", "", ""}, + {true, true, true, true, true, true, true, true, true, true, true, true}); + + results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, + cudf::strings_column_view(sep_col), + sep_rep, + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true); } TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixColumnReplacement) diff --git a/cpp/tests/strings/combine/concatenate_list_elements_tests.cpp b/cpp/tests/strings/combine/join_list_elements_tests.cpp similarity index 82% rename from cpp/tests/strings/combine/concatenate_list_elements_tests.cpp rename to cpp/tests/strings/combine/join_list_elements_tests.cpp index b6afd588dfb..e2f7c3e36a2 100644 --- a/cpp/tests/strings/combine/concatenate_list_elements_tests.cpp +++ b/cpp/tests/strings/combine/join_list_elements_tests.cpp @@ -58,7 +58,7 @@ TEST_F(StringsListsConcatenateTest, InvalidInput) { auto const string_lists = INT_LISTS{{1, 2, 3}, {4, 5, 6}}.release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); - EXPECT_THROW(cudf::strings::concatenate_list_elements(string_lv), cudf::logic_error); + EXPECT_THROW(cudf::strings::join_list_elements(string_lv), cudf::logic_error); } // Invalid scalar separator @@ -66,9 +66,8 @@ TEST_F(StringsListsConcatenateTest, InvalidInput) auto const string_lists = STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); - EXPECT_THROW( - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("", false)), - cudf::logic_error); + EXPECT_THROW(cudf::strings::join_list_elements(string_lv, cudf::string_scalar("", false)), + cudf::logic_error); } // Invalid column separators @@ -77,7 +76,7 @@ TEST_F(StringsListsConcatenateTest, InvalidInput) STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); auto const separators = STR_COL{"+++"}.release(); // size doesn't match with lists column size - EXPECT_THROW(cudf::strings::concatenate_list_elements(string_lv, separators->view()), + EXPECT_THROW(cudf::strings::join_list_elements(string_lv, separators->view()), cudf::logic_error); } } @@ -87,26 +86,26 @@ TEST_F(StringsListsConcatenateTest, EmptyInput) auto const string_lists = STR_LISTS{}.release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); auto const expected = STR_COL{}; - auto results = cudf::strings::concatenate_list_elements(string_lv); + auto results = cudf::strings::join_list_elements(string_lv); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); auto const separators = STR_COL{}.release(); - results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + results = cudf::strings::join_list_elements(string_lv, separators->view()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); } TEST_F(StringsListsConcatenateTest, ZeroSizeStringsInput) { auto const string_lists = - STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release(); + STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}, STR_LISTS{}}.release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); - auto const expected = STR_COL{"", "", ""}; + auto const expected = STR_COL{"", "", "", ""}; - auto results = cudf::strings::concatenate_list_elements(string_lv); + auto results = cudf::strings::join_list_elements(string_lv); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - auto const separators = STR_COL{"", "", ""}.release(); - results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + auto const separators = STR_COL{"", "", "", ""}.release(); + results = cudf::strings::join_list_elements(string_lv, separators->view()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); } @@ -120,29 +119,35 @@ TEST_F(StringsListsConcatenateTest, AllNullsStringsInput) auto const string_lv = cudf::lists_column_view(string_lists->view()); auto const expected = STR_COL{{"", "", ""}, all_nulls()}; - auto results = cudf::strings::concatenate_list_elements(string_lv); + auto results = cudf::strings::join_list_elements(string_lv); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); auto const separators = STR_COL{{"", "", ""}, all_nulls()}.release(); - results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + results = cudf::strings::join_list_elements(string_lv, separators->view()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); } +auto null_at(std::initializer_list indices) +{ + return cudf::detail::make_counting_transform_iterator( + 0, [indices](auto i) { return std::find(indices.begin(), indices.end(), i) == indices.end(); }); +} + TEST_F(StringsListsConcatenateTest, ScalarSeparator) { auto const string_lists = STR_LISTS{{STR_LISTS{{"a", "bb" /*NULL*/, "ccc"}, null_at(1)}, STR_LISTS{}, /*NULL*/ STR_LISTS{{"ddd" /*NULL*/, "efgh", "ijk"}, null_at(0)}, - STR_LISTS{"zzz", "xxxxx"}}, + STR_LISTS{"zzz", "xxxxx"}, + STR_LISTS{{"v", "", "", "w"}, null_at({1, 2})}}, null_at(1)} .release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); // No null replacement { - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); - std::vector h_expected{nullptr, nullptr, nullptr, "zzz+++xxxxx"}; + auto const results = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++")); + std::vector h_expected{nullptr, nullptr, nullptr, "zzz+++xxxxx", nullptr}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); @@ -150,10 +155,22 @@ TEST_F(StringsListsConcatenateTest, ScalarSeparator) // With null replacement { - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); std::vector h_expected{ - "a+++___+++ccc", nullptr, "___+++efgh+++ijk", "zzz+++xxxxx"}; + "a+++___+++ccc", nullptr, "___+++efgh+++ijk", "zzz+++xxxxx", "v+++___+++___+++w"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Turn off separator-on-nulls + { + auto const results = cudf::strings::join_list_elements(string_lv, + cudf::string_scalar("+++"), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + std::vector h_expected{"a+++ccc", nullptr, "efgh+++ijk", "zzz+++xxxxx", "v+++w"}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); @@ -181,8 +198,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the entire lists column, no null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + auto const results = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++")); std::vector h_expected{nullptr, nullptr, nullptr, @@ -202,7 +218,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the entire lists column, with null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); std::vector h_expected{"a+++___+++ccc", nullptr, @@ -223,8 +239,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the first half of the lists column, no null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + auto const results = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++")); std::vector h_expected{nullptr, nullptr, nullptr, "zzz+++xxxxx"}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; @@ -234,7 +249,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the first half of the lists column, with null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); std::vector h_expected{ "a+++___+++ccc", nullptr, "___+++efgh+++ijk", "zzz+++xxxxx"}; @@ -246,8 +261,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the second half of the lists column, no null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + auto const results = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++")); std::vector h_expected{ nullptr, nullptr, "0a0b0c+++5x5y5z", nullptr, "รฉรฉรฉ+++12345abcdef", "aaaรฉรฉรฉbbbรฉรฉรฉccc+++12345"}; auto const expected = @@ -258,7 +272,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the second half of the lists column, with null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); std::vector h_expected{"abcdef+++012345+++___+++xxx000", "___+++11111+++00000", @@ -274,8 +288,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the middle part of the lists column, no null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + auto const results = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++")); std::vector h_expected{ "zzz+++xxxxx", nullptr, nullptr, nullptr, "0a0b0c+++5x5y5z"}; auto const expected = @@ -286,7 +299,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the middle part of the lists column, with null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); std::vector h_expected{"zzz+++xxxxx", nullptr, @@ -318,7 +331,7 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators) // No null replacement { - auto const results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + auto const results = cudf::strings::join_list_elements(string_lv, separators->view()); std::vector h_expected{nullptr, nullptr, nullptr, nullptr, nullptr, "zzz^^^xxxxx"}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; @@ -327,8 +340,8 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators) // With null replacement for separators { - auto const results = cudf::strings::concatenate_list_elements( - string_lv, separators->view(), cudf::string_scalar("|||")); + auto const results = + cudf::strings::join_list_elements(string_lv, separators->view(), cudf::string_scalar("|||")); std::vector h_expected{ nullptr, nullptr, "0a0b0c|||xyzรฉรฉรฉ", nullptr, nullptr, "zzz^^^xxxxx"}; auto const expected = @@ -338,7 +351,7 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators) // With null replacement for strings { - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, separators->view(), cudf::string_scalar("", false), cudf::string_scalar("XXXXX")); std::vector h_expected{ "a+++XXXXX+++ccc", nullptr, nullptr, nullptr, "XXXXX%%%รกรกรก%%%รญรญรญ", "zzz^^^xxxxx"}; @@ -349,7 +362,7 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators) // With null replacement for both separators and strings { - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, separators->view(), cudf::string_scalar("|||"), cudf::string_scalar("XXXXX")); std::vector h_expected{"a+++XXXXX+++ccc", nullptr, @@ -361,6 +374,20 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators) STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); } + + // Turn off separator-on-nulls + { + auto const results = cudf::strings::join_list_elements(string_lv, + separators->view(), + cudf::string_scalar("+++"), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + std::vector h_expected{ + "a+++ccc", nullptr, "0a0b0c+++xyzรฉรฉรฉ", "efgh+++ijk", "รกรกรก%%%รญรญรญ", "zzz^^^xxxxx"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } } TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) @@ -390,7 +417,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + auto const results = cudf::strings::join_list_elements(string_lv, sep_col); std::vector h_expected{nullptr, nullptr, nullptr, @@ -411,7 +438,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); std::vector h_expected{"a+++___+++ccc", nullptr, @@ -433,7 +460,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 4})[0]); - auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + auto const results = cudf::strings::join_list_elements(string_lv, sep_col); std::vector h_expected{nullptr, nullptr, nullptr, nullptr}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; @@ -444,7 +471,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 4})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); std::vector h_expected{ "a+++___+++ccc", nullptr, "___|||efgh|||ijk", "zzz|||xxxxx"}; @@ -457,7 +484,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {5, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + auto const results = cudf::strings::join_list_elements(string_lv, sep_col); std::vector h_expected{ nullptr, nullptr, "0a0b0c###5x5y5z", nullptr, "รฉรฉรฉ-+-12345abcdef", "aaaรฉรฉรฉbbbรฉรฉรฉccc=+=12345"}; auto const expected = @@ -469,7 +496,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {5, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); std::vector h_expected{"abcdef^^^012345^^^___^^^xxx000", "___~!~11111~!~00000", @@ -486,7 +513,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {3, 8})[0]); - auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + auto const results = cudf::strings::join_list_elements(string_lv, sep_col); std::vector h_expected{nullptr, nullptr, nullptr, nullptr, "0a0b0c###5x5y5z"}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; @@ -497,7 +524,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {3, 8})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); std::vector h_expected{"zzz|||xxxxx", nullptr, diff --git a/docs/cudf/source/io-gds-integration.rst b/docs/cudf/source/io-gds-integration.rst new file mode 100644 index 00000000000..9ccf773b2e4 --- /dev/null +++ b/docs/cudf/source/io-gds-integration.rst @@ -0,0 +1,22 @@ +GPUDirect Storage Integration +============================= + +Many IO APIs can use GPUDirect Storage (GDS) library to optimize IO operations. +GDS enables a direct data path for direct memory access (DMA) transfers between GPU memory and storage, which avoids a bounce buffer through the CPU. +GDS also has a compatibility mode that allows the library to fall back to copying through a CPU bounce buffer. +The SDK is available for download `here `_. + +Use of GPUDirect Storage in cuDF is disabled by default, and can be enabled through environment variable ``LIBCUDF_CUFILE_POLICY``. +This variable also controls the GDS compatibility mode. There are two special values for the environment variable: + +- "GDS": Use of GDS is enabled; GDS compatibility mode is *off*. +- "ALWAYS": Use of GDS is enabled; GDS compatibility mode is *on*. + +Any other value (or no value set) will keep the GDS disabled for use in cuDF and IO will be done using cuDF's CPU bounce buffers. + +This environment variable also affects how cuDF treats GDS errors. +When ``LIBCUDF_CUFILE_POLICY`` is set to "GDS" and a GDS API call fails for any reason, cuDF falls back to the internal implementation with bounce buffers. +When ``LIBCUDF_CUFILE_POLICY`` is set to "ALWAYS" and a GDS API call fails for any reason (unlikely, given that the compatibility mode is on), +cuDF throws an exception to propagate the error to te user. + +NOTE: current GDS integration is not fully optimized and enabling GDS will not lead to performance improvements in all cases. \ No newline at end of file diff --git a/docs/cudf/source/io.rst b/docs/cudf/source/io.rst index 5186473ae10..e88162d8f52 100644 --- a/docs/cudf/source/io.rst +++ b/docs/cudf/source/io.rst @@ -8,4 +8,5 @@ This page contains Input / Output related APIs in cuDF. :maxdepth: 2 :caption: Contents: - io-supported-types.rst \ No newline at end of file + io-supported-types.rst + io-gds-integration.rst \ No newline at end of file diff --git a/java/ci/README.md b/java/ci/README.md index 458a76bcd04..968ce279a2c 100644 --- a/java/ci/README.md +++ b/java/ci/README.md @@ -49,5 +49,5 @@ scl enable devtoolset-9 "java/ci/build-in-docker.sh" ### The output -You can find the cuDF jar in java/target/ like cudf-21.06-SNAPSHOT-cuda11.jar. +You can find the cuDF jar in java/target/ like cudf-21.06.0-SNAPSHOT-cuda11.jar. diff --git a/java/pom.xml b/java/pom.xml index cec20ec04af..fe2d9a453f7 100755 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,7 +21,7 @@ ai.rapids cudf - 21.06-SNAPSHOT + 21.06.0-SNAPSHOT cudfjni diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index ea93a2daf36..a7e589ac890 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -1276,6 +1276,16 @@ public static ColumnVector fromStrings(String... values) { } } + /** + * Create a new string vector from the given values. This API + * supports inline nulls. + */ + public static ColumnVector fromUTF8Strings(byte[]... values) { + try (HostColumnVector host = HostColumnVector.fromUTF8Strings(values)) { + return host.copyToDevice(); + } + } + /** * Create a new vector from the given values. This API supports inline nulls, * but is much slower than building from primitive array of unscaledValues. diff --git a/java/src/main/java/ai/rapids/cudf/CuFile.java b/java/src/main/java/ai/rapids/cudf/CuFile.java index 00c9cdb9fd5..4baad834570 100644 --- a/java/src/main/java/ai/rapids/cudf/CuFile.java +++ b/java/src/main/java/ai/rapids/cudf/CuFile.java @@ -78,11 +78,25 @@ public static boolean libraryLoaded() { * @param path The file path to copy to. * @param file_offset The file offset from which to write the buffer. * @param buffer The device buffer to copy from. - * @return The file offset from which the buffer was appended. */ public static void writeDeviceBufferToFile(File path, long file_offset, BaseDeviceMemoryBuffer buffer) { - writeToFile(path.getAbsolutePath(), file_offset, buffer.getAddress(), buffer.getLength()); + writeDeviceMemoryToFile(path, file_offset, buffer.getAddress(), buffer.getLength()); + } + + /** + * Write device memory to a given file path synchronously. + *

+ * This method is NOT thread safe if the path points to the same file on disk. + * + * @param path The file path to copy to. + * @param file_offset The file offset from which to write the buffer. + * @param address The device memory address to copy from. + * @param length The length to copy. + */ + public static void writeDeviceMemoryToFile(File path, long file_offset, long address, + long length) { + writeToFile(path.getAbsolutePath(), file_offset, address, length); } /** @@ -95,7 +109,21 @@ public static void writeDeviceBufferToFile(File path, long file_offset, * @return The file offset from which the buffer was appended. */ public static long appendDeviceBufferToFile(File path, BaseDeviceMemoryBuffer buffer) { - return appendToFile(path.getAbsolutePath(), buffer.getAddress(), buffer.getLength()); + return appendDeviceMemoryToFile(path, buffer.getAddress(), buffer.getLength()); + } + + /** + * Append device memory to a given file path synchronously. + *

+ * This method is NOT thread safe if the path points to the same file on disk. + * + * @param path The file path to copy to. + * @param address The device memory address to copy from. + * @param length The length to copy. + * @return The file offset from which the buffer was appended. + */ + public static long appendDeviceMemoryToFile(File path, long address, long length) { + return appendToFile(path.getAbsolutePath(), address, length); } /** @@ -109,7 +137,21 @@ public static long appendDeviceBufferToFile(File path, BaseDeviceMemoryBuffer bu */ public static void readFileToDeviceBuffer(BaseDeviceMemoryBuffer buffer, File path, long fileOffset) { - readFromFile(buffer.getAddress(), buffer.getLength(), path.getAbsolutePath(), fileOffset); + readFileToDeviceMemory(buffer.getAddress(), buffer.getLength(), path, fileOffset); + } + + /** + * Read a file into device memory synchronously. + *

+ * This method is NOT thread safe if the path points to the same file on disk. + * + * @param address The device memory address to read into. + * @param length The length to read. + * @param path The file path to copy from. + * @param fileOffset The file offset from which to copy the content. + */ + public static void readFileToDeviceMemory(long address, long length, File path, long fileOffset) { + readFromFile(address, length, path.getAbsolutePath(), fileOffset); } private static native void writeToFile(String path, long file_offset, long address, long length); diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java index 846bcb3b635..46255428c1c 100644 --- a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java @@ -29,6 +29,7 @@ import java.util.Objects; import java.util.Optional; import java.util.StringJoiner; +import java.util.function.BiConsumer; import java.util.function.Consumer; /** @@ -577,6 +578,40 @@ public static HostColumnVector fromStrings(String... values) { }); } + /** + * Create a new string vector from the given values. This API + * supports inline nulls. + */ + public static HostColumnVector fromUTF8Strings(byte[]... values) { + int rows = values.length; + long nullCount = 0; + long bufferSize = 0; + // How many bytes do we need to hold the data. + for (byte[] s: values) { + if (s == null) { + nullCount++; + } else { + bufferSize += s.length; + } + } + + BiConsumer appendUTF8 = nullCount == 0 ? + (b, s) -> b.appendUTF8String(s) : + (b, s) -> { + if (s == null) { + b.appendNull(); + } else { + b.appendUTF8String(s); + } + }; + + return build(rows, bufferSize, (b) -> { + for (byte[] s: values) { + appendUTF8.accept(b, s); + } + }); + } + /** * Create a new vector from the given values. This API supports inline nulls, * but is much slower than building from primitive array of unscaledValues. @@ -1085,9 +1120,11 @@ private void appendChildOrNull(ColumnBuilder childBuilder, Object listElement) { } else if (listElement instanceof BigDecimal) { childBuilder.append((BigDecimal) listElement); } else if (listElement instanceof List) { - childBuilder.append((List) listElement); + childBuilder.append((List) listElement); } else if (listElement instanceof StructData) { childBuilder.append((StructData) listElement); + } else if (listElement instanceof byte[]) { + childBuilder.appendUTF8String((byte[]) listElement); } else { throw new IllegalStateException("Unexpected element type: " + listElement.getClass()); } diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java index 8d63d2aeefc..97813182deb 100755 --- a/java/src/main/java/ai/rapids/cudf/Rmm.java +++ b/java/src/main/java/ai/rapids/cudf/Rmm.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -173,6 +173,36 @@ public static synchronized void initialize(int allocationMode, LogConf logConf, */ public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize, long maxPoolSize) throws RmmException { + initialize(allocationMode, logConf, poolSize, maxPoolSize, 0, 0); + } + + /** + * Initialize memory manager state and storage. This will always initialize + * the CUDA context for the calling thread if it is not already set. The + * caller is responsible for setting the desired CUDA device prior to this + * call if a specific device is already set. + *

NOTE: All cudf methods will set the chosen CUDA device in the CUDA + * context of the calling thread after this returns. + * @param allocationMode Allocation strategy to use. Bit set using + * {@link RmmAllocationMode#CUDA_DEFAULT}, + * {@link RmmAllocationMode#POOL}, + * {@link RmmAllocationMode#ARENA} and + * {@link RmmAllocationMode#CUDA_MANAGED_MEMORY} + * @param logConf How to do logging or null if you don't want to + * @param poolSize The initial pool size in bytes + * @param maxPoolSize The maximum size the pool is allowed to grow. If the specified value + * is <= 0 then the pool size will not be artificially limited. + * @param allocationAlignment The size to which allocations are aligned. + * @param alignmentThreshold Only allocations with size larger than or equal to this threshold + * are aligned with `allocationAlignment`. + * @throws IllegalStateException if RMM has already been initialized + * @throws IllegalArgumentException if a max pool size is specified but the allocation mode + * is not {@link RmmAllocationMode#POOL} or + * {@link RmmAllocationMode#ARENA}, or the maximum pool size is + * below the initial size. + */ + public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize, + long maxPoolSize, long allocationAlignment, long alignmentThreshold) throws RmmException { if (initialized) { throw new IllegalStateException("RMM is already initialized"); } @@ -195,7 +225,8 @@ public static synchronized void initialize(int allocationMode, LogConf logConf, loc = logConf.loc; } - initializeInternal(allocationMode, loc.internalId, path, poolSize, maxPoolSize); + initializeInternal(allocationMode, loc.internalId, path, poolSize, maxPoolSize, + allocationAlignment, alignmentThreshold); MemoryCleaner.setDefaultGpu(Cuda.getDevice()); initialized = true; } @@ -241,7 +272,8 @@ private static long[] sortThresholds(long[] thresholds) { } private static native void initializeInternal(int allocationMode, int logTo, String path, - long poolSize, long maxPoolSize) throws RmmException; + long poolSize, long maxPoolSize, long allocationAlignment, long alignmentThreshold) + throws RmmException; /** * Shut down any initialized RMM instance. This should be used very rarely. It does not need to diff --git a/java/src/main/java/ai/rapids/cudf/Scalar.java b/java/src/main/java/ai/rapids/cudf/Scalar.java index 62dd9bda13b..7794b57c3f9 100644 --- a/java/src/main/java/ai/rapids/cudf/Scalar.java +++ b/java/src/main/java/ai/rapids/cudf/Scalar.java @@ -329,10 +329,19 @@ public static Scalar timestampFromLong(DType type, Long value) { } public static Scalar fromString(String value) { + return fromUTF8String(value == null ? null : value.getBytes(StandardCharsets.UTF_8)); + } + + /** + * Creates a String scalar from an array of UTF8 bytes. + * @param value the array of UTF8 bytes + * @return a String scalar + */ + public static Scalar fromUTF8String(byte[] value) { if (value == null) { return fromNull(DType.STRING); } - return new Scalar(DType.STRING, makeStringScalar(value.getBytes(StandardCharsets.UTF_8), true)); + return new Scalar(DType.STRING, makeStringScalar(value, true)); } /** diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp index 85bbdd41b4a..2953a6221e8 100644 --- a/java/src/main/native/src/ColumnVectorJni.cpp +++ b/java/src/main/native/src/ColumnVectorJni.cpp @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include @@ -220,49 +220,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeList(JNIEnv *env, j JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromScalar(JNIEnv *env, jclass, jlong j_scalar, jint row_count) { - using ScalarType = cudf::scalar_type_t; JNI_NULL_CHECK(env, j_scalar, "scalar is null", 0); try { cudf::jni::auto_set_device(env); auto scalar_val = reinterpret_cast(j_scalar); - auto dtype = scalar_val->type(); - cudf::mask_state mask_state = - scalar_val->is_valid() ? cudf::mask_state::UNALLOCATED : cudf::mask_state::ALL_NULL; std::unique_ptr col; - if (dtype.id() == cudf::type_id::LIST) { - // Neither 'cudf::make_empty_column' nor 'cudf::make_column_from_scalar' supports - // LIST type for now (https://github.com/rapidsai/cudf/issues/8088), so the list - // precedes the others and takes care of the empty column itself. - auto s_list = reinterpret_cast(scalar_val); - cudf::column_view s_val = s_list->view(); - - // Offsets: [0, list_size, list_size*2, ..., list_szie*row_count] - auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32)); - auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32)); - zero->set_valid(true); - step->set_valid(true); - static_cast(zero.get())->set_value(0); - static_cast(step.get())->set_value(s_val.size()); - std::unique_ptr offsets = cudf::sequence(row_count + 1, *zero, *step); - // Data: - // Builds the data column by leveraging `cudf::concatenate` to repeat the 's_val' - // 'row_count' times, because 'cudf::make_column_from_scalar' does not support list - // type. - // (Assumes the `row_count` is not big, otherwise there would be a performance issue.) - // Checks the `row_count` because `cudf::concatenate` does not support no rows. - auto data_col = row_count > 0 - ? cudf::concatenate(std::vector(row_count, s_val)) - : cudf::empty_like(s_val); - col = cudf::make_lists_column(row_count, std::move(offsets), std::move(data_col), - cudf::state_null_count(mask_state, row_count), - cudf::create_null_mask(row_count, mask_state)); - } else if (row_count == 0) { - col = cudf::make_empty_column(dtype); - } else if (cudf::is_fixed_width(dtype)) { - col = cudf::make_fixed_width_column(dtype, row_count, mask_state); - auto mut_view = col->mutable_view(); - cudf::fill_in_place(mut_view, 0, row_count, *scalar_val); - } else if (dtype.id() == cudf::type_id::STRING) { + if (scalar_val->type().id() == cudf::type_id::STRING) { + // Tests fail when using the cudf implementation, complaining no child for string column. + // So here take care of the String type itself. // create a string column of all empty strings to fill (cheapest string column to create) auto offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, row_count + 1, cudf::mask_state::UNALLOCATED); @@ -273,7 +238,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromScalar(JNIEnv *env, col = cudf::fill(str_col->view(), 0, row_count, *scalar_val); } else { - JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Invalid data type", 0); + col = cudf::make_column_from_scalar(*scalar_val, row_count); } return reinterpret_cast(col.release()); } diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 7f11e19fce8..e604fc7dd46 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -332,7 +333,9 @@ extern "C" { JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, jclass clazz, jint allocation_mode, jint log_to, jstring jpath, jlong pool_size, - jlong max_pool_size) { + jlong max_pool_size, + jlong allocation_alignment, + jlong alignment_threshold) { try { // make sure the CUDA device is setup in the context cudaError_t cuda_status = cudaFree(0); @@ -351,13 +354,9 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j if (use_managed_mem) { Initialized_resource = rmm::mr::make_owning_wrapper( std::make_shared(), pool_size, pool_limit); - auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT); - Tracking_memory_resource.reset(wrapped); } else { Initialized_resource = rmm::mr::make_owning_wrapper( std::make_shared(), pool_size, pool_limit); - auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT); - Tracking_memory_resource.reset(wrapped); } } else if (use_arena_alloc) { std::size_t pool_limit = (max_pool_size > 0) ? static_cast(max_pool_size) : @@ -365,23 +364,26 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j if (use_managed_mem) { Initialized_resource = rmm::mr::make_owning_wrapper( std::make_shared(), pool_size, pool_limit); - auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT); - Tracking_memory_resource.reset(wrapped); } else { Initialized_resource = rmm::mr::make_owning_wrapper( std::make_shared(), pool_size, pool_limit); - auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT); - Tracking_memory_resource.reset(wrapped); } } else if (use_managed_mem) { Initialized_resource = std::make_shared(); - auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT); - Tracking_memory_resource.reset(wrapped); } else { Initialized_resource = std::make_shared(); - auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT); - Tracking_memory_resource.reset(wrapped); } + + if (allocation_alignment != 0) { + Initialized_resource = rmm::mr::make_owning_wrapper( + Initialized_resource, allocation_alignment, alignment_threshold); + } + + auto wrapped = make_tracking_adaptor( + Initialized_resource.get(), + std::max(RMM_ALLOC_SIZE_ALIGNMENT, static_cast(allocation_alignment))); + Tracking_memory_resource.reset(wrapped); + auto resource = Tracking_memory_resource.get(); rmm::mr::set_current_device_resource(resource); diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 09ddef633e3..8da70afc6f3 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -29,6 +29,7 @@ import java.math.BigDecimal; import java.math.RoundingMode; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -176,6 +177,19 @@ void testStringCreation() { } } + @Test + void testUTF8StringCreation() { + try (ColumnVector cv = ColumnVector.fromUTF8Strings( + "d".getBytes(StandardCharsets.UTF_8), + "sd".getBytes(StandardCharsets.UTF_8), + "sde".getBytes(StandardCharsets.UTF_8), + null, + "END".getBytes(StandardCharsets.UTF_8)); + ColumnVector expected = ColumnVector.fromStrings("d", "sd", "sde", null, "END")) { + TableTest.assertColumnsAreEqual(expected, cv); + } + } + @Test void testRefCountLeak() throws InterruptedException { assumeTrue(Boolean.getBoolean("ai.rapids.cudf.flaky-tests-enabled")); @@ -2085,15 +2099,16 @@ void testStringConcatWithNulls() { assertColumnsAreEqual(concat, e_concat); } - try (ColumnVector v = ColumnVector.fromStrings("a", "B", "cd", "\u0480\u0481", "E\tf", - "g\nH", "IJ\"\u0100\u0101\u0500\u0501", - "kl m", "Nop1", "\\qRs2", null, - "3tuV\'", "wX4Yz", "\ud720\ud721"); - Scalar emptyString = Scalar.fromString(""); - Scalar nullSubstitute = Scalar.fromString("NULL"); - ColumnVector concat = ColumnVector.stringConcatenate(emptyString, nullSubstitute, new ColumnView[]{v})) { - assertColumnsAreEqual(v, concat); - } + assertThrows(CudfException.class, () -> { + try (ColumnVector v = ColumnVector.fromStrings("a", "B", "cd", "\u0480\u0481", "E\tf", + "g\nH", "IJ\"\u0100\u0101\u0500\u0501", + "kl m", "Nop1", "\\qRs2", null, + "3tuV\'", "wX4Yz", "\ud720\ud721"); + Scalar emptyString = Scalar.fromString(""); + Scalar nullSubstitute = Scalar.fromString("NULL"); + ColumnVector concat = ColumnVector.stringConcatenate(emptyString, nullSubstitute, new ColumnView[]{v})) { + } + }); } @Test diff --git a/java/src/test/java/ai/rapids/cudf/ScalarTest.java b/java/src/test/java/ai/rapids/cudf/ScalarTest.java index b09850bc3d9..a1078f2546b 100644 --- a/java/src/test/java/ai/rapids/cudf/ScalarTest.java +++ b/java/src/test/java/ai/rapids/cudf/ScalarTest.java @@ -27,6 +27,7 @@ import org.junit.jupiter.api.Test; import java.math.BigDecimal; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import static ai.rapids.cudf.TableTest.assertColumnsAreEqual; @@ -244,6 +245,22 @@ public void testString() { } } + @Test + public void testUTF8String() { + try (Scalar s = Scalar.fromUTF8String("TEST".getBytes(StandardCharsets.UTF_8))) { + assertEquals(DType.STRING, s.getType()); + assertTrue(s.isValid()); + assertEquals("TEST", s.getJavaString()); + assertArrayEquals(new byte[]{'T', 'E', 'S', 'T'}, s.getUTF8()); + } + try (Scalar s = Scalar.fromUTF8String("".getBytes(StandardCharsets.UTF_8))) { + assertEquals(DType.STRING, s.getType()); + assertTrue(s.isValid()); + assertEquals("", s.getJavaString()); + assertArrayEquals(new byte[]{}, s.getUTF8()); + } + } + @Test public void testList() { // list of int diff --git a/python/cudf/cudf/_lib/cpp/lists/concatenate_rows.pxd b/python/cudf/cudf/_lib/cpp/lists/combine.pxd similarity index 83% rename from python/cudf/cudf/_lib/cpp/lists/concatenate_rows.pxd rename to python/cudf/cudf/_lib/cpp/lists/combine.pxd index 8c4dabf5168..ea9ade178e2 100644 --- a/python/cudf/cudf/_lib/cpp/lists/concatenate_rows.pxd +++ b/python/cudf/cudf/_lib/cpp/lists/combine.pxd @@ -5,7 +5,7 @@ from libcpp.memory cimport unique_ptr from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.table.table_view cimport table_view -cdef extern from "cudf/lists/concatenate_rows.hpp" namespace \ +cdef extern from "cudf/lists/combine.hpp" namespace \ "cudf::lists" nogil: cdef unique_ptr[column] concatenate_rows( const table_view input_table diff --git a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd b/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd index fec1c6382e6..de5cb05447c 100644 --- a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd +++ b/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd @@ -9,6 +9,9 @@ from libcpp.string cimport string from cudf._lib.cpp.types cimport data_type from cudf._lib.cpp.wrappers.decimals cimport scale_type +from cudf._lib.cpp.column.column_view cimport column_view + + cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil: cdef cppclass scalar: scalar() except + @@ -60,3 +63,6 @@ cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil: bool is_valid) except + int64_t value() except + # TODO: Figure out how to add an int32 overload of value() + + cdef cppclass list_scalar(scalar): + column_view view() except + diff --git a/python/cudf/cudf/_lib/cpp/strings/combine.pxd b/python/cudf/cudf/_lib/cpp/strings/combine.pxd index 250c6441882..51c706b68d0 100644 --- a/python/cudf/cudf/_lib/cpp/strings/combine.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/combine.pxd @@ -18,13 +18,13 @@ cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil: string_scalar separator, string_scalar narep) except + - cdef unique_ptr[column] concatenate_list_elements( + cdef unique_ptr[column] join_list_elements( column_view lists_strings_column, column_view separators, string_scalar separator_narep, string_scalar string_narep) except + - cdef unique_ptr[column] concatenate_list_elements( + cdef unique_ptr[column] join_list_elements( column_view lists_strings_column, string_scalar separator, string_scalar narep) except + diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 46f034dc525..7d8909610dc 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -16,7 +16,7 @@ from cudf._lib.cpp.lists.drop_list_duplicates cimport ( from cudf._lib.cpp.lists.sorting cimport ( sort_lists as cpp_sort_lists ) -from cudf._lib.cpp.lists.concatenate_rows cimport ( +from cudf._lib.cpp.lists.combine cimport ( concatenate_rows as cpp_concatenate_rows ) from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 9f8a8ee6b1e..cb355a15f15 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -18,9 +18,18 @@ from libcpp.utility cimport move from libcpp cimport bool import cudf -from cudf._lib.types import cudf_to_np_types, duration_unit_map +from cudf.core.dtypes import ListDtype +from cudf._lib.types import ( + cudf_to_np_types, + duration_unit_map +) from cudf._lib.types import datetime_unit_map -from cudf._lib.types cimport underlying_type_t_type_id +from cudf._lib.types cimport underlying_type_t_type_id, dtype_from_column_view + +from cudf._lib.column cimport Column +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.table cimport Table +from cudf._lib.interop import to_arrow from cudf._lib.cpp.wrappers.timestamps cimport ( timestamp_s, @@ -41,12 +50,12 @@ from cudf._lib.cpp.scalar.scalar cimport ( timestamp_scalar, duration_scalar, string_scalar, - fixed_point_scalar + fixed_point_scalar, + list_scalar, ) -from cudf.utils.dtypes import _decimal_to_int64 +from cudf.utils.dtypes import _decimal_to_int64, is_list_dtype cimport cudf._lib.cpp.types as libcudf_types - cdef class DeviceScalar: def __init__(self, value, dtype): @@ -97,6 +106,8 @@ cdef class DeviceScalar: def _to_host_scalar(self): if isinstance(self.dtype, cudf.Decimal64Dtype): result = _get_py_decimal_from_fixed_point(self.c_value) + elif is_list_dtype(self.dtype): + result = _get_py_list_from_list(self.c_value) elif pd.api.types.is_string_dtype(self.dtype): result = _get_py_string_from_string(self.c_value) elif pd.api.types.is_numeric_dtype(self.dtype): @@ -159,6 +170,22 @@ cdef class DeviceScalar: raise TypeError( "Must pass a dtype when constructing from a fixed-point scalar" ) + elif cdtype.id() == libcudf_types.LIST: + if ( + s.get_raw_ptr() + )[0].view().type().id() == libcudf_types.LIST: + s._dtype = dtype_from_column_view( + (s.get_raw_ptr())[0].view() + ) + else: + s._dtype = ListDtype( + cudf_to_np_types[ + ( + (s.get_raw_ptr())[0] + .view().type().id() + ) + ] + ) else: if dtype is not None: s._dtype = dtype @@ -268,6 +295,19 @@ cdef _set_decimal64_from_scalar(unique_ptr[scalar]& s, ) ) +cdef _get_py_list_from_list(unique_ptr[scalar]& s): + + if not s.get()[0].is_valid(): + return cudf.NA + + cdef column_view list_col_view = (s.get()).view() + cdef Column list_col = Column.from_column_view(list_col_view, None) + cdef Table to_arrow_table = Table({"col": list_col}) + + arrow_table = to_arrow(to_arrow_table, [["col", []]]) + result = arrow_table['col'].to_pylist() + return _nested_na_replace(result) + cdef _get_py_string_from_string(unique_ptr[scalar]& s): if not s.get()[0].is_valid(): return cudf.NA @@ -440,3 +480,16 @@ def _create_proxy_nat_scalar(dtype): return result else: raise TypeError('NAT only valid for datetime and timedelta') + + +def _nested_na_replace(input_list): + ''' + Replace `None` with `cudf.NA` in the result of + `__getitem__` calls to list type columns + ''' + for idx, value in enumerate(input_list): + if isinstance(value, list): + _nested_na_replace(value) + elif value is None: + input_list[idx] = cudf.NA + return input_list diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx index 25619de3ed0..0d7dfb5c619 100644 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ b/python/cudf/cudf/_lib/strings/combine.pyx @@ -16,7 +16,7 @@ from cudf._lib.table cimport Table from cudf._lib.cpp.strings.combine cimport ( concatenate as cpp_concatenate, join_strings as cpp_join_strings, - concatenate_list_elements as cpp_concatenate_list_elements + join_list_elements as cpp_join_list_elements ) @@ -105,7 +105,7 @@ def join_lists_with_scalar( ) with nogil: - c_result = move(cpp_concatenate_list_elements( + c_result = move(cpp_join_list_elements( source_view, scalar_separator[0], scalar_narep[0] @@ -142,7 +142,7 @@ def join_lists_with_column( ) with nogil: - c_result = move(cpp_concatenate_list_elements( + c_result = move(cpp_join_list_elements( source_view, separator_view, scalar_separator_narep[0], diff --git a/python/cudf/cudf/core/abc.py b/python/cudf/cudf/core/abc.py index 0439f0d24b8..d3da544f8b5 100644 --- a/python/cudf/cudf/core/abc.py +++ b/python/cudf/cudf/core/abc.py @@ -1,9 +1,7 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. """Common abstract base classes for cudf.""" -import abc import sys -from abc import abstractmethod import rmm @@ -18,7 +16,7 @@ import pickle # type: ignore -class Serializable(abc.ABC): +class Serializable: """A serializable object composed of device memory buffers. This base class defines a standard serialization protocol for objects @@ -32,7 +30,6 @@ class Serializable(abc.ABC): latter converts back from that representation into an equivalent object. """ - @abstractmethod def serialize(self): """Generate an equivalent serializable representation of an object. @@ -53,10 +50,11 @@ def serialize(self): :meta private: """ - pass + raise NotImplementedError( + "Subclasses of Serializable must implement serialize" + ) @classmethod - @abstractmethod def deserialize(cls, header, frames): """Generate an object from a serialized representation. @@ -80,7 +78,9 @@ class can be constructed from a serialized representation generalized :meta private: """ - pass + raise NotImplementedError( + "Subclasses of Serializable must implement deserialize" + ) def device_serialize(self): """Serialize data and metadata associated with device memory. diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 20f302f7e59..4bf4b2b87f2 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -40,7 +40,12 @@ from cudf._typing import BinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.core.abc import Serializable from cudf.core.buffer import Buffer -from cudf.core.dtypes import CategoricalDtype, IntervalDtype +from cudf.core.dtypes import ( + CategoricalDtype, + IntervalDtype, + ListDtype, + StructDtype, +) from cudf.utils import ioutils, utils from cudf.utils.dtypes import ( check_cast_unsupported_dtype, @@ -291,8 +296,7 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: "None" ] - if isinstance(result.dtype, cudf.Decimal64Dtype): - result.dtype.precision = array.type.precision + result = _copy_type_metadata_from_arrow(array, result) return result def _get_mask_as_column(self) -> ColumnBase: @@ -2230,6 +2234,60 @@ def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase: return ColumnBase.from_scalar(cudf.Scalar(fill_value, dtype), size) +def _copy_type_metadata_from_arrow( + arrow_array: pa.array, cudf_column: ColumnBase +) -> ColumnBase: + """ + Similar to `Column._copy_type_metadata`, except copies type metadata + from arrow array into a cudf column. Recursive for every level. + * When `arrow_array` is struct type and `cudf_column` is StructDtype, copy + field names. + * When `arrow_array` is decimal type and `cudf_column` is + Decimal64Dtype, copy precisions. + """ + if pa.types.is_decimal(arrow_array.type) and isinstance( + cudf_column, cudf.core.column.DecimalColumn + ): + cudf_column.dtype.precision = arrow_array.type.precision + elif pa.types.is_struct(arrow_array.type) and isinstance( + cudf_column, cudf.core.column.StructColumn + ): + base_children = tuple( + _copy_type_metadata_from_arrow(arrow_array.field(i), col_child) + for i, col_child in enumerate(cudf_column.base_children) + ) + cudf_column.set_base_children(base_children) + return cudf.core.column.StructColumn( + data=None, + size=cudf_column.base_size, + dtype=StructDtype.from_arrow(arrow_array.type), + mask=cudf_column.base_mask, + offset=cudf_column.offset, + null_count=cudf_column.null_count, + children=base_children, + ) + elif pa.types.is_list(arrow_array.type) and isinstance( + cudf_column, cudf.core.column.ListColumn + ): + if arrow_array.values and cudf_column.base_children: + base_children = ( + cudf_column.base_children[0], + _copy_type_metadata_from_arrow( + arrow_array.values, cudf_column.base_children[1] + ), + ) + return cudf.core.column.ListColumn( + size=cudf_column.base_size, + dtype=ListDtype.from_arrow(arrow_array.type), + mask=cudf_column.base_mask, + offset=cudf_column.offset, + null_count=cudf_column.null_count, + children=base_children, + ) + + return cudf_column + + def _concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: """Concatenate a sequence of columns.""" if len(objs) == 0: diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 7db8ba15caa..f0b0dbba4a5 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -143,6 +143,8 @@ def __init__(self, element_type: Any) -> None: def element_type(self) -> Dtype: if isinstance(self._typ.value_type, pa.ListType): return ListDtype.from_arrow(self._typ.value_type) + elif isinstance(self._typ.value_type, pa.StructType): + return StructDtype.from_arrow(self._typ.value_type) else: return np.dtype(self._typ.value_type.to_pandas_dtype()).name @@ -176,10 +178,10 @@ def __eq__(self, other): return self._typ.equals(other._typ) def __repr__(self): - if isinstance(self.element_type, ListDtype): - return f"ListDtype({self.element_type.__repr__()})" + if isinstance(self.element_type, (ListDtype, StructDtype)): + return f"{type(self).__name__}({self.element_type.__repr__()})" else: - return f"ListDtype({self.element_type})" + return f"{type(self).__name__}({self.element_type})" def __hash__(self): return hash(self._typ) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index f59954aaf08..1c6c1ed85e6 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -32,6 +32,7 @@ is_numerical_dtype, is_scalar, min_scalar_type, + find_common_type, ) T = TypeVar("T", bound="Frame") @@ -156,6 +157,15 @@ def size(self): """ return self._num_columns * self._num_rows + @property + def _is_homogeneous(self): + # make sure that the dataframe has columns + if not self._data.columns: + return True + + first_type = self._data.columns[0].dtype.name + return all(x.dtype.name == first_type for x in self._data.columns) + @property def empty(self): """ @@ -4029,8 +4039,11 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): # default to the first non-null dtype dtypes[idx] = cols[0].dtype # If all the non-null dtypes are int/float, find a common dtype - if all(is_numerical_dtype(col.dtype) for col in cols): - dtypes[idx] = np.find_common_type([col.dtype for col in cols], []) + if all( + is_numerical_dtype(col.dtype) or is_decimal_dtype(col.dtype) + for col in cols + ): + dtypes[idx] = find_common_type([col.dtype for col in cols]) # If all categorical dtypes, combine the categories elif all( isinstance(col, cudf.core.column.CategoricalColumn) for col in cols @@ -4045,17 +4058,6 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): # Set the column dtype to the codes' dtype. The categories # will be re-assigned at the end dtypes[idx] = min_scalar_type(len(categories[idx])) - elif all( - isinstance(col, cudf.core.column.DecimalColumn) for col in cols - ): - # Find the largest scale and the largest difference between - # precision and scale of the columns to be concatenated - s = max([col.dtype.scale for col in cols]) - lhs = max([col.dtype.precision - col.dtype.scale for col in cols]) - # Combine to get the necessary precision and clip at the maximum - # precision - p = min(cudf.Decimal64Dtype.MAX_PRECISION, s + lhs) - dtypes[idx] = cudf.Decimal64Dtype(p, s) # Otherwise raise an error if columns have different dtypes elif not all(is_dtype_equal(c.dtype, dtypes[idx]) for c in cols): raise ValueError("All columns must be the same type") diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 61fe20636f0..c1060d5f505 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -110,6 +110,7 @@ def cumcount(self): ) .groupby(self.grouping, sort=self._sort) .agg("cumcount") + .reset_index(drop=True) ) @cached_property @@ -225,9 +226,10 @@ def nth(self, n): """ Return the nth row from each group. """ - result = self.agg(lambda x: x.nth(n)) - sizes = self.size() - return result[n < sizes] + result = self.agg(lambda x: x.nth(n)).sort_index() + sizes = self.size().sort_index() + + return result[sizes > n] def serialize(self): header = {} diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index 7de1aaf9726..21d075ae67d 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -85,7 +85,11 @@ def __getitem__(self, arg): arg = list(arg) data = self._sr._column[arg] - if is_scalar(data) or _is_null_host_scalar(data): + if ( + isinstance(data, list) + or is_scalar(data) + or _is_null_host_scalar(data) + ): return data index = self._sr.index.take(arg) return self._sr._copy_construct(data=data, index=index) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index d812214caf8..a894baf8235 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -45,7 +45,6 @@ from cudf.utils import cudautils, docutils, ioutils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( - _decimal_normalize_types, can_convert_to_column, is_decimal_dtype, is_list_dtype, @@ -53,7 +52,7 @@ is_mixed_with_object_dtype, is_scalar, min_scalar_type, - numeric_normalize_types, + find_common_type, ) from cudf.utils.utils import ( get_appropriate_dispatched_func, @@ -2402,10 +2401,8 @@ def _concat(cls, objs, axis=0, index=True): ) if dtype_mismatch: - if isinstance(objs[0]._column, cudf.core.column.DecimalColumn): - objs = _decimal_normalize_types(*objs) - else: - objs = numeric_normalize_types(*objs) + common_dtype = find_common_type([obj.dtype for obj in objs]) + objs = [obj.astype(common_dtype) for obj in objs] col = _concat_columns([o._column for o in objs]) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 31dc6012905..5c4c121db4d 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd import pytest +from decimal import Decimal import cudf as gd from cudf.tests.utils import assert_eq, assert_exceptions_equal @@ -1262,3 +1263,267 @@ def test_concat_decimal_series(ltype, rtype): expected = pd.concat([ps1, ps2]) assert_eq(expected, got) + + +@pytest.mark.parametrize( + "df1, df2, df3, expected", + [ + ( + gd.DataFrame( + {"val": [Decimal("42.5"), Decimal("8.7")]}, + dtype=Decimal64Dtype(5, 2), + ), + gd.DataFrame( + {"val": [Decimal("9.23"), Decimal("-67.49")]}, + dtype=Decimal64Dtype(6, 4), + ), + gd.DataFrame({"val": [8, -5]}, dtype="int32"), + gd.DataFrame( + { + "val": [ + Decimal("42.5"), + Decimal("8.7"), + Decimal("9.23"), + Decimal("-67.49"), + Decimal("8"), + Decimal("-5"), + ] + }, + dtype=Decimal64Dtype(7, 4), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.DataFrame( + {"val": [Decimal("95.2"), Decimal("23.4")]}, + dtype=Decimal64Dtype(5, 2), + ), + gd.DataFrame({"val": [54, 509]}, dtype="uint16"), + gd.DataFrame({"val": [24, -48]}, dtype="int32"), + gd.DataFrame( + { + "val": [ + Decimal("95.2"), + Decimal("23.4"), + Decimal("54"), + Decimal("509"), + Decimal("24"), + Decimal("-48"), + ] + }, + dtype=Decimal64Dtype(5, 2), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.DataFrame( + {"val": [Decimal("36.56"), Decimal("-59.24")]}, + dtype=Decimal64Dtype(9, 4), + ), + gd.DataFrame({"val": [403.21, 45.13]}, dtype="float32"), + gd.DataFrame({"val": [52.262, -49.25]}, dtype="float64"), + gd.DataFrame( + { + "val": [ + Decimal("36.56"), + Decimal("-59.24"), + Decimal("403.21"), + Decimal("45.13"), + Decimal("52.262"), + Decimal("-49.25"), + ] + }, + dtype=Decimal64Dtype(9, 4), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.DataFrame( + {"val": [Decimal("9563.24"), Decimal("236.633")]}, + dtype=Decimal64Dtype(9, 4), + ), + gd.DataFrame({"val": [5393, -95832]}, dtype="int64"), + gd.DataFrame({"val": [-29.234, -31.945]}, dtype="float64"), + gd.DataFrame( + { + "val": [ + Decimal("9563.24"), + Decimal("236.633"), + Decimal("5393"), + Decimal("-95832"), + Decimal("-29.234"), + Decimal("-31.945"), + ] + }, + dtype=Decimal64Dtype(9, 4), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ], +) +def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected): + df = gd.concat([df1, df2, df3]) + assert_eq(df, expected) + assert_eq(df.val.dtype, expected.val.dtype) + + +@pytest.mark.parametrize( + "s1, s2, s3, expected", + [ + ( + gd.Series( + [Decimal("32.8"), Decimal("-87.7")], dtype=Decimal64Dtype(6, 2) + ), + gd.Series( + [Decimal("101.243"), Decimal("-92.449")], + dtype=Decimal64Dtype(9, 6), + ), + gd.Series([94, -22], dtype="int32"), + gd.Series( + [ + Decimal("32.8"), + Decimal("-87.7"), + Decimal("101.243"), + Decimal("-92.449"), + Decimal("94"), + Decimal("-22"), + ], + dtype=Decimal64Dtype(10, 6), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.Series( + [Decimal("7.2"), Decimal("122.1")], dtype=Decimal64Dtype(5, 2) + ), + gd.Series([33, 984], dtype="uint32"), + gd.Series([593, -702], dtype="int32"), + gd.Series( + [ + Decimal("7.2"), + Decimal("122.1"), + Decimal("33"), + Decimal("984"), + Decimal("593"), + Decimal("-702"), + ], + dtype=Decimal64Dtype(5, 2), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.Series( + [Decimal("982.94"), Decimal("-493.626")], + dtype=Decimal64Dtype(9, 4), + ), + gd.Series([847.98, 254.442], dtype="float32"), + gd.Series([5299.262, -2049.25], dtype="float64"), + gd.Series( + [ + Decimal("982.94"), + Decimal("-493.626"), + Decimal("847.98"), + Decimal("254.442"), + Decimal("5299.262"), + Decimal("-2049.25"), + ], + dtype=Decimal64Dtype(9, 4), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.Series( + [Decimal("492.204"), Decimal("-72824.455")], + dtype=Decimal64Dtype(9, 4), + ), + gd.Series([8438, -27462], dtype="int64"), + gd.Series([-40.292, 49202.953], dtype="float64"), + gd.Series( + [ + Decimal("492.204"), + Decimal("-72824.455"), + Decimal("8438"), + Decimal("-27462"), + Decimal("-40.292"), + Decimal("49202.953"), + ], + dtype=Decimal64Dtype(9, 4), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ], +) +def test_concat_decimal_numeric_series(s1, s2, s3, expected): + s = gd.concat([s1, s2, s3]) + assert_eq(s, expected) + + +@pytest.mark.parametrize( + "s1, s2, expected", + [ + ( + gd.Series( + [Decimal("955.22"), Decimal("8.2")], dtype=Decimal64Dtype(5, 2) + ), + gd.Series(["2007-06-12", "2006-03-14"], dtype="datetime64"), + gd.Series( + [ + "955.22", + "8.20", + "2007-06-12 00:00:00", + "2006-03-14 00:00:00", + ], + index=[0, 1, 0, 1], + ), + ), + ( + gd.Series( + [Decimal("-52.44"), Decimal("365.22")], + dtype=Decimal64Dtype(5, 2), + ), + gd.Series( + np.arange( + "2005-02-01T12", "2005-02-01T15", dtype="datetime64[h]" + ), + dtype="datetime64[s]", + ), + gd.Series( + [ + "-52.44", + "365.22", + "2005-02-01 12:00:00", + "2005-02-01 13:00:00", + "2005-02-01 14:00:00", + ], + index=[0, 1, 0, 1, 2], + ), + ), + ( + gd.Series( + [Decimal("753.0"), Decimal("94.22")], + dtype=Decimal64Dtype(5, 2), + ), + gd.Series([np.timedelta64(111, "s"), np.timedelta64(509, "s")]), + gd.Series( + ["753.00", "94.22", "0 days 00:01:51", "0 days 00:08:29"], + index=[0, 1, 0, 1], + ), + ), + ( + gd.Series( + [Decimal("753.0"), Decimal("94.22")], + dtype=Decimal64Dtype(5, 2), + ), + gd.Series( + [np.timedelta64(940252, "s"), np.timedelta64(758385, "s")] + ), + gd.Series( + ["753.00", "94.22", "10 days 21:10:52", "8 days 18:39:45"], + index=[0, 1, 0, 1], + ), + ), + ], +) +def test_concat_decimal_non_numeric(s1, s2, expected): + s = gd.concat([s1, s2]) + assert_eq(s, expected) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index e5e36ba7e21..0b73f32e94d 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8579,3 +8579,100 @@ def test_dataframe_init_from_series(data, columns, index): actual, check_index_type=False if len(expected) == 0 else True, ) + + +@pytest.mark.parametrize( + "data, expected", + [ + ({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "c": [1.2, 1, 2, 3]}, False), + ({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, True), + ({"a": ["a", "b", "c"], "b": [4, 5, 6], "c": [7, 8, 9]}, False), + ({"a": [True, False, False], "b": [False, False, True]}, True), + ({"a": [True, False, False]}, True), + ({"a": [[1, 2], [3, 4]]}, True), + ({"a": [[1, 2], [3, 4]], "b": ["a", "b"]}, False), + ({"a": [{"c": 5}, {"e": 5}], "b": [{"c": 5}, {"g": 7}]}, True), + ({}, True), + ], +) +def test_is_homogeneous_dataframe(data, expected): + actual = cudf.DataFrame(data)._is_homogeneous + + assert actual == expected + + +@pytest.mark.parametrize( + "data, indexes, expected", + [ + ( + {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "c": [1.2, 1, 2, 3]}, + ["a", "b"], + True, + ), + ( + { + "a": [1, 2, 3, 4], + "b": [5, 6, 7, 8], + "c": [1.2, 1, 2, 3], + "d": ["hello", "world", "cudf", "rapids"], + }, + ["a", "b"], + False, + ), + ( + { + "a": ["a", "b", "c"], + "b": [4, 5, 6], + "c": [7, 8, 9], + "d": [1, 2, 3], + }, + ["a", "b"], + True, + ), + ], +) +def test_is_homogeneous_multiIndex_dataframe(data, indexes, expected): + test_dataframe = cudf.DataFrame(data).set_index(indexes) + actual = cudf.DataFrame(test_dataframe)._is_homogeneous + + assert actual == expected + + +@pytest.mark.parametrize( + "data, expected", [([1, 2, 3, 4], True), ([True, False], True)] +) +def test_is_homogeneous_series(data, expected): + actual = cudf.Series(data)._is_homogeneous + + assert actual == expected + + +@pytest.mark.parametrize( + "levels, codes, expected", + [ + ( + [["lama", "cow", "falcon"], ["speed", "weight", "length"]], + [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + True, + ), + ( + [[1, 2, 3], [True, False, True]], + [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + False, + ), + ], +) +def test_is_homogeneous_multiIndex(levels, codes, expected): + actual = cudf.MultiIndex(levels=levels, codes=codes)._is_homogeneous + + assert actual == expected + + +@pytest.mark.parametrize( + "data, expected", + [([1, 2, 3], True), (["Hello", "World"], True), ([True, False], True)], +) +def test_is_homogeneous_index(data, expected): + actual = cudf.Index(data)._is_homogeneous + + assert actual == expected diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index b6e2aac0304..a5895caf49f 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -6,14 +6,16 @@ import pytest import cudf +from cudf.core.column import ColumnBase from cudf.core.dtypes import ( CategoricalDtype, Decimal64Dtype, + IntervalDtype, ListDtype, StructDtype, - IntervalDtype, ) from cudf.tests.utils import assert_eq +from cudf.utils.dtypes import np_to_pa_dtype def test_cdt_basic(): @@ -155,3 +157,103 @@ def test_interval_dtype_pyarrow_round_trip(fields, closed): expect = pa_array got = IntervalDtype.from_arrow(expect).to_arrow() assert expect.equals(got) + + +def assert_column_array_dtype_equal(column: ColumnBase, array: pa.array): + """ + In cudf, each column holds its dtype. And since column may have child + columns, child columns also holds their datatype. This method tests + that every level of `column` matches the type of the given `array` + recursively. + """ + + if isinstance(column.dtype, ListDtype): + return array.type.equals( + column.dtype.to_arrow() + ) and assert_column_array_dtype_equal( + column.base_children[1], array.values + ) + elif isinstance(column.dtype, StructDtype): + return array.type.equals(column.dtype.to_arrow()) and all( + [ + assert_column_array_dtype_equal(child, array.field(i)) + for i, child in enumerate(column.base_children) + ] + ) + elif isinstance(column.dtype, Decimal64Dtype): + return array.type.equals(column.dtype.to_arrow()) + elif isinstance(column.dtype, CategoricalDtype): + raise NotImplementedError() + else: + return array.type.equals(np_to_pa_dtype(column.dtype)) + + +@pytest.mark.parametrize( + "data", + [ + [[{"name": 123}]], + [ + [ + { + "IsLeapYear": False, + "data": {"Year": 1999, "Month": 7}, + "names": ["Mike", None], + }, + { + "IsLeapYear": True, + "data": {"Year": 2004, "Month": 12}, + "names": None, + }, + { + "IsLeapYear": False, + "data": {"Year": 1996, "Month": 2}, + "names": ["Rose", "Richard"], + }, + ] + ], + [ + [None, {"human?": True, "deets": {"weight": 2.4, "age": 27}}], + [ + {"human?": None, "deets": {"weight": 5.3, "age": 25}}, + {"human?": False, "deets": {"weight": 8.0, "age": 31}}, + {"human?": False, "deets": None}, + ], + [], + None, + [{"human?": None, "deets": {"weight": 6.9, "age": None}}], + ], + [ + { + "name": "var0", + "val": [ + {"name": "var1", "val": None, "type": "optional"} + ], + "type": "list", + }, + {}, + { + "name": "var2", + "val": [ + { + "name": "var3", + "val": {"field": 42}, + "type": "optional", + }, + { + "name": "var4", + "val": {"field": 3.14}, + "type": "optional", + }, + ], + "type": "list", + }, + None, + ], + ], +) +def test_lists_of_structs_dtype(data): + got = cudf.Series(data) + expected = pa.array(data) + + assert_column_array_dtype_equal(got._column, expected) + assert expected.equals(got._column.to_arrow()) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index d1458c72770..2430b0da5ef 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -30,14 +30,28 @@ _index_type_aggs = {"count", "idxmin", "idxmax", "cumcount"} -def assert_groupby_results_equal(expect, got, sort=True, **kwargs): +def assert_groupby_results_equal( + expect, got, sort=True, as_index=True, by=None, **kwargs +): # Because we don't sort by index by default in groupby, # sort expect and got by index before comparing if sort: - expect = expect.sort_index() - got = got.sort_index() - else: - assert_eq(expect.sort_index(), got.sort_index(), **kwargs) + if as_index: + expect = expect.sort_index() + got = got.sort_index() + else: + assert by is not None + if isinstance(expect, (pd.DataFrame, cudf.DataFrame)): + expect = expect.sort_values(by=by).reset_index(drop=True) + else: + expect = expect.sort_values().reset_index(drop=True) + + if isinstance(got, cudf.DataFrame): + got = got.sort_values(by=by).reset_index(drop=True) + else: + got = got.sort_values().reset_index(drop=True) + + assert_eq(expect, got, **kwargs) def make_frame( @@ -201,10 +215,16 @@ def test_groupby_getitem_getattr(as_index): pdf = pd.DataFrame({"x": [1, 3, 1], "y": [1, 2, 3], "z": [1, 4, 5]}) gdf = cudf.from_pandas(pdf) assert_groupby_results_equal( - pdf.groupby("x")["y"].sum(), gdf.groupby("x")["y"].sum(), + pdf.groupby("x")["y"].sum(), + gdf.groupby("x")["y"].sum(), + as_index=as_index, + by="x", ) assert_groupby_results_equal( - pdf.groupby("x").y.sum(), gdf.groupby("x").y.sum(), + pdf.groupby("x").y.sum(), + gdf.groupby("x").y.sum(), + as_index=as_index, + by="x", ) assert_groupby_results_equal( pdf.groupby("x")[["y"]].sum(), gdf.groupby("x")[["y"]].sum(), @@ -212,6 +232,8 @@ def test_groupby_getitem_getattr(as_index): assert_groupby_results_equal( pdf.groupby(["x", "y"], as_index=as_index).sum(), gdf.groupby(["x", "y"], as_index=as_index).sum(), + as_index=as_index, + by=["x", "y"], ) @@ -1088,7 +1110,13 @@ def test_groupby_datetime(nelem, as_index, agg): else: pdres = pdg.agg({"datetime": agg}) gdres = gdg.agg({"datetime": agg}) - assert_groupby_results_equal(pdres, gdres, check_dtype=check_dtype) + assert_groupby_results_equal( + pdres, + gdres, + check_dtype=check_dtype, + as_index=as_index, + by=["datetime"], + ) def test_groupby_dropna(): @@ -1349,6 +1377,8 @@ def test_reset_index_after_empty_groupby(): assert_groupby_results_equal( pdf.groupby("a").sum().reset_index(), gdf.groupby("a").sum().reset_index(), + as_index=False, + by="a", ) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 5dcecc6c9e1..7edcb08a7c8 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -7,6 +7,7 @@ import pytest import cudf +from cudf import NA from cudf.tests.utils import assert_eq @@ -332,3 +333,20 @@ def test_concatenate_list_with_nonlist(): gdf1 = cudf.DataFrame({"A": [["a", "c"], ["b", "d"], ["c", "d"]]}) gdf2 = cudf.DataFrame({"A": ["a", "b", "c"]}) gdf1["A"] + gdf2["A"] + + +@pytest.mark.parametrize( + "indata,expect", + [ + ([1], [1]), + ([1, 2, 3], [1, 2, 3]), + ([[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]), + ([None], [NA]), + ([1, None, 3], [1, NA, 3]), + ([[1, None, 3], [None, 5, 6]], [[1, NA, 3], [NA, 5, 6]]), + ], +) +def test_list_getitem(indata, expect): + list_sr = cudf.Series([indata]) + # __getitem__ shall fill None with cudf.NA + assert list_sr[0] == expect diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 16c35bab4b1..0b59116f8e6 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -290,13 +290,15 @@ def is_decimal_dtype(obj): ) -def _decimal_normalize_types(*args): - s = max([a.dtype.scale for a in args]) - lhs = max([a.dtype.precision - a.dtype.scale for a in args]) +def _find_common_type_decimal(dtypes): + # Find the largest scale and the largest difference between + # precision and scale of the columns to be concatenated + s = max([dtype.scale for dtype in dtypes]) + lhs = max([dtype.precision - dtype.scale for dtype in dtypes]) + # Combine to get the necessary precision and clip at the maximum + # precision p = min(cudf.Decimal64Dtype.MAX_PRECISION, s + lhs) - dtype = cudf.Decimal64Dtype(p, s) - - return [a.astype(dtype) for a in args] + return cudf.Decimal64Dtype(p, s) def cudf_dtype_from_pydata_dtype(dtype): @@ -690,9 +692,15 @@ def find_common_type(dtypes): dtypes = set(dtypes) if any(is_decimal_dtype(dtype) for dtype in dtypes): - raise NotImplementedError( - "DecimalDtype is not yet supported in find_common_type" - ) + if all( + is_decimal_dtype(dtype) or is_numerical_dtype(dtype) + for dtype in dtypes + ): + return _find_common_type_decimal( + [dtype for dtype in dtypes if is_decimal_dtype(dtype)] + ) + else: + return np.dtype("O") # Corner case 1: # Resort to np.result_type to handle "M" and "m" types separately