From 84073e8c3c9477c8afa974f14058f1208f63aba2 Mon Sep 17 00:00:00 2001 From: AJ Schmidt Date: Fri, 17 Dec 2021 17:08:19 -0500 Subject: [PATCH 01/12] update changelog --- CHANGELOG.md | 227 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 225 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b55669f7f50..39bb868c7db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,229 @@ -# cuDF 21.12.00 (Date TBD) +# cuDF 21.12.00 (9 Dec 2021) -Please see https://github.com/rapidsai/cudf/releases/tag/v21.12.00a for the latest changes to this development branch. +## 🚨 Breaking Changes + +- Update `bitmask_and` and `bitmask_or` to return a pair of resulting mask and count of unset bits ([#9616](https://github.com/rapidsai/cudf/pull/9616)) [@PointKernel](https://github.com/PointKernel) +- Remove sizeof and standardize on memory_usage ([#9544](https://github.com/rapidsai/cudf/pull/9544)) [@vyasr](https://github.com/vyasr) +- Add support for single-line regex anchors ^/$ in contains_re ([#9482](https://github.com/rapidsai/cudf/pull/9482)) [@davidwendt](https://github.com/davidwendt) +- Refactor sorting APIs ([#9464](https://github.com/rapidsai/cudf/pull/9464)) [@vyasr](https://github.com/vyasr) +- Update Java nvcomp JNI bindings to nvcomp 2.x API ([#9384](https://github.com/rapidsai/cudf/pull/9384)) [@jbrennan333](https://github.com/jbrennan333) +- Support Python UDFs written in terms of rows ([#9343](https://github.com/rapidsai/cudf/pull/9343)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- JNI: Support nested types in ORC writer ([#9334](https://github.com/rapidsai/cudf/pull/9334)) [@firestarman](https://github.com/firestarman) +- Optionally nullify out-of-bounds indices in segmented_gather(). ([#9318](https://github.com/rapidsai/cudf/pull/9318)) [@mythrocks](https://github.com/mythrocks) +- Refactor cuIO timestamp processing with `cuda::std::chrono` ([#9278](https://github.com/rapidsai/cudf/pull/9278)) [@PointKernel](https://github.com/PointKernel) +- Various internal MultiIndex improvements ([#9243](https://github.com/rapidsai/cudf/pull/9243)) [@vyasr](https://github.com/vyasr) + +## πŸ› Bug Fixes + +- Fix read_parquet bug for bytes input ([#9669](https://github.com/rapidsai/cudf/pull/9669)) [@rjzamora](https://github.com/rjzamora) +- Use `_gather` internal for `sort_*` ([#9668](https://github.com/rapidsai/cudf/pull/9668)) [@isVoid](https://github.com/isVoid) +- Fix behavior of equals for non-DataFrame Frames and add tests. ([#9653](https://github.com/rapidsai/cudf/pull/9653)) [@vyasr](https://github.com/vyasr) +- Dont recompute output size if it is already available ([#9649](https://github.com/rapidsai/cudf/pull/9649)) [@abellina](https://github.com/abellina) +- Fix read_parquet bug for extended dtypes from remote storage ([#9638](https://github.com/rapidsai/cudf/pull/9638)) [@rjzamora](https://github.com/rjzamora) +- add const when getting data from a JNI data wrapper ([#9637](https://github.com/rapidsai/cudf/pull/9637)) [@wjxiz1992](https://github.com/wjxiz1992) +- Fix debrotli issue on CUDA 11.5 ([#9632](https://github.com/rapidsai/cudf/pull/9632)) [@vuule](https://github.com/vuule) +- Use std::size_t when computing join output size ([#9626](https://github.com/rapidsai/cudf/pull/9626)) [@jlowe](https://github.com/jlowe) +- Fix `usecols` parameter handling in `dask_cudf.read_csv` ([#9618](https://github.com/rapidsai/cudf/pull/9618)) [@galipremsagar](https://github.com/galipremsagar) +- Add support for string `'nan', 'inf' & '-inf'` values while type-casting to `float` ([#9613](https://github.com/rapidsai/cudf/pull/9613)) [@galipremsagar](https://github.com/galipremsagar) +- Avoid passing NativeFileDatasource to pyarrow in read_parquet ([#9608](https://github.com/rapidsai/cudf/pull/9608)) [@rjzamora](https://github.com/rjzamora) +- Fix test failure with cuda 11.5 in row_bit_count tests. ([#9581](https://github.com/rapidsai/cudf/pull/9581)) [@nvdbaranec](https://github.com/nvdbaranec) +- Correct _LIBCUDACXX_CUDACC_VER value computation ([#9579](https://github.com/rapidsai/cudf/pull/9579)) [@robertmaynard](https://github.com/robertmaynard) +- Increase max RLE stream size estimate to avoid potential overflows ([#9568](https://github.com/rapidsai/cudf/pull/9568)) [@vuule](https://github.com/vuule) +- Fix edge case in tdigest scalar generation for groups containing all nulls. ([#9551](https://github.com/rapidsai/cudf/pull/9551)) [@nvdbaranec](https://github.com/nvdbaranec) +- Fix pytests failing in `cuda-11.5` environment ([#9547](https://github.com/rapidsai/cudf/pull/9547)) [@galipremsagar](https://github.com/galipremsagar) +- compile libnvcomp with PTDS if requested ([#9540](https://github.com/rapidsai/cudf/pull/9540)) [@jbrennan333](https://github.com/jbrennan333) +- Fix `segmented_gather()` for null LIST rows ([#9537](https://github.com/rapidsai/cudf/pull/9537)) [@mythrocks](https://github.com/mythrocks) +- Deprecate DataFrame.label_encoding, use private _label_encoding method internally. ([#9535](https://github.com/rapidsai/cudf/pull/9535)) [@bdice](https://github.com/bdice) +- Fix several test and benchmark issues related to bitmask allocations. ([#9521](https://github.com/rapidsai/cudf/pull/9521)) [@nvdbaranec](https://github.com/nvdbaranec) +- Fix for inserting duplicates in groupby result cache ([#9508](https://github.com/rapidsai/cudf/pull/9508)) [@karthikeyann](https://github.com/karthikeyann) +- Fix mismatched types error in clip() when using non int64 numeric types ([#9498](https://github.com/rapidsai/cudf/pull/9498)) [@davidwendt](https://github.com/davidwendt) +- Match conda pinnings for style checks (revert part of #9412, #9433). ([#9490](https://github.com/rapidsai/cudf/pull/9490)) [@bdice](https://github.com/bdice) +- Make sure all dask-cudf supported aggs are handled in `_tree_node_agg` ([#9487](https://github.com/rapidsai/cudf/pull/9487)) [@charlesbluca](https://github.com/charlesbluca) +- Resolve `hash_columns` `FutureWarning` in `dask_cudf` ([#9481](https://github.com/rapidsai/cudf/pull/9481)) [@pentschev](https://github.com/pentschev) +- Add fixed point to AllTypes in libcudf unit tests ([#9472](https://github.com/rapidsai/cudf/pull/9472)) [@karthikeyann](https://github.com/karthikeyann) +- Fix regex handling of embedded null characters ([#9470](https://github.com/rapidsai/cudf/pull/9470)) [@davidwendt](https://github.com/davidwendt) +- Fix memcheck error in copy-if-else ([#9467](https://github.com/rapidsai/cudf/pull/9467)) [@davidwendt](https://github.com/davidwendt) +- Fix bug in dask_cudf.read_parquet for index=False ([#9453](https://github.com/rapidsai/cudf/pull/9453)) [@rjzamora](https://github.com/rjzamora) +- Preserve the decimal scale when creating a default scalar ([#9449](https://github.com/rapidsai/cudf/pull/9449)) [@revans2](https://github.com/revans2) +- Push down parent nulls when flattening nested columns. ([#9443](https://github.com/rapidsai/cudf/pull/9443)) [@mythrocks](https://github.com/mythrocks) +- Fix memcheck error in gtest SegmentedGatherTest/GatherSliced ([#9442](https://github.com/rapidsai/cudf/pull/9442)) [@davidwendt](https://github.com/davidwendt) +- Revert "Fix quantile division / partition handling for dask-cudf sort… ([#9438](https://github.com/rapidsai/cudf/pull/9438)) [@charlesbluca](https://github.com/charlesbluca) +- Allow int-like objects for the `decimals` argument in `round` ([#9428](https://github.com/rapidsai/cudf/pull/9428)) [@shwina](https://github.com/shwina) +- Fix stream compaction's `drop_duplicates` API to use stable sort ([#9417](https://github.com/rapidsai/cudf/pull/9417)) [@ttnghia](https://github.com/ttnghia) +- Skip Comparing Uniform Window Results in Var/std Tests ([#9416](https://github.com/rapidsai/cudf/pull/9416)) [@isVoid](https://github.com/isVoid) +- Fix `StructColumn.to_pandas` type handling issues ([#9388](https://github.com/rapidsai/cudf/pull/9388)) [@galipremsagar](https://github.com/galipremsagar) +- Correct issues in the build dir cudf-config.cmake ([#9386](https://github.com/rapidsai/cudf/pull/9386)) [@robertmaynard](https://github.com/robertmaynard) +- Fix Java table partition test to account for non-deterministic ordering ([#9385](https://github.com/rapidsai/cudf/pull/9385)) [@jlowe](https://github.com/jlowe) +- Fix timestamp truncation/overflow bugs in orc/parquet ([#9382](https://github.com/rapidsai/cudf/pull/9382)) [@PointKernel](https://github.com/PointKernel) +- Fix the crash in stats code ([#9368](https://github.com/rapidsai/cudf/pull/9368)) [@devavret](https://github.com/devavret) +- Make Series.hash_encode results reproducible. ([#9366](https://github.com/rapidsai/cudf/pull/9366)) [@bdice](https://github.com/bdice) +- Fix libcudf compile warnings on debug 11.4 build ([#9360](https://github.com/rapidsai/cudf/pull/9360)) [@davidwendt](https://github.com/davidwendt) +- Fail gracefully when compiling python UDFs that attempt to access columns with unsupported dtypes ([#9359](https://github.com/rapidsai/cudf/pull/9359)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Set pass_filenames: false in mypy pre-commit configuration. ([#9349](https://github.com/rapidsai/cudf/pull/9349)) [@bdice](https://github.com/bdice) +- Fix cudf_assert in cudf::io::orc::gpu::gpuDecodeOrcColumnData ([#9348](https://github.com/rapidsai/cudf/pull/9348)) [@davidwendt](https://github.com/davidwendt) +- Fix memcheck error in groupby-tdigest get_scalar_minmax ([#9339](https://github.com/rapidsai/cudf/pull/9339)) [@davidwendt](https://github.com/davidwendt) +- Optimizations for `cudf.concat` when `axis=1` ([#9333](https://github.com/rapidsai/cudf/pull/9333)) [@galipremsagar](https://github.com/galipremsagar) +- Use f-string in join helper warning message. ([#9325](https://github.com/rapidsai/cudf/pull/9325)) [@bdice](https://github.com/bdice) +- Avoid casting to list or struct dtypes in dask_cudf.read_parquet ([#9314](https://github.com/rapidsai/cudf/pull/9314)) [@rjzamora](https://github.com/rjzamora) +- Fix null count in statistics for parquet ([#9303](https://github.com/rapidsai/cudf/pull/9303)) [@devavret](https://github.com/devavret) +- Potential overflow of `decimal32` when casting to `int64_t` ([#9287](https://github.com/rapidsai/cudf/pull/9287)) [@codereport](https://github.com/codereport) +- Fix quantile division / partition handling for dask-cudf sort on null dataframes ([#9259](https://github.com/rapidsai/cudf/pull/9259)) [@charlesbluca](https://github.com/charlesbluca) +- Updating cudf version also updates rapids cmake branch ([#9249](https://github.com/rapidsai/cudf/pull/9249)) [@robertmaynard](https://github.com/robertmaynard) +- Implement `one_hot_encoding` in libcudf and bind to python ([#9229](https://github.com/rapidsai/cudf/pull/9229)) [@isVoid](https://github.com/isVoid) +- BUG FIX: CSV Writer ignores the header parameter when no metadata is provided ([#8740](https://github.com/rapidsai/cudf/pull/8740)) [@skirui-source](https://github.com/skirui-source) + +## πŸ“– Documentation + +- Update Documentation to use `TYPED_TEST_SUITE` ([#9654](https://github.com/rapidsai/cudf/pull/9654)) [@codereport](https://github.com/codereport) +- Add dedicated page for `StringHandling` in python docs ([#9624](https://github.com/rapidsai/cudf/pull/9624)) [@galipremsagar](https://github.com/galipremsagar) +- Update docstring of `DataFrame.merge` ([#9572](https://github.com/rapidsai/cudf/pull/9572)) [@galipremsagar](https://github.com/galipremsagar) +- Use raw strings to avoid SyntaxErrors in parsed docstrings. ([#9526](https://github.com/rapidsai/cudf/pull/9526)) [@bdice](https://github.com/bdice) +- Add example to docstrings in `rolling.apply` ([#9522](https://github.com/rapidsai/cudf/pull/9522)) [@isVoid](https://github.com/isVoid) +- Update help message to escape quotes in ./build.sh --cmake-args. ([#9494](https://github.com/rapidsai/cudf/pull/9494)) [@bdice](https://github.com/bdice) +- Improve Python docstring formatting. ([#9493](https://github.com/rapidsai/cudf/pull/9493)) [@bdice](https://github.com/bdice) +- Update table of I/O supported types ([#9476](https://github.com/rapidsai/cudf/pull/9476)) [@vuule](https://github.com/vuule) +- Document invalid regex patterns as undefined behavior ([#9473](https://github.com/rapidsai/cudf/pull/9473)) [@davidwendt](https://github.com/davidwendt) +- Miscellaneous documentation fixes to `cudf` ([#9471](https://github.com/rapidsai/cudf/pull/9471)) [@galipremsagar](https://github.com/galipremsagar) +- Fix many documentation errors in libcudf. ([#9355](https://github.com/rapidsai/cudf/pull/9355)) [@karthikeyann](https://github.com/karthikeyann) +- Fixing SubwordTokenizer docs issue ([#9354](https://github.com/rapidsai/cudf/pull/9354)) [@mayankanand007](https://github.com/mayankanand007) +- Improved deprecation warnings. ([#9347](https://github.com/rapidsai/cudf/pull/9347)) [@bdice](https://github.com/bdice) +- doc reorder mr, stream to stream, mr ([#9308](https://github.com/rapidsai/cudf/pull/9308)) [@karthikeyann](https://github.com/karthikeyann) +- Deprecate method parameters to DataFrame.join, DataFrame.merge. ([#9291](https://github.com/rapidsai/cudf/pull/9291)) [@bdice](https://github.com/bdice) +- Added deprecation warning for `.label_encoding()` ([#9289](https://github.com/rapidsai/cudf/pull/9289)) [@mayankanand007](https://github.com/mayankanand007) + +## πŸš€ New Features + +- Enable Series.divide and DataFrame.divide ([#9630](https://github.com/rapidsai/cudf/pull/9630)) [@vyasr](https://github.com/vyasr) +- Update `bitmask_and` and `bitmask_or` to return a pair of resulting mask and count of unset bits ([#9616](https://github.com/rapidsai/cudf/pull/9616)) [@PointKernel](https://github.com/PointKernel) +- Add handling of mixed numeric types in `to_dlpack` ([#9585](https://github.com/rapidsai/cudf/pull/9585)) [@galipremsagar](https://github.com/galipremsagar) +- Support re.Pattern object for pat arg in str.replace ([#9573](https://github.com/rapidsai/cudf/pull/9573)) [@davidwendt](https://github.com/davidwendt) +- Add JNI for `lists::drop_list_duplicates` with keys-values input column ([#9553](https://github.com/rapidsai/cudf/pull/9553)) [@ttnghia](https://github.com/ttnghia) +- Support structs column in `min`, `max`, `argmin` and `argmax` groupby aggregate() and scan() ([#9545](https://github.com/rapidsai/cudf/pull/9545)) [@ttnghia](https://github.com/ttnghia) +- Move libcudacxx to use `rapids_cpm` and use newer versions ([#9539](https://github.com/rapidsai/cudf/pull/9539)) [@robertmaynard](https://github.com/robertmaynard) +- Add scan min/max support for chrono types to libcudf reduction-scan (not groupby scan) ([#9518](https://github.com/rapidsai/cudf/pull/9518)) [@davidwendt](https://github.com/davidwendt) +- Support `args=` in `apply` ([#9514](https://github.com/rapidsai/cudf/pull/9514)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Add groupby scan min/max support for strings values ([#9502](https://github.com/rapidsai/cudf/pull/9502)) [@davidwendt](https://github.com/davidwendt) +- Add list output option to character_ngrams() function ([#9499](https://github.com/rapidsai/cudf/pull/9499)) [@davidwendt](https://github.com/davidwendt) +- More granular column selection in ORC reader ([#9496](https://github.com/rapidsai/cudf/pull/9496)) [@vuule](https://github.com/vuule) +- add min_periods, ddof to groupby covariance, & correlation aggregation ([#9492](https://github.com/rapidsai/cudf/pull/9492)) [@karthikeyann](https://github.com/karthikeyann) +- Implement Series.datetime.floor ([#9488](https://github.com/rapidsai/cudf/pull/9488)) [@skirui-source](https://github.com/skirui-source) +- Enable linting of CMake files using pre-commit ([#9484](https://github.com/rapidsai/cudf/pull/9484)) [@vyasr](https://github.com/vyasr) +- Add support for single-line regex anchors ^/$ in contains_re ([#9482](https://github.com/rapidsai/cudf/pull/9482)) [@davidwendt](https://github.com/davidwendt) +- Augment `order_by` to Accept a List of `null_precedence` ([#9455](https://github.com/rapidsai/cudf/pull/9455)) [@isVoid](https://github.com/isVoid) +- Add format API for list column of strings ([#9454](https://github.com/rapidsai/cudf/pull/9454)) [@davidwendt](https://github.com/davidwendt) +- Enable Datetime/Timedelta dtypes in Masked UDFs ([#9451](https://github.com/rapidsai/cudf/pull/9451)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Add cudf python groupby.diff ([#9446](https://github.com/rapidsai/cudf/pull/9446)) [@karthikeyann](https://github.com/karthikeyann) +- Implement `lists::stable_sort_lists` for stable sorting of elements within each row of lists column ([#9425](https://github.com/rapidsai/cudf/pull/9425)) [@ttnghia](https://github.com/ttnghia) +- add ctest memcheck using cuda-sanitizer ([#9414](https://github.com/rapidsai/cudf/pull/9414)) [@karthikeyann](https://github.com/karthikeyann) +- Support Unary Operations in Masked UDF ([#9409](https://github.com/rapidsai/cudf/pull/9409)) [@isVoid](https://github.com/isVoid) +- Move Several Series Function to Frame ([#9394](https://github.com/rapidsai/cudf/pull/9394)) [@isVoid](https://github.com/isVoid) +- MD5 Python hash API ([#9390](https://github.com/rapidsai/cudf/pull/9390)) [@bdice](https://github.com/bdice) +- Add cudf strings is_title API ([#9380](https://github.com/rapidsai/cudf/pull/9380)) [@davidwendt](https://github.com/davidwendt) +- Enable casting to int64, uint64, and double in AST code. ([#9379](https://github.com/rapidsai/cudf/pull/9379)) [@vyasr](https://github.com/vyasr) +- Add support for writing ORC with map columns ([#9369](https://github.com/rapidsai/cudf/pull/9369)) [@vuule](https://github.com/vuule) +- extract_list_elements() with column_view indices ([#9367](https://github.com/rapidsai/cudf/pull/9367)) [@mythrocks](https://github.com/mythrocks) +- Reimplement `lists::drop_list_duplicates` for keys-values lists columns ([#9345](https://github.com/rapidsai/cudf/pull/9345)) [@ttnghia](https://github.com/ttnghia) +- Support Python UDFs written in terms of rows ([#9343](https://github.com/rapidsai/cudf/pull/9343)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- JNI: Support nested types in ORC writer ([#9334](https://github.com/rapidsai/cudf/pull/9334)) [@firestarman](https://github.com/firestarman) +- Optionally nullify out-of-bounds indices in segmented_gather(). ([#9318](https://github.com/rapidsai/cudf/pull/9318)) [@mythrocks](https://github.com/mythrocks) +- Add shallow hash function and shallow equality comparison for column_view ([#9312](https://github.com/rapidsai/cudf/pull/9312)) [@karthikeyann](https://github.com/karthikeyann) +- Add CudaMemoryBuffer for cudaMalloc memory using RMM cuda_memory_resource ([#9311](https://github.com/rapidsai/cudf/pull/9311)) [@rongou](https://github.com/rongou) +- Add parameters to control row index stride and stripe size in ORC writer ([#9310](https://github.com/rapidsai/cudf/pull/9310)) [@vuule](https://github.com/vuule) +- Add `na_position` param to dask-cudf `sort_values` ([#9264](https://github.com/rapidsai/cudf/pull/9264)) [@charlesbluca](https://github.com/charlesbluca) +- Add `ascending` parameter for dask-cudf `sort_values` ([#9250](https://github.com/rapidsai/cudf/pull/9250)) [@charlesbluca](https://github.com/charlesbluca) +- New array conversion methods ([#9236](https://github.com/rapidsai/cudf/pull/9236)) [@vyasr](https://github.com/vyasr) +- Series `apply` method backed by masked UDFs ([#9217](https://github.com/rapidsai/cudf/pull/9217)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Grouping by frequency and resampling ([#9178](https://github.com/rapidsai/cudf/pull/9178)) [@shwina](https://github.com/shwina) +- Pure-python masked UDFs ([#9174](https://github.com/rapidsai/cudf/pull/9174)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Add Covariance, Pearson correlation for sort groupby (libcudf) ([#9154](https://github.com/rapidsai/cudf/pull/9154)) [@karthikeyann](https://github.com/karthikeyann) +- Add `calendrical_month_sequence` in c++ and `date_range` in python ([#8886](https://github.com/rapidsai/cudf/pull/8886)) [@shwina](https://github.com/shwina) + +## πŸ› οΈ Improvements + +- Followup to PR 9088 comments ([#9659](https://github.com/rapidsai/cudf/pull/9659)) [@cwharris](https://github.com/cwharris) +- Update cuCollections to version that supports installed libcudacxx ([#9633](https://github.com/rapidsai/cudf/pull/9633)) [@robertmaynard](https://github.com/robertmaynard) +- Add `11.5` dev.yml to `cudf` ([#9617](https://github.com/rapidsai/cudf/pull/9617)) [@galipremsagar](https://github.com/galipremsagar) +- Add `xfail` for parquet reader `11.5` issue ([#9612](https://github.com/rapidsai/cudf/pull/9612)) [@galipremsagar](https://github.com/galipremsagar) +- remove deprecated Rmm.initialize method ([#9607](https://github.com/rapidsai/cudf/pull/9607)) [@rongou](https://github.com/rongou) +- Use HostColumnVectorCore for child columns in JCudfSerialization.unpackHostColumnVectors ([#9596](https://github.com/rapidsai/cudf/pull/9596)) [@sperlingxx](https://github.com/sperlingxx) +- Set RMM pool to a fixed size in JNI ([#9583](https://github.com/rapidsai/cudf/pull/9583)) [@rongou](https://github.com/rongou) +- Use nvCOMP for Snappy compression/decompression ([#9582](https://github.com/rapidsai/cudf/pull/9582)) [@vuule](https://github.com/vuule) +- Build CUDA version agnostic packages for dask-cudf ([#9578](https://github.com/rapidsai/cudf/pull/9578)) [@Ethyling](https://github.com/Ethyling) +- Fixed tests warning: "TYPED_TEST_CASE is deprecated, please use TYPED_TEST_SUITE" ([#9574](https://github.com/rapidsai/cudf/pull/9574)) [@ttnghia](https://github.com/ttnghia) +- Enable CMake format in CI and fix style ([#9570](https://github.com/rapidsai/cudf/pull/9570)) [@vyasr](https://github.com/vyasr) +- Add NVTX Start/End Ranges to JNI ([#9563](https://github.com/rapidsai/cudf/pull/9563)) [@abellina](https://github.com/abellina) +- Add librdkafka and python-confluent-kafka to dev conda environments s… ([#9562](https://github.com/rapidsai/cudf/pull/9562)) [@jdye64](https://github.com/jdye64) +- Add offsets_begin/end() to strings_column_view ([#9559](https://github.com/rapidsai/cudf/pull/9559)) [@davidwendt](https://github.com/davidwendt) +- remove alignment options for RMM jni ([#9550](https://github.com/rapidsai/cudf/pull/9550)) [@rongou](https://github.com/rongou) +- Add axis parameter passthrough to `DataFrame` and `Series` take for pandas API compatibility ([#9549](https://github.com/rapidsai/cudf/pull/9549)) [@dantegd](https://github.com/dantegd) +- Remove sizeof and standardize on memory_usage ([#9544](https://github.com/rapidsai/cudf/pull/9544)) [@vyasr](https://github.com/vyasr) +- Adds cudaProfilerStart/cudaProfilerStop in JNI api ([#9543](https://github.com/rapidsai/cudf/pull/9543)) [@abellina](https://github.com/abellina) +- Generalize comparison binary operations ([#9542](https://github.com/rapidsai/cudf/pull/9542)) [@vyasr](https://github.com/vyasr) +- Expose APIs to wrap CUDA or RMM allocations with a Java device buffer instance ([#9538](https://github.com/rapidsai/cudf/pull/9538)) [@jlowe](https://github.com/jlowe) +- Add scan sum support for duration types to libcudf ([#9536](https://github.com/rapidsai/cudf/pull/9536)) [@davidwendt](https://github.com/davidwendt) +- Force inlining to improve AST performance ([#9530](https://github.com/rapidsai/cudf/pull/9530)) [@vyasr](https://github.com/vyasr) +- Generalize some more indexed frame methods ([#9529](https://github.com/rapidsai/cudf/pull/9529)) [@vyasr](https://github.com/vyasr) +- Add Java bindings for rolling window stddev aggregation ([#9527](https://github.com/rapidsai/cudf/pull/9527)) [@razajafri](https://github.com/razajafri) +- catch rmm::out_of_memory exceptions in jni ([#9525](https://github.com/rapidsai/cudf/pull/9525)) [@rongou](https://github.com/rongou) +- Add an overload of `make_empty_column` with `type_id` parameter ([#9524](https://github.com/rapidsai/cudf/pull/9524)) [@ttnghia](https://github.com/ttnghia) +- Accelerate conditional inner joins with larger right tables ([#9523](https://github.com/rapidsai/cudf/pull/9523)) [@vyasr](https://github.com/vyasr) +- Initial pass of generalizing `decimal` support in `cudf` python layer ([#9517](https://github.com/rapidsai/cudf/pull/9517)) [@galipremsagar](https://github.com/galipremsagar) +- Cleanup for flattening nested columns ([#9509](https://github.com/rapidsai/cudf/pull/9509)) [@rwlee](https://github.com/rwlee) +- Enable running tests using RMM arena and async memory resources ([#9506](https://github.com/rapidsai/cudf/pull/9506)) [@rongou](https://github.com/rongou) +- Remove dependency on six. ([#9495](https://github.com/rapidsai/cudf/pull/9495)) [@bdice](https://github.com/bdice) +- Cleanup some libcudf strings gtests ([#9489](https://github.com/rapidsai/cudf/pull/9489)) [@davidwendt](https://github.com/davidwendt) +- Rename strings/array_tests.cu to strings/array_tests.cpp ([#9480](https://github.com/rapidsai/cudf/pull/9480)) [@davidwendt](https://github.com/davidwendt) +- Refactor sorting APIs ([#9464](https://github.com/rapidsai/cudf/pull/9464)) [@vyasr](https://github.com/vyasr) +- Implement DataFrame.hash_values, deprecate DataFrame.hash_columns. ([#9458](https://github.com/rapidsai/cudf/pull/9458)) [@bdice](https://github.com/bdice) +- Deprecate Series.hash_encode. ([#9457](https://github.com/rapidsai/cudf/pull/9457)) [@bdice](https://github.com/bdice) +- Update `conda` recipes for Enhanced Compatibility effort ([#9456](https://github.com/rapidsai/cudf/pull/9456)) [@ajschmidt8](https://github.com/ajschmidt8) +- Small clean up to simplify column selection code in ORC reader ([#9444](https://github.com/rapidsai/cudf/pull/9444)) [@vuule](https://github.com/vuule) +- add missing stream to scalar.is_valid() wherever stream is available ([#9436](https://github.com/rapidsai/cudf/pull/9436)) [@karthikeyann](https://github.com/karthikeyann) +- Adds Deprecation Warnings to `one_hot_encoding` and Implement `get_dummies` with Cython API ([#9435](https://github.com/rapidsai/cudf/pull/9435)) [@isVoid](https://github.com/isVoid) +- Update pre-commit hook URLs. ([#9433](https://github.com/rapidsai/cudf/pull/9433)) [@bdice](https://github.com/bdice) +- Remove pyarrow import in `dask_cudf.io.parquet` ([#9429](https://github.com/rapidsai/cudf/pull/9429)) [@charlesbluca](https://github.com/charlesbluca) +- Miscellaneous improvements for UDFs ([#9422](https://github.com/rapidsai/cudf/pull/9422)) [@isVoid](https://github.com/isVoid) +- Use pre-commit for CI ([#9412](https://github.com/rapidsai/cudf/pull/9412)) [@vyasr](https://github.com/vyasr) +- Update to UCX-Py 0.23 ([#9407](https://github.com/rapidsai/cudf/pull/9407)) [@pentschev](https://github.com/pentschev) +- Expose OutOfBoundsPolicy in JNI for Table.gather ([#9406](https://github.com/rapidsai/cudf/pull/9406)) [@abellina](https://github.com/abellina) +- Improvements to tdigest aggregation code. ([#9403](https://github.com/rapidsai/cudf/pull/9403)) [@nvdbaranec](https://github.com/nvdbaranec) +- Add Java API to deserialize a table to host columns ([#9402](https://github.com/rapidsai/cudf/pull/9402)) [@jlowe](https://github.com/jlowe) +- Frame copy to use __class__ instead of type() ([#9397](https://github.com/rapidsai/cudf/pull/9397)) [@madsbk](https://github.com/madsbk) +- Change all DeprecationWarnings to FutureWarning. ([#9392](https://github.com/rapidsai/cudf/pull/9392)) [@bdice](https://github.com/bdice) +- Update Java nvcomp JNI bindings to nvcomp 2.x API ([#9384](https://github.com/rapidsai/cudf/pull/9384)) [@jbrennan333](https://github.com/jbrennan333) +- Add IndexedFrame class and move SingleColumnFrame to a separate module ([#9378](https://github.com/rapidsai/cudf/pull/9378)) [@vyasr](https://github.com/vyasr) +- Support Arrow NativeFile and PythonFile for remote ORC storage ([#9377](https://github.com/rapidsai/cudf/pull/9377)) [@rjzamora](https://github.com/rjzamora) +- Use Arrow PythonFile for remote CSV storage ([#9376](https://github.com/rapidsai/cudf/pull/9376)) [@rjzamora](https://github.com/rjzamora) +- Add multi-threaded writing to GDS writes ([#9372](https://github.com/rapidsai/cudf/pull/9372)) [@devavret](https://github.com/devavret) +- Miscellaneous column cleanup ([#9370](https://github.com/rapidsai/cudf/pull/9370)) [@vyasr](https://github.com/vyasr) +- Use single kernel to extract all groups in cudf::strings::extract ([#9358](https://github.com/rapidsai/cudf/pull/9358)) [@davidwendt](https://github.com/davidwendt) +- Consolidate binary ops into `Frame` ([#9357](https://github.com/rapidsai/cudf/pull/9357)) [@isVoid](https://github.com/isVoid) +- Move rank scan implementations from scan_inclusive.cu to rank_scan.cu ([#9351](https://github.com/rapidsai/cudf/pull/9351)) [@davidwendt](https://github.com/davidwendt) +- Remove usage of deprecated thrust::host_space_tag. ([#9350](https://github.com/rapidsai/cudf/pull/9350)) [@bdice](https://github.com/bdice) +- Use Default Memory Resource for Temporaries in `reduction.cpp` ([#9344](https://github.com/rapidsai/cudf/pull/9344)) [@isVoid](https://github.com/isVoid) +- Fix Cython compilation warnings. ([#9327](https://github.com/rapidsai/cudf/pull/9327)) [@bdice](https://github.com/bdice) +- Fix some unused variable warnings in libcudf ([#9326](https://github.com/rapidsai/cudf/pull/9326)) [@davidwendt](https://github.com/davidwendt) +- Use optional-iterator for copy-if-else kernel ([#9324](https://github.com/rapidsai/cudf/pull/9324)) [@davidwendt](https://github.com/davidwendt) +- Remove Table class ([#9315](https://github.com/rapidsai/cudf/pull/9315)) [@vyasr](https://github.com/vyasr) +- Unpin `dask` and `distributed` in CI ([#9307](https://github.com/rapidsai/cudf/pull/9307)) [@galipremsagar](https://github.com/galipremsagar) +- Add optional-iterator support to indexalator ([#9306](https://github.com/rapidsai/cudf/pull/9306)) [@davidwendt](https://github.com/davidwendt) +- Consolidate more methods in Frame ([#9305](https://github.com/rapidsai/cudf/pull/9305)) [@vyasr](https://github.com/vyasr) +- Add Arrow-NativeFile and PythonFile support to read_parquet and read_csv in cudf ([#9304](https://github.com/rapidsai/cudf/pull/9304)) [@rjzamora](https://github.com/rjzamora) +- Pin mypy in .pre-commit-config.yaml to match conda environment pinning. ([#9300](https://github.com/rapidsai/cudf/pull/9300)) [@bdice](https://github.com/bdice) +- Use gather.hpp when gather-map exists in device memory ([#9299](https://github.com/rapidsai/cudf/pull/9299)) [@davidwendt](https://github.com/davidwendt) +- Fix Automerger for `Branch-21.12` from `branch-21.10` ([#9285](https://github.com/rapidsai/cudf/pull/9285)) [@galipremsagar](https://github.com/galipremsagar) +- Refactor cuIO timestamp processing with `cuda::std::chrono` ([#9278](https://github.com/rapidsai/cudf/pull/9278)) [@PointKernel](https://github.com/PointKernel) +- Change strings copy_if_else to use optional-iterator instead of pair-iterator ([#9266](https://github.com/rapidsai/cudf/pull/9266)) [@davidwendt](https://github.com/davidwendt) +- Update cudf java bindings to 21.12.0-SNAPSHOT ([#9248](https://github.com/rapidsai/cudf/pull/9248)) [@pxLi](https://github.com/pxLi) +- Various internal MultiIndex improvements ([#9243](https://github.com/rapidsai/cudf/pull/9243)) [@vyasr](https://github.com/vyasr) +- Add detail interface for `split` and `slice(table_view)`, refactors both function with `host_span` ([#9226](https://github.com/rapidsai/cudf/pull/9226)) [@isVoid](https://github.com/isVoid) +- Refactor MD5 implementation. ([#9212](https://github.com/rapidsai/cudf/pull/9212)) [@bdice](https://github.com/bdice) +- Update groupby result_cache to allow sharing intermediate results based on column_view instead of requests. ([#9195](https://github.com/rapidsai/cudf/pull/9195)) [@karthikeyann](https://github.com/karthikeyann) +- Use nvcomp's snappy decompressor in avro reader ([#9181](https://github.com/rapidsai/cudf/pull/9181)) [@devavret](https://github.com/devavret) +- Add `isocalendar` API support ([#9169](https://github.com/rapidsai/cudf/pull/9169)) [@marlenezw](https://github.com/marlenezw) +- Simplify read_json by removing unnecessary reader/impl classes ([#9088](https://github.com/rapidsai/cudf/pull/9088)) [@cwharris](https://github.com/cwharris) +- Simplify read_csv by removing unnecessary reader/impl classes ([#9041](https://github.com/rapidsai/cudf/pull/9041)) [@cwharris](https://github.com/cwharris) +- Refactor hash join with cuCollections multimap ([#8934](https://github.com/rapidsai/cudf/pull/8934)) [@PointKernel](https://github.com/PointKernel) # cuDF 21.10.00 (7 Oct 2021) From ce02856c099694ad463dbf7970dfc69276842557 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 17 Dec 2021 15:44:56 -0800 Subject: [PATCH 02/12] Add decimal types to cuIO benchmarks (#9776) Closes https://github.com/rapidsai/cudf/issues/9769 Depends on https://github.com/rapidsai/cudf/pull/9775 Benchmarks now include decimal32/64/128 columns for all supported formats. Also fixes an issue in distribution factory, which caused all normal distributions to generate `upper_bound` in many cases. Authors: - Vukasin Milovanovic (https://github.com/vuule) - Jason Lowe (https://github.com/jlowe) Approvers: - Devavret Makkar (https://github.com/devavret) - https://github.com/nvdbaranec URL: https://github.com/rapidsai/cudf/pull/9776 --- .../common/generate_benchmark_input.cpp | 25 +++++++++++++++-- .../common/generate_benchmark_input.hpp | 13 +++++++-- .../common/random_distribution_factory.hpp | 27 +++++++++++-------- .../io/csv/csv_reader_benchmark.cpp | 2 ++ .../io/csv/csv_writer_benchmark.cpp | 2 ++ .../io/orc/orc_reader_benchmark.cpp | 5 +++- .../io/orc/orc_writer_benchmark.cpp | 5 +++- .../io/parquet/parquet_reader_benchmark.cpp | 5 +++- .../io/parquet/parquet_writer_benchmark.cpp | 5 +++- 9 files changed, 70 insertions(+), 19 deletions(-) diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp index 0ec2590bdb5..995cea13c27 100644 --- a/cpp/benchmarks/common/generate_benchmark_input.cpp +++ b/cpp/benchmarks/common/generate_benchmark_input.cpp @@ -161,8 +161,29 @@ struct random_value_fn()>> { */ template struct random_value_fn()>> { - random_value_fn(distribution_params const&) {} - T operator()(std::mt19937& engine) { CUDF_FAIL("Not implemented"); } + using rep = typename T::rep; + rep const lower_bound; + rep const upper_bound; + distribution_fn dist; + std::optional scale; + + random_value_fn(distribution_params const& desc) + : lower_bound{desc.lower_bound}, + upper_bound{desc.upper_bound}, + dist{make_distribution(desc.id, desc.lower_bound, desc.upper_bound)} + { + } + + T operator()(std::mt19937& engine) + { + if (not scale.has_value()) { + int const max_scale = std::numeric_limits::digits10; + auto scale_dist = make_distribution(distribution_id::NORMAL, -max_scale, max_scale); + scale = numeric::scale_type{std::max(std::min(scale_dist(engine), max_scale), -max_scale)}; + } + // Clamp the generated random value to the specified range + return T{std::max(std::min(dist(engine), upper_bound), lower_bound), *scale}; + } }; /** diff --git a/cpp/benchmarks/common/generate_benchmark_input.hpp b/cpp/benchmarks/common/generate_benchmark_input.hpp index 6ea57c0a7ad..3dbc6561839 100644 --- a/cpp/benchmarks/common/generate_benchmark_input.hpp +++ b/cpp/benchmarks/common/generate_benchmark_input.hpp @@ -216,6 +216,7 @@ class data_profile { distribution_params string_dist_desc{{distribution_id::NORMAL, 0, 32}}; distribution_params list_dist_desc{ cudf::type_id::INT32, {distribution_id::GEOMETRIC, 0, 100}, 2}; + std::map> decimal_params; double bool_probability = 0.5; double null_frequency = 0.01; @@ -284,9 +285,17 @@ class data_profile { } template ()>* = nullptr> - distribution_params get_distribution_params() const + distribution_params get_distribution_params() const { - CUDF_FAIL("Not implemented"); + using rep = typename T::rep; + auto it = decimal_params.find(cudf::type_to_id()); + if (it == decimal_params.end()) { + auto const range = default_range(); + return distribution_params{default_distribution_id(), range.first, range.second}; + } else { + auto& desc = it->second; + return {desc.id, static_cast(desc.lower_bound), static_cast(desc.upper_bound)}; + } } auto get_bool_probability() const { return bool_probability; } diff --git a/cpp/benchmarks/common/random_distribution_factory.hpp b/cpp/benchmarks/common/random_distribution_factory.hpp index c21fb645573..65dc8b4dd4d 100644 --- a/cpp/benchmarks/common/random_distribution_factory.hpp +++ b/cpp/benchmarks/common/random_distribution_factory.hpp @@ -21,19 +21,24 @@ #include #include +/** + * @brief Generates a normal(binomial) distribution between zero and upper_bound. + */ template ::value, T>* = nullptr> -auto make_normal_dist(T range_start, T range_end) +auto make_normal_dist(T upper_bound) { - using uT = typename std::make_unsigned::type; - uT const range_size = range_end - range_start; - return std::binomial_distribution(range_size, 0.5); + using uT = typename std::make_unsigned::type; + return std::binomial_distribution(upper_bound, 0.5); } +/** + * @brief Generates a normal distribution between zero and upper_bound. + */ template ()>* = nullptr> -auto make_normal_dist(T range_start, T range_end) +auto make_normal_dist(T upper_bound) { - T const mean = range_start / 2 + range_end / 2; - T const stddev = range_end / 6 - range_start / 6; + T const mean = upper_bound / 2; + T const stddev = upper_bound / 6; return std::normal_distribution(mean, stddev); } @@ -82,8 +87,8 @@ distribution_fn make_distribution(distribution_id did, T lower_bound, T upper { switch (did) { case distribution_id::NORMAL: - return [lower_bound, dist = make_normal_dist(lower_bound, upper_bound)]( - std::mt19937& engine) mutable -> T { return dist(engine) - lower_bound; }; + return [lower_bound, dist = make_normal_dist(upper_bound - lower_bound)]( + std::mt19937& engine) mutable -> T { return dist(engine) + lower_bound; }; case distribution_id::UNIFORM: return [dist = make_uniform_dist(lower_bound, upper_bound)]( std::mt19937& engine) mutable -> T { return dist(engine); }; @@ -104,8 +109,8 @@ distribution_fn make_distribution(distribution_id dist_id, T lower_bound, T u { switch (dist_id) { case distribution_id::NORMAL: - return [dist = make_normal_dist(lower_bound, upper_bound)]( - std::mt19937& engine) mutable -> T { return dist(engine); }; + return [lower_bound, dist = make_normal_dist(upper_bound - lower_bound)]( + std::mt19937& engine) mutable -> T { return dist(engine) + lower_bound; }; case distribution_id::UNIFORM: return [dist = make_uniform_dist(lower_bound, upper_bound)]( std::mt19937& engine) mutable -> T { return dist(engine); }; diff --git a/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp b/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp index 3f5549a3148..77bf4b03a14 100644 --- a/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp +++ b/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp @@ -70,6 +70,7 @@ void BM_csv_read_varying_options(benchmark::State& state) auto const data_types = dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL), int32_t(type_group_id::FLOATING_POINT), + int32_t(type_group_id::FIXED_POINT), int32_t(type_group_id::TIMESTAMP), int32_t(cudf::type_id::STRING)}), col_sel); @@ -143,6 +144,7 @@ void BM_csv_read_varying_options(benchmark::State& state) RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL); RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT); +RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT); RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP); RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING); diff --git a/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp b/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp index fdd7c63eece..9baab6b2571 100644 --- a/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp +++ b/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp @@ -63,6 +63,7 @@ void BM_csv_write_varying_options(benchmark::State& state) auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL), int32_t(type_group_id::FLOATING_POINT), + int32_t(type_group_id::FIXED_POINT), int32_t(type_group_id::TIMESTAMP), int32_t(cudf::type_id::STRING)}); @@ -96,6 +97,7 @@ void BM_csv_write_varying_options(benchmark::State& state) WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL); WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT); +WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT); WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP); WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING); diff --git a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp index f0624e40149..6ab8d8d09c0 100644 --- a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp +++ b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp @@ -91,8 +91,10 @@ void BM_orc_read_varying_options(benchmark::State& state) auto const data_types = dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED), int32_t(type_group_id::FLOATING_POINT), + int32_t(type_group_id::FIXED_POINT), int32_t(type_group_id::TIMESTAMP), - int32_t(cudf::type_id::STRING)}), + int32_t(cudf::type_id::STRING), + int32_t(cudf::type_id::LIST)}), col_sel); auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size}); auto const view = tbl->view(); @@ -158,6 +160,7 @@ void BM_orc_read_varying_options(benchmark::State& state) RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL_SIGNED); RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT); +RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT); RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP); RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING); RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, list, cudf::type_id::LIST); diff --git a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp index bfa7d4fc6d9..933b3d02e08 100644 --- a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp +++ b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp @@ -70,8 +70,10 @@ void BM_orc_write_varying_options(benchmark::State& state) auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED), int32_t(type_group_id::FLOATING_POINT), + int32_t(type_group_id::FIXED_POINT), int32_t(type_group_id::TIMESTAMP), - int32_t(cudf::type_id::STRING)}); + int32_t(cudf::type_id::STRING), + int32_t(cudf::type_id::LIST)}); auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size}); auto const view = tbl->view(); @@ -101,6 +103,7 @@ void BM_orc_write_varying_options(benchmark::State& state) WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL_SIGNED); WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT); +WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT); WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP); WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING); WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, list, cudf::type_id::LIST); diff --git a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp index 045aa0e043b..a68ce2bd1a1 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp @@ -92,8 +92,10 @@ void BM_parq_read_varying_options(benchmark::State& state) auto const data_types = dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL), int32_t(type_group_id::FLOATING_POINT), + int32_t(type_group_id::FIXED_POINT), int32_t(type_group_id::TIMESTAMP), - int32_t(cudf::type_id::STRING)}), + int32_t(cudf::type_id::STRING), + int32_t(cudf::type_id::LIST)}), col_sel); auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size}); auto const view = tbl->view(); @@ -160,6 +162,7 @@ void BM_parq_read_varying_options(benchmark::State& state) RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL); RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT); +RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT); RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP); RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING); RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, list, cudf::type_id::LIST); diff --git a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp index 5c3c53fee8e..1af7e206692 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp @@ -71,8 +71,10 @@ void BM_parq_write_varying_options(benchmark::State& state) auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED), int32_t(type_group_id::FLOATING_POINT), + int32_t(type_group_id::FIXED_POINT), int32_t(type_group_id::TIMESTAMP), - int32_t(cudf::type_id::STRING)}); + int32_t(cudf::type_id::STRING), + int32_t(cudf::type_id::LIST)}); auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size}); auto const view = tbl->view(); @@ -103,6 +105,7 @@ void BM_parq_write_varying_options(benchmark::State& state) WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL); WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT); +WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT); WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP); WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING); WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, list, cudf::type_id::LIST); From a4dc42d4c6b88b7f2da78d55934c01cb7479a6a1 Mon Sep 17 00:00:00 2001 From: MithunR Date: Mon, 20 Dec 2021 08:26:23 -0800 Subject: [PATCH 03/12] Implement `lists::index_of()` to find positions in list rows (#9510) Fixes #9164. ### Prelude `lists::contains()` (introduced in #7039) returns a `BOOL8` column, indicating whether the specified search_key(s) exist at all in each corresponding list row of an input LIST column. It does not return the actual position. ### `index_of()` This commit introduces `lists::index_of()`, to return the INT32 positions of the specified search_key(s) in a LIST column. The search keys may be searched for using either `FIND_FIRST` (which finds the position of the first occurrence), or `FIND_LAST` (which finds the last occurrence). Both column_view and scalar search keys are supported. As with `lists::contains()`, nested types are not supported as search keys in `lists::index_of()`. If the search_key cannot be found, that output row is set to `-1`. Additionally, the row `output[i]` is set to null if: 1. The `search_key`(scalar) or `search_keys[i]`(column_view) is null. 2. The list row `lists[i]` is null In all other cases, `output[i]` should contain a non-negative value. ### Semantic changes for `lists::contains()` This commit also modifies the semantics of `lists::contains()`: it will now return nulls only for the following cases: 1. The `search_key`(scalar) or `search_keys[i]`(column_view) is null. 2. The list row `lists[i]` is null In all other cases, a non-null bool is returned. Specifically `lists::contains()` no longer conforms to SQL semantics of returning `NULL` for list rows that don't contain the search key, while simultaneously containing nulls. In this case, `false` is returned. ### `lists::contains_null_elements()` A new function has been introduced to check if each list row contains null elements. The semantics are similar to `lists::contains()`, in that the column returned is BOOL8 typed: 1. If even 1 element in a list row is null, the returned row is `true`. 2. If no element is null, the returned row is `false`. 3. If the list row is null, the returned row is `null`. 4. If the list row is empty, the returned row is `false`. The current implementation is an inefficient placeholder, to be replaced once (#9588) is available. It is included here to reconstruct the SQL semantics dropped from `lists::contains()`. Authors: - MithunR (https://github.com/mythrocks) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Jason Lowe (https://github.com/jlowe) - Mark Harris (https://github.com/harrism) - Conor Hoekstra (https://github.com/codereport) URL: https://github.com/rapidsai/cudf/pull/9510 --- cpp/include/cudf/lists/contains.hpp | 102 +- cpp/src/lists/contains.cu | 353 +++++-- cpp/tests/lists/contains_tests.cpp | 995 ++++++++++++------ .../main/java/ai/rapids/cudf/ColumnView.java | 84 +- java/src/main/native/src/ColumnViewJni.cpp | 50 + .../java/ai/rapids/cudf/ColumnVectorTest.java | 165 ++- python/cudf/cudf/tests/test_list.py | 4 +- 7 files changed, 1283 insertions(+), 470 deletions(-) diff --git a/cpp/include/cudf/lists/contains.hpp b/cpp/include/cudf/lists/contains.hpp index 7cd40bb2f86..d529677d505 100644 --- a/cpp/include/cudf/lists/contains.hpp +++ b/cpp/include/cudf/lists/contains.hpp @@ -27,7 +27,7 @@ namespace lists { */ /** - * @brief Create a column of bool values indicating whether the specified scalar + * @brief Create a column of `bool` values indicating whether the specified scalar * is an element of each row of a list column. * * The output column has as many elements as the input `lists` column. @@ -51,7 +51,7 @@ std::unique_ptr contains( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Create a column of bool values indicating whether the list rows of the first + * @brief Create a column of `bool` values indicating whether the list rows of the first * column contain the corresponding values in the second column * * The output column has as many elements as the input `lists` column. @@ -74,6 +74,104 @@ std::unique_ptr contains( cudf::column_view const& search_keys, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Create a column of `bool` values indicating whether each row in the `lists` column + * contains at least one null element. + * + * The output column has as many elements as the input `lists` column. + * Output `column[i]` is set to null the list row `lists[i]` is null. + * Otherwise, `column[i]` is set to a non-null boolean value, depending on whether that list + * contains a null element. + * (Empty list rows are considered *NOT* to contain a null element.) + * + * @param lists Lists column whose `n` rows are to be searched + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return std::unique_ptr BOOL8 column of `n` rows with the result of the lookup + */ +std::unique_ptr contains_nulls( + cudf::lists_column_view const& lists, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Option to choose whether `index_of()` returns the first or last match + * of a search key in a list row + */ +enum class duplicate_find_option : int32_t { + FIND_FIRST = 0, ///< Finds first instance of a search key in a list row. + FIND_LAST ///< Finds last instance of a search key in a list row. +}; + +/** + * @brief Create a column of `size_type` values indicating the position of a search key + * within each list row in the `lists` column + * + * The output column has as many elements as there are rows in the input `lists` column. + * Output `column[i]` contains a 0-based index indicating the position of the search key + * in each list, counting from the beginning of the list. + * Note: + * 1. If the `search_key` is null, all output rows are set to null. + * 2. If the row `lists[i]` is null, `output[i]` is also null. + * 3. If the row `lists[i]` does not contain the `search_key`, `output[i]` is set to `-1`. + * 4. In all other cases, `output[i]` is set to a non-negative `size_type` index. + * + * If the `find_option` is set to `FIND_FIRST`, the position of the first match for + * `search_key` is returned. + * If `find_option == FIND_LAST`, the position of the last match in the list row is + * returned. + * + * @param lists Lists column whose `n` rows are to be searched + * @param search_key The scalar key to be looked up in each list row + * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or + * last (`FIND_LAST`) + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return std::unique_ptr INT32 column of `n` rows with the location of the `search_key` + * + * @throw cudf::logic_error If `search_key` type does not match the element type in `lists` + * @throw cudf::logic_error If `search_key` is of a nested type, or `lists` contains nested + * elements (LIST, STRUCT) + */ +std::unique_ptr index_of( + cudf::lists_column_view const& lists, + cudf::scalar const& search_key, + duplicate_find_option find_option = duplicate_find_option::FIND_FIRST, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Create a column of `size_type` values indicating the position of a search key + * row within the corresponding list row in the `lists` column + * + * The output column has as many elements as there are rows in the input `lists` column. + * Output `column[i]` contains a 0-based index indicating the position of each search key + * row in its corresponding list row, counting from the beginning of the list. + * Note: + * 1. If `search_keys[i]` is null, `output[i]` is also null. + * 2. If the row `lists[i]` is null, `output[i]` is also null. + * 3. If the row `lists[i]` does not contain `search_key[i]`, `output[i]` is set to `-1`. + * 4. In all other cases, `output[i]` is set to a non-negative `size_type` index. + * + * If the `find_option` is set to `FIND_FIRST`, the position of the first match for + * `search_key` is returned. + * If `find_option == FIND_LAST`, the position of the last match in the list row is + * returned. + * + * @param lists Lists column whose `n` rows are to be searched + * @param search_keys A column of search keys to be looked up in each corresponding row of + * `lists` + * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or + * last (`FIND_LAST`) + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return std::unique_ptr INT32 column of `n` rows with the location of the `search_key` + * + * @throw cudf::logic_error If `search_keys` does not match `lists` in its number of rows + * @throw cudf::logic_error If `search_keys` type does not match the element type in `lists` + * @throw cudf::logic_error If `lists` or `search_keys` contains nested elements (LIST, STRUCT) + */ +std::unique_ptr index_of( + cudf::lists_column_view const& lists, + cudf::column_view const& search_keys, + duplicate_find_option find_option = duplicate_find_option::FIND_FIRST, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of group } // namespace lists } // namespace cudf diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu index 3d135992dea..5d095fdd5a3 100644 --- a/cpp/src/lists/contains.cu +++ b/cpp/src/lists/contains.cu @@ -35,6 +35,8 @@ namespace lists { namespace { +auto constexpr absent_index = size_type{-1}; + auto get_search_keys_device_iterable_view(cudf::column_view const& search_keys, rmm::cuda_stream_view stream) { @@ -46,6 +48,59 @@ auto get_search_keys_device_iterable_view(cudf::scalar const& search_key, rmm::c return &search_key; } +template +auto __device__ find_begin(list_device_view const& list) +{ + if constexpr (find_option == duplicate_find_option::FIND_FIRST) { + return list.pair_rep_begin(); + } else { + return thrust::make_reverse_iterator(list.pair_rep_end()); + } +} + +template +auto __device__ find_end(list_device_view const& list) +{ + if constexpr (find_option == duplicate_find_option::FIND_FIRST) { + return list.pair_rep_end(); + } else { + return thrust::make_reverse_iterator(list.pair_rep_begin()); + } +} + +template +size_type __device__ distance([[maybe_unused]] Iterator begin, Iterator end, Iterator find_iter) +{ + if (find_iter == end) { + return absent_index; // Not found. + } + + if constexpr (find_option == duplicate_find_option::FIND_FIRST) { + return find_iter - begin; // Distance of find_position from begin. + } else { + return end - find_iter - 1; // Distance of find_position from end. + } +} + +/** + * @brief __device__ functor to search for a key in a `list_device_view`. + */ +template +struct finder { + template + __device__ size_type operator()(list_device_view const& list, ElementType const& search_key) const + { + auto const list_begin = find_begin(list); + auto const list_end = find_end(list); + auto const find_iter = thrust::find_if( + thrust::seq, list_begin, list_end, [search_key] __device__(auto element_and_validity) { + auto [element, element_is_valid] = element_and_validity; + return element_is_valid && cudf::equality_compare(element, search_key); + }); + return distance(list_begin, list_end, find_iter); + }; +}; + /** * @brief Functor to search each list row for the specified search keys. */ @@ -63,13 +118,15 @@ struct lookup_functor { Args&&...) const { CUDF_FAIL( - "lists::contains() is only supported on numeric types, decimals, chrono types, and strings."); + "List search operations are only supported on numeric types, decimals, chrono types, and " + "strings."); } - std::pair construct_null_mask(lists_column_view const& input_lists, - column_view const& result_validity, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + std::pair construct_null_mask( + lists_column_view const& input_lists, + column_view const& result_validity, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { if (!search_keys_have_nulls && !input_lists.has_nulls() && !input_lists.child().has_nulls()) { return {rmm::device_buffer{0, stream, mr}, size_type{0}}; @@ -82,50 +139,31 @@ struct lookup_functor { template void search_each_list_row(cudf::detail::lists_column_device_view const& d_lists, SearchKeyPairIter search_key_pair_iter, - cudf::mutable_column_device_view mutable_ret_bools, - cudf::mutable_column_device_view mutable_ret_validity, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource*) + duplicate_find_option find_option, + cudf::mutable_column_device_view ret_positions, + cudf::mutable_column_device_view ret_validity, + rmm::cuda_stream_view stream) const { - thrust::for_each( + auto output_iterator = thrust::make_zip_iterator( + thrust::make_tuple(ret_positions.data(), ret_validity.data())); + + thrust::tabulate( rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(d_lists.size()), - [d_lists, - search_key_pair_iter, - d_bools = mutable_ret_bools.data(), - d_validity = mutable_ret_validity.data()] __device__(auto row_index) { - auto search_key_and_validity = search_key_pair_iter[row_index]; - auto const& search_key_is_valid = search_key_and_validity.second; - - if (search_keys_have_nulls && !search_key_is_valid) { - d_bools[row_index] = false; - d_validity[row_index] = false; - return; - } + output_iterator, + output_iterator + d_lists.size(), + [d_lists, search_key_pair_iter, absent_index = absent_index, find_option] __device__( + auto row_index) -> thrust::pair { + auto [search_key, search_key_is_valid] = search_key_pair_iter[row_index]; + + if (search_keys_have_nulls && !search_key_is_valid) { return {absent_index, false}; } auto list = cudf::list_device_view(d_lists, row_index); - if (list.is_null()) { - d_bools[row_index] = false; - d_validity[row_index] = false; - return; - } - - auto search_key = search_key_and_validity.first; - d_bools[row_index] = - thrust::find_if(thrust::seq, - list.pair_rep_begin(), - list.pair_rep_end(), - [search_key] __device__(auto element_and_validity) { - return element_and_validity.second && - cudf::equality_compare(element_and_validity.first, search_key); - }) != list.pair_rep_end(); - d_validity[row_index] = - d_bools[row_index] || - thrust::none_of(thrust::seq, - thrust::make_counting_iterator(size_type{0}), - thrust::make_counting_iterator(list.size()), - [&list] __device__(auto const& i) { return list.is_null(i); }); + if (list.is_null()) { return {absent_index, false}; } + + auto const position = find_option == duplicate_find_option::FIND_FIRST + ? finder{}(list, search_key) + : finder{}(list, search_key); + return {position, true}; }); } @@ -133,74 +171,171 @@ struct lookup_functor { std::enable_if_t::value, std::unique_ptr> operator()( cudf::lists_column_view const& lists, SearchKeyType const& search_key, + duplicate_find_option find_option, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const { using namespace cudf; using namespace cudf::detail; CUDF_EXPECTS(!cudf::is_nested(lists.child().type()), - "Nested types not supported in lists::contains()"); + "Nested types not supported in list search operations."); CUDF_EXPECTS(lists.child().type() == search_key.type(), "Type/Scale of search key does not match list column element type."); CUDF_EXPECTS(search_key.type().id() != type_id::EMPTY, "Type cannot be empty."); auto constexpr search_key_is_scalar = std::is_same_v; - if (search_keys_have_nulls && search_key_is_scalar) { - return make_fixed_width_column(data_type(type_id::BOOL8), - lists.size(), - cudf::create_null_mask(lists.size(), mask_state::ALL_NULL, mr), - lists.size(), - stream, - mr); + if constexpr (search_keys_have_nulls && search_key_is_scalar) { + return make_numeric_column(data_type(type_id::INT32), + lists.size(), + cudf::create_null_mask(lists.size(), mask_state::ALL_NULL, mr), + lists.size(), + stream, + mr); } auto const device_view = column_device_view::create(lists.parent(), stream); - auto const d_lists = lists_column_device_view(*device_view); + auto const d_lists = lists_column_device_view{*device_view}; auto const d_skeys = get_search_keys_device_iterable_view(search_key, stream); - auto result_validity = make_fixed_width_column( + auto result_positions = make_numeric_column( + data_type{type_id::INT32}, lists.size(), cudf::mask_state::UNALLOCATED, stream, mr); + auto result_validity = make_numeric_column( data_type{type_id::BOOL8}, lists.size(), cudf::mask_state::UNALLOCATED, stream, mr); - auto result_bools = make_fixed_width_column( - data_type{type_id::BOOL8}, lists.size(), cudf::mask_state::UNALLOCATED, stream, mr); - auto mutable_result_bools = - mutable_column_device_view::create(result_bools->mutable_view(), stream); + auto mutable_result_positions = + mutable_column_device_view::create(result_positions->mutable_view(), stream); auto mutable_result_validity = mutable_column_device_view::create(result_validity->mutable_view(), stream); auto search_key_iter = cudf::detail::make_pair_rep_iterator(*d_skeys); - search_each_list_row( - d_lists, search_key_iter, *mutable_result_bools, *mutable_result_validity, stream, mr); - - rmm::device_buffer null_mask; - size_type num_nulls; + search_each_list_row(d_lists, + search_key_iter, + find_option, + *mutable_result_positions, + *mutable_result_validity, + stream); - std::tie(null_mask, num_nulls) = - construct_null_mask(lists, result_validity->view(), stream, mr); - result_bools->set_null_mask(std::move(null_mask), num_nulls); - - return result_bools; + auto [null_mask, num_nulls] = construct_null_mask(lists, result_validity->view(), stream, mr); + result_positions->set_null_mask(std::move(null_mask), num_nulls); + return result_positions; } }; +/** + * @brief Converts key-positions vector (from index_of()) to a BOOL8 vector, indicating if + * the search key was found. + */ +std::unique_ptr to_contains(std::unique_ptr&& key_positions, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(key_positions->type().id() == type_id::INT32, + "Expected input column of type INT32."); + // If position == -1, the list did not contain the search key. + auto const num_rows = key_positions->size(); + auto const positions_begin = key_positions->view().begin(); + auto result = + make_numeric_column(data_type{type_id::BOOL8}, num_rows, mask_state::UNALLOCATED, stream, mr); + thrust::transform(rmm::exec_policy(stream), + positions_begin, + positions_begin + num_rows, + result->mutable_view().begin(), + [] __device__(auto i) { return i != absent_index; }); + [[maybe_unused]] auto [_, null_mask, __] = key_positions->release(); + result->set_null_mask(std::move(*null_mask)); + return result; +} } // namespace namespace detail { +/** + * @copydoc cudf::lists::index_of(cudf::lists_column_view const&, + * cudf::scalar const&, + * duplicate_find_option, + * rmm::mr::device_memory_resource*) + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr index_of( + cudf::lists_column_view const& lists, + cudf::scalar const& search_key, + duplicate_find_option find_option, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + return search_key.is_valid(stream) + ? cudf::type_dispatcher(search_key.type(), + lookup_functor{}, // No nulls in search key + lists, + search_key, + find_option, + stream, + mr) + : cudf::type_dispatcher(search_key.type(), + lookup_functor{}, // Nulls in search key + lists, + search_key, + find_option, + stream, + mr); +} + +/** + * @copydoc cudf::lists::index_of(cudf::lists_column_view const&, + * cudf::column_view const&, + * duplicate_find_option, + * rmm::mr::device_memory_resource*) + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr index_of( + cudf::lists_column_view const& lists, + cudf::column_view const& search_keys, + duplicate_find_option find_option, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + CUDF_EXPECTS(search_keys.size() == lists.size(), + "Number of search keys must match list column size."); + + return search_keys.has_nulls() + ? cudf::type_dispatcher(search_keys.type(), + lookup_functor{}, // Nulls in search keys + lists, + search_keys, + find_option, + stream, + mr) + : cudf::type_dispatcher(search_keys.type(), + lookup_functor{}, // No nulls in search keys + lists, + search_keys, + find_option, + stream, + mr); +} +/** + * @copydoc cudf::lists::contains(cudf::lists_column_view const&, + * cudf::scalar const&, + * rmm::mr::device_memory_resource*) + * @param stream CUDA stream used for device memory operations and kernel launches. + */ std::unique_ptr contains(cudf::lists_column_view const& lists, cudf::scalar const& search_key, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return search_key.is_valid(stream) - ? cudf::type_dispatcher( - search_key.type(), lookup_functor{}, lists, search_key, stream, mr) - : cudf::type_dispatcher( - search_key.type(), lookup_functor{}, lists, search_key, stream, mr); + return to_contains( + index_of(lists, search_key, duplicate_find_option::FIND_FIRST, stream), stream, mr); } +/** + * @copydoc cudf::lists::contains(cudf::lists_column_view const&, + * cudf::column_view const&, + * rmm::mr::device_memory_resource*) + * @param stream CUDA stream used for device memory operations and kernel launches. + */ std::unique_ptr contains(cudf::lists_column_view const& lists, cudf::column_view const& search_keys, rmm::cuda_stream_view stream, @@ -209,11 +344,44 @@ std::unique_ptr contains(cudf::lists_column_view const& lists, CUDF_EXPECTS(search_keys.size() == lists.size(), "Number of search keys must match list column size."); - return search_keys.has_nulls() - ? cudf::type_dispatcher( - search_keys.type(), lookup_functor{}, lists, search_keys, stream, mr) - : cudf::type_dispatcher( - search_keys.type(), lookup_functor{}, lists, search_keys, stream, mr); + return to_contains( + index_of(lists, search_keys, duplicate_find_option::FIND_FIRST, stream), stream, mr); +} + +/** + * @copydoc cudf::lists::contain_nulls(cudf::lists_column_view const&, + * rmm::mr::device_memory_resource*) + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr contains_nulls(cudf::lists_column_view const& input_lists, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const num_rows = input_lists.size(); + auto const d_lists = column_device_view::create(input_lists.parent()); + auto has_nulls_output = make_numeric_column( + data_type{type_id::BOOL8}, input_lists.size(), mask_state::UNALLOCATED, stream, mr); + auto const output_begin = has_nulls_output->mutable_view().begin(); + thrust::tabulate( + rmm::exec_policy(stream), + output_begin, + output_begin + num_rows, + [lists = cudf::detail::lists_column_device_view{*d_lists}] __device__(auto list_idx) { + auto list = list_device_view{lists, list_idx}; + auto list_begin = thrust::make_counting_iterator(size_type{0}); + return list.is_null() || + thrust::any_of(thrust::seq, list_begin, list_begin + list.size(), [&list](auto i) { + return list.is_null(i); + }); + }); + auto const validity_begin = cudf::detail::make_counting_transform_iterator( + 0, [lists = cudf::detail::lists_column_device_view{*d_lists}] __device__(auto list_idx) { + return not list_device_view{lists, list_idx}.is_null(); + }); + auto [null_mask, num_nulls] = cudf::detail::valid_if( + validity_begin, validity_begin + num_rows, thrust::identity{}, stream, mr); + has_nulls_output->set_null_mask(std::move(null_mask), num_nulls); + return has_nulls_output; } } // namespace detail @@ -234,5 +402,30 @@ std::unique_ptr contains(cudf::lists_column_view const& lists, return detail::contains(lists, search_keys, rmm::cuda_stream_default, mr); } +std::unique_ptr contains_nulls(cudf::lists_column_view const& input_lists, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::contains_nulls(input_lists, rmm::cuda_stream_default, mr); +} + +std::unique_ptr index_of(cudf::lists_column_view const& lists, + cudf::scalar const& search_key, + duplicate_find_option find_option, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::index_of(lists, search_key, find_option, rmm::cuda_stream_default, mr); +} + +std::unique_ptr index_of(cudf::lists_column_view const& lists, + cudf::column_view const& search_keys, + duplicate_find_option find_option, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::index_of(lists, search_keys, find_option, rmm::cuda_stream_default, mr); +} + } // namespace lists } // namespace cudf diff --git a/cpp/tests/lists/contains_tests.cpp b/cpp/tests/lists/contains_tests.cpp index 5d7e218898c..066eb7eafc8 100644 --- a/cpp/tests/lists/contains_tests.cpp +++ b/cpp/tests/lists/contains_tests.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include namespace cudf { @@ -42,6 +43,12 @@ struct TypedContainsTest : public ContainsTest { TYPED_TEST_SUITE(TypedContainsTest, ContainsTestTypes); namespace { + +auto constexpr x = int32_t{-1}; // Placeholder for nulls. +auto constexpr absent = size_type{-1}; // Index when key is not found in a list. +auto constexpr FIND_FIRST = lists::duplicate_find_option::FIND_FIRST; +auto constexpr FIND_LAST = lists::duplicate_find_option::FIND_LAST; + template (), void>* = nullptr> auto create_scalar_search_key(T const& value) { @@ -101,238 +108,381 @@ auto create_null_search_key() } // namespace -TYPED_TEST(TypedContainsTest, ListContainsScalarWithNoNulls) +using iterators::all_nulls; +using iterators::null_at; +using iterators::nulls_at; +using bools = fixed_width_column_wrapper; +using indices = fixed_width_column_wrapper; + +TYPED_TEST(TypedContainsTest, ScalarKeyWithNoNulls) { using T = TypeParam; - auto search_space = lists_column_wrapper{ - {0, 1, 2}, - {3, 4, 5}, - {6, 7, 8}, - {9, 0, 1}, - {2, 3, 4}, - {5, 6, 7}, - {8, 9, 0}, - {}, - {1, 2, 3}, - {}}.release(); - auto search_key_one = create_scalar_search_key(1); - auto actual_result = lists::contains(search_space->view(), *search_key_one); - auto expected_result = fixed_width_column_wrapper{1, 0, 0, 1, 0, 0, 0, 0, 1, 0}; + auto search_space = lists_column_view{lists_column_wrapper{{0, 1, 2, 1}, + {3, 4, 5}, + {6, 7, 8}, + {9, 0, 1, 3, 1}, + {2, 3, 4}, + {5, 6, 7}, + {8, 9, 0}, + {}, + {1, 2, 1, 3}, + {}}}; + auto search_key_one = create_scalar_search_key(1); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); + { + // CONTAINS + auto result = lists::contains(search_space, *search_key_one); + auto expected = bools{1, 0, 0, 1, 0, 0, 0, 0, 1, 0}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // CONTAINS NULLS + auto result = lists::contains_nulls(search_space); + auto expected = bools{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_FIRST + auto result = lists::index_of(search_space, *search_key_one, FIND_FIRST); + auto expected = indices{1, absent, absent, 2, absent, absent, absent, absent, 0, absent}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_LAST + auto result = lists::index_of(search_space, *search_key_one, FIND_LAST); + auto expected = indices{3, absent, absent, 4, absent, absent, absent, absent, 2, absent}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } } -TYPED_TEST(TypedContainsTest, ListContainsScalarWithNullLists) +TYPED_TEST(TypedContainsTest, ScalarKeyWithNullLists) { // Test List columns that have NULL list rows. - using T = TypeParam; - auto search_space = lists_column_wrapper{ - {{0, 1, 2}, - {3, 4, 5}, - {6, 7, 8}, - {}, - {9, 0, 1}, - {2, 3, 4}, - {5, 6, 7}, - {8, 9, 0}, - {}, - {1, 2, 3}, - {}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return (i != 3) && (i != 10); - })}.release(); - + auto search_space = lists_column_view{lists_column_wrapper{{{0, 1, 2, 1}, + {3, 4, 5}, + {6, 7, 8}, + {}, + {9, 0, 1, 3, 1}, + {2, 3, 4}, + {5, 6, 7}, + {8, 9, 0}, + {}, + {1, 2, 2, 3}, + {}}, + nulls_at({3, 10})}}; auto search_key_one = create_scalar_search_key(1); - auto actual_result = lists::contains(search_space->view(), *search_key_one); - auto expected_result = - fixed_width_column_wrapper{{1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return (i != 3) && (i != 10); })}; - - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); + { + // CONTAINS + auto result = lists::contains(search_space, *search_key_one); + auto expected = bools{{1, 0, 0, x, 1, 0, 0, 0, 0, 1, x}, nulls_at({3, 10})}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // CONTAINS NULLS + auto result = lists::contains_nulls(search_space); + auto expected = bools{{0, 0, 0, x, 0, 0, 0, 0, 0, 0, x}, nulls_at({3, 10})}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_FIRST + auto result = lists::index_of(search_space, *search_key_one, FIND_FIRST); + auto expected = + indices{{1, absent, absent, x, 2, absent, absent, absent, absent, 0, x}, nulls_at({3, 10})}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_LAST + auto result = lists::index_of(search_space, *search_key_one, FIND_LAST); + auto expected = + indices{{3, absent, absent, x, 4, absent, absent, absent, absent, 0, x}, nulls_at({3, 10})}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } } TYPED_TEST(TypedContainsTest, SlicedLists) { // Test sliced List columns. - using namespace cudf; + using T = TypeParam; - using T = TypeParam; - using bools = fixed_width_column_wrapper; - - auto search_space = lists_column_wrapper{ - {{0, 1, 2}, - {3, 4, 5}, - {6, 7, 8}, - {}, - {9, 0, 1}, - {2, 3, 4}, - {5, 6, 7}, - {8, 9, 0}, - {}, - {1, 2, 3}, - {}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return (i != 3) && (i != 10); - })}.release(); - - auto sliced_column_1 = cudf::detail::slice(search_space->view(), {1, 8}).front(); - - auto search_key_one = create_scalar_search_key(1); - auto result_1 = lists::contains(sliced_column_1, *search_key_one); - - auto expected_result_1 = bools{ - {0, 0, 0, 1, 0, 0, 0}, cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return (i != 2); - })}.release(); - - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_1->view(), result_1->view()); - - auto sliced_column_2 = cudf::detail::slice(search_space->view(), {3, 10}).front(); - - auto result_2 = lists::contains(sliced_column_2, *search_key_one); + auto search_space = lists_column_wrapper{{{0, 1, 2, 1}, + {3, 4, 5}, + {6, 7, 8}, + {}, + {9, 0, 1, 3, 1}, + {2, 3, 4}, + {5, 6, 7}, + {8, 9, 0}, + {}, + {1, 2, 1, 3}, + {}}, + nulls_at({3, 10})}; - auto expected_result_2 = bools{ - {0, 1, 0, 0, 0, 0, 1}, cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return (i != 0); - })}.release(); + { + // First Slice. + auto sliced_column_1 = cudf::detail::slice(search_space, {1, 8}).front(); + auto search_key_one = create_scalar_search_key(1); + { + // CONTAINS + auto result = lists::contains(sliced_column_1, *search_key_one); + auto expected_result = bools{{0, 0, x, 1, 0, 0, 0}, null_at(2)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view()); + } + { + // CONTAINS NULLS + auto result = lists::contains_nulls(sliced_column_1); + auto expected_result = bools{{0, 0, x, 0, 0, 0, 0}, null_at(2)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view()); + } + { + // FIND_FIRST + auto result = lists::index_of(sliced_column_1, *search_key_one, FIND_FIRST); + auto expected_result = indices{{absent, absent, 0, 2, absent, absent, absent}, null_at(2)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view()); + } + { + // FIND_LAST + auto result = lists::index_of(sliced_column_1, *search_key_one, FIND_LAST); + auto expected_result = indices{{absent, absent, 0, 4, absent, absent, absent}, null_at(2)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view()); + } + } - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_2->view(), result_2->view()); + { + // Second Slice. + auto sliced_column_2 = cudf::detail::slice(search_space, {3, 10}).front(); + auto search_key_one = create_scalar_search_key(1); + { + // CONTAINS + auto result = lists::contains(sliced_column_2, *search_key_one); + auto expected_result = bools{{x, 1, 0, 0, 0, 0, 1}, null_at(0)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view()); + } + { + // CONTAINS NULLS + auto result = lists::contains_nulls(sliced_column_2); + auto expected_result = bools{{x, 0, 0, 0, 0, 0, 0}, null_at(0)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view()); + } + { + // FIND_FIRST + auto result = lists::index_of(sliced_column_2, *search_key_one, FIND_FIRST); + auto expected_result = indices{{0, 2, absent, absent, absent, absent, 0}, null_at(0)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view()); + } + { + // FIND_LAST + auto result = lists::index_of(sliced_column_2, *search_key_one, FIND_LAST); + auto expected_result = indices{{0, 4, absent, absent, absent, absent, 2}, null_at(0)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view()); + } + } } -TYPED_TEST(TypedContainsTest, ListContainsScalarNonNullListsWithNullValues) +TYPED_TEST(TypedContainsTest, ScalarKeyNonNullListsWithNullValues) { // Test List columns that have no NULL list rows, but NULL elements in some list rows. using T = TypeParam; - auto numerals = fixed_width_column_wrapper{ - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })}; - - auto search_space = - make_lists_column(8, - fixed_width_column_wrapper{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(), - numerals.release(), - 0, - {}); - + auto numerals = fixed_width_column_wrapper{{x, 1, 2, x, 4, 5, x, 7, 8, x, x, 1, 2, x, 1}, + nulls_at({0, 3, 6, 9, 10, 13})}; + auto search_space = make_lists_column( + 8, indices{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(), numerals.release(), 0, {}); + // Search space: [ [x], [1,2], [x,4,5,x], [], [], [7,8,x], [x], [1,2,x,1] ] auto search_key_one = create_scalar_search_key(1); - auto actual_result = lists::contains(search_space->view(), *search_key_one); - auto expected_result = - fixed_width_column_wrapper{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 1, 0, 1, 1}}; - - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); + { + // CONTAINS + auto result = lists::contains(search_space->view(), *search_key_one); + auto expected = bools{0, 1, 0, 0, 0, 0, 0, 1}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // CONTAINS NULLS + auto result = lists::contains_nulls(search_space->view()); + auto expected = bools{1, 0, 1, 0, 0, 1, 1, 1}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_FIRST + auto result = lists::index_of(search_space->view(), *search_key_one, FIND_FIRST); + auto expected = indices{absent, 0, absent, absent, absent, absent, absent, 0}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_LAST + auto result = lists::index_of(search_space->view(), *search_key_one, FIND_LAST); + auto expected = indices{absent, 0, absent, absent, absent, absent, absent, 3}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } } -TYPED_TEST(TypedContainsTest, ListContainsScalarWithNullsInLists) +TYPED_TEST(TypedContainsTest, ScalarKeysWithNullsInLists) { using T = TypeParam; - auto numerals = fixed_width_column_wrapper{ - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })}; - - auto input_null_mask_iter = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; }); + auto numerals = fixed_width_column_wrapper{{x, 1, 2, x, 4, 5, x, 7, 8, x, x, 1, 2, x, 1}, + nulls_at({0, 3, 6, 9, 10, 13})}; + auto input_null_mask_iter = null_at(4); auto search_space = make_lists_column( 8, - fixed_width_column_wrapper{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(), + indices{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(), numerals.release(), 1, cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8)); + // Search space: [ [x], [1,2], [x,4,5,x], [], x, [7,8,x], [x], [1,2,x,1] ] auto search_key_one = create_scalar_search_key(1); - auto actual_result = lists::contains(search_space->view(), *search_key_one); - auto expected_result = - fixed_width_column_wrapper{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 0, 0, 1, 1}}; - - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); + { + // CONTAINS. + auto result = lists::contains(search_space->view(), *search_key_one); + auto expected = bools{{0, 1, 0, 0, x, 0, 0, 1}, null_at(4)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // CONTAINS NULLS. + auto result = lists::contains_nulls(search_space->view()); + auto expected = bools{{1, 0, 1, 0, x, 1, 1, 1}, null_at(4)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_FIRST. + auto result = lists::index_of(search_space->view(), *search_key_one, FIND_FIRST); + auto expected = indices{{absent, 0, absent, absent, x, absent, absent, 0}, null_at(4)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_LAST. + auto result = lists::index_of(search_space->view(), *search_key_one, FIND_LAST); + auto expected = indices{{absent, 0, absent, absent, x, absent, absent, 3}, null_at(4)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } } -TEST_F(ContainsTest, BoolListContainsScalarWithNullsInLists) +TEST_F(ContainsTest, BoolScalarWithNullsInLists) { using T = bool; - auto numerals = fixed_width_column_wrapper{ - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })}; - - auto input_null_mask_iter = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; }); - - auto search_space = make_lists_column( + auto numerals = fixed_width_column_wrapper{{x, 1, 1, x, 1, 1, x, 1, 1, x, x, 1, 1, x, 1}, + nulls_at({0, 3, 6, 9, 10, 13})}; + auto input_null_mask_iter = null_at(4); + auto search_space = make_lists_column( 8, fixed_width_column_wrapper{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(), numerals.release(), 1, cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8)); + // Search space: [ [x], [1,1], [x,1,1,x], [], x, [1,1,x], [x], [1,1,x,1] ] auto search_key_one = create_scalar_search_key(1); - auto actual_result = lists::contains(search_space->view(), *search_key_one); - auto expected_result = - fixed_width_column_wrapper{{0, 1, 1, 0, 0, 1, 0, 1}, {0, 1, 1, 1, 0, 1, 1, 1}}; - - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); + { + // CONTAINS + auto result = lists::contains(search_space->view(), *search_key_one); + auto expected = bools{{0, 1, 1, 0, x, 1, 0, 1}, null_at(4)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // CONTAINS NULLS + auto result = lists::contains_nulls(search_space->view()); + auto expected = bools{{1, 0, 1, 0, x, 1, 1, 1}, null_at(4)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_FIRST. + auto result = lists::index_of(search_space->view(), *search_key_one, FIND_FIRST); + auto expected = indices{{absent, 0, 1, absent, x, 0, absent, 0}, null_at(4)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_LAST. + auto result = lists::index_of(search_space->view(), *search_key_one, FIND_LAST); + auto expected = indices{{absent, 1, 2, absent, x, 1, absent, 3}, null_at(4)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } } -TEST_F(ContainsTest, StringListContainsScalarWithNullsInLists) +TEST_F(ContainsTest, StringScalarWithNullsInLists) { using T = std::string; auto strings = strings_column_wrapper{ - {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "1", "2", "3", "4"}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })}; - - auto input_null_mask_iter = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; }); - - auto search_space = make_lists_column( + {"X", "1", "2", "X", "4", "5", "X", "7", "8", "X", "X", "1", "2", "X", "1"}, + nulls_at({0, 3, 6, 9, 10, 13})}; + auto input_null_mask_iter = null_at(4); + auto search_space = make_lists_column( 8, - fixed_width_column_wrapper{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(), + indices{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(), strings.release(), 1, cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8)); + // Search space: [ [x], [1,2], [x,4,5,x], [], x, [7,8,x], [x], [1,2,x,1] ] auto search_key_one = create_scalar_search_key("1"); - auto actual_result = lists::contains(search_space->view(), *search_key_one); - auto expected_result = - fixed_width_column_wrapper{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 0, 0, 1, 1}}; - - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); + { + // CONTAINS + auto result = lists::contains(search_space->view(), *search_key_one); + auto expected = bools{{0, 1, 0, 0, x, 0, 0, 1}, null_at(4)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // CONTAINS NULLS + auto result = lists::contains_nulls(search_space->view()); + auto expected = bools{{1, 0, 1, 0, x, 1, 1, 1}, null_at(4)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_FIRST. + auto result = lists::index_of(search_space->view(), *search_key_one, FIND_FIRST); + auto expected = indices{{absent, 0, absent, absent, x, absent, absent, 0}, null_at(4)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_LAST. + auto result = lists::index_of(search_space->view(), *search_key_one, FIND_LAST); + auto expected = indices{{absent, 0, absent, absent, x, absent, absent, 3}, null_at(4)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } } -TYPED_TEST(TypedContainsTest, ContainsScalarNullSearchKey) +TYPED_TEST(TypedContainsTest, ScalarNullSearchKey) { using T = TypeParam; - auto search_space = lists_column_wrapper{ - {{0, 1, 2}, - {3, 4, 5}, - {6, 7, 8}, - {}, - {9, 0, 1}, - {2, 3, 4}, - {5, 6, 7}, - {8, 9, 0}, - {}, - {1, 2, 3}, - {}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return (i != 3) && (i != 10); - })}.release(); - + auto search_space = lists_column_wrapper{{{0, 1, 2}, + {3, 4, 5}, + {6, 7, 8}, + {}, + {9, 0, 1}, + {2, 3, 4}, + {5, 6, 7}, + {8, 9, 0}, + {}, + {1, 2, 3}, + {}}, + nulls_at({3, 10})} + .release(); auto search_key_null = create_null_search_key(); - auto actual_result = lists::contains(search_space->view(), *search_key_null); - auto expected_result = fixed_width_column_wrapper{ - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return false; })}; - - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); + { + // CONTAINS + auto result = lists::contains(search_space->view(), *search_key_null); + auto expected = bools{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, all_nulls()}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_FIRST + auto result = lists::index_of(search_space->view(), *search_key_null, FIND_FIRST); + auto expected = indices{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, all_nulls()}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_LAST + auto result = lists::index_of(search_space->view(), *search_key_null, FIND_LAST); + auto expected = indices{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, all_nulls()}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } } TEST_F(ContainsTest, ScalarTypeRelatedExceptions) @@ -346,9 +496,12 @@ TEST_F(ContainsTest, ScalarTypeRelatedExceptions) {4, 5, 6}}}.release(); auto skey = create_scalar_search_key(10); CUDF_EXPECT_THROW_MESSAGE(lists::contains(list_of_lists->view(), *skey), - "Nested types not supported in lists::contains()"); + "Nested types not supported in list search operations."); + CUDF_EXPECT_THROW_MESSAGE(lists::index_of(list_of_lists->view(), *skey, FIND_FIRST), + "Nested types not supported in list search operations."); + CUDF_EXPECT_THROW_MESSAGE(lists::index_of(list_of_lists->view(), *skey, FIND_LAST), + "Nested types not supported in list search operations."); } - { // Search key must match list elements in type. auto list_of_ints = @@ -360,6 +513,10 @@ TEST_F(ContainsTest, ScalarTypeRelatedExceptions) auto skey = create_scalar_search_key("Hello, World!"); CUDF_EXPECT_THROW_MESSAGE(lists::contains(list_of_ints->view(), *skey), "Type/Scale of search key does not match list column element type."); + CUDF_EXPECT_THROW_MESSAGE(lists::index_of(list_of_ints->view(), *skey, FIND_FIRST), + "Type/Scale of search key does not match list column element type."); + CUDF_EXPECT_THROW_MESSAGE(lists::index_of(list_of_ints->view(), *skey, FIND_LAST), + "Type/Scale of search key does not match list column element type."); } } @@ -367,199 +524,275 @@ template struct TypedVectorContainsTest : public ContainsTest { }; -using VectorContainsTestTypes = +using VectorTestTypes = cudf::test::Concat; -TYPED_TEST_SUITE(TypedVectorContainsTest, VectorContainsTestTypes); +TYPED_TEST_SUITE(TypedVectorContainsTest, VectorTestTypes); -TYPED_TEST(TypedVectorContainsTest, ListContainsVectorWithNoNulls) +TYPED_TEST(TypedVectorContainsTest, VectorKeysWithNoNulls) { using T = TypeParam; auto search_space = lists_column_wrapper{ - {0, 1, 2}, + {0, 1, 2, 1}, {3, 4, 5}, {6, 7, 8}, - {9, 0, 1}, + {9, 0, 1, 3, 1}, {2, 3, 4}, {5, 6, 7}, {8, 9, 0}, {}, - {1, 2, 3}, + {1, 2, 3, 3}, {}}.release(); - auto search_key = fixed_width_column_wrapper{1, 2, 3, 1, 2, 3, 1, 2, 3, 1}; - auto actual_result = lists::contains(search_space->view(), search_key); - auto expected_result = fixed_width_column_wrapper{1, 0, 0, 1, 1, 0, 0, 0, 1, 0}; - - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); + auto search_key = fixed_width_column_wrapper{1, 2, 3, 1, 2, 3, 1, 2, 3, 1}; + { + // CONTAINS + auto result = lists::contains(search_space->view(), search_key); + auto expected = bools{1, 0, 0, 1, 1, 0, 0, 0, 1, 0}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_FIRST + auto result = lists::index_of(search_space->view(), search_key, FIND_FIRST); + auto expected = indices{1, absent, absent, 2, 0, absent, absent, absent, 2, absent}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_LAST + auto result = lists::index_of(search_space->view(), search_key, FIND_LAST); + auto expected = indices{3, absent, absent, 4, 0, absent, absent, absent, 3, absent}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } } -TYPED_TEST(TypedVectorContainsTest, ListContainsVectorWithNullLists) +TYPED_TEST(TypedVectorContainsTest, VectorWithNullLists) { // Test List columns that have NULL list rows. using T = TypeParam; - auto search_space = lists_column_wrapper{ - {{0, 1, 2}, - {3, 4, 5}, - {6, 7, 8}, - {}, - {9, 0, 1}, - {2, 3, 4}, - {5, 6, 7}, - {8, 9, 0}, - {}, - {1, 2, 3}, - {}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return (i != 3) && (i != 10); - })}.release(); - - auto search_keys = fixed_width_column_wrapper{1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2}; - auto actual_result = lists::contains(search_space->view(), search_keys); - auto expected_result = - fixed_width_column_wrapper{{1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return (i != 3) && (i != 10); })}; - - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); + auto search_space = lists_column_wrapper{{{0, 1, 2, 1}, + {3, 4, 5}, + {6, 7, 8}, + {}, + {9, 0, 1, 3, 1}, + {2, 3, 4}, + {5, 6, 7}, + {8, 9, 0}, + {}, + {1, 2, 3, 3}, + {}}, + nulls_at({3, 10})} + .release(); + + auto search_keys = fixed_width_column_wrapper{1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2}; + + { + // CONTAINS + auto result = lists::contains(search_space->view(), search_keys); + auto expected = bools{{1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0}, nulls_at({3, 10})}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_FIRST + auto result = lists::index_of(search_space->view(), search_keys, FIND_FIRST); + auto expected = + indices{{1, absent, absent, x, absent, 1, absent, absent, absent, 0, x}, nulls_at({3, 10})}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_LAST + auto result = lists::index_of(search_space->view(), search_keys, FIND_LAST); + auto expected = + indices{{3, absent, absent, x, absent, 1, absent, absent, absent, 0, x}, nulls_at({3, 10})}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } } -TYPED_TEST(TypedVectorContainsTest, ListContainsVectorNonNullListsWithNullValues) +TYPED_TEST(TypedVectorContainsTest, VectorNonNullListsWithNullValues) { // Test List columns that have no NULL list rows, but NULL elements in some list rows. using T = TypeParam; - auto numerals = fixed_width_column_wrapper{ - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })}; - - auto search_space = - make_lists_column(8, - fixed_width_column_wrapper{0, 1, 3, 7, 7, 7, 10, 12, 15}.release(), - numerals.release(), - 0, - {}); - - auto search_keys = fixed_width_column_wrapper{1, 2, 3, 1, 2, 3, 1, 3}; - auto actual_result = lists::contains(search_space->view(), search_keys); - auto expected_result = - fixed_width_column_wrapper{{0, 1, 0, 0, 0, 0, 1, 1}, {0, 1, 0, 1, 1, 0, 1, 1}}; + auto numerals = fixed_width_column_wrapper{{x, 1, 2, x, 4, 5, x, 7, 8, x, x, 1, 2, x, 1}, + nulls_at({0, 3, 6, 9, 10, 13})}; - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); + auto search_space = make_lists_column( + 8, indices{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(), numerals.release(), 0, {}); + // Search space: [ [x], [1,2], [x,4,5,x], [], [], [7,8,x], [x], [1,2,x,1] ] + auto search_keys = fixed_width_column_wrapper{1, 2, 3, 1, 2, 3, 1, 1}; + { + // CONTAINS + auto result = lists::contains(search_space->view(), search_keys); + auto expected = bools{0, 1, 0, 0, 0, 0, 0, 1}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_FIRST + auto result = lists::index_of(search_space->view(), search_keys, FIND_FIRST); + auto expected = indices{absent, 1, absent, absent, absent, absent, absent, 0}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_LAST + auto result = lists::index_of(search_space->view(), search_keys, FIND_LAST); + auto expected = indices{absent, 1, absent, absent, absent, absent, absent, 3}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } } -TYPED_TEST(TypedVectorContainsTest, ListContainsVectorWithNullsInLists) +TYPED_TEST(TypedVectorContainsTest, VectorWithNullsInLists) { using T = TypeParam; - auto numerals = fixed_width_column_wrapper{ - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })}; + auto numerals = fixed_width_column_wrapper{{x, 1, 2, x, 4, 5, x, 7, 8, x, x, 1, 2, x, 1}, + nulls_at({0, 3, 6, 9, 10, 13})}; - auto input_null_mask_iter = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; }); + auto input_null_mask_iter = null_at(4); auto search_space = make_lists_column( 8, - fixed_width_column_wrapper{0, 1, 3, 7, 7, 7, 10, 12, 15}.release(), + indices{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(), numerals.release(), 1, cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8)); + // Search space: [ [x], [1,2], [x,4,5,x], [], x, [7,8,x], [x], [1,2,x,1] ] - auto search_keys = fixed_width_column_wrapper{1, 2, 3, 1, 2, 3, 1, 3}; - auto actual_result = lists::contains(search_space->view(), search_keys); - auto expected_result = - fixed_width_column_wrapper{{0, 1, 0, 0, 0, 0, 1, 1}, {0, 1, 0, 1, 0, 0, 1, 1}}; - - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); + auto search_keys = fixed_width_column_wrapper{1, 2, 3, 1, 2, 3, 1, 1}; + { + // CONTAINS + auto result = lists::contains(search_space->view(), search_keys); + auto expected = bools{{0, 1, 0, 0, x, 0, 0, 1}, null_at(4)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_FIRST + auto result = lists::index_of(search_space->view(), search_keys, FIND_FIRST); + auto expected = indices{{absent, 1, absent, absent, x, absent, absent, 0}, null_at(4)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_LAST + auto result = lists::index_of(search_space->view(), search_keys, FIND_LAST); + auto expected = indices{{absent, 1, absent, absent, x, absent, absent, 3}, null_at(4)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } } TYPED_TEST(TypedVectorContainsTest, ListContainsVectorWithNullsInListsAndInSearchKeys) { using T = TypeParam; - auto numerals = fixed_width_column_wrapper{ - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })}; + auto numerals = fixed_width_column_wrapper{{x, 1, 2, x, 4, 5, x, 7, 8, x, x, 1, 2, x, 1}, + nulls_at({0, 3, 6, 9, 10, 13})}; - auto input_null_mask_iter = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; }); + auto input_null_mask_iter = null_at(4); auto search_space = make_lists_column( 8, - fixed_width_column_wrapper{0, 1, 3, 7, 7, 7, 10, 12, 15}.release(), + indices{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(), numerals.release(), 1, cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8)); + // Search space: [ [x], [1,2], [x,4,5,x], [], x, [7,8,x], [x], [1,2,x,1] ] - auto search_keys = fixed_width_column_wrapper{ - {1, 2, 3, 1, 2, 3, 1, 3}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 6; })}; - - auto actual_result = lists::contains(search_space->view(), search_keys); - auto expected_result = - fixed_width_column_wrapper{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 0, 0, 0, 1}}; - - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); + auto search_keys = fixed_width_column_wrapper{{1, 2, 3, x, 2, 3, 1, 1}, null_at(3)}; + { + // CONTAINS + auto result = lists::contains(search_space->view(), search_keys); + auto expected = bools{{0, 1, 0, x, x, 0, 0, 1}, nulls_at({3, 4})}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_FIRST + auto result = lists::index_of(search_space->view(), search_keys, FIND_FIRST); + auto expected = indices{{absent, 1, absent, x, x, absent, absent, 0}, nulls_at({3, 4})}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_LAST + auto result = lists::index_of(search_space->view(), search_keys, FIND_LAST); + auto expected = indices{{absent, 1, absent, x, x, absent, absent, 3}, nulls_at({3, 4})}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } } -TEST_F(ContainsTest, BoolListContainsVectorWithNullsInListsAndInSearchKeys) +TEST_F(ContainsTest, BoolKeyVectorWithNullsInListsAndInSearchKeys) { using T = bool; - auto numerals = fixed_width_column_wrapper{ - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })}; + auto numerals = fixed_width_column_wrapper{{x, 0, 1, x, 1, 1, x, 1, 1, x, x, 0, 1, x, 1}, + nulls_at({0, 3, 6, 9, 10, 13})}; - auto input_null_mask_iter = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; }); + auto input_null_mask_iter = null_at(4); auto search_space = make_lists_column( 8, - fixed_width_column_wrapper{0, 1, 3, 7, 7, 7, 10, 12, 15}.release(), + indices{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(), numerals.release(), 1, cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8)); - auto search_keys = fixed_width_column_wrapper{ - {0, 1, 0, 1, 0, 0, 1, 1}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 6; })}; - - auto actual_result = lists::contains(search_space->view(), search_keys); - auto expected_result = - fixed_width_column_wrapper{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 0, 0, 0, 1}}; - - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); + auto search_keys = fixed_width_column_wrapper{{0, 1, 0, x, 0, 0, 1, 1}, null_at(3)}; + // Search space: [ [x], [0,1], [x,1,1,x], [], x, [1,1,x], [x], [0,1,x,1] ] + // Search keys : [ 0, 1, 0, x, 0, 0, 1, 1 ] + { + // CONTAINS + auto result = lists::contains(search_space->view(), search_keys); + auto expected = bools{{0, 1, 0, x, x, 0, 0, 1}, nulls_at({3, 4})}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_FIRST + auto result = lists::index_of(search_space->view(), search_keys, FIND_FIRST); + auto expected = indices{{absent, 1, absent, x, x, absent, absent, 1}, nulls_at({3, 4})}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_LAST + auto result = lists::index_of(search_space->view(), search_keys, FIND_LAST); + auto expected = indices{{absent, 1, absent, x, x, absent, absent, 3}, nulls_at({3, 4})}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } } -TEST_F(ContainsTest, StringListContainsVectorWithNullsInListsAndInSearchKeys) +TEST_F(ContainsTest, StringKeyVectorWithNullsInListsAndInSearchKeys) { - auto numerals = strings_column_wrapper{ - {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "1", "2", "3", "4"}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })}; - - auto input_null_mask_iter = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; }); - - auto search_space = make_lists_column( + auto strings = strings_column_wrapper{ + {"X", "1", "2", "X", "4", "5", "X", "7", "8", "X", "X", "1", "2", "X", "1"}, + nulls_at({0, 3, 6, 9, 10, 13})}; + auto input_null_mask_iter = null_at(4); + auto search_space = make_lists_column( 8, - fixed_width_column_wrapper{0, 1, 3, 7, 7, 7, 10, 12, 15}.release(), - numerals.release(), + fixed_width_column_wrapper{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(), + strings.release(), 1, cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8)); - auto search_keys = strings_column_wrapper{ - {"1", "2", "3", "1", "2", "3", "1", "3"}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 6; })}; + auto search_keys = strings_column_wrapper{{"1", "2", "3", "X", "2", "3", "1", "1"}, null_at(3)}; - auto actual_result = lists::contains(search_space->view(), search_keys); - auto expected_result = - fixed_width_column_wrapper{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 0, 0, 0, 1}}; + // Search space: [ [x], [1,2], [x,4,5,x], [], x, [7,8,x], [x], [1,2,x,1] ] + // Search keys: [ 1, 2, 3, X, 2, 3, 1, 1] - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); + { + // CONTAINS + auto result = lists::contains(search_space->view(), search_keys); + auto expected = bools{{0, 1, 0, x, x, 0, 0, 1}, nulls_at({3, 4})}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_FIRST + auto result = lists::index_of(search_space->view(), search_keys, FIND_FIRST); + auto expected = indices{{absent, 1, absent, x, x, absent, absent, 0}, nulls_at({3, 4})}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_LAST + auto result = lists::index_of(search_space->view(), search_keys, FIND_LAST); + auto expected = indices{{absent, 1, absent, x, x, absent, absent, 3}, nulls_at({3, 4})}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } } TEST_F(ContainsTest, VectorTypeRelatedExceptions) @@ -573,9 +806,12 @@ TEST_F(ContainsTest, VectorTypeRelatedExceptions) {4, 5, 6}}}.release(); auto skey = fixed_width_column_wrapper{0, 1, 2}; CUDF_EXPECT_THROW_MESSAGE(lists::contains(list_of_lists->view(), skey), - "Nested types not supported in lists::contains()"); + "Nested types not supported in list search operations."); + CUDF_EXPECT_THROW_MESSAGE(lists::index_of(list_of_lists->view(), skey, FIND_FIRST), + "Nested types not supported in list search operations."); + CUDF_EXPECT_THROW_MESSAGE(lists::index_of(list_of_lists->view(), skey, FIND_LAST), + "Nested types not supported in list search operations."); } - { // Search key must match list elements in type. auto list_of_ints = @@ -587,15 +823,21 @@ TEST_F(ContainsTest, VectorTypeRelatedExceptions) auto skey = strings_column_wrapper{"Hello", "World"}; CUDF_EXPECT_THROW_MESSAGE(lists::contains(list_of_ints->view(), skey), "Type/Scale of search key does not match list column element type."); + CUDF_EXPECT_THROW_MESSAGE(lists::index_of(list_of_ints->view(), skey, FIND_FIRST), + "Type/Scale of search key does not match list column element type."); + CUDF_EXPECT_THROW_MESSAGE(lists::index_of(list_of_ints->view(), skey, FIND_LAST), + "Type/Scale of search key does not match list column element type."); } - { // Search key column size must match lists column size. auto list_of_ints = lists_column_wrapper{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}}.release(); - - auto skey = fixed_width_column_wrapper{0, 1, 2, 3}; + auto skey = fixed_width_column_wrapper{0, 1, 2, 3}; CUDF_EXPECT_THROW_MESSAGE(lists::contains(list_of_ints->view(), skey), "Number of search keys must match list column size."); + CUDF_EXPECT_THROW_MESSAGE(lists::index_of(list_of_ints->view(), skey, FIND_FIRST), + "Number of search keys must match list column size."); + CUDF_EXPECT_THROW_MESSAGE(lists::index_of(list_of_ints->view(), skey, FIND_LAST), + "Number of search keys must match list column size."); } } @@ -605,6 +847,7 @@ struct TypedContainsNaNsTest : public ContainsTest { TYPED_TEST_SUITE(TypedContainsNaNsTest, FloatingPointTypes); +namespace { template T get_nan(const char* nan_contents) { @@ -616,8 +859,9 @@ float get_nan(const char* nan_contents) { return std::nanf(nan_contents); } +} // namespace -TYPED_TEST(TypedContainsNaNsTest, ListWithNaNsContainsScalar) +TYPED_TEST(TypedContainsNaNsTest, ListWithNaNsScalar) { using T = TypeParam; @@ -637,11 +881,25 @@ TYPED_TEST(TypedContainsNaNsTest, ListWithNaNsContainsScalar) {1, 2, 3}, {}}.release(); - auto search_key_nan = create_scalar_search_key(nan_3); - auto actual_result = lists::contains(search_space->view(), *search_key_nan); - auto expected_result = fixed_width_column_wrapper{0, 0, 0, 0, 1, 0, 1, 0, 0, 0}; - - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); + auto search_key_nan = create_scalar_search_key(nan_3); + { + // CONTAINS + auto result = lists::contains(search_space->view(), *search_key_nan); + auto expected = bools{0, 0, 0, 0, 1, 0, 1, 0, 0, 0}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_FIRST + auto result = lists::index_of(search_space->view(), *search_key_nan, FIND_FIRST); + auto expected = indices{absent, absent, absent, absent, 0, absent, 1, absent, absent, absent}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_LAST + auto result = lists::index_of(search_space->view(), *search_key_nan, FIND_LAST); + auto expected = indices{absent, absent, absent, absent, 0, absent, 1, absent, absent, absent}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } } TYPED_TEST(TypedContainsNaNsTest, ListWithNaNsContainsVector) @@ -652,19 +910,18 @@ TYPED_TEST(TypedContainsNaNsTest, ListWithNaNsContainsVector) // presence of NaN values: // 1. If the search key is null, null is still returned. // 2. If the list contains a null, and the non-null search - // key is not found, null is returned. + // key is not found: + // a) contains() returns `null`. + // b) index_of() returns -1. using T = TypeParam; auto nan_1 = get_nan("1"); auto nan_2 = get_nan("2"); auto nan_3 = get_nan("3"); - auto null_at_index_2 = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2; }); - auto search_space = lists_column_wrapper{ {0.0, 1.0, 2.0}, - {{3, 4, 5}, null_at_index_2}, // i.e. {3, 4, βˆ…}. + {{3, 4, 5}, null_at(2)}, // i.e. {3, 4, βˆ…}. {6, 7, 8}, {9, 0, 1}, {nan_1, 3.0, 4.0}, @@ -679,33 +936,52 @@ TYPED_TEST(TypedContainsNaNsTest, ListWithNaNsContainsVector) { // With nulls in the search key rows. (At index 2.) auto search_keys = - fixed_width_column_wrapper{ - search_key_values.begin(), search_key_values.end(), null_at_index_2} + fixed_width_column_wrapper{search_key_values.begin(), search_key_values.end(), null_at(2)} .release(); - auto actual_result = lists::contains(search_space->view(), search_keys->view()); - auto null_at_index_1_and_2 = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1 && i != 2; }); - - auto expected_result = - fixed_width_column_wrapper{{1, 0, 0, 0, 1, 0, 1, 0, 1, 0}, null_at_index_1_and_2}; - - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); + { + // CONTAINS + auto result = lists::contains(search_space->view(), search_keys->view()); + auto expected = bools{{1, 0, 0, 0, 1, 0, 1, 0, 1, 0}, null_at(2)}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_FIRST + auto result = lists::index_of(search_space->view(), search_keys->view(), FIND_FIRST); + auto expected = + indices{{1, absent, x, absent, 0, absent, 2, absent, 1, absent}, nulls_at({2})}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_LAST + auto result = lists::index_of(search_space->view(), search_keys->view(), FIND_LAST); + auto expected = + indices{{1, absent, x, absent, 0, absent, 2, absent, 1, absent}, nulls_at({2})}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } } - { // No nulls in the search key rows. auto search_keys = fixed_width_column_wrapper(search_key_values.begin(), search_key_values.end()).release(); - - auto actual_result = lists::contains(search_space->view(), search_keys->view()); - auto null_at_index_1 = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; }); - - auto expected_result = - fixed_width_column_wrapper{{1, 0, 0, 0, 1, 0, 1, 0, 1, 0}, null_at_index_1}; - - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); + { + // CONTAINS + auto result = lists::contains(search_space->view(), search_keys->view()); + auto expected = bools{1, 0, 0, 0, 1, 0, 1, 0, 1, 0}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_FIRST + auto result = lists::index_of(search_space->view(), search_keys->view(), FIND_FIRST); + auto expected = indices{1, absent, absent, absent, 0, absent, 2, absent, 1, absent}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_LAST + auto result = lists::index_of(search_space->view(), search_keys->view(), FIND_LAST); + auto expected = indices{1, absent, absent, absent, 0, absent, 2, absent, 1, absent}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } } } @@ -715,50 +991,79 @@ struct TypedContainsDecimalsTest : public ContainsTest { TYPED_TEST_SUITE(TypedContainsDecimalsTest, FixedPointTypes); -TYPED_TEST(TypedContainsDecimalsTest, ListContainsScalar) +TYPED_TEST(TypedContainsDecimalsTest, ScalarKey) { using T = TypeParam; - auto const values = std::vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, - 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3}; - auto decimals = fixed_point_column_wrapper{ - values.begin(), values.end(), numeric::scale_type{0}}; - - auto list_offsets = fixed_width_column_wrapper{0, 3, 6, 9, 12, 15, 18, 21, 21, 24, 24}; - - auto const search_space = - make_lists_column(10, list_offsets.release(), decimals.release(), 0, {}); - - auto search_key_one = make_fixed_point_scalar(typename T::rep{1}, numeric::scale_type{0}); - auto actual_result = lists::contains(search_space->view(), *search_key_one); - auto expected_result = fixed_width_column_wrapper{1, 0, 0, 1, 0, 0, 0, 0, 1, 0}; - - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); + auto const search_space = [] { + auto const values = std::vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, + 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3}; + auto decimals = fixed_point_column_wrapper{ + values.begin(), values.end(), numeric::scale_type{0}}; + auto list_offsets = indices{0, 3, 6, 9, 12, 15, 18, 21, 21, 24, 24}; + return make_lists_column(10, list_offsets.release(), decimals.release(), 0, {}); + }(); + auto search_key_one = make_fixed_point_scalar(typename T::rep{1}, numeric::scale_type{0}); + + // Search space: [[0,1,2], [3,4,5], [6,7,8], [9,0,1], [2,3,4], [5,6,7], [8,9,0], [], [1,2,3], []] + { + // CONTAINS + auto result = lists::contains(search_space->view(), *search_key_one); + auto expected = bools{1, 0, 0, 1, 0, 0, 0, 0, 1, 0}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_FIRST + auto result = lists::index_of(search_space->view(), *search_key_one, FIND_FIRST); + auto expected = indices{1, absent, absent, 2, absent, absent, absent, absent, 0, absent}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_LAST + auto result = lists::index_of(search_space->view(), *search_key_one, FIND_LAST); + auto expected = indices{1, absent, absent, 2, absent, absent, absent, absent, 0, absent}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } } -TYPED_TEST(TypedContainsDecimalsTest, ListContainsVector) +TYPED_TEST(TypedContainsDecimalsTest, VectorKey) { using T = TypeParam; - auto const values = std::vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, - 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3}; - auto decimals = fixed_point_column_wrapper{ - values.begin(), values.end(), numeric::scale_type{0}}; - - auto list_offsets = fixed_width_column_wrapper{0, 3, 6, 9, 12, 15, 18, 21, 21, 24, 24}; - - auto const search_space = - make_lists_column(10, list_offsets.release(), decimals.release(), 0, {}); + auto const search_space = [] { + auto const values = std::vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, + 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3}; + auto decimals = fixed_point_column_wrapper{ + values.begin(), values.end(), numeric::scale_type{0}}; + auto list_offsets = indices{0, 3, 6, 9, 12, 15, 18, 21, 21, 24, 24}; + return make_lists_column(10, list_offsets.release(), decimals.release(), 0, {}); + }(); auto search_key = fixed_point_column_wrapper{ {1, 2, 3, 1, 2, 3, 1, 2, 3, 1}, numeric::scale_type{ 0}}.release(); - auto actual_result = lists::contains(search_space->view(), search_key->view()); - auto expected_result = fixed_width_column_wrapper{1, 0, 0, 1, 1, 0, 0, 0, 1, 0}; - - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); + // Search space: [ [0,1,2], [3,4,5], [6,7,8], [9,0,1], [2,3,4], [5,6,7], [8,9,0], [], [1,2,3], [] + // ] Search keys: [ 1, 2, 3, 1, 2, 3, 1, 2, 3, 1 ] + { + // CONTAINS + auto result = lists::contains(search_space->view(), search_key->view()); + auto expected = bools{1, 0, 0, 1, 1, 0, 0, 0, 1, 0}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_FIRST + auto result = lists::index_of(search_space->view(), search_key->view(), FIND_FIRST); + auto expected = indices{1, absent, absent, 2, 0, absent, absent, absent, 2, absent}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } + { + // FIND_LAST + auto result = lists::index_of(search_space->view(), search_key->view(), FIND_LAST); + auto expected = indices{1, absent, absent, 2, 0, absent, absent, absent, 2, absent}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result); + } } } // namespace test diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 5153c5c1d2a..a2e080e02f6 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -3170,8 +3170,6 @@ public static ColumnView fromDeviceBuffer(BaseDeviceMemoryBuffer buffer, * Output `column[i]` is set to null if one or more of the following are true: * 1. The key is null * 2. The column vector list value is null - * 3. The list row does not contain the key, and contains at least - * one null. * @param key the scalar to look up * @return a Boolean ColumnVector with the result of the lookup */ @@ -3183,10 +3181,9 @@ public final ColumnVector listContains(Scalar key) { /** * Create a column of bool values indicating whether the list rows of the first * column contain the corresponding values in the second column. + * Output `column[i]` is set to null if one or more of the following are true: * 1. The key value is null * 2. The column vector list value is null - * 3. The list row does not contain the key, and contains at least - * one null. * @param key the ColumnVector with look up values * @return a Boolean ColumnVector with the result of the lookup */ @@ -3195,6 +3192,58 @@ public final ColumnVector listContainsColumn(ColumnView key) { return new ColumnVector(listContainsColumn(getNativeView(), key.getNativeView())); } + /** + * Create a column of bool values indicating whether the list rows of the specified + * column contain null elements. + * Output `column[i]` is set to null iff the input list row is null. + * @return a Boolean ColumnVector with the result of the lookup + */ + public final ColumnVector listContainsNulls() { + assert type.equals(DType.LIST) : "column type must be a LIST"; + return new ColumnVector(listContainsNulls(getNativeView())); + } + + /** + * Enum to choose behaviour of listIndexOf functions: + * 1. FIND_FIRST finds the first occurrence of a search key. + * 2. FIND_LAST finds the last occurrence of a search key. + */ + public enum FindOptions {FIND_FIRST, FIND_LAST}; + + /** + * Create a column of int32 indices, indicating the position of the scalar search key + * in each list row. + * All indices are 0-based. If a search key is not found, the index is set to -1. + * The index is set to null if one of the following is true: + * 1. The search key is null. + * 2. The list row is null. + * @param key The scalar search key + * @param findOption Whether to find the first index of the key, or the last. + * @return The resultant column of int32 indices + */ + public final ColumnVector listIndexOf(Scalar key, FindOptions findOption) { + assert type.equals(DType.LIST) : "column type must be a LIST"; + boolean isFindFirst = findOption == FindOptions.FIND_FIRST; + return new ColumnVector(listIndexOfScalar(getNativeView(), key.getScalarHandle(), isFindFirst)); + } + + /** + * Create a column of int32 indices, indicating the position of each row in the + * search key column in the corresponding row of the lists column. + * All indices are 0-based. If a search key is not found, the index is set to -1. + * The index is set to null if one of the following is true: + * 1. The search key row is null. + * 2. The list row is null. + * @param key ColumnView of search keys. + * @param findOption Whether to find the first index of the key, or the last. + * @return The resultant column of int32 indices + */ + public final ColumnVector listIndexOf(ColumnView keys, FindOptions findOption) { + assert type.equals(DType.LIST) : "column type must be a LIST"; + boolean isFindFirst = findOption == FindOptions.FIND_FIRST; + return new ColumnVector(listIndexOfColumn(getNativeView(), keys.getNativeView(), isFindFirst)); + } + /** * Segmented sort of the elements within a list in each row of a list column. * NOTICE: list columns with nested child are NOT supported yet. @@ -3616,6 +3665,33 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat */ private static native long listContainsColumn(long nativeView, long keyColumn); + /** + * Native method to search list rows for null elements. + * @param nativeView the column view handle of the list + * @return column handle of the resultant boolean column + */ + private static native long listContainsNulls(long nativeView); + + /** + * Native method to find the first (or last) index of a specified scalar key, + * in each row of a list column. + * @param nativeView the column view handle of the list + * @param scalarKeyHandle handle to the scalar search key + * @param isFindFirst Whether to find the first index of the key, or the last. + * @return column handle of the resultant column of int32 indices + */ + private static native long listIndexOfScalar(long nativeView, long scalarKeyHandle, boolean isFindFirst); + + /** + * Native method to find the first (or last) index of each search key in the specified column, + * in each row of a list column. + * @param nativeView the column view handle of the list + * @param scalarColumnHandle handle to the search key column + * @param isFindFirst Whether to find the first index of the key, or the last. + * @return column handle of the resultant column of int32 indices + */ + private static native long listIndexOfColumn(long nativeView, long keyColumnHandle, boolean isFindFirst); + private static native long listSortRows(long nativeView, boolean isDescending, boolean isNullSmallest); private static native long getElement(long nativeView, int index); diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 4cd4b070aed..73ea49c18d9 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -511,6 +511,18 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContains(JNIEnv *env, CATCH_STD(env, 0); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContainsNulls(JNIEnv *env, jclass, + jlong column_view) { + JNI_NULL_CHECK(env, column_view, "column is null", 0); + try { + cudf::jni::auto_set_device(env); + auto cv = reinterpret_cast(column_view); + auto lcv = cudf::lists_column_view{*cv}; + return reinterpret_cast(cudf::lists::contains_nulls(lcv).release()); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContainsColumn(JNIEnv *env, jclass, jlong column_view, jlong lookup_key_cv) { @@ -528,6 +540,44 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContainsColumn(JNIEnv CATCH_STD(env, 0); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listIndexOfScalar(JNIEnv *env, jclass, + jlong column_view, + jlong lookup_key, + jboolean is_find_first) { + JNI_NULL_CHECK(env, column_view, "column is null", 0); + JNI_NULL_CHECK(env, lookup_key, "lookup scalar is null", 0); + try { + cudf::jni::auto_set_device(env); + auto const cv = reinterpret_cast(column_view); + auto const lcv = cudf::lists_column_view{*cv}; + auto const lookup_key_scalar = reinterpret_cast(lookup_key); + auto const find_option = is_find_first ? cudf::lists::duplicate_find_option::FIND_FIRST : + cudf::lists::duplicate_find_option::FIND_LAST; + auto result = cudf::lists::index_of(lcv, *lookup_key_scalar, find_option); + return reinterpret_cast(result.release()); + } + CATCH_STD(env, 0); +} + +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listIndexOfColumn(JNIEnv *env, jclass, + jlong column_view, + jlong lookup_keys, + jboolean is_find_first) { + JNI_NULL_CHECK(env, column_view, "column is null", 0); + JNI_NULL_CHECK(env, lookup_keys, "lookup key column is null", 0); + try { + cudf::jni::auto_set_device(env); + auto const cv = reinterpret_cast(column_view); + auto const lcv = cudf::lists_column_view{*cv}; + auto const lookup_key_column = reinterpret_cast(lookup_keys); + auto const find_option = is_find_first ? cudf::lists::duplicate_find_option::FIND_FIRST : + cudf::lists::duplicate_find_option::FIND_LAST; + auto result = cudf::lists::index_of(lcv, *lookup_key_column, find_option); + return reinterpret_cast(result.release()); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env, jclass, jlong column_view, jboolean is_descending, diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index b78183692a3..0771de9492d 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -18,6 +18,7 @@ package ai.rapids.cudf; +import ai.rapids.cudf.ColumnView.FindOptions; import ai.rapids.cudf.HostColumnVector.*; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; @@ -4364,70 +4365,160 @@ void testDropListDuplicatesWithKeysValues() { } } + @SafeVarargs + private static ColumnVector makeListsColumn(DType childDType, List... rows) { + HostColumnVector.DataType childType = new HostColumnVector.BasicType(true, childDType); + HostColumnVector.DataType listType = new HostColumnVector.ListType(true, childType); + return ColumnVector.fromLists(listType, rows); + } + @Test void testListContainsString() { - List list1 = Arrays.asList("HΓ©llo there", "thΓ©sΓ©"); - List list2 = Arrays.asList("", "ARΓ© some", "test strings"); - List list3 = Arrays.asList(null, "", "ARΓ© some", "test strings", "thΓ©sΓ©"); - List list4 = Arrays.asList(null, "", "ARΓ© some", "test strings"); - List list5 = null; - try (ColumnVector v = ColumnVector.fromLists(new HostColumnVector.ListType(true, - new HostColumnVector.BasicType(true, DType.STRING)), list1, list2, list3, list4, list5); - ColumnVector expected = ColumnVector.fromBoxedBooleans(true, false, true, null, null); - Scalar strScalar = Scalar.fromString("thΓ©sΓ©"); - ColumnVector result = v.listContains(strScalar)) { + List list0 = Arrays.asList("HΓ©llo there", "thΓ©sΓ©"); + List list1 = Arrays.asList("", "ARΓ© some", "test strings"); + List list2 = Arrays.asList(null, "", "ARΓ© some", "test strings", "thΓ©sΓ©"); + List list3 = Arrays.asList(null, "", "ARΓ© some", "test strings"); + List list4 = null; + try (ColumnVector input = makeListsColumn(DType.STRING, list0, list1, list2, list3, list4); + Scalar searchKey = Scalar.fromString("thΓ©sΓ©"); + ColumnVector expected = ColumnVector.fromBoxedBooleans(true, false, true, false, null); + ColumnVector result = input.listContains(searchKey)) { assertColumnsAreEqual(expected, result); } } @Test void testListContainsInt() { - List list1 = Arrays.asList(1, 2, 3); - List list2 = Arrays.asList(4, 5, 6); - List list3 = Arrays.asList(7, 8, 9); - List list4 = null; - try (ColumnVector v = ColumnVector.fromLists(new HostColumnVector.ListType(true, - new HostColumnVector.BasicType(true, DType.INT32)), list1, list2, list3, list4); + List list0 = Arrays.asList(1, 2, 3); + List list1 = Arrays.asList(4, 5, 6); + List list2 = Arrays.asList(7, 8, 9); + List list3 = null; + try (ColumnVector input = makeListsColumn(DType.INT32, list0, list1, list2, list3); + Scalar searchKey = Scalar.fromInt(7); ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, true, null); - Scalar intScalar = Scalar.fromInt(7); - ColumnVector result = v.listContains(intScalar)) { + ColumnVector result = input.listContains(searchKey)) { assertColumnsAreEqual(expected, result); } } @Test void testListContainsStringCol() { - List list1 = Arrays.asList("HΓ©llo there", "thΓ©sΓ©"); - List list2 = Arrays.asList("", "ARΓ© some", "test strings"); - List list3 = Arrays.asList("FOO", "", "ARΓ© some", "test"); + List list0 = Arrays.asList("HΓ©llo there", "thΓ©sΓ©"); + List list1 = Arrays.asList("", "ARΓ© some", "test strings"); + List list2 = Arrays.asList("FOO", "", "ARΓ© some", "test"); + List list3 = Arrays.asList(null, "FOO", "", "ARΓ© some", "test"); List list4 = Arrays.asList(null, "FOO", "", "ARΓ© some", "test"); - List list5 = Arrays.asList(null, "FOO", "", "ARΓ© some", "test"); - List list6 = null; - try (ColumnVector v = ColumnVector.fromLists(new HostColumnVector.ListType(true, - new HostColumnVector.BasicType(true, DType.STRING)), list1, list2, list3, list4, list5, list6); - ColumnVector expected = ColumnVector.fromBoxedBooleans(true, true, true, true, null, null); - ColumnVector strCol = ColumnVector.fromStrings("thΓ©sΓ©", "", "test", "test", "iotA", null); - ColumnVector result = v.listContainsColumn(strCol)) { + List list5 = null; + try (ColumnVector input = makeListsColumn(DType.STRING, list0, list1, list2, list3, list4, list5); + ColumnVector searchKeys = ColumnVector.fromStrings("thΓ©sΓ©", "", "test", "test", "iotA", null); + ColumnVector expected = ColumnVector.fromBoxedBooleans(true, true, true, true, false, null); + ColumnVector result = input.listContainsColumn(searchKeys)) { assertColumnsAreEqual(expected, result); } } @Test void testListContainsIntCol() { - List list1 = Arrays.asList(1, 2, 3); - List list2 = Arrays.asList(4, 5, 6); + List list0 = Arrays.asList(1, 2, 3); + List list1 = Arrays.asList(4, 5, 6); + List list2 = Arrays.asList(null, 8, 9); List list3 = Arrays.asList(null, 8, 9); - List list4 = Arrays.asList(null, 8, 9); - List list5 = null; - try (ColumnVector v = ColumnVector.fromLists(new HostColumnVector.ListType(true, - new HostColumnVector.BasicType(true, DType.INT32)), list1, list2, list3, list4, list5); - ColumnVector expected = ColumnVector.fromBoxedBooleans(true, false, true, null, null); - ColumnVector intCol = ColumnVector.fromBoxedInts(3, 3, 8, 3, null); - ColumnVector result = v.listContainsColumn(intCol)) { + List list4 = null; + try (ColumnVector input = makeListsColumn(DType.INT32, list0, list1, list2, list3, list4); + ColumnVector searchKeys = ColumnVector.fromBoxedInts(3, 3, 8, 3, null); + ColumnVector expected = ColumnVector.fromBoxedBooleans(true, false, true, false, null); + ColumnVector result = input.listContainsColumn(searchKeys)) { + assertColumnsAreEqual(expected, result); + } + } + + @Test + void testListContainsNulls() { + List list0 = Arrays.asList("HΓ©llo there", "thΓ©sΓ©"); + List list1 = Arrays.asList("", "ARΓ© some", "test strings"); + List list2 = Arrays.asList("FOO", "", "ARΓ© some", "test"); + List list3 = Arrays.asList(null, "FOO", "", "ARΓ© some", "test"); + List list4 = Arrays.asList(null, "FOO", "", "ARΓ© some", "test"); + List list5 = null; + try (ColumnVector input = makeListsColumn(DType.STRING, list0, list1, list2, list3, list4, list5); + ColumnVector result = input.listContainsNulls(); + ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, false, true, true, null)) { assertColumnsAreEqual(expected, result); } } + @Test + void testListIndexOfString() { + List list0 = Arrays.asList("HΓ©llo there", "thΓ©sΓ©"); + List list1 = Arrays.asList("", "ARΓ© some", "test strings"); + List list2 = Arrays.asList(null, "", "ARΓ© some", "thΓ©sΓ©", "test strings", "thΓ©sΓ©"); + List list3 = Arrays.asList(null, "", "ARΓ© some", "test strings"); + List list4 = null; + try (ColumnVector input = makeListsColumn(DType.STRING, list0, list1, list2, list3, list4); + Scalar searchKey = Scalar.fromString("thΓ©sΓ©"); + ColumnVector expectedFirst = ColumnVector.fromBoxedInts(1, -1, 3, -1, null); + ColumnVector resultFirst = input.listIndexOf(searchKey, FindOptions.FIND_FIRST); + ColumnVector expectedLast = ColumnVector.fromBoxedInts(1, -1, 5, -1, null); + ColumnVector resultLast = input.listIndexOf(searchKey, FindOptions.FIND_LAST)) { + assertColumnsAreEqual(expectedFirst, resultFirst); + assertColumnsAreEqual(expectedLast, resultLast); + } + } + + @Test + void testListIndexOfInt() { + List list0 = Arrays.asList(1, 2, 3); + List list1 = Arrays.asList(4, 5, 6); + List list2 = Arrays.asList(7, 8, 9, 7); + List list3 = null; + try (ColumnVector input = makeListsColumn(DType.INT32, list0, list1, list2, list3); + Scalar searchKey = Scalar.fromInt(7); + ColumnVector expectedFirst = ColumnVector.fromBoxedInts(-1, -1, 0, null); + ColumnVector resultFirst = input.listIndexOf(searchKey, FindOptions.FIND_FIRST); + ColumnVector expectedLast = ColumnVector.fromBoxedInts(-1, -1, 3, null); + ColumnVector resultLast = input.listIndexOf(searchKey, FindOptions.FIND_LAST)) { + assertColumnsAreEqual(expectedFirst, resultFirst); + assertColumnsAreEqual(expectedLast, resultLast); + } + } + + @Test + void testListIndexOfStringCol() { + List list0 = Arrays.asList("HΓ©llo there", "thΓ©sΓ©"); + List list1 = Arrays.asList("", "ARΓ© some", "test strings"); + List list2 = Arrays.asList("FOO", "", "ARΓ© some", "test"); + List list3 = Arrays.asList(null, "FOO", "", "test", "ARΓ© some", "test"); + List list4 = Arrays.asList(null, "FOO", "", "ARΓ© some", "test"); + List list5 = null; + try (ColumnVector input = makeListsColumn(DType.STRING, list0, list1, list2, list3, list4, list5); + ColumnVector searchKeys = ColumnVector.fromStrings("thΓ©sΓ©", "", "test", "test", "iotA", null); + ColumnVector expectedFirst = ColumnVector.fromBoxedInts(1, 0, 3, 3, -1, null); + ColumnVector resultFirst = input.listIndexOf(searchKeys, FindOptions.FIND_FIRST); + ColumnVector expectedLast = ColumnVector.fromBoxedInts(1, 0, 3, 5, -1, null); + ColumnVector resultLast = input.listIndexOf(searchKeys, FindOptions.FIND_LAST)) { + assertColumnsAreEqual(expectedFirst, resultFirst); + assertColumnsAreEqual(expectedLast, resultLast); + } + } + + @Test + void testListIndexOfIntCol() { + List list0 = Arrays.asList(1, 2, 3); + List list1 = Arrays.asList(4, 5, 6); + List list2 = Arrays.asList(null, 8, 9, 8); + List list3 = Arrays.asList(null, 8, 9); + List list4 = null; + try (ColumnVector input = makeListsColumn(DType.INT32, list0, list1, list2, list3, list4); + ColumnVector searchKeys = ColumnVector.fromBoxedInts(3, 3, 8, 3, null); + ColumnVector expectedFirst = ColumnVector.fromBoxedInts(2, -1, 1, -1, null); + ColumnVector resultFirst = input.listIndexOf(searchKeys, FindOptions.FIND_FIRST); + ColumnVector expectedLast = ColumnVector.fromBoxedInts(2, -1, 3, -1, null); + ColumnVector resultLast = input.listIndexOf(searchKeys, FindOptions.FIND_LAST)) { + assertColumnsAreEqual(expectedFirst, resultFirst); + assertColumnsAreEqual(expectedLast, resultLast); + } + } + @Test void testListSortRowsWithIntChild() { List list1 = Arrays.asList(1, 3, 0, 2); diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index b898222d7d7..44749103b54 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -304,8 +304,8 @@ def test_get_nulls(): ([[1, 2, 3], [], [3, 4, 5]], 6, [False, False, False],), ([[1.0, 2.0, 3.0], None, []], 2.0, [True, None, False],), ([[None, "b", "c"], [], ["b", "e", "f"]], "b", [True, False, True],), - ([[None, 2, 3], None, []], 1, [None, None, False]), - ([[None, "b", "c"], [], ["b", "e", "f"]], "d", [None, False, False],), + ([[None, 2, 3], None, []], 1, [False, None, False]), + ([[None, "b", "c"], [], ["b", "e", "f"]], "d", [False, False, False],), ], ) def test_contains_scalar(data, scalar, expect): From 68384ea2e1071d2f35867514d8a6add500d50cc6 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 22 Dec 2021 09:14:27 -0600 Subject: [PATCH 04/12] Merge branch-21.12 into branch-22.02 --- CHANGELOG.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 36c550926ab..68ff9abc9ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,6 @@ Please see https://github.com/rapidsai/cudf/releases/tag/v22.02.00a for the latest changes to this development branch. -# cuDF 21.12.00 (Date TBD) # cuDF 21.12.00 (9 Dec 2021) ## 🚨 Breaking Changes @@ -1705,7 +1704,7 @@ Please see https://github.com/rapidsai/cudf/releases/tag/v22.02.00a for the late - PR #6459 Add `map` method to series - PR #6379 Add list hashing functionality to MD5 - PR #6498 Add helper method to ColumnBuilder with some nits -- PR #6336 Add `join` functionality in cudf concat +- PR #6336 Add `join` functionality in cudf concat - PR #6653 Replaced SHFL_XOR calls with cub::WarpReduce - PR #6751 Rework ColumnViewAccess and its usage - PR #6698 Remove macros from ORC reader and writer From 04f4219428f734ddc284aad141a34f9d2bca37f5 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 23 Dec 2021 00:29:27 -0600 Subject: [PATCH 05/12] Use gpuci_mamba_retry to install local artifacts. (#9951) I see CI timeouts occurring at the step where local conda artifacts for libcudf and libcudf_kafka are installed. This PR uses `gpuci_mamba_retry` instead of `conda` to install those local artifacts (this change was also recently made in https://github.com/rapidsai/cugraph/pull/1928). Example timeouts: - https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cudf/job/prb/job/cudf-gpu-test/CUDA=11.5,GPU_LABEL=driver-495,LINUX_VER=ubuntu20.04,PYTHON=3.8/5764/console - https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cudf/job/prb/job/cudf-gpu-test/CUDA=11.5,GPU_LABEL=driver-495,LINUX_VER=ubuntu20.04,PYTHON=3.8/5773/console Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/cudf/pull/9951 --- ci/gpu/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 5646c268301..a557a2ef066 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -219,7 +219,7 @@ else KAFKA_CONDA_FILE=${KAFKA_CONDA_FILE//-/=} #convert to conda install gpuci_logger "Installing $CUDF_CONDA_FILE & $KAFKA_CONDA_FILE" - conda install -c ${CONDA_ARTIFACT_PATH} "$CUDF_CONDA_FILE" "$KAFKA_CONDA_FILE" + gpuci_mamba_retry install -c ${CONDA_ARTIFACT_PATH} "$CUDF_CONDA_FILE" "$KAFKA_CONDA_FILE" install_dask From c99a37fb804282565204fd1544d145177620c0a3 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 23 Dec 2021 17:40:31 -0600 Subject: [PATCH 06/12] Remove deprecated method Series.hash_encode. (#9942) This PR removes the deprecated method `Series.hash_encode`. Resolves #9475. Follows up on #9457, #9381. This PR also removes libcudf code paths used solely for this Python method. Users may replace code like `series.hash_encode(stop, use_name=False)` with `series.hash_values(method="murmur3") % stop`. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) - Conor Hoekstra (https://github.com/codereport) URL: https://github.com/rapidsai/cudf/pull/9942 --- cpp/include/cudf/detail/hashing.hpp | 14 ++--- cpp/include/cudf/hashing.hpp | 9 +-- cpp/include/cudf/table/row_operators.cuh | 48 --------------- cpp/src/hash/hashing.cu | 6 +- cpp/src/hash/murmur_hash.cu | 25 ++------ cpp/tests/hashing/hash_test.cpp | 50 +++++++-------- docs/cudf/source/api_docs/series.rst | 2 - python/cudf/cudf/_lib/cpp/hash.pxd | 3 +- python/cudf/cudf/_lib/hash.pyx | 4 +- python/cudf/cudf/core/frame.py | 4 +- python/cudf/cudf/core/series.py | 77 ------------------------ python/cudf/cudf/tests/test_dataframe.py | 38 ------------ 12 files changed, 46 insertions(+), 234 deletions(-) diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp index bd5c8a42a51..0fc807593fb 100644 --- a/cpp/include/cudf/detail/hashing.hpp +++ b/cpp/include/cudf/detail/hashing.hpp @@ -32,17 +32,15 @@ namespace detail { */ std::unique_ptr hash( table_view const& input, - hash_id hash_function = hash_id::HASH_MURMUR3, - cudf::host_span initial_hash = {}, - uint32_t seed = 0, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + hash_id hash_function = hash_id::HASH_MURMUR3, + uint32_t seed = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); std::unique_ptr murmur_hash3_32( table_view const& input, - cudf::host_span initial_hash = {}, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); std::unique_ptr md5_hash( table_view const& input, diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index 6b281c3f7f4..cce05042917 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -31,8 +31,6 @@ namespace cudf { * * @param input The table of columns to hash. * @param hash_function The hash function enum to use. - * @param initial_hash Optional host_span of initial hash values for each column. - * If this span is empty then each element will be hashed as-is. * @param seed Optional seed value to use for the hash function. * @param mr Device memory resource used to allocate the returned column's device memory. * @@ -40,10 +38,9 @@ namespace cudf { */ std::unique_ptr hash( table_view const& input, - hash_id hash_function = hash_id::HASH_MURMUR3, - cudf::host_span initial_hash = {}, - uint32_t seed = DEFAULT_HASH_SEED, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + hash_id hash_function = hash_id::HASH_MURMUR3, + uint32_t seed = DEFAULT_HASH_SEED, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh index 0f3ca073380..32ddd1ef49a 100644 --- a/cpp/include/cudf/table/row_operators.cuh +++ b/cpp/include/cudf/table/row_operators.cuh @@ -539,52 +539,4 @@ class row_hasher { uint32_t _seed{DEFAULT_HASH_SEED}; }; -/** - * @brief Computes the hash value of a row in the given table, combined with an - * initial hash value for each column. - * - * @tparam hash_function Hash functor to use for hashing elements. - * @tparam Nullate A cudf::nullate type describing how to check for nulls. - */ -template