diff --git a/.gitignore b/.gitignore index b398cfc4f88..aee3d072de2 100644 --- a/.gitignore +++ b/.gitignore @@ -159,3 +159,7 @@ dask-worker-space/ # protobuf **/*_pb2.py + +# Sphinx docs & build artifacts +docs/cudf/source/api_docs/generated/* +docs/cudf/source/api_docs/api/* \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index cc92cde15a8..de00213a6f6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,9 +2,260 @@ Please see https://github.com/rapidsai/cudf/releases/tag/v21.10.00a for the latest changes to this development branch. -# cuDF 21.08.00 (Date TBD) +# cuDF 21.08.00 (4 Aug 2021) -Please see https://github.com/rapidsai/cudf/releases/tag/v21.08.00a for the latest changes to this development branch. +## 🚨 Breaking Changes + +- Fix a crash in pack() when being handed tables with no columns. ([#8697](https://github.com/rapidsai/cudf/pull/8697)) [@nvdbaranec](https://github.com/nvdbaranec) +- Remove unused cudf::strings::create_offsets ([#8663](https://github.com/rapidsai/cudf/pull/8663)) [@davidwendt](https://github.com/davidwendt) +- Add delimiter parameter to cudf::strings::capitalize() ([#8620](https://github.com/rapidsai/cudf/pull/8620)) [@davidwendt](https://github.com/davidwendt) +- Change default datetime index resolution to ns to match pandas ([#8611](https://github.com/rapidsai/cudf/pull/8611)) [@vyasr](https://github.com/vyasr) +- Add sequence_type parameter to cudf::strings::title function ([#8602](https://github.com/rapidsai/cudf/pull/8602)) [@davidwendt](https://github.com/davidwendt) +- Add `strings::repeat_strings` API that can repeat each string a different number of times ([#8561](https://github.com/rapidsai/cudf/pull/8561)) [@ttnghia](https://github.com/ttnghia) +- String-to-boolean conversion is different from Pandas ([#8549](https://github.com/rapidsai/cudf/pull/8549)) [@skirui-source](https://github.com/skirui-source) +- Add accurate hash join size functions ([#8453](https://github.com/rapidsai/cudf/pull/8453)) [@PointKernel](https://github.com/PointKernel) +- Expose a Decimal32Dtype in cuDF Python ([#8438](https://github.com/rapidsai/cudf/pull/8438)) [@skirui-source](https://github.com/skirui-source) +- Update dask make_meta changes to be compatible with dask upstream ([#8426](https://github.com/rapidsai/cudf/pull/8426)) [@galipremsagar](https://github.com/galipremsagar) +- Adapt `cudf::scalar` classes to changes in `rmm::device_scalar` ([#8411](https://github.com/rapidsai/cudf/pull/8411)) [@harrism](https://github.com/harrism) +- Remove special Index class from the general index class hierarchy ([#8309](https://github.com/rapidsai/cudf/pull/8309)) [@vyasr](https://github.com/vyasr) +- Add first-class dtype utilities ([#8308](https://github.com/rapidsai/cudf/pull/8308)) [@vyasr](https://github.com/vyasr) +- ORC - Support reading multiple orc files/buffers in a single operation ([#8142](https://github.com/rapidsai/cudf/pull/8142)) [@jdye64](https://github.com/jdye64) +- Upgrade arrow to 4.0.1 ([#7495](https://github.com/rapidsai/cudf/pull/7495)) [@galipremsagar](https://github.com/galipremsagar) + +## 🐛 Bug Fixes + +- Fix `contains` check in string column ([#8834](https://github.com/rapidsai/cudf/pull/8834)) [@galipremsagar](https://github.com/galipremsagar) +- Remove unused variable from `row_bit_count_test`. ([#8829](https://github.com/rapidsai/cudf/pull/8829)) [@mythrocks](https://github.com/mythrocks) +- Fixes issue with null struct columns in ORC reader ([#8819](https://github.com/rapidsai/cudf/pull/8819)) [@rgsl888prabhu](https://github.com/rgsl888prabhu) +- Set CMake vars for python/parquet support in libarrow builds ([#8808](https://github.com/rapidsai/cudf/pull/8808)) [@vyasr](https://github.com/vyasr) +- Handle empty child columns in row_bit_count() ([#8791](https://github.com/rapidsai/cudf/pull/8791)) [@mythrocks](https://github.com/mythrocks) +- Revert "Remove cudf unneeded build time requirement of the cuda driver" ([#8784](https://github.com/rapidsai/cudf/pull/8784)) [@robertmaynard](https://github.com/robertmaynard) +- Fix isort error in utils.pyx ([#8771](https://github.com/rapidsai/cudf/pull/8771)) [@charlesbluca](https://github.com/charlesbluca) +- Handle sliced struct/list columns properly in concatenate() bounds checking. ([#8760](https://github.com/rapidsai/cudf/pull/8760)) [@nvdbaranec](https://github.com/nvdbaranec) +- Fix issues with `_CPackedColumns.serialize()` handling of host and device data ([#8759](https://github.com/rapidsai/cudf/pull/8759)) [@charlesbluca](https://github.com/charlesbluca) +- Fix issues with `MultiIndex` in `dropna`, `stack` & `reset_index` ([#8753](https://github.com/rapidsai/cudf/pull/8753)) [@galipremsagar](https://github.com/galipremsagar) +- Write pandas extension types to parquet file metadata ([#8749](https://github.com/rapidsai/cudf/pull/8749)) [@devavret](https://github.com/devavret) +- Fix `where` to handle `DataFrame` & `Series` input combination ([#8747](https://github.com/rapidsai/cudf/pull/8747)) [@galipremsagar](https://github.com/galipremsagar) +- Fix `replace` to handle null values correctly ([#8744](https://github.com/rapidsai/cudf/pull/8744)) [@galipremsagar](https://github.com/galipremsagar) +- Handle sliced structs properly in pack/contiguous_split. ([#8739](https://github.com/rapidsai/cudf/pull/8739)) [@nvdbaranec](https://github.com/nvdbaranec) +- Fix issue in slice() where columns with a positive offset were computing null counts incorrectly. ([#8738](https://github.com/rapidsai/cudf/pull/8738)) [@nvdbaranec](https://github.com/nvdbaranec) +- Fix `cudf.Series` constructor to handle list of sequences ([#8735](https://github.com/rapidsai/cudf/pull/8735)) [@galipremsagar](https://github.com/galipremsagar) +- Fix min/max sorted groupby aggregation on string column with nulls (argmin, argmax sentinel value missing on nulls) ([#8731](https://github.com/rapidsai/cudf/pull/8731)) [@karthikeyann](https://github.com/karthikeyann) +- Fix orc reader assert on create data_type in debug ([#8706](https://github.com/rapidsai/cudf/pull/8706)) [@davidwendt](https://github.com/davidwendt) +- Fix min/max inclusive cudf::scan for strings column ([#8705](https://github.com/rapidsai/cudf/pull/8705)) [@davidwendt](https://github.com/davidwendt) +- JNI: Fix driver version assertion logic in testGetCudaRuntimeInfo ([#8701](https://github.com/rapidsai/cudf/pull/8701)) [@sperlingxx](https://github.com/sperlingxx) +- Adding fix for skip_rows and crash in orc reader ([#8700](https://github.com/rapidsai/cudf/pull/8700)) [@rgsl888prabhu](https://github.com/rgsl888prabhu) +- Bug fix: `replace_nulls_policy` functor not returning correct indices for gathermap ([#8699](https://github.com/rapidsai/cudf/pull/8699)) [@isVoid](https://github.com/isVoid) +- Fix a crash in pack() when being handed tables with no columns. ([#8697](https://github.com/rapidsai/cudf/pull/8697)) [@nvdbaranec](https://github.com/nvdbaranec) +- Add post-processing steps to `dask_cudf.groupby.CudfSeriesGroupby.aggregate` ([#8694](https://github.com/rapidsai/cudf/pull/8694)) [@charlesbluca](https://github.com/charlesbluca) +- JNI build no longer looks for Arrow in conda environment ([#8686](https://github.com/rapidsai/cudf/pull/8686)) [@jlowe](https://github.com/jlowe) +- Handle arbitrarily different data in null list column rows when checking for equivalency. ([#8666](https://github.com/rapidsai/cudf/pull/8666)) [@nvdbaranec](https://github.com/nvdbaranec) +- Add ConfigureNVBench to avoid concurrent main() entry points ([#8662](https://github.com/rapidsai/cudf/pull/8662)) [@PointKernel](https://github.com/PointKernel) +- Pin `*arrow` to use `*cuda` in `run` ([#8651](https://github.com/rapidsai/cudf/pull/8651)) [@jakirkham](https://github.com/jakirkham) +- Add proper support for tolerances in testing methods. ([#8649](https://github.com/rapidsai/cudf/pull/8649)) [@vyasr](https://github.com/vyasr) +- Support multi-char case conversion in capitalize function ([#8647](https://github.com/rapidsai/cudf/pull/8647)) [@davidwendt](https://github.com/davidwendt) +- Fix repeated mangled names in read_csv with duplicate column names ([#8645](https://github.com/rapidsai/cudf/pull/8645)) [@karthikeyann](https://github.com/karthikeyann) +- Temporarily disable libcudf example build tests ([#8642](https://github.com/rapidsai/cudf/pull/8642)) [@isVoid](https://github.com/isVoid) +- Use conda-sourced cudf artifacts for libcudf example in CI ([#8638](https://github.com/rapidsai/cudf/pull/8638)) [@isVoid](https://github.com/isVoid) +- Ensure dev environment uses Arrow GPU packages ([#8637](https://github.com/rapidsai/cudf/pull/8637)) [@charlesbluca](https://github.com/charlesbluca) +- Fix bug that columns only initialized once when specified `columns` and `index` in dataframe ctor ([#8628](https://github.com/rapidsai/cudf/pull/8628)) [@isVoid](https://github.com/isVoid) +- Propagate **kwargs through to as_*_column methods ([#8618](https://github.com/rapidsai/cudf/pull/8618)) [@shwina](https://github.com/shwina) +- Fix orc_reader_benchmark.cpp compile error ([#8609](https://github.com/rapidsai/cudf/pull/8609)) [@davidwendt](https://github.com/davidwendt) +- Fix missed renumbering of Aggregation values ([#8600](https://github.com/rapidsai/cudf/pull/8600)) [@revans2](https://github.com/revans2) +- Update cmake to 3.20.5 in the Java Docker image ([#8593](https://github.com/rapidsai/cudf/pull/8593)) [@NvTimLiu](https://github.com/NvTimLiu) +- Fix bug in replace_with_backrefs when group has greedy quantifier ([#8575](https://github.com/rapidsai/cudf/pull/8575)) [@davidwendt](https://github.com/davidwendt) +- Apply metadata to keys before returning in `Frame._encode` ([#8560](https://github.com/rapidsai/cudf/pull/8560)) [@charlesbluca](https://github.com/charlesbluca) +- Fix for strings containing special JSON characters in get_json_object(). ([#8556](https://github.com/rapidsai/cudf/pull/8556)) [@nvdbaranec](https://github.com/nvdbaranec) +- Fix debug compile error in gather_struct_tests.cpp ([#8554](https://github.com/rapidsai/cudf/pull/8554)) [@davidwendt](https://github.com/davidwendt) +- String-to-boolean conversion is different from Pandas ([#8549](https://github.com/rapidsai/cudf/pull/8549)) [@skirui-source](https://github.com/skirui-source) +- Fix `__repr__` output with `display.max_rows` is `None` ([#8547](https://github.com/rapidsai/cudf/pull/8547)) [@galipremsagar](https://github.com/galipremsagar) +- Fix size passed to column constructors in _with_type_metadata ([#8539](https://github.com/rapidsai/cudf/pull/8539)) [@shwina](https://github.com/shwina) +- Properly retrieve last column when `-1` is specified for column index ([#8529](https://github.com/rapidsai/cudf/pull/8529)) [@isVoid](https://github.com/isVoid) +- Fix importing `apply` from `dask` ([#8517](https://github.com/rapidsai/cudf/pull/8517)) [@galipremsagar](https://github.com/galipremsagar) +- Fix offset of the string dictionary length stream ([#8515](https://github.com/rapidsai/cudf/pull/8515)) [@vuule](https://github.com/vuule) +- Fix double counting of selected columns in CSV reader ([#8508](https://github.com/rapidsai/cudf/pull/8508)) [@ochan1](https://github.com/ochan1) +- Incorrect map size in scatter_to_gather corrupts struct columns ([#8507](https://github.com/rapidsai/cudf/pull/8507)) [@gerashegalov](https://github.com/gerashegalov) +- replace_nulls properly propagates memory resource to gather calls ([#8500](https://github.com/rapidsai/cudf/pull/8500)) [@robertmaynard](https://github.com/robertmaynard) +- Disallow groupby aggs for `StructColumns` ([#8499](https://github.com/rapidsai/cudf/pull/8499)) [@charlesbluca](https://github.com/charlesbluca) +- Fixes out-of-bounds access for small files in unzip ([#8498](https://github.com/rapidsai/cudf/pull/8498)) [@elstehle](https://github.com/elstehle) +- Adding support for writing empty dataframe ([#8490](https://github.com/rapidsai/cudf/pull/8490)) [@shaneding](https://github.com/shaneding) +- Fix exclusive scan when including nulls and improve testing ([#8478](https://github.com/rapidsai/cudf/pull/8478)) [@harrism](https://github.com/harrism) +- Add workaround for crash in libcudf debug build using output_indexalator in thrust::lower_bound ([#8432](https://github.com/rapidsai/cudf/pull/8432)) [@davidwendt](https://github.com/davidwendt) +- Install only the same Thrust files that Thrust itself installs ([#8420](https://github.com/rapidsai/cudf/pull/8420)) [@robertmaynard](https://github.com/robertmaynard) +- Add nightly version for ucx-py in ci script ([#8419](https://github.com/rapidsai/cudf/pull/8419)) [@galipremsagar](https://github.com/galipremsagar) +- Fix null_equality config of rolling_collect_set ([#8415](https://github.com/rapidsai/cudf/pull/8415)) [@sperlingxx](https://github.com/sperlingxx) +- CollectSetAggregation: implement RollingAggregation interface ([#8406](https://github.com/rapidsai/cudf/pull/8406)) [@sperlingxx](https://github.com/sperlingxx) +- Handle pre-sliced nested columns in contiguous_split. ([#8391](https://github.com/rapidsai/cudf/pull/8391)) [@nvdbaranec](https://github.com/nvdbaranec) +- Fix bitmask_tests.cpp host accessing device memory ([#8370](https://github.com/rapidsai/cudf/pull/8370)) [@davidwendt](https://github.com/davidwendt) +- Fix concurrent_unordered_map to prevent accessing padding bits in pair_type ([#8348](https://github.com/rapidsai/cudf/pull/8348)) [@davidwendt](https://github.com/davidwendt) +- BUG FIX: Raise appropriate strings error when concatenating strings column ([#8290](https://github.com/rapidsai/cudf/pull/8290)) [@skirui-source](https://github.com/skirui-source) +- Make gpuCI and pre-commit style configurations consistent ([#8215](https://github.com/rapidsai/cudf/pull/8215)) [@charlesbluca](https://github.com/charlesbluca) +- Add collect list to dask-cudf groupby aggregations ([#8045](https://github.com/rapidsai/cudf/pull/8045)) [@charlesbluca](https://github.com/charlesbluca) + +## 📖 Documentation + +- Update Python UDFs notebook ([#8810](https://github.com/rapidsai/cudf/pull/8810)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Fix dask.dataframe API docs links after reorg ([#8772](https://github.com/rapidsai/cudf/pull/8772)) [@jsignell](https://github.com/jsignell) +- Fix instructions for running cuDF/dask-cuDF tests in CONTRIBUTING.md ([#8724](https://github.com/rapidsai/cudf/pull/8724)) [@shwina](https://github.com/shwina) +- Translate Markdown documentation to rST and remove recommonmark ([#8698](https://github.com/rapidsai/cudf/pull/8698)) [@vyasr](https://github.com/vyasr) +- Fixed spelling mistakes in libcudf documentation ([#8664](https://github.com/rapidsai/cudf/pull/8664)) [@karthikeyann](https://github.com/karthikeyann) +- Custom Sphinx Extension: `PandasCompat` ([#8643](https://github.com/rapidsai/cudf/pull/8643)) [@isVoid](https://github.com/isVoid) +- Fix README.md ([#8535](https://github.com/rapidsai/cudf/pull/8535)) [@ajschmidt8](https://github.com/ajschmidt8) +- Change namespace contains_nulls to struct ([#8523](https://github.com/rapidsai/cudf/pull/8523)) [@davidwendt](https://github.com/davidwendt) +- Add info about NVTX ranges to dev guide ([#8461](https://github.com/rapidsai/cudf/pull/8461)) [@jrhemstad](https://github.com/jrhemstad) +- Fixed documentation bug in groupby agg method ([#8325](https://github.com/rapidsai/cudf/pull/8325)) [@ahmet-uyar](https://github.com/ahmet-uyar) + +## 🚀 New Features + +- Fix concatenating structs ([#8811](https://github.com/rapidsai/cudf/pull/8811)) [@shaneding](https://github.com/shaneding) +- Implement JNI for groupby aggregations `M2` and `MERGE_M2` ([#8763](https://github.com/rapidsai/cudf/pull/8763)) [@ttnghia](https://github.com/ttnghia) +- Bump `isort` to `5.6.4` and remove `isort` overrides made for 5.0.7 ([#8755](https://github.com/rapidsai/cudf/pull/8755)) [@charlesbluca](https://github.com/charlesbluca) +- Implement `__setitem__` for `StructColumn` ([#8737](https://github.com/rapidsai/cudf/pull/8737)) [@shaneding](https://github.com/shaneding) +- Add `is_leap_year` to `DateTimeProperties` and `DatetimeIndex` ([#8736](https://github.com/rapidsai/cudf/pull/8736)) [@isVoid](https://github.com/isVoid) +- Add `struct.explode()` method ([#8729](https://github.com/rapidsai/cudf/pull/8729)) [@shwina](https://github.com/shwina) +- Add `DataFrame.to_struct()` method to convert a DataFrame to a struct Series ([#8728](https://github.com/rapidsai/cudf/pull/8728)) [@shwina](https://github.com/shwina) +- Add support for list type in ORC writer ([#8723](https://github.com/rapidsai/cudf/pull/8723)) [@vuule](https://github.com/vuule) +- Fix slicing from struct columns and accessing struct columns ([#8719](https://github.com/rapidsai/cudf/pull/8719)) [@shaneding](https://github.com/shaneding) +- Add `datetime::is_leap_year` ([#8711](https://github.com/rapidsai/cudf/pull/8711)) [@isVoid](https://github.com/isVoid) +- Accessing struct columns from `dask_cudf` ([#8675](https://github.com/rapidsai/cudf/pull/8675)) [@shaneding](https://github.com/shaneding) +- Added pct_change to Series ([#8650](https://github.com/rapidsai/cudf/pull/8650)) [@TravisHester](https://github.com/TravisHester) +- Add strings support to cudf::shift function ([#8648](https://github.com/rapidsai/cudf/pull/8648)) [@davidwendt](https://github.com/davidwendt) +- Support Scatter `struct_scalar` ([#8630](https://github.com/rapidsai/cudf/pull/8630)) [@isVoid](https://github.com/isVoid) +- Struct scalar from host dictionary ([#8629](https://github.com/rapidsai/cudf/pull/8629)) [@shaneding](https://github.com/shaneding) +- Add dayofyear and day_of_year to Series, DatetimeColumn, and DatetimeIndex ([#8626](https://github.com/rapidsai/cudf/pull/8626)) [@beckernick](https://github.com/beckernick) +- JNI support for capitalize ([#8624](https://github.com/rapidsai/cudf/pull/8624)) [@firestarman](https://github.com/firestarman) +- Add delimiter parameter to cudf::strings::capitalize() ([#8620](https://github.com/rapidsai/cudf/pull/8620)) [@davidwendt](https://github.com/davidwendt) +- Add NVBench in CMake ([#8619](https://github.com/rapidsai/cudf/pull/8619)) [@PointKernel](https://github.com/PointKernel) +- Change default datetime index resolution to ns to match pandas ([#8611](https://github.com/rapidsai/cudf/pull/8611)) [@vyasr](https://github.com/vyasr) +- ListColumn `__setitem__` ([#8606](https://github.com/rapidsai/cudf/pull/8606)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Implement groupby aggregations `M2` and `MERGE_M2` ([#8605](https://github.com/rapidsai/cudf/pull/8605)) [@ttnghia](https://github.com/ttnghia) +- Add sequence_type parameter to cudf::strings::title function ([#8602](https://github.com/rapidsai/cudf/pull/8602)) [@davidwendt](https://github.com/davidwendt) +- Adding support for list and struct type in ORC Reader ([#8599](https://github.com/rapidsai/cudf/pull/8599)) [@rgsl888prabhu](https://github.com/rgsl888prabhu) +- Benchmark for `strings::repeat_strings` APIs ([#8589](https://github.com/rapidsai/cudf/pull/8589)) [@ttnghia](https://github.com/ttnghia) +- Nested scalar support for copy if else ([#8588](https://github.com/rapidsai/cudf/pull/8588)) [@gerashegalov](https://github.com/gerashegalov) +- User specified decimal columns to float64 ([#8587](https://github.com/rapidsai/cudf/pull/8587)) [@jdye64](https://github.com/jdye64) +- Add `get_element` for struct column ([#8578](https://github.com/rapidsai/cudf/pull/8578)) [@isVoid](https://github.com/isVoid) +- Python changes for adding `__getitem__` for `struct` ([#8577](https://github.com/rapidsai/cudf/pull/8577)) [@shaneding](https://github.com/shaneding) +- Add `strings::repeat_strings` API that can repeat each string a different number of times ([#8561](https://github.com/rapidsai/cudf/pull/8561)) [@ttnghia](https://github.com/ttnghia) +- Refactor `tests/iterator_utilities.hpp` functions ([#8540](https://github.com/rapidsai/cudf/pull/8540)) [@ttnghia](https://github.com/ttnghia) +- Support MERGE_LISTS and MERGE_SETS in Java package ([#8516](https://github.com/rapidsai/cudf/pull/8516)) [@sperlingxx](https://github.com/sperlingxx) +- Decimal support csv reader ([#8511](https://github.com/rapidsai/cudf/pull/8511)) [@elstehle](https://github.com/elstehle) +- Add column type tests ([#8505](https://github.com/rapidsai/cudf/pull/8505)) [@isVoid](https://github.com/isVoid) +- Warn when downscaling decimal columns ([#8492](https://github.com/rapidsai/cudf/pull/8492)) [@ChrisJar](https://github.com/ChrisJar) +- Add JNI for `strings::repeat_strings` ([#8491](https://github.com/rapidsai/cudf/pull/8491)) [@ttnghia](https://github.com/ttnghia) +- Add `Index.get_loc` for Numerical, String Index support ([#8489](https://github.com/rapidsai/cudf/pull/8489)) [@isVoid](https://github.com/isVoid) +- Expose half_up rounding in cuDF ([#8477](https://github.com/rapidsai/cudf/pull/8477)) [@shwina](https://github.com/shwina) +- Java APIs to fetch CUDA runtime info ([#8465](https://github.com/rapidsai/cudf/pull/8465)) [@sperlingxx](https://github.com/sperlingxx) +- Add `str.edit_distance_matrix` ([#8463](https://github.com/rapidsai/cudf/pull/8463)) [@isVoid](https://github.com/isVoid) +- Support constructing `cudf.Scalar` objects from host side lists ([#8459](https://github.com/rapidsai/cudf/pull/8459)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Add accurate hash join size functions ([#8453](https://github.com/rapidsai/cudf/pull/8453)) [@PointKernel](https://github.com/PointKernel) +- Add cudf::strings::integer_to_hex convert API ([#8450](https://github.com/rapidsai/cudf/pull/8450)) [@davidwendt](https://github.com/davidwendt) +- Create objects from iterables that contain cudf.NA ([#8442](https://github.com/rapidsai/cudf/pull/8442)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- JNI bindings for sort_lists ([#8439](https://github.com/rapidsai/cudf/pull/8439)) [@sperlingxx](https://github.com/sperlingxx) +- Expose a Decimal32Dtype in cuDF Python ([#8438](https://github.com/rapidsai/cudf/pull/8438)) [@skirui-source](https://github.com/skirui-source) +- Replace `all_null()` and `all_valid()` by `iterator_all_nulls()` and `iterator_no_null()` in tests ([#8437](https://github.com/rapidsai/cudf/pull/8437)) [@ttnghia](https://github.com/ttnghia) +- Implement groupby `MERGE_LISTS` and `MERGE_SETS` aggregates ([#8436](https://github.com/rapidsai/cudf/pull/8436)) [@ttnghia](https://github.com/ttnghia) +- Add public libcudf match_dictionaries API ([#8429](https://github.com/rapidsai/cudf/pull/8429)) [@davidwendt](https://github.com/davidwendt) +- Add move constructors for `string_scalar` and `struct_scalar` ([#8428](https://github.com/rapidsai/cudf/pull/8428)) [@ttnghia](https://github.com/ttnghia) +- Implement `strings::repeat_strings` ([#8423](https://github.com/rapidsai/cudf/pull/8423)) [@ttnghia](https://github.com/ttnghia) +- STRUCT column support for cudf::merge. ([#8422](https://github.com/rapidsai/cudf/pull/8422)) [@nvdbaranec](https://github.com/nvdbaranec) +- Implement reverse in libcudf ([#8410](https://github.com/rapidsai/cudf/pull/8410)) [@shaneding](https://github.com/shaneding) +- Support multiple input files/buffers for read_json ([#8403](https://github.com/rapidsai/cudf/pull/8403)) [@jdye64](https://github.com/jdye64) +- Improve test coverage for struct search ([#8396](https://github.com/rapidsai/cudf/pull/8396)) [@ttnghia](https://github.com/ttnghia) +- Add `groupby.fillna` ([#8362](https://github.com/rapidsai/cudf/pull/8362)) [@isVoid](https://github.com/isVoid) +- Enable AST-based joining ([#8214](https://github.com/rapidsai/cudf/pull/8214)) [@vyasr](https://github.com/vyasr) +- Generalized null support in user defined functions ([#8213](https://github.com/rapidsai/cudf/pull/8213)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Add compiled binary operation ([#8192](https://github.com/rapidsai/cudf/pull/8192)) [@karthikeyann](https://github.com/karthikeyann) +- Implement `.describe() ` for `DataFrameGroupBy` ([#8179](https://github.com/rapidsai/cudf/pull/8179)) [@skirui-source](https://github.com/skirui-source) +- ORC - Support reading multiple orc files/buffers in a single operation ([#8142](https://github.com/rapidsai/cudf/pull/8142)) [@jdye64](https://github.com/jdye64) +- Add Python bindings for `lists::concatenate_list_elements` and expose them as `.list.concat()` ([#8006](https://github.com/rapidsai/cudf/pull/8006)) [@shwina](https://github.com/shwina) +- Use Arrow URI FileSystem backed instance to retrieve remote files ([#7709](https://github.com/rapidsai/cudf/pull/7709)) [@jdye64](https://github.com/jdye64) +- Example to build custom application and link to libcudf ([#7671](https://github.com/rapidsai/cudf/pull/7671)) [@isVoid](https://github.com/isVoid) +- Upgrade arrow to 4.0.1 ([#7495](https://github.com/rapidsai/cudf/pull/7495)) [@galipremsagar](https://github.com/galipremsagar) + +## 🛠️ Improvements + +- Provide a better error message when `CUDA::cuda_driver` not found ([#8794](https://github.com/rapidsai/cudf/pull/8794)) [@robertmaynard](https://github.com/robertmaynard) +- Remove anonymous namespace from null_mask.cuh ([#8786](https://github.com/rapidsai/cudf/pull/8786)) [@nvdbaranec](https://github.com/nvdbaranec) +- Allow cudf to be built without libcuda.so existing ([#8751](https://github.com/rapidsai/cudf/pull/8751)) [@robertmaynard](https://github.com/robertmaynard) +- Pin `mimesis` to `<4.1` ([#8745](https://github.com/rapidsai/cudf/pull/8745)) [@galipremsagar](https://github.com/galipremsagar) +- Update `conda` environment name for CI ([#8692](https://github.com/rapidsai/cudf/pull/8692)) [@ajschmidt8](https://github.com/ajschmidt8) +- Remove flatbuffers dependency ([#8671](https://github.com/rapidsai/cudf/pull/8671)) [@Ethyling](https://github.com/Ethyling) +- Add options to build Arrow with Python and Parquet support ([#8670](https://github.com/rapidsai/cudf/pull/8670)) [@trxcllnt](https://github.com/trxcllnt) +- Remove unused cudf::strings::create_offsets ([#8663](https://github.com/rapidsai/cudf/pull/8663)) [@davidwendt](https://github.com/davidwendt) +- Update GDS lib version to 1.0.0 ([#8654](https://github.com/rapidsai/cudf/pull/8654)) [@pxLi](https://github.com/pxLi) +- Support for groupby/scan rank and dense_rank aggregations ([#8652](https://github.com/rapidsai/cudf/pull/8652)) [@rwlee](https://github.com/rwlee) +- Fix usage of deprecated arrow ipc API ([#8632](https://github.com/rapidsai/cudf/pull/8632)) [@revans2](https://github.com/revans2) +- Use absolute imports in `cudf` ([#8631](https://github.com/rapidsai/cudf/pull/8631)) [@galipremsagar](https://github.com/galipremsagar) +- ENH Add Java CI build script ([#8627](https://github.com/rapidsai/cudf/pull/8627)) [@dillon-cullinan](https://github.com/dillon-cullinan) +- Add DeprecationWarning to `ser.str.subword_tokenize` ([#8603](https://github.com/rapidsai/cudf/pull/8603)) [@VibhuJawa](https://github.com/VibhuJawa) +- Rewrite binary operations for improved performance and additional type support ([#8598](https://github.com/rapidsai/cudf/pull/8598)) [@vyasr](https://github.com/vyasr) +- Fix `mypy` errors surfacing because of `numpy-1.21.0` ([#8595](https://github.com/rapidsai/cudf/pull/8595)) [@galipremsagar](https://github.com/galipremsagar) +- Remove unneeded includes from cudf::string_view headers ([#8594](https://github.com/rapidsai/cudf/pull/8594)) [@davidwendt](https://github.com/davidwendt) +- Use cmake 3.20.1 as it is now required by rmm ([#8586](https://github.com/rapidsai/cudf/pull/8586)) [@robertmaynard](https://github.com/robertmaynard) +- Remove device debug symbols from cmake CUDF_CUDA_FLAGS ([#8584](https://github.com/rapidsai/cudf/pull/8584)) [@davidwendt](https://github.com/davidwendt) +- Dask-CuDF: use default Dask Dataframe optimizer ([#8581](https://github.com/rapidsai/cudf/pull/8581)) [@madsbk](https://github.com/madsbk) +- Remove checking if an unsigned value is less than zero ([#8579](https://github.com/rapidsai/cudf/pull/8579)) [@robertmaynard](https://github.com/robertmaynard) +- Remove strings_count parameter from cudf::strings::detail::create_chars_child_column ([#8576](https://github.com/rapidsai/cudf/pull/8576)) [@davidwendt](https://github.com/davidwendt) +- Make `cudf.api.types` imports consistent ([#8571](https://github.com/rapidsai/cudf/pull/8571)) [@galipremsagar](https://github.com/galipremsagar) +- Modernize libcudf basic example CMakeFile; updates CI build tests ([#8568](https://github.com/rapidsai/cudf/pull/8568)) [@isVoid](https://github.com/isVoid) +- Rename concatenate_tests.cu to .cpp ([#8555](https://github.com/rapidsai/cudf/pull/8555)) [@davidwendt](https://github.com/davidwendt) +- enable window lead/lag test on struct ([#8548](https://github.com/rapidsai/cudf/pull/8548)) [@wbo4958](https://github.com/wbo4958) +- Add Java methods to split and write column views ([#8546](https://github.com/rapidsai/cudf/pull/8546)) [@razajafri](https://github.com/razajafri) +- Small cleanup ([#8534](https://github.com/rapidsai/cudf/pull/8534)) [@codereport](https://github.com/codereport) +- Unpin `dask` version in CI ([#8533](https://github.com/rapidsai/cudf/pull/8533)) [@galipremsagar](https://github.com/galipremsagar) +- Added optional flag for building Arrow with S3 filesystem support ([#8531](https://github.com/rapidsai/cudf/pull/8531)) [@jdye64](https://github.com/jdye64) +- Minor clean up of various internal column and frame utilities ([#8528](https://github.com/rapidsai/cudf/pull/8528)) [@vyasr](https://github.com/vyasr) +- Rename some copying_test source files .cu to .cpp ([#8527](https://github.com/rapidsai/cudf/pull/8527)) [@davidwendt](https://github.com/davidwendt) +- Correct the last warnings and issues when using newer cuda versions ([#8525](https://github.com/rapidsai/cudf/pull/8525)) [@robertmaynard](https://github.com/robertmaynard) +- Correct unused parameter warnings in transform and unary ops ([#8521](https://github.com/rapidsai/cudf/pull/8521)) [@robertmaynard](https://github.com/robertmaynard) +- Correct unused parameter warnings in string algorithms ([#8509](https://github.com/rapidsai/cudf/pull/8509)) [@robertmaynard](https://github.com/robertmaynard) +- Add in JNI APIs for scan, replace_nulls, group_by.scan, and group_by.replace_nulls ([#8503](https://github.com/rapidsai/cudf/pull/8503)) [@revans2](https://github.com/revans2) +- Fix `21.08` forward-merge conflicts ([#8502](https://github.com/rapidsai/cudf/pull/8502)) [@ajschmidt8](https://github.com/ajschmidt8) +- Fix Cython formatting command in Contributing.md. ([#8496](https://github.com/rapidsai/cudf/pull/8496)) [@marlenezw](https://github.com/marlenezw) +- Bug/correct unused parameters in reshape and text ([#8495](https://github.com/rapidsai/cudf/pull/8495)) [@robertmaynard](https://github.com/robertmaynard) +- Correct unused parameter warnings in partitioning and stream compact ([#8494](https://github.com/rapidsai/cudf/pull/8494)) [@robertmaynard](https://github.com/robertmaynard) +- Correct unused parameter warnings in labelling and list algorithms ([#8493](https://github.com/rapidsai/cudf/pull/8493)) [@robertmaynard](https://github.com/robertmaynard) +- Refactor index construction ([#8485](https://github.com/rapidsai/cudf/pull/8485)) [@vyasr](https://github.com/vyasr) +- Correct unused parameter warnings in replace algorithms ([#8483](https://github.com/rapidsai/cudf/pull/8483)) [@robertmaynard](https://github.com/robertmaynard) +- Correct unused parameter warnings in reduction algorithms ([#8481](https://github.com/rapidsai/cudf/pull/8481)) [@robertmaynard](https://github.com/robertmaynard) +- Correct unused parameter warnings in io algorithms ([#8480](https://github.com/rapidsai/cudf/pull/8480)) [@robertmaynard](https://github.com/robertmaynard) +- Correct unused parameter warnings in interop algorithms ([#8479](https://github.com/rapidsai/cudf/pull/8479)) [@robertmaynard](https://github.com/robertmaynard) +- Correct unused parameter warnings in filling algorithms ([#8468](https://github.com/rapidsai/cudf/pull/8468)) [@robertmaynard](https://github.com/robertmaynard) +- Correct unused parameter warnings in groupby ([#8467](https://github.com/rapidsai/cudf/pull/8467)) [@robertmaynard](https://github.com/robertmaynard) +- use libcu++ time_point as timestamp ([#8466](https://github.com/rapidsai/cudf/pull/8466)) [@karthikeyann](https://github.com/karthikeyann) +- Modify reprog_device::extract to return groups in a single pass ([#8460](https://github.com/rapidsai/cudf/pull/8460)) [@davidwendt](https://github.com/davidwendt) +- Update minimum Dask requirement to 2021.6.0 ([#8458](https://github.com/rapidsai/cudf/pull/8458)) [@pentschev](https://github.com/pentschev) +- Fix failures when performing binary operations on DataFrames with empty columns ([#8452](https://github.com/rapidsai/cudf/pull/8452)) [@ChrisJar](https://github.com/ChrisJar) +- Fix conflicts in `8447` ([#8448](https://github.com/rapidsai/cudf/pull/8448)) [@ajschmidt8](https://github.com/ajschmidt8) +- Add serialization methods for `List` and `StructDtype` ([#8441](https://github.com/rapidsai/cudf/pull/8441)) [@charlesbluca](https://github.com/charlesbluca) +- Replace make_empty_strings_column with make_empty_column ([#8435](https://github.com/rapidsai/cudf/pull/8435)) [@davidwendt](https://github.com/davidwendt) +- JNI bindings for get_element ([#8433](https://github.com/rapidsai/cudf/pull/8433)) [@revans2](https://github.com/revans2) +- Update dask make_meta changes to be compatible with dask upstream ([#8426](https://github.com/rapidsai/cudf/pull/8426)) [@galipremsagar](https://github.com/galipremsagar) +- Unpin dask version on CI ([#8425](https://github.com/rapidsai/cudf/pull/8425)) [@galipremsagar](https://github.com/galipremsagar) +- Add benchmark for strings/fixed_point convert APIs ([#8417](https://github.com/rapidsai/cudf/pull/8417)) [@davidwendt](https://github.com/davidwendt) +- Adapt `cudf::scalar` classes to changes in `rmm::device_scalar` ([#8411](https://github.com/rapidsai/cudf/pull/8411)) [@harrism](https://github.com/harrism) +- Add benchmark for strings/integers convert APIs ([#8402](https://github.com/rapidsai/cudf/pull/8402)) [@davidwendt](https://github.com/davidwendt) +- Enable multi-file partitioning in dask_cudf.read_parquet ([#8393](https://github.com/rapidsai/cudf/pull/8393)) [@rjzamora](https://github.com/rjzamora) +- Correct unused parameter warnings in rolling algorithms ([#8390](https://github.com/rapidsai/cudf/pull/8390)) [@robertmaynard](https://github.com/robertmaynard) +- Correct unused parameters in column round and search ([#8389](https://github.com/rapidsai/cudf/pull/8389)) [@robertmaynard](https://github.com/robertmaynard) +- Add functionality to apply `Dtype` metadata to `ColumnBase` ([#8373](https://github.com/rapidsai/cudf/pull/8373)) [@charlesbluca](https://github.com/charlesbluca) +- Refactor setting stack size in regex code ([#8358](https://github.com/rapidsai/cudf/pull/8358)) [@davidwendt](https://github.com/davidwendt) +- Update Java bindings to 21.08-SNAPSHOT ([#8344](https://github.com/rapidsai/cudf/pull/8344)) [@pxLi](https://github.com/pxLi) +- Replace remaining uses of device_vector ([#8343](https://github.com/rapidsai/cudf/pull/8343)) [@harrism](https://github.com/harrism) +- Statically link libnvcomp into libcudfjni ([#8334](https://github.com/rapidsai/cudf/pull/8334)) [@jlowe](https://github.com/jlowe) +- Resolve auto merge conflicts for Branch 21.08 from branch 21.06 ([#8329](https://github.com/rapidsai/cudf/pull/8329)) [@galipremsagar](https://github.com/galipremsagar) +- Minor code refactor for sorted_order ([#8326](https://github.com/rapidsai/cudf/pull/8326)) [@wbo4958](https://github.com/wbo4958) +- Remove special Index class from the general index class hierarchy ([#8309](https://github.com/rapidsai/cudf/pull/8309)) [@vyasr](https://github.com/vyasr) +- Add first-class dtype utilities ([#8308](https://github.com/rapidsai/cudf/pull/8308)) [@vyasr](https://github.com/vyasr) +- Add option to link Java bindings with Arrow dynamically ([#8307](https://github.com/rapidsai/cudf/pull/8307)) [@jlowe](https://github.com/jlowe) +- Refactor ColumnMethods and its subclasses to remove `column` argument and require `parent` argument ([#8306](https://github.com/rapidsai/cudf/pull/8306)) [@shwina](https://github.com/shwina) +- Refactor `scatter` for list columns ([#8255](https://github.com/rapidsai/cudf/pull/8255)) [@isVoid](https://github.com/isVoid) +- Expose pack/unpack API to Python ([#8153](https://github.com/rapidsai/cudf/pull/8153)) [@charlesbluca](https://github.com/charlesbluca) +- Adding cudf.cut method ([#8002](https://github.com/rapidsai/cudf/pull/8002)) [@marlenezw](https://github.com/marlenezw) +- Optimize string gather performance for large strings ([#7980](https://github.com/rapidsai/cudf/pull/7980)) [@gaohao95](https://github.com/gaohao95) +- Add peak memory usage tracking to cuIO benchmarks ([#7770](https://github.com/rapidsai/cudf/pull/7770)) [@devavret](https://github.com/devavret) +- Updating Clang Version to 11.0.0 ([#6695](https://github.com/rapidsai/cudf/pull/6695)) [@codereport](https://github.com/codereport) # cuDF 21.06.00 (9 Jun 2021) diff --git a/README.md b/README.md index 587f18d2603..525820eee01 100644 --- a/README.md +++ b/README.md @@ -65,15 +65,15 @@ Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapids cuDF can be installed with conda ([miniconda](https://conda.io/miniconda.html), or the full [Anaconda distribution](https://www.anaconda.com/download)) from the `rapidsai` channel: -For `cudf version == 21.06` : +For `cudf version == 21.08` : ```bash # for CUDA 11.0 conda install -c rapidsai -c nvidia -c numba -c conda-forge \ - cudf=21.06 python=3.7 cudatoolkit=11.0 + cudf=21.08 python=3.7 cudatoolkit=11.0 # or, for CUDA 11.2 conda install -c rapidsai -c nvidia -c numba -c conda-forge \ - cudf=21.06 python=3.7 cudatoolkit=11.2 + cudf=21.08 python=3.7 cudatoolkit=11.2 ``` diff --git a/build.sh b/build.sh index 70b93427d5c..11948c64412 100755 --- a/build.sh +++ b/build.sh @@ -18,26 +18,27 @@ ARGS=$* REPODIR=$(cd $(dirname $0); pwd) VALIDARGS="clean libcudf cudf dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n -l --allgpuarch --disable_nvtx --show_depr_warn --ptds -h" -HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [-l] - clean - remove all existing build artifacts and configuration (start - over) - libcudf - build the cudf C++ code only - cudf - build the cudf Python package - dask_cudf - build the dask_cudf Python package - benchmarks - build benchmarks - tests - build tests - libcudf_kafka - build the libcudf_kafka C++ code only - cudf_kafka - build the cudf_kafka Python package - custreamz - build the custreamz Python package - -v - verbose build mode - -g - build for debug - -n - no install step - -l - build legacy tests - --allgpuarch - build for all supported GPU architectures - --disable_nvtx - disable inserting NVTX profiling ranges - --show_depr_warn - show cmake deprecation warnings - --ptds - enable per-thread default stream - -h | --h[elp] - print this text +HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [-l] [--cmake-args=\"\"] + clean - remove all existing build artifacts and configuration (start + over) + libcudf - build the cudf C++ code only + cudf - build the cudf Python package + dask_cudf - build the dask_cudf Python package + benchmarks - build benchmarks + tests - build tests + libcudf_kafka - build the libcudf_kafka C++ code only + cudf_kafka - build the cudf_kafka Python package + custreamz - build the custreamz Python package + -v - verbose build mode + -g - build for debug + -n - no install step + -l - build legacy tests + --allgpuarch - build for all supported GPU architectures + --disable_nvtx - disable inserting NVTX profiling ranges + --show_depr_warn - show cmake deprecation warnings + --ptds - enable per-thread default stream + --cmake-args=\\\"\\\" - pass arbitrary list of CMake configuration options (escape all quotes in argument) + -h | --h[elp] - print this text default action (no args) is to build and install 'libcudf' then 'cudf' then 'dask_cudf' targets @@ -71,6 +72,28 @@ function hasArg { (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ") } +function cmakeArgs { + # Check for multiple cmake args options + if [[ $(echo $ARGS | { grep -Eo "\-\-cmake\-args" || true; } | wc -l ) -gt 1 ]]; then + echo "Multiple --cmake-args options were provided, please provide only one: ${ARGS}" + exit 1 + fi + + # Check for cmake args option + if [[ -n $(echo $ARGS | { grep -E "\-\-cmake\-args" || true; } ) ]]; then + # There are possible weird edge cases that may cause this regex filter to output nothing and fail silently + # the true pipe will catch any weird edge cases that may happen and will cause the program to fall back + # on the invalid option error + CMAKE_ARGS=$(echo $ARGS | { grep -Eo "\-\-cmake\-args=\".+\"" || true; }) + if [[ -n ${CMAKE_ARGS} ]]; then + # Remove the full CMAKE_ARGS argument from list of args so that it passes validArgs function + ARGS=${ARGS//$CMAKE_ARGS/} + # Filter the full argument down to just the extra string that will be added to cmake call + CMAKE_ARGS=$(echo $CMAKE_ARGS | grep -Eo "\".+\"" | sed -e 's/^"//' -e 's/"$//') + fi + fi +} + function buildAll { ((${NUMARGS} == 0 )) || !(echo " ${ARGS} " | grep -q " [^-]\+ ") } @@ -82,9 +105,11 @@ fi # Check for valid usage if (( ${NUMARGS} != 0 )); then + # Check for cmake args + cmakeArgs for a in ${ARGS}; do if ! (echo " ${VALIDARGS} " | grep -q " ${a} "); then - echo "Invalid option: ${a}" + echo "Invalid option or formatting, check --help: ${a}" exit 1 fi done @@ -139,7 +164,6 @@ fi # Configure, build, and install libcudf if buildAll || hasArg libcudf; then - if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then CUDF_CMAKE_CUDA_ARCHITECTURES="-DCMAKE_CUDA_ARCHITECTURES=" echo "Building for the architecture of the GPU in the system..." @@ -156,7 +180,8 @@ if buildAll || hasArg libcudf; then -DBUILD_BENCHMARKS=${BUILD_BENCHMARKS} \ -DDISABLE_DEPRECATION_WARNING=${BUILD_DISABLE_DEPRECATION_WARNING} \ -DPER_THREAD_DEFAULT_STREAM=${BUILD_PER_THREAD_DEFAULT_STREAM} \ - -DCMAKE_BUILD_TYPE=${BUILD_TYPE} + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + ${CMAKE_ARGS} cd ${LIB_BUILD_DIR} @@ -172,8 +197,7 @@ if buildAll || hasArg cudf; then cd ${REPODIR}/python/cudf if [[ ${INSTALL_TARGET} != "" ]]; then - PARALLEL_LEVEL=${PARALLEL_LEVEL} python setup.py build_ext --inplace -j${PARALLEL_LEVEL} - python setup.py install --single-version-externally-managed --record=record.txt + PARALLEL_LEVEL=${PARALLEL_LEVEL} python setup.py build_ext -j${PARALLEL_LEVEL} install --single-version-externally-managed --record=record.txt else PARALLEL_LEVEL=${PARALLEL_LEVEL} python setup.py build_ext --inplace -j${PARALLEL_LEVEL} --library-dir=${LIBCUDF_BUILD_DIR} fi @@ -196,7 +220,8 @@ if hasArg libcudf_kafka; then cmake -S $REPODIR/cpp/libcudf_kafka -B ${KAFKA_LIB_BUILD_DIR} \ -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ -DBUILD_TESTS=${BUILD_TESTS} \ - -DCMAKE_BUILD_TYPE=${BUILD_TYPE} + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + ${CMAKE_ARGS} cd ${KAFKA_LIB_BUILD_DIR} diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml index 70bbe88a00c..2c0984569db 100644 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ b/conda/environments/cudf_dev_cuda11.0.yml @@ -17,7 +17,7 @@ dependencies: - numba>=0.53.1 - numpy - pandas>=1.0,<1.3.0dev0 - - pyarrow=4.0.1=*cuda + - pyarrow=5.0.0=*cuda - fastavro>=0.22.9 - notebook>=0.5.0 - cython>=0.29,<0.30 @@ -26,7 +26,6 @@ dependencies: - pytest-benchmark - pytest-xdist - sphinx - - sphinx_rtd_theme - sphinxcontrib-websupport - nbsphinx - numpydoc @@ -43,7 +42,7 @@ dependencies: - dask>=2021.6.0 - distributed>=2021.6.0 - streamz - - arrow-cpp=4.0.1 + - arrow-cpp=5.0.0 - dlpack>=0.5,<0.6.0a0 - arrow-cpp-proc * cuda - double-conversion @@ -57,6 +56,7 @@ dependencies: - nvtx>=0.2.1 - cachetools - transformers + - pydata-sphinx-theme - pip: - git+https://github.com/dask/dask.git@main - git+https://github.com/dask/distributed.git@main diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml index 6d2abdda449..766d85e957b 100644 --- a/conda/environments/cudf_dev_cuda11.2.yml +++ b/conda/environments/cudf_dev_cuda11.2.yml @@ -17,7 +17,7 @@ dependencies: - numba>=0.53.1 - numpy - pandas>=1.0,<1.3.0dev0 - - pyarrow=4.0.1=*cuda + - pyarrow=5.0.0=*cuda - fastavro>=0.22.9 - notebook>=0.5.0 - cython>=0.29,<0.30 @@ -26,7 +26,6 @@ dependencies: - pytest-benchmark - pytest-xdist - sphinx - - sphinx_rtd_theme - sphinxcontrib-websupport - nbsphinx - numpydoc @@ -43,7 +42,7 @@ dependencies: - dask>=2021.6.0 - distributed>=2021.6.0 - streamz - - arrow-cpp=4.0.1 + - arrow-cpp=5.0.0 - dlpack>=0.5,<0.6.0a0 - arrow-cpp-proc * cuda - double-conversion @@ -57,6 +56,7 @@ dependencies: - nvtx>=0.2.1 - cachetools - transformers + - pydata-sphinx-theme - pip: - git+https://github.com/dask/dask.git@main - git+https://github.com/dask/distributed.git@main diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 9023e89c2f5..ca36acccfbb 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -30,7 +30,7 @@ requirements: - setuptools - numba >=0.53.1 - dlpack>=0.5,<0.6.0a0 - - pyarrow 4.0.1 *cuda + - pyarrow 5.0.0 *cuda - libcudf {{ version }} - rmm {{ minor_version }} - cudatoolkit {{ cuda_version }} diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 6c4175a2539..208c21c2dc0 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -37,7 +37,7 @@ requirements: host: - librmm {{ minor_version }}.* - cudatoolkit {{ cuda_version }}.* - - arrow-cpp 4.0.1 *cuda + - arrow-cpp 5.0.0 *cuda - arrow-cpp-proc * cuda - dlpack>=0.5,<0.6.0a0 run: @@ -51,11 +51,9 @@ test: - test -f $PREFIX/lib/libcudf.so - test -f $PREFIX/lib/libcudftestutil.a - test -f $PREFIX/include/cudf/aggregation.hpp - - test -f $PREFIX/include/cudf/ast/transform.hpp - - test -f $PREFIX/include/cudf/ast/detail/linearizer.hpp + - test -f $PREFIX/include/cudf/ast/detail/expression_parser.hpp - test -f $PREFIX/include/cudf/ast/detail/operators.hpp - - test -f $PREFIX/include/cudf/ast/nodes.hpp - - test -f $PREFIX/include/cudf/ast/operators.hpp + - test -f $PREFIX/include/cudf/ast/expressions.hpp - test -f $PREFIX/include/cudf/binaryop.hpp - test -f $PREFIX/include/cudf/labeling/label_bins.hpp - test -f $PREFIX/include/cudf/column/column_factories.hpp @@ -102,6 +100,7 @@ test: - test -f $PREFIX/include/cudf/detail/utilities/integer_utils.hpp - test -f $PREFIX/include/cudf/detail/utilities/int_fastdiv.h - test -f $PREFIX/include/cudf/detail/utilities/vector_factories.hpp + - test -f $PREFIX/include/cudf/detail/utilities/visitor_overload.hpp - test -f $PREFIX/include/cudf/dictionary/detail/concatenate.hpp - test -f $PREFIX/include/cudf/dictionary/detail/encode.hpp - test -f $PREFIX/include/cudf/dictionary/detail/merge.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5c05a58b448..3eee1147414 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -28,6 +28,17 @@ elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "") set(CUDF_BUILD_FOR_DETECTED_ARCHS TRUE) endif() +file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.10/RAPIDS.cmake + ${CMAKE_BINARY_DIR}/RAPIDS.cmake) +include(${CMAKE_BINARY_DIR}/RAPIDS.cmake) + +include(rapids-cmake) +include(rapids-cpm) +include(rapids-cuda) +include(rapids-export) +include(rapids-find) + + project(CUDF VERSION 21.10.00 LANGUAGES C CXX) # Needed because GoogleBenchmark changes the state of FindThreads.cmake, @@ -44,6 +55,7 @@ option(BUILD_BENCHMARKS "Configure CMake to build (google & nvbench) benchmarks" option(BUILD_SHARED_LIBS "Build cuDF shared libraries" ON) option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON) option(CUDF_USE_ARROW_STATIC "Build and statically link Arrow libraries" OFF) +option(CUDF_ENABLE_ARROW_ORC "Build the Arrow ORC adapter" OFF) option(CUDF_ENABLE_ARROW_PYTHON "Find (or build) Arrow with Python support" OFF) option(CUDF_ENABLE_ARROW_PARQUET "Find (or build) Arrow with Parquet support" OFF) option(CUDF_ENABLE_ARROW_S3 "Build/Enable AWS S3 Arrow filesystem support" ON) @@ -137,6 +149,9 @@ include(cmake/thirdparty/CUDF_GetArrow.cmake) include(cmake/thirdparty/CUDF_GetDLPack.cmake) # find libcu++ include(cmake/thirdparty/CUDF_GetLibcudacxx.cmake) +# find cuCollections +# Should come after including thrust and libcudacxx +include(cmake/thirdparty/CUDF_GetcuCollections.cmake) # find or install GoogleTest include(cmake/thirdparty/CUDF_GetGTest.cmake) # preprocess jitify-able kernels @@ -151,8 +166,8 @@ add_library(cudf src/aggregation/aggregation.cpp src/aggregation/aggregation.cu src/aggregation/result_cache.cpp - src/ast/linearizer.cpp - src/ast/transform.cu + src/ast/expression_parser.cpp + src/ast/expressions.cpp src/binaryop/binaryop.cpp src/binaryop/compiled/binary_ops.cu src/binaryop/compiled/Add.cu @@ -255,6 +270,7 @@ add_library(cudf src/interop/dlpack.cpp src/interop/from_arrow.cu src/interop/to_arrow.cu + src/interop/detail/arrow_allocator.cpp src/io/avro/avro.cpp src/io/avro/avro_gpu.cu src/io/avro/reader_impl.cu @@ -283,7 +299,7 @@ add_library(cudf src/io/orc/writer_impl.cu src/io/parquet/compact_protocol_writer.cpp src/io/parquet/page_data.cu - src/io/parquet/page_dict.cu + src/io/parquet/chunk_dict.cu src/io/parquet/page_enc.cu src/io/parquet/page_hdr.cu src/io/parquet/parquet.cpp @@ -305,6 +321,7 @@ add_library(cudf src/join/cross_join.cu src/join/hash_join.cu src/join/join.cu + src/join/join_utils.cu src/join/semi_join.cu src/lists/contains.cu src/lists/combine/concatenate_list_elements.cu @@ -436,6 +453,7 @@ add_library(cudf src/text/subword/wordpiece_tokenizer.cu src/text/tokenize.cu src/transform/bools_to_mask.cu + src/transform/compute_column.cu src/transform/encode.cu src/transform/mask_to_bools.cu src/transform/nans_to_nulls.cu @@ -523,7 +541,8 @@ target_link_libraries(cudf PUBLIC ZLIB::ZLIB ${ARROW_LIBRARIES} cudf::Thrust - rmm::rmm) + rmm::rmm + PRIVATE cuco::cuco) if(CUDA_STATIC_RUNTIME) # Tell CMake what CUDA language runtime to use @@ -628,9 +647,11 @@ endif() ################################################################################################### # - install targets ------------------------------------------------------------------------------- +include(CPack) + include(GNUInstallDirs) -set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/cudf) +set(INSTALL_CONFIGDIR lib/cmake/cudf) set(CMAKE_INSTALL_DEFAULT_COMPONENT_NAME cudf) # install target for cudf_base and the proxy libcudf.so @@ -679,22 +700,6 @@ configure_package_config_file(cmake/cudf-build-config.cmake.in ${CUDF_BINARY_DIR write_basic_package_version_file(${CUDF_BINARY_DIR}/cudf-config-version.cmake COMPATIBILITY SameMinorVersion) -if(TARGET arrow_shared) - get_target_property(arrow_is_imported arrow_shared IMPORTED) - if(NOT arrow_is_imported) - export(TARGETS arrow_shared arrow_cuda_shared - FILE ${CUDF_BINARY_DIR}/cudf-arrow-targets.cmake - NAMESPACE cudf::) - endif() -elseif(TARGET arrow_static) - get_target_property(arrow_is_imported arrow_static IMPORTED) - if(NOT arrow_is_imported) - export(TARGETS arrow_static arrow_cuda_static - FILE ${CUDF_BINARY_DIR}/cudf-arrow-targets.cmake - NAMESPACE cudf::) - endif() -endif() - if(TARGET gtest) get_target_property(gtest_is_imported gtest IMPORTED) if(NOT gtest_is_imported) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index d0a47984053..56f17dc7090 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -29,6 +29,7 @@ target_link_libraries(cudf_datagen GTest::gmock_main GTest::gtest_main benchmark::benchmark + nvbench::nvbench Threads::Threads cudf) @@ -102,6 +103,7 @@ ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates_benchma ################################################################################################### # - join benchmark -------------------------------------------------------------------------------- ConfigureBench(JOIN_BENCH join/join_benchmark.cu join/conditional_join_benchmark.cu) +ConfigureNVBench(JOIN_NVBENCH join/join_nvbench.cu) ################################################################################################### # - iterator benchmark ---------------------------------------------------------------------------- diff --git a/cpp/benchmarks/ast/transform_benchmark.cpp b/cpp/benchmarks/ast/transform_benchmark.cpp index 6f131cf0d6a..fd0a0f7d2c8 100644 --- a/cpp/benchmarks/ast/transform_benchmark.cpp +++ b/cpp/benchmarks/ast/transform_benchmark.cpp @@ -14,10 +14,10 @@ * limitations under the License. */ -#include #include #include #include +#include #include #include @@ -95,22 +95,22 @@ static void BM_ast_transform(benchmark::State& state) // Note that a std::list is required here because of its guarantees against reference invalidation // when items are added or removed. References to items in a std::vector are not safe if the // vector must re-allocate. - auto expressions = std::list(); + auto expressions = std::list(); // Construct tree that chains additions like (((a + b) + c) + d) auto const op = cudf::ast::ast_operator::ADD; if (reuse_columns) { - expressions.push_back(cudf::ast::expression(op, column_refs.at(0), column_refs.at(0))); + expressions.push_back(cudf::ast::operation(op, column_refs.at(0), column_refs.at(0))); for (cudf::size_type i = 0; i < tree_levels - 1; i++) { - expressions.push_back(cudf::ast::expression(op, expressions.back(), column_refs.at(0))); + expressions.push_back(cudf::ast::operation(op, expressions.back(), column_refs.at(0))); } } else { - expressions.push_back(cudf::ast::expression(op, column_refs.at(0), column_refs.at(1))); + expressions.push_back(cudf::ast::operation(op, column_refs.at(0), column_refs.at(1))); std::transform(std::next(column_refs.cbegin(), 2), column_refs.cend(), std::back_inserter(expressions), [&](auto const& column_ref) { - return cudf::ast::expression(op, expressions.back(), column_ref); + return cudf::ast::operation(op, expressions.back(), column_ref); }); } @@ -119,7 +119,7 @@ static void BM_ast_transform(benchmark::State& state) // Execute benchmark for (auto _ : state) { cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - cudf::ast::compute_column(table, expression_tree_root); + cudf::compute_column(table, expression_tree_root); } // Use the number of bytes read from global memory diff --git a/cpp/benchmarks/fixture/rmm_pool_raii.hpp b/cpp/benchmarks/fixture/rmm_pool_raii.hpp new file mode 100644 index 00000000000..9038f523b29 --- /dev/null +++ b/cpp/benchmarks/fixture/rmm_pool_raii.hpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +namespace cudf { + +/** + * @brief An RAII class setting up RMM memory pool for `nvbench` benchmarks + * + * This is a temporary solution before templated fixtures tests are supported + * in `nvbench`. Similarly to `cudf::benchmark`, creating this RAII object in + * each benchmark will ensure that the RAPIDS Memory Manager pool mode is used + * in benchmarks, which eliminates memory allocation / deallocation performance + * overhead from the benchmark. + * + * Example: + * + * void my_benchmark(nvbench::state& state) { + * cudf::rmm_pool_raii pool_raii; + * state.exec([](nvbench::launch& launch) { + * // benchmark stuff + * }); + * } + * + * NVBENCH_BENCH(my_benchmark); + */ +class rmm_pool_raii { + private: + // memory resource factory helpers + inline auto make_cuda() { return std::make_shared(); } + + inline auto make_pool() + { + return rmm::mr::make_owning_wrapper(make_cuda()); + } + + public: + rmm_pool_raii() + { + mr = make_pool(); + rmm::mr::set_current_device_resource(mr.get()); // set default resource to pool + } + + ~rmm_pool_raii() + { + rmm::mr::set_current_device_resource(nullptr); + mr.reset(); + } + + private: + std::shared_ptr mr; +}; + +} // namespace cudf diff --git a/cpp/benchmarks/groupby/group_nth_benchmark.cu b/cpp/benchmarks/groupby/group_nth_benchmark.cu index 9765a4a265c..8d1de36db95 100644 --- a/cpp/benchmarks/groupby/group_nth_benchmark.cu +++ b/cpp/benchmarks/groupby/group_nth_benchmark.cu @@ -63,7 +63,8 @@ void BM_pre_sorted_nth(benchmark::State& state) std::vector requests; requests.emplace_back(cudf::groupby::aggregation_request()); requests[0].values = vals; - requests[0].aggregations.push_back(cudf::make_nth_element_aggregation(-1)); + requests[0].aggregations.push_back( + cudf::make_nth_element_aggregation(-1)); for (auto _ : state) { cuda_event_timer timer(state, true); diff --git a/cpp/benchmarks/groupby/group_sum_benchmark.cu b/cpp/benchmarks/groupby/group_sum_benchmark.cu index 1455f1cecdc..6351da66fdd 100644 --- a/cpp/benchmarks/groupby/group_sum_benchmark.cu +++ b/cpp/benchmarks/groupby/group_sum_benchmark.cu @@ -58,7 +58,7 @@ void BM_basic_sum(benchmark::State& state) std::vector requests; requests.emplace_back(cudf::groupby::aggregation_request()); requests[0].values = vals; - requests[0].aggregations.push_back(cudf::make_sum_aggregation()); + requests[0].aggregations.push_back(cudf::make_sum_aggregation()); for (auto _ : state) { cuda_event_timer timer(state, true); @@ -97,7 +97,7 @@ void BM_pre_sorted_sum(benchmark::State& state) std::vector requests; requests.emplace_back(cudf::groupby::aggregation_request()); requests[0].values = vals; - requests[0].aggregations.push_back(cudf::make_sum_aggregation()); + requests[0].aggregations.push_back(cudf::make_sum_aggregation()); for (auto _ : state) { cuda_event_timer timer(state, true); diff --git a/cpp/benchmarks/join/conditional_join_benchmark.cu b/cpp/benchmarks/join/conditional_join_benchmark.cu index 4a655e29f74..71b90685fb9 100644 --- a/cpp/benchmarks/join/conditional_join_benchmark.cu +++ b/cpp/benchmarks/join/conditional_join_benchmark.cu @@ -14,117 +14,24 @@ * limitations under the License. */ -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -#include "generate_input_tables.cuh" +#include template class ConditionalJoin : public cudf::benchmark { }; -template -static void BM_join(benchmark::State& state, Join JoinFunc) -{ - const cudf::size_type build_table_size{(cudf::size_type)state.range(0)}; - const cudf::size_type probe_table_size{(cudf::size_type)state.range(1)}; - const cudf::size_type rand_max_val{build_table_size * 2}; - const double selectivity = 0.3; - const bool is_build_table_key_unique = true; - - // Generate build and probe tables - cudf::test::UniformRandomGenerator rand_gen(0, build_table_size); - auto build_random_null_mask = [&rand_gen](int size) { - if (Nullable) { - // roughly 25% nulls - auto validity = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [&rand_gen](auto i) { return (rand_gen.generate() & 3) == 0; }); - return cudf::test::detail::make_null_mask(validity, validity + size); - } else { - return cudf::create_null_mask(size, cudf::mask_state::UNINITIALIZED); - } - }; - - std::unique_ptr build_key_column = [&]() { - return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id()), - build_table_size, - build_random_null_mask(build_table_size)) - : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id()), - build_table_size); - }(); - std::unique_ptr probe_key_column = [&]() { - return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id()), - probe_table_size, - build_random_null_mask(probe_table_size)) - : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id()), - probe_table_size); - }(); - - generate_input_tables( - build_key_column->mutable_view().data(), - build_table_size, - probe_key_column->mutable_view().data(), - probe_table_size, - selectivity, - rand_max_val, - is_build_table_key_unique); - - auto payload_data_it = thrust::make_counting_iterator(0); - cudf::test::fixed_width_column_wrapper build_payload_column( - payload_data_it, payload_data_it + build_table_size); - - cudf::test::fixed_width_column_wrapper probe_payload_column( - payload_data_it, payload_data_it + probe_table_size); - - CHECK_CUDA(0); - - cudf::table_view build_table({build_key_column->view(), build_payload_column}); - cudf::table_view probe_table({probe_key_column->view(), probe_payload_column}); - - // Benchmark the inner join operation - - for (auto _ : state) { - cuda_event_timer raii(state, true, rmm::cuda_stream_default); - - // Common column references. - const auto col_ref_left_0 = cudf::ast::column_reference(0); - const auto col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); - auto left_zero_eq_right_zero = - cudf::ast::expression(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0); - - auto result = - JoinFunc(probe_table, build_table, left_zero_eq_right_zero, cudf::null_equality::UNEQUAL); - } -} - #define CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \ BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type) \ (::benchmark::State & st) \ { \ auto join = [](cudf::table_view const& left, \ cudf::table_view const& right, \ - cudf::ast::expression binary_pred, \ + cudf::ast::operation binary_pred, \ cudf::null_equality compare_nulls) { \ return cudf::conditional_inner_join(left, right, binary_pred, compare_nulls); \ }; \ - BM_join(st, join); \ + constexpr bool is_conditional = true; \ + BM_join(st, join); \ } CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit, int32_t, int32_t, false); @@ -138,11 +45,12 @@ CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit_nulls, int6 { \ auto join = [](cudf::table_view const& left, \ cudf::table_view const& right, \ - cudf::ast::expression binary_pred, \ + cudf::ast::operation binary_pred, \ cudf::null_equality compare_nulls) { \ return cudf::conditional_left_join(left, right, binary_pred, compare_nulls); \ }; \ - BM_join(st, join); \ + constexpr bool is_conditional = true; \ + BM_join(st, join); \ } CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit, int32_t, int32_t, false); @@ -156,11 +64,12 @@ CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit_nulls, int64_ { \ auto join = [](cudf::table_view const& left, \ cudf::table_view const& right, \ - cudf::ast::expression binary_pred, \ + cudf::ast::operation binary_pred, \ cudf::null_equality compare_nulls) { \ return cudf::conditional_inner_join(left, right, binary_pred, compare_nulls); \ }; \ - BM_join(st, join); \ + constexpr bool is_conditional = true; \ + BM_join(st, join); \ } CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit, int32_t, int32_t, false); @@ -174,11 +83,12 @@ CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit_nulls, int64_ { \ auto join = [](cudf::table_view const& left, \ cudf::table_view const& right, \ - cudf::ast::expression binary_pred, \ + cudf::ast::operation binary_pred, \ cudf::null_equality compare_nulls) { \ return cudf::conditional_left_anti_join(left, right, binary_pred, compare_nulls); \ }; \ - BM_join(st, join); \ + constexpr bool is_conditional = true; \ + BM_join(st, join); \ } CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit, @@ -204,11 +114,12 @@ CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit_nul { \ auto join = [](cudf::table_view const& left, \ cudf::table_view const& right, \ - cudf::ast::expression binary_pred, \ + cudf::ast::operation binary_pred, \ cudf::null_equality compare_nulls) { \ return cudf::conditional_left_semi_join(left, right, binary_pred, compare_nulls); \ }; \ - BM_join(st, join); \ + constexpr bool is_conditional = true; \ + BM_join(st, join); \ } CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit, @@ -234,11 +145,6 @@ BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_32bit) ->Args({100'000, 100'000}) ->Args({100'000, 400'000}) ->Args({100'000, 1'000'000}) - // TODO: The below benchmark is slow, but can be useful to validate that the - // code works for large data sets. This benchmark was used to compare to the - // otherwise equivalent nullable benchmark below, which has memory errors for - // sufficiently large data sets. - //->Args({1'000'000, 1'000'000}) ->UseManualTime(); BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_64bit) diff --git a/cpp/benchmarks/join/join_benchmark.cu b/cpp/benchmarks/join/join_benchmark.cu index a7c109db9b4..72d9b541232 100644 --- a/cpp/benchmarks/join/join_benchmark.cu +++ b/cpp/benchmarks/join/join_benchmark.cu @@ -14,121 +14,12 @@ * limitations under the License. */ -#include - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -#include "generate_input_tables.cuh" +#include template class Join : public cudf::benchmark { }; -template -static void BM_join(benchmark::State& state, Join JoinFunc) -{ - const cudf::size_type build_table_size{(cudf::size_type)state.range(0)}; - const cudf::size_type probe_table_size{(cudf::size_type)state.range(1)}; - const cudf::size_type rand_max_val{build_table_size * 2}; - const double selectivity = 0.3; - const bool is_build_table_key_unique = true; - - // Generate build and probe tables - cudf::test::UniformRandomGenerator rand_gen(0, build_table_size); - auto build_random_null_mask = [&rand_gen](int size) { - if (Nullable) { - // roughly 25% nulls - auto validity = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [&rand_gen](auto i) { return (rand_gen.generate() & 3) == 0; }); - return cudf::test::detail::make_null_mask(validity, validity + size); - } else { - return cudf::create_null_mask(size, cudf::mask_state::UNINITIALIZED); - } - }; - - std::unique_ptr build_key_column = [&]() { - return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id()), - build_table_size, - build_random_null_mask(build_table_size)) - : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id()), - build_table_size); - }(); - std::unique_ptr probe_key_column = [&]() { - return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id()), - probe_table_size, - build_random_null_mask(probe_table_size)) - : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id()), - probe_table_size); - }(); - - generate_input_tables( - build_key_column->mutable_view().data(), - build_table_size, - probe_key_column->mutable_view().data(), - probe_table_size, - selectivity, - rand_max_val, - is_build_table_key_unique); - - auto payload_data_it = thrust::make_counting_iterator(0); - cudf::test::fixed_width_column_wrapper build_payload_column( - payload_data_it, payload_data_it + build_table_size); - - cudf::test::fixed_width_column_wrapper probe_payload_column( - payload_data_it, payload_data_it + probe_table_size); - - CHECK_CUDA(0); - - cudf::table_view build_table({build_key_column->view(), build_payload_column}); - cudf::table_view probe_table({probe_key_column->view(), probe_payload_column}); - - // Setup join parameters and result table - - std::vector columns_to_join = {0}; - - // Benchmark the inner join operation - - for (auto _ : state) { - cuda_event_timer raii(state, true, rmm::cuda_stream_default); - - auto result = JoinFunc( - probe_table, build_table, columns_to_join, columns_to_join, cudf::null_equality::UNEQUAL); - } -} - -#define JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type) \ - (::benchmark::State & st) \ - { \ - auto join = [](cudf::table_view const& left, \ - cudf::table_view const& right, \ - std::vector const& left_on, \ - std::vector const& right_on, \ - cudf::null_equality compare_nulls) { \ - return cudf::inner_join(left, right, left_on, right_on, compare_nulls); \ - }; \ - BM_join(st, join); \ - } - -JOIN_BENCHMARK_DEFINE(join_32bit, int32_t, int32_t, false); -JOIN_BENCHMARK_DEFINE(join_64bit, int64_t, int64_t, false); -JOIN_BENCHMARK_DEFINE(join_32bit_nulls, int32_t, int32_t, true); -JOIN_BENCHMARK_DEFINE(join_64bit_nulls, int64_t, int64_t, true); - #define LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \ BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type) \ (::benchmark::State & st) \ @@ -167,43 +58,6 @@ LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit, int64_t, int64_t, false); LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit_nulls, int32_t, int32_t, true); LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit_nulls, int64_t, int64_t, true); -// join ----------------------------------------------------------------------- -BENCHMARK_REGISTER_F(Join, join_32bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->Args({10'000'000, 10'000'000}) - ->Args({10'000'000, 40'000'000}) - ->Args({10'000'000, 100'000'000}) - ->Args({100'000'000, 100'000'000}) - ->Args({80'000'000, 240'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(Join, join_64bit) - ->Unit(benchmark::kMillisecond) - ->Args({50'000'000, 50'000'000}) - ->Args({40'000'000, 120'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(Join, join_32bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->Args({10'000'000, 10'000'000}) - ->Args({10'000'000, 40'000'000}) - ->Args({10'000'000, 100'000'000}) - ->Args({100'000'000, 100'000'000}) - ->Args({80'000'000, 240'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(Join, join_64bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({50'000'000, 50'000'000}) - ->Args({40'000'000, 120'000'000}) - ->UseManualTime(); - // left anti-join ------------------------------------------------------------- BENCHMARK_REGISTER_F(Join, left_anti_join_32bit) ->Unit(benchmark::kMillisecond) diff --git a/cpp/benchmarks/join/join_benchmark_common.hpp b/cpp/benchmarks/join/join_benchmark_common.hpp new file mode 100644 index 00000000000..add87bf7dfb --- /dev/null +++ b/cpp/benchmarks/join/join_benchmark_common.hpp @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "generate_input_tables.cuh" + +template +static void BM_join(state_type& state, Join JoinFunc) +{ + auto const build_table_size = [&]() { + if constexpr (std::is_same_v) { + return static_cast(state.range(0)); + } + if constexpr (std::is_same_v) { + return static_cast(state.get_int64("Build Table Size")); + } + }(); + auto const probe_table_size = [&]() { + if constexpr (std::is_same_v) { + return static_cast(state.range(1)); + } + if constexpr (std::is_same_v) { + return static_cast(state.get_int64("Probe Table Size")); + } + }(); + + const cudf::size_type rand_max_val{build_table_size * 2}; + const double selectivity = 0.3; + const bool is_build_table_key_unique = true; + + // Generate build and probe tables + cudf::test::UniformRandomGenerator rand_gen(0, build_table_size); + auto build_random_null_mask = [&rand_gen](int size) { + // roughly 25% nulls + auto validity = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [&rand_gen](auto i) { return (rand_gen.generate() & 3) == 0; }); + return cudf::test::detail::make_null_mask(validity, validity + size); + }; + + std::unique_ptr build_key_column = [&]() { + return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id()), + build_table_size, + build_random_null_mask(build_table_size)) + : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id()), + build_table_size); + }(); + std::unique_ptr probe_key_column = [&]() { + return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id()), + probe_table_size, + build_random_null_mask(probe_table_size)) + : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id()), + probe_table_size); + }(); + + generate_input_tables( + build_key_column->mutable_view().data(), + build_table_size, + probe_key_column->mutable_view().data(), + probe_table_size, + selectivity, + rand_max_val, + is_build_table_key_unique); + + auto payload_data_it = thrust::make_counting_iterator(0); + cudf::test::fixed_width_column_wrapper build_payload_column( + payload_data_it, payload_data_it + build_table_size); + + cudf::test::fixed_width_column_wrapper probe_payload_column( + payload_data_it, payload_data_it + probe_table_size); + + CHECK_CUDA(0); + + cudf::table_view build_table({build_key_column->view(), build_payload_column}); + cudf::table_view probe_table({probe_key_column->view(), probe_payload_column}); + + // Setup join parameters and result table + [[maybe_unused]] std::vector columns_to_join = {0}; + + // Benchmark the inner join operation + if constexpr (std::is_same_v and (not is_conditional)) { + for (auto _ : state) { + cuda_event_timer raii(state, true, rmm::cuda_stream_default); + + auto result = JoinFunc( + probe_table, build_table, columns_to_join, columns_to_join, cudf::null_equality::UNEQUAL); + } + } + if constexpr (std::is_same_v and (not is_conditional)) { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + rmm::cuda_stream_view stream_view{launch.get_stream()}; + JoinFunc(probe_table, + build_table, + columns_to_join, + columns_to_join, + cudf::null_equality::UNEQUAL, + stream_view); + }); + } + + // Benchmark conditional join + if constexpr (std::is_same_v and is_conditional) { + // Common column references. + const auto col_ref_left_0 = cudf::ast::column_reference(0); + const auto col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); + auto left_zero_eq_right_zero = + cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0); + + for (auto _ : state) { + cuda_event_timer raii(state, true, rmm::cuda_stream_default); + + auto result = + JoinFunc(probe_table, build_table, left_zero_eq_right_zero, cudf::null_equality::UNEQUAL); + } + } +} diff --git a/cpp/benchmarks/join/join_nvbench.cu b/cpp/benchmarks/join/join_nvbench.cu new file mode 100644 index 00000000000..ffb21d8594d --- /dev/null +++ b/cpp/benchmarks/join/join_nvbench.cu @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +void skip_helper(nvbench::state& state) +{ + auto const build_table_size = state.get_int64("Build Table Size"); + auto const probe_table_size = state.get_int64("Probe Table Size"); + + if (build_table_size > probe_table_size) { + state.skip("Large build tables are skipped."); + return; + } + + if (build_table_size * 100 <= probe_table_size) { + state.skip("Large probe tables are skipped."); + return; + } +} + +template +void nvbench_inner_join(nvbench::state& state, + nvbench::type_list>) +{ + skip_helper(state); + + // TODO: to be replaced by nvbench fixture once it's ready + cudf::rmm_pool_raii pool_raii; + + auto join = [](cudf::table_view const& left_input, + cudf::table_view const& right_input, + std::vector const& left_on, + std::vector const& right_on, + cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream) { + cudf::hash_join hj_obj(left_input.select(left_on), compare_nulls, stream); + return hj_obj.inner_join(right_input.select(right_on), compare_nulls, std::nullopt, stream); + }; + + BM_join(state, join); +} + +template +void nvbench_left_join(nvbench::state& state, + nvbench::type_list>) +{ + skip_helper(state); + + // TODO: to be replaced by nvbench fixture once it's ready + cudf::rmm_pool_raii pool_raii; + + auto join = [](cudf::table_view const& left_input, + cudf::table_view const& right_input, + std::vector const& left_on, + std::vector const& right_on, + cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream) { + cudf::hash_join hj_obj(left_input.select(left_on), compare_nulls, stream); + return hj_obj.left_join(right_input.select(right_on), compare_nulls, std::nullopt, stream); + }; + + BM_join(state, join); +} + +template +void nvbench_full_join(nvbench::state& state, + nvbench::type_list>) +{ + skip_helper(state); + + // TODO: to be replaced by nvbench fixture once it's ready + cudf::rmm_pool_raii pool_raii; + + auto join = [](cudf::table_view const& left_input, + cudf::table_view const& right_input, + std::vector const& left_on, + std::vector const& right_on, + cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream) { + cudf::hash_join hj_obj(left_input.select(left_on), compare_nulls, stream); + return hj_obj.full_join(right_input.select(right_on), compare_nulls, std::nullopt, stream); + }; + + BM_join(state, join); +} + +// inner join ----------------------------------------------------------------------- +NVBENCH_BENCH_TYPES(nvbench_inner_join, + NVBENCH_TYPE_AXES(nvbench::type_list, + nvbench::type_list, + nvbench::enum_type_list)) + .set_name("inner_join_32bit") + .set_type_axes_names({"Key Type", "Payload Type", "Nullable"}) + .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000}) + .add_int64_axis("Probe Table Size", + {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000}); + +NVBENCH_BENCH_TYPES(nvbench_inner_join, + NVBENCH_TYPE_AXES(nvbench::type_list, + nvbench::type_list, + nvbench::enum_type_list)) + .set_name("inner_join_64bit") + .set_type_axes_names({"Key Type", "Payload Type", "Nullable"}) + .add_int64_axis("Build Table Size", {40'000'000, 50'000'000}) + .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000}); + +NVBENCH_BENCH_TYPES(nvbench_inner_join, + NVBENCH_TYPE_AXES(nvbench::type_list, + nvbench::type_list, + nvbench::enum_type_list)) + .set_name("inner_join_32bit_nulls") + .set_type_axes_names({"Key Type", "Payload Type", "Nullable"}) + .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000}) + .add_int64_axis("Probe Table Size", + {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000}); + +NVBENCH_BENCH_TYPES(nvbench_inner_join, + NVBENCH_TYPE_AXES(nvbench::type_list, + nvbench::type_list, + nvbench::enum_type_list)) + .set_name("inner_join_64bit_nulls") + .set_type_axes_names({"Key Type", "Payload Type", "Nullable"}) + .add_int64_axis("Build Table Size", {40'000'000, 50'000'000}) + .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000}); + +// left join ------------------------------------------------------------------------ +NVBENCH_BENCH_TYPES(nvbench_left_join, + NVBENCH_TYPE_AXES(nvbench::type_list, + nvbench::type_list, + nvbench::enum_type_list)) + .set_name("left_join_32bit") + .set_type_axes_names({"Key Type", "Payload Type", "Nullable"}) + .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000}) + .add_int64_axis("Probe Table Size", + {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000}); + +NVBENCH_BENCH_TYPES(nvbench_left_join, + NVBENCH_TYPE_AXES(nvbench::type_list, + nvbench::type_list, + nvbench::enum_type_list)) + .set_name("left_join_64bit") + .set_type_axes_names({"Key Type", "Payload Type", "Nullable"}) + .add_int64_axis("Build Table Size", {40'000'000, 50'000'000}) + .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000}); + +NVBENCH_BENCH_TYPES(nvbench_left_join, + NVBENCH_TYPE_AXES(nvbench::type_list, + nvbench::type_list, + nvbench::enum_type_list)) + .set_name("left_join_32bit_nulls") + .set_type_axes_names({"Key Type", "Payload Type", "Nullable"}) + .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000}) + .add_int64_axis("Probe Table Size", + {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000}); + +NVBENCH_BENCH_TYPES(nvbench_left_join, + NVBENCH_TYPE_AXES(nvbench::type_list, + nvbench::type_list, + nvbench::enum_type_list)) + .set_name("left_join_64bit_nulls") + .set_type_axes_names({"Key Type", "Payload Type", "Nullable"}) + .add_int64_axis("Build Table Size", {40'000'000, 50'000'000}) + .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000}); + +// full join ------------------------------------------------------------------------ +NVBENCH_BENCH_TYPES(nvbench_full_join, + NVBENCH_TYPE_AXES(nvbench::type_list, + nvbench::type_list, + nvbench::enum_type_list)) + .set_name("full_join_32bit") + .set_type_axes_names({"Key Type", "Payload Type", "Nullable"}) + .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000}) + .add_int64_axis("Probe Table Size", + {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000}); + +NVBENCH_BENCH_TYPES(nvbench_full_join, + NVBENCH_TYPE_AXES(nvbench::type_list, + nvbench::type_list, + nvbench::enum_type_list)) + .set_name("full_join_64bit") + .set_type_axes_names({"Key Type", "Payload Type", "Nullable"}) + .add_int64_axis("Build Table Size", {40'000'000, 50'000'000}) + .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000}); + +NVBENCH_BENCH_TYPES(nvbench_full_join, + NVBENCH_TYPE_AXES(nvbench::type_list, + nvbench::type_list, + nvbench::enum_type_list)) + .set_name("full_join_32bit_nulls") + .set_type_axes_names({"Key Type", "Payload Type", "Nullable"}) + .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000}) + .add_int64_axis("Probe Table Size", + {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000}); + +NVBENCH_BENCH_TYPES(nvbench_full_join, + NVBENCH_TYPE_AXES(nvbench::type_list, + nvbench::type_list, + nvbench::enum_type_list)) + .set_name("full_join_64bit_nulls") + .set_type_axes_names({"Key Type", "Payload Type", "Nullable"}) + .add_int64_axis("Build Table Size", {40'000'000, 50'000'000}) + .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000}); diff --git a/cpp/cmake/cudf-build-config.cmake.in b/cpp/cmake/cudf-build-config.cmake.in index 9b6dd24069a..4b5ad8ebb8d 100644 --- a/cpp/cmake/cudf-build-config.cmake.in +++ b/cpp/cmake/cudf-build-config.cmake.in @@ -61,6 +61,9 @@ else() if (NOT DEFINED CUDF_ENABLE_ARROW_S3) set(CUDF_ENABLE_ARROW_S3 OFF) endif() + if (NOT DEFINED CUDF_ENABLE_ARROW_ORC) + set(CUDF_ENABLE_ARROW_ORC OFF) + endif() if (NOT DEFINED CUDF_ENABLE_ARROW_PYTHON) set(CUDF_ENABLE_ARROW_PYTHON OFF) endif() diff --git a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake index 8cef3e8b9d0..38a5d8da44a 100644 --- a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake +++ b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake @@ -14,7 +14,25 @@ # limitations under the License. #============================================================================= -function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_PYTHON ENABLE_PARQUET) +function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENABLE_PYTHON ENABLE_PARQUET) + + if(BUILD_STATIC) + if(TARGET arrow_static AND TARGET arrow_cuda_static) + list(APPEND ARROW_LIBRARIES arrow_static) + list(APPEND ARROW_LIBRARIES arrow_cuda_static) + set(ARROW_FOUND TRUE PARENT_SCOPE) + set(ARROW_LIBRARIES ${ARROW_LIBRARIES} PARENT_SCOPE) + return() + endif() + else() + if(TARGET arrow_shared AND TARGET arrow_cuda_shared) + list(APPEND ARROW_LIBRARIES arrow_shared) + list(APPEND ARROW_LIBRARIES arrow_cuda_shared) + set(ARROW_FOUND TRUE PARENT_SCOPE) + set(ARROW_LIBRARIES ${ARROW_LIBRARIES} PARENT_SCOPE) + return() + endif() + endif() set(ARROW_BUILD_SHARED ON) set(ARROW_BUILD_STATIC OFF) @@ -40,12 +58,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_PYTHON E list(APPEND ARROW_PYTHON_OPTIONS "ARROW_PYTHON ON") # Arrow's logic to build Boost from source is busted, so we have to get it from the system. list(APPEND ARROW_PYTHON_OPTIONS "BOOST_SOURCE SYSTEM") - # Arrow's logic to find Thrift is busted, so we have to build it from - # source. Why can't we use `THRIFT_SOURCE BUNDLED` you might ask? - # Because that's _also_ busted. The only thing that seems to is to set - # _all_ dependencies to bundled, then optionall un-set BOOST_SOURCE to - # SYSTEM. - list(APPEND ARROW_PYTHON_OPTIONS "ARROW_DEPENDENCY_SOURCE BUNDLED") + list(APPEND ARROW_PYTHON_OPTIONS "Thrift_SOURCE BUNDLED") + list(APPEND ARROW_PYTHON_OPTIONS "ARROW_DEPENDENCY_SOURCE AUTO") endif() # Set this so Arrow correctly finds the CUDA toolkit when the build machine @@ -68,6 +82,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_PYTHON E "ARROW_CXXFLAGS -w" "ARROW_JEMALLOC OFF" "ARROW_S3 ${ENABLE_S3}" + "ARROW_ORC ${ENABLE_ORC}" # e.g. needed by blazingsql-io "ARROW_PARQUET ${ENABLE_PARQUET}" ${ARROW_PYTHON_OPTIONS} @@ -144,14 +159,31 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_PYTHON E set(ARROW_FOUND "${ARROW_FOUND}" PARENT_SCOPE) set(ARROW_LIBRARIES "${ARROW_LIBRARIES}" PARENT_SCOPE) + if(TARGET arrow_shared) + get_target_property(arrow_is_imported arrow_shared IMPORTED) + if(NOT arrow_is_imported) + export(TARGETS arrow_shared arrow_cuda_shared + FILE ${CUDF_BINARY_DIR}/cudf-arrow-targets.cmake + NAMESPACE cudf::) + endif() + elseif(TARGET arrow_static) + get_target_property(arrow_is_imported arrow_static IMPORTED) + if(NOT arrow_is_imported) + export(TARGETS arrow_static arrow_cuda_static + FILE ${CUDF_BINARY_DIR}/cudf-arrow-targets.cmake + NAMESPACE cudf::) + endif() + endif() + endfunction() -set(CUDF_VERSION_Arrow 4.0.1) +set(CUDF_VERSION_Arrow 5.0.0) find_and_configure_arrow( ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3} + ${CUDF_ENABLE_ARROW_ORC} ${CUDF_ENABLE_ARROW_PYTHON} ${CUDF_ENABLE_ARROW_PARQUET} ) diff --git a/cpp/cmake/thirdparty/CUDF_GetcuCollections.cmake b/cpp/cmake/thirdparty/CUDF_GetcuCollections.cmake new file mode 100644 index 00000000000..73717249585 --- /dev/null +++ b/cpp/cmake/thirdparty/CUDF_GetcuCollections.cmake @@ -0,0 +1,38 @@ +#============================================================================= +# Copyright (c) 2021, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +function(find_and_configure_cucollections) + + if(TARGET cuco::cuco) + return() + endif() + + # Find or install cuCollections + CPMFindPackage(NAME cuco + GITHUB_REPOSITORY NVIDIA/cuCollections + GIT_TAG 0d602ae21ea4f38d23ed816aa948453d97b2ee4e + OPTIONS "BUILD_TESTS OFF" + "BUILD_BENCHMARKS OFF" + "BUILD_EXAMPLES OFF" + ) + + set(CUCO_INCLUDE_DIR "${cuco_SOURCE_DIR}/include" PARENT_SCOPE) + + # Make sure consumers of cudf can also see cuco::cuco target + fix_cmake_global_defaults(cuco::cuco) +endfunction() + +find_and_configure_cucollections() diff --git a/cpp/docs/DEVELOPER_GUIDE.md b/cpp/docs/DEVELOPER_GUIDE.md index 9ec64060847..1da2d43cf6c 100644 --- a/cpp/docs/DEVELOPER_GUIDE.md +++ b/cpp/docs/DEVELOPER_GUIDE.md @@ -144,6 +144,16 @@ The following guidelines apply to organizing `#include` lines. * Always check that includes are only necessary for the file in which they are included. Try to avoid excessive including especially in header files. Double check this when you remove code. + * Use quotes `"` to include local headers from the same relative source directory. This should only + occur in source files and non-public header files. Otherwise use angle brackets `<>` around + included header filenames. + * Avoid relative paths with `..` when possible. Paths with `..` are necessary when including + (internal) headers from source paths not in the same directory as the including file, + because source paths are not passed with `-I`. + * Avoid including library internal headers from non-internal files. For example, try not to include + headers from libcudf `src` directories in tests or in libcudf public headers. If you find + yourself doing this, start a discussion about moving (parts of) the included internal header + to a public header. # libcudf Data Structures @@ -246,7 +256,31 @@ An *immutable*, non-owning view of a table. ### `cudf::mutable_table_view` -A *mutable*, non-owning view of a table. +A *mutable*, non-owning view of a table. + +## Spans + +libcudf provides `span` classes that mimic C++20 `std::span`, which is a lightweight +view of a contiguous sequence of objects. libcudf provides two classes, `host_span` and +`device_span`, which can be constructed from multiple container types, or from a pointer +(host or device, respectively) and size, or from iterators. `span` types are useful for defining +generic (internal) interfaces which work with multiple input container types. `device_span` can be +constructed from `thrust::device_vector`, `rmm::device_vector`, or `rmm::device_uvector`. +`host_span` can be constructed from `thrust::host_vector`, `std::vector`, or `std::basic_string`. + +If you are definining internal (detail) functions that operate on vectors, use spans for the input +vector parameters rather than a specific vector type, to make your functions more widely applicable. + +When a `span` refers to immutable elements, use `span`, not `span const`. Since a span +is lightweight view, it does not propagate `const`-ness. Therefore, `const` should be applied to +the template type parameter, not to the `span` itself. Also, `span` should be passed by value +because it is a lightweight view. APIS in libcudf that take spans as input will look like the +following function that copies device data to a host `std::vector`. + +```c++ +template +std::vector make_std_vector_async(device_span v, rmm::cuda_stream_view stream) +``` ## `cudf::scalar` diff --git a/cpp/examples/basic/src/process_csv.cpp b/cpp/examples/basic/src/process_csv.cpp index 2467c97393b..cd469af0036 100644 --- a/cpp/examples/basic/src/process_csv.cpp +++ b/cpp/examples/basic/src/process_csv.cpp @@ -25,7 +25,7 @@ void write_csv(cudf::table_view const& tbl_view, std::string const& file_path) } std::vector make_single_aggregation_request( - std::unique_ptr&& agg, cudf::column_view value) + std::unique_ptr&& agg, cudf::column_view value) { std::vector requests; requests.emplace_back(cudf::groupby::aggregation_request()); @@ -42,7 +42,7 @@ std::unique_ptr average_closing_price(cudf::table_view stock_info_t // Compute the average of each company's closing price with entire column cudf::groupby::groupby grpby_obj(keys); - auto requests = make_single_aggregation_request(cudf::make_mean_aggregation(), val); + auto requests = make_single_aggregation_request(cudf::make_mean_aggregation(), val); auto agg_results = grpby_obj.aggregate(requests); diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index 7ac3638b21c..ff665e2706a 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -106,8 +106,7 @@ class aggregation { }; /** - * @brief Derived class intended for enforcing operation-specific restrictions - * when calling various cudf functions. + * @brief Derived class intended for rolling_window specific aggregation usage. * * As an example, rolling_window will only accept rolling_aggregation inputs, * and the appropriate derived classes (sum_aggregation, mean_aggregation, etc) @@ -121,6 +120,28 @@ class rolling_aggregation : public virtual aggregation { rolling_aggregation() {} }; +/** + * @brief Derived class intended for groupby specific aggregation usage. + */ +class groupby_aggregation : public virtual aggregation { + public: + ~groupby_aggregation() = default; + + protected: + groupby_aggregation() {} +}; + +/** + * @brief Derived class intended for groupby specific scan usage. + */ +class groupby_scan_aggregation : public virtual aggregation { + public: + ~groupby_scan_aggregation() = default; + + protected: + groupby_scan_aggregation() {} +}; + enum class udf_type : bool { CUDA, PTX }; /// Factory to create a SUM aggregation diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/expression_evaluator.cuh similarity index 68% rename from cpp/include/cudf/ast/detail/transform.cuh rename to cpp/include/cudf/ast/detail/expression_evaluator.cuh index 89fa7d31980..fb198761115 100644 --- a/cpp/include/cudf/ast/detail/transform.cuh +++ b/cpp/include/cudf/ast/detail/expression_evaluator.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,10 +15,9 @@ */ #pragma once -#include +#include #include -#include -#include +#include #include #include #include @@ -39,28 +38,6 @@ namespace ast { namespace detail { -// Type trait for wrapping nullable types in a thrust::optional. Non-nullable -// types are returned as is. -template -struct possibly_null_value; - -template -struct possibly_null_value { - using type = thrust::optional; -}; - -template -struct possibly_null_value { - using type = T; -}; - -template -using possibly_null_value_t = typename possibly_null_value::type; - -// Type used for intermediate storage in expression evaluation. -template -using IntermediateDataType = possibly_null_value_t; - /** * @brief A container for capturing the output of an evaluated expression. * @@ -140,14 +117,15 @@ struct value_expression_result /** * @brief Returns the underlying data. * - * @throws thrust::bad_optional_access if the underlying data is not valid. + * If the underlying data is not valid, behavior is undefined. Callers should + * use is_valid to check for validity before accessing the value. */ __device__ T value() const { // Using two separate constexprs silences compiler warnings, whereas an // if/else does not. An unconditional return is not ignored by the compiler // when has_nulls is true and therefore raises a compiler error. - if constexpr (has_nulls) { return _obj.value(); } + if constexpr (has_nulls) { return *_obj; } if constexpr (!has_nulls) { return _obj; } } @@ -214,147 +192,30 @@ struct mutable_column_expression_result }; /** - * @brief A container of all device data required to evaluate an expression on tables. - * - * This struct should never be instantiated directly. It is created by the - * `ast_plan` on construction, and the resulting member is publicly accessible - * for passing to kernels for constructing an `expression_evaluator`. + * @brief Dispatch to a binary operator based on a single data type. * + * This functor is a dispatcher for binary operations that assumes that both + * operands are of the same type. This assumption is encoded in the + * non-deducible template parameter LHS, the type of the left-hand operand, + * which is then used as the template parameter for both the left and right + * operands to the binary operator f. */ -struct device_ast_plan { - device_span data_references; - device_span literals; - device_span operators; - device_span operator_source_indices; - cudf::size_type num_intermediates; - int shmem_per_thread; -}; - -/** - * @brief Preprocessor for an expression acting on tables to generate data suitable for AST - * expression evaluation on the GPU. - * - * On construction, an AST plan creates a single "packed" host buffer of all - * data arrays that will be necessary to evaluate an expression on a pair of - * tables. This data is copied to a single contiguous device buffer, and - * pointers are generated to the individual components. Because the plan tends - * to be small, this is the most efficient approach for low latency. All the - * data required on the GPU can be accessed via the convenient `dev_plan` - * member struct, which can be used to construct an `expression_evaluator` on - * the device. - * - * Note that the resulting device data cannot be used once this class goes out of scope. - */ -struct ast_plan { - /** - * @brief Construct an AST plan for an expression operating on two tables. - * - * @param expr The expression for which to construct a plan. - * @param left The left table on which the expression acts. - * @param right The right table on which the expression acts. - * @param has_nulls Boolean indicator of whether or not the data contains nulls. - * @param stream Stream view on which to allocate resources and queue execution. - * @param mr Device memory resource used to allocate the returned column's device. - */ - ast_plan(detail::node const& expr, - cudf::table_view left, - cudf::table_view right, - bool has_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - : _linearizer(expr, left, right) - { - std::vector sizes; - std::vector data_pointers; - - extract_size_and_pointer(_linearizer.data_references(), sizes, data_pointers); - extract_size_and_pointer(_linearizer.literals(), sizes, data_pointers); - extract_size_and_pointer(_linearizer.operators(), sizes, data_pointers); - extract_size_and_pointer(_linearizer.operator_source_indices(), sizes, data_pointers); - - // Create device buffer - auto const buffer_size = std::accumulate(sizes.cbegin(), sizes.cend(), 0); - auto buffer_offsets = std::vector(sizes.size()); - thrust::exclusive_scan(sizes.cbegin(), sizes.cend(), buffer_offsets.begin(), 0); - - auto h_data_buffer = std::make_unique(buffer_size); - for (unsigned int i = 0; i < data_pointers.size(); ++i) { - std::memcpy(h_data_buffer.get() + buffer_offsets[i], data_pointers[i], sizes[i]); - } - - _device_data_buffer = rmm::device_buffer(h_data_buffer.get(), buffer_size, stream, mr); - - stream.synchronize(); - - // Create device pointers to components of plan - auto device_data_buffer_ptr = static_cast(_device_data_buffer.data()); - dev_plan.data_references = device_span( - reinterpret_cast(device_data_buffer_ptr + - buffer_offsets[0]), - _linearizer.data_references().size()); - dev_plan.literals = device_span( - reinterpret_cast( - device_data_buffer_ptr + buffer_offsets[1]), - _linearizer.literals().size()); - dev_plan.operators = device_span( - reinterpret_cast(device_data_buffer_ptr + buffer_offsets[2]), - _linearizer.operators().size()); - dev_plan.operator_source_indices = device_span( - reinterpret_cast(device_data_buffer_ptr + buffer_offsets[3]), - _linearizer.operator_source_indices().size()); - dev_plan.num_intermediates = _linearizer.intermediate_count(); - dev_plan.shmem_per_thread = static_cast( - (has_nulls ? sizeof(IntermediateDataType) : sizeof(IntermediateDataType)) * - dev_plan.num_intermediates); - } - +struct single_dispatch_binary_operator { /** - * @brief Construct an AST plan for an expression operating on one table. + * @brief Single-type dispatch to a binary operation. * - * @param expr The expression for which to construct a plan. - * @param table The table on which the expression acts. - * @param has_nulls Boolean indicator of whether or not the data contains nulls. - * @param stream Stream view on which to allocate resources and queue execution. - * @param mr Device memory resource used to allocate the returned column's device. - */ - ast_plan(detail::node const& expr, - cudf::table_view table, - bool has_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - : ast_plan(expr, table, table, has_nulls, stream, mr) - { - } - - cudf::data_type output_type() const { return _linearizer.root_data_type(); } - - device_ast_plan - dev_plan; ///< The collection of data required to evaluate the expression on the device. - - private: - /** - * @brief Helper function for adding components (operators, literals, etc) to AST plan + * @tparam LHS Left input type. + * @tparam F Type of forwarded binary operator functor. + * @tparam Ts Parameter pack of forwarded arguments. * - * @tparam T The underlying type of the input `std::vector` - * @param[in] v The `std::vector` containing components (operators, literals, etc). - * @param[in,out] sizes The `std::vector` containing the size of each data buffer. - * @param[in,out] data_pointers The `std::vector` containing pointers to each data buffer. + * @param f Binary operator functor. + * @param args Forwarded arguments to `operator()` of `f`. */ - template - void extract_size_and_pointer(std::vector const& v, - std::vector& sizes, - std::vector& data_pointers) + template + CUDA_DEVICE_CALLABLE auto operator()(F&& f, Ts&&... args) { - auto const data_size = sizeof(T) * v.size(); - sizes.push_back(data_size); - data_pointers.push_back(v.data()); + f.template operator()(std::forward(args)...); } - - rmm::device_buffer - _device_data_buffer; ///< The device-side data buffer containing the plan information, which is - ///< owned by this class and persists until it is destroyed. - linearizer const _linearizer; ///< The linearizer created from the provided expression that is - ///< used to construct device-side operators and references. }; /** @@ -379,7 +240,7 @@ struct expression_evaluator { */ __device__ expression_evaluator(table_device_view const& left, table_device_view const& right, - device_ast_plan const& plan, + expression_device_view const& plan, IntermediateDataType* thread_intermediate_storage, null_equality compare_nulls = null_equality::EQUAL) : left(left), @@ -400,7 +261,7 @@ struct expression_evaluator { * @param compare_nulls Whether the equality operator returns true or false for two nulls. */ __device__ expression_evaluator(table_device_view const& table, - device_ast_plan const& plan, + expression_device_view const& plan, IntermediateDataType* thread_intermediate_storage, null_equality compare_nulls = null_equality::EQUAL) : left(table), @@ -426,17 +287,26 @@ struct expression_evaluator { */ template ())> __device__ possibly_null_value_t resolve_input( - detail::device_data_reference device_data_reference, cudf::size_type row_index) const + detail::device_data_reference device_data_reference, + cudf::size_type left_row_index, + thrust::optional right_row_index = {}) const { auto const data_index = device_data_reference.data_index; auto const ref_type = device_data_reference.reference_type; // TODO: Everywhere in the code assumes that the table reference is either // left or right. Should we error-check somewhere to prevent // table_reference::OUTPUT from being specified? - auto const& table = device_data_reference.table_source == table_reference::LEFT ? left : right; - using ReturnType = possibly_null_value_t; + using ReturnType = possibly_null_value_t; if (ref_type == detail::device_data_reference_type::COLUMN) { // If we have nullable data, return an empty nullable type with no value if the data is null. + auto const& table = + (device_data_reference.table_source == table_reference::LEFT) ? left : right; + // Note that the code below assumes that a right index has been passed in + // any case where device_data_reference.table_source == table_reference::RIGHT. + // Otherwise, behavior is undefined. + auto const row_index = (device_data_reference.table_source == table_reference::LEFT) + ? left_row_index + : *right_row_index; if constexpr (has_nulls) { return table.column(data_index).is_valid(row_index) ? ReturnType(table.column(data_index).element(row_index)) @@ -462,7 +332,9 @@ struct expression_evaluator { template ())> __device__ possibly_null_value_t resolve_input( - detail::device_data_reference device_data_reference, cudf::size_type row_index) const + detail::device_data_reference device_data_reference, + cudf::size_type left_row_index, + thrust::optional right_row_index = {}) const { cudf_assert(false && "Unsupported type in resolve_input."); // Unreachable return used to silence compiler warnings. @@ -484,11 +356,11 @@ struct expression_evaluator { */ template __device__ void operator()(OutputType& output_object, - const cudf::size_type input_row_index, - const detail::device_data_reference input, - const detail::device_data_reference output, - const cudf::size_type output_row_index, - const ast_operator op) const + cudf::size_type const input_row_index, + detail::device_data_reference const input, + detail::device_data_reference const output, + cudf::size_type const output_row_index, + ast_operator const op) const { auto const typed_input = resolve_input(input, input_row_index); ast_operator_dispatcher(op, @@ -517,16 +389,16 @@ struct expression_evaluator { */ template __device__ void operator()(OutputType& output_object, - const cudf::size_type left_row_index, - const cudf::size_type right_row_index, - const detail::device_data_reference lhs, - const detail::device_data_reference rhs, - const detail::device_data_reference output, - const cudf::size_type output_row_index, - const ast_operator op) const + cudf::size_type const left_row_index, + cudf::size_type const right_row_index, + detail::device_data_reference const lhs, + detail::device_data_reference const rhs, + detail::device_data_reference const output, + cudf::size_type const output_row_index, + ast_operator const op) const { - auto const typed_lhs = resolve_input(lhs, left_row_index); - auto const typed_rhs = resolve_input(rhs, right_row_index); + auto const typed_lhs = resolve_input(lhs, left_row_index, right_row_index); + auto const typed_rhs = resolve_input(rhs, left_row_index, right_row_index); ast_operator_dispatcher(op, binary_expression_output_handler(*this), output_object, @@ -544,11 +416,11 @@ struct expression_evaluator { __device__ void operator()(OutputType& output_object, cudf::size_type left_row_index, cudf::size_type right_row_index, - const detail::device_data_reference lhs, - const detail::device_data_reference rhs, - const detail::device_data_reference output, + detail::device_data_reference const lhs, + detail::device_data_reference const rhs, + detail::device_data_reference const output, cudf::size_type output_row_index, - const ast_operator op) const + ast_operator const op) const { cudf_assert(false && "Invalid binary dispatch operator for the provided input."); } @@ -587,19 +459,18 @@ struct expression_evaluator { cudf::size_type const right_row_index, cudf::size_type const output_row_index) { - auto operator_source_index = static_cast(0); + cudf::size_type operator_source_index{0}; for (cudf::size_type operator_index = 0; operator_index < plan.operators.size(); - operator_index++) { + ++operator_index) { // Execute operator auto const op = plan.operators[operator_index]; auto const arity = ast_operator_arity(op); if (arity == 1) { // Unary operator auto const input = - plan.data_references[plan.operator_source_indices[operator_source_index]]; + plan.data_references[plan.operator_source_indices[operator_source_index++]]; auto const output = - plan.data_references[plan.operator_source_indices[operator_source_index + 1]]; - operator_source_index += arity + 1; + plan.data_references[plan.operator_source_indices[operator_source_index++]]; auto input_row_index = input.table_source == table_reference::LEFT ? left_row_index : right_row_index; type_dispatcher(input.data_type, @@ -612,12 +483,12 @@ struct expression_evaluator { op); } else if (arity == 2) { // Binary operator - auto const lhs = plan.data_references[plan.operator_source_indices[operator_source_index]]; + auto const lhs = + plan.data_references[plan.operator_source_indices[operator_source_index++]]; auto const rhs = - plan.data_references[plan.operator_source_indices[operator_source_index + 1]]; + plan.data_references[plan.operator_source_indices[operator_source_index++]]; auto const output = - plan.data_references[plan.operator_source_indices[operator_source_index + 2]]; - operator_source_index += arity + 1; + plan.data_references[plan.operator_source_indices[operator_source_index++]]; type_dispatcher(lhs.data_type, detail::single_dispatch_binary_operator{}, *this, @@ -670,9 +541,9 @@ struct expression_evaluator { typename OutputType, CUDF_ENABLE_IF(is_rep_layout_compatible())> __device__ void resolve_output(OutputType& output_object, - const detail::device_data_reference device_data_reference, - const cudf::size_type row_index, - const possibly_null_value_t result) const + detail::device_data_reference const device_data_reference, + cudf::size_type const row_index, + possibly_null_value_t const result) const { auto const ref_type = device_data_reference.reference_type; if (ref_type == detail::device_data_reference_type::COLUMN) { @@ -690,9 +561,9 @@ struct expression_evaluator { typename OutputType, CUDF_ENABLE_IF(not is_rep_layout_compatible())> __device__ void resolve_output(OutputType& output_object, - const detail::device_data_reference device_data_reference, - const cudf::size_type row_index, - const possibly_null_value_t result) const + detail::device_data_reference const device_data_reference, + cudf::size_type const row_index, + possibly_null_value_t const result) const { cudf_assert(false && "Invalid type in resolve_output."); } @@ -730,9 +601,9 @@ struct expression_evaluator { typename OutputType, std::enable_if_t, Input>>* = nullptr> __device__ void operator()(OutputType& output_object, - const cudf::size_type output_row_index, - const possibly_null_value_t input, - const detail::device_data_reference output) const + cudf::size_type const output_row_index, + possibly_null_value_t const input, + detail::device_data_reference const output) const { using OperatorFunctor = detail::operator_functor; using Out = cuda::std::invoke_result_t; @@ -752,9 +623,9 @@ struct expression_evaluator { typename OutputType, std::enable_if_t, Input>>* = nullptr> __device__ void operator()(OutputType& output_object, - const cudf::size_type output_row_index, - const possibly_null_value_t input, - const detail::device_data_reference output) const + cudf::size_type const output_row_index, + possibly_null_value_t const input, + detail::device_data_reference const output) const { cudf_assert(false && "Invalid unary dispatch operator for the provided input."); } @@ -790,10 +661,10 @@ struct expression_evaluator { std::enable_if_t< detail::is_valid_binary_op, LHS, RHS>>* = nullptr> __device__ void operator()(OutputType& output_object, - const cudf::size_type output_row_index, - const possibly_null_value_t lhs, - const possibly_null_value_t rhs, - const detail::device_data_reference output) const + cudf::size_type const output_row_index, + possibly_null_value_t const lhs, + possibly_null_value_t const rhs, + detail::device_data_reference const output) const { using OperatorFunctor = detail::operator_functor; using Out = cuda::std::invoke_result_t; @@ -832,10 +703,10 @@ struct expression_evaluator { std::enable_if_t< !detail::is_valid_binary_op, LHS, RHS>>* = nullptr> __device__ void operator()(OutputType& output_object, - const cudf::size_type output_row_index, - const possibly_null_value_t lhs, - const possibly_null_value_t rhs, - const detail::device_data_reference output) const + cudf::size_type const output_row_index, + possibly_null_value_t const lhs, + possibly_null_value_t const rhs, + detail::device_data_reference output) const { cudf_assert(false && "Invalid binary dispatch operator for the provided input."); } @@ -843,7 +714,7 @@ struct expression_evaluator { table_device_view const& left; ///< The left table to operate on. table_device_view const& right; ///< The right table to operate on. - device_ast_plan const& + expression_device_view const& plan; ///< The container of device data representing the expression to evaluate. IntermediateDataType* thread_intermediate_storage; ///< The shared memory store of intermediates produced during @@ -852,23 +723,6 @@ struct expression_evaluator { compare_nulls; ///< Whether the equality operator returns true or false for two nulls. }; -/** - * @brief Compute a new column by evaluating an expression tree on a table. - * - * This evaluates an expression over a table to produce a new column. Also called an n-ary - * transform. - * - * @param table The table used for expression evaluation. - * @param expr The root of the expression tree. - * @param stream Stream on which to perform the computation. - * @param mr Device memory resource. - * @return std::unique_ptr Output column. - */ -std::unique_ptr compute_column( - table_view const table, - expression const& expr, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace ast diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp new file mode 100644 index 00000000000..1f35b54ea61 --- /dev/null +++ b/cpp/include/cudf/ast/detail/expression_parser.hpp @@ -0,0 +1,344 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace cudf { +namespace ast { +namespace detail { + +/** + * @brief Node data reference types. + * + * This enum is device-specific. For instance, intermediate data references are generated by the + * linearization process but cannot be explicitly created by the user. + */ +enum class device_data_reference_type { + COLUMN, // A value in a table column + LITERAL, // A literal value + INTERMEDIATE // An internal temporary value +}; + +/** + * @brief A device data reference describes a source of data used by a expression. + * + * This is a POD class used to create references describing data type and locations for consumption + * by the `row_evaluator`. + */ +struct alignas(8) device_data_reference { + device_data_reference(device_data_reference_type reference_type, + cudf::data_type data_type, + cudf::size_type data_index, + table_reference table_source); + + device_data_reference(device_data_reference_type reference_type, + cudf::data_type data_type, + cudf::size_type data_index); + + device_data_reference_type const reference_type; // Source of data + cudf::data_type const data_type; // Type of data + cudf::size_type const data_index; // The column index of a table, index of a + // literal, or index of an intermediate + table_reference const table_source; + + bool operator==(device_data_reference const& rhs) const + { + return std::tie(data_index, reference_type, table_source) == + std::tie(rhs.data_index, rhs.reference_type, rhs.table_source); + } +}; + +// Type trait for wrapping nullable types in a thrust::optional. Non-nullable +// types are returned as is. +template +struct possibly_null_value; + +template +struct possibly_null_value { + using type = thrust::optional; +}; + +template +struct possibly_null_value { + using type = T; +}; + +template +using possibly_null_value_t = typename possibly_null_value::type; + +// Type used for intermediate storage in expression evaluation. +template +using IntermediateDataType = possibly_null_value_t; + +/** + * @brief A container of all device data required to evaluate an expression on tables. + * + * This struct should never be instantiated directly. It is created by the + * `expression_parser` on construction, and the resulting member is publicly accessible + * for passing to kernels for constructing an `expression_evaluator`. + * + */ +struct expression_device_view { + device_span data_references; + device_span literals; + device_span operators; + device_span operator_source_indices; + cudf::size_type num_intermediates; + int shmem_per_thread; +}; + +/** + * @brief The expression_parser traverses an expression and converts it into a form suitable for + * execution on the device. + * + * This class is part of a "visitor" pattern with the `expression` class. + * + * This class does pre-processing work on the host, validating operators and operand data types. It + * traverses downward from a root expression in a depth-first fashion, capturing information about + * the expressions and constructing vectors of information that are later used by the device for + * evaluating the abstract syntax tree as a "linear" list of operators whose input dependencies are + * resolved into intermediate data storage in shared memory. + */ +class expression_parser { + public: + /** + * @brief Construct a new expression_parser object + * + * @param expr The expression to create an evaluable expression_parser for. + * @param left The left table used for evaluating the abstract syntax tree. + * @param right The right table used for evaluating the abstract syntax tree. + */ + expression_parser(expression const& expr, + cudf::table_view const& left, + std::optional> right, + bool has_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : _left{left}, + _right{right}, + _expression_count{0}, + _intermediate_counter{}, + _has_nulls(has_nulls) + { + expr.accept(*this); + move_to_device(stream, mr); + } + + /** + * @brief Construct a new expression_parser object + * + * @param expr The expression to create an evaluable expression_parser for. + * @param table The table used for evaluating the abstract syntax tree. + */ + expression_parser(expression const& expr, + cudf::table_view const& table, + bool has_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : expression_parser(expr, table, {}, has_nulls, stream, mr) + { + } + + /** + * @brief Get the root data type of the abstract syntax tree. + * + * @return cudf::data_type + */ + cudf::data_type output_type() const; + + /** + * @brief Visit a literal expression. + * + * @param expr Literal expression. + * @return cudf::size_type Index of device data reference for the expression. + */ + cudf::size_type visit(literal const& expr); + + /** + * @brief Visit a column reference expression. + * + * @param expr Column reference expression. + * @return cudf::size_type Index of device data reference for the expression. + */ + cudf::size_type visit(column_reference const& expr); + + /** + * @brief Visit an expression expression. + * + * @param expr Expression expression. + * @return cudf::size_type Index of device data reference for the expression. + */ + cudf::size_type visit(operation const& expr); + + /** + * @brief Internal class used to track the utilization of intermediate storage locations. + * + * As expressions are being evaluated, they may generate "intermediate" data that is immediately + * consumed. Rather than manifesting this data in global memory, we can store intermediates of any + * fixed width type (up to 8 bytes) by placing them in shared memory. This class helps to track + * the number and indices of intermediate data in shared memory using a give-take model. Locations + * in shared memory can be "taken" and used for storage, "given back," and then later re-used. + * This aims to minimize the maximum amount of shared memory needed at any point during the + * evaluation. + * + */ + class intermediate_counter { + public: + intermediate_counter() : used_values(), max_used(0) {} + cudf::size_type take(); + void give(cudf::size_type value); + cudf::size_type get_max_used() const { return max_used; } + + private: + /** + * @brief Find the first missing value in a contiguous sequence of integers. + * + * From a sorted container of integers, find the first "missing" value. + * For example, {0, 1, 2, 4, 5} is missing 3, and {1, 2, 3} is missing 0. + * If there are no missing values, return the size of the container. + * + * @return cudf::size_type Smallest value not already in the container. + */ + cudf::size_type find_first_missing() const; + + std::vector used_values; + cudf::size_type max_used; + }; + + expression_device_view device_expression_data; ///< The collection of data required to evaluate + ///< the expression on the device. + + private: + /** + * @brief Helper function for adding components (operators, literals, etc) to AST plan + * + * @tparam T The underlying type of the input `std::vector` + * @param[in] v The `std::vector` containing components (operators, literals, etc). + * @param[in,out] sizes The `std::vector` containing the size of each data buffer. + * @param[in,out] data_pointers The `std::vector` containing pointers to each data buffer. + */ + template + void extract_size_and_pointer(std::vector const& v, + std::vector& sizes, + std::vector& data_pointers) + { + auto const data_size = sizeof(T) * v.size(); + sizes.push_back(data_size); + data_pointers.push_back(v.data()); + } + + void move_to_device(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) + { + std::vector sizes; + std::vector data_pointers; + + extract_size_and_pointer(_data_references, sizes, data_pointers); + extract_size_and_pointer(_literals, sizes, data_pointers); + extract_size_and_pointer(_operators, sizes, data_pointers); + extract_size_and_pointer(_operator_source_indices, sizes, data_pointers); + + // Create device buffer + auto const buffer_size = std::accumulate(sizes.cbegin(), sizes.cend(), 0); + auto buffer_offsets = std::vector(sizes.size()); + thrust::exclusive_scan(sizes.cbegin(), sizes.cend(), buffer_offsets.begin(), 0); + + auto h_data_buffer = std::vector(buffer_size); + for (unsigned int i = 0; i < data_pointers.size(); ++i) { + std::memcpy(h_data_buffer.data() + buffer_offsets[i], data_pointers[i], sizes[i]); + } + + _device_data_buffer = rmm::device_buffer(h_data_buffer.data(), buffer_size, stream, mr); + + stream.synchronize(); + + // Create device pointers to components of plan + auto device_data_buffer_ptr = static_cast(_device_data_buffer.data()); + device_expression_data.data_references = device_span( + reinterpret_cast(device_data_buffer_ptr + + buffer_offsets[0]), + _data_references.size()); + device_expression_data.literals = + device_span( + reinterpret_cast( + device_data_buffer_ptr + buffer_offsets[1]), + _literals.size()); + device_expression_data.operators = device_span( + reinterpret_cast(device_data_buffer_ptr + buffer_offsets[2]), + _operators.size()); + device_expression_data.operator_source_indices = device_span( + reinterpret_cast(device_data_buffer_ptr + buffer_offsets[3]), + _operator_source_indices.size()); + device_expression_data.num_intermediates = _intermediate_counter.get_max_used(); + device_expression_data.shmem_per_thread = static_cast( + (_has_nulls ? sizeof(IntermediateDataType) : sizeof(IntermediateDataType)) * + device_expression_data.num_intermediates); + } + + /** + * @brief Helper function for recursive traversal of expressions. + * + * When parsing an expression composed of subexpressions, all subexpressions + * must be evaluated before an operator can be applied to them. This method + * performs that recursive traversal (in conjunction with the + * `expression_parser.visit` and `expression.accept` methods if necessary to + * descend deeper into an expression tree). + * + * @param operands The operands to visit. + * + * @return The indices of the operands stored in the data references. + */ + std::vector visit_operands( + std::vector> operands); + + /** + * @brief Add a data reference to the internal list. + * + * @param data_ref The data reference to add. + * + * @return The index of the added data reference in the internal data references list. + */ + cudf::size_type add_data_reference(detail::device_data_reference data_ref); + + rmm::device_buffer + _device_data_buffer; ///< The device-side data buffer containing the plan information, which is + ///< owned by this class and persists until it is destroyed. + + cudf::table_view const& _left; + std::optional> _right; + cudf::size_type _expression_count; + intermediate_counter _intermediate_counter; + bool _has_nulls; + std::vector _data_references; + std::vector _operators; + std::vector _operator_source_indices; + std::vector _literals; +}; + +} // namespace detail + +} // namespace ast + +} // namespace cudf diff --git a/cpp/include/cudf/ast/detail/linearizer.hpp b/cpp/include/cudf/ast/detail/linearizer.hpp deleted file mode 100644 index 59eda1df7b7..00000000000 --- a/cpp/include/cudf/ast/detail/linearizer.hpp +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include - -namespace cudf { -namespace ast { - -// Forward declaration -enum class table_reference; -class literal; -class column_reference; -class expression; - -namespace detail { - -/** - * @brief Enum defining data reference types used by a node. - * - * This enum is device-specific. For instance, intermediate data references are generated by the - * linearization process but cannot be explicitly created by the user. - */ -enum class device_data_reference_type { - COLUMN, // A value in a table column - LITERAL, // A literal value - INTERMEDIATE // An internal temporary value -}; - -/** - * @brief A device data reference describes a source of data used by a node. - * - * This is a POD class used to create references describing data type and locations for consumption - * by the `row_evaluator`. - */ -struct alignas(8) device_data_reference { - device_data_reference(device_data_reference_type reference_type, - cudf::data_type data_type, - cudf::size_type data_index, - table_reference table_source); - - device_data_reference(device_data_reference_type reference_type, - cudf::data_type data_type, - cudf::size_type data_index); - - const device_data_reference_type reference_type; // Source of data - const cudf::data_type data_type; // Type of data - const cudf::size_type data_index; // The column index of a table, index of a - // literal, or index of an intermediate - const table_reference table_source; - - inline bool operator==(const device_data_reference& rhs) const - { - return std::tie(data_index, reference_type, table_source) == - std::tie(rhs.data_index, rhs.reference_type, rhs.table_source); - } -}; - -// Forward declaration -class linearizer; - -/** - * @brief A generic node that can be evaluated to return a value. - * - * This class is a part of a "visitor" pattern with the `linearizer` class. - * Nodes inheriting from this class can accept visitors. - */ -struct node { - virtual cudf::size_type accept(detail::linearizer& visitor) const = 0; -}; - -/** - * @brief The linearizer traverses an abstract syntax tree to prepare for execution on the device. - * - * This class is part of a "visitor" pattern with the `node` class. - * - * This class does pre-processing work on the host, validating operators and operand data types. It - * traverses downward from a root node in a depth-first fashion, capturing information about - * the nodes and constructing vectors of information that are later used by the device for - * evaluating the abstract syntax tree as a "linear" list of operators whose input dependencies are - * resolved into intermediate data storage in shared memory. - */ -class linearizer { - public: - /** - * @brief Construct a new linearizer object - * - * @param expr The expression to create an evaluable linearizer for. - * @param left The left table used for evaluating the abstract syntax tree. - * @param right The right table used for evaluating the abstract syntax tree. - */ - linearizer(detail::node const& expr, cudf::table_view left, cudf::table_view right) - : _left{left}, _right{right}, _node_count{0}, _intermediate_counter{} - { - expr.accept(*this); - } - - /** - * @brief Construct a new linearizer object - * - * @param expr The expression to create an evaluable linearizer for. - * @param table The table used for evaluating the abstract syntax tree. - */ - linearizer(detail::node const& expr, cudf::table_view table) - : _left{table}, _right{table}, _node_count{0}, _intermediate_counter{} - { - expr.accept(*this); - } - - /** - * @brief Get the root data type of the abstract syntax tree. - * - * @return cudf::data_type - */ - cudf::data_type root_data_type() const; - - /** - * @brief Get the maximum number of intermediates stored by the abstract syntax tree. - * - * @return cudf::size_type - */ - cudf::size_type intermediate_count() const { return _intermediate_counter.get_max_used(); } - - /** - * @brief Get the device data references. - * - * @return std::vector - */ - std::vector const& data_references() const - { - return _data_references; - } - - /** - * @brief Get the operators. - * - * @return std::vector - */ - std::vector const& operators() const { return _operators; } - - /** - * @brief Get the operator source indices. - * - * @return std::vector - */ - std::vector const& operator_source_indices() const - { - return _operator_source_indices; - } - - /** - * @brief Get the literal device views. - * - * @return std::vector - */ - std::vector const& literals() const - { - return _literals; - } - - /** - * @brief Visit a literal node. - * - * @param expr Literal node. - * @return cudf::size_type Index of device data reference for the node. - */ - cudf::size_type visit(literal const& expr); - - /** - * @brief Visit a column reference node. - * - * @param expr Column reference node. - * @return cudf::size_type Index of device data reference for the node. - */ - cudf::size_type visit(column_reference const& expr); - - /** - * @brief Visit an expression node. - * - * @param expr Expression node. - * @return cudf::size_type Index of device data reference for the node. - */ - cudf::size_type visit(expression const& expr); - - /** - * @brief Internal class used to track the utilization of intermediate storage locations. - * - * As nodes are being evaluated, they may generate "intermediate" data that is immediately - * consumed. Rather than manifesting this data in global memory, we can store intermediates of any - * fixed width type (up to 8 bytes) by placing them in shared memory. This class helps to track - * the number and indices of intermediate data in shared memory using a give-take model. Locations - * in shared memory can be "taken" and used for storage, "given back," and then later re-used. - * This aims to minimize the maximum amount of shared memory needed at any point during the - * evaluation. - * - */ - class intermediate_counter { - public: - intermediate_counter() : used_values(), max_used(0) {} - cudf::size_type take(); - void give(cudf::size_type value); - cudf::size_type get_max_used() const { return max_used; } - - private: - cudf::size_type find_first_missing() const; - std::vector used_values; - cudf::size_type max_used; - }; - - private: - std::vector visit_operands( - std::vector> operands); - cudf::size_type add_data_reference(detail::device_data_reference data_ref); - - // State information about the "linearized" GPU execution plan - cudf::table_view const& _left; - cudf::table_view const& _right; - cudf::size_type _node_count; - intermediate_counter _intermediate_counter; - std::vector _data_references; - std::vector _operators; - std::vector _operator_source_indices; - std::vector _literals; -}; - -} // namespace detail - -} // namespace ast - -} // namespace cudf diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp index 01ec5b74b77..00723004a9f 100644 --- a/cpp/include/cudf/ast/detail/operators.hpp +++ b/cpp/include/cudf/ast/detail/operators.hpp @@ -15,7 +15,7 @@ */ #pragma once -#include +#include #include #include #include @@ -787,14 +787,6 @@ struct single_dispatch_binary_operator_types { } }; -struct single_dispatch_binary_operator { - template - CUDA_DEVICE_CALLABLE auto operator()(F&& f, Ts&&... args) - { - f.template operator()(std::forward(args)...); - } -}; - /** * @brief Functor performing a type dispatch for a binary operator. * diff --git a/cpp/include/cudf/ast/nodes.hpp b/cpp/include/cudf/ast/expressions.hpp similarity index 52% rename from cpp/include/cudf/ast/nodes.hpp rename to cpp/include/cudf/ast/expressions.hpp index 70dda58816e..d9ba197f8fe 100644 --- a/cpp/include/cudf/ast/nodes.hpp +++ b/cpp/include/cudf/ast/expressions.hpp @@ -15,9 +15,6 @@ */ #pragma once -#include -#include -#include #include #include #include @@ -27,6 +24,75 @@ namespace cudf { namespace ast { +// Forward declaration. +namespace detail { +class expression_parser; +} + +/** + * @brief A generic expression that can be evaluated to return a value. + * + * This class is a part of a "visitor" pattern with the `linearizer` class. + * Nodes inheriting from this class can accept visitors. + */ +struct expression { + virtual cudf::size_type accept(detail::expression_parser& visitor) const = 0; + + virtual ~expression() {} +}; + +/** + * @brief Enum of supported operators. + */ +enum class ast_operator { + // Binary operators + ADD, ///< operator + + SUB, ///< operator - + MUL, ///< operator * + DIV, ///< operator / using common type of lhs and rhs + TRUE_DIV, ///< operator / after promoting type to floating point + FLOOR_DIV, ///< operator / after promoting to 64 bit floating point and then + ///< flooring the result + MOD, ///< operator % + PYMOD, ///< operator % but following python's sign rules for negatives + POW, ///< lhs ^ rhs + EQUAL, ///< operator == + NOT_EQUAL, ///< operator != + LESS, ///< operator < + GREATER, ///< operator > + LESS_EQUAL, ///< operator <= + GREATER_EQUAL, ///< operator >= + BITWISE_AND, ///< operator & + BITWISE_OR, ///< operator | + BITWISE_XOR, ///< operator ^ + LOGICAL_AND, ///< operator && + LOGICAL_OR, ///< operator || + // Unary operators + IDENTITY, ///< Identity function + SIN, ///< Trigonometric sine + COS, ///< Trigonometric cosine + TAN, ///< Trigonometric tangent + ARCSIN, ///< Trigonometric sine inverse + ARCCOS, ///< Trigonometric cosine inverse + ARCTAN, ///< Trigonometric tangent inverse + SINH, ///< Hyperbolic sine + COSH, ///< Hyperbolic cosine + TANH, ///< Hyperbolic tangent + ARCSINH, ///< Hyperbolic sine inverse + ARCCOSH, ///< Hyperbolic cosine inverse + ARCTANH, ///< Hyperbolic tangent inverse + EXP, ///< Exponential (base e, Euler number) + LOG, ///< Natural Logarithm (base e) + SQRT, ///< Square-root (x^0.5) + CBRT, ///< Cube-root (x^(1.0/3)) + CEIL, ///< Smallest integer value not less than arg + FLOOR, ///< largest integer value not greater than arg + ABS, ///< Absolute value + RINT, ///< Rounds the floating-point argument arg to an integer value + BIT_INVERT, ///< Bitwise Not (~) + NOT ///< Logical Not (!) +}; + /** * @brief Enum of table references. * @@ -41,7 +107,7 @@ enum class table_reference { /** * @brief A literal value used in an abstract syntax tree. */ -class literal : public detail::node { +class literal : public expression { public: /** * @brief Construct a new literal object. @@ -96,21 +162,21 @@ class literal : public detail::node { * @param visitor Visitor. * @return cudf::size_type Index of device data reference for this instance. */ - cudf::size_type accept(detail::linearizer& visitor) const override; + cudf::size_type accept(detail::expression_parser& visitor) const override; private: - const cudf::detail::fixed_width_scalar_device_view_base value; + cudf::detail::fixed_width_scalar_device_view_base const value; }; /** - * @brief A node referring to data from a column in a table. + * @brief A expression referring to data from a column in a table. */ -class column_reference : public detail::node { +class column_reference : public expression { public: /** * @brief Construct a new column reference object * - * @param column_index Index of this column in the table (provided when the node is + * @param column_index Index of this column in the table (provided when the expression is * evaluated). * @param table_source Which table to use in cases with two tables (e.g. joins). */ @@ -140,7 +206,7 @@ class column_reference : public detail::node { * @param table Table used to determine types. * @return cudf::data_type */ - cudf::data_type get_data_type(const table_view& table) const + cudf::data_type get_data_type(table_view const& table) const { return table.column(get_column_index()).type(); } @@ -152,9 +218,9 @@ class column_reference : public detail::node { * @param right_table Right table used to determine types. * @return cudf::data_type */ - cudf::data_type get_data_type(const table_view& left_table, const table_view& right_table) const + cudf::data_type get_data_type(table_view const& left_table, table_view const& right_table) const { - const auto table = [&] { + auto const table = [&] { if (get_table_source() == table_reference::LEFT) { return left_table; } else if (get_table_source() == table_reference::RIGHT) { @@ -172,7 +238,7 @@ class column_reference : public detail::node { * @param visitor Visitor. * @return cudf::size_type Index of device data reference for this instance. */ - cudf::size_type accept(detail::linearizer& visitor) const override; + cudf::size_type accept(detail::expression_parser& visitor) const override; private: cudf::size_type column_index; @@ -180,43 +246,33 @@ class column_reference : public detail::node { }; /** - * @brief An expression node holds an operator and zero or more operands. + * @brief An operation expression holds an operator and zero or more operands. */ -class expression : public detail::node { +class operation : public expression { public: /** - * @brief Construct a new unary expression object. + * @brief Construct a new unary operation object. * * @param op Operator - * @param input Input node (operand) + * @param input Input expression (operand) */ - expression(ast_operator op, node const& input) : op(op), operands({input}) - { - if (cudf::ast::detail::ast_operator_arity(op) != 1) { - CUDF_FAIL("The provided operator is not a unary operator."); - } - } + operation(ast_operator op, expression const& input); /** - * @brief Construct a new binary expression object. + * @brief Construct a new binary operation object. * * @param op Operator - * @param left Left input node (left operand) - * @param right Right input node (right operand) + * @param left Left input expression (left operand) + * @param right Right input expression (right operand) */ - expression(ast_operator op, node const& left, node const& right) : op(op), operands({left, right}) - { - if (cudf::ast::detail::ast_operator_arity(op) != 2) { - CUDF_FAIL("The provided operator is not a binary operator."); - } - } + operation(ast_operator op, expression const& left, expression const& right); - // expression only stores references to nodes, so it does not accept r-value - // references: the calling code must own the nodes. - expression(ast_operator op, node&& input) = delete; - expression(ast_operator op, node&& left, node&& right) = delete; - expression(ast_operator op, node&& left, node const& right) = delete; - expression(ast_operator op, node const& left, node&& right) = delete; + // operation only stores references to expressions, so it does not accept r-value + // references: the calling code must own the expressions. + operation(ast_operator op, expression&& input) = delete; + operation(ast_operator op, expression&& left, expression&& right) = delete; + operation(ast_operator op, expression&& left, expression const& right) = delete; + operation(ast_operator op, expression const& left, expression&& right) = delete; /** * @brief Get the operator. @@ -228,9 +284,9 @@ class expression : public detail::node { /** * @brief Get the operands. * - * @return std::vector> + * @return std::vector> */ - std::vector> get_operands() const { return operands; } + std::vector> get_operands() const { return operands; } /** * @brief Accepts a visitor class. @@ -238,11 +294,11 @@ class expression : public detail::node { * @param visitor Visitor. * @return cudf::size_type Index of device data reference for this instance. */ - cudf::size_type accept(detail::linearizer& visitor) const override; + cudf::size_type accept(detail::expression_parser& visitor) const override; private: - const ast_operator op; - const std::vector> operands; + ast_operator const op; + std::vector> const operands; }; } // namespace ast diff --git a/cpp/include/cudf/ast/operators.hpp b/cpp/include/cudf/ast/operators.hpp deleted file mode 100644 index 78e56340246..00000000000 --- a/cpp/include/cudf/ast/operators.hpp +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -namespace cudf { - -namespace ast { - -/** - * @brief Enum of supported operators. - */ -enum class ast_operator { - // Binary operators - ADD, ///< operator + - SUB, ///< operator - - MUL, ///< operator * - DIV, ///< operator / using common type of lhs and rhs - TRUE_DIV, ///< operator / after promoting type to floating point - FLOOR_DIV, ///< operator / after promoting to 64 bit floating point and then - ///< flooring the result - MOD, ///< operator % - PYMOD, ///< operator % but following python's sign rules for negatives - POW, ///< lhs ^ rhs - EQUAL, ///< operator == - NOT_EQUAL, ///< operator != - LESS, ///< operator < - GREATER, ///< operator > - LESS_EQUAL, ///< operator <= - GREATER_EQUAL, ///< operator >= - BITWISE_AND, ///< operator & - BITWISE_OR, ///< operator | - BITWISE_XOR, ///< operator ^ - LOGICAL_AND, ///< operator && - LOGICAL_OR, ///< operator || - // Unary operators - IDENTITY, ///< Identity function - SIN, ///< Trigonometric sine - COS, ///< Trigonometric cosine - TAN, ///< Trigonometric tangent - ARCSIN, ///< Trigonometric sine inverse - ARCCOS, ///< Trigonometric cosine inverse - ARCTAN, ///< Trigonometric tangent inverse - SINH, ///< Hyperbolic sine - COSH, ///< Hyperbolic cosine - TANH, ///< Hyperbolic tangent - ARCSINH, ///< Hyperbolic sine inverse - ARCCOSH, ///< Hyperbolic cosine inverse - ARCTANH, ///< Hyperbolic tangent inverse - EXP, ///< Exponential (base e, Euler number) - LOG, ///< Natural Logarithm (base e) - SQRT, ///< Square-root (x^0.5) - CBRT, ///< Cube-root (x^(1.0/3)) - CEIL, ///< Smallest integer value not less than arg - FLOOR, ///< largest integer value not greater than arg - ABS, ///< Absolute value - RINT, ///< Rounds the floating-point argument arg to an integer value - BIT_INVERT, ///< Bitwise Not (~) - NOT ///< Logical Not (!) -}; - -} // namespace ast - -} // namespace cudf diff --git a/cpp/include/cudf/ast/transform.hpp b/cpp/include/cudf/ast/transform.hpp deleted file mode 100644 index 59697e5f75c..00000000000 --- a/cpp/include/cudf/ast/transform.hpp +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -namespace cudf { - -namespace ast { - -/** - * @brief Compute a new column by evaluating an expression tree on a table. - * - * This evaluates an expression over a table to produce a new column. Also called an n-ary - * transform. - * - * @param table The table used for expression evaluation. - * @param expr The root of the expression tree. - * @param mr Device memory resource. - * @return std::unique_ptr Output column. - */ -std::unique_ptr compute_column( - table_view const table, - expression const& expr, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -} // namespace ast - -} // namespace cudf diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index 87fbc1ac651..5950edabbfc 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -853,6 +854,14 @@ class alignas(16) column_device_view : public detail::column_device_view_base { return d_children[child_index]; } + /** + * @brief Returns a span containing the children of this column + */ + __device__ device_span children() const noexcept + { + return device_span(d_children, _num_children); + } + /** * @brief Returns the number of child columns * diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index 3d90ac063e1..2e4ac870969 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -206,6 +206,21 @@ std::unique_ptr is_leap_year( cudf::column_view const& column, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Extract the number of days in the month + * + * output[i] contains the number of days in the month of date `column[i]` + * output[i] is null if `column[i]` is null + * + * @throw cudf::logic_error if input column datatype is not a TIMESTAMP + * + * @param cudf::column_view of the input datetime values + * @return cudf::column of datatype INT16 of days in month of the corresponding date + */ +std::unique_ptr days_in_month( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Returns the quarter of the date * diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 163ad3e480f..4e4c63ae517 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -130,7 +130,9 @@ class aggregation_finalizer { // Declares the interface for the finalizer /** * @brief Derived class for specifying a sum aggregation */ -class sum_aggregation final : public rolling_aggregation { +class sum_aggregation final : public rolling_aggregation, + public groupby_aggregation, + public groupby_scan_aggregation { public: sum_aggregation() : aggregation(SUM) {} @@ -149,7 +151,7 @@ class sum_aggregation final : public rolling_aggregation { /** * @brief Derived class for specifying a product aggregation */ -class product_aggregation final : public aggregation { +class product_aggregation final : public groupby_aggregation { public: product_aggregation() : aggregation(PRODUCT) {} @@ -168,7 +170,9 @@ class product_aggregation final : public aggregation { /** * @brief Derived class for specifying a min aggregation */ -class min_aggregation final : public rolling_aggregation { +class min_aggregation final : public rolling_aggregation, + public groupby_aggregation, + public groupby_scan_aggregation { public: min_aggregation() : aggregation(MIN) {} @@ -187,7 +191,9 @@ class min_aggregation final : public rolling_aggregation { /** * @brief Derived class for specifying a max aggregation */ -class max_aggregation final : public rolling_aggregation { +class max_aggregation final : public rolling_aggregation, + public groupby_aggregation, + public groupby_scan_aggregation { public: max_aggregation() : aggregation(MAX) {} @@ -206,7 +212,9 @@ class max_aggregation final : public rolling_aggregation { /** * @brief Derived class for specifying a count aggregation */ -class count_aggregation final : public rolling_aggregation { +class count_aggregation final : public rolling_aggregation, + public groupby_aggregation, + public groupby_scan_aggregation { public: count_aggregation(aggregation::Kind kind) : aggregation(kind) {} @@ -263,7 +271,7 @@ class all_aggregation final : public aggregation { /** * @brief Derived class for specifying a sum_of_squares aggregation */ -class sum_of_squares_aggregation final : public aggregation { +class sum_of_squares_aggregation final : public groupby_aggregation { public: sum_of_squares_aggregation() : aggregation(SUM_OF_SQUARES) {} @@ -282,7 +290,7 @@ class sum_of_squares_aggregation final : public aggregation { /** * @brief Derived class for specifying a mean aggregation */ -class mean_aggregation final : public rolling_aggregation { +class mean_aggregation final : public rolling_aggregation, public groupby_aggregation { public: mean_aggregation() : aggregation(MEAN) {} @@ -301,7 +309,7 @@ class mean_aggregation final : public rolling_aggregation { /** * @brief Derived class for specifying a m2 aggregation */ -class m2_aggregation : public aggregation { +class m2_aggregation : public groupby_aggregation { public: m2_aggregation() : aggregation{M2} {} @@ -320,7 +328,7 @@ class m2_aggregation : public aggregation { /** * @brief Derived class for specifying a standard deviation/variance aggregation */ -class std_var_aggregation : public aggregation { +class std_var_aggregation : public groupby_aggregation { public: size_type _ddof; ///< Delta degrees of freedom @@ -339,7 +347,6 @@ class std_var_aggregation : public aggregation { CUDF_EXPECTS(k == aggregation::STD or k == aggregation::VARIANCE, "std_var_aggregation can accept only STD, VARIANCE"); } - size_type hash_impl() const { return std::hash{}(_ddof); } }; @@ -348,7 +355,10 @@ class std_var_aggregation : public aggregation { */ class var_aggregation final : public std_var_aggregation { public: - var_aggregation(size_type ddof) : std_var_aggregation{aggregation::VARIANCE, ddof} {} + var_aggregation(size_type ddof) + : aggregation{aggregation::VARIANCE}, std_var_aggregation{aggregation::VARIANCE, ddof} + { + } std::unique_ptr clone() const override { @@ -367,7 +377,10 @@ class var_aggregation final : public std_var_aggregation { */ class std_aggregation final : public std_var_aggregation { public: - std_aggregation(size_type ddof) : std_var_aggregation{aggregation::STD, ddof} {} + std_aggregation(size_type ddof) + : aggregation{aggregation::STD}, std_var_aggregation{aggregation::STD, ddof} + { + } std::unique_ptr clone() const override { @@ -384,7 +397,7 @@ class std_aggregation final : public std_var_aggregation { /** * @brief Derived class for specifying a median aggregation */ -class median_aggregation final : public aggregation { +class median_aggregation final : public groupby_aggregation { public: median_aggregation() : aggregation(MEDIAN) {} @@ -403,7 +416,7 @@ class median_aggregation final : public aggregation { /** * @brief Derived class for specifying a quantile aggregation */ -class quantile_aggregation final : public aggregation { +class quantile_aggregation final : public groupby_aggregation { public: quantile_aggregation(std::vector const& q, interpolation i) : aggregation{QUANTILE}, _quantiles{q}, _interpolation{i} @@ -449,7 +462,7 @@ class quantile_aggregation final : public aggregation { /** * @brief Derived class for specifying an argmax aggregation */ -class argmax_aggregation final : public rolling_aggregation { +class argmax_aggregation final : public rolling_aggregation, public groupby_aggregation { public: argmax_aggregation() : aggregation(ARGMAX) {} @@ -468,7 +481,7 @@ class argmax_aggregation final : public rolling_aggregation { /** * @brief Derived class for specifying an argmin aggregation */ -class argmin_aggregation final : public rolling_aggregation { +class argmin_aggregation final : public rolling_aggregation, public groupby_aggregation { public: argmin_aggregation() : aggregation(ARGMIN) {} @@ -487,7 +500,7 @@ class argmin_aggregation final : public rolling_aggregation { /** * @brief Derived class for specifying a nunique aggregation */ -class nunique_aggregation final : public aggregation { +class nunique_aggregation final : public groupby_aggregation { public: nunique_aggregation(null_policy null_handling) : aggregation{NUNIQUE}, _null_handling{null_handling} @@ -523,7 +536,7 @@ class nunique_aggregation final : public aggregation { /** * @brief Derived class for specifying a nth element aggregation */ -class nth_element_aggregation final : public aggregation { +class nth_element_aggregation final : public groupby_aggregation { public: nth_element_aggregation(size_type n, null_policy null_handling) : aggregation{NTH_ELEMENT}, _n{n}, _null_handling{null_handling} @@ -582,7 +595,7 @@ class row_number_aggregation final : public rolling_aggregation { /** * @brief Derived class for specifying a rank aggregation */ -class rank_aggregation final : public rolling_aggregation { +class rank_aggregation final : public rolling_aggregation, public groupby_scan_aggregation { public: rank_aggregation() : aggregation{RANK} {} @@ -601,7 +614,7 @@ class rank_aggregation final : public rolling_aggregation { /** * @brief Derived class for specifying a dense rank aggregation */ -class dense_rank_aggregation final : public rolling_aggregation { +class dense_rank_aggregation final : public rolling_aggregation, public groupby_scan_aggregation { public: dense_rank_aggregation() : aggregation{DENSE_RANK} {} @@ -620,7 +633,7 @@ class dense_rank_aggregation final : public rolling_aggregation { /** * @brief Derived aggregation class for specifying COLLECT_LIST aggregation */ -class collect_list_aggregation final : public rolling_aggregation { +class collect_list_aggregation final : public rolling_aggregation, public groupby_aggregation { public: explicit collect_list_aggregation(null_policy null_handling = null_policy::INCLUDE) : aggregation{COLLECT_LIST}, _null_handling{null_handling} @@ -656,7 +669,7 @@ class collect_list_aggregation final : public rolling_aggregation { /** * @brief Derived aggregation class for specifying COLLECT_SET aggregation */ -class collect_set_aggregation final : public rolling_aggregation { +class collect_set_aggregation final : public rolling_aggregation, public groupby_aggregation { public: explicit collect_set_aggregation(null_policy null_handling = null_policy::INCLUDE, null_equality nulls_equal = null_equality::EQUAL, @@ -795,7 +808,7 @@ class udf_aggregation final : public rolling_aggregation { /** * @brief Derived aggregation class for specifying MERGE_LISTS aggregation */ -class merge_lists_aggregation final : public aggregation { +class merge_lists_aggregation final : public groupby_aggregation { public: explicit merge_lists_aggregation() : aggregation{MERGE_LISTS} {} @@ -814,7 +827,7 @@ class merge_lists_aggregation final : public aggregation { /** * @brief Derived aggregation class for specifying MERGE_SETS aggregation */ -class merge_sets_aggregation final : public aggregation { +class merge_sets_aggregation final : public groupby_aggregation { public: explicit merge_sets_aggregation(null_equality nulls_equal, nan_equality nans_equal) : aggregation{MERGE_SETS}, _nulls_equal(nulls_equal), _nans_equal(nans_equal) @@ -855,7 +868,7 @@ class merge_sets_aggregation final : public aggregation { /** * @brief Derived aggregation class for specifying MERGE_M2 aggregation */ -class merge_m2_aggregation final : public aggregation { +class merge_m2_aggregation final : public groupby_aggregation { public: explicit merge_m2_aggregation() : aggregation{MERGE_M2} {} diff --git a/cpp/include/cudf/detail/reshape.hpp b/cpp/include/cudf/detail/reshape.hpp index fb24b7669d7..0ae7ba0a6a6 100644 --- a/cpp/include/cudf/detail/reshape.hpp +++ b/cpp/include/cudf/detail/reshape.hpp @@ -34,5 +34,16 @@ std::unique_ptr tile( size_type count, rmm::cuda_stream_view = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @copydoc cudf::interleave_columns + * + * @param stream CUDA stream used for device memory operations and kernel launches + */ +std::unique_ptr interleave_columns( + table_view const& input, + rmm::cuda_stream_view = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp index b94223cdabe..12948498455 100644 --- a/cpp/include/cudf/detail/transform.hpp +++ b/cpp/include/cudf/detail/transform.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -35,6 +36,17 @@ std::unique_ptr transform( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @copydoc cudf::compute_column + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr compute_column( + table_view const table, + ast::operation const& expr, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @copydoc cudf::nans_to_nulls * diff --git a/java/src/main/java/ai/rapids/cudf/ast/Expression.java b/cpp/include/cudf/detail/utilities/visitor_overload.hpp similarity index 59% rename from java/src/main/java/ai/rapids/cudf/ast/Expression.java rename to cpp/include/cudf/detail/utilities/visitor_overload.hpp index 8d391298cef..a55ca323c50 100644 --- a/java/src/main/java/ai/rapids/cudf/ast/Expression.java +++ b/cpp/include/cudf/detail/utilities/visitor_overload.hpp @@ -13,19 +13,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#pragma once -package ai.rapids.cudf.ast; +namespace cudf::detail { -import java.nio.ByteBuffer; -import java.nio.ByteOrder; +/** + * @brief Helper class to support inline-overloading for all of a variant's alternative types + */ +template +struct visitor_overload : Ts... { + using Ts::operator()...; +}; +template +visitor_overload(Ts...) -> visitor_overload; -/** Base class of every AST expression. */ -public abstract class Expression extends AstNode { - public CompiledExpression compile() { - int size = getSerializedSize(); - ByteBuffer bb = ByteBuffer.allocate(size); - bb.order(ByteOrder.nativeOrder()); - serialize(bb); - return new CompiledExpression(bb.array()); - } -} +} // namespace cudf::detail diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp index 5656b38a0ef..3b8354ebc9f 100644 --- a/cpp/include/cudf/groupby.hpp +++ b/cpp/include/cudf/groupby.hpp @@ -56,8 +56,23 @@ class sort_groupby_helper; * `values.size()` column must equal `keys.num_rows()`. */ struct aggregation_request { - column_view values; ///< The elements to aggregate - std::vector> aggregations; ///< Desired aggregations + column_view values; ///< The elements to aggregate + std::vector> aggregations; ///< Desired aggregations +}; + +/** + * @brief Request for groupby aggregation(s) for scanning a column. + * + * The group membership of each `value[i]` is determined by the corresponding + * row `i` in the original order of `keys` used to construct the + * `groupby`. I.e., for each `aggregation`, `values[i]` is aggregated with all + * other `values[j]` where rows `i` and `j` in `keys` are equivalent. + * + * `values.size()` column must equal `keys.num_rows()`. + */ +struct scan_request { + column_view values; ///< The elements to aggregate + std::vector> aggregations; ///< Desired aggregations }; /** @@ -222,7 +237,7 @@ class groupby { * specified in `requests`. */ std::pair, std::vector> scan( - host_span requests, + host_span requests, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -388,7 +403,7 @@ class groupby { rmm::mr::device_memory_resource* mr); std::pair, std::vector> sort_scan( - host_span requests, + host_span requests, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); }; diff --git a/cpp/include/cudf/io/avro.hpp b/cpp/include/cudf/io/avro.hpp index 34410209c72..774690c939f 100644 --- a/cpp/include/cudf/io/avro.hpp +++ b/cpp/include/cudf/io/avro.hpp @@ -195,12 +195,9 @@ class avro_reader_options_builder { * * The following code snippet demonstrates how to read a dataset from a file: * @code - * ... - * std::string filepath = "dataset.avro"; - * cudf::avro_reader_options options = - * cudf::avro_reader_options::builder(cudf::source_info(filepath)); - * ... - * auto result = cudf::read_avro(options); + * auto source = cudf::io::source_info("dataset.avro"); + * auto options = cudf::io::avro_reader_options::builder(source); + * auto result = cudf::io::read_avro(options); * @endcode * * @param options Settings for controlling reading behavior. diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index 1dff99735ec..455ffce7ed8 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -104,14 +104,18 @@ class csv_reader_options { // Whether a quote inside a value is double-quoted bool _doublequote = true; // Names of columns to read as datetime - std::vector _infer_date_names; + std::vector _parse_dates_names; // Indexes of columns to read as datetime - std::vector _infer_date_indexes; + std::vector _parse_dates_indexes; + // Names of columns to parse as hexadecimal + std::vector _parse_hex_names; + // Indexes of columns to parse as hexadecimal + std::vector _parse_hex_indexes; // Conversion settings // Per-column types; disables type inference on those columns - std::variant, std::vector> _dtypes; + std::variant, std::map> _dtypes; // Additional values to recognize as boolean true values std::vector _true_values{"True", "TRUE", "true"}; // Additional values to recognize as boolean false values @@ -280,17 +284,27 @@ class csv_reader_options { /** * @brief Returns names of columns to read as datetime. */ - std::vector const& get_infer_date_names() const { return _infer_date_names; } + std::vector const& get_parse_dates_names() const { return _parse_dates_names; } /** * @brief Returns indexes of columns to read as datetime. */ - std::vector const& get_infer_date_indexes() const { return _infer_date_indexes; } + std::vector const& get_parse_dates_indexes() const { return _parse_dates_indexes; } + + /** + * @brief Returns names of columns to read as hexadecimal. + */ + std::vector const& get_parse_hex_names() const { return _parse_hex_names; } + + /** + * @brief Returns indexes of columns to read as hexadecimal. + */ + std::vector const& get_parse_hex_indexes() const { return _parse_hex_indexes; } /** * @brief Returns per-column types. */ - std::variant, std::vector> const& get_dtypes() const + std::variant, std::map> const& get_dtypes() const { return _dtypes; } @@ -547,9 +561,9 @@ class csv_reader_options { * * @param col_names Vector of column names to infer as datetime. */ - void set_infer_date_names(std::vector col_names) + void set_parse_dates(std::vector col_names) { - _infer_date_names = std::move(col_names); + _parse_dates_names = std::move(col_names); } /** @@ -557,30 +571,38 @@ class csv_reader_options { * * @param col_names Vector of column indices to infer as datetime. */ - void set_infer_date_indexes(std::vector col_ind) + void set_parse_dates(std::vector col_ind) { _parse_dates_indexes = std::move(col_ind); } + + /** + * @brief Sets names of columns to parse as hexadecimal + * + * @param col_names Vector of column names to parse as hexadecimal + */ + void set_parse_hex(std::vector col_names) { - _infer_date_indexes = std::move(col_ind); + _parse_hex_names = std::move(col_names); } + /** + * @brief Sets indexes of columns to parse as hexadecimal + * + * @param col_names Vector of column indices to parse as hexadecimal + */ + void set_parse_hex(std::vector col_ind) { _parse_hex_indexes = std::move(col_ind); } + /** * @brief Sets per-column types * - * @param types Vector specifying the columns' target data types. + * @param types Column name -> data type map specifying the columns' target data types */ - void set_dtypes(std::vector types) { _dtypes = std::move(types); } + void set_dtypes(std::map types) { _dtypes = std::move(types); } /** - * @brief Sets per-column types, specified by the type's respective string representation. + * @brief Sets per-column types * - * @param types Vector of dtypes in which the column needs to be read. + * @param types Vector specifying the columns' target data types. */ - [[deprecated( - "The string-based interface will be deprecated." - "Use dtypes(std::vector) instead.")]] void - set_dtypes(std::vector types) - { - _dtypes = std::move(types); - } + void set_dtypes(std::vector types) { _dtypes = std::move(types); } /** * @brief Sets additional values to recognize as boolean true values. @@ -958,49 +980,70 @@ class csv_reader_options_builder { /** * @brief Sets names of columns to read as datetime. * - * @param col_names Vector of column names to infer as datetime. + * @param col_names Vector of column names to read as datetime. * @return this for chaining. */ - csv_reader_options_builder& infer_date_names(std::vector col_names) + csv_reader_options_builder& parse_dates(std::vector col_names) { - options._infer_date_names = std::move(col_names); + options._parse_dates_names = std::move(col_names); return *this; } /** * @brief Sets indexes of columns to read as datetime. * - * @param col_names Vector of column indices to infer as datetime. + * @param col_ind Vector of column indices to read as datetime + * @return this for chaining. + */ + csv_reader_options_builder& parse_dates(std::vector col_ind) + { + options._parse_dates_indexes = std::move(col_ind); + return *this; + } + + /** + * @brief Sets names of columns to parse as hexadecimal. + * + * @param col_names Vector of column names to parse as hexadecimal + * @return this for chaining. + */ + csv_reader_options_builder& parse_hex(std::vector col_names) + { + options._parse_hex_names = std::move(col_names); + return *this; + } + + /** + * @brief Sets indexes of columns to parse as hexadecimal. + * + * @param col_ind Vector of column indices to parse as hexadecimal * @return this for chaining. */ - csv_reader_options_builder& infer_date_indexes(std::vector col_ind) + csv_reader_options_builder& parse_hex(std::vector col_ind) { - options._infer_date_indexes = std::move(col_ind); + options._parse_hex_indexes = std::move(col_ind); return *this; } /** * @brief Sets per-column types. * - * @param types Vector of data types in which the column needs to be read. + * @param types Column name -> data type map specifying the columns' target data types * @return this for chaining. */ - csv_reader_options_builder& dtypes(std::vector types) + csv_reader_options_builder& dtypes(std::map types) { options._dtypes = std::move(types); return *this; } /** - * @brief Sets per-column types, specified by the type's respective string representation. + * @brief Sets per-column types. * - * @param types Vector of dtypes in which the column needs to be read. + * @param types Vector of data types in which the column needs to be read. * @return this for chaining. */ - [[deprecated( - "The string-based interface will be deprecated." - "Use dtypes(std::vector) instead.")]] csv_reader_options_builder& - dtypes(std::vector types) + csv_reader_options_builder& dtypes(std::vector types) { options._dtypes = std::move(types); return *this; @@ -1108,11 +1151,9 @@ class csv_reader_options_builder { * * The following code snippet demonstrates how to read a dataset from a file: * @code - * std::string filepath = "dataset.csv"; - * cudf::io::csv_reader_options options = - * cudf::io::csv_reader_options::builder(cudf::source_info(filepath)); - * ... - * auto result = cudf::read_csv(options); + * auto source = cudf::io::source_info("dataset.csv"); + * auto options = cudf::io::csv_reader_options::builder(source); + * auto result = cudf::io::read_csv(options); * @endcode * * @param options Settings for controlling reading behavior. @@ -1437,12 +1478,12 @@ class csv_writer_options_builder { * * The following code snippet demonstrates how to write columns to a file: * @code - * std::string filepath = "dataset.csv"; - * cudf::io::sink_info sink_info(filepath); + * auto destination = cudf::io::sink_info("dataset.csv"); + * auto options = cudf::io::csv_writer_options(destination, table->view()) + * .na_rep(na) + * .include_header(include_header) + * .rows_per_chunk(rows_per_chunk); * - * cudf::io::csv_writer_options options = cudf::io::csv_writer_options(sink_info, - * table->view()).na_rep(na).include_header(include_header).rows_per_chunk(rows_per_chunk); - * ... * cudf::io::write_csv(options); * @endcode * diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 2f4d0936d8b..31201e30ac6 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -23,7 +23,9 @@ #include +#include #include +#include #include namespace cudf { @@ -66,7 +68,7 @@ class json_reader_options { source_info _source; // Data types of the column; empty to infer dtypes - std::vector _dtypes; + std::variant, std::map> _dtypes; // Specify the compression format of the source or infer from file extension compression_type _compression = compression_type::AUTO; @@ -114,7 +116,10 @@ class json_reader_options { /** * @brief Returns data types of the columns. */ - std::vector const& get_dtypes() const { return _dtypes; } + std::variant, std::map> const& get_dtypes() const + { + return _dtypes; + } /** * @brief Returns compression format of the source. @@ -141,19 +146,26 @@ class json_reader_options { */ bool is_enabled_dayfirst() const { return _dayfirst; } + /** + * @brief Set data types for columns to be read. + * + * @param types Vector of dtypes + */ + void set_dtypes(std::vector types) { _dtypes = std::move(types); } + /** * @brief Set data types for columns to be read. * * @param types Vector dtypes in string format. */ - void dtypes(std::vector types) { _dtypes = std::move(types); } + void set_dtypes(std::map types) { _dtypes = std::move(types); } /** * @brief Set the compression type. * * @param comp_type The compression type used. */ - void compression(compression_type comp_type) { _compression = comp_type; } + void set_compression(compression_type comp_type) { _compression = comp_type; } /** * @brief Set number of bytes to skip from source start. @@ -205,10 +217,22 @@ class json_reader_options_builder { /** * @brief Set data types for columns to be read. * - * @param types Vector dtypes in string format. - * @return this for chaining. + * @param types Vector of dtypes + * @return this for chaining + */ + json_reader_options_builder& dtypes(std::vector types) + { + options._dtypes = std::move(types); + return *this; + } + + /** + * @brief Set data types for columns to be read. + * + * @param types Column name -> dtype map. + * @return this for chaining */ - json_reader_options_builder& dtypes(std::vector types) + json_reader_options_builder& dtypes(std::map types) { options._dtypes = std::move(types); return *this; @@ -292,11 +316,9 @@ class json_reader_options_builder { * * The following code snippet demonstrates how to read a dataset from a file: * @code - * ... - * std::string filepath = "dataset.json"; - * cudf::read_json_options options = cudf::read_json_options::builder(cudf::source_info(filepath)); - * ... - * auto result = cudf::read_json(options); + * auto source = cudf::io::source_info("dataset.json"); + * auto options = cudf::io::read_json_options::builder(source); + * auto result = cudf::io::read_json(options); * @endcode * * @param options Settings for controlling reading behavior. diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index 997f35ed922..4ae09b516a4 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -346,14 +346,14 @@ class orc_reader_options_builder { * * The following code snippet demonstrates how to read a dataset from a file: * @code - * ... - * std::string filepath = "dataset.orc"; - * cudf::orc_reader_options options = - * cudf::orc_reader_options::builder(cudf::source_info(filepath)); - * ... - * auto result = cudf::read_orc(options); + * auto source = cudf::io::source_info("dataset.orc"); + * auto options = cudf::io::orc_reader_options::builder(source); + * auto result = cudf::io::read_orc(options); * @endcode * + * Note: Support for reading files with struct columns is currently experimental, the output may not + * be as reliable as reading for other datatypes. + * * @param options Settings for controlling reading behavior. * @param mr Device memory resource used to allocate device memory of the table in the returned * table_with_metadata. @@ -565,12 +565,9 @@ class orc_writer_options_builder { * * The following code snippet demonstrates how to write columns to a file: * @code - * ... - * std::string filepath = "dataset.orc"; - * cudf::orc_writer_options options = cudf::orc_writer_options::builder(cudf::sink_info(filepath), - * table->view()); - * ... - * cudf::write_orc(options); + * auto destination = cudf::io::sink_info("dataset.orc"); + * auto options = cudf::io::orc_writer_options::builder(destination, table->view()); + * cudf::io::write_orc(options); * @endcode * * @param options Settings for controlling reading behavior. diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index ecd9607a87e..25cbb6fd554 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -50,7 +50,7 @@ class parquet_reader_options_builder; class parquet_reader_options { source_info _source; - // Names of column to read; empty is all + // Path in schema of column to read; empty is all std::vector _columns; // List of individual row groups to read (ignored if empty) @@ -354,12 +354,9 @@ class parquet_reader_options_builder { * * The following code snippet demonstrates how to read a dataset from a file: * @code - * ... - * std::string filepath = "dataset.parquet"; - * cudf::io::parquet_reader_options options = - * cudf::io::parquet_reader_options::builder(cudf::source_info(filepath)); - * ... - * auto result = cudf::read_parquet(options); + * auto source = cudf::io::source_info("dataset.parquet"); + * auto options = cudf::io::parquet_reader_options::builder(source); + * auto result = cudf::io::read_parquet(options); * @endcode * * @param options Settings for controlling reading behavior @@ -784,12 +781,9 @@ class parquet_writer_options_builder { * * The following code snippet demonstrates how to write columns to a file: * @code - * ... - * std::string filepath = "dataset.parquet"; - * cudf::io::parquet_writer_options options = - * cudf::io::parquet_writer_options::builder(cudf::sink_info(filepath), table->view()); - * ... - * cudf::write_parquet(options); + * auto destination = cudf::io::sink_info("dataset.parquet"); + * auto options = cudf::io::parquet_writer_options::builder(destination, table->view()); + * cudf::io::write_parquet(options); * @endcode * * @param options Settings for controlling writing behavior. @@ -1019,15 +1013,12 @@ std::unique_ptr> merge_rowgroup_metadata( * one logical table by writing a series of individual cudf::tables. * * @code - * ... - * std::string filepath = "dataset.parquet"; - * cudf::io::chunked_parquet_writer_options options = - * cudf::io::chunked_parquet_writer_options::builder(cudf::sink_info(filepath), table->view()); - * ... - * cudf::io::parquet_chunked_writer writer(options) + * auto destination = cudf::io::sink_info("dataset.parquet"); + * auto options = cudf::io::chunked_parquet_writer_options::builder(destination, table->view()); + * auto writer = cudf::io::parquet_chunked_writer(options); + * * writer.write(table0) * writer.write(table1) - * ... * writer.close() * @endcode */ diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index d0d2083b85b..483cd75c739 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -16,7 +16,7 @@ #pragma once -#include +#include #include #include @@ -673,8 +673,6 @@ class hash_join { * Result: {{1}, {0}} * @endcode * - * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys` - * mismatch. * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. * * @param left The left table @@ -689,11 +687,12 @@ class hash_join { std::pair>, std::unique_ptr>> conditional_inner_join( - table_view left, - table_view right, - ast::expression binary_predicate, - null_equality compare_nulls = null_equality::EQUAL, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + table_view const& left, + table_view const& right, + ast::expression const& binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + std::optional output_size = {}, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Returns a pair of row index vectors corresponding to all pairs @@ -721,8 +720,6 @@ conditional_inner_join( * Result: {{0, 1, 2}, {None, 0, None}} * @endcode * - * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys` - * mismatch. * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. * * @param left The left table @@ -736,10 +733,11 @@ conditional_inner_join( */ std::pair>, std::unique_ptr>> -conditional_left_join(table_view left, - table_view right, - ast::expression binary_predicate, - null_equality compare_nulls = null_equality::EQUAL, +conditional_left_join(table_view const& left, + table_view const& right, + ast::expression const& binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + std::optional output_size = {}, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -767,8 +765,6 @@ conditional_left_join(table_view left, * Result: {{0, 1, 2, None, None}, {None, 0, None, 1, 2}} * @endcode * - * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys` - * mismatch. * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. * * @param left The left table @@ -782,9 +778,9 @@ conditional_left_join(table_view left, */ std::pair>, std::unique_ptr>> -conditional_full_join(table_view left, - table_view right, - ast::expression binary_predicate, +conditional_full_join(table_view const& left, + table_view const& right, + ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -808,8 +804,6 @@ conditional_full_join(table_view left, * Result: {1} * @endcode * - * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys` - * mismatch. * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. * * @param left The left table @@ -823,11 +817,12 @@ conditional_full_join(table_view left, * `right` . */ std::unique_ptr> conditional_left_semi_join( - table_view left, - table_view right, - ast::expression binary_predicate, - null_equality compare_nulls = null_equality::EQUAL, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + table_view const& left, + table_view const& right, + ast::expression const& binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + std::optional output_size = {}, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Returns an index vector corresponding to all rows in the left table @@ -849,8 +844,6 @@ std::unique_ptr> conditional_left_semi_join( * Result: {0, 2} * @endcode * - * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys` - * mismatch. * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. * * @param left The left table @@ -864,11 +857,111 @@ std::unique_ptr> conditional_left_semi_join( * `right` . */ std::unique_ptr> conditional_left_anti_join( - table_view left, - table_view right, - ast::expression binary_predicate, + table_view const& left, + table_view const& right, + ast::expression const& binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + std::optional output_size = {}, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns the exact number of matches (rows) when performing a + * conditional inner join between the specified tables where the predicate + * evaluates to true. + * + * If the provided predicate returns NULL for a pair of rows + * (left, right), that pair is not included in the output. + * + * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. + * + * @param left The left table + * @param right The right table + * @param binary_predicate The condition on which to join. + * @param compare_nulls Whether the equality operator returns true or false for two nulls. + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return The size that would result from performing the requested join. + */ +std::size_t conditional_inner_join_size( + table_view const& left, + table_view const& right, + ast::expression const& binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns the exact number of matches (rows) when performing a + * conditional left join between the specified tables where the predicate + * evaluates to true. + * + * If the provided predicate returns NULL for a pair of rows + * (left, right), that pair is not included in the output. + * + * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. + * + * @param left The left table + * @param right The right table + * @param binary_predicate The condition on which to join. + * @param compare_nulls Whether the equality operator returns true or false for two nulls. + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return The size that would result from performing the requested join. + */ +std::size_t conditional_left_join_size( + table_view const& left, + table_view const& right, + ast::expression const& binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns the exact number of matches (rows) when performing a + * conditional left semi join between the specified tables where the predicate + * evaluates to true. + * + * If the provided predicate returns NULL for a pair of rows + * (left, right), that pair is not included in the output. + * + * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. + * + * @param left The left table + * @param right The right table + * @param binary_predicate The condition on which to join. + * @param compare_nulls Whether the equality operator returns true or false for two nulls. + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return The size that would result from performing the requested join. + */ +std::size_t conditional_left_semi_join_size( + table_view const& left, + table_view const& right, + ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns the exact number of matches (rows) when performing a + * conditional left anti join between the specified tables where the predicate + * evaluates to true. + * + * If the provided predicate returns NULL for a pair of rows + * (left, right), that pair is not included in the output. + * + * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. + * + * @param left The left table + * @param right The right table + * @param binary_predicate The condition on which to join. + * @param compare_nulls Whether the equality operator returns true or false for two nulls. + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return The size that would result from performing the requested join. + */ +std::size_t conditional_left_anti_join_size( + table_view const& left, + table_view const& right, + ast::expression const& binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp index 13a3da14cce..39bd2984095 100644 --- a/cpp/include/cudf/scalar/scalar.hpp +++ b/cpp/include/cudf/scalar/scalar.hpp @@ -706,13 +706,16 @@ class list_scalar : public scalar { */ class struct_scalar : public scalar { public: - struct_scalar() = delete; - ~struct_scalar() = default; - struct_scalar(struct_scalar&& other) = default; - struct_scalar(struct_scalar const& other) = default; + struct_scalar() = delete; + ~struct_scalar() = default; + struct_scalar(struct_scalar&& other) = default; struct_scalar& operator=(struct_scalar const& other) = delete; struct_scalar& operator=(struct_scalar&& other) = delete; + struct_scalar(struct_scalar const& other, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Construct a new struct scalar object from table_view. * diff --git a/cpp/include/cudf/strings/detail/utilities.cuh b/cpp/include/cudf/strings/detail/utilities.cuh index 8758a28885f..6894c34a077 100644 --- a/cpp/include/cudf/strings/detail/utilities.cuh +++ b/cpp/include/cudf/strings/detail/utilities.cuh @@ -18,7 +18,6 @@ #include #include #include -#include #include #include @@ -205,81 +204,6 @@ auto make_strings_children( return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr); } -/** - * @brief Creates child offsets, chars columns and null mask, null count of a strings column by - * applying the template function that can be used for computing the output size of each string as - * well as create the output. - * - * @tparam SizeAndExecuteFunction Function must accept an index and return a size. - * It must have members `d_offsets`, `d_chars`, and `d_validities` which are set to memory - * containing the offsets column, chars column and string validities during write. - * - * @param size_and_exec_fn This is called twice. Once for the output size of each string, which is - * written into the `d_offsets` array. After that, `d_chars` is set and this - * is called again to fill in the chars memory. The `d_validities` array may - * be modified to set the value `0` for the corresponding rows that contain - * null string elements. - * @param exec_size Range for executing the function `size_and_exec_fn`. - * @param strings_count Number of strings. - * @param mr Device memory resource used to allocate the returned columns' device memory. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @return offsets child column, chars child column, null_mask, and null_count for a strings column. - */ -template -std::tuple, std::unique_ptr, rmm::device_buffer, size_type> -make_strings_children_with_null_mask( - SizeAndExecuteFunction size_and_exec_fn, - size_type exec_size, - size_type strings_count, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) -{ - auto offsets_column = make_numeric_column( - data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); - auto offsets_view = offsets_column->mutable_view(); - auto d_offsets = offsets_view.template data(); - size_and_exec_fn.d_offsets = d_offsets; - - auto validities = rmm::device_uvector(strings_count, stream); - size_and_exec_fn.d_validities = validities.begin(); - - // This is called twice: once for offsets and validities, and once for chars - auto for_each_fn = [exec_size, stream](SizeAndExecuteFunction& size_and_exec_fn) { - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - exec_size, - size_and_exec_fn); - }; - - // Compute the string sizes (storing in `d_offsets`) and string validities - for_each_fn(size_and_exec_fn); - - // Compute the offsets from string sizes - thrust::exclusive_scan( - rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets); - - // Now build the chars column - auto const bytes = cudf::detail::get_value(offsets_view, strings_count, stream); - auto chars_column = create_chars_child_column(bytes, stream, mr); - - // Execute the function fn again to fill the chars column. - // Note that if the output chars column has zero size, the function fn should not be called to - // avoid accidentally overwriting the offsets. - if (bytes > 0) { - size_and_exec_fn.d_chars = chars_column->mutable_view().template data(); - for_each_fn(size_and_exec_fn); - } - - // Finally compute null mask and null count from the validities array - auto [null_mask, null_count] = cudf::detail::valid_if( - validities.begin(), validities.end(), thrust::identity{}, stream, mr); - - return std::make_tuple(std::move(offsets_column), - std::move(chars_column), - null_count > 0 ? std::move(null_mask) : rmm::device_buffer{}, - null_count); -} - // This template is a thin wrapper around per-context singleton objects. // It maintains a single object for each CUDA context. template diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp index 28ab19e53d9..087d1a94603 100644 --- a/cpp/include/cudf/strings/replace_re.hpp +++ b/cpp/include/cudf/strings/replace_re.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -72,22 +72,24 @@ std::unique_ptr replace_re( /** * @brief For each string, replaces any character sequence matching the given pattern - * using the repl template for back-references. + * using the replacement template for back-references. * * Any null string entries return corresponding null output column entries. * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * + * @throw cudf::logic_error if capture index values in `replacement` are not in range 1-99 + * * @param strings Strings instance for this operation. * @param pattern The regular expression patterns to search within each string. - * @param repl The replacement template for creating the output string. + * @param replacement The replacement template for creating the output string. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column. */ std::unique_ptr replace_with_backrefs( strings_column_view const& strings, std::string const& pattern, - std::string const& repl, + std::string const& replacement, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace strings diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp index f5880e9b37f..af2858d948e 100644 --- a/cpp/include/cudf/transform.hpp +++ b/cpp/include/cudf/transform.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -74,6 +75,24 @@ std::pair, size_type> nans_to_nulls( column_view const& input, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Compute a new column by evaluating an expression tree on a table. + * + * This evaluates an expression over a table to produce a new column. Also called an n-ary + * transform. + * + * @throws cudf::logic_error if passed an expression operating on table_reference::RIGHT. + * + * @param table The table used for expression evaluation. + * @param expr The root of the expression tree. + * @param mr Device memory resource. + * @return std::unique_ptr Output column. + */ +std::unique_ptr compute_column( + table_view const& table, + ast::expression const& expr, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Creates a bitmask from a column of boolean elements. * diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index 016f2367139..f0c522257fb 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -362,6 +362,8 @@ std::unique_ptr make_sum_aggregation() } template std::unique_ptr make_sum_aggregation(); template std::unique_ptr make_sum_aggregation(); +template std::unique_ptr make_sum_aggregation(); +template std::unique_ptr make_sum_aggregation(); /// Factory to create a PRODUCT aggregation template @@ -370,6 +372,7 @@ std::unique_ptr make_product_aggregation() return std::make_unique(); } template std::unique_ptr make_product_aggregation(); +template std::unique_ptr make_product_aggregation(); /// Factory to create a MIN aggregation template @@ -379,6 +382,8 @@ std::unique_ptr make_min_aggregation() } template std::unique_ptr make_min_aggregation(); template std::unique_ptr make_min_aggregation(); +template std::unique_ptr make_min_aggregation(); +template std::unique_ptr make_min_aggregation(); /// Factory to create a MAX aggregation template @@ -388,6 +393,8 @@ std::unique_ptr make_max_aggregation() } template std::unique_ptr make_max_aggregation(); template std::unique_ptr make_max_aggregation(); +template std::unique_ptr make_max_aggregation(); +template std::unique_ptr make_max_aggregation(); /// Factory to create a COUNT aggregation template @@ -401,6 +408,10 @@ template std::unique_ptr make_count_aggregation( null_policy null_handling); template std::unique_ptr make_count_aggregation( null_policy null_handling); +template std::unique_ptr make_count_aggregation( + null_policy null_handling); +template std::unique_ptr make_count_aggregation( + null_policy null_handling); /// Factory to create a ANY aggregation template @@ -425,6 +436,8 @@ std::unique_ptr make_sum_of_squares_aggregation() return std::make_unique(); } template std::unique_ptr make_sum_of_squares_aggregation(); +template std::unique_ptr +make_sum_of_squares_aggregation(); /// Factory to create a MEAN aggregation template @@ -434,6 +447,7 @@ std::unique_ptr make_mean_aggregation() } template std::unique_ptr make_mean_aggregation(); template std::unique_ptr make_mean_aggregation(); +template std::unique_ptr make_mean_aggregation(); /// Factory to create a M2 aggregation template @@ -442,6 +456,7 @@ std::unique_ptr make_m2_aggregation() return std::make_unique(); } template std::unique_ptr make_m2_aggregation(); +template std::unique_ptr make_m2_aggregation(); /// Factory to create a VARIANCE aggregation template @@ -450,6 +465,8 @@ std::unique_ptr make_variance_aggregation(size_type ddof) return std::make_unique(ddof); } template std::unique_ptr make_variance_aggregation(size_type ddof); +template std::unique_ptr make_variance_aggregation( + size_type ddof); /// Factory to create a STD aggregation template @@ -458,6 +475,8 @@ std::unique_ptr make_std_aggregation(size_type ddof) return std::make_unique(ddof); } template std::unique_ptr make_std_aggregation(size_type ddof); +template std::unique_ptr make_std_aggregation( + size_type ddof); /// Factory to create a MEDIAN aggregation template @@ -466,6 +485,7 @@ std::unique_ptr make_median_aggregation() return std::make_unique(); } template std::unique_ptr make_median_aggregation(); +template std::unique_ptr make_median_aggregation(); /// Factory to create a QUANTILE aggregation template @@ -475,6 +495,8 @@ std::unique_ptr make_quantile_aggregation(std::vector const& q, in } template std::unique_ptr make_quantile_aggregation( std::vector const& q, interpolation i); +template std::unique_ptr make_quantile_aggregation( + std::vector const& q, interpolation i); /// Factory to create an ARGMAX aggregation template @@ -484,6 +506,7 @@ std::unique_ptr make_argmax_aggregation() } template std::unique_ptr make_argmax_aggregation(); template std::unique_ptr make_argmax_aggregation(); +template std::unique_ptr make_argmax_aggregation(); /// Factory to create an ARGMIN aggregation template @@ -493,6 +516,7 @@ std::unique_ptr make_argmin_aggregation() } template std::unique_ptr make_argmin_aggregation(); template std::unique_ptr make_argmin_aggregation(); +template std::unique_ptr make_argmin_aggregation(); /// Factory to create an NUNIQUE aggregation template @@ -502,6 +526,8 @@ std::unique_ptr make_nunique_aggregation(null_policy null_handling) } template std::unique_ptr make_nunique_aggregation( null_policy null_handling); +template std::unique_ptr make_nunique_aggregation( + null_policy null_handling); /// Factory to create an NTH_ELEMENT aggregation template @@ -511,6 +537,8 @@ std::unique_ptr make_nth_element_aggregation(size_type n, null_policy null } template std::unique_ptr make_nth_element_aggregation( size_type n, null_policy null_handling); +template std::unique_ptr make_nth_element_aggregation( + size_type n, null_policy null_handling); /// Factory to create a ROW_NUMBER aggregation template @@ -528,6 +556,8 @@ std::unique_ptr make_rank_aggregation() return std::make_unique(); } template std::unique_ptr make_rank_aggregation(); +template std::unique_ptr +make_rank_aggregation(); /// Factory to create a DENSE_RANK aggregation template @@ -536,6 +566,8 @@ std::unique_ptr make_dense_rank_aggregation() return std::make_unique(); } template std::unique_ptr make_dense_rank_aggregation(); +template std::unique_ptr +make_dense_rank_aggregation(); /// Factory to create a COLLECT_LIST aggregation template @@ -547,6 +579,8 @@ template std::unique_ptr make_collect_list_aggregation null_policy null_handling); template std::unique_ptr make_collect_list_aggregation( null_policy null_handling); +template std::unique_ptr make_collect_list_aggregation( + null_policy null_handling); /// Factory to create a COLLECT_SET aggregation template @@ -560,6 +594,8 @@ template std::unique_ptr make_collect_set_aggregation( null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal); template std::unique_ptr make_collect_set_aggregation( null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal); +template std::unique_ptr make_collect_set_aggregation( + null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal); /// Factory to create a LAG aggregation template @@ -605,6 +641,7 @@ std::unique_ptr make_merge_lists_aggregation() return std::make_unique(); } template std::unique_ptr make_merge_lists_aggregation(); +template std::unique_ptr make_merge_lists_aggregation(); /// Factory to create a MERGE_SETS aggregation template @@ -615,6 +652,8 @@ std::unique_ptr make_merge_sets_aggregation(null_equality nulls_equal, } template std::unique_ptr make_merge_sets_aggregation(null_equality, nan_equality); +template std::unique_ptr make_merge_sets_aggregation( + null_equality, nan_equality); /// Factory to create a MERGE_M2 aggregation template @@ -623,6 +662,7 @@ std::unique_ptr make_merge_m2_aggregation() return std::make_unique(); } template std::unique_ptr make_merge_m2_aggregation(); +template std::unique_ptr make_merge_m2_aggregation(); namespace detail { namespace { diff --git a/cpp/src/ast/linearizer.cpp b/cpp/src/ast/expression_parser.cpp similarity index 61% rename from cpp/src/ast/linearizer.cpp rename to cpp/src/ast/expression_parser.cpp index 3e442305552..1072bff43dd 100644 --- a/cpp/src/ast/linearizer.cpp +++ b/cpp/src/ast/expression_parser.cpp @@ -13,9 +13,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include -#include -#include +#include +#include +#include #include #include #include @@ -56,7 +56,7 @@ device_data_reference::device_data_reference(device_data_reference_type referenc { } -cudf::size_type linearizer::intermediate_counter::take() +cudf::size_type expression_parser::intermediate_counter::take() { auto const first_missing = find_first_missing(); used_values.insert(used_values.cbegin() + first_missing, first_missing); @@ -64,7 +64,7 @@ cudf::size_type linearizer::intermediate_counter::take() return first_missing; } -void linearizer::intermediate_counter::give(cudf::size_type value) +void expression_parser::intermediate_counter::give(cudf::size_type value) { // TODO: add comment auto const lower_bound = std::lower_bound(used_values.cbegin(), used_values.cend(), value); @@ -72,18 +72,7 @@ void linearizer::intermediate_counter::give(cudf::size_type value) used_values.erase(lower_bound); } -/** - * @brief Find the first missing value in a contiguous sequence of integers. - * - * From a sorted container of integers, find the first "missing" value. - * For example, {0, 1, 2, 4, 5} is missing 3, and {1, 2, 3} is missing 0. - * If there are no missing values, return the size of the container. - * - * @param start Starting index. - * @param end Ending index. - * @return cudf::size_type Smallest value not already in the container. - */ -cudf::size_type linearizer::intermediate_counter::find_first_missing() const +cudf::size_type expression_parser::intermediate_counter::find_first_missing() const { if (used_values.empty() || (used_values.front() != 0)) { return 0; } // Search for the first non-contiguous pair of elements. @@ -94,42 +83,62 @@ cudf::size_type linearizer::intermediate_counter::find_first_missing() const : used_values.size(); // No missing elements. Return the next element in the sequence. } -cudf::size_type linearizer::visit(literal const& expr) +cudf::size_type expression_parser::visit(literal const& expr) { - _node_count++; // Increment the node index - auto const data_type = expr.get_data_type(); // Resolve node type - auto device_view = expr.get_value(); // Construct a scalar device view - auto const literal_index = cudf::size_type(_literals.size()); // Push literal - _literals.push_back(device_view); - auto const source = detail::device_data_reference( - detail::device_data_reference_type::LITERAL, data_type, literal_index); // Push data reference - return add_data_reference(source); + if (_expression_count == 0) { + // Handle the trivial case of a literal as the entire expression. + return visit(operation(ast_operator::IDENTITY, expr)); + } else { + _expression_count++; // Increment the expression index + auto const data_type = expr.get_data_type(); // Resolve expression type + auto device_view = expr.get_value(); // Construct a scalar device view + auto const literal_index = cudf::size_type(_literals.size()); // Push literal + _literals.push_back(device_view); + auto const source = detail::device_data_reference(detail::device_data_reference_type::LITERAL, + data_type, + literal_index); // Push data reference + return add_data_reference(source); + } } -cudf::size_type linearizer::visit(column_reference const& expr) +cudf::size_type expression_parser::visit(column_reference const& expr) { - // Increment the node index - _node_count++; - // Resolve node type - auto const data_type = expr.get_table_source() == table_reference::LEFT - ? expr.get_data_type(_left) - : expr.get_data_type(_right); - // Push data reference - auto const source = detail::device_data_reference(detail::device_data_reference_type::COLUMN, - data_type, - expr.get_column_index(), - expr.get_table_source()); - return add_data_reference(source); + if (_expression_count == 0) { + // Handle the trivial case of a column reference as the entire expression. + return visit(operation(ast_operator::IDENTITY, expr)); + } else { + // Increment the expression index + _expression_count++; + // Resolve expression type + cudf::data_type data_type; + if (expr.get_table_source() == table_reference::LEFT) { + data_type = expr.get_data_type(_left); + } else { + if (_right.has_value()) { + data_type = expr.get_data_type(*_right); + } else { + CUDF_FAIL( + "Your expression contains a reference to the RIGHT table even though it will only be " + "evaluated on a single table (by convention, the LEFT table)."); + } + } + // Push data reference + auto const source = detail::device_data_reference(detail::device_data_reference_type::COLUMN, + data_type, + expr.get_column_index(), + expr.get_table_source()); + return add_data_reference(source); + } } -cudf::size_type linearizer::visit(expression const& expr) +cudf::size_type expression_parser::visit(operation const& expr) { - // Increment the node index - auto const node_index = _node_count++; - // Visit children (operands) of this node + // Increment the expression index + auto const expression_index = _expression_count++; + // Visit children (operands) of this expression auto const operand_data_ref_indices = visit_operands(expr.get_operands()); // Resolve operand types - auto data_ref = [this](auto const& index) { return data_references()[index].data_type; }; + auto data_ref = [this](auto const& index) { return _data_references[index].data_type; }; auto begin = thrust::make_transform_iterator(operand_data_ref_indices.cbegin(), data_ref); auto end = begin + operand_data_ref_indices.size(); auto const operand_types = std::vector(begin, end); @@ -145,29 +154,30 @@ cudf::size_type linearizer::visit(expression const& expr) operand_data_ref_indices.cbegin(), operand_data_ref_indices.cend(), [this](auto const& data_reference_index) { - auto const operand_source = data_references()[data_reference_index]; + auto const operand_source = _data_references[data_reference_index]; if (operand_source.reference_type == detail::device_data_reference_type::INTERMEDIATE) { auto const intermediate_index = operand_source.data_index; _intermediate_counter.give(intermediate_index); } }); - // Resolve node type + // Resolve expression type auto const op = expr.get_operator(); auto const data_type = cudf::ast::detail::ast_operator_return_type(op, operand_types); _operators.push_back(op); // Push data reference auto const output = [&]() { - if (node_index == 0) { - // This node is the root. Output should be directed to the output column. + if (expression_index == 0) { + // This expression is the root. Output should be directed to the output column. return detail::device_data_reference( detail::device_data_reference_type::COLUMN, data_type, 0, table_reference::OUTPUT); } else { - // This node is not the root. Output is an intermediate value. + // This expression is not the root. Output is an intermediate value. // Ensure that the output type is fixed width and fits in the intermediate storage. if (!cudf::is_fixed_width(data_type)) { CUDF_FAIL( "The output data type is not a fixed-width type but must be stored in an intermediate."); - } else if (cudf::size_of(data_type) > sizeof(std::int64_t)) { + } else if (cudf::size_of(data_type) > (_has_nulls ? sizeof(IntermediateDataType) + : sizeof(IntermediateDataType))) { CUDF_FAIL("The output data type is too large to be stored in an intermediate."); } return detail::device_data_reference( @@ -183,14 +193,14 @@ cudf::size_type linearizer::visit(expression const& expr) return index; } -cudf::data_type linearizer::root_data_type() const +cudf::data_type expression_parser::output_type() const { - return data_references().empty() ? cudf::data_type(cudf::type_id::EMPTY) - : data_references().back().data_type; + return _data_references.empty() ? cudf::data_type(cudf::type_id::EMPTY) + : _data_references.back().data_type; } -std::vector linearizer::visit_operands( - std::vector> operands) +std::vector expression_parser::visit_operands( + std::vector> operands) { auto operand_data_reference_indices = std::vector(); for (auto const& operand : operands) { @@ -200,7 +210,7 @@ std::vector linearizer::visit_operands( return operand_data_reference_indices; } -cudf::size_type linearizer::add_data_reference(detail::device_data_reference data_ref) +cudf::size_type expression_parser::add_data_reference(detail::device_data_reference data_ref) { // If an equivalent data reference already exists, return its index. Otherwise add this data // reference and return the new index. @@ -215,16 +225,6 @@ cudf::size_type linearizer::add_data_reference(detail::device_data_reference dat } // namespace detail -cudf::size_type literal::accept(detail::linearizer& visitor) const { return visitor.visit(*this); } -cudf::size_type column_reference::accept(detail::linearizer& visitor) const -{ - return visitor.visit(*this); -} -cudf::size_type expression::accept(detail::linearizer& visitor) const -{ - return visitor.visit(*this); -} - } // namespace ast } // namespace cudf diff --git a/cpp/src/ast/expressions.cpp b/cpp/src/ast/expressions.cpp new file mode 100644 index 00000000000..88cc6650d6c --- /dev/null +++ b/cpp/src/ast/expressions.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cudf { +namespace ast { + +operation::operation(ast_operator op, expression const& input) : op(op), operands({input}) +{ + if (cudf::ast::detail::ast_operator_arity(op) != 1) { + CUDF_FAIL("The provided operator is not a unary operator."); + } +} + +operation::operation(ast_operator op, expression const& left, expression const& right) + : op(op), operands({left, right}) +{ + if (cudf::ast::detail::ast_operator_arity(op) != 2) { + CUDF_FAIL("The provided operator is not a binary operator."); + } +} + +cudf::size_type literal::accept(detail::expression_parser& visitor) const +{ + return visitor.visit(*this); +} +cudf::size_type column_reference::accept(detail::expression_parser& visitor) const +{ + return visitor.visit(*this); +} +cudf::size_type operation::accept(detail::expression_parser& visitor) const +{ + return visitor.visit(*this); +} + +} // namespace ast + +} // namespace cudf diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index f5f3937089f..f4b6a8bf5fd 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -374,11 +374,18 @@ void traverse_children::operator()(host_span size_t { strings_column_view scv(b); - return a + (b.is_empty() - ? 0 - : cudf::detail::get_value( - scv.offsets(), scv.offset() + b.size(), stream) - - cudf::detail::get_value(scv.offsets(), scv.offset(), stream)); + return a + (scv.is_empty() ? 0 + // if the column is unsliced, skip the offset retrieval. + : scv.offset() > 0 + ? cudf::detail::get_value( + scv.offsets(), scv.offset() + scv.size(), stream) - + cudf::detail::get_value(scv.offsets(), scv.offset(), stream) + // if the offset() is 0, it can still be sliced to a shorter length. in this case + // we only need to read a single offset. otherwise just return the full length + // (chars_size()) + : scv.size() + 1 == scv.offsets().size() + ? scv.chars_size() + : cudf::detail::get_value(scv.offsets(), scv.size(), stream)); }); // note: output text must include "exceeds size_type range" for python error handling CUDF_EXPECTS(total_char_count <= static_cast(std::numeric_limits::max()), diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index 4d8acb3bd3b..9879a6c5423 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -83,12 +83,6 @@ static __device__ int16_t const days_until_month[2][13] = { {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366} // For leap years }; -CUDA_DEVICE_CALLABLE uint8_t days_in_month(cuda::std::chrono::month mon, bool is_leap_year) -{ - return days_until_month[is_leap_year][unsigned{mon}] - - days_until_month[is_leap_year][unsigned{mon} - 1]; -} - // Round up the date to the last day of the month and return the // date only (without the time component) struct extract_last_day_of_month { @@ -96,18 +90,23 @@ struct extract_last_day_of_month { CUDA_DEVICE_CALLABLE timestamp_D operator()(Timestamp const ts) const { using namespace cuda::std::chrono; - // IDEAL: does not work with CUDA10.0 due to nvcc compiler bug - // cannot invoke ym_last_day.day() - // const year_month_day orig_ymd(floor(ts)); - // const year_month_day_last ym_last_day(orig_ymd.year(), month_day_last(orig_ymd.month())); - // return timestamp_D(sys_days(ym_last_day)); - - // Only has the days - time component is chopped off, which is what we want - auto const days_since_epoch = floor(ts); - auto const date = year_month_day(days_since_epoch); - auto const last_day = days_in_month(date.month(), date.year().is_leap()); + const year_month_day ymd(floor(ts)); + auto const ymdl = year_month_day_last{ymd.year() / ymd.month() / last}; + return timestamp_D{sys_days{ymdl}}; + } +}; - return timestamp_D(days_since_epoch + days(last_day - static_cast(date.day()))); +// Extract the number of days of the month +// A similar operator to `extract_last_day_of_month`, except this returns +// an integer while the other returns a timestamp. +struct days_in_month_op { + template + CUDA_DEVICE_CALLABLE int16_t operator()(Timestamp const ts) const + { + using namespace cuda::std::chrono; + auto const date = year_month_day(floor(ts)); + auto const ymdl = year_month_day_last(date.year() / date.month() / last); + return static_cast(unsigned{ymdl.day()}); } }; @@ -144,6 +143,7 @@ struct extract_quarter_op { } }; +// Returns true if the year is a leap year struct is_leap_year_op { template CUDA_DEVICE_CALLABLE bool operator()(Timestamp const ts) const @@ -220,22 +220,6 @@ struct add_calendrical_months_functor { { } - // std chrono implementation is copied here due to nvcc bug 2909685 - // https://howardhinnant.github.io/date_algorithms.html#days_from_civil - static CUDA_DEVICE_CALLABLE timestamp_D - compute_sys_days(cuda::std::chrono::year_month_day const& ymd) - { - const int yr = static_cast(ymd.year()) - (ymd.month() <= cuda::std::chrono::month{2}); - const unsigned mth = static_cast(ymd.month()); - const unsigned dy = static_cast(ymd.day()); - - const int era = (yr >= 0 ? yr : yr - 399) / 400; - const unsigned yoe = static_cast(yr - era * 400); // [0, 399] - const unsigned doy = (153 * (mth + (mth > 2 ? -3 : 9)) + 2) / 5 + dy - 1; // [0, 365] - const unsigned doe = yoe * 365 + yoe / 4 - yoe / 100 + doy; // [0, 146096] - return timestamp_D{duration_D{era * 146097 + static_cast(doe) - 719468}}; - } - template typename std::enable_if_t::value, void> operator()( rmm::cuda_stream_view stream) const @@ -265,15 +249,10 @@ struct add_calendrical_months_functor { // If the new date isn't valid, scale it back to the last day of the // month. - // IDEAL: if (!ymd.ok()) ymd = ymd.year()/ymd.month()/last; - auto month_days = days_in_month(ymd.month(), ymd.year().is_leap()); - if (unsigned{ymd.day()} > month_days) - ymd = ymd.year() / ymd.month() / day{month_days}; + if (!ymd.ok()) ymd = ymd.year() / ymd.month() / last; // Put back the time component to the date - return - // IDEAL: sys_days{ymd} + ... - compute_sys_days(ymd) + (time_val - days_since_epoch); + return sys_days{ymd} + (time_val - days_since_epoch); }); } }; @@ -393,6 +372,13 @@ std::unique_ptr is_leap_year(column_view const& column, return apply_datetime_op(column, stream, mr); } +std::unique_ptr days_in_month(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return apply_datetime_op(column, stream, mr); +} + std::unique_ptr extract_quarter(column_view const& column, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -476,6 +462,13 @@ std::unique_ptr is_leap_year(column_view const& column, rmm::mr::device_ return detail::is_leap_year(column, rmm::cuda_stream_default, mr); } +std::unique_ptr days_in_month(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::days_in_month(column, rmm::cuda_stream_default, mr); +} + std::unique_ptr extract_quarter(column_view const& column, rmm::mr::device_memory_resource* mr) { diff --git a/cpp/src/dictionary/detail/merge.cu b/cpp/src/dictionary/detail/merge.cu index 2ff0a3e0a2a..e972403cad3 100644 --- a/cpp/src/dictionary/detail/merge.cu +++ b/cpp/src/dictionary/detail/merge.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -62,8 +63,11 @@ std::unique_ptr merge(dictionary_column_view const& lcol, return make_dictionary_column( std::make_unique(lcol.keys(), stream, mr), std::move(indices_column), - rmm::device_buffer{ - lcol.has_nulls() || rcol.has_nulls() ? static_cast(merged_size) : 0, stream, mr}, + cudf::detail::create_null_mask( + lcol.has_nulls() || rcol.has_nulls() ? static_cast(merged_size) : 0, + mask_state::UNINITIALIZED, + stream, + mr), lcol.null_count() + rcol.null_count()); } diff --git a/cpp/src/groupby/common/utils.hpp b/cpp/src/groupby/common/utils.hpp index e8d5c60f81a..3da20fb9af3 100644 --- a/cpp/src/groupby/common/utils.hpp +++ b/cpp/src/groupby/common/utils.hpp @@ -24,8 +24,10 @@ namespace cudf { namespace groupby { namespace detail { -inline std::vector extract_results( - host_span requests, cudf::detail::result_cache& cache) + +template +inline std::vector extract_results(host_span requests, + cudf::detail::result_cache& cache) { std::vector results(requests.size()); diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index 8c43c071a85..a26d69e3d46 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -120,7 +120,8 @@ struct empty_column_constructor { }; /// Make an empty table with appropriate types for requested aggs -auto empty_results(host_span requests) +template +auto empty_results(host_span requests) { std::vector empty_results; @@ -144,7 +145,8 @@ auto empty_results(host_span requests) } /// Verifies the agg requested on the request's values is valid -void verify_valid_requests(host_span requests) +template +void verify_valid_requests(host_span requests) { CUDF_EXPECTS( std::all_of( @@ -184,7 +186,7 @@ std::pair, std::vector> groupby::aggr // Compute scan requests std::pair, std::vector> groupby::scan( - host_span requests, rmm::mr::device_memory_resource* mr) + host_span requests, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); CUDF_EXPECTS( diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp index 450a8313402..c43df77bb5e 100644 --- a/cpp/src/groupby/sort/scan.cpp +++ b/cpp/src/groupby/sort/scan.cpp @@ -152,7 +152,7 @@ void scan_result_functor::operator()(aggregation const& // Sort-based groupby std::pair, std::vector> groupby::sort_scan( - host_span requests, + host_span requests, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { diff --git a/cpp/src/interop/detail/arrow_allocator.cpp b/cpp/src/interop/detail/arrow_allocator.cpp new file mode 100644 index 00000000000..cb67c893573 --- /dev/null +++ b/cpp/src/interop/detail/arrow_allocator.cpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +namespace cudf { +namespace detail { + +std::unique_ptr allocate_arrow_buffer(const int64_t size, arrow::MemoryPool* ar_mr) +{ + /* + nvcc 11.0 generates Internal Compiler Error during codegen when arrow::AllocateBuffer + and `ValueOrDie` are used inside a CUDA compilation unit. + + To work around this issue we compile an allocation shim in C++ and use + that from our cuda sources + */ + auto result = arrow::AllocateBuffer(size, ar_mr); + CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer"); + return std::move(result).ValueOrDie(); +} + +std::shared_ptr allocate_arrow_bitmap(const int64_t size, arrow::MemoryPool* ar_mr) +{ + /* + nvcc 11.0 generates Internal Compiler Error during codegen when arrow::AllocateBuffer + and `ValueOrDie` are used inside a CUDA compilation unit. + + To work around this issue we compile an allocation shim in C++ and use + that from our cuda sources + */ + auto result = arrow::AllocateBitmap(size, ar_mr); + CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow bitmap"); + return std::move(result).ValueOrDie(); +} + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/interop/detail/arrow_allocator.hpp b/cpp/src/interop/detail/arrow_allocator.hpp new file mode 100644 index 00000000000..20099f91afa --- /dev/null +++ b/cpp/src/interop/detail/arrow_allocator.hpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cudf { +namespace detail { + +// unique_ptr because that is what AllocateBuffer returns +std::unique_ptr allocate_arrow_buffer(const int64_t size, arrow::MemoryPool* ar_mr); + +// shared_ptr because that is what AllocateBitmap returns +std::shared_ptr allocate_arrow_bitmap(const int64_t size, arrow::MemoryPool* ar_mr); + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 3cd515e9981..3271804bf39 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -34,6 +34,8 @@ #include #include +#include "detail/arrow_allocator.hpp" + namespace cudf { namespace detail { namespace { @@ -48,10 +50,7 @@ std::shared_ptr fetch_data_buffer(column_view input_view, { const int64_t data_size_in_bytes = sizeof(T) * input_view.size(); - auto result = arrow::AllocateBuffer(data_size_in_bytes, ar_mr); - CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data"); - - std::shared_ptr data_buffer = std::move(result.ValueOrDie()); + auto data_buffer = allocate_arrow_buffer(data_size_in_bytes, ar_mr); CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(), input_view.data(), @@ -59,7 +58,7 @@ std::shared_ptr fetch_data_buffer(column_view input_view, cudaMemcpyDeviceToHost, stream.value())); - return data_buffer; + return std::move(data_buffer); } /** @@ -72,9 +71,7 @@ std::shared_ptr fetch_mask_buffer(column_view input_view, const int64_t mask_size_in_bytes = cudf::bitmask_allocation_size_bytes(input_view.size()); if (input_view.has_nulls()) { - auto result = arrow::AllocateBitmap(static_cast(input_view.size()), ar_mr); - CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for mask"); - std::shared_ptr mask_buffer = std::move(result.ValueOrDie()); + auto mask_buffer = allocate_arrow_bitmap(static_cast(input_view.size()), ar_mr); CUDA_TRY(cudaMemcpyAsync( mask_buffer->mutable_data(), (input_view.offset() > 0) ? cudf::copy_bitmask(input_view).data() : input_view.null_mask(), @@ -163,10 +160,7 @@ std::shared_ptr dispatch_to_arrow::operator()( }); auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType); - auto result = arrow::AllocateBuffer(buf_size_in_bytes, ar_mr); - CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data"); - - std::shared_ptr data_buffer = std::move(result.ValueOrDie()); + auto data_buffer = allocate_arrow_buffer(buf_size_in_bytes, ar_mr); CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(), buf.data(), @@ -176,7 +170,7 @@ std::shared_ptr dispatch_to_arrow::operator()( auto type = arrow::decimal(18, -input.type().scale()); auto mask = fetch_mask_buffer(input, ar_mr, stream); - auto buffers = std::vector>{mask, data_buffer}; + auto buffers = std::vector>{mask, std::move(data_buffer)}; auto data = std::make_shared(type, input.size(), buffers); return std::make_shared(data); @@ -191,10 +185,7 @@ std::shared_ptr dispatch_to_arrow::operator()(column_view in { auto bitmask = bools_to_mask(input, stream); - auto result = arrow::AllocateBuffer(static_cast(bitmask.first->size()), ar_mr); - CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data"); - - std::shared_ptr data_buffer = std::move(result.ValueOrDie()); + auto data_buffer = allocate_arrow_buffer(static_cast(bitmask.first->size()), ar_mr); CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(), bitmask.first->data(), @@ -203,7 +194,7 @@ std::shared_ptr dispatch_to_arrow::operator()(column_view in stream.value())); return to_arrow_array(id, static_cast(input.size()), - data_buffer, + std::move(data_buffer), fetch_mask_buffer(input, ar_mr, stream), static_cast(input.null_count())); } @@ -225,19 +216,13 @@ std::shared_ptr dispatch_to_arrow::operator()( column_view input_view = (tmp_column != nullptr) ? tmp_column->view() : input; auto child_arrays = fetch_child_array(input_view, {{}, {}}, ar_mr, stream); if (child_arrays.empty()) { - arrow::Result> result; - // Empty string will have only one value in offset of 4 bytes - result = arrow::AllocateBuffer(4, ar_mr); - CUDF_EXPECTS(result.ok(), "Failed to allocate buffer"); - std::shared_ptr tmp_offset_buffer = std::move(result.ValueOrDie()); - tmp_offset_buffer->mutable_data()[0] = 0; - - result = arrow::AllocateBuffer(0, ar_mr); - CUDF_EXPECTS(result.ok(), "Failed to allocate buffer"); - std::shared_ptr tmp_data_buffer = std::move(result.ValueOrDie()); + auto tmp_offset_buffer = allocate_arrow_buffer(4, ar_mr); + auto tmp_data_buffer = allocate_arrow_buffer(0, ar_mr); + tmp_offset_buffer->mutable_data()[0] = 0; - return std::make_shared(0, tmp_offset_buffer, tmp_data_buffer); + return std::make_shared( + 0, std::move(tmp_offset_buffer), std::move(tmp_data_buffer)); } auto offset_buffer = child_arrays[0]->data()->buffers[1]; auto data_buffer = child_arrays[1]->data()->buffers[1]; diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu index 68ac67b900d..4d3736a41f0 100644 --- a/cpp/src/io/csv/csv_gpu.cu +++ b/cpp/src/io/csv/csv_gpu.cu @@ -193,7 +193,7 @@ __global__ void __launch_bounds__(csvparse_block_dim) int actual_col = 0; // Going through all the columns of a given record - while (col < column_flags.size() && field_start <= row_end) { + while (col < column_flags.size() && field_start < row_end) { auto next_delimiter = cudf::io::gpu::seek_field_end(field_start, row_end, opts); // Checking if this is a column that the user wants --- user can filter columns @@ -579,7 +579,7 @@ __global__ void __launch_bounds__(csvparse_block_dim) int col = 0; int actual_col = 0; - while (col < column_flags.size() && field_start <= row_end) { + while (col < column_flags.size() && field_start < row_end) { auto next_delimiter = cudf::io::gpu::seek_field_end(next_field, row_end, options); if (column_flags[col] & column_parse::enabled) { diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 70ce0fce1cc..7f85589a8aa 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -49,18 +49,6 @@ using cudf::device_span; using cudf::host_span; using cudf::detail::make_device_uvector_async; -namespace { -/** - * @brief Helper class to support inline-overloading for all of a variant's alternative types - */ -template -struct VisitorOverload : Ts... { - using Ts::operator()...; -}; -template -VisitorOverload(Ts...) -> VisitorOverload; -} // namespace - namespace cudf { namespace io { namespace detail { @@ -280,6 +268,41 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) return {rmm::device_uvector{0, stream}, selected_rows_offsets{stream}}; } +std::vector reader::impl::select_data_types( + std::map const& col_type_map) +{ + std::vector selected_dtypes; + + for (int col = 0; col < num_actual_cols_; col++) { + if (column_flags_[col] & column_parse::enabled) { + auto const col_type_it = col_type_map.find(col_names_[col]); + CUDF_EXPECTS(col_type_it != col_type_map.end(), + "Must specify data types for all active columns"); + selected_dtypes.emplace_back(col_type_it->second); + } + } + return selected_dtypes; +} + +std::vector reader::impl::select_data_types(std::vector const& dtypes) +{ + std::vector selected_dtypes; + + if (dtypes.size() == 1) { + // If it's a single dtype, assign that dtype to all active columns + selected_dtypes.resize(num_active_cols_, dtypes.front()); + } else { + // If it's a list, assign dtypes to active columns in the given order + CUDF_EXPECTS(static_cast(dtypes.size()) >= num_actual_cols_, + "Must specify data types for all columns"); + + for (int col = 0; col < num_actual_cols_; col++) { + if (column_flags_[col] & column_parse::enabled) { selected_dtypes.emplace_back(dtypes[col]); } + } + } + return selected_dtypes; +} + table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) { auto const data_row_offsets = select_data_and_row_offsets(stream); @@ -355,13 +378,13 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) } } - // User can specify which columns should be inferred as datetime - if (!opts_.get_infer_date_indexes().empty() || !opts_.get_infer_date_names().empty()) { - for (const auto index : opts_.get_infer_date_indexes()) { + // User can specify which columns should be read as datetime + if (!opts_.get_parse_dates_indexes().empty() || !opts_.get_parse_dates_names().empty()) { + for (const auto index : opts_.get_parse_dates_indexes()) { column_flags_[index] |= column_parse::as_datetime; } - for (const auto& name : opts_.get_infer_date_names()) { + for (const auto& name : opts_.get_parse_dates_names()) { auto it = std::find(col_names_.begin(), col_names_.end(), name); if (it != col_names_.end()) { column_flags_[it - col_names_.begin()] |= column_parse::as_datetime; @@ -369,6 +392,20 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) } } + // User can specify which columns should be parsed as hexadecimal + if (!opts_.get_parse_hex_indexes().empty() || !opts_.get_parse_hex_names().empty()) { + for (const auto index : opts_.get_parse_hex_indexes()) { + column_flags_[index] |= column_parse::as_hexadecimal; + } + + for (const auto& name : opts_.get_parse_hex_names()) { + auto it = std::find(col_names_.begin(), col_names_.end(), name); + if (it != col_names_.end()) { + column_flags_[it - col_names_.begin()] |= column_parse::as_hexadecimal; + } + } + } + // Return empty table rather than exception if nothing to load if (num_active_cols_ == 0) { return {std::make_unique
(), {}}; } @@ -382,11 +419,8 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) if (has_to_infer_column_types) { column_types = infer_column_types(data, row_offsets, stream); } else { - column_types = - std::visit(VisitorOverload{ - [&](const std::vector& data_types) { return data_types; }, - [&](const std::vector& dtypes) { return parse_column_types(dtypes); }}, - opts_.get_dtypes()); + column_types = std::visit([&](auto const& data_types) { return select_data_types(data_types); }, + opts_.get_dtypes()); } out_columns.reserve(column_types.size()); @@ -666,81 +700,6 @@ std::vector reader::impl::infer_column_types(device_span return dtypes; } -std::vector reader::impl::parse_column_types( - const std::vector& types_as_strings) -{ - std::vector dtypes; - - const bool is_dict = std::all_of(types_as_strings.begin(), - types_as_strings.end(), - [](const auto& s) { return s.find(':') != std::string::npos; }); - - if (!is_dict) { - if (types_as_strings.size() == 1) { - // If it's a single dtype, assign that dtype to all active columns - data_type dtype_; - column_parse::flags col_flags_; - std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[0]); - dtypes.resize(num_active_cols_, dtype_); - for (int col = 0; col < num_actual_cols_; col++) { - column_flags_[col] |= col_flags_; - } - CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); - } else { - // If it's a list, assign dtypes to active columns in the given order - CUDF_EXPECTS(static_cast(types_as_strings.size()) >= num_actual_cols_, - "Must specify data types for all columns"); - - auto dtype_ = std::back_inserter(dtypes); - - for (int col = 0; col < num_actual_cols_; col++) { - if (column_flags_[col] & column_parse::enabled) { - column_parse::flags col_flags_; - std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[col]); - column_flags_[col] |= col_flags_; - CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); - } - } - } - } else { - // Translate vector of `name : dtype` strings to map - // NOTE: Incoming pairs can be out-of-order from column names in dataset - std::unordered_map col_type_map; - for (const auto& pair : types_as_strings) { - const auto pos = pair.find_last_of(':'); - const auto name = pair.substr(0, pos); - const auto dtype = pair.substr(pos + 1, pair.size()); - col_type_map[name] = dtype; - } - - auto dtype_ = std::back_inserter(dtypes); - - for (int col = 0; col < num_actual_cols_; col++) { - if (column_flags_[col] & column_parse::enabled) { - CUDF_EXPECTS(col_type_map.find(col_names_[col]) != col_type_map.end(), - "Must specify data types for all active columns"); - column_parse::flags col_flags_; - std::tie(dtype_, col_flags_) = get_dtype_info(col_type_map[col_names_[col]]); - column_flags_[col] |= col_flags_; - CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); - } - } - } - - if (opts_.get_timestamp_type().id() != cudf::type_id::EMPTY) { - for (auto& type : dtypes) { - if (cudf::is_timestamp(type)) { type = opts_.get_timestamp_type(); } - } - } - - for (size_t i = 0; i < dtypes.size(); i++) { - // Replace EMPTY dtype with STRING - if (dtypes[i].id() == type_id::EMPTY) { dtypes[i] = data_type{type_id::STRING}; } - } - - return dtypes; -} - std::vector reader::impl::decode_data(device_span data, device_span row_offsets, host_span column_types, diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp index 29c6b48bc8a..4416457be16 100644 --- a/cpp/src/io/csv/reader_impl.hpp +++ b/cpp/src/io/csv/reader_impl.hpp @@ -182,13 +182,20 @@ class reader::impl { rmm::cuda_stream_view stream); /** - * @brief Parses the columns' data types from the vector of dtypes that are provided as strings. + * @brief Selects the columns' data types from the map of dtypes. * - * @param types_as_strings The vector of strings from which to parse the columns' target data - * types - * @return List of columns' data types + * @param col_type_map Column name -> data type map specifying the columns' target data types + * @return Sorted list of selected columns' data types */ - std::vector parse_column_types(std::vector const& types_as_strings); + std::vector select_data_types(std::map const& col_type_map); + + /** + * @brief Selects the columns' data types from the list of dtypes. + * + * @param dtypes Vector of data types specifying the columns' target data types + * @return Sorted list of selected columns' data types + */ + std::vector select_data_types(std::vector const& dtypes); /** * @brief Converts the row-column data and outputs to column bufferrs. diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index b4395d6c965..f1080342312 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -50,7 +51,6 @@ namespace json { using namespace cudf::io; namespace { - /** * @brief Estimates the maximum expected length or a row, based on the number * of columns @@ -87,12 +87,12 @@ std::unique_ptr
aggregate_keys_info(std::unique_ptr
info) auto const info_view = info->view(); std::vector requests; requests.emplace_back(groupby::aggregation_request{info_view.column(0)}); - requests.back().aggregations.emplace_back(make_min_aggregation()); - requests.back().aggregations.emplace_back(make_nth_element_aggregation(0)); + requests.back().aggregations.emplace_back(make_min_aggregation()); + requests.back().aggregations.emplace_back(make_nth_element_aggregation(0)); requests.emplace_back(groupby::aggregation_request{info_view.column(1)}); - requests.back().aggregations.emplace_back(make_min_aggregation()); - requests.back().aggregations.emplace_back(make_nth_element_aggregation(0)); + requests.back().aggregations.emplace_back(make_min_aggregation()); + requests.back().aggregations.emplace_back(make_nth_element_aggregation(0)); // Aggregate by hash values groupby::groupby gb_obj( @@ -236,7 +236,9 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size) { size_t map_range_size = 0; if (range_size != 0) { - map_range_size = range_size + calculate_max_row_size(options_.get_dtypes().size()); + auto const dtype_option_size = + std::visit([](const auto& dtypes) { return dtypes.size(); }, options_.get_dtypes()); + map_range_size = range_size + calculate_max_row_size(dtype_option_size); } // Support delayed opening of the file if using memory mapping datasource @@ -467,44 +469,29 @@ void reader::impl::set_column_names(device_span rec_starts, void reader::impl::set_data_types(device_span rec_starts, rmm::cuda_stream_view stream) { - auto const dtype = options_.get_dtypes(); - if (!dtype.empty()) { - CUDF_EXPECTS(dtype.size() == metadata_.column_names.size(), - "Need to specify the type of each column.\n"); - - // Assume that the dtype is in dictionary format only if all elements contain a colon - const bool is_dict = - std::all_of(std::cbegin(dtype), std::cend(dtype), [](const std::string& s) { - return std::find(std::cbegin(s), std::cend(s), ':') != std::cend(s); - }); - - auto split_on_colon = [](std::string_view s) { - auto const i = s.find(":"); - return std::pair{s.substr(0, i), s.substr(i + 1)}; - }; - - if (is_dict) { - std::map col_type_map; - std::transform( - std::cbegin(dtype), - std::cend(dtype), - std::inserter(col_type_map, col_type_map.end()), - [&](auto const& ts) { - auto const [col_name, type_str] = split_on_colon(ts); - return std::pair{std::string{col_name}, convert_string_to_dtype(std::string{type_str})}; - }); - - // Using the map here allows O(n log n) complexity - std::transform(std::cbegin(metadata_.column_names), - std::cend(metadata_.column_names), - std::back_inserter(dtypes_), - [&](auto const& column_name) { return col_type_map[column_name]; }); - } else { - std::transform(std::cbegin(dtype), - std::cend(dtype), - std::back_inserter(dtypes_), - [](auto const& col_dtype) { return convert_string_to_dtype(col_dtype); }); - } + bool has_to_infer_column_types = + std::visit([](const auto& dtypes) { return dtypes.empty(); }, options_.get_dtypes()); + if (!has_to_infer_column_types) { + dtypes_ = std::visit(cudf::detail::visitor_overload{ + [&](const std::vector& dtypes) { + CUDF_EXPECTS(dtypes.size() == metadata_.column_names.size(), + "Must specify types for all columns"); + return dtypes; + }, + [&](const std::map& dtypes) { + std::vector sorted_dtypes; + std::transform(std::cbegin(metadata_.column_names), + std::cend(metadata_.column_names), + std::back_inserter(sorted_dtypes), + [&](auto const& column_name) { + auto const it = dtypes.find(column_name); + CUDF_EXPECTS(it != dtypes.end(), + "Must specify types for all columns"); + return it->second; + }); + return sorted_dtypes; + }}, + options_.get_dtypes()); } else { CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n"); auto const num_columns = metadata_.column_names.size(); diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h index efc7b78cdb2..004812615eb 100644 --- a/cpp/src/io/orc/orc_gpu.h +++ b/cpp/src/io/orc/orc_gpu.h @@ -112,6 +112,7 @@ struct ColumnDesc { int32_t decimal_scale; // number of fractional decimal digits for decimal type int32_t ts_clock_rate; // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns) column_validity_info parent_validity_info; // consists of parent column valid_map and null count + uint32_t* parent_null_count_prefix_sums; // per-stripe prefix sums of parent column's null count }; /** @@ -138,7 +139,7 @@ struct EncChunk { int32_t scale; // scale for decimals or timestamps uint32_t* dict_index; // dictionary index from row index - device_span decimal_offsets; + uint32_t* decimal_offsets; column_device_view const* leaf_column; }; diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 033a2d9aff5..f7bd5ae86b8 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -759,6 +759,49 @@ void update_null_mask(cudf::detail::hostdevice_2dvector& chunks } } +/** + * @brief Compute the per-stripe prefix sum of null count, for each struct column in the current + * layer. + */ +void scan_null_counts(cudf::detail::hostdevice_2dvector const& chunks, + cudf::host_span> prefix_sums, + rmm::cuda_stream_view stream) +{ + auto const num_stripes = chunks.size().first; + if (num_stripes == 0) return; + + auto const num_columns = chunks.size().second; + std::vector>> prefix_sums_to_update; + for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) { + // Null counts sums are only needed for children of struct columns + if (chunks[0][col_idx].type_kind == STRUCT) { + prefix_sums_to_update.emplace_back(col_idx, prefix_sums[col_idx]); + } + } + auto const d_prefix_sums_to_update = + cudf::detail::make_device_uvector_async(prefix_sums_to_update, stream); + + thrust::for_each(rmm::exec_policy(stream), + d_prefix_sums_to_update.begin(), + d_prefix_sums_to_update.end(), + [chunks = cudf::detail::device_2dspan{chunks}] __device__( + auto const& idx_psums) { + auto const col_idx = idx_psums.first; + auto const psums = idx_psums.second; + + thrust::transform( + thrust::seq, + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + psums.size(), + psums.begin(), + [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; }); + + thrust::inclusive_scan(thrust::seq, psums.begin(), psums.end(), psums.begin()); + }); + // `prefix_sums_to_update` goes out of scope, copy has to be done before we return + stream.synchronize(); +} + void reader::impl::decode_stream_data(cudf::detail::hostdevice_2dvector& chunks, size_t num_dicts, size_t skip_rows, @@ -817,8 +860,6 @@ void reader::impl::decode_stream_data(cudf::detail::hostdevice_2dvector> out_buffers(_selected_columns.size()); std::vector schema_info; std::vector> lvl_stripe_data(_selected_columns.size()); + std::vector>> null_count_prefix_sums; table_metadata out_metadata; // There are no columns in the table @@ -1124,6 +1168,14 @@ table_with_metadata reader::impl::read(size_type skip_rows, // Logically view streams as columns std::vector stream_info; + null_count_prefix_sums.emplace_back(); + null_count_prefix_sums.back().reserve(_selected_columns[level].size()); + std::generate_n( + std::back_inserter(null_count_prefix_sums.back()), _selected_columns[level].size(), [&]() { + return cudf::detail::make_zeroed_device_uvector_async(total_num_stripes, + stream); + }); + // Tracker for eventually deallocating compressed and uncompressed data auto& stripe_data = lvl_stripe_data[level]; @@ -1207,10 +1259,12 @@ table_with_metadata reader::impl::read(size_type skip_rows, ? stripe_info->numberOfRows : _col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx]; chunk.column_num_rows = (level == 0) ? num_rows : _col_meta.num_child_rows[col_idx]; - chunk.parent_validity_info.valid_map_base = - (level == 0) ? nullptr : _col_meta.parent_column_data[col_idx].valid_map_base; - chunk.parent_validity_info.null_count = - (level == 0) ? 0 : _col_meta.parent_column_data[col_idx].null_count; + chunk.parent_validity_info = + (level == 0) ? column_validity_info{} : _col_meta.parent_column_data[col_idx]; + chunk.parent_null_count_prefix_sums = + (level == 0) + ? nullptr + : null_count_prefix_sums[level - 1][_col_meta.parent_column_index[col_idx]].data(); chunk.encoding_kind = stripe_footer->columns[selected_columns[col_idx].id].kind; chunk.type_kind = _metadata->per_file_metadata[stripe_source_mapping.source_idx] .ff.types[selected_columns[col_idx].id] @@ -1336,6 +1390,7 @@ table_with_metadata reader::impl::read(size_type skip_rows, // Extract information to process nested child columns if (nested_col.size()) { + scan_null_counts(chunks, null_count_prefix_sums[level], stream); row_groups.device_to_host(stream, true); aggregate_child_meta(chunks, row_groups, out_buffers[level], nested_col, level); } diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp index 49c0c983992..7171b13d422 100644 --- a/cpp/src/io/orc/reader_impl.hpp +++ b/cpp/src/io/orc/reader_impl.hpp @@ -58,6 +58,7 @@ struct reader_column_meta { std::vector parent_column_data; // consists of parent column valid_map and null count + std::vector parent_column_index; std::vector child_start_row; // start row of child columns [stripe][column] std::vector diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 75ccd19d77b..41ee285ac25 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -1167,8 +1167,17 @@ __global__ void __launch_bounds__(block_size) // No present stream: all rows are valid s->vals.u32[t] = ~0; } - while (s->top.nulls_desc_row < s->chunk.num_rows) { - uint32_t nrows_max = min(s->chunk.num_rows - s->top.nulls_desc_row, blockDim.x * 32); + auto const prev_parent_null_count = + (s->chunk.parent_null_count_prefix_sums != nullptr && stripe > 0) + ? s->chunk.parent_null_count_prefix_sums[stripe - 1] + : 0; + auto const parent_null_count = + (s->chunk.parent_null_count_prefix_sums != nullptr) + ? s->chunk.parent_null_count_prefix_sums[stripe] - prev_parent_null_count + : 0; + auto const num_elems = s->chunk.num_rows - parent_null_count; + while (s->top.nulls_desc_row < num_elems) { + uint32_t nrows_max = min(num_elems - s->top.nulls_desc_row, blockDim.x * 32); uint32_t nrows; size_t row_in; @@ -1187,7 +1196,7 @@ __global__ void __launch_bounds__(block_size) } __syncthreads(); - row_in = s->chunk.start_row + s->top.nulls_desc_row; + row_in = s->chunk.start_row + s->top.nulls_desc_row - prev_parent_null_count; if (row_in + nrows > first_row && row_in < first_row + max_num_rows && s->chunk.valid_map_base != NULL) { int64_t dst_row = row_in - first_row; @@ -1251,7 +1260,7 @@ __global__ void __launch_bounds__(block_size) // Sum up the valid counts and infer null_count null_count = block_reduce(temp_storage.bk_storage).Sum(null_count); if (t == 0) { - chunks[chunk_id].null_count = null_count; + chunks[chunk_id].null_count = parent_null_count + null_count; chunks[chunk_id].skip_count = s->chunk.skip_count; } } else { diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index d93845530d7..e0018ed7166 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -678,9 +678,7 @@ encoded_data writer::impl::encode_columns(orc_table_view const& orc_table, ck.dtype_len = column.type_width(); } ck.scale = column.scale(); - if (ck.type_kind == TypeKind::DECIMAL) { - ck.decimal_offsets = device_span{column.decimal_offsets(), ck.num_rows}; - } + if (ck.type_kind == TypeKind::DECIMAL) { ck.decimal_offsets = column.decimal_offsets(); } } } } @@ -1140,26 +1138,28 @@ void writer::impl::init_state() out_sink_->host_write(MAGIC, std::strlen(MAGIC)); } -/** - * @brief pre-order append ORC device columns - */ -void __device__ append_orc_device_column(uint32_t& idx, - thrust::optional parent_idx, - device_span cols, - column_device_view col) -{ - auto const current_idx = idx; - cols[current_idx] = orc_column_device_view{col, parent_idx}; - idx++; - if (col.type().id() == type_id::LIST) { - append_orc_device_column( - idx, current_idx, cols, col.child(lists_column_view::child_column_index)); +template +struct device_stack { + __device__ device_stack(T* stack_storage, int capacity) + : stack(stack_storage), capacity(capacity), size(0) + { } - if (col.type().id() == type_id::STRUCT) { - for (auto child_idx = 0; child_idx < col.num_child_columns(); ++child_idx) { - append_orc_device_column(idx, current_idx, cols, col.child(child_idx)); - } + __device__ void push(T const& val) + { + cudf_assert(size < capacity and "Stack overflow"); + stack[size++] = val; } + __device__ T pop() + { + cudf_assert(size > 0 and "Stack underflow"); + return stack[--size]; + } + __device__ bool empty() { return size == 0; } + + private: + T* stack; + int capacity; + int size; }; orc_table_view make_orc_table_view(table_view const& table, @@ -1189,13 +1189,40 @@ orc_table_view make_orc_table_view(table_view const& table, } rmm::device_uvector d_orc_columns(orc_columns.size(), stream); + using stack_value_type = thrust::pair>; + rmm::device_uvector stack_storage(orc_columns.size(), stream); + // pre-order append ORC device columns cudf::detail::device_single_thread( - [d_orc_cols = device_span{d_orc_columns}, - d_table = d_table] __device__() mutable { + [d_orc_cols = device_span{d_orc_columns}, + d_table = d_table, + stack_storage = stack_storage.data(), + stack_storage_size = stack_storage.size()] __device__() { + device_stack stack(stack_storage, stack_storage_size); + + thrust::for_each(thrust::seq, + thrust::make_reverse_iterator(d_table.end()), + thrust::make_reverse_iterator(d_table.begin()), + [&stack](column_device_view const& c) { + stack.push({&c, thrust::nullopt}); + }); + uint32_t idx = 0; - for (auto const& column : d_table) { - append_orc_device_column(idx, thrust::nullopt, d_orc_cols, column); + while (not stack.empty()) { + auto [col, parent] = stack.pop(); + d_orc_cols[idx] = orc_column_device_view{*col, parent}; + + if (col->type().id() == type_id::LIST) { + stack.push({&col->children()[lists_column_view::child_column_index], idx}); + } else if (col->type().id() == type_id::STRUCT) { + thrust::for_each(thrust::seq, + thrust::make_reverse_iterator(col->children().end()), + thrust::make_reverse_iterator(col->children().begin()), + [&stack, idx](column_device_view const& c) { + stack.push({&c, idx}); + }); + } + idx++; } }, stream); diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu new file mode 100644 index 00000000000..64b3dd69c0d --- /dev/null +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +namespace cudf { +namespace io { +namespace parquet { +namespace gpu { + +template +__global__ void __launch_bounds__(block_size, 1) + initialize_chunk_hash_maps_kernel(device_span chunks) +{ + auto chunk = chunks[blockIdx.x]; + auto t = threadIdx.x; + // fut: Now that per-chunk dict is same size as ck.num_values, try to not use one block per chunk + for (size_t i = 0; i < chunk.dict_map_size; i += block_size) { + if (t + i < chunk.dict_map_size) { + new (&chunk.dict_map_slots[t + i].first) map_type::atomic_key_type{KEY_SENTINEL}; + new (&chunk.dict_map_slots[t + i].second) map_type::atomic_mapped_type{VALUE_SENTINEL}; + } + } +} + +template +struct equality_functor { + column_device_view const& col; + __device__ bool operator()(size_type lhs_idx, size_type rhs_idx) + { + // We don't call this for nulls so this is fine + return equality_compare(col.element(lhs_idx), col.element(rhs_idx)); + } +}; + +template +struct hash_functor { + column_device_view const& col; + __device__ auto operator()(size_type idx) { return MurmurHash3_32{}(col.element(idx)); } +}; + +struct map_insert_fn { + map_type::device_mutable_view& map; + + template + __device__ bool operator()(column_device_view const& col, size_type i) + { + if constexpr (column_device_view::has_element_accessor()) { + auto hash_fn = hash_functor{col}; + auto equality_fn = equality_functor{col}; + return map.insert(std::make_pair(i, i), hash_fn, equality_fn); + } else { + cudf_assert(false && "Unsupported type to insert in map"); + } + return false; + } +}; + +struct map_find_fn { + map_type::device_view& map; + + template + __device__ auto operator()(column_device_view const& col, size_type i) + { + if constexpr (column_device_view::has_element_accessor()) { + auto hash_fn = hash_functor{col}; + auto equality_fn = equality_functor{col}; + return map.find(i, hash_fn, equality_fn); + } else { + cudf_assert(false && "Unsupported type to insert in map"); + } + return map.end(); + } +}; + +template +__global__ void __launch_bounds__(block_size, 1) + populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan chunks, + size_type num_rows) +{ + auto col_idx = blockIdx.y; + auto block_x = blockIdx.x; + auto t = threadIdx.x; + + auto start_row = + block_x * + max_page_fragment_size; // This is fragment size. all chunks are multiple of these many rows. + size_type end_row = min(start_row + max_page_fragment_size, num_rows); + + __shared__ EncColumnChunk* s_chunk; + __shared__ parquet_column_device_view s_col; + __shared__ size_type s_start_value_idx; + __shared__ size_type s_num_values; + if (t == 0) { + // Find the chunk this block is a part of + size_type num_rowgroups = chunks.size().first; + size_type rg_idx = 0; + while (rg_idx < num_rowgroups) { + if (auto ck = chunks[rg_idx][col_idx]; + start_row >= ck.start_row and start_row < ck.start_row + ck.num_rows) { + break; + } + ++rg_idx; + } + s_chunk = &chunks[rg_idx][col_idx]; + s_col = *(s_chunk->col_desc); + } + __syncthreads(); + if (not s_chunk->use_dictionary) { return; } + + if (t == 0) { + // Find the bounds of values in leaf column to be inserted into the map for current chunk + auto col = *(s_col.parent_column); + auto start_value_idx = start_row; + auto end_value_idx = end_row; + while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) { + if (col.type().id() == type_id::STRUCT) { + start_value_idx += col.offset(); + end_value_idx += col.offset(); + col = col.child(0); + } else { + auto offset_col = col.child(lists_column_view::offsets_column_index); + start_value_idx = offset_col.element(start_value_idx + col.offset()); + end_value_idx = offset_col.element(end_value_idx + col.offset()); + col = col.child(lists_column_view::child_column_index); + } + } + s_start_value_idx = start_value_idx; + s_num_values = end_value_idx - start_value_idx; + } + __syncthreads(); + + column_device_view const& data_col = *s_col.leaf_column; + using block_reduce = cub::BlockReduce; + __shared__ typename block_reduce::TempStorage reduce_storage; + + // Make a view of the hash map + auto hash_map_mutable = map_type::device_mutable_view( + s_chunk->dict_map_slots, s_chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL); + auto hash_map = map_type::device_view( + s_chunk->dict_map_slots, s_chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL); + + __shared__ int total_num_dict_entries; + for (size_type i = 0; i < s_num_values; i += block_size) { + // add the value to hash map + size_type val_idx = i + t + s_start_value_idx; + bool is_valid = + (i + t < s_num_values && val_idx < data_col.size()) and data_col.is_valid(val_idx); + + // insert element at val_idx to hash map and count successful insertions + size_type is_unique = 0; + size_type uniq_elem_size = 0; + if (is_valid) { + auto found_slot = type_dispatcher(data_col.type(), map_find_fn{hash_map}, data_col, val_idx); + if (found_slot == hash_map.end()) { + is_unique = + type_dispatcher(data_col.type(), map_insert_fn{hash_map_mutable}, data_col, val_idx); + uniq_elem_size = [&]() -> size_type { + if (not is_unique) { return 0; } + switch (s_col.physical_type) { + case Type::INT32: return 4; + case Type::INT64: return 8; + case Type::INT96: return 12; + case Type::FLOAT: return 4; + case Type::DOUBLE: return 8; + case Type::BYTE_ARRAY: + if (data_col.type().id() == type_id::STRING) { + // Strings are stored as 4 byte length + string bytes + return 4 + data_col.element(val_idx).size_bytes(); + } + case Type::FIXED_LEN_BYTE_ARRAY: + default: cudf_assert(false && "Unsupported type for dictionary encoding"); return 0; + } + }(); + } + } + + __syncthreads(); + auto num_unique = block_reduce(reduce_storage).Sum(is_unique); + __syncthreads(); + auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size); + if (t == 0) { + total_num_dict_entries = atomicAdd(&s_chunk->num_dict_entries, num_unique); + total_num_dict_entries += num_unique; + atomicAdd(&s_chunk->uniq_data_size, uniq_data_size); + } + __syncthreads(); + + // Check if the num unique values in chunk has already exceeded max dict size and early exit + if (total_num_dict_entries > MAX_DICT_SIZE) { return; } + } +} + +template +__global__ void __launch_bounds__(block_size, 1) + collect_map_entries_kernel(device_span chunks) +{ + auto& chunk = chunks[blockIdx.x]; + if (not chunk.use_dictionary) { return; } + + auto t = threadIdx.x; + auto map = + map_type::device_view(chunk.dict_map_slots, chunk.dict_map_size, KEY_SENTINEL, VALUE_SENTINEL); + + __shared__ size_type counter; + if (t == 0) counter = 0; + __syncthreads(); + for (size_t i = 0; i < chunk.dict_map_size; i += block_size) { + if (t + i < chunk.dict_map_size) { + auto slot = map.begin_slot() + t + i; + auto key = static_cast(slot->first); + if (key != KEY_SENTINEL) { + auto loc = atomicAdd(&counter, 1); + cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size"); + chunk.dict_data[loc] = key; + // If sorting dict page ever becomes a hard requirement, enable the following statement and + // add a dict sorting step before storing into the slot's second field. + // chunk.dict_data_idx[loc] = t + i; + slot->second.store(loc); + // TODO: ^ This doesn't need to be atomic. Try casting to value_type ptr and just writing. + } + } + } +} + +template +__global__ void __launch_bounds__(block_size, 1) + get_dictionary_indices_kernel(cudf::detail::device_2dspan chunks, + size_type num_rows) +{ + auto col_idx = blockIdx.y; + auto block_x = blockIdx.x; + auto t = threadIdx.x; + + size_type start_row = block_x * max_page_fragment_size; + size_type end_row = min(start_row + max_page_fragment_size, num_rows); + + __shared__ EncColumnChunk s_chunk; + __shared__ parquet_column_device_view s_col; + __shared__ size_type s_start_value_idx; + __shared__ size_type s_ck_start_val_idx; + __shared__ size_type s_num_values; + + if (t == 0) { + // Find the chunk this block is a part of + size_type num_rowgroups = chunks.size().first; + size_type rg_idx = 0; + while (rg_idx < num_rowgroups) { + if (auto ck = chunks[rg_idx][col_idx]; + start_row >= ck.start_row and start_row < ck.start_row + ck.num_rows) { + break; + } + ++rg_idx; + } + s_chunk = chunks[rg_idx][col_idx]; + s_col = *(s_chunk.col_desc); + + // Find the bounds of values in leaf column to be inserted into the map for current chunk + + auto col = *(s_col.parent_column); + auto start_value_idx = start_row; + auto end_value_idx = end_row; + auto chunk_start_val_idx = s_chunk.start_row; + while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) { + if (col.type().id() == type_id::STRUCT) { + start_value_idx += col.offset(); + chunk_start_val_idx += col.offset(); + end_value_idx += col.offset(); + col = col.child(0); + } else { + auto offset_col = col.child(lists_column_view::offsets_column_index); + start_value_idx = offset_col.element(start_value_idx + col.offset()); + chunk_start_val_idx = offset_col.element(chunk_start_val_idx + col.offset()); + end_value_idx = offset_col.element(end_value_idx + col.offset()); + col = col.child(lists_column_view::child_column_index); + } + } + s_start_value_idx = start_value_idx; + s_ck_start_val_idx = chunk_start_val_idx; + s_num_values = end_value_idx - start_value_idx; + } + __syncthreads(); + + if (not s_chunk.use_dictionary) { return; } + + column_device_view const& data_col = *s_col.leaf_column; + + auto map = map_type::device_view( + s_chunk.dict_map_slots, s_chunk.dict_map_size, KEY_SENTINEL, VALUE_SENTINEL); + + for (size_t i = 0; i < s_num_values; i += block_size) { + if (t + i < s_num_values) { + auto val_idx = s_start_value_idx + t + i; + bool is_valid = + (i + t < s_num_values && val_idx < data_col.size()) ? data_col.is_valid(val_idx) : false; + + if (is_valid) { + auto found_slot = type_dispatcher(data_col.type(), map_find_fn{map}, data_col, val_idx); + cudf_assert(found_slot != map.end() && + "Unable to find value in map in dictionary index construction"); + if (found_slot != map.end()) { + // No need for atomic as this is not going to be modified by any other thread + auto* val_ptr = reinterpret_cast(&found_slot->second); + s_chunk.dict_index[val_idx - s_ck_start_val_idx] = *val_ptr; + } + } + } + } +} + +void initialize_chunk_hash_maps(device_span chunks, rmm::cuda_stream_view stream) +{ + constexpr int block_size = 1024; + initialize_chunk_hash_maps_kernel + <<>>(chunks); +} + +void populate_chunk_hash_maps(cudf::detail::device_2dspan chunks, + size_type num_rows, + rmm::cuda_stream_view stream) +{ + constexpr int block_size = 256; + auto const grid_x = cudf::detail::grid_1d(num_rows, max_page_fragment_size); + auto const num_columns = chunks.size().second; + dim3 const dim_grid(grid_x.num_blocks, num_columns); + + populate_chunk_hash_maps_kernel + <<>>(chunks, num_rows); +} + +void collect_map_entries(device_span chunks, rmm::cuda_stream_view stream) +{ + constexpr int block_size = 1024; + collect_map_entries_kernel<<>>(chunks); +} + +void get_dictionary_indices(cudf::detail::device_2dspan chunks, + size_type num_rows, + rmm::cuda_stream_view stream) +{ + constexpr int block_size = 256; + auto const grid_x = cudf::detail::grid_1d(num_rows, max_page_fragment_size); + auto const num_columns = chunks.size().second; + dim3 const dim_grid(grid_x.num_blocks, num_columns); + + get_dictionary_indices_kernel + <<>>(chunks, num_rows); +} +} // namespace gpu +} // namespace parquet +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/parquet/page_dict.cu b/cpp/src/io/parquet/page_dict.cu deleted file mode 100644 index 0c55828b120..00000000000 --- a/cpp/src/io/parquet/page_dict.cu +++ /dev/null @@ -1,335 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include "parquet_gpu.hpp" - -#include - -#include - -#include - -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { -struct dict_state_s { - uint32_t row_cnt; - PageFragment* cur_fragment; - uint32_t* hashmap; - uint32_t total_dict_entries; //!< Total number of entries in dictionary - uint32_t dictionary_size; //!< Total dictionary size in bytes - uint32_t num_dict_entries; //!< Dictionary entries in current fragment to add - uint32_t frag_dict_size; - EncColumnChunk ck; - parquet_column_device_view col; - PageFragment frag; - volatile uint32_t scratch_red[32]; - uint16_t frag_dict[max_page_fragment_size]; -}; - -/** - * @brief Computes a 16-bit dictionary hash - */ -inline __device__ uint32_t uint32_hash16(uint32_t v) { return (v + (v >> 16)) & 0xffff; } - -inline __device__ uint32_t uint64_hash16(uint64_t v) -{ - return uint32_hash16((uint32_t)(v + (v >> 32))); -} - -inline __device__ uint32_t hash_string(const string_view& val) -{ - const char* p = val.data(); - uint32_t len = val.size_bytes(); - uint32_t hash = len; - if (len > 0) { - uint32_t align_p = 3 & reinterpret_cast(p); - const uint32_t* p32 = reinterpret_cast(p - align_p); - uint32_t ofs = align_p * 8; - uint32_t v; - while (len > 4) { - v = *p32++; - if (ofs) { v = __funnelshift_r(v, *p32, ofs); } - hash = __funnelshift_l(hash, hash, 5) + v; - len -= 4; - } - v = *p32; - if (ofs) { v = __funnelshift_r(v, (align_p + len > 4) ? p32[1] : 0, ofs); } - v &= ((2 << (len * 8 - 1)) - 1); - hash = __funnelshift_l(hash, hash, 5) + v; - } - return uint32_hash16(hash); -} - -/** - * @brief Fetch a page fragment and its dictionary entries in row-ascending order - * - * @param[in,out] s dictionary state - * @param[in,out] dict_data fragment dictionary data for the current column (zeroed out after - *fetching) - * @param[in] frag_start_row row position of current fragment - * @param[in] t thread id - */ -__device__ void FetchDictionaryFragment(dict_state_s* s, - uint32_t* dict_data, - uint32_t frag_start_row, - uint32_t t) -{ - if (t == 0) s->frag = *s->cur_fragment; - __syncthreads(); - // Store the row values in shared mem and set the corresponding dict_data to zero (end-of-list) - // It's easiest to do this here since we're only dealing with values all within a 5K-row window - for (uint32_t i = t; i < s->frag.num_dict_vals; i += 1024) { - uint32_t r = dict_data[frag_start_row + i] - frag_start_row; - s->frag_dict[i] = r; - } - __syncthreads(); - for (uint32_t i = t; i < s->frag.num_dict_vals; i += 1024) { - uint32_t r = s->frag_dict[i]; - dict_data[frag_start_row + r] = 0; - } - __syncthreads(); -} - -/// Generate dictionary indices in ascending row order -template -__device__ void GenerateDictionaryIndices(dict_state_s* s, uint32_t t) -{ - using block_scan = cub::BlockScan; - __shared__ typename block_scan::TempStorage temp_storage; - uint32_t* dict_index = s->col.dict_index; - uint32_t* dict_data = s->col.dict_data + s->ck.start_row; - uint32_t num_dict_entries = 0; - - for (uint32_t i = 0; i < s->row_cnt; i += 1024) { - uint32_t row = s->ck.start_row + i + t; - uint32_t is_valid = - (i + t < s->row_cnt && row < s->col.num_rows) ? s->col.leaf_column->is_valid(row) : 0; - uint32_t dict_idx = (is_valid) ? dict_index[row] : 0; - uint32_t is_unique = - (is_valid && - dict_idx == - row); // Any value that doesn't have bit31 set should have dict_idx=row at this point - uint32_t block_num_dict_entries; - uint32_t pos; - block_scan(temp_storage).ExclusiveSum(is_unique, pos, block_num_dict_entries); - pos += num_dict_entries; - num_dict_entries += block_num_dict_entries; - if (is_valid && is_unique) { - dict_data[pos] = row; - dict_index[row] = pos; - } - __syncthreads(); - if (is_valid && !is_unique) { - // NOTE: Should have at most 3 iterations (once for early duplicate elimination, once for - // final dictionary duplicate elimination and once for re-ordering) (If something went wrong - // building the dictionary, it will likely hang or crash right here) - do { - dict_idx = dict_index[dict_idx & 0x7fffffff]; - } while (dict_idx > 0x7fffffff); - dict_index[row] = dict_idx; - } - } -} - -// blockDim(1024, 1, 1) -template -__global__ void __launch_bounds__(block_size, 1) - gpuBuildChunkDictionaries(device_span chunks, uint32_t* dev_scratch) -{ - __shared__ __align__(8) dict_state_s state_g; - using block_reduce = cub::BlockReduce; - __shared__ typename block_reduce::TempStorage temp_storage; - - dict_state_s* const s = &state_g; - uint32_t t = threadIdx.x; - uint32_t dtype, dtype_len, dtype_len_in; - - if (t == 0) s->ck = chunks[blockIdx.x]; - __syncthreads(); - - if (!s->ck.has_dictionary) { return; } - - if (t == 0) s->col = *s->ck.col_desc; - __syncthreads(); - - if (!t) { - s->hashmap = dev_scratch + s->ck.dictionary_id * (size_t)(1 << kDictHashBits); - s->row_cnt = 0; - s->cur_fragment = s->ck.fragments; - s->total_dict_entries = 0; - s->dictionary_size = 0; - s->ck.num_dict_fragments = 0; - } - dtype = s->col.physical_type; - dtype_len = (dtype == INT96) ? 12 : (dtype == INT64 || dtype == DOUBLE) ? 8 : 4; - if (dtype == INT32) { - dtype_len_in = GetDtypeLogicalLen(s->col.leaf_column); - } else if (dtype == INT96) { - dtype_len_in = 8; - } else { - dtype_len_in = dtype_len; - } - __syncthreads(); - while (s->row_cnt < s->ck.num_rows) { - uint32_t frag_start_row = s->ck.start_row + s->row_cnt, num_dict_entries, frag_dict_size; - FetchDictionaryFragment(s, s->col.dict_data, frag_start_row, t); - __syncthreads(); - num_dict_entries = s->frag.num_dict_vals; - if (!t) { - s->num_dict_entries = 0; - s->frag_dict_size = 0; - } - for (uint32_t i = 0; i < num_dict_entries; i += 1024) { - bool is_valid = (i + t < num_dict_entries); - uint32_t len = 0; - uint32_t is_dupe = 0; - uint32_t row, hash, next, *next_addr; - uint32_t new_dict_entries; - - if (is_valid) { - row = frag_start_row + s->frag_dict[i + t]; - len = dtype_len; - if (dtype == BYTE_ARRAY) { - auto str1 = s->col.leaf_column->element(row); - len += str1.size_bytes(); - hash = hash_string(str1); - // Walk the list of rows with the same hash - next_addr = &s->hashmap[hash]; - while ((next = atomicCAS(next_addr, 0, row + 1)) != 0) { - auto const current = next - 1; - auto str2 = s->col.leaf_column->element(current); - if (str1 == str2) { - is_dupe = 1; - break; - } - next_addr = &s->col.dict_data[next - 1]; - } - } else { - uint64_t val; - - if (dtype_len_in == 8) { - val = s->col.leaf_column->element(row); - hash = uint64_hash16(val); - } else { - val = (dtype_len_in == 4) ? s->col.leaf_column->element(row) - : (dtype_len_in == 2) ? s->col.leaf_column->element(row) - : s->col.leaf_column->element(row); - hash = uint32_hash16(val); - } - // Walk the list of rows with the same hash - next_addr = &s->hashmap[hash]; - while ((next = atomicCAS(next_addr, 0, row + 1)) != 0) { - auto const current = next - 1; - uint64_t val2 = (dtype_len_in == 8) ? s->col.leaf_column->element(current) - : (dtype_len_in == 4) ? s->col.leaf_column->element(current) - : (dtype_len_in == 2) ? s->col.leaf_column->element(current) - : s->col.leaf_column->element(current); - if (val2 == val) { - is_dupe = 1; - break; - } - next_addr = &s->col.dict_data[next - 1]; - } - } - } - // Count the non-duplicate entries - frag_dict_size = block_reduce(temp_storage).Sum((is_valid && !is_dupe) ? len : 0); - new_dict_entries = __syncthreads_count(is_valid && !is_dupe); - if (t == 0) { - s->frag_dict_size += frag_dict_size; - s->num_dict_entries += new_dict_entries; - } - if (is_valid) { - if (!is_dupe) { - s->col.dict_index[row] = row; - } else { - s->col.dict_index[row] = (next - 1) | (1u << 31); - } - } - __syncthreads(); - // At this point, the dictionary order is non-deterministic, and we want insertion order - // Make sure that the non-duplicate entry corresponds to the lower row number - // (The entry in dict_data (next-1) used for duplicate elimination does not need - // to be the lowest row number) - bool reorder_check = (is_valid && is_dupe && next - 1 > row); - if (reorder_check) { - next = s->col.dict_index[next - 1]; - while (next & (1u << 31)) { - next = s->col.dict_index[next & 0x7fffffff]; - } - } - if (__syncthreads_or(reorder_check)) { - if (reorder_check) { atomicMin(&s->col.dict_index[next], row); } - __syncthreads(); - if (reorder_check && s->col.dict_index[next] == row) { - s->col.dict_index[next] = row | (1u << 31); - s->col.dict_index[row] = row; - } - __syncthreads(); - } - } - __syncthreads(); - num_dict_entries = s->num_dict_entries; - frag_dict_size = s->frag_dict_size; - if (s->total_dict_entries + num_dict_entries > 65536 || - (s->dictionary_size != 0 && s->dictionary_size + frag_dict_size > 512 * 1024)) { - break; - } - __syncthreads(); - if (!t) { - if (num_dict_entries != s->frag.num_dict_vals) { - s->cur_fragment->num_dict_vals = num_dict_entries; - } - if (frag_dict_size != s->frag.dict_data_size) { s->frag.dict_data_size = frag_dict_size; } - s->total_dict_entries += num_dict_entries; - s->dictionary_size += frag_dict_size; - s->row_cnt += s->frag.num_rows; - s->cur_fragment++; - s->ck.num_dict_fragments++; - } - __syncthreads(); - } - __syncthreads(); - GenerateDictionaryIndices(s, t); - if (!t) { - chunks[blockIdx.x].num_dict_fragments = s->ck.num_dict_fragments; - chunks[blockIdx.x].dictionary_size = s->dictionary_size; - chunks[blockIdx.x].total_dict_entries = s->total_dict_entries; - } -} - -/** - * @brief Launches kernel for building chunk dictionaries - * - * @param[in,out] chunks Column chunks - * @param[in] dev_scratch Device scratch data (kDictScratchSize per dictionary) - * @param[in] stream CUDA stream to use, default 0 - */ -void BuildChunkDictionaries(device_span chunks, - uint32_t* dev_scratch, - rmm::cuda_stream_view stream) -{ - auto num_chunks = chunks.size(); - gpuBuildChunkDictionaries<1024><<>>(chunks, dev_scratch); -} - -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 3c62dcf7eea..70b2e27f75d 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -48,14 +48,7 @@ constexpr uint32_t rle_buffer_size = (1 << 9); struct frag_init_state_s { parquet_column_device_view col; PageFragment frag; - uint32_t total_dupes; size_type start_value_idx; - volatile uint32_t scratch_red[32]; - uint32_t dict[max_page_fragment_size]; - union { - uint16_t u16[1 << (init_hash_bits)]; - uint32_t u32[1 << (init_hash_bits - 1)]; - } map; }; struct page_enc_state_s { @@ -68,6 +61,7 @@ struct page_enc_state_s { uint32_t rle_lit_count; uint32_t rle_rpt_count; uint32_t page_start_val; + uint32_t chunk_start_val; volatile uint32_t rpt_map[4]; volatile uint32_t scratch_red[32]; EncPage page; @@ -124,31 +118,22 @@ __global__ void __launch_bounds__(block_size) __shared__ __align__(16) frag_init_state_s state_g; using block_reduce = cub::BlockReduce; - using block_scan = cub::BlockScan; - __shared__ union { - typename block_reduce::TempStorage reduce_storage; - typename block_scan::TempStorage scan_storage; - } temp_storage; + __shared__ typename block_reduce::TempStorage reduce_storage; frag_init_state_s* const s = &state_g; uint32_t t = threadIdx.x; - uint32_t start_row, dtype_len, dtype_len_in, dtype; + uint32_t start_row, dtype_len, dtype; if (t == 0) s->col = col_desc[blockIdx.x]; - for (uint32_t i = 0; i < sizeof(s->map) / sizeof(uint32_t); i += block_size) { - if (i + t < sizeof(s->map) / sizeof(uint32_t)) s->map.u32[i + t] = 0; - } __syncthreads(); start_row = blockIdx.y * fragment_size; if (!t) { // frag.num_rows = fragment_size except for the last page fragment which can be smaller. // num_rows is fixed but fragment size could be larger if the data is strings or nested. s->frag.num_rows = min(fragment_size, max_num_rows - min(start_row, max_num_rows)); - s->frag.non_nulls = 0; s->frag.num_dict_vals = 0; s->frag.fragment_data_size = 0; s->frag.dict_data_size = 0; - s->total_dupes = 0; // To use num_vals instead of num_rows, we need to calculate num_vals on the fly. // For list>, values between i and i+50 can be calculated by @@ -195,16 +180,6 @@ __global__ void __launch_bounds__(block_size) : (dtype == INT64 || dtype == DOUBLE) ? 8 : (dtype == BOOLEAN) ? 1 : 4; - if (dtype == INT32) { - dtype_len_in = GetDtypeLogicalLen(s->col.leaf_column); - } else if (dtype == INT96) { - // cudf doesn't support INT96 internally and uses INT64, so treat INT96 as an INT64 for - // computing dictionary hash values and reading the data, but we do treat it as 12 bytes for - // dtype_len, which determines how much memory we need to allocate for the fragment. - dtype_len_in = 8; - } else { - dtype_len_in = dtype_len; - } __syncthreads(); size_type nvals = s->frag.num_leaf_values; @@ -215,167 +190,22 @@ __global__ void __launch_bounds__(block_size) uint32_t is_valid = (i + t < nvals && val_idx < s->col.leaf_column->size()) ? s->col.leaf_column->is_valid(val_idx) : 0; - uint32_t len, nz_pos, hash; + uint32_t len; if (is_valid) { len = dtype_len; if (dtype != BOOLEAN) { if (dtype == BYTE_ARRAY) { auto str = s->col.leaf_column->element(val_idx); len += str.size_bytes(); - hash = hash_string(str); - } else if (dtype_len_in == 8) { - hash = uint64_init_hash(s->col.leaf_column->element(val_idx)); - } else { - hash = - uint32_init_hash((dtype_len_in == 4) ? s->col.leaf_column->element(val_idx) - : (dtype_len_in == 2) ? s->col.leaf_column->element(val_idx) - : s->col.leaf_column->element(val_idx)); } } } else { len = 0; } - uint32_t non_nulls; - block_scan(temp_storage.scan_storage).ExclusiveSum(is_valid, nz_pos, non_nulls); - nz_pos += s->frag.non_nulls; - __syncthreads(); - len = block_reduce(temp_storage.reduce_storage).Sum(len); - if (!t) { - s->frag.non_nulls += non_nulls; - s->frag.fragment_data_size += len; - } - __syncthreads(); - if (is_valid && dtype != BOOLEAN) { - uint32_t* dict_index = s->col.dict_index; - if (dict_index) { - atomicAdd(&s->map.u32[hash >> 1], (hash & 1) ? 1 << 16 : 1); - dict_index[start_value_idx + nz_pos] = - ((i + t) << init_hash_bits) | - hash; // Store the hash along with the index, so we don't have to recompute it - } - } - __syncthreads(); - } - __syncthreads(); - // Reorder the 16-bit local indices according to the hash values - if (s->col.dict_index) { - static_assert((init_hash_bits == 12), "Hardcoded for init_hash_bits=12"); - // Cumulative sum of hash map counts - uint32_t count01 = s->map.u32[t * 4 + 0]; - uint32_t count23 = s->map.u32[t * 4 + 1]; - uint32_t count45 = s->map.u32[t * 4 + 2]; - uint32_t count67 = s->map.u32[t * 4 + 3]; - uint32_t sum01 = count01 + (count01 << 16); - uint32_t sum23 = count23 + (count23 << 16); - uint32_t sum45 = count45 + (count45 << 16); - uint32_t sum67 = count67 + (count67 << 16); - sum23 += (sum01 >> 16) * 0x10001; - sum45 += (sum23 >> 16) * 0x10001; - sum67 += (sum45 >> 16) * 0x10001; - uint32_t sum_w = sum67 >> 16; - block_scan(temp_storage.scan_storage).InclusiveSum(sum_w, sum_w); - sum_w = (sum_w - (sum67 >> 16)) * 0x10001; - s->map.u32[t * 4 + 0] = sum_w + sum01 - count01; - s->map.u32[t * 4 + 1] = sum_w + sum23 - count23; - s->map.u32[t * 4 + 2] = sum_w + sum45 - count45; - s->map.u32[t * 4 + 3] = sum_w + sum67 - count67; - } - __syncthreads(); - // Put the indices back in hash order - if (s->col.dict_index) { - uint32_t* dict_index = s->col.dict_index + start_row; - uint32_t nnz = s->frag.non_nulls; - for (uint32_t i = 0; i < nnz; i += block_size) { - uint32_t pos = 0, hash = 0, pos_old, pos_new, sh, colliding_row, val = 0; - bool collision; - if (i + t < nnz) { - val = dict_index[i + t]; - hash = val & ((1 << init_hash_bits) - 1); - sh = (hash & 1) ? 16 : 0; - pos_old = s->map.u16[hash]; - } - // The isolation of the atomicAdd, along with pos_old/pos_new is to guarantee deterministic - // behavior for the first row in the hash map that will be used for early duplicate detection - __syncthreads(); - if (i + t < nnz) { - pos = (atomicAdd(&s->map.u32[hash >> 1], 1 << sh) >> sh) & 0xffff; - s->dict[pos] = val; - } - __syncthreads(); - collision = false; - if (i + t < nnz) { - pos_new = s->map.u16[hash]; - collision = (pos != pos_old && pos_new > pos_old + 1); - if (collision) { colliding_row = s->dict[pos_old]; } - } - __syncthreads(); - if (collision) { atomicMin(&s->dict[pos_old], val); } - __syncthreads(); - // Resolve collision - if (collision && val == s->dict[pos_old]) { s->dict[pos] = colliding_row; } - } + len = block_reduce(reduce_storage).Sum(len); + if (!t) { s->frag.fragment_data_size += len; } __syncthreads(); - // Now that the values are ordered by hash, compare every entry with the first entry in the hash - // map, the position of the first entry can be inferred from the hash map counts - uint32_t dupe_data_size = 0; - for (uint32_t i = 0; i < nnz; i += block_size) { - uint32_t ck_row = 0, ck_row_ref = 0, is_dupe = 0; - if (i + t < nnz) { - uint32_t dict_val = s->dict[i + t]; - uint32_t hash = dict_val & ((1 << init_hash_bits) - 1); - ck_row = start_row + (dict_val >> init_hash_bits); - ck_row_ref = start_row + (s->dict[(hash > 0) ? s->map.u16[hash - 1] : 0] >> init_hash_bits); - if (ck_row_ref != ck_row) { - if (dtype == BYTE_ARRAY) { - auto str1 = s->col.leaf_column->element(ck_row); - auto str2 = s->col.leaf_column->element(ck_row_ref); - is_dupe = (str1 == str2); - dupe_data_size += (is_dupe) ? 4 + str1.size_bytes() : 0; - } else { - if (dtype_len_in == 8) { - auto v1 = s->col.leaf_column->element(ck_row); - auto v2 = s->col.leaf_column->element(ck_row_ref); - is_dupe = (v1 == v2); - dupe_data_size += (is_dupe) ? 8 : 0; - } else { - uint32_t v1, v2; - if (dtype_len_in == 4) { - v1 = s->col.leaf_column->element(ck_row); - v2 = s->col.leaf_column->element(ck_row_ref); - } else if (dtype_len_in == 2) { - v1 = s->col.leaf_column->element(ck_row); - v2 = s->col.leaf_column->element(ck_row_ref); - } else { - v1 = s->col.leaf_column->element(ck_row); - v2 = s->col.leaf_column->element(ck_row_ref); - } - is_dupe = (v1 == v2); - dupe_data_size += (is_dupe) ? 4 : 0; - } - } - } - } - uint32_t dupes_in_block; - uint32_t dupes_before; - block_scan(temp_storage.scan_storage).InclusiveSum(is_dupe, dupes_before, dupes_in_block); - dupes_before += s->total_dupes; - __syncthreads(); - if (t == 0) { s->total_dupes += dupes_in_block; } - if (i + t < nnz) { - if (!is_dupe) { - s->col.dict_data[start_row + i + t - dupes_before] = ck_row; - } else { - s->col.dict_index[ck_row] = ck_row_ref | (1u << 31); - } - } - } - __syncthreads(); - dupe_data_size = block_reduce(temp_storage.reduce_storage).Sum(dupe_data_size); - if (!t) { - s->frag.dict_data_size = s->frag.fragment_data_size - dupe_data_size; - s->frag.num_dict_vals = s->frag.non_nulls - s->total_dupes; - } } __syncthreads(); if (t == 0) frag[blockIdx.x][blockIdx.y] = s->frag; @@ -449,22 +279,21 @@ __global__ void __launch_bounds__(128) pagestats_g.start_chunk = ck_g.first_fragment; pagestats_g.num_chunks = 0; } - if (ck_g.has_dictionary) { + if (ck_g.use_dictionary) { if (!t) { page_g.page_data = ck_g.uncompressed_bfr + page_offset; page_g.compressed_data = ck_g.compressed_bfr + comp_page_offset; page_g.num_fragments = 0; page_g.page_type = PageType::DICTIONARY_PAGE; - page_g.dict_bits_plus1 = 0; page_g.chunk = &chunks[blockIdx.y][blockIdx.x]; page_g.chunk_id = blockIdx.y * num_columns + blockIdx.x; page_g.hdr_size = 0; page_g.max_hdr_size = 32; - page_g.max_data_size = ck_g.dictionary_size; + page_g.max_data_size = ck_g.uniq_data_size; page_g.start_row = cur_row; - page_g.num_rows = ck_g.total_dict_entries; - page_g.num_leaf_values = ck_g.total_dict_entries; - page_g.num_values = ck_g.total_dict_entries; + page_g.num_rows = ck_g.num_dict_entries; + page_g.num_leaf_values = ck_g.num_dict_entries; + page_g.num_values = ck_g.num_dict_entries; // TODO: shouldn't matter for dict page page_offset += page_g.max_hdr_size + page_g.max_data_size; comp_page_offset += page_g.max_hdr_size + GetMaxCompressedBfrSize(page_g.max_data_size); } @@ -483,7 +312,7 @@ __global__ void __launch_bounds__(128) // This doesn't actually deal with data. It's agnostic. It only cares about number of rows and // page size. do { - uint32_t fragment_data_size, max_page_size, minmax_len = 0; + uint32_t minmax_len = 0; __syncwarp(); if (num_rows < ck_g.num_rows) { if (t == 0) { frag_g = ck_g.fragments[fragments_in_chunk]; } @@ -496,50 +325,27 @@ __global__ void __launch_bounds__(128) frag_g.num_rows = 0; } __syncwarp(); - if (ck_g.has_dictionary && fragments_in_chunk < ck_g.num_dict_fragments) { - fragment_data_size = - frag_g.num_leaf_values * 2; // Assume worst-case of 2-bytes per dictionary index - } else { - fragment_data_size = frag_g.fragment_data_size; - } + uint32_t fragment_data_size = + (ck_g.use_dictionary) + ? frag_g.num_leaf_values * 2 // Assume worst-case of 2-bytes per dictionary index + : frag_g.fragment_data_size; // TODO (dm): this convoluted logic to limit page size needs refactoring - max_page_size = (values_in_page * 2 >= ck_g.num_values) ? 256 * 1024 - : (values_in_page * 3 >= ck_g.num_values) ? 384 * 1024 - : 512 * 1024; + uint32_t max_page_size = (values_in_page * 2 >= ck_g.num_values) ? 256 * 1024 + : (values_in_page * 3 >= ck_g.num_values) ? 384 * 1024 + : 512 * 1024; if (num_rows >= ck_g.num_rows || - (values_in_page > 0 && - (page_size + fragment_data_size > max_page_size || - (ck_g.has_dictionary && fragments_in_chunk == ck_g.num_dict_fragments)))) { - uint32_t dict_bits_plus1; - - if (ck_g.has_dictionary && page_start < ck_g.num_dict_fragments) { - uint32_t dict_bits; - if (num_dict_entries <= 2) { - dict_bits = 1; - } else if (num_dict_entries <= 4) { - dict_bits = 2; - } else if (num_dict_entries <= 16) { - dict_bits = 4; - } else if (num_dict_entries <= 256) { - dict_bits = 8; - } else if (num_dict_entries <= 4096) { - dict_bits = 12; - } else { - dict_bits = 16; - } - page_size = 1 + 5 + ((values_in_page * dict_bits + 7) >> 3) + (values_in_page >> 8); - dict_bits_plus1 = dict_bits + 1; - } else { - dict_bits_plus1 = 0; + (values_in_page > 0 && (page_size + fragment_data_size > max_page_size))) { + if (ck_g.use_dictionary) { + page_size = + 1 + 5 + ((values_in_page * ck_g.dict_rle_bits + 7) >> 3) + (values_in_page >> 8); } if (!t) { - page_g.num_fragments = fragments_in_chunk - page_start; - page_g.chunk = &chunks[blockIdx.y][blockIdx.x]; - page_g.chunk_id = blockIdx.y * num_columns + blockIdx.x; - page_g.page_type = PageType::DATA_PAGE; - page_g.dict_bits_plus1 = dict_bits_plus1; - page_g.hdr_size = 0; - page_g.max_hdr_size = 32; // Max size excluding statistics + page_g.num_fragments = fragments_in_chunk - page_start; + page_g.chunk = &chunks[blockIdx.y][blockIdx.x]; + page_g.chunk_id = blockIdx.y * num_columns + blockIdx.x; + page_g.page_type = PageType::DATA_PAGE; + page_g.hdr_size = 0; + page_g.max_hdr_size = 32; // Max size excluding statistics if (ck_g.stats) { uint32_t stats_hdr_len = 16; if (col_g.stats_dtype == dtype_string) { @@ -611,8 +417,8 @@ __global__ void __launch_bounds__(128) ck_g.num_pages = num_pages; ck_g.bfr_size = page_offset; ck_g.compressed_size = comp_page_offset; - pagestats_g.start_chunk = ck_g.first_page + ck_g.has_dictionary; // Exclude dictionary - pagestats_g.num_chunks = num_pages - ck_g.has_dictionary; + pagestats_g.start_chunk = ck_g.first_page + ck_g.use_dictionary; // Exclude dictionary + pagestats_g.num_chunks = num_pages - ck_g.use_dictionary; } } __syncthreads(); @@ -1069,7 +875,10 @@ __global__ void __launch_bounds__(128, 8) } else { dtype_len_in = dtype_len_out; } - dict_bits = (dtype == BOOLEAN) ? 1 : (s->page.dict_bits_plus1 - 1); + dict_bits = (dtype == BOOLEAN) ? 1 + : (s->ck.use_dictionary and s->page.page_type != PageType::DICTIONARY_PAGE) + ? s->ck.dict_rle_bits + : -1; if (t == 0) { uint8_t* dst = s->cur; s->rle_run = 0; @@ -1080,37 +889,56 @@ __global__ void __launch_bounds__(128, 8) dst[0] = dict_bits; s->rle_out = dst + 1; } - s->page_start_val = s->page.start_row; - if (s->col.parent_column != nullptr) { + s->page_start_val = s->page.start_row; // Dictionary page's start row is chunk's start row + auto chunk_start_val = s->ck.start_row; + if (s->col.parent_column != nullptr) { // TODO: remove this check. parent is now never nullptr auto col = *(s->col.parent_column); auto current_page_start_val = s->page_start_val; + // TODO: We do this so much. Add a global function that converts row idx to val idx while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) { if (col.type().id() == type_id::STRUCT) { current_page_start_val += col.offset(); + chunk_start_val += col.offset(); col = col.child(0); } else { - current_page_start_val = col.child(lists_column_view::offsets_column_index) - .element(current_page_start_val + col.offset()); - col = col.child(lists_column_view::child_column_index); + auto offset_col = col.child(lists_column_view::offsets_column_index); + current_page_start_val = + offset_col.element(current_page_start_val + col.offset()); + chunk_start_val = offset_col.element(chunk_start_val + col.offset()); + col = col.child(lists_column_view::child_column_index); } } - s->page_start_val = current_page_start_val; + s->page_start_val = current_page_start_val; + s->chunk_start_val = chunk_start_val; } } __syncthreads(); for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) { - uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, 128); - uint32_t val_idx = s->page_start_val + cur_val_idx + t; - uint32_t is_valid, len, pos; + uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, 128); + uint32_t len, pos; + + auto [is_valid, val_idx] = [&]() { + uint32_t val_idx; + uint32_t is_valid; + + size_type val_idx_in_block = cur_val_idx + t; + if (s->page.page_type == PageType::DICTIONARY_PAGE) { + val_idx = val_idx_in_block; + is_valid = (val_idx < s->page.num_leaf_values); + if (is_valid) { val_idx = s->ck.dict_data[val_idx]; } + } else { + size_type val_idx_in_leaf_col = s->page_start_val + val_idx_in_block; + + is_valid = (val_idx_in_leaf_col < s->col.leaf_column->size() && + val_idx_in_block < s->page.num_leaf_values) + ? s->col.leaf_column->is_valid(val_idx_in_leaf_col) + : 0; + val_idx = + (s->ck.use_dictionary) ? val_idx_in_leaf_col - s->chunk_start_val : val_idx_in_leaf_col; + } + return std::make_tuple(is_valid, val_idx); + }(); - if (s->page.page_type == PageType::DICTIONARY_PAGE) { - is_valid = (cur_val_idx + t < s->page.num_leaf_values); - val_idx = (is_valid) ? s->col.dict_data[val_idx] : val_idx; - } else { - is_valid = (val_idx < s->col.leaf_column->size() && cur_val_idx + t < s->page.num_leaf_values) - ? s->col.leaf_column->is_valid(val_idx) - : 0; - } cur_val_idx += nvals; if (dict_bits >= 0) { // Dictionary encoding @@ -1124,7 +952,7 @@ __global__ void __launch_bounds__(128, 8) if (dtype == BOOLEAN) { v = s->col.leaf_column->element(val_idx); } else { - v = s->col.dict_index[val_idx]; + v = s->ck.dict_index[val_idx]; } s->vals[(rle_numvals + pos) & (rle_buffer_size - 1)] = v; } @@ -1531,13 +1359,12 @@ __global__ void __launch_bounds__(128) // data pages (actual encoding is identical). Encoding encoding; if (enable_bool_rle) { - encoding = (col_g.physical_type != BOOLEAN) - ? (page_type == PageType::DICTIONARY_PAGE || page_g.dict_bits_plus1 != 0) - ? Encoding::PLAIN_DICTIONARY - : Encoding::PLAIN - : Encoding::RLE; + encoding = (col_g.physical_type == BOOLEAN) ? Encoding::RLE + : (page_type == PageType::DICTIONARY_PAGE || page_g.chunk->use_dictionary) + ? Encoding::PLAIN_DICTIONARY + : Encoding::PLAIN; } else { - encoding = (page_type == PageType::DICTIONARY_PAGE || page_g.dict_bits_plus1 != 0) + encoding = (page_type == PageType::DICTIONARY_PAGE || page_g.chunk->use_dictionary) ? Encoding::PLAIN_DICTIONARY : Encoding::PLAIN; } @@ -1562,7 +1389,7 @@ __global__ void __launch_bounds__(128) } else { // DictionaryPageHeader encoder.field_struct_begin(7); - encoder.field_int32(1, ck_g.total_dict_entries); // number of values in dictionary + encoder.field_int32(1, ck_g.num_dict_entries); // number of values in dictionary encoder.field_int32(2, encoding); encoder.field_struct_end(7); } @@ -1613,12 +1440,12 @@ __global__ void __launch_bounds__(1024) memcpy_block<1024, true>(dst, src, data_len, t); dst += data_len; __syncthreads(); - if (!t && page == 0 && ck_g.has_dictionary) { ck_g.dictionary_size = hdr_len + data_len; } + if (!t && page == 0 && ck_g.use_dictionary) { ck_g.dictionary_size = hdr_len + data_len; } } if (t == 0) { chunks[blockIdx.x].bfr_size = uncompressed_size; chunks[blockIdx.x].compressed_size = (dst - dst_base); - if (ck_g.has_dictionary) { chunks[blockIdx.x].dictionary_size = ck_g.dictionary_size; } + if (ck_g.use_dictionary) { chunks[blockIdx.x].dictionary_size = ck_g.dictionary_size; } } } @@ -1966,9 +1793,9 @@ dremel_data get_dremel_data(column_view h_col, // Scan to get distance by which each offset value is shifted due to the insertion of empties auto scan_it = cudf::detail::make_counting_transform_iterator( - column_offsets[level], [off = lcv.offsets().data()] __device__(auto i) -> int { - return off[i] == off[i + 1]; - }); + column_offsets[level], + [off = lcv.offsets().data(), size = lcv.offsets().size()] __device__( + auto i) -> int { return (i + 1 < size) && (off[i] == off[i + 1]); }); rmm::device_uvector scan_out(offset_size_at_level, stream); thrust::exclusive_scan( rmm::exec_policy(stream), scan_it, scan_it + offset_size_at_level, scan_out.begin()); @@ -2053,9 +1880,9 @@ dremel_data get_dremel_data(column_view h_col, // Scan to get distance by which each offset value is shifted due to the insertion of dremel // level value fof an empty list auto scan_it = cudf::detail::make_counting_transform_iterator( - column_offsets[level], [off = lcv.offsets().data()] __device__(auto i) -> int { - return off[i] == off[i + 1]; - }); + column_offsets[level], + [off = lcv.offsets().data(), size = lcv.offsets().size()] __device__( + auto i) -> int { return (i + 1 < size) && (off[i] == off[i + 1]); }); rmm::device_uvector scan_out(offset_size_at_level, stream); thrust::exclusive_scan( rmm::exec_policy(stream), scan_it, scan_it + offset_size_at_level, scan_out.begin()); diff --git a/cpp/src/io/parquet/parquet.cpp b/cpp/src/io/parquet/parquet.cpp index 6c658788fa1..c8c54e9933f 100644 --- a/cpp/src/io/parquet/parquet.cpp +++ b/cpp/src/io/parquet/parquet.cpp @@ -347,6 +347,7 @@ int CompactProtocolReader::WalkSchema( ++idx; if (e->num_children > 0) { for (int i = 0; i < e->num_children; i++) { + e->children_idx.push_back(idx); int idx_old = idx; idx = WalkSchema(md, idx, parent_idx, max_def_level, max_rep_level); if (idx <= idx_old) break; // Error diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index 2232017409d..4390d1c788f 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -165,6 +165,7 @@ struct SchemaElement { int max_definition_level = 0; int max_repetition_level = 0; int parent_idx = 0; + std::vector children_idx; bool operator==(SchemaElement const& other) const { diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 975d2545cd1..cdd7c6b6674 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -28,6 +28,8 @@ #include #include +#include + #include #include #include @@ -42,6 +44,10 @@ namespace parquet { using cudf::io::detail::string_index_pair; +// Total number of unsigned 16 bit values +constexpr size_type MAX_DICT_SIZE = + std::numeric_limits::max() - std::numeric_limits::min() + 1; + /** * @brief Struct representing an input column in the file. */ @@ -56,6 +62,11 @@ struct input_column_info { namespace gpu { +auto constexpr KEY_SENTINEL = size_type{-1}; +auto constexpr VALUE_SENTINEL = size_type{-1}; +using map_type = cuco::static_map; +using slot_type = map_type::pair_atomic_type; + /** * @brief Enums for the flags in the page header */ @@ -222,8 +233,6 @@ struct ColumnChunkDesc { * @brief Struct describing an encoder column */ struct parquet_column_device_view : stats_column_desc { - uint32_t* dict_index; //!< Dictionary index [row] - uint32_t* dict_data; //!< Dictionary data (unique row indices) uint8_t physical_type; //!< physical data type uint8_t converted_type; //!< logical data type uint8_t level_bits; //!< bits to encode max definition (lower nibble) & repetition (upper nibble) @@ -236,9 +245,9 @@ struct parquet_column_device_view : stats_column_desc { size_type const* level_offsets; //!< Offset array for per-row pre-calculated rep/def level values uint8_t const* rep_values; //!< Pre-calculated repetition level values uint8_t const* def_values; //!< Pre-calculated definition level values - uint8_t* nullability; //!< Array of nullability of each nesting level. e.g. nullable[0] is - //!< nullability of parent_column. May be different from col.nullable() in - //!< case of chunked writing. + uint8_t const* nullability; //!< Array of nullability of each nesting level. e.g. nullable[0] is + //!< nullability of parent_column. May be different from + //!< col.nullable() in case of chunked writing. }; constexpr int max_page_fragment_size = 5000; //!< Max number of rows in a page fragment @@ -253,7 +262,6 @@ struct PageFragment { uint32_t start_value_idx; uint32_t num_leaf_values; //!< Number of leaf values in fragment. Does not include nulls at //!< non-leaf level - uint32_t non_nulls; //!< Number of non-null values uint16_t num_rows; //!< Number of rows in fragment uint16_t num_dict_vals; //!< Number of unique dictionary entries }; @@ -292,26 +300,33 @@ struct EncPage; */ struct EncColumnChunk { parquet_column_device_view const* col_desc; //!< Column description - PageFragment* fragments; //!< First fragment in chunk - uint8_t* uncompressed_bfr; //!< Uncompressed page data - uint8_t* compressed_bfr; //!< Compressed page data - statistics_chunk const* stats; //!< Fragment statistics - uint32_t bfr_size; //!< Uncompressed buffer size - uint32_t compressed_size; //!< Compressed buffer size - uint32_t start_row; //!< First row of chunk - uint32_t num_rows; //!< Number of rows in chunk - uint32_t num_values; //!< Number of values in chunk. Different from num_rows for nested types + size_type col_desc_id; + PageFragment* fragments; //!< First fragment in chunk + uint8_t* uncompressed_bfr; //!< Uncompressed page data + uint8_t* compressed_bfr; //!< Compressed page data + statistics_chunk const* stats; //!< Fragment statistics + uint32_t bfr_size; //!< Uncompressed buffer size + uint32_t compressed_size; //!< Compressed buffer size + uint32_t start_row; //!< First row of chunk + uint32_t num_rows; //!< Number of rows in chunk + size_type num_values; //!< Number of values in chunk. Different from num_rows for nested types uint32_t first_fragment; //!< First fragment of chunk EncPage* pages; //!< Ptr to pages that belong to this chunk uint32_t first_page; //!< First page of chunk uint32_t num_pages; //!< Number of pages in chunk - uint32_t dictionary_id; //!< Dictionary id for this chunk uint8_t is_compressed; //!< Nonzero if the chunk uses compression - uint8_t has_dictionary; //!< Nonzero if the chunk uses dictionary encoding - uint16_t num_dict_fragments; //!< Number of fragments using dictionary - uint32_t dictionary_size; //!< Size of dictionary - uint32_t total_dict_entries; //!< Total number of entries in dictionary - uint32_t ck_stat_size; //!< Size of chunk-level statistics (included in 1st page header) + uint32_t dictionary_size; //!< Size of dictionary page including header + uint32_t ck_stat_size; //!< Size of chunk-level statistics (included in 1st page header) + slot_type* dict_map_slots; //!< Hash map storage for calculating dict encoding for this chunk + size_type dict_map_size; //!< Size of dict_map_slots + size_type num_dict_entries; //!< Total number of entries in dictionary + size_type + uniq_data_size; //!< Size of dictionary page (set of all unique values) if dict enc is used + size_type plain_data_size; //!< Size of data in this chunk if plain encoding is used + size_type* dict_data; //!< Dictionary data (unique row indices) + uint16_t* dict_index; //!< Index of value in dictionary page. column[dict_data[dict_index[row]]] + uint8_t dict_rle_bits; //!< Bit size for encoding dictionary indices + bool use_dictionary; //!< True if the chunk uses dictionary encoding }; /** @@ -322,7 +337,6 @@ struct EncPage { uint8_t* compressed_data; //!< Ptr to compressed page uint16_t num_fragments; //!< Number of fragments in page PageType page_type; //!< Page type - uint8_t dict_bits_plus1; //!< 0=plain, nonzero:bits to encoding dictionary indices + 1 EncColumnChunk* chunk; //!< Chunk that this page belongs to uint32_t chunk_id; //!< Index in chunk array uint32_t hdr_size; //!< Size of page header @@ -449,7 +463,7 @@ dremel_data get_dremel_data(column_view h_col, * @param[in] num_columns Number of columns * @param[in] fragment_size Number of rows per fragment * @param[in] num_rows Number of rows per column - * @param[in] stream CUDA stream to use, default 0 + * @param[in] stream CUDA stream to use */ void InitPageFragments(cudf::detail::device_2dspan frag, device_span col_desc, @@ -463,13 +477,57 @@ void InitPageFragments(cudf::detail::device_2dspan frag, * @param[out] groups Statistics groups [num_columns x num_fragments] * @param[in] fragments Page fragments [num_columns x num_fragments] * @param[in] col_desc Column description [num_columns] - * @param[in] stream CUDA stream to use, default 0 + * @param[in] stream CUDA stream to use */ void InitFragmentStatistics(cudf::detail::device_2dspan groups, cudf::detail::device_2dspan fragments, device_span col_desc, rmm::cuda_stream_view stream); +/** + * @brief Initialize per-chunk hash maps used for dictionary with sentinel values + * + * @param chunks Flat span of chunks to intialize hash maps for + * @param stream CUDA stream to use + */ +void initialize_chunk_hash_maps(device_span chunks, rmm::cuda_stream_view stream); + +/** + * @brief Insert chunk values into their respective hash maps + * + * @param chunks Column chunks [rowgroup][column] + * @param num_rows Number of rows per column + * @param stream CUDA stream to use + */ +void populate_chunk_hash_maps(cudf::detail::device_2dspan chunks, + size_type num_rows, + rmm::cuda_stream_view stream); + +/** + * @brief Compact dictionary hash map entries into chunk.dict_data + * + * @param chunks Flat span of chunks to compact hash maps for + * @param stream CUDA stream to use + */ +void collect_map_entries(device_span chunks, rmm::cuda_stream_view stream); + +/** + * @brief Get the Dictionary Indices for each row + * + * For each row of a chunk, gets the indices into chunk.dict_data which contains the value otherwise + * stored in input column [row]. Stores these indices into chunk.dict_index. + * + * Since dict_data itself contains indices into the original cudf column, this means that + * col[row] == col[dict_data[dict_index[row - chunk.start_row]]] + * + * @param chunks Column chunks [rowgroup][column] + * @param num_rows Number of rows per column + * @param stream CUDA stream to use + */ +void get_dictionary_indices(cudf::detail::device_2dspan chunks, + size_type num_rows, + rmm::cuda_stream_view stream); + /** * @brief Launches kernel for initializing encoder data pages * @@ -538,17 +596,6 @@ void GatherPages(device_span chunks, device_span pages, rmm::cuda_stream_view stream); -/** - * @brief Launches kernel for building chunk dictionaries - * - * @param[in] chunks Column chunks - * @param[in] dev_scratch Device scratch data (kDictScratchSize bytes per dictionary) - * @param[in] stream CUDA stream to use, default 0 - */ -void BuildChunkDictionaries(device_span chunks, - uint32_t* dev_scratch, - rmm::cuda_stream_view stream); - } // namespace gpu } // namespace parquet } // namespace io diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index 9f9bdfd4755..caf11b66206 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -464,8 +464,9 @@ class aggregate_metadata { * * @param names List of column names to load, where index column name(s) will be added */ - void add_pandas_index_names(std::vector& names) const + std::vector get_pandas_index_names() const { + std::vector names; auto str = get_pandas_index(); if (str.length() != 0) { std::regex index_name_expr{R"(\"((?:\\.|[^\"])*)\")"}; @@ -480,6 +481,7 @@ class aggregate_metadata { str = sm.suffix(); } } + return names; } struct row_group_info { @@ -549,86 +551,14 @@ class aggregate_metadata { return selection; } - /** - * @brief Build input and output column structures based on schema input. Recursive. - * - * @param[in,out] schema_idx Schema index to build information for. This value gets - * incremented as the function recurses. - * @param[out] input_columns Input column information (source data in the file) - * @param[out] output_columns Output column structure (resulting cudf columns) - * @param[in,out] nesting A stack keeping track of child column indices so we can - * reproduce the linear list of output columns that correspond to an input column. - * @param[in] strings_to_categorical Type conversion parameter - * @param[in] timestamp_type_id Type conversion parameter - * @param[in] strict_decimal_types True if it is an error to load an unsupported decimal type - * - */ - void build_column_info(int& schema_idx, - std::vector& input_columns, - std::vector& output_columns, - std::deque& nesting, - bool strings_to_categorical, - type_id timestamp_type_id, - bool strict_decimal_types) const - { - int start_schema_idx = schema_idx; - auto const& schema = get_schema(schema_idx); - schema_idx++; - - // if I am a stub, continue on - if (schema.is_stub()) { - // is this legit? - CUDF_EXPECTS(schema.num_children == 1, "Unexpected number of children for stub"); - build_column_info(schema_idx, - input_columns, - output_columns, - nesting, - strings_to_categorical, - timestamp_type_id, - strict_decimal_types); - return; - } - - // if we're at the root, this is a new output column - nesting.push_back(static_cast(output_columns.size())); - auto const col_type = - to_type_id(schema, strings_to_categorical, timestamp_type_id, strict_decimal_types); - auto const dtype = col_type == type_id::DECIMAL32 || col_type == type_id::DECIMAL64 - ? data_type{col_type, numeric::scale_type{-schema.decimal_scale}} - : data_type{col_type}; - output_columns.emplace_back(dtype, schema.repetition_type == OPTIONAL ? true : false); - column_buffer& output_col = output_columns.back(); - output_col.name = schema.name; - - // build each child - for (int idx = 0; idx < schema.num_children; idx++) { - build_column_info(schema_idx, - input_columns, - output_col.children, - nesting, - strings_to_categorical, - timestamp_type_id, - strict_decimal_types); - } - - // if I have no children, we're at a leaf and I'm an input column (that is, one with actual - // data stored) so add me to the list. - if (schema.num_children == 0) { - input_columns.emplace_back(input_column_info{start_schema_idx, schema.name}); - input_column_info& input_col = input_columns.back(); - std::copy(nesting.begin(), nesting.end(), std::back_inserter(input_col.nesting)); - } - - nesting.pop_back(); - } - /** * @brief Filters and reduces down to a selection of columns * - * @param use_names List of column names to select + * @param use_names List of paths of column names to select * @param include_index Whether to always include the PANDAS index column(s) * @param strings_to_categorical Type conversion parameter * @param timestamp_type_id Type conversion parameter + * @param strict_decimal_types Type conversion parameter * * @return input column information, output column information, list of output column schema * indices @@ -639,9 +569,86 @@ class aggregate_metadata { type_id timestamp_type_id, bool strict_decimal_types) const { - auto const& pfm = per_file_metadata[0]; + auto find_schema_child = [&](SchemaElement const& schema_elem, std::string const& name) { + auto const& col_schema_idx = std::find_if( + schema_elem.children_idx.cbegin(), + schema_elem.children_idx.cend(), + [&](size_t col_schema_idx) { return get_schema(col_schema_idx).name == name; }); + + return (col_schema_idx != schema_elem.children_idx.end()) ? static_cast(*col_schema_idx) + : -1; + }; + + std::vector output_columns; + std::vector input_columns; + std::vector nesting; + + // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is + // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is + // not a child of "struct1" then the function will return false for "struct1" + std::function&)> build_column = + [&](column_name_info const* col_name_info, + int schema_idx, + std::vector& out_col_array) { + if (schema_idx < 0) { return false; } + auto const& schema_elem = get_schema(schema_idx); + + // if schema_elem is a stub then it does not exist in the column_name_info and column_buffer + // hierarchy. So continue on + if (schema_elem.is_stub()) { + // is this legit? + CUDF_EXPECTS(schema_elem.num_children == 1, "Unexpected number of children for stub"); + auto child_col_name_info = (col_name_info) ? &col_name_info->children[0] : nullptr; + return build_column(child_col_name_info, schema_elem.children_idx[0], out_col_array); + } + + // if we're at the root, this is a new output column + auto const col_type = + to_type_id(schema_elem, strings_to_categorical, timestamp_type_id, strict_decimal_types); + auto const dtype = col_type == type_id::DECIMAL32 || col_type == type_id::DECIMAL64 + ? data_type{col_type, numeric::scale_type{-schema_elem.decimal_scale}} + : data_type{col_type}; + + column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL); + // store the index of this element if inserted in out_col_array + nesting.push_back(static_cast(out_col_array.size())); + output_col.name = schema_elem.name; + + // build each child + bool path_is_valid = false; + if (col_name_info == nullptr or col_name_info->children.empty()) { + // add all children of schema_elem. + // At this point, we can no longer pass a col_name_info to build_column + for (int idx = 0; idx < schema_elem.num_children; idx++) { + path_is_valid |= + build_column(nullptr, schema_elem.children_idx[idx], output_col.children); + } + } else { + for (size_t idx = 0; idx < col_name_info->children.size(); idx++) { + path_is_valid |= + build_column(&col_name_info->children[idx], + find_schema_child(schema_elem, col_name_info->children[idx].name), + output_col.children); + } + } + + // if I have no children, we're at a leaf and I'm an input column (that is, one with actual + // data stored) so add me to the list. + if (schema_elem.num_children == 0) { + input_column_info& input_col = + input_columns.emplace_back(input_column_info{schema_idx, schema_elem.name}); + std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting)); + path_is_valid = true; // If we're able to reach leaf then path is valid + } + + if (path_is_valid) { out_col_array.push_back(std::move(output_col)); } + + nesting.pop_back(); + return path_is_valid; + }; + + std::vector output_column_schemas; - // determine the list of output columns // // there is not necessarily a 1:1 mapping between input columns and output columns. // For example, parquet does not explicitly store a ColumnChunkDesc for struct columns. @@ -657,43 +664,120 @@ class aggregate_metadata { // "firstname", "middlename" and "lastname" represent the input columns in the file that we // process to produce the final cudf "name" column. // - std::vector output_column_schemas; + // A user can ask for a single field out of the struct e.g. firstname. + // In this case they'll pass a fully qualified name to the schema element like + // ["name", "firstname"] + // + auto const& root = get_schema(0); if (use_names.empty()) { - // walk the schema and choose all top level columns - for (size_t schema_idx = 1; schema_idx < pfm.schema.size(); schema_idx++) { - auto const& schema = pfm.schema[schema_idx]; - if (schema.parent_idx == 0) { output_column_schemas.push_back(schema_idx); } + for (auto const& schema_idx : root.children_idx) { + build_column(nullptr, schema_idx, output_columns); + output_column_schemas.push_back(schema_idx); } } else { - // Load subset of columns; include PANDAS index unless excluded - std::vector local_use_names = use_names; - if (include_index) { add_pandas_index_names(local_use_names); } - for (const auto& use_name : local_use_names) { - for (size_t schema_idx = 1; schema_idx < pfm.schema.size(); schema_idx++) { - auto const& schema = pfm.schema[schema_idx]; - // We select only top level columns by name. Selecting nested columns by name is not - // supported. Top level columns are identified by their parent being the root (idx == 0) - if (use_name == schema.name and schema.parent_idx == 0) { - output_column_schemas.push_back(schema_idx); - } + struct path_info { + std::string full_path; + int schema_idx; + }; + + // Convert schema into a vector of every possible path + std::vector all_paths; + std::function add_path = [&](std::string path_till_now, + int schema_idx) { + auto const& schema_elem = get_schema(schema_idx); + std::string curr_path = path_till_now + schema_elem.name; + all_paths.push_back({curr_path, schema_idx}); + for (auto const& child_idx : schema_elem.children_idx) { + add_path(curr_path + ".", child_idx); } + }; + for (auto const& child_idx : get_schema(0).children_idx) { + add_path("", child_idx); } - } - // construct input and output output column info - std::vector output_columns; - output_columns.reserve(output_column_schemas.size()); - std::vector input_columns; - std::deque nesting; - for (size_t idx = 0; idx < output_column_schemas.size(); idx++) { - int schema_index = output_column_schemas[idx]; - build_column_info(schema_index, - input_columns, - output_columns, - nesting, - strings_to_categorical, - timestamp_type_id, - strict_decimal_types); + // Find which of the selected paths are valid and get their schema index + std::vector valid_selected_paths; + for (auto const& selected_path : use_names) { + auto found_path = + std::find_if(all_paths.begin(), all_paths.end(), [&](path_info& valid_path) { + return valid_path.full_path == selected_path; + }); + if (found_path != all_paths.end()) { + valid_selected_paths.push_back({selected_path, found_path->schema_idx}); + } + } + + // Now construct paths as vector of strings for further consumption + std::vector> use_names3; + std::transform(valid_selected_paths.begin(), + valid_selected_paths.end(), + std::back_inserter(use_names3), + [&](path_info const& valid_path) { + auto schema_idx = valid_path.schema_idx; + std::vector result_path; + do { + SchemaElement const& elem = get_schema(schema_idx); + result_path.push_back(elem.name); + schema_idx = elem.parent_idx; + } while (schema_idx > 0); + return std::vector(result_path.rbegin(), result_path.rend()); + }); + + std::vector selected_columns; + if (include_index) { + std::vector index_names = get_pandas_index_names(); + std::transform(index_names.cbegin(), + index_names.cend(), + std::back_inserter(selected_columns), + [](std::string const& name) { return column_name_info(name); }); + } + // Merge the vector use_names into a set of hierarchical column_name_info objects + /* This is because if we have columns like this: + * col1 + * / \ + * s3 f4 + * / \ + * f5 f6 + * + * there may be common paths in use_names like: + * {"col1", "s3", "f5"}, {"col1", "f4"} + * which means we want the output to contain + * col1 + * / \ + * s3 f4 + * / + * f5 + * + * rather than + * col1 col1 + * | | + * s3 f4 + * | + * f5 + */ + for (auto const& path : use_names3) { + auto array_to_find_in = &selected_columns; + for (size_t depth = 0; depth < path.size(); ++depth) { + // Check if the path exists in our selected_columns and if not, add it. + auto const& name_to_find = path[depth]; + auto found_col = std::find_if( + array_to_find_in->begin(), + array_to_find_in->end(), + [&name_to_find](column_name_info const& col) { return col.name == name_to_find; }); + if (found_col == array_to_find_in->end()) { + auto& col = array_to_find_in->emplace_back(name_to_find); + array_to_find_in = &col.children; + } else { + // Path exists. go down further. + array_to_find_in = &found_col->children; + } + } + } + for (auto& col : selected_columns) { + auto const& top_level_col_schema_idx = find_schema_child(root, col.name); + bool valid_column = build_column(&col, top_level_col_schema_idx, output_columns); + if (valid_column) output_column_schemas.push_back(top_level_col_schema_idx); + } } return std::make_tuple( @@ -1581,18 +1665,16 @@ table_with_metadata reader::impl::read(size_type skip_rows, // create the final output cudf columns for (size_t i = 0; i < _output_columns.size(); ++i) { - out_metadata.schema_info.push_back(column_name_info{""}); - out_columns.emplace_back( - make_column(_output_columns[i], &out_metadata.schema_info.back(), stream, _mr)); + column_name_info& col_name = out_metadata.schema_info.emplace_back(""); + out_columns.emplace_back(make_column(_output_columns[i], &col_name, stream, _mr)); } } } // Create empty columns as needed (this can happen if we've ended up with no actual data to read) for (size_t i = out_columns.size(); i < _output_columns.size(); ++i) { - out_metadata.schema_info.push_back(column_name_info{""}); - out_columns.emplace_back(cudf::io::detail::empty_like( - _output_columns[i], &out_metadata.schema_info.back(), stream, _mr)); + column_name_info& col_name = out_metadata.schema_info.emplace_back(""); + out_columns.emplace_back(io::detail::empty_like(_output_columns[i], &col_name, stream, _mr)); } // Return column names (must match order of returned columns) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 7c0ce03886d..0d4ce40354f 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -364,6 +364,26 @@ struct leaf_schema_fn { } }; +inline bool is_col_nullable(LinkedColPtr const& col, + column_in_metadata const& col_meta, + bool single_write_mode) +{ + if (single_write_mode) { + return col->nullable(); + } else { + if (col_meta.is_nullability_defined()) { + CUDF_EXPECTS(col_meta.nullable() || !col->nullable(), + "Mismatch in metadata prescribed nullability and input column nullability. " + "Metadata for nullable input column cannot prescribe nullability = false"); + return col_meta.nullable(); + } else { + // For chunked write, when not provided nullability, we assume the worst case scenario + // that all columns are nullable. + return true; + } + } +} + /** * @brief Construct schema from input columns and per-column input options * @@ -371,7 +391,7 @@ struct leaf_schema_fn { * The resulting schema tree is stored in a vector in pre-order traversal order. */ std::vector construct_schema_tree(LinkedColVector const& linked_columns, - table_input_metadata const& metadata, + table_input_metadata& metadata, bool single_write_mode, bool int96_timestamps) { @@ -384,27 +404,9 @@ std::vector construct_schema_tree(LinkedColVector const& linke root.parent_idx = -1; // root schema has no parent schema.push_back(std::move(root)); - std::function add_schema = - [&](LinkedColPtr const& col, column_in_metadata const& col_meta, size_t parent_idx) { - bool col_nullable = [&]() { - if (single_write_mode) { - return col->nullable(); - } else { - if (col_meta.is_nullability_defined()) { - if (col_meta.nullable() == false) { - CUDF_EXPECTS( - col->nullable() == false, - "Mismatch in metadata prescribed nullability and input column nullability. " - "Metadata for nullable input column cannot prescribe nullability = false"); - } - return col_meta.nullable(); - } else { - // For chunked write, when not provided nullability, we assume the worst case scenario - // that all columns are nullable. - return true; - } - } - }(); + std::function add_schema = + [&](LinkedColPtr const& col, column_in_metadata& col_meta, size_t parent_idx) { + bool col_nullable = is_col_nullable(col, col_meta, single_write_mode); if (col->type().id() == type_id::STRUCT) { // if struct, add current and recursively call for all children @@ -426,7 +428,7 @@ std::vector construct_schema_tree(LinkedColVector const& linke for (size_t i = 0; i < col->children.size(); ++i) { add_schema(col->children[i], col_meta.child(i), struct_node_index); } - } else if (col->type().id() == type_id::LIST) { + } else if (col->type().id() == type_id::LIST && !col_meta.is_map()) { // List schema is denoted by two levels for each nesting level and one final level for leaf. // The top level is the same name as the column name. // So e.g. List> is denoted in the schema by @@ -454,6 +456,58 @@ std::vector construct_schema_tree(LinkedColVector const& linke add_schema(col->children[lists_column_view::child_column_index], col_meta.child(lists_column_view::child_column_index), schema.size() - 1); + } else if (col->type().id() == type_id::LIST && col_meta.is_map()) { + // Map schema is denoted by a list of struct + // e.g. List> will be + // "col_name" : { "key_value" : { "key", "value" } } + + // verify the List child structure is a struct + auto const& struct_col = col->child(lists_column_view::child_column_index); + CUDF_EXPECTS(struct_col.type().id() == type_id::STRUCT, "Map should be a List of struct"); + CUDF_EXPECTS(struct_col.num_children() == 2, + "Map should be a List of struct with two children only but found " + + std::to_string(struct_col.num_children())); + + schema_tree_node map_schema{}; + map_schema.converted_type = ConvertedType::MAP; + map_schema.repetition_type = + col_nullable ? FieldRepetitionType::OPTIONAL : FieldRepetitionType::REQUIRED; + map_schema.name = col_meta.get_name(); + map_schema.num_children = 1; + map_schema.parent_idx = parent_idx; + schema.push_back(std::move(map_schema)); + + schema_tree_node repeat_group{}; + repeat_group.repetition_type = FieldRepetitionType::REPEATED; + repeat_group.name = "key_value"; + repeat_group.num_children = 2; + repeat_group.parent_idx = schema.size() - 1; // Parent is map_schema, last added. + schema.push_back(std::move(repeat_group)); + + CUDF_EXPECTS(col_meta.num_children() == 2, + "List column's metadata should have exactly two children"); + CUDF_EXPECTS(col_meta.child(lists_column_view::child_column_index).num_children() == 2, + "Map struct column should have exactly two children"); + // verify the col meta of children of the struct have name key and value + auto& left_child_meta = col_meta.child(lists_column_view::child_column_index).child(0); + left_child_meta.set_name("key"); + left_child_meta.set_nullability(false); + + auto& right_child_meta = col_meta.child(lists_column_view::child_column_index).child(1); + right_child_meta.set_name("value"); + // check the repetition type of key is required i.e. the col should be non-nullable + auto key_col = col->children[lists_column_view::child_column_index]->children[0]; + CUDF_EXPECTS(!is_col_nullable(key_col, left_child_meta, single_write_mode), + "key column cannot be nullable. For chunked writing, explicitly set the " + "nullability to false in metadata"); + // process key + size_type struct_col_index = schema.size() - 1; + add_schema(key_col, left_child_meta, struct_col_index); + // process value + add_schema(col->children[lists_column_view::child_column_index]->children[1], + right_child_meta, + struct_col_index); + } else { // if leaf, add current if (col->type().id() == type_id::STRING) { @@ -505,7 +559,7 @@ struct parquet_column_view { rmm::cuda_stream_view stream); column_view leaf_column_view() const; - gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream); + gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const; column_view cudf_column_view() const { return cudf_col; } parquet::Type physical_type() const { return schema_node.type; } @@ -517,26 +571,6 @@ struct parquet_column_view { uint8_t max_rep_level() const noexcept { return _max_rep_level; } bool is_list() const noexcept { return _is_list; } - // Dictionary related member functions - uint32_t* get_dict_data() { return (_dict_data.size()) ? _dict_data.data() : nullptr; } - uint32_t* get_dict_index() { return (_dict_index.size()) ? _dict_index.data() : nullptr; } - void use_dictionary(bool use_dict) { _dictionary_used = use_dict; } - void alloc_dictionary(size_t max_num_rows, rmm::cuda_stream_view stream) - { - _dict_data.resize(max_num_rows, stream); - _dict_index.resize(max_num_rows, stream); - } - bool check_dictionary_used(rmm::cuda_stream_view stream) - { - if (!_dictionary_used) { - _dict_data.resize(0, stream); - _dict_data.shrink_to_fit(stream); - _dict_index.resize(0, stream); - _dict_index.shrink_to_fit(stream); - } - return _dictionary_used; - } - private: // Schema related members schema_tree_node schema_node; @@ -556,11 +590,6 @@ struct parquet_column_view { rmm::device_uvector _def_level; std::vector _nullability; size_type _data_count = 0; - - // Dictionary related members - bool _dictionary_used = false; - rmm::device_uvector _dict_data; - rmm::device_uvector _dict_index; }; parquet_column_view::parquet_column_view(schema_tree_node const& schema_node, @@ -570,9 +599,7 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node, _d_nullability(0, stream), _dremel_offsets(0, stream), _rep_level(0, stream), - _def_level(0, stream), - _dict_data(0, stream), - _dict_index(0, stream) + _def_level(0, stream) { // Construct single inheritance column_view from linked_column_view auto curr_col = schema_node.leaf_column.get(); @@ -683,21 +710,14 @@ column_view parquet_column_view::leaf_column_view() const return col; } -gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view stream) +gpu::parquet_column_device_view parquet_column_view::get_device_view( + rmm::cuda_stream_view stream) const { column_view col = leaf_column_view(); auto desc = gpu::parquet_column_device_view{}; // Zero out all fields desc.stats_dtype = schema_node.stats_dtype; desc.ts_scale = schema_node.ts_scale; - // TODO (dm): Enable dictionary for list and struct after refactor - if (physical_type() != BOOLEAN && physical_type() != UNDEFINED_TYPE && - !is_nested(cudf_col.type())) { - alloc_dictionary(_data_count, stream); - desc.dict_index = get_dict_index(); - desc.dict_data = get_dict_data(); - } - if (is_list()) { desc.level_offsets = _dremel_offsets.data(); desc.rep_values = _rep_level.data(); @@ -705,15 +725,9 @@ gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_s } desc.num_rows = cudf_col.size(); desc.physical_type = static_cast(physical_type()); - auto count_bits = [](uint16_t number) { - int16_t nbits = 0; - while (number > 0) { - nbits++; - number >>= 1; - } - return nbits; - }; - desc.level_bits = count_bits(max_rep_level()) << 4 | count_bits(max_def_level()); + + desc.level_bits = CompactProtocolReader::NumRequiredBits(max_rep_level()) << 4 | + CompactProtocolReader::NumRequiredBits(max_def_level()); desc.nullability = _d_nullability.data(); return desc; } @@ -744,22 +758,99 @@ void writer::impl::gather_fragment_statistics( stream.synchronize(); } -void writer::impl::build_chunk_dictionaries( - hostdevice_2dvector& chunks, - device_span col_desc, - uint32_t num_columns, - uint32_t num_dictionaries) +void writer::impl::init_page_sizes(hostdevice_2dvector& chunks, + device_span col_desc, + uint32_t num_columns) { chunks.host_to_device(stream); - if (num_dictionaries > 0) { - size_t dict_scratch_size = (size_t)num_dictionaries * gpu::kDictScratchSize; - auto dict_scratch = cudf::detail::make_zeroed_device_uvector_async( - dict_scratch_size / sizeof(uint32_t), stream); + gpu::InitEncoderPages(chunks, {}, col_desc, num_columns, nullptr, nullptr, stream); + chunks.device_to_host(stream, true); +} + +auto build_chunk_dictionaries(hostdevice_2dvector& chunks, + host_span col_desc, + uint32_t num_rows, + rmm::cuda_stream_view stream) +{ + // At this point, we know all chunks and their sizes. We want to allocate dictionaries for each + // chunk that can have dictionary + + auto h_chunks = chunks.host_view().flat_view(); - gpu::BuildChunkDictionaries(chunks.device_view().flat_view(), dict_scratch.data(), stream); + std::vector> dict_data; + std::vector> dict_index; + + if (h_chunks.size() == 0) { return std::make_pair(std::move(dict_data), std::move(dict_index)); } + + // Allocate slots for each chunk + std::vector> hash_maps_storage; + hash_maps_storage.reserve(h_chunks.size()); + for (auto& chunk : h_chunks) { + if (col_desc[chunk.col_desc_id].physical_type == Type::BOOLEAN) { + chunk.use_dictionary = false; + } else { + chunk.use_dictionary = true; + auto& inserted_map = hash_maps_storage.emplace_back(chunk.num_values, stream); + chunk.dict_map_slots = inserted_map.data(); + chunk.dict_map_size = inserted_map.size(); + } } - gpu::InitEncoderPages(chunks, {}, col_desc, num_columns, nullptr, nullptr, stream); + + chunks.host_to_device(stream); + + gpu::initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream); + gpu::populate_chunk_hash_maps(chunks, num_rows, stream); + chunks.device_to_host(stream, true); + + // Make decision about which chunks have dictionary + for (auto& ck : h_chunks) { + if (not ck.use_dictionary) { continue; } + std::tie(ck.use_dictionary, ck.dict_rle_bits) = [&]() { + // calculate size of chunk if dictionary is used + + // If we have N unique values then the idx for the last value is N - 1 and nbits is the number + // of bits required to encode indices into the dictionary + auto max_dict_index = (ck.num_dict_entries > 0) ? ck.num_dict_entries - 1 : 0; + auto nbits = CompactProtocolReader::NumRequiredBits(max_dict_index); + + // We don't use dictionary if the indices are > 16 bits because that's the maximum bitpacking + // bitsize we efficiently support + if (nbits > 16) { return std::make_pair(false, 0); } + + // Only these bit sizes are allowed for RLE encoding because it's compute optimized + constexpr auto allowed_bitsizes = std::array{1, 2, 4, 8, 12, 16}; + + // ceil to (1/2/4/8/12/16) + auto rle_bits = *std::lower_bound(allowed_bitsizes.begin(), allowed_bitsizes.end(), nbits); + auto rle_byte_size = util::div_rounding_up_safe(ck.num_values * rle_bits, 8); + + auto dict_enc_size = ck.uniq_data_size + rle_byte_size; + + bool use_dict = (ck.plain_data_size > dict_enc_size); + if (not use_dict) { rle_bits = 0; } + return std::make_pair(use_dict, rle_bits); + }(); + } + + // TODO: (enh) Deallocate hash map storage for chunks that don't use dict and clear pointers. + + dict_data.reserve(h_chunks.size()); + dict_index.reserve(h_chunks.size()); + for (auto& chunk : h_chunks) { + if (not chunk.use_dictionary) { continue; } + + size_t dict_data_size = std::min(MAX_DICT_SIZE, chunk.dict_map_size); + auto& inserted_dict_data = dict_data.emplace_back(dict_data_size, stream); + auto& inserted_dict_index = dict_index.emplace_back(chunk.num_values, stream); + chunk.dict_data = inserted_dict_data.data(); + chunk.dict_index = inserted_dict_index.data(); + } + chunks.host_to_device(stream); + gpu::collect_map_entries(chunks.device_view().flat_view(), stream); + gpu::get_dictionary_indices(chunks.device_view(), num_rows, stream); + + return std::make_pair(std::move(dict_data), std::move(dict_index)); } void writer::impl::init_encoder_pages(hostdevice_2dvector& chunks, @@ -959,10 +1050,8 @@ void writer::impl::write(table_view const& table) // Initialize column description hostdevice_vector col_desc(parquet_columns.size(), stream); - // This should've been `auto const&` but isn't since dictionary space is allocated when calling - // get_device_view(). Fix during dictionary refactor. std::transform( - parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [&](auto& pcol) { + parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [&](auto const& pcol) { return pcol.get_device_view(stream); }); @@ -973,11 +1062,9 @@ void writer::impl::write(table_view const& table) // ideally want the page size to be below 1MB so as to have enough pages to get good // compression/decompression performance). using cudf::io::parquet::gpu::max_page_fragment_size; - constexpr uint32_t fragment_size = 5000; - static_assert(fragment_size <= max_page_fragment_size, - "fragment size cannot be greater than max_page_fragment_size"); - uint32_t num_fragments = (uint32_t)((num_rows + fragment_size - 1) / fragment_size); + uint32_t num_fragments = + (uint32_t)((num_rows + max_page_fragment_size - 1) / max_page_fragment_size); cudf::detail::hostdevice_2dvector fragments( num_columns, num_fragments, stream); @@ -987,7 +1074,7 @@ void writer::impl::write(table_view const& table) leaf_column_views = create_leaf_column_device_views( col_desc, *parent_column_table_device_view, stream); - init_page_fragments(fragments, col_desc, num_rows, fragment_size); + init_page_fragments(fragments, col_desc, num_rows, max_page_fragment_size); } size_t global_rowgroup_base = md.row_groups.size(); @@ -1002,11 +1089,12 @@ void writer::impl::write(table_view const& table) for (auto i = 0; i < num_columns; i++) { fragment_data_size += fragments[i][f].fragment_data_size; } - if (f > rowgroup_start && (rowgroup_size + fragment_data_size > max_rowgroup_size_ || - (f + 1 - rowgroup_start) * fragment_size > max_rowgroup_rows_)) { + if (f > rowgroup_start && + (rowgroup_size + fragment_data_size > max_rowgroup_size_ || + (f + 1 - rowgroup_start) * max_page_fragment_size > max_rowgroup_rows_)) { // update schema md.row_groups.resize(md.row_groups.size() + 1); - md.row_groups[global_r++].num_rows = (f - rowgroup_start) * fragment_size; + md.row_groups[global_r++].num_rows = (f - rowgroup_start) * max_page_fragment_size; num_rowgroups++; rowgroup_start = f; rowgroup_size = 0; @@ -1015,7 +1103,7 @@ void writer::impl::write(table_view const& table) if (f + 1 == num_fragments) { // update schema md.row_groups.resize(md.row_groups.size() + 1); - md.row_groups[global_r++].num_rows = num_rows - rowgroup_start * fragment_size; + md.row_groups[global_r++].num_rows = num_rows - rowgroup_start * max_page_fragment_size; num_rowgroups++; } } @@ -1033,20 +1121,19 @@ void writer::impl::write(table_view const& table) // Initialize row groups and column chunks uint32_t num_chunks = num_rowgroups * num_columns; hostdevice_2dvector chunks(num_rowgroups, num_columns, stream); - uint32_t num_dictionaries = 0; for (uint32_t r = 0, global_r = global_rowgroup_base, f = 0, start_row = 0; r < num_rowgroups; r++, global_r++) { - uint32_t fragments_in_chunk = - (uint32_t)((md.row_groups[global_r].num_rows + fragment_size - 1) / fragment_size); + uint32_t fragments_in_chunk = (uint32_t)( + (md.row_groups[global_r].num_rows + max_page_fragment_size - 1) / max_page_fragment_size); md.row_groups[global_r].total_byte_size = 0; md.row_groups[global_r].columns.resize(num_columns); for (int i = 0; i < num_columns; i++) { gpu::EncColumnChunk* ck = &chunks[r][i]; - bool dict_enable = false; - *ck = {}; - ck->col_desc = col_desc.device_ptr() + i; - ck->fragments = &fragments.device_view()[i][f]; + *ck = {}; + ck->col_desc = col_desc.device_ptr() + i; + ck->col_desc_id = i; + ck->fragments = &fragments.device_view()[i][f]; ck->stats = (frag_stats.size() != 0) ? frag_stats.data() + i * num_fragments + f : nullptr; ck->start_row = start_row; ck->num_rows = (uint32_t)md.row_groups[global_r].num_rows; @@ -1056,30 +1143,12 @@ void writer::impl::write(table_view const& table) std::accumulate(chunk_fragments.begin(), chunk_fragments.end(), 0, [](uint32_t l, auto r) { return l + r.num_values; }); - ck->dictionary_id = num_dictionaries; - if (col_desc[i].dict_data) { - size_t plain_size = 0; - size_t dict_size = 1; - uint32_t num_dict_vals = 0; - for (uint32_t j = 0; j < fragments_in_chunk && num_dict_vals < 65536; j++) { - plain_size += chunk_fragments[j].fragment_data_size; - dict_size += chunk_fragments[j].dict_data_size + - ((num_dict_vals > 256) ? 2 : 1) * chunk_fragments[j].non_nulls; - num_dict_vals += chunk_fragments[j].num_dict_vals; - } - if (dict_size < plain_size) { - parquet_columns[i].use_dictionary(true); - dict_enable = true; - num_dictionaries++; - } - } - ck->has_dictionary = dict_enable; + ck->plain_data_size = std::accumulate( + chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, gpu::PageFragment frag) { + return sum + frag.fragment_data_size; + }); md.row_groups[global_r].columns[i].meta_data.type = parquet_columns[i].physical_type(); md.row_groups[global_r].columns[i].meta_data.encodings = {Encoding::PLAIN, Encoding::RLE}; - if (dict_enable) { - md.row_groups[global_r].columns[i].meta_data.encodings.push_back( - Encoding::PLAIN_DICTIONARY); - } md.row_groups[global_r].columns[i].meta_data.path_in_schema = parquet_columns[i].get_path_in_schema(); md.row_groups[global_r].columns[i].meta_data.codec = UNCOMPRESSED; @@ -1089,15 +1158,18 @@ void writer::impl::write(table_view const& table) start_row += (uint32_t)md.row_groups[global_r].num_rows; } - // Free unused dictionaries - for (auto& col : parquet_columns) { - col.check_dictionary_used(stream); + auto dict_info_owner = build_chunk_dictionaries(chunks, col_desc, num_rows, stream); + for (uint32_t rg = 0, global_rg = global_rowgroup_base; rg < num_rowgroups; rg++, global_rg++) { + for (int col = 0; col < num_columns; col++) { + if (chunks.host_view()[rg][col].use_dictionary) { + md.row_groups[global_rg].columns[col].meta_data.encodings.push_back( + Encoding::PLAIN_DICTIONARY); + } + } } // Build chunk dictionaries and count pages - if (num_chunks != 0) { - build_chunk_dictionaries(chunks, col_desc, num_columns, num_dictionaries); - } + if (num_chunks != 0) { init_page_sizes(chunks, col_desc, num_columns); } // Initialize batches of rowgroups to encode (mainly to limit peak memory usage) std::vector batch_list; @@ -1247,9 +1319,9 @@ void writer::impl::write(table_view const& table) } md.row_groups[global_r].total_byte_size += ck->compressed_size; md.row_groups[global_r].columns[i].meta_data.data_page_offset = - current_chunk_offset + ((ck->has_dictionary) ? ck->dictionary_size : 0); + current_chunk_offset + ((ck->use_dictionary) ? ck->dictionary_size : 0); md.row_groups[global_r].columns[i].meta_data.dictionary_page_offset = - (ck->has_dictionary) ? current_chunk_offset : 0; + (ck->use_dictionary) ? current_chunk_offset : 0; md.row_groups[global_r].columns[i].meta_data.total_uncompressed_size = ck->bfr_size; md.row_groups[global_r].columns[i].meta_data.total_compressed_size = ck->compressed_size; current_chunk_offset += ck->compressed_size; diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index 8d9bdc8adbd..8fb1a8294fb 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -153,12 +153,11 @@ class writer::impl { * @param chunks column chunk array * @param col_desc column description array * @param num_columns Total number of columns - * @param num_dictionaries Total number of dictionaries */ - void build_chunk_dictionaries(hostdevice_2dvector& chunks, - device_span col_desc, - uint32_t num_columns, - uint32_t num_dictionaries); + void init_page_sizes(hostdevice_2dvector& chunks, + device_span col_desc, + uint32_t num_columns); + /** * @brief Initialize encoder pages * diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index a6b4978aeab..88297423b9b 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -252,7 +252,7 @@ __device__ __inline__ char const* seek_field_end(char const* begin, bool quotation = false; auto current = begin; bool escape_next = false; - while (true) { + while (current < end) { // Use simple logic to ignore control chars between any quote seq // Handles nominal cases including doublequotes within quotes, but // may not output exact failures as PANDAS for malformed fields. @@ -262,7 +262,7 @@ __device__ __inline__ char const* seek_field_end(char const* begin, quotation = !quotation; } else if (!quotation) { if (*current == opts.delimiter) { - while (opts.multi_delimiter && current < end && *(current + 1) == opts.delimiter) { + while (opts.multi_delimiter && (current + 1 < end) && *(current + 1) == opts.delimiter) { ++current; } break; @@ -283,8 +283,7 @@ __device__ __inline__ char const* seek_field_end(char const* begin, } } - if (current >= end) break; - current++; + if (current < end) { current++; } } return current; } diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu index c7a1630311b..bfabe99aaf9 100644 --- a/cpp/src/join/conditional_join.cu +++ b/cpp/src/join/conditional_join.cu @@ -13,118 +13,400 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include -#include +#include +#include +#include #include #include +#include #include +#include +#include +#include +#include +#include #include +#include + namespace cudf { namespace detail { std::pair>, std::unique_ptr>> -conditional_join(table_view left, - table_view right, - ast::expression binary_predicate, +conditional_join(table_view const& left, + table_view const& right, + ast::expression const& binary_predicate, null_equality compare_nulls, - join_kind JoinKind, + join_kind join_type, + std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_FUNC_RANGE(); - return get_conditional_join_indices( - left, right, JoinKind, binary_predicate, compare_nulls, stream, mr); + // We can immediately filter out cases where the right table is empty. In + // some cases, we return all the rows of the left table with a corresponding + // null index for the right table; in others, we return an empty output. + if (right.num_rows() == 0) { + switch (join_type) { + // Left, left anti, and full (which are effectively left because we are + // guaranteed that left has more rows than right) all return a all the + // row indices from left with a corresponding NULL from the right. + case join_kind::LEFT_JOIN: + case join_kind::LEFT_ANTI_JOIN: + case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left, stream); + // Inner and left semi joins return empty output because no matches can exist. + case join_kind::INNER_JOIN: + case join_kind::LEFT_SEMI_JOIN: + return std::make_pair(std::make_unique>(0, stream, mr), + std::make_unique>(0, stream, mr)); + } + } + + // Prepare output column. Whether or not the output column is nullable is + // determined by whether any of the columns in the input table are nullable. + // If none of the input columns actually contain nulls, we can still use the + // non-nullable version of the expression evaluation code path for + // performance, so we capture that information as well. + auto const nullable = cudf::nullable(left) || cudf::nullable(right); + auto const has_nulls = nullable && (cudf::has_nulls(left) || cudf::has_nulls(right)); + + auto const parser = + ast::detail::expression_parser{binary_predicate, left, right, has_nulls, stream, mr}; + CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8, + "The expression must produce a boolean output."); + + auto left_table = table_device_view::create(left, stream); + auto right_table = table_device_view::create(right, stream); + + // Allocate storage for the counter used to get the size of the join output + detail::grid_1d config(left_table->num_rows(), DEFAULT_JOIN_BLOCK_SIZE); + auto const shmem_size_per_block = + parser.device_expression_data.shmem_per_thread * config.num_threads_per_block; + join_kind kernel_join_type = join_type == join_kind::FULL_JOIN ? join_kind::LEFT_JOIN : join_type; + + // If the join size was not provided as an input, compute it here. + std::size_t join_size; + if (output_size.has_value()) { + join_size = *output_size; + } else { + rmm::device_scalar size(0, stream, mr); + CHECK_CUDA(stream.value()); + if (has_nulls) { + compute_conditional_join_output_size + <<>>( + *left_table, + *right_table, + kernel_join_type, + compare_nulls, + parser.device_expression_data, + size.data()); + } else { + compute_conditional_join_output_size + <<>>( + *left_table, + *right_table, + kernel_join_type, + compare_nulls, + parser.device_expression_data, + size.data()); + } + CHECK_CUDA(stream.value()); + join_size = size.value(stream); + } + + // If the output size will be zero, we can return immediately. + if (join_size == 0) { + return std::make_pair(std::make_unique>(0, stream, mr), + std::make_unique>(0, stream, mr)); + } + + rmm::device_scalar write_index(0, stream); + + auto left_indices = std::make_unique>(join_size, stream, mr); + auto right_indices = std::make_unique>(join_size, stream, mr); + + auto const& join_output_l = left_indices->data(); + auto const& join_output_r = right_indices->data(); + if (has_nulls) { + conditional_join + <<>>( + *left_table, + *right_table, + kernel_join_type, + compare_nulls, + join_output_l, + join_output_r, + write_index.data(), + parser.device_expression_data, + join_size); + } else { + conditional_join + <<>>( + *left_table, + *right_table, + kernel_join_type, + compare_nulls, + join_output_l, + join_output_r, + write_index.data(), + parser.device_expression_data, + join_size); + } + + CHECK_CUDA(stream.value()); + + auto join_indices = std::make_pair(std::move(left_indices), std::move(right_indices)); + + // For full joins, get the indices in the right table that were not joined to + // by any row in the left table. + if (join_type == join_kind::FULL_JOIN) { + auto complement_indices = detail::get_left_join_indices_complement( + join_indices.second, left.num_rows(), right.num_rows(), stream, mr); + join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream); + } + return join_indices; +} + +std::size_t compute_conditional_join_output_size(table_view const& left, + table_view const& right, + ast::expression const& binary_predicate, + null_equality compare_nulls, + join_kind join_type, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // We can immediately filter out cases where the right table is empty. In + // some cases, we return all the rows of the left table with a corresponding + // null index for the right table; in others, we return an empty output. + if (right.num_rows() == 0) { + switch (join_type) { + // Left, left anti, and full (which are effectively left because we are + // guaranteed that left has more rows than right) all return a all the + // row indices from left with a corresponding NULL from the right. + case join_kind::LEFT_JOIN: + case join_kind::LEFT_ANTI_JOIN: + case join_kind::FULL_JOIN: return left.num_rows(); + // Inner and left semi joins return empty output because no matches can exist. + case join_kind::INNER_JOIN: + case join_kind::LEFT_SEMI_JOIN: return 0; + } + } + + // Prepare output column. Whether or not the output column is nullable is + // determined by whether any of the columns in the input table are nullable. + // If none of the input columns actually contain nulls, we can still use the + // non-nullable version of the expression evaluation code path for + // performance, so we capture that information as well. + auto const nullable = cudf::nullable(left) || cudf::nullable(right); + auto const has_nulls = nullable && (cudf::has_nulls(left) || cudf::has_nulls(right)); + + auto const parser = + ast::detail::expression_parser{binary_predicate, left, right, has_nulls, stream, mr}; + CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8, + "The expression must produce a boolean output."); + + auto left_table = table_device_view::create(left, stream); + auto right_table = table_device_view::create(right, stream); + + // Allocate storage for the counter used to get the size of the join output + rmm::device_scalar size(0, stream, mr); + CHECK_CUDA(stream.value()); + detail::grid_1d config(left_table->num_rows(), DEFAULT_JOIN_BLOCK_SIZE); + auto const shmem_size_per_block = + parser.device_expression_data.shmem_per_thread * config.num_threads_per_block; + + // Determine number of output rows without actually building the output to simply + // find what the size of the output will be. + assert(join_type != join_kind::FULL_JOIN); + if (has_nulls) { + compute_conditional_join_output_size + <<>>( + *left_table, + *right_table, + join_type, + compare_nulls, + parser.device_expression_data, + size.data()); + } else { + compute_conditional_join_output_size + <<>>( + *left_table, + *right_table, + join_type, + compare_nulls, + parser.device_expression_data, + size.data()); + } + CHECK_CUDA(stream.value()); + + return size.value(stream); } } // namespace detail std::pair>, std::unique_ptr>> -conditional_inner_join(table_view left, - table_view right, - ast::expression binary_predicate, +conditional_inner_join(table_view const& left, + table_view const& right, + ast::expression const& binary_predicate, null_equality compare_nulls, + std::optional output_size, rmm::mr::device_memory_resource* mr) { + CUDF_FUNC_RANGE(); return detail::conditional_join(left, right, binary_predicate, compare_nulls, detail::join_kind::INNER_JOIN, + output_size, rmm::cuda_stream_default, mr); } std::pair>, std::unique_ptr>> -conditional_left_join(table_view left, - table_view right, - ast::expression binary_predicate, +conditional_left_join(table_view const& left, + table_view const& right, + ast::expression const& binary_predicate, null_equality compare_nulls, + std::optional output_size, rmm::mr::device_memory_resource* mr) { + CUDF_FUNC_RANGE(); return detail::conditional_join(left, right, binary_predicate, compare_nulls, detail::join_kind::LEFT_JOIN, + output_size, rmm::cuda_stream_default, mr); } std::pair>, std::unique_ptr>> -conditional_full_join(table_view left, - table_view right, - ast::expression binary_predicate, +conditional_full_join(table_view const& left, + table_view const& right, + ast::expression const& binary_predicate, null_equality compare_nulls, rmm::mr::device_memory_resource* mr) { + CUDF_FUNC_RANGE(); return detail::conditional_join(left, right, binary_predicate, compare_nulls, detail::join_kind::FULL_JOIN, + {}, rmm::cuda_stream_default, mr); } std::unique_ptr> conditional_left_semi_join( - table_view left, - table_view right, - ast::expression binary_predicate, + table_view const& left, + table_view const& right, + ast::expression const& binary_predicate, null_equality compare_nulls, + std::optional output_size, rmm::mr::device_memory_resource* mr) { + CUDF_FUNC_RANGE(); return std::move(detail::conditional_join(left, right, binary_predicate, compare_nulls, detail::join_kind::LEFT_SEMI_JOIN, + output_size, rmm::cuda_stream_default, mr) .first); } std::unique_ptr> conditional_left_anti_join( - table_view left, - table_view right, - ast::expression binary_predicate, + table_view const& left, + table_view const& right, + ast::expression const& binary_predicate, null_equality compare_nulls, + std::optional output_size, rmm::mr::device_memory_resource* mr) { + CUDF_FUNC_RANGE(); return std::move(detail::conditional_join(left, right, binary_predicate, compare_nulls, detail::join_kind::LEFT_ANTI_JOIN, + output_size, rmm::cuda_stream_default, mr) .first); } +std::size_t conditional_inner_join_size(table_view const& left, + table_view const& right, + ast::expression const& binary_predicate, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::compute_conditional_join_output_size(left, + right, + binary_predicate, + compare_nulls, + detail::join_kind::INNER_JOIN, + rmm::cuda_stream_default, + mr); +} + +std::size_t conditional_left_join_size(table_view const& left, + table_view const& right, + ast::expression const& binary_predicate, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::compute_conditional_join_output_size(left, + right, + binary_predicate, + compare_nulls, + detail::join_kind::LEFT_JOIN, + rmm::cuda_stream_default, + mr); +} + +std::size_t conditional_left_semi_join_size(table_view const& left, + table_view const& right, + ast::expression const& binary_predicate, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return std::move(detail::compute_conditional_join_output_size(left, + right, + binary_predicate, + compare_nulls, + detail::join_kind::LEFT_SEMI_JOIN, + rmm::cuda_stream_default, + mr)); +} + +std::size_t conditional_left_anti_join_size(table_view const& left, + table_view const& right, + ast::expression const& binary_predicate, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return std::move(detail::compute_conditional_join_output_size(left, + right, + binary_predicate, + compare_nulls, + detail::join_kind::LEFT_ANTI_JOIN, + rmm::cuda_stream_default, + mr)); +} + } // namespace cudf diff --git a/cpp/src/join/conditional_join.cuh b/cpp/src/join/conditional_join.cuh deleted file mode 100644 index 4602b7fefaa..00000000000 --- a/cpp/src/join/conditional_join.cuh +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -namespace cudf { -namespace detail { - -/** - * @brief Computes the join operation between two tables and returns the - * output indices of left and right table as a combined table - * - * @param left Table of left columns to join - * @param right Table of right columns to join - * tables have been flipped, meaning the output indices should also be flipped - * @param JoinKind The type of join to be performed - * @param compare_nulls Controls whether null join-key values should match or not. - * @param stream CUDA stream used for device memory operations and kernel launches - * - * @return Join output indices vector pair - */ -std::pair>, - std::unique_ptr>> -get_conditional_join_indices(table_view const& left, - table_view const& right, - join_kind JoinKind, - ast::expression binary_pred, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - // We can immediately filter out cases where the right table is empty. In - // some cases, we return all the rows of the left table with a corresponding - // null index for the right table; in others, we return an empty output. - if (right.num_rows() == 0) { - switch (JoinKind) { - // Left, left anti, and full (which are effectively left because we are - // guaranteed that left has more rows than right) all return a all the - // row indices from left with a corresponding NULL from the right. - case join_kind::LEFT_JOIN: - case join_kind::LEFT_ANTI_JOIN: - case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left, stream); - // Inner and left semi joins return empty output because no matches can exist. - case join_kind::INNER_JOIN: - case join_kind::LEFT_SEMI_JOIN: - return std::make_pair(std::make_unique>(0, stream, mr), - std::make_unique>(0, stream, mr)); - } - } - - // Prepare output column. Whether or not the output column is nullable is - // determined by whether any of the columns in the input table are nullable. - // If none of the input columns actually contain nulls, we can still use the - // non-nullable version of the expression evaluation code path for - // performance, so we capture that information as well. - auto const nullable = cudf::nullable(left) || cudf::nullable(right); - auto const has_nulls = nullable && (cudf::has_nulls(left) || cudf::has_nulls(right)); - - auto const plan = ast::detail::ast_plan{binary_pred, left, right, has_nulls, stream, mr}; - CUDF_EXPECTS(plan.output_type().id() == type_id::BOOL8, - "The expression must produce a boolean output."); - - auto left_table = table_device_view::create(left, stream); - auto right_table = table_device_view::create(right, stream); - - // Allocate storage for the counter used to get the size of the join output - rmm::device_scalar size(0, stream, mr); - CHECK_CUDA(stream.value()); - constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE}; - detail::grid_1d config(left_table->num_rows(), block_size); - auto const shmem_size_per_block = plan.dev_plan.shmem_per_thread * config.num_threads_per_block; - - // Determine number of output rows without actually building the output to simply - // find what the size of the output will be. - join_kind KernelJoinKind = JoinKind == join_kind::FULL_JOIN ? join_kind::LEFT_JOIN : JoinKind; - if (has_nulls) { - compute_conditional_join_output_size - <<>>( - *left_table, *right_table, KernelJoinKind, compare_nulls, plan.dev_plan, size.data()); - } else { - compute_conditional_join_output_size - <<>>( - *left_table, *right_table, KernelJoinKind, compare_nulls, plan.dev_plan, size.data()); - } - CHECK_CUDA(stream.value()); - - size_type const join_size = size.value(stream); - - // If the output size will be zero, we can return immediately. - if (join_size == 0) { - return std::make_pair(std::make_unique>(0, stream, mr), - std::make_unique>(0, stream, mr)); - } - - rmm::device_scalar write_index(0, stream); - - auto left_indices = std::make_unique>(join_size, stream, mr); - auto right_indices = std::make_unique>(join_size, stream, mr); - - const auto& join_output_l = left_indices->data(); - const auto& join_output_r = right_indices->data(); - if (has_nulls) { - conditional_join - <<>>( - *left_table, - *right_table, - KernelJoinKind, - compare_nulls, - join_output_l, - join_output_r, - write_index.data(), - plan.dev_plan, - join_size); - } else { - conditional_join - <<>>( - *left_table, - *right_table, - KernelJoinKind, - compare_nulls, - join_output_l, - join_output_r, - write_index.data(), - plan.dev_plan, - join_size); - } - - CHECK_CUDA(stream.value()); - - auto join_indices = std::make_pair(std::move(left_indices), std::move(right_indices)); - - // For full joins, get the indices in the right table that were not joined to - // by any row in the left table. - if (JoinKind == join_kind::FULL_JOIN) { - auto complement_indices = detail::get_left_join_indices_complement( - join_indices.second, left.num_rows(), right.num_rows(), stream, mr); - join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream); - } - return join_indices; -} - -} // namespace detail - -} // namespace cudf diff --git a/cpp/src/join/conditional_join.hpp b/cpp/src/join/conditional_join.hpp new file mode 100644 index 00000000000..5a3fe887838 --- /dev/null +++ b/cpp/src/join/conditional_join.hpp @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "join_common_utils.hpp" + +#include +#include + +#include +#include + +#include + +namespace cudf { +namespace detail { + +/** + * @brief Computes the join operation between two tables and returns the + * output indices of left and right table as a combined table + * + * @param left Table of left columns to join + * @param right Table of right columns to join + * tables have been flipped, meaning the output indices should also be flipped + * @param JoinKind The type of join to be performed + * @param compare_nulls Controls whether null join-key values should match or not. + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return Join output indices vector pair + */ +std::pair>, + std::unique_ptr>> +conditional_join(table_view const& left, + table_view const& right, + ast::expression const& binary_predicate, + null_equality compare_nulls, + join_kind JoinKind, + std::optional output_size = {}, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Computes the size of a join operation between two tables without + * materializing the result and returns the total size value. + * + * @param left Table of left columns to join + * @param right Table of right columns to join + * tables have been flipped, meaning the output indices should also be flipped + * @param JoinKind The type of join to be performed + * @param compare_nulls Controls whether null join-key values should match or not. + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return Join output indices vector pair + */ +std::size_t compute_conditional_join_output_size( + table_view const& left, + table_view const& right, + ast::expression const& binary_predicate, + null_equality compare_nulls, + join_kind JoinKind, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh index 3d34a49c5af..9fcc7bf5cfb 100644 --- a/cpp/src/join/conditional_join_kernels.cuh +++ b/cpp/src/join/conditional_join_kernels.cuh @@ -19,13 +19,13 @@ #include #include -#include -#include -#include -#include +#include +#include #include #include +#include + namespace cudf { namespace detail { @@ -40,18 +40,20 @@ namespace detail { * * @param[in] left_table The left table * @param[in] right_table The right table - * @param[in] JoinKind The type of join to be performed + * @param[in] join_type The type of join to be performed * @param[in] compare_nulls Controls whether null join-key values should match or not. - * @param[in] plan Container of device data required to evaluate the desired expression. + * @param[in] device_expression_data Container of device data required to evaluate the desired + * expression. * @param[out] output_size The resulting output size */ template -__global__ void compute_conditional_join_output_size(table_device_view left_table, - table_device_view right_table, - join_kind JoinKind, - null_equality compare_nulls, - ast::detail::device_ast_plan plan, - cudf::size_type* output_size) +__global__ void compute_conditional_join_output_size( + table_device_view left_table, + table_device_view right_table, + join_kind join_type, + null_equality compare_nulls, + ast::detail::expression_device_view device_expression_data, + std::size_t* output_size) { // The (required) extern storage of the shared memory array leads to // conflicting declarations between different templates. The easiest @@ -60,16 +62,17 @@ __global__ void compute_conditional_join_output_size(table_device_view left_tabl extern __shared__ char raw_intermediate_storage[]; cudf::ast::detail::IntermediateDataType* intermediate_storage = reinterpret_cast*>(raw_intermediate_storage); - auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * plan.num_intermediates]; + auto thread_intermediate_storage = + &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates]; - cudf::size_type thread_counter(0); - const cudf::size_type left_start_idx = threadIdx.x + blockIdx.x * blockDim.x; - const cudf::size_type left_stride = blockDim.x * gridDim.x; - const cudf::size_type left_num_rows = left_table.num_rows(); - const cudf::size_type right_num_rows = right_table.num_rows(); + std::size_t thread_counter{0}; + cudf::size_type const left_start_idx = threadIdx.x + blockIdx.x * blockDim.x; + cudf::size_type const left_stride = blockDim.x * gridDim.x; + cudf::size_type const left_num_rows = left_table.num_rows(); + cudf::size_type const right_num_rows = right_table.num_rows(); auto evaluator = cudf::ast::detail::expression_evaluator( - left_table, right_table, plan, thread_intermediate_storage, compare_nulls); + left_table, right_table, device_expression_data, thread_intermediate_storage, compare_nulls); for (cudf::size_type left_row_index = left_start_idx; left_row_index < left_num_rows; left_row_index += left_stride) { @@ -78,15 +81,15 @@ __global__ void compute_conditional_join_output_size(table_device_view left_tabl auto output_dest = cudf::ast::detail::value_expression_result(); evaluator.evaluate(output_dest, left_row_index, right_row_index, 0); if (output_dest.is_valid() && output_dest.value()) { - if ((JoinKind != join_kind::LEFT_ANTI_JOIN) && - !(JoinKind == join_kind::LEFT_SEMI_JOIN && found_match)) { + if ((join_type != join_kind::LEFT_ANTI_JOIN) && + !(join_type == join_kind::LEFT_SEMI_JOIN && found_match)) { ++thread_counter; } found_match = true; } } - if ((JoinKind == join_kind::LEFT_JOIN || JoinKind == join_kind::LEFT_ANTI_JOIN || - JoinKind == join_kind::FULL_JOIN) && + if ((join_type == join_kind::LEFT_JOIN || join_type == join_kind::LEFT_ANTI_JOIN || + join_type == join_kind::FULL_JOIN) && (!found_match)) { ++thread_counter; } @@ -94,7 +97,7 @@ __global__ void compute_conditional_join_output_size(table_device_view left_tabl using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; - cudf::size_type block_counter = BlockReduce(temp_storage).Sum(thread_counter); + std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter); // Add block counter to global counter if (threadIdx.x == 0) atomicAdd(output_size, block_counter); @@ -112,25 +115,26 @@ __global__ void compute_conditional_join_output_size(table_device_view left_tabl * * @param[in] left_table The left table * @param[in] right_table The right table - * @param[in] JoinKind The type of join to be performed + * @param[in] join_type The type of join to be performed * @param compare_nulls Controls whether null join-key values should match or not. * @param[out] join_output_l The left result of the join operation * @param[out] join_output_r The right result of the join operation * @param[in,out] current_idx A global counter used by threads to coordinate * writes to the global output - * @param plan Container of device data required to evaluate the desired expression. + * @param device_expression_data Container of device data required to evaluate the desired + * expression. * @param[in] max_size The maximum size of the output */ template __global__ void conditional_join(table_device_view left_table, table_device_view right_table, - join_kind JoinKind, + join_kind join_type, null_equality compare_nulls, cudf::size_type* join_output_l, cudf::size_type* join_output_r, cudf::size_type* current_idx, - cudf::ast::detail::device_ast_plan plan, - const cudf::size_type max_size) + cudf::ast::detail::expression_device_view device_expression_data, + cudf::size_type const max_size) { constexpr int num_warps = block_size / detail::warp_size; __shared__ cudf::size_type current_idx_shared[num_warps]; @@ -144,12 +148,13 @@ __global__ void conditional_join(table_device_view left_table, extern __shared__ char raw_intermediate_storage[]; cudf::ast::detail::IntermediateDataType* intermediate_storage = reinterpret_cast*>(raw_intermediate_storage); - auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * plan.num_intermediates]; + auto thread_intermediate_storage = + &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates]; - const int warp_id = threadIdx.x / detail::warp_size; - const int lane_id = threadIdx.x % detail::warp_size; - const cudf::size_type left_num_rows = left_table.num_rows(); - const cudf::size_type right_num_rows = right_table.num_rows(); + int const warp_id = threadIdx.x / detail::warp_size; + int const lane_id = threadIdx.x % detail::warp_size; + cudf::size_type const left_num_rows = left_table.num_rows(); + cudf::size_type const right_num_rows = right_table.num_rows(); if (0 == lane_id) { current_idx_shared[warp_id] = 0; } @@ -157,10 +162,10 @@ __global__ void conditional_join(table_device_view left_table, cudf::size_type left_row_index = threadIdx.x + blockIdx.x * blockDim.x; - const unsigned int activemask = __ballot_sync(0xffffffff, left_row_index < left_num_rows); + unsigned int const activemask = __ballot_sync(0xffffffff, left_row_index < left_num_rows); auto evaluator = cudf::ast::detail::expression_evaluator( - left_table, right_table, plan, thread_intermediate_storage, compare_nulls); + left_table, right_table, device_expression_data, thread_intermediate_storage, compare_nulls); if (left_row_index < left_num_rows) { bool found_match = false; @@ -176,8 +181,8 @@ __global__ void conditional_join(table_device_view left_table, // that the current logic relies on the fact that we process all right // table rows for a single left table row on a single thread so that no // synchronization of found_match is required). - if ((JoinKind != join_kind::LEFT_ANTI_JOIN) && - !(JoinKind == join_kind::LEFT_SEMI_JOIN && found_match)) { + if ((join_type != join_kind::LEFT_ANTI_JOIN) && + !(join_type == join_kind::LEFT_SEMI_JOIN && found_match)) { add_pair_to_cache(left_row_index, right_row_index, current_idx_shared, @@ -209,8 +214,8 @@ __global__ void conditional_join(table_device_view left_table, // Left, left anti, and full joins all require saving left columns that // aren't present in the right. - if ((JoinKind == join_kind::LEFT_JOIN || JoinKind == join_kind::LEFT_ANTI_JOIN || - JoinKind == join_kind::FULL_JOIN) && + if ((join_type == join_kind::LEFT_JOIN || join_type == join_kind::LEFT_ANTI_JOIN || + join_type == join_kind::FULL_JOIN) && (!found_match)) { add_pair_to_cache(left_row_index, static_cast(JoinNoneValue), diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 526edbf6903..740431b8563 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh index 2b1c870bea1..d5c23b1d612 100644 --- a/cpp/src/join/join_common_utils.cuh +++ b/cpp/src/join/join_common_utils.cuh @@ -21,9 +21,7 @@ #include #include -#include -#include #include namespace cudf { @@ -31,7 +29,9 @@ namespace detail { /** * @brief Computes the trivial left join operation for the case when the - * right table is empty. In this case all the valid indices of the left table + * right table is empty. + * + * In this case all the valid indices of the left table * are returned with their corresponding right indices being set to * JoinNoneValue, i.e. -1. * @@ -41,21 +41,12 @@ namespace detail { * * @return Join output indices vector pair */ -inline std::pair>, - std::unique_ptr>> +std::pair>, + std::unique_ptr>> get_trivial_left_join_indices( table_view const& left, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) -{ - auto left_indices = std::make_unique>(left.num_rows(), stream, mr); - thrust::sequence(rmm::exec_policy(stream), left_indices->begin(), left_indices->end(), 0); - auto right_indices = - std::make_unique>(left.num_rows(), stream, mr); - thrust::uninitialized_fill( - rmm::exec_policy(stream), right_indices->begin(), right_indices->end(), JoinNoneValue); - return std::make_pair(std::move(left_indices), std::move(right_indices)); -} + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); // Convenient alias for a pair of unique pointers to device uvectors. using VectorPair = std::pair>, @@ -83,47 +74,11 @@ using VectorPair = std::pair>, * * @return A pair of vectors containing the concatenated output. */ -inline VectorPair concatenate_vector_pairs(VectorPair& a, - VectorPair& b, - rmm::cuda_stream_view stream) -{ - CUDF_EXPECTS((a.first->size() == a.second->size()), - "Mismatch between sizes of vectors in vector pair"); - CUDF_EXPECTS((b.first->size() == b.second->size()), - "Mismatch between sizes of vectors in vector pair"); - if (a.first->is_empty()) { - return std::move(b); - } else if (b.first->is_empty()) { - return std::move(a); - } - auto original_size = a.first->size(); - a.first->resize(a.first->size() + b.first->size(), stream); - a.second->resize(a.second->size() + b.second->size(), stream); - thrust::copy( - rmm::exec_policy(stream), b.first->begin(), b.first->end(), a.first->begin() + original_size); - thrust::copy(rmm::exec_policy(stream), - b.second->begin(), - b.second->end(), - a.second->begin() + original_size); - return std::move(a); -} - -/** - * @brief Device functor to determine if an index is contained in a range. - */ -template -struct valid_range { - T start, stop; - __host__ __device__ valid_range(const T begin, const T end) : start(begin), stop(end) {} - - __host__ __device__ __forceinline__ bool operator()(const T index) - { - return ((index >= start) && (index < stop)); - } -}; +VectorPair concatenate_vector_pairs(VectorPair& a, VectorPair& b, rmm::cuda_stream_view stream); /** * @brief Creates a table containing the complement of left join indices. + * * This table has two columns. The first one is filled with JoinNoneValue(-1) * and the second one contains values from 0 to right_table_row_count - 1 * excluding those found in the right_indices column. @@ -136,72 +91,27 @@ struct valid_range { * * @return Pair of vectors containing the left join indices complement */ -inline std::pair>, - std::unique_ptr>> +std::pair>, + std::unique_ptr>> get_left_join_indices_complement(std::unique_ptr>& right_indices, size_type left_table_row_count, size_type right_table_row_count, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - // Get array of indices that do not appear in right_indices - - // Vector allocated for unmatched result - auto right_indices_complement = - std::make_unique>(right_table_row_count, stream); - - // If left table is empty in a full join call then all rows of the right table - // should be represented in the joined indices. This is an optimization since - // if left table is empty and full join is called all the elements in - // right_indices will be JoinNoneValue, i.e. -1. This if path should - // produce exactly the same result as the else path but will be faster. - if (left_table_row_count == 0) { - thrust::sequence(rmm::exec_policy(stream), - right_indices_complement->begin(), - right_indices_complement->end(), - 0); - } else { - // Assume all the indices in invalid_index_map are invalid - auto invalid_index_map = - std::make_unique>(right_table_row_count, stream); - thrust::uninitialized_fill( - rmm::exec_policy(stream), invalid_index_map->begin(), invalid_index_map->end(), int32_t{1}); - - // Functor to check for index validity since left joins can create invalid indices - valid_range valid(0, right_table_row_count); + rmm::mr::device_memory_resource* mr); - // invalid_index_map[index_ptr[i]] = 0 for i = 0 to right_table_row_count - // Thus specifying that those locations are valid - thrust::scatter_if(rmm::exec_policy(stream), - thrust::make_constant_iterator(0), - thrust::make_constant_iterator(0) + right_indices->size(), - right_indices->begin(), // Index locations - right_indices->begin(), // Stencil - Check if index location is valid - invalid_index_map->begin(), // Output indices - valid); // Stencil Predicate - size_type begin_counter = static_cast(0); - size_type end_counter = static_cast(right_table_row_count); +/** + * @brief Device functor to determine if an index is contained in a range. + */ +template +struct valid_range { + T start, stop; + __host__ __device__ valid_range(const T begin, const T end) : start(begin), stop(end) {} - // Create list of indices that have been marked as invalid - size_type indices_count = thrust::copy_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(begin_counter), - thrust::make_counting_iterator(end_counter), - invalid_index_map->begin(), - right_indices_complement->begin(), - thrust::identity()) - - right_indices_complement->begin(); - right_indices_complement->resize(indices_count, stream); + __host__ __device__ __forceinline__ bool operator()(const T index) + { + return ((index >= start) && (index < stop)); } - - auto left_invalid_indices = - std::make_unique>(right_indices_complement->size(), stream); - thrust::uninitialized_fill(rmm::exec_policy(stream), - left_invalid_indices->begin(), - left_invalid_indices->end(), - JoinNoneValue); - - return std::make_pair(std::move(left_invalid_indices), std::move(right_indices_complement)); -} +}; /** * @brief Adds a pair of indices to the shared memory cache diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index d2337e28ed4..d2541b006a7 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,8 +19,6 @@ #include #include -#include - #include #include @@ -49,26 +47,7 @@ using row_equality = cudf::row_equality_comparator; enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN }; -inline bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type) -{ - // If there is nothing to join, then send empty table with all columns - if (left.is_empty() || right.is_empty()) { return true; } - - // If left join and the left table is empty, return immediately - if ((join_kind::LEFT_JOIN == join_type) && (0 == left.num_rows())) { return true; } - - // If Inner Join and either table is empty, return immediately - if ((join_kind::INNER_JOIN == join_type) && ((0 == left.num_rows()) || (0 == right.num_rows()))) { - return true; - } - - // If left semi join (contains) and right table is empty, - // return immediately - if ((join_kind::LEFT_SEMI_JOIN == join_type) && (0 == right.num_rows())) { return true; } - - return false; -} +bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type); } // namespace detail - } // namespace cudf diff --git a/cpp/src/join/join_utils.cu b/cpp/src/join/join_utils.cu new file mode 100644 index 00000000000..4aca4b4a9cf --- /dev/null +++ b/cpp/src/join/join_utils.cu @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include + +namespace cudf { +namespace detail { + +bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type) +{ + // If there is nothing to join, then send empty table with all columns + if (left.is_empty() || right.is_empty()) { return true; } + + // If left join and the left table is empty, return immediately + if ((join_kind::LEFT_JOIN == join_type) && (0 == left.num_rows())) { return true; } + + // If Inner Join and either table is empty, return immediately + if ((join_kind::INNER_JOIN == join_type) && ((0 == left.num_rows()) || (0 == right.num_rows()))) { + return true; + } + + // If left semi join (contains) and right table is empty, + // return immediately + if ((join_kind::LEFT_SEMI_JOIN == join_type) && (0 == right.num_rows())) { return true; } + + return false; +} + +std::pair>, + std::unique_ptr>> +get_trivial_left_join_indices(table_view const& left, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto left_indices = std::make_unique>(left.num_rows(), stream, mr); + thrust::sequence(rmm::exec_policy(stream), left_indices->begin(), left_indices->end(), 0); + auto right_indices = + std::make_unique>(left.num_rows(), stream, mr); + thrust::uninitialized_fill( + rmm::exec_policy(stream), right_indices->begin(), right_indices->end(), JoinNoneValue); + return std::make_pair(std::move(left_indices), std::move(right_indices)); +} + +VectorPair concatenate_vector_pairs(VectorPair& a, VectorPair& b, rmm::cuda_stream_view stream) +{ + CUDF_EXPECTS((a.first->size() == a.second->size()), + "Mismatch between sizes of vectors in vector pair"); + CUDF_EXPECTS((b.first->size() == b.second->size()), + "Mismatch between sizes of vectors in vector pair"); + if (a.first->is_empty()) { + return std::move(b); + } else if (b.first->is_empty()) { + return std::move(a); + } + auto original_size = a.first->size(); + a.first->resize(a.first->size() + b.first->size(), stream); + a.second->resize(a.second->size() + b.second->size(), stream); + thrust::copy( + rmm::exec_policy(stream), b.first->begin(), b.first->end(), a.first->begin() + original_size); + thrust::copy(rmm::exec_policy(stream), + b.second->begin(), + b.second->end(), + a.second->begin() + original_size); + return std::move(a); +} + +std::pair>, + std::unique_ptr>> +get_left_join_indices_complement(std::unique_ptr>& right_indices, + size_type left_table_row_count, + size_type right_table_row_count, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // Get array of indices that do not appear in right_indices + + // Vector allocated for unmatched result + auto right_indices_complement = + std::make_unique>(right_table_row_count, stream); + + // If left table is empty in a full join call then all rows of the right table + // should be represented in the joined indices. This is an optimization since + // if left table is empty and full join is called all the elements in + // right_indices will be JoinNoneValue, i.e. -1. This if path should + // produce exactly the same result as the else path but will be faster. + if (left_table_row_count == 0) { + thrust::sequence(rmm::exec_policy(stream), + right_indices_complement->begin(), + right_indices_complement->end(), + 0); + } else { + // Assume all the indices in invalid_index_map are invalid + auto invalid_index_map = + std::make_unique>(right_table_row_count, stream); + thrust::uninitialized_fill( + rmm::exec_policy(stream), invalid_index_map->begin(), invalid_index_map->end(), int32_t{1}); + + // Functor to check for index validity since left joins can create invalid indices + valid_range valid(0, right_table_row_count); + + // invalid_index_map[index_ptr[i]] = 0 for i = 0 to right_table_row_count + // Thus specifying that those locations are valid + thrust::scatter_if(rmm::exec_policy(stream), + thrust::make_constant_iterator(0), + thrust::make_constant_iterator(0) + right_indices->size(), + right_indices->begin(), // Index locations + right_indices->begin(), // Stencil - Check if index location is valid + invalid_index_map->begin(), // Output indices + valid); // Stencil Predicate + size_type begin_counter = static_cast(0); + size_type end_counter = static_cast(right_table_row_count); + + // Create list of indices that have been marked as invalid + size_type indices_count = thrust::copy_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(begin_counter), + thrust::make_counting_iterator(end_counter), + invalid_index_map->begin(), + right_indices_complement->begin(), + thrust::identity()) - + right_indices_complement->begin(); + right_indices_complement->resize(indices_count, stream); + } + + auto left_invalid_indices = + std::make_unique>(right_indices_complement->size(), stream); + thrust::uninitialized_fill(rmm::exec_policy(stream), + left_invalid_indices->begin(), + left_invalid_indices->end(), + JoinNoneValue); + + return std::make_pair(std::move(left_invalid_indices), std::move(right_indices_complement)); +} + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index cc34aed33ea..69a7b8c722b 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -18,15 +18,12 @@ #include #include -#include - #include -#include +#include +#include #include -#include #include #include -#include #include #include @@ -34,11 +31,15 @@ #include #include +#include +#include +#include + namespace cudf { namespace detail { -template std::unique_ptr> left_semi_anti_join( + join_kind const kind, cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls, @@ -48,13 +49,13 @@ std::unique_ptr> left_semi_anti_join( CUDF_EXPECTS(0 != left_keys.num_columns(), "Left table is empty"); CUDF_EXPECTS(0 != right_keys.num_columns(), "Right table is empty"); - if (is_trivial_join(left_keys, right_keys, JoinKind)) { + if (is_trivial_join(left_keys, right_keys, kind)) { return std::make_unique>(0, stream, mr); } - if ((join_kind::LEFT_ANTI_JOIN == JoinKind) && (0 == right_keys.num_rows())) { + if ((join_kind::LEFT_ANTI_JOIN == kind) && (0 == right_keys.num_rows())) { auto result = std::make_unique>(left_keys.num_rows(), stream, mr); - thrust::sequence(thrust::cuda::par.on(stream.value()), result->begin(), result->end()); + thrust::sequence(rmm::exec_policy(stream), result->begin(), result->end()); return result; } @@ -115,7 +116,7 @@ std::unique_ptr> left_semi_anti_join( // // For semi join we want contains to be true, for anti join we want contains to be false - bool join_type_boolean = (JoinKind == join_kind::LEFT_SEMI_JOIN); + bool const join_type_boolean = (kind == join_kind::LEFT_SEMI_JOIN); auto gather_map = std::make_unique>(left_num_rows, stream, mr); @@ -152,27 +153,26 @@ std::unique_ptr> left_semi_anti_join( * @throws cudf::logic_error if number of returned columns is 0 * @throws cudf::logic_error if number of elements in `right_on` and `left_on` are not equal * - * @param[in] left The left table - * @param[in] right The right table - * @param[in] left_on The column indices from `left` to join on. - * The column from `left` indicated by `left_on[i]` - * will be compared against the column from `right` - * indicated by `right_on[i]`. - * @param[in] right_on The column indices from `right` to join on. - * The column from `right` indicated by `right_on[i]` - * will be compared against the column from `left` - * indicated by `left_on[i]`. - * @param[in] compare_nulls Controls whether null join-key values should match or not. - * @param[in] mr Device memory resource to used to allocate the returned table's - * device memory - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - * @tparam join_kind Indicates whether to do LEFT_SEMI_JOIN or LEFT_ANTI_JOIN + * @param kind Indicates whether to do LEFT_SEMI_JOIN or LEFT_ANTI_JOIN + * @param left The left table + * @param right The right table + * @param left_on The column indices from `left` to join on. + * The column from `left` indicated by `left_on[i]` + * will be compared against the column from `right` + * indicated by `right_on[i]`. + * @param right_on The column indices from `right` to join on. + * The column from `right` indicated by `right_on[i]` + * will be compared against the column from `left` + * indicated by `left_on[i]`. + * @param compare_nulls Controls whether null join-key values should match or not. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource to used to allocate the returned table * - * @returns Result of joining `left` and `right` tables on the columns - * specified by `left_on` and `right_on`. + * @returns Result of joining `left` and `right` tables on the columns + * specified by `left_on` and `right_on`. */ -template std::unique_ptr left_semi_anti_join( + join_kind const kind, cudf::table_view const& left, cudf::table_view const& right, std::vector const& left_on, @@ -183,11 +183,11 @@ std::unique_ptr left_semi_anti_join( { CUDF_EXPECTS(left_on.size() == right_on.size(), "Mismatch in number of columns to be joined on"); - if ((left_on.empty() || right_on.empty()) || is_trivial_join(left, right, JoinKind)) { + if ((left_on.empty() || right_on.empty()) || is_trivial_join(left, right, kind)) { return empty_like(left); } - if ((join_kind::LEFT_ANTI_JOIN == JoinKind) && (0 == right.num_rows())) { + if ((join_kind::LEFT_ANTI_JOIN == kind) && (0 == right.num_rows())) { // Everything matches, just copy the proper columns from the left table return std::make_unique
(left, stream, mr); } @@ -202,14 +202,23 @@ std::unique_ptr left_semi_anti_join( auto const left_selected = matched.second.front(); auto const right_selected = matched.second.back(); - auto gather_map = - left_semi_anti_join(left_selected, right_selected, compare_nulls, stream); + auto gather_vector = + left_semi_anti_join(kind, left_selected, right_selected, compare_nulls, stream); + + // wrapping the device vector with a column view allows calling the non-iterator + // version of detail::gather, improving compile time by 10% and reducing the + // object file size by 2.2x without affecting performance + auto gather_map = column_view(data_type{type_id::INT32}, + static_cast(gather_vector->size()), + gather_vector->data(), + nullptr, + 0); auto const left_updated = scatter_columns(left_selected, left_on, left); return cudf::detail::gather(left_updated, - gather_map->begin(), - gather_map->end(), + gather_map, out_of_bounds_policy::DONT_CHECK, + negative_index_policy::NOT_ALLOWED, stream, mr); } @@ -224,8 +233,14 @@ std::unique_ptr left_semi_join(cudf::table_view const& left, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::left_semi_anti_join( - left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); + return detail::left_semi_anti_join(detail::join_kind::LEFT_SEMI_JOIN, + left, + right, + left_on, + right_on, + compare_nulls, + rmm::cuda_stream_default, + mr); } std::unique_ptr> left_semi_join( @@ -235,8 +250,8 @@ std::unique_ptr> left_semi_join( rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::left_semi_anti_join( - left, right, compare_nulls, rmm::cuda_stream_default, mr); + return detail::left_semi_anti_join( + detail::join_kind::LEFT_SEMI_JOIN, left, right, compare_nulls, rmm::cuda_stream_default, mr); } std::unique_ptr left_anti_join(cudf::table_view const& left, @@ -247,8 +262,14 @@ std::unique_ptr left_anti_join(cudf::table_view const& left, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::left_semi_anti_join( - left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); + return detail::left_semi_anti_join(detail::join_kind::LEFT_ANTI_JOIN, + left, + right, + left_on, + right_on, + compare_nulls, + rmm::cuda_stream_default, + mr); } std::unique_ptr> left_anti_join( @@ -258,8 +279,8 @@ std::unique_ptr> left_anti_join( rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::left_semi_anti_join( - left, right, compare_nulls, rmm::cuda_stream_default, mr); + return detail::left_semi_anti_join( + detail::join_kind::LEFT_ANTI_JOIN, left, right, compare_nulls, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu index c5a28a8ec5f..fb6bff3f129 100644 --- a/cpp/src/lists/combine/concatenate_list_elements.cu +++ b/cpp/src/lists/combine/concatenate_list_elements.cu @@ -51,13 +51,9 @@ std::unique_ptr concatenate_lists_ignore_null(column_view const& input, auto out_offsets = make_numeric_column( data_type{type_id::INT32}, num_rows + 1, mask_state::UNALLOCATED, stream, mr); - // The array of int8_t stores validities for the output list elements. - auto validities = rmm::device_uvector(build_null_mask ? num_rows : 0, stream); - auto const d_out_offsets = out_offsets->mutable_view().template begin(); auto const d_row_offsets = lists_column_view(input).offsets_begin(); auto const d_list_offsets = lists_column_view(lists_column_view(input).child()).offsets_begin(); - auto const lists_dv_ptr = column_device_view::create(lists_column_view(input).child()); // Concatenating the lists at the same row by converting the entry offsets from the child column // into row offsets of the root column. Those entry offsets are subtracted by the first entry @@ -67,22 +63,7 @@ std::unique_ptr concatenate_lists_ignore_null(column_view const& input, iter, iter + num_rows + 1, d_out_offsets, - [d_row_offsets, - d_list_offsets, - lists_dv = *lists_dv_ptr, - d_validities = validities.begin(), - build_null_mask, - iter] __device__(auto const idx) { - if (build_null_mask) { - // The output row will be null only if all lists on the input row are null. - auto const is_valid = thrust::any_of(thrust::seq, - iter + d_row_offsets[idx], - iter + d_row_offsets[idx + 1], - [&] __device__(auto const list_idx) { - return lists_dv.is_valid(list_idx); - }); - d_validities[idx] = static_cast(is_valid); - } + [d_row_offsets, d_list_offsets] __device__(auto const idx) { auto const start_offset = d_list_offsets[d_row_offsets[0]]; return d_list_offsets[d_row_offsets[idx]] - start_offset; }); @@ -92,10 +73,23 @@ std::unique_ptr concatenate_lists_ignore_null(column_view const& input, lists_column_view(lists_column_view(input).get_sliced_child(stream)).get_sliced_child(stream)); auto [null_mask, null_count] = [&] { - return build_null_mask - ? cudf::detail::valid_if( - validities.begin(), validities.end(), thrust::identity{}, stream, mr) - : std::make_pair(cudf::detail::copy_bitmask(input, stream, mr), input.null_count()); + if (!build_null_mask) + return std::make_pair(cudf::detail::copy_bitmask(input, stream, mr), input.null_count()); + + // The output row will be null only if all lists on the input row are null. + auto const lists_dv_ptr = column_device_view::create(lists_column_view(input).child(), stream); + return cudf::detail::valid_if( + iter, + iter + num_rows, + [d_row_offsets, lists_dv = *lists_dv_ptr, iter] __device__(auto const idx) { + return thrust::any_of( + thrust::seq, + iter + d_row_offsets[idx], + iter + d_row_offsets[idx + 1], + [&] __device__(auto const list_idx) { return lists_dv.is_valid(list_idx); }); + }, + stream, + mr); }(); return make_lists_column(num_rows, diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu index 5da8aef5853..4e69baef6ed 100644 --- a/cpp/src/lists/interleave_columns.cu +++ b/cpp/src/lists/interleave_columns.cu @@ -172,29 +172,24 @@ struct interleave_list_entries_fn { rmm::mr::device_memory_resource* mr) const noexcept { auto const table_dv_ptr = table_device_view::create(input); - auto const comp_fn = compute_string_sizes_and_interleave_lists_fn{ + auto comp_fn = compute_string_sizes_and_interleave_lists_fn{ *table_dv_ptr, output_list_offsets.template begin(), data_has_null_mask}; - if (data_has_null_mask) { - auto [offsets_column, chars_column, null_mask, null_count] = - cudf::strings::detail::make_strings_children_with_null_mask( - comp_fn, num_output_lists, num_output_entries, stream, mr); - return make_strings_column(num_output_entries, - std::move(offsets_column), - std::move(chars_column), - null_count, - std::move(null_mask), - stream, - mr); - } + auto validities = + rmm::device_uvector(data_has_null_mask ? num_output_entries : 0, stream); + comp_fn.d_validities = validities.data(); auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( comp_fn, num_output_lists, num_output_entries, stream, mr); + + auto [null_mask, null_count] = cudf::detail::valid_if( + validities.begin(), validities.end(), thrust::identity{}, stream, mr); + return make_strings_column(num_output_entries, std::move(offsets_column), std::move(chars_column), - 0, - rmm::device_buffer{}, + null_count, + std::move(null_mask), stream, mr); } diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp index 1df0a4ab41a..def4a400488 100644 --- a/cpp/src/quantiles/quantiles_util.hpp +++ b/cpp/src/quantiles/quantiles_util.hpp @@ -156,6 +156,8 @@ template CUDA_HOST_DEVICE_CALLABLE Result select_quantile_data(Iterator begin, size_type size, double q, interpolation interp) { + if (size == 0) return static_cast(*begin); + quantile_index idx(size, q); switch (interp) { diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu index d902efd8b06..b15708c5cf8 100644 --- a/cpp/src/reshape/interleave_columns.cu +++ b/cpp/src/reshape/interleave_columns.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -28,32 +29,111 @@ namespace cudf { namespace detail { namespace { -struct interleave_columns_functor { - template - std::enable_if_t() and not std::is_same_v and - not std::is_same_v, - std::unique_ptr> - operator()(Args&&...) +// Error case when no other overload or specialization is available +template +struct interleave_columns_impl { + template + std::unique_ptr operator()(Args&&...) { - CUDF_FAIL("Called `interleave_columns` on none-supported data type."); + CUDF_FAIL("Unsupported type in `interleave_columns`."); } +}; +struct interleave_columns_functor { template - std::enable_if_t, std::unique_ptr> operator()( - table_view const& lists_columns, - bool create_mask, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + std::unique_ptr operator()(table_view const& input, + bool create_mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + return interleave_columns_impl{}(input, create_mask, stream, mr); + } +}; + +template +struct interleave_columns_impl>> { + std::unique_ptr operator()(table_view const& lists_columns, + bool create_mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return lists::detail::interleave_columns(lists_columns, create_mask, stream, mr); } +}; - template - std::enable_if_t, std::unique_ptr> operator()( - table_view const& strings_columns, - bool create_mask, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +template +struct interleave_columns_impl>> { + std::unique_ptr operator()(table_view const& structs_columns, + bool create_mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + // We can safely call `column(0)` as the number of columns is known to be non zero. + auto const num_children = structs_columns.column(0).num_children(); + CUDF_EXPECTS( + std::all_of(structs_columns.begin(), + structs_columns.end(), + [num_children](auto const& col) { return col.num_children() == num_children; }), + "Number of children of the input structs columns must be the same"); + + auto const num_columns = structs_columns.num_columns(); + auto const num_rows = structs_columns.num_rows(); + auto const output_size = num_columns * num_rows; + + // Interleave the children of the structs columns. + std::vector> output_struct_members; + for (size_type child_idx = 0; child_idx < num_children; ++child_idx) { + // Collect children columns from the input structs columns at index `child_idx`. + auto const child_iter = + thrust::make_transform_iterator(structs_columns.begin(), [child_idx](auto const& col) { + return structs_column_view(col).get_sliced_child(child_idx); + }); + auto children = std::vector(child_iter, child_iter + num_columns); + + auto const child_type = children.front().type(); + CUDF_EXPECTS( + std::all_of(children.cbegin(), + children.cend(), + [child_type](auto const& col) { return child_type == col.type(); }), + "Children of the input structs columns at the same child index must have the same type"); + + auto const children_nullable = std::any_of( + children.cbegin(), children.cend(), [](auto const& col) { return col.nullable(); }); + output_struct_members.emplace_back( + type_dispatcher(child_type, + interleave_columns_functor{}, + table_view{std::move(children)}, + children_nullable, + stream, + mr)); + } + + auto const create_mask_fn = [&] { + auto const input_dv_ptr = table_device_view::create(structs_columns); + auto const validity_fn = [input_dv = *input_dv_ptr, num_columns] __device__(auto const idx) { + return input_dv.column(idx % num_columns).is_valid(idx / num_columns); + }; + return cudf::detail::valid_if(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(output_size), + validity_fn, + stream, + mr); + }; + + // Only create null mask if at least one input structs column is nullable. + auto [null_mask, null_count] = + create_mask ? create_mask_fn() : std::pair{rmm::device_buffer{0, stream, mr}, size_type{0}}; + return make_structs_column( + output_size, std::move(output_struct_members), null_count, std::move(null_mask), stream, mr); + } +}; + +template +struct interleave_columns_impl>> { + std::unique_ptr operator()(table_view const& strings_columns, + bool create_mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto num_columns = strings_columns.num_columns(); if (num_columns == 1) // Single strings column returns a copy @@ -105,7 +185,7 @@ struct interleave_columns_functor { cudf::detail::get_value(offsets_column->view(), num_strings, stream); auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr); // Fill the chars column - auto d_results_chars = chars_column->mutable_view().data(); + auto d_results_chars = chars_column->mutable_view().template data(); thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -131,13 +211,14 @@ struct interleave_columns_functor { stream, mr); } +}; - template - std::enable_if_t(), std::unique_ptr> operator()( - table_view const& input, - bool create_mask, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +template +struct interleave_columns_impl()>> { + std::unique_ptr operator()(table_view const& input, + bool create_mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto arch_column = input.column(0); auto output_size = input.num_columns() * input.num_rows(); @@ -184,30 +265,33 @@ struct interleave_columns_functor { }; } // anonymous namespace -} // namespace detail std::unique_ptr interleave_columns(table_view const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_FUNC_RANGE(); CUDF_EXPECTS(input.num_columns() > 0, "input must have at least one column to determine dtype."); auto const dtype = input.column(0).type(); - CUDF_EXPECTS(std::all_of(std::cbegin(input), std::cend(input), [dtype](auto const& col) { return dtype == col.type(); }), - "DTYPE mismatch"); + "Input columns must have the same type"); auto const output_needs_mask = std::any_of( std::cbegin(input), std::cend(input), [](auto const& col) { return col.nullable(); }); - return type_dispatcher(dtype, - detail::interleave_columns_functor{}, - input, - output_needs_mask, - rmm::cuda_stream_default, - mr); + return type_dispatcher( + dtype, detail::interleave_columns_functor{}, input, output_needs_mask, stream, mr); +} + +} // namespace detail + +std::unique_ptr interleave_columns(table_view const& input, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::interleave_columns(input, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp index 045bfbe0327..f982e7b99f2 100644 --- a/cpp/src/scalar/scalar.cpp +++ b/cpp/src/scalar/scalar.cpp @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include #include @@ -520,6 +520,13 @@ list_scalar::list_scalar(list_scalar const& other, column_view list_scalar::view() const { return _data.view(); } +struct_scalar::struct_scalar(struct_scalar const& other, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : scalar{other, stream, mr}, _data(other._data, stream, mr) +{ +} + struct_scalar::struct_scalar(table_view const& data, bool is_valid, rmm::cuda_stream_view stream, @@ -567,12 +574,13 @@ void struct_scalar::superimpose_nulls(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { // push validity mask down - std::vector host_validity({0}); - auto validity = cudf::detail::make_device_uvector_sync(host_validity, stream, mr); + std::vector host_validity( + cudf::bitmask_allocation_size_bytes(1) / sizeof(bitmask_type), 0); + auto validity = cudf::detail::create_null_mask(1, mask_state::ALL_NULL, stream); auto iter = thrust::make_counting_iterator(0); std::for_each(iter, iter + _data.num_columns(), [&](size_type i) { cudf::structs::detail::superimpose_parent_nulls( - validity.data(), 1, _data.get_column(i), stream, mr); + static_cast(validity.data()), 1, _data.get_column(i), stream, mr); }); } diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu index 2ef27759124..3e0bb8704b6 100644 --- a/cpp/src/strings/combine/join_list_elements.cu +++ b/cpp/src/strings/combine/join_list_elements.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -60,9 +61,6 @@ struct compute_size_and_concatenate_fn { // If d_chars != nullptr: only concatenate strings. char* d_chars{nullptr}; - // We need to set `1` or `0` for the validities of the output strings. - int8_t* d_validities{nullptr}; - __device__ bool output_is_null(size_type const idx, size_type const start_idx, size_type const end_idx) const noexcept @@ -73,33 +71,31 @@ struct compute_size_and_concatenate_fn { __device__ void operator()(size_type const idx) const noexcept { - // If this is the second pass, and the row `idx` is known to be a null string - if (d_chars && !d_validities[idx]) { return; } + // If this is the second pass, and the row `idx` is known to be a null or empty string + if (d_chars && (d_offsets[idx] == d_offsets[idx + 1])) { return; } // Indices of the strings within the list row auto const start_idx = list_offsets[idx]; auto const end_idx = list_offsets[idx + 1]; if (!d_chars && output_is_null(idx, start_idx, end_idx)) { - d_offsets[idx] = 0; - d_validities[idx] = false; + d_offsets[idx] = 0; return; } auto const separator = func.separator(idx); - auto size_bytes = size_type{0}; char* output_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr; - bool has_valid_element = false; bool write_separator = false; + auto size_bytes = size_type{0}; + bool has_valid_element = false; for (size_type str_idx = start_idx; str_idx < end_idx; ++str_idx) { bool null_element = strings_dv.is_null(str_idx); has_valid_element = has_valid_element || !null_element; if (!d_chars && (null_element && !string_narep_dv.is_valid())) { - d_offsets[idx] = 0; - d_validities[idx] = false; - return; // early termination: the entire list of strings will result in a null string + size_bytes = 0; + break; } if (write_separator && (separate_nulls == separator_on_nulls::YES || !null_element)) { @@ -119,11 +115,7 @@ struct compute_size_and_concatenate_fn { // If there are all null elements, the output should be the same as having an empty list input: // a null or an empty string - if (!d_chars) { - d_offsets[idx] = has_valid_element ? size_bytes : 0; - d_validities[idx] = - has_valid_element || empty_list_policy == output_if_empty_list::EMPTY_STRING; - } + if (!d_chars) { d_offsets[idx] = has_valid_element ? size_bytes : 0; } } }; @@ -144,6 +136,33 @@ struct scalar_separator_fn { __device__ string_view separator(size_type const) const noexcept { return d_separator.value(); } }; +template +struct validities_fn { + CompFn comp_fn; + + validities_fn(CompFn comp_fn) : comp_fn(comp_fn) {} + + __device__ bool operator()(size_type idx) + { + auto const start_idx = comp_fn.list_offsets[idx]; + auto const end_idx = comp_fn.list_offsets[idx + 1]; + bool valid_output = !comp_fn.output_is_null(idx, start_idx, end_idx); + if (valid_output) { + bool check_elements = false; + for (size_type str_idx = start_idx; str_idx < end_idx; ++str_idx) { + bool const valid_element = comp_fn.strings_dv.is_valid(str_idx); + check_elements = check_elements || valid_element; + // if an element is null and narep is invalid, the output row is null + if (!valid_element && !comp_fn.string_narep_dv.is_valid()) { return false; } + } + // handle empty-list-as-null output policy setting + valid_output = + check_elements || comp_fn.empty_list_policy == output_if_empty_list::EMPTY_STRING; + } + return valid_output; + } +}; + } // namespace std::unique_ptr join_list_elements(lists_column_view const& lists_strings_column, @@ -180,8 +199,14 @@ std::unique_ptr join_list_elements(lists_column_view const& lists_string string_narep_dv, separate_nulls, empty_list_policy}; - auto [offsets_column, chars_column, null_mask, null_count] = - make_strings_children_with_null_mask(comp_fn, num_rows, num_rows, stream, mr); + + auto [offsets_column, chars_column] = make_strings_children(comp_fn, num_rows, stream, mr); + auto [null_mask, null_count] = + cudf::detail::valid_if(thrust::counting_iterator(0), + thrust::counting_iterator(num_rows), + validities_fn{comp_fn}, + stream, + mr); return make_strings_column(num_rows, std::move(offsets_column), @@ -254,8 +279,14 @@ std::unique_ptr join_list_elements(lists_column_view const& lists_string string_narep_dv, separate_nulls, empty_list_policy}; - auto [offsets_column, chars_column, null_mask, null_count] = - make_strings_children_with_null_mask(comp_fn, num_rows, num_rows, stream, mr); + + auto [offsets_column, chars_column] = make_strings_children(comp_fn, num_rows, stream, mr); + auto [null_mask, null_count] = + cudf::detail::valid_if(thrust::counting_iterator(0), + thrust::counting_iterator(num_rows), + validities_fn{comp_fn}, + stream, + mr); return make_strings_column(num_rows, std::move(offsets_column), diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu index 8dcb260a7ee..c8b4b859020 100644 --- a/cpp/src/strings/convert/convert_durations.cu +++ b/cpp/src/strings/convert/convert_durations.cu @@ -535,7 +535,7 @@ struct parse_duration { auto ptr = d_string.data(); auto length = d_string.size_bytes(); int8_t hour_shift{0}; - for (size_t idx = 0; idx < items_count; ++idx) { + for (size_type idx = 0; idx < items_count; ++idx) { auto item = d_format_items[idx]; if (length < item.length) return 1; if (item.item_type == format_char_type::literal) { // static character we'll just skip; @@ -567,7 +567,7 @@ struct parse_duration { break; case 'S': // [-]SS[.mmm][uuu][nnn] timeparts->second = parse_second(ptr, item_length); - if (*(ptr + item_length) == '.') { + if ((item_length < length) && *(ptr + item_length) == '.') { item_length++; int64_t nanoseconds = str2int_fixed( ptr + item_length, 9, length - item_length, item_length); // normalize to nanoseconds diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index 866ff1adbc6..8d77c7da4cc 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -16,8 +16,7 @@ #include #include -#include -#include +#include #include #include #include @@ -27,12 +26,11 @@ #include #include +#include #include -#include "thrust/iterator/transform_iterator.h" #include -#include -#include +#include #include namespace cudf { @@ -287,12 +285,15 @@ std::unique_ptr concatenate(host_span columns, column_view offsets_child = column->child(strings_column_view::offsets_column_index); column_view chars_child = column->child(strings_column_view::chars_column_index); - auto d_offsets = offsets_child.data() + column_offset; - int32_t bytes_offset = thrust::device_pointer_cast(d_offsets)[0]; + auto bytes_offset = + cudf::detail::get_value(offsets_child, column_offset, stream); // copy the chars column data - auto d_chars = chars_child.data() + bytes_offset; - size_type bytes = thrust::device_pointer_cast(d_offsets)[column_size] - bytes_offset; + auto d_chars = chars_child.data() + bytes_offset; + auto const bytes = + cudf::detail::get_value(offsets_child, column_size + column_offset, stream) - + bytes_offset; + CUDA_TRY( cudaMemcpyAsync(d_new_chars, d_chars, bytes, cudaMemcpyDeviceToDevice, stream.value())); diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu index 462efedffe5..5f7b195e8f9 100644 --- a/cpp/src/strings/replace/backref_re.cu +++ b/cpp/src/strings/replace/backref_re.cu @@ -37,39 +37,57 @@ namespace strings { namespace detail { namespace { +/** + * @brief Return the capturing group index pattern to use with the given replacement string. + * + * Only two patterns are supported at this time `\d` and `${d}` where `d` is an integer in + * the range 1-99. The `\d` pattern is returned by default unless no `\d` pattern is found in + * the `repl` string, + * + * Reference: https://www.regular-expressions.info/refreplacebackref.html + */ +std::string get_backref_pattern(std::string const& repl) +{ + std::string const backslash_pattern = "\\\\(\\d+)"; + std::string const bracket_pattern = "\\$\\{(\\d+)\\}"; + std::smatch m; + return std::regex_search(repl, m, std::regex(backslash_pattern)) ? backslash_pattern + : bracket_pattern; +} /** * @brief Parse the back-ref index and position values from a given replace format. * - * The backref numbers are expected to be 1-based. + * The back-ref numbers are expected to be 1-based. + * + * Returns a modified string without back-ref indicators and a vector of back-ref + * byte position pairs. These are used by the device code to build the output + * string by placing the captured group elements into the replace format. * - * Returns a modified string without back-ref indicators and a vector of backref - * byte position pairs. - * ``` - * Example: - * for input string: 'hello \2 and \1' - * the returned pairs: (2,6),(1,11) - * returned string is: 'hello and ' - * ``` + * For example, for input string 'hello \2 and \1' the returned `backref_type` vector + * contains `[(2,6),(1,11)]` and the returned string is 'hello and '. */ std::pair> parse_backrefs(std::string const& repl) { std::vector backrefs; std::string str = repl; // make a modifiable copy std::smatch m; - std::regex ex("(\\\\\\d+)"); // this searches for backslash-number(s); example "\1" - std::string rtn; // result without refs + std::regex ex(get_backref_pattern(repl)); + std::string rtn; size_type byte_offset = 0; - while (std::regex_search(str, m, ex)) { - if (m.size() == 0) break; - std::string const backref = m[0]; - size_type const position = static_cast(m.position(0)); - size_type const length = static_cast(backref.length()); + while (std::regex_search(str, m, ex) && !m.empty()) { + // parse the back-ref index number + size_type const index = static_cast(std::atoi(std::string{m[1]}.c_str())); + CUDF_EXPECTS(index > 0 && index < 100, "Group index numbers must be in the range 1-99"); + + // store the new byte offset and index value + size_type const position = static_cast(m.position(0)); byte_offset += position; - size_type const index = std::atoi(backref.c_str() + 1); // back-ref index number - CUDF_EXPECTS(index > 0, "Back-reference numbers must be greater than 0"); - rtn += str.substr(0, position); - str = str.substr(position + length); backrefs.push_back({index, byte_offset}); + + // update the output string + rtn += str.substr(0, position); + // remove the back-ref pattern to continue parsing + str = str.substr(position + static_cast(m.length(0))); } if (!str.empty()) // add the remainder rtn += str; // of the string @@ -96,7 +114,7 @@ std::unique_ptr replace_with_backrefs( auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings.size(), stream); auto const regex_insts = d_prog->insts_counts(); - // parse the repl string for backref indicators + // parse the repl string for back-ref indicators auto const parse_result = parse_backrefs(repl); rmm::device_uvector backrefs(parse_result.second.size(), stream); CUDA_TRY(cudaMemcpyAsync(backrefs.data(), diff --git a/cpp/src/structs/copying/concatenate.cu b/cpp/src/structs/copying/concatenate.cu index 6f18c4bcbd4..fe5483b119d 100644 --- a/cpp/src/structs/copying/concatenate.cu +++ b/cpp/src/structs/copying/concatenate.cu @@ -28,6 +28,7 @@ #include #include +#include namespace cudf { namespace structs { @@ -53,7 +54,11 @@ std::unique_ptr concatenate(host_span columns, return cudf::detail::concatenate(cols, stream, mr); }); - size_type const total_length = children[0]->size(); + // get total length from concatenated children; if no child exists, we would compute it + auto const acc_size_fn = [](size_type s, column_view const& c) { return s + c.size(); }; + auto const total_length = + !children.empty() ? children[0]->size() + : std::accumulate(columns.begin(), columns.end(), size_type{0}, acc_size_fn); // if any of the input columns have nulls, construct the output mask bool const has_nulls = diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp index 80bea2ab55e..bfeb6ef3533 100644 --- a/cpp/src/structs/utilities.cpp +++ b/cpp/src/structs/utilities.cpp @@ -16,8 +16,10 @@ #include +#include #include #include +#include #include #include #include @@ -61,6 +63,24 @@ std::vector> extract_ordered_struct_children( return result; } +namespace { +/** + * @brief Check whether the specified column is of type `STRUCT`. + */ +bool is_struct(cudf::column_view const& col) { return col.type().id() == type_id::STRUCT; } + +/** + * @brief Check whether the specified column is of type LIST, or any LISTs in its descendent + * columns. + */ +bool is_or_has_nested_lists(cudf::column_view const& col) +{ + auto is_list = [](cudf::column_view const& col) { return col.type().id() == type_id::LIST; }; + + return is_list(col) || std::any_of(col.child_begin(), col.child_end(), is_or_has_nested_lists); +} +} // namespace + /** * @brief Flattens struct columns to constituent non-struct columns in the input table. * @@ -86,6 +106,13 @@ struct flattened_table { null_precedence(null_precedence), nullability(nullability) { + fail_if_unsupported_types(input); + } + + void fail_if_unsupported_types(table_view const& input) const + { + auto const has_lists = std::any_of(input.begin(), input.end(), is_or_has_nested_lists); + CUDF_EXPECTS(not has_lists, "Flattening LIST columns is not supported."); } // Convert null_mask to BOOL8 columns and flatten the struct children in order. @@ -156,9 +183,6 @@ struct flattened_table { } }; -/** - * @copydoc cudf::detail::flatten_nested_columns - */ std::tuple, std::vector, @@ -168,15 +192,107 @@ flatten_nested_columns(table_view const& input, std::vector const& null_precedence, column_nullability nullability) { - std::vector> validity_as_column; - auto const has_struct = std::any_of( - input.begin(), input.end(), [](auto const& col) { return col.type().id() == type_id::STRUCT; }); - if (not has_struct) - return std::make_tuple(input, column_order, null_precedence, std::move(validity_as_column)); + auto const has_struct = std::any_of(input.begin(), input.end(), is_struct); + if (not has_struct) { + return std::make_tuple( + input, column_order, null_precedence, std::vector>{}); + } return flattened_table{input, column_order, null_precedence, nullability}(); } +namespace { +using vector_of_columns = std::vector>; +using column_index_t = typename vector_of_columns::size_type; + +// Forward declaration, to enable recursion via `unflattener`. +std::unique_ptr unflatten_struct(vector_of_columns& flattened, + column_index_t& current_index, + cudf::column_view const& blueprint); + +/** + * @brief Helper functor to reconstruct STRUCT columns from its flattened member columns. + * + */ +class unflattener { + public: + unflattener(vector_of_columns& flattened_, column_index_t& current_index_) + : flattened{flattened_}, current_index{current_index_} + { + } + + auto operator()(column_view const& blueprint) + { + return is_struct(blueprint) ? unflatten_struct(flattened, current_index, blueprint) + : std::move(flattened[current_index++]); + } + + private: + vector_of_columns& flattened; + column_index_t& current_index; + +}; // class unflattener; + +std::unique_ptr unflatten_struct(vector_of_columns& flattened, + column_index_t& current_index, + cudf::column_view const& blueprint) +{ + // "Consume" columns from `flattened`, starting at `current_index`, + // based on the provided `blueprint` struct col. Recurse for struct children. + CUDF_EXPECTS(blueprint.type().id() == type_id::STRUCT, + "Expected blueprint column to be a STRUCT column."); + + CUDF_EXPECTS(current_index < flattened.size(), "STRUCT column can't have 0 children."); + + auto const num_rows = flattened[current_index]->size(); + + // cudf::flatten_nested_columns() executes depth first, and serializes the struct null vector + // before the child/member columns. + // E.g. STRUCT_1< STRUCT_2< A, B >, C > is flattened to: + // 1. Null Vector for STRUCT_1 + // 2. Null Vector for STRUCT_2 + // 3. Member STRUCT_2::A + // 4. Member STRUCT_2::B + // 5. Member STRUCT_1::C + // + // Extract null-vector *before* child columns are constructed. + auto struct_null_column_contents = flattened[current_index++]->release(); + auto unflattening_iter = + thrust::make_transform_iterator(blueprint.child_begin(), unflattener{flattened, current_index}); + + return cudf::make_structs_column( + num_rows, + vector_of_columns{unflattening_iter, unflattening_iter + blueprint.num_children()}, + UNKNOWN_NULL_COUNT, // Do count? + std::move(*struct_null_column_contents.null_mask)); +} +} // namespace + +std::unique_ptr unflatten_nested_columns(std::unique_ptr&& flattened, + table_view const& blueprint) +{ + // Bail, if LISTs are present. + auto const has_lists = std::any_of(blueprint.begin(), blueprint.end(), is_or_has_nested_lists); + CUDF_EXPECTS(not has_lists, "Unflattening LIST columns is not supported."); + + // If there are no STRUCTs, unflattening is a NOOP. + auto const has_structs = std::any_of(blueprint.begin(), blueprint.end(), is_struct); + if (not has_structs) { + return std::move(flattened); // Unchanged. + } + + // There be struct columns. + // Note: Requires null vectors for all struct input columns. + auto flattened_columns = flattened->release(); + auto current_idx = column_index_t{0}; + + auto unflattening_iter = + thrust::make_transform_iterator(blueprint.begin(), unflattener{flattened_columns, current_idx}); + + return std::make_unique( + vector_of_columns{unflattening_iter, unflattening_iter + blueprint.num_columns()}); +} + // Helper function to superimpose validity of parent struct // over the specified member (child) column. void superimpose_parent_nulls(bitmask_type const* parent_null_mask, @@ -187,8 +303,7 @@ void superimpose_parent_nulls(bitmask_type const* parent_null_mask, { if (!child.nullable()) { // Child currently has no null mask. Copy parent's null mask. - child.set_null_mask(rmm::device_buffer{ - parent_null_mask, cudf::bitmask_allocation_size_bytes(child.size()), stream, mr}); + child.set_null_mask(cudf::detail::copy_bitmask(parent_null_mask, 0, child.size(), stream, mr)); child.set_null_count(parent_null_count); } else { // Child should have a null mask. diff --git a/cpp/src/structs/utilities.hpp b/cpp/src/structs/utilities.hpp index eee9ca63146..a68f09574ce 100644 --- a/cpp/src/structs/utilities.hpp +++ b/cpp/src/structs/utilities.hpp @@ -76,6 +76,35 @@ flatten_nested_columns(table_view const& input, std::vector const& null_precedence, column_nullability nullability = column_nullability::MATCH_INCOMING); +/** + * @brief Unflatten columns flattened as by `flatten_nested_columns()`, + * based on the provided `blueprint`. + * + * cudf::flatten_nested_columns() executes depth first, and serializes the struct null vector + * before the child/member columns. + * E.g. STRUCT_1< STRUCT_2< A, B >, C > is flattened to: + * 1. Null Vector for STRUCT_1 + * 2. Null Vector for STRUCT_2 + * 3. Member STRUCT_2::A + * 4. Member STRUCT_2::B + * 5. Member STRUCT_1::C + * + * `unflatten_nested_columns()` reconstructs nested columns from flattened input that follows + * the convention above. + * + * Note: This function requires a null-mask vector for each STRUCT column, including for nested + * STRUCT members. + * + * @param flattened "Flattened" `table` of input columns, following the conventions in + * `flatten_nested_columns()`. + * @param blueprint The exemplar `table_view` with nested columns intact, whose structure defines + * the nesting of the reconstructed output table. + * @return std::unique_ptr Unflattened table (with nested STRUCT columns) reconstructed + * based on `blueprint`. + */ +std::unique_ptr unflatten_nested_columns(std::unique_ptr&& flattened, + table_view const& blueprint); + /** * @brief Pushdown nulls from a parent mask into a child column, using AND. * diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu index 3800339a6a2..b2230f95842 100644 --- a/cpp/src/text/subword/load_hash_file.cu +++ b/cpp/src/text/subword/load_hash_file.cu @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -40,11 +41,11 @@ namespace { struct get_codepoint_metadata_init { rmm::cuda_stream_view stream; - codepoint_metadata_type* operator()() const + rmm::device_uvector* operator()() const { - codepoint_metadata_type* table = - static_cast(rmm::mr::get_current_device_resource()->allocate( - codepoint_metadata_size * sizeof(codepoint_metadata_type), stream)); + auto table_vector = + new rmm::device_uvector(codepoint_metadata_size, stream); + auto table = table_vector->data(); thrust::fill(rmm::exec_policy(stream), table + cp_section1_end, table + codepoint_metadata_size, @@ -60,18 +61,18 @@ struct get_codepoint_metadata_init { (cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]), // 2nd section cudaMemcpyHostToDevice, stream.value())); - return table; + return table_vector; }; }; struct get_aux_codepoint_data_init { rmm::cuda_stream_view stream; - aux_codepoint_data_type* operator()() const + rmm::device_uvector* operator()() const { - aux_codepoint_data_type* table = - static_cast(rmm::mr::get_current_device_resource()->allocate( - aux_codepoint_data_size * sizeof(aux_codepoint_data_type), stream)); + auto table_vector = + new rmm::device_uvector(aux_codepoint_data_size, stream); + auto table = table_vector->data(); thrust::fill(rmm::exec_policy(stream), table + aux_section1_end, table + aux_codepoint_data_size, @@ -99,7 +100,7 @@ struct get_aux_codepoint_data_init { (aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]), // 4th section cudaMemcpyHostToDevice, stream.value())); - return table; + return table_vector; } }; } // namespace @@ -112,11 +113,11 @@ struct get_aux_codepoint_data_init { */ const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stream) { - static cudf::strings::detail::thread_safe_per_context_cache + static cudf::strings::detail::thread_safe_per_context_cache< + rmm::device_uvector> g_codepoint_metadata; - get_codepoint_metadata_init function = {stream}; - return g_codepoint_metadata.find_or_initialize(function); + return g_codepoint_metadata.find_or_initialize(get_codepoint_metadata_init{stream})->data(); } /** @@ -127,10 +128,11 @@ const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stre */ const aux_codepoint_data_type* get_aux_codepoint_data(rmm::cuda_stream_view stream) { - static cudf::strings::detail::thread_safe_per_context_cache + static cudf::strings::detail::thread_safe_per_context_cache< + rmm::device_uvector> g_aux_codepoint_data; - get_aux_codepoint_data_init function = {stream}; - return g_aux_codepoint_data.find_or_initialize(function); + + return g_aux_codepoint_data.find_or_initialize(get_aux_codepoint_data_init{stream})->data(); } namespace { diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu index 8c14f89d4d0..6de1044b492 100644 --- a/cpp/src/text/subword/subword_tokenize.cu +++ b/cpp/src/text/subword/subword_tokenize.cu @@ -172,8 +172,10 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count + 1), offsets_per_tensor.begin(), - [device_offsets, do_truncate, max_sequence_length, stride] __device__(cudf::size_type idx) { - uint32_t num_tokens = device_offsets[idx + 1] - device_offsets[idx]; + [device_offsets, do_truncate, max_sequence_length, stride, strings_count] __device__( + cudf::size_type idx) { + uint32_t const num_tokens = + idx < strings_count ? device_offsets[idx + 1] - device_offsets[idx] : 0; if (do_truncate || num_tokens <= max_sequence_length) return uint32_t{1}; return 1 + ((num_tokens - max_sequence_length + stride - 1) / stride); }, diff --git a/cpp/src/ast/transform.cu b/cpp/src/transform/compute_column.cu similarity index 68% rename from cpp/src/ast/transform.cu rename to cpp/src/transform/compute_column.cu index d6426f92002..1466ee9ad27 100644 --- a/cpp/src/ast/transform.cu +++ b/cpp/src/transform/compute_column.cu @@ -14,17 +14,18 @@ * limitations under the License. */ -#include -#include -#include -#include +#include +#include +#include #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -33,7 +34,6 @@ #include namespace cudf { -namespace ast { namespace detail { /** @@ -47,13 +47,14 @@ namespace detail { * @tparam has_nulls whether or not the output column may contain nulls. * * @param table The table device view used for evaluation. - * @param plan Container of device data required to evaluate the desired expression. + * @param device_expression_data Container of device data required to evaluate the desired + * expression. * @param output_column The destination for the results of evaluating the expression. */ template __launch_bounds__(max_block_size) __global__ void compute_column_kernel(table_device_view const table, - device_ast_plan plan, + ast::detail::expression_device_view device_expression_data, mutable_column_device_view output_column) { // The (required) extern storage of the shared memory array leads to @@ -61,23 +62,24 @@ __launch_bounds__(max_block_size) __global__ // workaround is to declare an arbitrary (here char) array type then cast it // after the fact to the appropriate type. extern __shared__ char raw_intermediate_storage[]; - IntermediateDataType* intermediate_storage = - reinterpret_cast*>(raw_intermediate_storage); + ast::detail::IntermediateDataType* intermediate_storage = + reinterpret_cast*>(raw_intermediate_storage); - auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * plan.num_intermediates]; + auto thread_intermediate_storage = + &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates]; auto const start_idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); auto const stride = static_cast(blockDim.x * gridDim.x); - auto evaluator = - cudf::ast::detail::expression_evaluator(table, plan, thread_intermediate_storage); + auto evaluator = cudf::ast::detail::expression_evaluator( + table, device_expression_data, thread_intermediate_storage); for (cudf::size_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) { - auto output_dest = mutable_column_expression_result(output_column); + auto output_dest = ast::detail::mutable_column_expression_result(output_column); evaluator.evaluate(output_dest, row_index); } } -std::unique_ptr compute_column(table_view const table, - expression const& expr, +std::unique_ptr compute_column(table_view const& table, + ast::expression const& expr, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -89,19 +91,19 @@ std::unique_ptr compute_column(table_view const table, auto const nullable = cudf::nullable(table); auto const has_nulls = nullable && cudf::has_nulls(table); - auto const plan = ast_plan{expr, table, has_nulls, stream, mr}; + auto const parser = ast::detail::expression_parser{expr, table, has_nulls, stream, mr}; auto const output_column_mask_state = nullable ? (has_nulls ? mask_state::UNINITIALIZED : mask_state::ALL_VALID) : mask_state::UNALLOCATED; auto output_column = cudf::make_fixed_width_column( - plan.output_type(), table.num_rows(), output_column_mask_state, stream, mr); + parser.output_type(), table.num_rows(), output_column_mask_state, stream, mr); auto mutable_output_device = cudf::mutable_column_device_view::create(output_column->mutable_view(), stream); // Configure kernel parameters - auto const& dev_plan = plan.dev_plan; + auto const& device_expression_data = parser.device_expression_data; int device_id; CUDA_TRY(cudaGetDevice(&device_id)); int shmem_limit_per_block; @@ -109,22 +111,23 @@ std::unique_ptr compute_column(table_view const table, cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); auto constexpr MAX_BLOCK_SIZE = 128; auto const block_size = - dev_plan.shmem_per_thread != 0 - ? std::min(MAX_BLOCK_SIZE, shmem_limit_per_block / dev_plan.shmem_per_thread) + device_expression_data.shmem_per_thread != 0 + ? std::min(MAX_BLOCK_SIZE, shmem_limit_per_block / device_expression_data.shmem_per_thread) : MAX_BLOCK_SIZE; - auto const config = cudf::detail::grid_1d{table.num_rows(), block_size}; - auto const shmem_per_block = dev_plan.shmem_per_thread * config.num_threads_per_block; + auto const config = cudf::detail::grid_1d{table.num_rows(), block_size}; + auto const shmem_per_block = + device_expression_data.shmem_per_thread * config.num_threads_per_block; // Execute the kernel auto table_device = table_device_view::create(table, stream); if (has_nulls) { - cudf::ast::detail::compute_column_kernel + cudf::detail::compute_column_kernel <<>>( - *table_device, dev_plan, *mutable_output_device); + *table_device, device_expression_data, *mutable_output_device); } else { - cudf::ast::detail::compute_column_kernel + cudf::detail::compute_column_kernel <<>>( - *table_device, dev_plan, *mutable_output_device); + *table_device, device_expression_data, *mutable_output_device); } CHECK_CUDA(stream.value()); return output_column; @@ -132,14 +135,12 @@ std::unique_ptr compute_column(table_view const table, } // namespace detail -std::unique_ptr compute_column(table_view const table, - expression const& expr, +std::unique_ptr compute_column(table_view const& table, + ast::expression const& expr, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::compute_column(table, expr, rmm::cuda_stream_default, mr); } -} // namespace ast - } // namespace cudf diff --git a/cpp/src/transpose/transpose.cu b/cpp/src/transpose/transpose.cu index 67a06e60dd3..5bc2cb21ac7 100644 --- a/cpp/src/transpose/transpose.cu +++ b/cpp/src/transpose/transpose.cu @@ -16,9 +16,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -44,7 +44,7 @@ std::pair, table_view> transpose(table_view const& input input.begin(), input.end(), [dtype](auto const& col) { return dtype == col.type(); }), "Column type mismatch"); - auto output_column = cudf::interleave_columns(input, mr); + auto output_column = cudf::detail::interleave_columns(input, stream, mr); auto one_iter = thrust::make_counting_iterator(1); auto splits_iter = thrust::make_transform_iterator( one_iter, [width = input.num_columns()](size_type idx) { return idx * width; }); diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index c82826b8c60..19421e3115d 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -379,6 +379,7 @@ ConfigureTest(STRINGS_TEST # - structs test ---------------------------------------------------------------------------------- ConfigureTest(STRUCTS_TEST structs/structs_column_tests.cpp + structs/utilities_tests.cpp ) ################################################################################################### diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp index 738c58c32b8..de6c9d486ec 100644 --- a/cpp/tests/ast/transform_tests.cpp +++ b/cpp/tests/ast/transform_tests.cpp @@ -14,8 +14,7 @@ * limitations under the License. */ -#include -#include +#include #include #include #include @@ -24,6 +23,7 @@ #include #include #include +#include #include #include @@ -47,6 +47,35 @@ constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_leve struct TransformTest : public cudf::test::BaseFixture { }; +TEST_F(TransformTest, ColumnReference) +{ + auto c_0 = column_wrapper{3, 20, 1, 50}; + auto c_1 = column_wrapper{10, 7, 20, 0}; + auto table = cudf::table_view{{c_0, c_1}}; + + auto col_ref_0 = cudf::ast::column_reference(0); + + auto const& expected = c_0; + auto result = cudf::compute_column(table, col_ref_0); + + cudf::test::expect_columns_equal(expected, result->view(), verbosity); +} + +TEST_F(TransformTest, Literal) +{ + auto c_0 = column_wrapper{3, 20, 1, 50}; + auto c_1 = column_wrapper{10, 7, 20, 0}; + auto table = cudf::table_view{{c_0, c_1}}; + + auto literal_value = cudf::numeric_scalar(42); + auto literal = cudf::ast::literal(literal_value); + + auto expected = column_wrapper{42, 42, 42, 42}; + auto result = cudf::compute_column(table, literal); + + cudf::test::expect_columns_equal(expected, result->view(), verbosity); +} + TEST_F(TransformTest, BasicAddition) { auto c_0 = column_wrapper{3, 20, 1, 50}; @@ -55,10 +84,10 @@ TEST_F(TransformTest, BasicAddition) auto col_ref_0 = cudf::ast::column_reference(0); auto col_ref_1 = cudf::ast::column_reference(1); - auto expression = cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1); + auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1); auto expected = column_wrapper{13, 27, 21, 50}; - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); cudf::test::expect_columns_equal(expected, result->view(), verbosity); } @@ -70,11 +99,11 @@ TEST_F(TransformTest, BasicAdditionLarge) auto table = cudf::table_view{{col, col}}; auto col_ref = cudf::ast::column_reference(0); - auto expression = cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref, col_ref); + auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref, col_ref); auto b = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; }); auto expected = column_wrapper(b, b + 2000); - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); cudf::test::expect_columns_equal(expected, result->view(), verbosity); } @@ -87,10 +116,10 @@ TEST_F(TransformTest, LessComparator) auto col_ref_0 = cudf::ast::column_reference(0); auto col_ref_1 = cudf::ast::column_reference(1); - auto expression = cudf::ast::expression(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1); + auto expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1); auto expected = column_wrapper{true, false, true, false}; - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); cudf::test::expect_columns_equal(expected, result->view(), verbosity); } @@ -105,11 +134,11 @@ TEST_F(TransformTest, LessComparatorLarge) auto col_ref_0 = cudf::ast::column_reference(0); auto col_ref_1 = cudf::ast::column_reference(1); - auto expression = cudf::ast::expression(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1); + auto expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1); auto c = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i < 500; }); auto expected = column_wrapper(c, c + 2000); - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); cudf::test::expect_columns_equal(expected, result->view(), verbosity); } @@ -126,15 +155,15 @@ TEST_F(TransformTest, MultiLevelTreeArithmetic) auto col_ref_2 = cudf::ast::column_reference(2); auto expression_left_subtree = - cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1); + cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1); auto expression_right_subtree = - cudf::ast::expression(cudf::ast::ast_operator::SUB, col_ref_2, col_ref_0); + cudf::ast::operation(cudf::ast::ast_operator::SUB, col_ref_2, col_ref_0); - auto expression_tree = cudf::ast::expression( + auto expression_tree = cudf::ast::operation( cudf::ast::ast_operator::ADD, expression_left_subtree, expression_right_subtree); - auto result = cudf::ast::compute_column(table, expression_tree); + auto result = cudf::compute_column(table, expression_tree); auto expected = column_wrapper{7, 73, 22, -99}; cudf::test::expect_columns_equal(expected, result->view(), verbosity); @@ -142,8 +171,6 @@ TEST_F(TransformTest, MultiLevelTreeArithmetic) TEST_F(TransformTest, MultiLevelTreeArithmeticLarge) { - using namespace cudf::ast; - auto a = thrust::make_counting_iterator(0); auto b = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i + 1; }); auto c = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; }); @@ -152,15 +179,17 @@ TEST_F(TransformTest, MultiLevelTreeArithmeticLarge) auto c_2 = column_wrapper(c, c + 2000); auto table = cudf::table_view{{c_0, c_1, c_2}}; - auto col_ref_0 = column_reference(0); - auto col_ref_1 = column_reference(1); - auto col_ref_2 = column_reference(2); + auto col_ref_0 = cudf::ast::column_reference(0); + auto col_ref_1 = cudf::ast::column_reference(1); + auto col_ref_2 = cudf::ast::column_reference(2); - auto expr_left_subtree = expression(cudf::ast::ast_operator::MUL, col_ref_0, col_ref_1); - auto expr_right_subtree = expression(cudf::ast::ast_operator::ADD, col_ref_2, col_ref_0); - auto expr_tree = expression(ast_operator::SUB, expr_left_subtree, expr_right_subtree); + auto expr_left_subtree = cudf::ast::operation(cudf::ast::ast_operator::MUL, col_ref_0, col_ref_1); + auto expr_right_subtree = + cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_2, col_ref_0); + auto expr_tree = + cudf::ast::operation(cudf::ast::ast_operator::SUB, expr_left_subtree, expr_right_subtree); - auto result = cudf::ast::compute_column(table, expr_tree); + auto result = cudf::compute_column(table, expr_tree); auto calc = [](auto i) { return (i * (i + 1)) - (i + (i * 2)); }; auto d = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return calc(i); }); auto expected = column_wrapper(d, d + 2000); @@ -180,12 +209,12 @@ TEST_F(TransformTest, ImbalancedTreeArithmetic) auto col_ref_2 = cudf::ast::column_reference(2); auto expression_right_subtree = - cudf::ast::expression(cudf::ast::ast_operator::MUL, col_ref_0, col_ref_1); + cudf::ast::operation(cudf::ast::ast_operator::MUL, col_ref_0, col_ref_1); auto expression_tree = - cudf::ast::expression(cudf::ast::ast_operator::SUB, col_ref_2, expression_right_subtree); + cudf::ast::operation(cudf::ast::ast_operator::SUB, col_ref_2, expression_right_subtree); - auto result = cudf::ast::compute_column(table, expression_tree); + auto result = cudf::compute_column(table, expression_tree); auto expected = column_wrapper{0.6, std::numeric_limits::infinity(), -3.201, -2099.18}; @@ -204,15 +233,15 @@ TEST_F(TransformTest, MultiLevelTreeComparator) auto col_ref_2 = cudf::ast::column_reference(2); auto expression_left_subtree = - cudf::ast::expression(cudf::ast::ast_operator::GREATER_EQUAL, col_ref_0, col_ref_1); + cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, col_ref_0, col_ref_1); auto expression_right_subtree = - cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_2, col_ref_0); + cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_2, col_ref_0); - auto expression_tree = cudf::ast::expression( + auto expression_tree = cudf::ast::operation( cudf::ast::ast_operator::LOGICAL_AND, expression_left_subtree, expression_right_subtree); - auto result = cudf::ast::compute_column(table, expression_tree); + auto result = cudf::compute_column(table, expression_tree); auto expected = column_wrapper{false, true, false, false}; cudf::test::expect_columns_equal(expected, result->view(), verbosity); @@ -228,13 +257,13 @@ TEST_F(TransformTest, MultiTypeOperationFailure) auto col_ref_1 = cudf::ast::column_reference(1); auto expression_0_plus_1 = - cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1); + cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1); auto expression_1_plus_0 = - cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref_1, col_ref_0); + cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_1, col_ref_0); // Operations on different types are not allowed - EXPECT_THROW(cudf::ast::compute_column(table, expression_0_plus_1), cudf::logic_error); - EXPECT_THROW(cudf::ast::compute_column(table, expression_1_plus_0), cudf::logic_error); + EXPECT_THROW(cudf::compute_column(table, expression_0_plus_1), cudf::logic_error); + EXPECT_THROW(cudf::compute_column(table, expression_1_plus_0), cudf::logic_error); } TEST_F(TransformTest, LiteralComparison) @@ -246,9 +275,9 @@ TEST_F(TransformTest, LiteralComparison) auto literal_value = cudf::numeric_scalar(41); auto literal = cudf::ast::literal(literal_value); - auto expression = cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_0, literal); + auto expression = cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_0, literal); - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); auto expected = column_wrapper{false, false, false, true}; cudf::test::expect_columns_equal(expected, result->view(), verbosity); @@ -261,9 +290,9 @@ TEST_F(TransformTest, UnaryNot) auto col_ref_0 = cudf::ast::column_reference(0); - auto expression = cudf::ast::expression(cudf::ast::ast_operator::NOT, col_ref_0); + auto expression = cudf::ast::operation(cudf::ast::ast_operator::NOT, col_ref_0); - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); auto expected = column_wrapper{false, true, false, false}; cudf::test::expect_columns_equal(expected, result->view(), verbosity); @@ -277,26 +306,26 @@ TEST_F(TransformTest, UnaryTrigonometry) auto col_ref_0 = cudf::ast::column_reference(0); auto expected_sin = column_wrapper{0.0, std::sqrt(2) / 2, std::sqrt(3.0) / 2.0}; - auto expression_sin = cudf::ast::expression(cudf::ast::ast_operator::SIN, col_ref_0); - auto result_sin = cudf::ast::compute_column(table, expression_sin); + auto expression_sin = cudf::ast::operation(cudf::ast::ast_operator::SIN, col_ref_0); + auto result_sin = cudf::compute_column(table, expression_sin); cudf::test::expect_columns_equivalent(expected_sin, result_sin->view(), verbosity); auto expected_cos = column_wrapper{1.0, std::sqrt(2) / 2, 0.5}; - auto expression_cos = cudf::ast::expression(cudf::ast::ast_operator::COS, col_ref_0); - auto result_cos = cudf::ast::compute_column(table, expression_cos); + auto expression_cos = cudf::ast::operation(cudf::ast::ast_operator::COS, col_ref_0); + auto result_cos = cudf::compute_column(table, expression_cos); cudf::test::expect_columns_equivalent(expected_cos, result_cos->view(), verbosity); auto expected_tan = column_wrapper{0.0, 1.0, std::sqrt(3.0)}; - auto expression_tan = cudf::ast::expression(cudf::ast::ast_operator::TAN, col_ref_0); - auto result_tan = cudf::ast::compute_column(table, expression_tan); + auto expression_tan = cudf::ast::operation(cudf::ast::ast_operator::TAN, col_ref_0); + auto result_tan = cudf::compute_column(table, expression_tan); cudf::test::expect_columns_equivalent(expected_tan, result_tan->view(), verbosity); } TEST_F(TransformTest, ArityCheckFailure) { auto col_ref_0 = cudf::ast::column_reference(0); - EXPECT_THROW(cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref_0), cudf::logic_error); - EXPECT_THROW(cudf::ast::expression(cudf::ast::ast_operator::ABS, col_ref_0, col_ref_0), + EXPECT_THROW(cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0), cudf::logic_error); + EXPECT_THROW(cudf::ast::operation(cudf::ast::ast_operator::ABS, col_ref_0, col_ref_0), cudf::logic_error); } @@ -308,10 +337,10 @@ TEST_F(TransformTest, StringComparison) auto col_ref_0 = cudf::ast::column_reference(0); auto col_ref_1 = cudf::ast::column_reference(1); - auto expression = cudf::ast::expression(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1); + auto expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1); auto expected = column_wrapper{true, false, true, false}; - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); cudf::test::expect_columns_equal(expected, result->view(), verbosity); } @@ -322,9 +351,9 @@ TEST_F(TransformTest, CopyColumn) auto table = cudf::table_view{{c_0}}; auto col_ref_0 = cudf::ast::column_reference(0); - auto expression = cudf::ast::expression(cudf::ast::ast_operator::IDENTITY, col_ref_0); + auto expression = cudf::ast::operation(cudf::ast::ast_operator::IDENTITY, col_ref_0); - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); auto expected = column_wrapper{3, 0, 1, 50}; cudf::test::expect_columns_equal(expected, result->view(), verbosity); @@ -338,9 +367,9 @@ TEST_F(TransformTest, CopyLiteral) auto literal_value = cudf::numeric_scalar(-123); auto literal = cudf::ast::literal(literal_value); - auto expression = cudf::ast::expression(cudf::ast::ast_operator::IDENTITY, literal); + auto expression = cudf::ast::operation(cudf::ast::ast_operator::IDENTITY, literal); - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); auto expected = column_wrapper{-123, -123, -123, -123}; cudf::test::expect_columns_equal(expected, result->view(), verbosity); @@ -355,9 +384,9 @@ TEST_F(TransformTest, TrueDiv) auto literal_value = cudf::numeric_scalar(2); auto literal = cudf::ast::literal(literal_value); - auto expression = cudf::ast::expression(cudf::ast::ast_operator::TRUE_DIV, col_ref_0, literal); + auto expression = cudf::ast::operation(cudf::ast::ast_operator::TRUE_DIV, col_ref_0, literal); - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); auto expected = column_wrapper{1.5, 0.0, 0.5, 25.0}; cudf::test::expect_columns_equal(expected, result->view(), verbosity); @@ -372,9 +401,9 @@ TEST_F(TransformTest, FloorDiv) auto literal_value = cudf::numeric_scalar(2.0); auto literal = cudf::ast::literal(literal_value); - auto expression = cudf::ast::expression(cudf::ast::ast_operator::FLOOR_DIV, col_ref_0, literal); + auto expression = cudf::ast::operation(cudf::ast::ast_operator::FLOOR_DIV, col_ref_0, literal); - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); auto expected = column_wrapper{1.0, 0.0, 0.0, 25.0}; cudf::test::expect_columns_equal(expected, result->view(), verbosity); @@ -389,9 +418,9 @@ TEST_F(TransformTest, Mod) auto literal_value = cudf::numeric_scalar(2.0); auto literal = cudf::ast::literal(literal_value); - auto expression = cudf::ast::expression(cudf::ast::ast_operator::MOD, col_ref_0, literal); + auto expression = cudf::ast::operation(cudf::ast::ast_operator::MOD, col_ref_0, literal); - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); auto expected = column_wrapper{1.0, 0.0, -1.0, 0.0}; cudf::test::expect_columns_equal(expected, result->view(), verbosity); @@ -406,9 +435,9 @@ TEST_F(TransformTest, PyMod) auto literal_value = cudf::numeric_scalar(2.0); auto literal = cudf::ast::literal(literal_value); - auto expression = cudf::ast::expression(cudf::ast::ast_operator::PYMOD, col_ref_0, literal); + auto expression = cudf::ast::operation(cudf::ast::ast_operator::PYMOD, col_ref_0, literal); - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); auto expected = column_wrapper{1.0, 0.0, 1.0, 0.0}; cudf::test::expect_columns_equal(expected, result->view(), verbosity); @@ -422,10 +451,10 @@ TEST_F(TransformTest, BasicAdditionNulls) auto col_ref_0 = cudf::ast::column_reference(0); auto col_ref_1 = cudf::ast::column_reference(1); - auto expression = cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1); + auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1); auto expected = column_wrapper{{0, 0, 0, 50}, {0, 0, 0, 1}}; - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); cudf::test::expect_columns_equal(expected, result->view(), verbosity); } @@ -447,11 +476,11 @@ TEST_F(TransformTest, BasicAdditionLargeNulls) auto table = cudf::table_view{{col}}; auto col_ref = cudf::ast::column_reference(0); - auto expression = cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref, col_ref); + auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref, col_ref); auto b = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; }); auto expected = column_wrapper(b, b + N, validities.begin()); - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); cudf::test::expect_columns_equal(expected, result->view(), verbosity); } diff --git a/cpp/tests/copying/concatenate_tests.cu b/cpp/tests/copying/concatenate_tests.cu index 7d3b7beb2cb..c48f7ad4dbc 100644 --- a/cpp/tests/copying/concatenate_tests.cu +++ b/cpp/tests/copying/concatenate_tests.cu @@ -48,8 +48,6 @@ using Table = cudf::table; template struct TypedColumnTest : public cudf::test::BaseFixture { - static std::size_t data_size() { return 1000; } - static std::size_t mask_size() { return 100; } cudf::data_type type() { return cudf::data_type{cudf::type_to_id()}; } TypedColumnTest(rmm::cuda_stream_view stream = rmm::cuda_stream_default) @@ -58,14 +56,14 @@ struct TypedColumnTest : public cudf::test::BaseFixture { { auto typed_data = static_cast(data.data()); auto typed_mask = static_cast(mask.data()); - std::vector h_data(data_size()); + std::vector h_data(data.size()); std::iota(h_data.begin(), h_data.end(), char{0}); - std::vector h_mask(mask_size()); + std::vector h_mask(mask.size()); std::iota(h_mask.begin(), h_mask.end(), char{0}); CUDA_TRY(cudaMemcpyAsync( - typed_data, h_data.data(), data_size(), cudaMemcpyHostToDevice, stream.value())); + typed_data, h_data.data(), data.size(), cudaMemcpyHostToDevice, stream.value())); CUDA_TRY(cudaMemcpyAsync( - typed_mask, h_mask.data(), mask_size(), cudaMemcpyHostToDevice, stream.value())); + typed_mask, h_mask.data(), mask.size(), cudaMemcpyHostToDevice, stream.value())); stream.synchronize(); } @@ -484,7 +482,7 @@ TEST_F(OverflowTest, Presliced) auto offset_gen = cudf::detail::make_counting_transform_iterator( 0, [string_size](size_type index) { return index * string_size; }); cudf::test::fixed_width_column_wrapper offsets(offset_gen, offset_gen + num_rows + 1); - auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, num_rows); + auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, total_chars_size); auto col = cudf::make_strings_column( num_rows, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{}); @@ -515,7 +513,7 @@ TEST_F(OverflowTest, Presliced) offsets->view().begin(), offsets->view().end(), offsets->mutable_view().begin()); - auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, num_rows); + auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, total_chars_size); auto col = cudf::make_strings_column( num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{}); @@ -826,6 +824,22 @@ TEST_F(StructsColumnTest, ConcatenateStructs) cudf::test::expect_columns_equivalent(*result, *expected); } +TEST_F(StructsColumnTest, ConcatenateEmptyStructs) +{ + using namespace cudf::test; + + auto expected = cudf::make_structs_column(10, {}, 0, rmm::device_buffer()); + auto first = cudf::make_structs_column(5, {}, 0, rmm::device_buffer()); + auto second = cudf::make_structs_column(2, {}, 0, rmm::device_buffer()); + auto third = cudf::make_structs_column(0, {}, 0, rmm::device_buffer()); + auto fourth = cudf::make_structs_column(3, {}, 0, rmm::device_buffer()); + + // concatenate + auto result = cudf::concatenate(std::vector({*first, *second, *third, *fourth})); + CUDF_EXPECTS(result->size() == expected->size(), "column size changed after concat"); + cudf::test::expect_columns_equivalent(*result, *expected); +} + TEST_F(StructsColumnTest, ConcatenateSplitStructs) { using namespace cudf::test; diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp index c05e95c164e..39ad5f556d4 100644 --- a/cpp/tests/datetime/datetime_ops_test.cpp +++ b/cpp/tests/datetime/datetime_ops_test.cpp @@ -570,6 +570,41 @@ TEST_F(BasicDatetimeOpsTest, TestIsLeapYear) {true, false, true, true, true, true, true, true, false, true, true, false}}); } +TEST_F(BasicDatetimeOpsTest, TestDaysInMonths) + +{ + using namespace cudf::test; + using namespace cudf::datetime; + using namespace cuda::std::chrono; + + auto timestamps_s = + cudf::test::fixed_width_column_wrapper{ + { + 0L, // NULL + -1887541682L, // 1910-03-10 10:51:58 + 0L, // NULL + -1251006943L, // 1930-05-11 18:04:17 + -932134638L, // 1940-06-18 09:42:42 + -614354877L, // 1950-07-14 09:52:03 + -296070394L, // 1960-08-14 06:13:26 + 22840404L, // 1970-09-22 08:33:24 + 339817190L, // 1980-10-08 01:39:50 + 657928062L, // 1990-11-06 21:47:42 + 976630837L, // 2000-12-12 14:20:37 + 1294699018L, // 2011-01-10 22:36:58 + 1613970182L, // 2021-02-22 05:03:02 - non leap year February + 1930963331L, // 2031-03-11 02:42:11 + 2249867102L, // 2041-04-18 03:05:02 + 951426858L, // 2000-02-24 21:14:18 - leap year February + }, + iterators::nulls_at({0, 2})}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*days_in_month(timestamps_s), + cudf::test::fixed_width_column_wrapper{ + {-1, 31, -1, 31, 30, 31, 31, 30, 31, 30, 31, 31, 28, 31, 30, 29}, + iterators::nulls_at({0, 2})}); +} + TEST_F(BasicDatetimeOpsTest, TestQuarter) { using namespace cudf::test; diff --git a/cpp/tests/encode/encode_tests.cpp b/cpp/tests/encode/encode_tests.cpp index 52244b38dfe..73c77a39a97 100644 --- a/cpp/tests/encode/encode_tests.cpp +++ b/cpp/tests/encode/encode_tests.cpp @@ -67,9 +67,6 @@ TYPED_TEST(EncodeNumericTests, SimpleWithNulls) cudf::test::fixed_width_column_wrapper expect_keys{{1, 2, 3, 0}, {1, 1, 1, 0}}; auto const result = cudf::encode(cudf::table_view({input})); - cudf::test::print(result.first->view().column(0)); - cudf::test::print(expect_keys); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.first->view().column(0), expect_keys); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.second->view(), expect); } diff --git a/cpp/tests/groupby/argmax_tests.cpp b/cpp/tests/groupby/argmax_tests.cpp index 6bf627d7b78..7cf693f7b08 100644 --- a/cpp/tests/groupby/argmax_tests.cpp +++ b/cpp/tests/groupby/argmax_tests.cpp @@ -47,10 +47,10 @@ TYPED_TEST(groupby_argmax_test, basic) fixed_width_column_wrapper expect_keys{1, 2, 3}; fixed_width_column_wrapper expect_vals{0, 1, 2}; - auto agg = cudf::make_argmax_aggregation(); + auto agg = cudf::make_argmax_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_argmax_aggregation(); + auto agg2 = cudf::make_argmax_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -67,10 +67,10 @@ TYPED_TEST(groupby_argmax_test, zero_valid_keys) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_argmax_aggregation(); + auto agg = cudf::make_argmax_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_argmax_aggregation(); + auto agg2 = cudf::make_argmax_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -87,10 +87,10 @@ TYPED_TEST(groupby_argmax_test, zero_valid_values) fixed_width_column_wrapper expect_keys{1}; fixed_width_column_wrapper expect_vals({0}, all_nulls()); - auto agg = cudf::make_argmax_aggregation(); + auto agg = cudf::make_argmax_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_argmax_aggregation(); + auto agg2 = cudf::make_argmax_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -111,10 +111,10 @@ TYPED_TEST(groupby_argmax_test, null_keys_and_values) // {6, 3, 5, 4, 0, 2, 1, -} fixed_width_column_wrapper expect_vals({3, 4, 7, 0}, {1, 1, 1, 0}); - auto agg = cudf::make_argmax_aggregation(); + auto agg = cudf::make_argmax_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_argmax_aggregation(); + auto agg2 = cudf::make_argmax_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -132,10 +132,10 @@ TEST_F(groupby_argmax_string_test, basic) fixed_width_column_wrapper expect_keys{1, 2, 3}; fixed_width_column_wrapper expect_vals({0, 4, 2}); - auto agg = cudf::make_argmax_aggregation(); + auto agg = cudf::make_argmax_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_argmax_aggregation(); + auto agg2 = cudf::make_argmax_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -150,10 +150,10 @@ TEST_F(groupby_argmax_string_test, zero_valid_values) fixed_width_column_wrapper expect_keys{1}; fixed_width_column_wrapper expect_vals({0}, all_nulls()); - auto agg = cudf::make_argmax_aggregation(); + auto agg = cudf::make_argmax_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_argmax_aggregation(); + auto agg2 = cudf::make_argmax_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -172,12 +172,13 @@ TEST_F(groupby_dictionary_argmax_test, basic) fixed_width_column_wrapper expect_vals({ 0, 4, 2 }); // clang-format on - test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_argmax_aggregation()); + test_single_agg( + keys, vals, expect_keys, expect_vals, cudf::make_argmax_aggregation()); test_single_agg(keys, vals, expect_keys, expect_vals, - cudf::make_argmax_aggregation(), + cudf::make_argmax_aggregation(), force_use_sort_impl::YES); } diff --git a/cpp/tests/groupby/argmin_tests.cpp b/cpp/tests/groupby/argmin_tests.cpp index d192c1b21b1..915575546c9 100644 --- a/cpp/tests/groupby/argmin_tests.cpp +++ b/cpp/tests/groupby/argmin_tests.cpp @@ -47,10 +47,10 @@ TYPED_TEST(groupby_argmin_test, basic) fixed_width_column_wrapper expect_keys{1, 2, 3}; fixed_width_column_wrapper expect_vals{6, 9, 8}; - auto agg = cudf::make_argmin_aggregation(); + auto agg = cudf::make_argmin_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_argmin_aggregation(); + auto agg2 = cudf::make_argmin_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -67,10 +67,10 @@ TYPED_TEST(groupby_argmin_test, zero_valid_keys) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_argmin_aggregation(); + auto agg = cudf::make_argmin_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_argmin_aggregation(); + auto agg2 = cudf::make_argmin_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -87,10 +87,10 @@ TYPED_TEST(groupby_argmin_test, zero_valid_values) fixed_width_column_wrapper expect_keys{1}; fixed_width_column_wrapper expect_vals({0}, all_nulls()); - auto agg = cudf::make_argmin_aggregation(); + auto agg = cudf::make_argmin_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_argmin_aggregation(); + auto agg2 = cudf::make_argmin_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -111,11 +111,11 @@ TYPED_TEST(groupby_argmin_test, null_keys_and_values) // { 9, 6, 8, 5, 0, 7, 1, -} fixed_width_column_wrapper expect_vals({3, 9, 8, 0}, {1, 1, 1, 0}); - auto agg = cudf::make_argmin_aggregation(); + auto agg = cudf::make_argmin_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); // TODO: explore making this a gtest parameter - auto agg2 = cudf::make_argmin_aggregation(); + auto agg2 = cudf::make_argmin_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -133,10 +133,10 @@ TEST_F(groupby_argmin_string_test, basic) fixed_width_column_wrapper expect_keys{1, 2, 3}; fixed_width_column_wrapper expect_vals({3, 5, 7}); - auto agg = cudf::make_argmin_aggregation(); + auto agg = cudf::make_argmin_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_argmin_aggregation(); + auto agg2 = cudf::make_argmin_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -151,10 +151,10 @@ TEST_F(groupby_argmin_string_test, zero_valid_values) fixed_width_column_wrapper expect_keys{1}; fixed_width_column_wrapper expect_vals({0}, all_nulls()); - auto agg = cudf::make_argmin_aggregation(); + auto agg = cudf::make_argmin_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_argmin_aggregation(); + auto agg2 = cudf::make_argmin_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -173,12 +173,13 @@ TEST_F(groupby_dictionary_argmin_test, basic) fixed_width_column_wrapper expect_vals({ 3, 5, 7 }); // clang-format on - test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_argmin_aggregation()); + test_single_agg( + keys, vals, expect_keys, expect_vals, cudf::make_argmin_aggregation()); test_single_agg(keys, vals, expect_keys, expect_vals, - cudf::make_argmin_aggregation(), + cudf::make_argmin_aggregation(), force_use_sort_impl::YES); } diff --git a/cpp/tests/groupby/collect_list_tests.cpp b/cpp/tests/groupby/collect_list_tests.cpp index 43c62743b9f..009917dabae 100644 --- a/cpp/tests/groupby/collect_list_tests.cpp +++ b/cpp/tests/groupby/collect_list_tests.cpp @@ -45,7 +45,7 @@ TYPED_TEST(groupby_collect_list_test, CollectWithoutNulls) fixed_width_column_wrapper expect_keys{1, 2}; lists_column_wrapper expect_vals{{1, 2, 3}, {4, 5, 6}}; - auto agg = cudf::make_collect_list_aggregation(); + auto agg = cudf::make_collect_list_aggregation(); test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg)); } @@ -64,7 +64,7 @@ TYPED_TEST(groupby_collect_list_test, CollectWithNulls) lists_column_wrapper expect_vals{ {{1, 2}, validity.begin()}, {{3, 4}, validity.begin()}, {{5, 6}, validity.begin()}}; - auto agg = cudf::make_collect_list_aggregation(); + auto agg = cudf::make_collect_list_aggregation(); test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg)); } @@ -82,7 +82,7 @@ TYPED_TEST(groupby_collect_list_test, CollectWithNullExclusion) lists_column_wrapper expect_vals{{2}, {4}, {}, {8, 9}}; - auto agg = cudf::make_collect_list_aggregation(null_policy::EXCLUDE); + auto agg = cudf::make_collect_list_aggregation(null_policy::EXCLUDE); test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg)); } @@ -97,7 +97,7 @@ TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInput) fixed_width_column_wrapper expect_keys{}; lists_column_wrapper expect_vals{}; - auto agg = cudf::make_collect_list_aggregation(null_policy::EXCLUDE); + auto agg = cudf::make_collect_list_aggregation(null_policy::EXCLUDE); test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg)); } @@ -116,7 +116,7 @@ TYPED_TEST(groupby_collect_list_test, CollectLists) lists_column_wrapper expect_vals{ {{1, 2}, {3, 4}}, {{5, 6, 7}, LCW{}}, {{9, 10}, {11}}}; - auto agg = cudf::make_collect_list_aggregation(); + auto agg = cudf::make_collect_list_aggregation(); test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg)); } @@ -135,7 +135,7 @@ TYPED_TEST(groupby_collect_list_test, CollectListsWithNullExclusion) LCW expect_vals{{{1, 2}}, {LCW{}}, {{9, 10}, {11}}, {}}; - auto agg = cudf::make_collect_list_aggregation(null_policy::EXCLUDE); + auto agg = cudf::make_collect_list_aggregation(null_policy::EXCLUDE); test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg)); } @@ -158,7 +158,7 @@ TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputLists) auto expect_values = cudf::make_lists_column(0, make_empty_column(offsets), std::move(expect_child), 0, {}); - auto agg = cudf::make_collect_list_aggregation(); + auto agg = cudf::make_collect_list_aggregation(); test_single_agg(keys, values->view(), expect_keys, expect_values->view(), std::move(agg)); } @@ -190,7 +190,7 @@ TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputListsOfStructs) auto expect_values = cudf::make_lists_column( 0, make_empty_column(data_type{type_to_id()}), std::move(expect_child), 0, {}); - auto agg = cudf::make_collect_list_aggregation(); + auto agg = cudf::make_collect_list_aggregation(); test_single_agg(keys, values->view(), expect_keys, expect_values->view(), std::move(agg)); } @@ -212,8 +212,11 @@ TYPED_TEST(groupby_collect_list_test, dictionary) 0, rmm::device_buffer{}); - test_single_agg( - keys, vals, expect_keys, expect_vals->view(), cudf::make_collect_list_aggregation()); + test_single_agg(keys, + vals, + expect_keys, + expect_vals->view(), + cudf::make_collect_list_aggregation()); } } // namespace test diff --git a/cpp/tests/groupby/collect_set_tests.cpp b/cpp/tests/groupby/collect_set_tests.cpp index 2f89b04c745..198caabfca9 100644 --- a/cpp/tests/groupby/collect_set_tests.cpp +++ b/cpp/tests/groupby/collect_set_tests.cpp @@ -33,16 +33,20 @@ namespace test { #define VALIDITY std::initializer_list struct CollectSetTest : public cudf::test::BaseFixture { - static auto collect_set() { return cudf::make_collect_set_aggregation(); } + static auto collect_set() + { + return cudf::make_collect_set_aggregation(); + } static auto collect_set_null_unequal() { - return cudf::make_collect_set_aggregation(null_policy::INCLUDE, null_equality::UNEQUAL); + return cudf::make_collect_set_aggregation(null_policy::INCLUDE, + null_equality::UNEQUAL); } static auto collect_set_null_exclude() { - return cudf::make_collect_set_aggregation(null_policy::EXCLUDE); + return cudf::make_collect_set_aggregation(null_policy::EXCLUDE); } }; @@ -174,7 +178,7 @@ TEST_F(CollectSetTest, FloatsWithNaN) vals, keys_expected, vals_expected, - cudf::make_collect_set_aggregation( + cudf::make_collect_set_aggregation( null_policy::INCLUDE, null_equality::EQUAL, nan_equality::ALL_EQUAL)); // null unequal with nan equal vals_expected = { @@ -183,7 +187,7 @@ TEST_F(CollectSetTest, FloatsWithNaN) vals, keys_expected, vals_expected, - cudf::make_collect_set_aggregation( + cudf::make_collect_set_aggregation( null_policy::INCLUDE, null_equality::UNEQUAL, nan_equality::ALL_EQUAL)); } diff --git a/cpp/tests/groupby/count_scan_tests.cpp b/cpp/tests/groupby/count_scan_tests.cpp index 9740bfa1954..62e8b11241d 100644 --- a/cpp/tests/groupby/count_scan_tests.cpp +++ b/cpp/tests/groupby/count_scan_tests.cpp @@ -53,11 +53,11 @@ TYPED_TEST(groupby_count_scan_test, basic) result_wrapper expect_vals{0, 1, 2, 0, 1, 2, 3, 0, 1, 2}; // clang-format on - auto agg1 = cudf::make_count_aggregation(); + auto agg1 = cudf::make_count_aggregation(); CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1)), "Unsupported groupby scan aggregation"); - auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2)); } @@ -74,10 +74,10 @@ TYPED_TEST(groupby_count_scan_test, empty_cols) result_wrapper expect_vals; // clang-format on - auto agg1 = cudf::make_count_aggregation(); + auto agg1 = cudf::make_count_aggregation(); EXPECT_NO_THROW(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1))); - auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2)); } @@ -94,7 +94,7 @@ TYPED_TEST(groupby_count_scan_test, zero_valid_keys) result_wrapper expect_vals{}; // clang-format on - auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2)); } @@ -111,7 +111,7 @@ TYPED_TEST(groupby_count_scan_test, zero_valid_values) result_wrapper expect_vals{0, 1, 2}; // clang-format on - auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2)); } @@ -130,7 +130,7 @@ TYPED_TEST(groupby_count_scan_test, null_keys_and_values) result_wrapper expect_vals{0, 1, 2, 0, 1, 2, 3, 0, 1, 0}; // clang-format on - auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2)); } @@ -151,7 +151,7 @@ TEST_F(groupby_count_scan_string_test, basic) result_wrapper expect_vals{0, 0, 0, 1, 0, 1}; // clang-format on - auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2)); } @@ -182,10 +182,14 @@ TYPED_TEST(FixedPointTestBothReps, GroupByCountScan) // clang-format on CUDF_EXPECT_THROW_MESSAGE( - test_single_scan(keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation()), + test_single_scan(keys, + vals, + expect_keys, + expect_vals, + cudf::make_count_aggregation()), "Unsupported groupby scan aggregation"); - auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2)); } @@ -205,11 +209,14 @@ TEST_F(groupby_dictionary_count_scan_test, basic) result_wrapper expect_vals{0, 0, 0, 1, 0, 1}; // clang-format on - auto agg1 = cudf::make_count_aggregation(); + auto agg1 = cudf::make_count_aggregation(); CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1)), "Unsupported groupby scan aggregation"); - test_single_scan( - keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation(null_policy::INCLUDE)); + test_single_scan(keys, + vals, + expect_keys, + expect_vals, + cudf::make_count_aggregation(null_policy::INCLUDE)); } } // namespace test diff --git a/cpp/tests/groupby/count_tests.cpp b/cpp/tests/groupby/count_tests.cpp index 2d45de04607..cbb821767c9 100644 --- a/cpp/tests/groupby/count_tests.cpp +++ b/cpp/tests/groupby/count_tests.cpp @@ -45,13 +45,13 @@ TYPED_TEST(groupby_count_test, basic) fixed_width_column_wrapper expect_keys{1, 2, 3}; fixed_width_column_wrapper expect_vals{3, 4, 3}; - auto agg = cudf::make_count_aggregation(); + auto agg = cudf::make_count_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg1 = cudf::make_count_aggregation(); + auto agg1 = cudf::make_count_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES); - auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2)); } @@ -66,10 +66,10 @@ TYPED_TEST(groupby_count_test, empty_cols) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals; - auto agg = cudf::make_count_aggregation(); + auto agg = cudf::make_count_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg1 = cudf::make_count_aggregation(); + auto agg1 = cudf::make_count_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES); } @@ -84,13 +84,13 @@ TYPED_TEST(groupby_count_test, zero_valid_keys) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_count_aggregation(); + auto agg = cudf::make_count_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg1 = cudf::make_count_aggregation(); + auto agg1 = cudf::make_count_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES); - auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2)); } @@ -105,14 +105,14 @@ TYPED_TEST(groupby_count_test, zero_valid_values) fixed_width_column_wrapper expect_keys{1}; fixed_width_column_wrapper expect_vals{0}; - auto agg = cudf::make_count_aggregation(); + auto agg = cudf::make_count_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg1 = cudf::make_count_aggregation(); + auto agg1 = cudf::make_count_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES); fixed_width_column_wrapper expect_vals2{3}; - auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2)); } @@ -133,14 +133,14 @@ TYPED_TEST(groupby_count_test, null_keys_and_values) fixed_width_column_wrapper expect_vals({2, 3, 2, 0}); // clang-format on - auto agg = cudf::make_count_aggregation(); + auto agg = cudf::make_count_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg1 = cudf::make_count_aggregation(); + auto agg1 = cudf::make_count_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES); fixed_width_column_wrapper expect_vals2{3, 4, 2, 1}; - auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2)); } @@ -160,10 +160,10 @@ TEST_F(groupby_count_string_test, basic) fixed_width_column_wrapper expect_keys{0, 1, 3, 5}; fixed_width_column_wrapper expect_vals{1, 1, 2, 2}; - auto agg = cudf::make_count_aggregation(); + auto agg = cudf::make_count_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg1 = cudf::make_count_aggregation(); + auto agg1 = cudf::make_count_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES); } // clang-format on @@ -191,13 +191,13 @@ TYPED_TEST(FixedPointTestBothReps, GroupByCount) auto const expect_keys = fixed_width_column_wrapper{1, 2, 3}; auto const expect_vals = fixed_width_column_wrapper{3, 4, 3}; - auto agg = cudf::make_count_aggregation(); + auto agg = cudf::make_count_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg1 = cudf::make_count_aggregation(); + auto agg1 = cudf::make_count_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES); - auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2)); } @@ -216,9 +216,14 @@ TEST_F(groupby_dictionary_count_test, basic) fixed_width_column_wrapper expect_vals{1, 1, 2, 2}; // clang-format on - test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation()); test_single_agg( - keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation(), force_use_sort_impl::YES); + keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation()); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + cudf::make_count_aggregation(), + force_use_sort_impl::YES); } } // namespace test diff --git a/cpp/tests/groupby/groupby_test_util.hpp b/cpp/tests/groupby/groupby_test_util.hpp index 9a083ac8e74..542205b5b51 100644 --- a/cpp/tests/groupby/groupby_test_util.hpp +++ b/cpp/tests/groupby/groupby_test_util.hpp @@ -63,7 +63,7 @@ inline void test_single_agg(column_view const& keys, column_view const& values, column_view const& expect_keys, column_view const& expect_vals, - std::unique_ptr&& agg, + std::unique_ptr&& agg, force_use_sort_impl use_sort = force_use_sort_impl::NO, null_policy include_null_keys = null_policy::EXCLUDE, sorted keys_are_sorted = sorted::NO, @@ -78,7 +78,7 @@ inline void test_single_agg(column_view const& keys, if (use_sort == force_use_sort_impl::YES) { // WAR to force groupby to use sort implementation - requests[0].aggregations.push_back(make_nth_element_aggregation(0)); + requests[0].aggregations.push_back(make_nth_element_aggregation(0)); } groupby::groupby gb_obj( @@ -105,14 +105,14 @@ inline void test_single_scan(column_view const& keys, column_view const& values, column_view const& expect_keys, column_view const& expect_vals, - std::unique_ptr&& agg, + std::unique_ptr&& agg, null_policy include_null_keys = null_policy::EXCLUDE, sorted keys_are_sorted = sorted::NO, std::vector const& column_order = {}, std::vector const& null_precedence = {}) { - std::vector requests; - requests.emplace_back(groupby::aggregation_request()); + std::vector requests; + requests.emplace_back(groupby::scan_request()); requests[0].values = values; requests[0].aggregations.push_back(std::move(agg)); diff --git a/cpp/tests/groupby/keys_tests.cpp b/cpp/tests/groupby/keys_tests.cpp index 91db37a5ff6..683eeb7eb01 100644 --- a/cpp/tests/groupby/keys_tests.cpp +++ b/cpp/tests/groupby/keys_tests.cpp @@ -50,7 +50,7 @@ TYPED_TEST(groupby_keys_test, basic) fixed_width_column_wrapper expect_vals { 3, 4, 3 }; // clang-format on - auto agg = cudf::make_count_aggregation(); + auto agg = cudf::make_count_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -68,7 +68,7 @@ TYPED_TEST(groupby_keys_test, zero_valid_keys) fixed_width_column_wrapper expect_vals { }; // clang-format on - auto agg = cudf::make_count_aggregation(); + auto agg = cudf::make_count_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -89,7 +89,7 @@ TYPED_TEST(groupby_keys_test, some_null_keys) fixed_width_column_wrapper expect_vals { 3, 4, 2, 1}; // clang-format on - auto agg = cudf::make_count_aggregation(); + auto agg = cudf::make_count_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -111,7 +111,7 @@ TYPED_TEST(groupby_keys_test, include_null_keys) fixed_width_column_wrapper expect_vals { 9, 19, 10, 4, 7}; // clang-format on - auto agg = cudf::make_sum_aggregation(); + auto agg = cudf::make_sum_aggregation(); test_single_agg(keys, vals, expect_keys, @@ -135,7 +135,7 @@ TYPED_TEST(groupby_keys_test, pre_sorted_keys) fixed_width_column_wrapper expect_vals { 3, 18, 24, 4}; // clang-format on - auto agg = cudf::make_sum_aggregation(); + auto agg = cudf::make_sum_aggregation(); test_single_agg(keys, vals, expect_keys, @@ -160,7 +160,7 @@ TYPED_TEST(groupby_keys_test, pre_sorted_keys_descending) fixed_width_column_wrapper expect_vals { 0, 6, 22, 21 }; // clang-format on - auto agg = cudf::make_sum_aggregation(); + auto agg = cudf::make_sum_aggregation(); test_single_agg(keys, vals, expect_keys, @@ -187,7 +187,7 @@ TYPED_TEST(groupby_keys_test, pre_sorted_keys_nullable) fixed_width_column_wrapper expect_vals { 3, 15, 17, 4}; // clang-format on - auto agg = cudf::make_sum_aggregation(); + auto agg = cudf::make_sum_aggregation(); test_single_agg(keys, vals, expect_keys, @@ -215,7 +215,7 @@ TYPED_TEST(groupby_keys_test, pre_sorted_keys_nulls_before_include_nulls) fixed_width_column_wrapper expect_vals { 3, 7, 11, 7, 17, 4}; // clang-format on - auto agg = cudf::make_sum_aggregation(); + auto agg = cudf::make_sum_aggregation(); test_single_agg(keys, vals, expect_keys, @@ -234,10 +234,11 @@ TYPED_TEST(groupby_keys_test, mismatch_num_rows) fixed_width_column_wrapper keys{1, 2, 3}; fixed_width_column_wrapper vals{0, 1, 2, 3, 4}; - auto agg = cudf::make_count_aggregation(); + auto agg = cudf::make_count_aggregation(); CUDF_EXPECT_THROW_MESSAGE(test_single_agg(keys, vals, keys, vals, std::move(agg)), "Size mismatch between request values and groupby keys."); - CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, keys, vals, std::move(agg)), + auto agg2 = cudf::make_count_aggregation(); + CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, keys, vals, std::move(agg2)), "Size mismatch between request values and groupby keys."); } @@ -257,7 +258,7 @@ TEST_F(groupby_string_keys_test, basic) fixed_width_column_wrapper expect_vals { 9, 19, 17 }; // clang-format on - auto agg = cudf::make_sum_aggregation(); + auto agg = cudf::make_sum_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } // clang-format on @@ -278,9 +279,14 @@ TEST_F(groupby_dictionary_keys_test, basic) fixed_width_column_wrapper expect_vals({ 9, 19, 17 }); // clang-format on - test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_sum_aggregation()); test_single_agg( - keys, vals, expect_keys, expect_vals, cudf::make_sum_aggregation(), force_use_sort_impl::YES); + keys, vals, expect_keys, expect_vals, cudf::make_sum_aggregation()); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + cudf::make_sum_aggregation(), + force_use_sort_impl::YES); } } // namespace test diff --git a/cpp/tests/groupby/m2_tests.cpp b/cpp/tests/groupby/m2_tests.cpp index 7b338a0d9b8..be7d6c1ce05 100644 --- a/cpp/tests/groupby/m2_tests.cpp +++ b/cpp/tests/groupby/m2_tests.cpp @@ -44,7 +44,7 @@ auto compute_M2(cudf::column_view const& keys, cudf::column_view const& values) std::vector requests; requests.emplace_back(cudf::groupby::aggregation_request()); requests[0].values = values; - requests[0].aggregations.emplace_back(cudf::make_m2_aggregation()); + requests[0].aggregations.emplace_back(cudf::make_m2_aggregation()); auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys})); auto result = gb_obj.aggregate(requests); diff --git a/cpp/tests/groupby/max_scan_tests.cpp b/cpp/tests/groupby/max_scan_tests.cpp index 70a48da69e8..4d83dc9f7ba 100644 --- a/cpp/tests/groupby/max_scan_tests.cpp +++ b/cpp/tests/groupby/max_scan_tests.cpp @@ -55,7 +55,7 @@ TYPED_TEST(groupby_max_scan_test, basic) result_wrapper expect_vals({5, 8, 8, 6, 9, 9, 9, 7, 7, 7}); // clang-format on - auto agg = cudf::make_max_aggregation(); + auto agg = cudf::make_max_aggregation(); test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -70,7 +70,7 @@ TYPED_TEST(groupby_max_scan_test, empty_cols) key_wrapper expect_keys{}; result_wrapper expect_vals{}; - auto agg = cudf::make_max_aggregation(); + auto agg = cudf::make_max_aggregation(); test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -87,7 +87,7 @@ TYPED_TEST(groupby_max_scan_test, zero_valid_keys) result_wrapper expect_vals{}; // clang-format on - auto agg = cudf::make_max_aggregation(); + auto agg = cudf::make_max_aggregation(); test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -104,7 +104,7 @@ TYPED_TEST(groupby_max_scan_test, zero_valid_values) result_wrapper expect_vals({-1, -1, -1}, all_nulls()); // clang-format on - auto agg = cudf::make_max_aggregation(); + auto agg = cudf::make_max_aggregation(); test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -124,7 +124,7 @@ TYPED_TEST(groupby_max_scan_test, null_keys_and_values) { 0, 1, 1, 1, 1, 0, 1, 1, 1, 0}); // clang-format on - auto agg = cudf::make_max_aggregation(); + auto agg = cudf::make_max_aggregation(); test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -152,7 +152,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxScanDecimalAsValue) auto const expect_vals_max = fp_wrapper{{5, 8, 8, 6, 9, 9, 9, 7, 7, 7}, scale}; // clang-format on - auto agg = cudf::make_max_aggregation(); + auto agg = cudf::make_max_aggregation(); test_single_scan(keys, vals, expect_keys, expect_vals_max, std::move(agg)); } } diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp index b5710d3f4bc..a1e34b625e8 100644 --- a/cpp/tests/groupby/max_tests.cpp +++ b/cpp/tests/groupby/max_tests.cpp @@ -46,10 +46,10 @@ TYPED_TEST(groupby_max_test, basic) fixed_width_column_wrapper expect_keys{1, 2, 3}; fixed_width_column_wrapper expect_vals({6, 9, 8}); - auto agg = cudf::make_max_aggregation(); + auto agg = cudf::make_max_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_max_aggregation(); + auto agg2 = cudf::make_max_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -64,10 +64,10 @@ TYPED_TEST(groupby_max_test, empty_cols) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_max_aggregation(); + auto agg = cudf::make_max_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_max_aggregation(); + auto agg2 = cudf::make_max_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -82,10 +82,10 @@ TYPED_TEST(groupby_max_test, zero_valid_keys) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_max_aggregation(); + auto agg = cudf::make_max_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_max_aggregation(); + auto agg2 = cudf::make_max_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -100,10 +100,10 @@ TYPED_TEST(groupby_max_test, zero_valid_values) fixed_width_column_wrapper expect_keys{1}; fixed_width_column_wrapper expect_vals({0}, all_nulls()); - auto agg = cudf::make_max_aggregation(); + auto agg = cudf::make_max_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_max_aggregation(); + auto agg2 = cudf::make_max_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -122,10 +122,10 @@ TYPED_TEST(groupby_max_test, null_keys_and_values) // { 0, 3, 1, 4, 5, 2, 8, -} fixed_width_column_wrapper expect_vals({3, 5, 8, 0}, {1, 1, 1, 0}); - auto agg = cudf::make_max_aggregation(); + auto agg = cudf::make_max_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_max_aggregation(); + auto agg2 = cudf::make_max_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -140,10 +140,10 @@ TEST_F(groupby_max_string_test, basic) fixed_width_column_wrapper expect_keys{1, 2, 3}; strings_column_wrapper expect_vals({"año", "zit", "₹1"}); - auto agg = cudf::make_max_aggregation(); + auto agg = cudf::make_max_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_max_aggregation(); + auto agg2 = cudf::make_max_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -155,10 +155,10 @@ TEST_F(groupby_max_string_test, zero_valid_values) fixed_width_column_wrapper expect_keys{1}; strings_column_wrapper expect_vals({""}, all_nulls()); - auto agg = cudf::make_max_aggregation(); + auto agg = cudf::make_max_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_max_aggregation(); + auto agg2 = cudf::make_max_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -187,7 +187,7 @@ TEST_F(groupby_max_string_test, max_sorted_strings) // fixed_width_column_wrapper expect_argmax( // {6, 10, 14, 18, 22, 26, 30, 34, 38, 42, -1}, // {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); - auto agg = cudf::make_max_aggregation(); + auto agg = cudf::make_max_aggregation(); test_single_agg(keys, vals, expect_keys, @@ -214,12 +214,16 @@ TEST_F(groupby_dictionary_max_test, basic) auto expect_vals = cudf::dictionary::set_keys(expect_vals_w, vals.keys()); - test_single_agg(keys, vals, expect_keys, expect_vals->view(), cudf::make_max_aggregation()); test_single_agg(keys, vals, expect_keys, expect_vals->view(), - cudf::make_max_aggregation(), + cudf::make_max_aggregation()); + test_single_agg(keys, + vals, + expect_keys, + expect_vals->view(), + cudf::make_max_aggregation(), force_use_sort_impl::YES); } @@ -247,7 +251,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxDecimalAsValue) auto const expect_keys = fixed_width_column_wrapper{1, 2, 3}; auto const expect_vals_max = fp_wrapper{{6, 9, 8}, scale}; - auto agg3 = cudf::make_max_aggregation(); + auto agg3 = cudf::make_max_aggregation(); test_single_agg( keys, vals, expect_keys, expect_vals_max, std::move(agg3), force_use_sort_impl::YES); } @@ -271,7 +275,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupByHashMaxDecimalAsValue) auto const expect_keys = fixed_width_column_wrapper{1, 2, 3}; auto const expect_vals_max = fp_wrapper{{6, 9, 8}, scale}; - auto agg7 = cudf::make_max_aggregation(); + auto agg7 = cudf::make_max_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals_max, std::move(agg7)); } } diff --git a/cpp/tests/groupby/mean_tests.cpp b/cpp/tests/groupby/mean_tests.cpp index bac95b11e81..613e1555b79 100644 --- a/cpp/tests/groupby/mean_tests.cpp +++ b/cpp/tests/groupby/mean_tests.cpp @@ -67,7 +67,7 @@ TYPED_TEST(groupby_mean_test, basic) fixed_width_column_wrapper expect_vals(expect_v.cbegin(), expect_v.cend()); // clang-format on - auto agg = cudf::make_mean_aggregation(); + auto agg = cudf::make_mean_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -82,7 +82,7 @@ TYPED_TEST(groupby_mean_test, empty_cols) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_mean_aggregation(); + auto agg = cudf::make_mean_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -97,7 +97,7 @@ TYPED_TEST(groupby_mean_test, zero_valid_keys) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_mean_aggregation(); + auto agg = cudf::make_mean_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -112,7 +112,7 @@ TYPED_TEST(groupby_mean_test, zero_valid_values) fixed_width_column_wrapper expect_keys{1}; fixed_width_column_wrapper expect_vals({0}, all_nulls()); - auto agg = cudf::make_mean_aggregation(); + auto agg = cudf::make_mean_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -135,7 +135,7 @@ TYPED_TEST(groupby_mean_test, null_keys_and_values) fixed_width_column_wrapper expect_vals(expect_v.cbegin(), expect_v.cend(), {1, 1, 1, 0}); // clang-format on - auto agg = cudf::make_mean_aggregation(); + auto agg = cudf::make_mean_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } // clang-format on @@ -156,7 +156,8 @@ TEST_F(groupby_dictionary_mean_test, basic) fixed_width_column_wrapper expect_vals({9. / 3, 19. / 4, 17. / 3}); // clang-format on - test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_mean_aggregation()); + test_single_agg( + keys, vals, expect_keys, expect_vals, cudf::make_mean_aggregation()); } } // namespace test diff --git a/cpp/tests/groupby/median_tests.cpp b/cpp/tests/groupby/median_tests.cpp index 18979820911..86d89325401 100644 --- a/cpp/tests/groupby/median_tests.cpp +++ b/cpp/tests/groupby/median_tests.cpp @@ -51,7 +51,7 @@ TYPED_TEST(groupby_median_test, basic) fixed_width_column_wrapper expect_vals({3., 4.5, 7.}, no_nulls()); // clang-format on - auto agg = cudf::make_median_aggregation(); + auto agg = cudf::make_median_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -66,7 +66,7 @@ TYPED_TEST(groupby_median_test, empty_cols) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_median_aggregation(); + auto agg = cudf::make_median_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -81,7 +81,7 @@ TYPED_TEST(groupby_median_test, zero_valid_keys) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_median_aggregation(); + auto agg = cudf::make_median_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -96,7 +96,7 @@ TYPED_TEST(groupby_median_test, zero_valid_values) fixed_width_column_wrapper expect_keys{1}; fixed_width_column_wrapper expect_vals({0}, all_nulls()); - auto agg = cudf::make_median_aggregation(); + auto agg = cudf::make_median_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -115,7 +115,7 @@ TYPED_TEST(groupby_median_test, null_keys_and_values) // { 3, 6, 1, 4, 9, 2, 8, -} fixed_width_column_wrapper expect_vals({4.5, 4., 5., 0.}, {1, 1, 1, 0}); - auto agg = cudf::make_median_aggregation(); + auto agg = cudf::make_median_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -134,7 +134,8 @@ TYPED_TEST(groupby_median_test, dictionary) fixed_width_column_wrapper expect_vals({3., 4.5, 7. }, no_nulls()); // clang-format on - test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_median_aggregation()); + test_single_agg( + keys, vals, expect_keys, expect_vals, cudf::make_median_aggregation()); } } // namespace test diff --git a/cpp/tests/groupby/merge_lists_tests.cpp b/cpp/tests/groupby/merge_lists_tests.cpp index 29c6185e3a5..b6b1d1a1720 100644 --- a/cpp/tests/groupby/merge_lists_tests.cpp +++ b/cpp/tests/groupby/merge_lists_tests.cpp @@ -42,7 +42,8 @@ auto merge_lists(vcol_views const& keys_cols, vcol_views const& values_cols) std::vector requests; requests.emplace_back(cudf::groupby::aggregation_request()); requests[0].values = *values; - requests[0].aggregations.emplace_back(cudf::make_merge_lists_aggregation()); + requests[0].aggregations.emplace_back( + cudf::make_merge_lists_aggregation()); auto gb_obj = cudf::groupby::groupby(cudf::table_view({*keys})); auto result = gb_obj.aggregate(requests); diff --git a/cpp/tests/groupby/merge_m2_tests.cpp b/cpp/tests/groupby/merge_m2_tests.cpp index 3ec8bfec774..60067e78022 100644 --- a/cpp/tests/groupby/merge_m2_tests.cpp +++ b/cpp/tests/groupby/merge_m2_tests.cpp @@ -59,9 +59,9 @@ auto compute_partial_results(cudf::column_view const& keys, cudf::column_view co std::vector requests; requests.emplace_back(cudf::groupby::aggregation_request()); requests[0].values = values; - requests[0].aggregations.emplace_back(cudf::make_count_aggregation()); - requests[0].aggregations.emplace_back(cudf::make_mean_aggregation()); - requests[0].aggregations.emplace_back(cudf::make_m2_aggregation()); + requests[0].aggregations.emplace_back(cudf::make_count_aggregation()); + requests[0].aggregations.emplace_back(cudf::make_mean_aggregation()); + requests[0].aggregations.emplace_back(cudf::make_m2_aggregation()); auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys})); auto [out_keys, out_results] = gb_obj.aggregate(requests); @@ -88,7 +88,8 @@ auto merge_M2(vcol_views const& keys_cols, vcol_views const& values_cols) std::vector requests; requests.emplace_back(cudf::groupby::aggregation_request()); requests[0].values = *values; - requests[0].aggregations.emplace_back(cudf::make_merge_m2_aggregation()); + requests[0].aggregations.emplace_back( + cudf::make_merge_m2_aggregation()); auto gb_obj = cudf::groupby::groupby(cudf::table_view({*keys})); auto result = gb_obj.aggregate(requests); diff --git a/cpp/tests/groupby/merge_sets_tests.cpp b/cpp/tests/groupby/merge_sets_tests.cpp index ee4f61bf44f..5a65774b430 100644 --- a/cpp/tests/groupby/merge_sets_tests.cpp +++ b/cpp/tests/groupby/merge_sets_tests.cpp @@ -42,7 +42,8 @@ auto merge_sets(vcol_views const& keys_cols, vcol_views const& values_cols) std::vector requests; requests.emplace_back(cudf::groupby::aggregation_request()); requests[0].values = *values; - requests[0].aggregations.emplace_back(cudf::make_merge_sets_aggregation()); + requests[0].aggregations.emplace_back( + cudf::make_merge_sets_aggregation()); auto gb_obj = cudf::groupby::groupby(cudf::table_view({*keys})); auto result = gb_obj.aggregate(requests); diff --git a/cpp/tests/groupby/min_scan_tests.cpp b/cpp/tests/groupby/min_scan_tests.cpp index ef548407761..452f70eaf16 100644 --- a/cpp/tests/groupby/min_scan_tests.cpp +++ b/cpp/tests/groupby/min_scan_tests.cpp @@ -53,7 +53,7 @@ TYPED_TEST(groupby_min_scan_test, basic) result_wrapper expect_vals({5, 5, 1, 6, 6, 0, 0, 7, 2, 2}); // clang-format on - auto agg = cudf::make_min_aggregation(); + auto agg = cudf::make_min_aggregation(); test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -68,7 +68,7 @@ TYPED_TEST(groupby_min_scan_test, empty_cols) key_wrapper expect_keys{}; result_wrapper expect_vals{}; - auto agg = cudf::make_min_aggregation(); + auto agg = cudf::make_min_aggregation(); test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -85,7 +85,7 @@ TYPED_TEST(groupby_min_scan_test, zero_valid_keys) result_wrapper expect_vals{}; // clang-format on - auto agg = cudf::make_min_aggregation(); + auto agg = cudf::make_min_aggregation(); test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -102,7 +102,7 @@ TYPED_TEST(groupby_min_scan_test, zero_valid_values) result_wrapper expect_vals({-1, -1, -1}, all_nulls()); // clang-format on - auto agg = cudf::make_min_aggregation(); + auto agg = cudf::make_min_aggregation(); test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -122,7 +122,7 @@ TYPED_TEST(groupby_min_scan_test, null_keys_and_values) { 0, 1, 1, 1, 1, 0, 1, 1, 1, 0}); // clang-format on - auto agg = cudf::make_min_aggregation(); + auto agg = cudf::make_min_aggregation(); test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -137,7 +137,7 @@ TEST_F(groupby_min_scan_string_test, basic) key_wrapper expect_keys{1, 1, 1, 2, 2, 2, 2, 3, 3, 3}; strings_column_wrapper expect_vals; - auto agg = cudf::make_min_aggregation(); + auto agg = cudf::make_min_aggregation(); CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)), "Unsupported groupby scan type-agg combination"); } @@ -167,7 +167,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortMinScanDecimalAsValue) auto const expect_vals_min = fp_wrapper{{5, 5, 1, 6, 6, 0, 0, 7, 2, 2}, scale}; // clang-format on - auto agg = cudf::make_min_aggregation(); + auto agg = cudf::make_min_aggregation(); test_single_scan(keys, vals, expect_keys, expect_vals_min, std::move(agg)); } } diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp index 1544e867595..59e9d540709 100644 --- a/cpp/tests/groupby/min_tests.cpp +++ b/cpp/tests/groupby/min_tests.cpp @@ -46,10 +46,10 @@ TYPED_TEST(groupby_min_test, basic) fixed_width_column_wrapper expect_keys{1, 2, 3}; fixed_width_column_wrapper expect_vals({0, 1, 2}); - auto agg = cudf::make_min_aggregation(); + auto agg = cudf::make_min_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_min_aggregation(); + auto agg2 = cudf::make_min_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -64,10 +64,10 @@ TYPED_TEST(groupby_min_test, empty_cols) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_min_aggregation(); + auto agg = cudf::make_min_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_min_aggregation(); + auto agg2 = cudf::make_min_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -82,10 +82,10 @@ TYPED_TEST(groupby_min_test, zero_valid_keys) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_min_aggregation(); + auto agg = cudf::make_min_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_min_aggregation(); + auto agg2 = cudf::make_min_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -100,10 +100,10 @@ TYPED_TEST(groupby_min_test, zero_valid_values) fixed_width_column_wrapper expect_keys{1}; fixed_width_column_wrapper expect_vals({0}, all_nulls()); - auto agg = cudf::make_min_aggregation(); + auto agg = cudf::make_min_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_min_aggregation(); + auto agg2 = cudf::make_min_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -122,10 +122,10 @@ TYPED_TEST(groupby_min_test, null_keys_and_values) // { 3, 6, 1, 4, 9, 2, 8, -} fixed_width_column_wrapper expect_vals({3, 1, 2, 0}, {1, 1, 1, 0}); - auto agg = cudf::make_min_aggregation(); + auto agg = cudf::make_min_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_min_aggregation(); + auto agg2 = cudf::make_min_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -140,10 +140,10 @@ TEST_F(groupby_min_string_test, basic) fixed_width_column_wrapper expect_keys{1, 2, 3}; strings_column_wrapper expect_vals({"aaa", "bat", "$1"}); - auto agg = cudf::make_min_aggregation(); + auto agg = cudf::make_min_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_min_aggregation(); + auto agg2 = cudf::make_min_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -155,10 +155,10 @@ TEST_F(groupby_min_string_test, zero_valid_values) fixed_width_column_wrapper expect_keys{1}; strings_column_wrapper expect_vals({""}, all_nulls()); - auto agg = cudf::make_min_aggregation(); + auto agg = cudf::make_min_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_min_aggregation(); + auto agg2 = cudf::make_min_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -187,7 +187,7 @@ TEST_F(groupby_min_string_test, min_sorted_strings) // fixed_width_column_wrapper expect_argmin( // {6, 10, 14, 18, 22, 26, 30, 34, 38, 42, -1}, // {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); - auto agg = cudf::make_min_aggregation(); + auto agg = cudf::make_min_aggregation(); test_single_agg(keys, vals, expect_keys, @@ -214,12 +214,16 @@ TEST_F(groupby_dictionary_min_test, basic) auto expect_vals = cudf::dictionary::set_keys(expect_vals_w, vals.keys()); - test_single_agg(keys, vals, expect_keys, expect_vals->view(), cudf::make_min_aggregation()); test_single_agg(keys, vals, expect_keys, expect_vals->view(), - cudf::make_min_aggregation(), + cudf::make_min_aggregation()); + test_single_agg(keys, + vals, + expect_keys, + expect_vals->view(), + cudf::make_min_aggregation(), force_use_sort_impl::YES); } @@ -246,7 +250,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortMinDecimalAsValue) auto const expect_keys = fixed_width_column_wrapper{1, 2, 3}; auto const expect_vals_min = fp_wrapper{{0, 1, 2}, scale}; - auto agg2 = cudf::make_min_aggregation(); + auto agg2 = cudf::make_min_aggregation(); test_single_agg( keys, vals, expect_keys, expect_vals_min, std::move(agg2), force_use_sort_impl::YES); } @@ -270,7 +274,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupByHashMinDecimalAsValue) auto const expect_keys = fixed_width_column_wrapper{1, 2, 3}; auto const expect_vals_min = fp_wrapper{{0, 1, 2}, scale}; - auto agg6 = cudf::make_min_aggregation(); + auto agg6 = cudf::make_min_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals_min, std::move(agg6)); } } diff --git a/cpp/tests/groupby/nth_element_tests.cpp b/cpp/tests/groupby/nth_element_tests.cpp index d5029147906..22f1e14815f 100644 --- a/cpp/tests/groupby/nth_element_tests.cpp +++ b/cpp/tests/groupby/nth_element_tests.cpp @@ -50,15 +50,15 @@ TYPED_TEST(groupby_nth_element_test, basic) fixed_width_column_wrapper expect_keys{1, 2, 3}; //groupby.first() - auto agg = cudf::make_nth_element_aggregation(0); + auto agg = cudf::make_nth_element_aggregation(0); fixed_width_column_wrapper expect_vals0({0, 1, 2}); test_single_agg(keys, vals, expect_keys, expect_vals0, std::move(agg)); - agg = cudf::make_nth_element_aggregation(1); + agg = cudf::make_nth_element_aggregation(1); fixed_width_column_wrapper expect_vals1({3, 4, 7}); test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg)); - agg = cudf::make_nth_element_aggregation(2); + agg = cudf::make_nth_element_aggregation(2); fixed_width_column_wrapper expect_vals2({6, 5, 8}); test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg)); } @@ -75,7 +75,7 @@ TYPED_TEST(groupby_nth_element_test, empty_cols) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_nth_element_aggregation(0); + auto agg = cudf::make_nth_element_aggregation(0); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -90,7 +90,7 @@ TYPED_TEST(groupby_nth_element_test, basic_out_of_bounds) fixed_width_column_wrapper expect_keys{1, 2, 3}; - auto agg = cudf::make_nth_element_aggregation(3); + auto agg = cudf::make_nth_element_aggregation(3); fixed_width_column_wrapper expect_vals({0, 9, 0}, {0, 1, 0}); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -109,15 +109,15 @@ TYPED_TEST(groupby_nth_element_test, negative) fixed_width_column_wrapper expect_keys{1, 2, 3}; //groupby.last() - auto agg = cudf::make_nth_element_aggregation(-1); + auto agg = cudf::make_nth_element_aggregation(-1); fixed_width_column_wrapper expect_vals0({6, 9, 8}); test_single_agg(keys, vals, expect_keys, expect_vals0, std::move(agg)); - agg = cudf::make_nth_element_aggregation(-2); + agg = cudf::make_nth_element_aggregation(-2); fixed_width_column_wrapper expect_vals1({3, 5, 7}); test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg)); - agg = cudf::make_nth_element_aggregation(-3); + agg = cudf::make_nth_element_aggregation(-3); fixed_width_column_wrapper expect_vals2({0, 4, 2}); test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg)); } @@ -133,7 +133,7 @@ TYPED_TEST(groupby_nth_element_test, negative_out_of_bounds) fixed_width_column_wrapper expect_keys{1, 2, 3}; - auto agg = cudf::make_nth_element_aggregation(-4); + auto agg = cudf::make_nth_element_aggregation(-4); fixed_width_column_wrapper expect_vals({0, 1, 0}, {0, 1, 0}); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -150,7 +150,7 @@ TYPED_TEST(groupby_nth_element_test, zero_valid_keys) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_nth_element_aggregation(0); + auto agg = cudf::make_nth_element_aggregation(0); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -166,7 +166,7 @@ TYPED_TEST(groupby_nth_element_test, zero_valid_values) fixed_width_column_wrapper expect_keys{1}; fixed_width_column_wrapper expect_vals({3}, all_nulls()); - auto agg = cudf::make_nth_element_aggregation(0); + auto agg = cudf::make_nth_element_aggregation(0); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -186,7 +186,7 @@ TYPED_TEST(groupby_nth_element_test, null_keys_and_values) //vals {-,3,6, 1,4,-,9, 2,8, -} fixed_width_column_wrapper expect_vals({-1, 1, 2, -1}, {0, 1, 1, 0}); - auto agg = cudf::make_nth_element_aggregation(0); + auto agg = cudf::make_nth_element_aggregation(0); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -206,7 +206,7 @@ TYPED_TEST(groupby_nth_element_test, null_keys_and_values_out_of_bounds) // value, null, out, out fixed_width_column_wrapper expect_vals({6, -1, -1, -1}, {1, 0, 0, 0}); - auto agg = cudf::make_nth_element_aggregation(2); + auto agg = cudf::make_nth_element_aggregation(2); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -237,18 +237,18 @@ TYPED_TEST(groupby_nth_element_test, exclude_nulls) fixed_width_column_wrapper expect_vals1({6, 4, 2, -1}, {1, 1, 1, 0}); fixed_width_column_wrapper expect_vals2({-1, 9, 8, -1}, {0, 1, 1, 0}); - auto agg = cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE); + auto agg = cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE); test_single_agg(keys, vals, expect_keys, expect_nuls0, std::move(agg)); - agg = cudf::make_nth_element_aggregation(1, cudf::null_policy::INCLUDE); + agg = cudf::make_nth_element_aggregation(1, cudf::null_policy::INCLUDE); test_single_agg(keys, vals, expect_keys, expect_nuls1, std::move(agg)); - agg = cudf::make_nth_element_aggregation(2, cudf::null_policy::INCLUDE); + agg = cudf::make_nth_element_aggregation(2, cudf::null_policy::INCLUDE); test_single_agg(keys, vals, expect_keys, expect_nuls2, std::move(agg)); - agg = cudf::make_nth_element_aggregation(0, cudf::null_policy::EXCLUDE); + agg = cudf::make_nth_element_aggregation(0, cudf::null_policy::EXCLUDE); test_single_agg(keys, vals, expect_keys, expect_vals0, std::move(agg)); - agg = cudf::make_nth_element_aggregation(1, cudf::null_policy::EXCLUDE); + agg = cudf::make_nth_element_aggregation(1, cudf::null_policy::EXCLUDE); test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg)); - agg = cudf::make_nth_element_aggregation(2, cudf::null_policy::EXCLUDE); + agg = cudf::make_nth_element_aggregation(2, cudf::null_policy::EXCLUDE); test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg)); } @@ -282,18 +282,18 @@ TYPED_TEST(groupby_nth_element_test, exclude_nulls_negative_index) fixed_width_column_wrapper expect_vals1({3, 4, 2, -1}, {1, 1, 1, 0}); fixed_width_column_wrapper expect_vals2({-1, 1, 2, -1}, {0, 1, 1, 0}); - auto agg = cudf::make_nth_element_aggregation(-1, cudf::null_policy::INCLUDE); + auto agg = cudf::make_nth_element_aggregation(-1, cudf::null_policy::INCLUDE); test_single_agg(keys, vals, expect_keys, expect_nuls0, std::move(agg)); - agg = cudf::make_nth_element_aggregation(-2, cudf::null_policy::INCLUDE); + agg = cudf::make_nth_element_aggregation(-2, cudf::null_policy::INCLUDE); test_single_agg(keys, vals, expect_keys, expect_nuls1, std::move(agg)); - agg = cudf::make_nth_element_aggregation(-3, cudf::null_policy::INCLUDE); + agg = cudf::make_nth_element_aggregation(-3, cudf::null_policy::INCLUDE); test_single_agg(keys, vals, expect_keys, expect_nuls2, std::move(agg)); - agg = cudf::make_nth_element_aggregation(-1, cudf::null_policy::EXCLUDE); + agg = cudf::make_nth_element_aggregation(-1, cudf::null_policy::EXCLUDE); test_single_agg(keys, vals, expect_keys, expect_vals0, std::move(agg)); - agg = cudf::make_nth_element_aggregation(-2, cudf::null_policy::EXCLUDE); + agg = cudf::make_nth_element_aggregation(-2, cudf::null_policy::EXCLUDE); test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg)); - agg = cudf::make_nth_element_aggregation(-3, cudf::null_policy::EXCLUDE); + agg = cudf::make_nth_element_aggregation(-3, cudf::null_policy::EXCLUDE); test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg)); } @@ -312,38 +312,38 @@ TEST_F(groupby_nth_element_string_test, basic_string) fixed_width_column_wrapper expect_keys{1, 2, 3}; //groupby.first() - auto agg = cudf::make_nth_element_aggregation(0); + auto agg = cudf::make_nth_element_aggregation(0); strings_column_wrapper expect_vals0{"ABCD", "1", "2"}; test_single_agg(keys, vals, expect_keys, expect_vals0, std::move(agg)); - agg = cudf::make_nth_element_aggregation(1); + agg = cudf::make_nth_element_aggregation(1); strings_column_wrapper expect_vals1{"3", "4", "7"}; test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg)); - agg = cudf::make_nth_element_aggregation(2); + agg = cudf::make_nth_element_aggregation(2); strings_column_wrapper expect_vals2{"6", "5", "8"}; test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg)); //+ve out of bounds - agg = cudf::make_nth_element_aggregation(3); + agg = cudf::make_nth_element_aggregation(3); strings_column_wrapper expect_vals3{{"", "9", ""}, {0, 1, 0}}; test_single_agg(keys, vals, expect_keys, expect_vals3, std::move(agg)); //groupby.last() - agg = cudf::make_nth_element_aggregation(-1); + agg = cudf::make_nth_element_aggregation(-1); strings_column_wrapper expect_vals4{"6", "9", "8"}; test_single_agg(keys, vals, expect_keys, expect_vals4, std::move(agg)); - agg = cudf::make_nth_element_aggregation(-2); + agg = cudf::make_nth_element_aggregation(-2); strings_column_wrapper expect_vals5{"3", "5", "7"}; test_single_agg(keys, vals, expect_keys, expect_vals5, std::move(agg)); - agg = cudf::make_nth_element_aggregation(-3); + agg = cudf::make_nth_element_aggregation(-3); strings_column_wrapper expect_vals6{"ABCD", "4", "2"}; test_single_agg(keys, vals, expect_keys, expect_vals6, std::move(agg)); //-ve out of bounds - agg = cudf::make_nth_element_aggregation(-4); + agg = cudf::make_nth_element_aggregation(-4); strings_column_wrapper expect_vals7{{"", "1", ""}, {0, 1, 0}}; test_single_agg(keys, vals, expect_keys, expect_vals7, std::move(agg)); } @@ -361,8 +361,11 @@ TEST_F(groupby_nth_element_string_test, dictionary) auto expect_vals = cudf::dictionary::set_keys(expect_vals_w, vals.keys()); - test_single_agg( - keys, vals, expect_keys, expect_vals->view(), cudf::make_nth_element_aggregation(2)); + test_single_agg(keys, + vals, + expect_keys, + expect_vals->view(), + cudf::make_nth_element_aggregation(2)); } template @@ -384,8 +387,11 @@ TYPED_TEST(groupby_nth_element_lists_test, Basics) auto expected_keys = fixed_width_column_wrapper{1, 2, 3}; auto expected_values = lists{{1, 2}, {5, 6, 7}, {9, 10}}; - test_single_agg( - keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(0)); + test_single_agg(keys, + values, + expected_keys, + expected_values, + cudf::make_nth_element_aggregation(0)); } TYPED_TEST(groupby_nth_element_lists_test, EmptyInput) @@ -401,8 +407,11 @@ TYPED_TEST(groupby_nth_element_lists_test, EmptyInput) auto expected_keys = fixed_width_column_wrapper{}; auto expected_values = lists{}; - test_single_agg( - keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(2)); + test_single_agg(keys, + values, + expected_keys, + expected_values, + cudf::make_nth_element_aggregation(2)); } } // namespace test diff --git a/cpp/tests/groupby/nunique_tests.cpp b/cpp/tests/groupby/nunique_tests.cpp index 089ca8805d4..88a6a1c903b 100644 --- a/cpp/tests/groupby/nunique_tests.cpp +++ b/cpp/tests/groupby/nunique_tests.cpp @@ -49,7 +49,7 @@ TYPED_TEST(groupby_nunique_test, basic) fixed_width_column_wrapper expect_bool_vals{2, 1, 1}; // clang-format on - auto agg = cudf::make_nunique_aggregation(); + auto agg = cudf::make_nunique_aggregation(); if (std::is_same()) test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg)); else @@ -67,7 +67,7 @@ TYPED_TEST(groupby_nunique_test, empty_cols) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_nunique_aggregation(); + auto agg = cudf::make_nunique_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -83,7 +83,7 @@ TYPED_TEST(groupby_nunique_test, basic_duplicates) fixed_width_column_wrapper expect_vals{2, 4, 1}; fixed_width_column_wrapper expect_bool_vals{2, 1, 1}; - auto agg = cudf::make_nunique_aggregation(); + auto agg = cudf::make_nunique_aggregation(); if (std::is_same()) test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg)); else @@ -101,7 +101,7 @@ TYPED_TEST(groupby_nunique_test, zero_valid_keys) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_nunique_aggregation(); + auto agg = cudf::make_nunique_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -116,7 +116,7 @@ TYPED_TEST(groupby_nunique_test, zero_valid_values) fixed_width_column_wrapper expect_keys{1}; fixed_width_column_wrapper expect_vals{0}; - auto agg = cudf::make_nunique_aggregation(); + auto agg = cudf::make_nunique_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -136,7 +136,7 @@ TYPED_TEST(groupby_nunique_test, null_keys_and_values) fixed_width_column_wrapper expect_vals{2, 3, 2, 0}; fixed_width_column_wrapper expect_bool_vals{1, 1, 1, 0}; - auto agg = cudf::make_nunique_aggregation(); + auto agg = cudf::make_nunique_aggregation(); if (std::is_same()) test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg)); else @@ -160,7 +160,7 @@ TYPED_TEST(groupby_nunique_test, null_keys_and_values_with_duplicates) fixed_width_column_wrapper expect_vals{2, 3, 2, 0}; fixed_width_column_wrapper expect_bool_vals{1, 1, 1, 0}; - auto agg = cudf::make_nunique_aggregation(); + auto agg = cudf::make_nunique_aggregation(); if (std::is_same()) test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg)); else @@ -184,7 +184,7 @@ TYPED_TEST(groupby_nunique_test, include_nulls) fixed_width_column_wrapper expect_vals{3, 4, 2, 1}; fixed_width_column_wrapper expect_bool_vals{2, 2, 1, 1}; - auto agg = cudf::make_nunique_aggregation(null_policy::INCLUDE); + auto agg = cudf::make_nunique_aggregation(null_policy::INCLUDE); if (std::is_same()) test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg)); else @@ -213,8 +213,11 @@ TYPED_TEST(groupby_nunique_test, dictionary) cudf::column_view expect_vals = (std::is_same()) ? cudf::column_view{expect_bool_vals} : cudf::column_view{expect_fixed_vals}; - test_single_agg( - keys, vals, expect_keys, expect_vals, cudf::make_nunique_aggregation(null_policy::INCLUDE)); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + cudf::make_nunique_aggregation(null_policy::INCLUDE)); } } // namespace test diff --git a/cpp/tests/groupby/product_tests.cpp b/cpp/tests/groupby/product_tests.cpp index eaa2cc07ff8..047bf856493 100644 --- a/cpp/tests/groupby/product_tests.cpp +++ b/cpp/tests/groupby/product_tests.cpp @@ -51,7 +51,11 @@ TYPED_TEST(groupby_product_test, basic) fixed_width_column_wrapper expect_vals({ 0., 180., 112. }, no_nulls()); // clang-format on - test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation()); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + cudf::make_product_aggregation()); } TYPED_TEST(groupby_product_test, empty_cols) @@ -65,7 +69,11 @@ TYPED_TEST(groupby_product_test, empty_cols) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation()); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + cudf::make_product_aggregation()); } TYPED_TEST(groupby_product_test, zero_valid_keys) @@ -79,7 +87,11 @@ TYPED_TEST(groupby_product_test, zero_valid_keys) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation()); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + cudf::make_product_aggregation()); } TYPED_TEST(groupby_product_test, zero_valid_values) @@ -93,7 +105,11 @@ TYPED_TEST(groupby_product_test, zero_valid_values) fixed_width_column_wrapper expect_keys{1}; fixed_width_column_wrapper expect_vals({0}, all_nulls()); - test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation()); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + cudf::make_product_aggregation()); } TYPED_TEST(groupby_product_test, null_keys_and_values) @@ -114,7 +130,11 @@ TYPED_TEST(groupby_product_test, null_keys_and_values) { 1, 1, 1, 0}); // clang-format on - test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation()); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + cudf::make_product_aggregation()); } TYPED_TEST(groupby_product_test, dictionary) @@ -132,7 +152,11 @@ TYPED_TEST(groupby_product_test, dictionary) fixed_width_column_wrapper expect_vals({ 0., 180., 112. }, no_nulls()); // clang-format on - test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation()); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + cudf::make_product_aggregation()); } TYPED_TEST(groupby_product_test, dictionary_with_nulls) @@ -151,7 +175,11 @@ TYPED_TEST(groupby_product_test, dictionary_with_nulls) fixed_width_column_wrapper expect_vals({ 0., 180., 56. }, no_nulls()); // clang-format on - test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation()); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + cudf::make_product_aggregation()); } } // namespace test diff --git a/cpp/tests/groupby/quantile_tests.cpp b/cpp/tests/groupby/quantile_tests.cpp index a82dae9edcb..43b065ee4d3 100644 --- a/cpp/tests/groupby/quantile_tests.cpp +++ b/cpp/tests/groupby/quantile_tests.cpp @@ -51,7 +51,7 @@ TYPED_TEST(groupby_quantile_test, basic) fixed_width_column_wrapper expect_vals({3., 4.5, 7.}, no_nulls()); // clang-format on - auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR); + auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -66,7 +66,7 @@ TYPED_TEST(groupby_quantile_test, empty_cols) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR); + auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -81,7 +81,7 @@ TYPED_TEST(groupby_quantile_test, zero_valid_keys) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR); + auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -96,7 +96,7 @@ TYPED_TEST(groupby_quantile_test, zero_valid_values) fixed_width_column_wrapper expect_keys{1}; fixed_width_column_wrapper expect_vals({0}, all_nulls()); - auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR); + auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -115,7 +115,7 @@ TYPED_TEST(groupby_quantile_test, null_keys_and_values) // { 3, 6, 1, 4, 9, 2, 8, -} fixed_width_column_wrapper expect_vals({4.5, 4., 5., 0.}, {1, 1, 1, 0}); - auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR); + auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -134,7 +134,8 @@ TYPED_TEST(groupby_quantile_test, multiple_quantile) fixed_width_column_wrapper expect_vals({1.5, 4.5, 3.25, 6., 4.5, 7.5}, no_nulls()); // clang-format on - auto agg = cudf::make_quantile_aggregation({0.25, 0.75}, interpolation::LINEAR); + auto agg = + cudf::make_quantile_aggregation({0.25, 0.75}, interpolation::LINEAR); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES); } @@ -152,27 +153,27 @@ TYPED_TEST(groupby_quantile_test, interpolation_types) // {0, 3, 6, 1, 4, 5, 9, 2, 7} fixed_width_column_wrapper expect_vals1({2.4, 4.2, 4.}, no_nulls()); - auto agg1 = cudf::make_quantile_aggregation({0.4}, interpolation::LINEAR); + auto agg1 = cudf::make_quantile_aggregation({0.4}, interpolation::LINEAR); test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg1)); // {0, 3, 6, 1, 4, 5, 9, 2, 7} fixed_width_column_wrapper expect_vals2({3, 4, 2}, no_nulls()); - auto agg2 = cudf::make_quantile_aggregation({0.4}, interpolation::NEAREST); + auto agg2 = cudf::make_quantile_aggregation({0.4}, interpolation::NEAREST); test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2)); // {0, 3, 6, 1, 4, 5, 9, 2, 7} fixed_width_column_wrapper expect_vals3({0, 4, 2}, no_nulls()); - auto agg3 = cudf::make_quantile_aggregation({0.4}, interpolation::LOWER); + auto agg3 = cudf::make_quantile_aggregation({0.4}, interpolation::LOWER); test_single_agg(keys, vals, expect_keys, expect_vals3, std::move(agg3)); // {0, 3, 6, 1, 4, 5, 9, 2, 7} fixed_width_column_wrapper expect_vals4({3, 5, 7}, no_nulls()); - auto agg4 = cudf::make_quantile_aggregation({0.4}, interpolation::HIGHER); + auto agg4 = cudf::make_quantile_aggregation({0.4}, interpolation::HIGHER); test_single_agg(keys, vals, expect_keys, expect_vals4, std::move(agg4)); // {0, 3, 6, 1, 4, 5, 9, 2, 7} fixed_width_column_wrapper expect_vals5({1.5, 4.5, 4.5}, no_nulls()); - auto agg5 = cudf::make_quantile_aggregation({0.4}, interpolation::MIDPOINT); + auto agg5 = cudf::make_quantile_aggregation({0.4}, interpolation::MIDPOINT); test_single_agg(keys, vals, expect_keys, expect_vals5, std::move(agg5)); // clang-format on } @@ -192,11 +193,12 @@ TYPED_TEST(groupby_quantile_test, dictionary) fixed_width_column_wrapper expect_vals({3., 4.5, 7.}, no_nulls()); // clang-format on - test_single_agg(keys, - vals, - expect_keys, - expect_vals, - cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR)); + test_single_agg( + keys, + vals, + expect_keys, + expect_vals, + cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR)); } } // namespace test diff --git a/cpp/tests/groupby/rank_scan_tests.cpp b/cpp/tests/groupby/rank_scan_tests.cpp index 51c4c1e63c2..37e75e2e906 100644 --- a/cpp/tests/groupby/rank_scan_tests.cpp +++ b/cpp/tests/groupby/rank_scan_tests.cpp @@ -39,11 +39,16 @@ inline void test_pair_rank_scans(column_view const& keys, order, keys, expected_dense, - make_dense_rank_aggregation(), + make_dense_rank_aggregation(), + null_policy::INCLUDE, + sorted::YES); + test_single_scan(keys, + order, + keys, + expected_rank, + make_rank_aggregation(), null_policy::INCLUDE, sorted::YES); - test_single_scan( - keys, order, keys, expected_rank, make_rank_aggregation(), null_policy::INCLUDE, sorted::YES); } struct groupby_rank_scan_test : public BaseFixture { @@ -201,11 +206,11 @@ TYPED_TEST(typed_groupby_rank_scan_test, mixedStructs) auto expected_rank_vals = fixed_width_column_wrapper{1, 1, 3, 3, 5, 6, 1, 1, 3, 1, 1, 3}; - std::vector requests; - requests.emplace_back(groupby::aggregation_request()); + std::vector requests; + requests.emplace_back(groupby::scan_request()); requests[0].values = *struct_col; - requests[0].aggregations.push_back(make_dense_rank_aggregation()); - requests[0].aggregations.push_back(make_rank_aggregation()); + requests[0].aggregations.push_back(make_dense_rank_aggregation()); + requests[0].aggregations.push_back(make_rank_aggregation()); groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES); auto result = gb_obj.scan(requests); @@ -377,34 +382,61 @@ TEST_F(groupby_rank_scan_test_failures, test_exception_triggers) fixed_width_column_wrapper col{3, 3, 1}; CUDF_EXPECT_THROW_MESSAGE( - test_single_scan( - keys, col, keys, col, make_dense_rank_aggregation(), null_policy::INCLUDE, sorted::NO), + test_single_scan(keys, + col, + keys, + col, + make_dense_rank_aggregation(), + null_policy::INCLUDE, + sorted::NO), "Dense rank aggregate in groupby scan requires the keys to be presorted"); - CUDF_EXPECT_THROW_MESSAGE( - test_single_scan( - keys, col, keys, col, make_rank_aggregation(), null_policy::INCLUDE, sorted::NO), - "Rank aggregate in groupby scan requires the keys to be presorted"); + CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, + col, + keys, + col, + make_rank_aggregation(), + null_policy::INCLUDE, + sorted::NO), + "Rank aggregate in groupby scan requires the keys to be presorted"); CUDF_EXPECT_THROW_MESSAGE( - test_single_scan( - keys, col, keys, col, make_dense_rank_aggregation(), null_policy::EXCLUDE, sorted::YES), + test_single_scan(keys, + col, + keys, + col, + make_dense_rank_aggregation(), + null_policy::EXCLUDE, + sorted::YES), "Dense rank aggregate in groupby scan requires the keys to be presorted"); - CUDF_EXPECT_THROW_MESSAGE( - test_single_scan( - keys, col, keys, col, make_rank_aggregation(), null_policy::EXCLUDE, sorted::YES), - "Rank aggregate in groupby scan requires the keys to be presorted"); + CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, + col, + keys, + col, + make_rank_aggregation(), + null_policy::EXCLUDE, + sorted::YES), + "Rank aggregate in groupby scan requires the keys to be presorted"); CUDF_EXPECT_THROW_MESSAGE( - test_single_scan( - keys, col, keys, col, make_dense_rank_aggregation(), null_policy::EXCLUDE, sorted::NO), + test_single_scan(keys, + col, + keys, + col, + make_dense_rank_aggregation(), + null_policy::EXCLUDE, + sorted::NO), "Dense rank aggregate in groupby scan requires the keys to be presorted"); - CUDF_EXPECT_THROW_MESSAGE( - test_single_scan( - keys, col, keys, col, make_rank_aggregation(), null_policy::EXCLUDE, sorted::NO), - "Rank aggregate in groupby scan requires the keys to be presorted"); + CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, + col, + keys, + col, + make_rank_aggregation(), + null_policy::EXCLUDE, + sorted::NO), + "Rank aggregate in groupby scan requires the keys to be presorted"); } } // namespace test diff --git a/cpp/tests/groupby/std_tests.cpp b/cpp/tests/groupby/std_tests.cpp index c771971ad9a..e2edabf3e8f 100644 --- a/cpp/tests/groupby/std_tests.cpp +++ b/cpp/tests/groupby/std_tests.cpp @@ -53,7 +53,7 @@ TYPED_TEST(groupby_std_test, basic) fixed_width_column_wrapper expect_vals({3., sqrt(131./12), sqrt(31./3)}, no_nulls()); // clang-format on - auto agg = cudf::make_std_aggregation(); + auto agg = cudf::make_std_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -68,7 +68,7 @@ TYPED_TEST(groupby_std_test, empty_cols) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_std_aggregation(); + auto agg = cudf::make_std_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -83,7 +83,7 @@ TYPED_TEST(groupby_std_test, zero_valid_keys) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_std_aggregation(); + auto agg = cudf::make_std_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -98,7 +98,7 @@ TYPED_TEST(groupby_std_test, zero_valid_values) fixed_width_column_wrapper expect_keys{1}; fixed_width_column_wrapper expect_vals({0}, all_nulls()); - auto agg = cudf::make_std_aggregation(); + auto agg = cudf::make_std_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -118,7 +118,7 @@ TYPED_TEST(groupby_std_test, null_keys_and_values) fixed_width_column_wrapper expect_vals({3 / sqrt(2), 7 / sqrt(3), 3 * sqrt(2), 0.}, {1, 1, 1, 0}); - auto agg = cudf::make_std_aggregation(); + auto agg = cudf::make_std_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -137,7 +137,7 @@ TYPED_TEST(groupby_std_test, ddof_non_default) // { 3, 6, 1, 4, 9, 2, 8, 3} fixed_width_column_wrapper expect_vals({0., 7 * sqrt(2. / 3), 0., 0.}, {0, 1, 0, 0}); - auto agg = cudf::make_std_aggregation(2); + auto agg = cudf::make_std_aggregation(2); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -156,7 +156,8 @@ TYPED_TEST(groupby_std_test, dictionary) fixed_width_column_wrapper expect_vals({3., sqrt(131./12), sqrt(31./3)}, no_nulls()); // clang-format on - test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_std_aggregation()); + test_single_agg( + keys, vals, expect_keys, expect_vals, cudf::make_std_aggregation()); } } // namespace test diff --git a/cpp/tests/groupby/sum_of_squares_tests.cpp b/cpp/tests/groupby/sum_of_squares_tests.cpp index 12b044c7382..0dab2c6483e 100644 --- a/cpp/tests/groupby/sum_of_squares_tests.cpp +++ b/cpp/tests/groupby/sum_of_squares_tests.cpp @@ -49,7 +49,7 @@ TYPED_TEST(groupby_sum_of_squares_test, basic) // { 0, 3, 6, 1, 4, 5, 9, 2, 7, 8} fixed_width_column_wrapper expect_vals({45., 123., 117.}, no_nulls()); - auto agg = cudf::make_sum_of_squares_aggregation(); + auto agg = cudf::make_sum_of_squares_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -64,7 +64,7 @@ TYPED_TEST(groupby_sum_of_squares_test, empty_cols) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_sum_of_squares_aggregation(); + auto agg = cudf::make_sum_of_squares_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -79,7 +79,7 @@ TYPED_TEST(groupby_sum_of_squares_test, zero_valid_keys) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_sum_of_squares_aggregation(); + auto agg = cudf::make_sum_of_squares_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -94,7 +94,7 @@ TYPED_TEST(groupby_sum_of_squares_test, zero_valid_values) fixed_width_column_wrapper expect_keys{1}; fixed_width_column_wrapper expect_vals({0}, all_nulls()); - auto agg = cudf::make_sum_of_squares_aggregation(); + auto agg = cudf::make_sum_of_squares_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -113,7 +113,7 @@ TYPED_TEST(groupby_sum_of_squares_test, null_keys_and_values) // { 3, 6, 1, 4, 9, 2, 8, 3} fixed_width_column_wrapper expect_vals({45., 98., 68., 9.}, {1, 1, 1, 0}); - auto agg = cudf::make_sum_of_squares_aggregation(); + auto agg = cudf::make_sum_of_squares_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -132,7 +132,11 @@ TYPED_TEST(groupby_sum_of_squares_test, dictionary) fixed_width_column_wrapper expect_vals({45., 123., 117. }, no_nulls()); // clang-format on - test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_sum_of_squares_aggregation()); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + cudf::make_sum_of_squares_aggregation()); } } // namespace test diff --git a/cpp/tests/groupby/sum_scan_tests.cpp b/cpp/tests/groupby/sum_scan_tests.cpp index 2f1928747ae..86fc0238597 100644 --- a/cpp/tests/groupby/sum_scan_tests.cpp +++ b/cpp/tests/groupby/sum_scan_tests.cpp @@ -57,7 +57,7 @@ TYPED_TEST(groupby_sum_scan_test, basic) // {0, 3, 6, 1, 4, 5, 9, 2, 7, 8} result_wrapper expect_vals{0, 3, 9, 1, 5, 10, 19, 2, 9, 17}; // clang-format on - auto agg = cudf::make_sum_aggregation(); + auto agg = cudf::make_sum_aggregation(); test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -74,7 +74,7 @@ TYPED_TEST(groupby_sum_scan_test, empty_cols) result_wrapper expect_vals{}; // clang-format on - auto agg = cudf::make_sum_aggregation(); + auto agg = cudf::make_sum_aggregation(); test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -91,7 +91,7 @@ TYPED_TEST(groupby_sum_scan_test, zero_valid_keys) result_wrapper expect_vals{}; // clang-format on - auto agg = cudf::make_sum_aggregation(); + auto agg = cudf::make_sum_aggregation(); test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -108,7 +108,7 @@ TYPED_TEST(groupby_sum_scan_test, zero_valid_values) result_wrapper expect_vals({3, 4, 5}, all_nulls()); // clang-format on - auto agg = cudf::make_sum_aggregation(); + auto agg = cudf::make_sum_aggregation(); test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -128,7 +128,7 @@ TYPED_TEST(groupby_sum_scan_test, null_keys_and_values) { 0, 1, 1, 1, 1, 0, 1, 1, 1, 0}); // clang-format on - auto agg = cudf::make_sum_aggregation(); + auto agg = cudf::make_sum_aggregation(); test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -156,7 +156,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortSumScanDecimalAsValue) auto const expect_vals_sum = out_fp_wrapper{{0, 3, 9, 1, 5, 10, 19, 2, 9, 17}, scale}; // clang-format on - auto agg2 = cudf::make_sum_aggregation(); + auto agg2 = cudf::make_sum_aggregation(); test_single_scan(keys, vals, expect_keys, expect_vals_sum, std::move(agg2)); } } diff --git a/cpp/tests/groupby/sum_tests.cpp b/cpp/tests/groupby/sum_tests.cpp index 458937ff2e4..5c935ee5a9d 100644 --- a/cpp/tests/groupby/sum_tests.cpp +++ b/cpp/tests/groupby/sum_tests.cpp @@ -49,10 +49,10 @@ TYPED_TEST(groupby_sum_test, basic) fixed_width_column_wrapper expect_keys{1, 2, 3}; fixed_width_column_wrapper expect_vals{9, 19, 17}; - auto agg = cudf::make_sum_aggregation(); + auto agg = cudf::make_sum_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_sum_aggregation(); + auto agg2 = cudf::make_sum_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -67,10 +67,10 @@ TYPED_TEST(groupby_sum_test, empty_cols) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_sum_aggregation(); + auto agg = cudf::make_sum_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_sum_aggregation(); + auto agg2 = cudf::make_sum_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -85,10 +85,10 @@ TYPED_TEST(groupby_sum_test, zero_valid_keys) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_sum_aggregation(); + auto agg = cudf::make_sum_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_sum_aggregation(); + auto agg2 = cudf::make_sum_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -103,10 +103,10 @@ TYPED_TEST(groupby_sum_test, zero_valid_values) fixed_width_column_wrapper expect_keys{1}; fixed_width_column_wrapper expect_vals({0}, all_nulls()); - auto agg = cudf::make_sum_aggregation(); + auto agg = cudf::make_sum_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_sum_aggregation(); + auto agg2 = cudf::make_sum_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } @@ -125,10 +125,10 @@ TYPED_TEST(groupby_sum_test, null_keys_and_values) // { 3, 6, 1, 4, 9, 2, 8, -} fixed_width_column_wrapper expect_vals({9, 14, 10, 0}, {1, 1, 1, 0}); - auto agg = cudf::make_sum_aggregation(); + auto agg = cudf::make_sum_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); - auto agg2 = cudf::make_sum_aggregation(); + auto agg2 = cudf::make_sum_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } // clang-format on @@ -146,9 +146,14 @@ TYPED_TEST(groupby_sum_test, dictionary) fixed_width_column_wrapper expect_vals{ 9, 19, 17}; // clang-format on - test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_sum_aggregation()); test_single_agg( - keys, vals, expect_keys, expect_vals, cudf::make_sum_aggregation(), force_use_sort_impl::YES); + keys, vals, expect_keys, expect_vals, cudf::make_sum_aggregation()); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + cudf::make_sum_aggregation(), + force_use_sort_impl::YES); } template @@ -176,11 +181,11 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortSumDecimalAsValue) auto const expect_keys = fixed_width_column_wrapper{1, 2, 3}; auto const expect_vals_sum = fp64_wrapper{{9, 19, 17}, scale}; - auto agg1 = cudf::make_sum_aggregation(); + auto agg1 = cudf::make_sum_aggregation(); test_single_agg( keys, vals, expect_keys, expect_vals_sum, std::move(agg1), force_use_sort_impl::YES); - auto agg4 = cudf::make_product_aggregation(); + auto agg4 = cudf::make_product_aggregation(); EXPECT_THROW( test_single_agg(keys, vals, expect_keys, {}, std::move(agg4), force_use_sort_impl::YES), cudf::logic_error); @@ -206,10 +211,10 @@ TYPED_TEST(FixedPointTestBothReps, GroupByHashSumDecimalAsValue) auto const expect_keys = fixed_width_column_wrapper{1, 2, 3}; auto const expect_vals_sum = fp64_wrapper{{9, 19, 17}, scale}; - auto agg5 = cudf::make_sum_aggregation(); + auto agg5 = cudf::make_sum_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals_sum, std::move(agg5)); - auto agg8 = cudf::make_product_aggregation(); + auto agg8 = cudf::make_product_aggregation(); EXPECT_THROW(test_single_agg(keys, vals, expect_keys, {}, std::move(agg8)), cudf::logic_error); } } diff --git a/cpp/tests/groupby/var_tests.cpp b/cpp/tests/groupby/var_tests.cpp index c3fc781801d..68ccf791960 100644 --- a/cpp/tests/groupby/var_tests.cpp +++ b/cpp/tests/groupby/var_tests.cpp @@ -53,7 +53,7 @@ TYPED_TEST(groupby_var_test, basic) fixed_width_column_wrapper expect_vals({9., 131. / 12, 31. / 3}, no_nulls()); // clang-format on - auto agg = cudf::make_variance_aggregation(); + auto agg = cudf::make_variance_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -68,7 +68,7 @@ TYPED_TEST(groupby_var_test, empty_cols) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_variance_aggregation(); + auto agg = cudf::make_variance_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -83,7 +83,7 @@ TYPED_TEST(groupby_var_test, zero_valid_keys) fixed_width_column_wrapper expect_keys{}; fixed_width_column_wrapper expect_vals{}; - auto agg = cudf::make_variance_aggregation(); + auto agg = cudf::make_variance_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -98,7 +98,7 @@ TYPED_TEST(groupby_var_test, zero_valid_values) fixed_width_column_wrapper expect_keys{1}; fixed_width_column_wrapper expect_vals({0}, all_nulls()); - auto agg = cudf::make_variance_aggregation(); + auto agg = cudf::make_variance_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -119,7 +119,7 @@ TYPED_TEST(groupby_var_test, null_keys_and_values) fixed_width_column_wrapper expect_vals({4.5, 49. / 3, 18., 0.}, {1, 1, 1, 0}); // clang-format on - auto agg = cudf::make_variance_aggregation(); + auto agg = cudf::make_variance_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -141,7 +141,7 @@ TYPED_TEST(groupby_var_test, ddof_non_default) {0, 1, 0, 0}); // clang-format on - auto agg = cudf::make_variance_aggregation(2); + auto agg = cudf::make_variance_aggregation(2); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } @@ -160,7 +160,11 @@ TYPED_TEST(groupby_var_test, dictionary) fixed_width_column_wrapper expect_vals({9., 131./12, 31./3 }, no_nulls()); // clang-format on - test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_variance_aggregation()); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + cudf::make_variance_aggregation()); } } // namespace test diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index 94f01fd62f3..5b6270a8be1 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -50,6 +50,16 @@ namespace cudf_io = cudf::io; +using cudf::data_type; +using cudf::type_id; +using cudf::type_to_id; + +template +auto dtype() +{ + return data_type{type_to_id()}; +} + template using column_wrapper = typename std::conditional, @@ -80,7 +90,6 @@ struct CsvReaderTest : public cudf::test::BaseFixture { // Typed test fixture for timestamp type tests template struct CsvReaderNumericTypeTest : public CsvReaderTest { - auto type() { return cudf::data_type{cudf::type_to_id()}; } }; // Declare typed test cases @@ -93,8 +102,8 @@ struct CsvFixedPointReaderTest : public CsvReaderTest { void run_tests(const std::vector& reference_strings, numeric::scale_type scale) { cudf::test::strings_column_wrapper strings(reference_strings.begin(), reference_strings.end()); - auto input_column = cudf::strings::to_fixed_point( - cudf::strings_column_view(strings), cudf::data_type{cudf::type_to_id(), scale}); + auto input_column = cudf::strings::to_fixed_point(cudf::strings_column_view(strings), + data_type{type_to_id(), scale}); std::string buffer = std::accumulate(reference_strings.begin(), reference_strings.end(), @@ -105,7 +114,7 @@ struct CsvFixedPointReaderTest : public CsvReaderTest { cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()}) - .dtypes({cudf::data_type{cudf::type_to_id(), scale}}) + .dtypes({data_type{type_to_id(), scale}}) .header(-1); const auto result = cudf_io::read_csv(in_opts); @@ -389,9 +398,9 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnNegativeScale) reference_strings = valid_reference_strings; using DecimalType = TypeParam; - auto input_column = cudf::strings::to_fixed_point( - cudf::strings_column_view(strings), - cudf::data_type{cudf::type_to_id(), numeric::scale_type{-2}}); + auto input_column = + cudf::strings::to_fixed_point(cudf::strings_column_view(strings), + data_type{type_to_id(), numeric::scale_type{-2}}); auto input_table = cudf::table_view{std::vector{*input_column}}; @@ -406,7 +415,7 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnNegativeScale) result_strings.reserve(reference_strings.size()); std::ifstream read_result_file(filepath); - assert(read_result_file.is_open()); + ASSERT_TRUE(read_result_file.is_open()); std::copy(std::istream_iterator(read_result_file), std::istream_iterator(), @@ -435,9 +444,9 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnPositiveScale) reference_strings = valid_reference_strings; using DecimalType = TypeParam; - auto input_column = cudf::strings::to_fixed_point( - cudf::strings_column_view(strings), - cudf::data_type{cudf::type_to_id(), numeric::scale_type{3}}); + auto input_column = + cudf::strings::to_fixed_point(cudf::strings_column_view(strings), + data_type{type_to_id(), numeric::scale_type{3}}); auto input_table = cudf::table_view{std::vector{*input_column}}; @@ -452,7 +461,7 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnPositiveScale) result_strings.reserve(reference_strings.size()); std::ifstream read_result_file(filepath); - assert(read_result_file.is_open()); + ASSERT_TRUE(read_result_file.is_open()); std::copy(std::istream_iterator(read_result_file), std::istream_iterator(), @@ -479,11 +488,10 @@ TEST_F(CsvReaderTest, MultiColumn) { std::ostringstream line; for (int i = 0; i < num_rows; ++i) { - line << std::to_string(int8_values[i]) << "," << int16_values[i] << "," << int16_values[i] - << "," << int32_values[i] << "," << int32_values[i] << "," << int64_values[i] << "," - << int64_values[i] << "," << std::to_string(uint8_values[i]) << "," << uint16_values[i] - << "," << uint32_values[i] << "," << uint64_values[i] << "," << float32_values[i] << "," - << float32_values[i] << "," << float64_values[i] << "," << float64_values[i] << "\n"; + line << std::to_string(int8_values[i]) << "," << int16_values[i] << "," << int32_values[i] + << "," << int64_values[i] << "," << std::to_string(uint8_values[i]) << "," + << uint16_values[i] << "," << uint32_values[i] << "," << uint64_values[i] << "," + << float32_values[i] << "," << float64_values[i] << "\n"; } std::ofstream outfile(filepath, std::ofstream::out); outfile << line.str(); @@ -492,39 +500,29 @@ TEST_F(CsvReaderTest, MultiColumn) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .header(-1) - .dtypes(std::vector{"int8", - "short", - "int16", - "int", - "int32", - "long", - "int64", - "uint8", - "uint16", - "uint32", - "uint64", - "float", - "float32", - "double", - "float64"}); + .dtypes({dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype()}); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); expect_column_data_equal(int8_values, view.column(0)); expect_column_data_equal(int16_values, view.column(1)); - expect_column_data_equal(int16_values, view.column(2)); - expect_column_data_equal(int32_values, view.column(3)); - expect_column_data_equal(int32_values, view.column(4)); - expect_column_data_equal(int64_values, view.column(5)); - expect_column_data_equal(int64_values, view.column(6)); - expect_column_data_equal(uint8_values, view.column(7)); - expect_column_data_equal(uint16_values, view.column(8)); - expect_column_data_equal(uint32_values, view.column(9)); - expect_column_data_equal(uint64_values, view.column(10)); - expect_column_data_equal(float32_values, view.column(11)); - expect_column_data_equal(float32_values, view.column(12)); - expect_column_data_equal(float64_values, view.column(13)); - expect_column_data_equal(float64_values, view.column(14)); + expect_column_data_equal(int32_values, view.column(2)); + expect_column_data_equal(int64_values, view.column(3)); + expect_column_data_equal(uint8_values, view.column(4)); + expect_column_data_equal(uint16_values, view.column(5)); + expect_column_data_equal(uint32_values, view.column(6)); + expect_column_data_equal(uint64_values, view.column(7)); + expect_column_data_equal(float32_values, view.column(8)); + expect_column_data_equal(float64_values, view.column(9)); } TEST_F(CsvReaderTest, RepeatColumn) @@ -549,7 +547,7 @@ TEST_F(CsvReaderTest, RepeatColumn) // repeats column in indexes and names, misses 1 column. cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes(std::vector{"int16", "int64", "uint64", "float"}) + .dtypes({dtype(), dtype(), dtype(), dtype()}) .names({"A", "B", "C", "D"}) .use_cols_indexes({1, 0, 0}) .use_cols_names({"D", "B", "B"}) @@ -575,7 +573,7 @@ TEST_F(CsvReaderTest, Booleans) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A", "B", "C", "D"}) - .dtypes(std::vector{"int32", "int32", "short", "bool"}) + .dtypes({dtype(), dtype(), dtype(), dtype()}) .true_values({"yes", "Yes", "YES", "foo", "FOO"}) .false_values({"no", "No", "NO", "Bar", "bar"}) .header(-1); @@ -584,10 +582,10 @@ TEST_F(CsvReaderTest, Booleans) // Booleans are the same (integer) data type, but valued at 0 or 1 const auto view = result.tbl->view(); EXPECT_EQ(4, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id()); - ASSERT_EQ(cudf::type_id::INT32, view.column(1).type().id()); - ASSERT_EQ(cudf::type_id::INT16, view.column(2).type().id()); - ASSERT_EQ(cudf::type_id::BOOL8, view.column(3).type().id()); + ASSERT_EQ(type_id::INT32, view.column(0).type().id()); + ASSERT_EQ(type_id::INT32, view.column(1).type().id()); + ASSERT_EQ(type_id::INT16, view.column(2).type().id()); + ASSERT_EQ(type_id::BOOL8, view.column(3).type().id()); expect_column_data_equal(std::vector{1, 0, 0, 0, 1}, view.column(0)); expect_column_data_equal(std::vector{0, 1, 1, 0, 1}, view.column(2)); @@ -607,14 +605,14 @@ TEST_F(CsvReaderTest, Dates) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"date"}) + .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}}) .dayfirst(true) .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; expect_column_data_equal(std::vector{cudf::timestamp_ms{983750400000ms}, @@ -643,15 +641,14 @@ TEST_F(CsvReaderTest, DatesCastToTimestampSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"date"}) + .dtypes({data_type{type_id::TIMESTAMP_SECONDS}}) .dayfirst(true) - .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_SECONDS}); + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_SECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_SECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; expect_column_data_equal(std::vector{cudf::timestamp_s{983750400s}, @@ -680,15 +677,14 @@ TEST_F(CsvReaderTest, DatesCastToTimestampMilliSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"date"}) + .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}}) .dayfirst(true) - .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS}); + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; expect_column_data_equal(std::vector{cudf::timestamp_ms{983750400000ms}, @@ -717,15 +713,14 @@ TEST_F(CsvReaderTest, DatesCastToTimestampMicroSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"date"}) + .dtypes({data_type{type_id::TIMESTAMP_MICROSECONDS}}) .dayfirst(true) - .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}); + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_MICROSECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_MICROSECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; expect_column_data_equal(std::vector{cudf::timestamp_us{983750400000000us}, @@ -754,15 +749,14 @@ TEST_F(CsvReaderTest, DatesCastToTimestampNanoSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"date"}) + .dtypes({data_type{type_id::TIMESTAMP_NANOSECONDS}}) .dayfirst(true) - .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_NANOSECONDS}); + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_NANOSECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_NANOSECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; expect_column_data_equal( @@ -795,14 +789,13 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"datetime64[s]"}) - .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_SECONDS}); + .dtypes({data_type{type_id::TIMESTAMP_SECONDS}}) + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_SECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_SECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0)); @@ -824,14 +817,13 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampMilliSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"datetime64[ms]"}) - .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS}); + .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}}) + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0)); @@ -853,14 +845,13 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampMicroSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"datetime64[us]"}) - .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}); + .dtypes({data_type{type_id::TIMESTAMP_MICROSECONDS}}) + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_MICROSECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_MICROSECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0)); @@ -882,14 +873,13 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampNanoSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"datetime64[ns]"}) - .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_NANOSECONDS}); + .dtypes({data_type{type_id::TIMESTAMP_NANOSECONDS}}) + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_NANOSECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_NANOSECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0)); @@ -907,14 +897,14 @@ TEST_F(CsvReaderTest, FloatingPoint) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"float32"}) + .dtypes({dtype()}) .lineterminator(';') .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::FLOAT32, view.column(0).type().id()); + ASSERT_EQ(type_id::FLOAT32, view.column(0).type().id()); const auto ref_vals = std::vector{5.6, 56.79, 12000000000, 0.7, 3.000, 12.34, 0.31, -73.98007199999998}; @@ -940,14 +930,14 @@ TEST_F(CsvReaderTest, Strings) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{"int32", "str"}) + .dtypes(std::vector{dtype(), dtype()}) .quoting(cudf_io::quote_style::NONE); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(2, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id()); - ASSERT_EQ(cudf::type_id::STRING, view.column(1).type().id()); + ASSERT_EQ(type_id::INT32, view.column(0).type().id()); + ASSERT_EQ(type_id::STRING, view.column(1).type().id()); expect_column_data_equal( std::vector{"abc def ghi", "\"jkl mno pqr\"", "stu \"\"vwx\"\" yz"}, @@ -970,14 +960,14 @@ TEST_F(CsvReaderTest, StringsQuotes) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{"int32", "str"}) + .dtypes(std::vector{dtype(), dtype()}) .quotechar('`'); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(2, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id()); - ASSERT_EQ(cudf::type_id::STRING, view.column(1).type().id()); + ASSERT_EQ(type_id::INT32, view.column(0).type().id()); + ASSERT_EQ(type_id::STRING, view.column(1).type().id()); expect_column_data_equal( std::vector{"abc,\ndef, ghi", "jkl, `mno`, pqr", "stu `vwx` yz"}, view.column(1)); @@ -999,15 +989,15 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{"int32", "str"}) + .dtypes(std::vector{dtype(), dtype()}) .quoting(cudf_io::quote_style::NONE) .doublequote(false); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(2, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id()); - ASSERT_EQ(cudf::type_id::STRING, view.column(1).type().id()); + ASSERT_EQ(type_id::INT32, view.column(0).type().id()); + ASSERT_EQ(type_id::STRING, view.column(1).type().id()); expect_column_data_equal( std::vector{"\"abcdef ghi\"", "\"jkl \"\"mno\"\" pqr\"", "stu \"vwx\" yz"}, @@ -1025,7 +1015,7 @@ TEST_F(CsvReaderTest, SkiprowsNrows) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"int32"}) + .dtypes({dtype()}) .header(1) .skiprows(2) .nrows(2); @@ -1033,7 +1023,7 @@ TEST_F(CsvReaderTest, SkiprowsNrows) const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id()); + ASSERT_EQ(type_id::INT32, view.column(0).type().id()); expect_column_data_equal(std::vector{5, 6}, view.column(0)); } @@ -1049,7 +1039,7 @@ TEST_F(CsvReaderTest, ByteRange) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"int32"}) + .dtypes({dtype()}) .header(-1) .byte_range_offset(11) .byte_range_size(15); @@ -1057,7 +1047,7 @@ TEST_F(CsvReaderTest, ByteRange) const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id()); + ASSERT_EQ(type_id::INT32, view.column(0).type().id()); expect_column_data_equal(std::vector{4000, 5000, 6000}, view.column(0)); } @@ -1068,14 +1058,14 @@ TEST_F(CsvReaderTest, ByteRangeStrings) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{input.c_str(), input.size()}) .names({"A"}) - .dtypes(std::vector{"str"}) + .dtypes({dtype()}) .header(-1) .byte_range_offset(4); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::STRING, view.column(0).type().id()); + ASSERT_EQ(type_id::STRING, view.column(0).type().id()); expect_column_data_equal(std::vector{"c"}, view.column(0)); } @@ -1091,14 +1081,14 @@ TEST_F(CsvReaderTest, BlanksAndComments) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"int32"}) + .dtypes({dtype()}) .header(-1) .comment('#'); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id()); + ASSERT_EQ(type_id::INT32, view.column(0).type().id()); expect_column_data_equal(std::vector{1, 3, 4, 5, 8, 9}, view.column(0)); } @@ -1166,12 +1156,12 @@ TEST_F(CsvReaderTest, ArrowFileSource) auto arrow_source = cudf_io::arrow_io_source{infile}; cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{&arrow_source}) - .dtypes(std::vector{"int8"}); + .dtypes({dtype()}); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT8, view.column(0).type().id()); + ASSERT_EQ(type_id::INT8, view.column(0).type().id()); expect_column_data_equal(std::vector{9, 8, 7, 6, 5, 4, 3, 2}, view.column(0)); } @@ -1187,13 +1177,13 @@ TEST_F(CsvReaderTest, InvalidFloatingPoint) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"float32"}) + .dtypes({dtype()}) .header(-1); const auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::FLOAT32, view.column(0).type().id()); + ASSERT_EQ(type_id::FLOAT32, view.column(0).type().id()); const auto col_data = cudf::test::to_host(view.column(0)); // col_data.first contains the column data @@ -1212,7 +1202,7 @@ TEST_F(CsvReaderTest, StringInference) const auto result = cudf_io::read_csv(in_opts); EXPECT_EQ(result.tbl->num_columns(), 1); - EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRING); + EXPECT_EQ(result.tbl->get_column(0).type().id(), type_id::STRING); } TEST_F(CsvReaderTest, TypeInferenceThousands) @@ -1226,9 +1216,9 @@ TEST_F(CsvReaderTest, TypeInferenceThousands) const auto result_view = result.tbl->view(); EXPECT_EQ(result_view.num_columns(), 3); - EXPECT_EQ(result_view.column(0).type().id(), cudf::type_id::INT64); - EXPECT_EQ(result_view.column(1).type().id(), cudf::type_id::INT64); - EXPECT_EQ(result_view.column(2).type().id(), cudf::type_id::FLOAT64); + EXPECT_EQ(result_view.column(0).type().id(), type_id::INT64); + EXPECT_EQ(result_view.column(1).type().id(), type_id::INT64); + EXPECT_EQ(result_view.column(2).type().id(), type_id::FLOAT64); auto tsnd_sep_col = std::vector{1400L, 123456L}; auto int_col = std::vector{123L, 123456L}; @@ -1254,9 +1244,9 @@ TEST_F(CsvReaderTest, TypeInferenceWithDecimal) const auto result_view = result.tbl->view(); EXPECT_EQ(result_view.num_columns(), 3); - EXPECT_EQ(result_view.column(0).type().id(), cudf::type_id::INT64); - EXPECT_EQ(result_view.column(1).type().id(), cudf::type_id::STRING); - EXPECT_EQ(result_view.column(2).type().id(), cudf::type_id::FLOAT64); + EXPECT_EQ(result_view.column(0).type().id(), type_id::INT64); + EXPECT_EQ(result_view.column(1).type().id(), type_id::STRING); + EXPECT_EQ(result_view.column(2).type().id(), type_id::FLOAT64); auto int_col = std::vector{1400L, 123456L}; auto str_col = std::vector{"1.23", "123.456"}; @@ -1296,7 +1286,7 @@ TEST_F(CsvReaderTest, nullHandling) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .na_filter(false) - .dtypes(std::vector{"str"}) + .dtypes({dtype()}) .header(-1) .skip_blank_lines(false); const auto result = cudf_io::read_csv(in_opts); @@ -1310,7 +1300,7 @@ TEST_F(CsvReaderTest, nullHandling) { cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes(std::vector{"str"}) + .dtypes({dtype()}) .header(-1) .skip_blank_lines(false); const auto result = cudf_io::read_csv(in_opts); @@ -1327,7 +1317,7 @@ TEST_F(CsvReaderTest, nullHandling) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .na_values({"Null"}) - .dtypes(std::vector{"str"}) + .dtypes({dtype()}) .header(-1) .skip_blank_lines(false); const auto result = cudf_io::read_csv(in_opts); @@ -1345,7 +1335,7 @@ TEST_F(CsvReaderTest, nullHandling) cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .keep_default_na(false) .na_values({"Null"}) - .dtypes(std::vector{"str"}) + .dtypes({dtype()}) .header(-1) .skip_blank_lines(false); const auto result = cudf_io::read_csv(in_opts); @@ -1477,16 +1467,35 @@ TEST_F(CsvReaderTest, HexTest) std::ofstream outfile(filepath, std::ofstream::out); outfile << "0x0\n-0x1000\n0xfedcba\n0xABCDEF\n0xaBcDeF\n9512c20b\n"; } + // specify hex columns by name + { + cudf_io::csv_reader_options in_opts = + cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) + .names({"A"}) + .dtypes({dtype()}) + .header(-1) + .parse_hex({"A"}); + auto result = cudf_io::read_csv(in_opts); - cudf_io::csv_reader_options in_opts = - cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) - .names({"A"}) - .dtypes(std::vector{"hex"}) - .header(-1); - auto result = cudf_io::read_csv(in_opts); + expect_column_data_equal( + std::vector{0, -4096, 16702650, 11259375, 11259375, 2501034507}, + result.tbl->view().column(0)); + } + + // specify hex columns by index + { + cudf_io::csv_reader_options in_opts = + cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) + .names({"A"}) + .dtypes({dtype()}) + .header(-1) + .parse_hex(std::vector{0}); + auto result = cudf_io::read_csv(in_opts); - expect_column_data_equal(std::vector{0, -4096, 16702650, 11259375, 11259375, 2501034507}, - result.tbl->view().column(0)); + expect_column_data_equal( + std::vector{0, -4096, 16702650, 11259375, 11259375, 2501034507}, + result.tbl->view().column(0)); + } } TYPED_TEST(CsvReaderNumericTypeTest, SingleColumnWithWriter) @@ -1555,18 +1564,13 @@ TEST_F(CsvReaderTest, MultiColumnWithWriter) std::vector input_columns{int8_column, int16_column, - int16_column, - int32_column, int32_column, int64_column, - int64_column, uint8_column, uint16_column, uint32_column, uint64_column, float32_column, - float32_column, - float64_column, float64_column}; cudf::table_view input_table{input_columns}; @@ -1577,26 +1581,21 @@ TEST_F(CsvReaderTest, MultiColumnWithWriter) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .header(-1) - .dtypes(std::vector{"int8", - "short", - "int16", - "int", - "int32", - "long", - "int64", - "uint8", - "uint16", - "uint32", - "uint64", - "float", - "float32", - "double", - "float64"}); + .dtypes({dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype()}); auto result = cudf_io::read_csv(in_opts); const auto result_table = result.tbl->view(); - std::vector non_float64s{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; + std::vector non_float64s{0, 1, 2, 3, 4, 5, 6, 7, 8}; const auto input_sliced_view = input_table.select(non_float64s); const auto result_sliced_view = result_table.select(non_float64s); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_sliced_view, result_sliced_view); @@ -1606,9 +1605,6 @@ TEST_F(CsvReaderTest, MultiColumnWithWriter) auto float64_col_idx = non_float64s.size(); check_float_column( input_table.column(float64_col_idx), result_table.column(float64_col_idx), tol, validity); - ++float64_col_idx; - check_float_column( - input_table.column(float64_col_idx), result_table.column(float64_col_idx), tol, validity); } TEST_F(CsvReaderTest, DatesWithWriter) @@ -1633,7 +1629,7 @@ TEST_F(CsvReaderTest, DatesWithWriter) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"date"}) + .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}}) .dayfirst(true) .header(-1); auto result = cudf_io::read_csv(in_opts); @@ -1764,7 +1760,7 @@ TEST_F(CsvReaderTest, FloatingPointWithWriter) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"float64"}) + .dtypes({dtype()}) .header(-1); // in_opts.lineterminator = ';'; auto result = cudf_io::read_csv(in_opts); @@ -1790,7 +1786,7 @@ TEST_F(CsvReaderTest, StringsWithWriter) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{"int32", "str"}) + .dtypes(std::vector{dtype(), dtype()}) .quoting(cudf_io::quote_style::NONE); auto result = cudf_io::read_csv(in_opts); @@ -1815,7 +1811,7 @@ TEST_F(CsvReaderTest, StringsWithWriterSimple) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{"int32", "str"}) + .dtypes(std::vector{dtype(), dtype()}) .quoting(cudf_io::quote_style::NONE); auto result = cudf_io::read_csv(in_opts); @@ -1839,7 +1835,7 @@ TEST_F(CsvReaderTest, StringsEmbeddedDelimiter) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{"int32", "str"}); + .dtypes(std::vector{dtype(), dtype()}); auto result = cudf_io::read_csv(in_opts); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result.tbl->view()); @@ -1862,7 +1858,11 @@ TEST_F(CsvReaderTest, HeaderEmbeddedDelimiter) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{"int32", "str", "int32", "int32", "int32"}); + .dtypes({dtype(), + dtype(), + dtype(), + dtype(), + dtype()}); auto result = cudf_io::read_csv(in_opts); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result.tbl->view()); @@ -1917,7 +1917,7 @@ TEST_F(CsvReaderTest, UserImplementedSource) TestSource source{csv_data.str()}; cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{&source}) - .dtypes(std::vector{"int8", "int16", "int32"}) + .dtypes({dtype(), dtype(), dtype()}) .header(-1); auto result = cudf_io::read_csv(in_opts); @@ -1962,8 +1962,11 @@ TEST_F(CsvReaderTest, DurationsWithWriter) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{ - "timedelta[D]", "timedelta64[s]", "timedelta64[ms]", "timedelta64[us]", "timedelta64[ns]"}); + .dtypes({data_type{type_id::DURATION_DAYS}, + data_type{type_id::DURATION_SECONDS}, + data_type{type_id::DURATION_MILLISECONDS}, + data_type{type_id::DURATION_MICROSECONDS}, + data_type{type_id::DURATION_NANOSECONDS}}); auto result = cudf_io::read_csv(in_opts); const auto result_table = result.tbl->view(); @@ -2164,4 +2167,35 @@ TEST_F(CsvReaderTest, DefaultWriteChunkSize) } } +TEST_F(CsvReaderTest, DtypesMap) +{ + std::string csv_in{"12,9\n34,8\n56,7"}; + + cudf_io::csv_reader_options in_opts = + cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_in.c_str(), csv_in.size()}) + .names({"A", "B"}) + .dtypes({{"B", dtype()}, {"A", dtype()}}) + .header(-1); + auto result = cudf_io::read_csv(in_opts); + + const auto result_table = result.tbl->view(); + ASSERT_EQ(result_table.num_columns(), 2); + ASSERT_EQ(result_table.column(0).type(), data_type{type_id::INT32}); + ASSERT_EQ(result_table.column(1).type(), data_type{type_id::INT16}); + expect_column_data_equal(std::vector{12, 34, 56}, result_table.column(0)); + expect_column_data_equal(std::vector{9, 8, 7}, result_table.column(1)); +} + +TEST_F(CsvReaderTest, DtypesMapInvalid) +{ + std::string csv_in{""}; + + cudf_io::csv_reader_options in_opts = + cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_in.c_str(), csv_in.size()}) + .names({"A", "B"}) + .dtypes({{"A", dtype()}}); + + EXPECT_THROW(cudf_io::read_csv(in_opts), cudf::logic_error); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 308821489c5..e83592a028a 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -42,6 +42,16 @@ using int64_wrapper = wrapper; using timestamp_ms_wrapper = wrapper; using bool_wrapper = wrapper; +using cudf::data_type; +using cudf::type_id; +using cudf::type_to_id; + +template +auto dtype() +{ + return data_type{type_to_id()}; +} + template using column_wrapper = typename std::conditional, @@ -151,7 +161,7 @@ TEST_F(JsonReaderTest, BasicJsonLines) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()}) - .dtypes({"int", "float64"}) + .dtypes(std::vector{dtype(), dtype()}) .lines(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -182,7 +192,7 @@ TEST_F(JsonReaderTest, FloatingPoint) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes({"float32"}) + .dtypes({dtype()}) .lines(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -206,7 +216,7 @@ TEST_F(JsonReaderTest, JsonLinesStrings) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()}) - .dtypes({"2:str", "0:int", "1:float64"}) + .dtypes({{"2", dtype()}, {"0", dtype()}, {"1", dtype()}}) .lines(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -245,9 +255,8 @@ TEST_F(JsonReaderTest, MultiColumn) std::ostringstream line; for (int i = 0; i < num_rows; ++i) { line << "[" << std::to_string(int8_values[i]) << "," << int16_values[i] << "," - << int16_values[i] << "," << int32_values[i] << "," << int32_values[i] << "," - << int64_values[i] << "," << int64_values[i] << "," << float32_values[i] << "," - << float32_values[i] << "," << float64_values[i] << "," << float64_values[i] << "]\n"; + << int32_values[i] << "," << int64_values[i] << "," << float32_values[i] << "," + << float64_values[i] << "]\n"; } std::ofstream outfile(filepath, std::ofstream::out); outfile << line.str(); @@ -255,17 +264,12 @@ TEST_F(JsonReaderTest, MultiColumn) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes({"int8", - "short", - "int16", - "int", - "int32", - "long", - "int64", - "float", - "float32", - "double", - "float64"}) + .dtypes({dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype()}) .lines(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -275,34 +279,21 @@ TEST_F(JsonReaderTest, MultiColumn) EXPECT_EQ(view.column(0).type().id(), cudf::type_id::INT8); EXPECT_EQ(view.column(1).type().id(), cudf::type_id::INT16); - EXPECT_EQ(view.column(2).type().id(), cudf::type_id::INT16); - EXPECT_EQ(view.column(3).type().id(), cudf::type_id::INT32); - EXPECT_EQ(view.column(4).type().id(), cudf::type_id::INT32); - EXPECT_EQ(view.column(5).type().id(), cudf::type_id::INT64); - EXPECT_EQ(view.column(6).type().id(), cudf::type_id::INT64); - EXPECT_EQ(view.column(7).type().id(), cudf::type_id::FLOAT32); - EXPECT_EQ(view.column(8).type().id(), cudf::type_id::FLOAT32); - EXPECT_EQ(view.column(9).type().id(), cudf::type_id::FLOAT64); - EXPECT_EQ(view.column(10).type().id(), cudf::type_id::FLOAT64); + EXPECT_EQ(view.column(2).type().id(), cudf::type_id::INT32); + EXPECT_EQ(view.column(3).type().id(), cudf::type_id::INT64); + EXPECT_EQ(view.column(4).type().id(), cudf::type_id::FLOAT32); + EXPECT_EQ(view.column(5).type().id(), cudf::type_id::FLOAT64); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(0), int8_wrapper{int8_values.begin(), int8_values.end(), validity}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(1), int16_wrapper{int16_values.begin(), int16_values.end(), validity}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(2), - int16_wrapper{int16_values.begin(), int16_values.end(), validity}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(3), - int_wrapper{int32_values.begin(), int32_values.end(), validity}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(4), int_wrapper{int32_values.begin(), int32_values.end(), validity}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(5), - int64_wrapper{int64_values.begin(), int64_values.end(), validity}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(6), + CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(3), int64_wrapper{int64_values.begin(), int64_values.end(), validity}); - check_float_column(view.column(7), float32_values, validity); - check_float_column(view.column(8), float32_values, validity); - check_float_column(view.column(9), float64_values, validity); - check_float_column(view.column(10), float64_values, validity); + check_float_column(view.column(4), float32_values, validity); + check_float_column(view.column(5), float64_values, validity); } TEST_F(JsonReaderTest, Booleans) @@ -315,7 +306,7 @@ TEST_F(JsonReaderTest, Booleans) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes({"bool"}) + .dtypes({dtype()}) .lines(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -342,7 +333,7 @@ TEST_F(JsonReaderTest, Dates) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes({"date"}) + .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}}) .lines(true) .dayfirst(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -379,7 +370,7 @@ TEST_F(JsonReaderTest, Durations) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes({"timedelta64[ns]"}) + .dtypes({data_type{type_id::DURATION_NANOSECONDS}}) .lines(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -665,13 +656,12 @@ TEST_F(JsonReaderTest, ArrowFileSource) auto arrow_source = cudf_io::arrow_io_source{infile}; cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{&arrow_source}) - .dtypes({"int8"}) + .dtypes({dtype()}) .lines(true); ; cudf_io::table_with_metadata result = cudf_io::read_json(in_options); - EXPECT_EQ(result.tbl->num_columns(), - static_cast(in_options.get_dtypes().size())); + EXPECT_EQ(result.tbl->num_columns(), 1); EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT8); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); @@ -690,7 +680,7 @@ TEST_F(JsonReaderTest, InvalidFloatingPoint) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes({"float32"}) + .dtypes({dtype()}) .lines(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -898,4 +888,27 @@ TEST_F(JsonReaderTest, JsonLinesMultipleFileInputs) float64_wrapper{{1.1, 2.2, 3.3, 4.4}, validity}); } +TEST_F(JsonReaderTest, BadDtypeParams) +{ + std::string buffer = "[1,2,3,4]"; + + cudf_io::json_reader_options options_vec = + cudf_io::json_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()}) + .lines(true) + .dtypes({dtype()}); + + // should throw because there are four columns and only one dtype + EXPECT_THROW(cudf_io::read_json(options_vec), cudf::logic_error); + + cudf_io::json_reader_options options_map = + cudf_io::json_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()}) + .lines(true) + .dtypes(std::map{{"0", dtype()}, + {"1", dtype()}, + {"2", dtype()}, + {"wrong_name", dtype()}}); + // should throw because one of the columns is not in the dtype map + EXPECT_THROW(cudf_io::read_json(options_map), cudf::logic_error); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 8fdfc6f9165..7260aa9e686 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -275,10 +275,10 @@ inline auto random_values(size_t size) TYPED_TEST(ParquetWriterNumericTypeTest, SingleColumn) { auto sequence = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return TypeParam(i); }); + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return TypeParam(i % 400); }); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - constexpr auto num_rows = 100; + constexpr auto num_rows = 800; column_wrapper col(sequence, sequence + num_rows, validity); std::vector> cols; @@ -816,7 +816,7 @@ TEST_F(ParquetWriterTest, MultiIndex) expected_metadata.column_metadata[3].set_name("floats"); expected_metadata.column_metadata[4].set_name("doubles"); expected_metadata.user_data.insert( - {"pandas", "\"index_columns\": [\"floats\", \"doubles\"], \"column1\": [\"int8s\"]"}); + {"pandas", "\"index_columns\": [\"int8s\", \"int16s\"], \"column1\": [\"int32s\"]"}); auto filepath = temp_env->get_temp_filepath("MultiIndex.parquet"); cudf_io::parquet_writer_options out_opts = @@ -827,7 +827,7 @@ TEST_F(ParquetWriterTest, MultiIndex) cudf_io::parquet_reader_options in_opts = cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}) .use_pandas_metadata(true) - .columns({"int8s", "int16s", "int32s"}); + .columns({"int32s", "floats", "doubles"}); auto result = cudf_io::read_parquet(in_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); @@ -967,8 +967,6 @@ TEST_F(ParquetWriterTest, StructOfList) auto struct_2 = cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release(); - // cudf::test::print(struct_2->child(1).child(2)); - auto expected = table_view({*struct_2}); cudf_io::table_input_metadata expected_metadata(expected); @@ -2497,6 +2495,131 @@ TEST_F(ParquetReaderTest, ReorderedColumns) } } +TEST_F(ParquetReaderTest, SelectNestedColumn) +{ + // Struct>, + // flats:List> + // > + // > + + auto weights_col = cudf::test::fixed_width_column_wrapper{1.1, 2.4, 5.3, 8.0, 9.6, 6.9}; + + auto ages_col = + cudf::test::fixed_width_column_wrapper{{48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}}; + + auto struct_1 = cudf::test::structs_column_wrapper{{weights_col, ages_col}, {1, 1, 1, 1, 0, 1}}; + + auto is_human_col = cudf::test::fixed_width_column_wrapper{ + {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}}; + + auto struct_2 = + cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release(); + + auto input = table_view({*struct_2}); + + cudf_io::table_input_metadata input_metadata(input); + input_metadata.column_metadata[0].set_name("being"); + input_metadata.column_metadata[0].child(0).set_name("human?"); + input_metadata.column_metadata[0].child(1).set_name("particulars"); + input_metadata.column_metadata[0].child(1).child(0).set_name("weight"); + input_metadata.column_metadata[0].child(1).child(1).set_name("age"); + + auto filepath = temp_env->get_temp_filepath("SelectNestedColumn.parquet"); + cudf_io::parquet_writer_options args = + cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, input) + .metadata(&input_metadata); + cudf_io::write_parquet(args); + + { // Test selecting a single leaf from the table + cudf_io::parquet_reader_options read_args = + cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath)) + .columns({"being.particulars.age"}); + const auto result = cudf_io::read_parquet(read_args); + + auto expect_ages_col = cudf::test::fixed_width_column_wrapper{ + {48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}}; + auto expect_s_1 = cudf::test::structs_column_wrapper{{expect_ages_col}, {1, 1, 1, 1, 0, 1}}; + auto expect_s_2 = + cudf::test::structs_column_wrapper{{expect_s_1}, {0, 1, 1, 1, 1, 1}}.release(); + auto expected = table_view({*expect_s_2}); + + cudf_io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("being"); + expected_metadata.column_metadata[0].child(0).set_name("particulars"); + expected_metadata.column_metadata[0].child(0).child(0).set_name("age"); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + compare_metadata_equality(expected_metadata, result.metadata); + } + + { // Test selecting a non-leaf and expecting all hierarchy from that node onwards + cudf_io::parquet_reader_options read_args = + cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath)) + .columns({"being.particulars"}); + const auto result = cudf_io::read_parquet(read_args); + + auto expected_weights_col = + cudf::test::fixed_width_column_wrapper{1.1, 2.4, 5.3, 8.0, 9.6, 6.9}; + + auto expected_ages_col = cudf::test::fixed_width_column_wrapper{ + {48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}}; + + auto expected_s_1 = cudf::test::structs_column_wrapper{ + {expected_weights_col, expected_ages_col}, {1, 1, 1, 1, 0, 1}}; + + auto expect_s_2 = + cudf::test::structs_column_wrapper{{expected_s_1}, {0, 1, 1, 1, 1, 1}}.release(); + auto expected = table_view({*expect_s_2}); + + cudf_io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("being"); + expected_metadata.column_metadata[0].child(0).set_name("particulars"); + expected_metadata.column_metadata[0].child(0).child(0).set_name("weight"); + expected_metadata.column_metadata[0].child(0).child(1).set_name("age"); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + compare_metadata_equality(expected_metadata, result.metadata); + } + + { // Test selecting struct children out of order + cudf_io::parquet_reader_options read_args = + cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath)) + .columns({"being.particulars.age", "being.particulars.weight", "being.human?"}); + const auto result = cudf_io::read_parquet(read_args); + + auto expected_weights_col = + cudf::test::fixed_width_column_wrapper{1.1, 2.4, 5.3, 8.0, 9.6, 6.9}; + + auto expected_ages_col = cudf::test::fixed_width_column_wrapper{ + {48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}}; + + auto expected_is_human_col = cudf::test::fixed_width_column_wrapper{ + {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}}; + + auto expect_s_1 = cudf::test::structs_column_wrapper{{expected_ages_col, expected_weights_col}, + {1, 1, 1, 1, 0, 1}}; + + auto expect_s_2 = + cudf::test::structs_column_wrapper{{expect_s_1, expected_is_human_col}, {0, 1, 1, 1, 1, 1}} + .release(); + + auto expected = table_view({*expect_s_2}); + + cudf_io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("being"); + expected_metadata.column_metadata[0].child(0).set_name("particulars"); + expected_metadata.column_metadata[0].child(0).child(0).set_name("age"); + expected_metadata.column_metadata[0].child(0).child(1).set_name("weight"); + expected_metadata.column_metadata[0].child(1).set_name("human?"); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + compare_metadata_equality(expected_metadata, result.metadata); + } +} + TEST_F(ParquetReaderTest, DecimalRead) { { diff --git a/cpp/tests/join/conditional_join_tests.cu b/cpp/tests/join/conditional_join_tests.cu index 57abdf17aa6..8018d613e05 100644 --- a/cpp/tests/join/conditional_join_tests.cu +++ b/cpp/tests/join/conditional_join_tests.cu @@ -14,8 +14,7 @@ * limitations under the License. */ -#include -#include +#include #include #include #include @@ -50,7 +49,7 @@ const auto col_ref_right_1 = cudf::ast::column_reference(1, cudf::ast::table_ref // Common expressions. auto left_zero_eq_right_zero = - cudf::ast::expression(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0); + cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0); } // namespace /** @@ -147,15 +146,17 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest { */ void test(std::vector> left_data, std::vector> right_data, - cudf::ast::expression predicate, + cudf::ast::operation predicate, std::vector> expected_outputs) { // Note that we need to maintain the column wrappers otherwise the // resulting column views will be referencing potentially invalid memory. auto [left_wrappers, right_wrappers, left_columns, right_columns, left, right] = this->parse_input(left_data, right_data); - auto result = this->join(left, right, predicate); + auto result_size = this->join_size(left, right, predicate); + EXPECT_TRUE(result_size == expected_outputs.size()); + auto result = this->join(left, right, predicate); std::vector> result_pairs; for (size_t i = 0; i < result.first->size(); ++i) { // Note: Not trying to be terribly efficient here since these tests are @@ -167,20 +168,22 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest { std::sort(result_pairs.begin(), result_pairs.end()); std::sort(expected_outputs.begin(), expected_outputs.end()); - EXPECT_TRUE(std::equal(result_pairs.begin(), result_pairs.end(), expected_outputs.begin())); + EXPECT_TRUE(std::equal(expected_outputs.begin(), expected_outputs.end(), result_pairs.begin())); } void test_nulls(std::vector, std::vector>> left_data, std::vector, std::vector>> right_data, - cudf::ast::expression predicate, + cudf::ast::operation predicate, std::vector> expected_outputs) { // Note that we need to maintain the column wrappers otherwise the // resulting column views will be referencing potentially invalid memory. auto [left_wrappers, right_wrappers, left_columns, right_columns, left, right] = this->parse_input(left_data, right_data); - auto result = this->join(left, right, predicate); + auto result_size = this->join_size(left, right, predicate); + EXPECT_TRUE(result_size == expected_outputs.size()); + auto result = this->join(left, right, predicate); std::vector> result_pairs; for (size_t i = 0; i < result.first->size(); ++i) { // Note: Not trying to be terribly efficient here since these tests are @@ -192,7 +195,7 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest { std::sort(result_pairs.begin(), result_pairs.end()); std::sort(expected_outputs.begin(), expected_outputs.end()); - EXPECT_TRUE(std::equal(result_pairs.begin(), result_pairs.end(), expected_outputs.begin())); + EXPECT_TRUE(std::equal(expected_outputs.begin(), expected_outputs.end(), result_pairs.begin())); } /* @@ -238,7 +241,7 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest { thrust::sort(thrust::device, reference_pairs.begin(), reference_pairs.end()); EXPECT_TRUE(thrust::equal( - thrust::device, result_pairs.begin(), result_pairs.end(), reference_pairs.begin())); + thrust::device, reference_pairs.begin(), reference_pairs.end(), result_pairs.begin())); } /** @@ -248,7 +251,16 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest { */ virtual std::pair>, std::unique_ptr>> - join(cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) = 0; + join(cudf::table_view left, cudf::table_view right, cudf::ast::operation predicate) = 0; + + /** + * This method must be implemented by subclasses for specific types of joins. + * It should be a simply forwarding of arguments to the appropriate cudf + * conditional join size computation API. + */ + virtual std::size_t join_size(cudf::table_view left, + cudf::table_view right, + cudf::ast::operation predicate) = 0; /** * This method must be implemented by subclasses for specific types of joins. @@ -267,11 +279,18 @@ template struct ConditionalInnerJoinTest : public ConditionalJoinPairReturnTest { std::pair>, std::unique_ptr>> - join(cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) override + join(cudf::table_view left, cudf::table_view right, cudf::ast::operation predicate) override { return cudf::conditional_inner_join(left, right, predicate); } + std::size_t join_size(cudf::table_view left, + cudf::table_view right, + cudf::ast::operation predicate) override + { + return cudf::conditional_inner_join_size(left, right, predicate); + } + std::pair>, std::unique_ptr>> reference_join(cudf::table_view left, cudf::table_view right) override @@ -316,7 +335,7 @@ TYPED_TEST(ConditionalInnerJoinTest, TestTwoColumnThreeRowSomeEqual) TYPED_TEST(ConditionalInnerJoinTest, TestNotComparison) { auto col_ref_0 = cudf::ast::column_reference(0); - auto expression = cudf::ast::expression(cudf::ast::ast_operator::NOT, col_ref_0); + auto expression = cudf::ast::operation(cudf::ast::ast_operator::NOT, col_ref_0); this->test({{0, 1, 2}}, {{3, 4, 5}}, expression, {{0, 0}, {0, 1}, {0, 2}}); }; @@ -325,7 +344,7 @@ TYPED_TEST(ConditionalInnerJoinTest, TestGreaterComparison) { auto col_ref_0 = cudf::ast::column_reference(0); auto col_ref_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); - auto expression = cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1); + auto expression = cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1); this->test({{0, 1, 2}}, {{1, 0, 0}}, expression, {{1, 1}, {1, 2}, {2, 0}, {2, 1}, {2, 2}}); }; @@ -334,7 +353,7 @@ TYPED_TEST(ConditionalInnerJoinTest, TestGreaterTwoColumnComparison) { auto col_ref_0 = cudf::ast::column_reference(0); auto col_ref_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT); - auto expression = cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1); + auto expression = cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1); this->test({{0, 1, 2}, {0, 0, 0}}, {{0, 0, 0}, {1, 0, 0}}, @@ -346,7 +365,7 @@ TYPED_TEST(ConditionalInnerJoinTest, TestGreaterDifferentNumberColumnComparison) { auto col_ref_0 = cudf::ast::column_reference(0); auto col_ref_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT); - auto expression = cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1); + auto expression = cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1); this->test( {{0, 1, 2}}, {{0, 0, 0}, {1, 0, 0}}, expression, {{1, 1}, {1, 2}, {2, 0}, {2, 1}, {2, 2}}); @@ -356,7 +375,7 @@ TYPED_TEST(ConditionalInnerJoinTest, TestGreaterDifferentNumberColumnDifferentSi { auto col_ref_0 = cudf::ast::column_reference(0); auto col_ref_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT); - auto expression = cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1); + auto expression = cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1); this->test({{0, 1}}, {{0, 0, 0}, {1, 0, 0}}, expression, {{1, 1}, {1, 2}}); }; @@ -367,14 +386,14 @@ TYPED_TEST(ConditionalInnerJoinTest, TestComplexConditionMultipleColumns) auto col_ref_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT); auto scalar_1 = cudf::numeric_scalar(1); auto literal_1 = cudf::ast::literal(scalar_1); - auto left_0_equal_1 = cudf::ast::expression(cudf::ast::ast_operator::EQUAL, col_ref_0, literal_1); + auto left_0_equal_1 = cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_0, literal_1); auto col_ref_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT); auto comparison_filter = - cudf::ast::expression(cudf::ast::ast_operator::LESS, col_ref_1, col_ref_0); + cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_1, col_ref_0); auto expression = - cudf::ast::expression(cudf::ast::ast_operator::LOGICAL_AND, left_0_equal_1, comparison_filter); + cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, left_0_equal_1, comparison_filter); this->test({{0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}}, {{0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2}, @@ -384,6 +403,20 @@ TYPED_TEST(ConditionalInnerJoinTest, TestComplexConditionMultipleColumns) {{4, 0}, {5, 0}, {6, 0}, {7, 0}}); }; +TYPED_TEST(ConditionalInnerJoinTest, TestSymmetry) +{ + auto col_ref_0 = cudf::ast::column_reference(0); + auto col_ref_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); + auto expression = cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_1, col_ref_0); + auto expression_reverse = + cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1); + + this->test( + {{0, 1, 2}}, {{1, 2, 3}}, expression, {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}}); + this->test( + {{0, 1, 2}}, {{1, 2, 3}}, expression_reverse, {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}}); +}; + TYPED_TEST(ConditionalInnerJoinTest, TestCompareRandomToHash) { // Generate columns of 10 repeats of the integer range [0, 10), then merge @@ -418,7 +451,7 @@ TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnTwoNullsRowAllEqual) TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnTwoNullsNoOutputRowAllEqual) { - this->test_nulls({{{0, 1}, {0, 1}}}, {{{0, 0}, {1, 1}}}, left_zero_eq_right_zero, {{}, {}}); + this->test_nulls({{{0, 1}, {0, 1}}}, {{{0, 0}, {1, 1}}}, left_zero_eq_right_zero, {}); }; /** @@ -428,11 +461,18 @@ template struct ConditionalLeftJoinTest : public ConditionalJoinPairReturnTest { std::pair>, std::unique_ptr>> - join(cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) override + join(cudf::table_view left, cudf::table_view right, cudf::ast::operation predicate) override { return cudf::conditional_left_join(left, right, predicate); } + std::size_t join_size(cudf::table_view left, + cudf::table_view right, + cudf::ast::operation predicate) override + { + return cudf::conditional_left_join_size(left, right, predicate); + } + std::pair>, std::unique_ptr>> reference_join(cudf::table_view left, cudf::table_view right) override @@ -484,11 +524,21 @@ template struct ConditionalFullJoinTest : public ConditionalJoinPairReturnTest { std::pair>, std::unique_ptr>> - join(cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) override + join(cudf::table_view left, cudf::table_view right, cudf::ast::operation predicate) override { return cudf::conditional_full_join(left, right, predicate); } + std::size_t join_size(cudf::table_view left, + cudf::table_view right, + cudf::ast::operation predicate) override + { + // Full joins don't actually support size calculations, but to support a + // uniform testing framework we just calculate it from the result of doing + // the join. + return cudf::conditional_full_join(left, right, predicate).first->size(); + } + std::pair>, std::unique_ptr>> reference_join(cudf::table_view left, cudf::table_view right) override @@ -499,6 +549,19 @@ struct ConditionalFullJoinTest : public ConditionalJoinPairReturnTest { TYPED_TEST_CASE(ConditionalFullJoinTest, cudf::test::IntegralTypesNotBool); +TYPED_TEST(ConditionalFullJoinTest, TestOneColumnNoneEqual) +{ + this->test({{0, 1, 2}}, + {{3, 4, 5}}, + left_zero_eq_right_zero, + {{0, JoinNoneValue}, + {1, JoinNoneValue}, + {2, JoinNoneValue}, + {JoinNoneValue, 0}, + {JoinNoneValue, 1}, + {JoinNoneValue, 2}}); +}; + TYPED_TEST(ConditionalFullJoinTest, TestTwoColumnThreeRowSomeEqual) { this->test({{0, 1, 2}, {10, 20, 30}}, @@ -546,13 +609,15 @@ struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest { */ void test(std::vector> left_data, std::vector> right_data, - cudf::ast::expression predicate, + cudf::ast::operation predicate, std::vector expected_outputs) { auto [left_wrappers, right_wrappers, left_columns, right_columns, left, right] = this->parse_input(left_data, right_data); - auto result = this->join(left, right, predicate); + auto result_size = this->join_size(left, right, predicate); + EXPECT_TRUE(result_size == expected_outputs.size()); + auto result = this->join(left, right, predicate); std::vector resulting_indices; for (size_t i = 0; i < result->size(); ++i) { // Note: Not trying to be terribly efficient here since these tests are @@ -595,7 +660,16 @@ struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest { * conditional join API. */ virtual std::unique_ptr> join( - cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) = 0; + cudf::table_view left, cudf::table_view right, cudf::ast::operation predicate) = 0; + + /** + * This method must be implemented by subclasses for specific types of joins. + * It should be a simply forwarding of arguments to the appropriate cudf + * conditional join size computation API. + */ + virtual std::size_t join_size(cudf::table_view left, + cudf::table_view right, + cudf::ast::operation predicate) = 0; /** * This method must be implemented by subclasses for specific types of joins. @@ -612,11 +686,18 @@ struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest { template struct ConditionalLeftSemiJoinTest : public ConditionalJoinSingleReturnTest { std::unique_ptr> join( - cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) override + cudf::table_view left, cudf::table_view right, cudf::ast::operation predicate) override { return cudf::conditional_left_semi_join(left, right, predicate); } + std::size_t join_size(cudf::table_view left, + cudf::table_view right, + cudf::ast::operation predicate) override + { + return cudf::conditional_left_semi_join_size(left, right, predicate); + } + std::unique_ptr> reference_join( cudf::table_view left, cudf::table_view right) override { @@ -663,11 +744,18 @@ TYPED_TEST(ConditionalLeftSemiJoinTest, TestCompareRandomToHash) template struct ConditionalLeftAntiJoinTest : public ConditionalJoinSingleReturnTest { std::unique_ptr> join( - cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) override + cudf::table_view left, cudf::table_view right, cudf::ast::operation predicate) override { return cudf::conditional_left_anti_join(left, right, predicate); } + std::size_t join_size(cudf::table_view left, + cudf::table_view right, + cudf::ast::operation predicate) override + { + return cudf::conditional_left_anti_join_size(left, right, predicate); + } + std::unique_ptr> reference_join( cudf::table_view left, cudf::table_view right) override { diff --git a/cpp/tests/reshape/interleave_columns_tests.cpp b/cpp/tests/reshape/interleave_columns_tests.cpp index 386fd9d08ee..e51f0740787 100644 --- a/cpp/tests/reshape/interleave_columns_tests.cpp +++ b/cpp/tests/reshape/interleave_columns_tests.cpp @@ -24,7 +24,7 @@ using namespace cudf::test::iterators; -constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; +constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::FIRST_ERROR}; template struct InterleaveColumnsTest : public cudf::test::BaseFixture { @@ -378,7 +378,7 @@ using IntListsCol = cudf::test::lists_column_wrapper; using IntCol = cudf::test::fixed_width_column_wrapper; using TView = cudf::table_view; -constexpr int32_t null{0}; +constexpr int32_t null{0}; // mark for null elements } // namespace struct ListsColumnsInterleaveTest : public cudf::test::BaseFixture { @@ -731,4 +731,341 @@ TEST_F(ListsColumnsInterleaveTest, SlicedStringsColumnsInputWithNulls) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, verbosity); } +namespace { +using StructsCol = cudf::test::structs_column_wrapper; +using StringsCol = cudf::test::strings_column_wrapper; +} // namespace + +struct StructsColumnsInterleaveTest : public cudf::test::BaseFixture { +}; + +TEST_F(StructsColumnsInterleaveTest, InvalidInput) +{ + // Input table contains non-structs column + { + auto const col1 = IntCol{}; + auto const col2 = StructsCol{}; + EXPECT_THROW(cudf::interleave_columns(TView{{col1, col2}}), cudf::logic_error); + } + + // Types mismatch + { + auto const structs1 = [] { + auto child1 = IntCol{1, 2, 3}; + auto child2 = IntCol{4, 5, 6}; + return StructsCol{{child1, child2}}; + }(); + + auto const structs2 = [] { + auto child1 = IntCol{7, 8, 9}; + auto child2 = StringsCol{"", "abc", "123"}; + return StructsCol{{child1, child2}}; + }(); + + EXPECT_THROW(cudf::interleave_columns(TView{{structs1, structs2}}), cudf::logic_error); + } + + // Numbers of children mismatch + { + auto const structs1 = [] { + auto child1 = IntCol{1, 2, 3}; + auto child2 = IntCol{4, 5, 6}; + return StructsCol{{child1, child2}}; + }(); + + auto const structs2 = [] { + auto child1 = IntCol{7, 8, 9}; + auto child2 = IntCol{10, 11, 12}; + auto child3 = IntCol{13, 14, 15}; + return StructsCol{{child1, child2, child3}}; + }(); + + EXPECT_THROW(cudf::interleave_columns(TView{{structs1, structs2}}), cudf::logic_error); + } +} + +TEST_F(StructsColumnsInterleaveTest, InterleaveEmptyColumns) +{ + auto const structs = StructsCol{}; + auto const results = cudf::interleave_columns(TView{{structs, structs}}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(structs, *results, verbosity); +} + +template +struct StructsColumnsInterleaveTypedTest : public cudf::test::BaseFixture { +}; + +using TypesForTest = cudf::test::Concat; +TYPED_TEST_SUITE(StructsColumnsInterleaveTypedTest, TypesForTest); + +TYPED_TEST(StructsColumnsInterleaveTypedTest, InterleaveOneColumnNotNull) +{ + using ColWrapper = cudf::test::fixed_width_column_wrapper; + + auto const structs = [] { + auto child1 = ColWrapper{1, 2, 3}; + auto child2 = ColWrapper{4, 5, 6}; + auto child3 = StringsCol{"Banana", "Mango", "Apple"}; + return StructsCol{{child1, child2, child3}}; + }(); + auto const results = cudf::interleave_columns(TView{{structs}}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(structs, *results, verbosity); +} + +TYPED_TEST(StructsColumnsInterleaveTypedTest, InterleaveOneColumnWithNulls) +{ + using ColWrapper = cudf::test::fixed_width_column_wrapper; + + auto const structs = [] { + auto child1 = ColWrapper{{1, 2, null, 3}, null_at(2)}; + auto child2 = ColWrapper{{4, null, 5, 6}, null_at(1)}; + auto child3 = StringsCol{{"" /*NULL*/, "Banana", "Mango", "Apple"}, null_at(0)}; + return StructsCol{{child1, child2, child3}, null_at(3)}; + }(); + auto const results = cudf::interleave_columns(TView{{structs}}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(structs, *results, verbosity); +} + +TYPED_TEST(StructsColumnsInterleaveTypedTest, SimpleInputNoNull) +{ + using ColWrapper = cudf::test::fixed_width_column_wrapper; + + auto const structs1 = [] { + auto child1 = ColWrapper{1, 2, 3}; + auto child2 = ColWrapper{4, 5, 6}; + auto child3 = StringsCol{"Banana", "Mango", "Apple"}; + return StructsCol{{child1, child2, child3}}; + }(); + + auto const structs2 = [] { + auto child1 = ColWrapper{7, 8, 9}; + auto child2 = ColWrapper{10, 11, 12}; + auto child3 = StringsCol{"Bear", "Duck", "Cat"}; + return StructsCol{{child1, child2, child3}}; + }(); + + auto const expected = [] { + auto child1 = ColWrapper{1, 7, 2, 8, 3, 9}; + auto child2 = ColWrapper{4, 10, 5, 11, 6, 12}; + auto child3 = StringsCol{"Banana", "Bear", "Mango", "Duck", "Apple", "Cat"}; + return StructsCol{{child1, child2, child3}}; + }(); + + auto const results = cudf::interleave_columns(TView{{structs1, structs2}}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results, verbosity); +} + +TYPED_TEST(StructsColumnsInterleaveTypedTest, SimpleInputWithNulls) +{ + using ColWrapper = cudf::test::fixed_width_column_wrapper; + + auto const structs1 = [] { + auto child1 = ColWrapper{{1, 2, null, 3, 4}, null_at(2)}; + auto child2 = ColWrapper{{4, null, 5, 6, 7}, null_at(1)}; + auto child3 = StringsCol{{"" /*NULL*/, "Banana", "Mango", "Apple", "Cherry"}, null_at(0)}; + return StructsCol{{child1, child2, child3}, null_at(0)}; + }(); + + auto const structs2 = [] { + auto child1 = ColWrapper{{7, null, null, 8, 9}, nulls_at({1, 2})}; + auto child2 = ColWrapper{{10, 11, 12, null, 14}, null_at(3)}; + auto child3 = StringsCol{"Bear", "Duck", "Cat", "Dog", "Panda"}; + return StructsCol{{child1, child2, child3}, null_at(4)}; + }(); + + auto const structs3 = [] { + auto child1 = ColWrapper{{-1, -2, -3, 0, null}, null_at(4)}; + auto child2 = ColWrapper{{-5, 0, null, -1, -10}, null_at(2)}; + auto child3 = StringsCol{"111", "Bànànà", "abcxyz", "é á í", "zzz"}; + return StructsCol{{child1, child2, child3}, null_at(1)}; + }(); + + auto const expected = [] { + auto child1 = ColWrapper{{1, 7, -1, 2, null, -2, null, null, -3, 3, 8, 0, 4, 9, null}, + nulls_at({4, 6, 7, 14})}; + auto child2 = ColWrapper{{4, 10, -5, null, 11, 0, 5, 12, null, 6, null, -1, 7, 14, -10}, + nulls_at({3, 8, 10})}; + auto child3 = StringsCol{{"" /*NULL*/, + "Bear", + "111", + "Banana", + "Duck", + "Bànànà", + "Mango", + "Cat", + "abcxyz", + "Apple", + "Dog", + "é á í", + "Cherry", + "Panda", + "zzz"}, + null_at(0)}; + return StructsCol{{child1, child2, child3}, nulls_at({0, 5, 13})}; + }(); + + auto const results = cudf::interleave_columns(TView{{structs1, structs2, structs3}}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results, verbosity); +} + +TYPED_TEST(StructsColumnsInterleaveTypedTest, NestedInputStructsColumns) +{ + using ColWrapper = cudf::test::fixed_width_column_wrapper; + + auto const structs1 = [] { + auto child_structs1 = [] { + auto child1 = ColWrapper{{null, 2, 3, 4, 5}, null_at(0)}; + auto child2 = ColWrapper{{6, 7, 8, null, 10}, null_at(3)}; + return StructsCol{{child1, child2}, null_at(0)}; + }(); + + auto child_structs2 = [] { + auto child1 = ColWrapper{{11, null, 13, 14, 15}, null_at(1)}; + auto child2 = ColWrapper{{null, 17, 18, 19, 20}, null_at(0)}; + return StructsCol{{child1, child2}, nulls_at({0, 1})}; + }(); + + auto child_strings = [] { return StringsCol{"Banana", "Mango", "Apple", "Cherry", "Kiwi"}; }(); + + return StructsCol{{child_structs1, child_structs2, child_strings}, null_at(0)}; + }(); + + auto const structs2 = [] { + auto child_structs1 = [] { + auto child1 = ColWrapper{{-1, null, -3, -4, -5}, null_at(1)}; + auto child2 = ColWrapper{{-6, -7, -8, null, -10}, null_at(3)}; + return StructsCol{{child1, child2}}; + }(); + + auto child_structs2 = [] { + auto child1 = ColWrapper{{-11, -12, null, -14, -15}, null_at(2)}; + auto child2 = ColWrapper{{-16, -17, -18, -19, null}, null_at(4)}; + return StructsCol{{child1, child2}, null_at(2)}; + }(); + + auto child_strings = [] { return StringsCol{"Bear", "Duck", "Cat", "Dog", "Rabbit"}; }(); + + return StructsCol{{child_structs1, child_structs2, child_strings}, null_at(2)}; + }(); + + auto const expected = [] { + auto child_structs1 = [] { + auto child1 = ColWrapper{{null, -1, 2, null, 3, -3, 4, -4, 5, -5}, nulls_at({0, 3})}; + auto child2 = ColWrapper{{6, -6, 7, -7, 8, -8, null, null, 10, -10}, nulls_at({6, 7})}; + return StructsCol{{child1, child2}, null_at(0)}; + }(); + + auto child_structs2 = [] { + auto child1 = ColWrapper{{11, -11, null, -12, 13, null, 14, -14, 15, -15}, nulls_at({2, 5})}; + auto child2 = ColWrapper{{null, -16, 17, -17, 18, -18, 19, -19, 20, null}, nulls_at({0, 9})}; + return StructsCol{{child1, child2}, nulls_at({0, 2, 5})}; + }(); + + auto child_strings = [] { + return StringsCol{ + "Banana", "Bear", "Mango", "Duck", "Apple", "Cat", "Cherry", "Dog", "Kiwi", "Rabbit"}; + }(); + + return StructsCol{{child_structs1, child_structs2, child_strings}, nulls_at({0, 5})}; + }(); + + auto const results = cudf::interleave_columns(TView{{structs1, structs2}}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results, verbosity); +} + +TYPED_TEST(StructsColumnsInterleaveTypedTest, SlicedColumnsInputNoNull) +{ + using ColWrapper = cudf::test::fixed_width_column_wrapper; + constexpr int32_t NOT_USE{-1}; // mark for elements that we don't care + + auto const structs1_original = [] { + auto child1 = ColWrapper{NOT_USE, NOT_USE, 1, 2, 3, NOT_USE}; + auto child2 = ColWrapper{NOT_USE, NOT_USE, 4, 5, 6, NOT_USE}; + auto child3 = StringsCol{"NOT_USE", "NOT_USE", "Banana", "Mango", "Apple", "NOT_USE"}; + return StructsCol{{child1, child2, child3}}; + }(); + + // structs2 has more rows than structs1 + auto const structs2_original = [] { + auto child1 = ColWrapper{NOT_USE, 7, 8, 9, NOT_USE, NOT_USE, NOT_USE}; + auto child2 = ColWrapper{NOT_USE, 10, 11, 12, NOT_USE, NOT_USE, NOT_USE}; + auto child3 = StringsCol{"NOT_USE", "Bear", "Duck", "Cat", "NOT_USE", "NOT_USE", "NOT_USE"}; + return StructsCol{{child1, child2, child3}}; + }(); + + auto const expected = [] { + auto child1 = ColWrapper{1, 7, 2, 8, 3, 9}; + auto child2 = ColWrapper{4, 10, 5, 11, 6, 12}; + auto child3 = StringsCol{"Banana", "Bear", "Mango", "Duck", "Apple", "Cat"}; + return StructsCol{{child1, child2, child3}}; + }(); + + auto const structs1 = cudf::slice(structs1_original, {2, 5})[0]; + auto const structs2 = cudf::slice(structs2_original, {1, 4})[0]; + auto const results = cudf::interleave_columns(TView{{structs1, structs2}}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results, verbosity); +} + +TYPED_TEST(StructsColumnsInterleaveTypedTest, SlicedColumnsInputWithNulls) +{ + using ColWrapper = cudf::test::fixed_width_column_wrapper; + constexpr int32_t NOT_USE{-1}; // mark for elements that we don't care + + auto const structs1_original = [] { + auto child1 = ColWrapper{{NOT_USE, NOT_USE, 1, 2, null, 3, 4, NOT_USE}, null_at(4)}; + auto child2 = ColWrapper{{NOT_USE, NOT_USE, 4, null, 5, 6, 7, NOT_USE}, null_at(3)}; + auto child3 = StringsCol{ + {"NOT_USE", "NOT_USE", "" /*NULL*/, "Banana", "Mango", "Apple", "Cherry", "NOT_USE"}, + null_at(2)}; + return StructsCol{{child1, child2, child3}, null_at(2)}; + }(); + + auto const structs2_original = [] { + auto child1 = ColWrapper{{7, null, null, 8, 9, NOT_USE, NOT_USE}, nulls_at({1, 2})}; + auto child2 = ColWrapper{{10, 11, 12, null, 14, NOT_USE, NOT_USE}, null_at(3)}; + auto child3 = StringsCol{"Bear", "Duck", "Cat", "Dog", "Panda", "NOT_USE", "NOT_USE"}; + return StructsCol{{child1, child2, child3}, null_at(4)}; + }(); + + auto const structs3_original = [] { + auto child1 = ColWrapper{{NOT_USE, NOT_USE, NOT_USE, -1, -2, -3, 0, null}, null_at(7)}; + auto child2 = ColWrapper{{NOT_USE, NOT_USE, NOT_USE, -5, 0, null, -1, -10}, null_at(5)}; + auto child3 = + StringsCol{"NOT_USE", "NOT_USE", "NOT_USE", "111", "Bànànà", "abcxyz", "é á í", "zzz"}; + return StructsCol{{child1, child2, child3}, null_at(4)}; + }(); + + auto const expected = [] { + auto child1 = ColWrapper{{1, 7, -1, 2, null, -2, null, null, -3, 3, 8, 0, 4, 9, null}, + nulls_at({4, 6, 7, 14})}; + auto child2 = ColWrapper{{4, 10, -5, null, 11, 0, 5, 12, null, 6, null, -1, 7, 14, -10}, + nulls_at({3, 8, 10})}; + auto child3 = StringsCol{{"" /*NULL*/, + "Bear", + "111", + "Banana", + "Duck", + "Bànànà", + "Mango", + "Cat", + "abcxyz", + "Apple", + "Dog", + "é á í", + "Cherry", + "Panda", + "zzz"}, + null_at(0)}; + return StructsCol{{child1, child2, child3}, nulls_at({0, 5, 13})}; + }(); + + auto const structs1 = cudf::slice(structs1_original, {2, 7})[0]; + auto const structs2 = cudf::slice(structs2_original, {0, 5})[0]; + auto const structs3 = cudf::slice(structs3_original, {3, 8})[0]; + auto const results = cudf::interleave_columns(TView{{structs1, structs2, structs3}}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results, verbosity); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/rolling/grouped_rolling_test.cpp b/cpp/tests/rolling/grouped_rolling_test.cpp index cb123114fd8..72b30c19fd5 100644 --- a/cpp/tests/rolling/grouped_rolling_test.cpp +++ b/cpp/tests/rolling/grouped_rolling_test.cpp @@ -139,19 +139,6 @@ class GroupedRollingTest : public cudf::test::BaseFixture { auto reference = create_reference_output( op, input, expected_grouping, preceding_window, following_window, min_periods); -#ifndef NDEBUG - std::cout << "input:\n"; - cudf::test::print(input, std::cout, ", "); - std::cout << "\n"; - std::cout << "output:\n"; - cudf::test::print(*output, std::cout, ", "); - std::cout << "\n"; - std::cout << "reference:\n"; - cudf::test::print(*reference, std::cout, ", "); - std::cout << "\n"; - std::cout << "\n"; -#endif - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*output, *reference); } @@ -709,19 +696,6 @@ class GroupedTimeRangeRollingTest : public cudf::test::BaseFixture { following_window_in_days, min_periods); -#ifndef NDEBUG - std::cout << "input:\n"; - cudf::test::print(input, std::cout, ", "); - std::cout << "\n"; - std::cout << "output:\n"; - cudf::test::print(*output, std::cout, ", "); - std::cout << "\n"; - std::cout << "reference:\n"; - cudf::test::print(*reference, std::cout, ", "); - std::cout << "\n"; - std::cout << "\n"; -#endif - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*output, *reference); } diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp index a67e670acb7..ec88500fde1 100644 --- a/cpp/tests/rolling/rolling_test.cpp +++ b/cpp/tests/rolling/rolling_test.cpp @@ -190,19 +190,6 @@ class RollingTest : public cudf::test::BaseFixture { auto reference = create_reference_output(op, input, preceding_window, following_window, min_periods); -#if 0 - std::cout << "input:\n"; - cudf::test::print(input, std::cout, ", "); - std::cout << "\n"; - std::cout << "output:\n"; - cudf::test::print(*output, std::cout, ", "); - std::cout << "\n"; - std::cout << "reference:\n"; - cudf::test::print(reference, std::cout, ", "); - std::cout << "\n"; - std::cout << "\n"; -#endif - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*output, *reference); } diff --git a/cpp/tests/scalar/scalar_test.cpp b/cpp/tests/scalar/scalar_test.cpp index 2047d815867..b54594fd1c4 100644 --- a/cpp/tests/scalar/scalar_test.cpp +++ b/cpp/tests/scalar/scalar_test.cpp @@ -14,19 +14,12 @@ * limitations under the License. */ -#include -#include -#include #include #include -#include #include -#include #include -#include -#include -#include +#include template struct TypedScalarTest : public cudf::test::BaseFixture { diff --git a/cpp/tests/strings/combine/join_list_elements_tests.cpp b/cpp/tests/strings/combine/join_list_elements_tests.cpp index d4aafbf5f23..bf739e83241 100644 --- a/cpp/tests/strings/combine/join_list_elements_tests.cpp +++ b/cpp/tests/strings/combine/join_list_elements_tests.cpp @@ -90,11 +90,11 @@ TEST_F(StringsListsConcatenateTest, ZeroSizeStringsInput) auto const expected = STR_COL{"", "", "", ""}; auto results = cudf::strings::join_list_elements(string_lv); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); auto const separators = STR_COL{"", "", "", ""}.release(); results = cudf::strings::join_list_elements(string_lv, separators->view()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); } // Empty list results in null diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index a2486d60051..1f01f0f1429 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -167,6 +167,20 @@ TEST_F(StringsReplaceTests, ReplaceBackrefsRegexTest) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +TEST_F(StringsReplaceTests, ReplaceBackrefsRegexAltIndexPatternTest) +{ + cudf::test::strings_column_wrapper strings({"12-3 34-5 67-89", "0-99: 777-888:: 5673-0"}); + auto strings_view = cudf::strings_column_view(strings); + + std::string pattern = "(\\d+)-(\\d+)"; + std::string repl_template = "${2} X ${1}0"; + auto results = cudf::strings::replace_with_backrefs(strings_view, pattern, repl_template); + + cudf::test::strings_column_wrapper expected( + {"3 X 120 5 X 340 89 X 670", "99 X 00: 888 X 7770:: 0 X 56730"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsReplaceTests, ReplaceBackrefsRegexReversedTest) { cudf::test::strings_column_wrapper strings( @@ -203,6 +217,17 @@ TEST_F(StringsReplaceTests, BackrefWithGreedyQuantifier) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +TEST_F(StringsReplaceTests, ReplaceBackrefsRegexErrorTest) +{ + cudf::test::strings_column_wrapper strings({"this string left intentionally blank"}); + auto view = cudf::strings_column_view(strings); + + EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "(\\w)", "\\0"), cudf::logic_error); + EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "(\\w)", "\\123"), cudf::logic_error); + EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "", "\\1"), cudf::logic_error); + EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "(\\w)", ""), cudf::logic_error); +} + TEST_F(StringsReplaceTests, MediumReplaceRegex) { // This results in 95 regex instructions and falls in the 'medium' range. diff --git a/cpp/tests/structs/structs_column_tests.cpp b/cpp/tests/structs/structs_column_tests.cpp index 548284d6c87..a94a35e8896 100644 --- a/cpp/tests/structs/structs_column_tests.cpp +++ b/cpp/tests/structs/structs_column_tests.cpp @@ -433,11 +433,6 @@ TYPED_TEST(TypedStructColumnWrapperTest, TestListsOfStructs) cudf::test::expect_columns_equivalent(expected_unchanged_struct_col, cudf::lists_column_view(*list_col).child()); - -#ifndef NDEBUG - std::cout << "Printing list col: \n"; - cudf::test::print(*list_col); -#endif } TYPED_TEST(TypedStructColumnWrapperTest, ListOfStructOfList) diff --git a/cpp/tests/structs/utilities_tests.cpp b/cpp/tests/structs/utilities_tests.cpp new file mode 100644 index 00000000000..d4ded02adce --- /dev/null +++ b/cpp/tests/structs/utilities_tests.cpp @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace cudf::test { + +/** + * @brief Round-trip input table through flatten/unflatten, + * verify that the table remains equivalent. + */ +void flatten_unflatten_compare(table_view const& input_table) +{ + using namespace cudf::structs::detail; + + auto [flattened, _, __, ___] = + flatten_nested_columns(input_table, {}, {}, column_nullability::FORCE); + auto unflattened = + unflatten_nested_columns(std::make_unique(flattened), input_table); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, unflattened->view()); +} + +using namespace cudf; +using iterators::null_at; +using strings = strings_column_wrapper; +using structs = structs_column_wrapper; + +struct StructUtilitiesTest : BaseFixture { +}; + +template +struct TypedStructUtilitiesTest : StructUtilitiesTest { +}; + +TYPED_TEST_CASE(TypedStructUtilitiesTest, FixedWidthTypes); + +TYPED_TEST(TypedStructUtilitiesTest, ListsAtTopLevelUnsupported) +{ + using T = TypeParam; + using lists = lists_column_wrapper; + using nums = fixed_width_column_wrapper; + + auto lists_col = lists{{0, 1}, {22, 33}, {44, 55, 66}}; + auto nums_col = nums{{0, 1, 2}, null_at(6)}; + + EXPECT_THROW(flatten_unflatten_compare(cudf::table_view{{lists_col, nums_col}}), + cudf::logic_error); +} + +TYPED_TEST(TypedStructUtilitiesTest, NestedListsUnsupported) +{ + using T = TypeParam; + using lists = lists_column_wrapper; + using nums = fixed_width_column_wrapper; + + auto lists_member = lists{{0, 1}, {22, 33}, {44, 55, 66}}; + auto nums_member = nums{{0, 1, 2}, null_at(6)}; + auto structs_col = structs{{nums_member, lists_member}}; + + auto nums_col = nums{{0, 1, 2}, null_at(6)}; + + EXPECT_THROW(flatten_unflatten_compare(cudf::table_view{{nums_col, structs_col}}), + cudf::logic_error); +} + +TYPED_TEST(TypedStructUtilitiesTest, NoStructs) +{ + using T = TypeParam; + using nums = fixed_width_column_wrapper; + + auto nums_col = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(0)}; + auto strings_col = strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)}; + auto nuther_nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)}; + + flatten_unflatten_compare(cudf::table_view{{nums_col, strings_col, nuther_nums_col}}); +} + +TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStruct) +{ + using T = TypeParam; + using nums = fixed_width_column_wrapper; + + auto nums_member = nums{{0, 1, 22, 333, 44, 55, 66}, null_at(0)}; + auto strings_member = strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)}; + auto structs_col = structs{{nums_member, strings_member}}; + auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)}; + + flatten_unflatten_compare(cudf::table_view{{nums_col, structs_col}}); +} + +TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStructWithNulls) +{ + using T = TypeParam; + using nums = fixed_width_column_wrapper; + + auto nums_member = nums{{0, 1, 22, 333, 44, 55, 66}, null_at(0)}; + auto strings_member = strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)}; + auto structs_col = structs{{nums_member, strings_member}, null_at(2)}; + auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)}; + + flatten_unflatten_compare(cudf::table_view{{nums_col, structs_col}}); +} + +TYPED_TEST(TypedStructUtilitiesTest, StructOfStruct) +{ + using T = TypeParam; + using nums = fixed_width_column_wrapper; + + auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)}; + + auto struct_0_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(0)}; + auto struct_0_strings_member = + strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)}; + auto structs_1_structs_member = structs{{struct_0_nums_member, struct_0_strings_member}}; + + auto struct_1_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)}; + auto struct_of_structs_col = structs{{struct_1_nums_member, structs_1_structs_member}}; + + flatten_unflatten_compare(cudf::table_view{{nums_col, struct_of_structs_col}}); +} + +TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtLeafLevel) +{ + using T = TypeParam; + using nums = fixed_width_column_wrapper; + + auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)}; + + auto struct_0_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(0)}; + auto struct_0_strings_member = + strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)}; + auto structs_1_structs_member = + structs{{struct_0_nums_member, struct_0_strings_member}, null_at(2)}; + + auto struct_1_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)}; + auto struct_of_structs_col = structs{{struct_1_nums_member, structs_1_structs_member}}; + + flatten_unflatten_compare(cudf::table_view{{nums_col, struct_of_structs_col}}); +} + +TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtTopLevel) +{ + using T = TypeParam; + using nums = fixed_width_column_wrapper; + + auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)}; + + auto struct_0_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(0)}; + auto struct_0_strings_member = + strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)}; + auto structs_1_structs_member = structs{{struct_0_nums_member, struct_0_strings_member}}; + + auto struct_1_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)}; + auto struct_of_structs_col = + structs{{struct_1_nums_member, structs_1_structs_member}, null_at(4)}; + + flatten_unflatten_compare(cudf::table_view{{nums_col, struct_of_structs_col}}); +} + +TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtAllLevels) +{ + using T = TypeParam; + using nums = fixed_width_column_wrapper; + + auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)}; + + auto struct_0_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(0)}; + auto struct_0_strings_member = + strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)}; + auto structs_1_structs_member = + structs{{struct_0_nums_member, struct_0_strings_member}, null_at(2)}; + + auto struct_1_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)}; + auto struct_of_structs_col = + structs{{struct_1_nums_member, structs_1_structs_member}, null_at(4)}; + + flatten_unflatten_compare(cudf::table_view{{nums_col, struct_of_structs_col}}); +} + +TYPED_TEST(TypedStructUtilitiesTest, ListsAreUnsupported) +{ + using T = TypeParam; + using ints = fixed_width_column_wrapper; + using lcw = lists_column_wrapper; + + // clang-format off + auto lists_member = lcw{ {0,1,2}, {3,4,5}, {6,7,8,9} }; + auto ints_member = ints{ 0, 1, 2 }; + // clang-format on + + auto structs_with_lists_col = structs{lists_member, ints_member}; + + EXPECT_THROW(flatten_unflatten_compare(cudf::table_view{{structs_with_lists_col}}), + cudf::logic_error); +} + +} // namespace cudf::test diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index 88e9e3d1384..f3002bc4b1a 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -114,14 +114,6 @@ std::unique_ptr generate_child_row_indices(lists_column_view const& c, // // result = [6, 1, 11, 1, 1] // - auto validity_iter = cudf::detail::make_counting_transform_iterator( - 0, - [row_indices = row_indices.begin(), - validity = c.null_mask(), - offset = c.offset()] __device__(int index) { - auto const true_index = row_indices[index] + offset; - return !validity || cudf::bit_is_set(validity, true_index) ? 1 : 0; - }); auto output_row_iter = cudf::detail::make_counting_transform_iterator( 0, [row_indices = row_indices.begin(), @@ -136,8 +128,9 @@ std::unique_ptr generate_child_row_indices(lists_column_view const& c, output_row_iter, output_row_iter + row_indices.size(), output_row_start->view().begin(), - validity_iter, - result->mutable_view().begin()); + row_size_iter, + result->mutable_view().begin(), + [] __device__(auto row_size) { return row_size != 0; }); // generate keys for each output row // @@ -150,11 +143,12 @@ std::unique_ptr generate_child_row_indices(lists_column_view const& c, keys->mutable_view().end(), [] __device__() { return 0; }); thrust::scatter_if(rmm::exec_policy(), - validity_iter, - validity_iter + row_indices.size(), + row_size_iter, + row_size_iter + row_indices.size(), output_row_start->view().begin(), - validity_iter, - keys->mutable_view().begin()); + row_size_iter, + keys->mutable_view().begin(), + [] __device__(auto row_size) { return row_size != 0; }); thrust::inclusive_scan(rmm::exec_policy(), keys->view().begin(), keys->view().end(), diff --git a/docs/cudf/source/_static/RAPIDS-logo-purple.png b/docs/cudf/source/_static/RAPIDS-logo-purple.png new file mode 100644 index 00000000000..d884e01374d Binary files /dev/null and b/docs/cudf/source/_static/RAPIDS-logo-purple.png differ diff --git a/docs/cudf/source/_static/copybutton_pydocs.js b/docs/cudf/source/_static/copybutton_pydocs.js deleted file mode 100644 index cec05777e6b..00000000000 --- a/docs/cudf/source/_static/copybutton_pydocs.js +++ /dev/null @@ -1,65 +0,0 @@ -$(document).ready(function() { - /* Add a [>>>] button on the top-right corner of code samples to hide - * the >>> and ... prompts and the output and thus make the code - * copyable. */ - var div = $('.highlight-python .highlight,' + - '.highlight-python3 .highlight,' + - '.highlight-pycon .highlight,' + - '.highlight-default .highlight'); - var pre = div.find('pre'); - - // get the styles from the current theme - pre.parent().parent().css('position', 'relative'); - var hide_text = 'Hide the prompts and output'; - var show_text = 'Show the prompts and output'; - var border_width = pre.css('border-top-width'); - var border_style = pre.css('border-top-style'); - var border_color = pre.css('border-top-color'); - var button_styles = { - 'cursor':'pointer', 'position': 'absolute', 'top': '0', 'right': '0', - 'border-color': border_color, 'border-style': border_style, - 'border-width': border_width, 'text-size': '75%', - 'font-family': 'monospace', 'padding-left': '0.2em', 'padding-right': '1.5em', - 'border-radius': '0 3px 0 0', - 'transition': "0.5s" - } - - // create and add the button to all the code blocks that contain >>> - div.each(function(index) { - var jthis = $(this); - if (jthis.find('.gp').length > 0) { - var button = $('>>>'); - button.css(button_styles) - button.attr('title', hide_text); - button.data('hidden', 'false'); - jthis.prepend(button); - } - // tracebacks (.gt) contain bare text elements that need to be - // wrapped in a span to work with .nextUntil() (see later) - jthis.find('pre:has(.gt)').contents().filter(function() { - return ((this.nodeType == 3) && (this.data.trim().length > 0)); - }).wrap(''); - }); - - // define the behavior of the button when it's clicked - $('.copybutton').click(function(e){ - e.preventDefault(); - var button = $(this); - if (button.data('hidden') === 'false') { - // hide the code output - button.parent().find('.go, .gp, .gt').hide(); - button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'hidden'); - button.css('text-decoration', 'line-through'); - button.attr('title', show_text); - button.data('hidden', 'true'); - } else { - // show the code output - button.parent().find('.go, .gp, .gt').show(); - button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'visible'); - button.css('text-decoration', 'none'); - button.attr('title', hide_text); - button.data('hidden', 'false'); - } - }); -}); - diff --git a/docs/cudf/source/_static/params.css b/docs/cudf/source/_static/params.css index 475b9dfb4ec..2bdd6f5a299 100644 --- a/docs/cudf/source/_static/params.css +++ b/docs/cudf/source/_static/params.css @@ -8,14 +8,6 @@ content: ":"; } -.highlight:hover span#strike_button { - color:#767676; -} - -span#strike_button { - color :#d0ced7; -} - /* Fix for text wrap in sphinx tables: * https://rackerlabs.github.io/docs-rackspace/tools/rtd-tables.html */ @@ -40,3 +32,24 @@ table.io-supported-types-table { table.io-supported-types-table thead{ text-align: center !important; } + +:root { + + --pst-color-active-navigation: 114, 83, 237; + --pst-color-navbar-link: 77, 77, 77; + --pst-color-navbar-link-hover: var(--pst-color-active-navigation); + --pst-color-navbar-link-active: var(--pst-color-active-navigation); + --pst-color-sidebar-link: 77, 77, 77; + --pst-color-sidebar-link-hover: var(--pst-color-active-navigation); + --pst-color-sidebar-link-active: var(--pst-color-active-navigation); + --pst-color-sidebar-expander-background-hover: 244, 244, 244; + --pst-color-sidebar-caption: 77, 77, 77; + --pst-color-toc-link: 119, 117, 122; + --pst-color-toc-link-hover: var(--pst-color-active-navigation); + --pst-color-toc-link-active: var(--pst-color-active-navigation); + +} + +.special-table td, .special-table th { + border: 1px solid #dee2e6; +} \ No newline at end of file diff --git a/docs/cudf/source/_templates/autosummary/class_with_autosummary.rst b/docs/cudf/source/_templates/autosummary/class_with_autosummary.rst new file mode 100644 index 00000000000..f86822bc567 --- /dev/null +++ b/docs/cudf/source/_templates/autosummary/class_with_autosummary.rst @@ -0,0 +1,33 @@ +{% extends "!autosummary/class.rst" %} + +{% block methods %} +{% if methods %} + +.. + HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages. + .. autosummary:: + :toctree: + {% for item in all_methods %} + {%- if not item.startswith('_') or item in ['__call__'] %} + {{ name }}.{{ item }} + {%- endif -%} + {%- endfor %} + +{% endif %} +{% endblock %} + +{% block attributes %} +{% if attributes %} + +.. + HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages. + .. autosummary:: + :toctree: + {% for item in all_attributes %} + {%- if not item.startswith('_') %} + {{ name }}.{{ item }} + {%- endif -%} + {%- endfor %} + +{% endif %} +{% endblock %} \ No newline at end of file diff --git a/docs/cudf/source/_templates/autosummary/class_without_autosummary.rst b/docs/cudf/source/_templates/autosummary/class_without_autosummary.rst new file mode 100644 index 00000000000..b57a7ceebb0 --- /dev/null +++ b/docs/cudf/source/_templates/autosummary/class_without_autosummary.rst @@ -0,0 +1,6 @@ +{{ fullname }} +{{ underline }} + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} \ No newline at end of file diff --git a/docs/cudf/source/api.rst b/docs/cudf/source/api.rst deleted file mode 100644 index d3042be2129..00000000000 --- a/docs/cudf/source/api.rst +++ /dev/null @@ -1,270 +0,0 @@ -~~~~~~~~~~~~~~~~~~~ -cuDF API Reference -~~~~~~~~~~~~~~~~~~~ - -.. currentmodule:: cudf.core.dataframe - -DataFrame ---------- -.. autoclass:: DataFrame - :members: - :inherited-members: - :exclude-members: serialize, deserialize, device_deserialize, device_serialize, host_deserialize, host_serialize, to_dict, itertuples, iterrows - -Series ------- -.. currentmodule:: cudf.core.series - -.. autoclass:: Series - :members: - :inherited-members: - :exclude-members: serialize, deserialize, logical_not, logical_or, logical_and, remainder, sum_of_squares, fill, merge, iteritems, items, device_deserialize, device_serialize, host_deserialize, host_serialize, to_dict, tolist, to_list - -Lists ------ -.. currentmodule:: cudf.core.column.lists - -.. autoclass:: ListMethods - :members: - -Strings -------- -.. currentmodule:: cudf.core.column.string - -.. autoclass:: StringMethods - :members: - -General Functions ------------------ -.. automodule:: cudf.core.reshape - :members: -.. autofunction:: cudf.to_datetime -.. autofunction:: cudf.to_numeric - -Index ------ -.. currentmodule:: cudf.core.index -.. autoclass:: Index - :members: - :inherited-members: - :exclude-members: serialize, deserialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -RangeIndex ----------- -.. currentmodule:: cudf.core.index -.. autoclass:: RangeIndex - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -GenericIndex ------------- -.. currentmodule:: cudf.core.index -.. autoclass:: GenericIndex - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -MultiIndex ----------- -.. currentmodule:: cudf.core.multiindex -.. autoclass:: MultiIndex - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -Int8Index ---------- -.. currentmodule:: cudf.core.index -.. autoclass:: Int8Index - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -Int16Index ----------- -.. currentmodule:: cudf.core.index -.. autoclass:: Int16Index - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -Int32Index ----------- -.. currentmodule:: cudf.core.index -.. autoclass:: Int32Index - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -Int64Index ----------- -.. currentmodule:: cudf.core.index -.. autoclass:: Int64Index - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -UInt8Index ----------- -.. currentmodule:: cudf.core.index -.. autoclass:: UInt8Index - :inherited-members: - :members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -UInt16Index ------------ -.. currentmodule:: cudf.core.index -.. autoclass:: UInt16Index - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -UInt32Index ------------ -.. currentmodule:: cudf.core.index -.. autoclass:: UInt32Index - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -UInt64Index ------------ -.. currentmodule:: cudf.core.index -.. autoclass:: UInt64Index - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -Float32Index ------------- -.. currentmodule:: cudf.core.index -.. autoclass:: Float32Index - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -Float64Index ------------- -.. currentmodule:: cudf.core.index -.. autoclass:: Float64Index - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -CategoricalIndex ----------------- -.. currentmodule:: cudf.core.index -.. autoclass:: CategoricalIndex - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -StringIndex ------------ -.. currentmodule:: cudf.core.index -.. autoclass:: StringIndex - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -DatetimeIndex -------------- -.. currentmodule:: cudf.core.index -.. autoclass:: DatetimeIndex - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -TimedeltaIndex --------------- -.. currentmodule:: cudf.core.index -.. autoclass:: TimedeltaIndex - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -Categories ----------- -.. currentmodule:: cudf.core.column.categorical - -.. autoclass:: CategoricalAccessor - :members: - -GroupBy -------- -.. currentmodule:: cudf.core.groupby.groupby - -.. autoclass:: GroupBy - :members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize - -Window ------- -.. currentmodule:: cudf.core.window -.. autoclass:: Rolling - :members: - -SubwordTokenizer ----------------- -.. currentmodule:: cudf.core.subword_tokenizer - -.. autoclass:: SubwordTokenizer - :members: - :special-members: __call__ - -General utility functions -------------------------- -.. currentmodule:: cudf.testing - -.. automodule:: cudf.testing.testing - :members: - - -Timedelta Properties --------------------- -.. currentmodule:: cudf.core.series -.. autoclass:: TimedeltaProperties - :members: - -Datetime Properties -------------------- -.. currentmodule:: cudf.core.series -.. autoclass:: DatetimeProperties - :members: - -IO --- -.. currentmodule:: cudf.io - -.. automodule:: cudf.io.csv - :members: -.. automodule:: cudf.io.parquet - :members: -.. automodule:: cudf.io.orc - :members: -.. automodule:: cudf.io.json - :members: -.. automodule:: cudf.io.avro - :members: -.. automodule:: cudf.io.dlpack - :members: -.. automodule:: cudf.io.feather - :members: -.. automodule:: cudf.io.hdf - :members: - -Extending cuDF ----------------- -.. currentmodule:: cudf.api.extensions - -.. automodule:: cudf.api.extensions.accessor - :members: - -GpuArrowReader --------------- -.. currentmodule:: cudf.comm.gpuarrow -.. autoclass:: GpuArrowReader - :members: - :exclude-members: count, index diff --git a/docs/cudf/source/api_docs/dataframe.rst b/docs/cudf/source/api_docs/dataframe.rst new file mode 100644 index 00000000000..12ff1f13bc4 --- /dev/null +++ b/docs/cudf/source/api_docs/dataframe.rst @@ -0,0 +1,254 @@ +========= +DataFrame +========= +.. currentmodule:: cudf + +Constructor +~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + :template: autosummary/class_with_autosummary.rst + + DataFrame + +Attributes and underlying data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +**Axes** + +.. autosummary:: + :toctree: api/ + + DataFrame.index + DataFrame.columns + +.. autosummary:: + :toctree: api/ + + DataFrame.dtypes + DataFrame.info + DataFrame.select_dtypes + DataFrame.values + DataFrame.ndim + DataFrame.size + DataFrame.shape + DataFrame.memory_usage + DataFrame.empty + +Conversion +~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.astype + DataFrame.copy + +Indexing, iteration +~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.head + DataFrame.at + DataFrame.iat + DataFrame.loc + DataFrame.iloc + DataFrame.insert + DataFrame.__iter__ + DataFrame.iteritems + DataFrame.keys + DataFrame.iterrows + DataFrame.itertuples + DataFrame.pop + DataFrame.tail + DataFrame.isin + DataFrame.where + DataFrame.mask + DataFrame.query + +For more information on ``.at``, ``.iat``, ``.loc``, and +``.iloc``, see the :ref:`indexing documentation `. + +Binary operator functions +~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.add + DataFrame.sub + DataFrame.mul + DataFrame.div + DataFrame.truediv + DataFrame.floordiv + DataFrame.mod + DataFrame.pow + DataFrame.radd + DataFrame.rsub + DataFrame.rmul + DataFrame.rdiv + DataFrame.rtruediv + DataFrame.rfloordiv + DataFrame.rmod + DataFrame.rpow + +Function application, GroupBy & window +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.apply + DataFrame.apply_chunks + DataFrame.apply_rows + DataFrame.pipe + DataFrame.agg + DataFrame.groupby + DataFrame.rolling + +.. _api.dataframe.stats: + +Computations / descriptive stats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.all + DataFrame.any + DataFrame.clip + DataFrame.corr + DataFrame.count + DataFrame.cov + DataFrame.cummax + DataFrame.cummin + DataFrame.cumprod + DataFrame.cumsum + DataFrame.describe + DataFrame.kurt + DataFrame.kurtosis + DataFrame.max + DataFrame.mean + DataFrame.min + DataFrame.mode + DataFrame.prod + DataFrame.product + DataFrame.quantile + DataFrame.quantiles + DataFrame.rank + DataFrame.round + DataFrame.skew + DataFrame.sum + DataFrame.std + DataFrame.var + +Reindexing / selection / label manipulation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.drop + DataFrame.drop_duplicates + DataFrame.equals + DataFrame.head + DataFrame.reindex + DataFrame.rename + DataFrame.reset_index + DataFrame.sample + DataFrame.searchsorted + DataFrame.set_index + DataFrame.repeat + DataFrame.tail + DataFrame.take + DataFrame.tile + +.. _api.dataframe.missing: + +Missing data handling +~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.dropna + DataFrame.fillna + DataFrame.isna + DataFrame.isnull + DataFrame.nans_to_nulls + DataFrame.notna + DataFrame.notnull + DataFrame.replace + +Reshaping, sorting, transposing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.argsort + DataFrame.interleave_columns + DataFrame.partition_by_hash + DataFrame.pivot + DataFrame.scatter_by_map + DataFrame.sort_values + DataFrame.sort_index + DataFrame.nlargest + DataFrame.nsmallest + DataFrame.stack + DataFrame.unstack + DataFrame.melt + DataFrame.explode + DataFrame.T + DataFrame.transpose + +Combining / comparing / joining / merging / encoding +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.append + DataFrame.assign + DataFrame.join + DataFrame.merge + DataFrame.update + DataFrame.label_encoding + DataFrame.one_hot_encoding + +Numerical operations +~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.acos + DataFrame.asin + DataFrame.atan + DataFrame.cos + DataFrame.exp + DataFrame.log + DataFrame.sin + DataFrame.sqrt + DataFrame.tan + +Time Series-related +~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.shift + +Serialization / IO / conversion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.as_gpu_matrix + DataFrame.as_matrix + DataFrame.from_arrow + DataFrame.from_pandas + DataFrame.from_records + DataFrame.hash_columns + DataFrame.to_arrow + DataFrame.to_dlpack + DataFrame.to_parquet + DataFrame.to_csv + DataFrame.to_hdf + DataFrame.to_dict + DataFrame.to_json + DataFrame.to_pandas + DataFrame.to_feather + DataFrame.to_records + DataFrame.to_string diff --git a/docs/cudf/source/api_docs/general_functions.rst b/docs/cudf/source/api_docs/general_functions.rst new file mode 100644 index 00000000000..226ae8acd32 --- /dev/null +++ b/docs/cudf/source/api_docs/general_functions.rst @@ -0,0 +1,32 @@ +================= +General Functions +================= +.. currentmodule:: cudf + +Data manipulations +------------------ + +.. autosummary:: + :toctree: api/ + + cudf.concat + cudf.melt + cudf.get_dummies + cudf.merge_sorted + cudf.pivot + cudf.unstack + +Top-level conversions +--------------------- +.. autosummary:: + :toctree: api/ + + cudf.to_numeric + +Top-level dealing with datetimelike +----------------------------------- + +.. autosummary:: + :toctree: api/ + + cudf.to_datetime diff --git a/docs/cudf/source/api_docs/general_utilities.rst b/docs/cudf/source/api_docs/general_utilities.rst new file mode 100644 index 00000000000..d9c53c3fbbd --- /dev/null +++ b/docs/cudf/source/api_docs/general_utilities.rst @@ -0,0 +1,13 @@ +================= +General Utilities +================= + +Testing functions +----------------- +.. autosummary:: + :toctree: api/ + + cudf.testing.testing.assert_column_equal + cudf.testing.testing.assert_frame_equal + cudf.testing.testing.assert_index_equal + cudf.testing.testing.assert_series_equal diff --git a/docs/cudf/source/api_docs/groupby.rst b/docs/cudf/source/api_docs/groupby.rst new file mode 100644 index 00000000000..27a314fa425 --- /dev/null +++ b/docs/cudf/source/api_docs/groupby.rst @@ -0,0 +1,96 @@ +.. _api.groupby: + +======= +GroupBy +======= +.. currentmodule:: cudf.core.groupby + +GroupBy objects are returned by groupby calls: :func:`cudf.DataFrame.groupby`, :func:`cudf.Series.groupby`, etc. + +Indexing, iteration +------------------- +.. autosummary:: + :toctree: api/ + + GroupBy.__iter__ + GroupBy.groups + +.. currentmodule:: cudf + +.. autosummary:: + :toctree: api/ + + Grouper + +.. currentmodule:: cudf.core.groupby.groupby + +Function application +-------------------- +.. autosummary:: + :toctree: api/ + + GroupBy.apply + GroupBy.agg + SeriesGroupBy.aggregate + DataFrameGroupBy.aggregate + GroupBy.pipe + +Computations / descriptive stats +-------------------------------- +.. autosummary:: + :toctree: api/ + + GroupBy.bfill + GroupBy.backfill + GroupBy.count + GroupBy.cumcount + GroupBy.cummax + GroupBy.cummin + GroupBy.cumsum + GroupBy.ffill + GroupBy.max + GroupBy.mean + GroupBy.median + GroupBy.min + GroupBy.nth + GroupBy.pad + GroupBy.prod + GroupBy.size + GroupBy.std + GroupBy.sum + GroupBy.var + +The following methods are available in both ``SeriesGroupBy`` and +``DataFrameGroupBy`` objects, but may differ slightly, usually in that +the ``DataFrameGroupBy`` version usually permits the specification of an +axis argument, and often an argument indicating whether to restrict +application to columns of a specific data type. + +.. autosummary:: + :toctree: api/ + + DataFrameGroupBy.backfill + DataFrameGroupBy.bfill + DataFrameGroupBy.count + DataFrameGroupBy.cumcount + DataFrameGroupBy.cummax + DataFrameGroupBy.cummin + DataFrameGroupBy.cumsum + DataFrameGroupBy.describe + DataFrameGroupBy.ffill + DataFrameGroupBy.fillna + DataFrameGroupBy.idxmax + DataFrameGroupBy.idxmin + DataFrameGroupBy.nunique + DataFrameGroupBy.pad + DataFrameGroupBy.quantile + DataFrameGroupBy.shift + DataFrameGroupBy.size + +The following methods are available only for ``SeriesGroupBy`` objects. + +.. autosummary:: + :toctree: api/ + + SeriesGroupBy.nunique + SeriesGroupBy.unique diff --git a/docs/cudf/source/api_docs/index.rst b/docs/cudf/source/api_docs/index.rst new file mode 100644 index 00000000000..70b9563fc1d --- /dev/null +++ b/docs/cudf/source/api_docs/index.rst @@ -0,0 +1,19 @@ +============= +API reference +============= + +This page provides a list of all publicly accessible modules, methods and classes through +``cudf.*`` namespace. + +.. toctree:: + :maxdepth: 2 + :caption: API Documentation + + series + dataframe + index_objects + groupby + general_functions + general_utilities + window + diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst new file mode 100644 index 00000000000..c23c9a3f6c1 --- /dev/null +++ b/docs/cudf/source/api_docs/index_objects.rst @@ -0,0 +1,296 @@ +============= +Index objects +============= + +Index +----- +.. currentmodule:: cudf + +**Many of these methods or variants thereof are available on the objects +that contain an index (Series/DataFrame) and those should most likely be +used before calling these methods directly.** + +.. autosummary:: + :toctree: api/ + :template: autosummary/class_with_autosummary.rst + + Index + +Properties +~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Index.empty + Index.gpu_values + Index.is_monotonic + Index.is_monotonic_increasing + Index.is_monotonic_decreasing + Index.is_unique + Index.name + Index.names + Index.ndim + Index.nlevels + Index.shape + Index.size + Index.values + + +Modifying and computations +~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Index.any + Index.copy + Index.drop_duplicates + Index.equals + Index.factorize + Index.min + Index.max + Index.rename + Index.repeat + Index.where + Index.take + Index.unique + +Compatibility with MultiIndex +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Index.set_names + +Missing values +~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Index.fillna + Index.dropna + Index.isna + Index.notna + +Memory usage +~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Index.memory_usage + +Conversion +~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Index.astype + Index.to_list + Index.to_series + Index.to_frame + +Sorting +~~~~~~~ +.. autosummary:: + :toctree: api/ + + Index.argsort + Index.searchsorted + Index.sort_values + +Time-specific operations +~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Index.shift + +Combining / joining / set operations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Index.append + Index.join + Index.difference + +Selecting +~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Index.get_level_values + Index.get_loc + Index.get_slice_bound + Index.isin + +.. _api.numericindex: + +Numeric Index +------------- +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + RangeIndex + Int64Index + UInt64Index + Float64Index + + +.. _api.categoricalindex: + +CategoricalIndex +---------------- +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + CategoricalIndex + +Categorical components +~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + CategoricalIndex.codes + CategoricalIndex.categories + +Modifying and computations +~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + CategoricalIndex.equals + +.. _api.intervalindex: + +IntervalIndex +------------- +.. autosummary:: + :toctree: api/ + + IntervalIndex + +IntervalIndex components +~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + IntervalIndex.from_breaks + IntervalIndex.values + IntervalIndex.get_loc + +.. _api.multiindex: + +MultiIndex +---------- +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + MultiIndex + + +MultiIndex constructors +~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + MultiIndex.from_tuples + MultiIndex.from_product + MultiIndex.from_frame + MultiIndex.from_arrow + +MultiIndex properties +~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + MultiIndex.names + MultiIndex.levels + MultiIndex.codes + MultiIndex.nlevels + +MultiIndex components +~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + MultiIndex.to_frame + MultiIndex.droplevel + +MultiIndex selecting +~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + MultiIndex.get_loc + MultiIndex.get_level_values + +.. _api.datetimeindex: + +DatetimeIndex +------------- +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + DatetimeIndex + +Time/date components +~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DatetimeIndex.year + DatetimeIndex.month + DatetimeIndex.day + DatetimeIndex.hour + DatetimeIndex.minute + DatetimeIndex.second + DatetimeIndex.dayofweek + DatetimeIndex.weekday + +Time-specific operations +~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DatetimeIndex.round + +Conversion +~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DatetimeIndex.to_series + DatetimeIndex.to_frame + +TimedeltaIndex +-------------- +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + TimedeltaIndex + +Components +~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + TimedeltaIndex.days + TimedeltaIndex.seconds + TimedeltaIndex.microseconds + TimedeltaIndex.nanoseconds + TimedeltaIndex.components + TimedeltaIndex.inferred_freq + +Conversion +~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + TimedeltaIndex.to_series + TimedeltaIndex.round + TimedeltaIndex.to_frame diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst new file mode 100644 index 00000000000..ffa809268f3 --- /dev/null +++ b/docs/cudf/source/api_docs/series.rst @@ -0,0 +1,478 @@ +====== +Series +====== +.. currentmodule:: cudf + +Constructor +----------- +.. autosummary:: + :toctree: api/ + :template: autosummary/class_with_autosummary.rst + + Series + +Attributes +---------- +**Axes** + +.. autosummary:: + :toctree: api/ + + Series.index + Series.values + Series.data + Series.dtype + Series.shape + Series.ndim + Series.nullable + Series.nullmask + Series.null_count + Series.size + Series.memory_usage + Series.has_nulls + Series.empty + Series.name + Series.valid_count + Series.values_host + +Conversion +---------- +.. autosummary:: + :toctree: api/ + + Series.astype + Series.copy + Series.to_list + Series.__array__ + Series.as_index + Series.as_mask + Series.scale + + +Indexing, iteration +------------------- +.. autosummary:: + :toctree: api/ + + Series.loc + Series.iloc + Series.__iter__ + Series.items + Series.iteritems + Series.keys + +For more information on ``.at``, ``.iat``, ``.loc``, and +``.iloc``, see the :ref:`indexing documentation `. + +Binary operator functions +------------------------- +.. autosummary:: + :toctree: api/ + + Series.add + Series.sub + Series.subtract + Series.mul + Series.multiply + Series.truediv + Series.floordiv + Series.mod + Series.pow + Series.radd + Series.rsub + Series.rmul + Series.rtruediv + Series.rfloordiv + Series.rmod + Series.rpow + Series.round + Series.lt + Series.gt + Series.le + Series.ge + Series.ne + Series.eq + Series.product + +Function application, GroupBy & window +-------------------------------------- +.. autosummary:: + :toctree: api/ + + Series.applymap + Series.map + Series.groupby + Series.rolling + Series.pipe + +.. _api.series.stats: + +Computations / descriptive stats +-------------------------------- +.. autosummary:: + :toctree: api/ + + Series.abs + Series.all + Series.any + Series.ceil + Series.clip + Series.corr + Series.count + Series.cov + Series.cummax + Series.cummin + Series.cumprod + Series.cumsum + Series.describe + Series.diff + Series.digitize + Series.factorize + Series.floor + Series.kurt + Series.max + Series.mean + Series.median + Series.min + Series.mode + Series.nlargest + Series.nsmallest + Series.prod + Series.quantile + Series.rank + Series.skew + Series.std + Series.sum + Series.var + Series.kurtosis + Series.unique + Series.nunique + Series.is_unique + Series.is_monotonic + Series.is_monotonic_increasing + Series.is_monotonic_decreasing + Series.value_counts + +Reindexing / selection / label manipulation +------------------------------------------- +.. autosummary:: + :toctree: api/ + + Series.drop + Series.drop_duplicates + Series.equals + Series.head + Series.isin + Series.reindex + Series.rename + Series.reset_index + Series.reverse + Series.sample + Series.set_index + Series.set_mask + Series.take + Series.tail + Series.tile + Series.where + Series.mask + +Missing data handling +--------------------- +.. autosummary:: + :toctree: api/ + + Series.dropna + Series.fillna + Series.isna + Series.isnull + Series.nans_to_nulls + Series.notna + Series.notnull + Series.replace + +Reshaping, sorting +------------------ +.. autosummary:: + :toctree: api/ + + Series.argsort + Series.interleave_columns + Series.sort_values + Series.sort_index + Series.explode + Series.scatter_by_map + Series.searchsorted + Series.repeat + +Combining / comparing / joining / merging / encoding +---------------------------------------------------- +.. autosummary:: + :toctree: api/ + + Series.append + Series.update + Series.label_encoding + Series.one_hot_encoding + +Numerical operations +~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Series.acos + Series.asin + Series.atan + Series.cos + Series.exp + Series.log + Series.sin + Series.sqrt + Series.tan + +Time Series-related +------------------- +.. autosummary:: + :toctree: api/ + + Series.shift + +Accessors +--------- + +pandas provides dtype-specific methods under various accessors. +These are separate namespaces within :class:`Series` that only apply +to specific data types. + +=========================== ================================= +Data Type Accessor +=========================== ================================= +Datetime, Timedelta :ref:`dt ` +String :ref:`str ` +Categorical :ref:`cat ` +List :ref:`list ` +=========================== ================================= + +.. _api.series.dt: + +Datetimelike properties +~~~~~~~~~~~~~~~~~~~~~~~ + +``Series.dt`` can be used to access the values of the series as +datetimelike and return several properties. +These can be accessed like ``Series.dt.``. + +Datetime properties +^^^^^^^^^^^^^^^^^^^ +.. currentmodule:: cudf.core.series.DatetimeProperties + +.. autosummary:: + :toctree: api/ + + day + dayofweek + hour + minute + month + second + weekday + year + +Datetime methods +^^^^^^^^^^^^^^^^ + +.. autosummary:: + :toctree: api/ + + strftime + + +Timedelta properties +^^^^^^^^^^^^^^^^^^^^ + +.. currentmodule:: cudf.core.series.TimedeltaProperties +.. autosummary:: + :toctree: api/ + + components + days + microseconds + nanoseconds + seconds + + +.. _api.series.str: + +String handling +~~~~~~~~~~~~~~~ + +``Series.str`` can be used to access the values of the series as +strings and apply several methods to it. These can be accessed like +``Series.str.``. + +.. currentmodule:: cudf.core.column.string.StringMethods +.. autosummary:: + :toctree: api/ + + byte_count + capitalize + cat + center + character_ngrams + character_tokenize + code_points + contains + count + detokenize + edit_distance + endswith + extract + filter_alphanum + filter_characters + filter_tokens + find + findall + get + get_json_object + htoi + index + insert + ip2int + is_consonant + is_vowel + isalnum + isalpha + isdecimal + isdigit + isempty + isfloat + ishex + isinteger + isipv4 + isspace + islower + isnumeric + isupper + istimestamp + join + len + ljust + lower + lstrip + match + ngrams + ngrams_tokenize + normalize_characters + pad + partition + porter_stemmer_measure + replace + replace_tokens + replace_with_backrefs + rfind + rindex + rjust + rpartition + rstrip + slice + slice_from + slice_replace + split + rsplit + startswith + strip + subword_tokenize + swapcase + title + token_count + tokenize + translate + upper + url_decode + url_encode + wrap + zfill + + + +.. + The following is needed to ensure the generated pages are created with the + correct template (otherwise they would be created in the Series/Index class page) + +.. + .. currentmodule:: cudf + .. autosummary:: + :toctree: api/ + :template: autosummary/accessor.rst + + Series.str + Series.cat + Series.dt + Index.str + +.. _api.series.cat: + +Categorical accessor +~~~~~~~~~~~~~~~~~~~~ + +Categorical-dtype specific methods and attributes are available under +the ``Series.cat`` accessor. + +.. currentmodule:: cudf.core.column.categorical.CategoricalAccessor +.. autosummary:: + :toctree: api/ + + categories + ordered + codes + reorder_categories + add_categories + remove_categories + set_categories + as_ordered + as_unordered + + +.. _api.series.list: + +List handling +~~~~~~~~~~~~~ + +``Series.list`` can be used to access the values of the series as +lists and apply list methods to it. These can be accessed like +``Series.list.``. + +.. currentmodule:: cudf.core.column.lists.ListMethods +.. autosummary:: + :toctree: api/ + + concat + contains + get + len + sort_values + take + unique + + +Serialization / IO / conversion +------------------------------- +.. currentmodule:: cudf +.. autosummary:: + :toctree: api/ + + Series.to_array + Series.to_arrow + Series.to_dlpack + Series.to_frame + Series.to_gpu_array + Series.to_hdf + Series.to_json + Series.to_pandas + Series.to_string + Series.from_arrow + Series.from_categorical + Series.from_masked_array + Series.from_pandas + Series.hash_encode + Series.hash_values + \ No newline at end of file diff --git a/docs/cudf/source/api_docs/window.rst b/docs/cudf/source/api_docs/window.rst new file mode 100644 index 00000000000..9f94f620949 --- /dev/null +++ b/docs/cudf/source/api_docs/window.rst @@ -0,0 +1,24 @@ +.. _api.window: + +====== +Window +====== + +Rolling objects are returned by ``.rolling`` calls: :func:`cudf.DataFrame.rolling`, :func:`cudf.Series.rolling`, etc. + +.. _api.functions_rolling: + +Rolling window functions +------------------------ +.. currentmodule:: cudf.core.window.rolling + +.. autosummary:: + :toctree: api/ + + Rolling.count + Rolling.sum + Rolling.mean + Rolling.min + Rolling.max + Rolling.apply + diff --git a/docs/cudf/source/basics.rst b/docs/cudf/source/basics.rst deleted file mode 100644 index 15b4b43662b..00000000000 --- a/docs/cudf/source/basics.rst +++ /dev/null @@ -1,54 +0,0 @@ -Basics -====== - - -Supported Dtypes ----------------- - -cuDF uses dtypes for Series or individual columns of a DataFrame. cuDF uses NumPy dtypes, NumPy provides support for ``float``, ``int``, ``bool``, -``'timedelta64[s]'``, ``'timedelta64[ms]'``, ``'timedelta64[us]'``, ``'timedelta64[ns]'``, ``'datetime64[s]'``, ``'datetime64[ms]'``, -``'datetime64[us]'``, ``'datetime64[ns]'`` (note that NumPy does not support timezone-aware datetimes). - - -The following table lists all of cudf types. For methods requiring dtype arguments, strings can be specified as indicated. See the respective documentation sections for more on each type. - - -+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ -| Kind of Data | Data Type | Scalar | String Aliases | -+========================+==================+=====================================================================================+=============================================+ -| Integer | | np.int8_, np.int16_, np.int32_, np.int64_, np.uint8_, np.uint16_, | ``'int8'``, ``'int16'``, ``'int32'``, | -| | | np.uint32_, np.uint64_ | ``'int64'``, ``'uint8'``, ``'uint16'``, | -| | | | ``'uint32'``, ``'uint64'`` | -+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ -| Float | | np.float32_, np.float64_ | ``'float32'``, ``'float64'`` | -+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ -| Strings | | `str `_ | ``'string'``, ``'object'`` | -+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ -| Datetime | | np.datetime64_ | ``'datetime64[s]'``, ``'datetime64[ms]'``, | -| | | | ``'datetime64[us]'``, ``'datetime64[ns]'`` | -+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ -| Timedelta | | np.timedelta64_ | ``'timedelta64[s]'``, ``'timedelta64[ms]'``,| -| (duration type) | | | ``'timedelta64[us]'``, ``'timedelta64[ns]'``| -+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ -| Categorical | CategoricalDtype | (none) | ``'category'`` | -+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ -| Boolean | | np.bool_ | ``'bool'`` | -+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ -| Decimal | Decimal64Dtype | (none) | (none) | -+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ - -**Note: All dtypes above are Nullable** - -.. _np.int8: -.. _np.int16: -.. _np.int32: -.. _np.int64: -.. _np.uint8: -.. _np.uint16: -.. _np.uint32: -.. _np.uint64: -.. _np.float32: -.. _np.float64: -.. _np.bool: https://numpy.org/doc/stable/user/basics.types.html -.. _np.datetime64: https://numpy.org/doc/stable/reference/arrays.datetime.html#basic-datetimes -.. _np.timedelta64: https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-and-timedelta-arithmetic diff --git a/docs/cudf/source/PandasCompat.rst b/docs/cudf/source/basics/PandasCompat.rst similarity index 100% rename from docs/cudf/source/PandasCompat.rst rename to docs/cudf/source/basics/PandasCompat.rst diff --git a/docs/cudf/source/basics/basics.rst b/docs/cudf/source/basics/basics.rst new file mode 100644 index 00000000000..ee63f67daa2 --- /dev/null +++ b/docs/cudf/source/basics/basics.rst @@ -0,0 +1,56 @@ +Basics +====== + + +Supported Dtypes +---------------- + +cuDF uses dtypes for Series or individual columns of a DataFrame. cuDF uses NumPy dtypes, NumPy provides support for ``float``, ``int``, ``bool``, +``'timedelta64[s]'``, ``'timedelta64[ms]'``, ``'timedelta64[us]'``, ``'timedelta64[ns]'``, ``'datetime64[s]'``, ``'datetime64[ms]'``, +``'datetime64[us]'``, ``'datetime64[ns]'`` (note that NumPy does not support timezone-aware datetimes). + + +The following table lists all of cudf types. For methods requiring dtype arguments, strings can be specified as indicated. See the respective documentation sections for more on each type. + +.. rst-class:: special-table +.. table:: + + +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ + | Kind of Data | Data Type | Scalar | String Aliases | + +========================+==================+=====================================================================================+=============================================+ + | Integer | | np.int8_, np.int16_, np.int32_, np.int64_, np.uint8_, np.uint16_, | ``'int8'``, ``'int16'``, ``'int32'``, | + | | | np.uint32_, np.uint64_ | ``'int64'``, ``'uint8'``, ``'uint16'``, | + | | | | ``'uint32'``, ``'uint64'`` | + +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ + | Float | | np.float32_, np.float64_ | ``'float32'``, ``'float64'`` | + +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ + | Strings | | `str `_ | ``'string'``, ``'object'`` | + +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ + | Datetime | | np.datetime64_ | ``'datetime64[s]'``, ``'datetime64[ms]'``, | + | | | | ``'datetime64[us]'``, ``'datetime64[ns]'`` | + +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ + | Timedelta | | np.timedelta64_ | ``'timedelta64[s]'``, ``'timedelta64[ms]'``,| + | (duration type) | | | ``'timedelta64[us]'``, ``'timedelta64[ns]'``| + +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ + | Categorical | CategoricalDtype | (none) | ``'category'`` | + +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ + | Boolean | | np.bool_ | ``'bool'`` | + +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ + | Decimal | Decimal64Dtype | (none) | (none) | + +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ + +**Note: All dtypes above are Nullable** + +.. _np.int8: +.. _np.int16: +.. _np.int32: +.. _np.int64: +.. _np.uint8: +.. _np.uint16: +.. _np.uint32: +.. _np.uint64: +.. _np.float32: +.. _np.float64: +.. _np.bool: https://numpy.org/doc/stable/user/basics.types.html +.. _np.datetime64: https://numpy.org/doc/stable/reference/arrays.datetime.html#basic-datetimes +.. _np.timedelta64: https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-and-timedelta-arithmetic diff --git a/docs/cudf/source/dask-cudf.rst b/docs/cudf/source/basics/dask-cudf.rst similarity index 100% rename from docs/cudf/source/dask-cudf.rst rename to docs/cudf/source/basics/dask-cudf.rst diff --git a/docs/cudf/source/groupby.rst b/docs/cudf/source/basics/groupby.rst similarity index 51% rename from docs/cudf/source/groupby.rst rename to docs/cudf/source/basics/groupby.rst index a6ce9db6817..04c4d42fa2a 100644 --- a/docs/cudf/source/groupby.rst +++ b/docs/cudf/source/basics/groupby.rst @@ -131,41 +131,44 @@ Aggregations on groups is supported via the ``agg`` method: The following table summarizes the available aggregations and the types that support them: -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| Aggregations / dtypes | Numeric | Datetime | String | Categorical | List | Struct | Interval | Decimal | -+====================================+===========+============+==========+===============+========+==========+============+===========+ -| count | ✅ | ✅ | ✅ | ✅ | | | | ✅ | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| size | ✅ | ✅ | ✅ | ✅ | | | | ✅ | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| sum | ✅ | ✅ | | | | | | ✅ | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| idxmin | ✅ | ✅ | | | | | | ✅ | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| idxmax | ✅ | ✅ | | | | | | ✅ | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| min | ✅ | ✅ | ✅ | | | | | ✅ | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| max | ✅ | ✅ | ✅ | | | | | ✅ | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| mean | ✅ | ✅ | | | | | | | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| var | ✅ | ✅ | | | | | | | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| std | ✅ | ✅ | | | | | | | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| quantile | ✅ | ✅ | | | | | | | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| median | ✅ | ✅ | | | | | | | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| nunique | ✅ | ✅ | ✅ | ✅ | | | | ✅ | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| nth | ✅ | ✅ | ✅ | | | | | ✅ | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| collect | ✅ | ✅ | ✅ | | ✅ | | | ✅ | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| unique | ✅ | ✅ | ✅ | ✅ | | | | | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ +.. rst-class:: special-table +.. table:: + + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | Aggregations / dtypes | Numeric | Datetime | String | Categorical | List | Struct | Interval | Decimal | + +====================================+===========+============+==========+===============+========+==========+============+===========+ + | count | ✅ | ✅ | ✅ | ✅ | | | | ✅ | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | size | ✅ | ✅ | ✅ | ✅ | | | | ✅ | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | sum | ✅ | ✅ | | | | | | ✅ | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | idxmin | ✅ | ✅ | | | | | | ✅ | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | idxmax | ✅ | ✅ | | | | | | ✅ | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | min | ✅ | ✅ | ✅ | | | | | ✅ | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | max | ✅ | ✅ | ✅ | | | | | ✅ | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | mean | ✅ | ✅ | | | | | | | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | var | ✅ | ✅ | | | | | | | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | std | ✅ | ✅ | | | | | | | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | quantile | ✅ | ✅ | | | | | | | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | median | ✅ | ✅ | | | | | | | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | nunique | ✅ | ✅ | ✅ | ✅ | | | | ✅ | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | nth | ✅ | ✅ | ✅ | | | | | ✅ | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | collect | ✅ | ✅ | ✅ | | ✅ | | | ✅ | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | unique | ✅ | ✅ | ✅ | ✅ | | | | | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ GroupBy apply ------------- diff --git a/docs/cudf/source/basics/index.rst b/docs/cudf/source/basics/index.rst new file mode 100644 index 00000000000..a29866d7e32 --- /dev/null +++ b/docs/cudf/source/basics/index.rst @@ -0,0 +1,15 @@ +====== +Basics +====== + + +.. toctree:: + :maxdepth: 2 + + basics + io.rst + groupby.rst + PandasCompat.rst + dask-cudf.rst + internals.rst + \ No newline at end of file diff --git a/docs/cudf/source/internals.rst b/docs/cudf/source/basics/internals.rst similarity index 100% rename from docs/cudf/source/internals.rst rename to docs/cudf/source/basics/internals.rst diff --git a/docs/cudf/source/io-gds-integration.rst b/docs/cudf/source/basics/io-gds-integration.rst similarity index 100% rename from docs/cudf/source/io-gds-integration.rst rename to docs/cudf/source/basics/io-gds-integration.rst diff --git a/docs/cudf/source/io-supported-types.rst b/docs/cudf/source/basics/io-supported-types.rst similarity index 99% rename from docs/cudf/source/io-supported-types.rst rename to docs/cudf/source/basics/io-supported-types.rst index 739c1634ca7..78c1bfb6554 100644 --- a/docs/cudf/source/io-supported-types.rst +++ b/docs/cudf/source/basics/io-supported-types.rst @@ -3,7 +3,7 @@ I/O Supported dtypes The following table lists are compatible cudf types for each supported IO format. -.. rst-class:: io-supported-types-table +.. rst-class:: io-supported-types-table special-table .. table:: :widths: 15 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 diff --git a/docs/cudf/source/io.rst b/docs/cudf/source/basics/io.rst similarity index 100% rename from docs/cudf/source/io.rst rename to docs/cudf/source/basics/io.rst diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index c764b64da60..c5f1233d022 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -24,7 +24,10 @@ from docutils.nodes import Text from sphinx.addnodes import pending_xref +import cudf +sys.path.insert(0, os.path.abspath(cudf.__path__[0])) +sys.path.insert(0, os.path.abspath(".")) sys.path.insert(0, os.path.abspath("../..")) sys.path.append(os.path.abspath("./_ext")) @@ -43,7 +46,6 @@ "sphinx.ext.autosummary", "sphinx_copybutton", "numpydoc", - "sphinx_markdown_tables", "IPython.sphinxext.ipython_console_highlighting", "IPython.sphinxext.ipython_directive", "nbsphinx", @@ -51,9 +53,11 @@ ] copybutton_prompt_text = ">>> " - +autosummary_generate = True ipython_mplbackend = "str" +html_use_modindex = True + # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] @@ -61,7 +65,7 @@ # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = {".rst": "restructuredtext", ".md": "markdown"} +source_suffix = {".rst": "restructuredtext"} # The master toctree document. master_doc = "index" @@ -90,21 +94,30 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = [] +exclude_patterns = ['venv', "**/includes/**",] # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + +html_theme_options = { + "external_links": [], + "github_url": "https://github.com/rapidsai/cudf", + "twitter_url": "https://twitter.com/rapidsai", + "show_toc_level": 1, + "navbar_align": "right", +} include_pandas_compat = True -# -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = "sphinx_rtd_theme" - +html_theme = "pydata_sphinx_theme" +html_logo = "_static/RAPIDS-logo-purple.png" # on_rtd is whether we are on readthedocs.org on_rtd = os.environ.get("READTHEDOCS", None) == "True" @@ -112,10 +125,10 @@ # only import and set the theme if we're building docs locally # otherwise, readthedocs.org uses their theme by default, # so no need to specify it - import sphinx_rtd_theme + import pydata_sphinx_theme - html_theme = "sphinx_rtd_theme" - html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + html_theme = "pydata_sphinx_theme" + html_theme_path = pydata_sphinx_theme.get_html_theme_path() # Theme options are theme-specific and customize the look and feel of a theme @@ -201,8 +214,9 @@ # Config numpydoc numpydoc_show_inherited_class_members = True numpydoc_class_members_toctree = False +numpydoc_attributes_as_param_list = False -autoclass_content = "init" +autoclass_content = "class" # Replace API shorthands with fullname _reftarget_aliases = { @@ -234,10 +248,27 @@ def ignore_internal_references(app, env, node, contnode): node["reftarget"] = "" return contnode +def process_class_docstrings(app, what, name, obj, options, lines): + """ + For those classes for which we use :: + :template: autosummary/class_without_autosummary.rst + the documented attributes/methods have to be listed in the class + docstring. However, if one of those lists is empty, we use 'None', + which then generates warnings in sphinx / ugly html output. + This "autodoc-process-docstring" event connector removes that part + from the processed docstring. + """ + if what == "class": + if name in {"cudf.RangeIndex", "cudf.Int64Index", "cudf.UInt64Index", "cudf.Float64Index", "cudf.CategoricalIndex", "cudf.IntervalIndex", "cudf.MultiIndex", "cudf.DatetimeIndex", "cudf.TimedeltaIndex", "cudf.TimedeltaIndex"}: + + cut_index = lines.index('.. rubric:: Attributes') + lines[:] = lines[:cut_index] + + + def setup(app): - app.add_js_file("copybutton_pydocs.js") app.add_css_file("params.css") - app.add_css_file("https://docs.rapids.ai/assets/css/custom.css") app.connect("doctree-read", resolve_aliases) app.connect("missing-reference", ignore_internal_references) + app.connect("autodoc-process-docstring", process_class_docstrings) diff --git a/docs/cudf/source/index.rst b/docs/cudf/source/index.rst index 5a6d9a2617d..90b287bd1b6 100644 --- a/docs/cudf/source/index.rst +++ b/docs/cudf/source/index.rst @@ -1,25 +1,25 @@ Welcome to cuDF's documentation! ================================= +cuDF is a Python GPU DataFrame library (built on the `Apache Arrow +`_ columnar memory format) for loading, joining, +aggregating, filtering, and otherwise manipulating data. cuDF also provides a +pandas-like API that will be familiar to data engineers & data scientists, so +they can use it to easily accelerate their workflows without going into +the details of CUDA programming. + + .. toctree:: :maxdepth: 2 :caption: Contents: - api.rst - 10min.ipynb - basics.rst - io.rst - groupby.rst - dask-cudf.rst - 10min-cudf-cupy.ipynb - guide-to-udfs.ipynb - internals.rst - Working-with-missing-data.ipynb - PandasCompat.rst + user_guide/index + basics/index + api_docs/index + Indices and tables ================== * :ref:`genindex` -* :ref:`modindex` * :ref:`search` diff --git a/docs/cudf/source/10min-cudf-cupy.ipynb b/docs/cudf/source/user_guide/10min-cudf-cupy.ipynb similarity index 100% rename from docs/cudf/source/10min-cudf-cupy.ipynb rename to docs/cudf/source/user_guide/10min-cudf-cupy.ipynb diff --git a/docs/cudf/source/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb similarity index 100% rename from docs/cudf/source/10min.ipynb rename to docs/cudf/source/user_guide/10min.ipynb diff --git a/docs/cudf/source/Working-with-missing-data.ipynb b/docs/cudf/source/user_guide/Working-with-missing-data.ipynb similarity index 100% rename from docs/cudf/source/Working-with-missing-data.ipynb rename to docs/cudf/source/user_guide/Working-with-missing-data.ipynb diff --git a/docs/cudf/source/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb similarity index 100% rename from docs/cudf/source/guide-to-udfs.ipynb rename to docs/cudf/source/user_guide/guide-to-udfs.ipynb diff --git a/docs/cudf/source/user_guide/index.rst b/docs/cudf/source/user_guide/index.rst new file mode 100644 index 00000000000..1061008eb3c --- /dev/null +++ b/docs/cudf/source/user_guide/index.rst @@ -0,0 +1,12 @@ +========== +User Guide +========== + + +.. toctree:: + :maxdepth: 2 + + 10min.ipynb + 10min-cudf-cupy.ipynb + guide-to-udfs.ipynb + Working-with-missing-data.ipynb diff --git a/java/pom.xml b/java/pom.xml index b9bf5e9d8b7..1b4a31116d4 100755 --- a/java/pom.xml +++ b/java/pom.xml @@ -138,6 +138,18 @@ ${arrow.version} test + + org.apache.parquet + parquet-avro + 1.10.0 + test + + + org.apache.hadoop + hadoop-common + 3.1.0 + test + diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation.java b/java/src/main/java/ai/rapids/cudf/Aggregation.java index 49c6d2b6ffc..734d9cb5694 100644 --- a/java/src/main/java/ai/rapids/cudf/Aggregation.java +++ b/java/src/main/java/ai/rapids/cudf/Aggregation.java @@ -24,7 +24,7 @@ * Represents an aggregation operation. Please note that not all aggregations work, or even make * sense in all types of aggregation operations. */ -public abstract class Aggregation { +abstract class Aggregation { static { NativeDepsLoader.loadNativeDeps(); } @@ -65,7 +65,7 @@ enum Kind { M2(26), MERGE_M2(27), RANK(28), - DENSE_RANK(29);; + DENSE_RANK(29); final int nativeId; @@ -102,7 +102,7 @@ public boolean equals(Object other) { } } - public static final class NthAggregation extends Aggregation { + static final class NthAggregation extends Aggregation { private final int offset; private final NullPolicy nullPolicy; @@ -194,7 +194,7 @@ public boolean equals(Object other) { } } - private static class QuantileAggregation extends Aggregation { + private static final class QuantileAggregation extends Aggregation { private final QuantileMethod method; private final double[] quantiles; @@ -275,8 +275,7 @@ long getDefaultOutput() { } } - public static final class CollectListAggregation extends Aggregation - implements RollingAggregation { + static final class CollectListAggregation extends Aggregation { private final NullPolicy nullPolicy; private CollectListAggregation(NullPolicy nullPolicy) { @@ -306,8 +305,7 @@ public boolean equals(Object other) { } } - public static final class CollectSetAggregation extends Aggregation - implements RollingAggregation { + static final class CollectSetAggregation extends Aggregation { private final NullPolicy nullPolicy; private final NullEquality nullEquality; private final NaNEquality nanEquality; @@ -348,7 +346,7 @@ public boolean equals(Object other) { } } - public static final class MergeSetsAggregation extends Aggregation { + static final class MergeSetsAggregation extends Aggregation { private final NullEquality nullEquality; private final NaNEquality nanEquality; @@ -388,14 +386,6 @@ protected Aggregation(Kind kind) { this.kind = kind; } - /** - * Add a column to the Aggregation so it can be used on a specific column of data. - * @param columnIndex the index of the column to operate on. - */ - public AggregationOnColumn onColumn(int columnIndex) { - return new AggregationOnColumn((T)this, columnIndex); - } - /** * Get the native view of a ColumnVector that provides default values to be used for some window * aggregations when there is not enough data to do the computation. This really only happens @@ -433,8 +423,7 @@ static void close(long[] ptrs) { static native void close(long ptr); - public static class SumAggregation extends NoParamAggregation - implements RollingAggregation { + static final class SumAggregation extends NoParamAggregation { private SumAggregation() { super(Kind.SUM); } @@ -443,11 +432,11 @@ private SumAggregation() { /** * Sum reduction. */ - public static SumAggregation sum() { + static SumAggregation sum() { return new SumAggregation(); } - public static class ProductAggregation extends NoParamAggregation { + static final class ProductAggregation extends NoParamAggregation { private ProductAggregation() { super(Kind.PRODUCT); } @@ -456,12 +445,11 @@ private ProductAggregation() { /** * Product reduction. */ - public static ProductAggregation product() { + static ProductAggregation product() { return new ProductAggregation(); } - public static class MinAggregation extends NoParamAggregation - implements RollingAggregation { + static final class MinAggregation extends NoParamAggregation { private MinAggregation() { super(Kind.MIN); } @@ -470,12 +458,11 @@ private MinAggregation() { /** * Min reduction. */ - public static MinAggregation min() { + static MinAggregation min() { return new MinAggregation(); } - public static class MaxAggregation extends NoParamAggregation - implements RollingAggregation { + static final class MaxAggregation extends NoParamAggregation { private MaxAggregation() { super(Kind.MAX); } @@ -484,12 +471,11 @@ private MaxAggregation() { /** * Max reduction. */ - public static MaxAggregation max() { + static MaxAggregation max() { return new MaxAggregation(); } - public static class CountAggregation extends CountLikeAggregation - implements RollingAggregation { + static final class CountAggregation extends CountLikeAggregation { private CountAggregation(NullPolicy nullPolicy) { super(Kind.COUNT, nullPolicy); } @@ -498,7 +484,7 @@ private CountAggregation(NullPolicy nullPolicy) { /** * Count number of valid, a.k.a. non-null, elements. */ - public static CountAggregation count() { + static CountAggregation count() { return count(NullPolicy.EXCLUDE); } @@ -507,11 +493,11 @@ public static CountAggregation count() { * @param nullPolicy INCLUDE if nulls should be counted. EXCLUDE if only non-null values * should be counted. */ - public static CountAggregation count(NullPolicy nullPolicy) { + static CountAggregation count(NullPolicy nullPolicy) { return new CountAggregation(nullPolicy); } - public static class AnyAggregation extends NoParamAggregation { + static final class AnyAggregation extends NoParamAggregation { private AnyAggregation() { super(Kind.ANY); } @@ -522,11 +508,11 @@ private AnyAggregation() { * if any of the elements in the range are true or non-zero, otherwise produces a false or 0. * Null values are skipped. */ - public static AnyAggregation any() { + static AnyAggregation any() { return new AnyAggregation(); } - public static class AllAggregation extends NoParamAggregation { + static final class AllAggregation extends NoParamAggregation { private AllAggregation() { super(Kind.ALL); } @@ -537,12 +523,11 @@ private AllAggregation() { * the range are true or non-zero, otherwise produces a false or 0. * Null values are skipped. */ - public static AllAggregation all() { + static AllAggregation all() { return new AllAggregation(); } - - public static class SumOfSquaresAggregation extends NoParamAggregation { + static final class SumOfSquaresAggregation extends NoParamAggregation { private SumOfSquaresAggregation() { super(Kind.SUM_OF_SQUARES); } @@ -551,12 +536,11 @@ private SumOfSquaresAggregation() { /** * Sum of squares reduction. */ - public static SumOfSquaresAggregation sumOfSquares() { + static SumOfSquaresAggregation sumOfSquares() { return new SumOfSquaresAggregation(); } - public static class MeanAggregation extends NoParamAggregation - implements RollingAggregation{ + static final class MeanAggregation extends NoParamAggregation { private MeanAggregation() { super(Kind.MEAN); } @@ -565,11 +549,11 @@ private MeanAggregation() { /** * Arithmetic mean reduction. */ - public static MeanAggregation mean() { + static MeanAggregation mean() { return new MeanAggregation(); } - public static class M2Aggregation extends NoParamAggregation { + static final class M2Aggregation extends NoParamAggregation { private M2Aggregation() { super(Kind.M2); } @@ -578,11 +562,11 @@ private M2Aggregation() { /** * Sum of square of differences from mean. */ - public static M2Aggregation M2() { + static M2Aggregation M2() { return new M2Aggregation(); } - public static class VarianceAggregation extends DdofAggregation { + static final class VarianceAggregation extends DdofAggregation { private VarianceAggregation(int ddof) { super(Kind.VARIANCE, ddof); } @@ -591,7 +575,7 @@ private VarianceAggregation(int ddof) { /** * Variance aggregation with 1 as the delta degrees of freedom. */ - public static VarianceAggregation variance() { + static VarianceAggregation variance() { return variance(1); } @@ -600,12 +584,12 @@ public static VarianceAggregation variance() { * @param ddof delta degrees of freedom. The divisor used in calculation of variance is * N - ddof, where N is the population size. */ - public static VarianceAggregation variance(int ddof) { + static VarianceAggregation variance(int ddof) { return new VarianceAggregation(ddof); } - public static class StandardDeviationAggregation extends DdofAggregation { + static final class StandardDeviationAggregation extends DdofAggregation { private StandardDeviationAggregation(int ddof) { super(Kind.STD, ddof); } @@ -614,7 +598,7 @@ private StandardDeviationAggregation(int ddof) { /** * Standard deviation aggregation with 1 as the delta degrees of freedom. */ - public static StandardDeviationAggregation standardDeviation() { + static StandardDeviationAggregation standardDeviation() { return standardDeviation(1); } @@ -623,11 +607,11 @@ public static StandardDeviationAggregation standardDeviation() { * @param ddof delta degrees of freedom. The divisor used in calculation of std is * N - ddof, where N is the population size. */ - public static StandardDeviationAggregation standardDeviation(int ddof) { + static StandardDeviationAggregation standardDeviation(int ddof) { return new StandardDeviationAggregation(ddof); } - public static class MedianAggregation extends NoParamAggregation { + static final class MedianAggregation extends NoParamAggregation { private MedianAggregation() { super(Kind.MEDIAN); } @@ -636,26 +620,25 @@ private MedianAggregation() { /** * Median reduction. */ - public static MedianAggregation median() { + static MedianAggregation median() { return new MedianAggregation(); } /** * Aggregate to compute the specified quantiles. Uses linear interpolation by default. */ - public static QuantileAggregation quantile(double ... quantiles) { + static QuantileAggregation quantile(double ... quantiles) { return quantile(QuantileMethod.LINEAR, quantiles); } /** * Aggregate to compute various quantiles. */ - public static QuantileAggregation quantile(QuantileMethod method, double ... quantiles) { + static QuantileAggregation quantile(QuantileMethod method, double ... quantiles) { return new QuantileAggregation(method, quantiles); } - public static class ArgMaxAggregation extends NoParamAggregation - implements RollingAggregation{ + static final class ArgMaxAggregation extends NoParamAggregation { private ArgMaxAggregation() { super(Kind.ARGMAX); } @@ -667,12 +650,11 @@ private ArgMaxAggregation() { * prior to doing the aggregation. This would result in an index into the sorted data being * returned. */ - public static ArgMaxAggregation argMax() { + static ArgMaxAggregation argMax() { return new ArgMaxAggregation(); } - public static class ArgMinAggregation extends NoParamAggregation - implements RollingAggregation{ + static final class ArgMinAggregation extends NoParamAggregation { private ArgMinAggregation() { super(Kind.ARGMIN); } @@ -684,11 +666,11 @@ private ArgMinAggregation() { * prior to doing the aggregation. This would result in an index into the sorted data being * returned. */ - public static ArgMinAggregation argMin() { + static ArgMinAggregation argMin() { return new ArgMinAggregation(); } - public static class NuniqueAggregation extends CountLikeAggregation { + static final class NuniqueAggregation extends CountLikeAggregation { private NuniqueAggregation(NullPolicy nullPolicy) { super(Kind.NUNIQUE, nullPolicy); } @@ -697,7 +679,7 @@ private NuniqueAggregation(NullPolicy nullPolicy) { /** * Number of unique, non-null, elements. */ - public static NuniqueAggregation nunique() { + static NuniqueAggregation nunique() { return nunique(NullPolicy.EXCLUDE); } @@ -707,7 +689,7 @@ public static NuniqueAggregation nunique() { * compare as equal so multiple null values in a range would all only * increase the count by 1. */ - public static NuniqueAggregation nunique(NullPolicy nullPolicy) { + static NuniqueAggregation nunique(NullPolicy nullPolicy) { return new NuniqueAggregation(nullPolicy); } @@ -716,7 +698,7 @@ public static NuniqueAggregation nunique(NullPolicy nullPolicy) { * @param offset the offset to look at. Negative numbers go from the end of the group. Any * value outside of the group range results in a null. */ - public static NthAggregation nth(int offset) { + static NthAggregation nth(int offset) { return nth(offset, NullPolicy.INCLUDE); } @@ -727,12 +709,11 @@ public static NthAggregation nth(int offset) { * @param nullPolicy INCLUDE if nulls should be included in the aggregation or EXCLUDE if they * should be skipped. */ - public static NthAggregation nth(int offset, NullPolicy nullPolicy) { + static NthAggregation nth(int offset, NullPolicy nullPolicy) { return new NthAggregation(offset, nullPolicy); } - public static class RowNumberAggregation extends NoParamAggregation - implements RollingAggregation{ + static final class RowNumberAggregation extends NoParamAggregation { private RowNumberAggregation() { super(Kind.ROW_NUMBER); } @@ -741,12 +722,11 @@ private RowNumberAggregation() { /** * Get the row number, only makes sense for a window operations. */ - public static RowNumberAggregation rowNumber() { + static RowNumberAggregation rowNumber() { return new RowNumberAggregation(); } - public static class RankAggregation extends NoParamAggregation - implements RollingAggregation{ + static final class RankAggregation extends NoParamAggregation { private RankAggregation() { super(Kind.RANK); } @@ -755,12 +735,11 @@ private RankAggregation() { /** * Get the row's ranking. */ - public static RankAggregation rank() { + static RankAggregation rank() { return new RankAggregation(); } - public static class DenseRankAggregation extends NoParamAggregation - implements RollingAggregation{ + static final class DenseRankAggregation extends NoParamAggregation { private DenseRankAggregation() { super(Kind.DENSE_RANK); } @@ -769,14 +748,14 @@ private DenseRankAggregation() { /** * Get the row's dense ranking. */ - public static DenseRankAggregation denseRank() { + static DenseRankAggregation denseRank() { return new DenseRankAggregation(); } /** * Collect the values into a list. Nulls will be skipped. */ - public static CollectListAggregation collectList() { + static CollectListAggregation collectList() { return collectList(NullPolicy.EXCLUDE); } @@ -785,7 +764,7 @@ public static CollectListAggregation collectList() { * * @param nullPolicy Indicates whether to include/exclude nulls during collection. */ - public static CollectListAggregation collectList(NullPolicy nullPolicy) { + static CollectListAggregation collectList(NullPolicy nullPolicy) { return new CollectListAggregation(nullPolicy); } @@ -793,7 +772,7 @@ public static CollectListAggregation collectList(NullPolicy nullPolicy) { * Collect the values into a set. All null values will be excluded, and all nan values are regarded as * unique instances. */ - public static CollectSetAggregation collectSet() { + static CollectSetAggregation collectSet() { return collectSet(NullPolicy.EXCLUDE, NullEquality.UNEQUAL, NaNEquality.UNEQUAL); } @@ -804,11 +783,11 @@ public static CollectSetAggregation collectSet() { * @param nullEquality Flag to specify whether null entries within each list should be considered equal. * @param nanEquality Flag to specify whether NaN values in floating point column should be considered equal. */ - public static CollectSetAggregation collectSet(NullPolicy nullPolicy, NullEquality nullEquality, NaNEquality nanEquality) { + static CollectSetAggregation collectSet(NullPolicy nullPolicy, NullEquality nullEquality, NaNEquality nanEquality) { return new CollectSetAggregation(nullPolicy, nullEquality, nanEquality); } - public static final class MergeListsAggregation extends NoParamAggregation { + static final class MergeListsAggregation extends NoParamAggregation { private MergeListsAggregation() { super(Kind.MERGE_LISTS); } @@ -818,7 +797,7 @@ private MergeListsAggregation() { * Merge the partial lists produced by multiple CollectListAggregations. * NOTICE: The partial lists to be merged should NOT include any null list element (but can include null list entries). */ - public static MergeListsAggregation mergeLists() { + static MergeListsAggregation mergeLists() { return new MergeListsAggregation(); } @@ -826,7 +805,7 @@ public static MergeListsAggregation mergeLists() { * Merge the partial sets produced by multiple CollectSetAggregations. Each null/nan value will be regarded as * a unique instance. */ - public static MergeSetsAggregation mergeSets() { + static MergeSetsAggregation mergeSets() { return mergeSets(NullEquality.UNEQUAL, NaNEquality.UNEQUAL); } @@ -836,58 +815,39 @@ public static MergeSetsAggregation mergeSets() { * @param nullEquality Flag to specify whether null entries within each list should be considered equal. * @param nanEquality Flag to specify whether NaN values in floating point column should be considered equal. */ - public static MergeSetsAggregation mergeSets(NullEquality nullEquality, NaNEquality nanEquality) { + static MergeSetsAggregation mergeSets(NullEquality nullEquality, NaNEquality nanEquality) { return new MergeSetsAggregation(nullEquality, nanEquality); } - public static class LeadAggregation extends LeadLagAggregation - implements RollingAggregation { + static final class LeadAggregation extends LeadLagAggregation { private LeadAggregation(int offset, ColumnVector defaultOutput) { super(Kind.LEAD, offset, defaultOutput); } } - /** - * In a rolling window return the value offset entries ahead or null if it is outside of the - * window. - */ - public static LeadAggregation lead(int offset) { - return lead(offset, null); - } - /** * In a rolling window return the value offset entries ahead or the corresponding value from * defaultOutput if it is outside of the window. Note that this does not take any ownership of * defaultOutput and the caller mush ensure that defaultOutput remains valid during the life * time of this aggregation operation. */ - public static LeadAggregation lead(int offset, ColumnVector defaultOutput) { + static LeadAggregation lead(int offset, ColumnVector defaultOutput) { return new LeadAggregation(offset, defaultOutput); } - public static class LagAggregation extends LeadLagAggregation - implements RollingAggregation{ + static final class LagAggregation extends LeadLagAggregation { private LagAggregation(int offset, ColumnVector defaultOutput) { super(Kind.LAG, offset, defaultOutput); } } - - /** - * In a rolling window return the value offset entries behind or null if it is outside of the - * window. - */ - public static LagAggregation lag(int offset) { - return lag(offset, null); - } - /** * In a rolling window return the value offset entries behind or the corresponding value from * defaultOutput if it is outside of the window. Note that this does not take any ownership of * defaultOutput and the caller mush ensure that defaultOutput remains valid during the life * time of this aggregation operation. */ - public static LagAggregation lag(int offset, ColumnVector defaultOutput) { + static LagAggregation lag(int offset, ColumnVector defaultOutput) { return new LagAggregation(offset, defaultOutput); } @@ -900,7 +860,7 @@ private MergeM2Aggregation() { /** * Merge the partial M2 values produced by multiple instances of M2Aggregation. */ - public static MergeM2Aggregation mergeM2() { + static MergeM2Aggregation mergeM2() { return new MergeM2Aggregation(); } diff --git a/java/src/main/java/ai/rapids/cudf/AggregationOverWindow.java b/java/src/main/java/ai/rapids/cudf/AggregationOverWindow.java index abce287c9b0..d5544e01e7e 100644 --- a/java/src/main/java/ai/rapids/cudf/AggregationOverWindow.java +++ b/java/src/main/java/ai/rapids/cudf/AggregationOverWindow.java @@ -22,12 +22,12 @@ * An Aggregation instance that also holds a column number and window metadata so the aggregation * can be done over a specific window. */ -public class AggregationOverWindow> - extends AggregationOnColumn { +public final class AggregationOverWindow { + private final RollingAggregationOnColumn wrapped; protected final WindowOptions windowOptions; - AggregationOverWindow(T wrapped, int columnIndex, WindowOptions windowOptions) { - super(wrapped, columnIndex); + AggregationOverWindow(RollingAggregationOnColumn wrapped, WindowOptions windowOptions) { + this.wrapped = wrapped; this.windowOptions = windowOptions; if (windowOptions == null) { @@ -43,23 +43,6 @@ public WindowOptions getWindowOptions() { return windowOptions; } - @Override - public AggregationOnColumn onColumn(int columnIndex) { - if (columnIndex == getColumnIndex()) { - return this; // NOOP - } else { - return new AggregationOverWindow(this.wrapped, columnIndex, windowOptions); - } - } - - @Override - public AggregationOverWindow overWindow(WindowOptions windowOptions) { - if (this.windowOptions.equals(windowOptions)) { - return this; - } - return new AggregationOverWindow(wrapped, columnIndex, windowOptions); - } - @Override public int hashCode() { return 31 * super.hashCode() + windowOptions.hashCode(); @@ -69,10 +52,22 @@ public int hashCode() { public boolean equals(Object other) { if (other == this) { return true; - } else if (other instanceof AggregationOnColumn) { - AggregationOnColumn o = (AggregationOnColumn) other; - return wrapped.equals(o.wrapped) && columnIndex == o.columnIndex; + } else if (other instanceof AggregationOverWindow) { + AggregationOverWindow o = (AggregationOverWindow) other; + return wrapped.equals(o.wrapped) && windowOptions.equals(o.windowOptions); } return false; } + + int getColumnIndex() { + return wrapped.getColumnIndex(); + } + + long createNativeInstance() { + return wrapped.createNativeInstance(); + } + + long getDefaultOutput() { + return wrapped.getDefaultOutput(); + } } diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index e543d0c7b21..6902e2b322b 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -152,6 +152,16 @@ private ColumnVector(long viewAddress, DeviceMemoryBuffer contiguousBuffer) { incRefCountInternal(true); } + + /** + * For a ColumnVector this is really just incrementing the reference count. + * @return this + */ + @Override + public ColumnVector copyToColumnVector() { + return incRefCount(); + } + /** * Retrieves the column_view for a cudf::column and if it fails to do so, the column is deleted * and the exception is thrown to the caller. @@ -803,7 +813,7 @@ private static native long stringConcatenation(long[] columnViews, long separato /** * Native method to concatenate columns of strings together using a separator specified for each row * and returns the result as a string column. - * @param columns array of longs holding the native handles of the column_views to combine. + * @param columnViews array of longs holding the native handles of the column_views to combine. * @param sep_column long holding the native handle of the strings_column_view used as separators. * @param separator_narep string scalar indicating null behavior when a separator is null. * If set to null and the separator is null the resulting string will diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 4a1ed3a178e..4d9991d0dd9 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -101,11 +101,39 @@ public ColumnView(DType type, long rows, Optional nullCount, || !nullCount.isPresent(); } + /** + * Create a new column view based off of data already on the device. Ref count on the buffers + * is not incremented and none of the underlying buffers are owned by this view. The returned + * ColumnView is only valid as long as the underlying buffers remain valid. If the buffers are + * closed before this ColumnView is closed, it will result in undefined behavior. + * + * If ownership is needed, call {@link ColumnView#copyToColumnVector} + * + * @param type the type of the vector + * @param rows the number of rows in this vector. + * @param nullCount the number of nulls in the dataset. + * @param dataBuffer a host buffer required for nested types including strings and string + * categories. The ownership doesn't change on this buffer + * @param validityBuffer an optional validity buffer. Must be provided if nullCount != 0. + * The ownership doesn't change on this buffer + * @param offsetBuffer The offsetbuffer for columns that need an offset buffer + */ + public ColumnView(DType type, long rows, Optional nullCount, + BaseDeviceMemoryBuffer dataBuffer, + BaseDeviceMemoryBuffer validityBuffer, BaseDeviceMemoryBuffer offsetBuffer) { + this(type, (int) rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(), + dataBuffer, validityBuffer, offsetBuffer, null); + assert (!type.isNestedType()); + assert (nullCount.isPresent() && nullCount.get() <= Integer.MAX_VALUE) + || !nullCount.isPresent(); + } + private ColumnView(DType type, long rows, int nullCount, BaseDeviceMemoryBuffer dataBuffer, BaseDeviceMemoryBuffer validityBuffer, BaseDeviceMemoryBuffer offsetBuffer, ColumnView[] children) { this(ColumnVector.initViewHandle(type, (int) rows, nullCount, dataBuffer, validityBuffer, - offsetBuffer, Arrays.stream(children).mapToLong(c -> c.getNativeView()).toArray())); + offsetBuffer, children == null ? new long[]{} : + Arrays.stream(children).mapToLong(c -> c.getNativeView()).toArray())); } /** Creates a ColumnVector from a column view handle @@ -140,6 +168,32 @@ public final DType getType() { return type; } + /** + * Returns the child column views for this view + * Please note that it is the responsibility of the caller to close these views. + * @return an array of child column views + */ + public final ColumnView[] getChildColumnViews() { + int numChildren = getNumChildren(); + if (!getType().isNestedType()) { + return null; + } + ColumnView[] views = new ColumnView[numChildren]; + try { + for (int i = 0; i < numChildren; i++) { + views[i] = getChildColumnView(i); + } + return views; + } catch(Throwable t) { + for (ColumnView v: views) { + if (v != null) { + v.close(); + } + } + throw t; + } + } + /** * Returns the child column view at a given index. * Please note that it is the responsibility of the caller to close this view. @@ -1135,7 +1189,7 @@ public Scalar sum() { * of the specified type. */ public Scalar sum(DType outType) { - return reduce(Aggregation.sum(), outType); + return reduce(ReductionAggregation.sum(), outType); } /** @@ -1143,7 +1197,7 @@ public Scalar sum(DType outType) { * of the same type as this column. */ public Scalar min() { - return reduce(Aggregation.min(), type); + return reduce(ReductionAggregation.min(), type); } /** @@ -1160,7 +1214,7 @@ public Scalar min(DType outType) { return tmp.min(outType); } } - return reduce(Aggregation.min(), outType); + return reduce(ReductionAggregation.min(), outType); } /** @@ -1168,7 +1222,7 @@ public Scalar min(DType outType) { * of the same type as this column. */ public Scalar max() { - return reduce(Aggregation.max(), type); + return reduce(ReductionAggregation.max(), type); } /** @@ -1185,7 +1239,7 @@ public Scalar max(DType outType) { return tmp.max(outType); } } - return reduce(Aggregation.max(), outType); + return reduce(ReductionAggregation.max(), outType); } /** @@ -1201,7 +1255,7 @@ public Scalar product() { * of the specified type. */ public Scalar product(DType outType) { - return reduce(Aggregation.product(), outType); + return reduce(ReductionAggregation.product(), outType); } /** @@ -1217,7 +1271,7 @@ public Scalar sumOfSquares() { * scalar of the specified type. */ public Scalar sumOfSquares(DType outType) { - return reduce(Aggregation.sumOfSquares(), outType); + return reduce(ReductionAggregation.sumOfSquares(), outType); } /** @@ -1241,7 +1295,7 @@ public Scalar mean() { * types are currently supported. */ public Scalar mean(DType outType) { - return reduce(Aggregation.mean(), outType); + return reduce(ReductionAggregation.mean(), outType); } /** @@ -1265,7 +1319,7 @@ public Scalar variance() { * types are currently supported. */ public Scalar variance(DType outType) { - return reduce(Aggregation.variance(), outType); + return reduce(ReductionAggregation.variance(), outType); } /** @@ -1290,7 +1344,7 @@ public Scalar standardDeviation() { * types are currently supported. */ public Scalar standardDeviation(DType outType) { - return reduce(Aggregation.standardDeviation(), outType); + return reduce(ReductionAggregation.standardDeviation(), outType); } /** @@ -1309,7 +1363,7 @@ public Scalar any() { * Null values are skipped. */ public Scalar any(DType outType) { - return reduce(Aggregation.any(), outType); + return reduce(ReductionAggregation.any(), outType); } /** @@ -1330,7 +1384,7 @@ public Scalar all() { */ @Deprecated public Scalar all(DType outType) { - return reduce(Aggregation.all(), outType); + return reduce(ReductionAggregation.all(), outType); } /** @@ -1343,7 +1397,7 @@ public Scalar all(DType outType) { * empty or the reduction operation fails then the * {@link Scalar#isValid()} method of the result will return false. */ - public Scalar reduce(Aggregation aggregation) { + public Scalar reduce(ReductionAggregation aggregation) { return reduce(aggregation, type); } @@ -1360,7 +1414,7 @@ public Scalar reduce(Aggregation aggregation) { * empty or the reduction operation fails then the * {@link Scalar#isValid()} method of the result will return false. */ - public Scalar reduce(Aggregation aggregation, DType outType) { + public Scalar reduce(ReductionAggregation aggregation, DType outType) { long nativeId = aggregation.createNativeInstance(); try { return new Scalar(outType, reduce(getNativeView(), nativeId, outType.typeId.getNativeId(), outType.getScale())); @@ -1390,20 +1444,19 @@ public final ColumnVector quantile(QuantileMethod method, double[] quantiles) { * @throws IllegalArgumentException if unsupported window specification * (i.e. other than {@link WindowOptions.FrameType#ROWS} is used. */ public final ColumnVector rollingWindow(RollingAggregation op, WindowOptions options) { - Aggregation agg = op.getBaseAggregation(); // Check that only row-based windows are used. if (!options.getFrameType().equals(WindowOptions.FrameType.ROWS)) { throw new IllegalArgumentException("Expected ROWS-based window specification. Unexpected window type: " + options.getFrameType()); } - long nativePtr = agg.createNativeInstance(); + long nativePtr = op.createNativeInstance(); try { Scalar p = options.getPrecedingScalar(); Scalar f = options.getFollowingScalar(); return new ColumnVector( rollingWindow(this.getNativeView(), - agg.getDefaultOutput(), + op.getDefaultOutput(), options.getMinPeriods(), nativePtr, p == null || !p.isValid() ? 0 : p.getInt(), @@ -1420,7 +1473,7 @@ public final ColumnVector rollingWindow(RollingAggregation op, WindowOptions opt * This is just a convenience method for an inclusive scan with a SUM aggregation. */ public final ColumnVector prefixSum() { - return scan(Aggregation.sum()); + return scan(ScanAggregation.sum()); } /** @@ -1431,7 +1484,7 @@ public final ColumnVector prefixSum() { * null policy too. Currently none of those aggregations are supported so * it is undefined how they would interact with each other. */ - public final ColumnVector scan(Aggregation aggregation, ScanType scanType, NullPolicy nullPolicy) { + public final ColumnVector scan(ScanAggregation aggregation, ScanType scanType, NullPolicy nullPolicy) { long nativeId = aggregation.createNativeInstance(); try { return new ColumnVector(scan(getNativeView(), nativeId, @@ -1446,7 +1499,7 @@ public final ColumnVector scan(Aggregation aggregation, ScanType scanType, NullP * @param aggregation the aggregation to perform * @param scanType should the scan be inclusive, include the current row, or exclusive. */ - public final ColumnVector scan(Aggregation aggregation, ScanType scanType) { + public final ColumnVector scan(ScanAggregation aggregation, ScanType scanType) { return scan(aggregation, scanType, NullPolicy.EXCLUDE); } @@ -1454,7 +1507,7 @@ public final ColumnVector scan(Aggregation aggregation, ScanType scanType) { * Computes an inclusive scan for a column that excludes nulls. * @param aggregation the aggregation to perform */ - public final ColumnVector scan(Aggregation aggregation) { + public final ColumnVector scan(ScanAggregation aggregation) { return scan(aggregation, ScanType.INCLUSIVE, NullPolicy.EXCLUDE); } diff --git a/java/src/main/java/ai/rapids/cudf/DType.java b/java/src/main/java/ai/rapids/cudf/DType.java index 87237f1e4b2..2d851aa2ae3 100644 --- a/java/src/main/java/ai/rapids/cudf/DType.java +++ b/java/src/main/java/ai/rapids/cudf/DType.java @@ -30,65 +30,61 @@ public final class DType { 2. Update SINGLETON_DTYPE_LOOKUP to reflect new type. The order should be maintained between DTypeEnum and SINGLETON_DTYPE_LOOKUP */ public enum DTypeEnum { - EMPTY(0, 0, "NOT SUPPORTED"), - INT8(1, 1, "byte"), - INT16(2, 2, "short"), - INT32(4, 3, "int"), - INT64(8, 4, "long"), - UINT8(1, 5, "uint8"), - UINT16(2, 6, "uint16"), - UINT32(4, 7, "uint32"), - UINT64(8, 8, "uint64"), - FLOAT32(4, 9, "float"), - FLOAT64(8, 10, "double"), + EMPTY(0, 0), + INT8(1, 1), + INT16(2, 2), + INT32(4, 3), + INT64(8, 4), + UINT8(1, 5), + UINT16(2, 6), + UINT32(4, 7), + UINT64(8, 8), + FLOAT32(4, 9), + FLOAT64(8, 10), /** * Byte wise true non-0/false 0. In general true will be 1. */ - BOOL8(1, 11, "bool"), + BOOL8(1, 11), /** * Days since the UNIX epoch */ - TIMESTAMP_DAYS(4, 12, "date32"), + TIMESTAMP_DAYS(4, 12), /** * s since the UNIX epoch */ - TIMESTAMP_SECONDS(8, 13, "timestamp[s]"), + TIMESTAMP_SECONDS(8, 13), /** * ms since the UNIX epoch */ - TIMESTAMP_MILLISECONDS(8, 14, "timestamp[ms]"), + TIMESTAMP_MILLISECONDS(8, 14), /** * microseconds since the UNIX epoch */ - TIMESTAMP_MICROSECONDS(8, 15, "timestamp[us]"), + TIMESTAMP_MICROSECONDS(8, 15), /** * ns since the UNIX epoch */ - TIMESTAMP_NANOSECONDS(8, 16, "timestamp[ns]"), - - //We currently don't have mappings for duration type to I/O files, and these - //simpleNames might change in future when we do - DURATION_DAYS(4, 17, "int32"), - DURATION_SECONDS(8, 18, "int64"), - DURATION_MILLISECONDS(8, 19, "int64"), - DURATION_MICROSECONDS(8, 20, "int64"), - DURATION_NANOSECONDS(8, 21, "int64"), - //DICTIONARY32(4, 22, "NO IDEA"), - - STRING(0, 23, "str"), - LIST(0, 24, "list"), - DECIMAL32(4, 25, "decimal32"), - DECIMAL64(8, 26, "decimal64"), - STRUCT(0, 27, "struct"); + TIMESTAMP_NANOSECONDS(8, 16), + + DURATION_DAYS(4, 17), + DURATION_SECONDS(8, 18), + DURATION_MILLISECONDS(8, 19), + DURATION_MICROSECONDS(8, 20), + DURATION_NANOSECONDS(8, 21), + //DICTIONARY32(4, 22), + + STRING(0, 23), + LIST(0, 24), + DECIMAL32(4, 25), + DECIMAL64(8, 26), + STRUCT(0, 27); final int sizeInBytes; final int nativeId; - final String simpleName; - DTypeEnum(int sizeInBytes, int nativeId, String simpleName) { + DTypeEnum(int sizeInBytes, int nativeId) { this.sizeInBytes = sizeInBytes; this.nativeId = nativeId; - this.simpleName = simpleName; } public int getNativeId() { return nativeId; } @@ -191,12 +187,6 @@ private DType(DTypeEnum id, int decimalScale) { */ public int getScale() { return scale; } - /** - * Returns string name mapped to type. - * @return name corresponding to type - */ - public String getSimpleName() { return typeId.simpleName; } - /** * Return enum for this DType * @return DTypeEnum diff --git a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java new file mode 100644 index 00000000000..dd2adf8bee8 --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java @@ -0,0 +1,296 @@ +/* + * + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +/** + * An aggregation that can be used for a reduce. + */ +public final class GroupByAggregation { + private final Aggregation wrapped; + + private GroupByAggregation(Aggregation wrapped) { + this.wrapped = wrapped; + } + + Aggregation getWrapped() { + return wrapped; + } + + + /** + * Add a column to the Aggregation so it can be used on a specific column of data. + * @param columnIndex the index of the column to operate on. + */ + public GroupByAggregationOnColumn onColumn(int columnIndex) { + return new GroupByAggregationOnColumn(this, columnIndex); + } + + @Override + public int hashCode() { + return wrapped.hashCode(); + } + + @Override + public boolean equals(Object other) { + if (other == this) { + return true; + } else if (other instanceof GroupByAggregation) { + GroupByAggregation o = (GroupByAggregation) other; + return wrapped.equals(o.wrapped); + } + return false; + } + + /** + * Count number of valid, a.k.a. non-null, elements. + */ + public static GroupByAggregation count() { + return new GroupByAggregation(Aggregation.count()); + } + + /** + * Count number of elements. + * @param nullPolicy INCLUDE if nulls should be counted. EXCLUDE if only non-null values + * should be counted. + */ + public static GroupByAggregation count(NullPolicy nullPolicy) { + return new GroupByAggregation(Aggregation.count(nullPolicy)); + } + + /** + * Sum Aggregation + */ + public static GroupByAggregation sum() { + return new GroupByAggregation(Aggregation.sum()); + } + + /** + * Product Aggregation. + */ + public static GroupByAggregation product() { + return new GroupByAggregation(Aggregation.product()); + } + + + /** + * Index of max element. Please note that when using this aggregation if the + * data is not already sorted by the grouping keys it may be automatically sorted + * prior to doing the aggregation. This would result in an index into the sorted data being + * returned. + */ + public static GroupByAggregation argMax() { + return new GroupByAggregation(Aggregation.argMax()); + } + + /** + * Index of min element. Please note that when using this aggregation if the + * data is not already sorted by the grouping keys it may be automatically sorted + * prior to doing the aggregation. This would result in an index into the sorted data being + * returned. + */ + public static GroupByAggregation argMin() { + return new GroupByAggregation(Aggregation.argMin()); + } + + /** + * Min Aggregation + */ + public static GroupByAggregation min() { + return new GroupByAggregation(Aggregation.min()); + } + + /** + * Max Aggregation + */ + public static GroupByAggregation max() { + return new GroupByAggregation(Aggregation.max()); + } + + /** + * Arithmetic mean reduction. + */ + public static GroupByAggregation mean() { + return new GroupByAggregation(Aggregation.mean()); + } + + /** + * Sum of square of differences from mean. + */ + public static GroupByAggregation M2() { + return new GroupByAggregation(Aggregation.M2()); + } + + /** + * Variance aggregation with 1 as the delta degrees of freedom. + */ + public static GroupByAggregation variance() { + return new GroupByAggregation(Aggregation.variance()); + } + + /** + * Variance aggregation. + * @param ddof delta degrees of freedom. The divisor used in calculation of variance is + * N - ddof, where N is the population size. + */ + public static GroupByAggregation variance(int ddof) { + return new GroupByAggregation(Aggregation.variance(ddof)); + } + + /** + * Standard deviation aggregation with 1 as the delta degrees of freedom. + */ + public static GroupByAggregation standardDeviation() { + return new GroupByAggregation(Aggregation.standardDeviation()); + } + + /** + * Standard deviation aggregation. + * @param ddof delta degrees of freedom. The divisor used in calculation of std is + * N - ddof, where N is the population size. + */ + public static GroupByAggregation standardDeviation(int ddof) { + return new GroupByAggregation(Aggregation.standardDeviation(ddof)); + } + + /** + * Aggregate to compute the specified quantiles. Uses linear interpolation by default. + */ + public static GroupByAggregation quantile(double ... quantiles) { + return new GroupByAggregation(Aggregation.quantile(quantiles)); + } + + /** + * Aggregate to compute various quantiles. + */ + public static GroupByAggregation quantile(QuantileMethod method, double ... quantiles) { + return new GroupByAggregation(Aggregation.quantile(method, quantiles)); + } + + /** + * Median reduction. + */ + public static GroupByAggregation median() { + return new GroupByAggregation(Aggregation.median()); + } + + /** + * Number of unique, non-null, elements. + */ + public static GroupByAggregation nunique() { + return new GroupByAggregation(Aggregation.nunique()); + } + + /** + * Number of unique elements. + * @param nullPolicy INCLUDE if nulls should be counted else EXCLUDE. If nulls are counted they + * compare as equal so multiple null values in a range would all only + * increase the count by 1. + */ + public static GroupByAggregation nunique(NullPolicy nullPolicy) { + return new GroupByAggregation(Aggregation.nunique(nullPolicy)); + } + + /** + * Get the nth, non-null, element in a group. + * @param offset the offset to look at. Negative numbers go from the end of the group. Any + * value outside of the group range results in a null. + */ + public static GroupByAggregation nth(int offset) { + return new GroupByAggregation(Aggregation.nth(offset)); + } + + /** + * Get the nth element in a group. + * @param offset the offset to look at. Negative numbers go from the end of the group. Any + * value outside of the group range results in a null. + * @param nullPolicy INCLUDE if nulls should be included in the aggregation or EXCLUDE if they + * should be skipped. + */ + public static GroupByAggregation nth(int offset, NullPolicy nullPolicy) { + return new GroupByAggregation(Aggregation.nth(offset, nullPolicy)); + } + + /** + * Collect the values into a list. Nulls will be skipped. + */ + public static GroupByAggregation collectList() { + return new GroupByAggregation(Aggregation.collectList()); + } + + /** + * Collect the values into a list. + * + * @param nullPolicy Indicates whether to include/exclude nulls during collection. + */ + public static GroupByAggregation collectList(NullPolicy nullPolicy) { + return new GroupByAggregation(Aggregation.collectList(nullPolicy)); + } + + /** + * Collect the values into a set. All null values will be excluded, and all nan values are regarded as + * unique instances. + */ + public static GroupByAggregation collectSet() { + return new GroupByAggregation(Aggregation.collectSet()); + } + + /** + * Collect the values into a set. + * + * @param nullPolicy Indicates whether to include/exclude nulls during collection. + * @param nullEquality Flag to specify whether null entries within each list should be considered equal. + * @param nanEquality Flag to specify whether NaN values in floating point column should be considered equal. + */ + public static GroupByAggregation collectSet(NullPolicy nullPolicy, NullEquality nullEquality, NaNEquality nanEquality) { + return new GroupByAggregation(Aggregation.collectSet(nullPolicy, nullEquality, nanEquality)); + } + + /** + * Merge the partial lists produced by multiple CollectListAggregations. + * NOTICE: The partial lists to be merged should NOT include any null list element (but can include null list entries). + */ + public static GroupByAggregation mergeLists() { + return new GroupByAggregation(Aggregation.mergeLists()); + } + + /** + * Merge the partial sets produced by multiple CollectSetAggregations. Each null/nan value will be regarded as + * a unique instance. + */ + public static GroupByAggregation mergeSets() { + return new GroupByAggregation(Aggregation.mergeSets()); + } + + /** + * Merge the partial sets produced by multiple CollectSetAggregations. + * + * @param nullEquality Flag to specify whether null entries within each list should be considered equal. + * @param nanEquality Flag to specify whether NaN values in floating point column should be considered equal. + */ + public static GroupByAggregation mergeSets(NullEquality nullEquality, NaNEquality nanEquality) { + return new GroupByAggregation(Aggregation.mergeSets(nullEquality, nanEquality)); + } + + /** + * Merge the partial M2 values produced by multiple instances of M2Aggregation. + */ + public static GroupByAggregation mergeM2() { + return new GroupByAggregation(Aggregation.mergeM2()); + } +} diff --git a/java/src/main/java/ai/rapids/cudf/GroupByAggregationOnColumn.java b/java/src/main/java/ai/rapids/cudf/GroupByAggregationOnColumn.java new file mode 100644 index 00000000000..c50cf3728f0 --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/GroupByAggregationOnColumn.java @@ -0,0 +1,56 @@ +/* + * + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +/** + * A GroupByAggregation for a specific column in a table. + */ +public final class GroupByAggregationOnColumn { + protected final GroupByAggregation wrapped; + protected final int columnIndex; + + GroupByAggregationOnColumn(GroupByAggregation wrapped, int columnIndex) { + this.wrapped = wrapped; + this.columnIndex = columnIndex; + } + + public int getColumnIndex() { + return columnIndex; + } + + GroupByAggregation getWrapped() { + return wrapped; + } + + @Override + public int hashCode() { + return 31 * wrapped.hashCode() + columnIndex; + } + + @Override + public boolean equals(Object other) { + if (other == this) { + return true; + } else if (other instanceof GroupByAggregationOnColumn) { + GroupByAggregationOnColumn o = (GroupByAggregationOnColumn) other; + return wrapped.equals(o.wrapped) && columnIndex == o.columnIndex; + } + return false; + } +} diff --git a/java/src/main/java/ai/rapids/cudf/GroupByScanAggregation.java b/java/src/main/java/ai/rapids/cudf/GroupByScanAggregation.java new file mode 100644 index 00000000000..219b6dde05d --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/GroupByScanAggregation.java @@ -0,0 +1,118 @@ +/* + * + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +/** + * An aggregation that can be used for a grouped scan. + */ +public final class GroupByScanAggregation { + private final Aggregation wrapped; + + private GroupByScanAggregation(Aggregation wrapped) { + this.wrapped = wrapped; + } + + long createNativeInstance() { + return wrapped.createNativeInstance(); + } + + long getDefaultOutput() { + return wrapped.getDefaultOutput(); + } + + Aggregation getWrapped() { + return wrapped; + } + + /** + * Add a column to the Aggregation so it can be used on a specific column of data. + * @param columnIndex the index of the column to operate on. + */ + public GroupByScanAggregationOnColumn onColumn(int columnIndex) { + return new GroupByScanAggregationOnColumn(this, columnIndex); + } + + @Override + public int hashCode() { + return wrapped.hashCode(); + } + + @Override + public boolean equals(Object other) { + if (other == this) { + return true; + } else if (other instanceof GroupByScanAggregation) { + GroupByScanAggregation o = (GroupByScanAggregation) other; + return wrapped.equals(o.wrapped); + } + return false; + } + + /** + * Sum Aggregation + */ + public static GroupByScanAggregation sum() { + return new GroupByScanAggregation(Aggregation.sum()); + } + + + /** + * Product Aggregation. + */ + public static GroupByScanAggregation product() { + return new GroupByScanAggregation(Aggregation.product()); + } + + /** + * Min Aggregation + */ + public static GroupByScanAggregation min() { + return new GroupByScanAggregation(Aggregation.min()); + } + + /** + * Max Aggregation + */ + public static GroupByScanAggregation max() { + return new GroupByScanAggregation(Aggregation.max()); + } + + /** + * Count number of elements. + * @param nullPolicy INCLUDE if nulls should be counted. EXCLUDE if only non-null values + * should be counted. + */ + public static GroupByScanAggregation count(NullPolicy nullPolicy) { + return new GroupByScanAggregation(Aggregation.count(nullPolicy)); + } + + /** + * Get the row's ranking. + */ + public static GroupByScanAggregation rank() { + return new GroupByScanAggregation(Aggregation.rank()); + } + + /** + * Get the row's dense ranking. + */ + public static GroupByScanAggregation denseRank() { + return new GroupByScanAggregation(Aggregation.denseRank()); + } +} diff --git a/java/src/main/java/ai/rapids/cudf/GroupByScanAggregationOnColumn.java b/java/src/main/java/ai/rapids/cudf/GroupByScanAggregationOnColumn.java new file mode 100644 index 00000000000..75e4936e5b9 --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/GroupByScanAggregationOnColumn.java @@ -0,0 +1,64 @@ +/* + * + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +/** + * A GroupByScanAggregation for a specific column in a table. + */ +public final class GroupByScanAggregationOnColumn { + protected final GroupByScanAggregation wrapped; + protected final int columnIndex; + + GroupByScanAggregationOnColumn(GroupByScanAggregation wrapped, int columnIndex) { + this.wrapped = wrapped; + this.columnIndex = columnIndex; + } + + public int getColumnIndex() { + return columnIndex; + } + + @Override + public int hashCode() { + return 31 * wrapped.hashCode() + columnIndex; + } + + @Override + public boolean equals(Object other) { + if (other == this) { + return true; + } else if (other instanceof GroupByScanAggregationOnColumn) { + GroupByScanAggregationOnColumn o = (GroupByScanAggregationOnColumn) other; + return wrapped.equals(o.wrapped) && columnIndex == o.columnIndex; + } + return false; + } + + long createNativeInstance() { + return wrapped.createNativeInstance(); + } + + long getDefaultOutput() { + return wrapped.getDefaultOutput(); + } + + GroupByScanAggregation getWrapped() { + return wrapped; + } +} diff --git a/java/src/main/java/ai/rapids/cudf/HashJoin.java b/java/src/main/java/ai/rapids/cudf/HashJoin.java new file mode 100644 index 00000000000..620a7ce6a6c --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/HashJoin.java @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.rapids.cudf; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This class represents a hash table built from the join keys of the right-side table for a + * join operation. This hash table can then be reused across a series of left probe tables + * to compute gather maps for joins more efficiently when the right-side table is not changing. + * It can also be used to query the output row count of a join and then pass that result to the + * operation that generates the join gather maps to avoid redundant computation when the output + * row count must be checked before manifesting the join gather maps. + */ +public class HashJoin implements AutoCloseable { + static { + NativeDepsLoader.loadNativeDeps(); + } + + private static final Logger log = LoggerFactory.getLogger(HashJoin.class); + + private static class HashJoinCleaner extends MemoryCleaner.Cleaner { + private Table buildKeys; + private long nativeHandle; + + HashJoinCleaner(Table buildKeys, long nativeHandle) { + this.buildKeys = buildKeys; + this.nativeHandle = nativeHandle; + addRef(); + } + + @Override + protected synchronized boolean cleanImpl(boolean logErrorIfNotClean) { + long origAddress = nativeHandle; + boolean neededCleanup = nativeHandle != 0; + if (neededCleanup) { + try { + destroy(nativeHandle); + buildKeys.close(); + buildKeys = null; + } finally { + nativeHandle = 0; + } + if (logErrorIfNotClean) { + log.error("A HASH TABLE WAS LEAKED (ID: " + id + " " + Long.toHexString(origAddress)); + } + } + return neededCleanup; + } + + @Override + public boolean isClean() { + return nativeHandle == 0; + } + } + + private final HashJoinCleaner cleaner; + private final boolean compareNulls; + private boolean isClosed = false; + + /** + * Construct a hash table for a join from a table representing the join key columns from the + * right-side table in the join. The resulting instance must be closed to release the + * GPU resources associated with the instance. + * @param buildKeys table view containing the join keys for the right-side join table + * @param compareNulls true if null key values should match otherwise false + */ + public HashJoin(Table buildKeys, boolean compareNulls) { + this.compareNulls = compareNulls; + Table buildTable = new Table(buildKeys.getColumns()); + try { + long handle = create(buildTable.getNativeView(), compareNulls); + this.cleaner = new HashJoinCleaner(buildTable, handle); + MemoryCleaner.register(this, cleaner); + } catch (Throwable t) { + try { + buildTable.close(); + } catch (Throwable t2) { + t.addSuppressed(t2); + } + throw t; + } + } + + @Override + public synchronized void close() { + cleaner.delRef(); + if (isClosed) { + cleaner.logRefCountDebug("double free " + this); + throw new IllegalStateException("Close called too many times " + this); + } + cleaner.clean(false); + isClosed = true; + } + + long getNativeView() { + return cleaner.nativeHandle; + } + + /** Get the number of join key columns for the table that was used to generate the has table. */ + public long getNumberOfColumns() { + return cleaner.buildKeys.getNumberOfColumns(); + } + + /** Returns true if the hash table was built to match on nulls otherwise false. */ + public boolean getCompareNulls() { + return compareNulls; + } + + private static native long create(long tableView, boolean nullEqual); + private static native void destroy(long handle); +} diff --git a/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java b/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java index 4bf38543a2d..a936d4830ee 100644 --- a/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java +++ b/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java @@ -277,6 +277,10 @@ public static void register(CompiledExpression expr, Cleaner cleaner) { all.add(new CleanerWeakReference(expr, cleaner, collected, false)); } + static void register(HashJoin hashJoin, Cleaner cleaner) { + all.add(new CleanerWeakReference(hashJoin, cleaner, collected, true)); + } + /** * This is not 100% perfect and we can still run into situations where RMM buffers were not * collected and this returns false because of thread race conditions. This is just a best effort. diff --git a/java/src/main/java/ai/rapids/cudf/ParquetColumnWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetColumnWriterOptions.java index f5b0a0f74b3..229cb0262d3 100644 --- a/java/src/main/java/ai/rapids/cudf/ParquetColumnWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ParquetColumnWriterOptions.java @@ -28,6 +28,7 @@ public class ParquetColumnWriterOptions { private boolean isTimestampTypeInt96; private int precision; private boolean isNullable; + private boolean isMap = false; private String columName; private ParquetColumnWriterOptions(AbstractStructBuilder builder) { this.columName = builder.name; @@ -122,6 +123,15 @@ public T withListColumn(ParquetListColumnWriterOptions child) { return (T) this; } + /** + * Set the map column meta. + * @return this for chaining. + */ + public T withMapColumn(ParquetColumnWriterOptions child) { + children.add(child); + return (T) this; + } + /** * Set a child struct meta data * @return this for chaining. @@ -220,22 +230,22 @@ public T withNullableTimestampColumn(String name, boolean isInt96) { public abstract V build(); } - ParquetColumnWriterOptions(String columnName, boolean isTimestampTypeInt96, - int precision, boolean isNullable) { + public ParquetColumnWriterOptions(String columnName, boolean isTimestampTypeInt96, + int precision, boolean isNullable) { this.isTimestampTypeInt96 = isTimestampTypeInt96; this.precision = precision; this.isNullable = isNullable; this.columName = columnName; } - ParquetColumnWriterOptions(String columnName, boolean isNullable) { + public ParquetColumnWriterOptions(String columnName, boolean isNullable) { this.isTimestampTypeInt96 = false; this.precision = 0; this.isNullable = isNullable; this.columName = columnName; } - ParquetColumnWriterOptions(String columnName) { + public ParquetColumnWriterOptions(String columnName) { this(columnName, true); } @@ -295,6 +305,15 @@ boolean[] getFlatIsNullable() { } } + boolean[] getFlatIsMap() { + boolean[] ret = {isMap}; + if (childColumnOptions.length > 0) { + return getFlatBooleans(ret, (opt) -> opt.getFlatIsMap()); + } else { + return ret; + } + } + int[] getFlatNumChildren() { int[] ret = {childColumnOptions.length}; if (childColumnOptions.length > 0) { @@ -351,6 +370,27 @@ protected String[] getFlatColumnNames(String[] ret) { return result; } + /** + * Add a Map Column to the schema. + *

+ * Maps are List columns with a Struct named 'key_value' with a child named 'key' and a child + * named 'value'. The caller of this method doesn't need to worry about this as this method will + * take care of this without the knowledge of the caller. + */ + public static ParquetColumnWriterOptions mapColumn(String name, ParquetColumnWriterOptions key, + ParquetColumnWriterOptions value) { + ParquetStructColumnWriterOptions struct = structBuilder("key_value").build(); + if (key.isNullable) { + throw new IllegalArgumentException("key column can not be nullable"); + } + struct.childColumnOptions = new ParquetColumnWriterOptions[]{key, value}; + ParquetColumnWriterOptions opt = listBuilder(name) + .withStructColumn(struct) + .build(); + opt.isMap = true; + return opt; + } + /** * Creates a ListBuilder for column called 'name' */ diff --git a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java index 9992ae9eaf1..38f8d8e59a4 100644 --- a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java @@ -57,6 +57,11 @@ boolean[] getFlatIsNullable() { return super.getFlatBooleans(new boolean[]{}, (opt) -> opt.getFlatIsNullable()); } + @Override + boolean[] getFlatIsMap() { + return super.getFlatBooleans(new boolean[]{}, (opt) -> opt.getFlatIsMap()); + } + @Override String[] getFlatColumnNames() { return super.getFlatColumnNames(new String[]{}); diff --git a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java new file mode 100644 index 00000000000..7eff85dcd0d --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java @@ -0,0 +1,212 @@ +/* + * + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +/** + * An aggregation that can be used for a reduce. + */ +public final class ReductionAggregation { + private final Aggregation wrapped; + + private ReductionAggregation(Aggregation wrapped) { + this.wrapped = wrapped; + } + + long createNativeInstance() { + return wrapped.createNativeInstance(); + } + + long getDefaultOutput() { + return wrapped.getDefaultOutput(); + } + + Aggregation getWrapped() { + return wrapped; + } + + @Override + public int hashCode() { + return wrapped.hashCode(); + } + + @Override + public boolean equals(Object other) { + if (other == this) { + return true; + } else if (other instanceof ReductionAggregation) { + ReductionAggregation o = (ReductionAggregation) other; + return wrapped.equals(o.wrapped); + } + return false; + } + + /** + * Sum Aggregation + */ + public static ReductionAggregation sum() { + return new ReductionAggregation(Aggregation.sum()); + } + + /** + * Product Aggregation. + */ + public static ReductionAggregation product() { + return new ReductionAggregation(Aggregation.product()); + } + + /** + * Min Aggregation + */ + public static ReductionAggregation min() { + return new ReductionAggregation(Aggregation.min()); + } + + /** + * Max Aggregation + */ + public static ReductionAggregation max() { + return new ReductionAggregation(Aggregation.max()); + } + + /** + * Any reduction. Produces a true or 1, depending on the output type, + * if any of the elements in the range are true or non-zero, otherwise produces a false or 0. + * Null values are skipped. + */ + public static ReductionAggregation any() { + return new ReductionAggregation(Aggregation.any()); + } + + /** + * All reduction. Produces true or 1, depending on the output type, if all of the elements in + * the range are true or non-zero, otherwise produces a false or 0. + * Null values are skipped. + */ + public static ReductionAggregation all() { + return new ReductionAggregation(Aggregation.all()); + } + + + /** + * Sum of squares reduction. + */ + public static ReductionAggregation sumOfSquares() { + return new ReductionAggregation(Aggregation.sumOfSquares()); + } + + /** + * Arithmetic mean reduction. + */ + public static ReductionAggregation mean() { + return new ReductionAggregation(Aggregation.mean()); + } + + + /** + * Variance aggregation with 1 as the delta degrees of freedom. + */ + public static ReductionAggregation variance() { + return new ReductionAggregation(Aggregation.variance()); + } + + /** + * Variance aggregation. + * @param ddof delta degrees of freedom. The divisor used in calculation of variance is + * N - ddof, where N is the population size. + */ + public static ReductionAggregation variance(int ddof) { + return new ReductionAggregation(Aggregation.variance(ddof)); + } + + /** + * Standard deviation aggregation with 1 as the delta degrees of freedom. + */ + public static ReductionAggregation standardDeviation() { + return new ReductionAggregation(Aggregation.standardDeviation()); + } + + /** + * Standard deviation aggregation. + * @param ddof delta degrees of freedom. The divisor used in calculation of std is + * N - ddof, where N is the population size. + */ + public static ReductionAggregation standardDeviation(int ddof) { + return new ReductionAggregation(Aggregation.standardDeviation(ddof)); + } + + + /** + * Median reduction. + */ + public static ReductionAggregation median() { + return new ReductionAggregation(Aggregation.median()); + } + + /** + * Aggregate to compute the specified quantiles. Uses linear interpolation by default. + */ + public static ReductionAggregation quantile(double ... quantiles) { + return new ReductionAggregation(Aggregation.quantile(quantiles)); + } + + /** + * Aggregate to compute various quantiles. + */ + public static ReductionAggregation quantile(QuantileMethod method, double ... quantiles) { + return new ReductionAggregation(Aggregation.quantile(method, quantiles)); + } + + + /** + * Number of unique, non-null, elements. + */ + public static ReductionAggregation nunique() { + return new ReductionAggregation(Aggregation.nunique()); + } + + /** + * Number of unique elements. + * @param nullPolicy INCLUDE if nulls should be counted else EXCLUDE. If nulls are counted they + * compare as equal so multiple null values in a range would all only + * increase the count by 1. + */ + public static ReductionAggregation nunique(NullPolicy nullPolicy) { + return new ReductionAggregation(Aggregation.nunique(nullPolicy)); + } + + /** + * Get the nth, non-null, element in a group. + * @param offset the offset to look at. Negative numbers go from the end of the group. Any + * value outside of the group range results in a null. + */ + public static ReductionAggregation nth(int offset) { + return new ReductionAggregation(Aggregation.nth(offset)); + } + + /** + * Get the nth element in a group. + * @param offset the offset to look at. Negative numbers go from the end of the group. Any + * value outside of the group range results in a null. + * @param nullPolicy INCLUDE if nulls should be included in the aggregation or EXCLUDE if they + * should be skipped. + */ + public static ReductionAggregation nth(int offset, NullPolicy nullPolicy) { + return new ReductionAggregation(Aggregation.nth(offset, nullPolicy)); + } +} diff --git a/java/src/main/java/ai/rapids/cudf/RollingAggregation.java b/java/src/main/java/ai/rapids/cudf/RollingAggregation.java index 9b80924463a..07983f77aad 100644 --- a/java/src/main/java/ai/rapids/cudf/RollingAggregation.java +++ b/java/src/main/java/ai/rapids/cudf/RollingAggregation.java @@ -19,11 +19,189 @@ package ai.rapids.cudf; /** - * Used to tag an aggregation as something that is compatible with rolling window operations. - * Do not try to implement this yourself + * An aggregation that can be used on rolling windows. */ -public interface RollingAggregation { - default T getBaseAggregation() { - return (T)this; +public final class RollingAggregation { + private final Aggregation wrapped; + + private RollingAggregation(Aggregation wrapped) { + this.wrapped = wrapped; + } + + long createNativeInstance() { + return wrapped.createNativeInstance(); + } + + long getDefaultOutput() { + return wrapped.getDefaultOutput(); + } + + /** + * Add a column to the Aggregation so it can be used on a specific column of data. + * @param columnIndex the index of the column to operate on. + */ + public RollingAggregationOnColumn onColumn(int columnIndex) { + return new RollingAggregationOnColumn(this, columnIndex); + } + + @Override + public int hashCode() { + return wrapped.hashCode(); + } + + @Override + public boolean equals(Object other) { + if (other == this) { + return true; + } else if (other instanceof RollingAggregation) { + RollingAggregation o = (RollingAggregation) other; + return wrapped.equals(o.wrapped); + } + return false; + } + + /** + * Rolling Window Sum + */ + public static RollingAggregation sum() { + return new RollingAggregation(Aggregation.sum()); + } + + + /** + * Rolling Window Min + */ + public static RollingAggregation min() { + return new RollingAggregation(Aggregation.min()); + } + + /** + * Rolling Window Max + */ + public static RollingAggregation max() { + return new RollingAggregation(Aggregation.max()); + } + + + /** + * Count number of valid, a.k.a. non-null, elements. + */ + public static RollingAggregation count() { + return new RollingAggregation(Aggregation.count()); + } + + /** + * Count number of elements. + * @param nullPolicy INCLUDE if nulls should be counted. EXCLUDE if only non-null values + * should be counted. + */ + public static RollingAggregation count(NullPolicy nullPolicy) { + return new RollingAggregation(Aggregation.count(nullPolicy)); + } + + /** + * Arithmetic Mean + */ + public static RollingAggregation mean() { + return new RollingAggregation(Aggregation.mean()); + } + + + /** + * Index of max element. + */ + public static RollingAggregation argMax() { + return new RollingAggregation(Aggregation.argMax()); + } + + /** + * Index of min element. + */ + public static RollingAggregation argMin() { + return new RollingAggregation(Aggregation.argMin()); + } + + + /** + * Get the row number. + */ + public static RollingAggregation rowNumber() { + return new RollingAggregation(Aggregation.rowNumber()); + } + + + /** + * In a rolling window return the value offset entries ahead or null if it is outside of the + * window. + */ + public static RollingAggregation lead(int offset) { + return lead(offset, null); + } + + /** + * In a rolling window return the value offset entries ahead or the corresponding value from + * defaultOutput if it is outside of the window. Note that this does not take any ownership of + * defaultOutput and the caller mush ensure that defaultOutput remains valid during the life + * time of this aggregation operation. + */ + public static RollingAggregation lead(int offset, ColumnVector defaultOutput) { + return new RollingAggregation(Aggregation.lead(offset, defaultOutput)); + } + + + + /** + * In a rolling window return the value offset entries behind or null if it is outside of the + * window. + */ + public static RollingAggregation lag(int offset) { + return lag(offset, null); + } + + /** + * In a rolling window return the value offset entries behind or the corresponding value from + * defaultOutput if it is outside of the window. Note that this does not take any ownership of + * defaultOutput and the caller mush ensure that defaultOutput remains valid during the life + * time of this aggregation operation. + */ + public static RollingAggregation lag(int offset, ColumnVector defaultOutput) { + return new RollingAggregation(Aggregation.lag(offset, defaultOutput)); + } + + + /** + * Collect the values into a list. Nulls will be skipped. + */ + public static RollingAggregation collectList() { + return new RollingAggregation(Aggregation.collectList()); + } + + /** + * Collect the values into a list. + * + * @param nullPolicy Indicates whether to include/exclude nulls during collection. + */ + public static RollingAggregation collectList(NullPolicy nullPolicy) { + return new RollingAggregation(Aggregation.collectList(nullPolicy)); + } + + + /** + * Collect the values into a set. All null values will be excluded, and all nan values are regarded as + * unique instances. + */ + public static RollingAggregation collectSet() { + return new RollingAggregation(Aggregation.collectSet()); + } + + /** + * Collect the values into a set. + * + * @param nullPolicy Indicates whether to include/exclude nulls during collection. + * @param nullEquality Flag to specify whether null entries within each list should be considered equal. + * @param nanEquality Flag to specify whether NaN values in floating point column should be considered equal. + */ + public static RollingAggregation collectSet(NullPolicy nullPolicy, NullEquality nullEquality, NaNEquality nanEquality) { + return new RollingAggregation(Aggregation.collectSet(nullPolicy, nullEquality, nanEquality)); } } diff --git a/java/src/main/java/ai/rapids/cudf/AggregationOnColumn.java b/java/src/main/java/ai/rapids/cudf/RollingAggregationOnColumn.java similarity index 55% rename from java/src/main/java/ai/rapids/cudf/AggregationOnColumn.java rename to java/src/main/java/ai/rapids/cudf/RollingAggregationOnColumn.java index bb1404e5a07..a6b1484aa71 100644 --- a/java/src/main/java/ai/rapids/cudf/AggregationOnColumn.java +++ b/java/src/main/java/ai/rapids/cudf/RollingAggregationOnColumn.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,47 +19,24 @@ package ai.rapids.cudf; /** - * An Aggregation instance that also holds a column number so the aggregation can be done on - * a specific column of data in a table. + * A RollingAggregation for a specific column in a table. */ -public class AggregationOnColumn extends Aggregation { - protected final T wrapped; +public final class RollingAggregationOnColumn { + protected final RollingAggregation wrapped; protected final int columnIndex; - AggregationOnColumn(T wrapped, int columnIndex) { - super(wrapped.kind); + RollingAggregationOnColumn(RollingAggregation wrapped, int columnIndex) { this.wrapped = wrapped; this.columnIndex = columnIndex; } - @Override - public AggregationOnColumn onColumn(int columnIndex) { - if (columnIndex == getColumnIndex()) { - return this; // NOOP - } else { - return new AggregationOnColumn(this.wrapped, columnIndex); - } - } - - /** - * Do the aggregation over a given Window. - */ - public > AggregationOverWindow overWindow(WindowOptions windowOptions) { - return new AggregationOverWindow(wrapped, columnIndex, windowOptions); - } - public int getColumnIndex() { return columnIndex; } - @Override - long createNativeInstance() { - return wrapped.createNativeInstance(); - } - @Override - long getDefaultOutput() { - return wrapped.getDefaultOutput(); + public AggregationOverWindow overWindow(WindowOptions windowOptions) { + return new AggregationOverWindow(this, windowOptions); } @Override @@ -71,10 +48,18 @@ public int hashCode() { public boolean equals(Object other) { if (other == this) { return true; - } else if (other instanceof AggregationOnColumn) { - AggregationOnColumn o = (AggregationOnColumn) other; + } else if (other instanceof RollingAggregationOnColumn) { + RollingAggregationOnColumn o = (RollingAggregationOnColumn) other; return wrapped.equals(o.wrapped) && columnIndex == o.columnIndex; } return false; } + + long createNativeInstance() { + return wrapped.createNativeInstance(); + } + + long getDefaultOutput() { + return wrapped.getDefaultOutput(); + } } diff --git a/java/src/main/java/ai/rapids/cudf/ScanAggregation.java b/java/src/main/java/ai/rapids/cudf/ScanAggregation.java new file mode 100644 index 00000000000..08489562adc --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/ScanAggregation.java @@ -0,0 +1,100 @@ +/* + * + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +/** + * An aggregation that can be used for a scan. + */ +public final class ScanAggregation { + private final Aggregation wrapped; + + private ScanAggregation(Aggregation wrapped) { + this.wrapped = wrapped; + } + + long createNativeInstance() { + return wrapped.createNativeInstance(); + } + + long getDefaultOutput() { + return wrapped.getDefaultOutput(); + } + + Aggregation getWrapped() { + return wrapped; + } + + @Override + public int hashCode() { + return wrapped.hashCode(); + } + + @Override + public boolean equals(Object other) { + if (other == this) { + return true; + } else if (other instanceof ScanAggregation) { + ScanAggregation o = (ScanAggregation) other; + return wrapped.equals(o.wrapped); + } + return false; + } + + /** + * Sum Aggregation + */ + public static ScanAggregation sum() { + return new ScanAggregation(Aggregation.sum()); + } + + /** + * Product Aggregation. + */ + public static ScanAggregation product() { + return new ScanAggregation(Aggregation.product()); + } + + /** + * Min Aggregation + */ + public static ScanAggregation min() { + return new ScanAggregation(Aggregation.min()); + } + + /** + * Max Aggregation + */ + public static ScanAggregation max() { + return new ScanAggregation(Aggregation.max()); + } + + /** + * Get the row's ranking. + */ + public static ScanAggregation rank() { + return new ScanAggregation(Aggregation.rank()); + } + + /** + * Get the row's dense ranking. + */ + public static ScanAggregation denseRank() { + return new ScanAggregation(Aggregation.denseRank()); + } +} diff --git a/java/src/main/java/ai/rapids/cudf/Schema.java b/java/src/main/java/ai/rapids/cudf/Schema.java index f0bc3d930d9..c90d27efa97 100644 --- a/java/src/main/java/ai/rapids/cudf/Schema.java +++ b/java/src/main/java/ai/rapids/cudf/Schema.java @@ -27,11 +27,11 @@ public class Schema { public static final Schema INFERRED = new Schema(); private final List names; - private final List typeNames; + private final List types; - private Schema(List names, List typeNames) { + private Schema(List names, List types) { this.names = new ArrayList<>(names); - this.typeNames = new ArrayList<>(typeNames); + this.types = new ArrayList<>(types); } /** @@ -39,7 +39,7 @@ private Schema(List names, List typeNames) { */ private Schema() { names = null; - typeNames = null; + types = null; } public static Builder builder() { @@ -53,25 +53,40 @@ public String[] getColumnNames() { return names.toArray(new String[names.size()]); } - String[] getTypesAsStrings() { - if (typeNames == null) { + int[] getTypeIds() { + if (types == null) { return null; } - return typeNames.toArray(new String[typeNames.size()]); + int[] ret = new int[types.size()]; + for (int i = 0; i < types.size(); i++) { + ret[i] = types.get(i).getTypeId().nativeId; + } + return ret; + } + + int[] getTypeScales() { + if (types == null) { + return null; + } + int[] ret = new int[types.size()]; + for (int i = 0; i < types.size(); i++) { + ret[i] = types.get(i).getScale(); + } + return ret; } public static class Builder { private final List names = new ArrayList<>(); - private final List typeNames = new ArrayList<>(); + private final List types = new ArrayList<>(); public Builder column(DType type, String name) { - typeNames.add(type.getSimpleName()); + types.add(type); names.add(name); return this; } public Schema build() { - return new Schema(names, typeNames); + return new Schema(names, types); } } } diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 96a9b608f06..eeb2d308f1a 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -170,10 +170,19 @@ public long getDeviceMemorySize() { return total; } + /** + * This method is internal and exposed purely for testing purpopses + */ + static Table removeNullMasksIfNeeded(Table table) { + return new Table(removeNullMasksIfNeeded(table.nativeHandle)); + } + ///////////////////////////////////////////////////////////////////////////// // NATIVE APIs ///////////////////////////////////////////////////////////////////////////// - + + private static native long[] removeNullMasksIfNeeded(long tableView) throws CudfException; + private static native ContiguousTable[] contiguousSplit(long inputTable, int[] indices); private static native long[] partition(long inputTable, long partitionView, @@ -200,7 +209,8 @@ private static native long bound(long inputTable, long valueTable, * into a java * object to try and pull out all of the options. If this becomes unwieldy we can change it. * @param columnNames names of all of the columns, even the ones filtered out - * @param dTypes types of all of the columns as strings. Why strings? who knows. + * @param dTypeIds native types IDs of all of the columns. + * @param dTypeScales scale of the type for all of the columns. * @param filterColumnNames name of the columns to read, or an empty array if we want to read * all of them * @param filePath the path of the file to read, or null if no path should be read. @@ -214,7 +224,8 @@ private static native long bound(long inputTable, long valueTable, * @param trueValues values that should be treated as boolean true * @param falseValues values that should be treated as boolean false */ - private static native long[] readCSV(String[] columnNames, String[] dTypes, + private static native long[] readCSV(String[] columnNames, + int[] dTypeIds, int[] dTypeScales, String[] filterColumnNames, String filePath, long address, long length, int headerRow, byte delim, byte quote, @@ -248,6 +259,7 @@ private static native long[] readParquet(String[] filterColumnNames, String file * @param isInt96 true if timestamp type is int96 * @param precisions precision list containing all the precisions of the decimal types in * the columns + * @param isMapValues true if a column is a map * @param filename local output path * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd. */ @@ -261,7 +273,7 @@ private static native long writeParquetFileBegin(String[] columnNames, int statsFreq, boolean[] isInt96, int[] precisions, - String filename) throws CudfException; + boolean[] isMapValues, String filename) throws CudfException; /** * Setup everything to write parquet formatted data to a buffer. @@ -276,6 +288,7 @@ private static native long writeParquetFileBegin(String[] columnNames, * @param isInt96 true if timestamp type is int96 * @param precisions precision list containing all the precisions of the decimal types in * the columns + * @param isMapValues true if a column is a map * @param consumer consumer of host buffers produced. * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd. */ @@ -289,6 +302,7 @@ private static native long writeParquetBufferBegin(String[] columnNames, int statsFreq, boolean[] isInt96, int[] precisions, + boolean[] isMapValues, HostBufferConsumer consumer) throws CudfException; /** @@ -500,18 +514,48 @@ private static native long[] leftJoin(long leftTable, int[] leftJoinCols, long r private static native long[] leftJoinGatherMaps(long leftKeys, long rightKeys, boolean compareNullsEqual) throws CudfException; + private static native long leftJoinRowCount(long leftTable, long rightHashJoin, + boolean nullsEqual) throws CudfException; + + private static native long[] leftHashJoinGatherMaps(long leftTable, long rightHashJoin, + boolean nullsEqual) throws CudfException; + + private static native long[] leftHashJoinGatherMapsWithCount(long leftTable, long rightHashJoin, + boolean nullsEqual, + long outputRowCount) throws CudfException; + private static native long[] innerJoin(long leftTable, int[] leftJoinCols, long rightTable, int[] rightJoinCols, boolean compareNullsEqual) throws CudfException; private static native long[] innerJoinGatherMaps(long leftKeys, long rightKeys, boolean compareNullsEqual) throws CudfException; + private static native long innerJoinRowCount(long table, long hashJoin, + boolean nullsEqual) throws CudfException; + + private static native long[] innerHashJoinGatherMaps(long table, long hashJoin, + boolean nullsEqual) throws CudfException; + + private static native long[] innerHashJoinGatherMapsWithCount(long table, long hashJoin, + boolean nullsEqual, + long outputRowCount) throws CudfException; + private static native long[] fullJoin(long leftTable, int[] leftJoinCols, long rightTable, int[] rightJoinCols, boolean compareNullsEqual) throws CudfException; private static native long[] fullJoinGatherMaps(long leftKeys, long rightKeys, boolean compareNullsEqual) throws CudfException; + private static native long fullJoinRowCount(long leftTable, long rightHashJoin, + boolean nullsEqual) throws CudfException; + + private static native long[] fullHashJoinGatherMaps(long leftTable, long rightHashJoin, + boolean nullsEqual) throws CudfException; + + private static native long[] fullHashJoinGatherMapsWithCount(long leftTable, long rightHashJoin, + boolean nullsEqual, + long outputRowCount) throws CudfException; + private static native long[] leftSemiJoin(long leftTable, int[] leftJoinCols, long rightTable, int[] rightJoinCols, boolean compareNullsEqual) throws CudfException; @@ -524,26 +568,67 @@ private static native long[] leftAntiJoin(long leftTable, int[] leftJoinCols, lo private static native long[] leftAntiJoinGatherMap(long leftKeys, long rightKeys, boolean compareNullsEqual) throws CudfException; + private static native long conditionalLeftJoinRowCount(long leftTable, long rightTable, + long condition, + boolean compareNullsEqual) throws CudfException; + private static native long[] conditionalLeftJoinGatherMaps(long leftTable, long rightTable, long condition, boolean compareNullsEqual) throws CudfException; + private static native long[] conditionalLeftJoinGatherMapsWithCount(long leftTable, long rightTable, + long condition, + boolean compareNullsEqual, + long rowCount) throws CudfException; + + private static native long conditionalInnerJoinRowCount(long leftTable, long rightTable, + long condition, + boolean compareNullsEqual) throws CudfException; + private static native long[] conditionalInnerJoinGatherMaps(long leftTable, long rightTable, long condition, boolean compareNullsEqual) throws CudfException; + private static native long[] conditionalInnerJoinGatherMapsWithCount(long leftTable, long rightTable, + long condition, + boolean compareNullsEqual, + long rowCount) throws CudfException; + private static native long[] conditionalFullJoinGatherMaps(long leftTable, long rightTable, long condition, boolean compareNullsEqual) throws CudfException; + private static native long[] conditionalFullJoinGatherMapsWithCount(long leftTable, long rightTable, + long condition, + boolean compareNullsEqual, + long rowCount) throws CudfException; + + private static native long conditionalLeftSemiJoinRowCount(long leftTable, long rightTable, + long condition, + boolean compareNullsEqual) throws CudfException; + private static native long[] conditionalLeftSemiJoinGatherMap(long leftTable, long rightTable, long condition, boolean compareNullsEqual) throws CudfException; + private static native long[] conditionalLeftSemiJoinGatherMapWithCount(long leftTable, long rightTable, + long condition, + boolean compareNullsEqual, + long rowCount) throws CudfException; + + private static native long conditionalLeftAntiJoinRowCount(long leftTable, long rightTable, + long condition, + boolean compareNullsEqual) throws CudfException; + private static native long[] conditionalLeftAntiJoinGatherMap(long leftTable, long rightTable, long condition, boolean compareNullsEqual) throws CudfException; + private static native long[] conditionalLeftAntiJoinGatherMapWithCount(long leftTable, long rightTable, + long condition, + boolean compareNullsEqual, + long rowCount) throws CudfException; + private static native long[] crossJoin(long leftTable, long rightTable) throws CudfException; private static native long[] concatenate(long[] cudfTablePointers) throws CudfException; @@ -608,7 +693,7 @@ public static Table readCSV(Schema schema, File path) { */ public static Table readCSV(Schema schema, CSVOptions opts, File path) { return new Table( - readCSV(schema.getColumnNames(), schema.getTypesAsStrings(), + readCSV(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(), opts.getIncludeColumnNames(), path.getAbsolutePath(), 0, 0, opts.getHeaderRow(), @@ -681,7 +766,7 @@ public static Table readCSV(Schema schema, CSVOptions opts, HostMemoryBuffer buf assert len > 0; assert len <= buffer.getLength() - offset; assert offset >= 0 && offset < buffer.length; - return new Table(readCSV(schema.getColumnNames(), schema.getTypesAsStrings(), + return new Table(readCSV(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(), opts.getIncludeColumnNames(), null, buffer.getAddress() + offset, len, opts.getHeaderRow(), @@ -864,6 +949,7 @@ private ParquetTableWriter(ParquetWriterOptions options, File outputFile) { String[] columnNames = options.getFlatColumnNames(); boolean[] columnNullabilities = options.getFlatIsNullable(); boolean[] timeInt96Values = options.getFlatIsTimeTypeInt96(); + boolean[] isMapValues = options.getFlatIsMap(); int[] precisions = options.getFlatPrecision(); int[] flatNumChildren = options.getFlatNumChildren(); @@ -878,6 +964,7 @@ private ParquetTableWriter(ParquetWriterOptions options, File outputFile) { options.getStatisticsFrequency().nativeId, timeInt96Values, precisions, + isMapValues, outputFile.getAbsolutePath()); } @@ -885,6 +972,7 @@ private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer cons String[] columnNames = options.getFlatColumnNames(); boolean[] columnNullabilities = options.getFlatIsNullable(); boolean[] timeInt96Values = options.getFlatIsTimeTypeInt96(); + boolean[] isMapValues = options.getFlatIsMap(); int[] precisions = options.getFlatPrecision(); int[] flatNumChildren = options.getFlatNumChildren(); @@ -899,6 +987,7 @@ private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer cons options.getStatisticsFrequency().nativeId, timeInt96Values, precisions, + isMapValues, consumer); } @@ -1990,6 +2079,84 @@ public GatherMap[] leftJoinGatherMaps(Table rightKeys, boolean compareNullsEqual return buildJoinGatherMaps(gatherMapData); } + /** + * Computes the number of rows resulting from a left equi-join between two tables. + * It is assumed this table instance holds the key columns from the left table, and the + * {@link HashJoin} argument has been constructed from the key columns from the right table. + * @param rightHash hash table built from join key columns from the right table + * @return row count of the join result + */ + public long leftJoinRowCount(HashJoin rightHash) { + if (getNumberOfColumns() != rightHash.getNumberOfColumns()) { + throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + "rightKeys: " + rightHash.getNumberOfColumns()); + } + return leftJoinRowCount(getNativeView(), rightHash.getNativeView(), + rightHash.getCompareNulls()); + } + + /** + * Computes the gather maps that can be used to manifest the result of a left equi-join between + * two tables. It is assumed this table instance holds the key columns from the left table, and + * the {@link HashJoin} argument has been constructed from the key columns from the right table. + * Two {@link GatherMap} instances will be returned that can be used to gather the left and right + * tables, respectively, to produce the result of the left join. + * It is the responsibility of the caller to close the resulting gather map instances. + * @param rightHash hash table built from join key columns from the right table + * @return left and right table gather maps + */ + public GatherMap[] leftJoinGatherMaps(HashJoin rightHash) { + if (getNumberOfColumns() != rightHash.getNumberOfColumns()) { + throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + "rightKeys: " + rightHash.getNumberOfColumns()); + } + long[] gatherMapData = + leftHashJoinGatherMaps(getNativeView(), rightHash.getNativeView(), + rightHash.getCompareNulls()); + return buildJoinGatherMaps(gatherMapData); + } + + /** + * Computes the gather maps that can be used to manifest the result of a left equi-join between + * two tables. It is assumed this table instance holds the key columns from the left table, and + * the {@link HashJoin} argument has been constructed from the key columns from the right table. + * Two {@link GatherMap} instances will be returned that can be used to gather the left and right + * tables, respectively, to produce the result of the left join. + * It is the responsibility of the caller to close the resulting gather map instances. + * This interface allows passing an output row count that was previously computed from + * {@link #leftJoinRowCount(HashJoin)}. + * WARNING: Passing a row count that is smaller than the actual row count will result + * in undefined behavior. + * @param rightHash hash table built from join key columns from the right table + * @param outputRowCount number of output rows in the join result + * @return left and right table gather maps + */ + public GatherMap[] leftJoinGatherMaps(HashJoin rightHash, long outputRowCount) { + if (getNumberOfColumns() != rightHash.getNumberOfColumns()) { + throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + "rightKeys: " + rightHash.getNumberOfColumns()); + } + long[] gatherMapData = + leftHashJoinGatherMapsWithCount(getNativeView(), rightHash.getNativeView(), + rightHash.getCompareNulls(), outputRowCount); + return buildJoinGatherMaps(gatherMapData); + } + + /** + * Computes the number of rows from the result of a left join between two tables when a + * conditional expression is true. It is assumed this table instance holds the columns from + * the left table, and the table argument represents the columns from the right table. + * @param rightTable the right side table of the join in the join + * @param condition conditional expression to evaluate during the join + * @param compareNullsEqual true if null key values should match otherwise false + * @return row count for the join result + */ + public long conditionalLeftJoinRowCount(Table rightTable, CompiledExpression condition, + boolean compareNullsEqual) { + return conditionalLeftJoinRowCount(getNativeView(), rightTable.getNativeView(), + condition.getNativeHandle(), compareNullsEqual); + } + /** * Computes the gather maps that can be used to manifest the result of a left join between * two tables when a conditional expression is true. It is assumed this table instance holds @@ -2002,18 +2169,42 @@ public GatherMap[] leftJoinGatherMaps(Table rightKeys, boolean compareNullsEqual * @param compareNullsEqual true if null key values should match otherwise false * @return left and right table gather maps */ - public GatherMap[] leftJoinGatherMaps(Table rightTable, CompiledExpression condition, - boolean compareNullsEqual) { - if (getNumberOfColumns() != rightTable.getNumberOfColumns()) { - throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + - "rightKeys: " + rightTable.getNumberOfColumns()); - } + public GatherMap[] conditionalLeftJoinGatherMaps(Table rightTable, + CompiledExpression condition, + boolean compareNullsEqual) { long[] gatherMapData = conditionalLeftJoinGatherMaps(getNativeView(), rightTable.getNativeView(), condition.getNativeHandle(), compareNullsEqual); return buildJoinGatherMaps(gatherMapData); } + /** + * Computes the gather maps that can be used to manifest the result of a left join between + * two tables when a conditional expression is true. It is assumed this table instance holds + * the columns from the left table, and the table argument represents the columns from the + * right table. Two {@link GatherMap} instances will be returned that can be used to gather + * the left and right tables, respectively, to produce the result of the left join. + * It is the responsibility of the caller to close the resulting gather map instances. + * This interface allows passing an output row count that was previously computed from + * {@link #conditionalLeftJoinRowCount(Table, CompiledExpression, boolean)}. + * WARNING: Passing a row count that is smaller than the actual row count will result + * in undefined behavior. + * @param rightTable the right side table of the join in the join + * @param condition conditional expression to evaluate during the join + * @param compareNullsEqual true if null key values should match otherwise false + * @param outputRowCount number of output rows in the join result + * @return left and right table gather maps + */ + public GatherMap[] conditionalLeftJoinGatherMaps(Table rightTable, + CompiledExpression condition, + boolean compareNullsEqual, + long outputRowCount) { + long[] gatherMapData = + conditionalLeftJoinGatherMapsWithCount(getNativeView(), rightTable.getNativeView(), + condition.getNativeHandle(), compareNullsEqual, outputRowCount); + return buildJoinGatherMaps(gatherMapData); + } + /** * Computes the gather maps that can be used to manifest the result of an inner equi-join between * two tables. It is assumed this table instance holds the key columns from the left table, and @@ -2035,6 +2226,83 @@ public GatherMap[] innerJoinGatherMaps(Table rightKeys, boolean compareNullsEqua return buildJoinGatherMaps(gatherMapData); } + /** + * Computes the number of rows resulting from an inner equi-join between two tables. + * @param otherHash hash table built from join key columns from the other table + * @return row count of the join result + */ + public long innerJoinRowCount(HashJoin otherHash) { + if (getNumberOfColumns() != otherHash.getNumberOfColumns()) { + throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + "otherKeys: " + otherHash.getNumberOfColumns()); + } + return innerJoinRowCount(getNativeView(), otherHash.getNativeView(), + otherHash.getCompareNulls()); + } + + /** + * Computes the gather maps that can be used to manifest the result of an inner equi-join between + * two tables. It is assumed this table instance holds the key columns from the left table, and + * the {@link HashJoin} argument has been constructed from the key columns from the right table. + * Two {@link GatherMap} instances will be returned that can be used to gather the left and right + * tables, respectively, to produce the result of the inner join. + * It is the responsibility of the caller to close the resulting gather map instances. + * @param rightHash hash table built from join key columns from the right table + * @return left and right table gather maps + */ + public GatherMap[] innerJoinGatherMaps(HashJoin rightHash) { + if (getNumberOfColumns() != rightHash.getNumberOfColumns()) { + throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + "rightKeys: " + rightHash.getNumberOfColumns()); + } + long[] gatherMapData = + innerHashJoinGatherMaps(getNativeView(), rightHash.getNativeView(), + rightHash.getCompareNulls()); + return buildJoinGatherMaps(gatherMapData); + } + + /** + * Computes the gather maps that can be used to manifest the result of an inner equi-join between + * two tables. It is assumed this table instance holds the key columns from the left table, and + * the {@link HashJoin} argument has been constructed from the key columns from the right table. + * Two {@link GatherMap} instances will be returned that can be used to gather the left and right + * tables, respectively, to produce the result of the inner join. + * It is the responsibility of the caller to close the resulting gather map instances. + * This interface allows passing an output row count that was previously computed from + * {@link #innerJoinRowCount(HashJoin)}. + * WARNING: Passing a row count that is smaller than the actual row count will result + * in undefined behavior. + * @param rightHash hash table built from join key columns from the right table + * @param outputRowCount number of output rows in the join result + * @return left and right table gather maps + */ + public GatherMap[] innerJoinGatherMaps(HashJoin rightHash, long outputRowCount) { + if (getNumberOfColumns() != rightHash.getNumberOfColumns()) { + throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + "rightKeys: " + rightHash.getNumberOfColumns()); + } + long[] gatherMapData = + innerHashJoinGatherMapsWithCount(getNativeView(), rightHash.getNativeView(), + rightHash.getCompareNulls(), outputRowCount); + return buildJoinGatherMaps(gatherMapData); + } + + /** + * Computes the number of rows from the result of an inner join between two tables when a + * conditional expression is true. It is assumed this table instance holds the columns from + * the left table, and the table argument represents the columns from the right table. + * @param rightTable the right side table of the join in the join + * @param condition conditional expression to evaluate during the join + * @param compareNullsEqual true if null key values should match otherwise false + * @return row count for the join result + */ + public long conditionalInnerJoinRowCount(Table rightTable, + CompiledExpression condition, + boolean compareNullsEqual) { + return conditionalInnerJoinRowCount(getNativeView(), rightTable.getNativeView(), + condition.getNativeHandle(), compareNullsEqual); + } + /** * Computes the gather maps that can be used to manifest the result of an inner join between * two tables when a conditional expression is true. It is assumed this table instance holds @@ -2047,18 +2315,42 @@ public GatherMap[] innerJoinGatherMaps(Table rightKeys, boolean compareNullsEqua * @param compareNullsEqual true if null key values should match otherwise false * @return left and right table gather maps */ - public GatherMap[] innerJoinGatherMaps(Table rightTable, CompiledExpression condition, - boolean compareNullsEqual) { - if (getNumberOfColumns() != rightTable.getNumberOfColumns()) { - throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + - "rightKeys: " + rightTable.getNumberOfColumns()); - } + public GatherMap[] conditionalInnerJoinGatherMaps(Table rightTable, + CompiledExpression condition, + boolean compareNullsEqual) { long[] gatherMapData = conditionalInnerJoinGatherMaps(getNativeView(), rightTable.getNativeView(), condition.getNativeHandle(), compareNullsEqual); return buildJoinGatherMaps(gatherMapData); } + /** + * Computes the gather maps that can be used to manifest the result of an inner join between + * two tables when a conditional expression is true. It is assumed this table instance holds + * the columns from the left table, and the table argument represents the columns from the + * right table. Two {@link GatherMap} instances will be returned that can be used to gather + * the left and right tables, respectively, to produce the result of the inner join. + * It is the responsibility of the caller to close the resulting gather map instances. + * This interface allows passing an output row count that was previously computed from + * {@link #conditionalInnerJoinRowCount(Table, CompiledExpression, boolean)}. + * WARNING: Passing a row count that is smaller than the actual row count will result + * in undefined behavior. + * @param rightTable the right side table of the join in the join + * @param condition conditional expression to evaluate during the join + * @param compareNullsEqual true if null key values should match otherwise false + * @param outputRowCount number of output rows in the join result + * @return left and right table gather maps + */ + public GatherMap[] conditionalInnerJoinGatherMaps(Table rightTable, + CompiledExpression condition, + boolean compareNullsEqual, + long outputRowCount) { + long[] gatherMapData = + conditionalInnerJoinGatherMapsWithCount(getNativeView(), rightTable.getNativeView(), + condition.getNativeHandle(), compareNullsEqual, outputRowCount); + return buildJoinGatherMaps(gatherMapData); + } + /** * Computes the gather maps that can be used to manifest the result of an full equi-join between * two tables. It is assumed this table instance holds the key columns from the left table, and @@ -2080,6 +2372,72 @@ public GatherMap[] fullJoinGatherMaps(Table rightKeys, boolean compareNullsEqual return buildJoinGatherMaps(gatherMapData); } + /** + * Computes the number of rows resulting from a full equi-join between two tables. + * It is assumed this table instance holds the key columns from the left table, and the + * {@link HashJoin} argument has been constructed from the key columns from the right table. + * Note that unlike {@link #leftJoinRowCount(HashJoin)} and {@link #innerJoinRowCount(HashJoin), + * this will perform some redundant calculations compared to + * {@link #fullJoinGatherMaps(HashJoin, long)}. + * @param rightHash hash table built from join key columns from the right table + * @return row count of the join result + */ + public long fullJoinRowCount(HashJoin rightHash) { + if (getNumberOfColumns() != rightHash.getNumberOfColumns()) { + throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + "rightKeys: " + rightHash.getNumberOfColumns()); + } + return fullJoinRowCount(getNativeView(), rightHash.getNativeView(), + rightHash.getCompareNulls()); + } + + /** + * Computes the gather maps that can be used to manifest the result of a full equi-join between + * two tables. It is assumed this table instance holds the key columns from the left table, and + * the {@link HashJoin} argument has been constructed from the key columns from the right table. + * Two {@link GatherMap} instances will be returned that can be used to gather the left and right + * tables, respectively, to produce the result of the full join. + * It is the responsibility of the caller to close the resulting gather map instances. + * @param rightHash hash table built from join key columns from the right table + * @return left and right table gather maps + */ + public GatherMap[] fullJoinGatherMaps(HashJoin rightHash) { + if (getNumberOfColumns() != rightHash.getNumberOfColumns()) { + throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + "rightKeys: " + rightHash.getNumberOfColumns()); + } + long[] gatherMapData = + fullHashJoinGatherMaps(getNativeView(), rightHash.getNativeView(), + rightHash.getCompareNulls()); + return buildJoinGatherMaps(gatherMapData); + } + + /** + * Computes the gather maps that can be used to manifest the result of a full equi-join between + * two tables. It is assumed this table instance holds the key columns from the left table, and + * the {@link HashJoin} argument has been constructed from the key columns from the right table. + * Two {@link GatherMap} instances will be returned that can be used to gather the left and right + * tables, respectively, to produce the result of the full join. + * It is the responsibility of the caller to close the resulting gather map instances. + * This interface allows passing an output row count that was previously computed from + * {@link #fullJoinRowCount(HashJoin)}. + * WARNING: Passing a row count that is smaller than the actual row count will result + * in undefined behavior. + * @param rightHash hash table built from join key columns from the right table + * @param outputRowCount number of output rows in the join result + * @return left and right table gather maps + */ + public GatherMap[] fullJoinGatherMaps(HashJoin rightHash, long outputRowCount) { + if (getNumberOfColumns() != rightHash.getNumberOfColumns()) { + throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + "rightKeys: " + rightHash.getNumberOfColumns()); + } + long[] gatherMapData = + fullHashJoinGatherMapsWithCount(getNativeView(), rightHash.getNativeView(), + rightHash.getCompareNulls(), outputRowCount); + return buildJoinGatherMaps(gatherMapData); + } + /** * Computes the gather maps that can be used to manifest the result of a full join between * two tables when a conditional expression is true. It is assumed this table instance holds @@ -2092,12 +2450,9 @@ public GatherMap[] fullJoinGatherMaps(Table rightKeys, boolean compareNullsEqual * @param compareNullsEqual true if null key values should match otherwise false * @return left and right table gather maps */ - public GatherMap[] fullJoinGatherMaps(Table rightTable, CompiledExpression condition, - boolean compareNullsEqual) { - if (getNumberOfColumns() != rightTable.getNumberOfColumns()) { - throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + - "rightKeys: " + rightTable.getNumberOfColumns()); - } + public GatherMap[] conditionalFullJoinGatherMaps(Table rightTable, + CompiledExpression condition, + boolean compareNullsEqual) { long[] gatherMapData = conditionalFullJoinGatherMaps(getNativeView(), rightTable.getNativeView(), condition.getNativeHandle(), compareNullsEqual); @@ -2132,6 +2487,22 @@ public GatherMap leftSemiJoinGatherMap(Table rightKeys, boolean compareNullsEqua return buildSemiJoinGatherMap(gatherMapData); } + /** + * Computes the number of rows from the result of a left semi join between two tables when a + * conditional expression is true. It is assumed this table instance holds the columns from + * the left table, and the table argument represents the columns from the right table. + * @param rightTable the right side table of the join in the join + * @param condition conditional expression to evaluate during the join + * @param compareNullsEqual true if null key values should match otherwise false + * @return row count for the join result + */ + public long conditionalLeftSemiJoinRowCount(Table rightTable, + CompiledExpression condition, + boolean compareNullsEqual) { + return conditionalLeftSemiJoinRowCount(getNativeView(), rightTable.getNativeView(), + condition.getNativeHandle(), compareNullsEqual); + } + /** * Computes the gather map that can be used to manifest the result of a left semi join between * two tables when a conditional expression is true. It is assumed this table instance holds @@ -2144,18 +2515,42 @@ public GatherMap leftSemiJoinGatherMap(Table rightKeys, boolean compareNullsEqua * @param compareNullsEqual true if null key values should match otherwise false * @return left table gather map */ - public GatherMap leftSemiJoinGatherMap(Table rightTable, CompiledExpression condition, - boolean compareNullsEqual) { - if (getNumberOfColumns() != rightTable.getNumberOfColumns()) { - throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + - "rightKeys: " + rightTable.getNumberOfColumns()); - } + public GatherMap conditionalLeftSemiJoinGatherMap(Table rightTable, + CompiledExpression condition, + boolean compareNullsEqual) { long[] gatherMapData = conditionalLeftSemiJoinGatherMap(getNativeView(), rightTable.getNativeView(), condition.getNativeHandle(), compareNullsEqual); return buildSemiJoinGatherMap(gatherMapData); } + /** + * Computes the gather map that can be used to manifest the result of a left semi join between + * two tables when a conditional expression is true. It is assumed this table instance holds + * the columns from the left table, and the table argument represents the columns from the + * right table. The {@link GatherMap} instance returned can be used to gather the left table + * to produce the result of the left semi join. + * It is the responsibility of the caller to close the resulting gather map instance. + * This interface allows passing an output row count that was previously computed from + * {@link #conditionalLeftSemiJoinRowCount(Table, CompiledExpression, boolean)}. + * WARNING: Passing a row count that is smaller than the actual row count will result + * in undefined behavior. + * @param rightTable the right side table of the join + * @param condition conditional expression to evaluate during the join + * @param compareNullsEqual true if null key values should match otherwise false + * @param outputRowCount number of output rows in the join result + * @return left table gather map + */ + public GatherMap conditionalLeftSemiJoinGatherMap(Table rightTable, + CompiledExpression condition, + boolean compareNullsEqual, + long outputRowCount) { + long[] gatherMapData = + conditionalLeftSemiJoinGatherMapWithCount(getNativeView(), rightTable.getNativeView(), + condition.getNativeHandle(), compareNullsEqual, outputRowCount); + return buildSemiJoinGatherMap(gatherMapData); + } + /** * Computes the gather map that can be used to manifest the result of a left anti-join between * two tables. It is assumed this table instance holds the key columns from the left table, and @@ -2177,6 +2572,22 @@ public GatherMap leftAntiJoinGatherMap(Table rightKeys, boolean compareNullsEqua return buildSemiJoinGatherMap(gatherMapData); } + /** + * Computes the number of rows from the result of a left anti join between two tables when a + * conditional expression is true. It is assumed this table instance holds the columns from + * the left table, and the table argument represents the columns from the right table. + * @param rightTable the right side table of the join in the join + * @param condition conditional expression to evaluate during the join + * @param compareNullsEqual true if null key values should match otherwise false + * @return row count for the join result + */ + public long conditionalLeftAntiJoinRowCount(Table rightTable, + CompiledExpression condition, + boolean compareNullsEqual) { + return conditionalLeftAntiJoinRowCount(getNativeView(), rightTable.getNativeView(), + condition.getNativeHandle(), compareNullsEqual); + } + /** * Computes the gather map that can be used to manifest the result of a left anti join between * two tables when a conditional expression is true. It is assumed this table instance holds @@ -2189,18 +2600,42 @@ public GatherMap leftAntiJoinGatherMap(Table rightKeys, boolean compareNullsEqua * @param compareNullsEqual true if null key values should match otherwise false * @return left table gather map */ - public GatherMap leftAntiJoinGatherMap(Table rightTable, CompiledExpression condition, - boolean compareNullsEqual) { - if (getNumberOfColumns() != rightTable.getNumberOfColumns()) { - throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + - "rightKeys: " + rightTable.getNumberOfColumns()); - } + public GatherMap conditionalLeftAntiJoinGatherMap(Table rightTable, + CompiledExpression condition, + boolean compareNullsEqual) { long[] gatherMapData = conditionalLeftAntiJoinGatherMap(getNativeView(), rightTable.getNativeView(), condition.getNativeHandle(), compareNullsEqual); return buildSemiJoinGatherMap(gatherMapData); } + /** + * Computes the gather map that can be used to manifest the result of a left anti join between + * two tables when a conditional expression is true. It is assumed this table instance holds + * the columns from the left table, and the table argument represents the columns from the + * right table. The {@link GatherMap} instance returned can be used to gather the left table + * to produce the result of the left anti join. + * It is the responsibility of the caller to close the resulting gather map instance. + * This interface allows passing an output row count that was previously computed from + * {@link #conditionalLeftAntiJoinRowCount(Table, CompiledExpression, boolean)}. + * WARNING: Passing a row count that is smaller than the actual row count will result + * in undefined behavior. + * @param rightTable the right side table of the join + * @param condition conditional expression to evaluate during the join + * @param compareNullsEqual true if null key values should match otherwise false + * @param outputRowCount number of output rows in the join result + * @return left table gather map + */ + public GatherMap conditionalLeftAntiJoinGatherMap(Table rightTable, + CompiledExpression condition, + boolean compareNullsEqual, + long outputRowCount) { + long[] gatherMapData = + conditionalLeftAntiJoinGatherMapWithCount(getNativeView(), rightTable.getNativeView(), + condition.getNativeHandle(), compareNullsEqual, outputRowCount); + return buildSemiJoinGatherMap(gatherMapData); + } + /** * Convert this table of columns into a row major format that is useful for interacting with other * systems that do row major processing of the data. Currently only fixed-width column types are @@ -2456,7 +2891,7 @@ public static final class GroupByOperation { * 1, 2 * 2, 1 ==> aggregated count */ - public Table aggregate(AggregationOnColumn... aggregates) { + public Table aggregate(GroupByAggregationOnColumn... aggregates) { assert aggregates != null; // To improve performance and memory we want to remove duplicate operations @@ -2469,9 +2904,9 @@ public Table aggregate(AggregationOnColumn... aggregates) { int keysLength = operation.indices.length; int totalOps = 0; for (int outputIndex = 0; outputIndex < aggregates.length; outputIndex++) { - AggregationOnColumn agg = aggregates[outputIndex]; + GroupByAggregationOnColumn agg = aggregates[outputIndex]; ColumnOps ops = groupedOps.computeIfAbsent(agg.getColumnIndex(), (idx) -> new ColumnOps()); - totalOps += ops.add(agg, outputIndex + keysLength); + totalOps += ops.add(agg.getWrapped().getWrapped(), outputIndex + keysLength); } int[] aggColumnIndexes = new int[totalOps]; long[] aggOperationInstances = new long[totalOps]; @@ -2808,7 +3243,7 @@ public Table aggregateWindowsOverRanges(AggregationOverWindow... windowAggregate } } - public Table scan(AggregationOnColumn... aggregates) { + public Table scan(GroupByScanAggregationOnColumn... aggregates) { assert aggregates != null; // To improve performance and memory we want to remove duplicate operations @@ -2821,9 +3256,9 @@ public Table scan(AggregationOnColumn... aggregates) { int keysLength = operation.indices.length; int totalOps = 0; for (int outputIndex = 0; outputIndex < aggregates.length; outputIndex++) { - AggregationOnColumn agg = aggregates[outputIndex]; + GroupByScanAggregationOnColumn agg = aggregates[outputIndex]; ColumnOps ops = groupedOps.computeIfAbsent(agg.getColumnIndex(), (idx) -> new ColumnOps()); - totalOps += ops.add(agg, outputIndex + keysLength); + totalOps += ops.add(agg.getWrapped().getWrapped(), outputIndex + keysLength); } int[] aggColumnIndexes = new int[totalOps]; long[] aggOperationInstances = new long[totalOps]; diff --git a/java/src/main/java/ai/rapids/cudf/ast/AstNode.java b/java/src/main/java/ai/rapids/cudf/ast/AstExpression.java similarity index 82% rename from java/src/main/java/ai/rapids/cudf/ast/AstNode.java rename to java/src/main/java/ai/rapids/cudf/ast/AstExpression.java index 78cf39b05d2..5ac15f714f0 100644 --- a/java/src/main/java/ai/rapids/cudf/ast/AstNode.java +++ b/java/src/main/java/ai/rapids/cudf/ast/AstExpression.java @@ -17,14 +17,15 @@ package ai.rapids.cudf.ast; import java.nio.ByteBuffer; +import java.nio.ByteOrder; /** Base class of every node in an AST */ -abstract class AstNode { +public abstract class AstExpression { /** * Enumeration for the types of AST nodes that can appear in a serialized AST. * NOTE: This must be kept in sync with the `jni_serialized_node_type` in CompiledExpression.cpp! */ - protected enum NodeType { + protected enum ExpressionType { VALID_LITERAL(0), NULL_LITERAL(1), COLUMN_REFERENCE(2), @@ -33,7 +34,7 @@ protected enum NodeType { private final byte nativeId; - NodeType(int nativeId) { + ExpressionType(int nativeId) { this.nativeId = (byte) nativeId; assert this.nativeId == nativeId; } @@ -49,6 +50,14 @@ void serialize(ByteBuffer bb) { } } + public CompiledExpression compile() { + int size = getSerializedSize(); + ByteBuffer bb = ByteBuffer.allocate(size); + bb.order(ByteOrder.nativeOrder()); + serialize(bb); + return new CompiledExpression(bb.array()); + } + /** Get the size in bytes of the serialized form of this node and all child nodes */ abstract int getSerializedSize(); diff --git a/java/src/main/java/ai/rapids/cudf/ast/BinaryExpression.java b/java/src/main/java/ai/rapids/cudf/ast/BinaryOperation.java similarity index 72% rename from java/src/main/java/ai/rapids/cudf/ast/BinaryExpression.java rename to java/src/main/java/ai/rapids/cudf/ast/BinaryOperation.java index ed4f95b01e1..c39c1c3a1c5 100644 --- a/java/src/main/java/ai/rapids/cudf/ast/BinaryExpression.java +++ b/java/src/main/java/ai/rapids/cudf/ast/BinaryOperation.java @@ -18,13 +18,13 @@ import java.nio.ByteBuffer; -/** A binary expression consisting of an operator and two operands. */ -public class BinaryExpression extends Expression { +/** A binary operation consisting of an operator and two operands. */ +public class BinaryOperation extends AstExpression { private final BinaryOperator op; - private final AstNode leftInput; - private final AstNode rightInput; + private final AstExpression leftInput; + private final AstExpression rightInput; - public BinaryExpression(BinaryOperator op, AstNode leftInput, AstNode rightInput) { + public BinaryOperation(BinaryOperator op, AstExpression leftInput, AstExpression rightInput) { this.op = op; this.leftInput = leftInput; this.rightInput = rightInput; @@ -32,7 +32,7 @@ public BinaryExpression(BinaryOperator op, AstNode leftInput, AstNode rightInput @Override int getSerializedSize() { - return NodeType.BINARY_EXPRESSION.getSerializedSize() + + return ExpressionType.BINARY_EXPRESSION.getSerializedSize() + op.getSerializedSize() + leftInput.getSerializedSize() + rightInput.getSerializedSize(); @@ -40,7 +40,7 @@ int getSerializedSize() { @Override void serialize(ByteBuffer bb) { - NodeType.BINARY_EXPRESSION.serialize(bb); + ExpressionType.BINARY_EXPRESSION.serialize(bb); op.serialize(bb); leftInput.serialize(bb); rightInput.serialize(bb); diff --git a/java/src/main/java/ai/rapids/cudf/ast/BinaryOperator.java b/java/src/main/java/ai/rapids/cudf/ast/BinaryOperator.java index 12e4d985658..595badb14b6 100644 --- a/java/src/main/java/ai/rapids/cudf/ast/BinaryOperator.java +++ b/java/src/main/java/ai/rapids/cudf/ast/BinaryOperator.java @@ -19,7 +19,7 @@ import java.nio.ByteBuffer; /** - * Enumeration of AST operations that can appear in a binary expression. + * Enumeration of AST operators that can appear in a binary operation. * NOTE: This must be kept in sync with `jni_to_binary_operator` in CompiledExpression.cpp! */ public enum BinaryOperator { diff --git a/java/src/main/java/ai/rapids/cudf/ast/ColumnReference.java b/java/src/main/java/ai/rapids/cudf/ast/ColumnReference.java index 34e4064e23b..4860a088a83 100644 --- a/java/src/main/java/ai/rapids/cudf/ast/ColumnReference.java +++ b/java/src/main/java/ai/rapids/cudf/ast/ColumnReference.java @@ -19,7 +19,7 @@ import java.nio.ByteBuffer; /** A reference to a column in an input table. */ -public final class ColumnReference extends AstNode { +public final class ColumnReference extends AstExpression { private final int columnIndex; private final TableReference tableSource; @@ -37,14 +37,14 @@ public ColumnReference(int columnIndex, TableReference tableSource) { @Override int getSerializedSize() { // node type + table ref + column index - return NodeType.COLUMN_REFERENCE.getSerializedSize() + + return ExpressionType.COLUMN_REFERENCE.getSerializedSize() + tableSource.getSerializedSize() + Integer.BYTES; } @Override void serialize(ByteBuffer bb) { - NodeType.COLUMN_REFERENCE.serialize(bb); + ExpressionType.COLUMN_REFERENCE.serialize(bb); tableSource.serialize(bb); bb.putInt(columnIndex); } diff --git a/java/src/main/java/ai/rapids/cudf/ast/CompiledExpression.java b/java/src/main/java/ai/rapids/cudf/ast/CompiledExpression.java index 0949b09cbb0..ea5dc003844 100644 --- a/java/src/main/java/ai/rapids/cudf/ast/CompiledExpression.java +++ b/java/src/main/java/ai/rapids/cudf/ast/CompiledExpression.java @@ -18,12 +18,17 @@ import ai.rapids.cudf.ColumnVector; import ai.rapids.cudf.MemoryCleaner; +import ai.rapids.cudf.NativeDepsLoader; import ai.rapids.cudf.Table; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** This class wraps a native compiled AST and must be closed to avoid native memory leaks. */ public class CompiledExpression implements AutoCloseable { + static { + NativeDepsLoader.loadNativeDeps(); + } + private static final Logger log = LoggerFactory.getLogger(CompiledExpression.class); private static class CompiledExpressionCleaner extends MemoryCleaner.Cleaner { diff --git a/java/src/main/java/ai/rapids/cudf/ast/Literal.java b/java/src/main/java/ai/rapids/cudf/ast/Literal.java index be306cd99c4..b93efce8c94 100644 --- a/java/src/main/java/ai/rapids/cudf/ast/Literal.java +++ b/java/src/main/java/ai/rapids/cudf/ast/Literal.java @@ -22,7 +22,7 @@ import java.nio.ByteOrder; /** A literal value in an AST expression. */ -public final class Literal extends AstNode { +public final class Literal extends AstExpression { private final DType type; private final byte[] serializedValue; @@ -207,8 +207,8 @@ public static Literal ofDurationFromLong(DType type, Long value) { @Override int getSerializedSize() { - NodeType nodeType = serializedValue != null - ? NodeType.VALID_LITERAL : NodeType.NULL_LITERAL; + ExpressionType nodeType = serializedValue != null + ? ExpressionType.VALID_LITERAL : ExpressionType.NULL_LITERAL; int size = nodeType.getSerializedSize() + getDataTypeSerializedSize(); if (serializedValue != null) { size += serializedValue.length; @@ -218,8 +218,8 @@ int getSerializedSize() { @Override void serialize(ByteBuffer bb) { - NodeType nodeType = serializedValue != null - ? NodeType.VALID_LITERAL : NodeType.NULL_LITERAL; + ExpressionType nodeType = serializedValue != null + ? ExpressionType.VALID_LITERAL : ExpressionType.NULL_LITERAL; nodeType.serialize(bb); serializeDataType(bb); if (serializedValue != null) { diff --git a/java/src/main/java/ai/rapids/cudf/ast/UnaryExpression.java b/java/src/main/java/ai/rapids/cudf/ast/UnaryOperation.java similarity index 73% rename from java/src/main/java/ai/rapids/cudf/ast/UnaryExpression.java rename to java/src/main/java/ai/rapids/cudf/ast/UnaryOperation.java index fa8e70266ac..03c4c45afd4 100644 --- a/java/src/main/java/ai/rapids/cudf/ast/UnaryExpression.java +++ b/java/src/main/java/ai/rapids/cudf/ast/UnaryOperation.java @@ -18,26 +18,26 @@ import java.nio.ByteBuffer; -/** A unary expression consisting of an operator and an operand. */ -public final class UnaryExpression extends Expression { +/** A unary operation consisting of an operator and an operand. */ +public final class UnaryOperation extends AstExpression { private final UnaryOperator op; - private final AstNode input; + private final AstExpression input; - public UnaryExpression(UnaryOperator op, AstNode input) { + public UnaryOperation(UnaryOperator op, AstExpression input) { this.op = op; this.input = input; } @Override int getSerializedSize() { - return NodeType.UNARY_EXPRESSION.getSerializedSize() + + return ExpressionType.UNARY_EXPRESSION.getSerializedSize() + op.getSerializedSize() + input.getSerializedSize(); } @Override void serialize(ByteBuffer bb) { - NodeType.UNARY_EXPRESSION.serialize(bb); + ExpressionType.UNARY_EXPRESSION.serialize(bb); op.serialize(bb); input.serialize(bb); } diff --git a/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java b/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java index c3f193d06b4..9ef18dbd75d 100644 --- a/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java +++ b/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java @@ -19,7 +19,7 @@ import java.nio.ByteBuffer; /** - * Enumeration of AST operations that can appear in a unary expression. + * Enumeration of AST operators that can appear in a unary operation. * NOTE: This must be kept in sync with `jni_to_unary_operator` in CompiledExpression.cpp! */ public enum UnaryOperator { diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index a938a2af456..bc59e3aee64 100755 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -186,7 +186,8 @@ endif(CUDF_JNI_ARROW_STATIC) find_library(ARROW_LIBRARY ${CUDF_JNI_ARROW_LIBNAME} REQUIRED HINTS "$ENV{ARROW_ROOT}/lib" - "${CUDF_CPP_BUILD_DIR}/_deps/arrow-build/release") + "${CUDF_CPP_BUILD_DIR}/_deps/arrow-build/release" + "${CUDF_CPP_BUILD_DIR}/_deps/arrow-build/debug") if(NOT ARROW_LIBRARY) if(CUDF_JNI_ARROW_STATIC) @@ -263,6 +264,7 @@ set(SOURCE_FILES "src/ColumnViewJni.cpp" "src/CompiledExpression.cpp" "src/ContiguousTableJni.cpp" + "src/HashJoinJni.cpp" "src/HostMemoryBufferNativeUtilsJni.cpp" "src/NvcompJni.cpp" "src/NvtxRangeJni.cpp" diff --git a/java/src/main/native/src/CompiledExpression.cpp b/java/src/main/native/src/CompiledExpression.cpp index 31f3184f107..470464f35c8 100644 --- a/java/src/main/native/src/CompiledExpression.cpp +++ b/java/src/main/native/src/CompiledExpression.cpp @@ -18,11 +18,10 @@ #include #include -#include -#include -#include +#include #include #include +#include #include #include "cudf_jni_apis.hpp" @@ -104,15 +103,15 @@ class jni_serialized_ast { }; /** - * Enumeration of the AST node types that can appear in the serialized data. + * Enumeration of the AST expression types that can appear in the serialized data. * NOTE: This must be kept in sync with the NodeType enumeration in AstNode.java! */ -enum class jni_serialized_node_type : int8_t { +enum class jni_serialized_expression_type : int8_t { VALID_LITERAL = 0, NULL_LITERAL = 1, COLUMN_REFERENCE = 2, - UNARY_EXPRESSION = 3, - BINARY_EXPRESSION = 4 + UNARY_OPERATION = 3, + BINARY_OPERATION = 4 }; /** @@ -276,41 +275,42 @@ cudf::ast::column_reference &compile_column_reference(cudf::jni::ast::compiled_e } // forward declaration -cudf::ast::detail::node &compile_node(cudf::jni::ast::compiled_expr &compiled_expr, - jni_serialized_ast &jni_ast); +cudf::ast::expression &compile_expression(cudf::jni::ast::compiled_expr &compiled_expr, + jni_serialized_ast &jni_ast); /** Decode a serialized AST unary expression */ -cudf::ast::expression &compile_unary_expression(cudf::jni::ast::compiled_expr &compiled_expr, - jni_serialized_ast &jni_ast) { +cudf::ast::operation &compile_unary_expression(cudf::jni::ast::compiled_expr &compiled_expr, + jni_serialized_ast &jni_ast) { auto const ast_op = jni_to_unary_operator(jni_ast.read_byte()); - cudf::ast::detail::node &child_node = compile_node(compiled_expr, jni_ast); - return compiled_expr.add_expression(std::make_unique(ast_op, child_node)); + cudf::ast::expression &child_expression = compile_expression(compiled_expr, jni_ast); + return compiled_expr.add_operation( + std::make_unique(ast_op, child_expression)); } /** Decode a serialized AST binary expression */ -cudf::ast::expression &compile_binary_expression(cudf::jni::ast::compiled_expr &compiled_expr, - jni_serialized_ast &jni_ast) { +cudf::ast::operation &compile_binary_expression(cudf::jni::ast::compiled_expr &compiled_expr, + jni_serialized_ast &jni_ast) { auto const ast_op = jni_to_binary_operator(jni_ast.read_byte()); - cudf::ast::detail::node &left_child = compile_node(compiled_expr, jni_ast); - cudf::ast::detail::node &right_child = compile_node(compiled_expr, jni_ast); - return compiled_expr.add_expression( - std::make_unique(ast_op, left_child, right_child)); + cudf::ast::expression &left_child = compile_expression(compiled_expr, jni_ast); + cudf::ast::expression &right_child = compile_expression(compiled_expr, jni_ast); + return compiled_expr.add_operation( + std::make_unique(ast_op, left_child, right_child)); } -/** Decode a serialized AST node by reading the node type and dispatching */ -cudf::ast::detail::node &compile_node(cudf::jni::ast::compiled_expr &compiled_expr, - jni_serialized_ast &jni_ast) { - auto const node_type = static_cast(jni_ast.read_byte()); - switch (node_type) { - case jni_serialized_node_type::VALID_LITERAL: +/** Decode a serialized AST expression by reading the expression type and dispatching */ +cudf::ast::expression &compile_expression(cudf::jni::ast::compiled_expr &compiled_expr, + jni_serialized_ast &jni_ast) { + auto const expression_type = static_cast(jni_ast.read_byte()); + switch (expression_type) { + case jni_serialized_expression_type::VALID_LITERAL: return compile_literal(true, compiled_expr, jni_ast); - case jni_serialized_node_type::NULL_LITERAL: + case jni_serialized_expression_type::NULL_LITERAL: return compile_literal(false, compiled_expr, jni_ast); - case jni_serialized_node_type::COLUMN_REFERENCE: + case jni_serialized_expression_type::COLUMN_REFERENCE: return compile_column_reference(compiled_expr, jni_ast); - case jni_serialized_node_type::UNARY_EXPRESSION: + case jni_serialized_expression_type::UNARY_OPERATION: return compile_unary_expression(compiled_expr, jni_ast); - case jni_serialized_node_type::BINARY_EXPRESSION: + case jni_serialized_expression_type::BINARY_OPERATION: return compile_binary_expression(compiled_expr, jni_ast); default: throw std::invalid_argument("data is not a serialized AST expression"); } @@ -319,16 +319,7 @@ cudf::ast::detail::node &compile_node(cudf::jni::ast::compiled_expr &compiled_ex /** Decode a serialized AST into a native libcudf AST and associated resources */ std::unique_ptr compile_serialized_ast(jni_serialized_ast &jni_ast) { auto jni_expr_ptr = std::make_unique(); - auto const node_type = static_cast(jni_ast.read_byte()); - switch (node_type) { - case jni_serialized_node_type::UNARY_EXPRESSION: - (void)compile_unary_expression(*jni_expr_ptr, jni_ast); - break; - case jni_serialized_node_type::BINARY_EXPRESSION: - (void)compile_binary_expression(*jni_expr_ptr, jni_ast); - break; - default: throw std::invalid_argument("data is not a serialized AST expression"); - } + (void)compile_expression(*jni_expr_ptr, jni_ast); if (!jni_ast.at_eof()) { throw std::invalid_argument("Extra bytes at end of serialized AST"); @@ -366,7 +357,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_computeColumn auto compiled_expr_ptr = reinterpret_cast(j_ast); auto tview_ptr = reinterpret_cast(j_table); std::unique_ptr result = - cudf::ast::compute_column(*tview_ptr, compiled_expr_ptr->get_top_expression()); + cudf::compute_column(*tview_ptr, compiled_expr_ptr->get_top_expression()); return reinterpret_cast(result.release()); } CATCH_STD(env, 0); diff --git a/java/src/main/native/src/HashJoinJni.cpp b/java/src/main/native/src/HashJoinJni.cpp new file mode 100644 index 00000000000..0f78aef64bc --- /dev/null +++ b/java/src/main/native/src/HashJoinJni.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "cudf_jni_apis.hpp" + +extern "C" { + +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_HashJoin_create(JNIEnv *env, jclass, jlong j_table, + jboolean j_nulls_equal) { + JNI_NULL_CHECK(env, j_table, "table handle is null", 0); + try { + cudf::jni::auto_set_device(env); + auto tview = reinterpret_cast(j_table); + auto nulleq = j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; + auto hash_join_ptr = new cudf::hash_join(*tview, nulleq); + return reinterpret_cast(hash_join_ptr); + } + CATCH_STD(env, 0); +} + +JNIEXPORT void JNICALL Java_ai_rapids_cudf_HashJoin_destroy(JNIEnv *env, jclass, jlong j_handle) { + try { + cudf::jni::auto_set_device(env); + auto hash_join_ptr = reinterpret_cast(j_handle); + delete hash_join_ptr; + } + CATCH_STD(env, ); +} + +} // extern "C" diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index c092450da1c..2bb56565f7a 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -668,21 +668,26 @@ namespace { int set_column_metadata(cudf::io::column_in_metadata &column_metadata, std::vector &col_names, cudf::jni::native_jbooleanArray &nullability, - cudf::jni::native_jbooleanArray &isInt96, + cudf::jni::native_jbooleanArray &is_int96, cudf::jni::native_jintArray &precisions, + cudf::jni::native_jbooleanArray &is_map, cudf::jni::native_jintArray &children, int num_children, int read_index) { int write_index = 0; for (int i = 0; i < num_children; i++, write_index++) { cudf::io::column_in_metadata child; child.set_name(col_names[read_index]) .set_decimal_precision(precisions[read_index]) - .set_int96_timestamps(isInt96[read_index]) + .set_int96_timestamps(is_int96[read_index]) .set_nullability(nullability[read_index]); + if (is_map[read_index]) { + child.set_list_column_as_map(); + } column_metadata.add_child(child); int childs_children = children[read_index++]; if (childs_children > 0) { - read_index = set_column_metadata(column_metadata.child(write_index), col_names, nullability, - isInt96, precisions, children, childs_children, read_index); + read_index = + set_column_metadata(column_metadata.child(write_index), col_names, nullability, is_int96, + precisions, is_map, children, childs_children, read_index); } } return read_index; @@ -692,7 +697,8 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam jintArray &j_children, jbooleanArray &j_col_nullability, jobjectArray &j_metadata_keys, jobjectArray &j_metadata_values, jint j_compression, jint j_stats_freq, jbooleanArray &j_isInt96, - jintArray &j_precisions, cudf::io::table_input_metadata &metadata) { + jintArray &j_precisions, jbooleanArray &j_is_map, + cudf::io::table_input_metadata &metadata) { cudf::jni::auto_set_device(env); cudf::jni::native_jstringArray col_names(env, j_col_names); cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability); @@ -701,6 +707,7 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam cudf::jni::native_jstringArray meta_values(env, j_metadata_values); cudf::jni::native_jintArray precisions(env, j_precisions); cudf::jni::native_jintArray children(env, j_children); + cudf::jni::native_jbooleanArray is_map(env, j_is_map); auto cpp_names = col_names.as_cpp_vector(); @@ -714,11 +721,14 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam .set_nullability(col_nullability[read_index]) .set_int96_timestamps(isInt96[read_index]) .set_decimal_precision(precisions[read_index]); + if (is_map[read_index]) { + metadata.column_metadata[write_index].set_list_column_as_map(); + } int childs_children = children[read_index++]; if (childs_children > 0) { read_index = set_column_metadata(metadata.column_metadata[write_index], cpp_names, col_nullability, - isInt96, precisions, children, childs_children, read_index); + isInt96, precisions, is_map, children, childs_children, read_index); } } for (auto i = 0; i < meta_keys.size(); ++i) { @@ -745,13 +755,46 @@ bool valid_window_parameters(native_jintArray const &values, values.size() == preceding.size() && values.size() == following.size(); } -// Generate gather maps needed to manifest the result of an equi-join between two tables. +// Convert a cudf gather map pair into the form that Java expects // The resulting Java long array contains the following at each index: // 0: Size of each gather map in bytes // 1: Device address of the gather map for the left table // 2: Host address of the rmm::device_buffer instance that owns the left gather map data // 3: Device address of the gather map for the right table // 4: Host address of the rmm::device_buffer instance that owns the right gather map data +jlongArray gather_maps_to_java(JNIEnv *env, + std::pair>, + std::unique_ptr>> + maps) { + // release the underlying device buffer to Java + auto left_map_buffer = std::make_unique(maps.first->release()); + auto right_map_buffer = std::make_unique(maps.second->release()); + cudf::jni::native_jlongArray result(env, 5); + result[0] = static_cast(left_map_buffer->size()); + result[1] = reinterpret_cast(left_map_buffer->data()); + result[2] = reinterpret_cast(left_map_buffer.release()); + result[3] = reinterpret_cast(right_map_buffer->data()); + result[4] = reinterpret_cast(right_map_buffer.release()); + return result.get_jArray(); +} + +// Convert a cudf gather map into the form that Java expects +// The resulting Java long array contains the following at each index: +// 0: Size of the gather map in bytes +// 1: Device address of the gather map +// 2: Host address of the rmm::device_buffer instance that owns the gather map data +jlongArray gather_map_to_java(JNIEnv *env, + std::unique_ptr> map) { + // release the underlying device buffer to Java + auto gather_map_buffer = std::make_unique(map->release()); + cudf::jni::native_jlongArray result(env, 3); + result[0] = static_cast(gather_map_buffer->size()); + result[1] = reinterpret_cast(gather_map_buffer->data()); + result[2] = reinterpret_cast(gather_map_buffer.release()); + return result.get_jArray(); +} + +// Generate gather maps needed to manifest the result of an equi-join between two tables. template jlongArray join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal, T join_func) { @@ -762,31 +805,29 @@ jlongArray join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_keys, auto left_keys = reinterpret_cast(j_left_keys); auto right_keys = reinterpret_cast(j_right_keys); auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; - std::pair>, - std::unique_ptr>> - join_maps = join_func(*left_keys, *right_keys, nulleq); - - // release the underlying device buffer to Java - auto left_map_buffer = std::make_unique(join_maps.first->release()); - auto right_map_buffer = std::make_unique(join_maps.second->release()); - cudf::jni::native_jlongArray result(env, 5); - result[0] = static_cast(left_map_buffer->size()); - result[1] = reinterpret_cast(left_map_buffer->data()); - result[2] = reinterpret_cast(left_map_buffer.release()); - result[3] = reinterpret_cast(right_map_buffer->data()); - result[4] = reinterpret_cast(right_map_buffer.release()); - return result.get_jArray(); + return gather_maps_to_java(env, join_func(*left_keys, *right_keys, nulleq)); + } + CATCH_STD(env, NULL); +} + +// Generate gather maps needed to manifest the result of an equi-join between a left table and +// a hash table built from the join's right table. +template +jlongArray hash_join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_hash_join, + jboolean compare_nulls_equal, T join_func) { + JNI_NULL_CHECK(env, j_left_keys, "left table is null", NULL); + JNI_NULL_CHECK(env, j_right_hash_join, "hash join is null", NULL); + try { + cudf::jni::auto_set_device(env); + auto left_keys = reinterpret_cast(j_left_keys); + auto hash_join = reinterpret_cast(j_right_hash_join); + auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; + return gather_maps_to_java(env, join_func(*left_keys, *hash_join, nulleq)); } CATCH_STD(env, NULL); } // Generate gather maps needed to manifest the result of a conditional join between two tables. -// The resulting Java long array contains the following at each index: -// 0: Size of each gather map in bytes -// 1: Device address of the gather map for the left table -// 2: Host address of the rmm::device_buffer instance that owns the left gather map data -// 3: Device address of the gather map for the right table -// 4: Host address of the rmm::device_buffer instance that owns the right gather map data template jlongArray cond_join_gather_maps(JNIEnv *env, jlong j_left_table, jlong j_right_table, jlong j_condition, jboolean compare_nulls_equal, T join_func) { @@ -799,29 +840,13 @@ jlongArray cond_join_gather_maps(JNIEnv *env, jlong j_left_table, jlong j_right_ auto right_table = reinterpret_cast(j_right_table); auto condition = reinterpret_cast(j_condition); auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; - std::pair>, - std::unique_ptr>> - join_maps = join_func(*left_table, *right_table, condition->get_top_expression(), nulleq); - - // release the underlying device buffer to Java - auto left_map_buffer = std::make_unique(join_maps.first->release()); - auto right_map_buffer = std::make_unique(join_maps.second->release()); - cudf::jni::native_jlongArray result(env, 5); - result[0] = static_cast(left_map_buffer->size()); - result[1] = reinterpret_cast(left_map_buffer->data()); - result[2] = reinterpret_cast(left_map_buffer.release()); - result[3] = reinterpret_cast(right_map_buffer->data()); - result[4] = reinterpret_cast(right_map_buffer.release()); - return result.get_jArray(); + return gather_maps_to_java( + env, join_func(*left_table, *right_table, condition->get_top_expression(), nulleq)); } CATCH_STD(env, NULL); } // Generate a gather map needed to manifest the result of a semi/anti join between two tables. -// The resulting Java long array contains the following at each index: -// 0: Size of the gather map in bytes -// 1: Device address of the gather map -// 2: Host address of the rmm::device_buffer instance that owns the gather map data template jlongArray join_gather_single_map(JNIEnv *env, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal, T join_func) { @@ -832,26 +857,13 @@ jlongArray join_gather_single_map(JNIEnv *env, jlong j_left_keys, jlong j_right_ auto left_keys = reinterpret_cast(j_left_keys); auto right_keys = reinterpret_cast(j_right_keys); auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; - std::unique_ptr> join_map = - join_func(*left_keys, *right_keys, nulleq); - - // release the underlying device buffer to Java - auto gather_map_buffer = std::make_unique(join_map->release()); - cudf::jni::native_jlongArray result(env, 3); - result[0] = static_cast(gather_map_buffer->size()); - result[1] = reinterpret_cast(gather_map_buffer->data()); - result[2] = reinterpret_cast(gather_map_buffer.release()); - return result.get_jArray(); + return gather_map_to_java(env, join_func(*left_keys, *right_keys, nulleq)); } CATCH_STD(env, NULL); } // Generate a gather map needed to manifest the result of a conditional semi/anti join // between two tables. -// The resulting Java long array contains the following at each index: -// 0: Size of the gather map in bytes -// 1: Device address of the gather map -// 2: Host address of the rmm::device_buffer instance that owns the gather map data template jlongArray cond_join_gather_single_map(JNIEnv *env, jlong j_left_table, jlong j_right_table, jlong j_condition, jboolean compare_nulls_equal, @@ -865,16 +877,8 @@ jlongArray cond_join_gather_single_map(JNIEnv *env, jlong j_left_table, jlong j_ auto right_table = reinterpret_cast(j_right_table); auto condition = reinterpret_cast(j_condition); auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; - std::unique_ptr> join_map = - join_func(*left_table, *right_table, condition->get_top_expression(), nulleq); - - // release the underlying device buffer to Java - auto gather_map_buffer = std::make_unique(join_map->release()); - cudf::jni::native_jlongArray result(env, 3); - result[0] = static_cast(gather_map_buffer->size()); - result[1] = reinterpret_cast(gather_map_buffer->data()); - result[2] = reinterpret_cast(gather_map_buffer.release()); - return result.get_jArray(); + return gather_map_to_java( + env, join_func(*left_table, *right_table, condition->get_top_expression(), nulleq)); } CATCH_STD(env, NULL); } @@ -925,6 +929,45 @@ jlongArray combine_join_results(JNIEnv *env, cudf::table &left_results, return combine_join_results(env, std::move(left_cols), std::move(right_cols)); } +cudf::column_view remove_validity_from_col(cudf::column_view column_view) { + if (!cudf::is_compound(column_view.type())) { + if (column_view.nullable() && column_view.null_count() == 0) { + // null_mask is allocated but no nulls present therefore we create a new column_view without + // the null_mask to avoid things blowing up in reading the parquet file + return cudf::column_view(column_view.type(), column_view.size(), column_view.head(), nullptr, + 0, column_view.offset()); + } else { + return cudf::column_view(column_view); + } + } else { + std::unique_ptr ret; + std::vector children; + children.reserve(column_view.num_children()); + for (auto it = column_view.child_begin(); it != column_view.child_end(); it++) { + children.push_back(remove_validity_from_col(*it)); + } + if (!column_view.nullable() || column_view.null_count() != 0) { + ret.reset(new cudf::column_view(column_view.type(), column_view.size(), nullptr, + column_view.null_mask(), column_view.null_count(), + column_view.offset(), children)); + } else { + ret.reset(new cudf::column_view(column_view.type(), column_view.size(), nullptr, nullptr, 0, + column_view.offset(), children)); + } + return *ret.release(); + } +} + +cudf::table_view remove_validity_if_needed(cudf::table_view *input_table_view) { + std::vector views; + views.reserve(input_table_view->num_columns()); + for (auto it = input_table_view->begin(); it != input_table_view->end(); it++) { + views.push_back(remove_validity_from_col(*it)); + } + + return cudf::table_view(views); +} + } // namespace } // namespace jni @@ -932,6 +975,25 @@ jlongArray combine_join_results(JNIEnv *env, cudf::table &left_results, extern "C" { +// This is a method purely added for testing remove_validity_if_needed method +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_removeNullMasksIfNeeded(JNIEnv *env, jclass, + jlong j_table_view) { + JNI_NULL_CHECK(env, j_table_view, "table view handle is null", 0); + try { + cudf::table_view *tview = reinterpret_cast(j_table_view); + cudf::table_view result = cudf::jni::remove_validity_if_needed(tview); + cudf::table m_tbl(result); + std::vector> cols = m_tbl.release(); + auto results = cudf::jni::native_jlongArray(env, cols.size()); + int i = 0; + for (auto it = cols.begin(); it != cols.end(); it++) { + results[i++] = reinterpret_cast(it->release()); + } + return results.get_jArray(); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_createCudfTableView(JNIEnv *env, jclass, jlongArray j_cudf_columns) { JNI_NULL_CHECK(env, j_cudf_columns, "columns are null", 0); @@ -1152,7 +1214,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass } JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV( - JNIEnv *env, jclass, jobjectArray col_names, jobjectArray data_types, + JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, jobjectArray filter_col_names, jstring inputfilepath, jlong buffer, jlong buffer_length, jint header_row, jbyte delim, jbyte quote, jbyte comment, jobjectArray null_values, jobjectArray true_values, jobjectArray false_values) { @@ -1173,7 +1235,23 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV( try { cudf::jni::auto_set_device(env); cudf::jni::native_jstringArray n_col_names(env, col_names); - cudf::jni::native_jstringArray n_data_types(env, data_types); + cudf::jni::native_jintArray n_types(env, j_types); + cudf::jni::native_jintArray n_scales(env, j_scales); + if (n_types.is_null() != n_scales.is_null()) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match null", + NULL); + } + std::vector data_types; + if (!n_types.is_null()) { + if (n_types.size() != n_scales.size()) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match size", + NULL); + } + data_types.reserve(n_types.size()); + for (int index = 0; index < n_types.size(); index++) { + data_types.emplace_back(cudf::jni::make_data_type(n_types[index], n_scales[index])); + } + } cudf::jni::native_jstring filename(env, inputfilepath); if (!read_buffer && filename.is_empty()) { @@ -1197,7 +1275,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV( .delimiter(delim) .header(header_row) .names(n_col_names.as_cpp_vector()) - .dtypes(n_data_types.as_cpp_vector()) + .dtypes(data_types) .use_cols_names(n_filter_col_names.as_cpp_vector()) .true_values(n_true_values.as_cpp_vector()) .false_values(n_false_values.as_cpp_vector()) @@ -1207,6 +1285,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV( .quotechar(quote) .comment(comment) .build(); + cudf::io::table_with_metadata result = cudf::io::read_csv(opts); return cudf::jni::convert_table_for_return(env, result.tbl); } @@ -1262,7 +1341,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin( JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children, jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values, jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions, - jobject consumer) { + jbooleanArray j_is_map, jobject consumer) { JNI_NULL_CHECK(env, j_col_names, "null columns", 0); JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0); JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0); @@ -1278,7 +1357,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin( table_input_metadata metadata; createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_metadata_keys, j_metadata_values, j_compression, j_stats_freq, j_isInt96, - j_precisions, metadata); + j_precisions, j_is_map, metadata); chunked_parquet_writer_options opts = chunked_parquet_writer_options::builder(sink) @@ -1298,7 +1377,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin( JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children, jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values, jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions, - jstring j_output_path) { + jbooleanArray j_is_map, jstring j_output_path) { JNI_NULL_CHECK(env, j_col_names, "null columns", 0); JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0); JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0); @@ -1312,7 +1391,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin( table_input_metadata metadata; createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_metadata_keys, j_metadata_values, j_compression, j_stats_freq, j_isInt96, - j_precisions, metadata); + j_precisions, j_is_map, metadata); sink_info sink{output_path.get()}; chunked_parquet_writer_options opts = chunked_parquet_writer_options::builder(sink) @@ -1336,7 +1415,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetChunk(JNIEnv *env, JNI_NULL_CHECK(env, j_state, "null state", ); using namespace cudf::io; - cudf::table_view *tview = reinterpret_cast(j_table); + cudf::table_view *tview_with_empty_nullmask = reinterpret_cast(j_table); + cudf::table_view tview = cudf::jni::remove_validity_if_needed(tview_with_empty_nullmask); cudf::jni::native_parquet_writer_handle *state = reinterpret_cast(j_state); @@ -1346,7 +1426,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetChunk(JNIEnv *env, } try { cudf::jni::auto_set_device(env); - state->writer->write(*tview); + state->writer->write(tview); } CATCH_STD(env, ) } @@ -1924,6 +2004,64 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftJoinGatherMaps( }); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_leftJoinRowCount(JNIEnv *env, jclass, + jlong j_left_table, + jlong j_right_hash_join, + jboolean compare_nulls_equal) { + JNI_NULL_CHECK(env, j_left_table, "left table is null", 0); + JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0); + try { + cudf::jni::auto_set_device(env); + auto left_table = reinterpret_cast(j_left_table); + auto hash_join = reinterpret_cast(j_right_hash_join); + auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; + auto row_count = hash_join->left_join_size(*left_table, nulleq); + return static_cast(row_count); + } + CATCH_STD(env, 0); +} + +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftHashJoinGatherMaps( + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, + jboolean compare_nulls_equal) { + return cudf::jni::hash_join_gather_maps( + env, j_left_table, j_right_hash_join, compare_nulls_equal, + [](cudf::table_view const &left, cudf::hash_join const &hash, cudf::null_equality nulleq) { + return hash.left_join(left, nulleq); + }); +} + +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftHashJoinGatherMapsWithCount( + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jboolean compare_nulls_equal, + jlong j_output_row_count) { + auto output_row_count = static_cast(j_output_row_count); + return cudf::jni::hash_join_gather_maps(env, j_left_table, j_right_hash_join, compare_nulls_equal, + [output_row_count](cudf::table_view const &left, + cudf::hash_join const &hash, + cudf::null_equality nulleq) { + return hash.left_join(left, nulleq, output_row_count); + }); +} + +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinRowCount( + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition, + jboolean compare_nulls_equal) { + JNI_NULL_CHECK(env, j_left_table, "left_table is null", 0); + JNI_NULL_CHECK(env, j_right_table, "right_table is null", 0); + JNI_NULL_CHECK(env, j_condition, "condition is null", 0); + try { + cudf::jni::auto_set_device(env); + auto left_table = reinterpret_cast(j_left_table); + auto right_table = reinterpret_cast(j_right_table); + auto condition = reinterpret_cast(j_condition); + auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; + auto row_count = cudf::conditional_left_join_size(*left_table, *right_table, + condition->get_top_expression(), nulleq); + return static_cast(row_count); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinGatherMaps( JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition, jboolean compare_nulls_equal) { @@ -1935,6 +2073,18 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinGather }); } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinGatherMapsWithCount( + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition, + jboolean compare_nulls_equal, jlong j_row_count) { + auto row_count = static_cast(j_row_count); + return cudf::jni::cond_join_gather_maps( + env, j_left_table, j_right_table, j_condition, compare_nulls_equal, + [row_count](cudf::table_view const &left, cudf::table_view const &right, + cudf::ast::expression const &cond_expr, cudf::null_equality nulleq) { + return cudf::conditional_left_join(left, right, cond_expr, nulleq, row_count); + }); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerJoinGatherMaps( JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) { return cudf::jni::join_gather_maps( @@ -1944,6 +2094,64 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerJoinGatherMaps( }); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_innerJoinRowCount(JNIEnv *env, jclass, + jlong j_left_table, + jlong j_right_hash_join, + jboolean compare_nulls_equal) { + JNI_NULL_CHECK(env, j_left_table, "left table is null", 0); + JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0); + try { + cudf::jni::auto_set_device(env); + auto left_table = reinterpret_cast(j_left_table); + auto hash_join = reinterpret_cast(j_right_hash_join); + auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; + auto row_count = hash_join->inner_join_size(*left_table, nulleq); + return static_cast(row_count); + } + CATCH_STD(env, 0); +} + +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerHashJoinGatherMaps( + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, + jboolean compare_nulls_equal) { + return cudf::jni::hash_join_gather_maps( + env, j_left_table, j_right_hash_join, compare_nulls_equal, + [](cudf::table_view const &left, cudf::hash_join const &hash, cudf::null_equality nulleq) { + return hash.inner_join(left, nulleq); + }); +} + +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerHashJoinGatherMapsWithCount( + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jboolean compare_nulls_equal, + jlong j_output_row_count) { + auto output_row_count = static_cast(j_output_row_count); + return cudf::jni::hash_join_gather_maps(env, j_left_table, j_right_hash_join, compare_nulls_equal, + [output_row_count](cudf::table_view const &left, + cudf::hash_join const &hash, + cudf::null_equality nulleq) { + return hash.inner_join(left, nulleq, output_row_count); + }); +} + +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinRowCount( + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition, + jboolean compare_nulls_equal) { + JNI_NULL_CHECK(env, j_left_table, "left_table is null", 0); + JNI_NULL_CHECK(env, j_right_table, "right_table is null", 0); + JNI_NULL_CHECK(env, j_condition, "condition is null", 0); + try { + cudf::jni::auto_set_device(env); + auto left_table = reinterpret_cast(j_left_table); + auto right_table = reinterpret_cast(j_right_table); + auto condition = reinterpret_cast(j_condition); + auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; + auto row_count = cudf::conditional_inner_join_size(*left_table, *right_table, + condition->get_top_expression(), nulleq); + return static_cast(row_count); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinGatherMaps( JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition, jboolean compare_nulls_equal) { @@ -1955,6 +2163,18 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinGathe }); } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinGatherMapsWithCount( + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition, + jboolean compare_nulls_equal, jlong j_row_count) { + auto row_count = static_cast(j_row_count); + return cudf::jni::cond_join_gather_maps( + env, j_left_table, j_right_table, j_condition, compare_nulls_equal, + [row_count](cudf::table_view const &left, cudf::table_view const &right, + cudf::ast::expression const &cond_expr, cudf::null_equality nulleq) { + return cudf::conditional_inner_join(left, right, cond_expr, nulleq, row_count); + }); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullJoinGatherMaps( JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) { return cudf::jni::join_gather_maps( @@ -1964,6 +2184,45 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullJoinGatherMaps( }); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_fullJoinRowCount(JNIEnv *env, jclass, + jlong j_left_table, + jlong j_right_hash_join, + jboolean compare_nulls_equal) { + JNI_NULL_CHECK(env, j_left_table, "left table is null", 0); + JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0); + try { + cudf::jni::auto_set_device(env); + auto left_table = reinterpret_cast(j_left_table); + auto hash_join = reinterpret_cast(j_right_hash_join); + auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; + auto row_count = hash_join->full_join_size(*left_table, nulleq); + return static_cast(row_count); + } + CATCH_STD(env, 0); +} + +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullHashJoinGatherMaps( + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, + jboolean compare_nulls_equal) { + return cudf::jni::hash_join_gather_maps( + env, j_left_table, j_right_hash_join, compare_nulls_equal, + [](cudf::table_view const &left, cudf::hash_join const &hash, cudf::null_equality nulleq) { + return hash.full_join(left, nulleq); + }); +} + +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullHashJoinGatherMapsWithCount( + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jboolean compare_nulls_equal, + jlong j_output_row_count) { + auto output_row_count = static_cast(j_output_row_count); + return cudf::jni::hash_join_gather_maps(env, j_left_table, j_right_hash_join, compare_nulls_equal, + [output_row_count](cudf::table_view const &left, + cudf::hash_join const &hash, + cudf::null_equality nulleq) { + return hash.full_join(left, nulleq, output_row_count); + }); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalFullJoinGatherMaps( JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition, jboolean compare_nulls_equal) { @@ -1984,6 +2243,25 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftSemiJoinGatherMap( }); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinRowCount( + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition, + jboolean compare_nulls_equal) { + JNI_NULL_CHECK(env, j_left_table, "left_table is null", 0); + JNI_NULL_CHECK(env, j_right_table, "right_table is null", 0); + JNI_NULL_CHECK(env, j_condition, "condition is null", 0); + try { + cudf::jni::auto_set_device(env); + auto left_table = reinterpret_cast(j_left_table); + auto right_table = reinterpret_cast(j_right_table); + auto condition = reinterpret_cast(j_condition); + auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; + auto row_count = cudf::conditional_left_semi_join_size(*left_table, *right_table, + condition->get_top_expression(), nulleq); + return static_cast(row_count); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinGatherMap( JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition, jboolean compare_nulls_equal) { @@ -1995,6 +2273,18 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinGa }); } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinGatherMapWithCount( + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition, + jboolean compare_nulls_equal, jlong j_row_count) { + auto row_count = static_cast(j_row_count); + return cudf::jni::cond_join_gather_single_map( + env, j_left_table, j_right_table, j_condition, compare_nulls_equal, + [row_count](cudf::table_view const &left, cudf::table_view const &right, + cudf::ast::expression const &cond_expr, cudf::null_equality nulleq) { + return cudf::conditional_left_semi_join(left, right, cond_expr, nulleq, row_count); + }); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftAntiJoinGatherMap( JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) { return cudf::jni::join_gather_single_map( @@ -2004,6 +2294,25 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftAntiJoinGatherMap( }); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinRowCount( + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition, + jboolean compare_nulls_equal) { + JNI_NULL_CHECK(env, j_left_table, "left_table is null", 0); + JNI_NULL_CHECK(env, j_right_table, "right_table is null", 0); + JNI_NULL_CHECK(env, j_condition, "condition is null", 0); + try { + cudf::jni::auto_set_device(env); + auto left_table = reinterpret_cast(j_left_table); + auto right_table = reinterpret_cast(j_right_table); + auto condition = reinterpret_cast(j_condition); + auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; + auto row_count = cudf::conditional_left_anti_join_size(*left_table, *right_table, + condition->get_top_expression(), nulleq); + return static_cast(row_count); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinGatherMap( JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition, jboolean compare_nulls_equal) { @@ -2015,6 +2324,18 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinGa }); } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinGatherMapWithCount( + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition, + jboolean compare_nulls_equal, jlong j_row_count) { + auto row_count = static_cast(j_row_count); + return cudf::jni::cond_join_gather_single_map( + env, j_left_table, j_right_table, j_condition, compare_nulls_equal, + [row_count](cudf::table_view const &left, cudf::table_view const &right, + cudf::ast::expression const &cond_expr, cudf::null_equality nulleq) { + return cudf::conditional_left_anti_join(left, right, cond_expr, nulleq, row_count); + }); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_crossJoin(JNIEnv *env, jclass, jlong left_table, jlong right_table) { @@ -2194,11 +2515,19 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate( for (int i = 0; i < n_values.size(); i++) { cudf::groupby::aggregation_request req; int col_index = n_values[i]; + + cudf::groupby_aggregation *agg = + dynamic_cast(n_agg_instances[i]); + JNI_ARG_CHECK(env, agg != nullptr, "aggregation is not an instance of groupby_aggregation", + nullptr); + std::unique_ptr cloned( + dynamic_cast(agg->clone().release())); + if (col_index == previous_index) { - requests.back().aggregations.push_back(n_agg_instances[i]->clone()); + requests.back().aggregations.push_back(std::move(cloned)); } else { req.values = n_input_table->column(col_index); - req.aggregations.push_back(n_agg_instances[i]->clone()); + req.aggregations.push_back(std::move(cloned)); requests.push_back(std::move(req)); } previous_index = col_index; @@ -2250,17 +2579,25 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan( // Aggregates are passed in already grouped by column, so we just need to fill it in // as we go. - std::vector requests; + std::vector requests; int previous_index = -1; for (int i = 0; i < n_values.size(); i++) { - cudf::groupby::aggregation_request req; + cudf::groupby::scan_request req; int col_index = n_values[i]; + + cudf::groupby_scan_aggregation *agg = + dynamic_cast(n_agg_instances[i]); + JNI_ARG_CHECK(env, agg != nullptr, + "aggregation is not an instance of groupby_scan_aggregation", nullptr); + std::unique_ptr cloned( + dynamic_cast(agg->clone().release())); + if (col_index == previous_index) { - requests.back().aggregations.push_back(n_agg_instances[i]->clone()); + requests.back().aggregations.push_back(std::move(cloned)); } else { req.values = n_input_table->column(col_index); - req.aggregations.push_back(n_agg_instances[i]->clone()); + req.aggregations.push_back(std::move(cloned)); requests.push_back(std::move(req)); } previous_index = col_index; diff --git a/java/src/main/native/src/jni_compiled_expr.hpp b/java/src/main/native/src/jni_compiled_expr.hpp index e42e5a37fba..74010f71011 100644 --- a/java/src/main/native/src/jni_compiled_expr.hpp +++ b/java/src/main/native/src/jni_compiled_expr.hpp @@ -32,12 +32,6 @@ namespace ast { * base AST node type. Then we do not have to track every AST node type separately. */ class compiled_expr { - /** All literal nodes within the expression tree */ - std::vector> literals; - - /** All column reference nodes within the expression tree */ - std::vector> column_refs; - /** All expression nodes within the expression tree */ std::vector> expressions; @@ -47,20 +41,20 @@ class compiled_expr { public: cudf::ast::literal &add_literal(std::unique_ptr literal_ptr, std::unique_ptr scalar_ptr) { - literals.push_back(std::move(literal_ptr)); + expressions.push_back(std::move(literal_ptr)); scalars.push_back(std::move(scalar_ptr)); - return *literals.back(); + return static_cast(*expressions.back()); } cudf::ast::column_reference & add_column_ref(std::unique_ptr ref_ptr) { - column_refs.push_back(std::move(ref_ptr)); - return *column_refs.back(); + expressions.push_back(std::move(ref_ptr)); + return static_cast(*expressions.back()); } - cudf::ast::expression &add_expression(std::unique_ptr expr_ptr) { + cudf::ast::operation &add_operation(std::unique_ptr expr_ptr) { expressions.push_back(std::move(expr_ptr)); - return *expressions.back(); + return static_cast(*expressions.back()); } /** Return the expression node at the top of the tree */ diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index d3fdb0e19bb..4856071e296 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2899,24 +2899,22 @@ void testPrefixSum() { @Test void testScanSum() { try (ColumnVector v1 = ColumnVector.fromBoxedInts(1, 2, null, 3, 5, 8, 10)) { - // Due to https://github.com/rapidsai/cudf/issues/8462 NullPolicy.INCLUDE - // tests have been disabled -// try (ColumnVector result = v1.scan(Aggregation.sum(), ScanType.INCLUSIVE, NullPolicy.INCLUDE); -// ColumnVector expected = ColumnVector.fromBoxedInts(1, 3, null, null, null, null, null)) { -// assertColumnsAreEqual(expected, result); -// } - - try (ColumnVector result = v1.scan(Aggregation.sum(), ScanType.INCLUSIVE, NullPolicy.EXCLUDE); + try (ColumnVector result = v1.scan(ScanAggregation.sum(), ScanType.INCLUSIVE, NullPolicy.INCLUDE); + ColumnVector expected = ColumnVector.fromBoxedInts(1, 3, null, null, null, null, null)) { + assertColumnsAreEqual(expected, result); + } + + try (ColumnVector result = v1.scan(ScanAggregation.sum(), ScanType.INCLUSIVE, NullPolicy.EXCLUDE); ColumnVector expected = ColumnVector.fromBoxedInts(1, 3, null, 6, 11, 19, 29)) { assertColumnsAreEqual(expected, result); } -// try (ColumnVector result = v1.scan(Aggregation.sum(), ScanType.EXCLUSIVE, NullPolicy.INCLUDE); -// ColumnVector expected = ColumnVector.fromBoxedInts(0, 1, 3, 3, 6, 11, 19)) { -// assertColumnsAreEqual(expected, result); -// } + try (ColumnVector result = v1.scan(ScanAggregation.sum(), ScanType.EXCLUSIVE, NullPolicy.INCLUDE); + ColumnVector expected = ColumnVector.fromBoxedInts(0, 1, 3, null, null, null, null)) { + assertColumnsAreEqual(expected, result); + } - try (ColumnVector result = v1.scan(Aggregation.sum(), ScanType.EXCLUSIVE, NullPolicy.EXCLUDE); + try (ColumnVector result = v1.scan(ScanAggregation.sum(), ScanType.EXCLUSIVE, NullPolicy.EXCLUDE); ColumnVector expected = ColumnVector.fromBoxedInts(0, 1, null, 3, 6, 11, 19)) { assertColumnsAreEqual(expected, result); } @@ -2925,25 +2923,23 @@ void testScanSum() { @Test void testScanMax() { - // Due to https://github.com/rapidsai/cudf/issues/8462 NullPolicy.INCLUDE - // tests have been disabled try (ColumnVector v1 = ColumnVector.fromBoxedInts(1, 2, null, 3, 5, 8, 10)) { -// try (ColumnVector result = v1.scan(Aggregation.max(), ScanType.INCLUSIVE, NullPolicy.INCLUDE); -// ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, null, null, null, null)) { -// assertColumnsAreEqual(expected, result); -// } + try (ColumnVector result = v1.scan(ScanAggregation.max(), ScanType.INCLUSIVE, NullPolicy.INCLUDE); + ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, null, null, null, null)) { + assertColumnsAreEqual(expected, result); + } - try (ColumnVector result = v1.scan(Aggregation.max(), ScanType.INCLUSIVE, NullPolicy.EXCLUDE); + try (ColumnVector result = v1.scan(ScanAggregation.max(), ScanType.INCLUSIVE, NullPolicy.EXCLUDE); ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, 3, 5, 8, 10)) { assertColumnsAreEqual(expected, result); } -// try (ColumnVector result = v1.scan(Aggregation.max(), ScanType.EXCLUSIVE, NullPolicy.INCLUDE); -// ColumnVector expected = ColumnVector.fromBoxedInts(Integer.MIN_VALUE, 1, 2, 2, 3, 5, 8)) { -// assertColumnsAreEqual(expected, result); -// } + try (ColumnVector result = v1.scan(ScanAggregation.max(), ScanType.EXCLUSIVE, NullPolicy.INCLUDE); + ColumnVector expected = ColumnVector.fromBoxedInts(Integer.MIN_VALUE, 1, 2, null, null, null, null)) { + assertColumnsAreEqual(expected, result); + } - try (ColumnVector result = v1.scan(Aggregation.max(), ScanType.EXCLUSIVE, NullPolicy.EXCLUDE); + try (ColumnVector result = v1.scan(ScanAggregation.max(), ScanType.EXCLUSIVE, NullPolicy.EXCLUDE); ColumnVector expected = ColumnVector.fromBoxedInts(Integer.MIN_VALUE, 1, null, 2, 3, 5, 8)) { assertColumnsAreEqual(expected, result); } @@ -2952,25 +2948,23 @@ void testScanMax() { @Test void testScanMin() { - // Due to https://github.com/rapidsai/cudf/issues/8462 NullPolicy.INCLUDE - // tests have been disabled try (ColumnVector v1 = ColumnVector.fromBoxedInts(1, 2, null, 3, 5, 8, 10)) { -// try (ColumnVector result = v1.scan(Aggregation.min(), ScanType.INCLUSIVE, NullPolicy.INCLUDE); -// ColumnVector expected = ColumnVector.fromBoxedInts(1, 1, null, null, null, null, null)) { -// assertColumnsAreEqual(expected, result); -// } + try (ColumnVector result = v1.scan(ScanAggregation.min(), ScanType.INCLUSIVE, NullPolicy.INCLUDE); + ColumnVector expected = ColumnVector.fromBoxedInts(1, 1, null, null, null, null, null)) { + assertColumnsAreEqual(expected, result); + } - try (ColumnVector result = v1.scan(Aggregation.min(), ScanType.INCLUSIVE, NullPolicy.EXCLUDE); + try (ColumnVector result = v1.scan(ScanAggregation.min(), ScanType.INCLUSIVE, NullPolicy.EXCLUDE); ColumnVector expected = ColumnVector.fromBoxedInts(1, 1, null, 1, 1, 1, 1)) { assertColumnsAreEqual(expected, result); } -// try (ColumnVector result = v1.scan(Aggregation.min(), ScanType.EXCLUSIVE, NullPolicy.INCLUDE); -// ColumnVector expected = ColumnVector.fromBoxedInts(Integer.MAX_VALUE, 1, 1, 1, 1, 1, 1)) { -// assertColumnsAreEqual(expected, result); -// } + try (ColumnVector result = v1.scan(ScanAggregation.min(), ScanType.EXCLUSIVE, NullPolicy.INCLUDE); + ColumnVector expected = ColumnVector.fromBoxedInts(Integer.MAX_VALUE, 1, 1, null, null, null, null)) { + assertColumnsAreEqual(expected, result); + } - try (ColumnVector result = v1.scan(Aggregation.min(), ScanType.EXCLUSIVE, NullPolicy.EXCLUDE); + try (ColumnVector result = v1.scan(ScanAggregation.min(), ScanType.EXCLUSIVE, NullPolicy.EXCLUDE); ColumnVector expected = ColumnVector.fromBoxedInts(Integer.MAX_VALUE, 1, null, 1, 1, 1, 1)) { assertColumnsAreEqual(expected, result); } @@ -2979,25 +2973,23 @@ void testScanMin() { @Test void testScanProduct() { - // Due to https://github.com/rapidsai/cudf/issues/8462 NullPolicy.INCLUDE - // tests have been disabled try (ColumnVector v1 = ColumnVector.fromBoxedInts(1, 2, null, 3, 5, 8, 10)) { -// try (ColumnVector result = v1.scan(Aggregation.product(), ScanType.INCLUSIVE, NullPolicy.INCLUDE); -// ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, null, null, null, null)) { -// assertColumnsAreEqual(expected, result); -// } + try (ColumnVector result = v1.scan(ScanAggregation.product(), ScanType.INCLUSIVE, NullPolicy.INCLUDE); + ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, null, null, null, null)) { + assertColumnsAreEqual(expected, result); + } - try (ColumnVector result = v1.scan(Aggregation.product(), ScanType.INCLUSIVE, NullPolicy.EXCLUDE); + try (ColumnVector result = v1.scan(ScanAggregation.product(), ScanType.INCLUSIVE, NullPolicy.EXCLUDE); ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, 6, 30, 240, 2400)) { assertColumnsAreEqual(expected, result); } -// try (ColumnVector result = v1.scan(Aggregation.product(), ScanType.EXCLUSIVE, NullPolicy.INCLUDE); -// ColumnVector expected = ColumnVector.fromBoxedInts(1, 1, 2, 2, 6, 30, 240)) { -// assertColumnsAreEqual(expected, result); -// } + try (ColumnVector result = v1.scan(ScanAggregation.product(), ScanType.EXCLUSIVE, NullPolicy.INCLUDE); + ColumnVector expected = ColumnVector.fromBoxedInts(1, 1, 2, null, null, null, null)) { + assertColumnsAreEqual(expected, result); + } - try (ColumnVector result = v1.scan(Aggregation.product(), ScanType.EXCLUSIVE, NullPolicy.EXCLUDE); + try (ColumnVector result = v1.scan(ScanAggregation.product(), ScanType.EXCLUSIVE, NullPolicy.EXCLUDE); ColumnVector expected = ColumnVector.fromBoxedInts(1, 1, null, 2, 6, 30, 240)) { assertColumnsAreEqual(expected, result); } @@ -3011,13 +3003,13 @@ void testScanRank() { ColumnVector struct_order = ColumnVector.makeStruct(col1, col2); ColumnVector expected = ColumnVector.fromBoxedInts( 1, 1, 3, 4, 5, 6, 7, 7, 9, 9, 11, 12)) { - try (ColumnVector result = struct_order.scan(Aggregation.rank(), + try (ColumnVector result = struct_order.scan(ScanAggregation.rank(), ScanType.INCLUSIVE, NullPolicy.INCLUDE)) { assertColumnsAreEqual(expected, result); } // Exclude should have identical results - try (ColumnVector result = struct_order.scan(Aggregation.rank(), + try (ColumnVector result = struct_order.scan(ScanAggregation.rank(), ScanType.INCLUSIVE, NullPolicy.EXCLUDE) ) { assertColumnsAreEqual(expected, result); @@ -3034,13 +3026,13 @@ void testScanDenseRank() { ColumnVector struct_order = ColumnVector.makeStruct(col1, col2); ColumnVector expected = ColumnVector.fromBoxedInts( 1, 1, 2, 3, 4, 5, 6, 6, 7, 7, 8, 9)) { - try (ColumnVector result = struct_order.scan(Aggregation.denseRank(), + try (ColumnVector result = struct_order.scan(ScanAggregation.denseRank(), ScanType.INCLUSIVE, NullPolicy.INCLUDE)) { assertColumnsAreEqual(expected, result); } // Exclude should have identical results - try (ColumnVector result = struct_order.scan(Aggregation.denseRank(), + try (ColumnVector result = struct_order.scan(ScanAggregation.denseRank(), ScanType.INCLUSIVE, NullPolicy.EXCLUDE)) { assertColumnsAreEqual(expected, result); } @@ -3058,39 +3050,39 @@ void testWindowStatic() { .minPeriods(2).build()) { try (ColumnVector v1 = ColumnVector.fromInts(5, 4, 7, 6, 8)) { try (ColumnVector expected = ColumnVector.fromLongs(9, 16, 17, 21, 14); - ColumnVector result = v1.rollingWindow(Aggregation.sum(), options)) { + ColumnVector result = v1.rollingWindow(RollingAggregation.sum(), options)) { assertColumnsAreEqual(expected, result); } try (ColumnVector expected = ColumnVector.fromInts(4, 4, 4, 6, 6); - ColumnVector result = v1.rollingWindow(Aggregation.min(), options)) { + ColumnVector result = v1.rollingWindow(RollingAggregation.min(), options)) { assertColumnsAreEqual(expected, result); } try (ColumnVector expected = ColumnVector.fromInts(5, 7, 7, 8, 8); - ColumnVector result = v1.rollingWindow(Aggregation.max(), options)) { + ColumnVector result = v1.rollingWindow(RollingAggregation.max(), options)) { assertColumnsAreEqual(expected, result); } // The rolling window produces the same result type as the input try (ColumnVector expected = ColumnVector.fromDoubles(4.5, 16.0 / 3, 17.0 / 3, 7, 7); - ColumnVector result = v1.rollingWindow(Aggregation.mean(), options)) { + ColumnVector result = v1.rollingWindow(RollingAggregation.mean(), options)) { assertColumnsAreEqual(expected, result); } try (ColumnVector expected = ColumnVector.fromBoxedInts(4, 7, 6, 8, null); - ColumnVector result = v1.rollingWindow(Aggregation.lead(1), options)) { + ColumnVector result = v1.rollingWindow(RollingAggregation.lead(1), options)) { assertColumnsAreEqual(expected, result); } try (ColumnVector expected = ColumnVector.fromBoxedInts(null, 5, 4, 7, 6); - ColumnVector result = v1.rollingWindow(Aggregation.lag(1), options)) { + ColumnVector result = v1.rollingWindow(RollingAggregation.lag(1), options)) { assertColumnsAreEqual(expected, result); } try (ColumnVector defaultOutput = ColumnVector.fromInts(-1, -2, -3, -4, -5); ColumnVector expected = ColumnVector.fromBoxedInts(-1, 5, 4, 7, 6); - ColumnVector result = v1.rollingWindow(Aggregation.lag(1, defaultOutput), options)) { + ColumnVector result = v1.rollingWindow(RollingAggregation.lag(1, defaultOutput), options)) { assertColumnsAreEqual(expected, result); } } @@ -3106,11 +3098,11 @@ void testWindowStaticCounts() { .minPeriods(2).build()) { try (ColumnVector v1 = ColumnVector.fromBoxedInts(5, 4, null, 6, 8)) { try (ColumnVector expected = ColumnVector.fromInts(2, 2, 2, 2, 2); - ColumnVector result = v1.rollingWindow(Aggregation.count(NullPolicy.EXCLUDE), options)) { + ColumnVector result = v1.rollingWindow(RollingAggregation.count(NullPolicy.EXCLUDE), options)) { assertColumnsAreEqual(expected, result); } try (ColumnVector expected = ColumnVector.fromInts(2, 3, 3, 3, 2); - ColumnVector result = v1.rollingWindow(Aggregation.count(NullPolicy.INCLUDE), options)) { + ColumnVector result = v1.rollingWindow(RollingAggregation.count(NullPolicy.INCLUDE), options)) { assertColumnsAreEqual(expected, result); } } @@ -3125,7 +3117,7 @@ void testWindowDynamicNegative() { .minPeriods(2).window(precedingCol, followingCol).build()) { try (ColumnVector v1 = ColumnVector.fromInts(5, 4, 7, 6, 8); ColumnVector expected = ColumnVector.fromBoxedLongs(null, null, 9L, 16L, 25L); - ColumnVector result = v1.rollingWindow(Aggregation.sum(), window)) { + ColumnVector result = v1.rollingWindow(RollingAggregation.sum(), window)) { assertColumnsAreEqual(expected, result); } } @@ -3141,7 +3133,7 @@ void testWindowLag() { .window(two, negOne).build()) { try (ColumnVector v1 = ColumnVector.fromInts(5, 4, 7, 6, 8); ColumnVector expected = ColumnVector.fromBoxedInts(null, 5, 4, 7, 6); - ColumnVector result = v1.rollingWindow(Aggregation.max(), window)) { + ColumnVector result = v1.rollingWindow(RollingAggregation.max(), window)) { assertColumnsAreEqual(expected, result); } } @@ -3155,7 +3147,7 @@ void testWindowDynamic() { .window(precedingCol, followingCol).build()) { try (ColumnVector v1 = ColumnVector.fromInts(5, 4, 7, 6, 8); ColumnVector expected = ColumnVector.fromLongs(16, 22, 30, 14, 14); - ColumnVector result = v1.rollingWindow(Aggregation.sum(), window)) { + ColumnVector result = v1.rollingWindow(RollingAggregation.sum(), window)) { assertColumnsAreEqual(expected, result); } } @@ -3181,7 +3173,7 @@ void testWindowThrowsException() { .minPeriods(1) .orderByColumnIndex(0) .build()) { - arraywindowCol.rollingWindow(Aggregation.sum(), options); + arraywindowCol.rollingWindow(RollingAggregation.sum(), options); } }); } diff --git a/java/src/test/java/ai/rapids/cudf/HashJoinTest.java b/java/src/test/java/ai/rapids/cudf/HashJoinTest.java new file mode 100644 index 00000000000..be6125340ec --- /dev/null +++ b/java/src/test/java/ai/rapids/cudf/HashJoinTest.java @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.rapids.cudf; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class HashJoinTest { + @Test + void testGetNumberOfColumns() { + try (Table t = new Table.TestBuilder().column(1, 2).column(3, 4).column(5, 6).build(); + HashJoin hashJoin = new HashJoin(t, false)) { + assertEquals(3, hashJoin.getNumberOfColumns()); + } + } + + @Test + void testGetCompareNulls() { + try (Table t = new Table.TestBuilder().column(1, 2, 3, 4).column(5, 6, 7, 8).build()) { + try (HashJoin hashJoin = new HashJoin(t, false)) { + assertFalse(hashJoin.getCompareNulls()); + } + try (HashJoin hashJoin = new HashJoin(t, true)) { + assertTrue(hashJoin.getCompareNulls()); + } + } + } +} diff --git a/java/src/test/java/ai/rapids/cudf/ReductionTest.java b/java/src/test/java/ai/rapids/cudf/ReductionTest.java index 17b9ec3556f..2b26597c8f7 100644 --- a/java/src/test/java/ai/rapids/cudf/ReductionTest.java +++ b/java/src/test/java/ai/rapids/cudf/ReductionTest.java @@ -43,17 +43,17 @@ class ReductionTest extends CudfTestBase { Aggregation.Kind.ANY, Aggregation.Kind.ALL); - private static Scalar buildExpectedScalar(Aggregation op, DType baseType, Object expectedObject) { + private static Scalar buildExpectedScalar(ReductionAggregation op, DType baseType, Object expectedObject) { if (expectedObject == null) { return Scalar.fromNull(baseType); } - if (FLOAT_REDUCTIONS.contains(op.kind)) { + if (FLOAT_REDUCTIONS.contains(op.getWrapped().kind)) { if (baseType.equals(DType.FLOAT32)) { return Scalar.fromFloat((Float) expectedObject); } return Scalar.fromDouble((Double) expectedObject); } - if (BOOL_REDUCTIONS.contains(op.kind)) { + if (BOOL_REDUCTIONS.contains(op.getWrapped().kind)) { return Scalar.fromBool((Boolean) expectedObject); } switch (baseType.typeId) { @@ -88,165 +88,165 @@ private static Scalar buildExpectedScalar(Aggregation op, DType baseType, Object private static Stream createBooleanParams() { Boolean[] vals = new Boolean[]{true, true, null, false, true, false, null}; return Stream.of( - Arguments.of(Aggregation.sum(), new Boolean[0], null, 0.), - Arguments.of(Aggregation.sum(), new Boolean[]{null, null, null}, null, 0.), - Arguments.of(Aggregation.sum(), vals, true, 0.), - Arguments.of(Aggregation.min(), vals, false, 0.), - Arguments.of(Aggregation.max(), vals, true, 0.), - Arguments.of(Aggregation.product(), vals, false, 0.), - Arguments.of(Aggregation.sumOfSquares(), vals, true, 0.), - Arguments.of(Aggregation.mean(), vals, 0.6, DELTAD), - Arguments.of(Aggregation.standardDeviation(), vals, 0.5477225575051662, DELTAD), - Arguments.of(Aggregation.variance(), vals, 0.3, DELTAD), - Arguments.of(Aggregation.any(), vals, true, 0.), - Arguments.of(Aggregation.all(), vals, false, 0.) + Arguments.of(ReductionAggregation.sum(), new Boolean[0], null, 0.), + Arguments.of(ReductionAggregation.sum(), new Boolean[]{null, null, null}, null, 0.), + Arguments.of(ReductionAggregation.sum(), vals, true, 0.), + Arguments.of(ReductionAggregation.min(), vals, false, 0.), + Arguments.of(ReductionAggregation.max(), vals, true, 0.), + Arguments.of(ReductionAggregation.product(), vals, false, 0.), + Arguments.of(ReductionAggregation.sumOfSquares(), vals, true, 0.), + Arguments.of(ReductionAggregation.mean(), vals, 0.6, DELTAD), + Arguments.of(ReductionAggregation.standardDeviation(), vals, 0.5477225575051662, DELTAD), + Arguments.of(ReductionAggregation.variance(), vals, 0.3, DELTAD), + Arguments.of(ReductionAggregation.any(), vals, true, 0.), + Arguments.of(ReductionAggregation.all(), vals, false, 0.) ); } private static Stream createByteParams() { Byte[] vals = new Byte[]{-1, 7, 123, null, 50, 60, 100}; return Stream.of( - Arguments.of(Aggregation.sum(), new Byte[0], null, 0.), - Arguments.of(Aggregation.sum(), new Byte[]{null, null, null}, null, 0.), - Arguments.of(Aggregation.sum(), vals, (byte) 83, 0.), - Arguments.of(Aggregation.min(), vals, (byte) -1, 0.), - Arguments.of(Aggregation.max(), vals, (byte) 123, 0.), - Arguments.of(Aggregation.product(), vals, (byte) 160, 0.), - Arguments.of(Aggregation.sumOfSquares(), vals, (byte) 47, 0.), - Arguments.of(Aggregation.mean(), vals, 56.5, DELTAD), - Arguments.of(Aggregation.standardDeviation(), vals, 49.24530434467839, DELTAD), - Arguments.of(Aggregation.variance(), vals, 2425.1, DELTAD), - Arguments.of(Aggregation.any(), vals, true, 0.), - Arguments.of(Aggregation.all(), vals, true, 0.) + Arguments.of(ReductionAggregation.sum(), new Byte[0], null, 0.), + Arguments.of(ReductionAggregation.sum(), new Byte[]{null, null, null}, null, 0.), + Arguments.of(ReductionAggregation.sum(), vals, (byte) 83, 0.), + Arguments.of(ReductionAggregation.min(), vals, (byte) -1, 0.), + Arguments.of(ReductionAggregation.max(), vals, (byte) 123, 0.), + Arguments.of(ReductionAggregation.product(), vals, (byte) 160, 0.), + Arguments.of(ReductionAggregation.sumOfSquares(), vals, (byte) 47, 0.), + Arguments.of(ReductionAggregation.mean(), vals, 56.5, DELTAD), + Arguments.of(ReductionAggregation.standardDeviation(), vals, 49.24530434467839, DELTAD), + Arguments.of(ReductionAggregation.variance(), vals, 2425.1, DELTAD), + Arguments.of(ReductionAggregation.any(), vals, true, 0.), + Arguments.of(ReductionAggregation.all(), vals, true, 0.) ); } private static Stream createShortParams() { Short[] vals = new Short[]{-1, 7, 123, null, 50, 60, 100}; return Stream.of( - Arguments.of(Aggregation.sum(), new Short[0], null, 0.), - Arguments.of(Aggregation.sum(), new Short[]{null, null, null}, null, 0.), - Arguments.of(Aggregation.sum(), vals, (short) 339, 0.), - Arguments.of(Aggregation.min(), vals, (short) -1, 0.), - Arguments.of(Aggregation.max(), vals, (short) 123, 0.), - Arguments.of(Aggregation.product(), vals, (short) -22624, 0.), - Arguments.of(Aggregation.sumOfSquares(), vals, (short) 31279, 0.), - Arguments.of(Aggregation.mean(), vals, 56.5, DELTAD), - Arguments.of(Aggregation.standardDeviation(), vals, 49.24530434467839, DELTAD), - Arguments.of(Aggregation.variance(), vals, 2425.1, DELTAD), - Arguments.of(Aggregation.any(), vals, true, 0.), - Arguments.of(Aggregation.all(), vals, true, 0.) + Arguments.of(ReductionAggregation.sum(), new Short[0], null, 0.), + Arguments.of(ReductionAggregation.sum(), new Short[]{null, null, null}, null, 0.), + Arguments.of(ReductionAggregation.sum(), vals, (short) 339, 0.), + Arguments.of(ReductionAggregation.min(), vals, (short) -1, 0.), + Arguments.of(ReductionAggregation.max(), vals, (short) 123, 0.), + Arguments.of(ReductionAggregation.product(), vals, (short) -22624, 0.), + Arguments.of(ReductionAggregation.sumOfSquares(), vals, (short) 31279, 0.), + Arguments.of(ReductionAggregation.mean(), vals, 56.5, DELTAD), + Arguments.of(ReductionAggregation.standardDeviation(), vals, 49.24530434467839, DELTAD), + Arguments.of(ReductionAggregation.variance(), vals, 2425.1, DELTAD), + Arguments.of(ReductionAggregation.any(), vals, true, 0.), + Arguments.of(ReductionAggregation.all(), vals, true, 0.) ); } private static Stream createIntParams() { Integer[] vals = new Integer[]{-1, 7, 123, null, 50, 60, 100}; return Stream.of( - Arguments.of(Aggregation.sum(), new Integer[0], null, 0.), - Arguments.of(Aggregation.sum(), new Integer[]{null, null, null}, null, 0.), - Arguments.of(Aggregation.sum(), vals, 339, 0.), - Arguments.of(Aggregation.min(), vals, -1, 0.), - Arguments.of(Aggregation.max(), vals, 123, 0.), - Arguments.of(Aggregation.product(), vals, -258300000, 0.), - Arguments.of(Aggregation.sumOfSquares(), vals, 31279, 0.), - Arguments.of(Aggregation.mean(), vals, 56.5, DELTAD), - Arguments.of(Aggregation.standardDeviation(), vals, 49.24530434467839, DELTAD), - Arguments.of(Aggregation.variance(), vals, 2425.1, DELTAD), - Arguments.of(Aggregation.any(), vals, true, 0.), - Arguments.of(Aggregation.all(), vals, true, 0.) + Arguments.of(ReductionAggregation.sum(), new Integer[0], null, 0.), + Arguments.of(ReductionAggregation.sum(), new Integer[]{null, null, null}, null, 0.), + Arguments.of(ReductionAggregation.sum(), vals, 339, 0.), + Arguments.of(ReductionAggregation.min(), vals, -1, 0.), + Arguments.of(ReductionAggregation.max(), vals, 123, 0.), + Arguments.of(ReductionAggregation.product(), vals, -258300000, 0.), + Arguments.of(ReductionAggregation.sumOfSquares(), vals, 31279, 0.), + Arguments.of(ReductionAggregation.mean(), vals, 56.5, DELTAD), + Arguments.of(ReductionAggregation.standardDeviation(), vals, 49.24530434467839, DELTAD), + Arguments.of(ReductionAggregation.variance(), vals, 2425.1, DELTAD), + Arguments.of(ReductionAggregation.any(), vals, true, 0.), + Arguments.of(ReductionAggregation.all(), vals, true, 0.) ); } private static Stream createLongParams() { Long[] vals = new Long[]{-1L, 7L, 123L, null, 50L, 60L, 100L}; return Stream.of( - Arguments.of(Aggregation.sum(), new Long[0], null, 0.), - Arguments.of(Aggregation.sum(), new Long[]{null, null, null}, null, 0.), - Arguments.of(Aggregation.sum(), vals, 339L, 0.), - Arguments.of(Aggregation.min(), vals, -1L, 0.), - Arguments.of(Aggregation.max(), vals, 123L, 0.), - Arguments.of(Aggregation.product(), vals, -258300000L, 0.), - Arguments.of(Aggregation.sumOfSquares(), vals, 31279L, 0.), - Arguments.of(Aggregation.mean(), vals, 56.5, DELTAD), - Arguments.of(Aggregation.standardDeviation(), vals, 49.24530434467839, DELTAD), - Arguments.of(Aggregation.variance(), vals, 2425.1, DELTAD), - Arguments.of(Aggregation.any(), vals, true, 0.), - Arguments.of(Aggregation.all(), vals, true, 0.), - Arguments.of(Aggregation.quantile(0.5), vals, 55.0, DELTAD), - Arguments.of(Aggregation.quantile(0.9), vals, 111.5, DELTAD) + Arguments.of(ReductionAggregation.sum(), new Long[0], null, 0.), + Arguments.of(ReductionAggregation.sum(), new Long[]{null, null, null}, null, 0.), + Arguments.of(ReductionAggregation.sum(), vals, 339L, 0.), + Arguments.of(ReductionAggregation.min(), vals, -1L, 0.), + Arguments.of(ReductionAggregation.max(), vals, 123L, 0.), + Arguments.of(ReductionAggregation.product(), vals, -258300000L, 0.), + Arguments.of(ReductionAggregation.sumOfSquares(), vals, 31279L, 0.), + Arguments.of(ReductionAggregation.mean(), vals, 56.5, DELTAD), + Arguments.of(ReductionAggregation.standardDeviation(), vals, 49.24530434467839, DELTAD), + Arguments.of(ReductionAggregation.variance(), vals, 2425.1, DELTAD), + Arguments.of(ReductionAggregation.any(), vals, true, 0.), + Arguments.of(ReductionAggregation.all(), vals, true, 0.), + Arguments.of(ReductionAggregation.quantile(0.5), vals, 55.0, DELTAD), + Arguments.of(ReductionAggregation.quantile(0.9), vals, 111.5, DELTAD) ); } private static Stream createFloatParams() { Float[] vals = new Float[]{-1f, 7f, 123f, null, 50f, 60f, 100f}; return Stream.of( - Arguments.of(Aggregation.sum(), new Float[0], null, 0f), - Arguments.of(Aggregation.sum(), new Float[]{null, null, null}, null, 0f), - Arguments.of(Aggregation.sum(), vals, 339f, 0f), - Arguments.of(Aggregation.min(), vals, -1f, 0f), - Arguments.of(Aggregation.max(), vals, 123f, 0f), - Arguments.of(Aggregation.product(), vals, -258300000f, 0f), - Arguments.of(Aggregation.sumOfSquares(), vals, 31279f, 0f), - Arguments.of(Aggregation.mean(), vals, 56.5f, DELTAF), - Arguments.of(Aggregation.standardDeviation(), vals, 49.24530434467839f, DELTAF), - Arguments.of(Aggregation.variance(), vals, 2425.1f, DELTAF), - Arguments.of(Aggregation.any(), vals, true, 0f), - Arguments.of(Aggregation.all(), vals, true, 0f) + Arguments.of(ReductionAggregation.sum(), new Float[0], null, 0f), + Arguments.of(ReductionAggregation.sum(), new Float[]{null, null, null}, null, 0f), + Arguments.of(ReductionAggregation.sum(), vals, 339f, 0f), + Arguments.of(ReductionAggregation.min(), vals, -1f, 0f), + Arguments.of(ReductionAggregation.max(), vals, 123f, 0f), + Arguments.of(ReductionAggregation.product(), vals, -258300000f, 0f), + Arguments.of(ReductionAggregation.sumOfSquares(), vals, 31279f, 0f), + Arguments.of(ReductionAggregation.mean(), vals, 56.5f, DELTAF), + Arguments.of(ReductionAggregation.standardDeviation(), vals, 49.24530434467839f, DELTAF), + Arguments.of(ReductionAggregation.variance(), vals, 2425.1f, DELTAF), + Arguments.of(ReductionAggregation.any(), vals, true, 0f), + Arguments.of(ReductionAggregation.all(), vals, true, 0f) ); } private static Stream createDoubleParams() { Double[] vals = new Double[]{-1., 7., 123., null, 50., 60., 100.}; return Stream.of( - Arguments.of(Aggregation.sum(), new Double[0], null, 0.), - Arguments.of(Aggregation.sum(), new Double[]{null, null, null}, null, 0.), - Arguments.of(Aggregation.sum(), vals, 339., 0.), - Arguments.of(Aggregation.min(), vals, -1., 0.), - Arguments.of(Aggregation.max(), vals, 123., 0.), - Arguments.of(Aggregation.product(), vals, -258300000., 0.), - Arguments.of(Aggregation.sumOfSquares(), vals, 31279., 0.), - Arguments.of(Aggregation.mean(), vals, 56.5, DELTAD), - Arguments.of(Aggregation.standardDeviation(), vals, 49.24530434467839, DELTAD), - Arguments.of(Aggregation.variance(), vals, 2425.1, DELTAD), - Arguments.of(Aggregation.any(), vals, true, 0.), - Arguments.of(Aggregation.all(), vals, true, 0.), - Arguments.of(Aggregation.quantile(0.5), vals, 55.0, DELTAD), - Arguments.of(Aggregation.quantile(0.9), vals, 111.5, DELTAD) + Arguments.of(ReductionAggregation.sum(), new Double[0], null, 0.), + Arguments.of(ReductionAggregation.sum(), new Double[]{null, null, null}, null, 0.), + Arguments.of(ReductionAggregation.sum(), vals, 339., 0.), + Arguments.of(ReductionAggregation.min(), vals, -1., 0.), + Arguments.of(ReductionAggregation.max(), vals, 123., 0.), + Arguments.of(ReductionAggregation.product(), vals, -258300000., 0.), + Arguments.of(ReductionAggregation.sumOfSquares(), vals, 31279., 0.), + Arguments.of(ReductionAggregation.mean(), vals, 56.5, DELTAD), + Arguments.of(ReductionAggregation.standardDeviation(), vals, 49.24530434467839, DELTAD), + Arguments.of(ReductionAggregation.variance(), vals, 2425.1, DELTAD), + Arguments.of(ReductionAggregation.any(), vals, true, 0.), + Arguments.of(ReductionAggregation.all(), vals, true, 0.), + Arguments.of(ReductionAggregation.quantile(0.5), vals, 55.0, DELTAD), + Arguments.of(ReductionAggregation.quantile(0.9), vals, 111.5, DELTAD) ); } private static Stream createTimestampDaysParams() { Integer[] vals = new Integer[]{-1, 7, 123, null, 50, 60, 100}; return Stream.of( - Arguments.of(Aggregation.max(), new Integer[0], null), - Arguments.of(Aggregation.max(), new Integer[]{null, null, null}, null), - Arguments.of(Aggregation.max(), vals, 123), - Arguments.of(Aggregation.min(), vals, -1) + Arguments.of(ReductionAggregation.max(), new Integer[0], null), + Arguments.of(ReductionAggregation.max(), new Integer[]{null, null, null}, null), + Arguments.of(ReductionAggregation.max(), vals, 123), + Arguments.of(ReductionAggregation.min(), vals, -1) ); } private static Stream createTimestampResolutionParams() { Long[] vals = new Long[]{-1L, 7L, 123L, null, 50L, 60L, 100L}; return Stream.of( - Arguments.of(Aggregation.max(), new Long[0], null), - Arguments.of(Aggregation.max(), new Long[]{null, null, null}, null), - Arguments.of(Aggregation.min(), vals, -1L), - Arguments.of(Aggregation.max(), vals, 123L) + Arguments.of(ReductionAggregation.max(), new Long[0], null), + Arguments.of(ReductionAggregation.max(), new Long[]{null, null, null}, null), + Arguments.of(ReductionAggregation.min(), vals, -1L), + Arguments.of(ReductionAggregation.max(), vals, 123L) ); } - private static void assertEqualsDelta(Aggregation op, Scalar expected, Scalar result, + private static void assertEqualsDelta(ReductionAggregation op, Scalar expected, Scalar result, Double percentage) { - if (FLOAT_REDUCTIONS.contains(op.kind)) { + if (FLOAT_REDUCTIONS.contains(op.getWrapped().kind)) { assertEqualsWithinPercentage(expected.getDouble(), result.getDouble(), percentage); } else { assertEquals(expected, result); } } - private static void assertEqualsDelta(Aggregation op, Scalar expected, Scalar result, + private static void assertEqualsDelta(ReductionAggregation op, Scalar expected, Scalar result, Float percentage) { - if (FLOAT_REDUCTIONS.contains(op.kind)) { + if (FLOAT_REDUCTIONS.contains(op.getWrapped().kind)) { assertEqualsWithinPercentage(expected.getFloat(), result.getFloat(), percentage); } else { assertEquals(expected, result); @@ -255,7 +255,7 @@ private static void assertEqualsDelta(Aggregation op, Scalar expected, Scalar re @ParameterizedTest @MethodSource("createBooleanParams") - void testBoolean(Aggregation op, Boolean[] values, Object expectedObject, Double delta) { + void testBoolean(ReductionAggregation op, Boolean[] values, Object expectedObject, Double delta) { try (Scalar expected = buildExpectedScalar(op, DType.BOOL8, expectedObject); ColumnVector v = ColumnVector.fromBoxedBooleans(values); Scalar result = v.reduce(op, expected.getType())) { @@ -265,7 +265,7 @@ void testBoolean(Aggregation op, Boolean[] values, Object expectedObject, Double @ParameterizedTest @MethodSource("createByteParams") - void testByte(Aggregation op, Byte[] values, Object expectedObject, Double delta) { + void testByte(ReductionAggregation op, Byte[] values, Object expectedObject, Double delta) { try (Scalar expected = buildExpectedScalar(op, DType.INT8, expectedObject); ColumnVector v = ColumnVector.fromBoxedBytes(values); Scalar result = v.reduce(op, expected.getType())) { @@ -275,7 +275,7 @@ void testByte(Aggregation op, Byte[] values, Object expectedObject, Double delta @ParameterizedTest @MethodSource("createShortParams") - void testShort(Aggregation op, Short[] values, Object expectedObject, Double delta) { + void testShort(ReductionAggregation op, Short[] values, Object expectedObject, Double delta) { try (Scalar expected = buildExpectedScalar(op, DType.INT16, expectedObject); ColumnVector v = ColumnVector.fromBoxedShorts(values); Scalar result = v.reduce(op, expected.getType())) { @@ -285,7 +285,7 @@ void testShort(Aggregation op, Short[] values, Object expectedObject, Double del @ParameterizedTest @MethodSource("createIntParams") - void testInt(Aggregation op, Integer[] values, Object expectedObject, Double delta) { + void testInt(ReductionAggregation op, Integer[] values, Object expectedObject, Double delta) { try (Scalar expected = buildExpectedScalar(op, DType.INT32, expectedObject); ColumnVector v = ColumnVector.fromBoxedInts(values); Scalar result = v.reduce(op, expected.getType())) { @@ -295,7 +295,7 @@ void testInt(Aggregation op, Integer[] values, Object expectedObject, Double del @ParameterizedTest @MethodSource("createLongParams") - void testLong(Aggregation op, Long[] values, Object expectedObject, Double delta) { + void testLong(ReductionAggregation op, Long[] values, Object expectedObject, Double delta) { try (Scalar expected = buildExpectedScalar(op, DType.INT64, expectedObject); ColumnVector v = ColumnVector.fromBoxedLongs(values); Scalar result = v.reduce(op, expected.getType())) { @@ -305,7 +305,7 @@ void testLong(Aggregation op, Long[] values, Object expectedObject, Double delta @ParameterizedTest @MethodSource("createFloatParams") - void testFloat(Aggregation op, Float[] values, Object expectedObject, Float delta) { + void testFloat(ReductionAggregation op, Float[] values, Object expectedObject, Float delta) { try (Scalar expected = buildExpectedScalar(op, DType.FLOAT32, expectedObject); ColumnVector v = ColumnVector.fromBoxedFloats(values); Scalar result = v.reduce(op, expected.getType())) { @@ -315,7 +315,7 @@ void testFloat(Aggregation op, Float[] values, Object expectedObject, Float delt @ParameterizedTest @MethodSource("createDoubleParams") - void testDouble(Aggregation op, Double[] values, Object expectedObject, Double delta) { + void testDouble(ReductionAggregation op, Double[] values, Object expectedObject, Double delta) { try (Scalar expected = buildExpectedScalar(op, DType.FLOAT64, expectedObject); ColumnVector v = ColumnVector.fromBoxedDoubles(values); Scalar result = v.reduce(op, expected.getType())) { @@ -325,7 +325,7 @@ void testDouble(Aggregation op, Double[] values, Object expectedObject, Double d @ParameterizedTest @MethodSource("createTimestampDaysParams") - void testTimestampDays(Aggregation op, Integer[] values, Object expectedObject) { + void testTimestampDays(ReductionAggregation op, Integer[] values, Object expectedObject) { try (Scalar expected = buildExpectedScalar(op, DType.TIMESTAMP_DAYS, expectedObject); ColumnVector v = ColumnVector.timestampDaysFromBoxedInts(values); Scalar result = v.reduce(op, expected.getType())) { @@ -335,7 +335,7 @@ void testTimestampDays(Aggregation op, Integer[] values, Object expectedObject) @ParameterizedTest @MethodSource("createTimestampResolutionParams") - void testTimestampSeconds(Aggregation op, Long[] values, Object expectedObject) { + void testTimestampSeconds(ReductionAggregation op, Long[] values, Object expectedObject) { try (Scalar expected = buildExpectedScalar(op, DType.TIMESTAMP_SECONDS, expectedObject); ColumnVector v = ColumnVector.timestampSecondsFromBoxedLongs(values); Scalar result = v.reduce(op, expected.getType())) { @@ -345,7 +345,7 @@ void testTimestampSeconds(Aggregation op, Long[] values, Object expectedObject) @ParameterizedTest @MethodSource("createTimestampResolutionParams") - void testTimestampMilliseconds(Aggregation op, Long[] values, Object expectedObject) { + void testTimestampMilliseconds(ReductionAggregation op, Long[] values, Object expectedObject) { try (Scalar expected = buildExpectedScalar(op, DType.TIMESTAMP_MILLISECONDS, expectedObject); ColumnVector v = ColumnVector.timestampMilliSecondsFromBoxedLongs(values); Scalar result = v.reduce(op, expected.getType())) { @@ -355,7 +355,7 @@ void testTimestampMilliseconds(Aggregation op, Long[] values, Object expectedObj @ParameterizedTest @MethodSource("createTimestampResolutionParams") - void testTimestampMicroseconds(Aggregation op, Long[] values, Object expectedObject) { + void testTimestampMicroseconds(ReductionAggregation op, Long[] values, Object expectedObject) { try (Scalar expected = buildExpectedScalar(op, DType.TIMESTAMP_MICROSECONDS, expectedObject); ColumnVector v = ColumnVector.timestampMicroSecondsFromBoxedLongs(values); Scalar result = v.reduce(op, expected.getType())) { @@ -365,7 +365,7 @@ void testTimestampMicroseconds(Aggregation op, Long[] values, Object expectedObj @ParameterizedTest @MethodSource("createTimestampResolutionParams") - void testTimestampNanoseconds(Aggregation op, Long[] values, Object expectedObject) { + void testTimestampNanoseconds(ReductionAggregation op, Long[] values, Object expectedObject) { try (Scalar expected = buildExpectedScalar(op, DType.TIMESTAMP_NANOSECONDS, expectedObject); ColumnVector v = ColumnVector.timestampNanoSecondsFromBoxedLongs(values); Scalar result = v.reduce(op, expected.getType())) { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 360f3c04f5b..cc030c392cb 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -25,11 +25,17 @@ import ai.rapids.cudf.HostColumnVector.StructData; import ai.rapids.cudf.HostColumnVector.StructType; -import ai.rapids.cudf.ast.BinaryExpression; +import ai.rapids.cudf.ast.BinaryOperation; import ai.rapids.cudf.ast.BinaryOperator; import ai.rapids.cudf.ast.ColumnReference; import ai.rapids.cudf.ast.CompiledExpression; import ai.rapids.cudf.ast.TableReference; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; import org.junit.jupiter.api.Test; import java.io.ByteArrayInputStream; @@ -43,18 +49,14 @@ import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.nio.file.Files; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Comparator; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.stream.Collectors; +import static ai.rapids.cudf.ParquetColumnWriterOptions.mapColumn; import static ai.rapids.cudf.ParquetWriterOptions.listBuilder; import static ai.rapids.cudf.ParquetWriterOptions.structBuilder; import static ai.rapids.cudf.Table.TestBuilder; +import static ai.rapids.cudf.Table.removeNullMasksIfNeeded; import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -104,7 +106,7 @@ public static void assertColumnsAreEqual(ColumnView expect, ColumnView cv) { * @param colName The name of the column */ public static void assertColumnsAreEqual(ColumnView expected, ColumnView cv, String colName) { - assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true); + assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false); } /** @@ -114,7 +116,7 @@ public static void assertColumnsAreEqual(ColumnView expected, ColumnView cv, Str * @param colName The name of the host column */ public static void assertColumnsAreEqual(HostColumnVector expected, HostColumnVector cv, String colName) { - assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true); + assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false); } /** @@ -123,7 +125,7 @@ public static void assertColumnsAreEqual(HostColumnVector expected, HostColumnVe * @param cv The input Struct column */ public static void assertStructColumnsAreEqual(ColumnView expected, ColumnView cv) { - assertPartialStructColumnsAreEqual(expected, 0, expected.getRowCount(), cv, "unnamed", true); + assertPartialStructColumnsAreEqual(expected, 0, expected.getRowCount(), cv, "unnamed", true, false); } /** @@ -133,13 +135,14 @@ public static void assertStructColumnsAreEqual(ColumnView expected, ColumnView c * @param length The number of rows to consider * @param cv The input Struct column * @param colName The name of the column - * @param enableNullCheck Whether to check for nulls in the Struct column + * @param enableNullCountCheck Whether to check for nulls in the Struct column + * @param enableNullabilityCheck Whether the table have a validity mask */ public static void assertPartialStructColumnsAreEqual(ColumnView expected, long rowOffset, long length, - ColumnView cv, String colName, boolean enableNullCheck) { + ColumnView cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) { try (HostColumnVector hostExpected = expected.copyToHost(); HostColumnVector hostcv = cv.copyToHost()) { - assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCheck); + assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCountCheck, enableNullabilityCheck); } } @@ -149,12 +152,13 @@ public static void assertPartialStructColumnsAreEqual(ColumnView expected, long * @param cv The input column * @param colName The name of the column * @param enableNullCheck Whether to check for nulls in the column + * @param enableNullabilityCheck Whether the table have a validity mask */ public static void assertPartialColumnsAreEqual(ColumnView expected, long rowOffset, long length, - ColumnView cv, String colName, boolean enableNullCheck) { + ColumnView cv, String colName, boolean enableNullCheck, boolean enableNullabilityCheck) { try (HostColumnVector hostExpected = expected.copyToHost(); HostColumnVector hostcv = cv.copyToHost()) { - assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCheck); + assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCheck, enableNullabilityCheck); } } @@ -165,18 +169,21 @@ public static void assertPartialColumnsAreEqual(ColumnView expected, long rowOff * @param length number of rows from starting offset * @param cv The input host column * @param colName The name of the host column - * @param enableNullCheck Whether to check for nulls in the host column + * @param enableNullCountCheck Whether to check for nulls in the host column */ public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, long rowOffset, long length, - HostColumnVectorCore cv, String colName, boolean enableNullCheck) { + HostColumnVectorCore cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) { assertEquals(expected.getType(), cv.getType(), "Type For Column " + colName); assertEquals(length, cv.getRowCount(), "Row Count For Column " + colName); assertEquals(expected.getNumChildren(), cv.getNumChildren(), "Child Count for Column " + colName); - if (enableNullCheck) { + if (enableNullCountCheck) { assertEquals(expected.getNullCount(), cv.getNullCount(), "Null Count For Column " + colName); } else { // TODO add in a proper check when null counts are supported by serializing a partitioned column } + if (enableNullabilityCheck) { + assertEquals(expected.hasValidityVector(), cv.hasValidityVector(), "Column nullability is different than expected"); + } DType type = expected.getType(); for (long expectedRow = rowOffset; expectedRow < (rowOffset + length); expectedRow++) { long tableRow = expectedRow - rowOffset; @@ -262,7 +269,7 @@ public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, l } assertPartialColumnsAreEqual(expected.getNestedChildren().get(0), expectedChildRowOffset, numChildRows, cv.getNestedChildren().get(0), colName + " list child", - enableNullCheck); + enableNullCountCheck, enableNullabilityCheck); break; case STRUCT: List expectedChildren = expected.getNestedChildren(); @@ -273,7 +280,7 @@ public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, l String childName = colName + " child " + i; assertEquals(length, cvChild.getRowCount(), "Row Count for Column " + colName); assertPartialColumnsAreEqual(expectedChild, rowOffset, length, cvChild, - colName, enableNullCheck); + colName, enableNullCountCheck, enableNullabilityCheck); } break; default: @@ -289,9 +296,10 @@ public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, l * @param length the number of rows to check * @param table the input table to compare against expected * @param enableNullCheck whether to check for nulls or not + * @param enableNullabilityCheck whether the table have a validity mask */ public static void assertPartialTablesAreEqual(Table expected, long rowOffset, long length, Table table, - boolean enableNullCheck) { + boolean enableNullCheck, boolean enableNullabilityCheck) { assertEquals(expected.getNumberOfColumns(), table.getNumberOfColumns()); assertEquals(length, table.getRowCount(), "ROW COUNT"); for (int col = 0; col < expected.getNumberOfColumns(); col++) { @@ -301,7 +309,7 @@ public static void assertPartialTablesAreEqual(Table expected, long rowOffset, l if (rowOffset != 0 || length != expected.getRowCount()) { name = name + " PART " + rowOffset + "-" + (rowOffset + length - 1); } - assertPartialColumnsAreEqual(expect, rowOffset, length, cv, name, enableNullCheck); + assertPartialColumnsAreEqual(expect, rowOffset, length, cv, name, enableNullCheck, enableNullabilityCheck); } } @@ -311,7 +319,7 @@ public static void assertPartialTablesAreEqual(Table expected, long rowOffset, l * @param table the input table to compare against expected */ public static void assertTablesAreEqual(Table expected, Table table) { - assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), table, true); + assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), table, true, false); } void assertTablesHaveSameValues(HashMap[] expectedTable, Table table) { @@ -1489,20 +1497,118 @@ void testLeftJoinGatherMapsNulls() { } } + @Test + void testLeftHashJoinGatherMaps() { + final int inv = Integer.MIN_VALUE; + try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); + Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build(); + HashJoin rightHash = new HashJoin(rightKeys, false); + Table expected = new Table.TestBuilder() + .column( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9) + .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, 3) + .build()) { + GatherMap[] maps = leftKeys.leftJoinGatherMaps(rightHash); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + + @Test + void testLeftHashJoinGatherMapsWithCount() { + final int inv = Integer.MIN_VALUE; + try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); + Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build(); + HashJoin rightHash = new HashJoin(rightKeys, false); + Table expected = new Table.TestBuilder() + .column( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9) + .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, 3) + .build()) { + long rowCount = leftKeys.leftJoinRowCount(rightHash); + assertEquals(expected.getRowCount(), rowCount); + GatherMap[] maps = leftKeys.leftJoinGatherMaps(rightHash, rowCount); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + + @Test + void testLeftHashJoinGatherMapsNulls() { + final int inv = Integer.MIN_VALUE; + try (Table leftKeys = new Table.TestBuilder() + .column(2, 3, 9, 0, 1, 7, 4, null, null, 8) + .build(); + Table rightKeys = new Table.TestBuilder() + .column(null, null, 9, 8, 10, 32) + .build(); + HashJoin rightHash = new HashJoin(rightKeys, true); + Table expected = new Table.TestBuilder() + .column( 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 8, 9) // left + .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right + .build()) { + GatherMap[] maps = leftKeys.leftJoinGatherMaps(rightHash); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + + @Test + void testLeftHashJoinGatherMapsNullsWithCount() { + final int inv = Integer.MIN_VALUE; + try (Table leftKeys = new Table.TestBuilder() + .column(2, 3, 9, 0, 1, 7, 4, null, null, 8) + .build(); + Table rightKeys = new Table.TestBuilder() + .column(null, null, 9, 8, 10, 32) + .build(); + HashJoin rightHash = new HashJoin(rightKeys,true); + Table expected = new Table.TestBuilder() + .column( 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 8, 9) // left + .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right + .build()) { + long rowCount = leftKeys.leftJoinRowCount(rightHash); + assertEquals(expected.getRowCount(), rowCount); + GatherMap[] maps = leftKeys.leftJoinGatherMaps(rightHash, rowCount); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + @Test void testConditionalLeftJoinGatherMaps() { final int inv = Integer.MIN_VALUE; - BinaryExpression expr = new BinaryExpression(BinaryOperator.GREATER, + BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER, new ColumnReference(0, TableReference.LEFT), new ColumnReference(0, TableReference.RIGHT)); try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); - Table right = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build(); + Table right = new Table.TestBuilder() + .column(6, 5, 9, 8, 10, 32) + .column(0, 1, 2, 3, 4, 5).build(); Table expected = new Table.TestBuilder() .column( 0, 1, 2, 2, 2, 3, 4, 5, 5, 6, 7, 8, 9, 9) .column(inv, inv, 0, 1, 3, inv, inv, 0, 1, inv, 1, inv, 0, 1) .build(); CompiledExpression condition = expr.compile()) { - GatherMap[] maps = left.leftJoinGatherMaps(right, condition, false); + GatherMap[] maps = left.conditionalLeftJoinGatherMaps(right, condition, false); try { verifyJoinGatherMaps(maps, expected); } finally { @@ -1516,7 +1622,7 @@ void testConditionalLeftJoinGatherMaps() { @Test void testConditionalLeftJoinGatherMapsNulls() { final int inv = Integer.MIN_VALUE; - BinaryExpression expr = new BinaryExpression(BinaryOperator.EQUAL, + BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL, new ColumnReference(0, TableReference.LEFT), new ColumnReference(0, TableReference.RIGHT)); try (Table left = new Table.TestBuilder() @@ -1530,7 +1636,65 @@ void testConditionalLeftJoinGatherMapsNulls() { .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right .build(); CompiledExpression condition = expr.compile()) { - GatherMap[] maps = left.leftJoinGatherMaps(right, condition, true); + GatherMap[] maps = left.conditionalLeftJoinGatherMaps(right, condition, true); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + + @Test + void testConditionalLeftJoinGatherMapsWithCount() { + final int inv = Integer.MIN_VALUE; + BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER, + new ColumnReference(0, TableReference.LEFT), + new ColumnReference(0, TableReference.RIGHT)); + try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); + Table right = new Table.TestBuilder() + .column(6, 5, 9, 8, 10, 32) + .column(0, 1, 2, 3, 4, 5).build(); + Table expected = new Table.TestBuilder() + .column( 0, 1, 2, 2, 2, 3, 4, 5, 5, 6, 7, 8, 9, 9) + .column(inv, inv, 0, 1, 3, inv, inv, 0, 1, inv, 1, inv, 0, 1) + .build(); + CompiledExpression condition = expr.compile()) { + long rowCount = left.conditionalLeftJoinRowCount(right, condition, false); + assertEquals(expected.getRowCount(), rowCount); + GatherMap[] maps = left.conditionalLeftJoinGatherMaps(right, condition, false, rowCount); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + + @Test + void testConditionalLeftJoinGatherMapsNullsWithCount() { + final int inv = Integer.MIN_VALUE; + BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL, + new ColumnReference(0, TableReference.LEFT), + new ColumnReference(0, TableReference.RIGHT)); + try (Table left = new Table.TestBuilder() + .column(2, 3, 9, 0, 1, 7, 4, null, null, 8) + .build(); + Table right = new Table.TestBuilder() + .column(null, null, 9, 8, 10, 32) + .build(); + Table expected = new Table.TestBuilder() + .column( 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 8, 9) // left + .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right + .build(); + CompiledExpression condition = expr.compile()) { + long rowCount = left.conditionalLeftJoinRowCount(right, condition, true); + assertEquals(expected.getRowCount(), rowCount); + GatherMap[] maps = left.conditionalLeftJoinGatherMaps(right, condition, true, rowCount); try { verifyJoinGatherMaps(maps, expected); } finally { @@ -1583,19 +1747,113 @@ void testInnerJoinGatherMapsNulls() { } } + @Test + void testInnerHashJoinGatherMaps() { + try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); + Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build(); + HashJoin rightHash = new HashJoin(rightKeys, false); + Table expected = new Table.TestBuilder() + .column(2, 7, 8, 9) // left + .column(2, 0, 1, 3) // right + .build()) { + GatherMap[] maps = leftKeys.innerJoinGatherMaps(rightHash); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + + @Test + void testInnerHashJoinGatherMapsWithCount() { + try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); + Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build(); + HashJoin rightHash = new HashJoin(rightKeys, false); + Table expected = new Table.TestBuilder() + .column(2, 7, 8, 9) // left + .column(2, 0, 1, 3) // right + .build()) { + long rowCount = leftKeys.innerJoinRowCount(rightHash); + assertEquals(expected.getRowCount(), rowCount); + GatherMap[] maps = leftKeys.innerJoinGatherMaps(rightHash, rowCount); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + + @Test + void testInnerHashJoinGatherMapsNulls() { + try (Table leftKeys = new Table.TestBuilder() + .column(2, 3, 9, 0, 1, 7, 4, null, null, 8) + .build(); + Table rightKeys = new Table.TestBuilder() + .column(null, null, 9, 8, 10, 32) + .build(); + HashJoin rightHash = new HashJoin(rightKeys, true); + Table expected = new Table.TestBuilder() + .column(2, 7, 7, 8, 8, 9) // left + .column(2, 0, 1, 0, 1, 3) // right + .build()) { + GatherMap[] maps = leftKeys.innerJoinGatherMaps(rightHash); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + + @Test + void testInnerHashJoinGatherMapsNullsWithCount() { + try (Table leftKeys = new Table.TestBuilder() + .column(2, 3, 9, 0, 1, 7, 4, null, null, 8) + .build(); + Table rightKeys = new Table.TestBuilder() + .column(null, null, 9, 8, 10, 32) + .build(); + HashJoin rightHash = new HashJoin(rightKeys, true); + Table expected = new Table.TestBuilder() + .column(2, 7, 7, 8, 8, 9) // left + .column(2, 0, 1, 0, 1, 3) // right + .build()) { + long rowCount = leftKeys.innerJoinRowCount(rightHash); + assertEquals(expected.getRowCount(), rowCount); + GatherMap[] maps = leftKeys.innerJoinGatherMaps(rightHash, rowCount); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + @Test void testConditionalInnerJoinGatherMaps() { - BinaryExpression expr = new BinaryExpression(BinaryOperator.GREATER, + BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER, new ColumnReference(0, TableReference.LEFT), new ColumnReference(0, TableReference.RIGHT)); try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); - Table right = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build(); + Table right = new Table.TestBuilder() + .column(6, 5, 9, 8, 10, 32) + .column(0, 1, 2, 3, 4, 5).build(); Table expected = new Table.TestBuilder() .column(2, 2, 2, 5, 5, 7, 9, 9) .column(0, 1, 3, 0, 1, 1, 0, 1) .build(); CompiledExpression condition = expr.compile()) { - GatherMap[] maps = left.innerJoinGatherMaps(right, condition, false); + GatherMap[] maps = left.conditionalInnerJoinGatherMaps(right, condition, false); try { verifyJoinGatherMaps(maps, expected); } finally { @@ -1608,7 +1866,61 @@ void testConditionalInnerJoinGatherMaps() { @Test void testConditionalInnerJoinGatherMapsNulls() { - BinaryExpression expr = new BinaryExpression(BinaryOperator.EQUAL, + BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL, + new ColumnReference(0, TableReference.LEFT), + new ColumnReference(0, TableReference.RIGHT)); + try (Table left = new Table.TestBuilder() + .column(2, 3, 9, 0, 1, 7, 4, null, null, 8) + .build(); + Table right = new Table.TestBuilder() + .column(null, null, 9, 8, 10, 32) + .build(); + Table expected = new Table.TestBuilder() + .column(2, 7, 7, 8, 8, 9) // left + .column(2, 0, 1, 0, 1, 3) // right + .build(); + CompiledExpression condition = expr.compile()) { + GatherMap[] maps = left.conditionalInnerJoinGatherMaps(right, condition, true); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + + @Test + void testConditionalInnerJoinGatherMapsWithCount() { + BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER, + new ColumnReference(0, TableReference.LEFT), + new ColumnReference(0, TableReference.RIGHT)); + try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); + Table right = new Table.TestBuilder() + .column(6, 5, 9, 8, 10, 32) + .column(0, 1, 2, 3, 4, 5).build(); + Table expected = new Table.TestBuilder() + .column(2, 2, 2, 5, 5, 7, 9, 9) + .column(0, 1, 3, 0, 1, 1, 0, 1) + .build(); + CompiledExpression condition = expr.compile()) { + long rowCount = left.conditionalInnerJoinRowCount(right, condition, false); + assertEquals(expected.getRowCount(), rowCount); + GatherMap[] maps = left.conditionalInnerJoinGatherMaps(right, condition, false, rowCount); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + + @Test + void testConditionalInnerJoinGatherMapsNullsWithCount() { + BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL, new ColumnReference(0, TableReference.LEFT), new ColumnReference(0, TableReference.RIGHT)); try (Table left = new Table.TestBuilder() @@ -1622,7 +1934,9 @@ void testConditionalInnerJoinGatherMapsNulls() { .column(2, 0, 1, 0, 1, 3) // right .build(); CompiledExpression condition = expr.compile()) { - GatherMap[] maps = left.innerJoinGatherMaps(right, condition, true); + long rowCount = left.conditionalInnerJoinRowCount(right, condition, true); + assertEquals(expected.getRowCount(), rowCount); + GatherMap[] maps = left.conditionalInnerJoinGatherMaps(right, condition, true, rowCount); try { verifyJoinGatherMaps(maps, expected); } finally { @@ -1677,20 +1991,118 @@ void testFullJoinGatherMapsNulls() { } } + @Test + void testFullHashJoinGatherMaps() { + final int inv = Integer.MIN_VALUE; + try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, null, 1, 7, 4, 6, 5, 8).build(); + Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, null).build(); + HashJoin rightHash = new HashJoin(rightKeys, false); + Table expected = new Table.TestBuilder() + .column(inv, inv, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9) // left + .column( 4, 5, inv, inv, 2, inv, inv, inv, inv, 0, 1, 3) // right + .build()) { + GatherMap[] maps = leftKeys.fullJoinGatherMaps(rightHash); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + + @Test + void testFullHashJoinGatherMapsWithCount() { + final int inv = Integer.MIN_VALUE; + try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, null, 1, 7, 4, 6, 5, 8).build(); + Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, null).build(); + HashJoin rightHash = new HashJoin(rightKeys, false); + Table expected = new Table.TestBuilder() + .column(inv, inv, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9) // left + .column( 4, 5, inv, inv, 2, inv, inv, inv, inv, 0, 1, 3) // right + .build()) { + long rowCount = leftKeys.fullJoinRowCount(rightHash); + assertEquals(expected.getRowCount(), rowCount); + GatherMap[] maps = leftKeys.fullJoinGatherMaps(rightHash, rowCount); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + + @Test + void testFullHashJoinGatherMapsNulls() { + final int inv = Integer.MIN_VALUE; + try (Table leftKeys = new Table.TestBuilder() + .column(2, 3, 9, 0, 1, 7, 4, null, null, 8) + .build(); + Table rightKeys = new Table.TestBuilder() + .column(null, null, 9, 8, 10, 32) + .build(); + HashJoin rightHash = new HashJoin(rightKeys, true); + Table expected = new Table.TestBuilder() + .column(inv, inv, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 8, 9) // left + .column( 4, 5, inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right + .build()) { + GatherMap[] maps = leftKeys.fullJoinGatherMaps(rightHash); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + + @Test + void testFullHashJoinGatherMapsNullsWithCount() { + final int inv = Integer.MIN_VALUE; + try (Table leftKeys = new Table.TestBuilder() + .column(2, 3, 9, 0, 1, 7, 4, null, null, 8) + .build(); + Table rightKeys = new Table.TestBuilder() + .column(null, null, 9, 8, 10, 32) + .build(); + HashJoin rightHash = new HashJoin(rightKeys, true); + Table expected = new Table.TestBuilder() + .column(inv, inv, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 8, 9) // left + .column( 4, 5, inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right + .build()) { + long rowCount = leftKeys.fullJoinRowCount(rightHash); + assertEquals(expected.getRowCount(), rowCount); + GatherMap[] maps = leftKeys.fullJoinGatherMaps(rightHash, rowCount); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + @Test void testConditionalFullJoinGatherMaps() { final int inv = Integer.MIN_VALUE; - BinaryExpression expr = new BinaryExpression(BinaryOperator.GREATER, + BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER, new ColumnReference(0, TableReference.LEFT), new ColumnReference(0, TableReference.RIGHT)); try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); - Table right = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build(); + Table right = new Table.TestBuilder() + .column(6, 5, 9, 8, 10, 32) + .column(0, 1, 2, 3, 4, 5).build(); Table expected = new Table.TestBuilder() .column(inv, inv, inv, 0, 1, 2, 2, 2, 3, 4, 5, 5, 6, 7, 8, 9, 9) .column( 2, 4, 5, inv, inv, 0, 1, 3, inv, inv, 0, 1, inv, 1, inv, 0, 1) .build(); CompiledExpression condition = expr.compile()) { - GatherMap[] maps = left.fullJoinGatherMaps(right, condition, false); + GatherMap[] maps = left.conditionalFullJoinGatherMaps(right, condition, false); try { verifyJoinGatherMaps(maps, expected); } finally { @@ -1704,7 +2116,7 @@ void testConditionalFullJoinGatherMaps() { @Test void testConditionalFullJoinGatherMapsNulls() { final int inv = Integer.MIN_VALUE; - BinaryExpression expr = new BinaryExpression(BinaryOperator.EQUAL, + BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL, new ColumnReference(0, TableReference.LEFT), new ColumnReference(0, TableReference.RIGHT)); try (Table left = new Table.TestBuilder() @@ -1718,7 +2130,7 @@ void testConditionalFullJoinGatherMapsNulls() { .column( 4, 5, inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right .build(); CompiledExpression condition = expr.compile()) { - GatherMap[] maps = left.fullJoinGatherMaps(right, condition, true); + GatherMap[] maps = left.conditionalFullJoinGatherMaps(right, condition, true); try { verifyJoinGatherMaps(maps, expected); } finally { @@ -1759,23 +2171,25 @@ void testLeftSemiJoinGatherMapNulls() { @Test void testConditionalLeftSemiJoinGatherMap() { - BinaryExpression expr = new BinaryExpression(BinaryOperator.GREATER, + BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER, new ColumnReference(0, TableReference.LEFT), new ColumnReference(0, TableReference.RIGHT)); try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); - Table right = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build(); + Table right = new Table.TestBuilder() + .column(6, 5, 9, 8, 10, 32) + .column(0, 1, 2, 3, 4, 5).build(); Table expected = new Table.TestBuilder() .column(2, 5, 7, 9) // left .build(); CompiledExpression condition = expr.compile(); - GatherMap map = left.leftSemiJoinGatherMap(right, condition, false)) { + GatherMap map = left.conditionalLeftSemiJoinGatherMap(right, condition, false)) { verifySemiJoinGatherMap(map, expected); } } @Test void testConditionalLeftSemiJoinGatherMapNulls() { - BinaryExpression expr = new BinaryExpression(BinaryOperator.EQUAL, + BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL, new ColumnReference(0, TableReference.LEFT), new ColumnReference(0, TableReference.RIGHT)); try (Table left = new Table.TestBuilder() @@ -1788,11 +2202,57 @@ void testConditionalLeftSemiJoinGatherMapNulls() { .column(2, 7, 8, 9) // left .build(); CompiledExpression condition = expr.compile(); - GatherMap map = left.leftSemiJoinGatherMap(right, condition, true)) { + GatherMap map = left.conditionalLeftSemiJoinGatherMap(right, condition, true)) { verifySemiJoinGatherMap(map, expected); } } + @Test + void testConditionalLeftSemiJoinGatherMapWithCount() { + BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER, + new ColumnReference(0, TableReference.LEFT), + new ColumnReference(0, TableReference.RIGHT)); + try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); + Table right = new Table.TestBuilder() + .column(6, 5, 9, 8, 10, 32) + .column(0, 1, 2, 3, 4, 5).build(); + Table expected = new Table.TestBuilder() + .column(2, 5, 7, 9) // left + .build(); + CompiledExpression condition = expr.compile()) { + long rowCount = left.conditionalLeftSemiJoinRowCount(right, condition, false); + assertEquals(expected.getRowCount(), rowCount); + try (GatherMap map = + left.conditionalLeftSemiJoinGatherMap(right, condition, false, rowCount)) { + verifySemiJoinGatherMap(map, expected); + } + } + } + + @Test + void testConditionalLeftSemiJoinGatherMapNullsWithCount() { + BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL, + new ColumnReference(0, TableReference.LEFT), + new ColumnReference(0, TableReference.RIGHT)); + try (Table left = new Table.TestBuilder() + .column(2, 3, 9, 0, 1, 7, 4, null, null, 8) + .build(); + Table right = new Table.TestBuilder() + .column(null, null, 9, 8, 10, 32) + .build(); + Table expected = new Table.TestBuilder() + .column(2, 7, 8, 9) // left + .build(); + CompiledExpression condition = expr.compile()) { + long rowCount = left.conditionalLeftSemiJoinRowCount(right, condition, true); + assertEquals(expected.getRowCount(), rowCount); + try (GatherMap map = + left.conditionalLeftSemiJoinGatherMap(right, condition, true, rowCount)) { + verifySemiJoinGatherMap(map, expected); + } + } + } + @Test void testAntiSemiJoinGatherMap() { try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); @@ -1823,23 +2283,25 @@ void testAntiSemiJoinGatherMapNulls() { @Test void testConditionalLeftAntiJoinGatherMap() { - BinaryExpression expr = new BinaryExpression(BinaryOperator.GREATER, + BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER, new ColumnReference(0, TableReference.LEFT), new ColumnReference(0, TableReference.RIGHT)); try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); - Table right = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build(); + Table right = new Table.TestBuilder() + .column(6, 5, 9, 8, 10, 32) + .column(0, 1, 2, 3, 4, 5).build(); Table expected = new Table.TestBuilder() .column(0, 1, 3, 4, 6, 8) // left .build(); CompiledExpression condition = expr.compile(); - GatherMap map = left.leftAntiJoinGatherMap(right, condition, false)) { + GatherMap map = left.conditionalLeftAntiJoinGatherMap(right, condition, false)) { verifySemiJoinGatherMap(map, expected); } } @Test void testConditionalAntiSemiJoinGatherMapNulls() { - BinaryExpression expr = new BinaryExpression(BinaryOperator.EQUAL, + BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL, new ColumnReference(0, TableReference.LEFT), new ColumnReference(0, TableReference.RIGHT)); try (Table left = new Table.TestBuilder() @@ -1852,11 +2314,57 @@ void testConditionalAntiSemiJoinGatherMapNulls() { .column(0, 1, 3, 4, 5, 6) // left .build(); CompiledExpression condition = expr.compile(); - GatherMap map = left.leftAntiJoinGatherMap(right, condition, true)) { + GatherMap map = left.conditionalLeftAntiJoinGatherMap(right, condition, true)) { verifySemiJoinGatherMap(map, expected); } } + @Test + void testConditionalLeftAntiJoinGatherMapWithCount() { + BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER, + new ColumnReference(0, TableReference.LEFT), + new ColumnReference(0, TableReference.RIGHT)); + try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); + Table right = new Table.TestBuilder() + .column(6, 5, 9, 8, 10, 32) + .column(0, 1, 2, 3, 4, 5).build(); + Table expected = new Table.TestBuilder() + .column(0, 1, 3, 4, 6, 8) // left + .build(); + CompiledExpression condition = expr.compile()) { + long rowCount = left.conditionalLeftAntiJoinRowCount(right, condition, false); + assertEquals(expected.getRowCount(), rowCount); + try (GatherMap map = + left.conditionalLeftAntiJoinGatherMap(right, condition, false, rowCount)) { + verifySemiJoinGatherMap(map, expected); + } + } + } + + @Test + void testConditionalAntiSemiJoinGatherMapNullsWithCount() { + BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL, + new ColumnReference(0, TableReference.LEFT), + new ColumnReference(0, TableReference.RIGHT)); + try (Table left = new Table.TestBuilder() + .column(2, 3, 9, 0, 1, 7, 4, null, null, 8) + .build(); + Table right = new Table.TestBuilder() + .column(null, null, 9, 8, 10, 32) + .build(); + Table expected = new Table.TestBuilder() + .column(0, 1, 3, 4, 5, 6) // left + .build(); + CompiledExpression condition = expr.compile()) { + long rowCount = left.conditionalLeftAntiJoinRowCount(right, condition, true); + assertEquals(expected.getRowCount(), rowCount); + try (GatherMap map = + left.conditionalLeftAntiJoinGatherMap(right, condition, true, rowCount)) { + verifySemiJoinGatherMap(map, expected); + } + } + } + @Test void testBoundsNulls() { boolean[] descFlags = new boolean[1]; @@ -2728,7 +3236,7 @@ void testSerializationRoundTripConcatHostSide() throws IOException { try (Table found = JCudfSerialization.readAndConcat( headers.toArray(new JCudfSerialization.SerializedTableHeader[headers.size()]), buffers.toArray(new HostMemoryBuffer[buffers.size()]))) { - assertPartialTablesAreEqual(t, 0, t.getRowCount(), found, false); + assertPartialTablesAreEqual(t, 0, t.getRowCount(), found, false, false); } } finally { for (HostMemoryBuffer buff: buffers) { @@ -2781,7 +3289,7 @@ void testConcatHost() throws IOException { try (Table result = JCudfSerialization.readAndConcat( new JCudfSerialization.SerializedTableHeader[] {header, header}, new HostMemoryBuffer[] {buff, buff})) { - assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), result, false); + assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), result, false, false); } } } @@ -2822,7 +3330,7 @@ void testSerializationRoundTripSlicedHostSide() throws IOException { buffers.toArray(new HostMemoryBuffer[buffers.size()]), bout2); ByteArrayInputStream bin2 = new ByteArrayInputStream(bout2.toByteArray()); try (JCudfSerialization.TableAndRowCountPair found = JCudfSerialization.readTableFrom(bin2)) { - assertPartialTablesAreEqual(t, 0, t.getRowCount(), found.getTable(), false); + assertPartialTablesAreEqual(t, 0, t.getRowCount(), found.getTable(), false, false); assertEquals(found.getTable(), found.getContiguousTable().getTable()); assertNotNull(found.getContiguousTable().getBuffer()); } @@ -2848,7 +3356,7 @@ void testSerializationRoundTripSliced() throws IOException { JCudfSerialization.writeToStream(t, bout, i, len); ByteArrayInputStream bin = new ByteArrayInputStream(bout.toByteArray()); try (JCudfSerialization.TableAndRowCountPair found = JCudfSerialization.readTableFrom(bin)) { - assertPartialTablesAreEqual(t, i, len, found.getTable(), i == 0 && len == t.getRowCount()); + assertPartialTablesAreEqual(t, i, len, found.getTable(), i == 0 && len == t.getRowCount(), false); assertEquals(found.getTable(), found.getContiguousTable().getTable()); assertNotNull(found.getContiguousTable().getBuffer()); } @@ -2902,12 +3410,12 @@ void testGroupByScan() { .withKeysSorted(true) .withKeysDescending(false, false) .build(), 0, 1) - .scan(Aggregation.sum().onColumn(2), - Aggregation.count(NullPolicy.INCLUDE).onColumn(2), - Aggregation.min().onColumn(2), - Aggregation.max().onColumn(2), - Aggregation.rank().onColumn(3), - Aggregation.denseRank().onColumn(3)); + .scan(GroupByScanAggregation.sum().onColumn(2), + GroupByScanAggregation.count(NullPolicy.INCLUDE).onColumn(2), + GroupByScanAggregation.min().onColumn(2), + GroupByScanAggregation.max().onColumn(2), + GroupByScanAggregation.rank().onColumn(3), + GroupByScanAggregation.denseRank().onColumn(3)); Table expected = new Table.TestBuilder() .column( "1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2") .column( 0, 1, 3, 3, 5, 5, 5, 5, 5, 5, 5) @@ -2957,7 +3465,7 @@ void testGroupByUniqueCount() { .build()) { try (Table t3 = t1 .groupBy(0, 1) - .aggregate(Aggregation.nunique().onColumn(0)); + .aggregate(GroupByAggregation.nunique().onColumn(0)); Table sorted = t3.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2)); Table expected = new Table.TestBuilder() .column( "1", "1", "1", "1") @@ -2978,7 +3486,7 @@ void testGroupByUniqueCountNulls() { .build()) { try (Table t3 = t1 .groupBy(0, 1) - .aggregate(Aggregation.nunique(NullPolicy.INCLUDE).onColumn(0)); + .aggregate(GroupByAggregation.nunique(NullPolicy.INCLUDE).onColumn(0)); Table sorted = t3.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2)); Table expected = new Table.TestBuilder() .column( "1", "1", "1", "1") @@ -2997,7 +3505,7 @@ void testGroupByCount() { .column(12.0, 14.0, 13.0, 17.0, 17.0, 17.0) .build()) { try (Table t3 = t1.groupBy(0, 1) - .aggregate(Aggregation.count().onColumn(0)); + .aggregate(GroupByAggregation.count().onColumn(0)); HostColumnVector aggOut1 = t3.getColumn(2).copyToHost()) { // verify t3 assertEquals(4, t3.getRowCount()); @@ -3048,9 +3556,9 @@ void testWindowingCount() { .build()) { try (Table windowAggResults = sorted.groupBy(0, 1) - .aggregateWindows(Aggregation.count().onColumn(3).overWindow(window)); + .aggregateWindows(RollingAggregation.count().onColumn(3).overWindow(window)); Table decWindowAggResults = decSorted.groupBy(0, 4) - .aggregateWindows(Aggregation.count().onColumn(3).overWindow(window)); + .aggregateWindows(RollingAggregation.count().onColumn(3).overWindow(window)); ColumnVector expect = ColumnVector.fromBoxedInts(2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2)) { assertColumnsAreEqual(expect, windowAggResults.getColumn(0)); assertColumnsAreEqual(expect, decWindowAggResults.getColumn(0)); @@ -3088,9 +3596,9 @@ void testWindowingMin() { .build()) { try (Table windowAggResults = sorted.groupBy(0, 1) - .aggregateWindows(Aggregation.min().onColumn(3).overWindow(window)); + .aggregateWindows(RollingAggregation.min().onColumn(3).overWindow(window)); Table decWindowAggResults = decSorted.groupBy(0, 4) - .aggregateWindows(Aggregation.min().onColumn(6).overWindow(window)); + .aggregateWindows(RollingAggregation.min().onColumn(6).overWindow(window)); ColumnVector expect = ColumnVector.fromBoxedInts(5, 1, 1, 1, 7, 7, 2, 2, 0, 0, 0, 6); ColumnVector decExpect = ColumnVector.decimalFromLongs(2, 5, 1, 1, 1, 7, 7, 2, 2, 0, 0, 0, 6)) { assertColumnsAreEqual(expect, windowAggResults.getColumn(0)); @@ -3129,9 +3637,9 @@ void testWindowingMax() { .build()) { try (Table windowAggResults = sorted.groupBy(0, 1) - .aggregateWindows(Aggregation.max().onColumn(3).overWindow(window)); + .aggregateWindows(RollingAggregation.max().onColumn(3).overWindow(window)); Table decWindowAggResults = decSorted.groupBy(0, 4) - .aggregateWindows(Aggregation.max().onColumn(6).overWindow(window)); + .aggregateWindows(RollingAggregation.max().onColumn(6).overWindow(window)); ColumnVector expect = ColumnVector.fromBoxedInts(7, 7, 9, 9, 9, 9, 9, 8, 8, 8, 6, 6); ColumnVector decExpect = ColumnVector.decimalFromLongs(2, 7, 7, 9, 9, 9, 9, 9, 8, 8, 8, 6, 6)) { assertColumnsAreEqual(expect, windowAggResults.getColumn(0)); @@ -3163,7 +3671,7 @@ void testWindowingSum() { .build()) { try (Table windowAggResults = sorted.groupBy(0, 1) - .aggregateWindows(Aggregation.sum().onColumn(3).overWindow(window)); + .aggregateWindows(RollingAggregation.sum().onColumn(3).overWindow(window)); ColumnVector expectAggResult = ColumnVector.fromBoxedLongs(12L, 13L, 15L, 10L, 16L, 24L, 19L, 10L, 8L, 14L, 12L, 12L)) { assertColumnsAreEqual(expectAggResult, windowAggResults.getColumn(0)); } @@ -3199,12 +3707,12 @@ void testWindowingRowNumber() { WindowOptions options = windowBuilder.window(two, one).build(); WindowOptions options1 = windowBuilder.window(two, one).build()) { try (Table windowAggResults = sorted.groupBy(0, 1) - .aggregateWindows(Aggregation + .aggregateWindows(RollingAggregation .rowNumber() .onColumn(3) .overWindow(options)); Table decWindowAggResults = decSorted.groupBy(0, 4) - .aggregateWindows(Aggregation + .aggregateWindows(RollingAggregation .rowNumber() .onColumn(6) .overWindow(options1)); @@ -3219,12 +3727,12 @@ void testWindowingRowNumber() { WindowOptions options = windowBuilder.window(three, two).build(); WindowOptions options1 = windowBuilder.window(three, two).build()) { try (Table windowAggResults = sorted.groupBy(0, 1) - .aggregateWindows(Aggregation + .aggregateWindows(RollingAggregation .rowNumber() .onColumn(3) .overWindow(options)); Table decWindowAggResults = decSorted.groupBy(0, 4) - .aggregateWindows(Aggregation + .aggregateWindows(RollingAggregation .rowNumber() .onColumn(6) .overWindow(options1)); @@ -3239,12 +3747,12 @@ void testWindowingRowNumber() { WindowOptions options = windowBuilder.window(four, three).build(); WindowOptions options1 = windowBuilder.window(four, three).build()) { try (Table windowAggResults = sorted.groupBy(0, 1) - .aggregateWindows(Aggregation + .aggregateWindows(RollingAggregation .rowNumber() .onColumn(3) .overWindow(options)); Table decWindowAggResults = decSorted.groupBy(0, 4) - .aggregateWindows(Aggregation + .aggregateWindows(RollingAggregation .rowNumber() .onColumn(6) .overWindow(options1)); @@ -3259,8 +3767,8 @@ void testWindowingRowNumber() { @Test void testWindowingCollectList() { - Aggregation aggCollectWithNulls = Aggregation.collectList(NullPolicy.INCLUDE); - Aggregation aggCollect = Aggregation.collectList(); + RollingAggregation aggCollectWithNulls = RollingAggregation.collectList(NullPolicy.INCLUDE); + RollingAggregation aggCollect = RollingAggregation.collectList(); try (Scalar two = Scalar.fromInt(2); Scalar one = Scalar.fromInt(1); WindowOptions winOpts = WindowOptions.builder() @@ -3335,12 +3843,12 @@ void testWindowingCollectList() { @Test void testWindowingCollectSet() { - Aggregation aggCollect = Aggregation.collectSet(); - Aggregation aggCollectWithEqNulls = Aggregation.collectSet(NullPolicy.INCLUDE, + RollingAggregation aggCollect = RollingAggregation.collectSet(); + RollingAggregation aggCollectWithEqNulls = RollingAggregation.collectSet(NullPolicy.INCLUDE, NullEquality.EQUAL, NaNEquality.UNEQUAL); - Aggregation aggCollectWithUnEqNulls = Aggregation.collectSet(NullPolicy.INCLUDE, + RollingAggregation aggCollectWithUnEqNulls = RollingAggregation.collectSet(NullPolicy.INCLUDE, NullEquality.UNEQUAL, NaNEquality.UNEQUAL); - Aggregation aggCollectWithEqNaNs = Aggregation.collectSet(NullPolicy.INCLUDE, + RollingAggregation aggCollectWithEqNaNs = RollingAggregation.collectSet(NullPolicy.INCLUDE, NullEquality.EQUAL, NaNEquality.ALL_EQUAL); try (Scalar two = Scalar.fromInt(2); @@ -3473,22 +3981,22 @@ void testWindowingLead() { Scalar one = Scalar.fromInt(1); WindowOptions options = windowBuilder.window(two, one).build(); Table windowAggResults = sorted.groupBy(0, 1) - .aggregateWindows(Aggregation + .aggregateWindows(RollingAggregation .lead(0) .onColumn(3) // Int Agg Column .overWindow(options)); Table decWindowAggResults = decSorted.groupBy(0, 4) - .aggregateWindows(Aggregation + .aggregateWindows(RollingAggregation .lead(0) .onColumn(6) // Decimal Agg Column .overWindow(options)); Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows( - Aggregation + RollingAggregation .lead(0) .onColumn(7) // List Agg COLUMN .overWindow(options)); Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows( - Aggregation + RollingAggregation .lead(0) .onColumn(8) //STRUCT Agg COLUMN .overWindow(options)); @@ -3517,22 +4025,22 @@ void testWindowingLead() { Scalar one = Scalar.fromInt(1); WindowOptions options = windowBuilder.window(zero, one).build(); Table windowAggResults = sorted.groupBy(0, 1) - .aggregateWindows(Aggregation + .aggregateWindows(RollingAggregation .lead(1) .onColumn(3) //Int Agg COLUMN .overWindow(options)); Table decWindowAggResults = sorted.groupBy(0, 4) - .aggregateWindows(Aggregation + .aggregateWindows(RollingAggregation .lead(1) .onColumn(6) //Decimal Agg COLUMN .overWindow(options)); Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows( - Aggregation + RollingAggregation .lead(1) .onColumn(7) //LIST Agg COLUMN .overWindow(options)); Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows( - Aggregation + RollingAggregation .lead(1) .onColumn(8) //STRUCT Agg COLUMN .overWindow(options)); @@ -3575,22 +4083,22 @@ null, new StructData(13, "s13"), new StructData(14, "s14"), null, new StructData(-111, "s111"), new StructData(null, "s112"), new StructData(-222, "s222"), new StructData(-333, "s333")); Table windowAggResults = sorted.groupBy(0, 1) - .aggregateWindows(Aggregation + .aggregateWindows(RollingAggregation .lead(1, defaultOutput) .onColumn(3) //Int Agg COLUMN .overWindow(options)); Table decWindowAggResults = sorted.groupBy(0, 4) - .aggregateWindows(Aggregation + .aggregateWindows(RollingAggregation .lead(1, decDefaultOutput) .onColumn(6) //Decimal Agg COLUMN .overWindow(options)); Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows( - Aggregation + RollingAggregation .lead(1, listDefaultOutput) .onColumn(7) //LIST Agg COLUMN .overWindow(options)); Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows( - Aggregation + RollingAggregation .lead(1, structDefaultOutput) .onColumn(8) //STRUCT Agg COLUMN .overWindow(options)); @@ -3619,22 +4127,22 @@ null, new StructData(13, "s13"), new StructData(14, "s14"), new StructData(-14, Scalar one = Scalar.fromInt(1); WindowOptions options = windowBuilder.window(zero, one).build(); Table windowAggResults = sorted.groupBy(0, 1) - .aggregateWindows(Aggregation + .aggregateWindows(RollingAggregation .lead(3) .onColumn(3) //Int Agg COLUMN .overWindow(options)); Table decWindowAggResults = sorted.groupBy(0, 4) - .aggregateWindows(Aggregation + .aggregateWindows(RollingAggregation .lead(3) .onColumn(6) //Decimal Agg COLUMN .overWindow(options)); Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows( - Aggregation + RollingAggregation .lead(3) .onColumn(7) //LIST Agg COLUMN .overWindow(options)); Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows( - Aggregation + RollingAggregation .lead(3) .onColumn(8) //STRUCT Agg COLUMN .overWindow(options)); @@ -3694,22 +4202,22 @@ void testWindowingLag() { Scalar one = Scalar.fromInt(1); WindowOptions options = windowBuilder.window(two, one).build(); Table windowAggResults = sorted.groupBy(0, 1) - .aggregateWindows(Aggregation + .aggregateWindows(RollingAggregation .lag(0) .onColumn(3) //Int Agg COLUMN .overWindow(options)); Table decWindowAggResults = sorted.groupBy(0, 4) - .aggregateWindows(Aggregation + .aggregateWindows(RollingAggregation .lag(0) .onColumn(6) //Decimal Agg COLUMN .overWindow(options)); Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows( - Aggregation + RollingAggregation .lag(0) .onColumn(7) //LIST Agg COLUMN .overWindow(options)); Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows( - Aggregation + RollingAggregation .lag(0) .onColumn(8) //STRUCT Agg COLUMN .overWindow(options)); @@ -3737,22 +4245,22 @@ void testWindowingLag() { Scalar two = Scalar.fromInt(2); WindowOptions options = windowBuilder.window(two, zero).build(); Table windowAggResults = sorted.groupBy(0, 1) - .aggregateWindows(Aggregation + .aggregateWindows(RollingAggregation .lag(1) .onColumn(3) //Int Agg COLUMN .overWindow(options)); Table decWindowAggResults = sorted.groupBy(0, 4) - .aggregateWindows(Aggregation + .aggregateWindows(RollingAggregation .lag(1) .onColumn(6) //Decimal Agg COLUMN .overWindow(options)); Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows( - Aggregation + RollingAggregation .lag(1) .onColumn(7) //LIST Agg COLUMN .overWindow(options)); Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows( - Aggregation + RollingAggregation .lag(1) .onColumn(8) //STRUCT Agg COLUMN .overWindow(options)); @@ -3794,22 +4302,22 @@ null, new StructData(111, "s111"), new StructData(null, "s112"), new StructData( new StructData(-11, "s11"), null, new StructData(-13, "s13"), new StructData(-14, "s14"), new StructData(-111, "s111"), new StructData(null, "s112"), new StructData(-222, "s222"), new StructData(-333, "s333")); Table windowAggResults = sorted.groupBy(0, 1) - .aggregateWindows(Aggregation + .aggregateWindows(RollingAggregation .lag(1, defaultOutput) .onColumn(3) //Int Agg COLUMN .overWindow(options)); Table decWindowAggResults = sorted.groupBy(0, 4) - .aggregateWindows(Aggregation + .aggregateWindows(RollingAggregation .lag(1, decDefaultOutput) .onColumn(6) //Decimal Agg COLUMN .overWindow(options)); Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows( - Aggregation + RollingAggregation .lag(1, listDefaultOutput) .onColumn(7) //LIST Agg COLUMN .overWindow(options)); Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows( - Aggregation + RollingAggregation .lag(1, structDefaultOutput) .onColumn(8) //STRUCT Agg COLUMN .overWindow(options)); @@ -3838,22 +4346,22 @@ null, new StructData(111, "s111"), new StructData(null, "s112"), new StructData( Scalar one = Scalar.fromInt(1); WindowOptions options = windowBuilder.window(one, zero).build(); Table windowAggResults = sorted.groupBy(0, 1) - .aggregateWindows(Aggregation + .aggregateWindows(RollingAggregation .lag(3) .onColumn(3) //Int Agg COLUMN .overWindow(options)); Table decWindowAggResults = sorted.groupBy(0, 4) - .aggregateWindows(Aggregation + .aggregateWindows(RollingAggregation .lag(3) .onColumn(6) //Decimal Agg COLUMN .overWindow(options)); Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows( - Aggregation + RollingAggregation .lag(3) .onColumn(7) //LIST Agg COLUMN .overWindow(options)); Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows( - Aggregation + RollingAggregation .lag(3) .onColumn(8) //STRUCT Agg COLUMN .overWindow(options)); @@ -3896,7 +4404,7 @@ void testWindowingMean() { .build()) { try (Table windowAggResults = sorted.groupBy(0, 1) - .aggregateWindows(Aggregation.mean().onColumn(3).overWindow(window)); + .aggregateWindows(RollingAggregation.mean().onColumn(3).overWindow(window)); ColumnVector expect = ColumnVector.fromBoxedDoubles(6.0d, 5.0d, 5.0d, 5.0d, 8.0d, 8.0d, 7.0d, 6.0d, 4.0d, 4.0d, 4.0d, 6.0d)) { assertColumnsAreEqual(expect, windowAggResults.getColumn(0)); } @@ -3941,10 +4449,10 @@ void testWindowingOnMultipleDifferentColumns() { try (Table windowAggResults = sorted.groupBy(0, 1) .aggregateWindows( - Aggregation.sum().onColumn(3).overWindow(window_1), - Aggregation.max().onColumn(3).overWindow(window_1), - Aggregation.sum().onColumn(3).overWindow(window_2), - Aggregation.min().onColumn(2).overWindow(window_3) + RollingAggregation.sum().onColumn(3).overWindow(window_1), + RollingAggregation.max().onColumn(3).overWindow(window_1), + RollingAggregation.sum().onColumn(3).overWindow(window_2), + RollingAggregation.min().onColumn(2).overWindow(window_3) ); ColumnVector expect_0 = ColumnVector.fromBoxedLongs(12L, 13L, 15L, 10L, 16L, 24L, 19L, 10L, 8L, 14L, 12L, 12L); ColumnVector expect_1 = ColumnVector.fromBoxedInts(7, 7, 9, 9, 9, 9, 9, 8, 8, 8, 6, 6); @@ -3979,8 +4487,8 @@ void testWindowingWithoutGroupByColumns() { .build()) { try (Table windowAggResults = sorted.groupBy().aggregateWindows( - Aggregation.sum().onColumn(1).overWindow(window)); - ColumnVector expectAggResult = ColumnVector.fromBoxedLongs(12L, 13L, 15L, 17L, 25L, 24L, 19L, 18L, 10L, 14L, 12L, 12L); + RollingAggregation.sum().onColumn(1).overWindow(window)); + ColumnVector expectAggResult = ColumnVector.fromBoxedLongs(12L, 13L, 15L, 17L, 25L, 24L, 19L, 18L, 10L, 14L, 12L, 12L) ) { assertColumnsAreEqual(expectAggResult, windowAggResults.getColumn(0)); } @@ -4054,7 +4562,7 @@ void testRangeWindowingCount() { .orderByColumnIndex(orderIndex) .build()) { try (Table windowAggResults = sorted.groupBy(0, 1).aggregateWindowsOverRanges( - Aggregation.count().onColumn(2).overWindow(window)); + RollingAggregation.count().onColumn(2).overWindow(window)); ColumnVector expect = ColumnVector.fromBoxedInts(3, 3, 4, 2, 4, 4, 4, 4, 4, 4, 5, 5, 3)) { assertColumnsAreEqual(expect, windowAggResults.getColumn(0)); } @@ -4098,7 +4606,7 @@ void testRangeWindowingLead() { .build()) { try (Table windowAggResults = sorted.groupBy(0, 1) - .aggregateWindowsOverRanges(Aggregation.lead(1) + .aggregateWindowsOverRanges(RollingAggregation.lead(1) .onColumn(2) .overWindow(window)); ColumnVector expect = ColumnVector.fromBoxedInts(5, 1, 9, null, 9, 8, 2, null, 0, 6, 6, 8, null)) { @@ -4144,7 +4652,7 @@ void testRangeWindowingMax() { .build()) { try (Table windowAggResults = sorted.groupBy(0, 1) - .aggregateWindowsOverRanges(Aggregation.max().onColumn(2).overWindow(window)); + .aggregateWindowsOverRanges(RollingAggregation.max().onColumn(2).overWindow(window)); ColumnVector expect = ColumnVector.fromBoxedInts(7, 7, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8)) { assertColumnsAreEqual(expect, windowAggResults.getColumn(0)); } @@ -4158,7 +4666,7 @@ void testRangeWindowingMax() { .build()) { try (Table windowAggResults = sorted.groupBy(0, 1) - .aggregateWindows(Aggregation.max().onColumn(2).overWindow(window)); + .aggregateWindows(RollingAggregation.max().onColumn(2).overWindow(window)); ColumnVector expect = ColumnVector.fromBoxedInts(7, 7, 9, 9, 9, 9, 9, 8, 8, 8, 6, 8, 8)) { assertColumnsAreEqual(expect, windowAggResults.getColumn(0)); } @@ -4202,7 +4710,7 @@ void testRangeWindowingRowNumber() { .build()) { try (Table windowAggResults = sorted.groupBy(0, 1) - .aggregateWindowsOverRanges(Aggregation.rowNumber().onColumn(2).overWindow(window)); + .aggregateWindowsOverRanges(RollingAggregation.rowNumber().onColumn(2).overWindow(window)); ColumnVector expect = ColumnVector.fromBoxedInts(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 5)) { assertColumnsAreEqual(expect, windowAggResults.getColumn(0)); } @@ -4254,12 +4762,12 @@ void testRangeWindowingCountDescendingTimestamps() { .window(preceding_1, following_1) .orderByColumnIndex(orderIndex) .orderByDescending() - .build();) { + .build()) { try (Table windowAggResults = sorted.groupBy(0, 1) .aggregateWindowsOverRanges( - Aggregation.count().onColumn(2).overWindow(window_0), - Aggregation.sum().onColumn(2).overWindow(window_1)); + RollingAggregation.count().onColumn(2).overWindow(window_0), + RollingAggregation.sum().onColumn(2).overWindow(window_1)); ColumnVector expect_0 = ColumnVector.fromBoxedInts(3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 5, 5, 5); ColumnVector expect_1 = ColumnVector.fromBoxedLongs(7L, 13L, 13L, 22L, 7L, 24L, 24L, 26L, 8L, 8L, 14L, 28L, 28L)) { assertColumnsAreEqual(expect_0, windowAggResults.getColumn(0)); @@ -4303,7 +4811,7 @@ void testRangeWindowingWithoutGroupByColumns() { .build();) { try (Table windowAggResults = sorted.groupBy() - .aggregateWindowsOverRanges(Aggregation.count().onColumn(1).overWindow(window)); + .aggregateWindowsOverRanges(RollingAggregation.count().onColumn(1).overWindow(window)); ColumnVector expect = ColumnVector.fromBoxedInts(3, 3, 6, 6, 6, 6, 7, 7, 6, 6, 5, 5, 3)) { assertColumnsAreEqual(expect, windowAggResults.getColumn(0)); } @@ -4333,7 +4841,7 @@ void testRangeWindowingOrderByUnsupportedDataTypeExceptions() { assertThrows(IllegalArgumentException.class, () -> table .groupBy(0, 1) - .aggregateWindowsOverRanges(Aggregation.max().onColumn(2).overWindow(rangeBasedWindow))); + .aggregateWindowsOverRanges(RollingAggregation.max().onColumn(2).overWindow(rangeBasedWindow))); } } } @@ -4353,7 +4861,7 @@ void testInvalidWindowTypeExceptions() { .minPeriods(1) .window(one, one) .build()) { - assertThrows(IllegalArgumentException.class, () -> table.groupBy(0, 1).aggregateWindowsOverRanges(Aggregation.max().onColumn(3).overWindow(rowBasedWindow))); + assertThrows(IllegalArgumentException.class, () -> table.groupBy(0, 1).aggregateWindowsOverRanges(RollingAggregation.max().onColumn(3).overWindow(rowBasedWindow))); } try (WindowOptions rangeBasedWindow = WindowOptions.builder() @@ -4361,7 +4869,7 @@ void testInvalidWindowTypeExceptions() { .window(one, one) .orderByColumnIndex(2) .build()) { - assertThrows(IllegalArgumentException.class, () -> table.groupBy(0, 1).aggregateWindows(Aggregation.max().onColumn(3).overWindow(rangeBasedWindow))); + assertThrows(IllegalArgumentException.class, () -> table.groupBy(0, 1).aggregateWindows(RollingAggregation.max().onColumn(3).overWindow(rangeBasedWindow))); } } } @@ -4399,7 +4907,7 @@ void testRangeWindowingCountUnboundedPreceding() { .build();) { try (Table windowAggResults = sorted.groupBy(0, 1) - .aggregateWindowsOverRanges(Aggregation.count().onColumn(2).overWindow(window)); + .aggregateWindowsOverRanges(RollingAggregation.count().onColumn(2).overWindow(window)); ColumnVector expect = ColumnVector.fromBoxedInts(3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5)) { assertColumnsAreEqual(expect, windowAggResults.getColumn(0)); } @@ -4475,11 +4983,11 @@ void testRangeWindowingCountUnboundedASCWithNullsFirst() { try (Table windowAggResults = sorted.groupBy(0, 1) .aggregateWindowsOverRanges( - Aggregation.count().onColumn(2).overWindow(unboundedPrecedingOneFollowing), - Aggregation.count().onColumn(2).overWindow(onePrecedingUnboundedFollowing), - Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndFollowing), - Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndCurrentRow), - Aggregation.count().onColumn(2).overWindow(currentRowAndUnboundedFollowing)); + RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingOneFollowing), + RollingAggregation.count().onColumn(2).overWindow(onePrecedingUnboundedFollowing), + RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingAndFollowing), + RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingAndCurrentRow), + RollingAggregation.count().onColumn(2).overWindow(currentRowAndUnboundedFollowing)); ColumnVector expect_0 = ColumnVector.fromBoxedInts(3, 3, 3, 5, 5, 6, 2, 2, 4, 4, 6, 6, 7); ColumnVector expect_1 = ColumnVector.fromBoxedInts(6, 6, 6, 3, 3, 1, 7, 7, 5, 5, 3, 3, 1); ColumnVector expect_2 = ColumnVector.fromBoxedInts(6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7); @@ -4570,11 +5078,11 @@ void testRangeWindowingCountUnboundedDESCWithNullsFirst() { try (Table windowAggResults = sorted.groupBy(0, 1) .aggregateWindowsOverRanges( - Aggregation.count().onColumn(2).overWindow(unboundedPrecedingOneFollowing), - Aggregation.count().onColumn(2).overWindow(onePrecedingUnboundedFollowing), - Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndFollowing), - Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndCurrentRow), - Aggregation.count().onColumn(2).overWindow(currentRowAndUnboundedFollowing)); + RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingOneFollowing), + RollingAggregation.count().onColumn(2).overWindow(onePrecedingUnboundedFollowing), + RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingAndFollowing), + RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingAndCurrentRow), + RollingAggregation.count().onColumn(2).overWindow(currentRowAndUnboundedFollowing)); ColumnVector expect_0 = ColumnVector.fromBoxedInts(3, 3, 3, 4, 6, 6, 2, 2, 3, 5, 5, 7, 7); ColumnVector expect_1 = ColumnVector.fromBoxedInts(6, 6, 6, 3, 2, 2, 7, 7, 5, 4, 4, 2, 2); ColumnVector expect_2 = ColumnVector.fromBoxedInts(6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7); @@ -4658,11 +5166,11 @@ void testRangeWindowingCountUnboundedASCWithNullsLast() { try (Table windowAggResults = sorted.groupBy(0, 1) .aggregateWindowsOverRanges( - Aggregation.count().onColumn(2).overWindow(unboundedPrecedingOneFollowing), - Aggregation.count().onColumn(2).overWindow(onePrecedingUnboundedFollowing), - Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndFollowing), - Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndCurrentRow), - Aggregation.count().onColumn(2).overWindow(currentRowAndUnboundedFollowing)); + RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingOneFollowing), + RollingAggregation.count().onColumn(2).overWindow(onePrecedingUnboundedFollowing), + RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingAndFollowing), + RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingAndCurrentRow), + RollingAggregation.count().onColumn(2).overWindow(currentRowAndUnboundedFollowing)); ColumnVector expect_0 = ColumnVector.fromBoxedInts(2, 2, 3, 6, 6, 6, 2, 2, 4, 4, 5, 7, 7); ColumnVector expect_1 = ColumnVector.fromBoxedInts(6, 6, 4, 3, 3, 3, 7, 7, 5, 5, 3, 2, 2); ColumnVector expect_2 = ColumnVector.fromBoxedInts(6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7); @@ -4752,11 +5260,11 @@ void testRangeWindowingCountUnboundedDESCWithNullsLast() { try (Table windowAggResults = sorted.groupBy(0, 1) .aggregateWindowsOverRanges( - Aggregation.count().onColumn(2).overWindow(unboundedPrecedingOneFollowing), - Aggregation.count().onColumn(2).overWindow(onePrecedingUnboundedFollowing), - Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndFollowing), - Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndCurrentRow), - Aggregation.count().onColumn(2).overWindow(currentRowAndUnboundedFollowing)); + RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingOneFollowing), + RollingAggregation.count().onColumn(2).overWindow(onePrecedingUnboundedFollowing), + RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingAndFollowing), + RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingAndCurrentRow), + RollingAggregation.count().onColumn(2).overWindow(currentRowAndUnboundedFollowing)); ColumnVector expect_0 = ColumnVector.fromBoxedInts(1, 3, 3, 6, 6, 6, 1, 3, 3, 5, 5, 7, 7); ColumnVector expect_1 = ColumnVector.fromBoxedInts(6, 5, 5, 3, 3, 3, 7, 6, 6, 4, 4, 2, 2); ColumnVector expect_2 = ColumnVector.fromBoxedInts(6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7); @@ -4784,9 +5292,9 @@ void testGroupByCountWithNulls() { .column( 1, 1, 1, null, 1, 1) .build()) { try (Table tmp = t1.groupBy(0).aggregate( - Aggregation.count().onColumn(1), - Aggregation.count().onColumn(2), - Aggregation.count().onColumn(3)); + GroupByAggregation.count().onColumn(1), + GroupByAggregation.count().onColumn(2), + GroupByAggregation.count().onColumn(3)); Table t3 = tmp.orderBy(OrderByArg.asc(0, true)); HostColumnVector groupCol = t3.getColumn(0).copyToHost(); HostColumnVector countCol = t3.getColumn(1).copyToHost(); @@ -4824,10 +5332,10 @@ void testGroupByCountWithNullsIncluded() { .column( 1, 1, 1, null, 1, 1) .build()) { try (Table tmp = t1.groupBy(0).aggregate( - Aggregation.count(NullPolicy.INCLUDE).onColumn(1), - Aggregation.count(NullPolicy.INCLUDE).onColumn(2), - Aggregation.count(NullPolicy.INCLUDE).onColumn(3), - Aggregation.count().onColumn(3)); + GroupByAggregation.count(NullPolicy.INCLUDE).onColumn(1), + GroupByAggregation.count(NullPolicy.INCLUDE).onColumn(2), + GroupByAggregation.count(NullPolicy.INCLUDE).onColumn(3), + GroupByAggregation.count().onColumn(3)); Table t3 = tmp.orderBy(OrderByArg.asc(0, true)); HostColumnVector groupCol = t3.getColumn(0).copyToHost(); HostColumnVector countCol = t3.getColumn(1).copyToHost(); @@ -4875,9 +5383,9 @@ void testGroupByCountWithCollapsingNulls() { .build(); try (Table tmp = t1.groupBy(options, 0).aggregate( - Aggregation.count().onColumn(1), - Aggregation.count().onColumn(2), - Aggregation.count().onColumn(3)); + GroupByAggregation.count().onColumn(1), + GroupByAggregation.count().onColumn(2), + GroupByAggregation.count().onColumn(3)); Table t3 = tmp.orderBy(OrderByArg.asc(0, true)); HostColumnVector groupCol = t3.getColumn(0).copyToHost(); HostColumnVector countCol = t3.getColumn(1).copyToHost(); @@ -4908,7 +5416,7 @@ void testGroupByMax() { .column( 1, 3, 3, 5, 5, 0) .column(12.0, 14.0, 13.0, 17.0, 17.0, 17.0) .build()) { - try (Table t3 = t1.groupBy(0, 1).aggregate(Aggregation.max().onColumn(2)); + try (Table t3 = t1.groupBy(0, 1).aggregate(GroupByAggregation.max().onColumn(2)); HostColumnVector aggOut1 = t3.getColumn(2).copyToHost()) { // verify t3 assertEquals(4, t3.getRowCount()); @@ -4943,7 +5451,7 @@ void testGroupByArgMax() { .column(17.0, 14.0, 14.0, 17.0, 17.1, 17.0) .build()) { try (Table t3 = t1.groupBy(0, 1) - .aggregate(Aggregation.argMax().onColumn(2)); + .aggregate(GroupByAggregation.argMax().onColumn(2)); Table sorted = t3 .orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2)); Table expected = new Table.TestBuilder() @@ -4965,7 +5473,7 @@ void testGroupByArgMin() { .column(17.0, 14.0, 14.0, 17.0, 17.1, 17.0) .build()) { try (Table t3 = t1.groupBy(0, 1) - .aggregate(Aggregation.argMin().onColumn(2)); + .aggregate(GroupByAggregation.argMin().onColumn(2)); Table sorted = t3 .orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2)); Table expected = new Table.TestBuilder() @@ -4983,7 +5491,7 @@ void testGroupByMinBool() { try (Table t1 = new Table.TestBuilder() .column(true, null, false, true, null, null) .column( 1, 1, 2, 2, 3, 3).build(); - Table other = t1.groupBy(1).aggregate(Aggregation.min().onColumn(0)); + Table other = t1.groupBy(1).aggregate(GroupByAggregation.min().onColumn(0)); Table ordered = other.orderBy(OrderByArg.asc(0)); Table expected = new Table.TestBuilder() .column(1, 2, 3) @@ -4998,7 +5506,7 @@ void testGroupByMaxBool() { try (Table t1 = new Table.TestBuilder() .column(false, null, false, true, null, null) .column( 1, 1, 2, 2, 3, 3).build(); - Table other = t1.groupBy(1).aggregate(Aggregation.max().onColumn(0)); + Table other = t1.groupBy(1).aggregate(GroupByAggregation.max().onColumn(0)); Table ordered = other.orderBy(OrderByArg.asc(0)); Table expected = new Table.TestBuilder() .column(1, 2, 3) @@ -5025,12 +5533,12 @@ void testGroupByDuplicateAggregates() { .column( 1, 2, 2, 1).build()) { try (Table t3 = t1.groupBy(0, 1) .aggregate( - Aggregation.max().onColumn(2), - Aggregation.min().onColumn(2), - Aggregation.min().onColumn(2), - Aggregation.max().onColumn(2), - Aggregation.min().onColumn(2), - Aggregation.count().onColumn(1)); + GroupByAggregation.max().onColumn(2), + GroupByAggregation.min().onColumn(2), + GroupByAggregation.min().onColumn(2), + GroupByAggregation.max().onColumn(2), + GroupByAggregation.min().onColumn(2), + GroupByAggregation.count().onColumn(1)); Table t4 = t3.orderBy(OrderByArg.asc(2))) { // verify t4 assertEquals(4, t4.getRowCount()); @@ -5053,7 +5561,7 @@ void testGroupByMin() { .column( 1, 3, 3, 5, 5, 0) .column( 12, 14, 13, 17, 17, 17) .build()) { - try (Table t3 = t1.groupBy(0, 1).aggregate(Aggregation.min().onColumn(2)); + try (Table t3 = t1.groupBy(0, 1).aggregate(GroupByAggregation.min().onColumn(2)); HostColumnVector aggOut0 = t3.getColumn(2).copyToHost()) { // verify t3 assertEquals(4, t3.getRowCount()); @@ -5088,7 +5596,7 @@ void testGroupBySum() { .column( 1, 3, 3, 5, 5, 0) .column(12.0, 14.0, 13.0, 17.0, 17.0, 17.0) .build()) { - try (Table t3 = t1.groupBy(0, 1).aggregate(Aggregation.sum().onColumn(2)); + try (Table t3 = t1.groupBy(0, 1).aggregate(GroupByAggregation.sum().onColumn(2)); HostColumnVector aggOut1 = t3.getColumn(2).copyToHost()) { // verify t3 assertEquals(4, t3.getRowCount()); @@ -5121,7 +5629,7 @@ void testGroupByM2() { try (Table input = new Table.TestBuilder().column(1, 2, 3, 1, 2, 2, 1, 3, 3, 2) .column(0, 1, -2, 3, -4, -5, -6, 7, -8, 9) .build(); - Table results = input.groupBy(0).aggregate(Aggregation.M2() + Table results = input.groupBy(0).aggregate(GroupByAggregation.M2() .onColumn(1)); Table expected = new Table.TestBuilder().column(1, 2, 3) .column(42.0, 122.75, 114.0) @@ -5134,7 +5642,7 @@ void testGroupByM2() { try (Table input = new Table.TestBuilder().column(1, 2, 5, 3, 4, 5, 2, 3, 2, 5) .column(0, null, null, 2, 3, null, 5, 6, 7, null) .build(); - Table results = input.groupBy(0).aggregate(Aggregation.M2() + Table results = input.groupBy(0).aggregate(GroupByAggregation.M2() .onColumn(1)); Table expected = new Table.TestBuilder().column(1, 2, 3, 4, 5) .column(0.0, 2.0, 8.0, 0.0, null) @@ -5146,7 +5654,7 @@ void testGroupByM2() { try (Table input = new Table.TestBuilder().column(4, 3, 1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4, 4) .column(null, null, 0.0, 1.0, 2.0, 3.0, 4.0, Double.NaN, 6.0, 7.0, 8.0, 9.0, 10.0, Double.NaN) .build(); - Table results = input.groupBy(0).aggregate(Aggregation.M2() + Table results = input.groupBy(0).aggregate(GroupByAggregation.M2() .onColumn(1)); Table expected = new Table.TestBuilder().column(1, 2, 3, 4, null) .column(18.0, Double.NaN, 18.0, Double.NaN, 0.0) @@ -5179,7 +5687,7 @@ void testGroupByM2() { Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY) .build(); - Table results = input.groupBy(0).aggregate(Aggregation.M2() + Table results = input.groupBy(0).aggregate(GroupByAggregation.M2() .onColumn(1)); Table expected = new Table.TestBuilder().column(1, 2, 3, 4, 5) .column(Double.NaN, Double.NaN, Double.NaN, Double.NaN, 12.5) @@ -5237,7 +5745,7 @@ void testGroupByMergeM2() { partialResults3, partialResults4); Table finalResults = concatenatedResults.groupBy(0).aggregate( - Aggregation.mergeM2().onColumn(1)) + GroupByAggregation.mergeM2().onColumn(1)) ) { assertTablesAreEqual(expected, finalResults); } @@ -5255,7 +5763,7 @@ void testGroupByFirstExcludeNulls() { .column(13, 14) .build(); Table found = input.groupBy(0).aggregate( - Aggregation.nth(0, NullPolicy.EXCLUDE).onColumn(1))) { + GroupByAggregation.nth(0, NullPolicy.EXCLUDE).onColumn(1))) { assertTablesAreEqual(expected, found); } } @@ -5271,7 +5779,7 @@ void testGroupByLastExcludeNulls() { .column(12, 15) .build(); Table found = input.groupBy(0).aggregate( - Aggregation.nth(-1, NullPolicy.EXCLUDE).onColumn(1))) { + GroupByAggregation.nth(-1, NullPolicy.EXCLUDE).onColumn(1))) { assertTablesAreEqual(expected, found); } } @@ -5287,7 +5795,7 @@ void testGroupByFirstIncludeNulls() { .column(null, 14) .build(); Table found = input.groupBy(0).aggregate( - Aggregation.nth(0, NullPolicy.INCLUDE).onColumn(1))) { + GroupByAggregation.nth(0, NullPolicy.INCLUDE).onColumn(1))) { assertTablesAreEqual(expected, found); } } @@ -5303,7 +5811,7 @@ void testGroupByLastIncludeNulls() { .column(12, null) .build(); Table found = input.groupBy(0).aggregate( - Aggregation.nth(-1, NullPolicy.INCLUDE).onColumn(1))) { + GroupByAggregation.nth(-1, NullPolicy.INCLUDE).onColumn(1))) { assertTablesAreEqual(expected, found); } } @@ -5314,7 +5822,7 @@ void testGroupByAvg() { .column( 1, 3, 3, 5, 5, 0) .column(12, 14, 13, 1, 17, 17) .build()) { - try (Table t3 = t1.groupBy(0, 1).aggregate(Aggregation.mean().onColumn(2)); + try (Table t3 = t1.groupBy(0, 1).aggregate(GroupByAggregation.mean().onColumn(2)); HostColumnVector aggOut1 = t3.getColumn(2).copyToHost()) { // verify t3 assertEquals(4, t3.getRowCount()); @@ -5349,11 +5857,11 @@ void testMultiAgg() { .column( 3, 1, 7, -1, 9, 0) .build()) { try (Table t2 = t1.groupBy(0, 1).aggregate( - Aggregation.count().onColumn(0), - Aggregation.max().onColumn(3), - Aggregation.min().onColumn(2), - Aggregation.mean().onColumn(2), - Aggregation.sum().onColumn(2)); + GroupByAggregation.count().onColumn(0), + GroupByAggregation.max().onColumn(3), + GroupByAggregation.min().onColumn(2), + GroupByAggregation.mean().onColumn(2), + GroupByAggregation.sum().onColumn(2)); HostColumnVector countOut = t2.getColumn(2).copyToHost(); HostColumnVector maxOut = t2.getColumn(3).copyToHost(); HostColumnVector minOut = t2.getColumn(4).copyToHost(); @@ -5419,7 +5927,7 @@ void testSumWithStrings() { .column(5289L, 5203L, 5303L, 5206L) .build(); Table result = t.groupBy(0).aggregate( - Aggregation.sum().onColumn(1)); + GroupByAggregation.sum().onColumn(1)); Table expected = new Table.TestBuilder() .column("1-URGENT", "3-MEDIUM") .column(5289L + 5303L, 5203L + 5206L) @@ -5517,7 +6025,7 @@ void testGroupByCollectListIncludeNulls() { Arrays.asList(0)) .build(); Table found = input.groupBy(0).aggregate( - Aggregation.collectList(NullPolicy.INCLUDE).onColumn(1))) { + GroupByAggregation.collectList(NullPolicy.INCLUDE).onColumn(1))) { assertTablesAreEqual(expected, found); } } @@ -5563,8 +6071,8 @@ void testGroupByMergeLists() { Arrays.asList(new StructData(333, "s333"), new StructData(222, "s222"), new StructData(111, "s111")), Arrays.asList(new StructData(222, "s222"), new StructData(444, "s444"))) .build(); - Table retListOfInts = input.groupBy(0).aggregate(Aggregation.mergeLists().onColumn(1)); - Table retListOfStructs = input.groupBy(0).aggregate(Aggregation.mergeLists().onColumn(2))) { + Table retListOfInts = input.groupBy(0).aggregate(GroupByAggregation.mergeLists().onColumn(1)); + Table retListOfStructs = input.groupBy(0).aggregate(GroupByAggregation.mergeLists().onColumn(2))) { assertTablesAreEqual(expectedListOfInts, retListOfInts); assertTablesAreEqual(expectedListOfStructs, retListOfStructs); } @@ -5573,7 +6081,7 @@ void testGroupByMergeLists() { @Test void testGroupByCollectSetIncludeNulls() { // test with null unequal and nan unequal - Aggregation collectSet = Aggregation.collectSet(NullPolicy.INCLUDE, + GroupByAggregation collectSet = GroupByAggregation.collectSet(NullPolicy.INCLUDE, NullEquality.UNEQUAL, NaNEquality.UNEQUAL); try (Table input = new Table.TestBuilder() .column(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4) @@ -5589,7 +6097,7 @@ void testGroupByCollectSetIncludeNulls() { assertTablesAreEqual(expected, found); } // test with null equal and nan unequal - collectSet = Aggregation.collectSet(NullPolicy.INCLUDE, + collectSet = GroupByAggregation.collectSet(NullPolicy.INCLUDE, NullEquality.EQUAL, NaNEquality.UNEQUAL); try (Table input = new Table.TestBuilder() .column(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4) @@ -5610,7 +6118,7 @@ void testGroupByCollectSetIncludeNulls() { assertTablesAreEqual(expected, found); } // test with null equal and nan equal - collectSet = Aggregation.collectSet(NullPolicy.INCLUDE, + collectSet = GroupByAggregation.collectSet(NullPolicy.INCLUDE, NullEquality.EQUAL, NaNEquality.ALL_EQUAL); try (Table input = new Table.TestBuilder() .column(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4) @@ -5671,10 +6179,10 @@ void testGroupByMergeSets() { Arrays.asList(1e-3, 1e3, Double.NaN), Arrays.asList()) .build(); - Table retListOfInts = input.groupBy(0).aggregate(Aggregation.mergeSets().onColumn(1)); - Table retListOfDoubles = input.groupBy(0).aggregate(Aggregation.mergeSets().onColumn(2)); + Table retListOfInts = input.groupBy(0).aggregate(GroupByAggregation.mergeSets().onColumn(1)); + Table retListOfDoubles = input.groupBy(0).aggregate(GroupByAggregation.mergeSets().onColumn(2)); Table retListOfDoublesNaNEq = input.groupBy(0).aggregate( - Aggregation.mergeSets(NullEquality.UNEQUAL, NaNEquality.ALL_EQUAL).onColumn(2))) { + GroupByAggregation.mergeSets(NullEquality.UNEQUAL, NaNEquality.ALL_EQUAL).onColumn(2))) { assertTablesAreEqual(expectedListOfInts, retListOfInts); assertTablesAreEqual(expectedListOfDoubles, retListOfDoubles); assertTablesAreEqual(expectedListOfDoublesNaNEq, retListOfDoublesNaNEq); @@ -5853,6 +6361,121 @@ void testAllFilteredFromValidity() { } } + ColumnView replaceValidity(ColumnView cv, DeviceMemoryBuffer validity, long nullCount) { + assert (validity.length >= BitVectorHelper.getValidityAllocationSizeInBytes(cv.rows)); + if (cv.type.isNestedType()) { + ColumnView[] children = cv.getChildColumnViews(); + try { + return new ColumnView(cv.type, + cv.rows, + Optional.of(nullCount), + validity, + cv.getOffsets(), + children); + } finally { + for (ColumnView v : children) { + if (v != null) { + v.close(); + } + } + } + } else { + return new ColumnView(cv.type, cv.rows, Optional.of(nullCount), cv.getData(), validity, cv.getOffsets()); + } + } + + @Test + void testRemoveNullMasksIfNeeded() { + ListType nestedType = new ListType(true, new StructType(false, + new BasicType(true, DType.INT32), + new BasicType(true, DType.INT64))); + + List data1 = Arrays.asList(10, 20L); + List data2 = Arrays.asList(50, 60L); + HostColumnVector.StructData structData1 = new HostColumnVector.StructData(data1); + HostColumnVector.StructData structData2 = new HostColumnVector.StructData(data2); + + //First we create ColumnVectors + try (ColumnVector nonNullVector0 = ColumnVector.fromBoxedInts(1, 2, 3); + ColumnVector nonNullVector2 = ColumnVector.fromStrings("1", "2", "3"); + ColumnVector nonNullVector1 = ColumnVector.fromLists(nestedType, + Arrays.asList(structData1, structData2), + Arrays.asList(structData1, structData2), + Arrays.asList(structData1, structData2))) { + //Then we take the created ColumnVectors and add validity masks even though the nullCount = 0 + long allocSize = BitVectorHelper.getValidityAllocationSizeInBytes(nonNullVector0.rows); + try (DeviceMemoryBuffer dm0 = DeviceMemoryBuffer.allocate(allocSize); + DeviceMemoryBuffer dm1 = DeviceMemoryBuffer.allocate(allocSize); + DeviceMemoryBuffer dm2 = DeviceMemoryBuffer.allocate(allocSize); + DeviceMemoryBuffer dm3_child = + DeviceMemoryBuffer.allocate(BitVectorHelper.getValidityAllocationSizeInBytes(2))) { + Cuda.memset(dm0.address, (byte) 0xFF, allocSize); + Cuda.memset(dm1.address, (byte) 0xFF, allocSize); + Cuda.memset(dm2.address, (byte) 0xFF, allocSize); + Cuda.memset(dm3_child.address, (byte) 0xFF, + BitVectorHelper.getValidityAllocationSizeInBytes(2)); + + try (ColumnView cv0View = replaceValidity(nonNullVector0, dm0, 0); + ColumnVector cv0 = cv0View.copyToColumnVector(); + ColumnView struct = nonNullVector1.getChildColumnView(0); + ColumnView structChild0 = struct.getChildColumnView(0); + ColumnView newStructChild0 = replaceValidity(structChild0, dm3_child, 0); + ColumnView newStruct = struct.replaceChildrenWithViews(new int[]{0}, new ColumnView[]{newStructChild0}); + ColumnView list = nonNullVector1.replaceChildrenWithViews(new int[]{0}, new ColumnView[]{newStruct}); + ColumnView cv1View = replaceValidity(list, dm1, 0); + ColumnVector cv1 = cv1View.copyToColumnVector(); + ColumnView cv2View = replaceValidity(nonNullVector2, dm2, 0); + ColumnVector cv2 = cv2View.copyToColumnVector()) { + + try (Table t = new Table(new ColumnVector[]{cv0, cv1, cv2}); + Table tableWithoutNullMask = removeNullMasksIfNeeded(t); + ColumnView tableStructChild0 = t.getColumn(1).getChildColumnView(0).getChildColumnView(0); + ColumnVector tableStructChild0Cv = tableStructChild0.copyToColumnVector(); + Table expected = new Table(new ColumnVector[]{nonNullVector0, nonNullVector1, + nonNullVector2})) { + assertTrue(t.getColumn(0).hasValidityVector()); + assertTrue(t.getColumn(1).hasValidityVector()); + assertTrue(t.getColumn(2).hasValidityVector()); + assertTrue(tableStructChild0Cv.hasValidityVector()); + + assertPartialTablesAreEqual(expected, + 0, + expected.getRowCount(), + tableWithoutNullMask, + true, + true); + } + } + } + } + } + + @Test + void testRemoveNullMasksIfNeededWithNulls() { + ListType nestedType = new ListType(true, new StructType(true, + new BasicType(true, DType.INT32), + new BasicType(true, DType.INT64))); + + List data1 = Arrays.asList(0, 10L); + List data2 = Arrays.asList(50, null); + HostColumnVector.StructData structData1 = new HostColumnVector.StructData(data1); + HostColumnVector.StructData structData2 = new HostColumnVector.StructData(data2); + + //First we create ColumnVectors + try (ColumnVector nonNullVector0 = ColumnVector.fromBoxedInts(1, null, 2, 3); + ColumnVector nonNullVector1 = ColumnVector.fromStrings("1", "2", null, "3"); + ColumnVector nonNullVector2 = ColumnVector.fromLists(nestedType, + Arrays.asList(structData1, structData2), + null, + Arrays.asList(structData1, structData2), + Arrays.asList(structData1, structData2))) { + try (Table expected = new Table(new ColumnVector[]{nonNullVector0, nonNullVector1, nonNullVector2}); + Table unchangedTable = removeNullMasksIfNeeded(expected)) { + assertTablesAreEqual(expected, unchangedTable); + } + } + } + @Test void testMismatchedSizesForFilter() { Boolean[] maskVals = new Boolean[3]; @@ -6002,6 +6625,40 @@ void testParquetWriteToBufferChunkedInt96() { } } + @Test + void testParquetWriteMap() throws IOException { + ParquetWriterOptions options = ParquetWriterOptions.builder() + .withMapColumn(mapColumn("my_map", + new ParquetColumnWriterOptions("key0", false), + new ParquetColumnWriterOptions("value0"))).build(); + File f = File.createTempFile("test-map", ".parquet"); + List list1 = + Arrays.asList(new HostColumnVector.StructData(Arrays.asList("a", "b"))); + List list2 = + Arrays.asList(new HostColumnVector.StructData(Arrays.asList("a", "c"))); + List list3 = + Arrays.asList(new HostColumnVector.StructData(Arrays.asList("e", "d"))); + HostColumnVector.StructType structType = new HostColumnVector.StructType(true, + Arrays.asList(new HostColumnVector.BasicType(true, DType.STRING), + new HostColumnVector.BasicType(true, DType.STRING))); + try (Table t0 = new Table(ColumnVector.fromLists(new HostColumnVector.ListType(true, + structType), list1, list2, list3))) { + try (TableWriter writer = Table.writeParquetChunked(options, f)) { + writer.write(t0); + } + ParquetFileReader reader = + ParquetFileReader.open(HadoopInputFile.fromPath(new Path(f.getAbsolutePath()), + new Configuration())); + MessageType schema = reader.getFooter().getFileMetaData().getSchema(); + assertEquals(OriginalType.MAP, schema.getType("my_map").getOriginalType()); + } + try (ColumnVector cv = Table.readParquet(f).getColumn(0); + ColumnVector res = cv.getMapValue(Scalar.fromString("a")); + ColumnVector expected = ColumnVector.fromStrings("b", "c", null)) { + assertColumnsAreEqual(expected, res); + } + } + @Test void testParquetWriteToBufferChunkedWithNested() { ParquetWriterOptions options = ParquetWriterOptions.builder() diff --git a/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java b/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java index 5a64fd6ab09..13af9aff682 100644 --- a/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java +++ b/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java @@ -42,16 +42,14 @@ public class CompiledExpressionTest extends CudfTestBase { public void testColumnReferenceTransform() { try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build()) { // use an implicit table reference - UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, - new ColumnReference(1)); + ColumnReference expr = new ColumnReference(1); try (CompiledExpression compiledExpr = expr.compile(); ColumnVector actual = compiledExpr.computeColumn(t)) { assertColumnsAreEqual(t.getColumn(1), actual); } // use an explicit table reference - expr = new UnaryExpression(UnaryOperator.IDENTITY, - new ColumnReference(1, TableReference.LEFT)); + expr = new ColumnReference(1, TableReference.LEFT); try (CompiledExpression compiledExpr = expr.compile(); ColumnVector actual = compiledExpr.computeColumn(t)) { assertColumnsAreEqual(t.getColumn(1), actual); @@ -61,22 +59,19 @@ public void testColumnReferenceTransform() { @Test public void testInvalidColumnReferenceTransform() { - // verify attempting to reference an invalid table remaps to the only valid table - UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, - new ColumnReference(1, TableReference.RIGHT)); + // Verify that computeColumn throws when passed an expression operating on TableReference.RIGHT. + ColumnReference expr = new ColumnReference(1, TableReference.RIGHT); try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build(); - CompiledExpression compiledExpr = expr.compile(); - ColumnVector actual = compiledExpr.computeColumn(t)) { - assertColumnsAreEqual(t.getColumn(1), actual); + CompiledExpression compiledExpr = expr.compile()) { + Assertions.assertThrows(CudfException.class, () -> compiledExpr.computeColumn(t).close()); } } @Test public void testBooleanLiteralTransform() { try (Table t = new Table.TestBuilder().column(true, false, null).build()) { - Literal trueLiteral = Literal.ofBoolean(true); - UnaryExpression trueExpr = new UnaryExpression(UnaryOperator.IDENTITY, trueLiteral); - try (CompiledExpression trueCompiledExpr = trueExpr.compile(); + Literal expr = Literal.ofBoolean(true); + try (CompiledExpression trueCompiledExpr = expr.compile(); ColumnVector trueExprActual = trueCompiledExpr.computeColumn(t); ColumnVector trueExprExpected = ColumnVector.fromBoxedBooleans(true, true, true)) { assertColumnsAreEqual(trueExprExpected, trueExprActual); @@ -84,7 +79,7 @@ public void testBooleanLiteralTransform() { // Uncomment the following after https://github.com/rapidsai/cudf/issues/8831 is fixed // Literal nullLiteral = Literal.ofBoolean(null); - // UnaryExpression nullExpr = new UnaryExpression(AstOperator.IDENTITY, nullLiteral); + // UnaryOperation nullExpr = new UnaryOperation(AstOperator.IDENTITY, nullLiteral); // try (CompiledExpression nullCompiledExpr = nullExpr.compile(); // ColumnVector nullExprActual = nullCompiledExpr.computeColumn(t); // ColumnVector nullExprExpected = ColumnVector.fromBoxedBooleans(null, null, null)) { @@ -98,8 +93,7 @@ public void testBooleanLiteralTransform() { // @NullSource @ValueSource(bytes = 0x12) public void testByteLiteralTransform(Byte value) { - Literal literal = Literal.ofByte(value); - UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal); + Literal expr = Literal.ofByte(value); try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build(); CompiledExpression compiledExpr = expr.compile(); ColumnVector actual = compiledExpr.computeColumn(t); @@ -113,8 +107,7 @@ public void testByteLiteralTransform(Byte value) { // @NullSource @ValueSource(shorts = 0x1234) public void testShortLiteralTransform(Short value) { - Literal literal = Literal.ofShort(value); - UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal); + Literal expr = Literal.ofShort(value); try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build(); CompiledExpression compiledExpr = expr.compile(); ColumnVector actual = compiledExpr.computeColumn(t); @@ -128,8 +121,7 @@ public void testShortLiteralTransform(Short value) { // @NullSource @ValueSource(ints = 0x12345678) public void testIntLiteralTransform(Integer value) { - Literal literal = Literal.ofInt(value); - UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal); + Literal expr = Literal.ofInt(value); try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build(); CompiledExpression compiledExpr = expr.compile(); ColumnVector actual = compiledExpr.computeColumn(t); @@ -143,8 +135,7 @@ public void testIntLiteralTransform(Integer value) { // @NullSource @ValueSource(longs = 0x1234567890abcdefL) public void testLongLiteralTransform(Long value) { - Literal literal = Literal.ofLong(value); - UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal); + Literal expr = Literal.ofLong(value); try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build(); CompiledExpression compiledExpr = expr.compile(); ColumnVector actual = compiledExpr.computeColumn(t); @@ -158,8 +149,7 @@ public void testLongLiteralTransform(Long value) { // @NullSource @ValueSource(floats = { 123456.789f, Float.NaN, Float.POSITIVE_INFINITY, Float.NEGATIVE_INFINITY} ) public void testFloatLiteralTransform(Float value) { - Literal literal = Literal.ofFloat(value); - UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal); + Literal expr = Literal.ofFloat(value); try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build(); CompiledExpression compiledExpr = expr.compile(); ColumnVector actual = compiledExpr.computeColumn(t); @@ -173,8 +163,7 @@ public void testFloatLiteralTransform(Float value) { // @NullSource @ValueSource(doubles = { 123456.789f, Double.NaN, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY} ) public void testDoubleLiteralTransform(Double value) { - Literal literal = Literal.ofDouble(value); - UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal); + Literal expr = Literal.ofDouble(value); try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build(); CompiledExpression compiledExpr = expr.compile(); ColumnVector actual = compiledExpr.computeColumn(t); @@ -188,8 +177,7 @@ public void testDoubleLiteralTransform(Double value) { // @NullSource @ValueSource(ints = 0x12345678) public void testTimestampDaysLiteralTransform(Integer value) { - Literal literal = Literal.ofTimestampDaysFromInt(value); - UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal); + Literal expr = Literal.ofTimestampDaysFromInt(value); try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build(); CompiledExpression compiledExpr = expr.compile(); ColumnVector actual = compiledExpr.computeColumn(t); @@ -204,8 +192,7 @@ public void testTimestampDaysLiteralTransform(Integer value) { // @NullSource @ValueSource(longs = 0x1234567890abcdefL) public void testTimestampSecondsLiteralTransform(Long value) { - Literal literal = Literal.ofTimestampFromLong(DType.TIMESTAMP_SECONDS, value); - UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal); + Literal expr = Literal.ofTimestampFromLong(DType.TIMESTAMP_SECONDS, value); try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build(); CompiledExpression compiledExpr = expr.compile(); ColumnVector actual = compiledExpr.computeColumn(t); @@ -220,8 +207,7 @@ public void testTimestampSecondsLiteralTransform(Long value) { // @NullSource @ValueSource(longs = 0x1234567890abcdefL) public void testTimestampMilliSecondsLiteralTransform(Long value) { - Literal literal = Literal.ofTimestampFromLong(DType.TIMESTAMP_MILLISECONDS, value); - UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal); + Literal expr = Literal.ofTimestampFromLong(DType.TIMESTAMP_MILLISECONDS, value); try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build(); CompiledExpression compiledExpr = expr.compile(); ColumnVector actual = compiledExpr.computeColumn(t); @@ -236,8 +222,7 @@ public void testTimestampMilliSecondsLiteralTransform(Long value) { // @NullSource @ValueSource(longs = 0x1234567890abcdefL) public void testTimestampMicroSecondsLiteralTransform(Long value) { - Literal literal = Literal.ofTimestampFromLong(DType.TIMESTAMP_MICROSECONDS, value); - UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal); + Literal expr = Literal.ofTimestampFromLong(DType.TIMESTAMP_MICROSECONDS, value); try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build(); CompiledExpression compiledExpr = expr.compile(); ColumnVector actual = compiledExpr.computeColumn(t); @@ -252,8 +237,7 @@ public void testTimestampMicroSecondsLiteralTransform(Long value) { // @NullSource @ValueSource(longs = 0x1234567890abcdefL) public void testTimestampNanoSecondsLiteralTransform(Long value) { - Literal literal = Literal.ofTimestampFromLong(DType.TIMESTAMP_NANOSECONDS, value); - UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal); + Literal expr = Literal.ofTimestampFromLong(DType.TIMESTAMP_NANOSECONDS, value); try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build(); CompiledExpression compiledExpr = expr.compile(); ColumnVector actual = compiledExpr.computeColumn(t); @@ -268,8 +252,7 @@ public void testTimestampNanoSecondsLiteralTransform(Long value) { // @NullSource @ValueSource(ints = 0x12345678) public void testDurationDaysLiteralTransform(Integer value) { - Literal literal = Literal.ofDurationDaysFromInt(value); - UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal); + Literal expr = Literal.ofDurationDaysFromInt(value); try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build(); CompiledExpression compiledExpr = expr.compile(); ColumnVector actual = compiledExpr.computeColumn(t); @@ -284,8 +267,7 @@ public void testDurationDaysLiteralTransform(Integer value) { // @NullSource @ValueSource(longs = 0x1234567890abcdefL) public void testDurationSecondsLiteralTransform(Long value) { - Literal literal = Literal.ofDurationFromLong(DType.DURATION_SECONDS, value); - UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal); + Literal expr = Literal.ofDurationFromLong(DType.DURATION_SECONDS, value); try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build(); CompiledExpression compiledExpr = expr.compile(); ColumnVector actual = compiledExpr.computeColumn(t); @@ -300,8 +282,7 @@ public void testDurationSecondsLiteralTransform(Long value) { // @NullSource @ValueSource(longs = 0x1234567890abcdefL) public void testDurationMilliSecondsLiteralTransform(Long value) { - Literal literal = Literal.ofDurationFromLong(DType.DURATION_MILLISECONDS, value); - UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal); + Literal expr = Literal.ofDurationFromLong(DType.DURATION_MILLISECONDS, value); try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build(); CompiledExpression compiledExpr = expr.compile(); ColumnVector actual = compiledExpr.computeColumn(t); @@ -316,8 +297,7 @@ public void testDurationMilliSecondsLiteralTransform(Long value) { // @NullSource @ValueSource(longs = 0x1234567890abcdefL) public void testDurationMicroSecondsLiteralTransform(Long value) { - Literal literal = Literal.ofDurationFromLong(DType.DURATION_MICROSECONDS, value); - UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal); + Literal expr = Literal.ofDurationFromLong(DType.DURATION_MICROSECONDS, value); try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build(); CompiledExpression compiledExpr = expr.compile(); ColumnVector actual = compiledExpr.computeColumn(t); @@ -332,8 +312,7 @@ public void testDurationMicroSecondsLiteralTransform(Long value) { // @NullSource @ValueSource(longs = 0x1234567890abcdefL) public void testDurationNanoSecondsLiteralTransform(Long value) { - Literal literal = Literal.ofDurationFromLong(DType.DURATION_NANOSECONDS, value); - UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal); + Literal expr = Literal.ofDurationFromLong(DType.DURATION_NANOSECONDS, value); try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build(); CompiledExpression compiledExpr = expr.compile(); ColumnVector actual = compiledExpr.computeColumn(t); @@ -360,7 +339,7 @@ private static ArrayList mapArray(T[] in1, U[] in2, BiFunction createUnaryDoubleExpressionParams() { + private static Stream createUnaryDoubleOperationParams() { Double[] input = new Double[] { -5., 4.5, null, 2.7, 1.5 }; return Stream.of( Arguments.of(UnaryOperator.IDENTITY, input, Arrays.asList(input)), @@ -384,10 +363,10 @@ private static Stream createUnaryDoubleExpressionParams() { } @ParameterizedTest - @MethodSource("createUnaryDoubleExpressionParams") - void testUnaryDoubleExpressionTransform(UnaryOperator op, Double[] input, + @MethodSource("createUnaryDoubleOperationParams") + void testUnaryDoubleOperationTransform(UnaryOperator op, Double[] input, List expectedValues) { - UnaryExpression expr = new UnaryExpression(op, new ColumnReference(0)); + UnaryOperation expr = new UnaryOperation(op, new ColumnReference(0)); try (Table t = new Table.TestBuilder().column(input).build(); CompiledExpression compiledExpr = expr.compile(); ColumnVector actual = compiledExpr.computeColumn(t); @@ -398,17 +377,17 @@ void testUnaryDoubleExpressionTransform(UnaryOperator op, Double[] input, } @Test - void testUnaryShortExpressionTransform() { + void testUnaryShortOperationTransform() { Short[] input = new Short[] { -5, 4, null, 2, 1 }; try (Table t = new Table.TestBuilder().column(input).build()) { - UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, new ColumnReference(0)); + ColumnReference expr = new ColumnReference(0); try (CompiledExpression compiledExpr = expr.compile(); ColumnVector actual = compiledExpr.computeColumn(t)) { assertColumnsAreEqual(t.getColumn(0), actual); } - expr = new UnaryExpression(UnaryOperator.BIT_INVERT, new ColumnReference(0)); - try (CompiledExpression compiledExpr = expr.compile(); + UnaryOperation expr2 = new UnaryOperation(UnaryOperator.BIT_INVERT, new ColumnReference(0)); + try (CompiledExpression compiledExpr = expr2.compile(); ColumnVector actual = compiledExpr.computeColumn(t); ColumnVector expected = ColumnVector.fromBoxedInts(4, -5, null, -3, -2)) { assertColumnsAreEqual(expected, actual); @@ -417,8 +396,8 @@ void testUnaryShortExpressionTransform() { } @Test - void testUnaryLogicalExpressionTransform() { - UnaryExpression expr = new UnaryExpression(UnaryOperator.NOT, new ColumnReference(0)); + void testUnaryLogicalOperationTransform() { + UnaryOperation expr = new UnaryOperation(UnaryOperator.NOT, new ColumnReference(0)); try (Table t = new Table.TestBuilder().column(-5L, 0L, null, 2L, 1L).build(); CompiledExpression compiledExpr = expr.compile(); ColumnVector actual = compiledExpr.computeColumn(t); @@ -427,7 +406,7 @@ void testUnaryLogicalExpressionTransform() { } } - private static Stream createBinaryFloatExpressionParams() { + private static Stream createBinaryFloatOperationParams() { Float[] in1 = new Float[] { -5f, 4.5f, null, 2.7f }; Float[] in2 = new Float[] { 123f, -456f, null, 0f }; return Stream.of( @@ -443,10 +422,10 @@ private static Stream createBinaryFloatExpressionParams() { } @ParameterizedTest - @MethodSource("createBinaryFloatExpressionParams") - void testBinaryFloatExpressionTransform(BinaryOperator op, Float[] in1, Float[] in2, + @MethodSource("createBinaryFloatOperationParams") + void testBinaryFloatOperationTransform(BinaryOperator op, Float[] in1, Float[] in2, List expectedValues) { - BinaryExpression expr = new BinaryExpression(op, + BinaryOperation expr = new BinaryOperation(op, new ColumnReference(0), new ColumnReference(1)); try (Table t = new Table.TestBuilder().column(in1).column(in2).build(); @@ -458,7 +437,7 @@ void testBinaryFloatExpressionTransform(BinaryOperator op, Float[] in1, Float[] } } - private static Stream createBinaryDoublePromotedExpressionParams() { + private static Stream createBinaryDoublePromotedOperationParams() { Float[] in1 = new Float[] { -5f, 4.5f, null, 2.7f }; Float[] in2 = new Float[] { 123f, -456f, null, 0f }; return Stream.of( @@ -469,10 +448,10 @@ private static Stream createBinaryDoublePromotedExpressionParams() { } @ParameterizedTest - @MethodSource("createBinaryDoublePromotedExpressionParams") - void testBinaryDoublePromotedExpressionTransform(BinaryOperator op, Float[] in1, Float[] in2, + @MethodSource("createBinaryDoublePromotedOperationParams") + void testBinaryDoublePromotedOperationTransform(BinaryOperator op, Float[] in1, Float[] in2, List expectedValues) { - BinaryExpression expr = new BinaryExpression(op, + BinaryOperation expr = new BinaryOperation(op, new ColumnReference(0), new ColumnReference(1)); try (Table t = new Table.TestBuilder().column(in1).column(in2).build(); @@ -484,7 +463,7 @@ void testBinaryDoublePromotedExpressionTransform(BinaryOperator op, Float[] in1, } } - private static Stream createBinaryComparisonExpressionParams() { + private static Stream createBinaryComparisonOperationParams() { Integer[] in1 = new Integer[] { -5, 4, null, 2, -3 }; Integer[] in2 = new Integer[] { 123, -456, null, 0, -3 }; return Stream.of( @@ -498,10 +477,10 @@ private static Stream createBinaryComparisonExpressionParams() { } @ParameterizedTest - @MethodSource("createBinaryComparisonExpressionParams") - void testBinaryComparisonExpressionTransform(BinaryOperator op, Integer[] in1, Integer[] in2, + @MethodSource("createBinaryComparisonOperationParams") + void testBinaryComparisonOperationTransform(BinaryOperator op, Integer[] in1, Integer[] in2, List expectedValues) { - BinaryExpression expr = new BinaryExpression(op, + BinaryOperation expr = new BinaryOperation(op, new ColumnReference(0), new ColumnReference(1)); try (Table t = new Table.TestBuilder().column(in1).column(in2).build(); @@ -513,7 +492,7 @@ void testBinaryComparisonExpressionTransform(BinaryOperator op, Integer[] in1, I } } - private static Stream createBinaryBitwiseExpressionParams() { + private static Stream createBinaryBitwiseOperationParams() { Integer[] in1 = new Integer[] { -5, 4, null, 2, -3 }; Integer[] in2 = new Integer[] { 123, -456, null, 0, -3 }; return Stream.of( @@ -523,10 +502,10 @@ private static Stream createBinaryBitwiseExpressionParams() { } @ParameterizedTest - @MethodSource("createBinaryBitwiseExpressionParams") - void testBinaryBitwiseExpressionTransform(BinaryOperator op, Integer[] in1, Integer[] in2, + @MethodSource("createBinaryBitwiseOperationParams") + void testBinaryBitwiseOperationTransform(BinaryOperator op, Integer[] in1, Integer[] in2, List expectedValues) { - BinaryExpression expr = new BinaryExpression(op, + BinaryOperation expr = new BinaryOperation(op, new ColumnReference(0), new ColumnReference(1)); try (Table t = new Table.TestBuilder().column(in1).column(in2).build(); @@ -538,7 +517,7 @@ void testBinaryBitwiseExpressionTransform(BinaryOperator op, Integer[] in1, Inte } } - private static Stream createBinaryBooleanExpressionParams() { + private static Stream createBinaryBooleanOperationParams() { Boolean[] in1 = new Boolean[] { false, true, null, true, false }; Boolean[] in2 = new Boolean[] { true, null, null, true, false }; return Stream.of( @@ -547,10 +526,10 @@ private static Stream createBinaryBooleanExpressionParams() { } @ParameterizedTest - @MethodSource("createBinaryBooleanExpressionParams") - void testBinaryBooleanExpressionTransform(BinaryOperator op, Boolean[] in1, Boolean[] in2, + @MethodSource("createBinaryBooleanOperationParams") + void testBinaryBooleanOperationTransform(BinaryOperator op, Boolean[] in1, Boolean[] in2, List expectedValues) { - BinaryExpression expr = new BinaryExpression(op, + BinaryOperation expr = new BinaryOperation(op, new ColumnReference(0), new ColumnReference(1)); try (Table t = new Table.TestBuilder().column(in1).column(in2).build(); @@ -563,9 +542,9 @@ void testBinaryBooleanExpressionTransform(BinaryOperator op, Boolean[] in1, Bool } @Test - void testMismatchedBinaryExpressionTypes() { + void testMismatchedBinaryOperationTypes() { // verify expression fails to transform if operands are not the same type - BinaryExpression expr = new BinaryExpression(BinaryOperator.ADD, + BinaryOperation expr = new BinaryOperation(BinaryOperator.ADD, new ColumnReference(0), new ColumnReference(1)); try (Table t = new Table.TestBuilder().column(1, 2, 3).column(1L, 2L, 3L).build(); diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 2d52b517242..6b5e5b858f0 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -8,6 +8,7 @@ import rmm +from cudf.api.types import dtype from cudf import core, datasets, testing from cudf._version import get_versions from cudf.api.extensions import ( @@ -15,34 +16,36 @@ register_index_accessor, register_series_accessor, ) -from cudf.core import ( +from cudf.core.scalar import ( NA, + Scalar, +) +from cudf.core.index import ( BaseIndex, CategoricalIndex, - DataFrame, DatetimeIndex, Float32Index, Float64Index, Index, + GenericIndex, Int8Index, Int16Index, Int32Index, Int64Index, IntervalIndex, - MultiIndex, RangeIndex, - Scalar, - Series, + StringIndex, TimedeltaIndex, UInt8Index, UInt16Index, UInt32Index, UInt64Index, - cut, - from_pandas, interval_range, - merge, ) +from cudf.core.dataframe import DataFrame, from_pandas, merge +from cudf.core.series import Series +from cudf.core.multiindex import MultiIndex +from cudf.core.cut import cut from cudf.core.algorithms import factorize from cudf.core.dtypes import ( CategoricalDtype, @@ -73,7 +76,14 @@ tan, true_divide, ) -from cudf.core.reshape import concat, get_dummies, melt, merge_sorted +from cudf.core.reshape import ( + concat, + get_dummies, + melt, + merge_sorted, + pivot, + unstack, +) from cudf.core.series import isclose from cudf.core.tools.datetimes import DateOffset, to_datetime from cudf.core.tools.numeric import to_numeric diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index fe9ed4d4934..83ab02351f2 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -18,44 +18,44 @@ ALL_POSSIBLE_VALUES = "ALL_POSSIBLE_VALUES" _PANDAS_TO_AVRO_SCHEMA_MAP = { - np.dtype("int8"): "int", + cudf.dtype("int8"): "int", pd.Int8Dtype(): ["int", "null"], pd.Int16Dtype(): ["int", "null"], pd.Int32Dtype(): ["int", "null"], pd.Int64Dtype(): ["long", "null"], pd.BooleanDtype(): ["boolean", "null"], pd.StringDtype(): ["string", "null"], - np.dtype("bool_"): "boolean", - np.dtype("int16"): "int", - np.dtype("int32"): "int", - np.dtype("int64"): "long", - np.dtype("O"): "string", - np.dtype("str"): "string", - np.dtype("float32"): "float", - np.dtype("float64"): "double", - np.dtype("( + NullHandling.EXCLUDE + ) + ) + ) + return agg + + @classmethod + def last(cls): + cdef Aggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation.make_nth_element_aggregation[aggregation]( + -1, + ( + NullHandling.EXCLUDE + ) + ) + ) + return agg + @classmethod def any(cls): cdef Aggregation agg = cls() @@ -251,7 +279,7 @@ cdef class Aggregation: nb_type = numpy_support.from_dtype(kwargs['dtype']) type_signature = (nb_type[:],) compiled_op = cudautils.compile_udf(op, type_signature) - output_np_dtype = np.dtype(compiled_op[1]) + output_np_dtype = cudf.dtype(compiled_op[1]) cpp_str = compiled_op[0].encode('UTF-8') if output_np_dtype not in np_to_cudf_types: raise TypeError( @@ -395,7 +423,7 @@ cdef class RollingAggregation: nb_type = numpy_support.from_dtype(kwargs['dtype']) type_signature = (nb_type[:],) compiled_op = cudautils.compile_udf(op, type_signature) - output_np_dtype = np.dtype(compiled_op[1]) + output_np_dtype = cudf.dtype(compiled_op[1]) cpp_str = compiled_op[0].encode('UTF-8') if output_np_dtype not in np_to_cudf_types: raise TypeError( @@ -433,6 +461,299 @@ cdef class RollingAggregation: )) return agg +cdef class GroupbyAggregation: + """A Cython wrapper for groupby aggregations. + + **This class should never be instantiated using a standard constructor, + only using one of its many factories.** These factories handle mapping + different cudf operations to their libcudf analogs, e.g. + `cudf.DataFrame.idxmin` -> `libcudf.argmin`. Additionally, they perform + any additional configuration needed to translate Python arguments into + their corresponding C++ types (for instance, C++ enumerations used for + flag arguments). The factory approach is necessary to support operations + like `df.agg(lambda x: x.sum())`; such functions are called with this + class as an argument to generation the desired aggregation. + """ + @property + def kind(self): + return AggregationKind(self.c_obj.get()[0].kind).name + + @classmethod + def sum(cls): + cdef GroupbyAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation.make_sum_aggregation[groupby_aggregation]()) + return agg + + @classmethod + def min(cls): + cdef GroupbyAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation.make_min_aggregation[groupby_aggregation]()) + return agg + + @classmethod + def max(cls): + cdef GroupbyAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation.make_max_aggregation[groupby_aggregation]()) + return agg + + @classmethod + def idxmin(cls): + cdef GroupbyAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation.make_argmin_aggregation[ + groupby_aggregation]()) + return agg + + @classmethod + def idxmax(cls): + cdef GroupbyAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation.make_argmax_aggregation[ + groupby_aggregation]()) + return agg + + @classmethod + def mean(cls): + cdef GroupbyAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation.make_mean_aggregation[groupby_aggregation]()) + return agg + + @classmethod + def count(cls, dropna=True): + cdef libcudf_types.null_policy c_null_handling + if dropna: + c_null_handling = libcudf_types.null_policy.EXCLUDE + else: + c_null_handling = libcudf_types.null_policy.INCLUDE + + cdef GroupbyAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation.make_count_aggregation[groupby_aggregation]( + c_null_handling + )) + return agg + + @classmethod + def size(cls): + cdef GroupbyAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation.make_count_aggregation[groupby_aggregation]( + ( + NullHandling.INCLUDE) + )) + return agg + + @classmethod + def collect(cls): + cdef GroupbyAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation. + make_collect_list_aggregation[groupby_aggregation]()) + return agg + + @classmethod + def nunique(cls): + cdef GroupbyAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation. + make_nunique_aggregation[groupby_aggregation]()) + return agg + + @classmethod + def nth(cls, libcudf_types.size_type size): + cdef GroupbyAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation. + make_nth_element_aggregation[groupby_aggregation](size)) + return agg + + @classmethod + def product(cls): + cdef GroupbyAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation. + make_product_aggregation[groupby_aggregation]()) + return agg + prod = product + + @classmethod + def sum_of_squares(cls): + cdef GroupbyAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation. + make_sum_of_squares_aggregation[groupby_aggregation]() + ) + return agg + + @classmethod + def var(cls, ddof=1): + cdef GroupbyAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation. + make_variance_aggregation[groupby_aggregation](ddof)) + return agg + + @classmethod + def std(cls, ddof=1): + cdef GroupbyAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation. + make_std_aggregation[groupby_aggregation](ddof)) + return agg + + @classmethod + def median(cls): + cdef GroupbyAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation. + make_median_aggregation[groupby_aggregation]()) + return agg + + @classmethod + def quantile(cls, q=0.5, interpolation="linear"): + cdef GroupbyAggregation agg = cls() + + if not pd.api.types.is_list_like(q): + q = [q] + + cdef vector[double] c_q = q + cdef libcudf_types.interpolation c_interp = ( + ( + ( + Interpolation[interpolation.upper()] + ) + ) + ) + agg.c_obj = move( + libcudf_aggregation.make_quantile_aggregation[groupby_aggregation]( + c_q, c_interp) + ) + return agg + + @classmethod + def unique(cls): + cdef GroupbyAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation. + make_collect_set_aggregation[groupby_aggregation]()) + return agg + + @classmethod + def first(cls): + cdef GroupbyAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation. + make_nth_element_aggregation[groupby_aggregation]( + 0, + ( + NullHandling.EXCLUDE + ) + ) + ) + return agg + + @classmethod + def last(cls): + cdef GroupbyAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation. + make_nth_element_aggregation[groupby_aggregation]( + -1, + ( + NullHandling.EXCLUDE + ) + ) + ) + return agg + +cdef class GroupbyScanAggregation: + """A Cython wrapper for groupby scan aggregations. + + **This class should never be instantiated using a standard constructor, + only using one of its many factories.** These factories handle mapping + different cudf operations to their libcudf analogs, e.g. + `cudf.DataFrame.idxmin` -> `libcudf.argmin`. Additionally, they perform + any additional configuration needed to translate Python arguments into + their corresponding C++ types (for instance, C++ enumerations used for + flag arguments). The factory approach is necessary to support operations + like `df.agg(lambda x: x.sum())`; such functions are called with this + class as an argument to generation the desired aggregation. + """ + @property + def kind(self): + return AggregationKind(self.c_obj.get()[0].kind).name + + @classmethod + def sum(cls): + cdef GroupbyScanAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation. + make_sum_aggregation[groupby_scan_aggregation]()) + return agg + + @classmethod + def min(cls): + cdef GroupbyScanAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation. + make_min_aggregation[groupby_scan_aggregation]()) + return agg + + @classmethod + def max(cls): + cdef GroupbyScanAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation. + make_max_aggregation[groupby_scan_aggregation]()) + return agg + + @classmethod + def count(cls, dropna=True): + cdef libcudf_types.null_policy c_null_handling + if dropna: + c_null_handling = libcudf_types.null_policy.EXCLUDE + else: + c_null_handling = libcudf_types.null_policy.INCLUDE + + cdef GroupbyScanAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation. + make_count_aggregation[groupby_scan_aggregation](c_null_handling)) + return agg + + @classmethod + def size(cls): + cdef GroupbyScanAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation. + make_count_aggregation[groupby_scan_aggregation]( + ( + NullHandling.INCLUDE) + )) + return agg + + @classmethod + def cumcount(cls): + cdef GroupbyScanAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation. + make_count_aggregation[groupby_scan_aggregation]( + libcudf_types.null_policy.INCLUDE + )) + return agg + + # scan aggregations + # TODO: update this after adding per algorithm aggregation derived types + # https://github.com/rapidsai/cudf/issues/7106 + cumsum = sum + cummin = min + cummax = max + + cdef Aggregation make_aggregation(op, kwargs=None): r""" Parameters @@ -508,3 +829,79 @@ cdef RollingAggregation make_rolling_aggregation(op, kwargs=None): else: raise TypeError(f"Unknown aggregation {op}") return agg + +cdef GroupbyAggregation make_groupby_aggregation(op, kwargs=None): + r""" + Parameters + ---------- + op : str or callable + If callable, must meet one of the following requirements: + + * Is of the form lambda x: x.agg(*args, **kwargs), where + `agg` is the name of a supported aggregation. Used to + to specify aggregations that take arguments, e.g., + `lambda x: x.quantile(0.5)`. + * Is a user defined aggregation function that operates on + group values. In this case, the output dtype must be + specified in the `kwargs` dictionary. + \*\*kwargs : dict, optional + Any keyword arguments to be passed to the op. + + Returns + ------- + GroupbyAggregation + """ + if kwargs is None: + kwargs = {} + + cdef GroupbyAggregation agg + if isinstance(op, str): + agg = getattr(GroupbyAggregation, op)(**kwargs) + elif callable(op): + if op is list: + agg = GroupbyAggregation.collect() + elif "dtype" in kwargs: + agg = GroupbyAggregation.from_udf(op, **kwargs) + else: + agg = op(GroupbyAggregation) + else: + raise TypeError(f"Unknown aggregation {op}") + return agg + +cdef GroupbyScanAggregation make_groupby_scan_aggregation(op, kwargs=None): + r""" + Parameters + ---------- + op : str or callable + If callable, must meet one of the following requirements: + + * Is of the form lambda x: x.agg(*args, **kwargs), where + `agg` is the name of a supported aggregation. Used to + to specify aggregations that take arguments, e.g., + `lambda x: x.quantile(0.5)`. + * Is a user defined aggregation function that operates on + group values. In this case, the output dtype must be + specified in the `kwargs` dictionary. + \*\*kwargs : dict, optional + Any keyword arguments to be passed to the op. + + Returns + ------- + GroupbyScanAggregation + """ + if kwargs is None: + kwargs = {} + + cdef GroupbyScanAggregation agg + if isinstance(op, str): + agg = getattr(GroupbyScanAggregation, op)(**kwargs) + elif callable(op): + if op is list: + agg = GroupbyScanAggregation.collect() + elif "dtype" in kwargs: + agg = GroupbyScanAggregation.from_udf(op, **kwargs) + else: + agg = op(GroupbyScanAggregation) + else: + raise TypeError(f"Unknown aggregation {op}") + return agg diff --git a/python/cudf/cudf/_lib/avro.pyx b/python/cudf/cudf/_lib/avro.pyx index 52ddbd8b8fb..5b644fda2f8 100644 --- a/python/cudf/cudf/_lib/avro.pyx +++ b/python/cudf/cudf/_lib/avro.pyx @@ -12,6 +12,7 @@ from cudf._lib.cpp.io.types cimport table_with_metadata from cudf._lib.cpp.types cimport size_type from cudf._lib.io.utils cimport make_source_info from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1): @@ -52,4 +53,4 @@ cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1): names = [name.decode() for name in c_result.metadata.column_names] - return Table.from_unique_ptr(move(c_result.tbl), column_names=names) + return data_from_unique_ptr(move(c_result.tbl), column_names=names) diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx index e8305ecaf2d..7e0be09236f 100644 --- a/python/cudf/cudf/_lib/binaryop.pyx +++ b/python/cudf/cudf/_lib/binaryop.pyx @@ -28,6 +28,7 @@ from cudf.utils.dtypes import is_scalar, is_string_dtype cimport cudf._lib.cpp.binaryop as cpp_binaryop from cudf._lib.cpp.binaryop cimport binary_operator +import cudf class BinaryOperation(IntEnum): @@ -211,7 +212,7 @@ def binaryop_udf(Column lhs, Column rhs, udf_ptx, dtype): cdef type_id tid = ( ( ( - np_to_cudf_types[np.dtype(dtype)] + np_to_cudf_types[cudf.dtype(dtype)] ) ) ) diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx index 86778e0a9e1..5266d0ac773 100644 --- a/python/cudf/cudf/_lib/concat.pyx +++ b/python/cudf/cudf/_lib/concat.pyx @@ -15,6 +15,7 @@ from cudf._lib.cpp.concatenate cimport ( from cudf._lib.cpp.table.table cimport table, table_view from cudf._lib.table cimport Table from cudf._lib.utils cimport ( + data_from_unique_ptr, make_column_views, make_table_data_views, make_table_views, @@ -52,7 +53,8 @@ cpdef concat_tables(object tables, bool ignore_index=False): c_views = make_table_data_views(tables) with nogil: c_result = move(libcudf_concatenate_tables(c_views)) - return Table.from_unique_ptr( + + return data_from_unique_ptr( move(c_result), column_names=tables[0]._column_names, index_names=None if ignore_index else tables[0]._index_names diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index d114a04eec4..ed31574b4a5 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -11,6 +11,7 @@ from libcpp.utility cimport move from libcpp.vector cimport vector from rmm._lib.device_buffer cimport DeviceBuffer + from cudf.core.buffer import Buffer from cudf._lib.column cimport Column @@ -35,6 +36,7 @@ from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type +from cudf._lib.utils cimport data_from_table_view, data_from_unique_ptr # workaround for https://github.com/cython/cython/issues/3885 ctypedef const scalar constscalar @@ -178,7 +180,7 @@ def gather( ) ) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=source_table._column_names, index_names=( @@ -210,19 +212,17 @@ def _scatter_table(Table source_table, Column scatter_map, ) ) - out_table = Table.from_unique_ptr( + data, _ = data_from_unique_ptr( move(c_result), column_names=target_table._column_names, index_names=None ) - out_table._index = ( + return data, ( None if target_table._index is None else target_table._index.copy( deep=False) ) - return out_table - def _scatter_scalar(scalars, Column scatter_map, Table target_table, bool bounds_check=True): @@ -250,19 +250,17 @@ def _scatter_scalar(scalars, Column scatter_map, ) ) - out_table = Table.from_unique_ptr( + data, _ = data_from_unique_ptr( move(c_result), column_names=target_table._column_names, index_names=None ) - out_table._index = ( + return data, ( None if target_table._index is None else target_table._index.copy( deep=False) ) - return out_table - def scatter(object input, object scatter_map, Table target, bool bounds_check=True): @@ -306,7 +304,7 @@ def _reverse_table(Table source_table): reverse_table_view )) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=source_table._column_names, index_names=source_table._index_names @@ -371,7 +369,7 @@ def table_empty_like(Table input_table, bool keep_index=True): with nogil: c_result = move(cpp_copying.empty_like(input_table_view)) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=input_table._column_names, index_names=( @@ -434,8 +432,8 @@ def table_slice(Table input_table, object indices, bool keep_index=True): ) num_of_result_cols = c_result.size() - result =[ - Table.from_table_view( + return [ + data_from_table_view( c_result[i], input_table, column_names=input_table._column_names, @@ -446,8 +444,6 @@ def table_slice(Table input_table, object indices, bool keep_index=True): ) ) for i in range(num_of_result_cols)] - return result - def column_split(Column input_column, object splits): @@ -505,8 +501,8 @@ def table_split(Table input_table, object splits, bool keep_index=True): ) num_of_result_cols = c_result.size() - result = [ - Table.from_table_view( + return [ + data_from_table_view( c_result[i], input_table, column_names=input_table._column_names, @@ -515,8 +511,6 @@ def table_split(Table input_table, object splits, bool keep_index=True): else None ) for i in range(num_of_result_cols)] - return result - def _copy_if_else_column_column(Column lhs, Column rhs, Column boolean_mask): @@ -642,7 +636,7 @@ def _boolean_mask_scatter_table(Table input_table, Table target_table, ) ) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=target_table._column_names, index_names=target_table._index._column_names @@ -672,13 +666,15 @@ def _boolean_mask_scatter_scalar(list input_scalars, Table target_table, ) ) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=target_table._column_names, index_names=target_table._index._column_names ) +# TODO: This function is currently unused but should be used in +# ColumnBase.__setitem__, see https://github.com/rapidsai/cudf/issues/8667. def boolean_mask_scatter(object input, Table target_table, Column boolean_mask): @@ -755,7 +751,7 @@ def sample(Table input, size_type n, cpp_copying.sample(tbl_view, n, replacement, seed) ) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_output), column_names=input._column_names, index_names=( @@ -791,12 +787,12 @@ cdef class _CPackedColumns: """ Construct a ``PackedColumns`` object from a ``cudf.DataFrame``. """ - from cudf.core import RangeIndex, dtypes + import cudf.core.dtypes cdef _CPackedColumns p = _CPackedColumns.__new__(_CPackedColumns) if keep_index and ( - not isinstance(input_table.index, RangeIndex) + not isinstance(input_table.index, cudf.RangeIndex) or input_table.index.start != 0 or input_table.index.stop != len(input_table) or input_table.index.step != 1 @@ -809,7 +805,7 @@ cdef class _CPackedColumns: p.column_names = input_table._column_names p.column_dtypes = {} for name, col in input_table._data.items(): - if isinstance(col.dtype, dtypes._BaseDtype): + if isinstance(col.dtype, cudf.core.dtypes._BaseDtype): p.column_dtypes[name] = col.dtype p.c_obj = move(cpp_copying.pack(input_table_view)) @@ -887,12 +883,12 @@ cdef class _CPackedColumns: return p def unpack(self): - output_table = Table.from_table_view( + output_table = Table(*data_from_table_view( cpp_copying.unpack(self.c_obj), self, self.column_names, self.index_names - ) + )) for name, dtype in self.column_dtypes.items(): output_table._data[name] = ( diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index b13815c925d..13bfa49057c 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -43,6 +43,12 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: cdef cppclass rolling_aggregation: aggregation.Kind kind + cdef cppclass groupby_aggregation: + aggregation.Kind kind + + cdef cppclass groupby_scan_aggregation: + aggregation.Kind kind + ctypedef enum udf_type: CUDA 'cudf::udf_type::CUDA' PTX 'cudf::udf_type::PTX' @@ -87,6 +93,11 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: size_type n ) except + + cdef unique_ptr[T] make_nth_element_aggregation[T]( + size_type n, + null_policy null_handling + ) except + + cdef unique_ptr[T] make_collect_list_aggregation[T]() except + cdef unique_ptr[T] make_collect_set_aggregation[T]() except + diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd index 29a6518fae8..a318dc68ac9 100644 --- a/python/cudf/cudf/_lib/cpp/copying.pxd +++ b/python/cudf/cudf/_lib/cpp/copying.pxd @@ -122,7 +122,7 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil: vector[size_type] splits ) except + - cdef struct packed_columns: + cdef cppclass packed_columns: unique_ptr[metadata] metadata_ unique_ptr[device_buffer] gpu_data diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd index 56ebc3a77fc..ef97be3cf9e 100644 --- a/python/cudf/cudf/_lib/cpp/datetime.pxd +++ b/python/cudf/cudf/_lib/cpp/datetime.pxd @@ -18,3 +18,8 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: ) except + cdef unique_ptr[column] day_of_year(const column_view& column) except + cdef unique_ptr[column] is_leap_year(const column_view& column) except + + cdef unique_ptr[column] last_day_of_month( + const column_view& column + ) except + + cdef unique_ptr[column] extract_quarter(const column_view& column) except + + cdef unique_ptr[column] days_in_month(const column_view& column) except + diff --git a/python/cudf/cudf/_lib/cpp/groupby.pxd b/python/cudf/cudf/_lib/cpp/groupby.pxd index 2d8f251799d..2ecdf76842f 100644 --- a/python/cudf/cudf/_lib/cpp/groupby.pxd +++ b/python/cudf/cudf/_lib/cpp/groupby.pxd @@ -5,7 +5,10 @@ from libcpp.memory cimport unique_ptr from libcpp.pair cimport pair from libcpp.vector cimport vector -from cudf._lib.cpp.aggregation cimport aggregation +from cudf._lib.cpp.aggregation cimport ( + groupby_aggregation, + groupby_scan_aggregation, +) from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.libcpp.functional cimport reference_wrapper @@ -26,7 +29,12 @@ cdef extern from "cudf/groupby.hpp" \ cdef cppclass aggregation_request: aggregation_request() except + column_view values - vector[unique_ptr[aggregation]] aggregations + vector[unique_ptr[groupby_aggregation]] aggregations + + cdef cppclass scan_request: + scan_request() except + + column_view values + vector[unique_ptr[groupby_scan_aggregation]] aggregations cdef cppclass aggregation_result: vector[unique_ptr[column]] results @@ -76,7 +84,7 @@ cdef extern from "cudf/groupby.hpp" \ unique_ptr[table], vector[aggregation_result] ] scan( - const vector[aggregation_request]& requests, + const vector[scan_request]& requests, ) except + pair[ diff --git a/python/cudf/cudf/_lib/cpp/io/csv.pxd b/python/cudf/cudf/_lib/cpp/io/csv.pxd index c5e235b5697..4afd8732320 100644 --- a/python/cudf/cudf/_lib/cpp/io/csv.pxd +++ b/python/cudf/cudf/_lib/cpp/io/csv.pxd @@ -2,6 +2,7 @@ from libc.stdint cimport uint8_t from libcpp cimport bool +from libcpp.map cimport map from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector @@ -49,8 +50,10 @@ cdef extern from "cudf/io/csv.hpp" \ cudf_io_types.quote_style get_quoting() except+ char get_quotechar() except+ bool is_enabled_doublequote() except+ - vector[string] get_infer_date_names() except+ - vector[int] get_infer_date_indexes() except+ + vector[string] get_parse_dates_names() except+ + vector[int] get_parse_dates_indexes() except+ + vector[string] get_parse_hex_names() except+ + vector[int] get_parse_hex_indexes() except+ # Conversion settings vector[string] get_dtype() except+ @@ -92,11 +95,14 @@ cdef extern from "cudf/io/csv.hpp" \ void set_quoting(cudf_io_types.quote_style style) except+ void set_quotechar(char val) except+ void set_doublequote(bool val) except+ - void set_infer_date_names(vector[string]) except+ - void set_infer_date_indexes(vector[int]) except+ + void set_parse_dates(vector[string]) except+ + void set_parse_dates(vector[int]) except+ + void set_parse_hex(vector[string]) except+ + void set_parse_hex(vector[int]) except+ # Conversion settings - void set_dtypes(vector[string] types) except+ + void set_dtypes(vector[data_type] types) except+ + void set_dtypes(map[string, data_type] types) except+ void set_true_values(vector[string] vals) except+ void set_false_values(vector[string] vals) except+ void set_na_values(vector[string] vals) except+ @@ -157,11 +163,15 @@ cdef extern from "cudf/io/csv.hpp" \ ) except+ csv_reader_options_builder& quotechar(char val) except+ csv_reader_options_builder& doublequote(bool val) except+ - csv_reader_options_builder& infer_date_names(vector[string]) except+ - csv_reader_options_builder& infer_date_indexes(vector[int]) except+ + csv_reader_options_builder& parse_dates(vector[string]) except+ + csv_reader_options_builder& parse_dates(vector[int]) except+ # Conversion settings csv_reader_options_builder& dtypes(vector[string] types) except+ + csv_reader_options_builder& dtypes(vector[data_type] types) except+ + csv_reader_options_builder& dtypes( + map[string, data_type] types + ) except+ csv_reader_options_builder& true_values(vector[string] vals) except+ csv_reader_options_builder& false_values(vector[string] vals) except+ csv_reader_options_builder& na_values(vector[string] vals) except+ diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd index 6f20195e87f..2c65e329bb0 100644 --- a/python/cudf/cudf/_lib/cpp/io/json.pxd +++ b/python/cudf/cudf/_lib/cpp/io/json.pxd @@ -2,6 +2,7 @@ from libc.stdint cimport uint8_t from libcpp cimport bool +from libcpp.map cimport map from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector @@ -25,7 +26,8 @@ cdef extern from "cudf/io/json.hpp" \ bool is_enabled_dayfirst() except+ # setter - void set_dtypes(vector[string] types) except+ + void set_dtypes(vector[data_type] types) except+ + void set_dtypes(map[string, data_type] types) except+ void set_compression( cudf_io_types.compression_type compression ) except+ @@ -47,6 +49,12 @@ cdef extern from "cudf/io/json.hpp" \ json_reader_options_builder& dtypes( vector[string] types ) except+ + json_reader_options_builder& dtypes( + vector[data_type] types + ) except+ + json_reader_options_builder& dtypes( + map[string, data_type] types + ) except+ json_reader_options_builder& compression( cudf_io_types.compression_type compression ) except+ diff --git a/python/cudf/cudf/_lib/cpp/strings/repeat.pxd b/python/cudf/cudf/_lib/cpp/strings/repeat.pxd new file mode 100644 index 00000000000..2a6754b9a11 --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/strings/repeat.pxd @@ -0,0 +1,19 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.types cimport size_type + + +cdef extern from "cudf/strings/repeat_strings.hpp" namespace "cudf::strings" \ + nogil: + + cdef unique_ptr[column] repeat_strings( + column_view strings, + size_type repeat) except + + + cdef unique_ptr[column] repeat_strings( + column_view strings, + column_view repeats) except + diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 773e81a0a7b..812d614e6d3 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -1,11 +1,16 @@ # Copyright (c) 2020, NVIDIA CORPORATION. from libcpp cimport bool +from libcpp.map cimport map from libcpp.memory cimport make_unique, unique_ptr from libcpp.string cimport string from libcpp.utility cimport move from libcpp.vector cimport vector +cimport cudf._lib.cpp.types as libcudf_types +from cudf._lib.cpp.types cimport data_type, type_id +from cudf._lib.types cimport dtype_to_data_type + import numpy as np import pandas as pd @@ -40,6 +45,7 @@ from cudf._lib.cpp.io.types cimport ( from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.io.utils cimport make_sink_info, make_source_info from cudf._lib.table cimport Table, make_table_view +from cudf._lib.utils cimport data_from_unique_ptr ctypedef int32_t underlying_type_t_compression @@ -68,6 +74,12 @@ class Compression(IntEnum): ) +CSV_HEX_TYPE_MAP = { + "hex": np.dtype("int64"), + "hex64": np.dtype("int64"), + "hex32": np.dtype("int32") +} + cdef csv_reader_options make_csv_reader_options( object datasource, object lineterminator, @@ -116,9 +128,12 @@ cdef csv_reader_options make_csv_reader_options( cdef vector[string] c_use_cols_names cdef size_type c_nrows = nrows if nrows is not None else -1 cdef quote_style c_quoting - cdef vector[string] c_infer_date_names - cdef vector[int] c_infer_date_indexes - cdef vector[string] c_dtypes + cdef vector[string] c_parse_dates_names + cdef vector[int] c_parse_dates_indexes + cdef vector[string] c_hex_col_names + cdef vector[data_type] c_dtypes_list + cdef map[string, data_type] c_dtypes_map + cdef vector[int] c_hex_col_indexes cdef vector[string] c_true_values cdef vector[string] c_false_values cdef vector[string] c_na_values @@ -220,48 +235,61 @@ cdef csv_reader_options make_csv_reader_options( "`parse_dates`: non-lists are unsupported") for col in parse_dates: if isinstance(col, str): - c_infer_date_names.push_back(str(col).encode()) + c_parse_dates_names.push_back(str(col).encode()) elif isinstance(col, int): - c_infer_date_indexes.push_back(col) + c_parse_dates_indexes.push_back(col) else: raise NotImplementedError( "`parse_dates`: Nesting is unsupported") - csv_reader_options_c.set_infer_date_names(c_infer_date_names) - csv_reader_options_c.set_infer_date_indexes(c_infer_date_indexes) + csv_reader_options_c.set_parse_dates(c_parse_dates_names) + csv_reader_options_c.set_parse_dates(c_parse_dates_indexes) if dtype is not None: if isinstance(dtype, abc.Mapping): - c_dtypes.reserve(len(dtype)) for k, v in dtype.items(): - c_dtypes.push_back( - str( - str(k)+":"+ - _get_cudf_compatible_str_from_dtype(v) - ).encode() - ) + col_type = v + if v in CSV_HEX_TYPE_MAP: + col_type = CSV_HEX_TYPE_MAP[v] + c_hex_col_names.push_back(str(k).encode()) + + c_dtypes_map[str(k).encode()] = \ + _get_cudf_data_type_from_dtype( + cudf.dtype(col_type)) + csv_reader_options_c.set_dtypes(c_dtypes_map) + csv_reader_options_c.set_parse_hex(c_hex_col_names) elif ( cudf.utils.dtypes.is_scalar(dtype) or isinstance(dtype, ( np.dtype, pd.core.dtypes.dtypes.ExtensionDtype, type )) ): - c_dtypes.reserve(1) - c_dtypes.push_back( - _get_cudf_compatible_str_from_dtype(dtype).encode() + c_dtypes_list.reserve(1) + if dtype in CSV_HEX_TYPE_MAP: + dtype = CSV_HEX_TYPE_MAP[dtype] + c_hex_col_indexes.push_back(0) + + c_dtypes_list.push_back( + _get_cudf_data_type_from_dtype(dtype) ) + csv_reader_options_c.set_dtypes(c_dtypes_list) + csv_reader_options_c.set_parse_hex(c_hex_col_indexes) elif isinstance(dtype, abc.Iterable): - c_dtypes.reserve(len(dtype)) - for col_dtype in dtype: - c_dtypes.push_back( - _get_cudf_compatible_str_from_dtype(col_dtype).encode() + c_dtypes_list.reserve(len(dtype)) + for index, col_dtype in enumerate(dtype): + if col_dtype in CSV_HEX_TYPE_MAP: + col_dtype = CSV_HEX_TYPE_MAP[col_dtype] + c_hex_col_indexes.push_back(index) + + c_dtypes_list.push_back( + _get_cudf_data_type_from_dtype(col_dtype) ) + csv_reader_options_c.set_dtypes(c_dtypes_list) + csv_reader_options_c.set_parse_hex(c_hex_col_indexes) else: raise ValueError( "dtype should be a scalar/str/list-like/dict-like" ) - csv_reader_options_c.set_dtypes(c_dtypes) - if true_values is not None: c_true_values.reserve(len(true_values)) for tv in true_values: @@ -358,7 +386,7 @@ def read_csv( See Also -------- - cudf.io.csv.read_csv + cudf.read_csv """ if not isinstance(datasource, (BytesIO, StringIO, bytes, @@ -393,7 +421,7 @@ def read_csv( c_result = move(cpp_read_csv(read_csv_options_c)) meta_names = [name.decode() for name in c_result.metadata.column_names] - df = cudf.DataFrame._from_table(Table.from_unique_ptr( + df = cudf.DataFrame._from_data(*data_from_unique_ptr( move(c_result.tbl), column_names=meta_names )) @@ -428,7 +456,7 @@ cpdef write_csv( See Also -------- - cudf.io.csv.to_csv + cudf.to_csv """ cdef table_view input_table_view = \ table.view() if index is True else table.data_view() @@ -483,7 +511,7 @@ cpdef write_csv( cpp_write_csv(options) -def _get_cudf_compatible_str_from_dtype(dtype): +cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +: # TODO: Remove this Error message once the # following issue is fixed: # https://github.com/rapidsai/cudf/issues/3960 @@ -493,29 +521,38 @@ def _get_cudf_compatible_str_from_dtype(dtype): "supported in CSV reader" ) - if ( - str(dtype) in cudf.utils.dtypes.ALL_TYPES or - str(dtype) in { - "hex", "hex32", "hex64", "date", "date32", "timestamp", - "timestamp[us]", "timestamp[s]", "timestamp[ms]", "timestamp[ns]", - "date64" - } - ): - return str(dtype) - pd_dtype = pd.core.dtypes.common.pandas_dtype(dtype) - - if pd_dtype in cudf.utils.dtypes.pandas_dtypes_to_cudf_dtypes: - return str(cudf.utils.dtypes.pandas_dtypes_to_cudf_dtypes[pd_dtype]) - elif isinstance(pd_dtype, np.dtype) and pd_dtype.kind in ("O", "U"): - return "str" - elif ( - pd_dtype in cudf.utils.dtypes.cudf_dtypes_to_pandas_dtypes or - str(pd_dtype) in cudf.utils.dtypes.ALL_TYPES or - cudf.utils.dtypes.is_categorical_dtype(pd_dtype) - ): - return str(pd_dtype) - else: - raise ValueError(f"dtype not understood: {dtype}") + if isinstance(dtype, str): + if str(dtype) == "date32": + return libcudf_types.data_type( + libcudf_types.type_id.TIMESTAMP_DAYS + ) + elif str(dtype) in ("date", "date64"): + return libcudf_types.data_type( + libcudf_types.type_id.TIMESTAMP_MILLISECONDS + ) + elif str(dtype) == "timestamp": + return libcudf_types.data_type( + libcudf_types.type_id.TIMESTAMP_MILLISECONDS + ) + elif str(dtype) == "timestamp[us]": + return libcudf_types.data_type( + libcudf_types.type_id.TIMESTAMP_MICROSECONDS + ) + elif str(dtype) == "timestamp[s]": + return libcudf_types.data_type( + libcudf_types.type_id.TIMESTAMP_SECONDS + ) + elif str(dtype) == "timestamp[ms]": + return libcudf_types.data_type( + libcudf_types.type_id.TIMESTAMP_MILLISECONDS + ) + elif str(dtype) == "timestamp[ns]": + return libcudf_types.data_type( + libcudf_types.type_id.TIMESTAMP_NANOSECONDS + ) + + dtype = cudf.dtype(dtype) + return dtype_to_data_type(dtype) def columns_apply_na_rep(column_names, na_rep): diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index 3b13cedcfd7..1b152f1a3b7 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -60,6 +60,8 @@ def extract_datetime_component(Column col, object field): def is_leap_year(Column col): + """Returns a boolean indicator whether the year of the date is a leap year + """ cdef unique_ptr[column] c_result cdef column_view col_view = col.view() @@ -67,3 +69,39 @@ def is_leap_year(Column col): c_result = move(libcudf_datetime.is_leap_year(col_view)) return Column.from_unique_ptr(move(c_result)) + + +def extract_quarter(Column col): + """ + Returns a column which contains the corresponding quarter of the year + for every timestamp inside the input column. + """ + cdef unique_ptr[column] c_result + cdef column_view col_view = col.view() + + with nogil: + c_result = move(libcudf_datetime.extract_quarter(col_view)) + + return Column.from_unique_ptr(move(c_result)) + + +def days_in_month(Column col): + """Extracts the number of days in the month of the date + """ + cdef unique_ptr[column] c_result + cdef column_view col_view = col.view() + + with nogil: + c_result = move(libcudf_datetime.days_in_month(col_view)) + + return Column.from_unique_ptr(move(c_result)) + + +def last_day_of_month(Column col): + cdef unique_ptr[column] c_result + cdef column_view col_view = col.view() + + with nogil: + c_result = move(libcudf_datetime.last_day_of_month(col_view)) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/filling.pyx b/python/cudf/cudf/_lib/filling.pyx index d9fdf72415c..99a3957006b 100644 --- a/python/cudf/cudf/_lib/filling.pyx +++ b/python/cudf/cudf/_lib/filling.pyx @@ -16,6 +16,7 @@ from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type from cudf._lib.scalar cimport DeviceScalar from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr def fill_in_place(Column destination, int begin, int end, DeviceScalar value): @@ -70,7 +71,7 @@ def _repeat_via_column(Table inp, Column count, bool check_count): c_check_count )) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=inp._column_names, index_names=inp._index_names @@ -87,7 +88,7 @@ def _repeat_via_size_type(Table inp, size_type count): count )) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=inp._column_names, index_names=inp._index_names diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 12e3f65a8a2..d7416625248 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -22,6 +22,8 @@ from libcpp.pair cimport pair from libcpp.utility cimport move from libcpp.vector cimport vector +import cudf + from cudf._lib.column cimport Column from cudf._lib.scalar cimport DeviceScalar from cudf._lib.table cimport Table @@ -30,7 +32,12 @@ from cudf._lib.scalar import as_device_scalar cimport cudf._lib.cpp.groupby as libcudf_groupby cimport cudf._lib.cpp.types as libcudf_types -from cudf._lib.aggregation cimport Aggregation, make_aggregation +from cudf._lib.aggregation cimport ( + GroupbyAggregation, + GroupbyScanAggregation, + make_groupby_aggregation, + make_groupby_scan_aggregation, +) from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.libcpp.functional cimport reference_wrapper @@ -39,6 +46,7 @@ from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table, table_view from cudf._lib.cpp.types cimport size_type from cudf._lib.cpp.utilities.host_span cimport host_span +from cudf._lib.utils cimport data_from_unique_ptr # The sets below define the possible aggregations that can be performed on # different dtypes. These strings must be elements of the AggregationKind enum. @@ -91,41 +99,24 @@ cdef class GroupBy: c_grouped_values = move(c_groups.values) c_group_offsets = c_groups.offsets - grouped_keys = Table.from_unique_ptr( + grouped_keys = cudf.Index._from_data(*data_from_unique_ptr( move(c_grouped_keys), column_names=range(c_grouped_keys.get()[0].num_columns()) - ) - grouped_values = Table.from_unique_ptr( + )) + grouped_values = data_from_unique_ptr( move(c_grouped_values), index_names=values._index_names, column_names=values._column_names ) return grouped_keys, grouped_values, c_group_offsets - def aggregate(self, Table values, aggregations): - """ - Parameters - ---------- - values : Table - aggregations - A dict mapping column names in `Table` to a list of aggregations - to perform on that column - - Each aggregation may be specified as: - - a string (e.g., "max") - - a lambda/function - - Returns - ------- - Table of aggregated values - """ + def aggregate_internal(self, Table values, aggregations): from cudf.core.column_accessor import ColumnAccessor cdef vector[libcudf_groupby.aggregation_request] c_agg_requests cdef libcudf_groupby.aggregation_request c_agg_request cdef Column col - cdef Aggregation agg_obj + cdef GroupbyAggregation agg_obj - cdef bool scan = _is_all_scan_aggregate(aggregations) allow_empty = all(len(v) == 0 for v in aggregations.values()) included_aggregations = defaultdict(list) @@ -151,7 +142,7 @@ cdef class GroupBy: c_agg_request = move(libcudf_groupby.aggregation_request()) for agg in aggs: - agg_obj = make_aggregation(agg) + agg_obj = make_groupby_aggregation(agg) if (valid_aggregations == "ALL" or agg_obj.kind in valid_aggregations): included_aggregations[col_name].append(agg) @@ -172,32 +163,92 @@ cdef class GroupBy: vector[libcudf_groupby.aggregation_result] ] c_result - try: - with nogil: - if scan: - c_result = move( - self.c_obj.get()[0].scan( - c_agg_requests - ) - ) - else: - c_result = move( - self.c_obj.get()[0].aggregate( - c_agg_requests - ) + with nogil: + c_result = move( + self.c_obj.get()[0].aggregate( + c_agg_requests + ) + ) + + grouped_keys, _ = data_from_unique_ptr( + move(c_result.first), + column_names=self.keys._column_names + ) + + result_data = ColumnAccessor(multiindex=True) + # Note: This loop relies on the included_aggregations dict being + # insertion ordered to map results to requested aggregations by index. + for i, col_name in enumerate(included_aggregations): + for j, agg_name in enumerate(included_aggregations[col_name]): + if callable(agg_name): + agg_name = agg_name.__name__ + result_data[(col_name, agg_name)] = ( + Column.from_unique_ptr(move(c_result.second[i].results[j])) + ) + + return result_data, cudf.Index._from_data(grouped_keys) + + def scan_internal(self, Table values, aggregations): + from cudf.core.column_accessor import ColumnAccessor + cdef vector[libcudf_groupby.scan_request] c_agg_requests + cdef libcudf_groupby.scan_request c_agg_request + cdef Column col + cdef GroupbyScanAggregation agg_obj + + allow_empty = all(len(v) == 0 for v in aggregations.values()) + + included_aggregations = defaultdict(list) + for i, (col_name, aggs) in enumerate(aggregations.items()): + col = values._data[col_name] + dtype = col.dtype + + valid_aggregations = ( + _LIST_AGGS if is_list_dtype(dtype) + else _STRING_AGGS if is_string_dtype(dtype) + else _CATEGORICAL_AGGS if is_categorical_dtype(dtype) + else _STRUCT_AGGS if is_struct_dtype(dtype) + else _INTERVAL_AGGS if is_interval_dtype(dtype) + else _DECIMAL_AGGS if is_decimal_dtype(dtype) + else "ALL" + ) + if (valid_aggregations is _DECIMAL_AGGS + and rmm._cuda.gpu.runtimeGetVersion() < 11000): + raise RuntimeError( + "Decimal aggregations are only supported on CUDA >= 11 " + "due to an nvcc compiler bug." + ) + + c_agg_request = move(libcudf_groupby.scan_request()) + for agg in aggs: + agg_obj = make_groupby_scan_aggregation(agg) + if (valid_aggregations == "ALL" + or agg_obj.kind in valid_aggregations): + included_aggregations[col_name].append(agg) + c_agg_request.aggregations.push_back( + move(agg_obj.c_obj) ) - except RuntimeError as e: - # TODO: remove this try..except after - # https://github.com/rapidsai/cudf/issues/7611 - # is resolved - if ("make_empty_column") in str(e): - raise NotImplementedError( - "Aggregation not supported for empty columns" - ) from e - else: - raise - - grouped_keys = Table.from_unique_ptr( + if not c_agg_request.aggregations.empty(): + c_agg_request.values = col.view() + c_agg_requests.push_back( + move(c_agg_request) + ) + + if c_agg_requests.empty() and not allow_empty: + raise DataError("All requested aggregations are unsupported.") + + cdef pair[ + unique_ptr[table], + vector[libcudf_groupby.aggregation_result] + ] c_result + + with nogil: + c_result = move( + self.c_obj.get()[0].scan( + c_agg_requests + ) + ) + + grouped_keys, _ = data_from_unique_ptr( move(c_result.first), column_names=self.keys._column_names ) @@ -213,7 +264,29 @@ cdef class GroupBy: Column.from_unique_ptr(move(c_result.second[i].results[j])) ) - return Table(data=result_data, index=grouped_keys) + return result_data, cudf.Index._from_data(grouped_keys) + + def aggregate(self, Table values, aggregations): + """ + Parameters + ---------- + values : Table + aggregations + A dict mapping column names in `Table` to a list of aggregations + to perform on that column + + Each aggregation may be specified as: + - a string (e.g., "max") + - a lambda/function + + Returns + ------- + Table of aggregated values + """ + if _is_all_scan_aggregate(aggregations): + return self.scan_internal(values, aggregations) + + return self.aggregate_internal(values, aggregations) def shift(self, Table values, int periods, list fill_values): cdef table_view view = values.view() @@ -238,16 +311,16 @@ cdef class GroupBy: self.c_obj.get()[0].shift(view, offsets, c_fill_values) ) - grouped_keys = Table.from_unique_ptr( + grouped_keys = cudf.Index._from_data(*data_from_unique_ptr( move(c_result.first), column_names=self.keys._column_names - ) + )) - shifted = Table.from_unique_ptr( + shifted, _ = data_from_unique_ptr( move(c_result.second), column_names=values._column_names ) - return Table(data=shifted._data, index=grouped_keys) + return shifted, grouped_keys def replace_nulls(self, Table values, object method): cdef table_view val_view = values.view() @@ -265,12 +338,10 @@ cdef class GroupBy: self.c_obj.get()[0].replace_nulls(val_view, policies) ) - grouped_result = Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result.second), column_names=values._column_names - ) + )[0] - result = Table(data=grouped_result._data) - return result _GROUPBY_SCANS = {"cumcount", "cumsum", "cummin", "cummax"} diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index 198e7a748c9..137b19ef69c 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -15,6 +15,7 @@ from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr def hash_partition(Table source_table, object columns_to_hash, @@ -41,12 +42,14 @@ def hash_partition(Table source_table, object columns_to_hash, # the original table (`source_table`) is empty. We need to # return a list of zeros in this case. return ( - Table.from_unique_ptr( + *data_from_unique_ptr( move(c_result.first), column_names=source_table._column_names, - index_names=source_table._index_names if( - keep_index is True) - else None + index_names=( + source_table._index_names + if keep_index is True + else None + ) ), list(c_result.second) if c_result.second.size() diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx index 08ea58e4587..234513733d1 100644 --- a/python/cudf/cudf/_lib/interop.pyx +++ b/python/cudf/cudf/_lib/interop.pyx @@ -21,6 +21,7 @@ from cudf._lib.cpp.interop cimport ( from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr def from_dlpack(dlpack_capsule): @@ -40,7 +41,7 @@ def from_dlpack(dlpack_capsule): cpp_from_dlpack(dlpack_tensor) ) - res = Table.from_unique_ptr( + res = data_from_unique_ptr( move(c_result), column_names=range(0, c_result.get()[0].num_columns()) ) @@ -164,10 +165,8 @@ def from_arrow( with nogil: c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0])) - out_table = Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=column_names, index_names=index_names ) - - return out_table diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd index 82ad9d67f78..66d93ffc531 100644 --- a/python/cudf/cudf/_lib/io/utils.pxd +++ b/python/cudf/cudf/_lib/io/utils.pxd @@ -3,6 +3,7 @@ from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector +from cudf._lib.column cimport Column from cudf._lib.cpp.io.types cimport ( column_name_info, data_sink, @@ -17,3 +18,7 @@ cdef sink_info make_sink_info(src, unique_ptr[data_sink] & data) except* cdef update_struct_field_names( Table table, vector[column_name_info]& schema_info) +cdef Column update_column_struct_field_names( + Column col, + column_name_info& info +) diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx index 72ab64f6249..d26cf19deaf 100644 --- a/python/cudf/cudf/_lib/io/utils.pyx +++ b/python/cudf/cudf/_lib/io/utils.pyx @@ -127,12 +127,12 @@ cdef update_struct_field_names( vector[column_name_info]& schema_info ): for i, (name, col) in enumerate(table._data.items()): - table._data[name] = _update_column_struct_field_names( + table._data[name] = update_column_struct_field_names( col, schema_info[i] ) -cdef Column _update_column_struct_field_names( +cdef Column update_column_struct_field_names( Column col, column_name_info& info ): @@ -149,7 +149,7 @@ cdef Column _update_column_struct_field_names( if col.children: children = list(col.children) for i, child in enumerate(children): - children[i] = _update_column_struct_field_names( + children[i] = update_column_struct_field_names( child, info.children[i] ) diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 4a15edf8a19..68d9da57e83 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -10,18 +10,22 @@ import os import cudf from libcpp cimport bool +from libcpp.map cimport map from libcpp.string cimport string from libcpp.utility cimport move from libcpp.vector cimport vector cimport cudf._lib.cpp.io.types as cudf_io_types +cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.cpp.io.json cimport ( json_reader_options, read_json as libcudf_read_json, ) -from cudf._lib.cpp.types cimport size_type +from cudf._lib.cpp.types cimport data_type, size_type, type_id from cudf._lib.io.utils cimport make_source_info from cudf._lib.table cimport Table +from cudf._lib.types cimport dtype_to_data_type +from cudf._lib.utils cimport data_from_unique_ptr cpdef read_json(object filepaths_or_buffers, @@ -50,7 +54,8 @@ cpdef read_json(object filepaths_or_buffers, filepaths_or_buffers[idx] = filepaths_or_buffers[idx].encode() # Setup arguments - cdef vector[string] c_dtypes + cdef vector[data_type] c_dtypes_list + cdef map[string, data_type] c_dtypes_map cdef cudf_io_types.compression_type c_compression # Determine byte read offsets if applicable cdef size_type c_range_offset = ( @@ -70,40 +75,36 @@ cpdef read_json(object filepaths_or_buffers, c_compression = cudf_io_types.compression_type.AUTO else: c_compression = cudf_io_types.compression_type.NONE - + is_list_like_dtypes = False if dtype is False: raise ValueError("False value is unsupported for `dtype`") elif dtype is not True: if isinstance(dtype, abc.Mapping): - c_dtypes.reserve(len(dtype)) for k, v in dtype.items(): - if cudf.utils.dtypes.is_categorical_dtype(v): - raise NotImplementedError( - "CategoricalDtype as dtype is not yet " - "supported in JSON reader" - ) - c_dtypes.push_back(str(str(k) + ":" + str(v)).encode()) + c_dtypes_map[str(k).encode()] = \ + _get_cudf_data_type_from_dtype(v) elif not isinstance(dtype, abc.Iterable): raise TypeError("`dtype` must be 'list like' or 'dict'") else: - c_dtypes.reserve(len(dtype)) + is_list_like_dtypes = True + c_dtypes_list.reserve(len(dtype)) for col_dtype in dtype: - if cudf.utils.dtypes.is_categorical_dtype(col_dtype): - raise NotImplementedError( - "CategoricalDtype as dtype is not yet " - "supported in JSON reader" - ) - c_dtypes.push_back(str(col_dtype).encode()) + c_dtypes_list.push_back( + _get_cudf_data_type_from_dtype( + col_dtype)) cdef json_reader_options opts = move( json_reader_options.builder(make_source_info(filepaths_or_buffers)) - .dtypes(c_dtypes) .compression(c_compression) .lines(c_lines) .byte_range_offset(c_range_offset) .byte_range_size(c_range_size) .build() ) + if is_list_like_dtypes: + opts.set_dtypes(c_dtypes_list) + else: + opts.set_dtypes(c_dtypes_map) # Read JSON cdef cudf_io_types.table_with_metadata c_out_table @@ -112,5 +113,15 @@ cpdef read_json(object filepaths_or_buffers, c_out_table = move(libcudf_read_json(opts)) column_names = [x.decode() for x in c_out_table.metadata.column_names] - return Table.from_unique_ptr(move(c_out_table.tbl), - column_names=column_names) + return data_from_unique_ptr(move(c_out_table.tbl), + column_names=column_names) + +cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +: + if cudf.utils.dtypes.is_categorical_dtype(dtype): + raise NotImplementedError( + "CategoricalDtype as dtype is not yet " + "supported in JSON reader" + ) + + dtype = cudf.dtype(dtype) + return dtype_to_data_type(dtype) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 8ada3376fdb..59c3a4b89dc 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -43,6 +43,7 @@ from cudf.core.dtypes import ListDtype from cudf._lib.cpp.lists.contains cimport contains from cudf._lib.cpp.lists.extract cimport extract_list_element +from cudf._lib.utils cimport data_from_unique_ptr def count_elements(Column col): @@ -72,7 +73,7 @@ def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False): with nogil: c_result = move(cpp_explode_outer(c_table_view, c_explode_column_idx)) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=tbl._column_names, index_names=None if ignore_index else tbl._index_names diff --git a/python/cudf/cudf/_lib/merge.pyx b/python/cudf/cudf/_lib/merge.pyx index cc2d405c207..83f088f4419 100644 --- a/python/cudf/cudf/_lib/merge.pyx +++ b/python/cudf/cudf/_lib/merge.pyx @@ -11,6 +11,7 @@ from cudf._lib.cpp.merge cimport merge as cpp_merge from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr def merge_sorted( @@ -102,7 +103,7 @@ def merge_sorted( ) ) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=source_table._column_names, index_names=index_names, diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 2470c15f541..995243c7ea7 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -40,6 +40,7 @@ from cudf._lib.cpp.types cimport data_type, size_type, type_id from cudf._lib.io.utils cimport ( make_sink_info, make_source_info, + update_column_struct_field_names, update_struct_field_names, ) from cudf._lib.table cimport Table @@ -50,7 +51,7 @@ from cudf._lib.types cimport underlying_type_t_type_id import numpy as np -from cudf._lib.utils cimport get_column_names +from cudf._lib.utils cimport data_from_unique_ptr, get_column_names from cudf._lib.utils import _index_level_name, generate_pandas_metadata @@ -83,7 +84,7 @@ cpdef read_orc(object filepaths_or_buffers, See Also -------- - cudf.io.orc.read_orc + cudf.read_orc """ cdef orc_reader_options c_orc_reader_options = make_orc_reader_options( filepaths_or_buffers, @@ -96,7 +97,7 @@ cpdef read_orc(object filepaths_or_buffers, if timestamp_type is None else ( ( - np_to_cudf_types[np.dtype(timestamp_type)] + np_to_cudf_types[cudf.dtype(timestamp_type)] ) ) ), @@ -111,11 +112,16 @@ cpdef read_orc(object filepaths_or_buffers, names = [name.decode() for name in c_result.metadata.column_names] - tbl = Table.from_unique_ptr(move(c_result.tbl), names) + data, index = data_from_unique_ptr(move(c_result.tbl), names) - update_struct_field_names(tbl, c_result.metadata.schema_info) + data = { + name: update_column_struct_field_names( + col, c_result.metadata.schema_info[i] + ) + for i, (name, col) in enumerate(data.items()) + } - return tbl + return data, index cdef compression_type _get_comp_type(object compression): @@ -136,7 +142,7 @@ cpdef write_orc(Table table, See Also -------- - cudf.io.orc.read_orc + cudf.read_orc """ cdef compression_type compression_ = _get_comp_type(compression) cdef table_metadata metadata_ = table_metadata() diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 52f3aada00b..95ae2202f68 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -26,7 +26,7 @@ from cudf.utils.dtypes import ( np_to_pa_dtype, ) -from cudf._lib.utils cimport get_column_names +from cudf._lib.utils cimport data_from_unique_ptr, get_column_names from cudf._lib.utils import _index_level_name, generate_pandas_metadata @@ -178,12 +178,10 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, for c in meta['columns']: if c['field_name'] == idx_col: index_col_names[idx_col] = c['name'] - df = cudf.DataFrame._from_table( - Table.from_unique_ptr( - move(c_out_table.tbl), - column_names=column_names - ) - ) + df = cudf.DataFrame._from_data(*data_from_unique_ptr( + move(c_out_table.tbl), + column_names=column_names + )) update_struct_field_names(df, c_out_table.metadata.schema_info) @@ -201,7 +199,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, meta_dtype = cols_dtype_map.get(col, None) df._data[col] = cudf.core.column.column_empty( row_count=0, - dtype=np.dtype(meta_dtype) + dtype=cudf.dtype(meta_dtype) ) # Set the index column diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx index 865138bec84..90aa6bb0344 100644 --- a/python/cudf/cudf/_lib/partitioning.pyx +++ b/python/cudf/cudf/_lib/partitioning.pyx @@ -16,6 +16,7 @@ from cudf._lib.table cimport Table from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count cimport cudf._lib.cpp.types as libcudf_types +from cudf._lib.utils cimport data_from_unique_ptr def partition(Table source_table, Column partition_map, @@ -44,7 +45,7 @@ def partition(Table source_table, Column partition_map, ) return ( - Table.from_unique_ptr( + *data_from_unique_ptr( move(c_result.first), column_names=source_table._column_names, index_names=source_table._index_names if( diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx index 45a4ff7c92c..76bf587237c 100644 --- a/python/cudf/cudf/_lib/quantiles.pyx +++ b/python/cudf/cudf/_lib/quantiles.pyx @@ -32,6 +32,7 @@ from cudf._lib.cpp.types cimport ( order_info, sorted, ) +from cudf._lib.utils cimport data_from_unique_ptr def quantile( @@ -118,7 +119,7 @@ def quantiles(Table source_table, ) ) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=source_table._column_names ) diff --git a/python/cudf/cudf/_lib/reshape.pyx b/python/cudf/cudf/_lib/reshape.pyx index fbed410de86..acca2694d10 100644 --- a/python/cudf/cudf/_lib/reshape.pyx +++ b/python/cudf/cudf/_lib/reshape.pyx @@ -13,6 +13,7 @@ from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr def interleave_columns(Table source_table): @@ -35,7 +36,7 @@ def tile(Table source_table, size_type count): with nogil: c_result = move(cpp_tile(c_view, c_count)) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=source_table._column_names, index_names=source_table._index_names diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 8b0a34b134e..fe11d5e2627 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -30,11 +30,12 @@ from cudf.core.dtypes import ListDtype, StructDtype from cudf._lib.column cimport Column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.table cimport Table +from cudf._lib.table cimport Table, make_table_view from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id from cudf._lib.interop import from_arrow, to_arrow +cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.cpp.scalar.scalar cimport ( duration_scalar, fixed_point_scalar, @@ -58,10 +59,9 @@ from cudf._lib.cpp.wrappers.timestamps cimport ( timestamp_s, timestamp_us, ) +from cudf._lib.utils cimport data_from_table_view -from cudf.utils.dtypes import _decimal_to_int64, is_list_dtype, is_struct_dtype - -cimport cudf._lib.cpp.types as libcudf_types +import cudf cdef class DeviceScalar: @@ -80,7 +80,7 @@ cdef class DeviceScalar: dtype : dtype A NumPy dtype. """ - self._dtype = dtype if dtype.kind != 'U' else np.dtype('object') + self._dtype = dtype if dtype.kind != 'U' else cudf.dtype('object') self._set_value(value, self._dtype) def _set_value(self, value, dtype): @@ -119,9 +119,9 @@ cdef class DeviceScalar: def _to_host_scalar(self): if isinstance(self.dtype, cudf.Decimal64Dtype): result = _get_py_decimal_from_fixed_point(self.c_value) - elif is_struct_dtype(self.dtype): + elif cudf.api.types.is_struct_dtype(self.dtype): result = _get_py_dict_from_struct(self.c_value) - elif is_list_dtype(self.dtype): + elif cudf.api.types.is_list_dtype(self.dtype): result = _get_py_list_from_list(self.c_value) elif pd.api.types.is_string_dtype(self.dtype): result = _get_py_string_from_string(self.c_value) @@ -308,7 +308,7 @@ cdef _set_decimal64_from_scalar(unique_ptr[scalar]& s, object value, object dtype, bool valid=True): - value = _decimal_to_int64(value) if valid else 0 + value = cudf.utils.dtypes._decimal_to_int64(value) if valid else 0 s.reset( new fixed_point_scalar[decimal64]( np.int64(value), scale_type(-dtype.scale), valid @@ -338,8 +338,8 @@ cdef _set_struct_from_pydict(unique_ptr[scalar]& s, names=columns ) - cdef Table table = from_arrow(pyarrow_table, column_names=columns) - cdef table_view struct_view = table.view() + data, _ = from_arrow(pyarrow_table, column_names=columns) + cdef table_view struct_view = make_table_view(data.values()) s.reset( new struct_scalar(struct_view, valid) @@ -352,11 +352,14 @@ cdef _get_py_dict_from_struct(unique_ptr[scalar]& s): cdef table_view struct_table_view = (s.get()).view() columns = [str(i) for i in range(struct_table_view.num_columns())] - cdef Table to_arrow_table = Table.from_table_view( + data, _ = data_from_table_view( struct_table_view, None, column_names=columns ) + cdef Table to_arrow_table = Table( + cudf.core.column_accessor.ColumnAccessor(data) + ) python_dict = to_arrow(to_arrow_table, columns).to_pydict() @@ -556,7 +559,7 @@ def _is_null_host_scalar(slr): def _create_proxy_nat_scalar(dtype): cdef DeviceScalar result = DeviceScalar.__new__(DeviceScalar) - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if dtype.char in 'mM': nat = dtype.type('NaT').astype(dtype) if dtype.type == np.datetime64: diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx index 1d15052e41a..a07017ef796 100644 --- a/python/cudf/cudf/_lib/sort.pyx +++ b/python/cudf/cudf/_lib/sort.pyx @@ -24,6 +24,7 @@ from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport null_order, null_policy, order from cudf._lib.sort cimport underlying_type_t_rank_method from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr def is_sorted( @@ -276,9 +277,9 @@ def rank_columns(Table source_table, object method, str na_option, cdef unique_ptr[table] c_result c_result.reset(new table(move(c_results))) - out_table = Table.from_unique_ptr( + data, _ = data_from_unique_ptr( move(c_result), - column_names=source_table._column_names + column_names=source_table._column_names, + index_names=None ) - out_table._index = source_table._index - return out_table + return data, source_table._index diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index a7326efcc03..f1eca64bb87 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -25,6 +25,7 @@ from cudf._lib.cpp.types cimport ( size_type, ) from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr def drop_nulls(Table source_table, how="any", keys=None, thresh=None): @@ -78,7 +79,7 @@ def drop_nulls(Table source_table, how="any", keys=None, thresh=None): ) ) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=source_table._column_names, index_names=( @@ -115,7 +116,7 @@ def apply_boolean_mask(Table source_table, Column boolean_mask): ) ) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=source_table._column_names, index_names=( @@ -192,7 +193,7 @@ def drop_duplicates(Table source_table, ) ) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=source_table._column_names, index_names=( diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 8f65cc9fee5..25e4149183e 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -10,10 +10,6 @@ from cudf._lib.scalar cimport DeviceScalar from cudf._lib.types import np_to_cudf_types -from cudf._lib.types cimport underlying_type_t_type_id - -from cudf.core.column.column import as_column - from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.utility cimport move @@ -55,6 +51,9 @@ from cudf._lib.cpp.strings.convert.convert_urls cimport ( url_encode as cpp_url_encode, ) from cudf._lib.cpp.types cimport data_type, type_id +from cudf._lib.types cimport underlying_type_t_type_id + +import cudf def floating_to_string(Column input_col): @@ -115,7 +114,7 @@ def stod(Column input_col, **kwargs): A Column with strings cast to double """ - return string_to_floating(input_col, np.dtype("float64")) + return string_to_floating(input_col, cudf.dtype("float64")) def ftos(Column input_col): @@ -147,7 +146,7 @@ def stof(Column input_col, **kwargs): A Column with strings cast to float """ - return string_to_floating(input_col, np.dtype("float32")) + return string_to_floating(input_col, cudf.dtype("float32")) def integer_to_string(Column input_col): @@ -208,7 +207,7 @@ def stoi8(Column input_col, **kwargs): A Column with strings cast to int8 """ - return string_to_integer(input_col, np.dtype("int8")) + return string_to_integer(input_col, cudf.dtype("int8")) def i16tos(Column input_col): @@ -240,7 +239,7 @@ def stoi16(Column input_col): A Column with strings cast to int16 """ - return string_to_integer(input_col, np.dtype("int16")) + return string_to_integer(input_col, cudf.dtype("int16")) def itos(Column input_col): @@ -272,7 +271,7 @@ def stoi(Column input_col): A Column with strings cast to int32 """ - return string_to_integer(input_col, np.dtype("int32")) + return string_to_integer(input_col, cudf.dtype("int32")) def ltos(Column input_col): @@ -304,7 +303,7 @@ def stol(Column input_col, **kwargs): A Column with strings cast to int64 """ - return string_to_integer(input_col, np.dtype("int64")) + return string_to_integer(input_col, cudf.dtype("int64")) def ui8tos(Column input_col): @@ -336,7 +335,7 @@ def stoui8(Column input_col, **kwargs): A Column with strings cast to uint8 """ - return string_to_integer(input_col, np.dtype("uint8")) + return string_to_integer(input_col, cudf.dtype("uint8")) def ui16tos(Column input_col): @@ -368,7 +367,7 @@ def stoui16(Column input_col, **kwargs): A Column with strings cast to uint16 """ - return string_to_integer(input_col, np.dtype("uint16")) + return string_to_integer(input_col, cudf.dtype("uint16")) def uitos(Column input_col): @@ -400,7 +399,7 @@ def stoui(Column input_col, **kwargs): A Column with strings cast to uint32 """ - return string_to_integer(input_col, np.dtype("uint32")) + return string_to_integer(input_col, cudf.dtype("uint32")) def ultos(Column input_col): @@ -432,7 +431,7 @@ def stoul(Column input_col, **kwargs): A Column with strings cast to uint64 """ - return string_to_integer(input_col, np.dtype("uint64")) + return string_to_integer(input_col, cudf.dtype("uint64")) def _to_booleans(Column input_col, object string_true="True"): @@ -588,7 +587,7 @@ def istimestamp( """ if input_col.size == 0: - return as_column([], dtype=kwargs.get('dtype')) + return cudf.core.column.as_column([], dtype=kwargs.get('dtype')) cdef column_view input_column_view = input_col.view() cdef string c_timestamp_format = str(format).encode('UTF-8') cdef unique_ptr[column] c_result @@ -745,7 +744,7 @@ def htoi(Column input_col, **kwargs): cdef column_view input_column_view = input_col.view() cdef type_id tid = ( ( - np_to_cudf_types[kwargs.get('dtype', np.dtype("int64"))] + np_to_cudf_types[kwargs.get('dtype', cudf.dtype("int64"))] ) ) cdef data_type c_out_type = data_type(tid) diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index 866c2861995..598ac804dd6 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -64,6 +64,7 @@ from cudf._lib.strings.findall import findall from cudf._lib.strings.json import get_json_object from cudf._lib.strings.padding import PadSide, center, ljust, pad, rjust, zfill +from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence from cudf._lib.strings.replace import ( insert, replace, diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx index 6eb8984b869..e35ab6489c6 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx @@ -6,11 +6,6 @@ from cudf._lib.column cimport Column from cudf._lib.types import np_to_cudf_types -from cudf._lib.cpp.types cimport DECIMAL64 -from cudf._lib.types cimport underlying_type_t_type_id - -from cudf.core.column.column import as_column - from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.utility cimport move @@ -22,7 +17,8 @@ from cudf._lib.cpp.strings.convert.convert_fixed_point cimport ( is_fixed_point as cpp_is_fixed_point, to_fixed_point as cpp_to_fixed_point, ) -from cudf._lib.cpp.types cimport data_type, type_id +from cudf._lib.cpp.types cimport DECIMAL64, data_type, type_id +from cudf._lib.types cimport underlying_type_t_type_id def from_decimal(Column input_col): diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx index 58558fade24..74d8e548ad1 100644 --- a/python/cudf/cudf/_lib/strings/extract.pyx +++ b/python/cudf/cudf/_lib/strings/extract.pyx @@ -11,6 +11,7 @@ from cudf._lib.cpp.strings.extract cimport extract as cpp_extract from cudf._lib.cpp.table.table cimport table from cudf._lib.scalar cimport DeviceScalar from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr def extract(Column source_strings, object pattern): @@ -31,7 +32,7 @@ def extract(Column source_strings, object pattern): pattern_string )) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=range(0, c_result.get()[0].num_columns()) ) diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx index cc5730c467d..702b0fc8053 100644 --- a/python/cudf/cudf/_lib/strings/findall.pyx +++ b/python/cudf/cudf/_lib/strings/findall.pyx @@ -12,6 +12,7 @@ from cudf._lib.cpp.strings.findall cimport findall_re as cpp_findall_re from cudf._lib.cpp.table.table cimport table from cudf._lib.scalar cimport DeviceScalar from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr def findall(Column source_strings, pattern): @@ -30,7 +31,7 @@ def findall(Column source_strings, pattern): pattern_string )) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=range(0, c_result.get()[0].num_columns()) ) diff --git a/python/cudf/cudf/_lib/strings/repeat.pyx b/python/cudf/cudf/_lib/strings/repeat.pyx new file mode 100644 index 00000000000..49a46f418b1 --- /dev/null +++ b/python/cudf/cudf/_lib/strings/repeat.pyx @@ -0,0 +1,49 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.column cimport Column +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.strings cimport repeat as cpp_repeat +from cudf._lib.cpp.types cimport size_type + + +def repeat_scalar(Column source_strings, + size_type repeats): + """ + Returns a Column after repeating + each string in `source_strings` + `repeats` number of times. + """ + cdef unique_ptr[column] c_result + cdef column_view source_view = source_strings.view() + + with nogil: + c_result = move(cpp_repeat.repeat_strings( + source_view, + repeats + )) + + return Column.from_unique_ptr(move(c_result)) + + +def repeat_sequence(Column source_strings, + Column repeats): + """ + Returns a Column after repeating + each string in `source_strings` + `repeats` number of times. + """ + cdef unique_ptr[column] c_result + cdef column_view source_view = source_strings.view() + cdef column_view repeats_view = repeats.view() + + with nogil: + c_result = move(cpp_repeat.repeat_strings( + source_view, + repeats_view + )) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx index 590de5bf526..0e62ab69298 100644 --- a/python/cudf/cudf/_lib/strings/split/partition.pyx +++ b/python/cudf/cudf/_lib/strings/split/partition.pyx @@ -17,6 +17,7 @@ from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type from cudf._lib.scalar cimport DeviceScalar from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr def partition(Column source_strings, @@ -40,7 +41,7 @@ def partition(Column source_strings, scalar_str[0] )) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=range(0, c_result.get()[0].num_columns()) ) @@ -67,7 +68,7 @@ def rpartition(Column source_strings, scalar_str[0] )) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=range(0, c_result.get()[0].num_columns()) ) diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx index 599f7602b51..a2ce237ced6 100644 --- a/python/cudf/cudf/_lib/strings/split/split.pyx +++ b/python/cudf/cudf/_lib/strings/split/split.pyx @@ -19,6 +19,7 @@ from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type from cudf._lib.scalar cimport DeviceScalar from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr def split(Column source_strings, @@ -45,7 +46,7 @@ def split(Column source_strings, maxsplit )) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=range(0, c_result.get()[0].num_columns()) ) @@ -104,7 +105,7 @@ def rsplit(Column source_strings, maxsplit )) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=range(0, c_result.get()[0].num_columns()) ) diff --git a/python/cudf/cudf/_lib/table.pxd b/python/cudf/cudf/_lib/table.pxd index e1bffbc3864..0730199c8a9 100644 --- a/python/cudf/cudf/_lib/table.pxd +++ b/python/cudf/cudf/_lib/table.pxd @@ -16,21 +16,6 @@ cdef class Table: cdef table_view index_view(self) except * cdef mutable_table_view mutable_index_view(self) except * - @staticmethod - cdef Table from_unique_ptr( - unique_ptr[table] c_tbl, - column_names, - index_names=* - ) - - @staticmethod - cdef Table from_table_view( - table_view, - owner, - column_names, - index_names=* - ) - cdef table_view make_table_view(columns) except * cdef mutable_table_view make_mutable_table_view(columns) except * cdef columns_from_ptr(unique_ptr[table] c_tbl) diff --git a/python/cudf/cudf/_lib/table.pyi b/python/cudf/cudf/_lib/table.pyi index 2a5dfb2a4dd..ccf0eab99dc 100644 --- a/python/cudf/cudf/_lib/table.pyi +++ b/python/cudf/cudf/_lib/table.pyi @@ -6,7 +6,7 @@ import cudf class Table(object): _data: cudf.core.column_accessor.ColumnAccessor - _index: Optional[cudf.core.index.Index] + _index: Optional[cudf.core.index.BaseIndex] def __init__(self, data: object = None, index: object = None) -> None: ... diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx index 07d7a0fcf02..2981a46a54a 100644 --- a/python/cudf/cudf/_lib/table.pyx +++ b/python/cudf/cudf/_lib/table.pyx @@ -4,8 +4,6 @@ import itertools import numpy as np -from cudf.core.column_accessor import ColumnAccessor - from cython.operator cimport dereference from libc.stdint cimport uintptr_t from libcpp.memory cimport unique_ptr @@ -19,6 +17,8 @@ from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport mutable_table_view, table_view from cudf._lib.cpp.types cimport size_type +import cudf + cdef class Table: def __init__(self, object data=None, object index=None): @@ -34,7 +34,7 @@ cdef class Table: """ if data is None: data = {} - self._data = ColumnAccessor(data) + self._data = cudf.core.column_accessor.ColumnAccessor(data) self._index = index @property @@ -71,106 +71,6 @@ cdef class Table: """ return self._data.columns - @staticmethod - cdef Table from_unique_ptr( - unique_ptr[table] c_tbl, - object column_names, - object index_names=None - ): - """ - Construct a Table from a unique_ptr to a cudf::table. - - Parameters - ---------- - c_tbl : unique_ptr[cudf::table] - index_names : iterable - column_names : iterable - """ - cdef vector[unique_ptr[column]] columns - columns = move(c_tbl.get()[0].release()) - - cdef vector[unique_ptr[column]].iterator it = columns.begin() - - # First construct the index, if any - cdef int i - - index = None - if index_names is not None: - index_data = ColumnAccessor._create_unsafe( - { - name: Column.from_unique_ptr( - move(dereference(it + i)) - ) - for i, name in enumerate(index_names) - } - ) - index = Table(data=index_data) - - # Construct the data dict - cdef int n_index_columns = len(index_names) if index_names else 0 - data = ColumnAccessor._create_unsafe( - { - name: Column.from_unique_ptr( - move(dereference(it + i + n_index_columns)) - ) - for i, name in enumerate(column_names) - } - ) - - return Table(data=data, index=index) - - @staticmethod - cdef Table from_table_view( - table_view tv, - object owner, - object column_names, - object index_names=None - ): - """ - Given a ``cudf::table_view``, constructs a ``cudf.Table`` from it, - along with referencing an ``owner`` Python object that owns the memory - lifetime. If ``owner`` is a ``cudf.Table``, we reach inside of it and - reach inside of each ``cudf.Column`` to make the owner of each newly - created ``Buffer`` underneath the ``cudf.Column`` objects of the - created ``cudf.Table`` the respective ``Buffer`` from the relevant - ``cudf.Column`` of the ``owner`` ``cudf.Table``. - """ - cdef size_type column_idx = 0 - table_owner = isinstance(owner, Table) - - # First construct the index, if any - index = None - if index_names is not None: - index_columns = [] - for _ in index_names: - column_owner = owner - if table_owner: - column_owner = owner._index._columns[column_idx] - index_columns.append( - Column.from_column_view( - tv.column(column_idx), - column_owner - ) - ) - column_idx += 1 - index = Table(dict(zip(index_names, index_columns))) - - # Construct the data dict - cdef size_type source_column_idx = 0 - data_columns = [] - for _ in column_names: - column_owner = owner - if table_owner: - column_owner = owner._columns[source_column_idx] - data_columns.append( - Column.from_column_view(tv.column(column_idx), column_owner) - ) - column_idx += 1 - source_column_idx += 1 - data = dict(zip(column_names, data_columns)) - - return Table(data=data, index=index) - cdef table_view view(self) except *: """ Return a cudf::table_view of all columns (including index columns) diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index 9a0c06a6fa1..9fada59640e 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -27,6 +27,7 @@ from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.types cimport underlying_type_t_type_id +from cudf._lib.utils cimport data_from_unique_ptr from numba.np import numpy_support @@ -57,8 +58,9 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit): Given a mask buffer, returns a boolean column representng bit 0 -> False and 1 -> True within range of [begin_bit, end_bit), """ - if not isinstance(mask_buffer, cudf.core.Buffer): - raise TypeError("mask_buffer is not an instance of cudf.core.Buffer") + if not isinstance(mask_buffer, cudf.core.buffer.Buffer): + raise TypeError("mask_buffer is not an instance of " + "cudf.core.buffer.Buffer") cdef bitmask_type* bit_mask = (mask_buffer.ptr) cdef unique_ptr[column] result @@ -97,7 +99,7 @@ def transform(Column input, op): nb_signature = (nb_type,) compiled_op = cudautils.compile_udf(op, nb_signature) c_str = compiled_op[0].encode('UTF-8') - np_dtype = np.dtype(compiled_op[1]) + np_dtype = cudf.dtype(compiled_op[1]) try: c_tid = ( @@ -151,7 +153,7 @@ def table_encode(Table input): c_result = move(libcudf_transform.encode(c_input)) return ( - Table.from_unique_ptr( + *data_from_unique_ptr( move(c_result.first), column_names=input._column_names, ), diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx index 7e4423419c9..0f8f0b6ea14 100644 --- a/python/cudf/cudf/_lib/transpose.pyx +++ b/python/cudf/cudf/_lib/transpose.pyx @@ -14,6 +14,7 @@ from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.transpose cimport transpose as cpp_transpose from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_table_view def transpose(Table source): @@ -51,14 +52,14 @@ def transpose(Table source): c_result = move(cpp_transpose(c_input)) result_owner = Column.from_unique_ptr(move(c_result.first)) - result = Table.from_table_view( + data, _ = data_from_table_view( c_result.second, owner=result_owner, column_names=range(source._num_rows) ) if cats is not None: - result = Table(index=result._index, data=[ + data= [ (name, cudf.core.column.column.build_categorical_column( codes=cudf.core.column.column.as_column( col.base_data, dtype=col.dtype), @@ -67,7 +68,7 @@ def transpose(Table source): categories=cats, offset=col.offset, )) - for name, col in result._data.items() - ]) + for name, col in data.items() + ] - return result + return data diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index d93e1b75376..d3a4c45f213 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -30,6 +30,7 @@ from cudf.utils.dtypes import ( ) cimport cudf._lib.cpp.types as libcudf_types +import cudf class TypeId(IntEnum): @@ -188,11 +189,11 @@ cdef dtype_from_lists_column_view(column_view cv): cdef column_view child = lv.get()[0].child() if child.type().id() == libcudf_types.type_id.LIST: - return ListDtype(dtype_from_lists_column_view(child)) + return cudf.ListDtype(dtype_from_lists_column_view(child)) elif child.type().id() == libcudf_types.type_id.EMPTY: - return ListDtype(np.dtype("int8")) + return cudf.ListDtype("int8") else: - return ListDtype( + return cudf.ListDtype( dtype_from_column_view(child) ) @@ -201,7 +202,7 @@ cdef dtype_from_structs_column_view(column_view cv): str(i): dtype_from_column_view(cv.child(i)) for i in range(cv.num_children()) } - return StructDtype(fields) + return cudf.StructDtype(fields) cdef dtype_from_column_view(column_view cv): cdef libcudf_types.type_id tid = cv.type().id() @@ -210,26 +211,26 @@ cdef dtype_from_column_view(column_view cv): elif tid == libcudf_types.type_id.STRUCT: return dtype_from_structs_column_view(cv) elif tid == libcudf_types.type_id.DECIMAL64: - return Decimal64Dtype( - precision=Decimal64Dtype.MAX_PRECISION, + return cudf.Decimal64Dtype( + precision=cudf.Decimal64Dtype.MAX_PRECISION, scale=-cv.type().scale() ) elif tid == libcudf_types.type_id.DECIMAL32: - return Decimal32Dtype( - precision=Decimal32Dtype.MAX_PRECISION, + return cudf.Decimal32Dtype( + precision=cudf.Decimal32Dtype.MAX_PRECISION, scale=-cv.type().scale() ) else: return cudf_to_np_types[(tid)] cdef libcudf_types.data_type dtype_to_data_type(dtype) except *: - if is_list_dtype(dtype): + if cudf.api.types.is_list_dtype(dtype): tid = libcudf_types.type_id.LIST - elif is_struct_dtype(dtype): + elif cudf.api.types.is_struct_dtype(dtype): tid = libcudf_types.type_id.STRUCT - elif is_decimal64_dtype(dtype): + elif cudf.api.types.is_decimal64_dtype(dtype): tid = libcudf_types.type_id.DECIMAL64 - elif is_decimal32_dtype(dtype): + elif cudf.api.types.is_decimal32_dtype(dtype): tid = libcudf_types.type_id.DECIMAL32 else: tid = ( diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd index e8ac858d8b2..f9b225a0b89 100644 --- a/python/cudf/cudf/_lib/utils.pxd +++ b/python/cudf/cudf/_lib/utils.pxd @@ -1,10 +1,11 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector from cudf._lib.cpp.column.column cimport column_view -from cudf._lib.cpp.table.table cimport table_view +from cudf._lib.cpp.table.table cimport table, table_view from cudf._lib.table cimport Table @@ -12,3 +13,7 @@ cdef vector[column_view] make_column_views(object columns) except* cdef vector[table_view] make_table_views(object tables) except* cdef vector[table_view] make_table_data_views(object tables) except* cdef vector[string] get_column_names(Table table, object index) except* +cdef data_from_unique_ptr( + unique_ptr[table] c_tbl, column_names, index_names=*) +cdef data_from_table_view( + table_view tv, object owner, object column_names, object index_names=*) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index d42e15df9f3..81b62159b59 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -4,13 +4,17 @@ import pyarrow as pa import cudf +from cython.operator cimport dereference from libc.stdint cimport uint8_t +from libcpp.memory cimport unique_ptr from libcpp.string cimport string +from libcpp.utility cimport move from libcpp.vector cimport vector from cudf._lib.column cimport Column -from cudf._lib.cpp.column.column cimport column_view +from cudf._lib.cpp.column.column cimport column, column_view from cudf._lib.cpp.table.table cimport table_view +from cudf._lib.cpp.types cimport size_type from cudf._lib.table cimport Table try: @@ -192,3 +196,124 @@ def _index_level_name(index_name, level, column_names): return index_name else: return f"__index_level_{level}__" + + +cdef data_from_unique_ptr( + unique_ptr[table] c_tbl, column_names, index_names=None +): + """Convert a libcudf table into a dict with an index. + + This method is intended to provide the bridge between the columns returned + from calls to libcudf APIs and the cuDF Python Table objects, which require + named columns and a separate index. + + Since cuDF Python has an independent representation of a table as a + collection of columns, this function simply returns a dict of columns + suitable for conversion into data to be passed to cuDF constructors. + This method returns the columns of the table in the order they are + stored in libcudf, but calling code is responsible for partitioning and + labeling them as needed. + + Parameters + ---------- + c_tbl : unique_ptr[cudf::table] + The libcudf table whose columns will be extracted + column_names : iterable + The keys associated with the columns in the output data. + index_names : iterable, optional + If provided, an iterable of strings that will be used to label the + corresponding first set of columns into a (Multi)Index. If this + argument is omitted, all columns are assumed to be part of the output + table and no index is constructed. + + + Returns + ------- + tuple(Dict[str, Column], Optional[Index]) + A dict of the columns in the output table. + """ + cdef vector[unique_ptr[column]] c_columns = move(c_tbl.get().release()) + cdef vector[unique_ptr[column]].iterator it = c_columns.begin() + + cdef int i + + columns = [Column.from_unique_ptr(move(dereference(it+i))) + for i in range(c_columns.size())] + + # First construct the index, if any + index = ( + # TODO: For performance, the _from_data methods of Frame types assume + # that the passed index object is already an Index because cudf.Index + # and cudf.as_index are expensive. As a result, this function is + # currently somewhat inconsistent in returning a dict of columns for + # the data while actually constructing the Index object here (instead + # of just returning a dict for that as well). As we clean up the + # Frame factories we may want to look for a less dissonant approach + # that does not impose performance penalties. The same applies to + # data_from_table_view below. + cudf.Index._from_data( + { + name: columns[i] + for i, name in enumerate(index_names) + } + ) + if index_names is not None + else None + ) + n_index_columns = len(index_names) if index_names is not None else 0 + data = { + name: columns[i + n_index_columns] + for i, name in enumerate(column_names) + } + return data, index + + +cdef data_from_table_view( + table_view tv, + object owner, + object column_names, + object index_names=None +): + """ + Given a ``cudf::table_view``, constructs a ``cudf.Table`` from it, + along with referencing an ``owner`` Python object that owns the memory + lifetime. If ``owner`` is a ``cudf.Table``, we reach inside of it and + reach inside of each ``cudf.Column`` to make the owner of each newly + created ``Buffer`` underneath the ``cudf.Column`` objects of the + created ``cudf.Table`` the respective ``Buffer`` from the relevant + ``cudf.Column`` of the ``owner`` ``cudf.Table``. + """ + cdef size_type column_idx = 0 + table_owner = isinstance(owner, Table) + + # First construct the index, if any + index = None + if index_names is not None: + index_columns = [] + for _ in index_names: + column_owner = owner + if table_owner: + column_owner = owner._index._columns[column_idx] + index_columns.append( + Column.from_column_view( + tv.column(column_idx), + column_owner + ) + ) + column_idx += 1 + index = cudf.Index._from_data(dict(zip(index_names, index_columns))) + + # Construct the data dict + cdef size_type source_column_idx = 0 + data_columns = [] + for _ in column_names: + column_owner = owner + if table_owner: + column_owner = owner._columns[source_column_idx] + data_columns.append( + Column.from_column_view(tv.column(column_idx), column_owner) + ) + column_idx += 1 + source_column_idx += 1 + + return dict(zip(column_names, data_columns)), index diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 01af22f70bf..bf296e11178 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -14,9 +14,9 @@ from pandas.api import types as pd_types import cudf -from cudf._lib.scalar import DeviceScalar from cudf.core.dtypes import ( # noqa: F401 _BaseDtype, + dtype, is_categorical_dtype, is_decimal32_dtype, is_decimal64_dtype, @@ -124,7 +124,7 @@ def is_scalar(val): Return True if given object is scalar. """ return ( - isinstance(val, DeviceScalar) + isinstance(val, cudf._lib.scalar.DeviceScalar) or isinstance(val, cudf.Scalar) or isinstance(val, cudf.core.tools.datetimes.DateOffset) or pd_types.is_scalar(val) diff --git a/python/cudf/cudf/comm/gpuarrow.py b/python/cudf/cudf/comm/gpuarrow.py index 451572224c6..85b4bf20e5c 100644 --- a/python/cudf/cudf/comm/gpuarrow.py +++ b/python/cudf/cudf/comm/gpuarrow.py @@ -6,10 +6,11 @@ import pandas as pd import pyarrow as pa +from cudf import Series from cudf._lib.gpuarrow import ( CudaRecordBatchStreamReader as _CudaRecordBatchStreamReader, ) -from cudf.core import Series, column +from cudf.core import column from cudf.utils.utils import mask_bitsize, mask_dtype diff --git a/python/cudf/cudf/core/__init__.py b/python/cudf/cudf/core/__init__.py index 5eaa5b52fd4..ec4878b332d 100644 --- a/python/cudf/cudf/core/__init__.py +++ b/python/cudf/cudf/core/__init__.py @@ -1,31 +1 @@ # Copyright (c) 2018-2021, NVIDIA CORPORATION. - -from cudf.core import _internals, buffer, column, column_accessor, common -from cudf.core.buffer import Buffer -from cudf.core.dataframe import DataFrame, from_pandas, merge -from cudf.core.index import ( - BaseIndex, - CategoricalIndex, - DatetimeIndex, - Float32Index, - Float64Index, - GenericIndex, - Index, - Int8Index, - Int16Index, - Int32Index, - Int64Index, - IntervalIndex, - RangeIndex, - TimedeltaIndex, - UInt8Index, - UInt16Index, - UInt32Index, - UInt64Index, - interval_range, -) -from cudf.core.multiindex import MultiIndex -from cudf.core.scalar import NA, Scalar -from cudf.core.series import Series -import cudf.core.udf -from cudf.core.cut import cut diff --git a/python/cudf/cudf/core/_internals/__init__.py b/python/cudf/cudf/core/_internals/__init__.py index 53d186def85..6faeeffdbec 100644 --- a/python/cudf/cudf/core/_internals/__init__.py +++ b/python/cudf/cudf/core/_internals/__init__.py @@ -1,3 +1 @@ # Copyright (c) 2021, NVIDIA CORPORATION. - -from cudf.core._internals.where import where diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 87dc1d8e01f..0688283bc43 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -27,7 +27,9 @@ def _normalize_scalars(col: ColumnBase, other: ScalarLike) -> ScalarLike: f"{type(other).__name__} to {col.dtype.name}" ) - return cudf.Scalar(other, dtype=col.dtype if other is None else None) + return cudf.Scalar( + other, dtype=col.dtype if other in {None, cudf.NA} else None + ) def _check_and_cast_columns_with_other( @@ -234,9 +236,15 @@ def where( if isinstance(frame, DataFrame): if hasattr(cond, "__cuda_array_interface__"): - cond = DataFrame( - cond, columns=frame._column_names, index=frame.index - ) + if isinstance(cond, Series): + cond = DataFrame( + {name: cond for name in frame._column_names}, + index=frame.index, + ) + else: + cond = DataFrame( + cond, columns=frame._column_names, index=frame.index + ) elif ( hasattr(cond, "__array_interface__") and cond.__array_interface__["shape"] != frame.shape @@ -378,6 +386,6 @@ def where( if isinstance(frame, Index): result = Index(result, name=frame.name) else: - result = frame._copy_construct(data=result) + result = frame._from_data({frame.name: result}, frame._index) return frame._mimic_inplace(result, inplace=inplace) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 9f26ac8ee78..50ad592b54f 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -2,7 +2,11 @@ from warnings import warn import cupy as cp +import numpy as np +from cudf.core.column import as_column +from cudf.core.frame import Frame +from cudf.core.index import RangeIndex from cudf.core.series import Index, Series @@ -35,7 +39,7 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None): See Also -------- - cudf.core.series.Series.factorize : Encode the input values of Series. + cudf.Series.factorize : Encode the input values of Series. """ if sort: @@ -59,3 +63,55 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None): values.name = name return labels, cats.values if return_cupy_array else Index(cats) + + +def _linear_interpolation(column, index=None): + """ + Interpolate over a float column. Implicitly assumes that values are + evenly spaced with respect to the x-axis, for example the data + [1.0, NaN, 3.0] will be interpolated assuming the NaN is half way + between the two valid values, yielding [1.0, 2.0, 3.0] + """ + + index = RangeIndex(start=0, stop=len(column), step=1) + return _index_or_values_interpolation(column, index=index) + + +def _index_or_values_interpolation(column, index=None): + """ + Interpolate over a float column. assumes a linear interpolation + strategy using the index of the data to denote spacing of the x + values. For example the data and index [1.0, NaN, 4.0], [1, 3, 4] + would result in [1.0, 3.0, 4.0] + """ + # figure out where the nans are + mask = cp.isnan(column) + + # trivial cases, all nan or no nans + num_nan = mask.sum() + if num_nan == 0 or num_nan == len(column): + return column + + to_interp = Frame(data={None: column}, index=index) + known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask)) + + known_x = known_x_and_y._index._column.values + known_y = known_x_and_y._data.columns[0].values + + result = cp.interp(to_interp._index.values, known_x, known_y) + + # find the first nan + first_nan_idx = (mask == 0).argmax().item() + result[:first_nan_idx] = np.nan + return result + + +def get_column_interpolator(method): + interpolator = { + "linear": _linear_interpolation, + "index": _index_or_values_interpolation, + "values": _index_or_values_interpolation, + }.get(method, None) + if not interpolator: + raise ValueError(f"Interpolation method `{method}` not found") + return interpolator diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py index c6875052685..0658927975f 100644 --- a/python/cudf/cudf/core/buffer.py +++ b/python/cudf/cudf/core/buffer.py @@ -11,10 +11,28 @@ import rmm from rmm import DeviceBuffer +import cudf from cudf.core.abc import Serializable class Buffer(Serializable): + """ + A Buffer represents a device memory allocation. + + Parameters + ---------- + data : Buffer, array_like, int + An array-like object or integer representing a + device or host pointer to pre-allocated memory. + size : int, optional + Size of memory allocation. Required if a pointer + is passed for `data`. + owner : object, optional + Python object to which the lifetime of the memory + allocation is tied. If provided, a reference to this + object is kept in this Buffer. + """ + ptr: int size: int _owner: Any @@ -22,22 +40,7 @@ class Buffer(Serializable): def __init__( self, data: Any = None, size: Optional[int] = None, owner: Any = None ): - """ - A Buffer represents a device memory allocation. - - Parameters - ---------- - data : Buffer, array_like, int - An array-like object or integer representing a - device or host pointer to pre-allocated memory. - size : int, optional - Size of memory allocation. Required if a pointer - is passed for `data`. - owner : object, optional - Python object to which the lifetime of the memory - allocation is tied. If provided, a reference to this - object is kept in this Buffer. - """ + if isinstance(data, Buffer): self.ptr = data.ptr self.size = data.size @@ -157,7 +160,7 @@ def _buffer_data_from_array_interface(array_interface): ptr = array_interface["data"][0] if ptr is None: ptr = 0 - itemsize = np.dtype(array_interface["typestr"]).itemsize + itemsize = cudf.dtype(array_interface["typestr"]).itemsize shape = ( array_interface["shape"] if len(array_interface["shape"]) > 0 else (1,) ) @@ -168,7 +171,7 @@ def _buffer_data_from_array_interface(array_interface): def confirm_1d_contiguous(array_interface): strides = array_interface["strides"] shape = array_interface["shape"] - itemsize = np.dtype(array_interface["typestr"]).itemsize + itemsize = cudf.dtype(array_interface["typestr"]).itemsize typestr = array_interface["typestr"] if typestr not in ("|i1", "|u1"): raise TypeError("Buffer data must be of uint8 type") diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 48398e03b2d..7333ae119cd 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -22,7 +22,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.scalar import as_device_scalar from cudf._lib.transform import bools_to_mask from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer @@ -49,62 +48,63 @@ class CategoricalAccessor(ColumnMethods): + """ + Accessor object for categorical properties of the Series values. + Be aware that assigning to `categories` is a inplace operation, + while all methods return new categorical data per default. + + Parameters + ---------- + column : Column + parent : Series or CategoricalIndex + + Examples + -------- + >>> s = cudf.Series([1,2,3], dtype='category') + >>> s + >>> s + 0 1 + 1 2 + 2 3 + dtype: category + Categories (3, int64): [1, 2, 3] + >>> s.cat.categories + Int64Index([1, 2, 3], dtype='int64') + >>> s.cat.reorder_categories([3,2,1]) + 0 1 + 1 2 + 2 3 + dtype: category + Categories (3, int64): [3, 2, 1] + >>> s.cat.remove_categories([1]) + 0 + 1 2 + 2 3 + dtype: category + Categories (2, int64): [2, 3] + >>> s.cat.set_categories(list('abcde')) + 0 + 1 + 2 + dtype: category + Categories (5, object): ['a', 'b', 'c', 'd', 'e'] + >>> s.cat.as_ordered() + 0 1 + 1 2 + 2 3 + dtype: category + Categories (3, int64): [1 < 2 < 3] + >>> s.cat.as_unordered() + 0 1 + 1 2 + 2 3 + dtype: category + Categories (3, int64): [1, 2, 3] + """ + _column: CategoricalColumn def __init__(self, parent: SeriesOrIndex): - """ - Accessor object for categorical properties of the Series values. - Be aware that assigning to `categories` is a inplace operation, - while all methods return new categorical data per default. - - Parameters - ---------- - column : Column - parent : Series or CategoricalIndex - - Examples - -------- - >>> s = cudf.Series([1,2,3], dtype='category') - >>> s - >>> s - 0 1 - 1 2 - 2 3 - dtype: category - Categories (3, int64): [1, 2, 3] - >>> s.cat.categories - Int64Index([1, 2, 3], dtype='int64') - >>> s.cat.reorder_categories([3,2,1]) - 0 1 - 1 2 - 2 3 - dtype: category - Categories (3, int64): [3, 2, 1] - >>> s.cat.remove_categories([1]) - 0 - 1 2 - 2 3 - dtype: category - Categories (2, int64): [2, 3] - >>> s.cat.set_categories(list('abcde')) - 0 - 1 - 2 - dtype: category - Categories (5, object): ['a', 'b', 'c', 'd', 'e'] - >>> s.cat.as_ordered() - 0 1 - 1 2 - 2 3 - dtype: category - Categories (3, int64): [1 < 2 < 3] - >>> s.cat.as_unordered() - 0 1 - 1 2 - 2 3 - dtype: category - Categories (3, int64): [1, 2, 3] - """ if not is_categorical_dtype(parent.dtype): raise AttributeError( "Can only use .cat accessor with a 'category' dtype" @@ -525,50 +525,12 @@ def set_categories( dtype: category Categories (2, int64): [1, 10] """ - ordered = ordered if ordered is not None else self.ordered - new_categories = column.as_column(new_categories) - - if isinstance(new_categories, CategoricalColumn): - new_categories = new_categories.categories - - # when called with rename=True, the pandas behavior is - # to replace the current category values with the new - # categories. - if rename: - # enforce same length - if len(new_categories) != len(self._column.categories): - raise ValueError( - "new_categories must have the same " - "number of items as old categories" - ) - - out_col = column.build_categorical_column( - categories=new_categories, - codes=self._column.base_children[0], - mask=self._column.base_mask, - size=self._column.size, - offset=self._column.offset, - ordered=ordered, - ) - else: - out_col = self._column - if not (type(out_col.categories) is type(new_categories)): - # If both categories are of different Column types, - # return a column full of Nulls. - out_col = _create_empty_categorical_column( - self._column, - CategoricalDtype( - categories=new_categories, ordered=ordered - ), - ) - elif ( - not out_col._categories_equal(new_categories, ordered=ordered) - or not self.ordered == ordered - ): - out_col = out_col._set_categories( - new_categories, ordered=ordered, - ) - return self._return_or_inplace(out_col, inplace=inplace) + return self._return_or_inplace( + self._column.set_categories( + new_categories=new_categories, ordered=ordered, rename=rename + ), + inplace=inplace, + ) def reorder_categories( self, @@ -648,7 +610,19 @@ def reorder_categories( class CategoricalColumn(column.ColumnBase): - """Implements operations for Columns of Categorical type + """ + Implements operations for Columns of Categorical type + + Parameters + ---------- + dtype : CategoricalDtype + mask : Buffer + The validity mask + offset : int + Data offset + children : Tuple[ColumnBase] + Two non-null columns containing the categories and codes + respectively """ dtype: cudf.core.dtypes.CategoricalDtype @@ -664,18 +638,7 @@ def __init__( null_count: int = None, children: Tuple["column.ColumnBase", ...] = (), ): - """ - Parameters - ---------- - dtype : CategoricalDtype - mask : Buffer - The validity mask - offset : int - Data offset - children : Tuple[ColumnBase] - Two non-null columns containing the categories and codes - respectively - """ + if size is None: for child in children: assert child.offset == 0 @@ -882,7 +845,9 @@ def _fill( return self if inplace else self.copy() fill_code = self._encode(fill_value) - fill_scalar = as_device_scalar(fill_code, self.codes.dtype) + fill_scalar = cudf._lib.scalar.as_device_scalar( + fill_code, self.codes.dtype + ) result = self if inplace else self.copy() diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index a5e49b026f3..d52f63a79f5 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -53,7 +53,6 @@ is_scalar, is_string_dtype, is_struct_dtype, - pandas_dtype, ) from cudf.core.abc import Serializable from cudf.core.buffer import Buffer @@ -65,11 +64,11 @@ ) from cudf.utils import ioutils, utils from cudf.utils.dtypes import ( - check_cast_unsupported_dtype, cudf_dtype_from_pa_type, get_time_unit, min_unsigned_type, np_to_pa_dtype, + pandas_dtypes_alias_to_cudf_alias, pandas_dtypes_to_cudf_dtypes, ) from cudf.utils.utils import mask_dtype @@ -82,7 +81,7 @@ def as_frame(self) -> "cudf.core.frame.Frame": """ Converts a Column to Frame """ - return cudf.core.frame.Frame({None: self.copy(deep=False)}) + return cudf.core.frame.SingleColumnFrame({None: self.copy(deep=False)}) @property def data_array_view(self) -> "cuda.devicearray.DeviceNDArray": @@ -171,11 +170,31 @@ def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool: def _null_equals(self, other: ColumnBase) -> ColumnBase: return self.binary_operator("NULL_EQUALS", other) - def all(self) -> bool: - return bool(libcudf.reduce.reduce("all", self, dtype=np.bool_)) + def all(self, skipna: bool = True) -> bool: + # If all entries are null the result is True, including when the column + # is empty. + result_col = self.nans_to_nulls() if skipna else self - def any(self) -> bool: - return bool(libcudf.reduce.reduce("any", self, dtype=np.bool_)) + if result_col.null_count == result_col.size: + return True + + if isinstance(result_col, ColumnBase): + return libcudf.reduce.reduce("all", result_col, dtype=np.bool_) + else: + return result_col + + def any(self, skipna: bool = True) -> bool: + # Early exit for fast cases. + result_col = self.nans_to_nulls() if skipna else self + if not skipna and result_col.has_nulls: + return True + elif skipna and result_col.null_count == result_col.size: + return False + + if isinstance(result_col, ColumnBase): + return libcudf.reduce.reduce("any", result_col, dtype=np.bool_) + else: + return result_col def __sizeof__(self) -> int: n = 0 @@ -241,7 +260,9 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: """ if not isinstance(array, (pa.Array, pa.ChunkedArray)): raise TypeError("array should be PyArrow array or chunked array") + data = pa.table([array], [None]) + if isinstance(array.type, pa.DictionaryType): indices_table = pa.table( { @@ -262,10 +283,10 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: codes = libcudf.interop.from_arrow( indices_table, indices_table.column_names - )._data["None"] + )[0]["None"] categories = libcudf.interop.from_arrow( dictionaries_table, dictionaries_table.column_names - )._data["None"] + )[0]["None"] return build_categorical_column( categories=categories, @@ -283,9 +304,7 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: elif isinstance(array.type, pa.Decimal128Type): return cudf.core.column.Decimal64Column.from_arrow(array) - result = libcudf.interop.from_arrow(data, data.column_names)._data[ - "None" - ] + result = libcudf.interop.from_arrow(data, data.column_names)[0]["None"] result = result._with_type_metadata( cudf_dtype_from_pa_type(array.type) @@ -373,14 +392,6 @@ def _fill( return self - fill_code = self._encode(fill_value) - fill_scalar = as_device_scalar(fill_code, self.codes.dtype) - - result = self if inplace else self.copy() - - libcudf.filling.fill_in_place(result.codes, begin, end, fill_scalar) - return result - def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase: return libcudf.copying.shift(self, offset, fill_value) @@ -433,7 +444,7 @@ def view(self, dtype: Dtype) -> ColumnBase: """ - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if dtype.kind in ("o", "u", "s"): raise TypeError( @@ -502,7 +513,10 @@ def slice(self, start: int, stop: int, stride: int = None) -> ColumnBase: else: # Need to create a gather map for given slice with stride gather_map = arange( - start=start, stop=stop, step=stride, dtype=np.dtype(np.int32), + start=start, + stop=stop, + step=stride, + dtype=cudf.dtype(np.int32), ) return self.take(gather_map) @@ -545,7 +559,7 @@ def __setitem__(self, key: Any, value: Any): start=key_start, stop=key_stop, step=key_stride, - dtype=np.dtype(np.int32), + dtype=cudf.dtype(np.int32), ) nelem = len(key) else: @@ -881,12 +895,16 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: if is_categorical_dtype(dtype): return self.as_categorical_column(dtype, **kwargs) - dtype = pandas_dtypes_to_cudf_dtypes.get(dtype, dtype) + dtype = ( + pandas_dtypes_alias_to_cudf_alias.get(dtype, dtype) + if isinstance(dtype, str) + else pandas_dtypes_to_cudf_dtypes.get(dtype, dtype) + ) if _is_non_decimal_numeric_dtype(dtype): return self.as_numerical_column(dtype, **kwargs) elif is_categorical_dtype(dtype): return self.as_categorical_column(dtype, **kwargs) - elif pandas_dtype(dtype).type in { + elif cudf.dtype(dtype).type in { np.str_, np.object_, str, @@ -908,9 +926,9 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: return self.as_interval_column(dtype, **kwargs) elif is_decimal_dtype(dtype): return self.as_decimal_column(dtype, **kwargs) - elif np.issubdtype(dtype, np.datetime64): + elif np.issubdtype(cast(Any, dtype), np.datetime64): return self.as_datetime_column(dtype, **kwargs) - elif np.issubdtype(dtype, np.timedelta64): + elif np.issubdtype(cast(Any, dtype), np.timedelta64): return self.as_timedelta_column(dtype, **kwargs) else: return self.as_numerical_column(dtype, **kwargs) @@ -948,7 +966,7 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase: cats = cats._column.dropna(drop_nan=False) min_type = min_unsigned_type(len(cats), 8) labels = labels - 1 - if np.dtype(min_type).itemsize < labels.dtype.itemsize: + if cudf.dtype(min_type).itemsize < labels.dtype.itemsize: labels = labels.astype(min_type) return build_categorical_column( @@ -985,9 +1003,7 @@ def as_string_column( def as_decimal_column( self, dtype: Dtype, **kwargs - ) -> Union[ - "cudf.core.column.Decimal32Column", "cudf.core.column.Decimal64Column" - ]: + ) -> Union["cudf.core.column.decimal.DecimalBaseColumn"]: raise NotImplementedError def as_decimal64_column( @@ -1235,57 +1251,6 @@ def _process_for_reduction( ) return result_col - def scatter_to_table( - self, - row_indices: ColumnBase, - column_indices: ColumnBase, - names: List[Any], - nrows: int = None, - ncols: int = None, - ) -> "cudf.core.frame.Frame": - """ - Scatters values from the column into a table. - - Parameters - ---------- - row_indices - A column of the same size as `self` specifying the - row index to scatter each value to - column_indices - A column of the same size as `self` specifying the - column index to scatter each value to - names - The column names of the resulting table - - Returns - ------- - """ - if nrows is None: - nrows = 0 - if len(row_indices) > 0: - nrows = int(row_indices.max() + 1) - - if ncols is None: - ncols = 0 - if len(column_indices) > 0: - ncols = int(column_indices.max() + 1) - - if nrows * ncols == 0: - return cudf.core.frame.Frame({}) - - scatter_map = (column_indices * np.int32(nrows)) + row_indices - target = cudf.core.frame.Frame( - {None: column_empty_like(self, masked=True, newsize=nrows * ncols)} - ) - target._data[None][scatter_map] = self - result_frames = target._split(range(nrows, nrows * ncols, nrows)) - return cudf.core.frame.Frame( - { - name: next(iter(f._columns)) - for name, f in zip(names, result_frames) - } - ) - def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase: """ Copies type metadata from self onto other, returning a new column. @@ -1347,7 +1312,7 @@ def column_empty( ) -> ColumnBase: """Allocate a new column like the given row_count and dtype. """ - dtype = pandas_dtype(dtype) + dtype = cudf.dtype(dtype) children = () # type: Tuple[ColumnBase, ...] if is_struct_dtype(dtype): @@ -1360,7 +1325,7 @@ def column_empty( data = None children = ( build_column( - data=Buffer.empty(row_count * np.dtype("int32").itemsize), + data=Buffer.empty(row_count * cudf.dtype("int32").itemsize), dtype="int32", ), ) @@ -1369,7 +1334,7 @@ def column_empty( children = ( full(row_count + 1, 0, dtype="int32"), build_column( - data=Buffer.empty(row_count * np.dtype("int8").itemsize), + data=Buffer.empty(row_count * cudf.dtype("int8").itemsize), dtype="int8", ), ) @@ -1412,7 +1377,7 @@ def build_column( offset : int, optional children : tuple, optional """ - dtype = pandas_dtype(dtype) + dtype = cudf.dtype(dtype) if _is_non_decimal_numeric_dtype(dtype): assert data is not None @@ -1768,9 +1733,9 @@ def as_column( elif hasattr(arbitrary, "__cuda_array_interface__"): desc = arbitrary.__cuda_array_interface__ - current_dtype = np.dtype(desc["typestr"]) + current_dtype = cudf.dtype(desc["typestr"]) - arb_dtype = check_cast_unsupported_dtype(current_dtype) + arb_dtype = cudf.dtype(current_dtype) if desc.get("mask", None) is not None: # Extract and remove the mask from arbitrary before @@ -1817,9 +1782,9 @@ def as_column( col = ColumnBase.from_arrow(arbitrary) if isinstance(arbitrary, pa.NullArray): if type(dtype) == str and dtype == "empty": - new_dtype = pandas_dtype(arbitrary.type.to_pandas_dtype()) + new_dtype = cudf.dtype(arbitrary.type.to_pandas_dtype()) else: - new_dtype = pandas_dtype(dtype) + new_dtype = cudf.dtype(dtype) col = col.astype(new_dtype) return col @@ -1836,7 +1801,7 @@ def as_column( elif arbitrary.dtype == np.bool_: data = as_column(cupy.asarray(arbitrary), dtype=arbitrary.dtype) elif arbitrary.dtype.kind in ("f"): - arb_dtype = check_cast_unsupported_dtype(arbitrary.dtype) + arb_dtype = cudf.dtype(arbitrary.dtype) data = as_column( cupy.asarray(arbitrary, dtype=arb_dtype), nan_as_null=nan_as_null, @@ -1874,7 +1839,7 @@ def as_column( ): arbitrary = None if dtype is None: - dtype = np.dtype("float64") + dtype = cudf.dtype("float64") data = as_column( utils.scalar_broadcast_to(arbitrary, length, dtype=dtype) @@ -1889,7 +1854,7 @@ def as_column( # CUDF assumes values are always contiguous desc = arbitrary.__array_interface__ shape = desc["shape"] - arb_dtype = np.dtype(desc["typestr"]) + arb_dtype = cudf.dtype(desc["typestr"]) # CUDF assumes values are always contiguous if len(shape) > 1: raise ValueError("Data must be 1-dimensional") @@ -1913,7 +1878,7 @@ def as_column( arbitrary = np.ascontiguousarray(arbitrary) if dtype is not None: - arbitrary = arbitrary.astype(dtype) + arbitrary = arbitrary.astype(np.dtype(dtype)) if arb_dtype.kind == "M": @@ -1921,7 +1886,7 @@ def as_column( cast_dtype = time_unit in ("D", "W", "M", "Y") if cast_dtype: - arbitrary = arbitrary.astype(np.dtype("datetime64[s]")) + arbitrary = arbitrary.astype(cudf.dtype("datetime64[s]")) buffer = Buffer(arbitrary.view("|u1")) mask = None @@ -1941,7 +1906,7 @@ def as_column( cast_dtype = time_unit in ("D", "W", "M", "Y") if cast_dtype: - arbitrary = arbitrary.astype(np.dtype("timedelta64[s]")) + arbitrary = arbitrary.astype(cudf.dtype("timedelta64[s]")) buffer = Buffer(arbitrary.view("|u1")) mask = None @@ -1980,9 +1945,7 @@ def as_column( if dtype is not None: data = data.astype(dtype) elif arb_dtype.kind in ("f"): - arb_dtype = check_cast_unsupported_dtype( - arb_dtype if dtype is None else dtype - ) + arb_dtype = cudf.dtype(arb_dtype if dtype is None else dtype) data = as_column( cupy.asarray(arbitrary, dtype=arb_dtype), nan_as_null=nan_as_null, @@ -1995,9 +1958,9 @@ def as_column( arb_dtype = arbitrary.dtype else: if arbitrary.dtype == pd.StringDtype(): - arb_dtype = np.dtype("O") + arb_dtype = cudf.dtype("O") else: - arb_dtype = check_cast_unsupported_dtype(arbitrary.dtype) + arb_dtype = cudf.dtype(arbitrary.dtype) if arb_dtype != arbitrary.dtype.numpy_dtype: arbitrary = arbitrary.astype(arb_dtype) if ( @@ -2044,6 +2007,29 @@ def as_column( memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null ) except TypeError: + if dtype is not None: + # Arrow throws a type error if the input is of + # mixed-precision and cannot fit into the provided + # decimal type properly, see: + # https://github.com/apache/arrow/pull/9948 + # Hence we should let the exception propagate to + # the user. + if isinstance(dtype, cudf.core.dtypes.Decimal64Dtype): + data = pa.array( + arbitrary, + type=pa.decimal128( + precision=dtype.precision, scale=dtype.scale + ), + ) + return cudf.core.column.Decimal64Column.from_arrow(data) + if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype): + data = pa.array( + arbitrary, + type=pa.decimal128( + precision=dtype.precision, scale=dtype.scale + ), + ) + return cudf.core.column.Decimal32Column.from_arrow(data) pa_type = None np_type = None try: @@ -2082,7 +2068,6 @@ def as_column( return cudf.core.column.Decimal32Column.from_arrow( data ) - dtype = pd.api.types.pandas_dtype(dtype) np_type = np.dtype(dtype).type if np_type == np.bool_: pa_type = pa.bool_() @@ -2136,7 +2121,7 @@ def _construct_array( Construct a CuPy or NumPy array from `arbitrary` """ try: - dtype = dtype if dtype is None else np.dtype(dtype) + dtype = dtype if dtype is None else cudf.dtype(dtype) arbitrary = cupy.asarray(arbitrary, dtype=dtype) except (TypeError, ValueError): native_dtype = dtype @@ -2150,7 +2135,7 @@ def _construct_array( arbitrary, dtype=native_dtype if native_dtype is None - else np.dtype(native_dtype), + else cudf.dtype(native_dtype), ) return arbitrary @@ -2159,7 +2144,7 @@ def _data_from_cuda_array_interface_desc(obj) -> Buffer: desc = obj.__cuda_array_interface__ ptr = desc["data"][0] nelem = desc["shape"][0] if len(desc["shape"]) > 0 else 1 - dtype = np.dtype(desc["typestr"]) + dtype = cudf.dtype(desc["typestr"]) data = Buffer(data=ptr, size=nelem * dtype.itemsize, owner=obj) return data @@ -2328,7 +2313,7 @@ def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase: def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: """Concatenate a sequence of columns.""" if len(objs) == 0: - dtype = pandas_dtype(None) + dtype = cudf.dtype(None) return column_empty(0, dtype=dtype, masked=True) # If all columns are `NumericalColumn` with different dtypes, diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index f3d1880b290..46ff1990ac2 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -52,6 +52,19 @@ class DatetimeColumn(column.ColumnBase): + """ + A Column implementation for Date-time types. + + Parameters + ---------- + data : Buffer + The datetime values + dtype : np.dtype + The data type + mask : Buffer; optional + The validity mask + """ + def __init__( self, data: Buffer, @@ -61,17 +74,8 @@ def __init__( offset: int = 0, null_count: int = None, ): - """ - Parameters - ---------- - data : Buffer - The datetime values - dtype : np.dtype - The data type - mask : Buffer; optional - The validity mask - """ - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) + if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") if size is None: @@ -154,6 +158,15 @@ def to_pandas( index=index, ) + @property + def values(self): + """ + Return a CuPy representation of the DateTimeColumn. + """ + raise NotImplementedError( + "DateTime Arrays is not yet implemented in cudf" + ) + def get_dt_field(self, field: str) -> ColumnBase: return libcudf.datetime.extract_datetime_component(self, field) @@ -236,7 +249,7 @@ def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]: return output def as_datetime_column(self, dtype: Dtype, **kwargs) -> DatetimeColumn: - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if dtype == self.dtype: return self return libcudf.unary.cast(self, dtype=dtype) @@ -264,7 +277,7 @@ def as_string_column( ) if len(self) > 0: return string._datetime_to_str_typecast_functions[ - np.dtype(self.dtype) + cudf.dtype(self.dtype) ](self, format) else: return cast( @@ -316,7 +329,7 @@ def binary_operator( return rhs._datetime_binop(self, op, reflect=reflect) lhs: Union[ScalarLike, ColumnBase] = self if op in ("eq", "ne", "lt", "gt", "le", "ge", "NULL_EQUALS"): - out_dtype = np.dtype(np.bool_) # type: Dtype + out_dtype = cudf.dtype(np.bool_) # type: Dtype elif op == "add" and pd.api.types.is_timedelta64_dtype(rhs.dtype): out_dtype = cudf.core.column.timedelta._timedelta_add_result_dtype( rhs, lhs @@ -389,13 +402,13 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: to_res, _ = np.datetime_data(to_dtype) self_res, _ = np.datetime_data(self.dtype) - max_int = np.iinfo(np.dtype("int64")).max + max_int = np.iinfo(cudf.dtype("int64")).max max_dist = np.timedelta64( - self.max().astype(np.dtype("int64"), copy=False), self_res + self.max().astype(cudf.dtype("int64"), copy=False), self_res ) min_dist = np.timedelta64( - self.min().astype(np.dtype("int64"), copy=False), self_res + self.min().astype(cudf.dtype("int64"), copy=False), self_res ) self_delta_dtype = np.timedelta64(0, self_res).dtype @@ -408,7 +421,7 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: return True else: return False - elif to_dtype == np.dtype("int64") or to_dtype == np.dtype("O"): + elif to_dtype == cudf.dtype("int64") or to_dtype == cudf.dtype("O"): # can safely cast to representation, or string return True else: diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index c667799c7c2..47f39eb570d 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -25,7 +25,30 @@ from .numerical_base import NumericalBaseColumn -class Decimal32Column(NumericalBaseColumn): +class DecimalBaseColumn(NumericalBaseColumn): + """Base column for decimal64 and decimal32 columns + """ + + dtype: Union[Decimal32Dtype, Decimal64Dtype] + + def as_decimal_column( + self, dtype: Dtype, **kwargs + ) -> Union["DecimalBaseColumn"]: + if ( + isinstance(dtype, (Decimal64Dtype, Decimal32Dtype)) + and dtype.scale < self.dtype.scale + ): + warn( + "cuDF truncates when downcasting decimals to a lower scale. " + "To round, use Series.round() or DataFrame.round()." + ) + + if dtype == self.dtype: + return self + return libcudf.unary.cast(self, dtype) + + +class Decimal32Column(DecimalBaseColumn): dtype: Decimal32Dtype @classmethod @@ -78,7 +101,7 @@ def to_arrow(self): ) -class Decimal64Column(NumericalBaseColumn): +class Decimal64Column(DecimalBaseColumn): dtype: Decimal64Dtype def __truediv__(self, other): @@ -202,24 +225,6 @@ def _decimal_quantile( return result._with_type_metadata(self.dtype) - def as_decimal_column( - self, dtype: Dtype, **kwargs - ) -> Union[ - "cudf.core.column.Decimal32Column", "cudf.core.column.Decimal64Column" - ]: - if ( - isinstance(dtype, Decimal64Dtype) - and dtype.scale < self.dtype.scale - ): - warn( - "cuDF truncates when downcasting decimals to a lower scale. " - "To round, use Series.round() or DataFrame.round()." - ) - - if dtype == self.dtype: - return self - return libcudf.unary.cast(self, dtype) - def as_numerical_column( self, dtype: Dtype, **kwargs ) -> "cudf.core.column.NumericalColumn": diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index 27dc4fe0c0d..a587c58a49d 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -63,8 +63,8 @@ def _return_or_inplace( """ if inplace: self._parent._mimic_inplace( - self._parent.__class__._from_table( - cudf._lib.table.Table({self._parent.name: new_col}) + self._parent.__class__._from_data( + {self._parent.name: new_col} ), inplace=True, ) @@ -78,8 +78,8 @@ def _return_or_inplace( table = new_col if isinstance(self._parent, cudf.BaseIndex): - idx = self._parent._constructor_expanddim._from_table( - table=table + idx = self._parent._constructor_expanddim._from_data( + table._data, table._index ) idx.names = None return idx diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 1ac3f1de6a2..bc12b42a3fa 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -36,6 +36,17 @@ class NumericalColumn(NumericalBaseColumn): + """ + A Column object for Numeric types. + + Parameters + ---------- + data : Buffer + dtype : np.dtype + The dtype associated with the data Buffer + mask : Buffer, optional + """ + def __init__( self, data: Buffer, @@ -45,15 +56,8 @@ def __init__( offset: int = 0, null_count: int = None, ): - """ - Parameters - ---------- - data : Buffer - dtype : np.dtype - The dtype associated with the data Buffer - mask : Buffer, optional - """ - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) + if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") if size is None: @@ -121,14 +125,14 @@ def binary_operator( self, binop: str, rhs: BinaryOperand, reflect: bool = False, ) -> ColumnBase: int_dtypes = [ - np.dtype("int8"), - np.dtype("int16"), - np.dtype("int32"), - np.dtype("int64"), - np.dtype("uint8"), - np.dtype("uint16"), - np.dtype("uint32"), - np.dtype("uint64"), + cudf.dtype("int8"), + cudf.dtype("int16"), + cudf.dtype("int32"), + cudf.dtype("int64"), + cudf.dtype("uint8"), + cudf.dtype("uint16"), + cudf.dtype("uint32"), + cudf.dtype("uint64"), ] if rhs is None: out_dtype = self.dtype @@ -158,7 +162,7 @@ def binary_operator( (np.isscalar(tmp) and (0 == tmp)) or ((isinstance(tmp, NumericalColumn)) and (0.0 in tmp)) ): - out_dtype = np.dtype("float64") + out_dtype = cudf.dtype("float64") if binop in { "l_and", @@ -193,13 +197,13 @@ def normalize_binop_value( if isinstance(other, cudf.Scalar): return other other_dtype = np.promote_types(self.dtype, other_dtype) - if other_dtype == np.dtype("float16"): - other_dtype = np.dtype("float32") + if other_dtype == cudf.dtype("float16"): + other_dtype = cudf.dtype("float32") other = other_dtype.type(other) if self.dtype.kind == "b": other_dtype = min_signed_type(other) if np.isscalar(other): - other = np.dtype(other_dtype).type(other) + other = cudf.dtype(other_dtype).type(other) return other else: ary = utils.scalar_broadcast_to( @@ -212,7 +216,7 @@ def normalize_binop_value( raise TypeError(f"cannot broadcast {type(other)}") def int2ip(self) -> "cudf.core.column.StringColumn": - if self.dtype != np.dtype("int64"): + if self.dtype != cudf.dtype("int64"): raise TypeError("Only int64 type can be converted to ip") return libcudf.string_casting.int2ip(self) @@ -222,7 +226,7 @@ def as_string_column( ) -> "cudf.core.column.StringColumn": if len(self) > 0: return string._numeric_to_str_typecast_functions[ - np.dtype(self.dtype) + cudf.dtype(self.dtype) ](self) else: return cast( @@ -263,7 +267,7 @@ def as_decimal_column( return libcudf.unary.cast(self, dtype) def as_numerical_column(self, dtype: Dtype, **kwargs) -> NumericalColumn: - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if dtype == self.dtype: return self return libcudf.unary.cast(self, dtype) @@ -618,7 +622,7 @@ def _safe_cast_to_int(col: ColumnBase, dtype: DtypeObj) -> ColumnBase: else: raise TypeError( f"Cannot safely cast non-equivalent " - f"{col.dtype.type.__name__} to {np.dtype(dtype).type.__name__}" + f"{col.dtype.type.__name__} to {cudf.dtype(dtype).type.__name__}" ) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 92c57477465..c4b07c41b06 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -53,62 +53,63 @@ def str_to_boolean(column: StringColumn): _str_to_numeric_typecast_functions = { - np.dtype("int8"): str_cast.stoi8, - np.dtype("int16"): str_cast.stoi16, - np.dtype("int32"): str_cast.stoi, - np.dtype("int64"): str_cast.stol, - np.dtype("uint8"): str_cast.stoui8, - np.dtype("uint16"): str_cast.stoui16, - np.dtype("uint32"): str_cast.stoui, - np.dtype("uint64"): str_cast.stoul, - np.dtype("float32"): str_cast.stof, - np.dtype("float64"): str_cast.stod, - np.dtype("bool"): str_to_boolean, + cudf.dtype("int8"): str_cast.stoi8, + cudf.dtype("int16"): str_cast.stoi16, + cudf.dtype("int32"): str_cast.stoi, + cudf.dtype("int64"): str_cast.stol, + cudf.dtype("uint8"): str_cast.stoui8, + cudf.dtype("uint16"): str_cast.stoui16, + cudf.dtype("uint32"): str_cast.stoui, + cudf.dtype("uint64"): str_cast.stoul, + cudf.dtype("float32"): str_cast.stof, + cudf.dtype("float64"): str_cast.stod, + cudf.dtype("bool"): str_to_boolean, } _numeric_to_str_typecast_functions = { - np.dtype("int8"): str_cast.i8tos, - np.dtype("int16"): str_cast.i16tos, - np.dtype("int32"): str_cast.itos, - np.dtype("int64"): str_cast.ltos, - np.dtype("uint8"): str_cast.ui8tos, - np.dtype("uint16"): str_cast.ui16tos, - np.dtype("uint32"): str_cast.uitos, - np.dtype("uint64"): str_cast.ultos, - np.dtype("float32"): str_cast.ftos, - np.dtype("float64"): str_cast.dtos, - np.dtype("bool"): str_cast.from_booleans, + cudf.dtype("int8"): str_cast.i8tos, + cudf.dtype("int16"): str_cast.i16tos, + cudf.dtype("int32"): str_cast.itos, + cudf.dtype("int64"): str_cast.ltos, + cudf.dtype("uint8"): str_cast.ui8tos, + cudf.dtype("uint16"): str_cast.ui16tos, + cudf.dtype("uint32"): str_cast.uitos, + cudf.dtype("uint64"): str_cast.ultos, + cudf.dtype("float32"): str_cast.ftos, + cudf.dtype("float64"): str_cast.dtos, + cudf.dtype("bool"): str_cast.from_booleans, } _datetime_to_str_typecast_functions = { # TODO: support Date32 UNIX days - # np.dtype("datetime64[D]"): str_cast.int2timestamp, - np.dtype("datetime64[s]"): str_cast.int2timestamp, - np.dtype("datetime64[ms]"): str_cast.int2timestamp, - np.dtype("datetime64[us]"): str_cast.int2timestamp, - np.dtype("datetime64[ns]"): str_cast.int2timestamp, + # cudf.dtype("datetime64[D]"): str_cast.int2timestamp, + cudf.dtype("datetime64[s]"): str_cast.int2timestamp, + cudf.dtype("datetime64[ms]"): str_cast.int2timestamp, + cudf.dtype("datetime64[us]"): str_cast.int2timestamp, + cudf.dtype("datetime64[ns]"): str_cast.int2timestamp, } _timedelta_to_str_typecast_functions = { - np.dtype("timedelta64[s]"): str_cast.int2timedelta, - np.dtype("timedelta64[ms]"): str_cast.int2timedelta, - np.dtype("timedelta64[us]"): str_cast.int2timedelta, - np.dtype("timedelta64[ns]"): str_cast.int2timedelta, + cudf.dtype("timedelta64[s]"): str_cast.int2timedelta, + cudf.dtype("timedelta64[ms]"): str_cast.int2timedelta, + cudf.dtype("timedelta64[us]"): str_cast.int2timedelta, + cudf.dtype("timedelta64[ns]"): str_cast.int2timedelta, } class StringMethods(ColumnMethods): + """ + Vectorized string functions for Series and Index. + + This mimics pandas ``df.str`` interface. nulls stay null + unless handled otherwise by a particular method. + Patterned after Python’s string methods, with some + inspiration from R’s stringr package. + """ + _column: StringColumn def __init__(self, parent): - """ - Vectorized string functions for Series and Index. - - This mimics pandas ``df.str`` interface. nulls stay null - unless handled otherwise by a particular method. - Patterned after Python’s string methods, with some - inspiration from R’s stringr package. - """ value_type = ( parent.dtype.leaf_type if is_list_dtype(parent.dtype) @@ -607,11 +608,12 @@ def extract( if flags != 0: raise NotImplementedError("`flags` parameter is not yet supported") - out = libstrings.extract(self._column, pat) - if out._num_columns == 1 and expand is False: - return self._return_or_inplace(out._columns[0], expand=expand) + data, index = libstrings.extract(self._column, pat) + if len(data) == 1 and expand is False: + data = next(iter(data.values())) else: - return self._return_or_inplace(out, expand=expand) + data = cudf.core.frame.Frame(data, index) + return self._return_or_inplace(data, expand=expand) def contains( self, @@ -749,6 +751,59 @@ def contains( ) return self._return_or_inplace(result_col) + def repeat(self, repeats: Union[int, Sequence],) -> SeriesOrIndex: + """ + Duplicate each string in the Series or Index. + Equivalent to `str.repeat() + `_. + + Parameters + ---------- + repeats : int or sequence of int + Same value for all (int) or different value per (sequence). + + Returns + ------- + Series or Index of object + Series or Index of repeated string objects specified by + input parameter repeats. + + Examples + -------- + >>> s = cudf.Series(['a', 'b', 'c']) + >>> s + 0 a + 1 b + 2 c + dtype: object + + Single int repeats string in Series + + >>> s.str.repeat(repeats=2) + 0 aa + 1 bb + 2 cc + dtype: object + + Sequence of int repeats corresponding string in Series + + >>> s.str.repeat(repeats=[1, 2, 3]) + 0 a + 1 bb + 2 ccc + dtype: object + """ + if can_convert_to_column(repeats): + return self._return_or_inplace( + libstrings.repeat_sequence( + self._column, column.as_column(repeats, dtype="int"), + ), + ) + + return self._return_or_inplace( + libstrings.repeat_scalar(self._column, repeats) + ) + def replace( self, pat: Union[str, Sequence], @@ -2274,12 +2329,13 @@ def split( if self._column.null_count == len(self._column): result_table = cudf.core.frame.Frame({0: self._column.copy()}) else: - result_table = libstrings.split( + data, index = libstrings.split( self._column, cudf.Scalar(pat, "str"), n ) - if len(result_table._data) == 1: - if result_table._data[0].null_count == len(self._column): - result_table = cudf.core.frame.Frame({}) + if len(data) == 1 and data[0].null_count == len(self._column): + result_table = cudf.core.frame.Frame({}) + else: + result_table = cudf.core.frame.Frame(data, index) else: result_table = libstrings.split_record( self._column, cudf.Scalar(pat, "str"), n @@ -2429,12 +2485,13 @@ def rsplit( if self._column.null_count == len(self._column): result_table = cudf.core.frame.Frame({0: self._column.copy()}) else: - result_table = libstrings.rsplit( - self._column, cudf.Scalar(pat), n + data, index = libstrings.rsplit( + self._column, cudf.Scalar(pat, "str"), n ) - if len(result_table._data) == 1: - if result_table._data[0].null_count == len(self._column): - result_table = cudf.core.frame.Frame({}) + if len(data) == 1 and data[0].null_count == len(self._column): + result_table = cudf.core.frame.Frame({}) + else: + result_table = cudf.core.frame.Frame(data, index) else: result_table = libstrings.rsplit_record( self._column, cudf.Scalar(pat), n @@ -2499,7 +2556,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: Also available on indices: - >>> idx = cudf.core.index.StringIndex(['X 123', 'Y 999']) + >>> idx = cudf.Index(['X 123', 'Y 999']) >>> idx StringIndex(['X 123' 'Y 999'], dtype='object') @@ -2519,7 +2576,9 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - libstrings.partition(self._column, cudf.Scalar(sep)), + cudf.core.frame.Frame( + *libstrings.partition(self._column, cudf.Scalar(sep)) + ), expand=expand, ) @@ -2564,7 +2623,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: Also available on indices: - >>> idx = cudf.core.index.StringIndex(['X 123', 'Y 999']) + >>> idx = cudf.Index(['X 123', 'Y 999']) >>> idx StringIndex(['X 123' 'Y 999'], dtype='object') @@ -2584,7 +2643,9 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - libstrings.rpartition(self._column, cudf.Scalar(sep)), + cudf.core.frame.Frame( + *libstrings.rpartition(self._column, cudf.Scalar(sep)) + ), expand=expand, ) @@ -3234,7 +3295,7 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: This is also available on Index. - >>> index = cudf.core.index.StringIndex(['A', 'A', 'Aaba', 'cat']) + >>> index = cudf.Index(['A', 'A', 'Aaba', 'cat']) >>> index.str.count('a') Int64Index([0, 0, 2, 1], dtype='int64') """ # noqa W605 @@ -3309,8 +3370,9 @@ def findall( if flags != 0: raise NotImplementedError("`flags` parameter is not yet supported") + data, index = libstrings.findall(self._column, pat) return self._return_or_inplace( - libstrings.findall(self._column, pat), expand=expand + cudf.core.frame.Frame(data, index), expand=expand ) def isempty(self) -> SeriesOrIndex: @@ -4861,7 +4923,18 @@ def _expected_types_format(types): class StringColumn(column.ColumnBase): - """Implements operations for Columns of String type + """ + Implements operations for Columns of String type + + Parameters + ---------- + mask : Buffer + The validity mask + offset : int + Data offset + children : Tuple[Column] + Two non-null columns containing the string data and offsets + respectively """ _start_offset: Optional[int] @@ -4876,18 +4949,7 @@ def __init__( null_count: int = None, children: Tuple["column.ColumnBase", ...] = (), ): - """ - Parameters - ---------- - mask : Buffer - The validity mask - offset : int - Data offset - children : Tuple[Column] - Two non-null columns containing the string data and offsets - respectively - """ - dtype = np.dtype("object") + dtype = cudf.dtype("object") if size is None: for child in children: @@ -5054,7 +5116,7 @@ def __contains__(self, item: ScalarLike) -> bool: def as_numerical_column( self, dtype: Dtype, **kwargs ) -> "cudf.core.column.NumericalColumn": - out_dtype = np.dtype(dtype) + out_dtype = cudf.dtype(dtype) if out_dtype.kind in {"i", "u"}: if not libstrings.is_integer(self).all(): @@ -5096,7 +5158,7 @@ def _as_datetime_or_timedelta_column(self, dtype, format): def as_datetime_column( self, dtype: Dtype, **kwargs ) -> "cudf.core.column.DatetimeColumn": - out_dtype = np.dtype(dtype) + out_dtype = cudf.dtype(dtype) # infer on host from the first not na element # or return all null column if all values @@ -5120,7 +5182,7 @@ def as_datetime_column( def as_timedelta_column( self, dtype: Dtype, **kwargs ) -> "cudf.core.column.TimeDeltaColumn": - out_dtype = np.dtype(dtype) + out_dtype = cudf.dtype(dtype) format = "%D days %H:%M:%S" return self._as_datetime_or_timedelta_column(out_dtype, format) @@ -5232,7 +5294,7 @@ def deserialize(cls, header: dict, frames: list) -> StringColumn: return col def can_cast_safely(self, to_dtype: Dtype) -> bool: - to_dtype = np.dtype(to_dtype) + to_dtype = cudf.dtype(to_dtype) if self.dtype == to_dtype: return True @@ -5379,7 +5441,7 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase": raise ValueError( "Can not produce a view of a string column with nulls" ) - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) str_byte_offset = self.base_children[0].element_indexing(self.offset) str_end_byte_offset = self.base_children[0].element_indexing( self.offset + self.size diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 6988128606e..fd63b4de144 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -191,7 +191,15 @@ def field(self, key): pos = fields.index(key) return self._return_or_inplace(self._column.children[pos]) else: - return self._return_or_inplace(self._column.children[key]) + if isinstance(key, int): + try: + return self._return_or_inplace(self._column.children[key]) + except IndexError: + raise IndexError(f"Index {key} out of range") + else: + raise KeyError( + f"Field '{key}' is not found in the set of existing keys." + ) def explode(self): """ diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index a27c20cc50c..7c1250231f3 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -34,6 +34,24 @@ class TimeDeltaColumn(column.ColumnBase): + """ + Parameters + ---------- + data : Buffer + The Timedelta values + dtype : np.dtype + The data type + size : int + Size of memory allocation. + mask : Buffer; optional + The validity mask + offset : int + Data offset + null_count : int, optional + The number of null values. + If None, it is calculated automatically. + """ + def __init__( self, data: Buffer, @@ -43,24 +61,8 @@ def __init__( offset: int = 0, null_count: int = None, ): - """ - Parameters - ---------- - data : Buffer - The Timedelta values - dtype : np.dtype - The data type - size : int - Size of memory allocation. - mask : Buffer; optional - The validity mask - offset : int - Data offset - null_count : int, optional - The number of null values. - If None, it is calculated automatically. - """ - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) + if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") if size is None: @@ -90,6 +92,15 @@ def __contains__(self, item: DatetimeLikeScalar) -> bool: return False return item.view("int64") in self.as_numerical + @property + def values(self): + """ + Return a CuPy representation of the TimeDeltaColumn. + """ + raise NotImplementedError( + "TimeDelta Arrays is not yet implemented in cudf" + ) + def to_arrow(self) -> pa.Array: mask = None if self.nullable: @@ -137,7 +148,7 @@ def _binary_op_floordiv( rhs = cudf.Scalar(None, "float64") else: rhs = rhs.astype(common_dtype).astype("float64") - out_dtype = np.dtype("int64") + out_dtype = cudf.dtype("int64") elif rhs.dtype.kind in ("f", "i", "u"): out_dtype = self.dtype else: @@ -204,7 +215,7 @@ def _binary_op_truediv( else: rhs = rhs.astype(common_dtype).astype("float64") - out_dtype = np.dtype("float64") + out_dtype = cudf.dtype("float64") elif rhs.dtype.kind in ("f", "i", "u"): out_dtype = self.dtype else: @@ -344,7 +355,7 @@ def as_string_column( ) if len(self) > 0: return string._timedelta_to_str_typecast_functions[ - np.dtype(self.dtype) + cudf.dtype(self.dtype) ](self, format=format) else: return cast( @@ -353,7 +364,7 @@ def as_string_column( ) def as_timedelta_column(self, dtype: Dtype, **kwargs) -> TimeDeltaColumn: - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if dtype == self.dtype: return self return libcudf.unary.cast(self, dtype=dtype) @@ -575,9 +586,9 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn": def determine_out_dtype(lhs_dtype: Dtype, rhs_dtype: Dtype) -> Dtype: - if np.can_cast(np.dtype(lhs_dtype), np.dtype(rhs_dtype)): + if np.can_cast(cudf.dtype(lhs_dtype), cudf.dtype(rhs_dtype)): return rhs_dtype - elif np.can_cast(np.dtype(rhs_dtype), np.dtype(lhs_dtype)): + elif np.can_cast(cudf.dtype(rhs_dtype), cudf.dtype(lhs_dtype)): return lhs_dtype else: raise TypeError(f"Cannot type-cast {lhs_dtype} and {rhs_dtype}") @@ -594,7 +605,7 @@ def _timedelta_add_result_dtype( lhs_unit = units.index(lhs_time_unit) rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs) rhs_unit = units.index(rhs_time_unit) - out_dtype = np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]") + out_dtype = cudf.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]") else: raise TypeError( f"Addition of {lhs.dtype} with {rhs.dtype} " @@ -619,7 +630,7 @@ def _timedelta_sub_result_dtype( lhs_unit = units.index(lhs_time_unit) rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs) rhs_unit = units.index(rhs_time_unit) - out_dtype = np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]") + out_dtype = cudf.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]") else: raise TypeError( f"Subtraction of {lhs.dtype} with {rhs.dtype} " diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 607b8ac307b..56882f89af8 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -80,6 +80,19 @@ def _to_flat_dict(d): class ColumnAccessor(MutableMapping): + """ + Parameters + ---------- + data : mapping + Mapping of keys to column values. + multiindex : bool, optional + Whether tuple keys represent a hierarchical + index with multiple "levels" (default=False). + level_names : tuple, optional + Tuple containing names for each of the levels. + For a non-hierarchical index, a tuple of size 1 + may be passe. + """ _data: "Dict[Any, ColumnBase]" multiindex: bool @@ -91,19 +104,6 @@ def __init__( multiindex: bool = False, level_names=None, ): - """ - Parameters - ---------- - data : mapping - Mapping of keys to column values. - multiindex : bool, optional - Whether tuple keys represent a hierarchical - index with multiple "levels" (default=False). - level_names : tuple, optional - Tuple containing names for each of the levels. - For a non-hierarchical index, a tuple of size 1 - may be passe. - """ if data is None: data = {} # TODO: we should validate the keys of `data` diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index 7811f477170..91f623a3cd3 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -5,7 +5,6 @@ import pandas as pd import cudf -from cudf._lib.labeling import label_bins from cudf.core.column import as_column, build_categorical_column from cudf.core.index import IntervalIndex, interval_range from cudf.utils.dtypes import is_list_like @@ -240,7 +239,7 @@ def cut( # the input arr must be changed to the same type as the edges input_arr = input_arr.astype(left_edges.dtype) # get the indexes for the appropriate number - index_labels = label_bins( + index_labels = cudf._lib.labeling.label_bins( input_arr, left_edges, left_inclusive, right_edges, right_inclusive ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 7cd42d749ec..721ebf22de7 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -10,7 +10,7 @@ import warnings from collections import defaultdict from collections.abc import Iterable, Sequence -from typing import Any, Optional, TypeVar +from typing import Any, MutableMapping, Optional, TypeVar import cupy import numpy as np @@ -23,6 +23,7 @@ from pandas.io.formats.printing import pprint_thing import cudf +import cudf.core.common from cudf import _lib as libcudf from cudf.api.types import is_bool_dtype, is_dict_like from cudf.core import column, reshape @@ -31,7 +32,7 @@ from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import Frame, _drop_rows_by_labels from cudf.core.groupby.groupby import DataFrameGroupBy -from cudf.core.index import BaseIndex, Index, RangeIndex, as_index +from cudf.core.index import BaseIndex, RangeIndex, as_index from cudf.core.indexing import _DataFrameIlocIndexer, _DataFrameLocIndexer from cudf.core.series import Series from cudf.core.window import Rolling @@ -62,6 +63,7 @@ "max": "nanmax", "sum": "nansum", "prod": "nanprod", + "product": "nanprod", "mean": "nanmean", "std": "nanstd", "var": "nanvar", @@ -69,100 +71,101 @@ class DataFrame(Frame, Serializable, GetAttrGetItemMixin): + """ + A GPU Dataframe object. - _PROTECTED_KEYS = frozenset(("_data", "_index")) - - @annotate("DATAFRAME_INIT", color="blue", domain="cudf_python") - def __init__(self, data=None, index=None, columns=None, dtype=None): - """ - A GPU Dataframe object. - - Parameters - ---------- - data : array-like, Iterable, dict, or DataFrame. - Dict can contain Series, arrays, constants, or list-like objects. + Parameters + ---------- + data : array-like, Iterable, dict, or DataFrame. + Dict can contain Series, arrays, constants, or list-like objects. - index : Index or array-like - Index to use for resulting frame. Will default to - RangeIndex if no indexing information part of input data and - no index provided. + index : Index or array-like + Index to use for resulting frame. Will default to + RangeIndex if no indexing information part of input data and + no index provided. - columns : Index or array-like - Column labels to use for resulting frame. - Will default to RangeIndex (0, 1, 2, …, n) if no column - labels are provided. + columns : Index or array-like + Column labels to use for resulting frame. + Will default to RangeIndex (0, 1, 2, …, n) if no column + labels are provided. - dtype : dtype, default None - Data type to force. Only a single dtype is allowed. - If None, infer. + dtype : dtype, default None + Data type to force. Only a single dtype is allowed. + If None, infer. - Examples - -------- + Examples + -------- - Build dataframe with ``__setitem__``: + Build dataframe with ``__setitem__``: - >>> import cudf - >>> df = cudf.DataFrame() - >>> df['key'] = [0, 1, 2, 3, 4] - >>> df['val'] = [float(i + 10) for i in range(5)] # insert column - >>> df - key val - 0 0 10.0 - 1 1 11.0 - 2 2 12.0 - 3 3 13.0 - 4 4 14.0 + >>> import cudf + >>> df = cudf.DataFrame() + >>> df['key'] = [0, 1, 2, 3, 4] + >>> df['val'] = [float(i + 10) for i in range(5)] # insert column + >>> df + key val + 0 0 10.0 + 1 1 11.0 + 2 2 12.0 + 3 3 13.0 + 4 4 14.0 + + Build DataFrame via dict of columns: + + >>> import numpy as np + >>> from datetime import datetime, timedelta + >>> t0 = datetime.strptime('2018-10-07 12:00:00', '%Y-%m-%d %H:%M:%S') + >>> n = 5 + >>> df = cudf.DataFrame({ + ... 'id': np.arange(n), + ... 'datetimes': np.array( + ... [(t0+ timedelta(seconds=x)) for x in range(n)]) + ... }) + >>> df + id datetimes + 0 0 2018-10-07T12:00:00.000 + 1 1 2018-10-07T12:00:01.000 + 2 2 2018-10-07T12:00:02.000 + 3 3 2018-10-07T12:00:03.000 + 4 4 2018-10-07T12:00:04.000 + + Build DataFrame via list of rows as tuples: + + >>> df = cudf.DataFrame([ + ... (5, "cats", "jump", np.nan), + ... (2, "dogs", "dig", 7.5), + ... (3, "cows", "moo", -2.1, "occasionally"), + ... ]) + >>> df + 0 1 2 3 4 + 0 5 cats jump + 1 2 dogs dig 7.5 + 2 3 cows moo -2.1 occasionally + + Convert from a Pandas DataFrame: - Build DataFrame via dict of columns: + >>> import pandas as pd + >>> pdf = pd.DataFrame({'a': [0, 1, 2, 3],'b': [0.1, 0.2, None, 0.3]}) + >>> pdf + a b + 0 0 0.1 + 1 1 0.2 + 2 2 NaN + 3 3 0.3 + >>> df = cudf.from_pandas(pdf) + >>> df + a b + 0 0 0.1 + 1 1 0.2 + 2 2 + 3 3 0.3 + """ - >>> import numpy as np - >>> from datetime import datetime, timedelta - >>> t0 = datetime.strptime('2018-10-07 12:00:00', '%Y-%m-%d %H:%M:%S') - >>> n = 5 - >>> df = cudf.DataFrame({ - ... 'id': np.arange(n), - ... 'datetimes': np.array( - ... [(t0+ timedelta(seconds=x)) for x in range(n)]) - ... }) - >>> df - id datetimes - 0 0 2018-10-07T12:00:00.000 - 1 1 2018-10-07T12:00:01.000 - 2 2 2018-10-07T12:00:02.000 - 3 3 2018-10-07T12:00:03.000 - 4 4 2018-10-07T12:00:04.000 - - Build DataFrame via list of rows as tuples: - - >>> df = cudf.DataFrame([ - ... (5, "cats", "jump", np.nan), - ... (2, "dogs", "dig", 7.5), - ... (3, "cows", "moo", -2.1, "occasionally"), - ... ]) - >>> df - 0 1 2 3 4 - 0 5 cats jump - 1 2 dogs dig 7.5 - 2 3 cows moo -2.1 occasionally + _PROTECTED_KEYS = frozenset(("_data", "_index")) - Convert from a Pandas DataFrame: + @annotate("DATAFRAME_INIT", color="blue", domain="cudf_python") + def __init__(self, data=None, index=None, columns=None, dtype=None): - >>> import pandas as pd - >>> pdf = pd.DataFrame({'a': [0, 1, 2, 3],'b': [0.1, 0.2, None, 0.3]}) - >>> pdf - a b - 0 0 0.1 - 1 1 0.2 - 2 2 NaN - 3 3 0.3 - >>> df = cudf.from_pandas(pdf) - >>> df - a b - 0 0 0.1 - 1 1 0.2 - 2 2 - 3 3 0.3 - """ super().__init__() if isinstance(columns, (Series, cudf.BaseIndex)): @@ -455,30 +458,16 @@ def _init_from_dict_like(self, data, index=None, columns=None): if columns is not None: self.columns = columns - @classmethod - def _from_table(cls, table, index=None): - if index is None: - if table._index is not None: - index = Index._from_table(table._index) - else: - index = RangeIndex(table._num_rows) - out = cls.__new__(cls) - out._data = table._data - out._index = index - return out - @classmethod def _from_data( cls, - data: ColumnAccessor, - index: Optional[Index] = None, + data: MutableMapping, + index: Optional[BaseIndex] = None, columns: Any = None, ) -> DataFrame: - out = cls.__new__(cls) - out._data = data + out = super()._from_data(data, index) if index is None: - index = cudf.Index(range(data.nrows)) - out._index = index + out.index = RangeIndex(out._data.nrows) if columns is not None: out.columns = columns return out @@ -864,17 +853,20 @@ def _slice(self: T, arg: slice) -> T: ) ) else: - result = self._from_table( - libcudf.copying.table_slice( + result = self._from_data( + *libcudf.copying.table_slice( self, [start, stop], keep_index )[0] ) result._copy_type_metadata(self, include_index=keep_index) - # Adding index of type RangeIndex back to - # result - if keep_index is False and self.index is not None: - result.index = self.index[start:stop] + if self.index is not None: + if keep_index: + result._index.names = self.index.names + else: + # Adding index of type RangeIndex back to + # result + result.index = self.index[start:stop] result.columns = self.columns return result @@ -3476,7 +3468,7 @@ def rename( if index: if ( any(type(item) == str for item in index.values()) - and type(self.index) != cudf.core.index.StringIndex + and type(self.index) != cudf.StringIndex ): raise NotImplementedError( "Implicit conversion of index to " @@ -3547,12 +3539,12 @@ def as_gpu_matrix(self, columns=None, order="F"): if ncol < 1: # This is the case for empty dataframe - construct empty cupy array matrix = cupy.empty( - shape=(0, 0), dtype=np.dtype("float64"), order=order + shape=(0, 0), dtype=cudf.dtype("float64"), order=order ) return cuda.as_cuda_array(matrix) if any( - (is_categorical_dtype(c) or np.issubdtype(c, np.dtype("object"))) + (is_categorical_dtype(c) or np.issubdtype(c, cudf.dtype("object"))) for c in cols ): raise TypeError("non-numeric data not yet supported") @@ -3566,7 +3558,7 @@ def as_gpu_matrix(self, columns=None, order="F"): ) cupy_dtype = dtype if np.issubdtype(cupy_dtype, np.datetime64): - cupy_dtype = np.dtype("int64") + cupy_dtype = cudf.dtype("int64") if order not in ("F", "C"): raise ValueError( @@ -3893,9 +3885,9 @@ def sort_values( Examples -------- >>> import cudf - >>> a = ('a', [0, 1, 2]) - >>> b = ('b', [-3, 2, 0]) - >>> df = cudf.DataFrame([a, b]) + >>> df = cudf.DataFrame() + >>> df['a'] = [0, 1, 2] + >>> df['b'] = [-3, 2, 0] >>> df.sort_values('b') a b 0 0 -3 @@ -3904,8 +3896,17 @@ def sort_values( """ if inplace: raise NotImplementedError("`inplace` not currently implemented.") - if kind != "quicksort": - raise NotImplementedError("`kind` not currently implemented.") + if kind not in {"quicksort", "mergesort", "heapsort", "stable"}: + raise AttributeError( + f"{kind} is not a valid sorting algorithm for " + f"'DataFrame' object" + ) + elif kind != "quicksort": + msg = ( + f"GPU-accelerated {kind} is currently not supported, " + f"now defaulting to GPU-accelerated quicksort." + ) + warnings.warn(msg) if axis != 0: raise NotImplementedError("`axis` not currently implemented.") @@ -4214,10 +4215,12 @@ def transpose(self): index = self.columns.copy(deep=False) if self._num_columns == 0 or self._num_rows == 0: return DataFrame(index=index, columns=columns) - # Cython renames the columns to the range [0...ncols] - result = self.__class__._from_table(libcudf.transpose.transpose(self)) # Set the old column names as the new index - result._index = as_index(index) + result = self.__class__._from_data( + # Cython renames the columns to the range [0...ncols] + libcudf.transpose.transpose(self), + as_index(index), + ) # Set the old index as the new column names result.columns = columns return result @@ -4458,6 +4461,7 @@ def join( ) return df + @copy_docstring(DataFrameGroupBy) def groupby( self, by=None, @@ -4502,6 +4506,7 @@ def groupby( sort=sort, ) + @copy_docstring(Rolling) def rolling( self, window, min_periods=None, center=False, axis=0, win_type=None ): @@ -5851,7 +5856,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): ) if data.ndim == 2: - num_cols = len(data[0]) + num_cols = data.shape[1] else: # Since we validate ndim to be either 1 or 2 above, # this case can be assumed to be ndim == 1. @@ -5896,6 +5901,36 @@ def _from_columns(cls, cols, index=None, columns=None): return cls(data=data, index=index,) + def interpolate( + self, + method="linear", + axis=0, + limit=None, + inplace=False, + limit_direction=None, + limit_area=None, + downcast=None, + **kwargs, + ): + + if all(dt == np.dtype("object") for dt in self.dtypes): + raise TypeError( + "Cannot interpolate with all object-dtype " + "columns in the DataFrame. Try setting at " + "least one column to a numeric dtype." + ) + + return super().interpolate( + method=method, + axis=axis, + limit=limit, + inplace=inplace, + limit_direction=limit_direction, + limit_area=limit_area, + downcast=downcast, + **kwargs, + ) + def quantile( self, q=0.5, @@ -6140,12 +6175,12 @@ def isin(self, values): isinstance( self[col]._column, cudf.core.column.CategoricalColumn ) - or np.issubdtype(self[col].dtype, np.dtype("object")) + or np.issubdtype(self[col].dtype, cudf.dtype("object")) ) or ( isinstance( values._column, cudf.core.column.CategoricalColumn ) - or np.issubdtype(values.dtype, np.dtype("object")) + or np.issubdtype(values.dtype, cudf.dtype("object")) ): result[col] = utils.scalar_broadcast_to(False, len(self)) else: @@ -6195,7 +6230,7 @@ def _prepare_for_rowwise_op(self, method, skipna): col.nullable for col in self._columns ): msg = ( - f"Row-wise operations to calculate '{method}' is not " + f"Row-wise operations to calculate '{method}' do not " f"currently support columns with null values. " f"Consider removing them with .dropna() " f"or using .fillna()." @@ -6266,472 +6301,56 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): Single 5 dtype: int64 """ - if axis not in (0, "index", None): + axis = self._get_axis_from_axis_arg(axis) + if axis != 0: raise NotImplementedError("Only axis=0 is currently supported.") - return self._apply_support_method( - "count", - axis=axis, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - - def min( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, - ): - """ - Return the minimum of the values in the DataFrame. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - level: int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a Series. - numeric_only: bool, default None - Include only float, int, boolean columns. If None, will attempt to - use everything, then use only numeric data. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.min() - a 1 - b 7 - dtype: int64 - """ - return self._apply_support_method( - "min", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - - def max( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, - ): - """ - Return the maximum of the values in the DataFrame. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - level: int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a Series. - numeric_only: bool, default None - Include only float, int, boolean columns. If None, will attempt to - use everything, then use only numeric data. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.max() - a 4 - b 10 - dtype: int64 - """ - return self._apply_support_method( - "max", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - - def sum( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """ - Return sum of the values in the DataFrame. - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. - min_count: int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.sum() - a 10 - b 34 - dtype: int64 - """ - return self._apply_support_method( - "sum", - axis=axis, - skipna=skipna, - dtype=dtype, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, + return Series._from_data( + {None: [self._data[col].valid_count for col in self._data.names]}, + as_index(self._data.names), ) - def product( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """ - Return product of the values in the DataFrame. - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. - min_count: int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.product() - a 24 - b 5040 - dtype: int64 - """ - return self._apply_support_method( - "prod", - axis=axis, - skipna=skipna, - dtype=dtype, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) + _SUPPORT_AXIS_LOOKUP = { + 0: 0, + 1: 1, + None: 0, + "index": 0, + "columns": 1, + } - def prod( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, + def _reduce( + self, op, axis=None, level=None, numeric_only=None, **kwargs, ): - """ - Return product of the values in the DataFrame. - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. - min_count: int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.prod() - a 24 - b 5040 - dtype: int64 - """ - return self.product( - axis=axis, - skipna=skipna, - dtype=dtype, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - - def cummin(self, axis=None, skipna=True, *args, **kwargs): - """ - Return cumulative minimum of the DataFrame. - - Parameters - ---------- - - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, - the result will be NA. - - Returns - ------- - DataFrame - - Notes - ----- - Parameters currently not supported is `axis` - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.cummin() - a b - 0 1 7 - 1 1 7 - 2 1 7 - 3 1 7 - """ - if axis not in (0, "index", None): - raise NotImplementedError("Only axis=0 is currently supported.") - - return self._apply_support_method( - "cummin", axis=axis, skipna=skipna, *args, **kwargs - ) - - def cummax(self, axis=None, skipna=True, *args, **kwargs): - """ - Return cumulative maximum of the DataFrame. - - Parameters - ---------- - - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, - the result will be NA. - - Returns - ------- - DataFrame - - Notes - ----- - Parameters currently not supported is `axis` - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.cummax() - a b - 0 1 7 - 1 2 8 - 2 3 9 - 3 4 10 - """ - if axis not in (0, "index", None): - raise NotImplementedError("Only axis=0 is currently supported.") - - return self._apply_support_method( - "cummax", axis=axis, skipna=skipna, *args, **kwargs - ) - - def cumsum(self, axis=None, skipna=True, *args, **kwargs): - """ - Return cumulative sum of the DataFrame. - - Parameters - ---------- - - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, - the result will be NA. - - - Returns - ------- - DataFrame - - Notes - ----- - Parameters currently not supported is `axis` - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> s.cumsum() - a b - 0 1 7 - 1 3 15 - 2 6 24 - 3 10 34 - """ - if axis not in (0, "index", None): - raise NotImplementedError("Only axis=0 is currently supported.") - - return self._apply_support_method( - "cumsum", axis=axis, skipna=skipna, *args, **kwargs - ) - - def cumprod(self, axis=None, skipna=True, *args, **kwargs): - """ - Return cumulative product of the DataFrame. - - Parameters - ---------- - - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, - the result will be NA. - - Returns - ------- - DataFrame + if level is not None: + raise NotImplementedError("level parameter is not implemented yet") - Notes - ----- - Parameters currently not supported is `axis` + if numeric_only not in (None, True): + raise NotImplementedError( + "numeric_only parameter is not implemented yet" + ) + axis = self._get_axis_from_axis_arg(axis) - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> s.cumprod() - a b - 0 1 7 - 1 2 56 - 2 6 504 - 3 24 5040 - """ - if axis not in (0, "index", None): - raise NotImplementedError("Only axis=0 is currently supported.") + if axis == 0: + result = [ + getattr(self._data[col], op)(**kwargs) + for col in self._data.names + ] - return self._apply_support_method( - "cumprod", axis=axis, skipna=skipna, *args, **kwargs - ) + return Series._from_data( + {None: result}, as_index(self._data.names) + ) + elif axis == 1: + return self._apply_cupy_method_axis_1(op, **kwargs) - def mean( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + def _scan( + self, op, axis=None, *args, **kwargs, ): - """ - Return the mean of the values for the requested axis. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'} - Axis for the function to be applied on. - skipna : bool, default True - Exclude NA/null values when computing the result. - level : int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a Series. - numeric_only : bool, default None - Include only float, int, boolean columns. If None, will attempt to - use everything, then use only numeric data. Not implemented for - Series. - **kwargs - Additional keyword arguments to be passed to the function. - - Returns - ------- - mean : Series or DataFrame (if level specified) + axis = self._get_axis_from_axis_arg(axis) - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.mean() - a 2.5 - b 8.5 - dtype: float64 - """ - return self._apply_support_method( - "mean", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) + if axis == 0: + return super()._scan(op, axis=axis, *args, **kwargs) + elif axis == 1: + return self._apply_cupy_method_axis_1(f"cum{op}", **kwargs) def mode(self, axis=0, numeric_only=False, dropna=True): """ @@ -6759,9 +6378,9 @@ def mode(self, axis=0, numeric_only=False, dropna=True): See Also -------- - cudf.core.series.Series.mode : Return the highest frequency value + cudf.Series.mode : Return the highest frequency value in a Series. - cudf.core.series.Series.value_counts : Return the counts of values + cudf.Series.value_counts : Return the counts of values in a Series. Notes @@ -6833,424 +6452,129 @@ def mode(self, axis=0, numeric_only=False, dropna=True): return df - def std( - self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs, - ): - """ - Return sample standard deviation of the DataFrame. - - Normalized by N-1 by default. This can be changed using - the `ddof` argument - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - ddof: int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is N - ddof, where N represents the number of elements. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.std() - a 1.290994 - b 1.290994 - dtype: float64 - """ - - return self._apply_support_method( - "std", - axis=axis, - skipna=skipna, - level=level, - ddof=ddof, - numeric_only=numeric_only, - **kwargs, - ) - - def var( - self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs, - ): - """ - Return unbiased variance of the DataFrame. - - Normalized by N-1 by default. This can be changed using the - ddof argument - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - ddof: int, default 1 - Delta Degrees of Freedom. The divisor used in calculations is - N - ddof, where N represents the number of elements. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.var() - a 1.666667 - b 1.666667 - dtype: float64 - """ - return self._apply_support_method( - "var", - axis=axis, - skipna=skipna, - level=level, - ddof=ddof, - numeric_only=numeric_only, - **kwargs, - ) - def kurtosis( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): - """ - Return Fisher's unbiased kurtosis of a sample. - - Kurtosis obtained using Fisher’s definition of - kurtosis (kurtosis of normal == 0.0). Normalized by N-1. - - Parameters - ---------- - - skipna: bool, default True - Exclude NA/null values when computing the result. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.kurt() - a -1.2 - b -1.2 - dtype: float64 - """ - if axis not in (0, "index", None): - raise NotImplementedError("Only axis=0 is currently supported.") - - if numeric_only not in (None, True): - msg = "Kurtosis only supports int, float, and bool dtypes." - raise NotImplementedError(msg) - - filtered = self.select_dtypes(include=[np.number, np.bool_]) - return filtered._apply_support_method( - "kurtosis", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, + obj = self.select_dtypes(include=[np.number, np.bool_]) + return super(DataFrame, obj).kurtosis( + axis, skipna, level, numeric_only, **kwargs ) - # Alias for kurtosis. - kurt = kurtosis - def skew( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): - """ - Return unbiased Fisher-Pearson skew of a sample. - - Parameters - ---------- - skipna: bool, default True - Exclude NA/null values when computing the result. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 8, 10, 10]}) - >>> df.skew() - a 0.00000 - b -0.37037 - dtype: float64 - """ - if axis not in (0, "index", None): - raise NotImplementedError("Only axis=0 is currently supported.") - - if numeric_only not in (None, True): - msg = "Skew only supports int, float, and bool dtypes." - raise NotImplementedError(msg) - - filtered = self.select_dtypes(include=[np.number, np.bool_]) - return filtered._apply_support_method( - "skew", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, + obj = self.select_dtypes(include=[np.number, np.bool_]) + return super(DataFrame, obj).skew( + axis, skipna, level, numeric_only, **kwargs ) def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): - """ - Return whether all elements are True in DataFrame. - - Parameters - ---------- - - skipna: bool, default True - Exclude NA/null values. If the entire row/column is NA and - skipna is True, then the result will be True, as for an - empty row/column. - If skipna is False, then NA are treated as True, because - these are not equal to zero. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) - >>> df.all() - a True - b False - dtype: bool - """ - if bool_only: - return self.select_dtypes(include="bool")._apply_support_method( - "all", - axis=axis, - bool_only=bool_only, - skipna=skipna, - level=level, - **kwargs, - ) - return self._apply_support_method( - "all", - axis=axis, - bool_only=bool_only, - skipna=skipna, - level=level, - **kwargs, - ) + obj = self.select_dtypes(include="bool") if bool_only else self + return super(DataFrame, obj).all(axis, skipna, level, **kwargs) def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): - """ - Return whether any elements is True in DataFrame. - - Parameters - ---------- - - skipna: bool, default True - Exclude NA/null values. If the entire row/column is NA and - skipna is True, then the result will be False, as for an - empty row/column. - If skipna is False, then NA are treated as True, because - these are not equal to zero. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) - >>> df.any() - a True - b True - dtype: bool - """ - if bool_only: - return self.select_dtypes(include="bool")._apply_support_method( - "any", - axis=axis, - bool_only=bool_only, - skipna=skipna, - level=level, - **kwargs, + obj = self.select_dtypes(include="bool") if bool_only else self + return super(DataFrame, obj).any(axis, skipna, level, **kwargs) + + def _apply_cupy_method_axis_1(self, method, *args, **kwargs): + # This method uses cupy to perform scans and reductions along rows of a + # DataFrame. Since cuDF is designed around columnar storage and + # operations, we convert DataFrames to 2D cupy arrays for these ops. + + # for dask metadata compatibility + skipna = kwargs.pop("skipna", None) + skipna = True if skipna is None else skipna + if method not in _cupy_nan_methods_map and skipna not in ( + None, + True, + 1, + ): + raise NotImplementedError( + f"Row-wise operations to calculate '{method}'" + f" currently do not support `skipna=False`." ) - return self._apply_support_method( - "any", - axis=axis, - bool_only=bool_only, - skipna=skipna, - level=level, - **kwargs, - ) - - def _apply_support_method(self, method, axis=0, *args, **kwargs): - assert axis in (None, 0, 1) - - if axis in (None, 0): - result = [ - getattr(self[col], method)(*args, **kwargs) - for col in self._data.names - ] - - if isinstance(result[0], Series): - support_result = result - result = DataFrame(index=support_result[0].index) - for idx, col in enumerate(self._data.names): - result[col] = support_result[idx] - else: - result = Series(result) - result = result.set_index(self._data.names) - return result - elif axis == 1: - # for dask metadata compatibility - skipna = kwargs.pop("skipna", None) - if method not in _cupy_nan_methods_map and skipna not in ( - None, - True, - 1, - ): - raise NotImplementedError( - f"Row-wise operation to calculate '{method}'" - f" currently do not support `skipna=False`." - ) + level = kwargs.pop("level", None) + if level not in (None,): + raise NotImplementedError( + "Row-wise operations currently do not support `level`." + ) - level = kwargs.pop("level", None) - if level not in (None,): - raise NotImplementedError( - "Row-wise operations currently do not support `level`." - ) + numeric_only = kwargs.pop("numeric_only", None) + if numeric_only not in (None, True): + raise NotImplementedError( + "Row-wise operations currently do not " + "support `numeric_only=False`." + ) - numeric_only = kwargs.pop("numeric_only", None) - if numeric_only not in (None, True): - raise NotImplementedError( - "Row-wise operations currently do not " - "support `numeric_only=False`." - ) + min_count = kwargs.pop("min_count", None) + if min_count not in (None, 0): + raise NotImplementedError( + "Row-wise operations currently do not support `min_count`." + ) - min_count = kwargs.pop("min_count", None) - if min_count not in (None, 0): - raise NotImplementedError( - "Row-wise operations currently do not " - "support `min_count`." - ) + bool_only = kwargs.pop("bool_only", None) + if bool_only not in (None, True): + raise NotImplementedError( + "Row-wise operations currently do not support `bool_only`." + ) - bool_only = kwargs.pop("bool_only", None) - if bool_only not in (None, True): - raise NotImplementedError( - "Row-wise operations currently do not " - "support `bool_only`." - ) + # This parameter is only necessary for axis 0 reductions that cuDF + # performs internally. cupy already upcasts smaller integer/bool types + # to int64 when accumulating. + kwargs.pop("cast_to_int", None) - prepared, mask, common_dtype = self._prepare_for_rowwise_op( - method, skipna - ) - for col in prepared._data.names: - if prepared._data[col].nullable: - prepared._data[col] = ( - prepared._data[col] - .astype( - cudf.utils.dtypes.get_min_float_dtype( - prepared._data[col] - ) - if not is_datetime_dtype(common_dtype) - else np.dtype("float64") + prepared, mask, common_dtype = self._prepare_for_rowwise_op( + method, skipna + ) + for col in prepared._data.names: + if prepared._data[col].nullable: + prepared._data[col] = ( + prepared._data[col] + .astype( + cudf.utils.dtypes.get_min_float_dtype( + prepared._data[col] ) - .fillna(np.nan) + if not is_datetime_dtype(common_dtype) + else cudf.dtype("float64") ) - arr = cupy.asarray(prepared.as_gpu_matrix()) - - if skipna is not False and method in _cupy_nan_methods_map: - method = _cupy_nan_methods_map[method] - - result = getattr(cupy, method)(arr, axis=1, **kwargs) - - if result.ndim == 1: - type_coerced_methods = { - "count", - "min", - "max", - "sum", - "prod", - "cummin", - "cummax", - "cumsum", - "cumprod", - } - result_dtype = ( - common_dtype - if method in type_coerced_methods - or is_datetime_dtype(common_dtype) - else None + .fillna(np.nan) ) - result = column.as_column(result, dtype=result_dtype) - if mask is not None: - result = result.set_mask( - cudf._lib.transform.bools_to_mask(mask._column) - ) - return Series(result, index=self.index, dtype=result_dtype,) - else: - result_df = DataFrame(result).set_index(self.index) - result_df.columns = prepared.columns - return result_df + arr = cupy.asarray(prepared.as_gpu_matrix()) + + if skipna is not False and method in _cupy_nan_methods_map: + method = _cupy_nan_methods_map[method] + + result = getattr(cupy, method)(arr, axis=1, **kwargs) + + if result.ndim == 1: + type_coerced_methods = { + "count", + "min", + "max", + "sum", + "prod", + "cummin", + "cummax", + "cumsum", + "cumprod", + } + result_dtype = ( + common_dtype + if method in type_coerced_methods + or is_datetime_dtype(common_dtype) + else None + ) + result = column.as_column(result, dtype=result_dtype) + if mask is not None: + result = result.set_mask( + cudf._lib.transform.bools_to_mask(mask._column) + ) + return Series(result, index=self.index, dtype=result_dtype,) + else: + result_df = DataFrame(result).set_index(self.index) + result_df.columns = prepared.columns + return result_df def _columns_view(self, columns): """ @@ -8084,7 +7408,7 @@ def _get_union_of_indices(indexes): if len(indexes) == 1: return indexes[0] else: - merged_index = cudf.core.Index._concat(indexes) + merged_index = cudf.Index._concat(indexes) merged_index = merged_index.drop_duplicates() _, inds = merged_index._values.sort_by_values() return merged_index.take(inds) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 6dbe55d0bb8..ead0b6453c1 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -21,20 +21,71 @@ from cudf.core.buffer import Buffer +def dtype(arbitrary): + """ + Return the cuDF-supported dtype corresponding to `arbitrary`. + + Parameters + ---------- + arbitrary: dtype or scalar-like + + Returns + ------- + dtype: the cuDF-supported dtype that best matches `arbitrary` + """ + # first, try interpreting arbitrary as a NumPy dtype that we support: + try: + np_dtype = np.dtype(arbitrary) + if np_dtype.name == "float16": + return np.dtype("float32") + elif np_dtype.name == "float128": + raise NotImplementedError() + elif np_dtype.kind in ("OU"): + return np.dtype("object") + except TypeError: + pass + else: + if np_dtype.kind not in "biufUOMm": + raise TypeError(f"Unsupported type {np_dtype}") + return np_dtype + + # next, check if `arbitrary` is one of our extension types: + if isinstance(arbitrary, cudf.core.dtypes._BaseDtype): + return arbitrary + + # use `pandas_dtype` to try and interpret + # `arbitrary` as a Pandas extension type. + # Return the corresponding NumPy/cuDF type. + pd_dtype = pd.api.types.pandas_dtype(arbitrary) + try: + return dtype(pd_dtype.numpy_dtype) + except AttributeError: + if isinstance(pd_dtype, pd.CategoricalDtype): + return cudf.CategoricalDtype.from_pandas(pd_dtype) + elif isinstance(pd_dtype, pd.StringDtype): + return np.dtype("object") + elif isinstance(pd_dtype, pd.IntervalDtype): + return cudf.IntervalDtype.from_pandas(pd_dtype) + else: + raise TypeError( + f"Cannot interpret {arbitrary} as a valid cuDF dtype" + ) + + class _BaseDtype(ExtensionDtype, Serializable): # Base type for all cudf-specific dtypes pass class CategoricalDtype(_BaseDtype): + """ + dtype similar to pd.CategoricalDtype with the categories + stored on the GPU. + """ ordered: Optional[bool] def __init__(self, categories=None, ordered: bool = None) -> None: - """ - dtype similar to pd.CategoricalDtype with the categories - stored on the GPU. - """ self._categories = self._init_categories(categories) self.ordered = ordered @@ -157,7 +208,7 @@ def element_type(self) -> Dtype: elif isinstance(self._typ.value_type, pa.StructType): return StructDtype.from_arrow(self._typ.value_type) else: - return np.dtype(self._typ.value_type.to_pandas_dtype()).name + return cudf.dtype(self._typ.value_type.to_pandas_dtype()).name @property def leaf_type(self): @@ -223,14 +274,14 @@ def deserialize(cls, header: dict, frames: list): class StructDtype(_BaseDtype): + """ + fields : dict + A mapping of field names to dtypes + """ name = "struct" def __init__(self, fields): - """ - fields : dict - A mapping of field names to dtypes - """ pa_fields = { k: cudf.utils.dtypes.cudf_dtype_to_pa_type(v) for k, v in fields.items() @@ -309,34 +360,34 @@ def deserialize(cls, header: dict, frames: list): class Decimal32Dtype(_BaseDtype): + """ + Parameters + ---------- + precision : int + The total number of digits in each value of this dtype + scale : int, optional + The scale of the Decimal32Dtype. See Notes below. + + Notes + ----- + When the scale is positive: + - numbers with fractional parts (e.g., 0.0042) can be represented + - the scale is the total number of digits to the right of the + decimal point + When the scale is negative: + - only multiples of powers of 10 (including 10**0) can be + represented (e.g., 1729, 4200, 1000000) + - the scale represents the number of trailing zeros in the value. + For example, 42 is representable with precision=2 and scale=0. + 13.0051 is representable with precision=6 and scale=4, + and *not* representable with precision<6 or scale<4. + """ name = "decimal32" _metadata = ("precision", "scale") MAX_PRECISION = np.floor(np.log10(np.iinfo("int32").max)) def __init__(self, precision, scale=0): - """ - Parameters - ---------- - precision : int - The total number of digits in each value of this dtype - scale : int, optional - The scale of the Decimal32Dtype. See Notes below. - - Notes - ----- - When the scale is positive: - - numbers with fractional parts (e.g., 0.0042) can be represented - - the scale is the total number of digits to the right of the - decimal point - When the scale is negative: - - only multiples of powers of 10 (including 10**0) can be - represented (e.g., 1729, 4200, 1000000) - - the scale represents the number of trailing zeros in the value. - For example, 42 is representable with precision=2 and scale=0. - 13.0051 is representable with precision=6 and scale=4, - and *not* representable with precision<6 or scale<4. - """ self._validate(precision, scale) self._typ = pa.decimal128(precision, scale) @@ -417,34 +468,34 @@ def deserialize(cls, header: dict, frames: list): class Decimal64Dtype(_BaseDtype): + """ + Parameters + ---------- + precision : int + The total number of digits in each value of this dtype + scale : int, optional + The scale of the Decimal64Dtype. See Notes below. + + Notes + ----- + When the scale is positive: + - numbers with fractional parts (e.g., 0.0042) can be represented + - the scale is the total number of digits to the right of the + decimal point + When the scale is negative: + - only multiples of powers of 10 (including 10**0) can be + represented (e.g., 1729, 4200, 1000000) + - the scale represents the number of trailing zeros in the value. + For example, 42 is representable with precision=2 and scale=0. + 13.0051 is representable with precision=6 and scale=4, + and *not* representable with precision<6 or scale<4. + """ name = "decimal64" _metadata = ("precision", "scale") MAX_PRECISION = np.floor(np.log10(np.iinfo("int64").max)) def __init__(self, precision, scale=0): - """ - Parameters - ---------- - precision : int - The total number of digits in each value of this dtype - scale : int, optional - The scale of the Decimal64Dtype. See Notes below. - - Notes - ----- - When the scale is positive: - - numbers with fractional parts (e.g., 0.0042) can be represented - - the scale is the total number of digits to the right of the - decimal point - When the scale is negative: - - only multiples of powers of 10 (including 10**0) can be - represented (e.g., 1729, 4200, 1000000) - - the scale represents the number of trailing zeros in the value. - For example, 42 is representable with precision=2 and scale=0. - 13.0051 is representable with precision=6 and scale=4, - and *not* representable with precision<6 or scale<4. - """ self._validate(precision, scale) self._typ = pa.decimal128(precision, scale) @@ -525,16 +576,17 @@ def deserialize(cls, header: dict, frames: list): class IntervalDtype(StructDtype): + """ + subtype: str, np.dtype + The dtype of the Interval bounds. + closed: {‘right’, ‘left’, ‘both’, ‘neither’}, default ‘right’ + Whether the interval is closed on the left-side, right-side, + both or neither. See the Notes for more detailed explanation. + """ + name = "interval" def __init__(self, subtype, closed="right"): - """ - subtype: str, np.dtype - The dtype of the Interval bounds. - closed: {‘right’, ‘left’, ‘both’, ‘neither’}, default ‘right’ - Whether the interval is closed on the left-side, right-side, - both or neither. See the Notes for more detailed explanation. - """ super().__init__(fields={"left": subtype, "right": subtype}) if closed in ["left", "right", "neither", "both"]: @@ -559,6 +611,12 @@ def to_arrow(self): pa.from_numpy_dtype(self.subtype), self.closed ) + @classmethod + def from_pandas(cls, pd_dtype: pd.IntervalDtype) -> "IntervalDtype": + return cls( + subtype=pd_dtype.subtype + ) # TODO: needs `closed` when we upgrade Pandas + def is_categorical_dtype(obj): """Check whether an array-like or dtype is of the Categorical dtype. diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 6ecb0bcc139..9f743cd8c85 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6,7 +6,7 @@ import functools import warnings from collections import abc -from typing import Any, Dict, Optional, Tuple, TypeVar, Union +from typing import Any, Dict, MutableMapping, Optional, Tuple, TypeVar, Union import cupy import numpy as np @@ -27,6 +27,7 @@ ) from cudf.core.column_accessor import ColumnAccessor from cudf.core.join import merge +from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( _is_non_decimal_numeric_dtype, _is_scalar_or_zero_d_array, @@ -64,8 +65,14 @@ def __init_subclass__(cls): cls._accessors = set() @classmethod - def _from_table(cls, table: Frame): - return cls(table._data, index=table._index) + def _from_data( + cls, + data: MutableMapping, + index: Optional[cudf.core.index.BaseIndex] = None, + ): + obj = cls.__new__(cls) + libcudf.table.Table.__init__(obj, data, index) + return obj def _mimic_inplace( self: T, result: Frame, inplace: bool = False @@ -476,8 +483,8 @@ def _concat( ) # Concatenate the Tables - out = cls._from_table( - libcudf.concat.concat_tables(tables, ignore_index=ignore_index) + out = cls._from_data( + *libcudf.concat.concat_tables(tables, ignore_index) ) # If ignore_index is True, all input frames are empty, and at @@ -612,10 +619,11 @@ def _explode(self, explode_column: Any, ignore_index: bool): if not ignore_index and self._index is not None: explode_column_num += self._index.nlevels - res_tbl = libcudf.lists.explode_outer( - self, explode_column_num, ignore_index + res = self.__class__._from_data( # type: ignore + *libcudf.lists.explode_outer( + self, explode_column_num, ignore_index + ) ) - res = self.__class__._from_table(res_tbl) res._data.multiindex = self._data.multiindex res._data._level_names = self._data._level_names @@ -644,14 +652,15 @@ def _get_columns_by_index(self, indices): def _gather(self, gather_map, keep_index=True, nullify=False): if not is_integer_dtype(gather_map.dtype): gather_map = gather_map.astype("int32") - result = self.__class__._from_table( - libcudf.copying.gather( + result = self.__class__._from_data( + *libcudf.copying.gather( self, as_column(gather_map), keep_index=keep_index, nullify=nullify, ) ) + result._copy_type_metadata(self, include_index=keep_index) if keep_index and self._index is not None: result._index.names = self._index.names @@ -663,10 +672,10 @@ def _hash(self, initial_hash_values=None): def _hash_partition( self, columns_to_hash, num_partitions, keep_index=True ): - output, offsets = libcudf.hash.hash_partition( + output_data, output_index, offsets = libcudf.hash.hash_partition( self, columns_to_hash, num_partitions, keep_index ) - output = self.__class__._from_table(output) + output = self.__class__._from_data(output_data, output_index) output._copy_type_metadata(self, include_index=keep_index) return output, offsets @@ -684,14 +693,16 @@ def _as_column(self): return self._data[None].copy(deep=False) def _scatter(self, key, value): - result = self._from_table(libcudf.copying.scatter(value, key, self)) + result = self.__class__._from_data( + *libcudf.copying.scatter(value, key, self) + ) result._copy_type_metadata(self) return result def _empty_like(self, keep_index=True): - result = self._from_table( - libcudf.copying.table_empty_like(self, keep_index) + result = self.__class__._from_data( + *libcudf.copying.table_empty_like(self, keep_index) ) result._copy_type_metadata(self, include_index=keep_index) @@ -876,8 +887,9 @@ def where(self, cond, other=None, inplace=False): 4 dtype: int64 """ + import cudf.core._internals.where - return cudf.core._internals.where( + return cudf.core._internals.where.where( frame=self, cond=cond, other=other, inplace=inplace ) @@ -944,10 +956,10 @@ def mask(self, cond, other=None, inplace=False): def _partition(self, scatter_map, npartitions, keep_index=True): - output_table, output_offsets = libcudf.partitioning.partition( + data, index, output_offsets = libcudf.partitioning.partition( self, scatter_map, npartitions, keep_index ) - partitioned = self.__class__._from_table(output_table) + partitioned = self.__class__._from_data(data, index) # due to the split limitation mentioned # here: https://github.com/rapidsai/cudf/issues/4607 @@ -1108,19 +1120,19 @@ def dropna( See also -------- - cudf.core.dataframe.DataFrame.isna + cudf.DataFrame.isna Indicate null values. - cudf.core.dataframe.DataFrame.notna + cudf.DataFrame.notna Indicate non-null values. - cudf.core.dataframe.DataFrame.fillna + cudf.DataFrame.fillna Replace null values. - cudf.core.series.Series.dropna + cudf.Series.dropna Drop null values. - cudf.core.index.Index.dropna + cudf.Index.dropna Drop null indices. Examples @@ -1332,7 +1344,7 @@ def fillna( ) or method is not None if should_fill: copy_data[name] = copy_data[name].fillna(value[name], method) - result = self._from_table(Frame(copy_data, self._index)) + result = self._from_data(copy_data, self._index) return self._mimic_inplace(result, inplace=inplace) @@ -1381,8 +1393,8 @@ def _drop_na_rows( else: frame._data[name] = col - result = frame.__class__._from_table( - libcudf.stream_compaction.drop_nulls( + result = self.__class__._from_data( + *libcudf.stream_compaction.drop_nulls( frame, how=how, keys=subset, thresh=thresh ) ) @@ -1427,14 +1439,83 @@ def _apply_boolean_mask(self, boolean_mask): """ boolean_mask = as_column(boolean_mask) - result = self.__class__._from_table( - libcudf.stream_compaction.apply_boolean_mask( + result = self.__class__._from_data( + *libcudf.stream_compaction.apply_boolean_mask( self, as_column(boolean_mask) ) ) result._copy_type_metadata(self) return result + def interpolate( + self, + method="linear", + axis=0, + limit=None, + inplace=False, + limit_direction=None, + limit_area=None, + downcast=None, + **kwargs, + ): + """ + Interpolate data values between some points. + + Parameters + ---------- + method : str, default 'linear' + Interpolation technique to use. Currently, + only 'linear` is supported. + * 'linear': Ignore the index and treat the values as + equally spaced. This is the only method supported on MultiIndexes. + * 'index', 'values': linearly interpolate using the index as + an x-axis. Unsorted indices can lead to erroneous results. + axis : int, default 0 + Axis to interpolate along. Currently, + only 'axis=0' is supported. + inplace : bool, default False + Update the data in place if possible. + + Returns + ------- + Series or DataFrame + Returns the same object type as the caller, interpolated at + some or all ``NaN`` values + + """ + + if method in {"pad", "ffill"} and limit_direction != "forward": + raise ValueError( + f"`limit_direction` must be 'forward' for method `{method}`" + ) + if method in {"backfill", "bfill"} and limit_direction != "backward": + raise ValueError( + f"`limit_direction` must be 'backward' for method `{method}`" + ) + + data = self + + if not isinstance(data._index, cudf.RangeIndex): + perm_sort = data._index.argsort() + data = data._gather(perm_sort) + + interpolator = cudf.core.algorithms.get_column_interpolator(method) + columns = {} + for colname, col in data._data.items(): + if col.nullable: + col = col.astype("float64").fillna(np.nan) + + # Interpolation methods may or may not need the index + columns[colname] = interpolator(col, index=data._index) + + result = self._from_data(columns, index=data._index) + + return ( + result + if isinstance(data._index, cudf.RangeIndex) + else result._gather(perm_sort.argsort()) + ) + def _quantiles( self, q, @@ -1453,8 +1534,8 @@ def _quantiles( libcudf.types.NullOrder[key] for key in null_precedence ] - result = self.__class__._from_table( - libcudf.quantiles.quantiles( + result = self.__class__._from_data( + *libcudf.quantiles.quantiles( self, q, interpolation, @@ -1548,11 +1629,11 @@ def rank( if source.empty: return source.astype("float64") - out_rank_table = libcudf.sort.rank_columns( + data, index = libcudf.sort.rank_columns( source, method_enum, na_option, ascending, pct ) - return self._from_table(out_rank_table).astype(np.float64) + return self._from_data(data, index).astype(np.float64) def repeat(self, repeats, axis=None): """Repeats elements consecutively. @@ -1639,24 +1720,24 @@ def _repeat(self, count): if not is_scalar(count): count = as_column(count) - result = self.__class__._from_table( - libcudf.filling.repeat(self, count) + result = self.__class__._from_data( + *libcudf.filling.repeat(self, count) ) result._copy_type_metadata(self) return result def _reverse(self): - result = self.__class__._from_table(libcudf.copying.reverse(self)) - return result + return self.__class__._from_data(*libcudf.copying.reverse(self)) def _fill(self, fill_values, begin, end, inplace): col_and_fill = zip(self._columns, fill_values) if not inplace: data_columns = (c._fill(v, begin, end) for (c, v) in col_and_fill) - data = zip(self._column_names, data_columns) - return self.__class__._from_table(Frame(data, self._index)) + return self.__class__._from_data( + zip(self._column_names, data_columns), self._index + ) for (c, v) in col_and_fill: c.fill(v, begin, end, inplace=True) @@ -1671,8 +1752,9 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): def _shift(self, offset, fill_value=None): data_columns = (col.shift(offset, fill_value) for col in self._columns) - data = zip(self._column_names, data_columns) - return self.__class__._from_table(Frame(data, self._index)) + return self.__class__._from_data( + zip(self._column_names, data_columns), self._index + ) def __array__(self, dtype=None): raise TypeError( @@ -1792,13 +1874,11 @@ def round(self, decimals=0, how="half_even"): "decimals must be an integer, a dict-like or a Series" ) - return self.__class__._from_table( - Frame( - data=cudf.core.column_accessor.ColumnAccessor( - cols, - multiindex=self._data.multiindex, - level_names=self._data.level_names, - ) + return self.__class__._from_data( + data=cudf.core.column_accessor.ColumnAccessor( + cols, + multiindex=self._data.multiindex, + level_names=self._data.level_names, ), index=self._index, ) @@ -1923,8 +2003,8 @@ def sample( else: seed = np.int64(random_state) - result = self._from_table( - libcudf.copying.sample( + result = self.__class__._from_data( + *libcudf.copying.sample( self, n=n, replace=replace, @@ -2064,12 +2144,12 @@ def from_arrow(cls, data): ) # Handle dict arrays - cudf_category_frame = libcudf.table.Table() + cudf_category_frame = {} if len(dict_indices): dict_indices_table = pa.table(dict_indices) data = data.drop(dict_indices_table.column_names) - cudf_indices_frame = libcudf.interop.from_arrow( + cudf_indices_frame, _ = libcudf.interop.from_arrow( dict_indices_table, dict_indices_table.column_names ) # as dictionary size can vary, it can't be a single table @@ -2078,9 +2158,8 @@ def from_arrow(cls, data): for name in dict_dictionaries.keys() } - for name in cudf_indices_frame._data.names: - codes = cudf_indices_frame._data[name] - cudf_category_frame._data[name] = build_categorical_column( + for name, codes in cudf_indices_frame.items(): + cudf_category_frame[name] = build_categorical_column( cudf_dictionaries_columns[name], codes, mask=codes.base_mask, @@ -2090,30 +2169,20 @@ def from_arrow(cls, data): # Handle non-dict arrays cudf_non_category_frame = ( - libcudf.table.Table() + {} if data.num_columns == 0 - else libcudf.interop.from_arrow(data, data.column_names) + else libcudf.interop.from_arrow(data, data.column_names)[0] ) - if ( - cudf_non_category_frame._num_columns > 0 - and cudf_category_frame._num_columns > 0 - ): - result = cudf_non_category_frame - for name in cudf_category_frame._data.names: - result._data[name] = cudf_category_frame._data[name] - elif cudf_non_category_frame._num_columns > 0: - result = cudf_non_category_frame - else: - result = cudf_category_frame + result = {**cudf_non_category_frame, **cudf_category_frame} # There are some special cases that need to be handled # based on metadata. if pandas_dtypes: - for name in result._data.names: + for name in result: dtype = None if ( - len(result._data[name]) == 0 + len(result[name]) == 0 and pandas_dtypes[name] == "categorical" ): # When pandas_dtype is a categorical column and the size @@ -2139,18 +2208,14 @@ def from_arrow(cls, data): # struct fields, hence renaming the struct fields is # necessary by extracting the field names from arrow # struct types. - result._data[name] = result._data[name]._rename_fields( + result[name] = result[name]._rename_fields( [field.name for field in data[name].type] ) if dtype is not None: - result._data[name] = result._data[name].astype(dtype) - - result = libcudf.table.Table( - result._data.select_by_label(column_names) - ) + result[name] = result[name].astype(dtype) - return cls._from_table(result) + return cls._from_data({name: result[name] for name in column_names}) @annotate("TO_ARROW", color="orange", domain="cudf_python") def to_arrow(self): @@ -2209,8 +2274,8 @@ def drop_duplicates( if len(subset_cols) == 0: return self.copy(deep=True) - result = self._from_table( - libcudf.stream_compaction.drop_duplicates( + result = self.__class__._from_data( + *libcudf.stream_compaction.drop_duplicates( self, keys=subset, keep=keep, @@ -2256,7 +2321,7 @@ def replace(self, to_replace: Any, replacement: Any) -> Frame: else: copy_data = self._data.copy(deep=True) - result = self._from_table(Frame(copy_data, self._index)) + result = self._from_data(copy_data, self._index) return result @@ -2278,15 +2343,17 @@ def _copy_type_metadata( if include_index: if self._index is not None and other._index is not None: self._index._copy_type_metadata(other._index) - # When other._index is a CategoricalIndex, there is + # When other._index is a CategoricalIndex, the current index + # will be a NumericalIndex with an underlying CategoricalColumn + # (the above _copy_type_metadata call will have converted the + # column). Calling cudf.Index on that column generates the + # appropriate index. if isinstance( other._index, cudf.core.index.CategoricalIndex ) and not isinstance( self._index, cudf.core.index.CategoricalIndex ): - self._index = cudf.core.index.Index._from_table( - self._index - ) + self._index = cudf.Index(self._index._column) return self @@ -2376,8 +2443,9 @@ def isnull(self): GenericIndex([False, False, True, True, False, False], dtype='bool') """ data_columns = (col.isnull() for col in self._columns) - data = zip(self._column_names, data_columns) - return self.__class__._from_table(Frame(data, self._index)) + return self.__class__._from_data( + zip(self._column_names, data_columns), self._index + ) # Alias for isnull isna = isnull @@ -2456,8 +2524,9 @@ def notnull(self): GenericIndex([True, True, False, False, True, True], dtype='bool') """ data_columns = (col.notnull() for col in self._columns) - data = zip(self._column_names, data_columns) - return self.__class__._from_table(Frame(data, self._index)) + return self.__class__._from_data( + zip(self._column_names, data_columns), self._index + ) # Alias for notnull notna = notnull @@ -2526,7 +2595,7 @@ def tile(self, count): ------- The table containing the tiled "rows". """ - result = self.__class__._from_table(libcudf.reshape.tile(self, count)) + result = self.__class__._from_data(*libcudf.reshape.tile(self, count)) result._copy_type_metadata(self) return result @@ -3264,20 +3333,16 @@ def _is_sorted(self, ascending=None, null_position=None): ) def _split(self, splits, keep_index=True): - result = libcudf.copying.table_split( + results = libcudf.copying.table_split( self, splits, keep_index=keep_index ) - result = [self.__class__._from_table(tbl) for tbl in result] - return result + return [self.__class__._from_data(*result) for result in results] def _encode(self): - keys, indices = libcudf.transform.table_encode(self) - keys = self.__class__._from_table(keys) - for col in keys._data: - keys._data[col] = keys._data[col]._with_type_metadata( - self._data[col].dtype - ) - + data, index, indices = libcudf.transform.table_encode(self) + for name, col in data.items(): + data[name] = col._with_type_metadata(self._data[name].dtype) + keys = self.__class__._from_data(data, index) return keys, indices def _reindex( @@ -3312,7 +3377,7 @@ def _reindex( if index is not None: index = cudf.core.index.as_index(index) - if isinstance(index, cudf.core.MultiIndex): + if isinstance(index, cudf.MultiIndex): idx_dtype_match = ( df.index._source_data.dtypes == index._source_data.dtypes ).all() @@ -3344,13 +3409,11 @@ def _reindex( for name in names } - result = self.__class__._from_table( - Frame( - data=cudf.core.column_accessor.ColumnAccessor( - cols, - multiindex=self._data.multiindex, - level_names=self._data.level_names, - ) + result = self.__class__._from_data( + data=cudf.core.column_accessor.ColumnAccessor( + cols, + multiindex=self._data.multiindex, + level_names=self._data.level_names, ), index=index, ) @@ -3359,8 +3422,9 @@ def _reindex( def _unaryop(self, op): data_columns = (col.unary_operator(op) for col in self._columns) - data = zip(self._column_names, data_columns) - return self.__class__._from_table(Frame(data, self._index)) + return self.__class__._from_data( + zip(self._column_names, data_columns), self._index + ) def _binaryop( self, @@ -3613,6 +3677,860 @@ def __pos__(self): def __abs__(self): return self._unaryop("abs") + # Reductions + @classmethod + def _get_axis_from_axis_arg(cls, axis): + try: + return cls._SUPPORT_AXIS_LOOKUP[axis] + except KeyError: + raise ValueError(f"No axis named {axis} for object type {cls}") + + def _reduce(self, *args, **kwargs): + raise NotImplementedError( + f"Reductions are not supported for objects of type {type(self)}." + ) + + def min( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, + ): + """ + Return the minimum of the values in the DataFrame. + + Parameters + ---------- + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + level: int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a Series. + numeric_only: bool, default None + Include only float, int, boolean columns. If None, will attempt to + use everything, then use only numeric data. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `level`, `numeric_only`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.min() + a 1 + b 7 + dtype: int64 + """ + return self._reduce( + "min", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + + def max( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, + ): + """ + Return the maximum of the values in the DataFrame. + + Parameters + ---------- + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + level: int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a Series. + numeric_only: bool, default None + Include only float, int, boolean columns. If None, will attempt to + use everything, then use only numeric data. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `level`, `numeric_only`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.max() + a 4 + b 10 + dtype: int64 + """ + return self._reduce( + "max", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + + def sum( + self, + axis=None, + skipna=None, + dtype=None, + level=None, + numeric_only=None, + min_count=0, + **kwargs, + ): + """ + Return sum of the values in the DataFrame. + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + dtype: data type + Data type to cast the result to. + min_count: int, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result + will be NA. + + The default being 0. This means the sum of an all-NA or empty + Series is 0, and the product of an all-NA or empty Series is 1. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `level`, `numeric_only`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.sum() + a 10 + b 34 + dtype: int64 + """ + return self._reduce( + "sum", + axis=axis, + skipna=skipna, + dtype=dtype, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + + def product( + self, + axis=None, + skipna=None, + dtype=None, + level=None, + numeric_only=None, + min_count=0, + **kwargs, + ): + """ + Return product of the values in the DataFrame. + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + dtype: data type + Data type to cast the result to. + min_count: int, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result + will be NA. + + The default being 0. This means the sum of an all-NA or empty + Series is 0, and the product of an all-NA or empty Series is 1. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are level`, `numeric_only`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.product() + a 24 + b 5040 + dtype: int64 + """ + axis = self._get_axis_from_axis_arg(axis) + return self._reduce( + # cuDF columns use "product" as the op name, but cupy uses "prod" + # and we need cupy if axis == 1. + "product" if axis == 0 else "prod", + axis=axis, + skipna=skipna, + dtype=dtype, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + + # Alias for pandas compatibility. + prod = product + + def mean( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + ): + """ + Return the mean of the values for the requested axis. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'} + Axis for the function to be applied on. + skipna : bool, default True + Exclude NA/null values when computing the result. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a Series. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to + use everything, then use only numeric data. Not implemented for + Series. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + mean : Series or DataFrame (if level specified) + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.mean() + a 2.5 + b 8.5 + dtype: float64 + """ + return self._reduce( + "mean", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + + def std( + self, + axis=None, + skipna=None, + level=None, + ddof=1, + numeric_only=None, + **kwargs, + ): + """ + Return sample standard deviation of the DataFrame. + + Normalized by N-1 by default. This can be changed using + the `ddof` argument + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ddof: int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is N - ddof, where N represents the number of elements. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `level` and + `numeric_only` + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.std() + a 1.290994 + b 1.290994 + dtype: float64 + """ + + return self._reduce( + "std", + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + + def var( + self, + axis=None, + skipna=None, + level=None, + ddof=1, + numeric_only=None, + **kwargs, + ): + """ + Return unbiased variance of the DataFrame. + + Normalized by N-1 by default. This can be changed using the + ddof argument + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ddof: int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is + N - ddof, where N represents the number of elements. + + Returns + ------- + scalar + + Notes + ----- + Parameters currently not supported are `level` and + `numeric_only` + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.var() + a 1.666667 + b 1.666667 + dtype: float64 + """ + return self._reduce( + "var", + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + + def kurtosis( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + ): + """ + Return Fisher's unbiased kurtosis of a sample. + + Kurtosis obtained using Fisher’s definition of + kurtosis (kurtosis of normal == 0.0). Normalized by N-1. + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + + Returns + ------- + Series or scalar + + Notes + ----- + Parameters currently not supported are `level` and `numeric_only` + + Examples + -------- + **Series** + + >>> import cudf + >>> series = cudf.Series([1, 2, 3, 4]) + >>> series.kurtosis() + -1.1999999999999904 + + **DataFrame** + + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.kurt() + a -1.2 + b -1.2 + dtype: float64 + """ + if axis not in (0, "index", None): + raise NotImplementedError("Only axis=0 is currently supported.") + + return self._reduce( + "kurtosis", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + + # Alias for kurtosis. + @copy_docstring(kurtosis) + def kurt( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + ): + return self.kurtosis( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + + def skew( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + ): + """ + Return unbiased Fisher-Pearson skew of a sample. + + Parameters + ---------- + skipna: bool, default True + Exclude NA/null values when computing the result. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `axis`, `level` and + `numeric_only` + + Examples + -------- + **Series** + + >>> import cudf + >>> series = cudf.Series([1, 2, 3, 4, 5, 6, 6]) + >>> series + 0 1 + 1 2 + 2 3 + 3 4 + 4 5 + 5 6 + 6 6 + dtype: int64 + + **DataFrame** + + >>> import cudf + >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 8, 10, 10]}) + >>> df.skew() + a 0.00000 + b -0.37037 + dtype: float64 + """ + if axis not in (0, "index", None): + raise NotImplementedError("Only axis=0 is currently supported.") + + return self._reduce( + "skew", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + + def all(self, axis=0, skipna=True, level=None, **kwargs): + """ + Return whether all elements are True in DataFrame. + + Parameters + ---------- + + skipna: bool, default True + Exclude NA/null values. If the entire row/column is NA and + skipna is True, then the result will be True, as for an + empty row/column. + If skipna is False, then NA are treated as True, because + these are not equal to zero. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `axis`, `bool_only`, `level`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) + >>> df.all() + a True + b False + dtype: bool + """ + return self._reduce( + "all", axis=axis, skipna=skipna, level=level, **kwargs, + ) + + def any(self, axis=0, skipna=True, level=None, **kwargs): + """ + Return whether any elements is True in DataFrame. + + Parameters + ---------- + + skipna: bool, default True + Exclude NA/null values. If the entire row/column is NA and + skipna is True, then the result will be False, as for an + empty row/column. + If skipna is False, then NA are treated as True, because + these are not equal to zero. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `axis`, `bool_only`, `level`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) + >>> df.any() + a True + b True + dtype: bool + """ + return self._reduce( + "any", axis=axis, skipna=skipna, level=level, **kwargs, + ) + + def sum_of_squares(self, dtype=None): + """Return the sum of squares of values. + + Parameters + ---------- + dtype: data type + Data type to cast the result to. + + Returns + ------- + Series + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) + >>> df.sum_of_squares() + a 38 + b 249 + dtype: int64 + """ + return self._reduce("sum_of_squares", dtype=dtype) + + def median( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + ): + """ + Return the median of the values for the requested axis. + + Parameters + ---------- + + skipna : bool, default True + Exclude NA/null values when computing the result. + + Returns + ------- + scalar + + Notes + ----- + Parameters currently not supported are `level` and `numeric_only`. + + Examples + -------- + >>> import cudf + >>> ser = cudf.Series([10, 25, 3, 25, 24, 6]) + >>> ser + 0 10 + 1 25 + 2 3 + 3 25 + 4 24 + 5 6 + dtype: int64 + >>> ser.median() + 17.0 + """ + return self._reduce( + "median", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + + # Scans + def _scan(self, op, axis=None, skipna=True, cast_to_int=False): + skipna = True if skipna is None else skipna + + results = {} + for name, col in self._data.items(): + if skipna: + result_col = self._data[name].nans_to_nulls() + else: + result_col = self._data[name].copy() + if result_col.has_nulls: + # Workaround as find_first_value doesn't seem to work + # incase of bools. + first_index = int( + result_col.isnull().astype("int8").find_first_value(1) + ) + result_col[first_index:] = None + + if ( + cast_to_int + and not is_decimal_dtype(result_col.dtype) + and ( + np.issubdtype(result_col.dtype, np.integer) + or np.issubdtype(result_col.dtype, np.bool_) + ) + ): + # For reductions that accumulate a value (e.g. sum, not max) + # pandas returns an int64 dtype for all int or bool dtypes. + result_col = result_col.astype(np.int64) + results[name] = result_col._apply_scan_op(op) + # TODO: This will work for Index because it's passing self._index + # (which is None), but eventually we may want to remove that parameter + # for Index._from_data and simplify. + return self._from_data(results, index=self._index) + + def cummin(self, axis=None, skipna=True, *args, **kwargs): + """ + Return cumulative minimum of the Series or DataFrame. + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values. If an entire row/column is NA, + the result will be NA. + + Returns + ------- + Series or DataFrame + + Examples + -------- + **Series** + + >>> import cudf + >>> ser = cudf.Series([1, 5, 2, 4, 3]) + >>> ser.cummin() + 0 1 + 1 1 + 2 1 + 3 1 + 4 1 + + **DataFrame** + + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.cummin() + a b + 0 1 7 + 1 1 7 + 2 1 7 + 3 1 7 + """ + return self._scan("min", axis=axis, skipna=skipna, *args, **kwargs) + + def cummax(self, axis=None, skipna=True, *args, **kwargs): + """ + Return cumulative maximum of the Series or DataFrame. + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values. If an entire row/column is NA, + the result will be NA. + + Returns + ------- + Series or DataFrame + + Examples + -------- + **Series** + + >>> import cudf + >>> ser = cudf.Series([1, 5, 2, 4, 3]) + >>> ser.cummax() + 0 1 + 1 5 + 2 5 + 3 5 + 4 5 + + **DataFrame** + + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.cummax() + a b + 0 1 7 + 1 2 8 + 2 3 9 + 3 4 10 + """ + return self._scan("max", axis=axis, skipna=skipna, *args, **kwargs) + + def cumsum(self, axis=None, skipna=True, *args, **kwargs): + """ + Return cumulative sum of the Series or DataFrame. + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values. If an entire row/column is NA, + the result will be NA. + + + Returns + ------- + Series or DataFrame + + Examples + -------- + **Series** + + >>> import cudf + >>> ser = cudf.Series([1, 5, 2, 4, 3]) + >>> ser.cumsum() + 0 1 + 1 6 + 2 8 + 3 12 + 4 15 + + **DataFrame** + + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> s.cumsum() + a b + 0 1 7 + 1 3 15 + 2 6 24 + 3 10 34 + """ + return self._scan( + "sum", axis=axis, skipna=skipna, cast_to_int=True, *args, **kwargs + ) + + def cumprod(self, axis=None, skipna=True, *args, **kwargs): + """ + Return cumulative product of the Series or DataFrame. + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values. If an entire row/column is NA, + the result will be NA. + + Returns + ------- + Series or DataFrame + + Examples + -------- + **Series** + + >>> import cudf + >>> ser = cudf.Series([1, 5, 2, 4, 3]) + >>> ser.cumprod() + 0 1 + 1 5 + 2 10 + 3 40 + 4 120 + + **DataFrame** + + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> s.cumprod() + a b + 0 1 7 + 1 2 56 + 2 6 504 + 3 24 5040 + """ + return self._scan( + "prod", axis=axis, skipna=skipna, cast_to_int=True, *args, **kwargs + ) + class SingleColumnFrame(Frame): """A one-dimensional frame. @@ -3621,6 +4539,46 @@ class SingleColumnFrame(Frame): this class. """ + _SUPPORT_AXIS_LOOKUP = { + 0: 0, + None: 0, + "index": 0, + } + + def _reduce( + self, op, axis=None, level=None, numeric_only=None, **kwargs, + ): + if axis not in (None, 0): + raise NotImplementedError("axis parameter is not implemented yet") + + if level is not None: + raise NotImplementedError("level parameter is not implemented yet") + + if numeric_only not in (None, True): + raise NotImplementedError( + "numeric_only parameter is not implemented yet" + ) + return getattr(self._column, op)(**kwargs) + + def _scan(self, op, axis=None, *args, **kwargs): + if axis not in (None, 0): + raise NotImplementedError("axis parameter is not implemented yet") + + return super()._scan(op, axis=axis, *args, **kwargs) + + @classmethod + def _from_data( + cls, + data: MutableMapping, + index: Optional[cudf.core.index.BaseIndex] = None, + name: Any = None, + ): + + out = super()._from_data(data, index) + if name is not None: + out.name = name + return out + @property def name(self): """The name of this object.""" @@ -3642,6 +4600,12 @@ def shape(self): return (len(self),) def __iter__(self): + """ + Iterating over a GPU object is not effecient and hence not supported. + + Consider using ``.to_arrow()``, ``.to_pandas()`` or ``.values_host`` + if you wish to iterate over the values. + """ cudf.utils.utils.raise_iteration_error(obj=self) def __len__(self): @@ -3895,16 +4859,6 @@ def factorize(self, na_sentinel=-1): """ return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel) - @property - def _copy_construct_defaults(self): - """A default dictionary of kwargs to be used for copy construction.""" - raise NotImplementedError - - def _copy_construct(self, **kwargs): - """Shallow copy this object by replacing certain ctor args. - """ - return self.__class__(**{**self._copy_construct_defaults, **kwargs}) - def _binaryop( self, other: T, @@ -3963,8 +4917,9 @@ def _binaryop( result_name: (self._column, other, reflect, fill_value) } - return self._copy_construct( - data=type(self)._colwise_binop(operands, fn)[result_name], + return self._from_data( + data=type(self)._colwise_binop(operands, fn), + index=self._index, name=result_name, ) @@ -4012,7 +4967,7 @@ def _get_replacement_values_for_columns( col: [value] if _is_non_decimal_numeric_dtype(columns_dtype_map[col]) else cudf.utils.utils.scalar_broadcast_to( - value, (len(to_replace),), np.dtype(type(value)), + value, (len(to_replace),), cudf.dtype(type(value)), ) for col in columns_dtype_map } diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 29c29691389..fd425d9de76 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -195,9 +195,9 @@ def agg(self, func): # Note: When there are no key columns, the below produces # a Float64Index, while Pandas returns an Int64Index # (GH: 6945) - result = self._groupby.aggregate(self.obj, normalized_aggs) - - result = cudf.DataFrame._from_table(result) + result = cudf.DataFrame._from_data( + *self._groupby.aggregate(self.obj, normalized_aggs) + ) if self._sort: result = result.sort_index() @@ -220,13 +220,17 @@ def agg(self, func): else: raise + if libgroupby._is_all_scan_aggregate(normalized_aggs): + # Scan aggregations return rows in original index order + return self._mimic_pandas_order(result) + # set index names to be group key names if len(result): result.index.names = self.grouping.names # copy categorical information from keys to the result index: result.index._copy_type_metadata(self.grouping.keys) - result._index = cudf.core.index.Index._from_table(result._index) + result._index = cudf.Index(result._index) if not self._as_index: for col_name in reversed(self.grouping._named_columns): @@ -288,9 +292,7 @@ def deserialize(cls, header, frames): def _grouped(self): grouped_keys, grouped_values, offsets = self._groupby.groups(self.obj) - - grouped_keys = cudf.Index._from_table(grouped_keys) - grouped_values = self.obj.__class__._from_table(grouped_values) + grouped_values = self.obj.__class__._from_data(*grouped_values) grouped_values._copy_type_metadata(self.obj) group_names = grouped_keys.unique() return (group_names, offsets, grouped_keys, grouped_values) @@ -350,10 +352,10 @@ def pipe(self, func, *args, **kwargs): See also -------- - cudf.core.series.Series.pipe + cudf.Series.pipe Apply a function with arguments to a series. - cudf.core.dataframe.DataFrame.pipe + cudf.DataFrame.pipe Apply a function with arguments to a dataframe. apply @@ -447,7 +449,7 @@ def mult(df): """ if not callable(function): raise TypeError(f"type {type(function)} is not callable") - _, offsets, _, grouped_values = self._grouped() + group_names, offsets, _, grouped_values = self._grouped() ngroups = len(offsets) - 1 if ngroups > self._MAX_GROUPS_BEFORE_WARN: @@ -465,9 +467,7 @@ def mult(df): return self.obj.__class__() if cudf.utils.dtypes.is_scalar(chunk_results[0]): - result = cudf.Series( - chunk_results, index=self.grouping.keys[offsets[:-1]] - ) + result = cudf.Series(chunk_results, index=group_names) result.index.names = self.grouping.names elif isinstance(chunk_results[0], cudf.Series): result = cudf.concat(chunk_results, axis=1).T @@ -815,14 +815,21 @@ def cummax(self): """Get the column-wise cumulative maximum value in each group.""" return self.agg("cummax") + def first(self): + """Get the first non-null value in each group.""" + return self.agg("first") + + def last(self): + """Get the last non-null value in each group.""" + return self.agg("last") + def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries: """Internal implementation for `ffill` and `bfill` """ value_columns = self.grouping.values - result = self._groupby.replace_nulls( - Table(value_columns._data), method + result = self.obj.__class__._from_data( + self._groupby.replace_nulls(Table(value_columns._data), method) ) - result = self.obj.__class__._from_table(result) result = self._mimic_pandas_order(result) return result._copy_type_metadata(value_columns) @@ -936,9 +943,9 @@ def fillna( return getattr(self, method, limit)() value_columns = self.grouping.values - _, grouped_values, _ = self._groupby.groups(Table(value_columns._data)) + _, (data, index), _ = self._groupby.groups(Table(value_columns._data)) - grouped = self.obj.__class__._from_data(grouped_values._data) + grouped = self.obj.__class__._from_data(data, index) result = grouped.fillna( value=value, inplace=inplace, axis=axis, limit=limit ) @@ -984,21 +991,20 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): if not axis == 0: raise NotImplementedError("Only axis=0 is supported.") - value_column_names = [ - x for x in self.obj._column_names if x not in self.grouping.names - ] - num_columns_to_shift = len(value_column_names) + value_columns = self.grouping.values if is_list_like(fill_value): - if not len(fill_value) == num_columns_to_shift: + if not len(fill_value) == len(value_columns._data): raise ValueError( "Mismatched number of columns and values to fill." ) else: - fill_value = [fill_value] * num_columns_to_shift + fill_value = [fill_value] * len(value_columns._data) - value_columns = self.obj._data.select_by_label(value_column_names) - result = self._groupby.shift(Table(value_columns), periods, fill_value) - return self.obj.__class__._from_table(result) + result = self.obj.__class__._from_data( + *self._groupby.shift(Table(value_columns), periods, fill_value) + ) + result = self._mimic_pandas_order(result) + return result._copy_type_metadata(value_columns) def _mimic_pandas_order( self, result: DataFrameOrSeries @@ -1007,104 +1013,103 @@ def _mimic_pandas_order( matching that of pandas. This also adds appropriate indices. """ sorted_order_column = arange(0, result._data.nrows) - _, order, _ = self._groupby.groups( + _, (order, _), _ = self._groupby.groups( Table({"sorted_order_column": sorted_order_column}) ) - order = order._data["sorted_order_column"] - gather_map = order.argsort() + gather_map = order["sorted_order_column"].argsort() result = result.take(gather_map) result.index = self.obj.index return result class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): + """ + Group DataFrame using a mapper or by a Series of columns. + + A groupby operation involves some combination of splitting the object, + applying a function, and combining the results. This can be used to + group large amounts of data and compute operations on these groups. + + Parameters + ---------- + by : mapping, function, label, or list of labels + Used to determine the groups for the groupby. If by is a + function, it’s called on each value of the object’s index. + If a dict or Series is passed, the Series or dict VALUES will + be used to determine the groups (the Series’ values are first + aligned; see .align() method). If a cupy array is passed, the + values are used as-is determine the groups. A label or list + of labels may be passed to group by the columns in self. + Notice that a tuple is interpreted as a (single) key. + level : int, level name, or sequence of such, default None + If the axis is a MultiIndex (hierarchical), group by a particular + level or levels. + as_index : bool, default True + For aggregated output, return object with group labels as + the index. Only relevant for DataFrame input. + as_index=False is effectively “SQL-style” grouped output. + sort : bool, default False + Sort result by group key. Differ from Pandas, cudf defaults to + ``False`` for better performance. Note this does not influence + the order of observations within each group. Groupby preserves + the order of rows within each group. + dropna : bool, optional + If True (default), do not include the "null" group. + + Returns + ------- + DataFrameGroupBy + Returns a groupby object that contains information + about the groups. + + Examples + -------- + >>> import cudf + >>> import pandas as pd + >>> df = cudf.DataFrame({'Animal': ['Falcon', 'Falcon', + ... 'Parrot', 'Parrot'], + ... 'Max Speed': [380., 370., 24., 26.]}) + >>> df + Animal Max Speed + 0 Falcon 380.0 + 1 Falcon 370.0 + 2 Parrot 24.0 + 3 Parrot 26.0 + >>> df.groupby(['Animal']).mean() + Max Speed + Animal + Falcon 375.0 + Parrot 25.0 + + >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], + ... ['Captive', 'Wild', 'Captive', 'Wild']] + >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) + >>> df = cudf.DataFrame({'Max Speed': [390., 350., 30., 20.]}, + index=index) + >>> df + Max Speed + Animal Type + Falcon Captive 390.0 + Wild 350.0 + Parrot Captive 30.0 + Wild 20.0 + >>> df.groupby(level=0).mean() + Max Speed + Animal + Falcon 370.0 + Parrot 25.0 + >>> df.groupby(level="Type").mean() + Max Speed + Type + Wild 185.0 + Captive 210.0 + """ + _PROTECTED_KEYS = frozenset(("obj",)) def __init__( self, obj, by=None, level=None, sort=False, as_index=True, dropna=True ): - """ - Group DataFrame using a mapper or by a Series of columns. - - A groupby operation involves some combination of splitting the object, - applying a function, and combining the results. This can be used to - group large amounts of data and compute operations on these groups. - - Parameters - ---------- - by : mapping, function, label, or list of labels - Used to determine the groups for the groupby. If by is a - function, it’s called on each value of the object’s index. - If a dict or Series is passed, the Series or dict VALUES will - be used to determine the groups (the Series’ values are first - aligned; see .align() method). If a cupy array is passed, the - values are used as-is determine the groups. A label or list - of labels may be passed to group by the columns in self. - Notice that a tuple is interpreted as a (single) key. - level : int, level name, or sequence of such, default None - If the axis is a MultiIndex (hierarchical), group by a particular - level or levels. - as_index : bool, default True - For aggregated output, return object with group labels as - the index. Only relevant for DataFrame input. - as_index=False is effectively “SQL-style” grouped output. - sort : bool, default False - Sort result by group key. Differ from Pandas, cudf defaults to - ``False`` for better performance. Note this does not influence - the order of observations within each group. Groupby preserves - the order of rows within each group. - dropna : bool, optional - If True (default), do not include the "null" group. - - Returns - ------- - DataFrameGroupBy - Returns a groupby object that contains information - about the groups. - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> df = cudf.DataFrame({'Animal': ['Falcon', 'Falcon', - ... 'Parrot', 'Parrot'], - ... 'Max Speed': [380., 370., 24., 26.]}) - >>> df - Animal Max Speed - 0 Falcon 380.0 - 1 Falcon 370.0 - 2 Parrot 24.0 - 3 Parrot 26.0 - >>> df.groupby(['Animal']).mean() - Max Speed - Animal - Falcon 375.0 - Parrot 25.0 - - >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], - ... ['Captive', 'Wild', 'Captive', 'Wild']] - >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) - >>> df = cudf.DataFrame({'Max Speed': [390., 350., 30., 20.]}, - index=index) - >>> df - Max Speed - Animal Type - Falcon Captive 390.0 - Wild 350.0 - Parrot Captive 30.0 - Wild 20.0 - >>> df.groupby(level=0).mean() - Max Speed - Animal - Falcon 370.0 - Parrot 25.0 - >>> df.groupby(level="Type").mean() - Max Speed - Type - Wild 185.0 - Captive 210.0 - - """ super().__init__( obj=obj, by=by, @@ -1127,68 +1132,68 @@ def nunique(self): class SeriesGroupBy(GroupBy): + """ + Group Series using a mapper or by a Series of columns. + + A groupby operation involves some combination of splitting the object, + applying a function, and combining the results. This can be used to + group large amounts of data and compute operations on these groups. + + Parameters + ---------- + by : mapping, function, label, or list of labels + Used to determine the groups for the groupby. If by is a + function, it’s called on each value of the object’s index. + If a dict or Series is passed, the Series or dict VALUES will + be used to determine the groups (the Series’ values are first + aligned; see .align() method). If an cupy array is passed, the + values are used as-is determine the groups. A label or list + of labels may be passed to group by the columns in self. + Notice that a tuple is interpreted as a (single) key. + level : int, level name, or sequence of such, default None + If the axis is a MultiIndex (hierarchical), group by a particular + level or levels. + as_index : bool, default True + For aggregated output, return object with group labels as + the index. Only relevant for DataFrame input. + as_index=False is effectively “SQL-style” grouped output. + sort : bool, default False + Sort result by group key. Differ from Pandas, cudf defaults to + ``False`` for better performance. Note this does not influence + the order of observations within each group. Groupby preserves + the order of rows within each group. + + Returns + ------- + SeriesGroupBy + Returns a groupby object that contains information + about the groups. + + Examples + -------- + >>> ser = cudf.Series([390., 350., 30., 20.], + ... index=['Falcon', 'Falcon', 'Parrot', 'Parrot'], + ... name="Max Speed") + >>> ser + Falcon 390.0 + Falcon 350.0 + Parrot 30.0 + Parrot 20.0 + Name: Max Speed, dtype: float64 + >>> ser.groupby(level=0).mean() + Falcon 370.0 + Parrot 25.0 + Name: Max Speed, dtype: float64 + >>> ser.groupby(ser > 100).mean() + Max Speed + False 25.0 + True 370.0 + Name: Max Speed, dtype: float64 + """ + def __init__( self, obj, by=None, level=None, sort=False, as_index=True, dropna=True ): - """ - Group Series using a mapper or by a Series of columns. - - A groupby operation involves some combination of splitting the object, - applying a function, and combining the results. This can be used to - group large amounts of data and compute operations on these groups. - - Parameters - ---------- - by : mapping, function, label, or list of labels - Used to determine the groups for the groupby. If by is a - function, it’s called on each value of the object’s index. - If a dict or Series is passed, the Series or dict VALUES will - be used to determine the groups (the Series’ values are first - aligned; see .align() method). If an cupy array is passed, the - values are used as-is determine the groups. A label or list - of labels may be passed to group by the columns in self. - Notice that a tuple is interpreted as a (single) key. - level : int, level name, or sequence of such, default None - If the axis is a MultiIndex (hierarchical), group by a particular - level or levels. - as_index : bool, default True - For aggregated output, return object with group labels as - the index. Only relevant for DataFrame input. - as_index=False is effectively “SQL-style” grouped output. - sort : bool, default False - Sort result by group key. Differ from Pandas, cudf defaults to - ``False`` for better performance. Note this does not influence - the order of observations within each group. Groupby preserves - the order of rows within each group. - - Returns - ------- - SeriesGroupBy - Returns a groupby object that contains information - about the groups. - - Examples - -------- - >>> ser = cudf.Series([390., 350., 30., 20.], - ... index=['Falcon', 'Falcon', 'Parrot', 'Parrot'], - ... name="Max Speed") - >>> ser - Falcon 390.0 - Falcon 350.0 - Parrot 30.0 - Parrot 20.0 - Name: Max Speed, dtype: float64 - >>> ser.groupby(level=0).mean() - Falcon 370.0 - Parrot 25.0 - Name: Max Speed, dtype: float64 - >>> ser.groupby(ser > 100).mean() - Max Speed - False 25.0 - True 370.0 - Name: Max Speed, dtype: float64 - - """ super().__init__( obj=obj, by=by, @@ -1215,6 +1220,14 @@ def agg(self, func): return result + def apply(self, func): + result = super().apply(func) + + # apply Series name to result + result.name = self.obj.name + + return result + class Grouper(object): def __init__(self, key=None, level=None): diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 691b6ab2e29..6be21ce74d2 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -4,7 +4,16 @@ import pickle from numbers import Number -from typing import Any, Dict, Optional, Tuple, Type, Union +from typing import ( + Any, + Dict, + List, + MutableMapping, + Optional, + Tuple, + Type, + Union, +) import cupy import numpy as np @@ -13,7 +22,7 @@ from pandas._config import get_option import cudf -from cudf._lib.datetime import is_leap_year +from cudf._lib.datetime import extract_quarter, is_leap_year from cudf._lib.filling import sequence from cudf._lib.search import search_sorted from cudf._lib.table import Table @@ -326,7 +335,7 @@ def set_names(self, names, level=None, inplace=False): See Also -------- - cudf.core.index.Index.rename : Able to set new names without level. + cudf.Index.rename : Able to set new names without level. Examples -------- @@ -518,83 +527,20 @@ def gpu_values(self): """ return self._values.data_array_view - def min(self): - """ - Return the minimum value of the Index. - - Returns - ------- - scalar - Minimum value. - - See Also - -------- - cudf.core.index.Index.max : Return the maximum value in an Index. - cudf.core.series.Series.min : Return the minimum value in a Series. - cudf.core.dataframe.DataFrame.min : Return the minimum values in - a DataFrame. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([3, 2, 1]) - >>> idx.min() - 1 - """ - return self._values.min() - - def max(self): - """ - Return the maximum value of the Index. - - Returns - ------- - scalar - Maximum value. - - See Also - -------- - cudf.core.index.Index.min : Return the minimum value in an Index. - cudf.core.series.Series.max : Return the maximum value in a Series. - cudf.core.dataframe.DataFrame.max : Return the maximum values in - a DataFrame. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([3, 2, 1]) - >>> idx.max() - 3 - """ - return self._values.max() - - def sum(self): - """ - Return the sum of all values of the Index. - - Returns - ------- - scalar - Sum of all values. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([3, 2, 1]) - >>> idx.sum() - 6 - """ - return self._values.sum() - @classmethod def _concat(cls, objs): - data = concat_columns([o._values for o in objs]) + if all(isinstance(obj, RangeIndex) for obj in objs): + result = _concat_range_index(objs) + else: + data = concat_columns([o._values for o in objs]) + result = as_index(data) + names = {obj.name for obj in objs} if len(names) == 1: [name] = names else: name = None - result = as_index(data) + result.name = name return result @@ -646,12 +592,12 @@ def append(self, other): if is_mixed_with_object_dtype(this, other): got_dtype = ( other.dtype - if this.dtype == np.dtype("object") + if this.dtype == cudf.dtype("object") else this.dtype ) raise TypeError( f"cudf does not support appending an Index of " - f"dtype `{np.dtype('object')}` with an Index " + f"dtype `{cudf.dtype('object')}` with an Index " f"of dtype `{got_dtype}`, please type-cast " f"either one of them to same dtypes." ) @@ -724,39 +670,6 @@ def difference(self, other, sort=None): return difference - def _copy_construct(self, **kwargs): - # Need to override the parent behavior because pandas allows operations - # on unsigned types to return signed values, forcing us to choose the - # right index type here. - data = kwargs.get("data") - cls = self.__class__ - - if data is not None: - if self.dtype != data.dtype: - # TODO: This logic is largely copied from `as_index`. The two - # should be unified via a centralized type dispatching scheme. - if isinstance(data, NumericalColumn): - try: - cls = _dtype_to_index[data.dtype.type] - except KeyError: - cls = GenericIndex - elif isinstance(data, StringColumn): - cls = StringIndex - elif isinstance(data, DatetimeColumn): - cls = DatetimeIndex - elif isinstance(data, TimeDeltaColumn): - cls = TimedeltaIndex - elif isinstance(data, CategoricalColumn): - cls = CategoricalIndex - elif cls is RangeIndex: - # RangeIndex must convert to other numerical types for ops - try: - cls = _dtype_to_index[data.dtype.type] - except KeyError: - cls = GenericIndex - - return cls(**{**self._copy_construct_defaults, **kwargs}) - def sort_values(self, return_indexer=False, ascending=True, key=None): """ Return a sorted copy of the index, and optionally return the indices @@ -780,8 +693,8 @@ def sort_values(self, return_indexer=False, ascending=True, key=None): See Also -------- - cudf.core.series.Series.min : Sort values of a Series. - cudf.core.dataframe.DataFrame.sort_values : Sort values in a DataFrame. + cudf.Series.min : Sort values of a Series. + cudf.DataFrame.sort_values : Sort values in a DataFrame. Examples -------- @@ -1350,9 +1263,9 @@ def from_pandas(cls, index, nan_as_null=None): >>> import numpy as np >>> data = [10, 20, 30, np.nan] >>> pdi = pd.Index(data) - >>> cudf.core.index.Index.from_pandas(pdi) + >>> cudf.Index.from_pandas(pdi) Float64Index([10.0, 20.0, 30.0, ], dtype='float64') - >>> cudf.core.index.Index.from_pandas(pdi, nan_as_null=False) + >>> cudf.Index.from_pandas(pdi, nan_as_null=False) Float64Index([10.0, 20.0, 30.0, nan], dtype='float64') """ if not isinstance(index, pd.Index): @@ -1363,52 +1276,43 @@ def from_pandas(cls, index, nan_as_null=None): return ind @classmethod - def _from_table(cls, table): - if not isinstance(table, RangeIndex): - if table._num_columns == 0: - raise ValueError("Cannot construct Index from any empty Table") - if table._num_columns == 1: - values = next(iter(table._data.values())) - - if isinstance(values, NumericalColumn): - try: - index_class_type = _dtype_to_index[values.dtype.type] - except KeyError: - index_class_type = GenericIndex - out = super(BaseIndex, index_class_type).__new__( - index_class_type - ) - elif isinstance(values, DatetimeColumn): - out = super(BaseIndex, DatetimeIndex).__new__( - DatetimeIndex - ) - elif isinstance(values, TimeDeltaColumn): - out = super(BaseIndex, TimedeltaIndex).__new__( - TimedeltaIndex - ) - elif isinstance(values, StringColumn): - out = super(BaseIndex, StringIndex).__new__(StringIndex) - elif isinstance(values, CategoricalColumn): - out = super(BaseIndex, CategoricalIndex).__new__( - CategoricalIndex - ) - out._data = table._data - out._index = None - return out - else: - return cudf.MultiIndex._from_table( - table, names=table._data.names + def _from_data( + cls, + data: MutableMapping, + index: Optional[BaseIndex] = None, + name: Any = None, + ) -> BaseIndex: + assert index is None + if not isinstance(data, cudf.core.column_accessor.ColumnAccessor): + data = cudf.core.column_accessor.ColumnAccessor(data) + if len(data) == 0: + raise ValueError("Cannot construct Index from any empty Table") + if len(data) == 1: + values = next(iter(data.values())) + + if isinstance(values, NumericalColumn): + try: + index_class_type = _dtype_to_index[values.dtype.type] + except KeyError: + index_class_type = GenericIndex + out = super(BaseIndex, index_class_type).__new__( + index_class_type ) + elif isinstance(values, DatetimeColumn): + out = super(BaseIndex, DatetimeIndex).__new__(DatetimeIndex) + elif isinstance(values, TimeDeltaColumn): + out = super(BaseIndex, TimedeltaIndex).__new__(TimedeltaIndex) + elif isinstance(values, StringColumn): + out = super(BaseIndex, StringIndex).__new__(StringIndex) + elif isinstance(values, CategoricalColumn): + out = super(BaseIndex, CategoricalIndex).__new__( + CategoricalIndex + ) + out._data = data + out._index = None + return out else: - return as_index(table) - - @property - def _copy_construct_defaults(self): - return {"data": self._column, "name": self.name} - - @classmethod - def _from_data(cls, data, index=None): - return cls._from_table(SingleColumnFrame(data=data)) + return cudf.MultiIndex._from_data(data) @property def _constructor_expanddim(self): @@ -1640,7 +1544,7 @@ def dtype(self): """ `dtype` of the range of values in RangeIndex. """ - return np.dtype(np.int64) + return cudf.dtype(np.int64) @property def is_contiguous(self): @@ -1783,25 +1687,25 @@ def __mul__(self, other): class GenericIndex(BaseIndex): - """An array of orderable values that represent the indices of another Column + """ + An array of orderable values that represent the indices of another Column Attributes ---------- _values: A Column object name: A string + + Parameters + ---------- + data : Column + The Column of data for this index + name : str optional + The name of the Index. If not provided, the Index adopts the value + Column's name. Otherwise if this name is different from the value + Column's, the data Column will be cloned to adopt this name. """ def __init__(self, data, **kwargs): - """ - Parameters - ---------- - data : Column - The Column of data for this index - name : str optional - The name of the Index. If not provided, the Index adopts the value - Column's name. Otherwise if this name is different from the value - Column's, the data Column will be cloned to adopt this name. - """ kwargs = _setdefault_name(data, **kwargs) # normalize the input @@ -2007,42 +1911,252 @@ def __init__(self, data=None, dtype=None, copy=False, name=None): class Int8Index(NumericIndex): + """ + Immutable, ordered and sliceable sequence of labels. + The basic object storing row labels for all cuDF objects. + Int8Index is a special case of Index with purely + integer(``int8``) labels. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype, + but not used. + copy : bool + Make a copy of input data. + name : object + Name to be stored in the index. + + Returns + ------- + Int8Index + """ + _dtype = np.int8 class Int16Index(NumericIndex): + """ + Immutable, ordered and sliceable sequence of labels. + The basic object storing row labels for all cuDF objects. + Int16Index is a special case of Index with purely + integer(``int16``) labels. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype, + but not used. + copy : bool + Make a copy of input data. + name : object + Name to be stored in the index. + + Returns + ------- + Int16Index + """ + _dtype = np.int16 class Int32Index(NumericIndex): + """ + Immutable, ordered and sliceable sequence of labels. + The basic object storing row labels for all cuDF objects. + Int32Index is a special case of Index with purely + integer(``int32``) labels. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype, + but not used. + copy : bool + Make a copy of input data. + name : object + Name to be stored in the index. + + Returns + ------- + Int32Index + """ + _dtype = np.int32 class Int64Index(NumericIndex): + """ + Immutable, ordered and sliceable sequence of labels. + The basic object storing row labels for all cuDF objects. + Int64Index is a special case of Index with purely + integer(``int64``) labels. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype, + but not used. + copy : bool + Make a copy of input data. + name : object + Name to be stored in the index. + + Returns + ------- + Int64Index + """ + _dtype = np.int64 class UInt8Index(NumericIndex): + """ + Immutable, ordered and sliceable sequence of labels. + The basic object storing row labels for all cuDF objects. + UInt8Index is a special case of Index with purely + integer(``uint64``) labels. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype, + but not used. + copy : bool + Make a copy of input data. + name : object + Name to be stored in the index. + + Returns + ------- + UInt8Index + """ + _dtype = np.uint8 class UInt16Index(NumericIndex): + """ + Immutable, ordered and sliceable sequence of labels. + The basic object storing row labels for all cuDF objects. + UInt16Index is a special case of Index with purely + integer(``uint16``) labels. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype, + but not used. + copy : bool + Make a copy of input data. + name : object + Name to be stored in the index. + + Returns + ------- + UInt16Index + """ + _dtype = np.uint16 class UInt32Index(NumericIndex): + """ + Immutable, ordered and sliceable sequence of labels. + The basic object storing row labels for all cuDF objects. + UInt32Index is a special case of Index with purely + integer(``uint32``) labels. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype, + but not used. + copy : bool + Make a copy of input data. + name : object + Name to be stored in the index. + + Returns + ------- + UInt32Index + """ + _dtype = np.uint32 class UInt64Index(NumericIndex): + """ + Immutable, ordered and sliceable sequence of labels. + The basic object storing row labels for all cuDF objects. + UInt64Index is a special case of Index with purely + integer(``uint64``) labels. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype, + but not used. + copy : bool + Make a copy of input data. + name : object + Name to be stored in the index. + + Returns + ------- + UInt64Index + """ + _dtype = np.uint64 class Float32Index(NumericIndex): + """ + Immutable, ordered and sliceable sequence of labels. + The basic object storing row labels for all cuDF objects. + Float32Index is a special case of Index with purely + float(``float32``) labels. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype, + but not used. + copy : bool + Make a copy of input data. + name : object + Name to be stored in the index. + + Returns + ------- + Float32Index + """ + _dtype = np.float32 class Float64Index(NumericIndex): + """ + Immutable, ordered and sliceable sequence of labels. + The basic object storing row labels for all cuDF objects. + Float64Index is a special case of Index with purely + float(``float64``) labels. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype, + but not used. + copy : bool + Make a copy of input data. + name : object + Name to be stored in the index. + + Returns + ------- + Float64Index + """ + _dtype = np.float64 @@ -2357,6 +2471,31 @@ def is_leap_year(self): res = is_leap_year(self._values).fillna(False) return cupy.asarray(res) + @property + def quarter(self): + """ + Integer indicator for which quarter of the year the date belongs in. + + There are 4 quarters in a year. With the first quarter being from + January - March, second quarter being April - June, third quarter + being July - September and fourth quarter being October - December. + + Returns + ------- + Int8Index + Integer indicating which quarter the date belongs to. + + Examples + -------- + >>> import cudf + >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:00:00", + ... "1999-12-31 18:40:00"]) + >>> gIndex.quarter + Int8Index([2, 4], dtype='int8') + """ + res = extract_quarter(self._values) + return Int8Index(res, dtype="int8") + def to_pandas(self): nanos = self._values.astype("datetime64[ns]") return pd.DatetimeIndex(nanos.to_pandas(), name=self.name) @@ -2493,6 +2632,13 @@ def components(self): @property def inferred_freq(self): + """ + Infers frequency of TimedeltaIndex. + + Notes + ----- + This property is currently not supported. + """ raise NotImplementedError("inferred_freq is not yet supported") @@ -2798,7 +2944,7 @@ def from_breaks(breaks, closed="right", name=None, copy=False, dtype=None): Construct an IntervalIndex from an array of splits. Parameters - --------- + ---------- breaks : array-like (1-dimensional) Left and right bounds for each interval. closed : {"left", "right", "both", "neither"}, default "right" @@ -2878,7 +3024,7 @@ def __repr__(self): + ")" ) - @copy_docstring(StringMethods.__init__) # type: ignore + @copy_docstring(StringMethods) # type: ignore @property def str(self): return StringMethods(parent=self) @@ -3043,3 +3189,43 @@ def __new__( ) return as_index(data, copy=copy, dtype=dtype, name=name, **kwargs) + + +def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex: + """ + An internal Utility function to concat RangeIndex objects. + """ + start = step = next_ = None + + # Filter the empty indexes + non_empty_indexes = [obj for obj in indexes if len(obj)] + + if not non_empty_indexes: + # Here all "indexes" had 0 length, i.e. were empty. + # In this case return an empty range index. + return RangeIndex(0, 0) + + for obj in non_empty_indexes: + if start is None: + # This is set by the first non-empty index + start = obj.start + if step is None and len(obj) > 1: + step = obj.step + elif step is None: + # First non-empty index had only one element + if obj.start == start: + result = as_index(concat_columns([x._values for x in indexes])) + return result + step = obj.start - start + + non_consecutive = (step != obj.step and len(obj) > 1) or ( + next_ is not None and obj.start != next_ + ) + if non_consecutive: + result = as_index(concat_columns([x._values for x in indexes])) + return result + if step is not None: + next_ = obj[-1] + step + + stop = non_empty_indexes[-1].stop if next_ is None else next_ + return RangeIndex(start, stop, step) diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index a4a69a4e084..da999f13fa8 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -98,8 +98,9 @@ def __getitem__(self, arg): or _is_null_host_scalar(data) ): return data - index = self._sr.index.take(arg) - return self._sr._copy_construct(data=data, index=index) + return self._sr._from_data( + {self._sr.name: data}, index=cudf.Index(self._sr.index.take(arg)) + ) def __setitem__(self, key, value): from cudf.core.column import column @@ -431,7 +432,7 @@ def _setitem_tuple_arg(self, key, value): ) try: - columns = self._get_column_selection(key[1]) + columns_df = self._get_column_selection(key[1]) except KeyError: if not self._df.empty and isinstance(key[0], slice): pos_range = get_label_range_or_mask( @@ -456,8 +457,27 @@ def _setitem_tuple_arg(self, key, value): ) self._df._data.insert(key[1], new_col) else: - for col in columns: - self._df[col].loc[key[0]] = value + if isinstance(value, (cp.ndarray, np.ndarray)): + value_df = cudf.DataFrame(value) + if value_df.shape[1] != columns_df.shape[1]: + if value_df.shape[1] == 1: + value_cols = ( + value_df._data.columns * columns_df.shape[1] + ) + else: + raise ValueError( + f"shape mismatch: value array of shape " + f"{value_df.shape} could not be " + f"broadcast to indexing result of shape " + f"{columns_df.shape}" + ) + else: + value_cols = value_df._data.columns + for i, col in enumerate(columns_df._column_names): + self._df[col].loc[key[0]] = value_cols[i] + else: + for col in columns_df._column_names: + self._df[col].loc[key[0]] = value def _get_column_selection(self, arg): return self._df._get_columns_by_label(arg) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 51423d604c2..079a6d902b6 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -6,7 +6,7 @@ import pickle import warnings from collections.abc import Sequence -from typing import Any, List, Tuple, Union +from typing import Any, List, Mapping, Tuple, Union import cupy import numpy as np @@ -18,7 +18,6 @@ from cudf._typing import DataFrameOrSeries from cudf.core._compat import PANDAS_GE_120 from cudf.core.column import as_column, column -from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import SingleColumnFrame from cudf.core.index import BaseIndex, as_index from cudf.utils.utils import _maybe_indices_to_slice @@ -94,7 +93,6 @@ def __init__( self._name = None - column_names = [] if labels: warnings.warn( "the 'labels' keyword is deprecated, use 'codes' " "instead", @@ -124,17 +122,6 @@ def __init__( self._levels = levels return - # name setup - if isinstance(names, (Sequence, pd.core.indexes.frozen.FrozenList,),): - if sum(x is None for x in names) > 1: - column_names = list(range(len(codes))) - else: - column_names = names - elif names is None: - column_names = list(range(len(codes))) - else: - column_names = names - if len(levels) == 0: raise ValueError("Must pass non-zero number of levels/codes") @@ -147,10 +134,12 @@ def __init__( self._codes = codes elif len(levels) == len(codes): self._codes = cudf.DataFrame() - for i, codes in enumerate(codes): - name = column_names[i] or i - codes = column.as_column(codes) - self._codes[name] = codes.astype(np.int64) + self._codes = cudf.DataFrame._from_data( + { + i: column.as_column(code).astype(np.int64) + for i, code in enumerate(codes) + } + ) else: raise ValueError( "MultiIndex has unequal number of levels and " @@ -161,20 +150,20 @@ def __init__( self._validate_levels_and_codes(self._levels, self._codes) source_data = cudf.DataFrame() - for i, name in enumerate(self._codes.columns): - codes = as_index(self._codes[name]._column) - if -1 in self._codes[name].values: + for i, n in enumerate(self._codes.columns): + codes = as_index(self._codes[n]._column) + if -1 in self._codes[n].values: # Must account for null(s) in _source_data column level = cudf.DataFrame( - {name: [None] + list(self._levels[i])}, + {n: [None] + list(self._levels[i])}, index=range(-1, len(self._levels[i])), ) else: - level = cudf.DataFrame({name: self._levels[i]}) + level = cudf.DataFrame({n: self._levels[i]}) - source_data[name] = libcudf.copying.gather( + source_data[n] = libcudf.copying.gather( level, codes._data.columns[0] - )._data[name] + )[0][n] self._data = source_data._data self.names = names @@ -294,17 +283,15 @@ def set_names(self, names, level=None, inplace=False): return self._set_names(names=names, inplace=inplace) + # TODO: This type ignore is indicating a real problem, which is that + # MultiIndex should not be inheriting from SingleColumnFrame, but fixing + # that will have to wait until we reshuffle the Index hierarchy. @classmethod - def _from_data(cls, data: ColumnAccessor, index=None) -> MultiIndex: + def _from_data( # type: ignore + cls, data: Mapping, index=None + ) -> MultiIndex: return cls.from_frame(cudf.DataFrame._from_data(data)) - @classmethod - def _from_table(cls, table, names=None): - df = cudf.DataFrame(table._data) - if names is None: - names = df.columns - return MultiIndex.from_frame(df, names=names) - @property def shape(self): return (self._data.nrows, len(self._data.names)) @@ -612,6 +599,30 @@ def to_arrow(self): @property def codes(self): + """ + Returns the codes of the underlying MultiIndex. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a':[1, 2, 3], 'b':[10, 11, 12]}) + >>> cudf.MultiIndex.from_frame(df) + MultiIndex([(1, 10), + (2, 11), + (3, 12)], + names=['a', 'b']) + >>> midx = cudf.MultiIndex.from_frame(df) + >>> midx + MultiIndex([(1, 10), + (2, 11), + (3, 12)], + names=['a', 'b']) + >>> midx.codes + a b + 0 0 0 + 1 1 1 + 2 2 2 + """ if self._codes is None: self._compute_levels_and_codes() return self._codes @@ -625,6 +636,37 @@ def nlevels(self): @property def levels(self): + """ + Returns list of levels in the MultiIndex + + Returns + ------- + List of Series objects + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a':[1, 2, 3], 'b':[10, 11, 12]}) + >>> cudf.MultiIndex.from_frame(df) + MultiIndex([(1, 10), + (2, 11), + (3, 12)], + names=['a', 'b']) + >>> midx = cudf.MultiIndex.from_frame(df) + >>> midx + MultiIndex([(1, 10), + (2, 11), + (3, 12)], + names=['a', 'b']) + >>> midx.levels + [0 1 + 1 2 + 2 3 + dtype: int64, 0 10 + 1 11 + 2 12 + dtype: int64] + """ if self._levels is None: self._compute_levels_and_codes() return self._levels @@ -778,8 +820,7 @@ def _compute_levels_and_codes(self): for name in self._source_data.columns: code, cats = self._source_data[name].factorize() codes[name] = code.astype(np.int64) - cats.name = None - cats = cudf.Series(cats)._copy_construct(name=None) + cats = cudf.Series(cats, name=None) levels.append(cats) self._levels = levels @@ -1055,10 +1096,12 @@ def __getitem__(self, index): match = self.take(index) if isinstance(index, slice): return match - result = [] - for level, item in enumerate(match.codes): - result.append(match.levels[level][match.codes[item].iloc[0]]) - return tuple(result) + if isinstance(index, int): + # we are indexing into a single row of the MultiIndex, + # return that row as a tuple: + return match.to_pandas()[0] + else: + return match def to_frame(self, index=True, name=None): df = self._source_data @@ -1126,6 +1169,37 @@ def _concat(cls, objs): @classmethod def from_tuples(cls, tuples, names=None): + """ + Convert list of tuples to MultiIndex. + + Parameters + ---------- + tuples : list / sequence of tuple-likes + Each tuple is the index of one row/column. + names : list / sequence of str, optional + Names for the levels in the index. + + Returns + ------- + MultiIndex + + See Also + -------- + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables. + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. + + Examples + -------- + >>> tuples = [(1, 'red'), (1, 'blue'), + ... (2, 'red'), (2, 'blue')] + >>> cudf.MultiIndex.from_tuples(tuples, names=('number', 'color')) + MultiIndex([(1, 'red'), + (1, 'blue'), + (2, 'red'), + (2, 'blue')], + names=['number', 'color']) + """ # Use Pandas for handling Python host objects pdi = pd.MultiIndex.from_tuples(tuples, names=names) result = cls.from_pandas(pdi) @@ -1190,11 +1264,97 @@ def values(self): return self._source_data.values @classmethod - def from_frame(cls, dataframe, names=None): - return cls(source_data=dataframe, names=names) + def from_frame(cls, df, names=None): + """ + Make a MultiIndex from a DataFrame. + + Parameters + ---------- + df : DataFrame + DataFrame to be converted to MultiIndex. + names : list-like, optional + If no names are provided, use the column names, or tuple of column + names if the columns is a MultiIndex. If a sequence, overwrite + names with the given sequence. + + Returns + ------- + MultiIndex + The MultiIndex representation of the given DataFrame. + + See Also + -------- + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame([['HI', 'Temp'], ['HI', 'Precip'], + ... ['NJ', 'Temp'], ['NJ', 'Precip']], + ... columns=['a', 'b']) + >>> df + a b + 0 HI Temp + 1 HI Precip + 2 NJ Temp + 3 NJ Precip + >>> cudf.MultiIndex.from_frame(df) + MultiIndex([('HI', 'Temp'), + ('HI', 'Precip'), + ('NJ', 'Temp'), + ('NJ', 'Precip')], + names=['a', 'b']) + + Using explicit names, instead of the column names + + >>> cudf.MultiIndex.from_frame(df, names=['state', 'observation']) + MultiIndex([('HI', 'Temp'), + ('HI', 'Precip'), + ('NJ', 'Temp'), + ('NJ', 'Precip')], + names=['state', 'observation']) + """ + return cls(source_data=df, names=names) @classmethod def from_product(cls, arrays, names=None): + """ + Make a MultiIndex from the cartesian product of multiple iterables. + + Parameters + ---------- + iterables : list / sequence of iterables + Each iterable has unique labels for each level of the index. + names : list / sequence of str, optional + Names for the levels in the index. + If not explicitly provided, names will be inferred from the + elements of iterables if an element has a name attribute + + Returns + ------- + MultiIndex + + See Also + -------- + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. + + Examples + -------- + >>> numbers = [0, 1, 2] + >>> colors = ['green', 'purple'] + >>> cudf.MultiIndex.from_product([numbers, colors], + ... names=['number', 'color']) + MultiIndex([(0, 'green'), + (0, 'purple'), + (1, 'green'), + (1, 'purple'), + (2, 'green'), + (2, 'purple')], + names=['number', 'color']) + """ # Use Pandas for handling Python host objects pdi = pd.MultiIndex.from_product(arrays, names=names) result = cls.from_pandas(pdi) @@ -1241,9 +1401,7 @@ def _poplevels(self, level): popped_data[n] = self._data.pop(n) # construct the popped result - popped = cudf.core.index.Index._from_table( - cudf.core.frame.Frame(popped_data) - ) + popped = cudf.Index._from_data(popped_data) popped.names = popped_names # update self diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 9d449d16401..1b8405af1a4 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -781,8 +781,8 @@ def merge_sorted( if by_index and ignore_index: raise ValueError("`by_index` and `ignore_index` cannot both be True") - result = objs[0].__class__._from_table( - cudf._lib.merge.merge_sorted( + result = objs[0].__class__._from_data( + *cudf._lib.merge.merge_sorted( objs, keys=keys, by_index=by_index, @@ -803,9 +803,9 @@ def _pivot(df, index, columns): Parameters ---------- df : DataFrame - index : cudf.core.index.Index + index : cudf.Index Index labels of the result - columns : cudf.core.index.Index + columns : cudf.Index Column labels of the result """ columns_labels, columns_idx = columns._encode() @@ -822,22 +822,31 @@ def as_tuple(x): for v in df: names = [as_tuple(v) + as_tuple(name) for name in column_labels] - col = df._data[v] - result.update( - cudf.DataFrame._from_table( - col.scatter_to_table( - index_idx, - columns_idx, - names, - nrows=len(index_labels), - ncols=len(names), - ) - )._data - ) - out = cudf.DataFrame._from_data( + nrows = len(index_labels) + ncols = len(names) + num_elements = nrows * ncols + if num_elements > 0: + col = df._data[v] + scatter_map = (columns_idx * np.int32(nrows)) + index_idx + target = cudf.core.frame.Frame( + { + None: cudf.core.column.column_empty_like( + col, masked=True, newsize=nrows * ncols + ) + } + ) + target._data[None][scatter_map] = col + result_frames = target._split(range(nrows, nrows * ncols, nrows)) + result.update( + { + name: next(iter(f._columns)) + for name, f in zip(names, result_frames) + } + ) + + return cudf.DataFrame._from_data( result, index=cudf.Index(index_labels, name=index.name) ) - return out def pivot(data, index=None, columns=None, values=None): diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index c6663a25684..f425b650ee7 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -5,7 +5,7 @@ import pyarrow as pa from pandas._libs.missing import NAType as pd_NAType -from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar +import cudf from cudf.core.column.column import ColumnBase from cudf.core.dtypes import Decimal64Dtype, ListDtype, StructDtype from cudf.core.index import BaseIndex @@ -17,45 +17,46 @@ class Scalar(object): + """ + A GPU-backed scalar object with NumPy scalar like properties + May be used in binary operations against other scalars, cuDF + Series, DataFrame, and Index objects. + + Examples + -------- + >>> import cudf + >>> cudf.Scalar(42, dtype='int64') + Scalar(42, dtype=int64) + >>> cudf.Scalar(42, dtype='int32') + cudf.Scalar(42, dtype='float64') + Scalar(84.0, dtype=float64) + >>> cudf.Scalar(42, dtype='int64') + np.int8(21) + Scalar(63, dtype=int64) + >>> x = cudf.Scalar(42, dtype='datetime64[s]') + >>> y = cudf.Scalar(21, dtype='timedelta64[ns]) + >>> x - y + Scalar(1970-01-01T00:00:41.999999979, dtype=datetime64[ns]) + >>> cudf.Series([1,2,3]) + cudf.Scalar(1) + 0 2 + 1 3 + 2 4 + dtype: int64 + >>> df = cudf.DataFrame({'a':[1,2,3], 'b':[4.5, 5.5, 6.5]}) + >>> slr = cudf.Scalar(10, dtype='uint8') + >>> df - slr + a b + 0 -9 -5.5 + 1 -8 -4.5 + 2 -7 -3.5 + + Parameters + ---------- + value : Python Scalar, NumPy Scalar, or cuDF Scalar + The scalar value to be converted to a GPU backed scalar object + dtype : np.dtype or string specifier + The data type + """ + def __init__(self, value, dtype=None): - """ - A GPU-backed scalar object with NumPy scalar like properties - May be used in binary operations against other scalars, cuDF - Series, DataFrame, and Index objects. - - Examples - -------- - >>> import cudf - >>> cudf.Scalar(42, dtype='int64') - Scalar(42, dtype=int64) - >>> cudf.Scalar(42, dtype='int32') + cudf.Scalar(42, dtype='float64') - Scalar(84.0, dtype=float64) - >>> cudf.Scalar(42, dtype='int64') + np.int8(21) - Scalar(63, dtype=int64) - >>> x = cudf.Scalar(42, dtype='datetime64[s]') - >>> y = cudf.Scalar(21, dtype='timedelta64[ns]) - >>> x - y - Scalar(1970-01-01T00:00:41.999999979, dtype=datetime64[ns]) - >>> cudf.Series([1,2,3]) + cudf.Scalar(1) - 0 2 - 1 3 - 2 4 - dtype: int64 - >>> df = cudf.DataFrame({'a':[1,2,3], 'b':[4.5, 5.5, 6.5]}) - >>> slr = cudf.Scalar(10, dtype='uint8') - >>> df - slr - a b - 0 -9 -5.5 - 1 -8 -4.5 - 2 -7 -3.5 - - Parameters - ---------- - value : Python Scalar, NumPy Scalar, or cuDF Scalar - The scalar value to be converted to a GPU backed scalar object - dtype : np.dtype or string specifier - The data type - """ self._host_value = None self._host_dtype = None @@ -67,7 +68,7 @@ def __init__(self, value, dtype=None): self._host_dtype = value._host_dtype else: self._device_value = value._device_value - elif isinstance(value, DeviceScalar): + elif isinstance(value, cudf._lib.scalar.DeviceScalar): self._device_value = value else: self._host_value, self._host_dtype = self._preprocess_host_value( @@ -85,7 +86,7 @@ def _is_device_value_current(self): @property def device_value(self): if self._device_value is None: - self._device_value = DeviceScalar( + self._device_value = cudf._lib.scalar.DeviceScalar( self._host_value, self._host_dtype ) return self._device_value @@ -101,7 +102,7 @@ def value(self): def dtype(self): if self._is_host_value_current: if isinstance(self._host_value, str): - return np.dtype("object") + return cudf.dtype("object") else: return self._host_dtype else: @@ -110,13 +111,13 @@ def dtype(self): def is_valid(self): if not self._is_host_value_current: self._device_value_to_host() - return not _is_null_host_scalar(self._host_value) + return not cudf._lib.scalar._is_null_host_scalar(self._host_value) def _device_value_to_host(self): self._host_value = self._device_value._to_host_scalar() def _preprocess_host_value(self, value, dtype): - valid = not _is_null_host_scalar(value) + valid = not cudf._lib.scalar._is_null_host_scalar(value) if isinstance(value, list): if dtype is not None: @@ -171,7 +172,7 @@ def _preprocess_host_value(self, value, dtype): dtype = value.dtype if not isinstance(dtype, Decimal64Dtype): - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if not valid: value = NA @@ -186,7 +187,7 @@ def _sync(self): if self._is_host_value_current and self._is_device_value_current: return elif self._is_host_value_current and not self._is_device_value_current: - self._device_value = DeviceScalar( + self._device_value = cudf._lib.scalar.DeviceScalar( self._host_value, self._host_dtype ) elif self._is_device_value_current and not self._is_host_value_current: @@ -323,10 +324,10 @@ def _binop_result_dtype_or_error(self, other, op): and self.dtype.char == other.dtype.char == "M" ): res, _ = np.datetime_data(max(self.dtype, other.dtype)) - return np.dtype("m8" + f"[{res}]") + return cudf.dtype("m8" + f"[{res}]") return np.result_type(self.dtype, other.dtype) - return np.dtype(out_dtype) + return cudf.dtype(out_dtype) def _scalar_binop(self, other, op): if isinstance(other, (ColumnBase, Series, BaseIndex, np.ndarray)): @@ -357,9 +358,9 @@ def _unaop_result_type_or_error(self, op): if op in {"__ceil__", "__floor__"}: if self.dtype.char in "bBhHf?": - return np.dtype("float32") + return cudf.dtype("float32") else: - return np.dtype("float64") + return cudf.dtype("float64") return self.dtype def _scalar_unaop(self, op): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index fb197fbc90d..ff3b9fc68ef 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -7,7 +7,7 @@ from collections import abc as abc from numbers import Number from shutil import get_terminal_size -from typing import Any, Optional +from typing import Any, MutableMapping, Optional from uuid import uuid4 import cupy @@ -64,6 +64,48 @@ class Series(SingleColumnFrame, Serializable): + """ + One-dimensional GPU array (including time series). + + Labels need not be unique but must be a hashable type. The object + supports both integer- and label-based indexing and provides a + host of methods for performing operations involving the index. + Statistical methods from ndarray have been overridden to + automatically exclude missing data (currently represented + as null/NaN). + + Operations between Series (`+`, `-`, `/`, `*`, `**`) align + values based on their associated index values-– they need + not be the same length. The result index will be the + sorted union of the two indexes. + + ``Series`` objects are used as columns of ``DataFrame``. + + Parameters + ---------- + data : array-like, Iterable, dict, or scalar value + Contains data stored in Series. + + index : array-like or Index (1d) + Values must be hashable and have the same length + as data. Non-unique index values are allowed. Will + default to RangeIndex (0, 1, 2, …, n) if not provided. + If both a dict and index sequence are used, the index will + override the keys found in the dict. + + dtype : str, numpy.dtype, or ExtensionDtype, optional + Data type for the output Series. If not specified, + this will be inferred from data. + + name : str, optional + The name to give to the Series. + + nan_as_null : bool, Default True + If ``None``/``True``, converts ``np.nan`` values to + ``null`` values. + If ``False``, leaves ``np.nan`` values as is. + """ + # The `constructor*` properties are used by `dask` (and `dask_cudf`) @property def _constructor(self): @@ -171,47 +213,6 @@ def from_masked_array(cls, data, mask, null_count=None): def __init__( self, data=None, index=None, dtype=None, name=None, nan_as_null=True, ): - """ - One-dimensional GPU array (including time series). - - Labels need not be unique but must be a hashable type. The object - supports both integer- and label-based indexing and provides a - host of methods for performing operations involving the index. - Statistical methods from ndarray have been overridden to - automatically exclude missing data (currently represented - as null/NaN). - - Operations between Series (`+`, `-`, `/`, `*`, `**`) align - values based on their associated index values-– they need - not be the same length. The result index will be the - sorted union of the two indexes. - - ``Series`` objects are used as columns of ``DataFrame``. - - Parameters - ---------- - data : array-like, Iterable, dict, or scalar value - Contains data stored in Series. - - index : array-like or Index (1d) - Values must be hashable and have the same length - as data. Non-unique index values are allowed. Will - default to RangeIndex (0, 1, 2, …, n) if not provided. - If both a dict and index sequence are used, the index will - override the keys found in the dict. - - dtype : str, numpy.dtype, or ExtensionDtype, optional - Data type for the output Series. If not specified, - this will be inferred from data. - - name : str, optional - The name to give to the Series. - - nan_as_null : bool, Default True - If ``None``/``True``, converts ``np.nan`` values to - ``null`` values. - If ``False``, leaves ``np.nan`` values as is. - """ if isinstance(data, pd.Series): if name is None: name = data.name @@ -266,29 +267,19 @@ def __init__( super().__init__({name: data}) self._index = RangeIndex(len(data)) if index is None else index - @classmethod - def _from_table(cls, table, index=None): - name, data = next(iter(table._data.items())) - if index is None: - if table._index is not None: - index = Index._from_table(table._index) - return cls(data=data, index=index, name=name) - @classmethod def _from_data( cls, - data: ColumnAccessor, - index: Optional[Index] = None, + data: MutableMapping, + index: Optional[BaseIndex] = None, name: Any = None, ) -> Series: """ Construct the Series from a ColumnAccessor """ - out = cls.__new__(cls) - out._data = data - out._index = index if index is not None else RangeIndex(data.nrows) - if name is not None: - out.name = name + out: Series = super()._from_data(data, index, name) + if index is None: + out._index = RangeIndex(out._data.nrows) return out def __contains__(self, item): @@ -392,10 +383,6 @@ def deserialize(cls, header, frames): return Series(column, index=index, name=name) - @property - def _copy_construct_defaults(self): - return {"data": self._column, "index": self._index, "name": self.name} - def _get_columns_by_label(self, labels, downcast=False): """Return the column specified by `labels` @@ -467,7 +454,7 @@ def drop( Return series without null values Series.drop_duplicates Return series with duplicate values removed - cudf.core.dataframe.DataFrame.drop + cudf.DataFrame.drop Drop specified labels from rows or columns in dataframe Examples @@ -708,7 +695,7 @@ def reset_index(self, drop=False, inplace=False): if inplace is True: self._index = RangeIndex(len(self)) else: - return self._copy_construct(index=RangeIndex(len(self))) + return self._from_data(self._data, index=RangeIndex(len(self))) def set_index(self, index): """Returns a new Series with a different index. @@ -743,7 +730,7 @@ def set_index(self, index): dtype: int64 """ index = index if isinstance(index, BaseIndex) else as_index(index) - return self._copy_construct(index=index) + return self._from_data(self._data, index, self.name) def as_index(self): """Returns a new Series with a RangeIndex. @@ -855,8 +842,14 @@ def set_mask(self, mask, null_count=None): 4 5 dtype: int64 """ - col = self._column.set_mask(mask) - return self._copy_construct(data=col) + warnings.warn( + "Series.set_mask is deprecated and will be removed " + "in the future.", + DeprecationWarning, + ) + return self._from_data( + {self.name: self._column.set_mask(mask)}, self._index + ) def __sizeof__(self): return self._column.__sizeof__() + self._index.__sizeof__() @@ -884,7 +877,7 @@ def memory_usage(self, index=True, deep=False): See Also -------- - cudf.core.dataframe.DataFrame.memory_usage : Bytes consumed by + cudf.DataFrame.memory_usage : Bytes consumed by a DataFrame. Examples @@ -1097,8 +1090,9 @@ def take(self, indices, keep_index=True): return self.iloc[indices] else: col_inds = as_column(indices) - data = self._column.take(col_inds, keep_index=False) - return self._copy_construct(data=data, index=None) + return self._from_data( + {self.name: self._column.take(col_inds, keep_index=False)} + ) def head(self, n=5): """ @@ -2349,22 +2343,22 @@ def __invert__(self): f"Operation `~` not supported on {self.dtype.type.__name__}" ) - @copy_docstring(CategoricalAccessor.__init__) # type: ignore + @copy_docstring(CategoricalAccessor) # type: ignore @property def cat(self): return CategoricalAccessor(parent=self) - @copy_docstring(StringMethods.__init__) # type: ignore + @copy_docstring(StringMethods) # type: ignore @property def str(self): return StringMethods(parent=self) - @copy_docstring(ListMethods.__init__) # type: ignore + @copy_docstring(ListMethods) # type: ignore @property def list(self): return ListMethods(parent=self) - @copy_docstring(StructMethods.__init__) # type: ignore + @copy_docstring(StructMethods) # type: ignore @property def struct(self): return StructMethods(parent=self) @@ -2508,10 +2502,10 @@ def dropna(self, axis=0, inplace=False, how=None): Series.fillna : Replace null values. - cudf.core.dataframe.DataFrame.dropna : Drop rows or columns which + cudf.DataFrame.dropna : Drop rows or columns which contain null values. - cudf.core.index.Index.dropna : Drop null indices. + cudf.Index.dropna : Drop null indices. Examples -------- @@ -2727,113 +2721,23 @@ def nans_to_nulls(self): 4 10.0 dtype: float64 """ - result_col = self._column.nans_to_nulls() - return self._copy_construct(data=result_col) + return self._from_data( + {self.name: self._column.nans_to_nulls()}, self._index + ) def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): - """ - Return whether all elements are True in Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If the entire row/column is NA and - skipna is True, then the result will be True, as for an - empty row/column. - If skipna is False, then NA are treated as True, because - these are not equal to zero. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.all() - True - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - if bool_only not in (None, True): raise NotImplementedError( - "bool_only parameter is not implemented yet" + "The bool_only parameter is not supported for Series." ) - - if skipna: - result_series = self.nans_to_nulls() - if len(result_series) == result_series.null_count: - return True - else: - result_series = self - return result_series._column.all() + return super().all(axis, skipna, level, **kwargs) def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): - """ - Return whether any elements is True in Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If the entire row/column is NA and - skipna is True, then the result will be False, as for an - empty row/column. - If skipna is False, then NA are treated as True, because - these are not equal to zero. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.any() - True - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - if bool_only not in (None, True): raise NotImplementedError( - "bool_only parameter is not implemented yet" + "The bool_only parameter is not supported for Series." ) - - skipna = False if skipna is None else skipna - - if skipna is False and self.has_nulls: - return True - - if skipna: - result_series = self.nans_to_nulls() - if len(result_series) == result_series.null_count: - return False - - else: - result_series = self - - return result_series._column.any() + return super().any(axis, skipna, level, **kwargs) def to_pandas(self, index=True, nullable=False, **kwargs): """ @@ -2941,7 +2845,7 @@ def loc(self): See also -------- - cudf.core.dataframe.DataFrame.loc + cudf.DataFrame.loc Examples -------- @@ -2964,7 +2868,7 @@ def iloc(self): See also -------- - cudf.core.dataframe.DataFrame.iloc + cudf.DataFrame.iloc Examples -------- @@ -3106,8 +3010,9 @@ def astype(self, dtype, copy=False, errors="raise"): try: data = self._column.astype(dtype) - return self._copy_construct( - data=data.copy(deep=True) if copy else data, index=self.index + return self._from_data( + {self.name: (data.copy(deep=True) if copy else data)}, + index=self._index, ) except Exception as e: @@ -3421,8 +3326,8 @@ def _sort(self, ascending=True, na_position="last"): col_keys, col_inds = self._column.sort_by_values( ascending=ascending, na_position=na_position ) - sr_keys = self._copy_construct(data=col_keys) - sr_inds = self._copy_construct(data=col_inds) + sr_keys = self._from_data({self.name: col_keys}, self._index) + sr_inds = self._from_data({self.name: col_inds}, self._index) return sr_keys, sr_inds def replace( @@ -3725,9 +3630,9 @@ def reverse(self): dtype: int64 """ rinds = column.arange((self._column.size - 1), -1, -1, dtype=np.int32) - col = self._column[rinds] - index = self.index._values[rinds] - return self._copy_construct(data=col, index=index) + return self._from_data( + {self.name: self._column[rinds]}, self.index._values[rinds] + ) def one_hot_encoding(self, cats, dtype="float64"): """Perform one-hot-encoding @@ -3774,7 +3679,7 @@ def one_hot_encoding(self, cats, dtype="float64"): cats = cats.to_pandas() else: cats = pd.Series(cats, dtype="object") - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) def encode(cat): if cat is None: @@ -3881,7 +3786,9 @@ def _return_sentinel_series(): codes = codes.merge(value, on="value", how="left") codes = codes.sort_values("order")["code"].fillna(na_sentinel) - return codes._copy_construct(name=None, index=self.index) + codes.name = None + codes.index = self._index + return codes # UDF related @@ -3995,7 +3902,7 @@ def applymap(self, udf, out_dtype=None): """ if not callable(udf): raise ValueError("Input UDF must be a callable object.") - return self._copy_construct(data=self._unaryop(udf)) + return self._from_data({self.name: self._unaryop(udf)}, self._index) # # Stats @@ -4026,932 +3933,155 @@ def count(self, level=None, **kwargs): return self.valid_count - def min( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - **kwargs, - ): + def mode(self, dropna=True): """ - Return the minimum of the values in the Series. + Return the mode(s) of the dataset. + + Always returns Series even if only one value is returned. Parameters ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - dtype : data type - Data type to cast the result to. + dropna : bool, default True + Don't consider counts of NA/NaN/NaT. Returns ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level`, `numeric_only`. + Series + Modes of the Series in sorted order. Examples -------- >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.min() - 1 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") + >>> series = cudf.Series([7, 6, 5, 4, 3, 2, 1]) + >>> series + 0 7 + 1 6 + 2 5 + 3 4 + 4 3 + 5 2 + 6 1 + dtype: int64 + >>> series.mode() + 0 1 + 1 2 + 2 3 + 3 4 + 4 5 + 5 6 + 6 7 + dtype: int64 - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") + We can include ```` values in mode by + passing ``dropna=False``. - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) + >>> series = cudf.Series([7, 4, 3, 3, 7, None, None]) + >>> series + 0 7 + 1 4 + 2 3 + 3 3 + 4 7 + 5 + 6 + dtype: int64 + >>> series.mode() + 0 3 + 1 7 + dtype: int64 + >>> series.mode(dropna=False) + 0 3 + 1 7 + 2 + dtype: int64 + """ + val_counts = self.value_counts(ascending=False, dropna=dropna) + if len(val_counts) > 0: + val_counts = val_counts[val_counts == val_counts.iloc[0]] - return self._column.min(skipna=skipna, dtype=dtype) + return Series(val_counts.index.sort_values(), name=self.name) - def max( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - **kwargs, - ): + def round(self, decimals=0, how="half_even"): """ - Return the maximum of the values in the Series. + Round each value in a Series to the given number of decimals. Parameters ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - dtype : data type - Data type to cast the result to. + decimals : int, default 0 + Number of decimal places to round to. If decimals is negative, + it specifies the number of positions to the left of the decimal + point. + how : str, optional + Type of rounding. Can be either "half_even" (default) + of "half_up" rounding. Returns ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level`, `numeric_only`. + Series + Rounded values of the Series. Examples -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.max() - 5 + >>> s = cudf.Series([0.1, 1.4, 2.9]) + >>> s.round() + 0 0.0 + 1 1.0 + 2 3.0 + dtype: float64 """ + return Series( + self._column.round(decimals=decimals, how=how), + name=self.name, + index=self.index, + dtype=self.dtype, + ) - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.max(skipna=skipna, dtype=dtype) - - def sum( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): + def cov(self, other, min_periods=None): """ - Return sum of the values in the Series. + Compute covariance with Series, excluding missing values. Parameters ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - dtype : data type - Data type to cast the result to. - - min_count : int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. + other : Series + Series with which to compute the covariance. Returns ------- - scalar + float + Covariance between Series and other normalized by N-1 + (unbiased estimator). Notes ----- - Parameters currently not supported are `axis`, `level`, `numeric_only`. + `min_periods` parameter is not yet supported. Examples -------- >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.sum() - 15 + >>> ser1 = cudf.Series([0.9, 0.13, 0.62]) + >>> ser2 = cudf.Series([0.12, 0.26, 0.51]) + >>> ser1.cov(ser2) + -0.015750000000000004 """ - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): + if min_periods is not None: raise NotImplementedError( - "numeric_only parameter is not implemented yet" + "min_periods parameter is not implemented yet" ) - return self._column.sum( - skipna=skipna, dtype=dtype, min_count=min_count - ) - - def product( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """ - Return product of the values in the Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - dtype : data type - Data type to cast the result to. + if self.empty or other.empty: + return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - min_count : int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. + lhs = self.nans_to_nulls().dropna() + rhs = other.nans_to_nulls().dropna() - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. + lhs, rhs = _align_indices([lhs, rhs], how="inner") - Returns - ------- - scalar + return lhs._column.cov(rhs._column) - Notes - ----- - Parameters currently not supported are `axis`, `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.product() - 120 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.product( - skipna=skipna, dtype=dtype, min_count=min_count - ) - - prod = product - - def cummin(self, axis=None, skipna=True, *args, **kwargs): - """ - Return cumulative minimum of the Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, - the result will be NA. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported is `axis` - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.cummin() - 0 1 - 1 1 - 2 1 - 3 1 - 4 1 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - skipna = True if skipna is None else skipna - - if skipna: - result_col = self.nans_to_nulls()._column - else: - result_col = self._column.copy() - if result_col.has_nulls: - # Workaround as find_first_value doesn't seem to work - # incase of bools. - first_index = int( - result_col.isnull().astype("int8").find_first_value(1) - ) - result_col[first_index:] = None - - return Series( - result_col._apply_scan_op("min"), name=self.name, index=self.index, - ) - - def cummax(self, axis=0, skipna=True, *args, **kwargs): - """ - Return cumulative maximum of the Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, - the result will be NA. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported is `axis` - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.cummax() - 0 1 - 1 5 - 2 5 - 3 5 - 4 5 - """ - assert axis in (None, 0) - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - skipna = True if skipna is None else skipna - - if skipna: - result_col = self.nans_to_nulls()._column - else: - result_col = self._column.copy() - if result_col.has_nulls: - first_index = int( - result_col.isnull().astype("int8").find_first_value(1) - ) - result_col[first_index:] = None - - return Series( - result_col._apply_scan_op("max"), name=self.name, index=self.index, - ) - - def cumsum(self, axis=0, skipna=True, *args, **kwargs): - """ - Return cumulative sum of the Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, - the result will be NA. - - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported is `axis` - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.cumsum() - 0 1 - 1 6 - 2 8 - 3 12 - 4 15 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - skipna = True if skipna is None else skipna - - if skipna: - result_col = self.nans_to_nulls()._column - else: - result_col = self._column.copy() - if result_col.has_nulls: - first_index = int( - result_col.isnull().astype("int8").find_first_value(1) - ) - result_col[first_index:] = None - - # pandas always returns int64 dtype if original dtype is int or `bool` - if not is_decimal_dtype(result_col.dtype) and ( - np.issubdtype(result_col.dtype, np.integer) - or np.issubdtype(result_col.dtype, np.bool_) - ): - return Series( - result_col.astype(np.int64)._apply_scan_op("sum"), - name=self.name, - index=self.index, - ) - else: - return Series( - result_col._apply_scan_op("sum"), - name=self.name, - index=self.index, - ) - - def cumprod(self, axis=0, skipna=True, *args, **kwargs): - """ - Return cumulative product of the Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, - the result will be NA. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported is `axis` - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.cumprod() - 0 1 - 1 5 - 2 10 - 3 40 - 4 120 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if is_decimal_dtype(self.dtype): - raise NotImplementedError( - "cumprod does not currently support decimal types" - ) - - skipna = True if skipna is None else skipna - - if skipna: - result_col = self.nans_to_nulls()._column - else: - result_col = self._column.copy() - if result_col.has_nulls: - first_index = int( - result_col.isnull().astype("int8").find_first_value(1) - ) - result_col[first_index:] = None - - # pandas always returns int64 dtype if original dtype is int or `bool` - if np.issubdtype(result_col.dtype, np.integer) or np.issubdtype( - result_col.dtype, np.bool_ - ): - return Series( - result_col.astype(np.int64)._apply_scan_op("product"), - name=self.name, - index=self.index, - ) - else: - return Series( - result_col._apply_scan_op("product"), - name=self.name, - index=self.index, - ) - - def mean( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs - ): - """ - Return the mean of the values in the series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([10, 25, 3, 25, 24, 6]) - >>> ser.mean() - 15.5 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.mean(skipna=skipna) - - def std( - self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs, - ): - """ - Return sample standard deviation of the Series. - - Normalized by N-1 by default. This can be changed using - the `ddof` argument - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is N - ddof, where N represents the number of elements. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([10, 10, 20, 30, 40]) - >>> series - 0 10 - 1 10 - 2 20 - 3 30 - 4 40 - dtype: int64 - >>> series.std() - 13.038404810405298 - >>> series.std(ddof=2) - 15.05545305418162 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.std(skipna=skipna, ddof=ddof) - - def var( - self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs, - ): - """ - Return unbiased variance of the Series. - - Normalized by N-1 by default. This can be changed using the - ddof argument - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations is - N - ddof, where N represents the number of elements. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([10, 11, 12, 0, 1]) - >>> series - 0 10 - 1 11 - 2 12 - 3 0 - 4 1 - dtype: int64 - >>> series.var() - 33.7 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.var(skipna=skipna, ddof=ddof) - - def sum_of_squares(self, dtype=None): - return self._column.sum_of_squares(dtype=dtype) - - def median( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs - ): - """ - Return the median of the values for the requested axis. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([10, 25, 3, 25, 24, 6]) - >>> ser - 0 10 - 1 25 - 2 3 - 3 25 - 4 24 - 5 6 - dtype: int64 - >>> ser.median() - 17.0 - """ - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.median(skipna=skipna) - - def mode(self, dropna=True): - """ - Return the mode(s) of the dataset. - - Always returns Series even if only one value is returned. - - Parameters - ---------- - dropna : bool, default True - Don't consider counts of NA/NaN/NaT. - - Returns - ------- - Series - Modes of the Series in sorted order. - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([7, 6, 5, 4, 3, 2, 1]) - >>> series - 0 7 - 1 6 - 2 5 - 3 4 - 4 3 - 5 2 - 6 1 - dtype: int64 - >>> series.mode() - 0 1 - 1 2 - 2 3 - 3 4 - 4 5 - 5 6 - 6 7 - dtype: int64 - - We can include ```` values in mode by - passing ``dropna=False``. - - >>> series = cudf.Series([7, 4, 3, 3, 7, None, None]) - >>> series - 0 7 - 1 4 - 2 3 - 3 3 - 4 7 - 5 - 6 - dtype: int64 - >>> series.mode() - 0 3 - 1 7 - dtype: int64 - >>> series.mode(dropna=False) - 0 3 - 1 7 - 2 - dtype: int64 - """ - val_counts = self.value_counts(ascending=False, dropna=dropna) - if len(val_counts) > 0: - val_counts = val_counts[val_counts == val_counts.iloc[0]] - - return Series(val_counts.index.sort_values(), name=self.name) - - def round(self, decimals=0, how="half_even"): - """ - Round each value in a Series to the given number of decimals. - - Parameters - ---------- - decimals : int, default 0 - Number of decimal places to round to. If decimals is negative, - it specifies the number of positions to the left of the decimal - point. - how : str, optional - Type of rounding. Can be either "half_even" (default) - of "half_up" rounding. - - Returns - ------- - Series - Rounded values of the Series. - - Examples - -------- - >>> s = cudf.Series([0.1, 1.4, 2.9]) - >>> s.round() - 0 0.0 - 1 1.0 - 2 3.0 - dtype: float64 - """ - return Series( - self._column.round(decimals=decimals, how=how), - name=self.name, - index=self.index, - dtype=self.dtype, - ) - - def kurtosis( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs - ): - """ - Return Fisher's unbiased kurtosis of a sample. - - Kurtosis obtained using Fisher’s definition of - kurtosis (kurtosis of normal == 0.0). Normalized by N-1. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([1, 2, 3, 4]) - >>> series.kurtosis() - -1.1999999999999904 - """ - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.kurtosis(skipna=skipna) - - # Alias for kurtosis. - kurt = kurtosis - - def skew( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs - ): - """ - Return unbiased Fisher-Pearson skew of a sample. - - Parameters - ---------- - skipna : bool, default True - Exclude NA/null values when computing the result. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([1, 2, 3, 4, 5, 6, 6]) - >>> series - 0 1 - 1 2 - 2 3 - 3 4 - 4 5 - 5 6 - 6 6 - dtype: int64 - >>> series.skew() - -0.288195490292614 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.skew(skipna=skipna) - - def cov(self, other, min_periods=None): - """ - Compute covariance with Series, excluding missing values. - - Parameters - ---------- - other : Series - Series with which to compute the covariance. - - Returns - ------- - float - Covariance between Series and other normalized by N-1 - (unbiased estimator). - - Notes - ----- - `min_periods` parameter is not yet supported. - - Examples - -------- - >>> import cudf - >>> ser1 = cudf.Series([0.9, 0.13, 0.62]) - >>> ser2 = cudf.Series([0.12, 0.26, 0.51]) - >>> ser1.cov(ser2) - -0.015750000000000004 - """ - - if min_periods is not None: - raise NotImplementedError( - "min_periods parameter is not implemented yet" - ) - - if self.empty or other.empty: - return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - - lhs = self.nans_to_nulls().dropna() - rhs = other.nans_to_nulls().dropna() - - lhs, rhs = _align_indices([lhs, rhs], how="inner") - - return lhs._column.cov(rhs._column) - - def corr(self, other, method="pearson", min_periods=None): - """Calculates the sample correlation between two Series, - excluding missing values. + def corr(self, other, method="pearson", min_periods=None): + """Calculates the sample correlation between two Series, + excluding missing values. Examples -------- @@ -4962,7 +4092,11 @@ def corr(self, other, method="pearson", min_periods=None): -0.20454263717316112 """ - assert method in ("pearson",) and min_periods in (None,) + if method not in ("pearson",): + raise ValueError(f"Unknown method {method}") + + if min_periods not in (None,): + raise NotImplementedError("Unsupported argument 'min_periods'") if self.empty or other.empty: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) @@ -5149,7 +4283,7 @@ def value_counts( Series.count Number of non-NA elements in a Series. - cudf.core.dataframe.DataFrame.count + cudf.DataFrame.count Number of non-NA elements in a DataFrame. Examples @@ -5260,7 +4394,8 @@ def scale(self): vmin = self.min() vmax = self.max() scaled = (self - vmin) / (vmax - vmin) - return self._copy_construct(data=scaled) + scaled._index = self._index.copy(deep=False) + return scaled # Absolute def abs(self): @@ -5411,7 +4546,8 @@ def hash_encode(self, stop, use_name=False): 2 76 dtype: int32 """ - assert stop > 0 + if not stop > 0: + raise ValueError("stop must be a positive integer.") initial_hash = [hash(self.name) & 0xFFFFFFFF] if use_name else None hashed_values = Series(self._hash(initial_hash)) @@ -5755,7 +4891,7 @@ def diff(self, periods=1): return Series(output_col, name=self.name, index=self.index) - @copy_docstring(SeriesGroupBy.__init__) + @copy_docstring(SeriesGroupBy) def groupby( self, by=None, @@ -6438,6 +5574,42 @@ def is_leap_year(self): ------- Series Booleans indicating if dates belong to a leap year. + + Example + ------- + >>> import pandas as pd, cudf + >>> s = cudf.Series( + ... pd.date_range(start='2000-02-01', end='2013-02-01', freq='1Y')) + >>> s + 0 2000-12-31 + 1 2001-12-31 + 2 2002-12-31 + 3 2003-12-31 + 4 2004-12-31 + 5 2005-12-31 + 6 2006-12-31 + 7 2007-12-31 + 8 2008-12-31 + 9 2009-12-31 + 10 2010-12-31 + 11 2011-12-31 + 12 2012-12-31 + dtype: datetime64[ns] + >>> s.dt.is_leap_year + 0 True + 1 False + 2 False + 3 False + 4 True + 5 False + 6 False + 7 False + 8 True + 9 False + 10 False + 11 False + 12 True + dtype: bool """ res = libcudf.datetime.is_leap_year(self.series._column).fillna(False) return Series._from_data( @@ -6447,17 +5619,304 @@ def is_leap_year(self): ) @property - def is_month_start(self): + def quarter(self): """ - Boolean indicator if the date is the first day of the month. + Integer indicator for which quarter of the year the date belongs in. + + There are 4 quarters in a year. With the first quarter being from + January - March, second quarter being April - June, third quarter + being July - September and fourth quarter being October - December. Returns ------- Series + Integer indicating which quarter the date belongs to. + + Examples + ------- + >>> import cudf + >>> s = cudf.Series(["2020-05-31 08:00:00","1999-12-31 18:40:00"], + ... dtype="datetime64[ms]") + >>> s.dt.quarter + 0 2 + 1 4 + dtype: int8 + """ + res = libcudf.datetime.extract_quarter(self.series._column).astype( + np.int8 + ) + return Series._from_data( + {None: res}, index=self.series._index, name=self.series.name, + ) + + @property + def is_month_start(self): + """ Booleans indicating if dates are the first day of the month. """ return (self.day == 1).fillna(False) + @property + def days_in_month(self): + """ + Get the total number of days in the month that the date falls on. + + Returns + ------- + Series + Integers representing the number of days in month + + Example + ------- + >>> import pandas as pd, cudf + >>> s = cudf.Series( + ... pd.date_range(start='2000-08-01', end='2001-08-01', freq='1M')) + >>> s + 0 2000-08-31 + 1 2000-09-30 + 2 2000-10-31 + 3 2000-11-30 + 4 2000-12-31 + 5 2001-01-31 + 6 2001-02-28 + 7 2001-03-31 + 8 2001-04-30 + 9 2001-05-31 + 10 2001-06-30 + 11 2001-07-31 + dtype: datetime64[ns] + >>> s.dt.days_in_month + 0 31 + 1 30 + 2 31 + 3 30 + 4 31 + 5 31 + 6 28 + 7 31 + 8 30 + 9 31 + 10 30 + 11 31 + dtype: int16 + """ + res = libcudf.datetime.days_in_month(self.series._column) + return Series._from_data( + ColumnAccessor({None: res}), + index=self.series._index, + name=self.series.name, + ) + + @property + def is_month_end(self): + """ + Boolean indicator if the date is the last day of the month. + + Returns + ------- + Series + Booleans indicating if dates are the last day of the month. + + Example + ------- + >>> import pandas as pd, cudf + >>> s = cudf.Series( + ... pd.date_range(start='2000-08-26', end='2000-09-03', freq='1D')) + >>> s + 0 2000-08-26 + 1 2000-08-27 + 2 2000-08-28 + 3 2000-08-29 + 4 2000-08-30 + 5 2000-08-31 + 6 2000-09-01 + 7 2000-09-02 + 8 2000-09-03 + dtype: datetime64[ns] + >>> s.dt.is_month_end + 0 False + 1 False + 2 False + 3 False + 4 False + 5 True + 6 False + 7 False + 8 False + dtype: bool + """ # noqa: E501 + last_day = libcudf.datetime.last_day_of_month(self.series._column) + last_day = Series._from_data( + ColumnAccessor({None: last_day}), + index=self.series._index, + name=self.series.name, + ) + return (self.day == last_day.dt.day).fillna(False) + + @property + def is_quarter_start(self): + """ + Boolean indicator if the date is the first day of a quarter. + + Returns + ------- + Series + Booleans indicating if dates are the begining of a quarter + + Example + ------- + >>> import pandas as pd, cudf + >>> s = cudf.Series( + ... pd.date_range(start='2000-09-26', end='2000-10-03', freq='1D')) + >>> s + 0 2000-09-26 + 1 2000-09-27 + 2 2000-09-28 + 3 2000-09-29 + 4 2000-09-30 + 5 2000-10-01 + 6 2000-10-02 + 7 2000-10-03 + dtype: datetime64[ns] + >>> s.dt.is_quarter_start + 0 False + 1 False + 2 False + 3 False + 4 False + 5 True + 6 False + 7 False + dtype: bool + """ + day = self.series._column.get_dt_field("day") + first_month = self.series._column.get_dt_field("month").isin( + [1, 4, 7, 10] + ) + + result = ((day == cudf.Scalar(1)) & first_month).fillna(False) + return Series._from_data( + {None: result}, index=self.series._index, name=self.series.name, + ) + + @property + def is_quarter_end(self): + """ + Boolean indicator if the date is the last day of a quarter. + + Returns + ------- + Series + Booleans indicating if dates are the end of a quarter + + Example + ------- + >>> import pandas as pd, cudf + >>> s = cudf.Series( + ... pd.date_range(start='2000-09-26', end='2000-10-03', freq='1D')) + >>> s + 0 2000-09-26 + 1 2000-09-27 + 2 2000-09-28 + 3 2000-09-29 + 4 2000-09-30 + 5 2000-10-01 + 6 2000-10-02 + 7 2000-10-03 + dtype: datetime64[ns] + >>> s.dt.is_quarter_end + 0 False + 1 False + 2 False + 3 False + 4 True + 5 False + 6 False + 7 False + dtype: bool + """ + day = self.series._column.get_dt_field("day") + last_day = libcudf.datetime.last_day_of_month(self.series._column) + last_day = last_day.get_dt_field("day") + last_month = self.series._column.get_dt_field("month").isin( + [3, 6, 9, 12] + ) + + result = ((day == last_day) & last_month).fillna(False) + return Series._from_data( + {None: result}, index=self.series._index, name=self.series.name, + ) + + @property + def is_year_start(self): + """ + Boolean indicator if the date is the first day of the year. + + Returns + ------- + Series + Booleans indicating if dates are the first day of the year. + + Example + ------- + >>> import pandas as pd, cudf + >>> s = cudf.Series(pd.date_range("2017-12-30", periods=3)) + >>> dates + 0 2017-12-30 + 1 2017-12-31 + 2 2018-01-01 + dtype: datetime64[ns] + >>> dates.dt.is_year_start + 0 False + 1 False + 2 True + dtype: bool + """ + outcol = self.series._column.get_dt_field( + "day_of_year" + ) == cudf.Scalar(1) + return Series._from_data( + {None: outcol.fillna(False)}, + index=self.series._index, + name=self.series.name, + ) + + @property + def is_year_end(self): + """ + Boolean indicator if the date is the last day of the year. + + Returns + ------- + Series + Booleans indicating if dates are the last day of the year. + + Example + ------- + >>> import pandas as pd, cudf + >>> dates = cudf.Series(pd.date_range("2017-12-30", periods=3)) + >>> dates + 0 2017-12-30 + 1 2017-12-31 + 2 2018-01-01 + dtype: datetime64[ns] + >>> dates.dt.is_year_end + 0 False + 1 True + 2 False + dtype: bool + """ + day_of_year = self.series._column.get_dt_field("day_of_year") + leap_dates = libcudf.datetime.is_leap_year(self.series._column) + + leap = day_of_year == cudf.Scalar(366) + non_leap = day_of_year == cudf.Scalar(365) + result = cudf._lib.copying.copy_if_else(leap, non_leap, leap_dates) + result = result.fillna(False) + return Series._from_data( + {None: result}, index=self.series._index, name=self.series.name, + ) + def _get_dt_field(self, field): out_column = self.series._column.get_dt_field(field) return Series( @@ -6828,7 +6287,7 @@ def _align_indices(series_list, how="outer", allow_non_unique=False): for sr in series_list[1:]: if not sr.index.names == head.names: all_names_equal = False - new_index_names = [None] + new_index_names = [None] * head.nlevels if all_names_equal: new_index_names = head.names diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 00f60cfc8b5..946cdcb1ebc 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -345,6 +345,66 @@ def get_units(value): class DateOffset: + """ + An object used for binary ops where calendrical arithmetic + is desired rather than absolute time arithmetic. Used to + add or subtract a whole number of periods, such as several + months or years, to a series or index of datetime dtype. + Works similarly to pd.DateOffset, but stores the offset + on the device (GPU). + + Parameters + ---------- + n : int, default 1 + The number of time periods the offset represents. + **kwds + Temporal parameter that add to or replace the offset value. + Parameters that **add** to the offset (like Timedelta): + - months + + See Also + -------- + pandas.DateOffset : The equivalent Pandas object that this + object replicates + + Examples + -------- + >>> from cudf import DateOffset + >>> ts = cudf.Series([ + "2000-01-01 00:00:00.012345678", + "2000-01-31 00:00:00.012345678", + "2000-02-29 00:00:00.012345678", + ], dtype='datetime64[ns]) + >>> ts + DateOffset(months=3) + 0 2000-04-01 00:00:00.012345678 + 1 2000-04-30 00:00:00.012345678 + 2 2000-05-29 00:00:00.012345678 + dtype: datetime64[ns] + >>> ts - DateOffset(months=12) + 0 1999-01-01 00:00:00.012345678 + 1 1999-01-31 00:00:00.012345678 + 2 1999-02-28 00:00:00.012345678 + dtype: datetime64[ns] + + Notes + ----- + Note that cuDF does not yet support DateOffset arguments + that 'replace' units in the datetime data being operated on + such as + - year + - month + - week + - day + - hour + - minute + - second + - microsecond + - millisecond + - nanosecond + + cuDF does not yet support rounding via a `normalize` + keyword argument. + """ _UNITS_TO_CODES = { "nanoseconds": "ns", @@ -362,66 +422,6 @@ class DateOffset: _CODES_TO_UNITS = {v: k for k, v in _UNITS_TO_CODES.items()} def __init__(self, n=1, normalize=False, **kwds): - """ - An object used for binary ops where calendrical arithmetic - is desired rather than absolute time arithmetic. Used to - add or subtract a whole number of periods, such as several - months or years, to a series or index of datetime dtype. - Works similarly to pd.DateOffset, but stores the offset - on the device (GPU). - - Parameters - ---------- - n : int, default 1 - The number of time periods the offset represents. - **kwds - Temporal parameter that add to or replace the offset value. - Parameters that **add** to the offset (like Timedelta): - - months - - See Also - -------- - pandas.DateOffset : The equivalent Pandas object that this - object replicates - - Examples - -------- - >>> from cudf import DateOffset - >>> ts = cudf.Series([ - "2000-01-01 00:00:00.012345678", - "2000-01-31 00:00:00.012345678", - "2000-02-29 00:00:00.012345678", - ], dtype='datetime64[ns]) - >>> ts + DateOffset(months=3) - 0 2000-04-01 00:00:00.012345678 - 1 2000-04-30 00:00:00.012345678 - 2 2000-05-29 00:00:00.012345678 - dtype: datetime64[ns] - >>> ts - DateOffset(months=12) - 0 1999-01-01 00:00:00.012345678 - 1 1999-01-31 00:00:00.012345678 - 2 1999-02-28 00:00:00.012345678 - dtype: datetime64[ns] - - Notes - ----- - Note that cuDF does not yet support DateOffset arguments - that 'replace' units in the datetime data being operated on - such as - - year - - month - - week - - day - - hour - - minute - - second - - microsecond - - millisecond - - nanosecond - - cuDF does not yet support rounding via a `normalize` - keyword argument. - """ if normalize: raise NotImplementedError( "normalize not yet supported for DateOffset" @@ -495,7 +495,7 @@ def __init__(self, n=1, normalize=False, **kwds): dtype = "int16" else: unit = self._UNITS_TO_CODES[k] - dtype = np.dtype(f"timedelta64[{unit}]") + dtype = cudf.dtype(f"timedelta64[{unit}]") scalars[k] = cudf.Scalar(v, dtype=dtype) self._scalars = scalars diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 6d31c1ba74d..d5c4df12246 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -109,7 +109,7 @@ def to_numeric(arg, errors="raise", downcast=None): dtype = col.dtype if is_datetime_dtype(dtype) or is_timedelta_dtype(dtype): - col = col.as_numerical_column(np.dtype("int64")) + col = col.as_numerical_column(cudf.dtype("int64")) elif is_categorical_dtype(dtype): cat_dtype = col.dtype.type if _is_non_decimal_numeric_dtype(cat_dtype): @@ -140,7 +140,7 @@ def to_numeric(arg, errors="raise", downcast=None): raise ValueError("Unrecognized datatype") # str->float conversion may require lower precision - if col.dtype == np.dtype("f"): + if col.dtype == cudf.dtype("f"): col = col.as_numerical_column("d") if downcast: @@ -150,13 +150,13 @@ def to_numeric(arg, errors="raise", downcast=None): "unsigned": list(np.typecodes["UnsignedInteger"]), } float_types = list(np.typecodes["Float"]) - idx = float_types.index(np.dtype(np.float32).char) + idx = float_types.index(cudf.dtype(np.float32).char) downcast_type_map["float"] = float_types[idx:] type_set = downcast_type_map[downcast] for t in type_set: - downcast_dtype = np.dtype(t) + downcast_dtype = cudf.dtype(t) if downcast_dtype.itemsize <= col.dtype.itemsize: if col.can_cast_safely(downcast_dtype): col = libcudf.unary.cast(col, downcast_dtype) @@ -197,7 +197,7 @@ def _convert_str_col(col, errors, _downcast=None): is_integer = libstrings.is_integer(col) if is_integer.all(): - return col.as_numerical_column(dtype=np.dtype("i8")) + return col.as_numerical_column(dtype=cudf.dtype("i8")) col = _proc_inf_empty_strings(col) @@ -210,9 +210,9 @@ def _convert_str_col(col, errors, _downcast=None): "limited by float32 precision." ) ) - return col.as_numerical_column(dtype=np.dtype("f")) + return col.as_numerical_column(dtype=cudf.dtype("f")) else: - return col.as_numerical_column(dtype=np.dtype("d")) + return col.as_numerical_column(dtype=cudf.dtype("d")) else: if errors == "coerce": col = libcudf.string_casting.stod(col) diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index d9a2fd89165..e3ed15ba2a6 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -215,7 +215,7 @@ def _apply_agg_series(self, sr, agg_name): self.center, agg_name, ) - return sr._copy_construct(data=result_col) + return sr._from_data({sr.name: result_col}, sr._index) def _apply_agg_dataframe(self, df, agg_name): result_df = cudf.DataFrame({}) @@ -258,12 +258,12 @@ def apply(self, func, *args, **kwargs): See also -------- - cudf.core.series.Series.applymap : Apply an elementwise function to + cudf.Series.applymap : Apply an elementwise function to transform the values in the Column. Notes ----- - See notes of the :meth:`cudf.core.series.Series.applymap` + See notes of the :meth:`cudf.Series.applymap` """ has_nulls = False @@ -353,14 +353,15 @@ def __repr__(self): class RollingGroupby(Rolling): - def __init__(self, groupby, window, min_periods=None, center=False): - """ - Grouped rolling window calculation. + """ + Grouped rolling window calculation. - See also - -------- - cudf.core.window.Rolling - """ + See also + -------- + cudf.core.window.Rolling + """ + + def __init__(self, groupby, window, min_periods=None, center=False): sort_order = groupby.grouping.keys.argsort() # TODO: there may be overlap between the columns diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py index 5e54af86bb5..b568c108191 100644 --- a/python/cudf/cudf/datasets.py +++ b/python/cudf/cudf/datasets.py @@ -2,6 +2,8 @@ import pandas as pd import cudf +from cudf._lib.transform import bools_to_mask +from cudf.core.column_accessor import ColumnAccessor __all__ = ["timeseries", "randomdata"] @@ -9,7 +11,12 @@ # TODO: # change default of name from category to str type when nvstring are merged def timeseries( - start="2000-01-01", end="2000-01-31", freq="1s", dtypes=None, seed=None, + start="2000-01-01", + end="2000-01-31", + freq="1s", + dtypes=None, + nulls_frequency=0, + seed=None, ): """ Create timeseries dataframe with random data @@ -26,6 +33,8 @@ def timeseries( ``{"name": "category", "id": int, "x": float, "y": float}`` freq : string String like '2s' or '1H' or '12W' for the time series frequency + nulls_frequency : float + Fill the series with the specified proportion of nulls. Default is 0. seed : int (optional) Randomstate seed @@ -54,7 +63,21 @@ def timeseries( df = pd.DataFrame(columns, index=index, columns=sorted(columns)) if df.index[-1] == end: df = df.iloc[:-1] - return cudf.from_pandas(df) + + gdf = cudf.from_pandas(df) + for col in gdf: + mask = state.choice( + [True, False], + size=len(index), + p=[1 - nulls_frequency, nulls_frequency], + ) + mask_buf = bools_to_mask(cudf.core.column.as_column(mask)) + masked_col = gdf[col]._column.set_mask(mask_buf) + gdf[col] = cudf.Series._from_data( + ColumnAccessor({None: masked_col}), index=gdf.index + ) + + return gdf def randomdata(nrows=10, dtypes=None, seed=None): diff --git a/python/cudf/cudf/io/avro.py b/python/cudf/cudf/io/avro.py index a6713e85e76..9e38b6e896d 100644 --- a/python/cudf/cudf/io/avro.py +++ b/python/cudf/cudf/io/avro.py @@ -1,4 +1,5 @@ # Copyright (c) 2019, NVIDIA CORPORATION. +import cudf from cudf import _lib as libcudf from cudf.utils import ioutils @@ -14,8 +15,6 @@ def read_avro( ): """{docstring}""" - from cudf import DataFrame - is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer( path_or_data=filepath_or_buffer, **kwargs, ) @@ -31,8 +30,8 @@ def read_avro( ValueError("URL content-encoding decompression is not supported") if engine == "cudf": - return DataFrame._from_table( - libcudf.avro.read_avro( + return cudf.DataFrame._from_data( + *libcudf.avro.read_avro( filepath_or_buffer, columns, skiprows, num_rows ) ) diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py index b8a76890913..9d97bee0396 100644 --- a/python/cudf/cudf/io/dlpack.py +++ b/python/cudf/cudf/io/dlpack.py @@ -35,12 +35,12 @@ def from_dlpack(pycapsule_obj): tensor is row-major, transpose it before passing it to this function. """ - res = libdlpack.from_dlpack(pycapsule_obj) + data, _ = libdlpack.from_dlpack(pycapsule_obj) - if res._num_columns == 1: - return Series(res._data[0]) + if len(data) == 1: + return Series._from_data(data) else: - return DataFrame(data=res._data) + return DataFrame._from_data(data) @ioutils.doc_to_dlpack() diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index b605bf90ff4..8a00d9c73a0 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -53,8 +53,8 @@ def read_json( else: filepaths_or_buffers.append(tmp_source) - return cudf.DataFrame._from_table( - libjson.read_json( + return cudf.DataFrame._from_data( + *libjson.read_json( filepaths_or_buffers, dtype, lines, compression, byte_range ) ) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index a99f82fde7a..8f6002bb577 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -290,8 +290,8 @@ def read_orc( stripes = selected_stripes if engine == "cudf": - df = DataFrame._from_table( - liborc.read_orc( + return DataFrame._from_data( + *liborc.read_orc( filepaths_or_buffers, columns, stripes, diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index a18486cff3c..fa748761695 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -210,6 +210,10 @@ def read_parquet( else: filepaths_or_buffers.append(tmp_source) + if columns is not None: + if not is_list_like(columns): + raise ValueError("Expected list like for columns") + if filters is not None: # Convert filters to ds.Expression filters = pq._filters_to_expression(filters) diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 672e83e6f64..b101835e626 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -245,7 +245,7 @@ def _get_args_kwars_for_assert_exceptions(func_args_and_kwargs): def gen_rand(dtype, size, **kwargs): - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if dtype.kind == "f": res = np.random.random(size=size).astype(dtype) if kwargs.get("positive_only", False): @@ -284,7 +284,7 @@ def gen_rand(dtype, size, **kwargs): return pd.to_datetime( np.random.randint(low=low, high=high, size=size), unit=time_unit ) - elif dtype.kind == "U": + elif dtype.kind in ("O", "U"): return pd.util.testing.rands_array(10, size) raise NotImplementedError(f"dtype.kind={dtype.kind}") diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py index 5e03068f818..cdea22a05af 100644 --- a/python/cudf/cudf/testing/dataset_generator.py +++ b/python/cudf/cudf/testing/dataset_generator.py @@ -18,6 +18,7 @@ from pyarrow import parquet as pq import cudf +from cudf.utils.dtypes import np_to_pa_dtype class ColumnParameters: @@ -94,6 +95,7 @@ def _write(tbl, path, format): def _generate_column(column_params, num_rows): # If cardinality is specified, we create a set to sample from. # Otherwise, we simply use the given generator to generate each value. + if column_params.cardinality is not None: # Construct set of values to sample from where # set size = cardinality @@ -127,7 +129,7 @@ def _generate_column(column_params, num_rows): if hasattr(column_params.dtype, "to_arrow"): arrow_type = column_params.dtype.to_arrow() elif column_params.dtype is not None: - arrow_type = pa.from_numpy_dtype(column_params.dtype) + arrow_type = np_to_pa_dtype(cudf.dtype(column_params.dtype)) else: arrow_type = None @@ -227,15 +229,15 @@ def get_dataframe(parameters, use_threads): ): arrow_type = pa.dictionary( index_type=pa.int64(), - value_type=pa.from_numpy_dtype( - type(next(iter(column_params.generator))) + value_type=np_to_pa_dtype( + cudf.dtype(type(next(iter(column_params.generator)))) ), ) elif hasattr(column_params.dtype, "to_arrow"): arrow_type = column_params.dtype.to_arrow() else: - arrow_type = pa.from_numpy_dtype( - type(next(iter(column_params.generator))) + arrow_type = np_to_pa_dtype( + cudf.dtype(type(next(iter(column_params.generator)))) if column_params.dtype is None else column_params.dtype ) @@ -380,7 +382,7 @@ def rand_dataframe( ) ) else: - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if dtype.kind in ("i", "u"): column_params.append( ColumnParameters( @@ -428,7 +430,7 @@ def rand_dataframe( dtype=dtype, size=cardinality ), is_sorted=False, - dtype=np.dtype(dtype), + dtype=cudf.dtype(dtype), ) ) elif dtype.kind == "m": @@ -440,7 +442,7 @@ def rand_dataframe( dtype=dtype, size=cardinality ), is_sorted=False, - dtype=np.dtype(dtype), + dtype=cudf.dtype(dtype), ) ) elif dtype.kind == "b": @@ -450,7 +452,7 @@ def rand_dataframe( null_frequency=null_frequency, generator=boolean_generator(cardinality), is_sorted=False, - dtype=np.dtype(dtype), + dtype=cudf.dtype(dtype), ) ) else: @@ -538,7 +540,7 @@ def get_values_for_nested_data(dtype, lists_max_length): Returns list of values based on dtype. """ cardinality = np.random.randint(0, lists_max_length) - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if dtype.kind in ("i", "u"): values = int_generator(dtype=dtype, size=cardinality)() elif dtype.kind == "f": diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 8277b8e7b32..abdac07d65d 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -13,7 +13,7 @@ import pytest import cudf -from cudf.core import Series +from cudf import Series from cudf.core.index import as_index from cudf.testing import _utils as utils from cudf.utils.dtypes import ( @@ -931,7 +931,7 @@ def test_ufunc_ops(lhs, rhs, ops): def dtype_scalar(val, dtype): if dtype == "str": return str(val) - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if dtype.type in {np.datetime64, np.timedelta64}: res, _ = np.datetime_data(dtype) return dtype.type(val, res) @@ -1695,13 +1695,15 @@ def test_binops_with_lhs_numpy_scalar(frame, dtype): ) if dtype == "datetime64[s]": - val = np.dtype(dtype).type(4, "s") + val = cudf.dtype(dtype).type(4, "s") elif dtype == "timedelta64[s]": - val = np.dtype(dtype).type(4, "s") + val = cudf.dtype(dtype).type(4, "s") elif dtype == "category": val = np.int64(4) + elif dtype == "str": + val = str(4) else: - val = np.dtype(dtype).type(4) + val = cudf.dtype(dtype).type(4) expected = val == data.to_pandas() got = val == data @@ -1758,16 +1760,16 @@ def test_binops_with_NA_consistent(dtype, op): ( operator.add, ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=2, precision=2), + cudf.Decimal64Dtype(scale=2, precision=3), ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=2, precision=2), - ["3.0", "4.0"], cudf.Decimal64Dtype(scale=2, precision=3), + ["3.0", "4.0"], + cudf.Decimal64Dtype(scale=2, precision=4), ), ( operator.add, ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=2, precision=2), + cudf.Decimal64Dtype(scale=2, precision=3), ["2.25", "1.005"], cudf.Decimal64Dtype(scale=3, precision=4), ["3.75", "3.005"], @@ -1785,7 +1787,7 @@ def test_binops_with_NA_consistent(dtype, op): ( operator.sub, ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=2, precision=2), + cudf.Decimal64Dtype(scale=1, precision=2), ["2.25", "1.005"], cudf.Decimal64Dtype(scale=3, precision=4), ["-0.75", "0.995"], @@ -1794,7 +1796,7 @@ def test_binops_with_NA_consistent(dtype, op): ( operator.sub, ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=2, precision=2), + cudf.Decimal64Dtype(scale=1, precision=2), ["2.25", "1.005"], cudf.Decimal64Dtype(scale=3, precision=4), ["-0.75", "0.995"], @@ -1812,11 +1814,11 @@ def test_binops_with_NA_consistent(dtype, op): ( operator.mul, ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=2, precision=2), + cudf.Decimal64Dtype(scale=2, precision=3), ["1.5", "3.0"], cudf.Decimal64Dtype(scale=3, precision=4), ["2.25", "6.0"], - cudf.Decimal64Dtype(scale=5, precision=7), + cudf.Decimal64Dtype(scale=5, precision=8), ), ( operator.mul, @@ -1866,16 +1868,16 @@ def test_binops_with_NA_consistent(dtype, op): ( operator.add, ["1.5", None, "2.0"], - cudf.Decimal64Dtype(scale=2, precision=2), + cudf.Decimal64Dtype(scale=1, precision=2), ["1.5", None, "2.0"], - cudf.Decimal64Dtype(scale=2, precision=2), + cudf.Decimal64Dtype(scale=1, precision=2), ["3.0", None, "4.0"], - cudf.Decimal64Dtype(scale=2, precision=3), + cudf.Decimal64Dtype(scale=1, precision=3), ), ( operator.add, ["1.5", None], - cudf.Decimal64Dtype(scale=2, precision=2), + cudf.Decimal64Dtype(scale=2, precision=3), ["2.25", "1.005"], cudf.Decimal64Dtype(scale=3, precision=4), ["3.75", None], @@ -1884,7 +1886,7 @@ def test_binops_with_NA_consistent(dtype, op): ( operator.sub, ["1.5", None], - cudf.Decimal64Dtype(scale=2, precision=2), + cudf.Decimal64Dtype(scale=2, precision=3), ["2.25", None], cudf.Decimal64Dtype(scale=3, precision=4), ["-0.75", None], @@ -1893,7 +1895,7 @@ def test_binops_with_NA_consistent(dtype, op): ( operator.sub, ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=2, precision=2), + cudf.Decimal64Dtype(scale=2, precision=3), ["2.25", None], cudf.Decimal64Dtype(scale=3, precision=4), ["-0.75", None], @@ -1902,11 +1904,11 @@ def test_binops_with_NA_consistent(dtype, op): ( operator.mul, ["1.5", None], - cudf.Decimal64Dtype(scale=2, precision=2), + cudf.Decimal64Dtype(scale=2, precision=3), ["1.5", None], cudf.Decimal64Dtype(scale=3, precision=4), ["2.25", None], - cudf.Decimal64Dtype(scale=5, precision=7), + cudf.Decimal64Dtype(scale=5, precision=8), ), ( operator.mul, @@ -2432,10 +2434,10 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected): ( operator.truediv, ["100", "200"], - cudf.Decimal64Dtype(scale=2, precision=4), + cudf.Decimal64Dtype(scale=2, precision=5), decimal.Decimal(2), ["50", "100"], - cudf.Decimal64Dtype(scale=2, precision=6), + cudf.Decimal64Dtype(scale=2, precision=7), False, ), ( @@ -2459,10 +2461,10 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected): ( operator.truediv, ["100", "200"], - cudf.Decimal64Dtype(scale=2, precision=3), + cudf.Decimal64Dtype(scale=2, precision=5), 1, ["0", "0"], - cudf.Decimal64Dtype(scale=-2, precision=5), + cudf.Decimal64Dtype(scale=-2, precision=7), True, ), ( @@ -2793,11 +2795,11 @@ def test_column_null_scalar_comparison(dtype, null_scalar, cmpop): # a new series where all the elements are . if isinstance(null_scalar, np.datetime64): - if np.dtype(dtype).kind not in "mM": + if cudf.dtype(dtype).kind not in "mM": pytest.skip() null_scalar = null_scalar.astype(dtype) - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) data = [1, 2, 3, 4, 5] sr = cudf.Series(data, dtype=dtype) diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index d8e10a62a12..51327038c39 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -799,7 +799,7 @@ def test_categorical_setitem_with_nan(): @pytest.mark.parametrize("dtype", list(NUMERIC_TYPES) + ["object"]) @pytest.mark.parametrize("input_obj", [[1, cudf.NA, 3]]) def test_series_construction_with_nulls(input_obj, dtype): - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) input_obj = [ dtype.type(v) if v is not cudf.NA else cudf.NA for v in input_obj ] diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index f3387b3d27d..cc4c98b611f 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -362,7 +362,7 @@ def test_column_view_string_slice(slc): ) def test_as_column_buffer(data, expected): actual_column = cudf.core.column.as_column( - cudf.core.Buffer(data), dtype=data.dtype + cudf.core.buffer.Buffer(data), dtype=data.dtype ) assert_eq(cudf.Series(actual_column), cudf.Series(expected)) @@ -481,3 +481,29 @@ def test_concatenate_large_column_strings(): match="total size of output is too large for a cudf column", ): cudf.concat([s_1, s_2]) + + +@pytest.mark.parametrize( + "alias,expect_dtype", + [ + ("UInt8", "uint8"), + ("UInt16", "uint16"), + ("UInt32", "uint32"), + ("UInt64", "uint64"), + ("Int8", "int8"), + ("Int16", "int16"), + ("Int32", "int32"), + ("Int64", "int64"), + ("boolean", "bool"), + ("Float32", "float32"), + ("Float64", "float64"), + ], +) +@pytest.mark.parametrize( + "data", [[1, 2, 0]], +) +def test_astype_with_aliases(alias, expect_dtype, data): + pd_data = pd.Series(data) + gd_data = cudf.Series.from_pandas(pd_data) + + assert_eq(pd_data.astype(expect_dtype), gd_data.astype(alias)) diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py index b6650600261..f06142f4cc9 100644 --- a/python/cudf/cudf/tests/test_contains.py +++ b/python/cudf/cudf/tests/test_contains.py @@ -4,6 +4,7 @@ import pandas as pd import pytest +import cudf from cudf import Series from cudf.core.index import RangeIndex, as_index from cudf.testing._utils import ( @@ -82,7 +83,7 @@ def test_rangeindex_contains(): @pytest.mark.parametrize("dtype", NUMERIC_TYPES) def test_lists_contains(dtype): - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) inner_data = np.array([1, 2, 3], dtype=dtype) data = Series([inner_data]) @@ -96,7 +97,7 @@ def test_lists_contains(dtype): @pytest.mark.parametrize("dtype", DATETIME_TYPES + TIMEDELTA_TYPES) def test_lists_contains_datetime(dtype): - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) inner_data = np.array([1, 2, 3]) unit, _ = np.datetime_data(dtype) diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index 0965b5298a4..21a6a9172db 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -5,7 +5,7 @@ import pytest import cudf -from cudf.core import Series +from cudf import Series from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 5511a65d0a4..f04a5e6dca0 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -320,7 +320,6 @@ def test_csv_reader_dtype_dict(use_names): dtypes = df.dtypes.to_dict() gdf_names = list(gdf_dtypes.keys()) if use_names else None pdf_names = list(pdf_dtypes.keys()) if use_names else None - gdf = read_csv(StringIO(buffer), dtype=dtypes, names=gdf_names) pdf = pd.read_csv(StringIO(buffer), dtype=dtypes, names=pdf_names) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 9acf6783095..a337660b5b0 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1828,42 +1828,79 @@ def gdf(pdf): @pytest.mark.parametrize( "data", [ - {"x": [np.nan, 2, 3, 4, 100, np.nan], "y": [4, 5, 6, 88, 99, np.nan]}, - {"x": [1, 2, 3], "y": [4, 5, 6]}, - {"x": [np.nan, np.nan, np.nan], "y": [np.nan, np.nan, np.nan]}, - {"x": [], "y": []}, + { + "x": [np.nan, 2, 3, 4, 100, np.nan], + "y": [4, 5, 6, 88, 99, np.nan], + "z": [7, 8, 9, 66, np.nan, 77], + }, + {"x": [1, 2, 3], "y": [4, 5, 6], "z": [7, 8, 9]}, + { + "x": [np.nan, np.nan, np.nan], + "y": [np.nan, np.nan, np.nan], + "z": [np.nan, np.nan, np.nan], + }, + {"x": [], "y": [], "z": []}, {"x": []}, ], ) +@pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "func", [ - lambda df, **kwargs: df.min(**kwargs), - lambda df, **kwargs: df.max(**kwargs), - lambda df, **kwargs: df.sum(**kwargs), - lambda df, **kwargs: df.product(**kwargs), - lambda df, **kwargs: df.cummin(**kwargs), - lambda df, **kwargs: df.cummax(**kwargs), - lambda df, **kwargs: df.cumsum(**kwargs), - lambda df, **kwargs: df.cumprod(**kwargs), - lambda df, **kwargs: df.mean(**kwargs), - lambda df, **kwargs: df.sum(**kwargs), - lambda df, **kwargs: df.max(**kwargs), - lambda df, **kwargs: df.std(ddof=1, **kwargs), - lambda df, **kwargs: df.var(ddof=1, **kwargs), - lambda df, **kwargs: df.std(ddof=2, **kwargs), - lambda df, **kwargs: df.var(ddof=2, **kwargs), - lambda df, **kwargs: df.kurt(**kwargs), - lambda df, **kwargs: df.skew(**kwargs), - lambda df, **kwargs: df.all(**kwargs), - lambda df, **kwargs: df.any(**kwargs), + "min", + "max", + "sum", + "prod", + "product", + "cummin", + "cummax", + "cumsum", + "cumprod", + "mean", + "median", + "sum", + "max", + "std", + "var", + "kurt", + "skew", + "all", + "any", ], ) @pytest.mark.parametrize("skipna", [True, False, None]) -def test_dataframe_reductions(data, func, skipna): +def test_dataframe_reductions(data, axis, func, skipna): pdf = pd.DataFrame(data=data) gdf = cudf.DataFrame.from_pandas(pdf) - assert_eq(func(pdf, skipna=skipna), func(gdf, skipna=skipna)) + + # Reductions can fail in numerous possible ways when attempting row-wise + # reductions, which are only partially supported. Catching the appropriate + # exception here allows us to detect API breakage in the form of changing + # exceptions. + expected_exception = None + if axis == 1: + if func in ("kurt", "skew"): + expected_exception = NotImplementedError + elif func not in cudf.core.dataframe._cupy_nan_methods_map: + if skipna is False: + expected_exception = NotImplementedError + elif any(col.nullable for name, col in gdf.iteritems()): + expected_exception = ValueError + elif func in ("cummin", "cummax"): + expected_exception = AttributeError + + # Test different degrees of freedom for var and std. + all_kwargs = [{"ddof": 1}, {"ddof": 2}] if func in ("var", "std") else [{}] + for kwargs in all_kwargs: + if expected_exception is not None: + with pytest.raises(expected_exception): + getattr(gdf, func)(axis=axis, skipna=skipna, **kwargs), + else: + assert_eq( + getattr(pdf, func)(axis=axis, skipna=skipna, **kwargs), + getattr(gdf, func)(axis=axis, skipna=skipna, **kwargs), + check_dtype=False, + ) @pytest.mark.parametrize( @@ -3423,8 +3460,6 @@ def test_all(data): expected = pdata.all(bool_only=True) assert_eq(got, expected) else: - with pytest.raises(NotImplementedError): - gdata.all(bool_only=False) with pytest.raises(NotImplementedError): gdata.all(level="a") @@ -3484,8 +3519,6 @@ def test_any(data, axis): expected = pdata.any(bool_only=True) assert_eq(got, expected) else: - with pytest.raises(NotImplementedError): - gdata.any(bool_only=False) with pytest.raises(NotImplementedError): gdata.any(level="a") @@ -3616,9 +3649,7 @@ def test_as_column_types(): assert_eq(pds, gds) pds = pd.Series(pd.Index(["1", "18", "9"]), dtype="int") - gds = cudf.Series( - cudf.core.index.StringIndex(["1", "18", "9"]), dtype="int" - ) + gds = cudf.Series(cudf.StringIndex(["1", "18", "9"]), dtype="int") assert_eq(pds, gds) @@ -5054,6 +5085,18 @@ def test_insert(data): assert_eq(pdf, gdf) +@pytest.mark.parametrize( + "data", [{"A": [1, 2, 3], "B": ["a", "b", "c"]}], +) +def test_insert_NA(data): + pdf = pd.DataFrame.from_dict(data) + gdf = cudf.DataFrame.from_pandas(pdf) + + pdf["C"] = pd.NA + gdf["C"] = cudf.NA + assert_eq(pdf, gdf) + + def test_cov(): gdf = cudf.datasets.randomdata(10) pdf = gdf.to_pandas() @@ -5372,14 +5415,6 @@ def test_change_column_dtype_in_empty(): assert_eq(pdf, gdf) -def test_dataframe_from_table_empty_index(): - df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - odict = df._data - tbl = cudf._lib.table.Table(odict) - - result = cudf.DataFrame._from_table(tbl) # noqa: F841 - - @pytest.mark.parametrize("dtype", ["int64", "str"]) def test_dataframe_from_dictionary_series_same_name_index(dtype): pd_idx1 = pd.Index([1, 2, 0], name="test_index").astype(dtype) @@ -8108,17 +8143,7 @@ def custom_func(df, column): @pytest.mark.parametrize( - "op", - [ - "count", - "cummin", - "cummax", - "cummax", - "cumprod", - "kurt", - "kurtosis", - "skew", - ], + "op", ["count", "kurt", "kurtosis", "skew"], ) def test_dataframe_axis1_unsupported_ops(op): df = cudf.DataFrame({"a": [1, 2, 3], "b": [8, 9, 10]}) @@ -8732,3 +8757,60 @@ def test_frame_series_where(): expected = gdf.where(gdf.notna(), gdf.mean()) actual = pdf.where(pdf.notna(), pdf.mean(), axis=1) assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "array,is_error", + [ + (cupy.arange(20, 40).reshape(-1, 2), False), + (cupy.arange(20, 50).reshape(-1, 3), True), + (np.arange(20, 40).reshape(-1, 2), False), + (np.arange(20, 30).reshape(-1, 1), False), + (cupy.arange(20, 30).reshape(-1, 1), False), + ], +) +def test_dataframe_indexing_setitem_np_cp_array(array, is_error): + gdf = cudf.DataFrame({"a": range(10), "b": range(10)}) + pdf = gdf.to_pandas() + if not is_error: + gdf.loc[:, ["a", "b"]] = array + pdf.loc[:, ["a", "b"]] = cupy.asnumpy(array) + + assert_eq(gdf, pdf) + else: + assert_exceptions_equal( + lfunc=pdf.loc.__setitem__, + rfunc=gdf.loc.__setitem__, + lfunc_args_and_kwargs=( + [(slice(None, None, None), ["a", "b"]), cupy.asnumpy(array)], + {}, + ), + rfunc_args_and_kwargs=( + [(slice(None, None, None), ["a", "b"]), array], + {}, + ), + compare_error_message=False, + expected_error_message="shape mismatch: value array of shape " + "(10, 3) could not be broadcast to indexing " + "result of shape (10, 2)", + ) + + +@pytest.mark.parametrize( + "data", [{"a": [1, 2, 3], "b": [1, 1, 0]}], +) +def test_frame_series_where_other(data): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + expected = gdf.where(gdf["b"] == 1, cudf.NA) + actual = pdf.where(pdf["b"] == 1, pd.NA) + assert_eq( + actual.fillna(-1).values, + expected.fillna(-1).values, + check_dtype=False, + ) + + expected = gdf.where(gdf["b"] == 1, 0) + actual = pdf.where(pdf["b"] == 1, 0) + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_datasets.py b/python/cudf/cudf/tests/test_datasets.py index b7bc89f008d..c9f07eab5dd 100644 --- a/python/cudf/cudf/tests/test_datasets.py +++ b/python/cudf/cudf/tests/test_datasets.py @@ -6,10 +6,10 @@ def test_dataset_timeseries(): gdf1 = gd.datasets.timeseries( - dtypes={"x": int, "y": float}, freq="120s", seed=1 + dtypes={"x": int, "y": float}, freq="120s", nulls_frequency=0.3, seed=1 ) gdf2 = gd.datasets.timeseries( - dtypes={"x": int, "y": float}, freq="120s", seed=1 + dtypes={"x": int, "y": float}, freq="120s", nulls_frequency=0.3, seed=1 ) assert_eq(gdf1, gdf2) @@ -23,6 +23,7 @@ def test_dataset_timeseries(): "2010", freq="2H", dtypes={"value": float, "name": "category", "id": int}, + nulls_frequency=0.7, seed=1, ) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 5f5a0a78414..9f19bf8b960 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -12,7 +12,8 @@ import pytest import cudf -from cudf.core import DataFrame, Series +import cudf.testing.dataset_generator as dataset_generator +from cudf import DataFrame, Series from cudf.core.index import DatetimeIndex from cudf.testing._utils import ( DATETIME_TYPES, @@ -1299,6 +1300,58 @@ def test_is_leap_year(): assert_eq(expect2, got2) +def test_quarter(): + data = [ + "2020-05-31 08:00:00", + "1999-12-31 18:40:00", + "2000-12-31 04:00:00", + "1900-02-28 07:00:00", + "1800-03-14 07:30:00", + "2100-03-14 07:30:00", + "1970-01-01 00:00:00", + "1969-12-31 12:59:00", + ] + dtype = "datetime64[s]" + + # Series + ps = pd.Series(data, dtype=dtype) + gs = cudf.from_pandas(ps) + + expect = ps.dt.quarter + got = gs.dt.quarter + + assert_eq(expect, got, check_dtype=False) + + # DatetimeIndex + pIndex = pd.DatetimeIndex(data) + gIndex = cudf.from_pandas(pIndex) + + expect2 = pIndex.quarter + got2 = gIndex.quarter + + assert isinstance(got2, cudf.Int8Index) + assert_eq(expect2.values, got2.values, check_dtype=False) + + +@pytest.mark.parametrize("dtype", DATETIME_TYPES) +def test_days_in_months(dtype): + nrows = 1000 + + data = dataset_generator.rand_dataframe( + dtypes_meta=[ + {"dtype": dtype, "null_frequency": 0.4, "cardinality": nrows} + ], + rows=nrows, + use_threads=False, + seed=23, + ) + + ps = data.to_pandas()["0"] + gs = cudf.from_pandas(ps) + + assert_eq(ps.dt.days_in_month, gs.dt.days_in_month) + + @pytest.mark.parametrize( "data", [ @@ -1326,3 +1379,174 @@ def test_is_month_start(data, dtype): got = gs.dt.is_month_start assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data", + [ + [ + "2020-05-31", + "2020-02-29", + None, + "1999-12-01", + "2000-12-21", + None, + "1900-02-28", + "1800-03-14", + "2100-03-10", + "1970-01-01", + "1969-12-11", + ] + ], +) +@pytest.mark.parametrize("dtype", ["datetime64[ns]"]) +def test_is_month_end(data, dtype): + # Series + ps = pd.Series(data, dtype=dtype) + gs = cudf.from_pandas(ps) + + expect = ps.dt.is_month_end + got = gs.dt.is_month_end + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data", + [ + [ + "2020-05-31", + None, + "1999-12-01", + "2000-12-21", + None, + "1900-01-01", + "1800-03-14", + "2100-03-10", + "1970-01-01", + "1969-12-11", + "2017-12-30", + "2017-12-31", + "2018-01-01", + ] + ], +) +@pytest.mark.parametrize("dtype", ["datetime64[ns]"]) +def test_is_year_start(data, dtype): + ps = pd.Series(data, dtype=dtype) + gs = cudf.from_pandas(ps) + + expect = ps.dt.is_year_start + got = gs.dt.is_year_start + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data", + [ + [ + "2020-05-31", + None, + "1999-12-01", + "2000-12-21", + None, + "1900-12-31", + "1800-03-14", + "2017-12-30", + "2017-12-31", + "2020-12-31 08:00:00", + None, + "1999-12-31 18:40:00", + "2000-12-31 04:00:00", + None, + "1800-12-14 07:30:00", + "2100-12-14 07:30:00", + "2020-05-31", + ] + ], +) +@pytest.mark.parametrize("dtype", ["datetime64[ns]"]) +def test_is_year_end(data, dtype): + ps = pd.Series(data, dtype=dtype) + gs = cudf.from_pandas(ps) + + expect = ps.dt.is_year_end + got = gs.dt.is_year_end + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data", + [ + [ + "2020-05-01", + "2020-05-31", + "2020-02-29", + None, + "1999-12-01", + "2000-12-21", + None, + "1900-02-28", + "1800-03-14", + "2100-03-10", + "1970-04-1", + "1970-01-01", + "1969-12-11", + "2020-12-31", + ] + ], +) +@pytest.mark.parametrize("dtype", ["datetime64[ns]"]) +def test_is_quarter_start(data, dtype): + # Series + ps = pd.Series(data, dtype=dtype) + gs = cudf.from_pandas(ps) + + expect = ps.dt.is_quarter_start + got = gs.dt.is_quarter_start + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data", + [ + [ + "2020-05-01", + "2020-05-31", + "2020-02-29", + None, + "1999-12-01", + "2000-12-21", + None, + "1900-02-28", + "1800-03-14", + "2100-03-10", + "1970-04-1", + "1970-01-01", + "1969-12-11", + "2020-12-31", + ] + ], +) +@pytest.mark.parametrize("dtype", ["datetime64[ns]"]) +def test_is_quarter_end(data, dtype): + # Series + ps = pd.Series(data, dtype=dtype) + gs = cudf.from_pandas(ps) + + expect = ps.dt.is_quarter_end + got = gs.dt.is_quarter_end + + assert_eq(expect, got) + + +def test_error_values(): + s = cudf.Series([1, 2, 3], dtype="datetime64[ns]") + with pytest.raises( + NotImplementedError, + match="DateTime Arrays is not yet implemented in cudf", + ): + s.values diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py index d2de44b0c8f..51f05e1b876 100644 --- a/python/cudf/cudf/tests/test_decimal.py +++ b/python/cudf/cudf/tests/test_decimal.py @@ -9,7 +9,7 @@ import cudf from cudf.core.column import Decimal32Column, Decimal64Column, NumericalColumn -from cudf.core.dtypes import Decimal64Dtype +from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype from cudf.testing._utils import ( FLOAT_TYPES, INTEGER_TYPES, @@ -24,7 +24,7 @@ [1], [-1], [1, 2, 3, 4], - [42, 1729, 4104], + [42, 17, 41], [1, 2, None, 4], [None, None, None], [], @@ -164,21 +164,43 @@ def test_typecast_from_int_to_decimal(data, from_dtype, to_dtype): ) @pytest.mark.parametrize( "from_dtype", - [Decimal64Dtype(7, 2), Decimal64Dtype(11, 4), Decimal64Dtype(18, 10)], + [ + Decimal64Dtype(7, 2), + Decimal64Dtype(11, 4), + Decimal64Dtype(18, 10), + Decimal32Dtype(7, 2), + Decimal32Dtype(5, 3), + Decimal32Dtype(9, 5), + ], ) @pytest.mark.parametrize( "to_dtype", - [Decimal64Dtype(7, 2), Decimal64Dtype(18, 10), Decimal64Dtype(11, 4)], + [ + Decimal64Dtype(7, 2), + Decimal64Dtype(18, 10), + Decimal64Dtype(11, 4), + Decimal32Dtype(7, 2), + Decimal32Dtype(9, 5), + Decimal32Dtype(5, 3), + ], ) def test_typecast_to_from_decimal(data, from_dtype, to_dtype): - got = data.astype(from_dtype) + if from_dtype.scale > to_dtype.MAX_PRECISION: + pytest.skip( + "This is supposed to overflow because the representation value in " + "the source exceeds the max representable in destination dtype." + ) + s = data.astype(from_dtype) - pa_arr = got.to_arrow().cast( + pa_arr = s.to_arrow().cast( pa.decimal128(to_dtype.precision, to_dtype.scale), safe=False ) - expected = cudf.Series(Decimal64Column.from_arrow(pa_arr)) + if isinstance(to_dtype, Decimal32Dtype): + expected = cudf.Series(Decimal32Column.from_arrow(pa_arr)) + elif isinstance(to_dtype, Decimal64Dtype): + expected = cudf.Series(Decimal64Column.from_arrow(pa_arr)) - got = got.astype(to_dtype) + got = s.astype(to_dtype) assert_eq(got, expected) @@ -347,3 +369,11 @@ def test_serialize_decimal_columns(data): df = cudf.DataFrame(data) recreated = df.__class__.deserialize(*df.serialize()) assert_eq(recreated, df) + + +def test_decimal_invalid_precision(): + with pytest.raises(pa.ArrowInvalid): + _ = cudf.Series([10, 20, 30], dtype=cudf.Decimal64Dtype(2, 2)) + + with pytest.raises(pa.ArrowInvalid): + _ = cudf.Series([Decimal("300")], dtype=cudf.Decimal64Dtype(2, 1)) diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index 41d7f5d215e..ee6cc7b6df6 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -257,3 +257,62 @@ def test_lists_of_structs_dtype(data): assert_column_array_dtype_equal(got._column, expected) assert expected.equals(got._column.to_arrow()) + + +@pytest.mark.parametrize( + "in_dtype,expect", + [ + (np.dtype("int8"), np.dtype("int8")), + (np.int8, np.dtype("int8")), + (np.float16, np.dtype("float32")), + (pd.Int8Dtype(), np.dtype("int8")), + (pd.StringDtype(), np.dtype("object")), + ("int8", np.dtype("int8")), + ("boolean", np.dtype("bool")), + ("bool_", np.dtype("bool")), + (np.bool_, np.dtype("bool")), + (int, np.dtype("int64")), + (float, np.dtype("float64")), + (cudf.ListDtype("int64"), cudf.ListDtype("int64")), + ("float16", np.dtype("float32")), + (np.dtype("U"), np.dtype("object")), + ("timedelta64", np.dtype("= min_size: + if (cudf.dtype(int_dtype).itemsize * 8) >= min_size: if np.iinfo(int_dtype).min <= x <= np.iinfo(int_dtype).max: return int_dtype # resort to using `int64` and let numpy raise appropriate exception: @@ -350,7 +364,7 @@ def min_unsigned_type(x, min_size=8): that can represent the integer ``x`` """ for int_dtype in np.sctypes["uint"]: - if (np.dtype(int_dtype).itemsize * 8) >= min_size: + if (cudf.dtype(int_dtype).itemsize * 8) >= min_size: if 0 <= x <= np.iinfo(int_dtype).max: return int_dtype # resort to using `uint64` and let numpy raise appropriate exception: @@ -374,47 +388,22 @@ def min_column_type(x, expected_type): max_bound_dtype = np.min_scalar_type(x.max()) min_bound_dtype = np.min_scalar_type(x.min()) result_type = np.promote_types(max_bound_dtype, min_bound_dtype) - if result_type == np.dtype("float16"): - # cuDF does not support float16 dtype - result_type = np.dtype("float32") - return result_type - if np.issubdtype(expected_type, np.integer): + elif np.issubdtype(expected_type, np.integer): max_bound_dtype = np.min_scalar_type(x.max()) min_bound_dtype = np.min_scalar_type(x.min()) - return np.promote_types(max_bound_dtype, min_bound_dtype) + result_type = np.promote_types(max_bound_dtype, min_bound_dtype) + else: + result_type = x.dtype - return x.dtype + return cudf.dtype(result_type) def get_min_float_dtype(col): max_bound_dtype = np.min_scalar_type(float(col.max())) min_bound_dtype = np.min_scalar_type(float(col.min())) result_type = np.promote_types(max_bound_dtype, min_bound_dtype) - if result_type == np.dtype("float16"): - # cuDF does not support float16 dtype - result_type = np.dtype("float32") - return result_type - - -def check_cast_unsupported_dtype(dtype): - if is_categorical_dtype(dtype): - return dtype - - if isinstance(dtype, pd.core.arrays.numpy_.PandasDtype): - dtype = dtype.numpy_dtype - else: - dtype = np.dtype(dtype) - - if dtype in cudf._lib.types.np_to_cudf_types: - return dtype - - if dtype == np.dtype("float16"): - return np.dtype("float32") - - raise NotImplementedError( - f"Cannot cast {dtype} dtype, as it is not supported by CuDF." - ) + return cudf.dtype(result_type) def is_mixed_with_object_dtype(lhs, rhs): @@ -438,7 +427,7 @@ def get_time_unit(obj): def _get_nan_for_dtype(dtype): - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if pd.api.types.is_datetime64_dtype( dtype ) or pd.api.types.is_timedelta64_dtype(dtype): @@ -536,7 +525,7 @@ def find_common_type(dtypes): [dtype for dtype in dtypes if is_decimal_dtype(dtype)] ) else: - return np.dtype("O") + return cudf.dtype("O") # Corner case 1: # Resort to np.result_type to handle "M" and "m" types separately @@ -553,11 +542,7 @@ def find_common_type(dtypes): dtypes.add(np.result_type(*td_dtypes)) common_dtype = np.find_common_type(list(dtypes), []) - if common_dtype == np.dtype("float16"): - # cuDF does not support float16 dtype - return np.dtype("float32") - else: - return common_dtype + return cudf.dtype(common_dtype) def _can_cast(from_dtype, to_dtype): @@ -567,10 +552,12 @@ def _can_cast(from_dtype, to_dtype): `np.can_cast` but with some special handling around cudf specific dtypes. """ + if from_dtype in {None, cudf.NA}: + return True if isinstance(from_dtype, type): - from_dtype = np.dtype(from_dtype) + from_dtype = cudf.dtype(from_dtype) if isinstance(to_dtype, type): - to_dtype = np.dtype(to_dtype) + to_dtype = cudf.dtype(to_dtype) # TODO : Add precision & scale checking for # decimal types in future diff --git a/python/cudf/cudf/utils/gpu_utils.py b/python/cudf/cudf/utils/gpu_utils.py index 4bd19720151..77963f8bcc1 100644 --- a/python/cudf/cudf/utils/gpu_utils.py +++ b/python/cudf/cudf/utils/gpu_utils.py @@ -139,6 +139,15 @@ def _try_get_old_or_new_symbols(): # CUDA Driver Version Check: # Driver Runtime version is >= Runtime version pass + elif ( + cuda_driver_supported_rt_version >= 11000 + and cuda_runtime_version >= 11000 + ): + # With cuda enhanced compatibitlity any code compiled + # with 11.x version of cuda can now run on any + # driver >= 450.80.02. 11000 is the minimum cuda + # version 450.80.02 supports. + pass else: from cudf.errors import UnSupportedCUDAError diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 2aaea8435e0..1927ef96e6f 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -67,8 +67,8 @@ See Also -------- -cudf.io.csv.read_csv -cudf.io.json.read_json +cudf.read_csv +cudf.read_json """.format( remote_data_sources=_docstring_remote_sources ) @@ -175,7 +175,7 @@ -------- cudf.io.parquet.read_parquet_metadata cudf.io.parquet.to_parquet -cudf.io.orc.read_orc +cudf.read_orc """.format( remote_data_sources=_docstring_remote_sources ) @@ -217,7 +217,7 @@ See Also -------- cudf.io.parquet.read_parquet -cudf.io.orc.read_orc +cudf.read_orc """ doc_to_parquet = docfmt_partial(docstring=_docstring_to_parquet) @@ -256,6 +256,12 @@ Number of stripes List of column names +Notes +----- +Support for reading files with struct columns is currently experimental, +the output may not be as reliable as reading for other datatypes. +{remote_data_sources} + Examples -------- >>> import cudf @@ -270,7 +276,7 @@ See Also -------- -cudf.io.orc.read_orc +cudf.read_orc """ doc_read_orc_metadata = docfmt_partial(docstring=_docstring_read_orc_metadata) @@ -296,7 +302,7 @@ See Also -------- -cudf.io.orc.read_orc +cudf.read_orc """ doc_read_orc_statistics = docfmt_partial( docstring=_docstring_read_orc_statistics @@ -385,7 +391,7 @@ See Also -------- -cudf.io.orc.read_orc +cudf.read_orc """ doc_to_orc = docfmt_partial(docstring=_docstring_to_orc) @@ -687,7 +693,7 @@ See Also -------- -cudf.io.hdf.read_hdf : Read from HDF file. +cudf.read_hdf : Read from HDF file. cudf.io.parquet.to_parquet : Write a DataFrame to the binary parquet format. cudf.io.feather.to_feather : Write out feather-format for DataFrames. """ @@ -898,7 +904,7 @@ See Also -------- -cudf.io.csv.to_csv +cudf.to_csv """.format( remote_data_sources=_docstring_remote_sources ) @@ -963,7 +969,7 @@ See Also -------- -cudf.io.csv.read_csv +cudf.read_csv """ doc_to_csv = docfmt_partial( docstring=_docstring_to_csv.format( diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 209f61ad399..c9d38c8399e 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -17,7 +17,7 @@ from cudf.utils.dtypes import to_cudf_compatible_scalar # The size of the mask in bytes -mask_dtype = np.dtype(np.int32) +mask_dtype = cudf.dtype(np.int32) mask_bitsize = mask_dtype.itemsize * 8 @@ -42,10 +42,7 @@ def scalar_broadcast_to(scalar, size, dtype=None): if isinstance(size, (tuple, list)): size = size[0] - if scalar is None or ( - isinstance(scalar, (np.datetime64, np.timedelta64)) - and np.isnat(scalar) - ): + if cudf._lib.scalar._is_null_host_scalar(scalar): if dtype is None: dtype = "object" return column.column_empty(size, dtype=dtype, masked=True) @@ -70,7 +67,7 @@ def scalar_broadcast_to(scalar, size, dtype=None): scalar = to_cudf_compatible_scalar(scalar, dtype=dtype) dtype = scalar.dtype - if np.dtype(dtype).kind in ("O", "U"): + if cudf.dtype(dtype).kind in ("O", "U"): gather_map = column.full(size, 0, dtype="int32") scalar_str_col = column.as_column([scalar], dtype="str") return scalar_str_col[gather_map] diff --git a/python/cudf/requirements/cuda-11.0/dev_requirements.txt b/python/cudf/requirements/cuda-11.0/dev_requirements.txt index efb22ddd5a4..f69c246832b 100644 --- a/python/cudf/requirements/cuda-11.0/dev_requirements.txt +++ b/python/cudf/requirements/cuda-11.0/dev_requirements.txt @@ -23,6 +23,7 @@ packaging pandas>=1.0,<1.3.0dev0 pandoc==2.0a4 protobuf +pydata-sphinx-theme pyorc pytest pytest-benchmark @@ -33,7 +34,6 @@ setuptools sphinx sphinx-copybutton sphinx-markdown-tables -sphinx_rtd_theme sphinxcontrib-websupport transformers typing_extensions diff --git a/python/cudf/requirements/cuda-11.2/dev_requirements.txt b/python/cudf/requirements/cuda-11.2/dev_requirements.txt index cb88f74399f..e55dc2f921a 100644 --- a/python/cudf/requirements/cuda-11.2/dev_requirements.txt +++ b/python/cudf/requirements/cuda-11.2/dev_requirements.txt @@ -23,6 +23,7 @@ packaging pandas>=1.0,<1.3.0dev0 pandoc==2.0a4 protobuf +pydata-sphinx-theme pyorc pytest pytest-benchmark @@ -33,7 +34,6 @@ setuptools sphinx sphinx-copybutton sphinx-markdown-tables -sphinx_rtd_theme sphinxcontrib-websupport transformers typing_extensions diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 53543b9e886..c0204190957 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -1,5 +1,7 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. +from collections.abc import Iterator + import cupy as cp import numpy as np import pandas as pd @@ -51,8 +53,8 @@ def _nonempty_index(idx): data = np.array([start, "1970-01-02"], dtype=idx.dtype) values = cudf.core.column.as_column(data) return cudf.core.index.DatetimeIndex(values, name=idx.name) - elif isinstance(idx, cudf.core.index.StringIndex): - return cudf.core.index.StringIndex(["cat", "dog"], name=idx.name) + elif isinstance(idx, cudf.StringIndex): + return cudf.StringIndex(["cat", "dog"], name=idx.name) elif isinstance(idx, cudf.core.index.CategoricalIndex): key = tuple(idx._data.keys()) assert len(key) == 1 @@ -67,10 +69,10 @@ def _nonempty_index(idx): return cudf.core.index.GenericIndex( np.arange(2, dtype=idx.dtype), name=idx.name ) - elif isinstance(idx, cudf.core.MultiIndex): + elif isinstance(idx, cudf.core.multiindex.MultiIndex): levels = [meta_nonempty(lev) for lev in idx.levels] codes = [[0, 0] for i in idx.levels] - return cudf.core.MultiIndex( + return cudf.core.multiindex.MultiIndex( levels=levels, codes=codes, names=idx.names ) @@ -256,6 +258,52 @@ def is_categorical_dtype_cudf(obj): return cudf.utils.dtypes.is_categorical_dtype(obj) +try: + from dask.dataframe.dispatch import percentile_dispatch + + @percentile_dispatch.register((cudf.Series, cp.ndarray, cudf.Index)) + def percentile_cudf(a, q, interpolation="linear"): + # Cudf dispatch to the equivalent of `np.percentile`: + # https://numpy.org/doc/stable/reference/generated/numpy.percentile.html + a = cudf.Series(a) + # a is series. + n = len(a) + if not len(a): + return None, n + if isinstance(q, Iterator): + q = list(q) + + if cudf.utils.dtypes.is_categorical_dtype(a.dtype): + result = cp.percentile(a.cat.codes, q, interpolation=interpolation) + + return ( + pd.Categorical.from_codes( + result, a.dtype.categories, a.dtype.ordered + ), + n, + ) + if np.issubdtype(a.dtype, np.datetime64): + result = a.quantile( + [i / 100.0 for i in q], interpolation=interpolation + ) + + if q[0] == 0: + # https://github.com/dask/dask/issues/6864 + result[0] = min(result[0], a.min()) + return result.to_pandas(), n + if not np.issubdtype(a.dtype, np.number): + interpolation = "nearest" + return ( + a.quantile( + [i / 100.0 for i in q], interpolation=interpolation + ).to_pandas(), + n, + ) + + +except ImportError: + pass + try: from dask.dataframe.dispatch import union_categoricals_dispatch diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index 2ec457018d9..600d6cc7412 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -16,6 +16,8 @@ from dask.dataframe.groupby import DataFrameGroupBy, SeriesGroupBy from dask.highlevelgraph import HighLevelGraph +import cudf + class CudfDataFrameGroupBy(DataFrameGroupBy): def __init__(self, *args, **kwargs): @@ -71,15 +73,28 @@ def aggregate(self, arg, split_every=None, split_out=1): "min", "max", "collect", + "first", + "last", } if ( isinstance(self.obj, DaskDataFrame) - and isinstance(self.index, (str, list)) + and ( + isinstance(self.index, str) + or ( + isinstance(self.index, list) + and all(isinstance(x, str) for x in self.index) + ) + ) and _is_supported(arg, _supported) ): + if isinstance(self._meta.grouping.keys, cudf.MultiIndex): + keys = self._meta.grouping.keys.names + else: + keys = self._meta.grouping.keys.name + return groupby_agg( self.obj, - self.index, + keys, arg, split_every=split_every, split_out=split_out, @@ -127,7 +142,10 @@ def aggregate(self, arg, split_every=None, split_out=1): "min", "max", "collect", + "first", + "last", } + if ( isinstance(self.obj, DaskDataFrame) and isinstance(self.index, (str, list)) @@ -165,7 +183,16 @@ def groupby_agg( This aggregation algorithm only supports the following options: - {"count", "mean", "std", "var", "sum", "min", "max", "collect"} + - "count" + - "mean" + - "std" + - "var" + - "sum" + - "min" + - "max" + - "collect" + - "first" + - "last" This "optimized" approach is more performant than the algorithm in `dask.dataframe`, because it allows the cudf backend to @@ -208,6 +235,8 @@ def groupby_agg( "min", "max", "collect", + "first", + "last", } if not _is_supported(aggs, _supported): raise ValueError( diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index 510b5730169..0ac0af2842b 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -7,7 +7,7 @@ from pyarrow import parquet as pq from dask import dataframe as dd -from dask.dataframe.io.parquet.arrow import ArrowEngine +from dask.dataframe.io.parquet.arrow import ArrowDatasetEngine try: from dask.dataframe.io.parquet import ( @@ -19,12 +19,20 @@ import cudf from cudf.core.column import as_column, build_categorical_column from cudf.io import write_to_dataset +from cudf.utils.dtypes import cudf_dtype_from_pa_type -class CudfEngine(ArrowEngine): +class CudfEngine(ArrowDatasetEngine): @staticmethod def read_metadata(*args, **kwargs): - meta, stats, parts, index = ArrowEngine.read_metadata(*args, **kwargs) + meta, stats, parts, index = ArrowDatasetEngine.read_metadata( + *args, **kwargs + ) + if parts: + # Re-set "object" dtypes align with pa schema + set_object_dtypes_from_pa_schema( + meta, parts[0].get("common_kwargs", {}).get("schema", None), + ) # If `strings_to_categorical==True`, convert objects to int32 strings_to_cats = kwargs.get("strings_to_categorical", False) @@ -59,7 +67,6 @@ def read_partition( pieces = [pieces] strings_to_cats = kwargs.get("strings_to_categorical", False) - if len(pieces) > 1: paths = [] @@ -72,6 +79,9 @@ def read_partition( rgs.append(None) else: (path, row_group, partition_keys) = piece + + row_group = None if row_group == [None] else row_group + paths.append(path) rgs.append( [row_group] @@ -96,6 +106,7 @@ def read_partition( partition_keys = [] else: (path, row_group, partition_keys) = pieces[0] + row_group = None if row_group == [None] else row_group if cudf.utils.ioutils._is_local_filesystem(fs): df = cudf.read_parquet( @@ -117,6 +128,9 @@ def read_partition( **kwargs.get("read", {}), ) + # Re-set "object" dtypes align with pa schema + set_object_dtypes_from_pa_schema(df, kwargs.get("schema", None)) + if index and (index[0] in df.columns): df = df.set_index(index[0]) elif index is False and set(df.index.names).issubset(columns): @@ -127,17 +141,22 @@ def read_partition( if partition_keys: if partitions is None: raise ValueError("Must pass partition sets") + for i, (name, index2) in enumerate(partition_keys): - categories = [ - val.as_py() for val in partitions.levels[i].dictionary - ] - col = as_column(index2).as_frame().repeat(len(df))._data[None] + # Build the column from `codes` directly + # (since the category is often a larger dtype) + codes = ( + as_column(partitions[i].keys.index(index2)) + .as_frame() + .repeat(len(df)) + ._data[None] + ) df[name] = build_categorical_column( - categories=categories, - codes=as_column(col.base_data, dtype=col.dtype), - size=col.size, - offset=col.offset, + categories=partitions[i].keys, + codes=codes, + size=codes.size, + offset=codes.offset, ordered=False, ) @@ -233,6 +252,18 @@ def aggregate_metadata(cls, meta_list, fs, out_path): return meta +def set_object_dtypes_from_pa_schema(df, schema): + # Simple utility to modify cudf DataFrame + # "object" dtypes to agree with a specific + # pyarrow schema. + if schema: + for name in df.columns: + if name in schema.names and df[name].dtype == "O": + df[name] = df[name].astype( + cudf_dtype_from_pa_type(schema.field(name).type) + ) + + def read_parquet( path, columns=None, @@ -243,9 +274,9 @@ def read_parquet( """ Read parquet files into a Dask DataFrame Calls ``dask.dataframe.read_parquet`` to cordinate the execution of - ``cudf.read_parquet``, and ultimately read multiple partitions into a - single Dask dataframe. The Dask version must supply an ``ArrowEngine`` - class to support full functionality. + ``cudf.read_parquet``, and ultimately read multiple partitions into + a single Dask dataframe. The Dask version must supply an + ``ArrowDatasetEngine`` class to support full functionality. See ``cudf.read_parquet`` and Dask documentation for further details. Examples diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 740a2d48ce2..a5492bc5fc0 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -455,17 +455,24 @@ def test_create_metadata_file_inconsistent_schema(tmpdir): p1 = os.path.join(tmpdir, "part.1.parquet") df1.to_parquet(p1, engine="pyarrow") - with pytest.raises(RuntimeError): - # Pyarrow will fail to aggregate metadata - # if gather_statistics=True - dask_cudf.read_parquet(str(tmpdir), gather_statistics=True,).compute() + # New pyarrow-dataset base can handle an inconsistent + # schema (even without a _metadata file), but computing + # and dtype validation may fail + ddf1 = dask_cudf.read_parquet(str(tmpdir), gather_statistics=True) # Add global metadata file. # Dask-CuDF can do this without requiring schema - # consistency. Once the _metadata file is avaible, - # parsing metadata should no longer be a problem + # consistency. dask_cudf.io.parquet.create_metadata_file([p0, p1]) - # Check that we can now read the ddf + # Check that we can still read the ddf # with the _metadata file present - dask_cudf.read_parquet(str(tmpdir), gather_statistics=True,).compute() + ddf2 = dask_cudf.read_parquet(str(tmpdir), gather_statistics=True) + + # Check that the result is the same with and + # without the _metadata file. Note that we must + # call `compute` on `ddf1`, because the dtype of + # the inconsistent column ("a") may be "object" + # before computing, and "int" after + dd.assert_eq(ddf1.compute(), ddf2) + dd.assert_eq(ddf1.compute(), ddf2.compute()) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index cf5203a22e5..ace9701b677 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -59,7 +59,7 @@ def test_from_cudf_with_generic_idx(): ddf = dgd.from_cudf(cdf, npartitions=2) - assert isinstance(ddf.index.compute(), cudf.core.index.GenericIndex) + assert isinstance(ddf.index.compute(), cudf.RangeIndex) dd.assert_eq(ddf.loc[1:2, ["a"]], cdf.loc[1:2, ["a"]]) diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 84de32952e5..61fa32b76ed 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -594,3 +594,54 @@ def test_groupby_unique_lists(): dd.assert_eq( gdf.groupby("a").b.unique(), gddf.groupby("a").b.unique().compute(), ) + + +@pytest.mark.parametrize( + "data", + [ + {"a": [], "b": []}, + {"a": [2, 1, 2, 1, 1, 3], "b": [None, 1, 2, None, 2, None]}, + {"a": [None], "b": [None]}, + {"a": [2, 1, 1], "b": [None, 1, 0], "c": [None, 0, 1]}, + ], +) +@pytest.mark.parametrize("agg", ["first", "last"]) +def test_groupby_first_last(data, agg): + pdf = pd.DataFrame(data) + gdf = cudf.DataFrame.from_pandas(pdf) + + ddf = dd.from_pandas(pdf, npartitions=2) + gddf = dask_cudf.from_cudf(gdf, npartitions=2) + + dd.assert_eq( + ddf.groupby("a").agg(agg).compute(), + gddf.groupby("a").agg(agg).compute(), + ) + + dd.assert_eq( + getattr(ddf.groupby("a"), agg)().compute(), + getattr(gddf.groupby("a"), agg)().compute(), + ) + + dd.assert_eq( + gdf.groupby("a").agg(agg), gddf.groupby("a").agg(agg).compute() + ) + + dd.assert_eq( + getattr(gdf.groupby("a"), agg)(), + getattr(gddf.groupby("a"), agg)().compute(), + ) + + +def test_groupby_with_list_of_series(): + df = cudf.DataFrame({"a": [1, 2, 3, 4, 5]}) + gdf = dask_cudf.from_cudf(df, npartitions=2) + gs = cudf.Series([1, 1, 1, 2, 2], name="id") + ggs = dask_cudf.from_cudf(gs, npartitions=2) + + ddf = dd.from_pandas(df.to_pandas(), npartitions=2) + pgs = dd.from_pandas(gs.to_pandas(), npartitions=2) + + dd.assert_eq( + gdf.groupby([ggs]).agg(["sum"]), ddf.groupby([pgs]).agg(["sum"]) + )