From b2423ac13e0a8687987e7c82bead694fc7fcb061 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 30 Aug 2021 14:20:09 -0700 Subject: [PATCH 1/8] Consolidate Several Series and Dataframe Methods (#9059) Partly addresses #9038 This function consolidate several (trivial) functions from `Series` and `DataFrame` into Frame. `__invert__` was consolidated to shared (more efficient) code path using factory methods. `deserialize` was not consolidated because we have to provide backward compatibility to older classes. But factory method was used for faster class construction. Authors: - Michael Wang (https://github.com/isVoid) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Sheilah Kirui (https://github.com/skirui-source) URL: https://github.com/rapidsai/cudf/pull/9059 --- python/cudf/cudf/core/dataframe.py | 123 +------------- python/cudf/cudf/core/frame.py | 250 +++++++++++++++++++++++++++++ python/cudf/cudf/core/series.py | 208 +----------------------- 3 files changed, 262 insertions(+), 319 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0d833a7d341..a739eba71f3 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -35,7 +35,6 @@ from cudf.core.index import BaseIndex, RangeIndex, as_index from cudf.core.indexing import _DataFrameIlocIndexer, _DataFrameLocIndexer from cudf.core.series import Series -from cudf.core.window import Rolling from cudf.utils import applyutils, docutils, ioutils, queryutils, utils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( @@ -526,11 +525,12 @@ def serialize(self): # Use the column directly to avoid duplicating the index # need to pickle column names to handle numpy integer columns - header["column_names"] = pickle.dumps(tuple(self._data.names)) - column_header, column_frames = column.serialize_columns(self._columns) - header["columns"] = column_header + header["columns"], column_frames = column.serialize_columns( + self._columns + ) frames.extend(column_frames) + header["column_names"] = pickle.dumps(tuple(self._data.names)) return header, frames @classmethod @@ -547,7 +547,7 @@ def deserialize(cls, header, frames): column_names = pickle.loads(header["column_names"]) columns = column.deserialize_columns(header["columns"], column_frames) - return cls(dict(zip(column_names, columns)), index=index) + return cls._from_data(dict(zip(column_names, columns)), index=index,) @property def dtypes(self): @@ -1029,68 +1029,6 @@ def assign(self, **kwargs): new[k] = v return new - def head(self, n=5): - """ - Returns the first n rows as a new DataFrame - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame() - >>> df['key'] = [0, 1, 2, 3, 4] - >>> df['val'] = [float(i + 10) for i in range(5)] # insert column - >>> df.head(2) - key val - 0 0 10.0 - 1 1 11.0 - """ - return self.iloc[:n] - - def tail(self, n=5): - """ - Returns the last n rows as a new DataFrame - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame() - >>> df['key'] = [0, 1, 2, 3, 4] - >>> df['val'] = [float(i + 10) for i in range(5)] # insert column - >>> df.tail(2) - key val - 3 3 13.0 - 4 4 14.0 - """ - if n == 0: - return self.iloc[0:0] - - return self.iloc[-n:] - - def to_string(self): - """ - Convert to string - - cuDF uses Pandas internals for efficient string formatting. - Set formatting options using pandas string formatting options and - cuDF objects will print identically to Pandas objects. - - cuDF supports `null/None` as a value in any column type, which - is transparently supported during this output process. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame() - >>> df['key'] = [0, 1, 2] - >>> df['val'] = [float(i + 10) for i in range(3)] - >>> df.to_string() - ' key val\\n0 0 10.0\\n1 1 11.0\\n2 2 12.0' - """ - return self.__repr__() - - def __str__(self): - return self.to_string() - def astype(self, dtype, copy=False, errors="raise", **kwargs): """ Cast the DataFrame to the given dtype @@ -1644,14 +1582,6 @@ def update( self._mimic_inplace(source_df, inplace=True) - def __invert__(self): - # Defer logic to Series since pandas semantics dictate different - # behaviors for different types that requires too much special casing - # of the standard _unaryop. - return DataFrame( - data={col: ~self[col] for col in self}, index=self.index - ) - def radd(self, other, axis=1, level=None, fill_value=None): """ Get Addition of dataframe and other, element-wise (binary @@ -3505,15 +3435,6 @@ def rename( else: return out.copy(deep=copy) - def nans_to_nulls(self): - """ - Convert nans (if any) to nulls. - """ - df = self.copy() - for col in df.columns: - df[col] = df[col].nans_to_nulls() - return df - def as_gpu_matrix(self, columns=None, order="F"): """Convert to a matrix in device memory. @@ -4506,19 +4427,6 @@ def groupby( sort=sort, ) - @copy_docstring(Rolling) - def rolling( - self, window, min_periods=None, center=False, axis=0, win_type=None - ): - return Rolling( - self, - window, - min_periods=min_periods, - center=center, - axis=axis, - win_type=win_type, - ) - def query(self, expr, local_dict=None): """ Query with a boolean expression using Numba to compile a GPU kernel. @@ -6732,27 +6640,6 @@ def to_feather(self, path, *args, **kwargs): feather.to_feather(self, path, *args, **kwargs) - @ioutils.doc_to_json() - def to_json(self, path_or_buf=None, *args, **kwargs): - """{docstring}""" - from cudf.io import json as json - - return json.to_json(self, path_or_buf=path_or_buf, *args, **kwargs) - - @ioutils.doc_to_hdf() - def to_hdf(self, path_or_buf, key, *args, **kwargs): - """{docstring}""" - from cudf.io import hdf as hdf - - hdf.to_hdf(path_or_buf, key, self, *args, **kwargs) - - @ioutils.doc_to_dlpack() - def to_dlpack(self): - """{docstring}""" - from cudf.io import dlpack as dlpack - - return dlpack.to_dlpack(self) - @ioutils.doc_dataframe_to_csv() def to_csv( self, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 4f46794aa3f..b6eb3108550 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -27,6 +27,8 @@ ) from cudf.core.column_accessor import ColumnAccessor from cudf.core.join import merge +from cudf.core.window import Rolling +from cudf.utils import ioutils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( _is_non_decimal_numeric_dtype, @@ -4523,6 +4525,242 @@ def cumprod(self, axis=None, skipna=True, *args, **kwargs): "prod", axis=axis, skipna=skipna, cast_to_int=True, *args, **kwargs ) + @ioutils.doc_to_json() + def to_json(self, path_or_buf=None, *args, **kwargs): + """{docstring}""" + + return cudf.io.json.to_json( + self, path_or_buf=path_or_buf, *args, **kwargs + ) + + @ioutils.doc_to_hdf() + def to_hdf(self, path_or_buf, key, *args, **kwargs): + """{docstring}""" + + cudf.io.hdf.to_hdf(path_or_buf, key, self, *args, **kwargs) + + @ioutils.doc_to_dlpack() + def to_dlpack(self): + """{docstring}""" + + return cudf.io.dlpack.to_dlpack(self) + + def to_string(self): + """ + Convert to string + + cuDF uses Pandas internals for efficient string formatting. + Set formatting options using pandas string formatting options and + cuDF objects will print identically to Pandas objects. + + cuDF supports `null/None` as a value in any column type, which + is transparently supported during this output process. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame() + >>> df['key'] = [0, 1, 2] + >>> df['val'] = [float(i + 10) for i in range(3)] + >>> df.to_string() + ' key val\\n0 0 10.0\\n1 1 11.0\\n2 2 12.0' + """ + return self.__repr__() + + def __str__(self): + return self.to_string() + + def head(self, n=5): + """ + Return the first `n` rows. + This function returns the first `n` rows for the object based + on position. It is useful for quickly testing if your object + has the right type of data in it. + For negative values of `n`, this function returns all rows except + the last `n` rows, equivalent to ``df[:-n]``. + + Parameters + ---------- + n : int, default 5 + Number of rows to select. + + Returns + ------- + DataFrame or Series + The first `n` rows of the caller object. + + See Also + -------- + Frame.tail: Returns the last `n` rows. + + Examples + -------- + + **Series** + + >>> ser = cudf.Series(['alligator', 'bee', 'falcon', + ... 'lion', 'monkey', 'parrot', 'shark', 'whale', 'zebra']) + >>> ser + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra + dtype: object + + Viewing the first 5 lines + + >>> ser.head() + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + dtype: object + + Viewing the first `n` lines (three in this case) + + >>> ser.head(3) + 0 alligator + 1 bee + 2 falcon + dtype: object + + For negative values of `n` + + >>> ser.head(-3) + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot + dtype: object + + **DataFrame** + + >>> df = cudf.DataFrame() + >>> df['key'] = [0, 1, 2, 3, 4] + >>> df['val'] = [float(i + 10) for i in range(5)] # insert column + >>> df.head(2) + key val + 0 0 10.0 + 1 1 11.0 + """ + return self.iloc[:n] + + def tail(self, n=5): + """ + Returns the last n rows as a new DataFrame or Series + + Examples + -------- + + **DataFrame** + + >>> import cudf + >>> df = cudf.DataFrame() + >>> df['key'] = [0, 1, 2, 3, 4] + >>> df['val'] = [float(i + 10) for i in range(5)] # insert column + >>> df.tail(2) + key val + 3 3 13.0 + 4 4 14.0 + + **Series** + + >>> import cudf + >>> ser = cudf.Series([4, 3, 2, 1, 0]) + >>> ser.tail(2) + 3 1 + 4 0 + """ + if n == 0: + return self.iloc[0:0] + + return self.iloc[-n:] + + @copy_docstring(Rolling) + def rolling( + self, window, min_periods=None, center=False, axis=0, win_type=None + ): + return Rolling( + self, + window, + min_periods=min_periods, + center=center, + axis=axis, + win_type=win_type, + ) + + def nans_to_nulls(self): + """ + Convert nans (if any) to nulls + + Returns + ------- + DataFrame or Series + + Examples + -------- + + **Series** + + >>> import cudf, numpy as np + >>> series = cudf.Series([1, 2, np.nan, None, 10], nan_as_null=False) + >>> series + 0 1.0 + 1 2.0 + 2 NaN + 3 + 4 10.0 + dtype: float64 + >>> series.nans_to_nulls() + 0 1.0 + 1 2.0 + 2 + 3 + 4 10.0 + dtype: float64 + + **DataFrame** + + >>> df = cudf.DataFrame() + >>> df['a'] = cudf.Series([1, None, np.nan], nan_as_null=False) + >>> df['b'] = cudf.Series([None, 3.14, np.nan], nan_as_null=False) + >>> df + a b + 0 1.0 + 1 3.14 + 2 NaN NaN + >>> df.nans_to_nulls() + a b + 0 1.0 + 1 3.14 + 2 + """ + return self._from_data( + { + name: col.copy().nans_to_nulls() + for name, col in self._data.items() + }, + self._index, + ) + + def __invert__(self): + """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" + return self._from_data( + { + name: _apply_inverse_column(col) + for name, col in self._data.items() + }, + self._index, + ) + class SingleColumnFrame(Frame): """A one-dimensional frame. @@ -5251,3 +5489,15 @@ def _drop_rows_by_labels( return res else: return obj.join(key_df, how="leftanti") + + +def _apply_inverse_column(col: ColumnBase) -> ColumnBase: + """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" + if np.issubdtype(col.dtype, np.integer): + return col.unary_operator("invert") + elif np.issubdtype(col.dtype, np.bool_): + return col.unary_operator("not") + else: + raise TypeError( + f"Operation `~` not supported on {col.dtype.type.__name__}" + ) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index ff3b9fc68ef..7943d033cf8 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -41,8 +41,7 @@ from cudf.core.groupby.groupby import SeriesGroupBy from cudf.core.index import BaseIndex, Index, RangeIndex, as_index from cudf.core.indexing import _SeriesIlocIndexer, _SeriesLocIndexer -from cudf.core.window import Rolling -from cudf.utils import cudautils, docutils, ioutils +from cudf.utils import cudautils, docutils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( can_convert_to_column, @@ -329,15 +328,16 @@ def from_pandas(cls, s, nan_as_null=None): def serialize(self): header = {} frames = [] + header["type-serialized"] = pickle.dumps(type(self)) header["index"], index_frames = self._index.serialize() - header["name"] = pickle.dumps(self.name) - frames.extend(index_frames) header["index_frame_count"] = len(index_frames) + frames.extend(index_frames) + header["column"], column_frames = self._column.serialize() - header["type-serialized"] = pickle.dumps(type(self)) - frames.extend(column_frames) header["column_frame_count"] = len(column_frames) + frames.extend(column_frames) + header["name"] = pickle.dumps(self.name) return header, frames @property @@ -381,7 +381,7 @@ def deserialize(cls, header, frames): col_typ = pickle.loads(header["column"]["type-serialized"]) column = col_typ.deserialize(header["column"], frames[:column_nframes]) - return Series(column, index=index, name=name) + return cls._from_data({name: column}, index=index) def _get_columns_by_label(self, labels, downcast=False): """Return the column specified by `labels` @@ -1094,124 +1094,6 @@ def take(self, indices, keep_index=True): {self.name: self._column.take(col_inds, keep_index=False)} ) - def head(self, n=5): - """ - Return the first `n` rows. - This function returns the first `n` rows for the object based - on position. It is useful for quickly testing if your object - has the right type of data in it. - For negative values of `n`, this function returns all rows except - the last `n` rows, equivalent to ``df[:-n]``. - - Parameters - ---------- - n : int, default 5 - Number of rows to select. - - Returns - ------- - same type as caller - The first `n` rows of the caller object. - - See Also - -------- - Series.tail: Returns the last `n` rows. - - Examples - -------- - >>> ser = cudf.Series(['alligator', 'bee', 'falcon', - ... 'lion', 'monkey', 'parrot', 'shark', 'whale', 'zebra']) - >>> ser - 0 alligator - 1 bee - 2 falcon - 3 lion - 4 monkey - 5 parrot - 6 shark - 7 whale - 8 zebra - dtype: object - - Viewing the first 5 lines - - >>> ser.head() - 0 alligator - 1 bee - 2 falcon - 3 lion - 4 monkey - dtype: object - - Viewing the first `n` lines (three in this case) - - >>> ser.head(3) - 0 alligator - 1 bee - 2 falcon - dtype: object - - For negative values of `n` - - >>> ser.head(-3) - 0 alligator - 1 bee - 2 falcon - 3 lion - 4 monkey - 5 parrot - dtype: object - """ - return self.iloc[:n] - - def tail(self, n=5): - """ - Returns the last n rows as a new Series - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([4, 3, 2, 1, 0]) - >>> ser.tail(2) - 3 1 - 4 0 - """ - if n == 0: - return self.iloc[0:0] - - return self.iloc[-n:] - - def to_string(self): - """Convert to string - - Uses Pandas formatting internals to produce output identical to Pandas. - Use the Pandas formatting settings directly in Pandas to control cuDF - output. - - Returns - ------- - str - String representation of Series - - Examples - -------- - >>> import cudf - >>> series = cudf.Series(['a', None, 'b', 'c', None]) - >>> series - 0 a - 1 - 2 b - 3 c - 4 - dtype: object - >>> series.to_string() - '0 a\\n1 \\n2 b\\n3 c\\n4 \\ndtype: object' - """ # noqa : E501 - return self.__repr__() - - def __str__(self): - return self.to_string() - def __repr__(self): _, height = get_terminal_size() max_rows = ( @@ -2332,17 +2214,6 @@ def ge(self, other, fill_value=None, axis=0): other=other, fn="ge", fill_value=fill_value, can_reindex=True ) - def __invert__(self): - """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" - if np.issubdtype(self.dtype, np.integer): - return self._unaryop("invert") - elif np.issubdtype(self.dtype, np.bool_): - return self._unaryop("not") - else: - raise TypeError( - f"Operation `~` not supported on {self.dtype.type.__name__}" - ) - @copy_docstring(CategoricalAccessor) # type: ignore @property def cat(self): @@ -2693,38 +2564,6 @@ def to_array(self, fillna=None): """ return self._column.to_array(fillna=fillna) - def nans_to_nulls(self): - """ - Convert nans (if any) to nulls - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> import numpy as np - >>> series = cudf.Series([1, 2, np.nan, None, 10], nan_as_null=False) - >>> series - 0 1.0 - 1 2.0 - 2 NaN - 3 - 4 10.0 - dtype: float64 - >>> series.nans_to_nulls() - 0 1.0 - 1 2.0 - 2 - 3 - 4 10.0 - dtype: float64 - """ - return self._from_data( - {self.name: self._column.nans_to_nulls()}, self._index - ) - def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): if bool_only not in (None, True): raise NotImplementedError( @@ -4931,39 +4770,6 @@ def groupby( self, by=by, level=level, dropna=dropna, sort=sort ) - @copy_docstring(Rolling) - def rolling( - self, window, min_periods=None, center=False, axis=0, win_type=None - ): - return Rolling( - self, - window, - min_periods=min_periods, - center=center, - axis=axis, - win_type=win_type, - ) - - @ioutils.doc_to_json() - def to_json(self, path_or_buf=None, *args, **kwargs): - """{docstring}""" - - return cudf.io.json.to_json( - self, path_or_buf=path_or_buf, *args, **kwargs - ) - - @ioutils.doc_to_hdf() - def to_hdf(self, path_or_buf, key, *args, **kwargs): - """{docstring}""" - - cudf.io.hdf.to_hdf(path_or_buf, key, self, *args, **kwargs) - - @ioutils.doc_to_dlpack() - def to_dlpack(self): - """{docstring}""" - - return cudf.io.dlpack.to_dlpack(self) - def rename(self, index=None, copy=True): """ Alter Series name From cc0c8e0abdb777026389e88f7ce9b5f6285c20f8 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 30 Aug 2021 19:03:08 -0500 Subject: [PATCH 2/8] Add IO docs page in `cudf` documentation (#9145) Fixes: #9138 This PR fixes missing IO api docs by having a dedicated page for IO APIs. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - https://github.com/brandon-b-miller - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/9145 --- docs/cudf/source/api_docs/index.rst | 1 + docs/cudf/source/api_docs/io.rst | 72 +++++++++++++++++++++++++++++ python/cudf/cudf/utils/ioutils.py | 2 +- 3 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 docs/cudf/source/api_docs/io.rst diff --git a/docs/cudf/source/api_docs/index.rst b/docs/cudf/source/api_docs/index.rst index 70b9563fc1d..960608d8f3c 100644 --- a/docs/cudf/source/api_docs/index.rst +++ b/docs/cudf/source/api_docs/index.rst @@ -16,4 +16,5 @@ This page provides a list of all publicly accessible modules, methods and classe general_functions general_utilities window + io diff --git a/docs/cudf/source/api_docs/io.rst b/docs/cudf/source/api_docs/io.rst new file mode 100644 index 00000000000..4e73531e174 --- /dev/null +++ b/docs/cudf/source/api_docs/io.rst @@ -0,0 +1,72 @@ +.. _api.io: + +============ +Input/output +============ +.. currentmodule:: cudf + +CSV +~~~ +.. autosummary:: + :toctree: api/ + + read_csv + DataFrame.to_csv + + +.. currentmodule:: cudf.io.json + +JSON +~~~~ +.. autosummary:: + :toctree: api/ + + read_json + to_json + +.. currentmodule:: cudf + +Parquet +~~~~~~~ +.. autosummary:: + :toctree: api/ + + read_parquet + DataFrame.to_parquet + +ORC +~~~ +.. autosummary:: + :toctree: api/ + + read_orc + DataFrame.to_orc + +.. currentmodule:: cudf + +HDFStore: PyTables (HDF5) +~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + read_hdf + DataFrame.to_hdf + +.. warning:: + + HDF reader and writers are not GPU accelerated. These currently use CPU via Pandas. + This may be GPU accelerated in the future. + +Feather +~~~~~~~ +.. autosummary:: + :toctree: api/ + + read_feather + DataFrame.to_feather + +.. warning:: + + Feather reader and writers are not GPU accelerated. These currently use CPU via Pandas. + This may be GPU accelerated in the future. + diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 1927ef96e6f..af91db6a9e6 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -904,7 +904,7 @@ See Also -------- -cudf.to_csv +cudf.io.csv.to_csv """.format( remote_data_sources=_docstring_remote_sources ) From 4ad09aa80a4ae69139a8ada71e791d9b81127414 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 30 Aug 2021 22:13:33 -0500 Subject: [PATCH 3/8] Simplify read_avro by removing unnecessary writer/impl classes (#9090) Depends on #9040 Authors: - Christopher Harris (https://github.com/cwharris) Approvers: - Nghia Truong (https://github.com/ttnghia) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/9090 --- cpp/include/cudf/io/detail/avro.hpp | 56 ++---- cpp/src/io/avro/avro_gpu.cu | 50 ++--- cpp/src/io/avro/avro_gpu.h | 8 +- cpp/src/io/avro/reader_impl.cu | 294 ++++++++++++++-------------- cpp/src/io/avro/reader_impl.hpp | 117 ----------- cpp/src/io/functions.cpp | 6 +- 6 files changed, 188 insertions(+), 343 deletions(-) delete mode 100644 cpp/src/io/avro/reader_impl.hpp diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp index 306c15dcb72..62d97081b75 100644 --- a/cpp/include/cudf/io/detail/avro.hpp +++ b/cpp/include/cudf/io/detail/avro.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,11 +14,6 @@ * limitations under the License. */ -/** - * @file avro.hpp - * @brief cuDF-IO reader classes API - */ - #pragma once #include @@ -29,44 +24,23 @@ namespace cudf { namespace io { namespace detail { namespace avro { + /** - * @brief Class to read Avro dataset data into columns. + * @brief Reads the entire dataset. + * + * @param source Input `datasource` object to read the dataset from + * @param options Settings for controlling reading behavior + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource to use for device memory allocation + * + * @return The set of columns along with table metadata */ -class reader { - private: - class impl; - std::unique_ptr _impl; - - public: - /** - * @brief Constructor from an array of datasources - * - * @param sources Input `datasource` objects to read the dataset from - * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation - */ - explicit reader(std::vector>&& sources, - avro_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - - /** - * @brief Destructor explicitly-declared to avoid inlined in header - */ - ~reader(); +table_with_metadata read_avro( + std::unique_ptr&& source, + avro_reader_options const& options, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** - * @brief Reads the entire dataset. - * - * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches. - * - * @return The set of columns along with table metadata - */ - table_with_metadata read(avro_reader_options const& options, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); -}; } // namespace avro } // namespace detail } // namespace io diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu index 6fabcf00b8f..cb1c32458a3 100644 --- a/cpp/src/io/avro/avro_gpu.cu +++ b/cpp/src/io/avro/avro_gpu.cu @@ -65,14 +65,15 @@ static inline int64_t __device__ avro_decode_zigzag_varint(const uint8_t*& cur, * * @return data pointer at the end of the row (start of next row) */ -static const uint8_t* __device__ avro_decode_row(const schemadesc_s* schema, - schemadesc_s* schema_g, - uint32_t schema_len, - size_t row, - size_t max_rows, - const uint8_t* cur, - const uint8_t* end, - device_span global_dictionary) +static uint8_t const* __device__ +avro_decode_row(schemadesc_s const* schema, + schemadesc_s* schema_g, + uint32_t schema_len, + size_t row, + size_t max_rows, + uint8_t const* cur, + uint8_t const* end, + device_span global_dictionary) { uint32_t array_start = 0, array_repeat_count = 0; int array_children = 0; @@ -220,7 +221,6 @@ static const uint8_t* __device__ avro_decode_row(const schemadesc_s* schema, * @param[in] schema Schema description * @param[in] global_Dictionary Global dictionary entries * @param[in] avro_data Raw block data - * @param[in] num_blocks Number of blocks * @param[in] schema_len Number of entries in schema * @param[in] min_row_size Minimum size in bytes of a row * @param[in] max_rows Maximum number of rows to load @@ -228,11 +228,10 @@ static const uint8_t* __device__ avro_decode_row(const schemadesc_s* schema, */ // blockDim {32,num_warps,1} extern "C" __global__ void __launch_bounds__(num_warps * 32, 2) - gpuDecodeAvroColumnData(block_desc_s* blocks, + gpuDecodeAvroColumnData(device_span blocks, schemadesc_s* schema_g, - device_span global_dictionary, - const uint8_t* avro_data, - uint32_t num_blocks, + device_span global_dictionary, + uint8_t const* avro_data, uint32_t schema_len, uint32_t min_row_size, size_t max_rows, @@ -258,9 +257,9 @@ extern "C" __global__ void __launch_bounds__(num_warps * 32, 2) } else { schema = schema_g; } - if (block_id < num_blocks and threadIdx.x == 0) { *blk = blocks[block_id]; } + if (block_id < blocks.size() and threadIdx.x == 0) { *blk = blocks[block_id]; } __syncthreads(); - if (block_id >= num_blocks) { return; } + if (block_id >= blocks.size()) { return; } cur_row = blk->first_row; rows_remaining = blk->num_rows; cur = avro_data + blk->offset; @@ -304,18 +303,16 @@ extern "C" __global__ void __launch_bounds__(num_warps * 32, 2) * @param[in] schema Schema description * @param[in] global_dictionary Global dictionary entries * @param[in] avro_data Raw block data - * @param[in] num_blocks Number of blocks * @param[in] schema_len Number of entries in schema * @param[in] max_rows Maximum number of rows to load * @param[in] first_row Crop all rows below first_row * @param[in] min_row_size Minimum size in bytes of a row * @param[in] stream CUDA stream to use, default 0 */ -void DecodeAvroColumnData(block_desc_s* blocks, +void DecodeAvroColumnData(device_span blocks, schemadesc_s* schema, - device_span global_dictionary, - const uint8_t* avro_data, - uint32_t num_blocks, + device_span global_dictionary, + uint8_t const* avro_data, uint32_t schema_len, size_t max_rows, size_t first_row, @@ -325,17 +322,10 @@ void DecodeAvroColumnData(block_desc_s* blocks, // num_warps warps per threadblock dim3 const dim_block(32, num_warps); // 1 warp per datablock, num_warps datablocks per threadblock - dim3 const dim_grid((num_blocks + num_warps - 1) / num_warps, 1); + dim3 const dim_grid((blocks.size() + num_warps - 1) / num_warps, 1); - gpuDecodeAvroColumnData<<>>(blocks, - schema, - global_dictionary, - avro_data, - num_blocks, - schema_len, - min_row_size, - max_rows, - first_row); + gpuDecodeAvroColumnData<<>>( + blocks, schema, global_dictionary, avro_data, schema_len, min_row_size, max_rows, first_row); } } // namespace gpu diff --git a/cpp/src/io/avro/avro_gpu.h b/cpp/src/io/avro/avro_gpu.h index a895d1bea02..c87ac8afb13 100644 --- a/cpp/src/io/avro/avro_gpu.h +++ b/cpp/src/io/avro/avro_gpu.h @@ -43,18 +43,16 @@ struct schemadesc_s { * @param[in] schema Schema description * @param[in] global_dictionary Global dictionary entries * @param[in] avro_data Raw block data - * @param[in] num_blocks Number of blocks * @param[in] schema_len Number of entries in schema * @param[in] max_rows Maximum number of rows to load * @param[in] first_row Crop all rows below first_row * @param[in] min_row_size Minimum size in bytes of a row * @param[in] stream CUDA stream to use, default 0 */ -void DecodeAvroColumnData(block_desc_s* blocks, +void DecodeAvroColumnData(cudf::device_span blocks, schemadesc_s* schema, - cudf::device_span global_dictionary, - const uint8_t* avro_data, - uint32_t num_blocks, + cudf::device_span global_dictionary, + uint8_t const* avro_data, uint32_t schema_len, size_t max_rows = ~0, size_t first_row = 0, diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu index 08ea96139a1..aa3bab2d877 100644 --- a/cpp/src/io/avro/reader_impl.cu +++ b/cpp/src/io/avro/reader_impl.cu @@ -14,30 +14,38 @@ * limitations under the License. */ -/** - * @file reader_impl.cu - * @brief cuDF-IO Avro reader class implementation - */ - -#include "reader_impl.hpp" +#include "avro.h" +#include "avro_gpu.h" #include +#include +#include #include +#include +#include +#include #include #include +#include #include #include #include #include +#include +#include +#include +#include + using cudf::device_span; namespace cudf { namespace io { namespace detail { namespace avro { + // Import functionality that's independent of legacy code using namespace cudf::io::avro; using namespace cudf::io; @@ -46,7 +54,7 @@ namespace { /** * @brief Function that translates Avro data kind to cuDF type enum */ -type_id to_type_id(const avro::schema_entry* col) +type_id to_type_id(avro::schema_entry const* col) { switch (col->kind) { case avro::type_boolean: return type_id::BOOL8; @@ -79,7 +87,7 @@ class metadata : public file_metadata { */ void init_and_select_rows(int& row_start, int& row_count) { - const auto buffer = source->host_read(0, source->size()); + auto const buffer = source->host_read(0, source->size()); avro::container pod(buffer->data(), buffer->size()); CUDF_EXPECTS(pod.parse(this, row_count, row_start), "Cannot parse metadata"); row_start = skip_rows; @@ -97,10 +105,10 @@ class metadata : public file_metadata { { std::vector> selection; - const auto num_avro_columns = static_cast(columns.size()); + auto const num_avro_columns = static_cast(columns.size()); if (!use_names.empty()) { int index = 0; - for (const auto& use_name : use_names) { + for (auto const& use_name : use_names) { for (int i = 0; i < num_avro_columns; ++i, ++index) { if (index >= num_avro_columns) { index = 0; } if (columns[index].name == use_name && @@ -138,25 +146,28 @@ class metadata : public file_metadata { datasource* const source; }; -rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer& comp_block_data, - rmm::cuda_stream_view stream) +rmm::device_buffer decompress_data(datasource& source, + metadata& meta, + rmm::device_buffer const& comp_block_data, + rmm::cuda_stream_view stream) { size_t uncompressed_data_size = 0; - hostdevice_vector inflate_in(_metadata->block_list.size()); - hostdevice_vector inflate_out(_metadata->block_list.size()); - if (_metadata->codec == "deflate") { + auto inflate_in = hostdevice_vector(meta.block_list.size()); + auto inflate_out = hostdevice_vector(meta.block_list.size()); + + if (meta.codec == "deflate") { // Guess an initial maximum uncompressed block size - uint32_t initial_blk_len = (_metadata->max_block_size * 2 + 0xfff) & ~0xfff; - uncompressed_data_size = initial_blk_len * _metadata->block_list.size(); + uint32_t initial_blk_len = (meta.max_block_size * 2 + 0xfff) & ~0xfff; + uncompressed_data_size = initial_blk_len * meta.block_list.size(); for (size_t i = 0; i < inflate_in.size(); ++i) { inflate_in[i].dstSize = initial_blk_len; } - } else if (_metadata->codec == "snappy") { + } else if (meta.codec == "snappy") { // Extract the uncompressed length from the snappy stream - for (size_t i = 0; i < _metadata->block_list.size(); i++) { - const auto buffer = _source->host_read(_metadata->block_list[i].offset, 4); - const uint8_t* blk = buffer->data(); + for (size_t i = 0; i < meta.block_list.size(); i++) { + auto const buffer = source.host_read(meta.block_list[i].offset, 4); + uint8_t const* blk = buffer->data(); uint32_t blk_len = blk[0]; if (blk_len > 0x7f) { blk_len = (blk_len & 0x7f) | (blk[1] << 7); @@ -174,28 +185,28 @@ rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer& comp_ rmm::device_buffer decomp_block_data(uncompressed_data_size, stream); - const auto base_offset = _metadata->block_list[0].offset; - for (size_t i = 0, dst_pos = 0; i < _metadata->block_list.size(); i++) { - const auto src_pos = _metadata->block_list[i].offset - base_offset; + auto const base_offset = meta.block_list[0].offset; + for (size_t i = 0, dst_pos = 0; i < meta.block_list.size(); i++) { + auto const src_pos = meta.block_list[i].offset - base_offset; - inflate_in[i].srcDevice = static_cast(comp_block_data.data()) + src_pos; - inflate_in[i].srcSize = _metadata->block_list[i].size; + inflate_in[i].srcDevice = static_cast(comp_block_data.data()) + src_pos; + inflate_in[i].srcSize = meta.block_list[i].size; inflate_in[i].dstDevice = static_cast(decomp_block_data.data()) + dst_pos; // Update blocks offsets & sizes to refer to uncompressed data - _metadata->block_list[i].offset = dst_pos; - _metadata->block_list[i].size = static_cast(inflate_in[i].dstSize); - dst_pos += _metadata->block_list[i].size; + meta.block_list[i].offset = dst_pos; + meta.block_list[i].size = static_cast(inflate_in[i].dstSize); + dst_pos += meta.block_list[i].size; } for (int loop_cnt = 0; loop_cnt < 2; loop_cnt++) { inflate_in.host_to_device(stream); CUDA_TRY( cudaMemsetAsync(inflate_out.device_ptr(), 0, inflate_out.memory_size(), stream.value())); - if (_metadata->codec == "deflate") { + if (meta.codec == "deflate") { CUDA_TRY(gpuinflate( inflate_in.device_ptr(), inflate_out.device_ptr(), inflate_in.size(), 0, stream)); - } else if (_metadata->codec == "snappy") { + } else if (meta.codec == "snappy") { CUDA_TRY( gpu_unsnap(inflate_in.device_ptr(), inflate_out.device_ptr(), inflate_in.size(), stream)); } else { @@ -204,9 +215,9 @@ rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer& comp_ inflate_out.device_to_host(stream, true); // Check if larger output is required, as it's not known ahead of time - if (_metadata->codec == "deflate" && !loop_cnt) { + if (meta.codec == "deflate" && !loop_cnt) { size_t actual_uncompressed_size = 0; - for (size_t i = 0; i < _metadata->block_list.size(); i++) { + for (size_t i = 0; i < meta.block_list.size(); i++) { // If error status is 1 (buffer too small), the `bytes_written` field // is actually contains the uncompressed data size if (inflate_out[i].status == 1 && inflate_out[i].bytes_written > inflate_in[i].dstSize) { @@ -216,13 +227,13 @@ rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer& comp_ } if (actual_uncompressed_size > uncompressed_data_size) { decomp_block_data.resize(actual_uncompressed_size, stream); - for (size_t i = 0, dst_pos = 0; i < _metadata->block_list.size(); i++) { + for (size_t i = 0, dst_pos = 0; i < meta.block_list.size(); i++) { auto dst_base = static_cast(decomp_block_data.data()); inflate_in[i].dstDevice = dst_base + dst_pos; - _metadata->block_list[i].offset = dst_pos; - _metadata->block_list[i].size = static_cast(inflate_in[i].dstSize); - dst_pos += _metadata->block_list[i].size; + meta.block_list[i].offset = dst_pos; + meta.block_list[i].size = static_cast(inflate_in[i].dstSize); + dst_pos += meta.block_list[i].size; } } else { break; @@ -235,28 +246,40 @@ rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer& comp_ return decomp_block_data; } -void reader::impl::decode_data(const rmm::device_buffer& block_data, - const std::vector>& dict, - device_span global_dictionary, - size_t num_rows, - std::vector> selection, - std::vector& out_buffers, - rmm::cuda_stream_view stream) +std::vector decode_data(metadata& meta, + rmm::device_buffer const& block_data, + std::vector> const& dict, + device_span global_dictionary, + size_t num_rows, + std::vector> const& selection, + std::vector const& column_types, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { + auto out_buffers = std::vector(); + + for (size_t i = 0; i < column_types.size(); ++i) { + auto col_idx = selection[i].first; + bool is_nullable = (meta.columns[col_idx].schema_null_idx >= 0); + out_buffers.emplace_back(column_types[i], num_rows, is_nullable, stream, mr); + } + // Build gpu schema - hostdevice_vector schema_desc(_metadata->schema.size()); + auto schema_desc = hostdevice_vector(meta.schema.size()); + uint32_t min_row_data_size = 0; int skip_field_cnt = 0; - for (size_t i = 0; i < _metadata->schema.size(); i++) { - type_kind_e kind = _metadata->schema[i].kind; + + for (size_t i = 0; i < meta.schema.size(); i++) { + type_kind_e kind = meta.schema[i].kind; if (skip_field_cnt != 0) { // Exclude union and array members from min_row_data_size - skip_field_cnt += _metadata->schema[i].num_children - 1; + skip_field_cnt += meta.schema[i].num_children - 1; } else { switch (kind) { case type_union: case type_array: - skip_field_cnt = _metadata->schema[i].num_children; + skip_field_cnt = meta.schema[i].num_children; // fall through case type_boolean: case type_int: @@ -269,21 +292,21 @@ void reader::impl::decode_data(const rmm::device_buffer& block_data, default: break; } } - if (kind == type_enum && !_metadata->schema[i].symbols.size()) { kind = type_int; } - schema_desc[i].kind = kind; - schema_desc[i].count = (kind == type_enum) ? 0 : (uint32_t)_metadata->schema[i].num_children; + if (kind == type_enum && !meta.schema[i].symbols.size()) { kind = type_int; } + schema_desc[i].kind = kind; + schema_desc[i].count = + (kind == type_enum) ? 0 : static_cast(meta.schema[i].num_children); schema_desc[i].dataptr = nullptr; - CUDF_EXPECTS( - kind != type_union || _metadata->schema[i].num_children < 2 || - (_metadata->schema[i].num_children == 2 && (_metadata->schema[i + 1].kind == type_null || - _metadata->schema[i + 2].kind == type_null)), - "Union with non-null type not currently supported"); + CUDF_EXPECTS(kind != type_union || meta.schema[i].num_children < 2 || + (meta.schema[i].num_children == 2 && + (meta.schema[i + 1].kind == type_null || meta.schema[i + 2].kind == type_null)), + "Union with non-null type not currently supported"); } std::vector valid_alias(out_buffers.size(), nullptr); for (size_t i = 0; i < out_buffers.size(); i++) { - const auto col_idx = selection[i].first; - int schema_data_idx = _metadata->columns[col_idx].schema_data_idx; - int schema_null_idx = _metadata->columns[col_idx].schema_null_idx; + auto const col_idx = selection[i].first; + int schema_data_idx = meta.columns[col_idx].schema_data_idx; + int schema_null_idx = meta.columns[col_idx].schema_null_idx; schema_desc[schema_data_idx].dataptr = out_buffers[i].data(); if (schema_null_idx >= 0) { @@ -293,25 +316,25 @@ void reader::impl::decode_data(const rmm::device_buffer& block_data, valid_alias[i] = schema_desc[schema_null_idx].dataptr; } } - if (_metadata->schema[schema_data_idx].kind == type_enum) { + if (meta.schema[schema_data_idx].kind == type_enum) { schema_desc[schema_data_idx].count = dict[i].first; } if (out_buffers[i].null_mask_size()) { cudf::detail::set_null_mask(out_buffers[i].null_mask(), 0, num_rows, true, stream); } } - rmm::device_buffer block_list( - _metadata->block_list.data(), _metadata->block_list.size() * sizeof(block_desc_s), stream); + + auto block_list = cudf::detail::make_device_uvector_async(meta.block_list, stream); + schema_desc.host_to_device(stream); - gpu::DecodeAvroColumnData(static_cast(block_list.data()), + gpu::DecodeAvroColumnData(block_list, schema_desc.device_ptr(), global_dictionary, - static_cast(block_data.data()), - static_cast(_metadata->block_list.size()), + static_cast(block_data.data()), static_cast(schema_desc.size()), - _metadata->num_rows, - _metadata->skip_rows, + meta.num_rows, + meta.skip_rows, min_row_data_size, stream); @@ -328,23 +351,18 @@ void reader::impl::decode_data(const rmm::device_buffer& block_data, schema_desc.device_to_host(stream, true); for (size_t i = 0; i < out_buffers.size(); i++) { - const auto col_idx = selection[i].first; - const auto schema_null_idx = _metadata->columns[col_idx].schema_null_idx; + auto const col_idx = selection[i].first; + auto const schema_null_idx = meta.columns[col_idx].schema_null_idx; out_buffers[i].null_count() = (schema_null_idx >= 0) ? schema_desc[schema_null_idx].count : 0; } -} -reader::impl::impl(std::unique_ptr source, - avro_reader_options const& options, - rmm::mr::device_memory_resource* mr) - : _mr(mr), _source(std::move(source)), _columns(options.get_columns()) -{ - // Open the source Avro dataset metadata - _metadata = std::make_unique(_source.get()); + return out_buffers; } -table_with_metadata reader::impl::read(avro_reader_options const& options, - rmm::cuda_stream_view stream) +table_with_metadata read_avro(std::unique_ptr&& source, + avro_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto skip_rows = options.get_skip_rows(); auto num_rows = options.get_num_rows(); @@ -352,70 +370,76 @@ table_with_metadata reader::impl::read(avro_reader_options const& options, std::vector> out_columns; table_metadata metadata_out; + // Open the source Avro dataset metadata + auto meta = metadata(source.get()); + // Select and read partial metadata / schema within the subset of rows - _metadata->init_and_select_rows(skip_rows, num_rows); + meta.init_and_select_rows(skip_rows, num_rows); // Select only columns required by the options - auto selected_columns = _metadata->select_columns(_columns); + auto selected_columns = meta.select_columns(options.get_columns()); if (selected_columns.size() != 0) { // Get a list of column data types std::vector column_types; - for (const auto& col : selected_columns) { - auto& col_schema = _metadata->schema[_metadata->columns[col.first].schema_data_idx]; + for (auto const& col : selected_columns) { + auto& col_schema = meta.schema[meta.columns[col.first].schema_data_idx]; auto col_type = to_type_id(&col_schema); CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type"); column_types.emplace_back(col_type); } - if (_metadata->total_data_size > 0) { + if (meta.total_data_size > 0) { rmm::device_buffer block_data; - if (_source->is_device_read_preferred(_metadata->total_data_size)) { - block_data = rmm::device_buffer{_metadata->total_data_size, stream}; - auto read_bytes = _source->device_read(_metadata->block_list[0].offset, - _metadata->total_data_size, - static_cast(block_data.data()), - stream); + if (source->is_device_read_preferred(meta.total_data_size)) { + block_data = rmm::device_buffer{meta.total_data_size, stream}; + auto read_bytes = source->device_read(meta.block_list[0].offset, + meta.total_data_size, + static_cast(block_data.data()), + stream); block_data.resize(read_bytes, stream); } else { - const auto buffer = - _source->host_read(_metadata->block_list[0].offset, _metadata->total_data_size); - block_data = rmm::device_buffer{buffer->data(), buffer->size(), stream}; + auto const buffer = source->host_read(meta.block_list[0].offset, meta.total_data_size); + block_data = rmm::device_buffer{buffer->data(), buffer->size(), stream}; } - if (_metadata->codec != "" && _metadata->codec != "null") { - auto decomp_block_data = decompress_data(block_data, stream); + if (meta.codec != "" && meta.codec != "null") { + auto decomp_block_data = decompress_data(*source, meta, block_data, stream); block_data = std::move(decomp_block_data); } else { - auto dst_ofs = _metadata->block_list[0].offset; - for (size_t i = 0; i < _metadata->block_list.size(); i++) { - _metadata->block_list[i].offset -= dst_ofs; + auto dst_ofs = meta.block_list[0].offset; + for (size_t i = 0; i < meta.block_list.size(); i++) { + meta.block_list[i].offset -= dst_ofs; } } size_t total_dictionary_entries = 0; size_t dictionary_data_size = 0; - std::vector> dict(column_types.size()); + + auto dict = std::vector>(column_types.size()); + for (size_t i = 0; i < column_types.size(); ++i) { auto col_idx = selected_columns[i].first; - auto& col_schema = _metadata->schema[_metadata->columns[col_idx].schema_data_idx]; + auto& col_schema = meta.schema[meta.columns[col_idx].schema_data_idx]; dict[i].first = static_cast(total_dictionary_entries); dict[i].second = static_cast(col_schema.symbols.size()); total_dictionary_entries += dict[i].second; - for (const auto& sym : col_schema.symbols) { + for (auto const& sym : col_schema.symbols) { dictionary_data_size += sym.length(); } } - rmm::device_uvector d_global_dict(total_dictionary_entries, stream); - rmm::device_uvector d_global_dict_data(dictionary_data_size, stream); + auto d_global_dict = rmm::device_uvector(0, stream); + auto d_global_dict_data = rmm::device_uvector(0, stream); + if (total_dictionary_entries > 0) { - std::vector h_global_dict(total_dictionary_entries); - std::vector h_global_dict_data(dictionary_data_size); - size_t dict_pos = 0; + auto h_global_dict = std::vector(total_dictionary_entries); + auto h_global_dict_data = std::vector(dictionary_data_size); + size_t dict_pos = 0; + for (size_t i = 0; i < column_types.size(); ++i) { - auto const col_idx = selected_columns[i].first; - auto const& col_schema = _metadata->schema[_metadata->columns[col_idx].schema_data_idx]; + auto const col_idx = selected_columns[i].first; + auto const& col_schema = meta.schema[meta.columns[col_idx].schema_data_idx]; auto const col_dict_entries = &(h_global_dict[dict[i].first]); for (size_t j = 0; j < dict[i].second; j++) { auto const& symbols = col_schema.symbols[j]; @@ -430,30 +454,24 @@ table_with_metadata reader::impl::read(avro_reader_options const& options, } } - CUDA_TRY(cudaMemcpyAsync(d_global_dict.data(), - h_global_dict.data(), - h_global_dict.size() * sizeof(string_index_pair), - cudaMemcpyDefault, - stream.value())); - CUDA_TRY(cudaMemcpyAsync(d_global_dict_data.data(), - h_global_dict_data.data(), - h_global_dict_data.size() * sizeof(char), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); - } + d_global_dict = cudf::detail::make_device_uvector_async(h_global_dict, stream); + d_global_dict_data = cudf::detail::make_device_uvector_async(h_global_dict_data, stream); - std::vector out_buffers; - for (size_t i = 0; i < column_types.size(); ++i) { - auto col_idx = selected_columns[i].first; - bool is_nullable = (_metadata->columns[col_idx].schema_null_idx >= 0); - out_buffers.emplace_back(column_types[i], num_rows, is_nullable, stream, _mr); + stream.synchronize(); } - decode_data(block_data, dict, d_global_dict, num_rows, selected_columns, out_buffers, stream); + auto out_buffers = decode_data(meta, + block_data, + dict, + d_global_dict, + num_rows, + selected_columns, + column_types, + stream, + mr); for (size_t i = 0; i < column_types.size(); ++i) { - out_columns.emplace_back(make_column(out_buffers[i], nullptr, stream, _mr)); + out_columns.emplace_back(make_column(out_buffers[i], nullptr, stream, mr)); } } else { // Create empty columns @@ -469,29 +487,11 @@ table_with_metadata reader::impl::read(avro_reader_options const& options, metadata_out.column_names[i] = selected_columns[i].second; } // Return user metadata - metadata_out.user_data = _metadata->user_data; + metadata_out.user_data = meta.user_data; return {std::make_unique(std::move(out_columns)), std::move(metadata_out)}; } -// Forward to implementation -reader::reader(std::vector>&& sources, - avro_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported."); - _impl = std::make_unique(std::move(sources[0]), options, mr); -} - -// Destructor within this translation unit -reader::~reader() = default; - -// Forward to implementation -table_with_metadata reader::read(avro_reader_options const& options, rmm::cuda_stream_view stream) -{ - return _impl->read(options, stream); -} } // namespace avro } // namespace detail } // namespace io diff --git a/cpp/src/io/avro/reader_impl.hpp b/cpp/src/io/avro/reader_impl.hpp deleted file mode 100644 index 9af32ed88a0..00000000000 --- a/cpp/src/io/avro/reader_impl.hpp +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file reader_impl.hpp - * @brief cuDF-IO Avro reader class implementation header - */ - -#pragma once - -#include "avro.h" -#include "avro_gpu.h" - -#include -#include -#include - -#include -#include - -#include - -#include -#include -#include -#include - -namespace cudf { -namespace io { -namespace detail { -namespace avro { -using namespace cudf::io::avro; -using namespace cudf::io; - -// Forward declarations -class metadata; - -/** - * @brief Implementation for Avro reader - */ -class reader::impl { - public: - /** - * @brief Constructor from a dataset source with reader options. - * - * @param source Dataset source - * @param options Settings for controlling reading behavior - * @param mr Device memory resource to use for device memory allocation - */ - explicit impl(std::unique_ptr source, - avro_reader_options const& options, - rmm::mr::device_memory_resource* mr); - - /** - * @brief Read an entire set or a subset of data and returns a set of columns - * - * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches. - * - * @return The set of columns along with metadata - */ - table_with_metadata read(avro_reader_options const& options, rmm::cuda_stream_view stream); - - private: - /** - * @brief Decompresses the block data. - * - * @param comp_block_data Compressed block data - * @param stream CUDA stream used for device memory operations and kernel launches. - * - * @return Device buffer to decompressed block data - */ - rmm::device_buffer decompress_data(const rmm::device_buffer& comp_block_data, - rmm::cuda_stream_view stream); - - /** - * @brief Convert the avro row-based block data and outputs to columns - * - * @param block_data Uncompressed block data - * @param dict Dictionary entries - * @param global_dictionary Dictionary allocation - * @param out_buffers Output columns' device buffers - * @param stream CUDA stream used for device memory operations and kernel launches. - */ - void decode_data(const rmm::device_buffer& block_data, - const std::vector>& dict, - cudf::device_span global_dictionary, - size_t num_rows, - std::vector> columns, - std::vector& out_buffers, - rmm::cuda_stream_view stream); - - private: - rmm::mr::device_memory_resource* _mr = nullptr; - std::unique_ptr _source; - std::unique_ptr _metadata; - - std::vector _columns; -}; - -} // namespace avro -} // namespace detail -} // namespace io -} // namespace cudf diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 438cb1762c6..511a1a22ee7 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -146,10 +146,10 @@ table_with_metadata read_avro(avro_reader_options const& options, CUDF_FUNC_RANGE(); auto datasources = make_datasources(options.get_source()); - auto reader = - std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); - return reader->read(options); + CUDF_EXPECTS(datasources.size() == 1, "Only a single source is currently supported."); + + return avro::read_avro(std::move(datasources[0]), options, rmm::cuda_stream_default, mr); } compression_type infer_compression_type(compression_type compression, source_info const& info) From 8a3efd0864765300d1b00db979db3cae5c3c931c Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Tue, 31 Aug 2021 07:54:03 -0400 Subject: [PATCH 4/8] cudf now leverages rapids-cmake to reduce CMake boilerplate (#9030) rapids-cmake providies features such as dependency tracking, pre-configured CPM dependencies, and CUDA architecture detction. Using those features allows cudf to reduce the amount of CMake code it needs to maintain, and brings it inline with the rest of RAPIDS. Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Jason Lowe (https://github.com/jlowe) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/9030 --- build.sh | 4 +- cpp/CMakeLists.txt | 282 ++++++++++-------- cpp/cmake/Modules/ConfigureCUDA.cmake | 13 - cpp/cmake/Modules/EvalGPUArchs.cmake | 70 ----- cpp/cmake/Modules/SetGPUArchs.cmake | 55 ---- cpp/cmake/cudf-build-config.cmake.in | 105 ------- cpp/cmake/cudf-config.cmake.in | 113 ------- cpp/cmake/install/FindThrust.cmake | 63 ---- cpp/cmake/thirdparty/CUDF_GetCPM.cmake | 33 -- cpp/cmake/thirdparty/CUDF_GetGTest.cmake | 50 ---- cpp/cmake/thirdparty/CUDF_GetRMM.cmake | 50 ---- .../{CUDF_GetArrow.cmake => get_arrow.cmake} | 54 ++-- ...lections.cmake => get_cucollections.cmake} | 17 +- ...{CUDF_GetDLPack.cmake => get_dlpack.cmake} | 24 +- cpp/cmake/thirdparty/get_gtest.cmake | 37 +++ ...{CUDF_GetJitify.cmake => get_jitify.cmake} | 3 +- ...tLibcudacxx.cmake => get_libcudacxx.cmake} | 8 +- .../{CUDF_GetNVBench.cmake => get_rmm.cmake} | 22 +- ...{CUDF_GetThrust.cmake => get_thrust.cmake} | 24 +- cpp/libcudf_kafka/CMakeLists.txt | 79 +++-- ...UDF_KAFKA_GetCUDF.cmake => get_cudf.cmake} | 35 ++- ...FKA_GetRDKafka.cmake => get_rdkafka.cmake} | 32 +- cpp/libcudf_kafka/tests/CMakeLists.txt | 18 +- java/src/main/native/CMakeLists.txt | 174 +++++------ java/src/main/native/cmake/EvalGpuArchs.cmake | 62 ---- .../cmake/Modules/ConfigureNvcomp.cmake | 17 +- 26 files changed, 467 insertions(+), 977 deletions(-) delete mode 100644 cpp/cmake/Modules/EvalGPUArchs.cmake delete mode 100644 cpp/cmake/Modules/SetGPUArchs.cmake delete mode 100644 cpp/cmake/cudf-build-config.cmake.in delete mode 100644 cpp/cmake/cudf-config.cmake.in delete mode 100644 cpp/cmake/install/FindThrust.cmake delete mode 100644 cpp/cmake/thirdparty/CUDF_GetCPM.cmake delete mode 100644 cpp/cmake/thirdparty/CUDF_GetGTest.cmake delete mode 100644 cpp/cmake/thirdparty/CUDF_GetRMM.cmake rename cpp/cmake/thirdparty/{CUDF_GetArrow.cmake => get_arrow.cmake} (83%) rename cpp/cmake/thirdparty/{CUDF_GetcuCollections.cmake => get_cucollections.cmake} (69%) rename cpp/cmake/thirdparty/{CUDF_GetDLPack.cmake => get_dlpack.cmake} (69%) create mode 100644 cpp/cmake/thirdparty/get_gtest.cmake rename cpp/cmake/thirdparty/{CUDF_GetJitify.cmake => get_jitify.cmake} (94%) rename cpp/cmake/thirdparty/{CUDF_GetLibcudacxx.cmake => get_libcudacxx.cmake} (82%) rename cpp/cmake/thirdparty/{CUDF_GetNVBench.cmake => get_rmm.cmake} (60%) rename cpp/cmake/thirdparty/{CUDF_GetThrust.cmake => get_thrust.cmake} (77%) rename cpp/libcudf_kafka/cmake/thirdparty/{CUDF_KAFKA_GetCUDF.cmake => get_cudf.cmake} (52%) rename cpp/libcudf_kafka/cmake/thirdparty/{CUDF_KAFKA_GetRDKafka.cmake => get_rdkafka.cmake} (52%) delete mode 100644 java/src/main/native/cmake/EvalGpuArchs.cmake diff --git a/build.sh b/build.sh index 11948c64412..c9333a3e2af 100755 --- a/build.sh +++ b/build.sh @@ -165,10 +165,10 @@ fi if buildAll || hasArg libcudf; then if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then - CUDF_CMAKE_CUDA_ARCHITECTURES="-DCMAKE_CUDA_ARCHITECTURES=" + CUDF_CMAKE_CUDA_ARCHITECTURES="-DCMAKE_CUDA_ARCHITECTURES=NATIVE" echo "Building for the architecture of the GPU in the system..." else - CUDF_CMAKE_CUDA_ARCHITECTURES="" + CUDF_CMAKE_CUDA_ARCHITECTURES="-DCMAKE_CUDA_ARCHITECTURES=ALL" echo "Building for *ALL* supported GPU architectures..." fi diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d9a493f57a0..18af85c98e0 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -14,19 +14,7 @@ # limitations under the License. #============================================================================= -cmake_minimum_required(VERSION 3.18 FATAL_ERROR) - -# If `CMAKE_CUDA_ARCHITECTURES` is not defined, build for all supported architectures. If -# `CMAKE_CUDA_ARCHITECTURES` is set to an empty string (""), build for only the current -# architecture. If `CMAKE_CUDA_ARCHITECTURES` is specified by the user, use user setting. - -# This needs to be run before enabling the CUDA language due to the default initialization behavior -# of `CMAKE_CUDA_ARCHITECTURES`, https://gitlab.kitware.com/cmake/cmake/-/issues/21302 -if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "ALL") - set(CUDF_BUILD_FOR_ALL_ARCHS TRUE) -elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "") - set(CUDF_BUILD_FOR_DETECTED_ARCHS TRUE) -endif() +cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.10/RAPIDS.cmake ${CMAKE_BINARY_DIR}/RAPIDS.cmake) @@ -38,8 +26,9 @@ include(rapids-cuda) include(rapids-export) include(rapids-find) +rapids_cuda_init_architectures(CUDF) -project(CUDF VERSION 21.10.00 LANGUAGES C CXX) +project(CUDF VERSION 21.10.00 LANGUAGES C CXX CUDA) # Needed because GoogleBenchmark changes the state of FindThreads.cmake, # causing subsequent runs to have different values for the `Threads::Threads` target. @@ -79,7 +68,7 @@ message(VERBOSE "CUDF: Enable the -lineinfo option for nvcc (useful for cuda-mem message(VERBOSE "CUDF: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}") # Set a default build type if none was specified -set(DEFAULT_BUILD_TYPE "Release") +rapids_cmake_build_type("Release") set(CUDF_BUILD_TESTS ${BUILD_TESTS}) set(CUDF_BUILD_BENCHMARKS ${BUILD_BENCHMARKS}) @@ -88,13 +77,6 @@ set(CUDF_CUDA_FLAGS "") set(CUDF_CXX_DEFINITIONS "") set(CUDF_CUDA_DEFINITIONS "") -if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) - message(VERBOSE "CUDF: Setting build type to '${DEFAULT_BUILD_TYPE}' since none specified.") - set(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" CACHE STRING "Choose the type of build." FORCE) - # Set the possible values of build type for cmake-gui - set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") -endif(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) - # Set RMM logging level set(RMM_LOGGING_LEVEL "INFO" CACHE STRING "Choose the logging level.") set_property(CACHE RMM_LOGGING_LEVEL PROPERTY STRINGS "TRACE" "DEBUG" "INFO" "WARN" "ERROR" "CRITICAL" "OFF") @@ -106,54 +88,47 @@ endif() ################################################################################################### # - conda environment ----------------------------------------------------------------------------- - -if("$ENV{CONDA_BUILD}" STREQUAL "1") - set(CMAKE_PREFIX_PATH "$ENV{BUILD_PREFIX};$ENV{PREFIX};${CMAKE_PREFIX_PATH}") - set(CONDA_INCLUDE_DIRS "$ENV{BUILD_PREFIX}/include" "$ENV{PREFIX}/include") - set(CONDA_LINK_DIRS "$ENV{BUILD_PREFIX}/lib" "$ENV{PREFIX}/lib") - message(VERBOSE "CUDF: Conda build detected, CMAKE_PREFIX_PATH set to: ${CMAKE_PREFIX_PATH}") -elseif(DEFINED ENV{CONDA_PREFIX}) - set(CMAKE_PREFIX_PATH "$ENV{CONDA_PREFIX};${CMAKE_PREFIX_PATH}") - set(CONDA_INCLUDE_DIRS "$ENV{CONDA_PREFIX}/include") - set(CONDA_LINK_DIRS "$ENV{CONDA_PREFIX}/lib") - message(VERBOSE "CUDF: Conda environment detected, CMAKE_PREFIX_PATH set to: ${CMAKE_PREFIX_PATH}") -endif() +rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH) ################################################################################################### # - compiler options ------------------------------------------------------------------------------ +rapids_find_package(CUDAToolkit REQUIRED + BUILD_EXPORT_SET cudf-exports + INSTALL_EXPORT_SET cudf-exports) +include(cmake/Modules/ConfigureCUDA.cmake) # set other CUDA compilation flags -# * find CUDAToolkit package -# * determine GPU architectures -# * enable the CMake CUDA language -# * set other CUDA compilation flags -include(cmake/Modules/ConfigureCUDA.cmake) ################################################################################################### # - dependencies ---------------------------------------------------------------------------------- # find zlib -find_package(ZLIB REQUIRED) +rapids_find_package(ZLIB REQUIRED + BUILD_EXPORT_SET cudf-exports + INSTALL_EXPORT_SET cudf-exports) # find Threads (needed by cudftestutil) -find_package(Threads REQUIRED) +rapids_find_package(Threads REQUIRED + BUILD_EXPORT_SET cudf-exports + INSTALL_EXPORT_SET cudf-exports) + # add third party dependencies using CPM -include(cmake/thirdparty/CUDF_GetCPM.cmake) +rapids_cpm_init() # find jitify -include(cmake/thirdparty/CUDF_GetJitify.cmake) +include(cmake/thirdparty/get_jitify.cmake) # find thrust/cub -include(cmake/thirdparty/CUDF_GetThrust.cmake) +include(cmake/thirdparty/get_thrust.cmake) # find rmm -include(cmake/thirdparty/CUDF_GetRMM.cmake) +include(cmake/thirdparty/get_rmm.cmake) # find arrow -include(cmake/thirdparty/CUDF_GetArrow.cmake) +include(cmake/thirdparty/get_arrow.cmake) # find dlpack -include(cmake/thirdparty/CUDF_GetDLPack.cmake) +include(cmake/thirdparty/get_dlpack.cmake) # find libcu++ -include(cmake/thirdparty/CUDF_GetLibcudacxx.cmake) +include(cmake/thirdparty/get_libcudacxx.cmake) # find cuCollections # Should come after including thrust and libcudacxx -include(cmake/thirdparty/CUDF_GetcuCollections.cmake) +include(cmake/thirdparty/get_cucollections.cmake) # find or install GoogleTest -include(cmake/thirdparty/CUDF_GetGTest.cmake) +include(cmake/thirdparty/get_gtest.cmake) # preprocess jitify-able kernels include(cmake/Modules/JitifyPreprocessKernels.cmake) # find cuFile @@ -487,14 +462,6 @@ target_compile_options(cudf "$<$:${CUDF_CUDA_FLAGS}>" ) -target_compile_definitions(cudf - PUBLIC "$<$:${CUDF_CXX_DEFINITIONS}>" - "$:${CUDF_CUDA_DEFINITIONS}>>" -) - -# Disable Jitify log printing. See https://github.com/NVIDIA/jitify/issues/79 -target_compile_definitions(cudf PRIVATE "JITIFY_PRINT_LOG=0") - # Specify include paths for the current target and dependents target_include_directories(cudf PUBLIC "$" @@ -507,15 +474,13 @@ target_include_directories(cudf "$" "$") -# Add Conda library paths if specified -if(CONDA_LINK_DIRS) - target_link_directories(cudf PUBLIC "$") -endif() +target_compile_definitions(cudf + PUBLIC "$<$:${CUDF_CXX_DEFINITIONS}>" + "$:${CUDF_CUDA_DEFINITIONS}>>" +) -# Add Conda include paths if specified -if(CONDA_INCLUDE_DIRS) - target_include_directories(cudf PUBLIC "$") -endif() +# Disable Jitify log printing. See https://github.com/NVIDIA/jitify/issues/79 +target_compile_definitions(cudf PRIVATE "JITIFY_PRINT_LOG=0") # Instruct jitify to use the kernel JIT cache if(JITIFY_USE_CACHE) @@ -546,6 +511,16 @@ target_link_libraries(cudf rmm::rmm PRIVATE cuco::cuco) +# Add Conda library, and include paths if specified +if(TARGET conda_env) + target_link_libraries(cudf PRIVATE conda_env ) +endif() + +# Add cuFile interface if available +if(TARGET cuFile::cuFile_interface) + target_link_libraries(cudf PRIVATE cuFile::cuFile_interface) +endif() + if(CUDA_STATIC_RUNTIME) # Tell CMake what CUDA language runtime to use set_target_properties(cudf PROPERTIES CUDA_RUNTIME_LIBRARY Static) @@ -566,11 +541,6 @@ if(NOT TARGET CUDA::cuda_driver) endif() target_link_libraries(cudf PUBLIC CUDA::cuda_driver) -# Add cuFile interface if available -if(TARGET cuFile::cuFile_interface) - target_link_libraries(cudf PRIVATE cuFile::cuFile_interface) -endif() - file(WRITE "${CUDF_BINARY_DIR}/fatbin.ld" [=[ SECTIONS @@ -579,7 +549,7 @@ SECTIONS .nv_fatbin : { *(.nv_fatbin) } } ]=]) -target_link_options(cudf PRIVATE "${CUDF_BINARY_DIR}/fatbin.ld") +target_link_options(cudf PRIVATE "$") add_library(cudf::cudf ALIAS cudf) @@ -596,14 +566,24 @@ add_library(cudftestutil STATIC tests/utilities/table_utilities.cu tests/strings/utilities.cu) +set_target_properties(cudftestutil + PROPERTIES BUILD_RPATH "\$ORIGIN" + INSTALL_RPATH "\$ORIGIN" + # set target compile options + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + POSITION_INDEPENDENT_CODE ON + INTERFACE_POSITION_INDEPENDENT_CODE ON +) + + target_compile_options(cudftestutil PUBLIC "$<$:${CUDF_CXX_FLAGS}>" "$:${CUDF_CUDA_FLAGS}>>" ) -target_compile_features(cudftestutil - PUBLIC cxx_std_17 $) - target_link_libraries(cudftestutil PUBLIC GTest::gmock GTest::gtest @@ -614,10 +594,6 @@ target_include_directories(cudftestutil PUBLIC "$" "$") -install(TARGETS cudftestutil - DESTINATION lib - EXPORT cudf-testing-targets) - add_library(cudf::cudftestutil ALIAS cudftestutil) ################################################################################################### @@ -634,32 +610,31 @@ endif() if(CUDF_BUILD_BENCHMARKS) # Find or install GoogleBench - CPMFindPackage(NAME benchmark - VERSION 1.5.2 + rapids_cpm_find(benchmark 1.5.2 GIT_REPOSITORY https://github.com/google/benchmark.git GIT_TAG v1.5.2 GIT_SHALLOW TRUE OPTIONS "BENCHMARK_ENABLE_TESTING OFF" "BENCHMARK_ENABLE_INSTALL OFF") + # Find or install NVBench - include(cmake/thirdparty/CUDF_GetNVBench.cmake) + include(${rapids-cmake}/cpm/nvbench.cmake) + rapids_cpm_nvbench() add_subdirectory(benchmarks) endif() ################################################################################################### # - install targets ------------------------------------------------------------------------------- - +rapids_cmake_install_lib_dir(lib_dir) include(CPack) - include(GNUInstallDirs) -set(INSTALL_CONFIGDIR lib/cmake/cudf) set(CMAKE_INSTALL_DEFAULT_COMPONENT_NAME cudf) # install target for cudf_base and the proxy libcudf.so install(TARGETS cudf - DESTINATION lib - EXPORT cudf-targets) + DESTINATION ${lib_dir} + EXPORT cudf-exports) install(DIRECTORY ${CUDF_SOURCE_DIR}/include/cudf @@ -671,54 +646,115 @@ install(DIRECTORY ${CUDF_GENERATED_INCLUDE_DIR}/include/libcudacxx DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/libcudf) -include(CMakePackageConfigHelpers) +install(TARGETS cudftestutil + DESTINATION ${lib_dir} + EXPORT cudf-testing-exports) -configure_package_config_file(cmake/cudf-config.cmake.in "${CUDF_BINARY_DIR}/cmake/cudf-config.cmake" - INSTALL_DESTINATION "${INSTALL_CONFIGDIR}") +install(EXPORT cudf-testing-exports + FILE cudf-testing-targets.cmake + NAMESPACE cudf:: + DESTINATION "${lib_dir}/cmake/cudf") -write_basic_package_version_file("${CUDF_BINARY_DIR}/cmake/cudf-config-version.cmake" - COMPATIBILITY SameMinorVersion) -install(FILES "${CUDF_BINARY_DIR}/cmake/cudf-config.cmake" - "${CUDF_BINARY_DIR}/cmake/cudf-config-version.cmake" - "${CUDF_SOURCE_DIR}/cmake/install/FindThrust.cmake" - DESTINATION "${INSTALL_CONFIGDIR}") +include("${rapids-cmake-dir}/export/write_dependencies.cmake") +rapids_export_write_dependencies(INSTALL cudf-testing-exports + "${PROJECT_BINARY_DIR}/rapids-cmake/cudf/export/cudf-testing-dependencies.cmake") -install(EXPORT cudf-targets - FILE cudf-targets.cmake - NAMESPACE cudf:: - DESTINATION "${INSTALL_CONFIGDIR}") +set(doc_string + [=[ +#[=======================================================================[ -install(EXPORT cudf-testing-targets - FILE cudf-testing-targets.cmake - NAMESPACE cudf:: - DESTINATION "${INSTALL_CONFIGDIR}") +Provide targets for the cudf library. + +Built based on the Apache Arrow columnar memory format, cuDF is a GPU DataFrame +library for loading, joining, aggregating, filtering, and otherwise +manipulating data. + +cuDF provides a pandas-like API that will be familiar to data engineers & +data scientists, so they can use it to easily accelerate their workflows +without going into the details of CUDA programming. + + +Imported Targets +^^^^^^^^^^^^^^^^ + +If cudf is found, this module defines the following IMPORTED GLOBAL +targets: + + cudf::cudf - The main cudf library. + +This module offers an optional testing component which defines the +following IMPORTED GLOBAL targets: + + cudf::cudftestutil - The main cudf testing library + +Result Variables +^^^^^^^^^^^^^^^^ + +This module will set the following variables in your project:: + + CUDF_FOUND + CUDF_VERSION + CUDF_VERSION_MAJOR + CUDF_VERSION_MINOR + CUDF_VERSION_PATCH + +#]=======================================================================] + ]=]) + + +set(install_code_string + [=[ +set(ArrowCUDA_DIR "${Arrow_DIR}") +find_dependency(ArrowCUDA) +if(testing IN_LIST cudf_FIND_COMPONENTS) + enable_language(CUDA) + if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-testing-dependencies.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-dependencies.cmake") + endif() + if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake") + endif() +endif() + +if(NOT TARGET cudf::Thrust) + thrust_create_target(cudf::Thrust FROM_OPTIONS) +endif() +]=]) + +rapids_export(INSTALL cudf + EXPORT_SET cudf-exports + GLOBAL_TARGETS cudf + NAMESPACE cudf:: + FINAL_CODE_BLOCK install_code_string) ################################################################################################ # - build export ------------------------------------------------------------------------------- -configure_package_config_file(cmake/cudf-build-config.cmake.in ${CUDF_BINARY_DIR}/cudf-config.cmake - INSTALL_DESTINATION ${CUDF_BINARY_DIR}) - -write_basic_package_version_file(${CUDF_BINARY_DIR}/cudf-config-version.cmake - COMPATIBILITY SameMinorVersion) - -if(TARGET gtest) - get_target_property(gtest_is_imported gtest IMPORTED) - if(NOT gtest_is_imported) - export(TARGETS gtest gmock gtest_main gmock_main - FILE ${CUDF_BINARY_DIR}/cudf-gtesting-targets.cmake - NAMESPACE GTest::) - endif() +set(build_code_string + [=[ +if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-testing-dependencies.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-dependencies.cmake") +endif() +if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake") endif() -export(EXPORT cudf-targets - FILE ${CUDF_BINARY_DIR}/cudf-targets.cmake - NAMESPACE cudf::) +if(NOT TARGET cudf::Thrust) + thrust_create_target(cudf::Thrust FROM_OPTIONS) +endif() +]=]) -export(EXPORT cudf-testing-targets - FILE ${CUDF_BINARY_DIR}/cudf-testing-targets.cmake - NAMESPACE cudf::) +rapids_export(BUILD cudf + EXPORT_SET cudf-exports + GLOBAL_TARGETS cudf + NAMESPACE cudf:: + FINAL_CODE_BLOCK code_string) +export(EXPORT cudf-testing-exports + FILE ${CUDF_BINARY_DIR}/cudf-testing.cmake + NAMESPACE cudf::) +rapids_export_write_dependencies(BUILD cudf-testing-exports + "${CUDF_BINARY_DIR}/cudf-testing-dependencies.cmake") ################################################################################################### # - make documentation ---------------------------------------------------------------------------- diff --git a/cpp/cmake/Modules/ConfigureCUDA.cmake b/cpp/cmake/Modules/ConfigureCUDA.cmake index 716163b5530..a537b5e3beb 100644 --- a/cpp/cmake/Modules/ConfigureCUDA.cmake +++ b/cpp/cmake/Modules/ConfigureCUDA.cmake @@ -14,19 +14,6 @@ # limitations under the License. #============================================================================= -# Find the CUDAToolkit -find_package(CUDAToolkit REQUIRED) - -# Auto-detect available GPU compute architectures -include(${CMAKE_CURRENT_LIST_DIR}/SetGPUArchs.cmake) -message(STATUS "CUDF: Building CUDF for GPU architectures: ${CMAKE_CUDA_ARCHITECTURES}") - -# Must come after find_package(CUDAToolkit) because we symlink -# ccache as a compiler front-end for nvcc in gpuCI CPU builds. -# Must also come after we detect and potentially rewrite -# CMAKE_CUDA_ARCHITECTURES -enable_language(CUDA) - if(CMAKE_COMPILER_IS_GNUCXX) list(APPEND CUDF_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations) endif() diff --git a/cpp/cmake/Modules/EvalGPUArchs.cmake b/cpp/cmake/Modules/EvalGPUArchs.cmake deleted file mode 100644 index 09e42c6cc7a..00000000000 --- a/cpp/cmake/Modules/EvalGPUArchs.cmake +++ /dev/null @@ -1,70 +0,0 @@ -#============================================================================= -# Copyright (c) 2019-2021, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -#============================================================================= - -# Unset this first in case it's set to -set(CMAKE_CUDA_ARCHITECTURES OFF) - -# Enable CUDA so we can invoke nvcc -enable_language(CUDA) - -# Function uses the CUDA runtime API to query the compute capability of the device, so if a user -# doesn't pass any architecture options to CMake we only build the current architecture -function(evaluate_gpu_archs gpu_archs) - set(eval_file ${PROJECT_BINARY_DIR}/eval_gpu_archs.cu) - set(eval_exe ${PROJECT_BINARY_DIR}/eval_gpu_archs) - set(error_file ${PROJECT_BINARY_DIR}/eval_gpu_archs.stderr.log) - file( - WRITE ${eval_file} - " -#include -#include -#include -using namespace std; -int main(int argc, char** argv) { - set archs; - int nDevices; - if((cudaGetDeviceCount(&nDevices) == cudaSuccess) && (nDevices > 0)) { - for(int dev=0;dev= 1.9.10 included configs. -# -# Result Variables -# ^^^^^^^^^^^^^^^^ -# -# This module defines the following variables: -# ~~~ -# ``Thrust_FOUND`` system has Thrust -# ``Thrust_INCLUDE_DIRS`` the Thrust include directories -# ~~~ - -include(FindPackageHandleStandardArgs) - -# try to find Thrust via installed config first -find_package(Thrust QUIET CONFIG) -if(Thrust_FOUND) - find_package_handle_standard_args(Thrust CONFIG_MODE) - return() -endif() - -find_dependency(CUDAToolkit) - -find_path( - Thrust_INCLUDE_DIR - NAMES thrust/version.h - HINTS ${CUDAToolkit_INCLUDE_DIRS}) - -file(READ ${Thrust_INCLUDE_DIR}/thrust/version.h _version_header) -string(REGEX MATCH "#define THRUST_VERSION ([0-9]*)" _match "${_version_header}") -math(EXPR major "${CMAKE_MATCH_1} / 100000") -math(EXPR minor "(${CMAKE_MATCH_1} / 100) % 1000") -math(EXPR subminor "${CMAKE_MATCH_1} % 100") -set(Thrust_VERSION "${major}.${minor}.${subminor}") - -find_package_handle_standard_args( - Thrust - REQUIRED_VARS Thrust_INCLUDE_DIR - VERSION_VAR Thrust_VERSION) - -if(Thrust_FOUND) - set(Thrust_INCLUDE_DIRS "${Thrust_INCLUDE_DIR}") - # Create wrapper function to handle situation where we can't use a regular IMPORTED INTERFACE - # target since that'll use -isystem, leading to the wrong search order with nvcc - function(thrust_create_target tgt) - if(NOT TARGET ${tgt}) - add_library(thrust_internal INTERFACE) - set_target_properties(thrust_internal PROPERTIES INTERFACE_INCLUDE_DIRECTORIES - "${Thrust_INCLUDE_DIRS}") - add_library(${tgt} ALIAS thrust_internal) - endif() - endfunction() -endif() diff --git a/cpp/cmake/thirdparty/CUDF_GetCPM.cmake b/cpp/cmake/thirdparty/CUDF_GetCPM.cmake deleted file mode 100644 index ce2921f5954..00000000000 --- a/cpp/cmake/thirdparty/CUDF_GetCPM.cmake +++ /dev/null @@ -1,33 +0,0 @@ -set(CPM_DOWNLOAD_VERSION 7644c3a40fc7889f8dee53ce21e85dc390b883dc) # v0.32.1 - -if(CPM_SOURCE_CACHE) - # Expand relative path. This is important if the provided path contains a tilde (~) - get_filename_component(CPM_SOURCE_CACHE ${CPM_SOURCE_CACHE} ABSOLUTE) - set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") -elseif(DEFINED ENV{CPM_SOURCE_CACHE}) - set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") -else() - set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake") -endif() - -if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION})) - message(VERBOSE "CUDF: Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}") - file( - DOWNLOAD - https://raw.githubusercontent.com/cpm-cmake/CPM.cmake/${CPM_DOWNLOAD_VERSION}/cmake/CPM.cmake - ${CPM_DOWNLOAD_LOCATION}) -endif() - -include(${CPM_DOWNLOAD_LOCATION}) - -# If a target is installed, found by the `find_package` step of CPMFindPackage, -# and marked as IMPORTED, make it globally accessible to consumers of our libs. -function(fix_cmake_global_defaults target) - if(TARGET ${target}) - get_target_property(_is_imported ${target} IMPORTED) - get_target_property(_already_global ${target} IMPORTED_GLOBAL) - if(_is_imported AND NOT _already_global) - set_target_properties(${target} PROPERTIES IMPORTED_GLOBAL TRUE) - endif() - endif() -endfunction() diff --git a/cpp/cmake/thirdparty/CUDF_GetGTest.cmake b/cpp/cmake/thirdparty/CUDF_GetGTest.cmake deleted file mode 100644 index 9e4f3c137b1..00000000000 --- a/cpp/cmake/thirdparty/CUDF_GetGTest.cmake +++ /dev/null @@ -1,50 +0,0 @@ -#============================================================================= -# Copyright (c) 2021, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -#============================================================================= - -function(find_and_configure_gtest VERSION) - - if(TARGET GTest::gtest) - return() - endif() - - # Find or install GoogleTest - CPMFindPackage(NAME GTest - VERSION ${VERSION} - GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG release-${VERSION} - GIT_SHALLOW TRUE - OPTIONS "INSTALL_GTEST ON" - # googletest >= 1.10.0 provides a cmake config file -- use it if it exists - FIND_PACKAGE_ARGUMENTS "CONFIG") - # Add GTest aliases if they don't already exist. - # Assumes if GTest::gtest doesn't exist, the others don't either. - # TODO: Is this always a valid assumption? - if(NOT TARGET GTest::gtest) - add_library(GTest::gtest ALIAS gtest) - add_library(GTest::gmock ALIAS gmock) - add_library(GTest::gtest_main ALIAS gtest_main) - add_library(GTest::gmock_main ALIAS gmock_main) - endif() - # Make sure consumers of cudf can also see GTest::* targets - fix_cmake_global_defaults(GTest::gtest) - fix_cmake_global_defaults(GTest::gmock) - fix_cmake_global_defaults(GTest::gtest_main) - fix_cmake_global_defaults(GTest::gmock_main) -endfunction() - -set(CUDF_MIN_VERSION_GTest 1.10.0) - -find_and_configure_gtest(${CUDF_MIN_VERSION_GTest}) diff --git a/cpp/cmake/thirdparty/CUDF_GetRMM.cmake b/cpp/cmake/thirdparty/CUDF_GetRMM.cmake deleted file mode 100644 index b2861ae48c4..00000000000 --- a/cpp/cmake/thirdparty/CUDF_GetRMM.cmake +++ /dev/null @@ -1,50 +0,0 @@ -#============================================================================= -# Copyright (c) 2020-2021, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -#============================================================================= - -function(find_and_configure_rmm VERSION) - - if(TARGET rmm::rmm) - return() - endif() - - if(${VERSION} MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=]) - set(MAJOR_AND_MINOR "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}") - else() - set(MAJOR_AND_MINOR "${VERSION}") - endif() - - # Consumers have two options for local source builds: - # 1. Pass `-D CPM_rmm_SOURCE=/path/to/rmm` to build a local RMM source tree - # 2. Pass `-D CMAKE_PREFIX_PATH=/path/to/rmm/build` to use an existing local - # RMM build directory as the install location for find_package(rmm) - CPMFindPackage(NAME rmm - VERSION ${VERSION} - GIT_REPOSITORY https://github.com/rapidsai/rmm.git - GIT_TAG branch-${MAJOR_AND_MINOR} - GIT_SHALLOW TRUE - OPTIONS "BUILD_TESTS OFF" - "BUILD_BENCHMARKS OFF" - "CUDA_STATIC_RUNTIME ${CUDA_STATIC_RUNTIME}" - "DISABLE_DEPRECATION_WARNING ${DISABLE_DEPRECATION_WARNING}" - ) - - # Make sure consumers of cudf can also see rmm::rmm - fix_cmake_global_defaults(rmm::rmm) -endfunction() - -set(CUDF_MIN_VERSION_rmm "${CUDF_VERSION_MAJOR}.${CUDF_VERSION_MINOR}.00") - -find_and_configure_rmm(${CUDF_MIN_VERSION_rmm}) diff --git a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake similarity index 83% rename from cpp/cmake/thirdparty/CUDF_GetArrow.cmake rename to cpp/cmake/thirdparty/get_arrow.cmake index 38a5d8da44a..c67f316f7e9 100644 --- a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -49,8 +49,9 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB if(BUILD_STATIC) set(ARROW_BUILD_STATIC ON) set(ARROW_BUILD_SHARED OFF) - # Use CPMAddPackage if static linking - set(CPMAddOrFindPackage CPMAddPackage) + # Turn off CPM using `find_package` so we always download + # and make sure we get proper static library + set(CPM_DOWNLOAD_ALL TRUE) endif() set(ARROW_PYTHON_OPTIONS "") @@ -66,9 +67,9 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB # does not have the CUDA driver installed. This must be an env var. set(ENV{CUDA_LIB_PATH} "${CUDAToolkit_LIBRARY_DIR}/stubs") - cmake_language(CALL ${CPMAddOrFindPackage} - NAME Arrow - VERSION ${VERSION} + rapids_cpm_find(Arrow ${VERSION} + GLOBAL_TARGETS arrow_shared arrow_cuda_shared + CPM_ARGS GIT_REPOSITORY https://github.com/apache/arrow.git GIT_TAG apache-arrow-${VERSION} GIT_SHALLOW TRUE @@ -100,6 +101,33 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB "ARROW_PROTOBUF_USE_SHARED ${ARROW_BUILD_SHARED}" "ARROW_ZSTD_USE_SHARED ${ARROW_BUILD_SHARED}") + if(Arrow_ADDED) + rapids_export(BUILD Arrow + VERSION ${VERSION} + EXPORT_SET arrow_targets + GLOBAL_TARGETS arrow_shared arrow_static + NAMESPACE cudf::) + + rapids_export(BUILD ArrowCUDA + VERSION ${VERSION} + EXPORT_SET arrow_cuda_targets + GLOBAL_TARGETS arrow_cuda_shared arrow_cuda_static + NAMESPACE cudf::) + endif() + # We generate the arrow-config and arrowcuda-config files + # when we built arrow locally, so always do `find_dependency` + rapids_export_package(BUILD Arrow cudf-exports) + rapids_export_package(INSTALL Arrow cudf-exports) + + # We have to generate the find_dependency(ArrowCUDA) ourselves + # since we need to specify ArrowCUDA_DIR to be where Arrow + # was found, since Arrow packages ArrowCUDA.config in a non-standard + # location + rapids_export_package(BUILD ArrowCUDA cudf-exports) + + include("${rapids-cmake-dir}/export/find_package_root.cmake") + rapids_export_find_package_root(BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports) + rapids_export_find_package_root(BUILD ArrowCUDA [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports) set(ARROW_FOUND TRUE) set(ARROW_LIBRARIES "") @@ -159,22 +187,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB set(ARROW_FOUND "${ARROW_FOUND}" PARENT_SCOPE) set(ARROW_LIBRARIES "${ARROW_LIBRARIES}" PARENT_SCOPE) - if(TARGET arrow_shared) - get_target_property(arrow_is_imported arrow_shared IMPORTED) - if(NOT arrow_is_imported) - export(TARGETS arrow_shared arrow_cuda_shared - FILE ${CUDF_BINARY_DIR}/cudf-arrow-targets.cmake - NAMESPACE cudf::) - endif() - elseif(TARGET arrow_static) - get_target_property(arrow_is_imported arrow_static IMPORTED) - if(NOT arrow_is_imported) - export(TARGETS arrow_static arrow_cuda_static - FILE ${CUDF_BINARY_DIR}/cudf-arrow-targets.cmake - NAMESPACE cudf::) - endif() - endif() - endfunction() set(CUDF_VERSION_Arrow 5.0.0) diff --git a/cpp/cmake/thirdparty/CUDF_GetcuCollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake similarity index 69% rename from cpp/cmake/thirdparty/CUDF_GetcuCollections.cmake rename to cpp/cmake/thirdparty/get_cucollections.cmake index 73717249585..b2e70ebaddc 100644 --- a/cpp/cmake/thirdparty/CUDF_GetcuCollections.cmake +++ b/cpp/cmake/thirdparty/get_cucollections.cmake @@ -22,17 +22,14 @@ function(find_and_configure_cucollections) # Find or install cuCollections CPMFindPackage(NAME cuco - GITHUB_REPOSITORY NVIDIA/cuCollections - GIT_TAG 0d602ae21ea4f38d23ed816aa948453d97b2ee4e - OPTIONS "BUILD_TESTS OFF" - "BUILD_BENCHMARKS OFF" - "BUILD_EXAMPLES OFF" + GLOBAL_TARGETS cuco::cuco + CPM_ARGS + GITHUB_REPOSITORY NVIDIA/cuCollections + GIT_TAG 0d602ae21ea4f38d23ed816aa948453d97b2ee4e + OPTIONS "BUILD_TESTS OFF" + "BUILD_BENCHMARKS OFF" + "BUILD_EXAMPLES OFF" ) - - set(CUCO_INCLUDE_DIR "${cuco_SOURCE_DIR}/include" PARENT_SCOPE) - - # Make sure consumers of cudf can also see cuco::cuco target - fix_cmake_global_defaults(cuco::cuco) endfunction() find_and_configure_cucollections() diff --git a/cpp/cmake/thirdparty/CUDF_GetDLPack.cmake b/cpp/cmake/thirdparty/get_dlpack.cmake similarity index 69% rename from cpp/cmake/thirdparty/CUDF_GetDLPack.cmake rename to cpp/cmake/thirdparty/get_dlpack.cmake index 349f75d604f..1334ac91c6e 100644 --- a/cpp/cmake/thirdparty/CUDF_GetDLPack.cmake +++ b/cpp/cmake/thirdparty/get_dlpack.cmake @@ -15,25 +15,21 @@ #============================================================================= function(find_and_configure_dlpack VERSION) - if(DLPACK_INCLUDE) - set(DLPACK_INCLUDE_DIR "${DLPACK_INCLUDE}" PARENT_SCOPE) - return() - endif() - find_path(DLPACK_INCLUDE_DIR "dlpack" - HINTS "$ENV{DLPACK_ROOT}/include" - "$ENV{CONDA_PREFIX}/include") - if(DLPACK_INCLUDE_DIR) - set(DLPACK_INCLUDE_DIR ${DLPACK_INCLUDE_DIR} PARENT_SCOPE) - return() - endif() - CPMFindPackage(NAME dlpack - VERSION ${VERSION} + + rapids_find_generate_module( DLPACK + HEADER_NAMES dlpack.h) + + rapids_cpm_find(dlpack ${VERSION} GIT_REPOSITORY https://github.com/dmlc/dlpack.git GIT_TAG v${VERSION} GIT_SHALLOW TRUE DOWNLOAD_ONLY TRUE OPTIONS "BUILD_MOCK OFF") - set(DLPACK_INCLUDE_DIR "${dlpack_SOURCE_DIR}/include" PARENT_SCOPE) + + if(DEFINED dlpack_SOURCE_DIR) + #otherwise find_package(DLPACK) will set this variable + set(DLPACK_INCLUDE_DIR "${dlpack_SOURCE_DIR}/include" PARENT_SCOPE) + endif() endfunction() set(CUDF_MIN_VERSION_dlpack 0.5) diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake new file mode 100644 index 00000000000..9de2b4a50a9 --- /dev/null +++ b/cpp/cmake/thirdparty/get_gtest.cmake @@ -0,0 +1,37 @@ +#============================================================================= +# Copyright (c) 2021, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +function(find_and_configure_gtest) + include(${rapids-cmake-dir}/cpm/gtest.cmake) + + # Find or install GoogleTest + rapids_cpm_gtest(BUILD_EXPORT_SET cudf-testing-exports + INSTALL_EXPORT_SET cudf-testing-exports) + + if(GTest_ADDED) + rapids_export(BUILD GTest + VERSION ${GTest_VERSION} + EXPORT_SET GTestTargets + GLOBAL_TARGETS gtest gmock gtest_main gmock_main + NAMESPACE GTest::) + + include("${rapids-cmake-dir}/export/find_package_root.cmake") + rapids_export_find_package_root(BUILD GTest [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-testing-exports) + endif() + +endfunction() + +find_and_configure_gtest() diff --git a/cpp/cmake/thirdparty/CUDF_GetJitify.cmake b/cpp/cmake/thirdparty/get_jitify.cmake similarity index 94% rename from cpp/cmake/thirdparty/CUDF_GetJitify.cmake rename to cpp/cmake/thirdparty/get_jitify.cmake index 6e853816ec5..b8a85889ef2 100644 --- a/cpp/cmake/thirdparty/CUDF_GetJitify.cmake +++ b/cpp/cmake/thirdparty/get_jitify.cmake @@ -17,8 +17,7 @@ # Jitify doesn't have a version :/ function(find_and_configure_jitify) - CPMFindPackage(NAME jitify - VERSION 2.0.0 + rapids_cpm_find(jitify 2.0.0 GIT_REPOSITORY https://github.com/rapidsai/jitify.git GIT_TAG cudf_0.19 GIT_SHALLOW TRUE diff --git a/cpp/cmake/thirdparty/CUDF_GetLibcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake similarity index 82% rename from cpp/cmake/thirdparty/CUDF_GetLibcudacxx.cmake rename to cpp/cmake/thirdparty/get_libcudacxx.cmake index 63d6d26802c..4921abe0581 100644 --- a/cpp/cmake/thirdparty/CUDF_GetLibcudacxx.cmake +++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,16 +15,14 @@ #============================================================================= function(find_and_configure_libcudacxx VERSION) - CPMFindPackage(NAME libcudacxx - VERSION ${VERSION} + rapids_cpm_find(libcudacxx ${VERSION} GIT_REPOSITORY https://github.com/NVIDIA/libcudacxx.git GIT_TAG ${VERSION} GIT_SHALLOW TRUE DOWNLOAD_ONLY TRUE ) - set(LIBCUDACXX_DIR "${libcudacxx_SOURCE_DIR}" PARENT_SCOPE) + set(LIBCUDACXX_INCLUDE_DIR "${libcudacxx_SOURCE_DIR}/include" PARENT_SCOPE) - set(LIBCXX_DIR "${libcudacxx_SOURCE_DIR}/libcxx" PARENT_SCOPE) set(LIBCXX_INCLUDE_DIR "${libcudacxx_SOURCE_DIR}/libcxx/include" PARENT_SCOPE) endfunction() diff --git a/cpp/cmake/thirdparty/CUDF_GetNVBench.cmake b/cpp/cmake/thirdparty/get_rmm.cmake similarity index 60% rename from cpp/cmake/thirdparty/CUDF_GetNVBench.cmake rename to cpp/cmake/thirdparty/get_rmm.cmake index 09ceffb284f..ec40afa4d05 100644 --- a/cpp/cmake/thirdparty/CUDF_GetNVBench.cmake +++ b/cpp/cmake/thirdparty/get_rmm.cmake @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,21 +14,13 @@ # limitations under the License. #============================================================================= -# NVBench doesn't have a public release yet +function(find_and_configure_rmm) + include(${rapids-cmake-dir}/cpm/rmm.cmake) -function(find_and_configure_nvbench) - - if(TARGET nvbench::main) - return() - endif() - - CPMFindPackage(NAME nvbench - GIT_REPOSITORY https://github.com/NVIDIA/nvbench.git - GIT_TAG main - GIT_SHALLOW TRUE - OPTIONS "NVBench_ENABLE_EXAMPLES OFF" - "NVBench_ENABLE_TESTING OFF") + # Find or install RMM + rapids_cpm_rmm(BUILD_EXPORT_SET cudf-exports + INSTALL_EXPORT_SET cudf-exports) endfunction() -find_and_configure_nvbench() +find_and_configure_rmm() diff --git a/cpp/cmake/thirdparty/CUDF_GetThrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake similarity index 77% rename from cpp/cmake/thirdparty/CUDF_GetThrust.cmake rename to cpp/cmake/thirdparty/get_thrust.cmake index 2792786f553..aecf0498b65 100644 --- a/cpp/cmake/thirdparty/CUDF_GetThrust.cmake +++ b/cpp/cmake/thirdparty/get_thrust.cmake @@ -24,17 +24,21 @@ function(find_and_configure_thrust VERSION) set(cpm_thrust_disconnect_update "") endif() - CPMAddPackage(NAME Thrust - VERSION ${VERSION} - GIT_REPOSITORY https://github.com/NVIDIA/thrust.git - GIT_TAG ${VERSION} - GIT_SHALLOW TRUE + rapids_cpm_find( + Thrust ${VERSION} + BUILD_EXPORT_SET cudf-exports + INSTALL_EXPORT_SET cudf-exports + CPM_ARGS + GIT_REPOSITORY https://github.com/NVIDIA/thrust.git + GIT_TAG ${VERSION} + GIT_SHALLOW TRUE ${cpm_thrust_disconnect_update} PATCH_COMMAND patch --reject-file=- -p1 -N < ${CUDF_SOURCE_DIR}/cmake/thrust.patch || true - ) + OPTIONS "THRUST_INSTALL TRUE") - thrust_create_target(cudf::Thrust FROM_OPTIONS) - set(THRUST_LIBRARY "cudf::Thrust" PARENT_SCOPE) + if(NOT TARGET cudf::Thrust) + thrust_create_target(cudf::Thrust FROM_OPTIONS) + endif() include(GNUInstallDirs) install(DIRECTORY "${Thrust_SOURCE_DIR}/thrust" @@ -52,6 +56,10 @@ function(find_and_configure_thrust VERSION) install(DIRECTORY "${Thrust_SOURCE_DIR}/dependencies/cub/cub/cmake" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/dependencies/cub/") + + # Store where CMake can find our custom Thrust install + include("${rapids-cmake-dir}/export/find_package_root.cmake") + rapids_export_find_package_root(INSTALL Thrust [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/Thrust/]=] cudf-exports) endfunction() set(CUDF_MIN_VERSION_Thrust 1.12.0) diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt index 7c112d838d8..020f5c76c10 100644 --- a/cpp/libcudf_kafka/CMakeLists.txt +++ b/cpp/libcudf_kafka/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,10 +13,23 @@ # See the License for the specific language governing permissions and # limitations under the License. #============================================================================= -cmake_minimum_required(VERSION 3.18 FATAL_ERROR) +cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) + +file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.10/RAPIDS.cmake + ${CMAKE_BINARY_DIR}/RAPIDS.cmake) +include(${CMAKE_BINARY_DIR}/RAPIDS.cmake) + +include(rapids-cmake) +include(rapids-cpm) +include(rapids-cuda) +include(rapids-export) +include(rapids-find) project(CUDA_KAFKA VERSION 21.10.00 LANGUAGES CXX) +# Set a default build type if none was specified +rapids_cmake_build_type(Release) + ################################################################################################### # - Build options option(BUILD_TESTS "Build tests for libcudf_kafka" ON) @@ -26,19 +39,15 @@ message(VERBOSE "CUDF_KAFKA: Build gtests: ${BUILD_TESTS}") ################################################################################################### # - Dependencies -# CPM -include(../cmake/thirdparty/CUDF_GetCPM.cmake) - -# libcudf -include(cmake/thirdparty/CUDF_KAFKA_GetCUDF.cmake) - -# librdkafka -include(cmake/thirdparty/CUDF_KAFKA_GetRDKafka.cmake) +# add third party dependencies using CPM +rapids_cpm_init() +include(cmake/thirdparty/get_cudf.cmake) +include(cmake/thirdparty/get_rdkafka.cmake) # # GTests if enabled if (BUILD_TESTS) # GoogleTest - include(../cmake/thirdparty/CUDF_GetGTest.cmake) + include(../cmake/thirdparty/get_gtest.cmake) # include CTest module -- automatically calls enable_testing() include(CTest) @@ -46,32 +55,46 @@ if (BUILD_TESTS) endif() ################################################################################################### -# - include paths --------------------------------------------------------------------------------- - -include_directories("${CMAKE_BINARY_DIR}/include" - "${CMAKE_SOURCE_DIR}/include" - "${CMAKE_SOURCE_DIR}/src") +# - library target -------------------------------------------------------------------------------- +add_library(cudf_kafka SHARED + src/kafka_consumer.cpp) ################################################################################################### -# - library paths --------------------------------------------------------------------------------- - -link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" # CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported variable containing the link directories for nvcc - "${CMAKE_BINARY_DIR}/lib" - "${CMAKE_BINARY_DIR}") +# - include paths --------------------------------------------------------------------------------- +target_include_directories(cudf_kafka + PUBLIC + "$" + "$") ################################################################################################### -# - library target -------------------------------------------------------------------------------- +# - library paths --------------------------------------------------------------------------------- +target_link_libraries(cudf_kafka PUBLIC cudf::cudf RDKAFKA::RDKAFKA) -add_library(cudf_kafka SHARED - src/kafka_consumer.cpp -) +set_target_properties(cudf_kafka + PROPERTIES BUILD_RPATH "\$ORIGIN" + INSTALL_RPATH "\$ORIGIN" + # set target compile options + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON) ################################################################################################### # - cudf_kafka Install ---------------------------------------------------------------------------- -target_link_libraries(cudf_kafka cudf::cudf RDKAFKA::RDKAFKA) - +rapids_cmake_install_lib_dir(lib_dir) install(TARGETS cudf_kafka - DESTINATION lib) + DESTINATION ${lib_dir} + EXPORT cudf_kafka-exports) install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include DESTINATION include) + +rapids_export(INSTALL cudf_kafka + EXPORT_SET cudf_kafka-exports + GLOBAL_TARGETS cudf_kafka + NAMESPACE cudf_kafka:: + ) + +rapids_export(BUILD cudf_kafka + EXPORT_SET cudf_kafka-exports + GLOBAL_TARGETS cudf_kafka + NAMESPACE cudf_kafka:: + ) diff --git a/cpp/libcudf_kafka/cmake/thirdparty/CUDF_KAFKA_GetCUDF.cmake b/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake similarity index 52% rename from cpp/libcudf_kafka/cmake/thirdparty/CUDF_KAFKA_GetCUDF.cmake rename to cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake index 29220860b6e..ea749726b97 100644 --- a/cpp/libcudf_kafka/cmake/thirdparty/CUDF_KAFKA_GetCUDF.cmake +++ b/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake @@ -15,24 +15,37 @@ #============================================================================= function(find_and_configure_cudf VERSION) - CPMFindPackage(NAME cudf - VERSION ${VERSION} - GIT_REPOSITORY https://github.com/rapidsai/cudf.git - GIT_TAG branch-${VERSION} - GIT_SHALLOW TRUE - SOURCE_SUBDIR cpp - OPTIONS "BUILD_TESTS OFF" - "BUILD_BENCHMARKS OFF") - if(cudf_ADDED) - set(cudf_ADDED TRUE PARENT_SCOPE) + rapids_cmake_parse_version(MAJOR_MINOR ${VERSION} major_minor) + rapids_cpm_find(cudf ${VERSION} + BUILD_EXPORT_SET cudf_kafka-exports + INSTALL_EXPORT_SET cudf_kafka-exports + CPM_ARGS + GIT_REPOSITORY https://github.com/rapidsai/cudf.git + GIT_TAG branch-${major_minor} + GIT_SHALLOW TRUE + SOURCE_SUBDIR cpp + OPTIONS "BUILD_TESTS OFF" + "BUILD_BENCHMARKS OFF") + # If after loading cudf we now have the CMAKE_CUDA_COMPILER + # variable we know that we need to re-enable the cuda language + if(CMAKE_CUDA_COMPILER) + set(cudf_REQUIRES_CUDA TRUE PARENT_SCOPE) endif() endfunction() set(CUDA_KAFKA_MIN_VERSION_cudf "${CUDA_KAFKA_VERSION_MAJOR}.${CUDA_KAFKA_VERSION_MINOR}.${CUDA_KAFKA_VERSION_PATCH}") find_and_configure_cudf(${CUDA_KAFKA_MIN_VERSION_cudf}) -if(cudf_ADDED) +if(cudf_REQUIRES_CUDA) + rapids_cuda_init_architectures(CUDA_KAFKA) + # Since we are building cudf as part of ourselves we need # to enable the CUDA language in the top-most scope enable_language(CUDA) + + # Since CUDA_KAFKA only enables CUDA optionally we need to manually include the file that + # rapids_cuda_init_architectures relies on `project` calling + if(DEFINED CMAKE_PROJECT_CUDA_KAFKA_INCLUDE) + include("${CMAKE_PROJECT_CUDA_KAFKA_INCLUDE}") + endif() endif() diff --git a/cpp/libcudf_kafka/cmake/thirdparty/CUDF_KAFKA_GetRDKafka.cmake b/cpp/libcudf_kafka/cmake/thirdparty/get_rdkafka.cmake similarity index 52% rename from cpp/libcudf_kafka/cmake/thirdparty/CUDF_KAFKA_GetRDKafka.cmake rename to cpp/libcudf_kafka/cmake/thirdparty/get_rdkafka.cmake index 5c07db66668..3a4fffd5000 100644 --- a/cpp/libcudf_kafka/cmake/thirdparty/CUDF_KAFKA_GetRDKafka.cmake +++ b/cpp/libcudf_kafka/cmake/thirdparty/get_rdkafka.cmake @@ -14,12 +14,28 @@ # limitations under the License. #============================================================================= -find_path(RDKAFKA_INCLUDE "librdkafka" HINTS "$ENV{RDKAFKA_ROOT}/include") -find_library(RDKAFKA++_LIBRARY "rdkafka++" HINTS "$ENV{RDKAFKA_ROOT}/lib" "$ENV{RDKAFKA_ROOT}/build") -if(RDKAFKA_INCLUDE AND RDKAFKA++_LIBRARY) - add_library(rdkafka INTERFACE) - target_link_libraries(rdkafka INTERFACE "${RDKAFKA++_LIBRARY}") - target_include_directories(rdkafka INTERFACE "${RDKAFKA_INCLUDE}") - add_library(RDKAFKA::RDKAFKA ALIAS rdkafka) -endif() \ No newline at end of file +function( get_RDKafka ) + rapids_find_generate_module(RDKAFKA + HEADER_NAMES rdkafkacpp.h + INCLUDE_SUFFIXES librdkafka + LIBRARY_NAMES rdkafka++ + BUILD_EXPORT_SET cudf_kafka-exports + INSTALL_EXPORT_SET cudf_kafka-exports + ) + + if(DEFINED ENV{RDKAFKA_ROOT}) + # Since this is inside a function the modification of + # CMAKE_PREFIX_PATH won't leak to other callers/users + list(APPEND CMAKE_PREFIX_PATH "$ENV{RDKAFKA_ROOT}") + list(APPEND CMAKE_PREFIX_PATH "$ENV{RDKAFKA_ROOT}/build") + endif() + + + rapids_find_package(RDKAFKA REQUIRED + BUILD_EXPORT_SET cudf_kafka-exports + INSTALL_EXPORT_SET cudf_kafka-exports) + +endfunction() + +get_RDKafka() diff --git a/cpp/libcudf_kafka/tests/CMakeLists.txt b/cpp/libcudf_kafka/tests/CMakeLists.txt index e813ed5439e..f0c2664cd96 100644 --- a/cpp/libcudf_kafka/tests/CMakeLists.txt +++ b/cpp/libcudf_kafka/tests/CMakeLists.txt @@ -17,22 +17,16 @@ ################################################################################################### # - compiler function ----------------------------------------------------------------------------- -function(ConfigureTest CMAKE_TEST_NAME ) - add_executable(${CMAKE_TEST_NAME} ${ARGN}) - set_target_properties(${CMAKE_TEST_NAME} +function(ConfigureTest test_name ) + add_executable(${test_name} ${ARGN}) + set_target_properties(${test_name} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$") - target_link_libraries(${CMAKE_TEST_NAME} PRIVATE GTest::gmock_main GTest::gtest_main cudf_kafka) - target_include_directories(${CMAKE_TEST_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include) - add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME}) + target_link_libraries(${test_name} PRIVATE GTest::gmock_main GTest::gtest_main cudf_kafka) + + add_test(NAME ${test_name} COMMAND ${test_name}) endfunction() ################################################################################################### # - Kafka host tests ---------------------------------------------------------------------------------- ConfigureTest(KAFKA_HOST_TEST kafka_consumer_tests.cpp) - -################################################################################################### -### enable testing ################################################################################ -################################################################################################### - -enable_testing() diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index bc59e3aee64..d4875b22b89 100755 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -13,26 +13,23 @@ # See the License for the specific language governing permissions and # limitations under the License. #============================================================================= -cmake_minimum_required(VERSION 3.18 FATAL_ERROR) +cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) + +file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.10/RAPIDS.cmake + ${CMAKE_BINARY_DIR}/RAPIDS.cmake) +include(${CMAKE_BINARY_DIR}/RAPIDS.cmake) + +include(rapids-cmake) +include(rapids-cuda) +include(rapids-find) # Use GPU_ARCHS if it is defined if(DEFINED GPU_ARCHS) set(CMAKE_CUDA_ARCHITECTURES "${GPU_ARCHS}") endif() +rapids_cuda_init_architectures(CUDF_JNI) -# If `CMAKE_CUDA_ARCHITECTURES` is not defined, build for all supported architectures. If -# `CMAKE_CUDA_ARCHITECTURES` is set to an empty string (""), build for only the current -# architecture. If `CMAKE_CUDA_ARCHITECTURES` is specified by the user, use user setting. - -# This needs to be run before enabling the CUDA language due to the default initialization behavior -# of `CMAKE_CUDA_ARCHITECTURES`, https://gitlab.kitware.com/cmake/cmake/-/issues/21302 -if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "ALL") - set(CUDF_JNI_BUILD_FOR_ALL_ARCHS TRUE) -elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "") - set(CUDF_JNI_BUILD_FOR_DETECTED_ARCHS TRUE) -endif() - -project(CUDF_JNI VERSION 21.10.00 LANGUAGES C CXX) +project(CUDF_JNI VERSION 21.10.00 LANGUAGES C CXX CUDA) ################################################################################################### # - build options --------------------------------------------------------------------------------- @@ -71,11 +68,8 @@ set(CUDF_CUDA_FLAGS "") set(CUDF_CXX_DEFINITIONS "") set(CUDF_CUDA_DEFINITIONS "") -# * find CUDAToolkit package -# * determine GPU architectures -# * enable the CMake CUDA language -# * set other CUDA compilation flags -include(ConfigureCUDA) +rapids_find_package(CUDAToolkit REQUIRED) +include(ConfigureCUDA) # set other CUDA compilation flags # Disable NVTX if necessary if(NOT USE_NVTX) @@ -85,33 +79,12 @@ endif() if(PER_THREAD_DEFAULT_STREAM) message(STATUS "Using per-thread default stream") add_compile_definitions(CUDA_API_PER_THREAD_DEFAULT_STREAM) -endif(PER_THREAD_DEFAULT_STREAM) +endif() ################################################################################################### # - build type ------------------------------------------------------------------------------------ - # Set a default build type if none was specified -set(DEFAULT_BUILD_TYPE "Release") - -if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) - message(STATUS "Setting build type to '${DEFAULT_BUILD_TYPE}' since none specified.") - set(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" CACHE - STRING "Choose the type of build." FORCE) - # Set the possible values of build type for cmake-gui - set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS - "Debug" "Release" "MinSizeRel" "RelWithDebInfo") -endif() - -################################################################################################### -# - cudart options -------------------------------------------------------------------------------- -# cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking - -if(CUDA_STATIC_RUNTIME) - message(STATUS "Enabling static linking of cudart") - set(CUDART_LIBRARY "cudart_static") -else() - set(CUDART_LIBRARY "cudart") -endif(CUDA_STATIC_RUNTIME) +rapids_cmake_build_type("Release") ################################################################################################### # - Thrust/CUB/libcudacxx ------------------------------------------------------------------------------------ @@ -182,7 +155,7 @@ if(CUDF_JNI_ARROW_STATIC) set(CUDF_JNI_ARROW_LIBNAME "libarrow.a") else() set(CUDF_JNI_ARROW_LIBNAME "arrow") -endif(CUDF_JNI_ARROW_STATIC) +endif() find_library(ARROW_LIBRARY ${CUDF_JNI_ARROW_LIBNAME} REQUIRED HINTS "$ENV{ARROW_ROOT}/lib" @@ -194,10 +167,10 @@ if(NOT ARROW_LIBRARY) message(FATAL_ERROR "Arrow static library not found. Was libcudf built with CUDF_USE_ARROW_STATIC=ON?") else() message(FATAL_ERROR "Arrow dynamic library not found.") - endif(CUDF_JNI_ARROW_STATIC) + endif() else() message(STATUS "ARROW: ARROW_LIBRARY set to ${ARROW_LIBRARY}") -endif(NOT ARROW_LIBRARY) +endif() ################################################################################################### # - find JNI ------------------------------------------------------------------------------------- @@ -206,7 +179,7 @@ if(JNI_FOUND) message(STATUS "JDK with JNI in ${JNI_INCLUDE_DIRS}") else() message(FATAL_ERROR "JDK with JNI not found, please check your settings.") -endif(JNI_FOUND) +endif() ################################################################################################### # - nvcomp ---------------------------------------------------------------------------------------- @@ -216,9 +189,7 @@ if(NVCOMP_FOUND) message(STATUS "nvcomp compression library found in ${NVCOMP_ROOT}") else() message(FATAL_ERROR "nvcomp compression library not found.") -endif(NVCOMP_FOUND) - -add_library(nvcomp STATIC IMPORTED ${NVCOMP_LIB}) +endif() ################################################################################################### # - GDS/cufile ------------------------------------------------------------------------------------ @@ -226,57 +197,55 @@ add_library(nvcomp STATIC IMPORTED ${NVCOMP_LIB}) if(USE_GDS) message(STATUS "Building with GPUDirect Storage (GDS)/cuFile support") find_package(cuFile REQUIRED) -endif(USE_GDS) +endif() ################################################################################################### -# - include paths --------------------------------------------------------------------------------- +# - library targets ------------------------------------------------------------------------------- -include_directories("${THRUST_INCLUDE}" - "${CUB_INCLUDE}" - "${LIBCUDACXX_INCLUDE}" - "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}" - "${NVCOMP_INCLUDE_DIR}" - "${CMAKE_BINARY_DIR}/include" - "${CMAKE_SOURCE_DIR}/include" - "${SPDLOG_INCLUDE}" - "${CMAKE_SOURCE_DIR}/src" - "${JNI_INCLUDE_DIRS}" - "${CUDF_INCLUDE}" - "${RMM_INCLUDE}" - "${ARROW_INCLUDE}") +add_library(cudfjni SHARED + src/row_conversion.cu + src/AggregationJni.cpp + src/CudfJni.cpp + src/CudaJni.cpp + src/ColumnVectorJni.cpp + src/ColumnViewJni.cpp + src/CompiledExpression.cpp + src/ContiguousTableJni.cpp + src/HashJoinJni.cpp + src/HostMemoryBufferNativeUtilsJni.cpp + src/NvcompJni.cpp + src/NvtxRangeJni.cpp + src/RmmJni.cpp + src/ScalarJni.cpp + src/TableJni.cpp + src/map_lookup.cu) ################################################################################################### -# - library paths --------------------------------------------------------------------------------- - -link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" # CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported variable containing the link directories for nvcc - "${CMAKE_BINARY_DIR}/lib") +# - include paths --------------------------------------------------------------------------------- +target_include_directories(cudfjni + PUBLIC + "${THRUST_INCLUDE}" + "${CUB_INCLUDE}" + "${LIBCUDACXX_INCLUDE}" + "${CUDAToolkit_INCLUDE_DIRS}" + "${NVCOMP_INCLUDE_DIR}" + "${CMAKE_BINARY_DIR}/include" + "${CMAKE_SOURCE_DIR}/include" + "${SPDLOG_INCLUDE}" + "${CMAKE_SOURCE_DIR}/src" + "${JNI_INCLUDE_DIRS}" + "${CUDF_INCLUDE}" + "${RMM_INCLUDE}" + "${ARROW_INCLUDE}") ################################################################################################### -# - library targets ------------------------------------------------------------------------------- - -set(SOURCE_FILES - "src/row_conversion.cu" - "src/AggregationJni.cpp" - "src/CudfJni.cpp" - "src/CudaJni.cpp" - "src/ColumnVectorJni.cpp" - "src/ColumnViewJni.cpp" - "src/CompiledExpression.cpp" - "src/ContiguousTableJni.cpp" - "src/HashJoinJni.cpp" - "src/HostMemoryBufferNativeUtilsJni.cpp" - "src/NvcompJni.cpp" - "src/NvtxRangeJni.cpp" - "src/RmmJni.cpp" - "src/ScalarJni.cpp" - "src/TableJni.cpp" - "src/map_lookup.cu") -add_library(cudfjni SHARED ${SOURCE_FILES}) +# - compile options --------------------------------------------------------------------------------- #Override RPATH for cudfjni -SET_TARGET_PROPERTIES(cudfjni - PROPERTIES BUILD_RPATH "\$ORIGIN" +set_target_properties(cudfjni + PROPERTIES BUILD_RPATH "\$ORIGIN" + INSTALL_RPATH "\$ORIGIN" # set target compile options CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON @@ -295,18 +264,17 @@ target_compile_definitions(cudfjni ) if(USE_GDS) - add_library(cufilejni SHARED "src/CuFileJni.cpp") - SET_TARGET_PROPERTIES(cufilejni - PROPERTIES BUILD_RPATH "\$ORIGIN" + add_library(cufilejni SHARED src/CuFileJni.cpp) + set_target_properties(cufilejni + PROPERTIES BUILD_RPATH "\$ORIGIN" + INSTALL_RPATH "\$ORIGIN" # set target compile options CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON - CUDA_STANDARD 17 - CUDA_STANDARD_REQUIRED ON ) target_include_directories(cufilejni PRIVATE "${cuFile_INCLUDE_DIRS}") target_link_libraries(cufilejni PRIVATE cudfjni "${cuFile_LIBRARIES}") -endif(USE_GDS) +endif() ################################################################################################### # - rmm logging level ----------------------------------------------------------------------------- @@ -322,4 +290,16 @@ target_compile_definitions(cudfjni PUBLIC SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM ################################################################################################### # - link libraries -------------------------------------------------------------------------------- -target_link_libraries(cudfjni ${NVCOMP_LIB} ${CUDF_LIB} ${ARROW_LIBRARY} ${CUDART_LIBRARY} cuda) +target_link_libraries(cudfjni PRIVATE nvcomp ${CUDF_LIB} ${ARROW_LIBRARY}) + +################################################################################################### +# - cudart options -------------------------------------------------------------------------------- +# cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking + +if(CUDA_STATIC_RUNTIME) + # Tell CMake what CUDA language runtime to use + set_target_properties(cudfjni PROPERTIES CUDA_RUNTIME_LIBRARY Static) +else() + # Tell CMake what CUDA language runtime to use + set_target_properties(cudfjni PROPERTIES CUDA_RUNTIME_LIBRARY Shared) +endif() diff --git a/java/src/main/native/cmake/EvalGpuArchs.cmake b/java/src/main/native/cmake/EvalGpuArchs.cmake deleted file mode 100644 index 740987e4785..00000000000 --- a/java/src/main/native/cmake/EvalGpuArchs.cmake +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -function(evaluate_gpu_archs gpu_archs) - set(eval_file ${PROJECT_BINARY_DIR}/eval_gpu_archs.cu) - set(eval_exe ${PROJECT_BINARY_DIR}/eval_gpu_archs) - set(error_file ${PROJECT_BINARY_DIR}/eval_gpu_archs.stderr.log) - file(WRITE ${eval_file} - " -#include -#include -#include -using namespace std; -int main(int argc, char** argv) { - set archs; - int nDevices; - if((cudaGetDeviceCount(&nDevices) == cudaSuccess) && (nDevices > 0)) { - for(int dev=0;dev Date: Tue, 31 Aug 2021 10:41:24 -0400 Subject: [PATCH 5/8] Support additional format specifiers in from_timestamps (#9047) Reference #5991 This PR adds support for the following format specifiers in `cudf::strings::from_timestamp` ``` %a and %A -- weekday names (passed into the API) %b and %B -- month names (passed into the API) %u - ISO weekday (1-7) %w - weekday (0-6) %U - week of the year (Sunday based) %W - week of the year (Monday based) %V - ISO week of the year %G - Year based on ISO weeks ``` This adds a new parameter to the API for the caller to pass then string names for the weekdays and months. These are only required if the `%a, %b, %A, %B` specifiers are contained in the format string. The change to `from_timestamps` is mainly a rewrite to include logic for these specifiers. Some common code required corresponding changes to `to_timestamps` and `is_timestamps` though these functions have not changed in this PR. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Devavret Makkar (https://github.com/devavret) - Conor Hoekstra (https://github.com/codereport) URL: https://github.com/rapidsai/cudf/pull/9047 --- .../cudf/strings/convert/convert_datetime.hpp | 84 +- .../cudf/strings/detail/converters.hpp | 3 +- cpp/src/io/csv/writer_impl.cu | 7 +- cpp/src/strings/convert/convert_datetime.cu | 965 ++++++++++-------- cpp/tests/strings/datetime_tests.cpp | 140 +++ 5 files changed, 752 insertions(+), 447 deletions(-) diff --git a/cpp/include/cudf/strings/convert/convert_datetime.hpp b/cpp/include/cudf/strings/convert/convert_datetime.hpp index 39bd6c639aa..4abca96e32a 100644 --- a/cpp/include/cudf/strings/convert/convert_datetime.hpp +++ b/cpp/include/cudf/strings/convert/convert_datetime.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,9 @@ #include #include +#include +#include + namespace cudf { namespace strings { /** @@ -135,7 +138,20 @@ std::unique_ptr is_timestamp( * | \%z | Always outputs "+0000" | * | \%Z | Always outputs "UTC" | * | \%j | Day of the year: 001-366 | - * | \%p | Only 'AM' or 'PM' | + * | \%u | ISO weekday where Monday is 1 and Sunday is 7 | + * | \%w | Weekday where Sunday is 0 and Saturday is 6 | + * | \%U | Week of the year with Sunday as the first day: 00-53 | + * | \%W | Week of the year with Monday as the first day: 00-53 | + * | \%V | Week of the year per ISO-8601 format: 01-53 | + * | \%G | Year based on the ISO-8601 weeks: 0000-9999 | + * | \%p | AM/PM from `timestamp_names::am_str/pm_str` | + * | \%a | Weekday abbreviation from the `names` parameter | + * | \%A | Weekday from the `names` parameter | + * | \%b | Month name abbreviation from the `names` parameter | + * | \%B | Month name from the `names` parameter | + * + * Additional descriptions can be found here: + * https://en.cppreference.com/w/cpp/chrono/system_clock/formatter * * No checking is done for invalid formats or invalid timestamp values. * All timestamps values are formatted to UTC. @@ -143,25 +159,75 @@ std::unique_ptr is_timestamp( * Any null input entry will result in a corresponding null entry in the output column. * * The time units of the input column do not influence the number of digits written by - * the "%f" specifier. - * The "%f" supports a precision value to write out numeric digits for the subsecond value. - * Specify the precision with a single integer value (1-9) between the "%" and the "f" as follows: - * use "%3f" for milliseconds, "%6f" for microseconds and "%9f" for nanoseconds. - * If the precision is higher than the units, then zeroes are padded to the right of - * the subsecond value. - * If the precision is lower than the units, the subsecond value may be truncated. + * the "%f" specifier. The "%f" supports a precision value to write out numeric digits + * for the subsecond value. Specify the precision with a single integer value (1-9) + * between the "%" and the "f" as follows: use "%3f" for milliseconds, use "%6f" for + * microseconds and use "%9f" for nanoseconds. If the precision is higher than the + * units, then zeroes are padded to the right of the subsecond value. If the precision + * is lower than the units, the subsecond value may be truncated. + * + * If the "%a", "%A", "%b", "%B" specifiers are included in the format, the caller + * should provide the format names in the `names` strings column using the following + * as a guide: + * + * @code{.pseudo} + * ["AM", "PM", // specify the AM/PM strings + * "Sunday", "Monday", ..., "Saturday", // Weekday full names + * "Sun", "Mon", ..., "Sat", // Weekday abbreviated names + * "January", "February", ..., "December", // Month full names + * "Jan", "Feb", ..., "Dec"] // Month abbreviated names + * @endcode + * + * The result is undefined if the format names are not provided for these specifiers. + * + * These format names can be retrieved for specific locales using the `nl_langinfo` + * functions from C++ `clocale` (std) library or the Python `locale` library. + * + * The following code is an example of retrieving these strings from the locale + * using c++ std functions: + * + * @code{.cpp} + * #include + * #include + * + * // note: install language pack on Ubuntu using 'apt-get install language-pack-de' + * { + * // set to a German language locale for date settings + * std::setlocale(LC_TIME, "de_DE.UTF-8"); + * + * std::vector names({nl_langinfo(AM_STR), nl_langinfo(PM_STR), + * nl_langinfo(DAY_1), nl_langinfo(DAY_2), nl_langinfo(DAY_3), nl_langinfo(DAY_4), + * nl_langinfo(DAY_5), nl_langinfo(DAY_6), nl_langinfo(DAY_7), + * nl_langinfo(ABDAY_1), nl_langinfo(ABDAY_2), nl_langinfo(ABDAY_3), nl_langinfo(ABDAY_4), + * nl_langinfo(ABDAY_5), nl_langinfo(ABDAY_6), nl_langinfo(ABDAY_7), + * nl_langinfo(MON_1), nl_langinfo(MON_2), nl_langinfo(MON_3), nl_langinfo(MON_4), + * nl_langinfo(MON_5), nl_langinfo(MON_6), nl_langinfo(MON_7), nl_langinfo(MON_8), + * nl_langinfo(MON_9), nl_langinfo(MON_10), nl_langinfo(MON_11), nl_langinfo(MON_12), + * nl_langinfo(ABMON_1), nl_langinfo(ABMON_2), nl_langinfo(ABMON_3), nl_langinfo(ABMON_4), + * nl_langinfo(ABMON_5), nl_langinfo(ABMON_6), nl_langinfo(ABMON_7), nl_langinfo(ABMON_8), + * nl_langinfo(ABMON_9), nl_langinfo(ABMON_10), nl_langinfo(ABMON_11), nl_langinfo(ABMON_12)}); + * + * std::setlocale(LC_TIME,""); // reset to default locale + * } + * @endcode * * @throw cudf::logic_error if `timestamps` column parameter is not a timestamp type. + * @throw cudf::logic_error if the `format` string is empty + * @throw cudf::logic_error if `names.size()` is an invalid size. Must be 0 or 40 strings. * * @param timestamps Timestamp values to convert. * @param format The string specifying output format. * Default format is "%Y-%m-%dT%H:%M:%SZ". + * @param names The string names to use for weekdays ("%a", "%A") and months ("%b", "%B") + * Default is an empty `strings_column_view`. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column with formatted timestamps. */ std::unique_ptr from_timestamps( column_view const& timestamps, std::string const& format = "%Y-%m-%dT%H:%M:%SZ", + strings_column_view const& names = strings_column_view(column_view{ + data_type{type_id::STRING}, 0, nullptr}), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/detail/converters.hpp b/cpp/include/cudf/strings/detail/converters.hpp index d91979708e0..b44276fcc33 100644 --- a/cpp/include/cudf/strings/detail/converters.hpp +++ b/cpp/include/cudf/strings/detail/converters.hpp @@ -100,12 +100,13 @@ std::unique_ptr to_timestamps(strings_column_view const& strings, /** * @copydoc from_timestamps(strings_column_view const&,std::string - * const&,rmm::mr::device_memory_resource*) + * const&,strings_column_view const&,rmm::mr::device_memory_resource*) * * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr from_timestamps(column_view const& timestamps, std::string const& format, + strings_column_view const& names, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index 335634b7fa8..f50aae72418 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -231,7 +231,12 @@ struct column_to_strings_fn { format = "\"" + format + "\""; } - return cudf::strings::detail::from_timestamps(column, format, stream_, mr_); + return cudf::strings::detail::from_timestamps( + column, + format, + strings_column_view(column_view{data_type{type_id::STRING}, 0, nullptr}), + stream_, + mr_); } template diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu index d804ac66961..ce5eb015039 100644 --- a/cpp/src/strings/convert/convert_datetime.cu +++ b/cpp/src/strings/convert/convert_datetime.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -26,17 +27,19 @@ #include #include #include -#include +#include #include #include #include #include +#include #include #include #include +#include #include namespace cudf { @@ -45,38 +48,23 @@ namespace detail { namespace { /** - * @brief Units for timestamp conversion. - * These are defined since there are more than what cudf supports. + * @brief Structure of date/time components */ -enum class timestamp_units { - years, ///< precision is years - months, ///< precision is months - days, ///< precision is days - hours, ///< precision is hours - minutes, ///< precision is minutes - seconds, ///< precision is seconds - ms, ///< precision is milliseconds - us, ///< precision is microseconds - ns ///< precision is nanoseconds -}; - -// used to index values in a timeparts array -enum timestamp_parse_component { - TP_YEAR = 0, - TP_MONTH = 1, - TP_DAY = 2, - TP_DAY_OF_YEAR = 3, - TP_HOUR = 4, - TP_MINUTE = 5, - TP_SECOND = 6, - TP_SUBSECOND = 7, - TP_TZ_MINUTES = 8, - TP_ARRAYSIZE = 9 +struct timestamp_components { + int16_t year; + int8_t month; + int8_t day; + int16_t day_of_year; + int8_t hour; + int8_t minute; + int8_t second; + int32_t subsecond; + int32_t tz_minutes; }; enum class format_char_type : int8_t { - literal, // literal char type passed through - specifier // timestamp format specifier + literal, ///< literal char type passed through + specifier ///< timestamp format specifier }; /** @@ -93,49 +81,49 @@ struct alignas(4) format_item { { return format_item{format_char_type::specifier, format_char, length}; } - static format_item new_delimiter(char literal) + static format_item new_literal(char literal) { return format_item{format_char_type::literal, literal, 1}; } }; /** - * @brief The format_compiler parses a timestamp format string into a vector of - * format_items. + * @brief The format-compiler parses a timestamp format string into a vector of + * `format_items`. * - * The vector of format_items are used when parsing a string into timestamp + * The vector of `format_items` is used when parsing a string into timestamp * components and when formatting a string from timestamp components. */ +using specifier_map = std::map; + struct format_compiler { - std::string format; - std::string template_string; + std::string const format; rmm::device_uvector d_items; - std::map specifier_lengths = {{'Y', 4}, - {'y', 2}, - {'m', 2}, - {'d', 2}, - {'H', 2}, - {'I', 2}, - {'M', 2}, - {'S', 2}, - {'f', 6}, - {'z', 5}, - {'Z', 3}, - {'p', 2}, - {'j', 3}}; - - format_compiler(const char* fmt, rmm::cuda_stream_view stream) : format(fmt), d_items(0, stream) + // clang-format off + // The specifiers are documented here (not all are supported): + // https://en.cppreference.com/w/cpp/chrono/system_clock/formatter + specifier_map specifiers = { + {'Y', 4}, {'y', 2}, {'m', 2}, {'d', 2}, {'H', 2}, {'I', 2}, {'M', 2}, + {'S', 2}, {'f', 6}, {'z', 5}, {'Z', 3}, {'p', 2}, {'j', 3}}; + // clang-format on + + format_compiler(std::string fmt, + rmm::cuda_stream_view stream, + specifier_map extra_specifiers = {}) + : format(fmt), d_items(0, stream) { + specifiers.insert(extra_specifiers.begin(), extra_specifiers.end()); std::vector items; const char* str = format.c_str(); auto length = format.length(); while (length > 0) { char ch = *str++; length--; + + // first check for a literal character if (ch != '%') { - items.push_back(format_item::new_delimiter(ch)); - template_string.append(1, ch); + items.push_back(format_item::new_literal(ch)); continue; } CUDF_EXPECTS(length > 0, "Unfinished specifier in timestamp format"); @@ -144,45 +132,42 @@ struct format_compiler { length--; if (ch == '%') // escaped % char { - items.push_back(format_item::new_delimiter(ch)); - template_string.append(1, ch); + items.push_back(format_item::new_literal(ch)); continue; } if (ch >= '0' && ch <= '9') { CUDF_EXPECTS(*str == 'f', "precision not supported for specifier: " + std::string(1, *str)); - specifier_lengths[*str] = static_cast(ch - '0'); - ch = *str++; + specifiers[*str] = static_cast(ch - '0'); + ch = *str++; length--; } - CUDF_EXPECTS(specifier_lengths.find(ch) != specifier_lengths.end(), + + // check if the specifier found is supported + CUDF_EXPECTS(specifiers.find(ch) != specifiers.end(), "invalid format specifier: " + std::string(1, ch)); - int8_t spec_length = specifier_lengths[ch]; - items.push_back(format_item::new_specifier(ch, spec_length)); - template_string.append((size_t)spec_length, ch); + // create the format item for this specifier + items.push_back(format_item::new_specifier(ch, specifiers[ch])); } - // create program in device memory - d_items.resize(items.size(), stream); - CUDA_TRY(cudaMemcpyAsync(d_items.data(), - items.data(), - items.size() * sizeof(items[0]), - cudaMemcpyHostToDevice, - stream.value())); + + // copy format_items to device memory + d_items = cudf::detail::make_device_uvector_async(items, stream); } - format_item const* format_items() { return d_items.data(); } - size_type template_bytes() const { return static_cast(template_string.size()); } - size_type items_count() const { return static_cast(d_items.size()); } - int8_t subsecond_precision() const { return specifier_lengths.at('f'); } + device_span format_items() { return device_span(d_items); } + + int8_t subsecond_precision() const { return specifiers.at('f'); } }; -// this parses date/time characters into a timestamp integer -template // timestamp type +/** + * @brief This parses date/time characters into a timestamp integer + * + * @tparam T cudf::timestamp type + */ +template struct parse_datetime { column_device_view const d_strings; - format_item const* d_format_items; - size_type items_count; - timestamp_units units; + device_span const d_format_items; int8_t subsecond_precision; /** @@ -210,16 +195,17 @@ struct parse_datetime { return value; } - // Walk the format_items to read the datetime string. - // Returns 0 if all ok. - __device__ int parse_into_parts(string_view const& d_string, int32_t* timeparts) + // Walk the format_items to parse the string into date/time components + __device__ timestamp_components parse_into_parts(string_view const& d_string) { + timestamp_components timeparts = {1970, 1, 1, 0}; // init to epoch time + auto ptr = d_string.data(); auto length = d_string.size_bytes(); - for (size_t idx = 0; idx < items_count; ++idx) { - auto item = d_format_items[idx]; + for (auto item : d_format_items) { if (item.value != 'f') item.length = static_cast(std::min(static_cast(item.length), length)); + if (item.item_type == format_char_type::literal) { // static character we'll just skip; // consume item.length bytes from string @@ -230,93 +216,77 @@ struct parse_datetime { // special logic for each specifier switch (item.value) { - case 'Y': timeparts[TP_YEAR] = str2int(ptr, item.length); break; + case 'Y': timeparts.year = static_cast(str2int(ptr, item.length)); break; case 'y': { - auto const year = str2int(ptr, item.length); - timeparts[TP_YEAR] = year + (year < 69 ? 2000 : 1900); + auto const year = str2int(ptr, item.length); + timeparts.year = static_cast(year + (year < 69 ? 2000 : 1900)); break; } - case 'm': timeparts[TP_MONTH] = str2int(ptr, item.length); break; - case 'd': timeparts[TP_DAY] = str2int(ptr, item.length); break; - case 'j': timeparts[TP_DAY_OF_YEAR] = str2int(ptr, item.length); break; + case 'm': timeparts.month = static_cast(str2int(ptr, item.length)); break; + case 'd': timeparts.day = static_cast(str2int(ptr, item.length)); break; + case 'j': timeparts.day_of_year = static_cast(str2int(ptr, item.length)); break; case 'H': - case 'I': timeparts[TP_HOUR] = str2int(ptr, item.length); break; - case 'M': timeparts[TP_MINUTE] = str2int(ptr, item.length); break; - case 'S': timeparts[TP_SECOND] = str2int(ptr, item.length); break; + case 'I': timeparts.hour = static_cast(str2int(ptr, item.length)); break; + case 'M': timeparts.minute = static_cast(str2int(ptr, item.length)); break; + case 'S': timeparts.second = static_cast(str2int(ptr, item.length)); break; case 'f': { int32_t const read_size = std::min(static_cast(item.length), static_cast(length)); - int64_t const fraction = str2int(ptr, read_size) * power_of_ten(item.length - read_size); - timeparts[TP_SUBSECOND] = static_cast(fraction); + int64_t const fraction = str2int(ptr, read_size) * power_of_ten(item.length - read_size); + timeparts.subsecond = static_cast(fraction); break; } case 'p': { string_view am_pm(ptr, 2); - auto hour = timeparts[TP_HOUR]; + auto hour = timeparts.hour; if ((am_pm.compare("AM", 2) == 0) || (am_pm.compare("am", 2) == 0)) { if (hour == 12) hour = 0; } else if (hour < 12) hour += 12; - timeparts[TP_HOUR] = hour; + timeparts.hour = hour; break; } case 'z': { - int sign = *ptr == '-' ? 1 : -1; // revert timezone back to UTC - int hh = str2int(ptr + 1, 2); - int mm = str2int(ptr + 3, 2); + auto const sign = *ptr == '-' ? 1 : -1; // revert timezone back to UTC + auto const hh = str2int(ptr + 1, 2); + auto const mm = str2int(ptr + 3, 2); // ignoring the rest for now // item.length has how many chars we should read - timeparts[TP_TZ_MINUTES] = sign * ((hh * 60) + mm); + timeparts.tz_minutes = sign * ((hh * 60) + mm); break; } case 'Z': break; // skip - default: return 3; + default: break; } ptr += item.length; length -= item.length; } - return 0; + return timeparts; } - __device__ int64_t timestamp_from_parts(int32_t const* timeparts, timestamp_units units) + __device__ int64_t timestamp_from_parts(timestamp_components const& timeparts) { - auto year = timeparts[TP_YEAR]; - if (units == timestamp_units::years) return year - 1970; - auto month = timeparts[TP_MONTH]; - if (units == timestamp_units::months) - return ((year - 1970) * 12) + (month - 1); // months are 1-12, need to 0-base it here - auto day = timeparts[TP_DAY]; - auto ymd = // convenient chrono class handles the leap year calculations for us - cuda::std::chrono::year_month_day(cuda::std::chrono::year{year}, - cuda::std::chrono::month{static_cast(month)}, - cuda::std::chrono::day{static_cast(day)}); - int32_t days = cuda::std::chrono::sys_days(ymd).time_since_epoch().count(); - if (units == timestamp_units::days) return days; - - auto tzadjust = timeparts[TP_TZ_MINUTES]; // in minutes - auto hour = timeparts[TP_HOUR]; - if (units == timestamp_units::hours) return (days * 24L) + hour + (tzadjust / 60); - - auto minute = timeparts[TP_MINUTE]; - if (units == timestamp_units::minutes) - return static_cast(days * 24L * 60L) + (hour * 60L) + minute + tzadjust; - - auto second = timeparts[TP_SECOND]; - int64_t timestamp = - (days * 24L * 3600L) + (hour * 3600L) + (minute * 60L) + second + (tzadjust * 60); - if (units == timestamp_units::seconds) return timestamp; - - int64_t subsecond = - timeparts[TP_SUBSECOND] * power_of_ten(9 - subsecond_precision); // normalize to nanoseconds - if (units == timestamp_units::ms) { - timestamp *= 1000L; - subsecond = subsecond / 1000000L; - } else if (units == timestamp_units::us) { - timestamp *= 1000000L; - subsecond = subsecond / 1000L; - } else if (units == timestamp_units::ns) - timestamp *= 1000000000L; + auto const ymd = // convenient chrono class handles the leap year calculations for us + cuda::std::chrono::year_month_day( + cuda::std::chrono::year{timeparts.year}, + cuda::std::chrono::month{static_cast(timeparts.month)}, + cuda::std::chrono::day{static_cast(timeparts.day)}); + auto const days = cuda::std::chrono::sys_days(ymd).time_since_epoch().count(); + + if constexpr (std::is_same_v) { return days; } + + int64_t timestamp = (days * 24L * 3600L) + (timeparts.hour * 3600L) + (timeparts.minute * 60L) + + timeparts.second + (timeparts.tz_minutes * 60L); + + if constexpr (std::is_same_v) { return timestamp; } + + int64_t const subsecond = + (timeparts.subsecond * power_of_ten(9 - subsecond_precision)) / // normalize to nanoseconds + (1000000000L / T::period::type::den); // and rescale to T + + timestamp *= T::period::type::den; timestamp += subsecond; + return timestamp; } @@ -326,73 +296,34 @@ struct parse_datetime { if (d_strings.is_null(idx)) return epoch_time; string_view d_str = d_strings.element(idx); if (d_str.empty()) return epoch_time; - // - int32_t timeparts[TP_ARRAYSIZE] = {1970, 1, 1}; // month and day are 1-based - if (parse_into_parts(d_str, timeparts)) return epoch_time; // unexpected parse case - // - return T{T::duration(timestamp_from_parts(timeparts, units))}; - } -}; -// convert cudf type to timestamp units -struct dispatch_timestamp_to_units_fn { - template - timestamp_units operator()() - { - CUDF_FAIL("Invalid type for timestamp conversion."); + auto const timeparts = parse_into_parts(d_str); + + return T{T::duration(timestamp_from_parts(timeparts))}; } }; -template <> -timestamp_units dispatch_timestamp_to_units_fn::operator()() -{ - return timestamp_units::days; -} -template <> -timestamp_units dispatch_timestamp_to_units_fn::operator()() -{ - return timestamp_units::seconds; -} -template <> -timestamp_units dispatch_timestamp_to_units_fn::operator()() -{ - return timestamp_units::ms; -} -template <> -timestamp_units dispatch_timestamp_to_units_fn::operator()() -{ - return timestamp_units::us; -} -template <> -timestamp_units dispatch_timestamp_to_units_fn::operator()() -{ - return timestamp_units::ns; -} - -// dispatch operator to map timestamp to native fixed-width-type +/** + * @brief Type-dispatch operator to convert timestamp strings to native fixed-width-type + */ struct dispatch_to_timestamps_fn { template ()>* = nullptr> void operator()(column_device_view const& d_strings, std::string const& format, - timestamp_units units, mutable_column_view& results_view, rmm::cuda_stream_view stream) const { - format_compiler compiler(format.c_str(), stream); - auto d_items = compiler.format_items(); - auto d_results = results_view.data(); - parse_datetime pfn{ - d_strings, d_items, compiler.items_count(), units, compiler.subsecond_precision()}; + format_compiler compiler(format, stream); + parse_datetime pfn{d_strings, compiler.format_items(), compiler.subsecond_precision()}; thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(results_view.size()), - d_results, + results_view.data(), pfn); } template ()>* = nullptr> void operator()(column_device_view const&, std::string const&, - timestamp_units, mutable_column_view&, rmm::cuda_stream_view) const { @@ -403,31 +334,31 @@ struct dispatch_to_timestamps_fn { } // namespace // -std::unique_ptr to_timestamps(strings_column_view const& strings, +std::unique_ptr to_timestamps(strings_column_view const& input, data_type timestamp_type, std::string const& format, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - size_type strings_count = strings.size(); - if (strings_count == 0) return make_timestamp_column(timestamp_type, 0); + if (input.is_empty()) + return make_empty_column(timestamp_type); // make_timestamp_column(timestamp_type, 0); CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty."); - timestamp_units units = cudf::type_dispatcher(timestamp_type, dispatch_timestamp_to_units_fn()); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; + auto d_strings = column_device_view::create(input.parent(), stream); - auto results = make_timestamp_column(timestamp_type, - strings_count, - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), + auto results = make_timestamp_column(timestamp_type, + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), stream, mr); + auto results_view = results->mutable_view(); cudf::type_dispatcher( - timestamp_type, dispatch_to_timestamps_fn(), d_column, format, units, results_view, stream); - results->set_null_count(strings.null_count()); + timestamp_type, dispatch_to_timestamps_fn(), *d_strings, format, results_view, stream); + + results->set_null_count(input.null_count()); return results; } @@ -438,8 +369,7 @@ std::unique_ptr to_timestamps(strings_column_view const& strings, */ struct check_datetime_format { column_device_view const d_strings; - format_item const* d_format_items; - size_type items_count; + device_span const d_format_items; /** * @brief Check the specified characters are between ['0','9']. @@ -508,16 +438,17 @@ struct check_datetime_format { * The checking here is a little more strict than the actual * parser used for conversion. */ - __device__ bool check_string(string_view const& d_string, int32_t* dateparts) + __device__ thrust::optional check_string(string_view const& d_string) { + timestamp_components dateparts = {1970, 1, 1, 0}; // init to epoch time + auto ptr = d_string.data(); auto length = d_string.size_bytes(); - for (size_t idx = 0; idx < items_count; ++idx) { - auto item = d_format_items[idx]; + for (auto item : d_format_items) { // eliminate static character values first if (item.item_type == format_char_type::literal) { // check static character matches - if (*ptr != item.value) return false; + if (*ptr != item.value) return thrust::nullopt; ptr += item.length; length -= item.length; continue; @@ -532,30 +463,30 @@ struct check_datetime_format { switch (item.value) { case 'Y': { if (auto value = str2int(ptr, item.length)) { - result = true; - dateparts[TP_YEAR] = value.value(); + result = true; + dateparts.year = static_cast(value.value()); } break; } case 'y': { if (auto value = str2int(ptr, item.length)) { - result = true; - auto const year = value.value(); - dateparts[TP_YEAR] = year + (year < 69 ? 2000 : 1900); + result = true; + auto const year = value.value(); + dateparts.year = static_cast(year + (year < 69 ? 2000 : 1900)); } break; } case 'm': { if (auto value = str2int(ptr, item.length)) { - result = true; - dateparts[TP_MONTH] = value.value(); + result = true; + dateparts.month = static_cast(value.value()); } break; } case 'd': { if (auto value = str2int(ptr, item.length)) { - result = true; - dateparts[TP_DAY] = value.value(); + result = true; + dateparts.day = static_cast(value.value()); } break; } @@ -587,23 +518,26 @@ struct check_datetime_format { case 'Z': result = true; // skip default: break; } - if (!result) return false; + if (!result) return thrust::nullopt; ptr += item.length; length -= item.length; } - return true; + return dateparts; } __device__ bool operator()(size_type idx) { if (d_strings.is_null(idx)) return false; + string_view d_str = d_strings.element(idx); if (d_str.empty()) return false; - int32_t dateparts[] = {1970, 1, 1}; // year, month, day - if (!check_string(d_str, dateparts)) return false; - auto year = dateparts[TP_YEAR]; - auto month = static_cast(dateparts[TP_MONTH]); - auto day = static_cast(dateparts[TP_DAY]); + + auto const dateparts = check_string(d_str); + if (!dateparts.has_value()) return false; + + auto const year = dateparts.value().year; + auto const month = static_cast(dateparts.value().month); + auto const day = static_cast(dateparts.value().day); return cuda::std::chrono::year_month_day(cuda::std::chrono::year{year}, cuda::std::chrono::month{month}, cuda::std::chrono::day{day}) @@ -611,36 +545,34 @@ struct check_datetime_format { } }; -std::unique_ptr is_timestamp(strings_column_view const& strings, +std::unique_ptr is_timestamp(strings_column_view const& input, std::string const& format, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - size_type strings_count = strings.size(); + size_type strings_count = input.size(); if (strings_count == 0) return make_empty_column(data_type{type_id::BOOL8}); CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty."); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_strings = *strings_column; + auto d_strings = column_device_view::create(input.parent(), stream); auto results = make_numeric_column(data_type{type_id::BOOL8}, strings_count, - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), stream, mr); auto d_results = results->mutable_view().data(); - format_compiler compiler(format.c_str(), stream); - thrust::transform( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - d_results, - check_datetime_format{d_strings, compiler.format_items(), compiler.items_count()}); + format_compiler compiler(format, stream); + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + d_results, + check_datetime_format{*d_strings, compiler.format_items()}); - results->set_null_count(strings.null_count()); + results->set_null_count(input.null_count()); return results; } @@ -648,141 +580,205 @@ std::unique_ptr is_timestamp(strings_column_view const& strings, // external APIs -std::unique_ptr to_timestamps(strings_column_view const& strings, +std::unique_ptr to_timestamps(strings_column_view const& input, data_type timestamp_type, std::string const& format, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::to_timestamps(strings, timestamp_type, format, rmm::cuda_stream_default, mr); + return detail::to_timestamps(input, timestamp_type, format, rmm::cuda_stream_default, mr); } -std::unique_ptr is_timestamp(strings_column_view const& strings, +std::unique_ptr is_timestamp(strings_column_view const& input, std::string const& format, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_timestamp(strings, format, rmm::cuda_stream_default, mr); + return detail::is_timestamp(input, format, rmm::cuda_stream_default, mr); } namespace detail { namespace { -// converts a timestamp into date-time string + +constexpr size_type format_names_size = 40; // 2(am/pm) + 2x7(weekdays) + 2x12(months) +constexpr size_type offset_weekdays = 2; +constexpr size_type offset_months = 16; +constexpr size_type days_in_week = 7; +constexpr size_type months_in_year = 12; + +/** + * @brief Time components used by the date_time_formatter + */ +struct time_components { + int8_t hour; + int8_t minute; + int8_t second; + int32_t subsecond; +}; + +/** + * @brief Base class for the `from_timestamps_size_fn` and the `date_time_formatter` + * + * These contain some common utility functions used by both subclasses. + */ template -struct datetime_formatter { - const column_device_view d_timestamps; - const format_item* d_format_items; - size_type items_count; - timestamp_units units; - const int32_t* d_offsets; - char* d_chars; - - __device__ cudf::timestamp_D::duration convert_to_days(int64_t timestamp, timestamp_units units) +struct from_timestamp_base { + /** + * @brief Specialized modulo expression that handles negative values. + * + * @code{.pseudo} + * Examples: + * modulo(1,60) -> 1 + * modulo(-1,60) -> 59 + * @endcode + */ + __device__ int32_t modulo_time(int64_t time, int64_t base) const { - using namespace cuda::std::chrono; - using minutes = duration; - using hours = duration; - switch (units) { - case timestamp_units::minutes: return floor(minutes(timestamp)); - case timestamp_units::seconds: return floor(cudf::timestamp_s::duration(timestamp)); - case timestamp_units::hours: return floor(hours(timestamp)); - case timestamp_units::ms: return floor(cudf::timestamp_ms::duration(timestamp)); - case timestamp_units::us: return floor(cudf::timestamp_us::duration(timestamp)); - case timestamp_units::ns: return floor(cudf::timestamp_ns::duration(timestamp)); - default: return cudf::timestamp_D::duration(timestamp); - } - } + return static_cast(((time % base) + base) % base); + }; - // divide timestamp integer into time components (year, month, day, etc) - // TODO call the cuda::std::chrono methods here instead when they are ready - __device__ void dissect_timestamp(int64_t timestamp, int32_t* timeparts) + /** + * @brief This function handles converting units by dividing and adjusting for negative values. + * + * @code{.pseudo} + * Examples: + * scale(-61,60) -> -2 + * scale(-60,60) -> -1 + * scale(-59,60) -> -1 + * scale( 59,60) -> 0 + * scale( 60,60) -> 1 + * scale( 61,60) -> 1 + * @endcode + */ + __device__ int32_t scale_time(int64_t time, int64_t base) const + { + return static_cast((time - ((time < 0) * (base - 1L))) / base); + }; + + __device__ time_components get_time_components(int64_t tstamp) const { - if (units == timestamp_units::years) { - timeparts[TP_YEAR] = static_cast(timestamp) + 1970; - timeparts[TP_MONTH] = 1; - timeparts[TP_DAY] = 1; - return; + time_components result = {0}; + if constexpr (std::is_same_v) { return result; } + + // Note: Tried using: cuda::std::chrono::hh_mm_ss(T::duration(timestamp)); + // and retrieving the hour, minute, second, and subsecond values from it + // but it did not scale/modulo the components for negative timestamps + // correctly -- it simply did an abs(timestamp) as documented here: + // https://en.cppreference.com/w/cpp/chrono/hh_mm_ss/hh_mm_ss + + if constexpr (not std::is_same_v) { + int64_t constexpr base = T::period::type::den; // 1000=ms, 1000000=us, etc + auto const subsecond = modulo_time(tstamp, base); + tstamp = tstamp / base - ((tstamp < 0) and (subsecond != 0)); + result.subsecond = subsecond; } - // Specialized modulo expression that handles negative values. - // Examples: - // modulo(1,60) 1 - // modulo(-1,60) 59 - auto modulo_time = [](int64_t time, int64_t base) { - return static_cast(((time % base) + base) % base); - }; + result.hour = modulo_time(scale_time(tstamp, 3600), 24); + result.minute = modulo_time(scale_time(tstamp, 60), 60); + result.second = modulo_time(tstamp, 60); - // This function handles converting units by dividing and adjusting for negative values. - // Examples: - // scale(-61,60) -2 - // scale(-60,60) -1 - // scale(-59,60) -1 - // scale( 59,60) 0 - // scale( 60,60) 1 - // scale( 61,60) 1 - auto scale_time = [](int64_t time, int64_t base) { - return static_cast((time - ((time < 0) * (base - 1L))) / base); - }; + return result; + } +}; - if (units == timestamp_units::months) { - int32_t month = modulo_time(timestamp, 12); - int32_t year = scale_time(timestamp, 12) + 1970; - timeparts[TP_YEAR] = year; - timeparts[TP_MONTH] = month + 1; // months start at 1 and not 0 - timeparts[TP_DAY] = 1; - return; - } +template +struct from_timestamps_size_fn : public from_timestamp_base { + column_device_view const d_timestamps; + column_device_view const d_format_names; + device_span const d_format_items; + + from_timestamps_size_fn(column_device_view const& d_timestamps, + column_device_view const& d_format_names, + device_span const& d_format_items) + : d_timestamps(d_timestamps), d_format_names(d_format_names), d_format_items(d_format_items) + { + } - // first, convert to days so we can handle months, years, day of the year. - auto const days = convert_to_days(timestamp, units); - auto const ymd = cuda::std::chrono::year_month_day(cuda::std::chrono::sys_days(days)); - auto const year = static_cast(ymd.year()); - auto const month = static_cast(ymd.month()); - auto const day = static_cast(ymd.day()); + __device__ size_type operator()(size_type idx) const + { + if (d_timestamps.is_null(idx)) { return 0; } - int32_t const monthDayOffset[] = {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334}; - timeparts[TP_DAY_OF_YEAR] = - day + monthDayOffset[month - 1] + (month > 2 and ymd.year().is_leap()); + // We only dissect the timestamp into components if needed + // by a specifier. And then we only do it once and reuse it. + // This can improve performance when not using uncommon specifiers. + thrust::optional days; - timeparts[TP_YEAR] = year; - timeparts[TP_MONTH] = month; - timeparts[TP_DAY] = day; - if (units == timestamp_units::days) return; + auto days_from_timestamp = [&]() { + auto const tstamp = d_timestamps.element(idx).time_since_epoch().count(); + return cuda::std::chrono::sys_days(static_cast( + floor(T::duration(tstamp)))); + }; - // done with date, now work on time + size_type bytes = 0; // output size + for (auto item : d_format_items) { + if (item.item_type == format_char_type::literal) { + bytes += item.length; + continue; + } - if (units == timestamp_units::hours) { - timeparts[TP_HOUR] = modulo_time(timestamp, 24); - return; - } - if (units == timestamp_units::minutes) { - timeparts[TP_HOUR] = modulo_time(scale_time(timestamp, 60), 24); - timeparts[TP_MINUTE] = modulo_time(timestamp, 60); - return; - } - if (units == timestamp_units::seconds) { - timeparts[TP_HOUR] = modulo_time(scale_time(timestamp, 3600), 24); - timeparts[TP_MINUTE] = modulo_time(scale_time(timestamp, 60), 60); - timeparts[TP_SECOND] = modulo_time(timestamp, 60); - return; + // only specifiers resulting in strings require special logic + switch (item.value) { + case 'a': // weekday abbreviated + case 'A': { // weekday full name + if (!days.has_value()) { days = days_from_timestamp(); } + auto const day_of_week = + cuda::std::chrono::year_month_weekday(days.value()).weekday().c_encoding(); + auto const day_idx = + day_of_week + offset_weekdays + (item.value == 'a' ? days_in_week : 0); + if (day_idx < d_format_names.size()) + bytes += d_format_names.element(day_idx).size_bytes(); + break; + } + case 'b': // month abbreviated + case 'B': { // month full name + if (!days.has_value()) { days = days_from_timestamp(); } + auto const month = + static_cast(cuda::std::chrono::year_month_day(days.value()).month()); + auto const month_idx = + month - 1 + offset_months + (item.value == 'b' ? months_in_year : 0); + if (month_idx < d_format_names.size()) + bytes += d_format_names.element(month_idx).size_bytes(); + break; + } + case 'p': // AM/PM + { + auto times = get_time_components(d_timestamps.element(idx).time_since_epoch().count()); + bytes += d_format_names.size() > 1 + ? d_format_names.element(static_cast(times.hour >= 12)) + .size_bytes() + : 2; + break; + } + default: { + bytes += item.length; + break; + } + } } + return bytes; + } +}; - // common utility for setting time components from a subsecond unit value - auto subsecond_fn = [&](int64_t subsecond_base) { - auto subsecond = modulo_time(timestamp, subsecond_base); - timestamp = timestamp / subsecond_base - ((timestamp < 0) and (subsecond != 0)); - timeparts[TP_SUBSECOND] = subsecond; - timeparts[TP_HOUR] = modulo_time(scale_time(timestamp, 3600), 24); - timeparts[TP_MINUTE] = modulo_time(scale_time(timestamp, 60), 60); - timeparts[TP_SECOND] = modulo_time(timestamp, 60); - }; - - if (units == timestamp_units::ms) - subsecond_fn(1000); - else if (units == timestamp_units::us) - subsecond_fn(1000000); - else - subsecond_fn(1000000000); +// converts a timestamp into date-time formatted string +template +struct datetime_formatter : public from_timestamp_base { + column_device_view const d_timestamps; + column_device_view const d_format_names; + device_span const d_format_items; + int32_t const* d_offsets{}; + char* d_chars{}; + + datetime_formatter(column_device_view const& d_timestamps, + column_device_view const& d_format_names, + device_span const& d_format_items, + int32_t const* d_offsets, + char* d_chars) + : d_timestamps(d_timestamps), + d_format_names(d_format_names), + d_format_items(d_format_items), + d_offsets(d_offsets), + d_chars(d_chars) + { } // utility to create 0-padded integers (up to 9 chars) @@ -801,120 +797,234 @@ struct datetime_formatter { return str; } - __device__ char* format_from_parts(int32_t const* timeparts, char* ptr) + // from https://howardhinnant.github.io/date/date.html + __device__ thrust::pair get_iso_week_year( + cuda::std::chrono::year_month_day const& ymd) const + { + auto const days = cuda::std::chrono::sys_days(ymd); + auto year = ymd.year(); + + auto iso_week_start = [](cuda::std::chrono::year const y) { + // clang-format off + return cuda::std::chrono::sys_days{cuda::std::chrono::Thursday[1]/cuda::std::chrono::January/y} - + (cuda::std::chrono::Thursday - cuda::std::chrono::Monday); + // clang-format on + }; + + auto start = iso_week_start(year); + if (days < start) + start = iso_week_start(--year); + else { + auto const next_start = iso_week_start(year + cuda::std::chrono::years{1}); + if (days >= next_start) { + ++year; + start = next_start; + } + } + return thrust::make_pair( + (cuda::std::chrono::duration_cast(days - start) + + cuda::std::chrono::weeks{1}) // always [1-53] + .count(), + static_cast(year)); + } + + __device__ int8_t get_week_of_year(cuda::std::chrono::sys_days const days, + cuda::std::chrono::sys_days const start) const { - for (size_t idx = 0; idx < items_count; ++idx) { - auto item = d_format_items[idx]; + return days < start + ? 0 + : (cuda::std::chrono::duration_cast(days - start) + + cuda::std::chrono::weeks{1}) + .count(); + } + + __device__ int32_t get_day_of_year(cuda::std::chrono::year_month_day const& ymd) + { + auto const month = static_cast(ymd.month()); + auto const day = static_cast(ymd.day()); + int32_t const monthDayOffset[] = {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334}; + return static_cast(day + monthDayOffset[month - 1] + + (month > 2 and ymd.year().is_leap())); + } + + __device__ void operator()(size_type idx) + { + if (d_timestamps.is_null(idx)) return; + auto tstamp = d_timestamps.element(idx).time_since_epoch().count(); + + auto const days = cuda::std::chrono::sys_days(static_cast( + cuda::std::chrono::floor(T::duration(tstamp)))); + auto const ymd = cuda::std::chrono::year_month_day(days); + + auto timeparts = get_time_components(tstamp); + + // convert to characters using the format items + auto ptr = d_chars + d_offsets[idx]; + for (auto item : d_format_items) { if (item.item_type == format_char_type::literal) { *ptr++ = item.value; continue; } + + // Value to use for int2str call at the end of the switch-statement. + // This simplifies the case statements and prevents alot of extra inlining. + int32_t copy_value = -1; // default set for non-int2str usage cases + // special logic for each specifier switch (item.value) { case 'Y': // 4-digit year - ptr = int2str(ptr, item.length, timeparts[TP_YEAR]); + copy_value = static_cast(ymd.year()); break; case 'y': // 2-digit year { - auto year = timeparts[TP_YEAR]; + auto year = static_cast(ymd.year()); // remove hundredths digits and above - ptr = int2str(ptr, item.length, year - ((year / 100) * 100)); + copy_value = year - ((year / 100) * 100); break; } case 'm': // month - ptr = int2str(ptr, item.length, timeparts[TP_MONTH]); + copy_value = static_cast(static_cast(ymd.month())); break; case 'd': // day of month - ptr = int2str(ptr, item.length, timeparts[TP_DAY]); + copy_value = static_cast(static_cast(ymd.day())); break; case 'j': // day of year - ptr = int2str(ptr, item.length, timeparts[TP_DAY_OF_YEAR]); + copy_value = get_day_of_year(ymd); break; case 'H': // 24-hour - ptr = int2str(ptr, item.length, timeparts[TP_HOUR]); + copy_value = timeparts.hour; break; case 'I': // 12-hour { // 0 = 12am; 12 = 12pm; 6 = 06am; 18 = 06pm - auto hour = timeparts[TP_HOUR]; - if (hour == 0) hour = 12; - if (hour > 12) hour -= 12; - ptr = int2str(ptr, item.length, hour); + copy_value = [h = timeparts.hour] { + if (h == 0) return 12; + return h > 12 ? h - 12 : h; + }(); break; } case 'M': // minute - ptr = int2str(ptr, item.length, timeparts[TP_MINUTE]); + copy_value = timeparts.minute; break; case 'S': // second - ptr = int2str(ptr, item.length, timeparts[TP_SECOND]); + copy_value = timeparts.second; break; case 'f': // sub-second { char subsecond_digits[] = "000000000"; // 9 max digits - const int digits = [units = units] { - if (units == timestamp_units::ms) return 3; - if (units == timestamp_units::us) return 6; - if (units == timestamp_units::ns) return 9; + const int digits = [] { + if constexpr (std::is_same_v) return 3; + if constexpr (std::is_same_v) return 6; + if constexpr (std::is_same_v) return 9; return 0; }(); - int2str(subsecond_digits, digits, timeparts[TP_SUBSECOND]); + int2str(subsecond_digits, digits, timeparts.subsecond); ptr = copy_and_increment(ptr, subsecond_digits, item.length); break; } case 'p': // am or pm + { // 0 = 12am, 12 = 12pm - if (timeparts[TP_HOUR] < 12) - memcpy(ptr, "AM", 2); - else - memcpy(ptr, "PM", 2); - ptr += 2; + auto const am_pm = [&] { + if (d_format_names.size() > 1) + return d_format_names.element( + static_cast(timeparts.hour >= 12)); + return string_view(timeparts.hour >= 12 ? "PM" : "AM", 2); + }(); + ptr = copy_string(ptr, am_pm); break; - case 'z': // timezone - memcpy(ptr, "+0000", 5); // always UTC - ptr += 5; + } + case 'z': // timezone -- always UTC + ptr = copy_and_increment(ptr, "+0000", 5); break; - case 'Z': - memcpy(ptr, "UTC", 3); - ptr += 3; + case 'Z': // timezone string -- always UTC + ptr = copy_and_increment(ptr, "UTC", 3); break; - default: // ignore everything else + case 'u': // day of week ISO + case 'w': { // day of week non-ISO + auto const day_of_week = static_cast( + cuda::std::chrono::year_month_weekday(days).weekday().c_encoding()); + copy_value = day_of_week == 0 && item.value == 'u' ? 7 : day_of_week; break; + } + // clang-format off + case 'U': { // week of year: first week includes the first Sunday of the year + copy_value = get_week_of_year(days, cuda::std::chrono::sys_days{ + cuda::std::chrono::Sunday[1]/cuda::std::chrono::January/ymd.year()}); + break; + } + case 'W': { // week of year: first week includes the first Monday of the year + copy_value = get_week_of_year(days, cuda::std::chrono::sys_days{ + cuda::std::chrono::Monday[1]/cuda::std::chrono::January/ymd.year()}); + break; + } + // clang-format on + case 'V': // ISO week number + case 'G': { // ISO year number + auto const [week, year] = get_iso_week_year(ymd); + copy_value = item.value == 'G' ? year : week; + break; + } + case 'a': // abbreviated day of the week + case 'A': { // day of the week + auto const day_of_week = + cuda::std::chrono::year_month_weekday(days).weekday().c_encoding(); + auto const day_idx = + day_of_week + offset_weekdays + (item.value == 'a' ? days_in_week : 0); + if (d_format_names.size()) + ptr = copy_string(ptr, d_format_names.element(day_idx)); + break; + } + case 'b': // abbreviated month of the year + case 'B': { // month of the year + auto const month = static_cast(ymd.month()); + auto const month_idx = + month - 1 + offset_months + (item.value == 'b' ? months_in_year : 0); + if (d_format_names.size()) + ptr = copy_string(ptr, d_format_names.element(month_idx)); + break; + } + default: break; } + if (copy_value >= 0) ptr = int2str(ptr, item.length, copy_value); } - return ptr; - } - - __device__ void operator()(size_type idx) - { - if (d_timestamps.is_null(idx)) return; - auto timestamp = d_timestamps.element(idx); - int32_t timeparts[TP_ARRAYSIZE] = {0}; - dissect_timestamp(timestamp.time_since_epoch().count(), timeparts); - // convert to characters - format_from_parts(timeparts, d_chars + d_offsets[idx]); } }; // +using strings_children = std::pair, std::unique_ptr>; struct dispatch_from_timestamps_fn { template ()>* = nullptr> - void operator()(column_device_view const& d_timestamps, - format_item const* d_format_items, - size_type items_count, - timestamp_units units, - const int32_t* d_offsets, - char* d_chars, - rmm::cuda_stream_view stream) const + strings_children operator()(column_device_view const& d_timestamps, + column_device_view const& d_format_names, + device_span d_format_items, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { - datetime_formatter pfn{d_timestamps, d_format_items, items_count, units, d_offsets, d_chars}; + size_type const strings_count = d_timestamps.size(); + // build offsets column + auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator( + 0, from_timestamps_size_fn{d_timestamps, d_format_names, d_format_items}); + auto offsets_column = make_offsets_child_column( + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); + auto d_offsets = offsets_column->mutable_view().template data(); + + // build chars column + auto const bytes = + cudf::detail::get_value(offsets_column->view(), strings_count, stream); + auto chars_column = create_chars_child_column(bytes, stream, mr); + auto d_chars = chars_column->mutable_view().template data(); + + datetime_formatter pfn{d_timestamps, d_format_names, d_format_items, d_offsets, d_chars}; thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), d_timestamps.size(), pfn); + return std::make_pair(std::move(offsets_column), std::move(chars_column)); } template - std::enable_if_t(), void> operator()(Args&&...) const + std::enable_if_t(), strings_children> operator()(Args&&...) const { CUDF_FAIL("Only timestamps type are expected"); } @@ -925,59 +1035,41 @@ struct dispatch_from_timestamps_fn { // std::unique_ptr from_timestamps(column_view const& timestamps, std::string const& format, + strings_column_view const& names, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - size_type strings_count = timestamps.size(); - if (strings_count == 0) return make_empty_column(data_type{type_id::STRING}); + if (timestamps.is_empty()) return make_empty_column(data_type{type_id::STRING}); CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty."); - timestamp_units units = - cudf::type_dispatcher(timestamps.type(), dispatch_timestamp_to_units_fn()); - - format_compiler compiler(format.c_str(), stream); - auto d_format_items = compiler.format_items(); - - auto column = column_device_view::create(timestamps, stream); - auto d_column = *column; - - // copy null mask - rmm::device_buffer null_mask = cudf::detail::copy_bitmask(timestamps, stream, mr); - // Each string will be the same number of bytes which can be determined - // directly from the format string. - auto d_str_bytes = compiler.template_bytes(); // size in bytes of each string - // build offsets column - auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator( - 0, [d_column, d_str_bytes] __device__(size_type idx) { - return d_column.is_null(idx) ? 0 : d_str_bytes; - }); - auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); - auto offsets_view = offsets_column->view(); - auto d_new_offsets = offsets_view.template data(); - - // build chars column - auto const bytes = - cudf::detail::get_value(offsets_column->view(), strings_count, stream); - auto chars_column = create_chars_child_column(bytes, stream, mr); - auto d_chars = chars_column->mutable_view().template data(); - // fill in chars column with timestamps + CUDF_EXPECTS(names.is_empty() || names.size() == format_names_size, + "Invalid size for format names."); + + auto const d_names = column_device_view::create(names.parent(), stream); + + // This API supports a few more specifiers than to_timestamps. + // clang-format off + format_compiler compiler(format, stream, + specifier_map{{'w', 1}, {'W', 2}, {'u', 1}, {'U', 2}, {'V', 2}, {'G', 4}, + {'a', 3}, {'A', 3}, {'b', 3}, {'B', 3}}); + // clang-format on + auto const d_format_items = compiler.format_items(); + auto const d_timestamps = column_device_view::create(timestamps, stream); + // dispatcher is called to handle the different timestamp types - cudf::type_dispatcher(timestamps.type(), - dispatch_from_timestamps_fn(), - d_column, - d_format_items, - compiler.items_count(), - units, - d_new_offsets, - d_chars, - stream); - - return make_strings_column(strings_count, + auto [offsets_column, chars_column] = cudf::type_dispatcher(timestamps.type(), + dispatch_from_timestamps_fn(), + *d_timestamps, + *d_names, + d_format_items, + stream, + mr); + + return make_strings_column(timestamps.size(), std::move(offsets_column), std::move(chars_column), timestamps.null_count(), - std::move(null_mask), + cudf::detail::copy_bitmask(timestamps, stream, mr), stream, mr); } @@ -988,10 +1080,11 @@ std::unique_ptr from_timestamps(column_view const& timestamps, std::unique_ptr from_timestamps(column_view const& timestamps, std::string const& format, + strings_column_view const& names, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::from_timestamps(timestamps, format, rmm::cuda_stream_default, mr); + return detail::from_timestamps(timestamps, format, names, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/tests/strings/datetime_tests.cpp b/cpp/tests/strings/datetime_tests.cpp index a0f1eed9935..1a814ea707e 100644 --- a/cpp/tests/strings/datetime_tests.cpp +++ b/cpp/tests/strings/datetime_tests.cpp @@ -288,6 +288,145 @@ TEST_F(StringsDatetimeTest, FromTimestampDayOfYear) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +// Format names used for some specifiers in from_timestamps +// clang-format off +cudf::test::strings_column_wrapper format_names({"AM", "PM", + "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", + "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", + "January", "February", "March", "April", "May", "June", "July", + "August", "September", "October", "November", "December", + "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"}); +// clang-format on + +TEST_F(StringsDatetimeTest, FromTimestampDayOfWeekOfYear) +{ + cudf::test::fixed_width_column_wrapper timestamps{ + 1645059720L, // 2022-02-17 + 1647167880L, // 2022-03-13 + 1649276040L, // 2022-04-06 + 1588734621L, // 2020-05-06 + 1560948892L, // 2019-06-19 + -265880250L, // 1961-07-29 + 1628194442L, // 2021-08-05 + 1632410760L, // 2021-09-23 + 1633464842L, // 2021-10-05 + 1636100042L, // 2021-11-05 + // These are a sequence of dates which are particular to the ISO week and + // year numbers which shift through Monday and Thursday and nicely includes + // a leap year (1980). https://en.wikipedia.org/wiki/ISO_week_date + 220924800L, // 1977-01-01 + 221011200L, // 1977-01-02 + 252374400L, // 1977-12-31 + 252460800L, // 1978-01-01 + 252547200L, // 1978-01-02 + 283910400L, // 1978-12-31 + 283996800L, // 1979-01-01 + 315360000L, // 1979-12-30 + 315446400L, // 1979-12-31 + 315532800L, // 1980-01-01 + 346809600L, // 1980-12-28 + 346896000L, // 1980-12-29 + 346982400L, // 1980-12-30 + 347068800L, // 1980-12-31 + 347155200L, // 1981-01-01 + 378604800L, // 1981-12-31 + 378691200L, // 1982-01-01 + 378777600L, // 1982-01-02 + 378864000L // 1982-01-03 + }; + + cudf::test::strings_column_wrapper expected( + {"[Thu 17, Feb 2022 4 07 4 07 2022 07]", "[Sun 13, Mar 2022 0 10 7 11 2022 10]", + "[Wed 06, Apr 2022 3 14 3 14 2022 14]", "[Wed 06, May 2020 3 18 3 18 2020 19]", + "[Wed 19, Jun 2019 3 24 3 24 2019 25]", "[Sat 29, Jul 1961 6 30 6 30 1961 30]", + "[Thu 05, Aug 2021 4 31 4 31 2021 31]", "[Thu 23, Sep 2021 4 38 4 38 2021 38]", + "[Tue 05, Oct 2021 2 40 2 40 2021 40]", "[Fri 05, Nov 2021 5 44 5 44 2021 44]", + "[Sat 01, Jan 1977 6 00 6 00 1976 53]", "[Sun 02, Jan 1977 0 00 7 01 1976 53]", + "[Sat 31, Dec 1977 6 52 6 52 1977 52]", "[Sun 01, Jan 1978 0 00 7 01 1977 52]", + "[Mon 02, Jan 1978 1 01 1 01 1978 01]", "[Sun 31, Dec 1978 0 52 7 53 1978 52]", + "[Mon 01, Jan 1979 1 01 1 00 1979 01]", "[Sun 30, Dec 1979 0 52 7 52 1979 52]", + "[Mon 31, Dec 1979 1 53 1 52 1980 01]", "[Tue 01, Jan 1980 2 00 2 00 1980 01]", + "[Sun 28, Dec 1980 0 51 7 52 1980 52]", "[Mon 29, Dec 1980 1 52 1 52 1981 01]", + "[Tue 30, Dec 1980 2 52 2 52 1981 01]", "[Wed 31, Dec 1980 3 52 3 52 1981 01]", + "[Thu 01, Jan 1981 4 00 4 00 1981 01]", "[Thu 31, Dec 1981 4 52 4 52 1981 53]", + "[Fri 01, Jan 1982 5 00 5 00 1981 53]", "[Sat 02, Jan 1982 6 00 6 00 1981 53]", + "[Sun 03, Jan 1982 0 00 7 01 1981 53]"}); + + auto results = cudf::strings::from_timestamps( + timestamps, "[%a %d, %b %Y %w %W %u %U %G %V]", cudf::strings_column_view(format_names)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST_F(StringsDatetimeTest, FromTimestampWeekdayMonthYear) +{ + cudf::test::fixed_width_column_wrapper timestamps{ + 1642951560L, // 2022-01-23 15:26:00 Sunday + 1645059720L, // 2022-02-17 01:02:00 Thursday + 1647167880L, // 2022-03-13 10:38:00 Sunday + 1649276040L, // 2022-04-06 20:14:00 Wednesday + 1588734621L, // 2020-05-06 03:10:21 Wednesday + 1560948892L, // 2019-06-19 12:54:52 Wednesday + -265880250L, // 1961-07-29 16:22:30 Saturday + 1628194442L, // 2021-08-05 20:14:02 Thursday + 1632410760L, // 2021-09-23 15:26:00 Thursday + 1633464842L, // 2021-10-05 20:14:02 Tuesday + 1636100042L, // 2021-11-05 08:14:02 Friday + 1638757202L // 2021-12-06 02:20:00 Monday + }; + + cudf::test::strings_column_wrapper expected({"[Sunday January 23, 2022: 03 PM]", + "[Thursday February 17, 2022: 01 AM]", + "[Sunday March 13, 2022: 10 AM]", + "[Wednesday April 06, 2022: 08 PM]", + "[Wednesday May 06, 2020: 03 AM]", + "[Wednesday June 19, 2019: 12 PM]", + "[Saturday July 29, 1961: 04 PM]", + "[Thursday August 05, 2021: 08 PM]", + "[Thursday September 23, 2021: 03 PM]", + "[Tuesday October 05, 2021: 08 PM]", + "[Friday November 05, 2021: 08 AM]", + "[Monday December 06, 2021: 02 AM]"}); + + auto results = cudf::strings::from_timestamps( + timestamps, "[%A %B %d, %Y: %I %p]", cudf::strings_column_view(format_names)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST_F(StringsDatetimeTest, FromTimestampAllSpecifiers) +{ + cudf::test::fixed_width_column_wrapper input{ + 1645059720000000001L, + 1647167880000001000L, + 1649276040001000000L, + 1588734621123456789L, + 1560948892987654321L, + -265880250010203040L, + 1628194442090807060L, + 1632410760500400300L, + 1633464842000000000L, + 1636100042999999999L}; + + auto results = cudf::strings::from_timestamps( + input, + "[%d/%m/%y/%Y %H:%I:%M:%S.%f %z:%Z %j %u %U %W %V %G %p %a %A %b %B]", + cudf::strings_column_view(format_names)); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "[17/02/22/2022 01:01:02:00.000000 +0000:UTC 048 4 07 07 07 2022 AM Thu Thursday Feb February]", + "[13/03/22/2022 10:10:38:00.000001 +0000:UTC 072 7 11 10 10 2022 AM Sun Sunday Mar March]", + "[06/04/22/2022 20:08:14:00.001000 +0000:UTC 096 3 14 14 14 2022 PM Wed Wednesday Apr April]", + "[06/05/20/2020 03:03:10:21.123456 +0000:UTC 127 3 18 18 19 2020 AM Wed Wednesday May May]", + "[19/06/19/2019 12:12:54:52.987654 +0000:UTC 170 3 24 24 25 2019 PM Wed Wednesday Jun June]", + "[29/07/61/1961 16:04:22:29.989796 +0000:UTC 210 6 30 30 30 1961 PM Sat Saturday Jul July]", + "[05/08/21/2021 20:08:14:02.090807 +0000:UTC 217 4 31 31 31 2021 PM Thu Thursday Aug August]", + "[23/09/21/2021 15:03:26:00.500400 +0000:UTC 266 4 38 38 38 2021 PM Thu Thursday Sep September]", + "[05/10/21/2021 20:08:14:02.000000 +0000:UTC 278 2 40 40 40 2021 PM Tue Tuesday Oct October]", + "[05/11/21/2021 08:08:14:02.999999 +0000:UTC 309 5 44 44 44 2021 AM Fri Friday Nov November]"}); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsDatetimeTest, ZeroSizeStringsColumn) { cudf::column_view zero_size_column( @@ -324,6 +463,7 @@ TEST_F(StringsDatetimeTest, Errors) cudf::test::fixed_width_column_wrapper timestamps{ 1530705600}; EXPECT_THROW(cudf::strings::from_timestamps(timestamps, ""), cudf::logic_error); + EXPECT_THROW(cudf::strings::from_timestamps(timestamps, "%A %B", view), cudf::logic_error); } TEST_F(StringsDatetimeTest, ToTimestampSingleSpecifier) From 48bc39e1bfac2cb7b63562d07f142998d802d313 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 31 Aug 2021 12:07:29 -0700 Subject: [PATCH 6/8] Refactor Index hierarchy (#9039) Until now the class hierarchy of index types has had numerous logical flaws for reasons of convenience: for instance, `RangeIndex` was always inheriting from `Frame` despite not actually being backed by data, and since #8115 `MultiIndex` has been a `SingleColumnFrame` even though it actually has multiple columns. This PR moves `BaseIndex` to the top of its own hierarchy, and uses multiple inheritance with `Frame` and `SingleColumnFrame` to create a more sensible hierarchy for its subclasses. `BaseIndex` is now effectively an ABC defining the interface that subclasses must define, but many of these methods are still inherited from `Frame` types (or in the case of `RangeIndex`, delegated to `Int64Index`). These changes remove lots of broken behavior that was previously present in `MultiIndex` and `RangeIndex`; for instance, binary operations would previously fail in strange ways for `MultiIndex`, and various hacks were necessary for `MultiIndex` methods to bypass `SingleColumnFrame`. `RangeIndex` methods that delegate to `Int64Index` are now made explicit (rather than the previous implicit conversion via `self._data`). The new hierarchy also allows much more sensible type-checking by mypy, which revealed numerous additional conceptual issues. The bulk of this PR is actually moving functions around to make the type checker happy, some of which also fixed actual functional issues: for example, `RangeIndex.get_loc` was previously broken. The refactor will make it much easier to handle future changes to all classes in the index hierarchy. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Marlene (https://github.com/marlenezw) - Michael Wang (https://github.com/isVoid) URL: https://github.com/rapidsai/cudf/pull/9039 --- python/cudf/cudf/_lib/groupby.pyx | 26 +- python/cudf/cudf/_lib/utils.pyx | 5 +- python/cudf/cudf/_typing.py | 3 + python/cudf/cudf/api/types.py | 2 +- python/cudf/cudf/core/_base_index.py | 964 +++++++++++ python/cudf/cudf/core/_internals/where.py | 12 +- python/cudf/cudf/core/algorithms.py | 4 +- python/cudf/cudf/core/column/categorical.py | 4 +- python/cudf/cudf/core/column/methods.py | 2 +- python/cudf/cudf/core/dataframe.py | 314 +++- python/cudf/cudf/core/frame.py | 377 +--- python/cudf/cudf/core/index.py | 1733 +++++-------------- python/cudf/cudf/core/join/_join_helpers.py | 4 +- python/cudf/cudf/core/multiindex.py | 122 +- python/cudf/cudf/core/reshape.py | 2 +- python/cudf/cudf/core/series.py | 21 +- python/cudf/cudf/tests/test_index.py | 33 +- 17 files changed, 1864 insertions(+), 1764 deletions(-) create mode 100644 python/cudf/cudf/core/_base_index.py diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index d7416625248..153b116cd33 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -99,10 +99,12 @@ cdef class GroupBy: c_grouped_values = move(c_groups.values) c_group_offsets = c_groups.offsets - grouped_keys = cudf.Index._from_data(*data_from_unique_ptr( - move(c_grouped_keys), - column_names=range(c_grouped_keys.get()[0].num_columns()) - )) + grouped_keys = cudf.core.index._index_from_data( + *data_from_unique_ptr( + move(c_grouped_keys), + column_names=range(c_grouped_keys.get()[0].num_columns()) + ) + ) grouped_values = data_from_unique_ptr( move(c_grouped_values), index_names=values._index_names, @@ -186,7 +188,8 @@ cdef class GroupBy: Column.from_unique_ptr(move(c_result.second[i].results[j])) ) - return result_data, cudf.Index._from_data(grouped_keys) + return result_data, cudf.core.index._index_from_data( + grouped_keys) def scan_internal(self, Table values, aggregations): from cudf.core.column_accessor import ColumnAccessor @@ -264,7 +267,8 @@ cdef class GroupBy: Column.from_unique_ptr(move(c_result.second[i].results[j])) ) - return result_data, cudf.Index._from_data(grouped_keys) + return result_data, cudf.core.index._index_from_data( + grouped_keys) def aggregate(self, Table values, aggregations): """ @@ -311,10 +315,12 @@ cdef class GroupBy: self.c_obj.get()[0].shift(view, offsets, c_fill_values) ) - grouped_keys = cudf.Index._from_data(*data_from_unique_ptr( - move(c_result.first), - column_names=self.keys._column_names - )) + grouped_keys = cudf.core.index._index_from_data( + *data_from_unique_ptr( + move(c_result.first), + column_names=self.keys._column_names + ) + ) shifted, _ = data_from_unique_ptr( move(c_result.second), column_names=values._column_names diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index cd258102228..2456aa334e9 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -251,7 +251,7 @@ cdef data_from_unique_ptr( # Frame factories we may want to look for a less dissonant approach # that does not impose performance penalties. The same applies to # data_from_table_view below. - cudf.Index._from_data( + cudf.core.index._index_from_data( { name: columns[i] for i, name in enumerate(index_names) @@ -301,7 +301,8 @@ cdef data_from_table_view( ) ) column_idx += 1 - index = cudf.Index._from_data(dict(zip(index_names, index_columns))) + index = cudf.core.index._index_from_data( + dict(zip(index_names, index_columns))) # Construct the data dict cdef size_type source_column_idx = 0 diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py index 7eb0c7bdce4..793a5d1d9e8 100644 --- a/python/cudf/cudf/_typing.py +++ b/python/cudf/cudf/_typing.py @@ -29,3 +29,6 @@ DataFrameOrSeries = Union["cudf.Series", "cudf.DataFrame"] SeriesOrIndex = Union["cudf.Series", "cudf.core.index.BaseIndex"] +SeriesOrSingleColumnIndex = Union[ + "cudf.Series", "cudf.core.index.GenericIndex" +] diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index bf296e11178..10bbb620715 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -194,7 +194,7 @@ def wrapped_func(obj): def _union_categoricals( - to_union: List[Union[cudf.Series, cudf.Index]], + to_union: List[Union[cudf.Series, cudf.CategoricalIndex]], sort_categories: bool = False, ignore_order: bool = False, ): diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py new file mode 100644 index 00000000000..5f12cbaf21f --- /dev/null +++ b/python/cudf/cudf/core/_base_index.py @@ -0,0 +1,964 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from __future__ import annotations, division, print_function + +import pickle +from typing import Any, Set + +import cupy +import pandas as pd + +import cudf +from cudf._typing import DtypeObj +from cudf.api.types import is_dtype_equal, is_integer +from cudf.core.abc import Serializable +from cudf.core.column import ColumnBase, column +from cudf.core.column_accessor import ColumnAccessor +from cudf.utils import ioutils +from cudf.utils.dtypes import ( + is_list_like, + is_mixed_with_object_dtype, + is_scalar, + numeric_normalize_types, +) +from cudf.utils.utils import cached_property + + +class BaseIndex(Serializable): + """Base class for all cudf Index types.""" + + dtype: DtypeObj + _accessors: Set[Any] = set() + _data: ColumnAccessor + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + + if method == "__call__" and hasattr(cudf, ufunc.__name__): + func = getattr(cudf, ufunc.__name__) + return func(*inputs) + else: + return NotImplemented + + @cached_property + def _values(self) -> ColumnBase: + raise NotImplementedError + + def copy(self, deep: bool = True) -> BaseIndex: + raise NotImplementedError + + @property + def values(self): + return self._values.values + + def get_loc(self, key, method=None, tolerance=None): + raise NotImplementedError + + def __getitem__(self, key): + raise NotImplementedError() + + def serialize(self): + header = {} + header["index_column"] = {} + # store metadata values of index separately + # Indexes: Numerical/DateTime/String are often GPU backed + header["index_column"], frames = self._values.serialize() + + header["name"] = pickle.dumps(self.name) + header["dtype"] = pickle.dumps(self.dtype) + header["type-serialized"] = pickle.dumps(type(self)) + header["frame_count"] = len(frames) + return header, frames + + def __contains__(self, item): + return item in self._values + + def get_level_values(self, level): + """ + Return an Index of values for requested level. + + This is primarily useful to get an individual level of values from a + MultiIndex, but is provided on Index as well for compatibility. + + Parameters + ---------- + level : int or str + It is either the integer position or the name of the level. + + Returns + ------- + Index + Calling object, as there is only one level in the Index. + + See Also + -------- + cudf.core.multiindex.MultiIndex.get_level_values : Get values for + a level of a MultiIndex. + + Notes + ----- + For Index, level should be 0, since there are no multiple levels. + + Examples + -------- + >>> import cudf + >>> idx = cudf.Index(["a", "b", "c"]) + >>> idx.get_level_values(0) + StringIndex(['a' 'b' 'c'], dtype='object') + """ + + if level == self.name: + return self + elif is_integer(level): + if level != 0: + raise IndexError( + f"Cannot get level: {level} " f"for index with 1 level" + ) + return self + else: + raise KeyError(f"Requested level with name {level} " "not found") + + @classmethod + def deserialize(cls, header, frames): + h = header["index_column"] + idx_typ = pickle.loads(header["type-serialized"]) + name = pickle.loads(header["name"]) + + col_typ = pickle.loads(h["type-serialized"]) + index = col_typ.deserialize(h, frames[: header["frame_count"]]) + return idx_typ(index, name=name) + + @property + def names(self): + """ + Returns a tuple containing the name of the Index. + """ + return (self.name,) + + @names.setter + def names(self, values): + if not is_list_like(values): + raise ValueError("Names must be a list-like") + + num_values = len(values) + if num_values > 1: + raise ValueError( + "Length of new names must be 1, got %d" % num_values + ) + + self.name = values[0] + + def _clean_nulls_from_index(self): + """ + Convert all na values(if any) in Index object + to `` as a preprocessing step to `__repr__` methods. + + This will involve changing type of Index object + to StringIndex but it is the responsibility of the `__repr__` + methods using this method to replace or handle representation + of the actual types correctly. + """ + if self._values.has_nulls: + return cudf.Index( + self._values.astype("str").fillna(cudf._NA_REP), name=self.name + ) + else: + return self + + @property + def nlevels(self): + """ + Number of levels. + """ + return 1 + + def _set_names(self, names, inplace=False): + if inplace: + idx = self + else: + idx = self.copy(deep=False) + + idx.names = names + if not inplace: + return idx + + def set_names(self, names, level=None, inplace=False): + """ + Set Index or MultiIndex name. + Able to set new names partially and by level. + + Parameters + ---------- + names : label or list of label + Name(s) to set. + level : int, label or list of int or label, optional + If the index is a MultiIndex, level(s) to set (None for all + levels). Otherwise level must be None. + inplace : bool, default False + Modifies the object directly, instead of creating a new Index or + MultiIndex. + + Returns + ------- + Index + The same type as the caller or None if inplace is True. + + See Also + -------- + cudf.Index.rename : Able to set new names without level. + + Examples + -------- + >>> import cudf + >>> idx = cudf.Index([1, 2, 3, 4]) + >>> idx + Int64Index([1, 2, 3, 4], dtype='int64') + >>> idx.set_names('quarter') + Int64Index([1, 2, 3, 4], dtype='int64', name='quarter') + >>> idx = cudf.MultiIndex.from_product([['python', 'cobra'], + ... [2018, 2019]]) + >>> idx + MultiIndex([('python', 2018), + ('python', 2019), + ( 'cobra', 2018), + ( 'cobra', 2019)], + ) + >>> idx.names + FrozenList([None, None]) + >>> idx.set_names(['kind', 'year'], inplace=True) + >>> idx.names + FrozenList(['kind', 'year']) + >>> idx.set_names('species', level=0, inplace=True) + >>> idx.names + FrozenList(['species', 'year']) + """ + if level is not None: + raise ValueError("Level must be None for non-MultiIndex") + + if not is_list_like(names): + names = [names] + + return self._set_names(names=names, inplace=inplace) + + def fillna(self, value, downcast=None): + """ + Fill null values with the specified value. + + Parameters + ---------- + value : scalar + Scalar value to use to fill nulls. This value cannot be a + list-likes. + + downcast : dict, default is None + This Parameter is currently NON-FUNCTIONAL. + + Returns + ------- + filled : Index + + Examples + -------- + >>> import cudf + >>> index = cudf.Index([1, 2, None, 4]) + >>> index + Int64Index([1, 2, null, 4], dtype='int64') + >>> index.fillna(3) + Int64Index([1, 2, 3, 4], dtype='int64') + """ + if downcast is not None: + raise NotImplementedError( + "`downcast` parameter is not yet supported" + ) + + return super().fillna(value=value) + + def take(self, indices): + """Gather only the specific subset of indices + + Parameters + ---------- + indices: An array-like that maps to values contained in this Index. + """ + return self[indices] + + def argsort(self, ascending=True, **kwargs): + """ + Return the integer indices that would sort the index. + + Parameters + ---------- + ascending : bool, default True + If True, returns the indices for ascending order. + If False, returns the indices for descending order. + + Returns + ------- + array : A cupy array containing Integer indices that + would sort the index if used as an indexer. + + Examples + -------- + >>> import cudf + >>> index = cudf.Index([10, 100, 1, 1000]) + >>> index + Int64Index([10, 100, 1, 1000], dtype='int64') + >>> index.argsort() + array([2, 0, 1, 3], dtype=int32) + + The order of argsort can be reversed using + ``ascending`` parameter, by setting it to ``False``. + >>> index.argsort(ascending=False) + array([3, 1, 0, 2], dtype=int32) + + ``argsort`` on a MultiIndex: + + >>> index = cudf.MultiIndex( + ... levels=[[1, 3, 4, -10], [1, 11, 5]], + ... codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + ... names=["x", "y"], + ... ) + >>> index + MultiIndex([( 1, 1), + ( 1, 5), + ( 3, 11), + ( 4, 11), + (-10, 1)], + names=['x', 'y']) + >>> index.argsort() + array([4, 0, 1, 2, 3], dtype=int32) + >>> index.argsort(ascending=False) + array([3, 2, 1, 0, 4], dtype=int32) + """ + indices = self._values.argsort(ascending=ascending, **kwargs) + return cupy.asarray(indices) + + def to_frame(self, index=True, name=None): + """Create a DataFrame with a column containing this Index + + Parameters + ---------- + index : boolean, default True + Set the index of the returned DataFrame as the original Index + name : str, default None + Name to be used for the column + + Returns + ------- + DataFrame + cudf DataFrame + """ + + if name is not None: + col_name = name + elif self.name is None: + col_name = 0 + else: + col_name = self.name + return cudf.DataFrame( + {col_name: self._values}, index=self if index else None + ) + + def any(self): + """ + Return whether any elements is True in Index. + """ + return self._values.any() + + def to_pandas(self): + """ + Convert to a Pandas Index. + + Examples + -------- + >>> import cudf + >>> idx = cudf.Index([-3, 10, 15, 20]) + >>> idx + Int64Index([-3, 10, 15, 20], dtype='int64') + >>> idx.to_pandas() + Int64Index([-3, 10, 15, 20], dtype='int64') + >>> type(idx.to_pandas()) + + >>> type(idx) + + """ + return pd.Index(self._values.to_pandas(), name=self.name) + + @ioutils.doc_to_dlpack() + def to_dlpack(self): + """{docstring}""" + + return cudf.io.dlpack.to_dlpack(self) + + @property + def gpu_values(self): + """ + View the data as a numba device array object + """ + return self._values.data_array_view + + def append(self, other): + """ + Append a collection of Index options together. + + Parameters + ---------- + other : Index or list/tuple of indices + + Returns + ------- + appended : Index + + Examples + -------- + >>> import cudf + >>> idx = cudf.Index([1, 2, 10, 100]) + >>> idx + Int64Index([1, 2, 10, 100], dtype='int64') + >>> other = cudf.Index([200, 400, 50]) + >>> other + Int64Index([200, 400, 50], dtype='int64') + >>> idx.append(other) + Int64Index([1, 2, 10, 100, 200, 400, 50], dtype='int64') + + append accepts list of Index objects + + >>> idx.append([other, other]) + Int64Index([1, 2, 10, 100, 200, 400, 50, 200, 400, 50], dtype='int64') + """ + + if is_list_like(other): + to_concat = [self] + to_concat.extend(other) + else: + this = self + if len(other) == 0: + # short-circuit and return a copy + to_concat = [self] + + other = cudf.Index(other) + + if len(self) == 0: + to_concat = [other] + + if len(self) and len(other): + if is_mixed_with_object_dtype(this, other): + got_dtype = ( + other.dtype + if this.dtype == cudf.dtype("object") + else this.dtype + ) + raise TypeError( + f"cudf does not support appending an Index of " + f"dtype `{cudf.dtype('object')}` with an Index " + f"of dtype `{got_dtype}`, please type-cast " + f"either one of them to same dtypes." + ) + + if isinstance(self._values, cudf.core.column.NumericalColumn): + if self.dtype != other.dtype: + this, other = numeric_normalize_types(self, other) + to_concat = [this, other] + + for obj in to_concat: + if not isinstance(obj, BaseIndex): + raise TypeError("all inputs must be Index") + + return self._concat(to_concat) + + def difference(self, other, sort=None): + """ + Return a new Index with elements from the index that are not in + `other`. + + This is the set difference of two Index objects. + + Parameters + ---------- + other : Index or array-like + sort : False or None, default None + Whether to sort the resulting index. By default, the + values are attempted to be sorted, but any TypeError from + incomparable elements is caught by cudf. + + * None : Attempt to sort the result, but catch any TypeErrors + from comparing incomparable elements. + * False : Do not sort the result. + + Returns + ------- + difference : Index + + Examples + -------- + >>> import cudf + >>> idx1 = cudf.Index([2, 1, 3, 4]) + >>> idx1 + Int64Index([2, 1, 3, 4], dtype='int64') + >>> idx2 = cudf.Index([3, 4, 5, 6]) + >>> idx2 + Int64Index([3, 4, 5, 6], dtype='int64') + >>> idx1.difference(idx2) + Int64Index([1, 2], dtype='int64') + >>> idx1.difference(idx2, sort=False) + Int64Index([2, 1], dtype='int64') + """ + if sort not in {None, False}: + raise ValueError( + f"The 'sort' keyword only takes the values " + f"of None or False; {sort} was passed." + ) + + other = cudf.Index(other) + + if is_mixed_with_object_dtype(self, other): + difference = self.copy() + else: + difference = self.join(other, how="leftanti") + if self.dtype != other.dtype: + difference = difference.astype(self.dtype) + + if sort is None: + return difference.sort_values() + + return difference + + def sort_values(self, return_indexer=False, ascending=True, key=None): + """ + Return a sorted copy of the index, and optionally return the indices + that sorted the index itself. + + Parameters + ---------- + return_indexer : bool, default False + Should the indices that would sort the index be returned. + ascending : bool, default True + Should the index values be sorted in an ascending order. + key : None, optional + This parameter is NON-FUNCTIONAL. + + Returns + ------- + sorted_index : Index + Sorted copy of the index. + indexer : cupy.ndarray, optional + The indices that the index itself was sorted by. + + See Also + -------- + cudf.Series.min : Sort values of a Series. + cudf.DataFrame.sort_values : Sort values in a DataFrame. + + Examples + -------- + >>> import cudf + >>> idx = cudf.Index([10, 100, 1, 1000]) + >>> idx + Int64Index([10, 100, 1, 1000], dtype='int64') + + Sort values in ascending order (default behavior). + + >>> idx.sort_values() + Int64Index([1, 10, 100, 1000], dtype='int64') + + Sort values in descending order, and also get the indices `idx` was + sorted by. + + >>> idx.sort_values(ascending=False, return_indexer=True) + (Int64Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2], + dtype=int32)) + + Sorting values in a MultiIndex: + + >>> midx = cudf.MultiIndex( + ... levels=[[1, 3, 4, -10], [1, 11, 5]], + ... codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + ... names=["x", "y"], + ... ) + >>> midx + MultiIndex([( 1, 1), + ( 1, 5), + ( 3, 11), + ( 4, 11), + (-10, 1)], + names=['x', 'y']) + >>> midx.sort_values() + MultiIndex([(-10, 1), + ( 1, 1), + ( 1, 5), + ( 3, 11), + ( 4, 11)], + names=['x', 'y']) + >>> midx.sort_values(ascending=False) + MultiIndex([( 4, 11), + ( 3, 11), + ( 1, 5), + ( 1, 1), + (-10, 1)], + names=['x', 'y']) + """ + if key is not None: + raise NotImplementedError("key parameter is not yet implemented.") + + indices = self._values.argsort(ascending=ascending) + index_sorted = cudf.Index(self.take(indices), name=self.name) + + if return_indexer: + return index_sorted, cupy.asarray(indices) + else: + return index_sorted + + def unique(self): + """ + Return unique values in the index. + + Returns + ------- + Index without duplicates + """ + return cudf.Index(self._values.unique(), name=self.name) + + def join( + self, other, how="left", level=None, return_indexers=False, sort=False + ): + """ + Compute join_index and indexers to conform data structures + to the new index. + + Parameters + ---------- + other : Index. + how : {'left', 'right', 'inner', 'outer'} + return_indexers : bool, default False + sort : bool, default False + Sort the join keys lexicographically in the result Index. If False, + the order of the join keys depends on the join type (how keyword). + + Returns: index + + Examples + -------- + >>> import cudf + >>> lhs = cudf.DataFrame( + ... {"a":[2, 3, 1], "b":[3, 4, 2]}).set_index(['a', 'b'] + ... ).index + >>> lhs + MultiIndex([(2, 3), + (3, 4), + (1, 2)], + names=['a', 'b']) + >>> rhs = cudf.DataFrame({"a":[1, 4, 3]}).set_index('a').index + >>> rhs + Int64Index([1, 4, 3], dtype='int64', name='a') + >>> lhs.join(rhs, how='inner') + MultiIndex([(3, 4), + (1, 2)], + names=['a', 'b']) + """ + + if isinstance(self, cudf.MultiIndex) and isinstance( + other, cudf.MultiIndex + ): + raise TypeError( + "Join on level between two MultiIndex objects is ambiguous" + ) + + if level is not None and not is_scalar(level): + raise ValueError("level should be an int or a label only") + + if isinstance(other, cudf.MultiIndex): + if how == "left": + how = "right" + elif how == "right": + how = "left" + rhs = self.copy(deep=False) + lhs = other.copy(deep=False) + else: + lhs = self.copy(deep=False) + rhs = other.copy(deep=False) + + on = level + # In case of MultiIndex, it will be None as + # we don't need to update name + left_names = lhs.names + right_names = rhs.names + # There should be no `None` values in Joined indices, + # so essentially it would be `left/right` or 'inner' + # in case of MultiIndex + if isinstance(lhs, cudf.MultiIndex): + if level is not None and isinstance(level, int): + on = lhs._data.select_by_index(level).names[0] + right_names = (on,) or right_names + on = right_names[0] + if how == "outer": + how = "left" + elif how == "right": + how = "inner" + else: + # Both are nomal indices + right_names = left_names + on = right_names[0] + + lhs.names = left_names + rhs.names = right_names + + output = lhs._merge(rhs, how=how, on=on, sort=sort) + + return output + + def rename(self, name, inplace=False): + """ + Alter Index name. + + Defaults to returning new index. + + Parameters + ---------- + name : label + Name(s) to set. + + Returns + ------- + Index + + Examples + -------- + >>> import cudf + >>> index = cudf.Index([1, 2, 3], name='one') + >>> index + Int64Index([1, 2, 3], dtype='int64', name='one') + >>> index.name + 'one' + >>> renamed_index = index.rename('two') + >>> renamed_index + Int64Index([1, 2, 3], dtype='int64', name='two') + >>> renamed_index.name + 'two' + """ + if inplace is True: + self.name = name + return None + else: + out = self.copy(deep=False) + out.name = name + return out.copy(deep=True) + + def astype(self, dtype, copy=False): + """ + Create an Index with values cast to dtypes. The class of a new Index + is determined by dtype. When conversion is impossible, a ValueError + exception is raised. + + Parameters + ---------- + dtype : numpy dtype + Use a numpy.dtype to cast entire Index object to. + copy : bool, default False + By default, astype always returns a newly allocated object. + If copy is set to False and internal requirements on dtype are + satisfied, the original data is used to create a new Index + or the original Index is returned. + + Returns + ------- + Index + Index with values cast to specified dtype. + + Examples + -------- + >>> import cudf + >>> index = cudf.Index([1, 2, 3]) + >>> index + Int64Index([1, 2, 3], dtype='int64') + >>> index.astype('float64') + Float64Index([1.0, 2.0, 3.0], dtype='float64') + """ + if is_dtype_equal(dtype, self.dtype): + return self.copy(deep=copy) + + return cudf.Index( + self.copy(deep=copy)._values.astype(dtype), name=self.name + ) + + def to_array(self, fillna=None): + """Get a dense numpy array for the data. + + Parameters + ---------- + fillna : str or None + Defaults to None, which will skip null values. + If it equals "pandas", null values are filled with NaNs. + Non integral dtype is promoted to np.float64. + + Notes + ----- + + if ``fillna`` is ``None``, null values are skipped. Therefore, the + output size could be smaller. + """ + return self._values.to_array(fillna=fillna) + + def to_series(self, index=None, name=None): + """ + Create a Series with both index and values equal to the index keys. + Useful with map for returning an indexer based on an index. + + Parameters + ---------- + index : Index, optional + Index of resulting Series. If None, defaults to original index. + name : str, optional + Dame of resulting Series. If None, defaults to name of original + index. + + Returns + ------- + Series + The dtype will be based on the type of the Index values. + """ + return cudf.Series( + self._values, + index=self.copy(deep=False) if index is None else index, + name=self.name if name is None else name, + ) + + def get_slice_bound(self, label, side, kind): + """ + Calculate slice bound that corresponds to given label. + Returns leftmost (one-past-the-rightmost if ``side=='right'``) position + of given label. + + Parameters + ---------- + label : object + side : {'left', 'right'} + kind : {'ix', 'loc', 'getitem'} + + Returns + ------- + int + Index of label. + """ + raise (NotImplementedError) + + def __array_function__(self, func, types, args, kwargs): + + # check if the function is implemented for the current type + cudf_index_module = type(self) + for submodule in func.__module__.split(".")[1:]: + # point cudf_index_module to the correct submodule + if hasattr(cudf_index_module, submodule): + cudf_index_module = getattr(cudf_index_module, submodule) + else: + return NotImplemented + + fname = func.__name__ + + handled_types = [BaseIndex, cudf.Series] + + # check if we don't handle any of the types (including sub-class) + for t in types: + if not any( + issubclass(t, handled_type) for handled_type in handled_types + ): + return NotImplemented + + if hasattr(cudf_index_module, fname): + cudf_func = getattr(cudf_index_module, fname) + # Handle case if cudf_func is same as numpy function + if cudf_func is func: + return NotImplemented + else: + return cudf_func(*args, **kwargs) + + else: + return NotImplemented + + def isin(self, values): + """Return a boolean array where the index values are in values. + + Compute boolean array of whether each index value is found in + the passed set of values. The length of the returned boolean + array matches the length of the index. + + Parameters + ---------- + values : set, list-like, Index + Sought values. + + Returns + ------- + is_contained : cupy array + CuPy array of boolean values. + + Examples + -------- + >>> idx = cudf.Index([1,2,3]) + >>> idx + Int64Index([1, 2, 3], dtype='int64') + + Check whether each index value in a list of values. + + >>> idx.isin([1, 4]) + array([ True, False, False]) + """ + + return self._values.isin(values).values + + def memory_usage(self, deep=False): + """ + Memory usage of the values. + + Parameters + ---------- + deep : bool + Introspect the data deeply, + interrogate `object` dtypes for system-level + memory consumption. + + Returns + ------- + bytes used + """ + return self._values._memory_usage(deep=deep) + + @classmethod + def from_pandas(cls, index, nan_as_null=None): + """ + Convert from a Pandas Index. + + Parameters + ---------- + index : Pandas Index object + A Pandas Index object which has to be converted + to cuDF Index. + nan_as_null : bool, Default None + If ``None``/``True``, converts ``np.nan`` values + to ``null`` values. + If ``False``, leaves ``np.nan`` values as is. + + Raises + ------ + TypeError for invalid input type. + + Examples + -------- + >>> import cudf + >>> import pandas as pd + >>> import numpy as np + >>> data = [10, 20, 30, np.nan] + >>> pdi = pd.Index(data) + >>> cudf.Index.from_pandas(pdi) + Float64Index([10.0, 20.0, 30.0, ], dtype='float64') + >>> cudf.Index.from_pandas(pdi, nan_as_null=False) + Float64Index([10.0, 20.0, 30.0, nan], dtype='float64') + """ + if not isinstance(index, pd.Index): + raise TypeError("not a pandas.Index") + + ind = cudf.Index(column.as_column(index, nan_as_null=nan_as_null)) + ind.name = index.name + return ind + + @property + def _constructor_expanddim(self): + return cudf.MultiIndex diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 0688283bc43..ea3c7bfb91f 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -10,7 +10,7 @@ from cudf._typing import ColumnLike, ScalarLike from cudf.core.column import ColumnBase from cudf.core.dataframe import DataFrame -from cudf.core.frame import Frame +from cudf.core.frame import Frame, SingleColumnFrame from cudf.core.index import Index from cudf.core.series import Series @@ -94,9 +94,9 @@ def _check_and_cast_columns_with_other( def _normalize_columns_and_scalars_type( - frame: Union[Series, Index, DataFrame], other: Any, inplace: bool = False, + frame: Frame, other: Any, inplace: bool = False, ) -> Tuple[ - Union[Series, Index, DataFrame, ColumnLike], Any, + Union[Frame, ColumnLike], Any, ]: """ Try to normalize the other's dtypes as per frame. @@ -177,10 +177,7 @@ def _normalize_columns_and_scalars_type( def where( - frame: Union[Series, Index, DataFrame], - cond: Any, - other: Any = None, - inplace: bool = False, + frame: Frame, cond: Any, other: Any = None, inplace: bool = False, ) -> Optional[Union[Frame]]: """ Replace values where the condition is False. @@ -332,6 +329,7 @@ def where( return frame._mimic_inplace(out_df, inplace=inplace) else: + frame = cast(SingleColumnFrame, frame) if isinstance(other, DataFrame): raise NotImplementedError( "cannot align with a higher dimensional Frame" diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 50ad592b54f..fa6c49284f0 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -6,8 +6,8 @@ from cudf.core.column import as_column from cudf.core.frame import Frame -from cudf.core.index import RangeIndex -from cudf.core.series import Index, Series +from cudf.core.index import Index, RangeIndex +from cudf.core.series import Series def factorize(values, sort=False, na_sentinel=-1, size_hint=None): diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 7333ae119cd..76dd0683a5a 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -37,7 +37,7 @@ ) if TYPE_CHECKING: - from cudf._typing import SeriesOrIndex + from cudf._typing import SeriesOrIndex, SeriesOrSingleColumnIndex from cudf.core.column import ( ColumnBase, DatetimeColumn, @@ -104,7 +104,7 @@ class CategoricalAccessor(ColumnMethods): _column: CategoricalColumn - def __init__(self, parent: SeriesOrIndex): + def __init__(self, parent: SeriesOrSingleColumnIndex): if not is_categorical_dtype(parent.dtype): raise AttributeError( "Can only use .cat accessor with a 'category' dtype" diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index a587c58a49d..9bea94cfecb 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -8,7 +8,7 @@ import cudf -ParentType = Union["cudf.Series", "cudf.BaseIndex"] +ParentType = Union["cudf.Series", "cudf.core.index.GenericIndex"] class ColumnMethods: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a739eba71f3..aac0b027c0b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2,6 +2,7 @@ from __future__ import annotations, division +import functools import inspect import itertools import numbers @@ -10,7 +11,7 @@ import warnings from collections import defaultdict from collections.abc import Iterable, Sequence -from typing import Any, MutableMapping, Optional, TypeVar +from typing import Any, MutableMapping, Optional, Set, TypeVar import cupy import numpy as np @@ -25,10 +26,15 @@ import cudf import cudf.core.common from cudf import _lib as libcudf -from cudf.api.types import is_bool_dtype, is_dict_like +from cudf.api.types import is_bool_dtype, is_dict_like, is_dtype_equal from cudf.core import column, reshape from cudf.core.abc import Serializable -from cudf.core.column import as_column, column_empty +from cudf.core.column import ( + as_column, + build_categorical_column, + column_empty, + concat_columns, +) from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import Frame, _drop_rows_by_labels from cudf.core.groupby.groupby import DataFrameGroupBy @@ -47,9 +53,11 @@ is_datetime_dtype, is_list_dtype, is_list_like, + is_numerical_dtype, is_scalar, is_string_dtype, is_struct_dtype, + min_scalar_type, numeric_normalize_types, ) from cudf.utils.utils import GetAttrGetItemMixin @@ -160,7 +168,8 @@ class DataFrame(Frame, Serializable, GetAttrGetItemMixin): 3 3 0.3 """ - _PROTECTED_KEYS = frozenset(("_data", "_index")) + _PROTECTED_KEYS = frozenset(("_column_accessor", "_data", "_index")) + _accessors: Set[Any] = set() @annotate("DATAFRAME_INIT", color="blue", domain="cudf_python") def __init__(self, data=None, index=None, columns=None, dtype=None): @@ -1029,6 +1038,209 @@ def assign(self, **kwargs): new[k] = v return new + @classmethod + @annotate("CONCAT", color="orange", domain="cudf_python") + def _concat( + cls, objs, axis=0, join="outer", ignore_index=False, sort=False + ): + # flag to indicate at least one empty input frame also has an index + empty_has_index = False + # length of output frame's RangeIndex if all input frames are empty, + # and at least one has an index + result_index_length = 0 + # the number of empty input frames + num_empty_input_frames = 0 + + for i, obj in enumerate(objs): + # shallow-copy the input DFs in case the same DF instance + # is concatenated with itself + objs[i] = obj.copy(deep=False) + + # If ignore_index is true, determine if + # all or some objs are empty(and have index). + # 1. If all objects are empty(and have index), we + # should set the index separately using RangeIndex. + # 2. If some objects are empty(and have index), we + # create empty columns later while populating `columns` + # variable. Detailed explanation of second case before + # allocation of `columns` variable below. + if ignore_index and obj.empty: + num_empty_input_frames += 1 + result_index_length += len(obj) + empty_has_index = empty_has_index or len(obj) > 0 + + if join == "inner": + sets_of_column_names = [set(obj._column_names) for obj in objs] + + intersecting_columns = functools.reduce( + set.intersection, sets_of_column_names + ) + union_of_columns = functools.reduce( + set.union, sets_of_column_names + ) + non_intersecting_columns = union_of_columns.symmetric_difference( + intersecting_columns + ) + + # Get an ordered list of the intersecting columns to preserve input + # order, which is promised by pandas for inner joins. + ordered_intersecting_columns = [ + name + for obj in objs + for name in obj._column_names + if name in intersecting_columns + ] + + names = dict.fromkeys(ordered_intersecting_columns).keys() + + if axis == 0: + if ignore_index and ( + num_empty_input_frames > 0 + or len(intersecting_columns) == 0 + ): + # When ignore_index is True and if there is + # at least 1 empty dataframe and no + # intersecting columns are present, an empty dataframe + # needs to be returned just with an Index. + empty_has_index = True + num_empty_input_frames = len(objs) + result_index_length = sum(len(obj) for obj in objs) + + # remove columns not present in all objs + for obj in objs: + obj.drop( + columns=non_intersecting_columns, + inplace=True, + errors="ignore", + ) + elif join == "outer": + # Get a list of the unique table column names + names = [name for f in objs for name in f._column_names] + names = dict.fromkeys(names).keys() + + else: + raise ValueError( + "Only can inner (intersect) or outer (union) when joining" + "the other axis" + ) + + if sort: + try: + # Sorted always returns a list, but will fail to sort if names + # include different types that are not comparable. + names = sorted(names) + except TypeError: + names = list(names) + else: + names = list(names) + + # Combine the index and table columns for each Frame into a list of + # [...index_cols, ...table_cols]. + # + # If any of the input frames have a non-empty index, include these + # columns in the list of columns to concatenate, even if the input + # frames are empty and `ignore_index=True`. + columns = [ + ( + [] + if (ignore_index and not empty_has_index) + else list(f._index._data.columns) + ) + + [f._data[name] if name in f._data else None for name in names] + for f in objs + ] + + # Get a list of the combined index and table column indices + indices = list(range(functools.reduce(max, map(len, columns)))) + # The position of the first table colum in each + # combined index + table columns list + first_data_column_position = len(indices) - len(names) + + # Get the non-null columns and their dtypes + non_null_cols, dtypes = _get_non_null_cols_and_dtypes(indices, columns) + + # Infer common dtypes between numeric columns + # and combine CategoricalColumn categories + categories = _find_common_dtypes_and_categories(non_null_cols, dtypes) + + # Cast all columns to a common dtype, assign combined categories, + # and back-fill missing columns with all-null columns + _cast_cols_to_common_dtypes(indices, columns, dtypes, categories) + + # Construct input tables with the index and data columns in the same + # order. This strips the given index/column names and replaces the + # names with their integer positions in the `cols` list + tables = [] + for cols in columns: + table_index = None + if 1 == first_data_column_position: + table_index = cudf.core.index.as_index(cols[0]) + elif first_data_column_position > 1: + table_index = libcudf.table.Table( + data=dict( + zip( + indices[:first_data_column_position], + cols[:first_data_column_position], + ) + ) + ) + tables.append( + libcudf.table.Table( + data=dict( + zip( + indices[first_data_column_position:], + cols[first_data_column_position:], + ) + ), + index=table_index, + ) + ) + + # Concatenate the Tables + out = cls._from_data( + *libcudf.concat.concat_tables(tables, ignore_index) + ) + + # If ignore_index is True, all input frames are empty, and at + # least one input frame has an index, assign a new RangeIndex + # to the result frame. + if empty_has_index and num_empty_input_frames == len(objs): + out._index = cudf.RangeIndex(result_index_length) + # Reassign the categories for any categorical table cols + _reassign_categories( + categories, out._data, indices[first_data_column_position:] + ) + + # Reassign the categories for any categorical index cols + if not isinstance(out._index, cudf.RangeIndex): + _reassign_categories( + categories, + out._index._data, + indices[:first_data_column_position], + ) + if not isinstance( + out._index, cudf.MultiIndex + ) and is_categorical_dtype(out._index._values.dtype): + out = out.set_index( + cudf.core.index.as_index(out.index._values) + ) + + # Reassign precision for any decimal cols + for name, col in out._data.items(): + if isinstance(col, cudf.core.column.Decimal64Column): + col = col._with_type_metadata(tables[0]._data[name].dtype) + + # Reassign index and column names + if isinstance(objs[0].columns, pd.MultiIndex): + out.columns = objs[0].columns + else: + out.columns = names + if not ignore_index: + out._index.name = objs[0]._index.name + out._index.names = objs[0]._index.names + + return out + def astype(self, dtype, copy=False, errors="raise", **kwargs): """ Cast the DataFrame to the given dtype @@ -7295,7 +7507,7 @@ def _get_union_of_indices(indexes): if len(indexes) == 1: return indexes[0] else: - merged_index = cudf.Index._concat(indexes) + merged_index = cudf.core.index.GenericIndex._concat(indexes) merged_index = merged_index.drop_duplicates() _, inds = merged_index._values.sort_by_values() return merged_index.take(inds) @@ -7336,3 +7548,95 @@ def _drop_columns(df: DataFrame, columns: Iterable, errors: str): pass else: raise e + + +# Create a dictionary of the common, non-null columns +def _get_non_null_cols_and_dtypes(col_idxs, list_of_columns): + # A mapping of {idx: np.dtype} + dtypes = dict() + # A mapping of {idx: [...columns]}, where `[...columns]` + # is a list of columns with at least one valid value for each + # column name across all input frames + non_null_columns = dict() + for idx in col_idxs: + for cols in list_of_columns: + # Skip columns not in this frame + if idx >= len(cols) or cols[idx] is None: + continue + # Store the first dtype we find for a column, even if it's + # all-null. This ensures we always have at least one dtype + # for each name. This dtype will be overwritten later if a + # non-null Column with the same name is found. + if idx not in dtypes: + dtypes[idx] = cols[idx].dtype + if cols[idx].valid_count > 0: + if idx not in non_null_columns: + non_null_columns[idx] = [cols[idx]] + else: + non_null_columns[idx].append(cols[idx]) + return non_null_columns, dtypes + + +def _find_common_dtypes_and_categories(non_null_columns, dtypes): + # A mapping of {idx: categories}, where `categories` is a + # column of all the unique categorical values from each + # categorical column across all input frames + categories = dict() + for idx, cols in non_null_columns.items(): + # default to the first non-null dtype + dtypes[idx] = cols[0].dtype + # If all the non-null dtypes are int/float, find a common dtype + if all(is_numerical_dtype(col.dtype) for col in cols): + dtypes[idx] = find_common_type([col.dtype for col in cols]) + # If all categorical dtypes, combine the categories + elif all( + isinstance(col, cudf.core.column.CategoricalColumn) for col in cols + ): + # Combine and de-dupe the categories + categories[idx] = ( + cudf.Series(concat_columns([col.categories for col in cols])) + .drop_duplicates(ignore_index=True) + ._column + ) + # Set the column dtype to the codes' dtype. The categories + # will be re-assigned at the end + dtypes[idx] = min_scalar_type(len(categories[idx])) + # Otherwise raise an error if columns have different dtypes + elif not all(is_dtype_equal(c.dtype, dtypes[idx]) for c in cols): + raise ValueError("All columns must be the same type") + return categories + + +def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories): + # Cast all columns to a common dtype, assign combined categories, + # and back-fill missing columns with all-null columns + for idx in col_idxs: + dtype = dtypes[idx] + for cols in list_of_columns: + # If column not in this df, fill with an all-null column + if idx >= len(cols) or cols[idx] is None: + n = len(next(x for x in cols if x is not None)) + cols[idx] = column_empty(row_count=n, dtype=dtype, masked=True) + else: + # If column is categorical, rebase the codes with the + # combined categories, and cast the new codes to the + # min-scalar-sized dtype + if idx in categories: + cols[idx] = ( + cols[idx] + ._set_categories(categories[idx], is_unique=True,) + .codes + ) + cols[idx] = cols[idx].astype(dtype) + + +def _reassign_categories(categories, cols, col_idxs): + for name, idx in zip(cols, col_idxs): + if idx in categories: + cols[name] = build_categorical_column( + categories=categories[idx], + codes=as_column(cols[name].base_data, dtype=cols[name].dtype), + mask=cols[name].base_mask, + offset=cols[name].offset, + size=cols[name].size, + ) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index b6eb3108550..33be14462d4 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3,10 +3,18 @@ from __future__ import annotations import copy -import functools import warnings from collections import abc -from typing import Any, Dict, MutableMapping, Optional, Tuple, TypeVar, Union +from typing import ( + Any, + Dict, + MutableMapping, + Optional, + Tuple, + TypeVar, + Union, + cast, +) import cupy import numpy as np @@ -17,13 +25,12 @@ import cudf from cudf import _lib as libcudf from cudf._typing import ColumnLike, DataFrameOrSeries -from cudf.api.types import is_dict_like, is_dtype_equal, issubdtype +from cudf.api.types import is_dict_like, issubdtype from cudf.core.column import ( ColumnBase, as_column, build_categorical_column, column_empty, - concat_columns, ) from cudf.core.column_accessor import ColumnAccessor from cudf.core.join import merge @@ -33,14 +40,10 @@ from cudf.utils.dtypes import ( _is_non_decimal_numeric_dtype, _is_scalar_or_zero_d_array, - find_common_type, - is_categorical_dtype, is_column_like, is_decimal_dtype, is_integer_dtype, - is_numerical_dtype, is_scalar, - min_scalar_type, ) T = TypeVar("T", bound="Frame") @@ -60,12 +63,6 @@ class Frame(libcudf.table.Table): _data: "ColumnAccessor" - @classmethod - def __init_subclass__(cls): - # All subclasses contain a set _accessors that is used to hold custom - # accessors defined by user APIs (see cudf/api/extensions/accessor.py). - cls._accessors = set() - @classmethod def _from_data( cls, @@ -326,209 +323,6 @@ def copy(self: T, deep: bool = True) -> T: return new_frame - @classmethod - @annotate("CONCAT", color="orange", domain="cudf_python") - def _concat( - cls, objs, axis=0, join="outer", ignore_index=False, sort=False - ): - # flag to indicate at least one empty input frame also has an index - empty_has_index = False - # length of output frame's RangeIndex if all input frames are empty, - # and at least one has an index - result_index_length = 0 - # the number of empty input frames - num_empty_input_frames = 0 - - for i, obj in enumerate(objs): - # shallow-copy the input DFs in case the same DF instance - # is concatenated with itself - objs[i] = obj.copy(deep=False) - - # If ignore_index is true, determine if - # all or some objs are empty(and have index). - # 1. If all objects are empty(and have index), we - # should set the index separately using RangeIndex. - # 2. If some objects are empty(and have index), we - # create empty columns later while populating `columns` - # variable. Detailed explanation of second case before - # allocation of `columns` variable below. - if ignore_index and obj.empty: - num_empty_input_frames += 1 - result_index_length += len(obj) - empty_has_index = empty_has_index or len(obj) > 0 - - if join == "inner": - sets_of_column_names = [set(obj._column_names) for obj in objs] - - intersecting_columns = functools.reduce( - set.intersection, sets_of_column_names - ) - union_of_columns = functools.reduce( - set.union, sets_of_column_names - ) - non_intersecting_columns = union_of_columns.symmetric_difference( - intersecting_columns - ) - - # Get an ordered list of the intersecting columns to preserve input - # order, which is promised by pandas for inner joins. - ordered_intersecting_columns = [ - name - for obj in objs - for name in obj._column_names - if name in intersecting_columns - ] - - names = dict.fromkeys(ordered_intersecting_columns).keys() - - if axis == 0: - if ignore_index and ( - num_empty_input_frames > 0 - or len(intersecting_columns) == 0 - ): - # When ignore_index is True and if there is - # at least 1 empty dataframe and no - # intersecting columns are present, an empty dataframe - # needs to be returned just with an Index. - empty_has_index = True - num_empty_input_frames = len(objs) - result_index_length = sum(len(obj) for obj in objs) - - # remove columns not present in all objs - for obj in objs: - obj.drop( - columns=non_intersecting_columns, - inplace=True, - errors="ignore", - ) - elif join == "outer": - # Get a list of the unique table column names - names = [name for f in objs for name in f._column_names] - names = dict.fromkeys(names).keys() - - else: - raise ValueError( - "Only can inner (intersect) or outer (union) when joining" - "the other axis" - ) - - if sort: - try: - # Sorted always returns a list, but will fail to sort if names - # include different types that are not comparable. - names = sorted(names) - except TypeError: - names = list(names) - else: - names = list(names) - - # Combine the index and table columns for each Frame into a list of - # [...index_cols, ...table_cols]. - # - # If any of the input frames have a non-empty index, include these - # columns in the list of columns to concatenate, even if the input - # frames are empty and `ignore_index=True`. - columns = [ - ( - [] - if (ignore_index and not empty_has_index) - else list(f._index._data.columns) - ) - + [f._data[name] if name in f._data else None for name in names] - for f in objs - ] - - # Get a list of the combined index and table column indices - indices = list(range(functools.reduce(max, map(len, columns)))) - # The position of the first table colum in each - # combined index + table columns list - first_data_column_position = len(indices) - len(names) - - # Get the non-null columns and their dtypes - non_null_cols, dtypes = _get_non_null_cols_and_dtypes(indices, columns) - - # Infer common dtypes between numeric columns - # and combine CategoricalColumn categories - categories = _find_common_dtypes_and_categories(non_null_cols, dtypes) - - # Cast all columns to a common dtype, assign combined categories, - # and back-fill missing columns with all-null columns - _cast_cols_to_common_dtypes(indices, columns, dtypes, categories) - - # Construct input tables with the index and data columns in the same - # order. This strips the given index/column names and replaces the - # names with their integer positions in the `cols` list - tables = [] - for cols in columns: - table_index = None - if 1 == first_data_column_position: - table_index = cudf.core.index.as_index(cols[0]) - elif first_data_column_position > 1: - table_index = libcudf.table.Table( - data=dict( - zip( - indices[:first_data_column_position], - cols[:first_data_column_position], - ) - ) - ) - tables.append( - libcudf.table.Table( - data=dict( - zip( - indices[first_data_column_position:], - cols[first_data_column_position:], - ) - ), - index=table_index, - ) - ) - - # Concatenate the Tables - out = cls._from_data( - *libcudf.concat.concat_tables(tables, ignore_index) - ) - - # If ignore_index is True, all input frames are empty, and at - # least one input frame has an index, assign a new RangeIndex - # to the result frame. - if empty_has_index and num_empty_input_frames == len(objs): - out._index = cudf.RangeIndex(result_index_length) - # Reassign the categories for any categorical table cols - _reassign_categories( - categories, out._data, indices[first_data_column_position:] - ) - - # Reassign the categories for any categorical index cols - if not isinstance(out._index, cudf.RangeIndex): - _reassign_categories( - categories, - out._index._data, - indices[:first_data_column_position], - ) - if not isinstance( - out._index, cudf.MultiIndex - ) and is_categorical_dtype(out._index._values.dtype): - out = out.set_index( - cudf.core.index.as_index(out.index._values) - ) - - # Reassign precision for any decimal cols - for name, col in out._data.items(): - if isinstance(col, cudf.core.column.Decimal64Column): - col = col._with_type_metadata(tables[0]._data[name].dtype) - - # Reassign index and column names - if isinstance(objs[0].columns, pd.MultiIndex): - out.columns = objs[0].columns - else: - out.columns = names - if not ignore_index: - out._index.name = objs[0]._index.name - out._index.names = objs[0]._index.names - - return out - def equals(self, other, **kwargs): """ Test whether two objects contain the same elements. @@ -2336,7 +2130,7 @@ def _copy_type_metadata( if include_index: if self._index is not None and other._index is not None: - self._index._copy_type_metadata(other._index) + self._index._copy_type_metadata(other._index) # type: ignore # When other._index is a CategoricalIndex, the current index # will be a NumericalIndex with an underlying CategoricalColumn # (the above _copy_type_metadata call will have converted the @@ -2347,7 +2141,9 @@ def _copy_type_metadata( ) and not isinstance( self._index, cudf.core.index.CategoricalIndex ): - self._index = cudf.Index(self._index._column) + self._index = cudf.Index( + cast(cudf.core.index.NumericIndex, self._index)._column + ) return self @@ -3429,6 +3225,26 @@ def _binaryop( *args, **kwargs, ) -> Frame: + """Perform a binary operation between two frames. + + Parameters + ---------- + other : Frame + The second operand. + fn : str + The operation to perform. + fill_value : Any, default None + The value to replace null values with. If ``None``, nulls are not + filled before the operation. + reflect : bool, default False + If ``True`` the operation is reflected (i.e whether to swap the + left and right operands). + + Returns + ------- + Frame + A new instance containing the result of the operation. + """ raise NotImplementedError @classmethod @@ -3455,8 +3271,8 @@ def _colwise_binop( Returns ------- - Frame - A subclass of Frame constructed from the result of performing the + Dict[ColumnBase] + A dict of columns constructed from the result of performing the requested operation on the operands. """ @@ -5089,39 +4905,32 @@ def factorize(self, na_sentinel=-1): """ return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel) - def _binaryop( + def _make_operands_for_binop( self, other: T, - fn: str, fill_value: Any = None, reflect: bool = False, *args, **kwargs, - ) -> SingleColumnFrame: - """Perform a binary operation between two single column frames. + ) -> Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]]: + """Generate the dictionary of operands used for a binary operation. Parameters ---------- other : SingleColumnFrame The second operand. - fn : str - The operation fill_value : Any, default None The value to replace null values with. If ``None``, nulls are not filled before the operation. reflect : bool, default False If ``True`` the operation is reflected (i.e whether to swap the left and right operands). - lhs : SingleColumnFrame, default None - The left hand operand. If ``None``, self is used. This parameter - allows child classes to preprocess the inputs if necessary. Returns ------- - SingleColumnFrame - A new instance containing the result of the operation. + Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]] + The operands to be passed to _colwise_binop. """ - # Get the appropriate name for output operations involving two objects # that are Series-like objects. The output shares the lhs's name unless # the rhs is a _differently_ named Series-like object. @@ -5143,15 +4952,7 @@ def _binaryop( except Exception: return NotImplemented - operands: Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]] = { - result_name: (self._column, other, reflect, fill_value) - } - - return self._from_data( - data=type(self)._colwise_binop(operands, fn), - index=self._index, - name=result_name, - ) + return {result_name: (self._column, other, reflect, fill_value)} def _get_replacement_values_for_columns( @@ -5310,98 +5111,6 @@ def _get_replacement_values_for_columns( return all_na_columns, to_replace_columns, values_columns -# Create a dictionary of the common, non-null columns -def _get_non_null_cols_and_dtypes(col_idxs, list_of_columns): - # A mapping of {idx: np.dtype} - dtypes = dict() - # A mapping of {idx: [...columns]}, where `[...columns]` - # is a list of columns with at least one valid value for each - # column name across all input frames - non_null_columns = dict() - for idx in col_idxs: - for cols in list_of_columns: - # Skip columns not in this frame - if idx >= len(cols) or cols[idx] is None: - continue - # Store the first dtype we find for a column, even if it's - # all-null. This ensures we always have at least one dtype - # for each name. This dtype will be overwritten later if a - # non-null Column with the same name is found. - if idx not in dtypes: - dtypes[idx] = cols[idx].dtype - if cols[idx].valid_count > 0: - if idx not in non_null_columns: - non_null_columns[idx] = [cols[idx]] - else: - non_null_columns[idx].append(cols[idx]) - return non_null_columns, dtypes - - -def _find_common_dtypes_and_categories(non_null_columns, dtypes): - # A mapping of {idx: categories}, where `categories` is a - # column of all the unique categorical values from each - # categorical column across all input frames - categories = dict() - for idx, cols in non_null_columns.items(): - # default to the first non-null dtype - dtypes[idx] = cols[0].dtype - # If all the non-null dtypes are int/float, find a common dtype - if all(is_numerical_dtype(col.dtype) for col in cols): - dtypes[idx] = find_common_type([col.dtype for col in cols]) - # If all categorical dtypes, combine the categories - elif all( - isinstance(col, cudf.core.column.CategoricalColumn) for col in cols - ): - # Combine and de-dupe the categories - categories[idx] = ( - cudf.Series(concat_columns([col.categories for col in cols])) - .drop_duplicates(ignore_index=True) - ._column - ) - # Set the column dtype to the codes' dtype. The categories - # will be re-assigned at the end - dtypes[idx] = min_scalar_type(len(categories[idx])) - # Otherwise raise an error if columns have different dtypes - elif not all(is_dtype_equal(c.dtype, dtypes[idx]) for c in cols): - raise ValueError("All columns must be the same type") - return categories - - -def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories): - # Cast all columns to a common dtype, assign combined categories, - # and back-fill missing columns with all-null columns - for idx in col_idxs: - dtype = dtypes[idx] - for cols in list_of_columns: - # If column not in this df, fill with an all-null column - if idx >= len(cols) or cols[idx] is None: - n = len(next(x for x in cols if x is not None)) - cols[idx] = column_empty(row_count=n, dtype=dtype, masked=True) - else: - # If column is categorical, rebase the codes with the - # combined categories, and cast the new codes to the - # min-scalar-sized dtype - if idx in categories: - cols[idx] = ( - cols[idx] - ._set_categories(categories[idx], is_unique=True,) - .codes - ) - cols[idx] = cols[idx].astype(dtype) - - -def _reassign_categories(categories, cols, col_idxs): - for name, idx in zip(cols, col_idxs): - if idx in categories: - cols[name] = build_categorical_column( - categories=categories[idx], - codes=as_column(cols[name].base_data, dtype=cols[name].dtype), - mask=cols[name].base_mask, - offset=cols[name].offset, - size=cols[name].size, - ) - - def _is_series(obj): """ Checks if the `obj` is of type `cudf.Series` diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 6be21ce74d2..6b4b77fabc5 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2,6 +2,7 @@ from __future__ import annotations, division, print_function +import math import pickle from numbers import Number from typing import ( @@ -12,6 +13,7 @@ Optional, Tuple, Type, + TypeVar, Union, ) @@ -26,14 +28,8 @@ from cudf._lib.filling import sequence from cudf._lib.search import search_sorted from cudf._lib.table import Table -from cudf._typing import DtypeObj -from cudf.api.types import ( - _is_scalar_or_zero_d_array, - is_dtype_equal, - is_integer, - is_string_dtype, -) -from cudf.core.abc import Serializable +from cudf.api.types import _is_scalar_or_zero_d_array, is_string_dtype +from cudf.core._base_index import BaseIndex from cudf.core.column import ( CategoricalColumn, ColumnBase, @@ -41,1282 +37,78 @@ IntervalColumn, NumericalColumn, StringColumn, - TimeDeltaColumn, - arange, - column, -) -from cudf.core.column.column import as_column, concat_columns -from cudf.core.column.string import StringMethods as StringMethods -from cudf.core.dtypes import IntervalDtype -from cudf.core.frame import SingleColumnFrame -from cudf.utils import ioutils -from cudf.utils.docutils import copy_docstring -from cudf.utils.dtypes import ( - _is_non_decimal_numeric_dtype, - find_common_type, - is_categorical_dtype, - is_interval_dtype, - is_list_like, - is_mixed_with_object_dtype, - is_scalar, - numeric_normalize_types, -) -from cudf.utils.utils import cached_property, search_range - - -class BaseIndex(SingleColumnFrame, Serializable): - """Base class for all cudf Index types.""" - - dtype: DtypeObj - - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - - if method == "__call__" and hasattr(cudf, ufunc.__name__): - func = getattr(cudf, ufunc.__name__) - return func(*inputs) - else: - return NotImplemented - - @cached_property - def _values(self) -> ColumnBase: - raise NotImplementedError - - def __getitem__(self, key): - raise NotImplementedError() - - def drop_duplicates(self, keep="first"): - """ - Return Index with duplicate values removed - - Parameters - ---------- - keep : {‘first’, ‘last’, False}, default ‘first’ - * ‘first’ : Drop duplicates except for the - first occurrence. - * ‘last’ : Drop duplicates except for the - last occurrence. - * False : Drop all duplicates. - - Returns - ------- - deduplicated : Index - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) - >>> idx - StringIndex(['lama' 'cow' 'lama' 'beetle' 'lama' 'hippo'], dtype='object') - >>> idx.drop_duplicates() - StringIndex(['beetle' 'cow' 'hippo' 'lama'], dtype='object') - """ # noqa: E501 - return super().drop_duplicates(keep=keep) - - def serialize(self): - header = {} - header["index_column"] = {} - # store metadata values of index separately - # Indexes: Numerical/DateTime/String are often GPU backed - header["index_column"], frames = self._values.serialize() - - header["name"] = pickle.dumps(self.name) - header["dtype"] = pickle.dumps(self.dtype) - header["type-serialized"] = pickle.dumps(type(self)) - header["frame_count"] = len(frames) - return header, frames - - def __contains__(self, item): - return item in self._values - - @annotate("INDEX_EQUALS", color="green", domain="cudf_python") - def equals(self, other, **kwargs): - """ - Determine if two Index objects contain the same elements. - - Returns - ------- - out: bool - True if “other” is an Index and it has the same elements - as calling index; False otherwise. - """ - if not isinstance(other, BaseIndex): - return False - - check_types = False - - self_is_categorical = isinstance(self, CategoricalIndex) - other_is_categorical = isinstance(other, CategoricalIndex) - if self_is_categorical and not other_is_categorical: - other = other.astype(self.dtype) - check_types = True - elif other_is_categorical and not self_is_categorical: - self = self.astype(other.dtype) - check_types = True - - try: - return super().equals(other, check_types=check_types) - except TypeError: - return False - - def get_level_values(self, level): - """ - Return an Index of values for requested level. - - This is primarily useful to get an individual level of values from a - MultiIndex, but is provided on Index as well for compatibility. - - Parameters - ---------- - level : int or str - It is either the integer position or the name of the level. - - Returns - ------- - Index - Calling object, as there is only one level in the Index. - - See Also - -------- - cudf.core.multiindex.MultiIndex.get_level_values : Get values for - a level of a MultiIndex. - - Notes - ----- - For Index, level should be 0, since there are no multiple levels. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index(["a", "b", "c"]) - >>> idx.get_level_values(0) - StringIndex(['a' 'b' 'c'], dtype='object') - """ - - if level == self.name: - return self - elif is_integer(level): - if level != 0: - raise IndexError( - f"Cannot get level: {level} " f"for index with 1 level" - ) - return self - else: - raise KeyError(f"Requested level with name {level} " "not found") - - @classmethod - def deserialize(cls, header, frames): - h = header["index_column"] - idx_typ = pickle.loads(header["type-serialized"]) - name = pickle.loads(header["name"]) - - col_typ = pickle.loads(h["type-serialized"]) - index = col_typ.deserialize(h, frames[: header["frame_count"]]) - return idx_typ(index, name=name) - - @property - def names(self): - """ - Returns a tuple containing the name of the Index. - """ - return (self.name,) - - @names.setter - def names(self, values): - if not is_list_like(values): - raise ValueError("Names must be a list-like") - - num_values = len(values) - if num_values > 1: - raise ValueError( - "Length of new names must be 1, got %d" % num_values - ) - - self.name = values[0] - - def dropna(self, how="any"): - """ - Return an Index with null values removed. - - Parameters - ---------- - how : {‘any’, ‘all’}, default ‘any’ - If the Index is a MultiIndex, drop the value when any or - all levels are NaN. - - Returns - ------- - valid : Index - - Examples - -------- - >>> import cudf - >>> index = cudf.Index(['a', None, 'b', 'c']) - >>> index - StringIndex(['a' None 'b' 'c'], dtype='object') - >>> index.dropna() - StringIndex(['a' 'b' 'c'], dtype='object') - - Using `dropna` on a `MultiIndex`: - - >>> midx = cudf.MultiIndex( - ... levels=[[1, None, 4, None], [1, 2, 5]], - ... codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - ... names=["x", "y"], - ... ) - >>> midx - MultiIndex([( 1, 1), - ( 1, 5), - (, 2), - ( 4, 2), - (, 1)], - names=['x', 'y']) - >>> midx.dropna() - MultiIndex([(1, 1), - (1, 5), - (4, 2)], - names=['x', 'y']) - """ - return super().dropna(how=how) - - def _clean_nulls_from_index(self): - """ - Convert all na values(if any) in Index object - to `` as a preprocessing step to `__repr__` methods. - - This will involve changing type of Index object - to StringIndex but it is the responsibility of the `__repr__` - methods using this method to replace or handle representation - of the actual types correctly. - """ - if self._values.has_nulls: - return cudf.Index( - self._values.astype("str").fillna(cudf._NA_REP), name=self.name - ) - else: - return self - - @property - def nlevels(self): - """ - Number of levels. - """ - return 1 - - def _set_names(self, names, inplace=False): - if inplace: - idx = self - else: - idx = self.copy(deep=False) - - idx.names = names - if not inplace: - return idx - - def set_names(self, names, level=None, inplace=False): - """ - Set Index or MultiIndex name. - Able to set new names partially and by level. - - Parameters - ---------- - names : label or list of label - Name(s) to set. - level : int, label or list of int or label, optional - If the index is a MultiIndex, level(s) to set (None for all - levels). Otherwise level must be None. - inplace : bool, default False - Modifies the object directly, instead of creating a new Index or - MultiIndex. - - Returns - ------- - Index - The same type as the caller or None if inplace is True. - - See Also - -------- - cudf.Index.rename : Able to set new names without level. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([1, 2, 3, 4]) - >>> idx - Int64Index([1, 2, 3, 4], dtype='int64') - >>> idx.set_names('quarter') - Int64Index([1, 2, 3, 4], dtype='int64', name='quarter') - >>> idx = cudf.MultiIndex.from_product([['python', 'cobra'], - ... [2018, 2019]]) - >>> idx - MultiIndex([('python', 2018), - ('python', 2019), - ( 'cobra', 2018), - ( 'cobra', 2019)], - ) - >>> idx.names - FrozenList([None, None]) - >>> idx.set_names(['kind', 'year'], inplace=True) - >>> idx.names - FrozenList(['kind', 'year']) - >>> idx.set_names('species', level=0, inplace=True) - >>> idx.names - FrozenList(['species', 'year']) - """ - if level is not None: - raise ValueError("Level must be None for non-MultiIndex") - - if not is_list_like(names): - names = [names] - - return self._set_names(names=names, inplace=inplace) - - def fillna(self, value, downcast=None): - """ - Fill null values with the specified value. - - Parameters - ---------- - value : scalar - Scalar value to use to fill nulls. This value cannot be a - list-likes. - - downcast : dict, default is None - This Parameter is currently NON-FUNCTIONAL. - - Returns - ------- - filled : Index - - Examples - -------- - >>> import cudf - >>> index = cudf.Index([1, 2, None, 4]) - >>> index - Int64Index([1, 2, null, 4], dtype='int64') - >>> index.fillna(3) - Int64Index([1, 2, 3, 4], dtype='int64') - """ - if downcast is not None: - raise NotImplementedError( - "`downcast` parameter is not yet supported" - ) - - return super().fillna(value=value) - - def take(self, indices): - """Gather only the specific subset of indices - - Parameters - ---------- - indices: An array-like that maps to values contained in this Index. - """ - return self[indices] - - def argsort(self, ascending=True, **kwargs): - """ - Return the integer indices that would sort the index. - - Parameters - ---------- - ascending : bool, default True - If True, returns the indices for ascending order. - If False, returns the indices for descending order. - - Returns - ------- - array : A cupy array containing Integer indices that - would sort the index if used as an indexer. - - Examples - -------- - >>> import cudf - >>> index = cudf.Index([10, 100, 1, 1000]) - >>> index - Int64Index([10, 100, 1, 1000], dtype='int64') - >>> index.argsort() - array([2, 0, 1, 3], dtype=int32) - - The order of argsort can be reversed using - ``ascending`` parameter, by setting it to ``False``. - >>> index.argsort(ascending=False) - array([3, 1, 0, 2], dtype=int32) - - ``argsort`` on a MultiIndex: - - >>> index = cudf.MultiIndex( - ... levels=[[1, 3, 4, -10], [1, 11, 5]], - ... codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - ... names=["x", "y"], - ... ) - >>> index - MultiIndex([( 1, 1), - ( 1, 5), - ( 3, 11), - ( 4, 11), - (-10, 1)], - names=['x', 'y']) - >>> index.argsort() - array([4, 0, 1, 2, 3], dtype=int32) - >>> index.argsort(ascending=False) - array([3, 2, 1, 0, 4], dtype=int32) - """ - indices = self._values.argsort(ascending=ascending, **kwargs) - return cupy.asarray(indices) - - def to_frame(self, index=True, name=None): - """Create a DataFrame with a column containing this Index - - Parameters - ---------- - index : boolean, default True - Set the index of the returned DataFrame as the original Index - name : str, default None - Name to be used for the column - - Returns - ------- - DataFrame - cudf DataFrame - """ - - if name is not None: - col_name = name - elif self.name is None: - col_name = 0 - else: - col_name = self.name - return cudf.DataFrame( - {col_name: self._values}, index=self if index else None - ) - - def any(self): - """ - Return whether any elements is True in Index. - """ - return self._values.any() - - def to_pandas(self): - """ - Convert to a Pandas Index. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([-3, 10, 15, 20]) - >>> idx - Int64Index([-3, 10, 15, 20], dtype='int64') - >>> idx.to_pandas() - Int64Index([-3, 10, 15, 20], dtype='int64') - >>> type(idx.to_pandas()) - - >>> type(idx) - - """ - return pd.Index(self._values.to_pandas(), name=self.name) - - @ioutils.doc_to_dlpack() - def to_dlpack(self): - """{docstring}""" - - return cudf.io.dlpack.to_dlpack(self) - - @property - def gpu_values(self): - """ - View the data as a numba device array object - """ - return self._values.data_array_view - - @classmethod - def _concat(cls, objs): - if all(isinstance(obj, RangeIndex) for obj in objs): - result = _concat_range_index(objs) - else: - data = concat_columns([o._values for o in objs]) - result = as_index(data) - - names = {obj.name for obj in objs} - if len(names) == 1: - [name] = names - else: - name = None - - result.name = name - return result - - def append(self, other): - """ - Append a collection of Index options together. - - Parameters - ---------- - other : Index or list/tuple of indices - - Returns - ------- - appended : Index - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([1, 2, 10, 100]) - >>> idx - Int64Index([1, 2, 10, 100], dtype='int64') - >>> other = cudf.Index([200, 400, 50]) - >>> other - Int64Index([200, 400, 50], dtype='int64') - >>> idx.append(other) - Int64Index([1, 2, 10, 100, 200, 400, 50], dtype='int64') - - append accepts list of Index objects - - >>> idx.append([other, other]) - Int64Index([1, 2, 10, 100, 200, 400, 50, 200, 400, 50], dtype='int64') - """ - - if is_list_like(other): - to_concat = [self] - to_concat.extend(other) - else: - this = self - if len(other) == 0: - # short-circuit and return a copy - to_concat = [self] - - other = as_index(other) - - if len(self) == 0: - to_concat = [other] - - if len(self) and len(other): - if is_mixed_with_object_dtype(this, other): - got_dtype = ( - other.dtype - if this.dtype == cudf.dtype("object") - else this.dtype - ) - raise TypeError( - f"cudf does not support appending an Index of " - f"dtype `{cudf.dtype('object')}` with an Index " - f"of dtype `{got_dtype}`, please type-cast " - f"either one of them to same dtypes." - ) - - if isinstance(self._values, cudf.core.column.NumericalColumn): - if self.dtype != other.dtype: - this, other = numeric_normalize_types(self, other) - to_concat = [this, other] - - for obj in to_concat: - if not isinstance(obj, BaseIndex): - raise TypeError("all inputs must be Index") - - return self._concat(to_concat) - - def difference(self, other, sort=None): - """ - Return a new Index with elements from the index that are not in - `other`. - - This is the set difference of two Index objects. - - Parameters - ---------- - other : Index or array-like - sort : False or None, default None - Whether to sort the resulting index. By default, the - values are attempted to be sorted, but any TypeError from - incomparable elements is caught by cudf. - - * None : Attempt to sort the result, but catch any TypeErrors - from comparing incomparable elements. - * False : Do not sort the result. - - Returns - ------- - difference : Index - - Examples - -------- - >>> import cudf - >>> idx1 = cudf.Index([2, 1, 3, 4]) - >>> idx1 - Int64Index([2, 1, 3, 4], dtype='int64') - >>> idx2 = cudf.Index([3, 4, 5, 6]) - >>> idx2 - Int64Index([3, 4, 5, 6], dtype='int64') - >>> idx1.difference(idx2) - Int64Index([1, 2], dtype='int64') - >>> idx1.difference(idx2, sort=False) - Int64Index([2, 1], dtype='int64') - """ - if sort not in {None, False}: - raise ValueError( - f"The 'sort' keyword only takes the values " - f"of None or False; {sort} was passed." - ) - - other = as_index(other) - - if is_mixed_with_object_dtype(self, other): - difference = self.copy() - else: - difference = self.join(other, how="leftanti") - if self.dtype != other.dtype: - difference = difference.astype(self.dtype) - - if sort is None: - return difference.sort_values() - - return difference - - def sort_values(self, return_indexer=False, ascending=True, key=None): - """ - Return a sorted copy of the index, and optionally return the indices - that sorted the index itself. - - Parameters - ---------- - return_indexer : bool, default False - Should the indices that would sort the index be returned. - ascending : bool, default True - Should the index values be sorted in an ascending order. - key : None, optional - This parameter is NON-FUNCTIONAL. - - Returns - ------- - sorted_index : Index - Sorted copy of the index. - indexer : cupy.ndarray, optional - The indices that the index itself was sorted by. - - See Also - -------- - cudf.Series.min : Sort values of a Series. - cudf.DataFrame.sort_values : Sort values in a DataFrame. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([10, 100, 1, 1000]) - >>> idx - Int64Index([10, 100, 1, 1000], dtype='int64') - - Sort values in ascending order (default behavior). - - >>> idx.sort_values() - Int64Index([1, 10, 100, 1000], dtype='int64') - - Sort values in descending order, and also get the indices `idx` was - sorted by. - - >>> idx.sort_values(ascending=False, return_indexer=True) - (Int64Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2], - dtype=int32)) - - Sorting values in a MultiIndex: - - >>> midx = cudf.MultiIndex( - ... levels=[[1, 3, 4, -10], [1, 11, 5]], - ... codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - ... names=["x", "y"], - ... ) - >>> midx - MultiIndex([( 1, 1), - ( 1, 5), - ( 3, 11), - ( 4, 11), - (-10, 1)], - names=['x', 'y']) - >>> midx.sort_values() - MultiIndex([(-10, 1), - ( 1, 1), - ( 1, 5), - ( 3, 11), - ( 4, 11)], - names=['x', 'y']) - >>> midx.sort_values(ascending=False) - MultiIndex([( 4, 11), - ( 3, 11), - ( 1, 5), - ( 1, 1), - (-10, 1)], - names=['x', 'y']) - """ - if key is not None: - raise NotImplementedError("key parameter is not yet implemented.") - - indices = self._values.argsort(ascending=ascending) - index_sorted = as_index(self.take(indices), name=self.name) - - if return_indexer: - return index_sorted, cupy.asarray(indices) - else: - return index_sorted - - def unique(self): - """ - Return unique values in the index. - - Returns - ------- - Index without duplicates - """ - return as_index(self._values.unique(), name=self.name) - - def join( - self, other, how="left", level=None, return_indexers=False, sort=False - ): - """ - Compute join_index and indexers to conform data structures - to the new index. - - Parameters - ---------- - other : Index. - how : {'left', 'right', 'inner', 'outer'} - return_indexers : bool, default False - sort : bool, default False - Sort the join keys lexicographically in the result Index. If False, - the order of the join keys depends on the join type (how keyword). - - Returns: index - - Examples - -------- - >>> import cudf - >>> lhs = cudf.DataFrame( - ... {"a":[2, 3, 1], "b":[3, 4, 2]}).set_index(['a', 'b'] - ... ).index - >>> lhs - MultiIndex([(2, 3), - (3, 4), - (1, 2)], - names=['a', 'b']) - >>> rhs = cudf.DataFrame({"a":[1, 4, 3]}).set_index('a').index - >>> rhs - Int64Index([1, 4, 3], dtype='int64', name='a') - >>> lhs.join(rhs, how='inner') - MultiIndex([(3, 4), - (1, 2)], - names=['a', 'b']) - """ - - if isinstance(self, cudf.MultiIndex) and isinstance( - other, cudf.MultiIndex - ): - raise TypeError( - "Join on level between two MultiIndex objects is ambiguous" - ) - - if level is not None and not is_scalar(level): - raise ValueError("level should be an int or a label only") - - if isinstance(other, cudf.MultiIndex): - if how == "left": - how = "right" - elif how == "right": - how = "left" - rhs = self.copy(deep=False) - lhs = other.copy(deep=False) - else: - lhs = self.copy(deep=False) - rhs = other.copy(deep=False) - - on = level - # In case of MultiIndex, it will be None as - # we don't need to update name - left_names = lhs.names - right_names = rhs.names - # There should be no `None` values in Joined indices, - # so essentially it would be `left/right` or 'inner' - # in case of MultiIndex - if isinstance(lhs, cudf.MultiIndex): - if level is not None and isinstance(level, int): - on = lhs._data.select_by_index(level).names[0] - right_names = (on,) or right_names - on = right_names[0] - if how == "outer": - how = "left" - elif how == "right": - how = "inner" - else: - # Both are nomal indices - right_names = left_names - on = right_names[0] - - lhs.names = left_names - rhs.names = right_names - - output = lhs._merge(rhs, how=how, on=on, sort=sort) - - return output - - def rename(self, name, inplace=False): - """ - Alter Index name. - - Defaults to returning new index. - - Parameters - ---------- - name : label - Name(s) to set. - - Returns - ------- - Index - - Examples - -------- - >>> import cudf - >>> index = cudf.Index([1, 2, 3], name='one') - >>> index - Int64Index([1, 2, 3], dtype='int64', name='one') - >>> index.name - 'one' - >>> renamed_index = index.rename('two') - >>> renamed_index - Int64Index([1, 2, 3], dtype='int64', name='two') - >>> renamed_index.name - 'two' - """ - if inplace is True: - self.name = name - return None - else: - out = self.copy(deep=False) - out.name = name - return out.copy(deep=True) - - def astype(self, dtype, copy=False): - """ - Create an Index with values cast to dtypes. The class of a new Index - is determined by dtype. When conversion is impossible, a ValueError - exception is raised. - - Parameters - ---------- - dtype : numpy dtype - Use a numpy.dtype to cast entire Index object to. - copy : bool, default False - By default, astype always returns a newly allocated object. - If copy is set to False and internal requirements on dtype are - satisfied, the original data is used to create a new Index - or the original Index is returned. - - Returns - ------- - Index - Index with values cast to specified dtype. - - Examples - -------- - >>> import cudf - >>> index = cudf.Index([1, 2, 3]) - >>> index - Int64Index([1, 2, 3], dtype='int64') - >>> index.astype('float64') - Float64Index([1.0, 2.0, 3.0], dtype='float64') - """ - if is_dtype_equal(dtype, self.dtype): - return self.copy(deep=copy) - - return as_index( - self.copy(deep=copy)._values.astype(dtype), name=self.name - ) - - def to_array(self, fillna=None): - """Get a dense numpy array for the data. - - Parameters - ---------- - fillna : str or None - Defaults to None, which will skip null values. - If it equals "pandas", null values are filled with NaNs. - Non integral dtype is promoted to np.float64. - - Notes - ----- - - if ``fillna`` is ``None``, null values are skipped. Therefore, the - output size could be smaller. - """ - return self._values.to_array(fillna=fillna) - - def to_series(self, index=None, name=None): - """ - Create a Series with both index and values equal to the index keys. - Useful with map for returning an indexer based on an index. - - Parameters - ---------- - index : Index, optional - Index of resulting Series. If None, defaults to original index. - name : str, optional - Dame of resulting Series. If None, defaults to name of original - index. - - Returns - ------- - Series - The dtype will be based on the type of the Index values. - """ - return cudf.Series( - self._values, - index=self.copy(deep=False) if index is None else index, - name=self.name if name is None else name, - ) - - def get_slice_bound(self, label, side, kind): - """ - Calculate slice bound that corresponds to given label. - Returns leftmost (one-past-the-rightmost if ``side=='right'``) position - of given label. - - Parameters - ---------- - label : object - side : {'left', 'right'} - kind : {'ix', 'loc', 'getitem'} - - Returns - ------- - int - Index of label. - """ - raise (NotImplementedError) - - def __array_function__(self, func, types, args, kwargs): - - # check if the function is implemented for the current type - cudf_index_module = type(self) - for submodule in func.__module__.split(".")[1:]: - # point cudf_index_module to the correct submodule - if hasattr(cudf_index_module, submodule): - cudf_index_module = getattr(cudf_index_module, submodule) - else: - return NotImplemented - - fname = func.__name__ - - handled_types = [Index, cudf.Series] - - # check if we don't handle any of the types (including sub-class) - for t in types: - if not any( - issubclass(t, handled_type) for handled_type in handled_types - ): - return NotImplemented - - if hasattr(cudf_index_module, fname): - cudf_func = getattr(cudf_index_module, fname) - # Handle case if cudf_func is same as numpy function - if cudf_func is func: - return NotImplemented - else: - return cudf_func(*args, **kwargs) - - else: - return NotImplemented - - def isin(self, values): - """Return a boolean array where the index values are in values. - - Compute boolean array of whether each index value is found in - the passed set of values. The length of the returned boolean - array matches the length of the index. - - Parameters - ---------- - values : set, list-like, Index - Sought values. - - Returns - ------- - is_contained : cupy array - CuPy array of boolean values. - - Examples - -------- - >>> idx = cudf.Index([1,2,3]) - >>> idx - Int64Index([1, 2, 3], dtype='int64') - - Check whether each index value in a list of values. - - >>> idx.isin([1, 4]) - array([ True, False, False]) - """ - - return self._values.isin(values).values - - def where(self, cond, other=None): - """ - Replace values where the condition is False. - - Parameters - ---------- - cond : bool array-like with the same length as self - Where cond is True, keep the original value. - Where False, replace with corresponding value from other. - Callables are not supported. - other: scalar, or array-like - Entries where cond is False are replaced with - corresponding value from other. Callables are not - supported. Default is None. - - Returns - ------- - Same type as caller - - Examples - -------- - >>> import cudf - >>> index = cudf.Index([4, 3, 2, 1, 0]) - >>> index - Int64Index([4, 3, 2, 1, 0], dtype='int64') - >>> index.where(index > 2, 15) - Int64Index([4, 3, 15, 15, 15], dtype='int64') - """ - return super().where(cond=cond, other=other) - - def memory_usage(self, deep=False): - """ - Memory usage of the values. - - Parameters - ---------- - deep : bool - Introspect the data deeply, - interrogate `object` dtypes for system-level - memory consumption. - - Returns - ------- - bytes used - """ - return self._values._memory_usage(deep=deep) - - def get_loc(self, key, method=None, tolerance=None): - """Get integer location, slice or boolean mask for requested label. - - Parameters - ---------- - key : label - method : {None, 'pad'/'fill', 'backfill'/'bfill', 'nearest'}, optional - - default: exact matches only. - - pad / ffill: find the PREVIOUS index value if no exact match. - - backfill / bfill: use NEXT index value if no exact match. - - nearest: use the NEAREST index value if no exact match. Tied - distances are broken by preferring the larger index - value. - tolerance : int or float, optional - Maximum distance from index value for inexact matches. The value - of the index at the matching location must satisfy the equation - ``abs(index[loc] - key) <= tolerance``. - - Returns - ------- - int or slice or boolean mask - - If result is unique, return integer index - - If index is monotonic, loc is returned as a slice object - - Otherwise, a boolean mask is returned - - Examples - -------- - >>> unique_index = cudf.Index(list('abc')) - >>> unique_index.get_loc('b') - 1 - >>> monotonic_index = cudf.Index(list('abbc')) - >>> monotonic_index.get_loc('b') - slice(1, 3, None) - >>> non_monotonic_index = cudf.Index(list('abcb')) - >>> non_monotonic_index.get_loc('b') - array([False, True, False, True]) - >>> numeric_unique_index = cudf.Index([1, 2, 3]) - >>> numeric_unique_index.get_loc(3) - 2 - """ - if tolerance is not None: - raise NotImplementedError( - "Parameter tolerance is unsupported yet." - ) - if method not in { - None, - "ffill", - "bfill", - "pad", - "backfill", - "nearest", - }: - raise ValueError( - f"Invalid fill method. Expecting pad (ffill), backfill (bfill)" - f" or nearest. Got {method}" - ) - - is_sorted = ( - self.is_monotonic_increasing or self.is_monotonic_decreasing - ) - - if not is_sorted and method is not None: - raise ValueError( - "index must be monotonic increasing or decreasing if `method`" - "is specified." - ) - - key_as_table = Table({"None": as_column(key, length=1)}) - lower_bound, upper_bound, sort_inds = self._lexsorted_equal_range( - key_as_table, is_sorted - ) - - if lower_bound == upper_bound: - # Key not found, apply method - if method in ("pad", "ffill"): - if lower_bound == 0: - raise KeyError(key) - return lower_bound - 1 - elif method in ("backfill", "bfill"): - if lower_bound == self._data.nrows: - raise KeyError(key) - return lower_bound - elif method == "nearest": - if lower_bound == self._data.nrows: - return lower_bound - 1 - elif lower_bound == 0: - return 0 - lower_val = self._column.element_indexing(lower_bound - 1) - upper_val = self._column.element_indexing(lower_bound) - return ( - lower_bound - 1 - if abs(lower_val - key) < abs(upper_val - key) - else lower_bound - ) - else: - raise KeyError(key) - - if lower_bound + 1 == upper_bound: - # Search result is unique, return int. - return ( - lower_bound - if is_sorted - else sort_inds.element_indexing(lower_bound) - ) - - if is_sorted: - # In monotonic index, lex search result is continuous. A slice for - # the range is returned. - return slice(lower_bound, upper_bound) - - # Not sorted and not unique. Return a boolean mask - mask = cupy.full(self._data.nrows, False) - true_inds = sort_inds.slice(lower_bound, upper_bound).to_gpu_array() - mask[cupy.array(true_inds)] = True - return mask - - def _lexsorted_equal_range( - self, key_as_table: Table, is_sorted: bool - ) -> Tuple[int, int, Optional[ColumnBase]]: - """Get equal range for key in lexicographically sorted index. If index - is not sorted when called, a sort will take place and `sort_inds` is - returned. Otherwise `None` is returned in that position. - """ - if not is_sorted: - sort_inds = self._get_sorted_inds() - sort_vals = self._gather(sort_inds) - else: - sort_inds = None - sort_vals = self - lower_bound = search_sorted( - sort_vals, key_as_table, side="left" - ).element_indexing(0) - upper_bound = search_sorted( - sort_vals, key_as_table, side="right" - ).element_indexing(0) - - return lower_bound, upper_bound, sort_inds - - @classmethod - def from_pandas(cls, index, nan_as_null=None): - """ - Convert from a Pandas Index. - - Parameters - ---------- - index : Pandas Index object - A Pandas Index object which has to be converted - to cuDF Index. - nan_as_null : bool, Default None - If ``None``/``True``, converts ``np.nan`` values - to ``null`` values. - If ``False``, leaves ``np.nan`` values as is. - - Raises - ------ - TypeError for invalid input type. - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> import numpy as np - >>> data = [10, 20, 30, np.nan] - >>> pdi = pd.Index(data) - >>> cudf.Index.from_pandas(pdi) - Float64Index([10.0, 20.0, 30.0, ], dtype='float64') - >>> cudf.Index.from_pandas(pdi, nan_as_null=False) - Float64Index([10.0, 20.0, 30.0, nan], dtype='float64') - """ - if not isinstance(index, pd.Index): - raise TypeError("not a pandas.Index") + TimeDeltaColumn, + arange, + column, +) +from cudf.core.column.column import as_column, concat_columns +from cudf.core.column.string import StringMethods as StringMethods +from cudf.core.dtypes import IntervalDtype +from cudf.core.frame import Frame, SingleColumnFrame +from cudf.utils.docutils import copy_docstring +from cudf.utils.dtypes import ( + _is_non_decimal_numeric_dtype, + find_common_type, + is_categorical_dtype, + is_interval_dtype, +) +from cudf.utils.utils import cached_property, search_range - ind = as_index(column.as_column(index, nan_as_null=nan_as_null)) - ind.name = index.name - return ind +T = TypeVar("T", bound="Frame") - @classmethod - def _from_data( - cls, - data: MutableMapping, - index: Optional[BaseIndex] = None, - name: Any = None, - ) -> BaseIndex: - assert index is None - if not isinstance(data, cudf.core.column_accessor.ColumnAccessor): - data = cudf.core.column_accessor.ColumnAccessor(data) - if len(data) == 0: - raise ValueError("Cannot construct Index from any empty Table") - if len(data) == 1: - values = next(iter(data.values())) - - if isinstance(values, NumericalColumn): - try: - index_class_type = _dtype_to_index[values.dtype.type] - except KeyError: - index_class_type = GenericIndex - out = super(BaseIndex, index_class_type).__new__( - index_class_type - ) - elif isinstance(values, DatetimeColumn): - out = super(BaseIndex, DatetimeIndex).__new__(DatetimeIndex) - elif isinstance(values, TimeDeltaColumn): - out = super(BaseIndex, TimedeltaIndex).__new__(TimedeltaIndex) - elif isinstance(values, StringColumn): - out = super(BaseIndex, StringIndex).__new__(StringIndex) - elif isinstance(values, CategoricalColumn): - out = super(BaseIndex, CategoricalIndex).__new__( - CategoricalIndex - ) - out._data = data - out._index = None - return out - else: - return cudf.MultiIndex._from_data(data) - @property - def _constructor_expanddim(self): - return cudf.MultiIndex +def _lexsorted_equal_range( + idx: Union[GenericIndex, cudf.MultiIndex], + key_as_table: Table, + is_sorted: bool, +) -> Tuple[int, int, Optional[ColumnBase]]: + """Get equal range for key in lexicographically sorted index. If index + is not sorted when called, a sort will take place and `sort_inds` is + returned. Otherwise `None` is returned in that position. + """ + if not is_sorted: + sort_inds = idx._get_sorted_inds() + sort_vals = idx._gather(sort_inds) + else: + sort_inds = None + sort_vals = idx + lower_bound = search_sorted( + sort_vals, key_as_table, side="left" + ).element_indexing(0) + upper_bound = search_sorted( + sort_vals, key_as_table, side="right" + ).element_indexing(0) + + return lower_bound, upper_bound, sort_inds + + +def _index_from_data(data: MutableMapping, name: Any = None): + """Construct an index of the appropriate type from some data.""" + if len(data) == 0: + raise ValueError("Cannot construct Index from any empty Table") + if len(data) == 1: + values = next(iter(data.values())) + + if isinstance(values, NumericalColumn): + try: + index_class_type: Type[ + Union[GenericIndex, cudf.MultiIndex] + ] = _dtype_to_index[values.dtype.type] + except KeyError: + index_class_type = GenericIndex + elif isinstance(values, DatetimeColumn): + index_class_type = DatetimeIndex + elif isinstance(values, TimeDeltaColumn): + index_class_type = TimedeltaIndex + elif isinstance(values, StringColumn): + index_class_type = StringIndex + elif isinstance(values, CategoricalColumn): + index_class_type = CategoricalIndex + elif isinstance(values, IntervalColumn): + index_class_type = IntervalIndex + else: + index_class_type = cudf.MultiIndex + return index_class_type._from_data(data, None, name) class RangeIndex(BaseIndex): @@ -1371,6 +163,13 @@ def __init__( self._index = None self._name = name + def _copy_type_metadata( + self, other: Frame, include_index: bool = True + ) -> RangeIndex: + # There is no metadata to be copied for RangeIndex since it does not + # have an underlying column. + return self + @property def name(self): """ @@ -1461,6 +260,9 @@ def copy(self, name=None, deep=False, dtype=None, names=None): start=self._start, stop=self._stop, step=self._step, name=name ) + def drop_duplicates(self, keep="first"): + return self + def __repr__(self): return ( f"{self.__class__.__name__}(start={self._start}, stop={self._stop}" @@ -1509,7 +311,7 @@ def equals(self, other): other._step, ): return True - return super().equals(other) + return cudf.Int64Index._from_data(self._data).equals(other) def serialize(self): header = {} @@ -1683,10 +485,102 @@ def __mul__(self, other): return RangeIndex( self.start * other, self.stop * other, self.step * other ) - return super().__mul__(other) + return self._as_int64().__mul__(other) + + def __rmul__(self, other): + # Multiplication is commutative. + return self.__mul__(other) + + def _as_int64(self): + # Convert self to an Int64Index. This method is used to perform ops + # that are not defined directly on RangeIndex. + return cudf.Int64Index._from_data(self._data) + + def __getattr__(self, key): + # For methods that are not defined for RangeIndex we attempt to operate + # on the corresponding integer index if possible. + try: + return getattr(self._as_int64(), key) + except AttributeError: + raise AttributeError( + f"'{type(self)}' object has no attribute {key}" + ) + + def get_loc(self, key, method=None, tolerance=None): + # Given an actual integer, + idx = (key - self._start) / self._step + idx_int_upper_bound = (self._stop - self._start) // self._step + if method is None: + if tolerance is not None: + raise ValueError( + "tolerance argument only valid if using pad, " + "backfill or nearest lookups" + ) + + if idx > idx_int_upper_bound or idx < 0: + raise KeyError(key) + + idx_int = (key - self._start) // self._step + if idx_int != idx: + raise KeyError(key) + return idx_int + + if (method == "ffill" and idx < 0) or ( + method == "bfill" and idx > idx_int_upper_bound + ): + raise KeyError(key) + + round_method = { + "ffill": math.floor, + "bfill": math.ceil, + "nearest": round, + }[method] + if tolerance is not None and (abs(idx) * self._step > tolerance): + raise KeyError(key) + return np.clip(round_method(idx), 0, idx_int_upper_bound, dtype=int) + + +# Patch in all binops and unary ops, which bypass __getattr__ on the instance +# and prevent the above overload from working. +for binop in ( + "__add__", + "__radd__", + "__sub__", + "__rsub__", + "__mod__", + "__rmod__", + "__pow__", + "__rpow__", + "__floordiv__", + "__rfloordiv__", + "__truediv__", + "__rtruediv__", + "__and__", + "__or__", + "__xor__", + "__eq__", + "__ne__", + "__lt__", + "__le__", + "__gt__", + "__ge__", +): + setattr( + RangeIndex, + binop, + lambda self, other, op=binop: getattr(self._as_int64(), op)(other), + ) + + +for unaop in ("__neg__", "__pos__", "__abs__"): + setattr( + RangeIndex, + binop, + lambda self, op=unaop: getattr(self._as_int64(), op)(), + ) -class GenericIndex(BaseIndex): +class GenericIndex(SingleColumnFrame, BaseIndex): """ An array of orderable values that represent the indices of another Column @@ -1725,10 +619,118 @@ def __init__(self, data, **kwargs): name = kwargs.get("name") super().__init__({name: data}) + def drop_duplicates(self, keep="first"): + """ + Return Index with duplicate values removed + + Parameters + ---------- + keep : {‘first’, ‘last’, False}, default ‘first’ + * ‘first’ : Drop duplicates except for the + first occurrence. + * ‘last’ : Drop duplicates except for the + last occurrence. + * False : Drop all duplicates. + + Returns + ------- + Index + + Examples + -------- + >>> import cudf + >>> idx = cudf.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) + >>> idx + StringIndex(['lama' 'cow' 'lama' 'beetle' 'lama' 'hippo'], dtype='object') + >>> idx.drop_duplicates() + StringIndex(['beetle' 'cow' 'hippo' 'lama'], dtype='object') + """ # noqa: E501 + return super().drop_duplicates(keep=keep) + + def _binaryop( + self, + other: T, + fn: str, + fill_value: Any = None, + reflect: bool = False, + *args, + **kwargs, + ) -> SingleColumnFrame: + # Specialize binops to generate the appropriate output index type. + operands = self._make_operands_for_binop(other, fill_value, reflect) + return ( + _index_from_data(data=self._colwise_binop(operands, fn),) + if operands is not NotImplemented + else NotImplemented + ) + + def _copy_type_metadata( + self, other: Frame, include_index: bool = True + ) -> GenericIndex: + """ + Copy type metadata from each column of `other` to the corresponding + column of `self`. + See `ColumnBase._with_type_metadata` for more information. + """ + for name, col, other_col in zip( + self._data.keys(), self._data.values(), other._data.values() + ): + self._data.set_by_label( + name, col._with_type_metadata(other_col.dtype), validate=False + ) + return self + @property def _values(self): return self._column + @classmethod + def _concat(cls, objs): + if all(isinstance(obj, RangeIndex) for obj in objs): + result = _concat_range_index(objs) + else: + data = concat_columns([o._values for o in objs]) + result = as_index(data) + + names = {obj.name for obj in objs} + if len(names) == 1: + [name] = names + else: + name = None + + result.name = name + return result + + @annotate("INDEX_EQUALS", color="green", domain="cudf_python") + def equals(self, other, **kwargs): + """ + Determine if two Index objects contain the same elements. + + Returns + ------- + out: bool + True if “other” is an Index and it has the same elements + as calling index; False otherwise. + """ + if not isinstance(other, BaseIndex): + return False + + check_types = False + + self_is_categorical = isinstance(self, CategoricalIndex) + other_is_categorical = isinstance(other, CategoricalIndex) + if self_is_categorical and not other_is_categorical: + other = other.astype(self.dtype) + check_types = True + elif other_is_categorical and not self_is_categorical: + self = self.astype(other.dtype) + check_types = True + + try: + return super().equals(other, check_types=check_types) + except TypeError: + return False + def copy(self, name=None, deep=False, dtype=None, names=None): """ Make a copy of this object. @@ -1753,7 +755,124 @@ def copy(self, name=None, deep=False, dtype=None, names=None): dtype = self.dtype if dtype is None else dtype name = self.name if name is None else name - return as_index(self._values.astype(dtype), name=name, copy=deep) + col = self._values.astype(dtype) + return _index_from_data({name: col.copy(True) if deep else col}) + + def get_loc(self, key, method=None, tolerance=None): + """Get integer location, slice or boolean mask for requested label. + + Parameters + ---------- + key : label + method : {None, 'pad'/'fill', 'backfill'/'bfill', 'nearest'}, optional + - default: exact matches only. + - pad / ffill: find the PREVIOUS index value if no exact match. + - backfill / bfill: use NEXT index value if no exact match. + - nearest: use the NEAREST index value if no exact match. Tied + distances are broken by preferring the larger index + value. + tolerance : int or float, optional + Maximum distance from index value for inexact matches. The value + of the index at the matching location must satisfy the equation + ``abs(index[loc] - key) <= tolerance``. + + Returns + ------- + int or slice or boolean mask + - If result is unique, return integer index + - If index is monotonic, loc is returned as a slice object + - Otherwise, a boolean mask is returned + + Examples + -------- + >>> unique_index = cudf.Index(list('abc')) + >>> unique_index.get_loc('b') + 1 + >>> monotonic_index = cudf.Index(list('abbc')) + >>> monotonic_index.get_loc('b') + slice(1, 3, None) + >>> non_monotonic_index = cudf.Index(list('abcb')) + >>> non_monotonic_index.get_loc('b') + array([False, True, False, True]) + >>> numeric_unique_index = cudf.Index([1, 2, 3]) + >>> numeric_unique_index.get_loc(3) + 2 + """ + if tolerance is not None: + raise NotImplementedError( + "Parameter tolerance is unsupported yet." + ) + if method not in { + None, + "ffill", + "bfill", + "pad", + "backfill", + "nearest", + }: + raise ValueError( + f"Invalid fill method. Expecting pad (ffill), backfill (bfill)" + f" or nearest. Got {method}" + ) + + is_sorted = ( + self.is_monotonic_increasing or self.is_monotonic_decreasing + ) + + if not is_sorted and method is not None: + raise ValueError( + "index must be monotonic increasing or decreasing if `method`" + "is specified." + ) + + key_as_table = Table({"None": as_column(key, length=1)}) + lower_bound, upper_bound, sort_inds = _lexsorted_equal_range( + self, key_as_table, is_sorted + ) + + if lower_bound == upper_bound: + # Key not found, apply method + if method in ("pad", "ffill"): + if lower_bound == 0: + raise KeyError(key) + return lower_bound - 1 + elif method in ("backfill", "bfill"): + if lower_bound == self._data.nrows: + raise KeyError(key) + return lower_bound + elif method == "nearest": + if lower_bound == self._data.nrows: + return lower_bound - 1 + elif lower_bound == 0: + return 0 + lower_val = self._column.element_indexing(lower_bound - 1) + upper_val = self._column.element_indexing(lower_bound) + return ( + lower_bound - 1 + if abs(lower_val - key) < abs(upper_val - key) + else lower_bound + ) + else: + raise KeyError(key) + + if lower_bound + 1 == upper_bound: + # Search result is unique, return int. + return ( + lower_bound + if is_sorted + else sort_inds.element_indexing(lower_bound) + ) + + if is_sorted: + # In monotonic index, lex search result is continuous. A slice for + # the range is returned. + return slice(lower_bound, upper_bound) + + # Not sorted and not unique. Return a boolean mask + mask = cupy.full(self._data.nrows, False) + true_inds = sort_inds.slice(lower_bound, upper_bound).to_gpu_array() + mask[cupy.array(true_inds)] = True + return mask def __sizeof__(self): return self._values.__sizeof__() @@ -3070,37 +2189,27 @@ def as_index(arbitrary, **kwargs) -> BaseIndex: idx = arbitrary.copy(deep=False) idx.rename(kwargs["name"], inplace=True) return idx - elif isinstance(arbitrary, NumericalColumn): - try: - return _dtype_to_index[arbitrary.dtype.type](arbitrary, **kwargs) - except KeyError: - return GenericIndex(arbitrary, **kwargs) - elif isinstance(arbitrary, StringColumn): - return StringIndex(arbitrary, **kwargs) - elif isinstance(arbitrary, DatetimeColumn): - return DatetimeIndex(arbitrary, **kwargs) - elif isinstance(arbitrary, TimeDeltaColumn): - return TimedeltaIndex(arbitrary, **kwargs) - elif isinstance(arbitrary, CategoricalColumn): - return CategoricalIndex(arbitrary, **kwargs) - elif isinstance(arbitrary, IntervalColumn): - return IntervalIndex(arbitrary, **kwargs) + elif isinstance(arbitrary, ColumnBase): + return _index_from_data({kwargs.get("name", None): arbitrary}) elif isinstance(arbitrary, cudf.Series): return as_index(arbitrary._column, **kwargs) - elif isinstance(arbitrary, pd.RangeIndex): - return RangeIndex(start=arbitrary.start, stop=arbitrary.stop, **kwargs) + elif isinstance(arbitrary, (pd.RangeIndex, range)): + return RangeIndex( + start=arbitrary.start, + stop=arbitrary.stop, + step=arbitrary.step, + **kwargs, + ) elif isinstance(arbitrary, pd.MultiIndex): return cudf.MultiIndex.from_pandas(arbitrary) elif isinstance(arbitrary, cudf.DataFrame): return cudf.MultiIndex(source_data=arbitrary) - elif isinstance(arbitrary, range): - return RangeIndex(arbitrary, **kwargs) return as_index( column.as_column(arbitrary, dtype=kwargs.get("dtype", None)), **kwargs ) -_dtype_to_index: Dict[Any, Type[BaseIndex]] = { +_dtype_to_index: Dict[Any, Type[NumericIndex]] = { np.int8: Int8Index, np.int16: Int16Index, np.int32: Int32Index, @@ -3180,9 +2289,9 @@ def __new__( tupleize_cols=True, **kwargs, ): - assert cls is Index, ( - "Index cannot be subclassed, extend BaseIndex " "instead." - ) + assert ( + cls is Index + ), "Index cannot be subclassed, extend BaseIndex instead." if tupleize_cols is not True: raise NotImplementedError( "tupleize_cols != True is not yet supported" @@ -3190,6 +2299,14 @@ def __new__( return as_index(data, copy=copy, dtype=dtype, name=name, **kwargs) + @classmethod + def from_arrow(cls, obj): + try: + return cls(ColumnBase.from_arrow(obj)) + except TypeError: + # Try interpreting object as a MultiIndex before failing. + return cudf.MultiIndex.from_arrow(obj) + def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex: """ diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 78fc7a863d6..1d1f661779f 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -70,7 +70,9 @@ def _frame_select_by_indexers( else: data.set_by_label(idx.name, idx.get(frame), validate=False) - result_index = cudf.Index._from_data(index_data) if index_data else None + result_index = ( + cudf.core.index._index_from_data(index_data) if index_data else None + ) result = cudf.core.frame.Frame(data=data, index=result_index) return result diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 418d24f41df..3b364a3fa86 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -6,7 +6,7 @@ import pickle import warnings from collections.abc import Sequence -from typing import Any, List, Mapping, Tuple, Union +from typing import Any, List, MutableMapping, Optional, Tuple, Union import cupy import numpy as np @@ -18,12 +18,12 @@ from cudf._typing import DataFrameOrSeries from cudf.core._compat import PANDAS_GE_120 from cudf.core.column import as_column, column -from cudf.core.frame import SingleColumnFrame -from cudf.core.index import BaseIndex, as_index +from cudf.core.frame import Frame +from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index from cudf.utils.utils import _maybe_indices_to_slice -class MultiIndex(BaseIndex): +class MultiIndex(Frame, BaseIndex): """A multi-level or hierarchical index. Provides N-Dimensional indexing into Series and DataFrame objects. @@ -191,11 +191,6 @@ def names(self, value): ) self._names = pd.core.indexes.frozen.FrozenList(value) - @property - def _num_columns(self): - # MultiIndex is not a single-columned frame. - return super(SingleColumnFrame, self)._num_columns - def rename(self, names, inplace=False): """ Alter MultiIndex level names @@ -283,14 +278,18 @@ def set_names(self, names, level=None, inplace=False): return self._set_names(names=names, inplace=inplace) - # TODO: This type ignore is indicating a real problem, which is that - # MultiIndex should not be inheriting from SingleColumnFrame, but fixing - # that will have to wait until we reshuffle the Index hierarchy. @classmethod - def _from_data( # type: ignore - cls, data: Mapping, index=None + def _from_data( + cls, + data: MutableMapping, + index: Optional[cudf.core.index.BaseIndex] = None, + name: Any = None, ) -> MultiIndex: - return cls.from_frame(cudf.DataFrame._from_data(data)) + assert index is None + obj = cls.from_frame(cudf.DataFrame._from_data(data)) + if name is not None: + obj.name = name + return obj @property def shape(self): @@ -434,6 +433,15 @@ def deepcopy(self): def __copy__(self): return self.copy(deep=True) + def __iter__(self): + """ + Iterating over a GPU object is not effecient and hence not supported. + + Consider using ``.to_arrow()``, ``.to_pandas()`` or ``.values_host`` + if you wish to iterate over the values. + """ + cudf.utils.utils.raise_iteration_error(obj=self) + def _popn(self, n): """ Returns a copy of this index without the left-most n values. @@ -535,68 +543,6 @@ def __repr__(self): data_output = "\n".join(lines) return output_prefix + data_output - @classmethod - def from_arrow(cls, table): - """ - Convert PyArrow Table to MultiIndex - - Parameters - ---------- - table : PyArrow Table - PyArrow Object which has to be converted to MultiIndex - - Returns - ------- - cudf MultiIndex - - Examples - -------- - >>> import cudf - >>> import pyarrow as pa - >>> tbl = pa.table({"a":[1, 2, 3], "b":["a", "b", "c"]}) - >>> cudf.MultiIndex.from_arrow(tbl) - MultiIndex([(1, 'a'), - (2, 'b'), - (3, 'c')], - names=['a', 'b']) - """ - - return super(SingleColumnFrame, cls).from_arrow(table) - - def to_arrow(self): - """Convert MultiIndex to PyArrow Table - - Returns - ------- - PyArrow Table - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({"a":[1, 2, 3], "b":[2, 3, 4]}) - >>> mindex = cudf.Index(df) - >>> mindex - MultiIndex([(1, 2), - (2, 3), - (3, 4)], - names=['a', 'b']) - >>> mindex.to_arrow() - pyarrow.Table - a: int64 - b: int64 - >>> mindex.to_arrow()['a'] - - [ - [ - 1, - 2, - 3 - ] - ] - """ - - return super(SingleColumnFrame, self).to_arrow() - @property def codes(self): """ @@ -1401,7 +1347,7 @@ def _poplevels(self, level): popped_data[n] = self._data.pop(n) # construct the popped result - popped = cudf.Index._from_data(popped_data) + popped = cudf.core.index._index_from_data(popped_data) popped.names = popped_names # update self @@ -1548,6 +1494,18 @@ def is_unique(self): ) return self._is_unique + @property + def is_monotonic(self): + """Return boolean if values in the object are monotonic_increasing. + + This property is an alias for :attr:`is_monotonic_increasing`. + + Returns + ------- + bool + """ + return self.is_monotonic_increasing + @property def is_monotonic_increasing(self): """ @@ -1853,11 +1811,9 @@ def get_loc(self, key, method=None, tolerance=None): partial_index = self.__class__._from_data( data=self._data.select_by_index(slice(key_as_table._num_columns)) ) - ( - lower_bound, - upper_bound, - sort_inds, - ) = partial_index._lexsorted_equal_range(key_as_table, is_sorted) + (lower_bound, upper_bound, sort_inds,) = _lexsorted_equal_range( + partial_index, key_as_table, is_sorted + ) if lower_bound == upper_bound: raise KeyError(key) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 1b8405af1a4..392a251dfc4 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -386,7 +386,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): elif typ is cudf.MultiIndex: return cudf.MultiIndex._concat(objs) elif issubclass(typ, cudf.Index): - return cudf.Index._concat(objs) + return cudf.core.index.GenericIndex._concat(objs) else: raise TypeError(f"cannot concatenate object of type {typ}") diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 7943d033cf8..4fe5712f240 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -7,7 +7,7 @@ from collections import abc as abc from numbers import Number from shutil import get_terminal_size -from typing import Any, MutableMapping, Optional +from typing import Any, MutableMapping, Optional, Set from uuid import uuid4 import cupy @@ -39,7 +39,7 @@ from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import Frame, SingleColumnFrame, _drop_rows_by_labels from cudf.core.groupby.groupby import SeriesGroupBy -from cudf.core.index import BaseIndex, Index, RangeIndex, as_index +from cudf.core.index import BaseIndex, RangeIndex, as_index from cudf.core.indexing import _SeriesIlocIndexer, _SeriesLocIndexer from cudf.utils import cudautils, docutils from cudf.utils.docutils import copy_docstring @@ -105,6 +105,8 @@ class Series(SingleColumnFrame, Serializable): If ``False``, leaves ``np.nan`` values as is. """ + _accessors: Set[Any] = set() + # The `constructor*` properties are used by `dask` (and `dask_cudf`) @property def _constructor(self): @@ -1216,6 +1218,7 @@ def _binaryop( *args, **kwargs, ): + # Specialize binops to align indices. if isinstance(other, SingleColumnFrame): if ( # TODO: The can_reindex logic also needs to be applied for @@ -1238,8 +1241,14 @@ def _binaryop( else: lhs = self - # Note that we call the super on lhs, not self. - return super(Series, lhs)._binaryop(other, fn, fill_value, reflect) + operands = lhs._make_operands_for_binop(other, fill_value, reflect) + return ( + lhs._from_data( + data=lhs._colwise_binop(operands, fn), index=lhs._index, + ) + if operands is not NotImplemented + else NotImplemented + ) def add(self, other, fill_value=None, axis=0): """ @@ -2246,7 +2255,9 @@ def _concat(cls, objs, axis=0, index=True): if isinstance(objs[0].index, cudf.MultiIndex): index = cudf.MultiIndex._concat([o.index for o in objs]) else: - index = Index._concat([o.index for o in objs]) + index = cudf.core.index.GenericIndex._concat( + [o.index for o in objs] + ) names = {obj.name for obj in objs} if len(names) == 1: diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index f80bdec0ab5..29b39fbd195 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -672,11 +672,11 @@ def test_index_where(data, condition, other, error): else: assert_eq( ps.where(ps_condition, other=ps_other) - .fillna(gs._columns[0].default_na_value()) + .fillna(gs._values.default_na_value()) .values, gs.where(gs_condition, other=gs_other) .to_pandas() - .fillna(gs._columns[0].default_na_value()) + .fillna(gs._values.default_na_value()) .values, ) else: @@ -2098,6 +2098,35 @@ def test_get_loc_single_unique_numeric(idx, key, method): assert_eq(expected, got) +@pytest.mark.parametrize( + "idx", [pd.RangeIndex(3, 100, 4)], +) +@pytest.mark.parametrize("key", list(range(1, 110, 3))) +@pytest.mark.parametrize("method", [None, "ffill"]) +def test_get_loc_rangeindex(idx, key, method): + pi = idx + gi = cudf.from_pandas(pi) + + if ( + (key not in pi and method is None) + # Get key before the first element is KeyError + or (key < pi.start and method in "ffill") + # Get key after the last element is KeyError + or (key >= pi.stop and method in "bfill") + ): + assert_exceptions_equal( + lfunc=pi.get_loc, + rfunc=gi.get_loc, + lfunc_args_and_kwargs=([], {"key": key, "method": method}), + rfunc_args_and_kwargs=([], {"key": key, "method": method}), + ) + else: + expected = pi.get_loc(key, method=method) + got = gi.get_loc(key, method=method) + + assert_eq(expected, got) + + @pytest.mark.parametrize( "idx", [ From 549bcb7a0abace83e7f0b2ddac804598b3101b4c Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Tue, 31 Aug 2021 17:46:09 -0400 Subject: [PATCH 7/8] Import rapids-cmake modules using the correct cmake variable. (#9149) Use `rapids-cmake-dir` when computing the location of rapids-cmake modules. Corrects build errors that occur when `CUDF_BUILD_BENCHMARKS` is enabled. Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - David Wendt (https://github.com/davidwendt) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/9149 --- cpp/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 18af85c98e0..6e80d0b32fc 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -618,7 +618,7 @@ if(CUDF_BUILD_BENCHMARKS) "BENCHMARK_ENABLE_INSTALL OFF") # Find or install NVBench - include(${rapids-cmake}/cpm/nvbench.cmake) + include(${rapids-cmake-dir}/cpm/nvbench.cmake) rapids_cpm_nvbench() add_subdirectory(benchmarks) endif() From 1935a8a9de87152e70b7930c911e0a44da0560cc Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 31 Aug 2021 18:07:15 -0500 Subject: [PATCH 8/8] Add support for `get_group` in GroupBy (#9070) This PR adds `get_group` functionality to `GroupBy`. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/9070 --- python/cudf/cudf/core/groupby/groupby.py | 37 ++++++++++++++++++ python/cudf/cudf/tests/test_groupby.py | 49 ++++++++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index fd425d9de76..d98a78efb18 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -103,6 +103,43 @@ def groups(self): zip(group_names.to_pandas(), grouped_index._split(offsets[1:-1])) ) + def get_group(self, name, obj=None): + """ + Construct DataFrame from group with provided name. + + Parameters + ---------- + name : object + The name of the group to get as a DataFrame. + obj : DataFrame, default None + The DataFrame to take the DataFrame out of. If + it is None, the object groupby was called on will + be used. + + Returns + ------- + group : same type as obj + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}) + >>> df + X Y + 0 A 1 + 1 B 4 + 2 A 3 + 3 B 2 + >>> df.groupby("X").get_group("A") + X Y + 0 A 1 + 2 A 3 + """ + if obj is None: + obj = self.obj + + return obj.loc[self.groups[name]] + def size(self): """ Return the size of each group. diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index df6a9336e97..7719df492f7 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -2114,3 +2114,52 @@ def foo(x): expect = make_frame(pd.DataFrame, 100).groupby("x").y.apply(foo) assert_groupby_results_equal(expect, got) + + +@pytest.mark.parametrize( + "pdf, group, name, obj", + [ + ( + pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), + "X", + "A", + None, + ), + ( + pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), + "X", + "B", + None, + ), + ( + pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), + "X", + "A", + pd.DataFrame({"a": [1, 2, 4, 5, 10, 11]}), + ), + ( + pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), + "Y", + 1, + pd.DataFrame({"a": [1, 2, 4, 5, 10, 11]}), + ), + ( + pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), + "Y", + 3, + pd.DataFrame({"a": [1, 2, 0, 11]}), + ), + ], +) +def test_groupby_get_group(pdf, group, name, obj): + gdf = cudf.from_pandas(pdf) + + if isinstance(obj, pd.DataFrame): + gobj = cudf.from_pandas(obj) + else: + gobj = obj + + expected = pdf.groupby(group).get_group(name=name, obj=obj) + actual = gdf.groupby(group).get_group(name=name, obj=gobj) + + assert_groupby_results_equal(expected, actual)