diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml index 76a9f8fd01d..3541ed1208c 100644 --- a/conda/environments/cudf_dev_cuda10.1.yml +++ b/conda/environments/cudf_dev_cuda10.1.yml @@ -17,7 +17,7 @@ dependencies: - python>=3.6,<3.8 - numba>=0.49.0,!=0.51.0 - numpy - - pandas>=1.0,<1.2.0dev0 + - pandas>=1.0,<1.3.0dev0 - pyarrow=1.0.1 - fastavro>=0.22.9 - notebook>=0.5.0 diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml index a6a39ecdcba..839533516fb 100644 --- a/conda/environments/cudf_dev_cuda10.2.yml +++ b/conda/environments/cudf_dev_cuda10.2.yml @@ -17,7 +17,7 @@ dependencies: - python>=3.6,<3.8 - numba>=0.49,!=0.51.0 - numpy - - pandas>=1.0,<1.2.0dev0 + - pandas>=1.0,<1.3.0dev0 - pyarrow=1.0.1 - fastavro>=0.22.9 - notebook>=0.5.0 diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml index 0afa36721c5..401eaea63da 100644 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ b/conda/environments/cudf_dev_cuda11.0.yml @@ -17,7 +17,7 @@ dependencies: - python>=3.6,<3.8 - numba>=0.49,!=0.51.0 - numpy - - pandas>=1.0,<1.2.0dev0 + - pandas>=1.0,<1.3.0dev0 - pyarrow=1.0.1 - fastavro>=0.22.9 - notebook>=0.5.0 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 85280181711..5635f54ba20 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} @@ -35,7 +35,7 @@ requirements: - protobuf - python - typing_extensions - - pandas >=1.0,<1.2.0dev0 + - pandas >=1.0,<1.3.0dev0 - cupy >7.1.0,<9.0.0a0 - numba >=0.49.0 - numpy @@ -45,6 +45,7 @@ requirements: - fsspec>=0.6.0 - {{ pin_compatible('cudatoolkit', max_pin='x.x') }} - nvtx >=0.2.1 + - packaging - cachetools test: diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index e18a204eedb..0fedfcabb46 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. import pandas as pd from packaging import version @@ -6,3 +6,4 @@ PANDAS_VERSION = version.parse(pd.__version__) PANDAS_GE_100 = PANDAS_VERSION >= version.parse("1.0") PANDAS_GE_110 = PANDAS_VERSION >= version.parse("1.1") +PANDAS_GE_120 = PANDAS_VERSION >= version.parse("1.2") diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 01c8dfb5f1b..c41a458f02b 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -9,6 +9,7 @@ Dict, Mapping, Optional, + Sequence, Tuple, Union, cast, @@ -867,6 +868,15 @@ def set_base_data(self, value): else: super().set_base_data(value) + def _process_values_for_isin( + self, values: Sequence + ) -> Tuple[ColumnBase, ColumnBase]: + lhs = self + # We need to convert values to same type as self, + # hence passing dtype=self.dtype + rhs = cudf.core.column.as_column(values, dtype=self.dtype) + return lhs, rhs + def set_base_mask(self, value: Optional[Buffer]): super().set_base_mask(value) self._codes = None @@ -936,6 +946,21 @@ def unary_operator(self, unaryop: str): ) def __setitem__(self, key, value): + if cudf.utils.dtypes.is_scalar( + value + ) and cudf._lib.scalar._is_null_host_scalar(value): + to_add_categories = 0 + else: + to_add_categories = len( + cudf.Index(value).difference(self.categories) + ) + + if to_add_categories > 0: + raise ValueError( + "Cannot setitem on a Categorical with a new " + "category, set the categories first" + ) + if cudf.utils.dtypes.is_scalar(value): value = self._encode(value) if value is not None else value else: @@ -1046,11 +1071,24 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]: def to_pandas( self, index: ColumnLike = None, nullable: bool = False, **kwargs ) -> pd.Series: - signed_dtype = min_signed_type(len(self.categories)) - codes = self.cat().codes.astype(signed_dtype).fillna(-1).to_array() - categories = self.categories.to_pandas() + + if self.categories.dtype.kind == "f": + new_mask = bools_to_mask(self.notnull()) + col = column.build_categorical_column( + categories=self.categories, + codes=column.as_column(self.codes, dtype=self.codes.dtype), + mask=new_mask, + ordered=self.dtype.ordered, + size=self.codes.size, + ) + else: + col = self + + signed_dtype = min_signed_type(len(col.categories)) + codes = col.cat().codes.astype(signed_dtype).fillna(-1).to_array() + categories = col.categories.dropna(drop_nan=True).to_pandas() data = pd.Categorical.from_codes( - codes, categories=categories, ordered=self.ordered + codes, categories=categories, ordered=col.ordered ) return pd.Series(data, index=index) @@ -1180,6 +1218,38 @@ def find_and_replace( ordered=self.dtype.ordered, ) + def isnull(self) -> ColumnBase: + """ + Identify missing values in a CategoricalColumn. + """ + result = libcudf.unary.is_null(self) + + if self.categories.dtype.kind == "f": + # Need to consider `np.nan` values incase + # of an underlying float column + categories = libcudf.unary.is_nan(self.categories) + if categories.any(): + code = self._encode(np.nan) + result = result | (self.codes == cudf.Scalar(code)) + + return result + + def notnull(self) -> ColumnBase: + """ + Identify non-missing values in a CategoricalColumn. + """ + result = libcudf.unary.is_valid(self) + + if self.categories.dtype.kind == "f": + # Need to consider `np.nan` values incase + # of an underlying float column + categories = libcudf.unary.is_nan(self.categories) + if categories.any(): + code = self._encode(np.nan) + result = result & (self.codes != cudf.Scalar(code)) + + return result + def fillna( self, fill_value: Any = None, method: Any = None, dtype: Dtype = None ) -> CategoricalColumn: @@ -1204,6 +1274,12 @@ def fillna( raise ValueError(err_msg) from err else: fill_value = column.as_column(fill_value, nan_as_null=False) + if isinstance(fill_value, CategoricalColumn): + if self.dtype != fill_value.dtype: + raise ValueError( + "Cannot set a Categorical with another, " + "without identical categories" + ) # TODO: only required if fill_value has a subset of the # categories: fill_value = fill_value.cat()._set_categories( diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index ed9d54a1283..1bad2c3a451 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1,4 +1,5 @@ # Copyright (c) 2018-2021, NVIDIA CORPORATION. + from __future__ import annotations import builtins @@ -49,12 +50,12 @@ get_time_unit, is_categorical_dtype, is_decimal_dtype, + is_interval_dtype, is_list_dtype, is_numerical_dtype, is_scalar, is_string_dtype, is_struct_dtype, - is_interval_dtype, min_signed_type, min_unsigned_type, np_to_pa_dtype, @@ -848,55 +849,65 @@ def isin(self, values: Sequence) -> ColumnBase: ------- result: Column Column of booleans indicating if each element is in values. - Raises - ------- - TypeError - If values is a string """ - if is_scalar(values): - raise TypeError( - "only list-like objects are allowed to be passed " - f"to isin(), you passed a [{type(values).__name__}]" - ) - lhs = self rhs = None try: - # We need to convert values to same type as self, - # hence passing dtype=self.dtype - rhs = as_column(values, dtype=self.dtype) - - # Short-circuit if rhs is all null. - if lhs.null_count == 0 and (rhs.null_count == len(rhs)): - return full(len(self), False, dtype="bool") + lhs, rhs = self._process_values_for_isin(values) + res = lhs._isin_earlystop(rhs) + if res is not None: + return res except ValueError: # pandas functionally returns all False when cleansing via # typecasting fails return full(len(self), False, dtype="bool") - # If categorical, combine categories first - if is_categorical_dtype(lhs): - lhs_cats = lhs.cat().categories._values - rhs_cats = rhs.cat().categories._values - - if not np.issubdtype(rhs_cats.dtype, lhs_cats.dtype): - # If they're not the same dtype, short-circuit if the values - # list doesn't have any nulls. If it does have nulls, make - # the values list a Categorical with a single null - if not rhs.has_nulls: - return full(len(self), False, dtype="bool") - rhs = as_column(pd.Categorical.from_codes([-1], categories=[])) - rhs = rhs.cat().set_categories(lhs_cats).astype(self.dtype) - - ldf = cudf.DataFrame({"x": lhs, "orig_order": arange(len(lhs))}) + res = lhs._obtain_isin_result(rhs) + + return res + + def _process_values_for_isin( + self, values: Sequence + ) -> Tuple[ColumnBase, ColumnBase]: + """ + Helper function for `isin` which pre-process `values` based on `self`. + """ + lhs = self + rhs = as_column(values, nan_as_null=False) + if lhs.null_count == len(lhs): + lhs = lhs.astype(rhs.dtype) + elif rhs.null_count == len(rhs): + rhs = rhs.astype(lhs.dtype) + return lhs, rhs + + def _isin_earlystop(self, rhs: ColumnBase) -> Union[ColumnBase, None]: + """ + Helper function for `isin` which determines possibility of + early-stopping or not. + """ + if self.dtype != rhs.dtype: + if self.null_count and rhs.null_count: + return self.isna() + else: + return cudf.core.column.full(len(self), False, dtype="bool") + elif self.null_count == 0 and (rhs.null_count == len(rhs)): + return cudf.core.column.full(len(self), False, dtype="bool") + else: + return None + + def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase: + """ + Helper function for `isin` which merges `self` & `rhs` + to determine what values of `rhs` exist in `self`. + """ + ldf = cudf.DataFrame({"x": self, "orig_order": arange(len(self))}) rdf = cudf.DataFrame( {"x": rhs, "bool": full(len(rhs), True, dtype="bool")} ) res = ldf.merge(rdf, on="x", how="left").sort_values(by="orig_order") res = res.drop_duplicates(subset="orig_order", ignore_index=True) res = res._data["bool"].fillna(False) - return res def as_mask(self) -> Buffer: @@ -1052,14 +1063,14 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase: # columns include null index in factorization; remove: if self.has_nulls: - cats = cats.dropna() + cats = cats._column.dropna(drop_nan=False) min_type = min_unsigned_type(len(cats), 8) labels = labels - 1 if np.dtype(min_type).itemsize < labels.dtype.itemsize: labels = labels.astype(min_type) return build_categorical_column( - categories=cats._column, + categories=cats, codes=labels._column, mask=self.mask, ordered=ordered, @@ -1250,7 +1261,7 @@ def sum( def product( self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 ): - raise TypeError(f"cannot perform prod with type {self.dtype}") + raise TypeError(f"cannot perform product with type {self.dtype}") def mean(self, skipna: bool = None, dtype: Dtype = None): raise TypeError(f"cannot perform mean with type {self.dtype}") @@ -1262,7 +1273,7 @@ def var(self, skipna: bool = None, ddof=1, dtype: Dtype = np.float64): raise TypeError(f"cannot perform var with type {self.dtype}") def kurtosis(self, skipna: bool = None): - raise TypeError(f"cannot perform kurt with type {self.dtype}") + raise TypeError(f"cannot perform kurtosis with type {self.dtype}") def skew(self, skipna: bool = None): raise TypeError(f"cannot perform skew with type {self.dtype}") @@ -2066,9 +2077,11 @@ def _construct_array( arbitrary = cupy.asarray(arbitrary, dtype=dtype) except (TypeError, ValueError): native_dtype = dtype - if dtype is None and pd.api.types.infer_dtype(arbitrary) in ( - "mixed", - "mixed-integer", + if ( + dtype is None + and not cudf._lib.scalar._is_null_host_scalar(arbitrary) + and pd.api.types.infer_dtype(arbitrary) + in ("mixed", "mixed-integer",) ): native_dtype = "object" arbitrary = np.asarray( diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 6029052c1d3..7c5385b9bbf 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -1,4 +1,5 @@ # Copyright (c) 2019-2021, NVIDIA CORPORATION. + from __future__ import annotations import datetime as dt @@ -13,11 +14,17 @@ import cudf from cudf import _lib as libcudf from cudf._typing import DatetimeLikeScalar, Dtype, DtypeObj, ScalarLike +from cudf.core._compat import PANDAS_GE_120 from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, column, string from cudf.utils.dtypes import is_scalar from cudf.utils.utils import _fillna_natwise +if PANDAS_GE_120: + _guess_datetime_format = pd.core.tools.datetimes.guess_datetime_format +else: + _guess_datetime_format = pd.core.tools.datetimes._guess_datetime_format + # nanoseconds per time_unit _numpy_to_pandas_conversion = { "ns": 1, @@ -235,6 +242,19 @@ def mean(self, skipna=None, dtype=np.float64) -> ScalarLike: unit=self.time_unit, ) + def std( + self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64 + ) -> pd.Timedelta: + return pd.Timedelta( + self.as_numerical.std(skipna=skipna, ddof=ddof, dtype=dtype) + * _numpy_to_pandas_conversion[self.time_unit], + ) + + def median(self, skipna: bool = None) -> pd.Timestamp: + return pd.Timestamp( + self.as_numerical.median(skipna=skipna), unit=self.time_unit + ) + def quantile( self, q: Union[float, Sequence[float]], interpolation: str, exact: bool ) -> ColumnBase: @@ -316,6 +336,9 @@ def find_last_value(self, value: ScalarLike, closest: bool = False) -> int: def is_unique(self) -> bool: return self.as_numerical.is_unique + def isin(self, values: Sequence) -> ColumnBase: + return cudf.core.tools.datetimes._isin_datetimelike(self, values) + def can_cast_safely(self, to_dtype: Dtype) -> bool: if np.issubdtype(to_dtype, np.datetime64): @@ -375,7 +398,7 @@ def infer_format(element: str, **kwargs) -> str: """ Infers datetime format from a string, also takes cares for `ms` and `ns` """ - fmt = pd.core.tools.datetimes._guess_datetime_format(element, **kwargs) + fmt = _guess_datetime_format(element, **kwargs) if fmt is not None: return fmt @@ -389,15 +412,11 @@ def infer_format(element: str, **kwargs) -> str: second_parts = re.split(r"(\D+)", element_parts[1], maxsplit=1) subsecond_fmt = ".%" + str(len(second_parts[0])) + "f" - first_part = pd.core.tools.datetimes._guess_datetime_format( - element_parts[0], **kwargs - ) + first_part = _guess_datetime_format(element_parts[0], **kwargs) # For the case where first_part is '00:00:03' if first_part is None: tmp = "1970-01-01 " + element_parts[0] - first_part = pd.core.tools.datetimes._guess_datetime_format( - tmp, **kwargs - ).split(" ", 1)[1] + first_part = _guess_datetime_format(tmp, **kwargs).split(" ", 1)[1] if first_part is None: raise ValueError("Unable to infer the timestamp format from the data") @@ -411,9 +430,7 @@ def infer_format(element: str, **kwargs) -> str: if len(second_part) > 1: # Only infer if second_parts is not an empty string. - second_part = pd.core.tools.datetimes._guess_datetime_format( - second_part, **kwargs - ) + second_part = _guess_datetime_format(second_part, **kwargs) else: second_part = "" diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 0a8d93c913b..f9b695e9ce3 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -3,7 +3,7 @@ from __future__ import annotations from numbers import Number -from typing import Any, Callable, Sequence, Union, cast +from typing import Any, Callable, Sequence, Tuple, Union, cast import numpy as np import pandas as pd @@ -248,6 +248,22 @@ def std( ) -> float: return self.reduce("std", skipna=skipna, dtype=dtype, ddof=ddof) + def _process_values_for_isin( + self, values: Sequence + ) -> Tuple[ColumnBase, ColumnBase]: + lhs = cast("cudf.core.column.ColumnBase", self) + rhs = as_column(values, nan_as_null=False) + + if isinstance(rhs, NumericalColumn): + rhs = rhs.astype(dtype=self.dtype) + + if lhs.null_count == len(lhs): + lhs = lhs.astype(rhs.dtype) + elif rhs.null_count == len(rhs): + rhs = rhs.astype(lhs.dtype) + + return lhs, rhs + def sum_of_squares(self, dtype: Dtype = None) -> float: return libcudf.reduce.reduce("sum_of_squares", self, dtype=dtype) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 21f504ea684..aa5172a9a89 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5189,7 +5189,7 @@ def _get_cols_list(parent_obj, others): ] return cols_list - elif others is not None: + elif others is not None and not isinstance(others, StringMethods): if ( parent_index is not None and isinstance(others, cudf.Series) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 75509df4ec6..ac63192b692 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -1,4 +1,5 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. + from __future__ import annotations import datetime as dt @@ -127,7 +128,7 @@ def _binary_op_floordiv( common_dtype = determine_out_dtype(self.dtype, rhs.dtype) lhs = lhs.astype(common_dtype).astype("float64") if isinstance(rhs, cudf.Scalar): - if rhs.is_valid: + if rhs.is_valid(): rhs = cudf.Scalar( np.timedelta64(rhs.value) .astype(common_dtype) @@ -367,6 +368,9 @@ def median(self, skipna: bool = None) -> pd.Timedelta: self.as_numerical.median(skipna=skipna), unit=self.time_unit ) + def isin(self, values: Sequence) -> ColumnBase: + return cudf.core.tools.datetimes._isin_datetimelike(self, values) + def quantile( self, q: Union[float, Sequence[float]], interpolation: str, exact: bool ) -> "column.ColumnBase": @@ -380,15 +384,12 @@ def quantile( def sum( self, skipna: bool = None, dtype: Dtype = None, min_count=0 ) -> pd.Timedelta: - if len(self) == 0: - return pd.Timedelta(None, unit=self.time_unit) - else: - return pd.Timedelta( - self.as_numerical.sum( - skipna=skipna, dtype=dtype, min_count=min_count - ), - unit=self.time_unit, - ) + return pd.Timedelta( + self.as_numerical.sum( + skipna=skipna, dtype=dtype, min_count=min_count + ), + unit=self.time_unit, + ) def std( self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64 diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d04a0bcf62d..8bdb36fc27d 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -584,8 +584,8 @@ def deserialize(cls, header, frames): @property def dtypes(self): """Return the dtypes in this object.""" - return pd.Series( - [x.dtype for x in self._data.columns], index=self._data.names + return cudf.utils.utils._create_pandas_series( + data=[x.dtype for x in self._data.columns], index=self._data.names, ) @property @@ -690,7 +690,7 @@ def __getitem__(self, arg): elif can_convert_to_column(arg): mask = arg if is_list_like(mask): - mask = pd.Series(mask) + mask = cudf.utils.utils._create_pandas_series(data=mask) if mask.dtype == "bool": return self._apply_boolean_mask(mask) else: diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index f11f3692faf..8b7d54b6715 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -56,7 +56,12 @@ def to_pandas(self) -> pd.CategoricalDtype: if self.categories is None: categories = None else: - categories = self.categories.to_pandas() + if isinstance( + self.categories, (cudf.Float32Index, cudf.Float64Index) + ): + categories = self.categories.dropna().to_pandas() + else: + categories = self.categories.to_pandas() return pd.CategoricalDtype(categories=categories, ordered=self.ordered) def _init_categories(self, categories: Any): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index a21201a7f10..dedefeaf9a2 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1,9 +1,9 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. + from __future__ import annotations import copy import functools -import operator import warnings from collections import OrderedDict, abc as abc from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, overload @@ -339,9 +339,11 @@ def _concat( np.intersect1d, all_columns_list ) # get column names not present in all objs - non_intersecting_columns = ( - functools.reduce(operator.or_, (obj.columns for obj in objs)) - ^ intersecting_columns + union_of_columns = functools.reduce( + pd.Index.union, [obj.columns for obj in objs] + ) + non_intersecting_columns = union_of_columns.symmetric_difference( + intersecting_columns ) names = OrderedDict.fromkeys(intersecting_columns).keys() diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index e3899a403f1..88f3f8c4c89 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1993,7 +1993,20 @@ def __repr__(self): # utilize `Index.to_string` once it is implemented # related issue : https://github.com/pandas-dev/pandas/issues/35389 if isinstance(preprocess, CategoricalIndex): - output = preprocess.to_pandas().__repr__() + if preprocess.categories.dtype.kind == "f": + output = ( + preprocess.astype("str") + .to_pandas() + .astype("category") + .__repr__() + ) + break_idx = output.find("ordered=") + output = ( + output[:break_idx].replace("'", "") + output[break_idx:] + ) + else: + output = preprocess.to_pandas().__repr__() + output = output.replace("nan", cudf._NA_REP) elif preprocess._values.nullable: output = self._clean_nulls_from_index().to_pandas().__repr__() diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index 4d685408df3..cf372286b7e 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -95,8 +95,10 @@ def __setitem__(self, key, value): else: value = column.as_column(value) - if hasattr(value, "dtype") and pd.api.types.is_numeric_dtype( - value.dtype + if ( + not is_categorical_dtype(self._sr._column.dtype) + and hasattr(value, "dtype") + and pd.api.types.is_numeric_dtype(value.dtype) ): # normalize types if necessary: if not pd.api.types.is_integer(key): diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 4e82a1f72b0..19c5b827d50 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1,4 +1,5 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. + import itertools import numbers import pickle @@ -15,6 +16,7 @@ import cudf from cudf import _lib as libcudf from cudf._typing import DataFrameOrSeries +from cudf.core._compat import PANDAS_GE_120 from cudf.core.column import column from cudf.core.frame import Frame from cudf.core.index import Index, as_index @@ -485,7 +487,28 @@ def __repr__(self): ) ) ) - preprocess = preprocess.to_pandas(nullable=True) + + if PANDAS_GE_120: + # TODO: Remove this whole `if` block, + # this is a workaround for the following issue: + # https://github.com/pandas-dev/pandas/issues/39984 + temp_df = preprocess._source_data + + preprocess_pdf = pd.DataFrame() + for col in temp_df.columns: + if temp_df[col].dtype.kind == "f": + preprocess_pdf[col] = temp_df[col].to_pandas( + nullable=False + ) + else: + preprocess_pdf[col] = temp_df[col].to_pandas( + nullable=True + ) + + preprocess_pdf.columns = preprocess.names + preprocess = pd.MultiIndex.from_frame(preprocess_pdf) + else: + preprocess = preprocess.to_pandas(nullable=True) preprocess.values[:] = tuples_list else: preprocess = preprocess.to_pandas(nullable=True) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 72e468002db..be03fb147ff 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1070,7 +1070,13 @@ def __repr__(self): else get_option("display.min_rows") ) show_dimensions = get_option("display.show_dimensions") - output = preprocess.to_pandas().to_string( + if preprocess._column.categories.dtype.kind == "f": + pd_series = ( + preprocess.astype("str").to_pandas().astype("category") + ) + else: + pd_series = preprocess.to_pandas() + output = pd_series.to_string( name=self.name, dtype=self.dtype, min_rows=min_rows, @@ -1085,6 +1091,15 @@ def __repr__(self): if isinstance(preprocess._column, cudf.core.column.CategoricalColumn): category_memory = lines[-1] + if preprocess._column.categories.dtype.kind == "f": + category_memory = category_memory.replace("'", "").split(": ") + category_memory = ( + category_memory[0].replace( + "object", preprocess._column.categories.dtype.name + ) + + ": " + + category_memory[1] + ) lines = lines[:-1] if len(lines) > 1: if lines[-1].startswith("Name: "): diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 206786fad42..4e5e4ce1987 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -1,6 +1,7 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. import warnings +from typing import Sequence, Union import numpy as np import pandas as pd @@ -497,3 +498,43 @@ def __setattr__(self, name, value): raise AttributeError("DateOffset objects are immutable.") else: object.__setattr__(self, name, value) + + +def _isin_datetimelike( + lhs: Union[column.TimeDeltaColumn, column.DatetimeColumn], values: Sequence +) -> column.ColumnBase: + """ + Check whether values are contained in the + DateTimeColumn or TimeDeltaColumn. + + Parameters + ---------- + lhs : TimeDeltaColumn or DatetimeColumn + Column to check whether the `values` exist in. + values : set or list-like + The sequence of values to test. Passing in a single string will + raise a TypeError. Instead, turn a single string into a list + of one element. + + Returns + ------- + result: Column + Column of booleans indicating if each element is in values. + """ + rhs = None + try: + rhs = cudf.core.column.as_column(values) + + if rhs.dtype.kind in {"f", "i", "u"}: + return cudf.core.column.full(len(lhs), False, dtype="bool") + rhs = rhs.astype(lhs.dtype) + res = lhs._isin_earlystop(rhs) + if res is not None: + return res + except ValueError: + # pandas functionally returns all False when cleansing via + # typecasting fails + return cudf.core.column.full(len(lhs), False, dtype="bool") + + res = lhs._obtain_isin_result(rhs) + return res diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 2048e574acc..bacab24a6f3 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. from __future__ import annotations @@ -8,6 +8,7 @@ import pandas as pd import cudf +from cudf.core._compat import PANDAS_GE_110 from cudf.utils.dtypes import is_categorical_dtype @@ -91,6 +92,8 @@ def assert_column_equal( check_datetimelike_compat=False, check_categorical=True, check_category_order=True, + rtol=1e-05, + atol=1e-08, obj="ColumnBase", ): """ @@ -122,6 +125,10 @@ def assert_column_equal( Whether to compare internal Categorical exactly. check_category_order : bool, default True Whether to compare category order of internal Categoricals + rtol : float, default 1e-5 + Relative tolerance. Only used when `check_exact` is False. + atol : float, default 1e-8 + Absolute tolerance. Only used when `check_exact` is False. obj : str, default ‘ColumnBase’ Specify object name being compared, internally used to show appropriate assertion message. @@ -165,6 +172,8 @@ def assert_column_equal( exact=check_dtype, check_exact=True, check_categorical=False, + rtol=rtol, + atol=atol, ) assert_column_equal( left.codes, @@ -173,6 +182,8 @@ def assert_column_equal( check_exact=True, check_categorical=False, check_category_order=False, + rtol=rtol, + atol=atol, ) if left.ordered != right.ordered: @@ -220,6 +231,9 @@ def assert_index_equal( check_less_precise: Union[bool, int] = False, check_exact: bool = True, check_categorical: bool = True, + check_order: bool = True, + rtol: float = 1e-5, + atol: float = 1e-8, obj: str = "Index", ): """ @@ -247,6 +261,17 @@ def assert_index_equal( Whether to compare number exactly. check_categorical : bool, default True Whether to compare internal Categorical exactly. + check_order : bool, default True + Whether to compare the order of index entries as + well as their values. + If True, both indexes must contain the same elements, + in the same order. + If False, both indexes must contain the same elements, + but in any order. + rtol : float, default 1e-5 + Relative tolerance. Only used when `check_exact` is False. + atol : float, default 1e-8 + Absolute tolerance. Only used when `check_exact` is False. obj : str, default ‘Index’ Specify object name being compared, internally used to show appropriate assertion message. @@ -293,6 +318,11 @@ def assert_index_equal( obj, "lengths are different", f"{len(left)}", f"{len(right)}" ) + # If order doesn't matter then sort the index entries + if not check_order: + left = left.sort_values() + right = right.sort_values() + if isinstance(left, cudf.MultiIndex): if left.nlevels != right.nlevels: raise AssertionError( @@ -309,8 +339,11 @@ def assert_index_equal( rlevel, exact=check_exact, check_names=check_names, - check_less_precise=check_less_precise, check_exact=check_exact, + check_less_precise=check_less_precise, + check_order=check_order, + rtol=rtol, + atol=atol, obj=mul_obj, ) return @@ -343,6 +376,8 @@ def assert_series_equal( check_datetimelike_compat=False, check_categorical=True, check_category_order=True, + rtol=1e-5, + atol=1e-8, obj="Series", ): """ @@ -380,6 +415,10 @@ def assert_series_equal( Whether to compare internal Categorical exactly. check_category_order : bool, default True Whether to compare category order of internal Categoricals + rtol : float, default 1e-5 + Relative tolerance. Only used when `check_exact` is False. + atol : float, default 1e-8 + Absolute tolerance. Only used when `check_exact` is False. obj : str, default ‘Series’ Specify object name being compared, internally used to show appropriate assertion message. @@ -431,6 +470,8 @@ def assert_series_equal( check_less_precise=check_less_precise, check_exact=check_exact, check_categorical=check_categorical, + rtol=rtol, + atol=atol, obj=f"{obj}.index", ) @@ -444,6 +485,8 @@ def assert_series_equal( check_datetimelike_compat=check_datetimelike_compat, check_categorical=check_categorical, check_category_order=check_category_order, + rtol=rtol, + atol=atol, ) # metadata comparison @@ -460,13 +503,14 @@ def assert_frame_equal( check_index_type="equiv", check_column_type="equiv", check_frame_type=True, - check_less_precise=False, - by_blocks=False, check_names=True, + by_blocks=False, check_exact=False, check_datetimelike_compat=False, check_categorical=True, check_like=False, + rtol=1e-5, + atol=1e-8, obj="DataFrame", ): """ @@ -493,8 +537,6 @@ def assert_frame_equal( and similar to pandas. check_frame_type : bool, default True Whether to check the DataFrame class is identical. - check_less_precise : bool or int, default False - Not yet supported check_names : bool, default True Whether to check that the names attribute for both the index and column attributes of the DataFrame is identical. @@ -512,6 +554,10 @@ def assert_frame_equal( If True, ignore the order of index & columns. Note: index labels must match their respective rows (same as in columns) - same labels must be with the same data. + rtol : float, default 1e-5 + Relative tolerance. Only used when `check_exact` is False. + atol : float, default 1e-8 + Absolute tolerance. Only used when `check_exact` is False. obj : str, default ‘DataFrame’ Specify object name being compared, internally used to show appropriate assertion message. @@ -568,40 +614,51 @@ def assert_frame_equal( left, right = left.reindex(index=right.index), right right = right[list(left._data.names)] - if check_less_precise: - raise NotImplementedError("check_less_precise is not yet supported") - # index comparison assert_index_equal( left.index, right.index, exact=check_index_type, check_names=check_names, - check_less_precise=check_less_precise, check_exact=check_exact, check_categorical=check_categorical, + rtol=rtol, + atol=atol, obj=f"{obj}.index", ) - pd.testing.assert_index_equal( - left.columns, - right.columns, - exact=check_column_type, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_categorical=check_categorical, - obj=f"{obj}.columns", - ) + if PANDAS_GE_110: + pd.testing.assert_index_equal( + left.columns, + right.columns, + exact=check_column_type, + check_names=check_names, + check_exact=check_exact, + check_categorical=check_categorical, + rtol=rtol, + atol=atol, + obj=f"{obj}.columns", + ) + else: + pd.testing.assert_index_equal( + left.columns, + right.columns, + exact=check_column_type, + check_names=check_names, + check_exact=check_exact, + check_categorical=check_categorical, + obj=f"{obj}.columns", + ) for col in left.columns: assert_column_equal( left._data[col], right._data[col], check_dtype=check_dtype, - check_less_precise=check_less_precise, check_exact=check_exact, check_datetimelike_compat=check_datetimelike_compat, check_categorical=check_categorical, + rtol=rtol, + atol=atol, obj=f'Column name="{col}"', ) diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 2d8130e6cb1..a117c15f14d 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -1,15 +1,14 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. import operator +import string import numpy as np import pandas as pd import pytest -import cudf as gd -from cudf.core import DataFrame, Series +import cudf from cudf.core._compat import PANDAS_GE_110 -from cudf.core.index import as_index from cudf.tests.utils import assert_eq, assert_exceptions_equal @@ -22,10 +21,10 @@ def pd_str_cat(): def test_categorical_basic(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) - cudf_cat = as_index(cat) + cudf_cat = cudf.Index(cat) pdsr = pd.Series(cat, index=["p", "q", "r", "s", "t"]) - sr = Series(cat, index=["p", "q", "r", "s", "t"]) + sr = cudf.Series(cat, index=["p", "q", "r", "s", "t"]) assert_eq(pdsr.cat.codes, sr.cat.codes, check_dtype=False) # Test attributes @@ -53,7 +52,7 @@ def test_categorical_integer(): pytest.xfail(reason="pandas >=1.1 required") cat = pd.Categorical(["a", "_", "_", "c", "a"], categories=["a", "b", "c"]) pdsr = pd.Series(cat) - sr = Series(cat) + sr = cudf.Series(cat) np.testing.assert_array_equal( cat.codes, sr.cat.codes.astype(cat.codes.dtype).fillna(-1).to_array() ) @@ -81,7 +80,7 @@ def test_categorical_compare_unordered(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) pdsr = pd.Series(cat) - sr = Series(cat) + sr = cudf.Series(cat) # test equal out = sr == sr @@ -112,12 +111,12 @@ def test_categorical_compare_ordered(): ["a", "a", "b", "c", "a"], categories=["a", "b", "c"], ordered=True ) pdsr1 = pd.Series(cat1) - sr1 = Series(cat1) + sr1 = cudf.Series(cat1) cat2 = pd.Categorical( ["a", "b", "a", "c", "b"], categories=["a", "b", "c"], ordered=True ) pdsr2 = pd.Series(cat2) - sr2 = Series(cat2) + sr2 = cudf.Series(cat2) # test equal out = sr1 == sr1 @@ -142,7 +141,7 @@ def test_categorical_compare_ordered(): def test_categorical_binary_add(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) pdsr = pd.Series(cat) - sr = Series(cat) + sr = cudf.Series(cat) assert_exceptions_equal( lfunc=operator.add, @@ -157,7 +156,7 @@ def test_categorical_binary_add(): def test_categorical_unary_ceil(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) pdsr = pd.Series(cat) - sr = Series(cat) + sr = cudf.Series(cat) assert_exceptions_equal( lfunc=getattr, @@ -176,7 +175,7 @@ def test_categorical_element_indexing(): """ cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) pdsr = pd.Series(cat) - sr = Series(cat) + sr = cudf.Series(cat) assert_eq(pdsr, sr) assert_eq(pdsr.cat.codes, sr.cat.codes, check_dtype=False) @@ -188,7 +187,7 @@ def test_categorical_masking(): """ cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) pdsr = pd.Series(cat) - sr = Series(cat) + sr = cudf.Series(cat) # check scalar comparison expect_matches = pdsr == "a" @@ -208,7 +207,7 @@ def test_categorical_masking(): def test_df_cat_set_index(): - df = DataFrame() + df = cudf.DataFrame() df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) df["b"] = np.arange(len(df)) got = df.set_index("a") @@ -220,7 +219,7 @@ def test_df_cat_set_index(): def test_df_cat_sort_index(): - df = DataFrame() + df = cudf.DataFrame() df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) df["b"] = np.arange(len(df)) @@ -231,7 +230,7 @@ def test_df_cat_sort_index(): def test_cat_series_binop_error(): - df = DataFrame() + df = cudf.DataFrame() df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) df["b"] = np.arange(len(df)) @@ -261,20 +260,20 @@ def test_cat_series_binop_error(): @pytest.mark.parametrize("num_elements", [10, 100, 1000]) def test_categorical_unique(num_elements): - from string import ascii_letters, digits - # create categorical series np.random.seed(12) pd_cat = pd.Categorical( pd.Series( - np.random.choice(list(ascii_letters + digits), num_elements), + np.random.choice( + list(string.ascii_letters + string.digits), num_elements + ), dtype="category", ) ) # gdf - gdf = DataFrame() - gdf["a"] = Series.from_categorical(pd_cat) + gdf = cudf.DataFrame() + gdf["a"] = cudf.Series.from_categorical(pd_cat) gdf_unique_sorted = np.sort(gdf["a"].unique().to_pandas()) # pandas @@ -288,20 +287,20 @@ def test_categorical_unique(num_elements): @pytest.mark.parametrize("nelem", [20, 50, 100]) def test_categorical_unique_count(nelem): - from string import ascii_letters, digits - # create categorical series np.random.seed(12) pd_cat = pd.Categorical( pd.Series( - np.random.choice(list(ascii_letters + digits), nelem), + np.random.choice( + list(string.ascii_letters + string.digits), nelem + ), dtype="category", ) ) # gdf - gdf = DataFrame() - gdf["a"] = Series.from_categorical(pd_cat) + gdf = cudf.DataFrame() + gdf["a"] = cudf.Series.from_categorical(pd_cat) gdf_unique_count = gdf["a"].nunique() # pandas @@ -316,7 +315,7 @@ def test_categorical_unique_count(nelem): def test_categorical_empty(): cat = pd.Categorical([]) pdsr = pd.Series(cat) - sr = Series(cat) + sr = cudf.Series(cat) np.testing.assert_array_equal(cat.codes, sr.cat.codes.to_array()) # Test attributes @@ -331,7 +330,7 @@ def test_categorical_empty(): def test_categorical_set_categories(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) psr = pd.Series(cat) - sr = Series.from_categorical(cat) + sr = cudf.Series.from_categorical(cat) # adding category expect = psr.cat.set_categories(["a", "b", "c", "d"]) @@ -349,7 +348,7 @@ def test_categorical_set_categories_preserves_order(): # reassigning categories should preserve element ordering assert_eq( series.cat.set_categories([1, 2]), - Series(series).cat.set_categories([1, 2]), + cudf.Series(series).cat.set_categories([1, 2]), ) @@ -357,7 +356,7 @@ def test_categorical_set_categories_preserves_order(): def test_categorical_as_ordered(pd_str_cat, inplace): pd_sr = pd.Series(pd_str_cat.copy().set_ordered(False)) - cd_sr = gd.Series(pd_str_cat.copy().set_ordered(False)) + cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(False)) assert cd_sr.cat.ordered is False assert cd_sr.cat.ordered == pd_sr.cat.ordered @@ -376,7 +375,7 @@ def test_categorical_as_ordered(pd_str_cat, inplace): def test_categorical_as_unordered(pd_str_cat, inplace): pd_sr = pd.Series(pd_str_cat.copy().set_ordered(True)) - cd_sr = gd.Series(pd_str_cat.copy().set_ordered(True)) + cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(True)) assert cd_sr.cat.ordered is True assert cd_sr.cat.ordered == pd_sr.cat.ordered @@ -399,7 +398,7 @@ def test_categorical_reorder_categories( ): pd_sr = pd.Series(pd_str_cat.copy().set_ordered(from_ordered)) - cd_sr = gd.Series(pd_str_cat.copy().set_ordered(from_ordered)) + cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(from_ordered)) assert_eq(pd_sr, cd_sr) @@ -421,7 +420,7 @@ def test_categorical_reorder_categories( def test_categorical_add_categories(pd_str_cat, inplace): pd_sr = pd.Series(pd_str_cat.copy()) - cd_sr = gd.Series(pd_str_cat.copy()) + cd_sr = cudf.Series(pd_str_cat.copy()) assert_eq(pd_sr, cd_sr) @@ -442,7 +441,7 @@ def test_categorical_add_categories(pd_str_cat, inplace): def test_categorical_remove_categories(pd_str_cat, inplace): pd_sr = pd.Series(pd_str_cat.copy()) - cd_sr = gd.Series(pd_str_cat.copy()) + cd_sr = cudf.Series(pd_str_cat.copy()) assert_eq(pd_sr, cd_sr) @@ -470,7 +469,7 @@ def test_categorical_remove_categories(pd_str_cat, inplace): def test_categorical_dataframe_slice_copy(): pdf = pd.DataFrame({"g": pd.Series(["a", "b", "z"], dtype="category")}) - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) exp = pdf[1:].copy() gdf = gdf[1:].copy() @@ -493,7 +492,7 @@ def test_categorical_dataframe_slice_copy(): pd.Series([1, 2, 3, 89], dtype="float64"), pd.Series([1, 2.5, 3.001, 89], dtype="float64"), pd.Series([None, None, None]), - pd.Series([]), + pd.Series([], dtype="float64"), ], ) @pytest.mark.parametrize( @@ -511,7 +510,7 @@ def test_categorical_dataframe_slice_copy(): ) def test_categorical_typecast(data, cat_type): pd_data = data.copy() - gd_data = gd.from_pandas(data) + gd_data = cudf.from_pandas(data) assert_eq(pd_data.astype(cat_type), gd_data.astype(cat_type)) @@ -526,7 +525,7 @@ def test_categorical_typecast(data, cat_type): pd.Series([1, 2, 3, 89], dtype="float64"), pd.Series([1, 2.5, 3.001, 89], dtype="float64"), pd.Series([None, None, None]), - pd.Series([]), + pd.Series([], dtype="float64"), ], ) @pytest.mark.parametrize( @@ -545,7 +544,7 @@ def test_categorical_typecast(data, cat_type): ) def test_categorical_set_categories_categoricals(data, new_categories): pd_data = data.copy().astype("category") - gd_data = gd.from_pandas(pd_data) + gd_data = cudf.from_pandas(pd_data) assert_eq( pd_data.cat.set_categories(new_categories=new_categories), @@ -557,7 +556,7 @@ def test_categorical_set_categories_categoricals(data, new_categories): new_categories=pd.Series(new_categories, dtype="category") ), gd_data.cat.set_categories( - new_categories=gd.Series(new_categories, dtype="category") + new_categories=cudf.Series(new_categories, dtype="category") ), ) @@ -590,14 +589,14 @@ def test_categorical_set_categories_categoricals(data, new_categories): ) def test_categorical_creation(data, dtype): expected = pd.Series(data, dtype=dtype) - got = gd.Series(data, dtype=dtype) + got = cudf.Series(data, dtype=dtype) assert_eq(expected, got) - got = gd.Series(data, dtype=gd.from_pandas(dtype)) + got = cudf.Series(data, dtype=cudf.from_pandas(dtype)) assert_eq(expected, got) expected = pd.Series(data, dtype="category") - got = gd.Series(data, dtype="category") + got = cudf.Series(data, dtype="category") assert_eq(expected, got) @@ -613,33 +612,33 @@ def test_categorical_creation(data, dtype): @pytest.mark.parametrize("ordered", [True, False]) def test_categorical_dtype(categories, ordered): expected = pd.CategoricalDtype(categories=categories, ordered=ordered) - got = gd.CategoricalDtype(categories=categories, ordered=ordered) + got = cudf.CategoricalDtype(categories=categories, ordered=ordered) assert_eq(expected, got) @pytest.mark.parametrize( ("data", "expected"), [ - (gd.Series([1]), np.uint8), - (gd.Series([1, None]), np.uint8), - (gd.Series(np.arange(np.iinfo(np.int8).max)), np.uint8), + (cudf.Series([1]), np.uint8), + (cudf.Series([1, None]), np.uint8), + (cudf.Series(np.arange(np.iinfo(np.int8).max)), np.uint8), ( - gd.Series(np.append(np.arange(np.iinfo(np.int8).max), [None])), + cudf.Series(np.append(np.arange(np.iinfo(np.int8).max), [None])), np.uint8, ), - (gd.Series(np.arange(np.iinfo(np.int16).max)), np.uint16), + (cudf.Series(np.arange(np.iinfo(np.int16).max)), np.uint16), ( - gd.Series(np.append(np.arange(np.iinfo(np.int16).max), [None])), + cudf.Series(np.append(np.arange(np.iinfo(np.int16).max), [None])), np.uint16, ), - (gd.Series(np.arange(np.iinfo(np.uint8).max)), np.uint8), + (cudf.Series(np.arange(np.iinfo(np.uint8).max)), np.uint8), ( - gd.Series(np.append(np.arange(np.iinfo(np.uint8).max), [None])), + cudf.Series(np.append(np.arange(np.iinfo(np.uint8).max), [None])), np.uint8, ), - (gd.Series(np.arange(np.iinfo(np.uint16).max)), np.uint16), + (cudf.Series(np.arange(np.iinfo(np.uint16).max)), np.uint16), ( - gd.Series(np.append(np.arange(np.iinfo(np.uint16).max), [None])), + cudf.Series(np.append(np.arange(np.iinfo(np.uint16).max), [None])), np.uint16, ), ], @@ -664,7 +663,7 @@ def test_astype_dtype(data, expected): ) def test_add_categories(data, add): pds = pd.Series(data, dtype="category") - gds = gd.Series(data, dtype="category") + gds = cudf.Series(data, dtype="category") expected = pds.cat.add_categories(add) actual = gds.cat.add_categories(add) @@ -692,7 +691,7 @@ def test_add_categories(data, add): ) def test_add_categories_error(data, add): pds = pd.Series(data, dtype="category") - gds = gd.Series(data, dtype="category") + gds = cudf.Series(data, dtype="category") assert_exceptions_equal( pds.cat.add_categories, @@ -704,12 +703,12 @@ def test_add_categories_error(data, add): def test_add_categories_mixed_error(): - gds = gd.Series(["a", "bd", "ef"], dtype="category") + gds = cudf.Series(["a", "bd", "ef"], dtype="category") with pytest.raises(TypeError): gds.cat.add_categories([1, 2, 3]) - gds = gd.Series([1, 2, 3], dtype="category") + gds = cudf.Series([1, 2, 3], dtype="category") with pytest.raises(TypeError): gds.cat.add_categories(["a", "bd", "ef"]) @@ -743,7 +742,7 @@ def test_add_categories_mixed_error(): def test_categorical_assignment(data, cat_dtype): pd_df = pd.DataFrame() pd_df["a"] = np.ones(len(data)) - cd_df = gd.from_pandas(pd_df) + cd_df = cudf.from_pandas(pd_df) pd_cat_series = pd.Series(data, dtype=cat_dtype) # assign categorical series @@ -757,9 +756,37 @@ def test_categorical_assignment(data, cat_dtype): # see issue: https://github.com/rapidsai/cudf/issues/2269 pd_df = pd.DataFrame() pd_df["a"] = np.ones(len(data)) - cd_df = gd.from_pandas(pd_df) + cd_df = cudf.from_pandas(pd_df) pd_categorical = pd.Categorical(data, dtype=cat_dtype) pd_df.assign(cat_col=pd_categorical) cd_df.assign(cat_col=pd_categorical) assert_eq(pd_df, cd_df) + + +def test_categorical_allow_nan(): + gs = cudf.Series([1, 2, np.nan, 10, np.nan, None], nan_as_null=False) + gs = gs.astype("category") + expected_codes = cudf.Series([0, 1, 3, 2, 3, None], dtype="uint8") + assert_eq(expected_codes, gs.cat.codes) + + expected_categories = cudf.Index([1.0, 2.0, 10.0, np.nan], dtype="float64") + assert_eq(expected_categories, gs.cat.categories) + + actual_ps = gs.to_pandas() + expected_ps = pd.Series( + [1.0, 2.0, np.nan, 10.0, np.nan, np.nan], dtype="category" + ) + assert_eq(actual_ps, expected_ps) + + +def test_categorical_setitem_with_nan(): + gs = cudf.Series( + [1, 2, np.nan, 10, np.nan, None], nan_as_null=False + ).astype("category") + gs[[1, 3]] = np.nan + + expected_series = cudf.Series( + [1, np.nan, np.nan, np.nan, np.nan, None], nan_as_null=False + ).astype(gs.dtype) + assert_eq(gs, expected_series) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index f8a7099f1bf..d0e31a82b28 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. import re @@ -372,8 +372,8 @@ def test_concat_mixed_input(): [ [pd.Series([1, 2, 3]), pd.DataFrame({"a": [1, 2]})], [pd.Series([1, 2, 3]), pd.DataFrame({"a": []})], - [pd.Series([]), pd.DataFrame({"a": []})], - [pd.Series([]), pd.DataFrame({"a": [1, 2]})], + [pd.Series([], dtype="float64"), pd.DataFrame({"a": []})], + [pd.Series([], dtype="float64"), pd.DataFrame({"a": [1, 2]})], [pd.Series([1, 2, 3.0, 1.2], name="abc"), pd.DataFrame({"a": [1, 2]})], [ pd.Series( diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 23a950bb72d..d972d2ad11c 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1815,7 +1815,7 @@ def test_csv_reader_dtypes(dtype): @pytest.mark.parametrize( - "dtype", ["Int64", "UInt32", {"a": "UInt64", "b": "float64", "c": "Int32"}] + "dtype", ["Int64", "UInt32", {"a": "UInt64", "b": "Float64", "c": "Int32"}] ) def test_csv_reader_nullable_dtypes(dtype): buf = "a,b,c\n1,10,111\n2,11,112\n3,12,113\n4,13,114\n" @@ -1838,7 +1838,6 @@ def test_csv_reader_timedetla_dtypes(dtype): assert_eq(expected, actual) -@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/6719") @pytest.mark.parametrize( "dtype", sorted(list(cudf.utils.dtypes.DATETIME_TYPES)) ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index b45d71bd088..a3bad0ab5a6 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1,11 +1,14 @@ # Copyright (c) 2018-2021, NVIDIA CORPORATION. import array as arr +import datetime import io import operator import random import re +import string import textwrap +from copy import copy import cupy import numpy as np @@ -14,8 +17,8 @@ import pytest from numba import cuda -import cudf as gd -from cudf.core._compat import PANDAS_GE_110 +import cudf +from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_120 from cudf.core.column import column from cudf.tests import utils from cudf.tests.utils import ( @@ -37,7 +40,7 @@ def test_init_via_list_of_tuples(): ] pdf = pd.DataFrame(data) - gdf = gd.DataFrame(data) + gdf = cudf.DataFrame(data) assert_eq(pdf, gdf) @@ -70,9 +73,15 @@ def test_init_via_list_of_empty_tuples(rows): data = [()] * rows pdf = pd.DataFrame(data) - gdf = gd.DataFrame(data) + gdf = cudf.DataFrame(data) - assert_eq(pdf, gdf, check_like=True) + assert_eq( + pdf, + gdf, + check_like=True, + check_column_type=False, + check_index_type=False, + ) @pytest.mark.parametrize( @@ -103,15 +112,15 @@ def test_init_via_list_of_empty_tuples(rows): ) def test_init_from_series_align(dict_of_series): pdf = pd.DataFrame(dict_of_series) - gdf = gd.DataFrame(dict_of_series) + gdf = cudf.DataFrame(dict_of_series) assert_eq(pdf, gdf) for key in dict_of_series: if isinstance(dict_of_series[key], pd.Series): - dict_of_series[key] = gd.Series(dict_of_series[key]) + dict_of_series[key] = cudf.Series(dict_of_series[key]) - gdf = gd.DataFrame(dict_of_series) + gdf = cudf.DataFrame(dict_of_series) assert_eq(pdf, gdf) @@ -139,7 +148,7 @@ def test_init_from_series_align(dict_of_series): ) def test_init_from_series_align_nonunique(dict_of_series, expectation): with expectation: - gdf = gd.DataFrame(dict_of_series) + gdf = cudf.DataFrame(dict_of_series) if expectation == does_not_raise(): pdf = pd.DataFrame(dict_of_series) @@ -154,10 +163,10 @@ def test_init_unaligned_with_index(): }, index=[7, 8, 9], ) - gdf = gd.DataFrame( + gdf = cudf.DataFrame( { - "a": gd.Series([1.0, 2.0, 3.0], index=[4, 5, 6]), - "b": gd.Series([1.0, 2.0, 3.0], index=[1, 2, 3]), + "a": cudf.Series([1.0, 2.0, 3.0], index=[4, 5, 6]), + "b": cudf.Series([1.0, 2.0, 3.0], index=[1, 2, 3]), }, index=[7, 8, 9], ) @@ -168,7 +177,7 @@ def test_init_unaligned_with_index(): def test_series_basic(): # Make series from buffer a1 = np.arange(10, dtype=np.float64) - series = gd.Series(a1) + series = cudf.Series(a1) assert len(series) == 10 np.testing.assert_equal(series.to_array(), np.hstack([a1])) @@ -177,8 +186,8 @@ def test_series_from_cupy_scalars(): data = [0.1, 0.2, 0.3] data_np = np.array(data) data_cp = cupy.array(data) - s_np = gd.Series([data_np[0], data_np[2]]) - s_cp = gd.Series([data_cp[0], data_cp[2]]) + s_np = cudf.Series([data_np[0], data_np[2]]) + s_cp = cudf.Series([data_cp[0], data_cp[2]]) assert_eq(s_np, s_cp) @@ -190,7 +199,7 @@ def test_append_index(a, b): df["a"] = a df["b"] = b - gdf = gd.DataFrame() + gdf = cudf.DataFrame() gdf["a"] = a gdf["b"] = b @@ -212,17 +221,17 @@ def test_series_init_none(): # test for creating empty series # 1: without initializing - sr1 = gd.Series() + sr1 = cudf.Series() got = sr1.to_string() - print(got) + expect = "Series([], dtype: float64)" # values should match despite whitespace difference assert got.split() == expect.split() # 2: Using `None` as an initializer - sr2 = gd.Series(None) + sr2 = cudf.Series(None) got = sr2.to_string() - print(got) + expect = "Series([], dtype: float64)" # values should match despite whitespace difference assert got.split() == expect.split() @@ -230,7 +239,7 @@ def test_series_init_none(): def test_dataframe_basic(): np.random.seed(0) - df = gd.DataFrame() + df = cudf.DataFrame() # Populate with cuda memory df["keys"] = np.arange(10, dtype=np.float64) @@ -245,12 +254,12 @@ def test_dataframe_basic(): assert tuple(df.columns) == ("keys", "vals") # Make another dataframe - df2 = gd.DataFrame() + df2 = cudf.DataFrame() df2["keys"] = np.array([123], dtype=np.float64) df2["vals"] = np.array([321], dtype=np.float64) # Concat - df = gd.concat([df, df2]) + df = cudf.concat([df, df2]) assert len(df) == 11 hkeys = np.asarray(np.arange(10, dtype=np.float64).tolist() + [123]) @@ -264,21 +273,19 @@ def test_dataframe_basic(): expect = np.vstack([hkeys, hvals]).T - print(expect) - print(mat) np.testing.assert_equal(mat, expect) # test dataframe with tuple name - df_tup = gd.DataFrame() + df_tup = cudf.DataFrame() data = np.arange(10) df_tup[(1, "foobar")] = data np.testing.assert_equal(data, df_tup[(1, "foobar")].to_array()) - df = gd.DataFrame(pd.DataFrame({"a": [1, 2, 3], "c": ["a", "b", "c"]})) + df = cudf.DataFrame(pd.DataFrame({"a": [1, 2, 3], "c": ["a", "b", "c"]})) pdf = pd.DataFrame(pd.DataFrame({"a": [1, 2, 3], "c": ["a", "b", "c"]})) assert_eq(df, pdf) - gdf = gd.DataFrame({"id": [0, 1], "val": [None, None]}) + gdf = cudf.DataFrame({"id": [0, 1], "val": [None, None]}) gdf["val"] = gdf["val"].astype("int") assert gdf["val"].isnull().all() @@ -299,7 +306,7 @@ def test_dataframe_basic(): @pytest.mark.parametrize("inplace", [True, False]) def test_dataframe_drop_columns(pdf, columns, inplace): pdf = pdf.copy() - gdf = gd.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) expected = pdf.drop(columns=columns, inplace=inplace) actual = gdf.drop(columns=columns, inplace=inplace) @@ -327,7 +334,7 @@ def test_dataframe_drop_columns(pdf, columns, inplace): @pytest.mark.parametrize("inplace", [True, False]) def test_dataframe_drop_labels_axis_0(pdf, labels, inplace): pdf = pdf.copy() - gdf = gd.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) expected = pdf.drop(labels=labels, axis=0, inplace=inplace) actual = gdf.drop(labels=labels, axis=0, inplace=inplace) @@ -355,7 +362,7 @@ def test_dataframe_drop_labels_axis_0(pdf, labels, inplace): @pytest.mark.parametrize("inplace", [True, False]) def test_dataframe_drop_index(pdf, index, inplace): pdf = pdf.copy() - gdf = gd.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) expected = pdf.drop(index=index, inplace=inplace) actual = gdf.drop(index=index, inplace=inplace) @@ -420,7 +427,7 @@ def test_dataframe_drop_index(pdf, index, inplace): @pytest.mark.parametrize("inplace", [True, False]) def test_dataframe_drop_multiindex(pdf, index, level, inplace): pdf = pdf.copy() - gdf = gd.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) expected = pdf.drop(index=index, inplace=inplace, level=level) actual = gdf.drop(index=index, inplace=inplace, level=level) @@ -447,7 +454,7 @@ def test_dataframe_drop_multiindex(pdf, index, level, inplace): @pytest.mark.parametrize("inplace", [True, False]) def test_dataframe_drop_labels_axis_1(pdf, labels, inplace): pdf = pdf.copy() - gdf = gd.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) expected = pdf.drop(labels=labels, axis=1, inplace=inplace) actual = gdf.drop(labels=labels, axis=1, inplace=inplace) @@ -460,7 +467,7 @@ def test_dataframe_drop_labels_axis_1(pdf, labels, inplace): def test_dataframe_drop_error(): - df = gd.DataFrame({"a": [1], "b": [2], "c": [3]}) + df = cudf.DataFrame({"a": [1], "b": [2], "c": [3]}) pdf = df.to_pandas() assert_exceptions_equal( @@ -505,7 +512,7 @@ def test_dataframe_drop_error(): def test_dataframe_drop_raises(): - df = gd.DataFrame( + df = cudf.DataFrame( {"a": [1, 2, 3], "c": [10, 20, 30]}, index=["x", "y", "z"] ) pdf = df.to_pandas() @@ -550,7 +557,7 @@ def test_dataframe_drop_raises(): def test_dataframe_column_add_drop_via_setitem(): - df = gd.DataFrame() + df = cudf.DataFrame() data = np.asarray(range(10)) df["a"] = data df["b"] = data @@ -567,7 +574,7 @@ def test_dataframe_column_set_via_attr(): data_0 = np.asarray([0, 2, 4, 5]) data_1 = np.asarray([1, 4, 2, 3]) data_2 = np.asarray([2, 0, 3, 0]) - df = gd.DataFrame({"a": data_0, "b": data_1, "c": data_2}) + df = cudf.DataFrame({"a": data_0, "b": data_1, "c": data_2}) for i in range(10): df.c = df.a @@ -580,7 +587,7 @@ def test_dataframe_column_set_via_attr(): def test_dataframe_column_drop_via_attr(): - df = gd.DataFrame({"a": []}) + df = cudf.DataFrame({"a": []}) with pytest.raises(AttributeError): del df.a @@ -591,7 +598,7 @@ def test_dataframe_column_drop_via_attr(): @pytest.mark.parametrize("axis", [0, "index"]) def test_dataframe_index_rename(axis): pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) expect = pdf.rename(mapper={1: 5, 2: 6}, axis=axis) got = gdf.rename(mapper={1: 5, 2: 6}, axis=axis) @@ -615,7 +622,7 @@ def test_dataframe_index_rename(axis): def test_dataframe_MI_rename(): - gdf = gd.DataFrame( + gdf = cudf.DataFrame( {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)} ) gdg = gdf.groupby(["a", "b"]).count() @@ -630,7 +637,7 @@ def test_dataframe_MI_rename(): @pytest.mark.parametrize("axis", [1, "columns"]) def test_dataframe_column_rename(axis): pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) expect = pdf.rename(mapper=lambda name: 2 * name, axis=axis) got = gdf.rename(mapper=lambda name: 2 * name, axis=axis) @@ -653,7 +660,7 @@ def test_dataframe_pop(): pdf = pd.DataFrame( {"a": [1, 2, 3], "b": ["x", "y", "z"], "c": [7.0, 8.0, 9.0]} ) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) # Test non-existing column error with pytest.raises(KeyError) as raises: @@ -680,7 +687,7 @@ def test_dataframe_pop(): # check empty dataframe edge case empty_pdf = pd.DataFrame(columns=["a", "b"]) - empty_gdf = gd.DataFrame(columns=["a", "b"]) + empty_gdf = cudf.DataFrame(columns=["a", "b"]) pb = empty_pdf.pop("b") gb = empty_gdf.pop("b") assert len(pb) == len(gb) @@ -689,7 +696,7 @@ def test_dataframe_pop(): @pytest.mark.parametrize("nelem", [0, 3, 100, 1000]) def test_dataframe_astype(nelem): - df = gd.DataFrame() + df = cudf.DataFrame() data = np.asarray(range(nelem), dtype=np.int32) df["a"] = data assert df["a"].dtype is np.dtype(np.int32) @@ -700,7 +707,7 @@ def test_dataframe_astype(nelem): @pytest.mark.parametrize("nelem", [0, 100]) def test_index_astype(nelem): - df = gd.DataFrame() + df = cudf.DataFrame() data = np.asarray(range(nelem), dtype=np.int32) df["a"] = data assert df.index.dtype is np.dtype(np.int64) @@ -719,13 +726,15 @@ def test_dataframe_to_string(): pd.options.display.max_rows = 5 pd.options.display.max_columns = 8 # Test basic - df = gd.DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]}) + df = cudf.DataFrame( + {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]} + ) string = str(df) - print(string) + assert string.splitlines()[-1] == "[6 rows x 2 columns]" # Test skipped columns - df = gd.DataFrame( + df = cudf.DataFrame( { "a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16], @@ -734,17 +743,19 @@ def test_dataframe_to_string(): } ) string = df.to_string() - print(string) + assert string.splitlines()[-1] == "[6 rows x 4 columns]" # Test masked - df = gd.DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]}) + df = cudf.DataFrame( + {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]} + ) data = np.arange(6) - mask = np.zeros(1, dtype=gd.utils.utils.mask_dtype) + mask = np.zeros(1, dtype=cudf.utils.utils.mask_dtype) mask[0] = 0b00101101 - masked = gd.Series.from_masked_array(data, mask) + masked = cudf.Series.from_masked_array(data, mask) assert masked.null_count == 2 df["c"] = masked @@ -760,11 +771,11 @@ def test_dataframe_to_string(): # null position is correct for i in range(len(values)): if i not in validids: - assert values[i] is gd.NA + assert values[i] is cudf.NA pd.options.display.max_rows = 10 got = df.to_string() - print(got) + expect = """ a b c 0 1 11 0 @@ -779,14 +790,14 @@ def test_dataframe_to_string(): def test_dataframe_to_string_wide(monkeypatch): - monkeypatch.setenv("COLUMNS", 79) + monkeypatch.setenv("COLUMNS", "79") # Test basic - df = gd.DataFrame() + df = cudf.DataFrame() for i in range(100): df["a{}".format(i)] = list(range(3)) pd.options.display.max_columns = 0 got = df.to_string() - print(got) + expect = """ a0 a1 a2 a3 a4 a5 a6 a7 ... a92 a93 a94 a95 a96 a97 a98 a99 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 @@ -800,9 +811,9 @@ def test_dataframe_to_string_wide(monkeypatch): def test_dataframe_empty_to_string(): # Test for printing empty dataframe - df = gd.DataFrame() + df = cudf.DataFrame() got = df.to_string() - print(got) + expect = "Empty DataFrame\nColumns: []\nIndex: []\n" # values should match despite whitespace difference assert got.split() == expect.split() @@ -810,11 +821,11 @@ def test_dataframe_empty_to_string(): def test_dataframe_emptycolumns_to_string(): # Test for printing dataframe having empty columns - df = gd.DataFrame() + df = cudf.DataFrame() df["a"] = [] df["b"] = [] got = df.to_string() - print(got) + expect = "Empty DataFrame\nColumns: [a, b]\nIndex: []\n" # values should match despite whitespace difference assert got.split() == expect.split() @@ -822,14 +833,12 @@ def test_dataframe_emptycolumns_to_string(): def test_dataframe_copy(): # Test for copying the dataframe using python copy pkg - from copy import copy - - df = gd.DataFrame() + df = cudf.DataFrame() df["a"] = [1, 2, 3] df2 = copy(df) df2["b"] = [4, 5, 6] got = df.to_string() - print(got) + expect = """ a 0 1 @@ -842,12 +851,12 @@ def test_dataframe_copy(): def test_dataframe_copy_shallow(): # Test for copy dataframe using class method - df = gd.DataFrame() + df = cudf.DataFrame() df["a"] = [1, 2, 3] df2 = df.copy() df2["b"] = [4, 2, 3] got = df.to_string() - print(got) + expect = """ a 0 1 @@ -862,7 +871,9 @@ def test_dataframe_dtypes(): dtypes = pd.Series( [np.int32, np.float32, np.float64], index=["c", "a", "b"] ) - df = gd.DataFrame({k: np.ones(10, dtype=v) for k, v in dtypes.iteritems()}) + df = cudf.DataFrame( + {k: np.ones(10, dtype=v) for k, v in dtypes.iteritems()} + ) assert df.dtypes.equals(dtypes) @@ -873,7 +884,7 @@ def test_dataframe_add_col_to_object_dataframe(): data = {k: v for (k, v) in zip(cols, [["a"] for _ in cols])} - gdf = gd.DataFrame(data) + gdf = cudf.DataFrame(data) gdf = gdf[:0] assert gdf.dtypes.equals(df.dtypes) @@ -886,7 +897,7 @@ def test_dataframe_add_col_to_object_dataframe(): def test_dataframe_dir_and_getattr(): - df = gd.DataFrame( + df = cudf.DataFrame( { "a": np.ones(10), "b": np.ones(10), @@ -908,13 +919,13 @@ def test_dataframe_dir_and_getattr(): @pytest.mark.parametrize("order", ["C", "F"]) def test_empty_dataframe_as_gpu_matrix(order): - df = gd.DataFrame() + df = cudf.DataFrame() # Check fully empty dataframe. mat = df.as_gpu_matrix(order=order).copy_to_host() assert mat.shape == (0, 0) - df = gd.DataFrame() + df = cudf.DataFrame() nelem = 123 for k in "abc": df[k] = np.random.random(nelem) @@ -926,7 +937,7 @@ def test_empty_dataframe_as_gpu_matrix(order): @pytest.mark.parametrize("order", ["C", "F"]) def test_dataframe_as_gpu_matrix(order): - df = gd.DataFrame() + df = cudf.DataFrame() nelem = 123 for k in "abcd": @@ -947,7 +958,7 @@ def test_dataframe_as_gpu_matrix(order): def test_dataframe_as_gpu_matrix_null_values(): - df = gd.DataFrame() + df = cudf.DataFrame() nelem = 123 na = -10000 @@ -983,7 +994,7 @@ def test_dataframe_append_empty(): "value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], } ) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) gdf["newcol"] = 100 pdf["newcol"] = 100 @@ -1000,30 +1011,30 @@ def test_dataframe_setitem_from_masked_object(): np.random.shuffle(mask) ary[mask] = np.nan - test1_null = gd.Series(ary, nan_as_null=True) + test1_null = cudf.Series(ary, nan_as_null=True) assert test1_null.nullable assert test1_null.null_count == 20 - test1_nan = gd.Series(ary, nan_as_null=False) + test1_nan = cudf.Series(ary, nan_as_null=False) assert test1_nan.null_count == 0 - test2_null = gd.DataFrame.from_pandas( + test2_null = cudf.DataFrame.from_pandas( pd.DataFrame({"a": ary}), nan_as_null=True ) assert test2_null["a"].nullable assert test2_null["a"].null_count == 20 - test2_nan = gd.DataFrame.from_pandas( + test2_nan = cudf.DataFrame.from_pandas( pd.DataFrame({"a": ary}), nan_as_null=False ) assert test2_nan["a"].null_count == 0 gpu_ary = cupy.asarray(ary) - test3_null = gd.Series(gpu_ary, nan_as_null=True) + test3_null = cudf.Series(gpu_ary, nan_as_null=True) assert test3_null.nullable assert test3_null.null_count == 20 - test3_nan = gd.Series(gpu_ary, nan_as_null=False) + test3_nan = cudf.Series(gpu_ary, nan_as_null=False) assert test3_nan.null_count == 0 - test4 = gd.DataFrame() + test4 = cudf.DataFrame() lst = [1, 2, None, 4, 5, 6, None, 8, 9] test4["lst"] = lst assert test4["lst"].nullable @@ -1035,7 +1046,7 @@ def test_dataframe_append_to_empty(): pdf["a"] = [] pdf["b"] = [1, 2, 3] - gdf = gd.DataFrame() + gdf = cudf.DataFrame() gdf["a"] = [] gdf["b"] = [1, 2, 3] @@ -1043,7 +1054,7 @@ def test_dataframe_append_to_empty(): def test_dataframe_setitem_index_len1(): - gdf = gd.DataFrame() + gdf = cudf.DataFrame() gdf["a"] = [1] gdf["b"] = gdf.index._values @@ -1051,7 +1062,7 @@ def test_dataframe_setitem_index_len1(): def test_assign(): - gdf = gd.DataFrame({"x": [1, 2, 3]}) + gdf = cudf.DataFrame({"x": [1, 2, 3]}) gdf2 = gdf.assign(y=gdf.x + 1) assert list(gdf.columns) == ["x"] assert list(gdf2.columns) == ["x", "y"] @@ -1061,7 +1072,7 @@ def test_assign(): @pytest.mark.parametrize("nrows", [1, 8, 100, 1000]) def test_dataframe_hash_columns(nrows): - gdf = gd.DataFrame() + gdf = cudf.DataFrame() data = np.asarray(range(nrows)) data[0] = data[-1] # make first and last the same gdf["a"] = data @@ -1079,7 +1090,7 @@ def test_dataframe_hash_columns(nrows): out_one = cupy.asnumpy(gdf.hash_columns(["a"])) # First matches last assert out_one[0] == out_one[-1] - # Equivalent to the gd.Series.hash_values() + # Equivalent to the cudf.Series.hash_values() np.testing.assert_array_equal(cupy.asnumpy(gdf.a.hash_values()), out_one) @@ -1088,7 +1099,7 @@ def test_dataframe_hash_columns(nrows): @pytest.mark.parametrize("nkeys", [1, 2]) def test_dataframe_hash_partition(nrows, nparts, nkeys): np.random.seed(123) - gdf = gd.DataFrame() + gdf = cudf.DataFrame() keycols = [] for i in range(nkeys): keyname = "key{}".format(i) @@ -1102,7 +1113,7 @@ def test_dataframe_hash_partition(nrows, nparts, nkeys): # Must have correct number of partitions assert len(got) == nparts # All partitions must be DataFrame type - assert all(isinstance(p, gd.DataFrame) for p in got) + assert all(isinstance(p, cudf.DataFrame) for p in got) # Check that all partitions have unique keys part_unique_keys = set() for p in got: @@ -1117,7 +1128,7 @@ def test_dataframe_hash_partition(nrows, nparts, nkeys): @pytest.mark.parametrize("nrows", [3, 10, 50]) def test_dataframe_hash_partition_masked_value(nrows): - gdf = gd.DataFrame() + gdf = cudf.DataFrame() gdf["key"] = np.arange(nrows) gdf["val"] = np.arange(nrows) + 100 bitmask = utils.random_bitmask(nrows) @@ -1138,7 +1149,7 @@ def test_dataframe_hash_partition_masked_value(nrows): @pytest.mark.parametrize("nrows", [3, 10, 50]) def test_dataframe_hash_partition_masked_keys(nrows): - gdf = gd.DataFrame() + gdf = cudf.DataFrame() gdf["key"] = np.arange(nrows) gdf["val"] = np.arange(nrows) + 100 bitmask = utils.random_bitmask(nrows) @@ -1161,14 +1172,14 @@ def test_dataframe_hash_partition_masked_keys(nrows): @pytest.mark.parametrize("keep_index", [True, False]) def test_dataframe_hash_partition_keep_index(keep_index): - gdf = gd.DataFrame( + gdf = cudf.DataFrame( {"val": [1, 2, 3, 4], "key": [3, 2, 1, 4]}, index=[4, 3, 2, 1] ) - expected_df1 = gd.DataFrame( + expected_df1 = cudf.DataFrame( {"val": [1], "key": [3]}, index=[4] if keep_index else None ) - expected_df2 = gd.DataFrame( + expected_df2 = cudf.DataFrame( {"val": [2, 3, 4], "key": [2, 1, 4]}, index=[3, 2, 1] if keep_index else range(1, 4), ) @@ -1181,7 +1192,7 @@ def test_dataframe_hash_partition_keep_index(keep_index): def test_dataframe_hash_partition_empty(): - gdf = gd.DataFrame({"val": [1, 2], "key": [3, 2]}, index=["a", "b"]) + gdf = cudf.DataFrame({"val": [1, 2], "key": [3, 2]}, index=["a", "b"]) parts = gdf.iloc[:0].partition_by_hash(["key"], nparts=3) assert len(parts) == 3 for part in parts: @@ -1195,33 +1206,33 @@ def test_dataframe_concat_different_numerical_columns(dtype1, dtype2): df2 = pd.DataFrame(dict(x=pd.Series(np.arange(5)).astype(dtype2))) if dtype1 != dtype2 and "datetime" in dtype1 or "datetime" in dtype2: with pytest.raises(TypeError): - gd.concat([df1, df2]) + cudf.concat([df1, df2]) else: pres = pd.concat([df1, df2]) - gres = gd.concat([gd.from_pandas(df1), gd.from_pandas(df2)]) - assert_eq(gd.from_pandas(pres), gres) + gres = cudf.concat([cudf.from_pandas(df1), cudf.from_pandas(df2)]) + assert_eq(cudf.from_pandas(pres), gres) def test_dataframe_concat_different_column_types(): - df1 = gd.Series([42], dtype=np.float) - df2 = gd.Series(["a"], dtype="category") + df1 = cudf.Series([42], dtype=np.float) + df2 = cudf.Series(["a"], dtype="category") with pytest.raises(ValueError): - gd.concat([df1, df2]) + cudf.concat([df1, df2]) - df2 = gd.Series(["a string"]) + df2 = cudf.Series(["a string"]) with pytest.raises(TypeError): - gd.concat([df1, df2]) + cudf.concat([df1, df2]) @pytest.mark.parametrize( - "df_1", [gd.DataFrame({"a": [1, 2], "b": [1, 3]}), gd.DataFrame({})] + "df_1", [cudf.DataFrame({"a": [1, 2], "b": [1, 3]}), cudf.DataFrame({})] ) @pytest.mark.parametrize( - "df_2", [gd.DataFrame({"a": [], "b": []}), gd.DataFrame({})] + "df_2", [cudf.DataFrame({"a": [], "b": []}), cudf.DataFrame({})] ) def test_concat_empty_dataframe(df_1, df_2): - got = gd.concat([df_1, df_2]) + got = cudf.concat([df_1, df_2]) expect = pd.concat([df_1.to_pandas(), df_2.to_pandas()], sort=False) # ignoring dtypes as pandas upcasts int to float @@ -1248,8 +1259,8 @@ def test_concat_empty_dataframe(df_1, df_2): ], ) def test_concat_different_column_dataframe(df1_d, df2_d): - got = gd.concat( - [gd.DataFrame(df1_d), gd.DataFrame(df2_d), gd.DataFrame(df1_d)], + got = cudf.concat( + [cudf.DataFrame(df1_d), cudf.DataFrame(df2_d), cudf.DataFrame(df1_d)], sort=False, ) @@ -1268,10 +1279,12 @@ def test_concat_different_column_dataframe(df1_d, df2_d): assert_eq(got, expect, check_dtype=False) -@pytest.mark.parametrize("ser_1", [pd.Series([1, 2, 3]), pd.Series([])]) -@pytest.mark.parametrize("ser_2", [pd.Series([])]) +@pytest.mark.parametrize( + "ser_1", [pd.Series([1, 2, 3]), pd.Series([], dtype="float64")] +) +@pytest.mark.parametrize("ser_2", [pd.Series([], dtype="float64")]) def test_concat_empty_series(ser_1, ser_2): - got = gd.concat([gd.Series(ser_1), gd.Series(ser_2)]) + got = cudf.concat([cudf.Series(ser_1), cudf.Series(ser_2)]) expect = pd.concat([ser_1, ser_2]) assert_eq(got, expect) @@ -1282,49 +1295,49 @@ def test_concat_with_axis(): df2 = pd.DataFrame(dict(a=np.arange(5), b=np.arange(5))) concat_df = pd.concat([df1, df2], axis=1) - cdf1 = gd.from_pandas(df1) - cdf2 = gd.from_pandas(df2) + cdf1 = cudf.from_pandas(df1) + cdf2 = cudf.from_pandas(df2) # concat only dataframes - concat_cdf = gd.concat([cdf1, cdf2], axis=1) + concat_cdf = cudf.concat([cdf1, cdf2], axis=1) assert_eq(concat_cdf, concat_df) # concat only series concat_s = pd.concat([df1.x, df1.y], axis=1) - cs1 = gd.Series.from_pandas(df1.x) - cs2 = gd.Series.from_pandas(df1.y) - concat_cdf_s = gd.concat([cs1, cs2], axis=1) + cs1 = cudf.Series.from_pandas(df1.x) + cs2 = cudf.Series.from_pandas(df1.y) + concat_cdf_s = cudf.concat([cs1, cs2], axis=1) assert_eq(concat_cdf_s, concat_s) # concat series and dataframes s3 = pd.Series(np.random.random(5)) - cs3 = gd.Series.from_pandas(s3) + cs3 = cudf.Series.from_pandas(s3) - concat_cdf_all = gd.concat([cdf1, cs3, cdf2], axis=1) + concat_cdf_all = cudf.concat([cdf1, cs3, cdf2], axis=1) concat_df_all = pd.concat([df1, s3, df2], axis=1) assert_eq(concat_cdf_all, concat_df_all) # concat manual multi index - midf1 = gd.from_pandas(df1) - midf1.index = gd.MultiIndex( + midf1 = cudf.from_pandas(df1) + midf1.index = cudf.MultiIndex( levels=[[0, 1, 2, 3], [0, 1]], codes=[[0, 1, 2, 3, 2], [0, 1, 0, 1, 0]] ) midf2 = midf1[2:] - midf2.index = gd.MultiIndex( + midf2.index = cudf.MultiIndex( levels=[[3, 4, 5], [2, 0]], codes=[[0, 1, 2], [1, 0, 1]] ) mipdf1 = midf1.to_pandas() mipdf2 = midf2.to_pandas() - assert_eq(gd.concat([midf1, midf2]), pd.concat([mipdf1, mipdf2])) - assert_eq(gd.concat([midf2, midf1]), pd.concat([mipdf2, mipdf1])) + assert_eq(cudf.concat([midf1, midf2]), pd.concat([mipdf1, mipdf2])) + assert_eq(cudf.concat([midf2, midf1]), pd.concat([mipdf2, mipdf1])) assert_eq( - gd.concat([midf1, midf2, midf1]), pd.concat([mipdf1, mipdf2, mipdf1]) + cudf.concat([midf1, midf2, midf1]), pd.concat([mipdf1, mipdf2, mipdf1]) ) # concat groupby multi index - gdf1 = gd.DataFrame( + gdf1 = cudf.DataFrame( { "x": np.random.randint(0, 10, 10), "y": np.random.randint(0, 10, 10), @@ -1338,8 +1351,8 @@ def test_concat_with_axis(): pdg1 = gdg1.to_pandas() pdg2 = gdg2.to_pandas() - assert_eq(gd.concat([gdg1, gdg2]), pd.concat([pdg1, pdg2])) - assert_eq(gd.concat([gdg2, gdg1]), pd.concat([pdg2, pdg1])) + assert_eq(cudf.concat([gdg1, gdg2]), pd.concat([pdg1, pdg2])) + assert_eq(cudf.concat([gdg2, gdg1]), pd.concat([pdg2, pdg1])) # series multi index concat gdgz1 = gdg1.z @@ -1347,15 +1360,15 @@ def test_concat_with_axis(): pdgz1 = gdgz1.to_pandas() pdgz2 = gdgz2.to_pandas() - assert_eq(gd.concat([gdgz1, gdgz2]), pd.concat([pdgz1, pdgz2])) - assert_eq(gd.concat([gdgz2, gdgz1]), pd.concat([pdgz2, pdgz1])) + assert_eq(cudf.concat([gdgz1, gdgz2]), pd.concat([pdgz1, pdgz2])) + assert_eq(cudf.concat([gdgz2, gdgz1]), pd.concat([pdgz2, pdgz1])) @pytest.mark.parametrize("nrows", [0, 3, 10, 100, 1000]) def test_nonmatching_index_setitem(nrows): np.random.seed(0) - gdf = gd.DataFrame() + gdf = cudf.DataFrame() gdf["a"] = np.random.randint(2147483647, size=nrows) gdf["b"] = np.random.randint(2147483647, size=nrows) gdf = gdf.set_index("b") @@ -1366,20 +1379,20 @@ def test_nonmatching_index_setitem(nrows): assert ( gdf["c"] .to_pandas() - .equals(gd.Series(test_values).set_index(gdf._index).to_pandas()) + .equals(cudf.Series(test_values).set_index(gdf._index).to_pandas()) ) def test_from_pandas(): df = pd.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0]) - gdf = gd.DataFrame.from_pandas(df) - assert isinstance(gdf, gd.DataFrame) + gdf = cudf.DataFrame.from_pandas(df) + assert isinstance(gdf, cudf.DataFrame) assert_eq(df, gdf) s = df.x - gs = gd.Series.from_pandas(s) - assert isinstance(gs, gd.Series) + gs = cudf.Series.from_pandas(s) + assert isinstance(gs, cudf.Series) assert_eq(s, gs) @@ -1389,14 +1402,14 @@ def test_from_records(dtypes): h_ary = np.ndarray(shape=(10, 4), dtype=dtypes) rec_ary = h_ary.view(np.recarray) - gdf = gd.DataFrame.from_records(rec_ary, columns=["a", "b", "c", "d"]) + gdf = cudf.DataFrame.from_records(rec_ary, columns=["a", "b", "c", "d"]) df = pd.DataFrame.from_records(rec_ary, columns=["a", "b", "c", "d"]) - assert isinstance(gdf, gd.DataFrame) + assert isinstance(gdf, cudf.DataFrame) assert_eq(df, gdf) - gdf = gd.DataFrame.from_records(rec_ary) + gdf = cudf.DataFrame.from_records(rec_ary) df = pd.DataFrame.from_records(rec_ary) - assert isinstance(gdf, gd.DataFrame) + assert isinstance(gdf, cudf.DataFrame) assert_eq(df, gdf) @@ -1418,9 +1431,9 @@ def test_from_records_index(columns, index): [("Rex", 9, 81.0), ("Fido", 3, 27.0)], dtype=[("name", "U10"), ("age", "i4"), ("weight", "f4")], ) - gdf = gd.DataFrame.from_records(rec_ary, columns=columns, index=index) + gdf = cudf.DataFrame.from_records(rec_ary, columns=columns, index=index) df = pd.DataFrame.from_records(rec_ary, columns=columns, index=index) - assert isinstance(gdf, gd.DataFrame) + assert isinstance(gdf, cudf.DataFrame) assert_eq(df, gdf) @@ -1428,37 +1441,37 @@ def test_dataframe_construction_from_cupy_arrays(): h_ary = np.array([[1, 2, 3], [4, 5, 6]], np.int32) d_ary = cupy.asarray(h_ary) - gdf = gd.DataFrame(d_ary, columns=["a", "b", "c"]) + gdf = cudf.DataFrame(d_ary, columns=["a", "b", "c"]) df = pd.DataFrame(h_ary, columns=["a", "b", "c"]) - assert isinstance(gdf, gd.DataFrame) + assert isinstance(gdf, cudf.DataFrame) assert_eq(df, gdf) - gdf = gd.DataFrame(d_ary) + gdf = cudf.DataFrame(d_ary) df = pd.DataFrame(h_ary) - assert isinstance(gdf, gd.DataFrame) + assert isinstance(gdf, cudf.DataFrame) assert_eq(df, gdf) - gdf = gd.DataFrame(d_ary, index=["a", "b"]) + gdf = cudf.DataFrame(d_ary, index=["a", "b"]) df = pd.DataFrame(h_ary, index=["a", "b"]) - assert isinstance(gdf, gd.DataFrame) + assert isinstance(gdf, cudf.DataFrame) assert_eq(df, gdf) - gdf = gd.DataFrame(d_ary) + gdf = cudf.DataFrame(d_ary) gdf = gdf.set_index(keys=0, drop=False) df = pd.DataFrame(h_ary) df = df.set_index(keys=0, drop=False) - assert isinstance(gdf, gd.DataFrame) + assert isinstance(gdf, cudf.DataFrame) assert_eq(df, gdf) - gdf = gd.DataFrame(d_ary) + gdf = cudf.DataFrame(d_ary) gdf = gdf.set_index(keys=1, drop=False) df = pd.DataFrame(h_ary) df = df.set_index(keys=1, drop=False) - assert isinstance(gdf, gd.DataFrame) + assert isinstance(gdf, cudf.DataFrame) assert_eq(df, gdf) @@ -1468,7 +1481,7 @@ def test_dataframe_cupy_wrong_dimensions(): with pytest.raises( ValueError, match="records dimension expected 1 or 2 but found: 3" ): - gd.DataFrame(d_ary) + cudf.DataFrame(d_ary) def test_dataframe_cupy_array_wrong_index(): @@ -1479,20 +1492,19 @@ def test_dataframe_cupy_array_wrong_index(): match="Length mismatch: Expected axis has 2 elements, " "new values have 1 elements", ): - gd.DataFrame(d_ary, index=["a"]) + cudf.DataFrame(d_ary, index=["a"]) with pytest.raises( ValueError, match="Length mismatch: Expected axis has 2 elements, " "new values have 1 elements", ): - gd.DataFrame(d_ary, index="a") + cudf.DataFrame(d_ary, index="a") -@pytest.mark.xfail(reason="constructor does not coerce index inputs") def test_index_in_dataframe_constructor(): a = pd.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0]) - b = gd.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0]) + b = cudf.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0]) assert_eq(a, b) assert_eq(a.loc[4:], b.loc[4:]) @@ -1513,14 +1525,14 @@ def test_from_arrow(nelem, data_type): padf = pa.Table.from_pandas( df, preserve_index=False ).replace_schema_metadata(None) - gdf = gd.DataFrame.from_arrow(padf) - assert isinstance(gdf, gd.DataFrame) + gdf = cudf.DataFrame.from_arrow(padf) + assert isinstance(gdf, cudf.DataFrame) assert_eq(df, gdf) s = pa.Array.from_pandas(df.a) - gs = gd.Series.from_arrow(s) - assert isinstance(gs, gd.Series) + gs = cudf.Series.from_arrow(s) + assert isinstance(gs, cudf.Series) # For some reason PyArrow to_pandas() converts to numpy array and has # better type compatibility @@ -1536,7 +1548,7 @@ def test_to_arrow(nelem, data_type): "b": np.random.randint(0, 1000, nelem).astype(data_type), } ) - gdf = gd.DataFrame.from_pandas(df) + gdf = cudf.DataFrame.from_pandas(df) pa_df = pa.Table.from_pandas( df, preserve_index=False @@ -1572,8 +1584,8 @@ def test_to_from_arrow_nulls(data_type): time_unit, _ = np.datetime_data(dtype) data_type = pa.timestamp(unit=time_unit) s1 = pa.array([1, None, 3, None, 5], type=data_type) - gs1 = gd.Series.from_arrow(s1) - assert isinstance(gs1, gd.Series) + gs1 = cudf.Series.from_arrow(s1) + assert isinstance(gs1, cudf.Series) # We have 64B padded buffers for nulls whereas Arrow returns a minimal # number of bytes, so only check the first byte in this case np.testing.assert_array_equal( @@ -1583,8 +1595,8 @@ def test_to_from_arrow_nulls(data_type): assert pa.Array.equals(s1, gs1.to_arrow()) s2 = pa.array([None, None, None, None, None], type=data_type) - gs2 = gd.Series.from_arrow(s2) - assert isinstance(gs2, gd.Series) + gs2 = cudf.Series.from_arrow(s2) + assert isinstance(gs2, cudf.Series) # We have 64B padded buffers for nulls whereas Arrow returns a minimal # number of bytes, so only check the first byte in this case np.testing.assert_array_equal( @@ -1597,7 +1609,7 @@ def test_to_from_arrow_nulls(data_type): def test_to_arrow_categorical(): df = pd.DataFrame() df["a"] = pd.Series(["a", "b", "c"], dtype="category") - gdf = gd.DataFrame.from_pandas(df) + gdf = cudf.DataFrame.from_pandas(df) pa_df = pa.Table.from_pandas( df, preserve_index=False @@ -1617,9 +1629,9 @@ def test_to_arrow_categorical(): def test_from_arrow_missing_categorical(): pd_cat = pd.Categorical(["a", "b", "c"], categories=["a", "b"]) pa_cat = pa.array(pd_cat, from_pandas=True) - gd_cat = gd.Series(pa_cat) + gd_cat = cudf.Series(pa_cat) - assert isinstance(gd_cat, gd.Series) + assert isinstance(gd_cat, cudf.Series) assert_eq( pd.Series(pa_cat.to_pandas()), # PyArrow returns a pd.Categorical gd_cat.to_pandas(), @@ -1629,9 +1641,9 @@ def test_from_arrow_missing_categorical(): def test_to_arrow_missing_categorical(): pd_cat = pd.Categorical(["a", "b", "c"], categories=["a", "b"]) pa_cat = pa.array(pd_cat, from_pandas=True) - gd_cat = gd.Series(pa_cat) + gd_cat = cudf.Series(pa_cat) - assert isinstance(gd_cat, gd.Series) + assert isinstance(gd_cat, cudf.Series) assert pa.Array.equals(pa_cat, gd_cat.to_arrow()) @@ -1644,14 +1656,12 @@ def test_from_scalar_typing(data_type): .astype("datetime64[ms]") ) elif data_type.startswith("datetime64"): - from datetime import date - - scalar = np.datetime64(date.today()).astype("datetime64[ms]") + scalar = np.datetime64(datetime.date.today()).astype("datetime64[ms]") data_type = "datetime64[ms]" else: scalar = np.dtype(data_type).type(np.random.randint(0, 5)) - gdf = gd.DataFrame() + gdf = cudf.DataFrame() gdf["a"] = [1, 2, 3, 4, 5] gdf["b"] = scalar assert gdf["b"].dtype == np.dtype(data_type) @@ -1664,35 +1674,35 @@ def test_from_python_array(data_type): data = memoryview(np_arr) data = arr.array(data.format, data) - gs = gd.Series(data) + gs = cudf.Series(data) np.testing.assert_equal(gs.to_array(), np_arr) def test_series_shape(): ps = pd.Series([1, 2, 3, 4]) - cs = gd.Series([1, 2, 3, 4]) + cs = cudf.Series([1, 2, 3, 4]) assert ps.shape == cs.shape def test_series_shape_empty(): - ps = pd.Series() - cs = gd.Series([]) + ps = pd.Series(dtype="float64") + cs = cudf.Series([]) assert ps.shape == cs.shape def test_dataframe_shape(): pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]}) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) assert pdf.shape == gdf.shape def test_dataframe_shape_empty(): pdf = pd.DataFrame() - gdf = gd.DataFrame() + gdf = cudf.DataFrame() assert pdf.shape == gdf.shape @@ -1702,14 +1712,12 @@ def test_dataframe_shape_empty(): @pytest.mark.parametrize("dtype", dtypes) @pytest.mark.parametrize("nulls", ["none", "some", "all"]) def test_dataframe_transpose(nulls, num_cols, num_rows, dtype): - pdf = pd.DataFrame() - from string import ascii_lowercase null_rep = np.nan if dtype in ["float32", "float64"] else None for i in range(num_cols): - colname = ascii_lowercase[i] + colname = string.ascii_lowercase[i] data = pd.Series(np.random.randint(0, 26, num_rows).astype(dtype)) if nulls == "some": idx = np.random.choice( @@ -1720,7 +1728,7 @@ def test_dataframe_transpose(nulls, num_cols, num_rows, dtype): data[:] = null_rep pdf[colname] = data - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) got_function = gdf.transpose() got_property = gdf.T @@ -1735,15 +1743,14 @@ def test_dataframe_transpose(nulls, num_cols, num_rows, dtype): @pytest.mark.parametrize("num_rows", [1, 2, 20]) def test_dataframe_transpose_category(num_cols, num_rows): pdf = pd.DataFrame() - from string import ascii_lowercase for i in range(num_cols): - colname = ascii_lowercase[i] - data = pd.Series(list(ascii_lowercase), dtype="category") + colname = string.ascii_lowercase[i] + data = pd.Series(list(string.ascii_lowercase), dtype="category") data = data.sample(num_rows, replace=True).reset_index(drop=True) pdf[colname] = data - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) got_function = gdf.transpose() got_property = gdf.T @@ -1755,7 +1762,7 @@ def test_dataframe_transpose_category(num_cols, num_rows): def test_generated_column(): - gdf = gd.DataFrame({"a": (i for i in range(5))}) + gdf = cudf.DataFrame({"a": (i for i in range(5))}) assert len(gdf) == 5 @@ -1766,7 +1773,7 @@ def pdf(): @pytest.fixture def gdf(pdf): - return gd.DataFrame.from_pandas(pdf) + return cudf.DataFrame.from_pandas(pdf) @pytest.mark.parametrize( @@ -1806,9 +1813,7 @@ def gdf(pdf): @pytest.mark.parametrize("skipna", [True, False, None]) def test_dataframe_reductions(data, func, skipna): pdf = pd.DataFrame(data=data) - print(func(pdf, skipna=skipna)) - gdf = gd.DataFrame.from_pandas(pdf) - print(func(gdf, skipna=skipna)) + gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(func(pdf, skipna=skipna), func(gdf, skipna=skipna)) @@ -1825,7 +1830,7 @@ def test_dataframe_reductions(data, func, skipna): @pytest.mark.parametrize("func", [lambda df: df.count()]) def test_dataframe_count_reduction(data, func): pdf = pd.DataFrame(data=data) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(func(pdf), func(gdf)) @@ -1845,7 +1850,10 @@ def test_dataframe_count_reduction(data, func): @pytest.mark.parametrize("min_count", [-10, -1, 0, 1, 2, 3, 10]) def test_dataframe_min_count_ops(data, ops, skipna, min_count): psr = pd.DataFrame(data) - gsr = gd.DataFrame(data) + gsr = cudf.DataFrame(data) + + if PANDAS_GE_120 and psr.shape[0] * psr.shape[1] < min_count: + pytest.xfail("https://github.com/pandas-dev/pandas/issues/39738") assert_eq( getattr(psr, ops)(skipna=skipna, min_count=min_count), @@ -1942,7 +1950,7 @@ def test_unary_operators(func, pdf, gdf): def test_is_monotonic(gdf): pdf = pd.DataFrame({"x": [1, 2, 3]}, index=[3, 1, 2]) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) assert not gdf.index.is_monotonic assert not gdf.index.is_monotonic_increasing assert not gdf.index.is_monotonic_decreasing @@ -1955,7 +1963,7 @@ def test_iter(pdf, gdf): def test_iteritems(gdf): for k, v in gdf.iteritems(): assert k in gdf.columns - assert isinstance(v, gd.Series) + assert isinstance(v, cudf.Series) assert_eq(v, gdf[k]) @@ -1967,7 +1975,7 @@ def test_quantile(q, numeric_only): pdf = pd.DataFrame( {"date": ts, "delta": td, "val": np.random.randn(len(ts))} ) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(pdf["date"].quantile(q), gdf["date"].quantile(q)) assert_eq(pdf["delta"].quantile(q), gdf["delta"].quantile(q)) @@ -1987,7 +1995,7 @@ def test_quantile(q, numeric_only): def test_empty_quantile(): pdf = pd.DataFrame({"x": []}) - df = gd.DataFrame({"x": []}) + df = cudf.DataFrame({"x": []}) actual = df.quantile() expected = pdf.quantile() @@ -1996,16 +2004,16 @@ def test_empty_quantile(): def test_from_pandas_function(pdf): - gdf = gd.from_pandas(pdf) - assert isinstance(gdf, gd.DataFrame) + gdf = cudf.from_pandas(pdf) + assert isinstance(gdf, cudf.DataFrame) assert_eq(pdf, gdf) - gdf = gd.from_pandas(pdf.x) - assert isinstance(gdf, gd.Series) + gdf = cudf.from_pandas(pdf.x) + assert isinstance(gdf, cudf.Series) assert_eq(pdf.x, gdf) with pytest.raises(TypeError): - gd.from_pandas(123) + cudf.from_pandas(123) @pytest.mark.parametrize("preserve_index", [True, False]) @@ -2020,7 +2028,7 @@ def test_arrow_pandas_compat(pdf, gdf, preserve_index): assert pa.Table.equals(pdf_arrow_table, gdf_arrow_table) - gdf2 = gd.DataFrame.from_arrow(pdf_arrow_table) + gdf2 = cudf.DataFrame.from_arrow(pdf_arrow_table) pdf2 = pdf_arrow_table.to_pandas() assert_eq(pdf2, gdf2) @@ -2033,11 +2041,11 @@ def test_series_hash_encode(nrows): # results in enc_with_name_arr and enc_arr to be same. # And there is no other better way to make hash return same value. # So using an integer name to get constant value back from hash. - s = gd.Series(data, name=1) + s = cudf.Series(data, name=1) num_features = 1000 encoded_series = s.hash_encode(num_features) - assert isinstance(encoded_series, gd.Series) + assert isinstance(encoded_series, cudf.Series) enc_arr = encoded_series.to_array() assert np.all(enc_arr >= 0) assert np.max(enc_arr) < num_features @@ -2053,10 +2061,10 @@ def test_cuda_array_interface(dtype): cupy_data = cupy.array(np_data) pd_data = pd.Series(np_data) - cudf_data = gd.Series(cupy_data) + cudf_data = cudf.Series(cupy_data) assert_eq(pd_data, cudf_data) - gdf = gd.DataFrame() + gdf = cudf.DataFrame() gdf["test"] = cupy_data pd_data.name = "test" assert_eq(pd_data, gdf["test"]) @@ -2073,7 +2081,7 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type): pa_chunk_array = pa.chunked_array(np_list_data) expect = pd.Series(pa_chunk_array.to_pandas()) - got = gd.Series(pa_chunk_array) + got = cudf.Series(pa_chunk_array) assert_eq(expect, got) @@ -2087,15 +2095,13 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type): ) expect = pa_table.to_pandas() - got = gd.DataFrame.from_arrow(pa_table) + got = cudf.DataFrame.from_arrow(pa_table) assert_eq(expect, got) @pytest.mark.skip(reason="Test was designed to be run in isolation") def test_gpu_memory_usage_with_boolmask(): - import cudf - ctx = cuda.current_context() def query_GPU_memory(note=""): @@ -2110,7 +2116,7 @@ def query_GPU_memory(note=""): colNames = ["col" + str(iCol) for iCol in range(nCols)] pandasDF = pd.DataFrame(data=dataNumpy, columns=colNames, dtype=np.float32) cudaDF = cudf.core.DataFrame.from_pandas(pandasDF) - boolmask = gd.Series(np.random.randint(1, 2, len(cudaDF)).astype("bool")) + boolmask = cudf.Series(np.random.randint(1, 2, len(cudaDF)).astype("bool")) memory_used = query_GPU_memory() cudaDF = cudaDF[boolmask] @@ -2153,8 +2159,8 @@ def test_dataframe_boolmask(mask_shape): pdf_mask = pd.DataFrame() for col in mask_shape[1]: pdf_mask[col] = np.random.randint(0, 2, mask_shape[0]) > 0 - gdf = gd.DataFrame.from_pandas(pdf) - gdf_mask = gd.DataFrame.from_pandas(pdf_mask) + gdf = cudf.DataFrame.from_pandas(pdf) + gdf_mask = cudf.DataFrame.from_pandas(pdf_mask) gdf = gdf[gdf_mask] pdf = pdf[pdf_mask] @@ -2170,7 +2176,7 @@ def test_dataframe_boolmask(mask_shape): [ [True, False, True], pytest.param( - gd.Series([True, False, True]), + cudf.Series([True, False, True]), marks=pytest.mark.xfail( reason="Pandas can't index a multiindex with a Series" ), @@ -2178,7 +2184,7 @@ def test_dataframe_boolmask(mask_shape): ], ) def test_dataframe_multiindex_boolmask(mask): - gdf = gd.DataFrame( + gdf = cudf.DataFrame( {"w": [3, 2, 1], "x": [1, 2, 3], "y": [0, 1, 0], "z": [1, 1, 1]} ) gdg = gdf.groupby(["w", "x"]).count() @@ -2190,7 +2196,7 @@ def test_dataframe_assignment(): pdf = pd.DataFrame() for col in "abc": pdf[col] = np.array([0, 1, 1, -2, 10]) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) gdf[gdf < 0] = 999 pdf[pdf < 0] = 999 assert_eq(gdf, pdf) @@ -2202,7 +2208,7 @@ def test_1row_arrow_table(): table = pa.Table.from_batches([batch]) expect = table.to_pandas() - got = gd.DataFrame.from_arrow(table) + got = cudf.DataFrame.from_arrow(table) assert_eq(expect, got) @@ -2211,7 +2217,7 @@ def test_arrow_handle_no_index_name(pdf, gdf): pdf_arrow = pa.Table.from_pandas(pdf) assert pa.Table.equals(pdf_arrow, gdf_arrow) - got = gd.DataFrame.from_arrow(gdf_arrow) + got = cudf.DataFrame.from_arrow(gdf_arrow) expect = pdf_arrow.to_pandas() assert_eq(expect, got) @@ -2224,9 +2230,9 @@ def test_arrow_handle_no_index_name(pdf, gdf): def test_series_digitize(num_rows, num_bins, right, dtype, series_bins): data = np.random.randint(0, 100, num_rows).astype(dtype) bins = np.unique(np.sort(np.random.randint(2, 95, num_bins).astype(dtype))) - s = gd.Series(data) + s = cudf.Series(data) if series_bins: - s_bins = gd.Series(bins) + s_bins = cudf.Series(bins) indices = s.digitize(s_bins, right) else: indices = s.digitize(bins, right) @@ -2236,8 +2242,8 @@ def test_series_digitize(num_rows, num_bins, right, dtype, series_bins): def test_series_digitize_invalid_bins(): - s = gd.Series(np.random.randint(0, 30, 80), dtype="int32") - bins = gd.Series([2, None, None, 50, 90], dtype="int32") + s = cudf.Series(np.random.randint(0, 30, 80), dtype="int32") + bins = cudf.Series([2, None, None, 50, 90], dtype="int32") with pytest.raises( ValueError, match="`bins` cannot contain null entries." @@ -2252,7 +2258,7 @@ def test_pandas_non_contiguious(): for col in df.columns: assert df[col].values.flags["C_CONTIGUOUS"] is False - gdf = gd.DataFrame.from_pandas(df) + gdf = cudf.DataFrame.from_pandas(df) assert_eq(gdf.to_pandas(), df) @@ -2270,8 +2276,8 @@ def test_series_all_null(num_elements, null_type): data = [null_type] * num_elements # Typecast Pandas because None will return `object` dtype - expect = pd.Series(data).astype("float64") - got = gd.Series(data) + expect = pd.Series(data, dtype="float64") + got = cudf.Series(data) assert_eq(expect, got) @@ -2279,13 +2285,13 @@ def test_series_all_null(num_elements, null_type): @pytest.mark.parametrize("num_elements", [0, 2, 10, 100]) def test_series_all_valid_nan(num_elements): data = [np.nan] * num_elements - sr = gd.Series(data, nan_as_null=False) + sr = cudf.Series(data, nan_as_null=False) np.testing.assert_equal(sr.null_count, 0) def test_series_rename(): pds = pd.Series([1, 2, 3], name="asdf") - gds = gd.Series([1, 2, 3], name="asdf") + gds = cudf.Series([1, 2, 3], name="asdf") expect = pds.rename("new_name") got = gds.rename("new_name") @@ -2293,12 +2299,12 @@ def test_series_rename(): assert_eq(expect, got) pds = pd.Series(expect) - gds = gd.Series(got) + gds = cudf.Series(got) assert_eq(pds, gds) pds = pd.Series(expect, name="name name") - gds = gd.Series(got, name="name name") + gds = cudf.Series(got, name="name name") assert_eq(pds, gds) @@ -2319,7 +2325,7 @@ def check_frame_series_equality(left, right): check_index_equality(left, right) check_values_equality(left, right) - gdf = gd.DataFrame( + gdf = cudf.DataFrame( { "a": np.random.randint(0, 1000, nelem).astype(data_type), "b": np.random.randint(0, 1000, nelem).astype(data_type), @@ -2346,9 +2352,9 @@ def check_frame_series_equality(left, right): def test_tail_for_string(): - gdf = gd.DataFrame() - gdf["id"] = gd.Series(["a", "b"], dtype=np.object) - gdf["v"] = gd.Series([1, 2]) + gdf = cudf.DataFrame() + gdf["id"] = cudf.Series(["a", "b"], dtype=np.object) + gdf["v"] = cudf.Series([1, 2]) assert_eq(gdf.tail(3), gdf.to_pandas().tail(3)) @@ -2431,7 +2437,7 @@ def test_reset_index_inplace(pdf, gdf, drop): @pytest.mark.parametrize("append", [True, False]) @pytest.mark.parametrize("inplace", [True, False]) def test_set_index(data, index, drop, append, inplace): - gdf = gd.DataFrame(data) + gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() expected = pdf.set_index(index, inplace=inplace, drop=drop, append=append) @@ -2457,7 +2463,7 @@ def test_set_index(data, index, drop, append, inplace): @pytest.mark.parametrize("verify_integrity", [True]) @pytest.mark.xfail def test_set_index_verify_integrity(data, index, verify_integrity): - gdf = gd.DataFrame(data) + gdf = cudf.DataFrame(data) gdf.set_index(index, verify_integrity=verify_integrity) @@ -2476,7 +2482,7 @@ def test_set_index_multi(drop, nelem): } ) df["e"] = df["d"].astype("category") - gdf = gd.DataFrame.from_pandas(df) + gdf = cudf.DataFrame.from_pandas(df) assert_eq(gdf.set_index("a", drop=drop), gdf.set_index(["a"], drop=drop)) assert_eq( @@ -2497,7 +2503,7 @@ def test_set_index_multi(drop, nelem): def test_dataframe_reindex_0(copy): # TODO (ptaylor): pandas changes `int` dtype to `float64` # when reindexing and filling new label indices with NaN - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={ "a": "category", @@ -2514,7 +2520,7 @@ def test_dataframe_reindex_0(copy): @pytest.mark.parametrize("copy", [True, False]) def test_dataframe_reindex_1(copy): index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={"a": "category", "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2525,7 +2531,7 @@ def test_dataframe_reindex_1(copy): @pytest.mark.parametrize("copy", [True, False]) def test_dataframe_reindex_2(copy): index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={"a": "category", "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2539,7 +2545,7 @@ def test_dataframe_reindex_2(copy): @pytest.mark.parametrize("copy", [True, False]) def test_dataframe_reindex_3(copy): columns = ["a", "b", "c", "d", "e"] - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={"a": "category", "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2553,7 +2559,7 @@ def test_dataframe_reindex_3(copy): @pytest.mark.parametrize("copy", [True, False]) def test_dataframe_reindex_4(copy): index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={"a": "category", "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2567,7 +2573,7 @@ def test_dataframe_reindex_4(copy): @pytest.mark.parametrize("copy", [True, False]) def test_dataframe_reindex_5(copy): columns = ["a", "b", "c", "d", "e"] - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={"a": "category", "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2581,7 +2587,7 @@ def test_dataframe_reindex_5(copy): @pytest.mark.parametrize("copy", [True, False]) def test_dataframe_reindex_6(copy): index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={"a": "category", "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2595,7 +2601,7 @@ def test_dataframe_reindex_6(copy): @pytest.mark.parametrize("copy", [True, False]) def test_dataframe_reindex_7(copy): columns = ["a", "b", "c", "d", "e"] - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={"a": "category", "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2609,7 +2615,7 @@ def test_dataframe_reindex_7(copy): @pytest.mark.parametrize("copy", [True, False]) def test_dataframe_reindex_8(copy): index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={"a": "category", "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2623,7 +2629,7 @@ def test_dataframe_reindex_8(copy): @pytest.mark.parametrize("copy", [True, False]) def test_dataframe_reindex_9(copy): columns = ["a", "b", "c", "d", "e"] - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={"a": "category", "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2638,7 +2644,7 @@ def test_dataframe_reindex_9(copy): def test_dataframe_reindex_10(copy): index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] columns = ["a", "b", "c", "d", "e"] - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={"a": "category", "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2658,7 +2664,7 @@ def test_dataframe_reindex_change_dtype(copy): kwargs = {} index = pd.date_range("12/29/2009", periods=10, freq="D") columns = ["a", "b", "c", "d", "e"] - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={"a": "category", "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2674,7 +2680,7 @@ def test_dataframe_reindex_change_dtype(copy): @pytest.mark.parametrize("copy", [True, False]) def test_series_categorical_reindex(copy): index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] - gdf = gd.datasets.randomdata(nrows=6, dtypes={"a": "category"}) + gdf = cudf.datasets.randomdata(nrows=6, dtypes={"a": "category"}) pdf = gdf.to_pandas() assert_eq(pdf["a"].reindex(copy=True), gdf["a"].reindex(copy=copy)) assert_eq( @@ -2689,7 +2695,7 @@ def test_series_categorical_reindex(copy): @pytest.mark.parametrize("copy", [True, False]) def test_series_float_reindex(copy): index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] - gdf = gd.datasets.randomdata(nrows=6, dtypes={"c": float}) + gdf = cudf.datasets.randomdata(nrows=6, dtypes={"c": float}) pdf = gdf.to_pandas() assert_eq(pdf["c"].reindex(copy=True), gdf["c"].reindex(copy=copy)) assert_eq( @@ -2704,7 +2710,7 @@ def test_series_float_reindex(copy): @pytest.mark.parametrize("copy", [True, False]) def test_series_string_reindex(copy): index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] - gdf = gd.datasets.randomdata(nrows=6, dtypes={"d": str}) + gdf = cudf.datasets.randomdata(nrows=6, dtypes={"d": str}) pdf = gdf.to_pandas() assert_eq(pdf["d"].reindex(copy=True), gdf["d"].reindex(copy=copy)) assert_eq( @@ -2733,7 +2739,7 @@ def test_to_frame(pdf, gdf): def test_dataframe_empty_sort_index(): pdf = pd.DataFrame({"x": []}) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) expect = pdf.sort_index() got = gdf.sort_index() @@ -2753,7 +2759,7 @@ def test_dataframe_sort_index( {"b": [1, 3, 2], "a": [1, 4, 3], "c": [4, 1, 5]}, index=[3.0, 1.0, np.nan], ) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) expected = pdf.sort_index( axis=axis, @@ -2809,7 +2815,7 @@ def test_dataframe_mulitindex_sort_index( "d": [1, 2, 8], } ).set_index(["b", "a", 1]) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) # ignore_index is supported in v.1.0 expected = pdf.sort_index( @@ -2847,7 +2853,7 @@ def test_dataframe_0_row_dtype(dtype): else: data = np.array([1, 2, 3, 4, 5], dtype=dtype) - expect = gd.DataFrame() + expect = cudf.DataFrame() expect["x"] = data expect["y"] = data got = expect.head(0) @@ -2855,7 +2861,7 @@ def test_dataframe_0_row_dtype(dtype): for col_name in got.columns: assert expect[col_name].dtype == got[col_name].dtype - expect = gd.Series(data) + expect = cudf.Series(data) got = expect.head(0) assert expect.dtype == got.dtype @@ -2866,7 +2872,7 @@ def test_series_list_nanasnull(nan_as_null): data = [1.0, 2.0, 3.0, np.nan, None] expect = pa.array(data, from_pandas=nan_as_null) - got = gd.Series(data, nan_as_null=nan_as_null).to_arrow() + got = cudf.Series(data, nan_as_null=nan_as_null).to_arrow() # Bug in Arrow 0.14.1 where NaNs aren't handled expect = expect.cast("int64", safe=False) @@ -2876,7 +2882,7 @@ def test_series_list_nanasnull(nan_as_null): def test_column_assignment(): - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=20, dtypes={"a": "category", "b": int, "c": float} ) new_cols = ["q", "r", "s"] @@ -2885,7 +2891,7 @@ def test_column_assignment(): def test_select_dtype(): - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=20, dtypes={"a": "category", "b": int, "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2943,7 +2949,9 @@ def test_select_dtype(): ), ) - gdf = gd.DataFrame({"A": [3, 4, 5], "C": [1, 2, 3], "D": ["a", "b", "c"]}) + gdf = cudf.DataFrame( + {"A": [3, 4, 5], "C": [1, 2, 3], "D": ["a", "b", "c"]} + ) pdf = gdf.to_pandas() assert_eq( pdf.select_dtypes(include=["object", "int", "category"]), @@ -2954,7 +2962,7 @@ def test_select_dtype(): gdf.select_dtypes(include=["object"], exclude=["category"]), ) - gdf = gd.DataFrame({"a": range(10), "b": range(10, 20)}) + gdf = cudf.DataFrame({"a": range(10), "b": range(10, 20)}) pdf = gdf.to_pandas() assert_eq( pdf.select_dtypes(include=["category"]), @@ -2988,8 +2996,8 @@ def test_select_dtype(): lfunc=pdf.select_dtypes, rfunc=gdf.select_dtypes, ) - gdf = gd.DataFrame( - {"a": gd.Series([], dtype="int"), "b": gd.Series([], dtype="str")} + gdf = cudf.DataFrame( + {"a": cudf.Series([], dtype="int"), "b": cudf.Series([], dtype="str")} ) pdf = gdf.to_pandas() assert_eq( @@ -3003,7 +3011,7 @@ def test_select_dtype(): def test_select_dtype_datetime(): - gdf = gd.datasets.timeseries( + gdf = cudf.datasets.timeseries( start="2000-01-01", end="2000-01-02", freq="3600s", dtypes={"x": int} ) gdf = gdf.reset_index() @@ -3021,7 +3029,7 @@ def test_select_dtype_datetime(): def test_select_dtype_datetime_with_frequency(): - gdf = gd.datasets.timeseries( + gdf = cudf.datasets.timeseries( start="2000-01-01", end="2000-01-02", freq="3600s", dtypes={"x": int} ) gdf = gdf.reset_index() @@ -3036,7 +3044,7 @@ def test_select_dtype_datetime_with_frequency(): def test_array_ufunc(): - gdf = gd.DataFrame({"x": [2, 3, 4.0], "y": [9.0, 2.5, 1.1]}) + gdf = cudf.DataFrame({"x": [2, 3, 4.0], "y": [9.0, 2.5, 1.1]}) pdf = gdf.to_pandas() assert_eq(np.sqrt(gdf), np.sqrt(pdf)) @@ -3046,7 +3054,7 @@ def test_array_ufunc(): @pytest.mark.parametrize("nan_value", [-5, -5.0, 0, 5, 5.0, None, "pandas"]) def test_series_to_gpu_array(nan_value): - s = gd.Series([0, 1, None, 3]) + s = cudf.Series([0, 1, None, 3]) np.testing.assert_array_equal( s.to_array(nan_value), s.to_gpu_array(nan_value).copy_to_host() ) @@ -3056,7 +3064,7 @@ def test_dataframe_describe_exclude(): np.random.seed(12) data_length = 10000 - df = gd.DataFrame() + df = cudf.DataFrame() df["x"] = np.random.normal(10, 1, data_length) df["x"] = df.x.astype("int64") df["y"] = np.random.normal(10, 1, data_length) @@ -3071,7 +3079,7 @@ def test_dataframe_describe_include(): np.random.seed(12) data_length = 10000 - df = gd.DataFrame() + df = cudf.DataFrame() df["x"] = np.random.normal(10, 1, data_length) df["x"] = df.x.astype("int64") df["y"] = np.random.normal(10, 1, data_length) @@ -3086,7 +3094,7 @@ def test_dataframe_describe_default(): np.random.seed(12) data_length = 10000 - df = gd.DataFrame() + df = cudf.DataFrame() df["x"] = np.random.normal(10, 1, data_length) df["y"] = np.random.normal(10, 1, data_length) pdf = df.to_pandas() @@ -3100,7 +3108,7 @@ def test_series_describe_include_all(): np.random.seed(12) data_length = 10000 - df = gd.DataFrame() + df = cudf.DataFrame() df["x"] = np.random.normal(10, 1, data_length) df["x"] = df.x.astype("int64") df["y"] = np.random.normal(10, 1, data_length) @@ -3124,7 +3132,7 @@ def test_dataframe_describe_percentiles(): data_length = 10000 sample_percentiles = [0.0, 0.1, 0.33, 0.84, 0.4, 0.99] - df = gd.DataFrame() + df = cudf.DataFrame() df["x"] = np.random.normal(10, 1, data_length) df["y"] = np.random.normal(10, 1, data_length) pdf = df.to_pandas() @@ -3138,7 +3146,7 @@ def test_get_numeric_data(): pdf = pd.DataFrame( {"x": [1, 2, 3], "y": [1.0, 2.0, 3.0], "z": ["a", "b", "c"]} ) - gdf = gd.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) assert_eq(pdf._get_numeric_data(), gdf._get_numeric_data()) @@ -3157,7 +3165,7 @@ def test_shift(dtype, period, data_empty): else: data = gen_rand(dtype, 100000) - gdf = gd.DataFrame({"a": gd.Series(data, dtype=dtype)}) + gdf = cudf.DataFrame({"a": cudf.Series(data, dtype=dtype)}) pdf = pd.DataFrame({"a": pd.Series(data, dtype=dtype)}) shifted_outcome = gdf.a.shift(period).fillna(0) @@ -3182,7 +3190,7 @@ def test_diff(dtype, period, data_empty): else: data = gen_rand(dtype, 100000) - gdf = gd.DataFrame({"a": gd.Series(data, dtype=dtype)}) + gdf = cudf.DataFrame({"a": cudf.Series(data, dtype=dtype)}) pdf = pd.DataFrame({"a": pd.Series(data, dtype=dtype)}) expected_outcome = pdf.a.diff(period) @@ -3198,7 +3206,7 @@ def test_diff(dtype, period, data_empty): @pytest.mark.parametrize("nan_as_null", [True, False, None]) def test_dataframe_isnull_isna(df, nan_as_null): - gdf = gd.DataFrame.from_pandas(df, nan_as_null=nan_as_null) + gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null) assert_eq(df.isnull(), gdf.isnull()) assert_eq(df.isna(), gdf.isna()) @@ -3213,7 +3221,7 @@ def test_dataframe_isnull_isna(df, nan_as_null): @pytest.mark.parametrize("nan_as_null", [True, False, None]) def test_dataframe_notna_notnull(df, nan_as_null): - gdf = gd.DataFrame.from_pandas(df, nan_as_null=nan_as_null) + gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null) assert_eq(df.notnull(), gdf.notnull()) assert_eq(df.notna(), gdf.notna()) @@ -3226,12 +3234,12 @@ def test_dataframe_notna_notnull(df, nan_as_null): def test_ndim(): pdf = pd.DataFrame({"x": range(5), "y": range(5, 10)}) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) assert pdf.ndim == gdf.ndim assert pdf.x.ndim == gdf.x.ndim - s = pd.Series() - gs = gd.Series() + s = pd.Series(dtype="float64") + gs = cudf.Series() assert s.ndim == gs.ndim @@ -3242,7 +3250,7 @@ def test_ndim(): 0, 5, pd.Series([1, 4, 3, -6], index=["w", "x", "y", "z"]), - gd.Series([-4, -2, 12], index=["x", "y", "z"]), + cudf.Series([-4, -2, 12], index=["x", "y", "z"]), {"w": -1, "x": 15, "y": 2}, ], ) @@ -3268,9 +3276,9 @@ def test_dataframe_round(decimals): "z": np.repeat([-0.6459412758761901], 10), } ) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) - if isinstance(decimals, gd.Series): + if isinstance(decimals, cudf.Series): pdecimals = decimals.to_pandas() else: pdecimals = decimals @@ -3323,11 +3331,13 @@ def test_all(data): # Pandas treats `None` in object type columns as True for some reason, so # replacing with `False` if np.array(data).ndim <= 1: - pdata = pd.Series(data).replace([None], False) - gdata = gd.Series.from_pandas(pdata) + pdata = cudf.utils.utils._create_pandas_series(data=data).replace( + [None], False + ) + gdata = cudf.Series.from_pandas(pdata) else: pdata = pd.DataFrame(data, columns=["a", "b"]).replace([None], False) - gdata = gd.DataFrame.from_pandas(pdata) + gdata = cudf.DataFrame.from_pandas(pdata) # test bool_only if pdata["b"].dtype == "bool": @@ -3376,8 +3386,8 @@ def test_all(data): @pytest.mark.parametrize("axis", [0, 1]) def test_any(data, axis): if np.array(data).ndim <= 1: - pdata = pd.Series(data) - gdata = gd.Series.from_pandas(pdata) + pdata = cudf.utils.utils._create_pandas_series(data=data) + gdata = cudf.Series.from_pandas(pdata) if axis == 1: with pytest.raises(NotImplementedError): @@ -3388,7 +3398,7 @@ def test_any(data, axis): assert_eq(got, expected) else: pdata = pd.DataFrame(data, columns=["a", "b"]) - gdata = gd.DataFrame.from_pandas(pdata) + gdata = cudf.DataFrame.from_pandas(pdata) # test bool_only if pdata["b"].dtype == "bool": @@ -3409,7 +3419,7 @@ def test_any(data, axis): @pytest.mark.parametrize("axis", [0, 1]) def test_empty_dataframe_any(axis): pdf = pd.DataFrame({}, columns=["a", "b"]) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) got = gdf.any(axis=axis) expected = pdf.any(axis=axis) assert_eq(got, expected, check_index_type=False) @@ -3420,7 +3430,7 @@ def test_dataframe_sizeof(indexed): rows = int(1e6) index = list(i for i in range(rows)) if indexed else None - gdf = gd.DataFrame({"A": [8] * rows, "B": [32] * rows}, index=index) + gdf = cudf.DataFrame({"A": [8] * rows, "B": [32] * rows}, index=index) for c in gdf._data.columns: assert gdf._index.__sizeof__() == gdf._index.__sizeof__() @@ -3437,19 +3447,19 @@ def test_dataframe_sizeof(indexed): @pytest.mark.parametrize("non_list_data", [123, "abc", "zyx", "rapids", 0.8]) def test_create_dataframe_cols_empty_data(a, b, misc_data, non_list_data): expected = pd.DataFrame({"a": a}) - actual = gd.DataFrame.from_pandas(expected) + actual = cudf.DataFrame.from_pandas(expected) expected["b"] = b actual["b"] = b assert_eq(actual, expected) expected = pd.DataFrame({"a": []}) - actual = gd.DataFrame.from_pandas(expected) + actual = cudf.DataFrame.from_pandas(expected) expected["b"] = misc_data actual["b"] = misc_data assert_eq(actual, expected) expected = pd.DataFrame({"a": a}) - actual = gd.DataFrame.from_pandas(expected) + actual = cudf.DataFrame.from_pandas(expected) expected["b"] = non_list_data actual["b"] = non_list_data assert_eq(actual, expected) @@ -3457,7 +3467,7 @@ def test_create_dataframe_cols_empty_data(a, b, misc_data, non_list_data): def test_empty_dataframe_describe(): pdf = pd.DataFrame({"a": [], "b": []}) - gdf = gd.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) expected = pdf.describe() actual = gdf.describe() @@ -3466,75 +3476,77 @@ def test_empty_dataframe_describe(): def test_as_column_types(): - from cudf.core.column import column - - col = column.as_column(gd.Series([])) + col = column.as_column(cudf.Series([])) assert_eq(col.dtype, np.dtype("float64")) - gds = gd.Series(col) - pds = pd.Series(pd.Series([])) + gds = cudf.Series(col) + pds = pd.Series(pd.Series([], dtype="float64")) assert_eq(pds, gds) - col = column.as_column(gd.Series([]), dtype="float32") + col = column.as_column(cudf.Series([]), dtype="float32") assert_eq(col.dtype, np.dtype("float32")) - gds = gd.Series(col) + gds = cudf.Series(col) pds = pd.Series(pd.Series([], dtype="float32")) assert_eq(pds, gds) - col = column.as_column(gd.Series([]), dtype="str") + col = column.as_column(cudf.Series([]), dtype="str") assert_eq(col.dtype, np.dtype("object")) - gds = gd.Series(col) + gds = cudf.Series(col) pds = pd.Series(pd.Series([], dtype="str")) assert_eq(pds, gds) - col = column.as_column(gd.Series([]), dtype="object") + col = column.as_column(cudf.Series([]), dtype="object") assert_eq(col.dtype, np.dtype("object")) - gds = gd.Series(col) + gds = cudf.Series(col) pds = pd.Series(pd.Series([], dtype="object")) assert_eq(pds, gds) pds = pd.Series(np.array([1, 2, 3]), dtype="float32") - gds = gd.Series(column.as_column(np.array([1, 2, 3]), dtype="float32")) + gds = cudf.Series(column.as_column(np.array([1, 2, 3]), dtype="float32")) assert_eq(pds, gds) pds = pd.Series([1, 2, 3], dtype="float32") - gds = gd.Series([1, 2, 3], dtype="float32") + gds = cudf.Series([1, 2, 3], dtype="float32") assert_eq(pds, gds) - pds = pd.Series([]) - gds = gd.Series(column.as_column(pds)) + pds = pd.Series([], dtype="float64") + gds = cudf.Series(column.as_column(pds)) assert_eq(pds, gds) pds = pd.Series([1, 2, 4], dtype="int64") - gds = gd.Series(column.as_column(gd.Series([1, 2, 4]), dtype="int64")) + gds = cudf.Series(column.as_column(cudf.Series([1, 2, 4]), dtype="int64")) assert_eq(pds, gds) pds = pd.Series([1.2, 18.0, 9.0], dtype="float32") - gds = gd.Series( - column.as_column(gd.Series([1.2, 18.0, 9.0]), dtype="float32") + gds = cudf.Series( + column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="float32") ) assert_eq(pds, gds) pds = pd.Series([1.2, 18.0, 9.0], dtype="str") - gds = gd.Series(column.as_column(gd.Series([1.2, 18.0, 9.0]), dtype="str")) + gds = cudf.Series( + column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="str") + ) assert_eq(pds, gds) pds = pd.Series(pd.Index(["1", "18", "9"]), dtype="int") - gds = gd.Series(gd.core.index.StringIndex(["1", "18", "9"]), dtype="int") + gds = cudf.Series( + cudf.core.index.StringIndex(["1", "18", "9"]), dtype="int" + ) assert_eq(pds, gds) def test_one_row_head(): - gdf = gd.DataFrame({"name": ["carl"], "score": [100]}, index=[123]) + gdf = cudf.DataFrame({"name": ["carl"], "score": [100]}, index=[123]) pdf = gdf.to_pandas() head_gdf = gdf.head() @@ -3547,7 +3559,7 @@ def test_one_row_head(): @pytest.mark.parametrize("as_dtype", NUMERIC_TYPES) def test_series_astype_numeric_to_numeric(dtype, as_dtype): psr = pd.Series([1, 2, 4, 3], dtype=dtype) - gsr = gd.from_pandas(psr) + gsr = cudf.from_pandas(psr) assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) @@ -3555,9 +3567,9 @@ def test_series_astype_numeric_to_numeric(dtype, as_dtype): @pytest.mark.parametrize("as_dtype", NUMERIC_TYPES) def test_series_astype_numeric_to_numeric_nulls(dtype, as_dtype): data = [1, 2, None, 3] - sr = gd.Series(data, dtype=dtype) + sr = cudf.Series(data, dtype=dtype) got = sr.astype(as_dtype) - expect = gd.Series([1, 2, None, 3], dtype=as_dtype) + expect = cudf.Series([1, 2, None, 3], dtype=as_dtype) assert_eq(expect, got) @@ -3575,7 +3587,7 @@ def test_series_astype_numeric_to_numeric_nulls(dtype, as_dtype): ) def test_series_astype_numeric_to_other(dtype, as_dtype): psr = pd.Series([1, 2, 3], dtype=dtype) - gsr = gd.from_pandas(psr) + gsr = cudf.from_pandas(psr) assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) @@ -3599,7 +3611,7 @@ def test_series_astype_string_to_other(as_dtype): else: data = ["1", "2", "3"] psr = pd.Series(data) - gsr = gd.from_pandas(psr) + gsr = cudf.from_pandas(psr) assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) @@ -3616,7 +3628,7 @@ def test_series_astype_string_to_other(as_dtype): def test_series_astype_datetime_to_other(as_dtype): data = ["2001-01-01", "2002-02-02", "2001-01-05"] psr = pd.Series(data) - gsr = gd.from_pandas(psr) + gsr = cudf.from_pandas(psr) assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) @@ -3632,7 +3644,7 @@ def test_series_astype_datetime_to_other(as_dtype): def test_series_astype_datetime_to_string(inp): dtype, expect = inp base_date = "2011-01-01" - sr = gd.Series([base_date], dtype=dtype) + sr = cudf.Series([base_date], dtype=dtype) got = sr.astype(str)[0] assert expect == got @@ -3657,19 +3669,19 @@ def test_series_astype_categorical_to_other(as_dtype): else: data = [1, 2, 3, 1] psr = pd.Series(data, dtype="category") - gsr = gd.from_pandas(psr) + gsr = cudf.from_pandas(psr) assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) @pytest.mark.parametrize("ordered", [True, False]) def test_series_astype_to_categorical_ordered(ordered): psr = pd.Series([1, 2, 3, 1], dtype="category") - gsr = gd.from_pandas(psr) + gsr = cudf.from_pandas(psr) ordered_dtype_pd = pd.CategoricalDtype( categories=[1, 2, 3], ordered=ordered ) - ordered_dtype_gd = gd.CategoricalDtype.from_pandas(ordered_dtype_pd) + ordered_dtype_gd = cudf.CategoricalDtype.from_pandas(ordered_dtype_pd) assert_eq( psr.astype("int32").astype(ordered_dtype_pd).astype("int32"), gsr.astype("int32").astype(ordered_dtype_gd).astype("int32"), @@ -3682,11 +3694,11 @@ def test_series_astype_cat_ordered_to_unordered(ordered): pd_to_dtype = pd.CategoricalDtype( categories=[1, 2, 3], ordered=not ordered ) - gd_dtype = gd.CategoricalDtype.from_pandas(pd_dtype) - gd_to_dtype = gd.CategoricalDtype.from_pandas(pd_to_dtype) + gd_dtype = cudf.CategoricalDtype.from_pandas(pd_dtype) + gd_to_dtype = cudf.CategoricalDtype.from_pandas(pd_to_dtype) psr = pd.Series([1, 2, 3], dtype=pd_dtype) - gsr = gd.Series([1, 2, 3], dtype=gd_dtype) + gsr = cudf.Series([1, 2, 3], dtype=gd_dtype) expect = psr.astype(pd_to_dtype) got = gsr.astype(gd_to_dtype) @@ -3698,62 +3710,63 @@ def test_series_astype_null_cases(): data = [1, 2, None, 3] # numerical to other - assert_eq(gd.Series(data, dtype="str"), gd.Series(data).astype("str")) + assert_eq(cudf.Series(data, dtype="str"), cudf.Series(data).astype("str")) assert_eq( - gd.Series(data, dtype="category"), gd.Series(data).astype("category") + cudf.Series(data, dtype="category"), + cudf.Series(data).astype("category"), ) assert_eq( - gd.Series(data, dtype="float32"), - gd.Series(data, dtype="int32").astype("float32"), + cudf.Series(data, dtype="float32"), + cudf.Series(data, dtype="int32").astype("float32"), ) assert_eq( - gd.Series(data, dtype="float32"), - gd.Series(data, dtype="uint32").astype("float32"), + cudf.Series(data, dtype="float32"), + cudf.Series(data, dtype="uint32").astype("float32"), ) assert_eq( - gd.Series(data, dtype="datetime64[ms]"), - gd.Series(data).astype("datetime64[ms]"), + cudf.Series(data, dtype="datetime64[ms]"), + cudf.Series(data).astype("datetime64[ms]"), ) # categorical to other assert_eq( - gd.Series(data, dtype="str"), - gd.Series(data, dtype="category").astype("str"), + cudf.Series(data, dtype="str"), + cudf.Series(data, dtype="category").astype("str"), ) assert_eq( - gd.Series(data, dtype="float32"), - gd.Series(data, dtype="category").astype("float32"), + cudf.Series(data, dtype="float32"), + cudf.Series(data, dtype="category").astype("float32"), ) assert_eq( - gd.Series(data, dtype="datetime64[ms]"), - gd.Series(data, dtype="category").astype("datetime64[ms]"), + cudf.Series(data, dtype="datetime64[ms]"), + cudf.Series(data, dtype="category").astype("datetime64[ms]"), ) # string to other assert_eq( - gd.Series([1, 2, None, 3], dtype="int32"), - gd.Series(["1", "2", None, "3"]).astype("int32"), + cudf.Series([1, 2, None, 3], dtype="int32"), + cudf.Series(["1", "2", None, "3"]).astype("int32"), ) assert_eq( - gd.Series( + cudf.Series( ["2001-01-01", "2001-02-01", None, "2001-03-01"], dtype="datetime64[ms]", ), - gd.Series(["2001-01-01", "2001-02-01", None, "2001-03-01"]).astype( + cudf.Series(["2001-01-01", "2001-02-01", None, "2001-03-01"]).astype( "datetime64[ms]" ), ) assert_eq( - gd.Series(["a", "b", "c", None], dtype="category").to_pandas(), - gd.Series(["a", "b", "c", None]).astype("category").to_pandas(), + cudf.Series(["a", "b", "c", None], dtype="category").to_pandas(), + cudf.Series(["a", "b", "c", None]).astype("category").to_pandas(), ) # datetime to other @@ -3764,20 +3777,21 @@ def test_series_astype_null_cases(): "2001-03-01 00:00:00.000000", ] assert_eq( - gd.Series(data), gd.Series(data, dtype="datetime64[us]").astype("str"), + cudf.Series(data), + cudf.Series(data, dtype="datetime64[us]").astype("str"), ) assert_eq( pd.Series(data, dtype="datetime64[ns]").astype("category"), - gd.from_pandas(pd.Series(data, dtype="datetime64[ns]")).astype( + cudf.from_pandas(pd.Series(data, dtype="datetime64[ns]")).astype( "category" ), ) def test_series_astype_null_categorical(): - sr = gd.Series([None, None, None], dtype="category") - expect = gd.Series([None, None, None], dtype="int32") + sr = cudf.Series([None, None, None], dtype="category") + expect = cudf.Series([None, None, None], dtype="int32") got = sr.astype("int32") assert_eq(expect, got) @@ -3801,19 +3815,19 @@ def test_series_astype_null_categorical(): ) def test_create_dataframe_from_list_like(data): pdf = pd.DataFrame(data, index=["count", "mean", "std", "min"]) - gdf = gd.DataFrame(data, index=["count", "mean", "std", "min"]) + gdf = cudf.DataFrame(data, index=["count", "mean", "std", "min"]) assert_eq(pdf, gdf) pdf = pd.DataFrame(data) - gdf = gd.DataFrame(data) + gdf = cudf.DataFrame(data) assert_eq(pdf, gdf) def test_create_dataframe_column(): pdf = pd.DataFrame(columns=["a", "b", "c"], index=["A", "Z", "X"]) - gdf = gd.DataFrame(columns=["a", "b", "c"], index=["A", "Z", "X"]) + gdf = cudf.DataFrame(columns=["a", "b", "c"], index=["A", "Z", "X"]) assert_eq(pdf, gdf) @@ -3822,7 +3836,7 @@ def test_create_dataframe_column(): columns=["a", "b", "c"], index=["A", "Z", "X"], ) - gdf = gd.DataFrame( + gdf = cudf.DataFrame( {"a": [1, 2, 3], "b": [2, 3, 5]}, columns=["a", "b", "c"], index=["A", "Z", "X"], @@ -3842,8 +3856,8 @@ def test_create_dataframe_column(): ], ) def test_series_values_host_property(data): - pds = pd.Series(data) - gds = gd.Series(data) + pds = cudf.utils.utils._create_pandas_series(data=data) + gds = cudf.Series(data) np.testing.assert_array_equal(pds.values, gds.values_host) @@ -3865,8 +3879,8 @@ def test_series_values_host_property(data): ], ) def test_series_values_property(data): - pds = pd.Series(data) - gds = gd.Series(data) + pds = cudf.utils.utils._create_pandas_series(data=data) + gds = cudf.Series(data) gds_vals = gds.values assert isinstance(gds_vals, cupy.ndarray) np.testing.assert_array_equal(gds_vals.get(), pds.values) @@ -3911,7 +3925,7 @@ def test_series_values_property(data): ) def test_df_values_property(data): pdf = pd.DataFrame.from_dict(data) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) pmtr = pdf.values gmtr = gdf.values.get() @@ -3927,7 +3941,7 @@ def test_value_counts(): } ) - gdf = gd.DataFrame( + gdf = cudf.DataFrame( { "numeric": [1, 2, 3, 4, 5, 6, 1, 2, 4] * 10, "alpha": ["u", "h", "d", "a", "m", "u", "h", "d", "a"] * 10, @@ -3973,11 +3987,12 @@ def test_value_counts(): ) def test_isin_numeric(data, values): index = np.random.randint(0, 100, len(data)) - psr = pd.Series(data, index=index) - gsr = gd.Series.from_pandas(psr) + psr = cudf.utils.utils._create_pandas_series(data=data, index=index) + gsr = cudf.Series.from_pandas(psr, nan_as_null=False) - got = gsr.isin(values) expected = psr.isin(values) + got = gsr.isin(values) + assert_eq(got, expected) @@ -4026,8 +4041,8 @@ def test_isin_numeric(data, values): ], ) def test_isin_datetime(data, values): - psr = pd.Series(data) - gsr = gd.Series.from_pandas(psr) + psr = cudf.utils.utils._create_pandas_series(data=data) + gsr = cudf.Series.from_pandas(psr) got = gsr.isin(values) expected = psr.isin(values) @@ -4052,19 +4067,18 @@ def test_isin_datetime(data, values): ["12", "14", "19"], pytest.param( [12, 14, 19], - marks=[ - pytest.mark.xfail( - reason="pandas's failure here seems like a bug " - "given the reverse succeeds" - ) - ], + marks=pytest.mark.xfail( + not PANDAS_GE_120, + reason="pandas's failure here seems like a bug(in < 1.2) " + "given the reverse succeeds", + ), ), ["is", "this", "is", "this", "is"], ], ) def test_isin_string(data, values): - psr = pd.Series(data) - gsr = gd.Series.from_pandas(psr) + psr = cudf.utils.utils._create_pandas_series(data=data) + gsr = cudf.Series.from_pandas(psr) got = gsr.isin(values) expected = psr.isin(values) @@ -4092,8 +4106,8 @@ def test_isin_string(data, values): ], ) def test_isin_categorical(data, values): - psr = pd.Series(data) - gsr = gd.Series.from_pandas(psr) + psr = cudf.utils.utils._create_pandas_series(data=data) + gsr = cudf.Series.from_pandas(psr) got = gsr.isin(values) expected = psr.isin(values) @@ -4126,8 +4140,8 @@ def test_isin_categorical(data, values): ], ) def test_isin_index(data, values): - psr = pd.Series(data) - gsr = gd.Series.from_pandas(psr) + psr = cudf.utils.utils._create_pandas_series(data=data) + gsr = cudf.Series.from_pandas(psr) got = gsr.index.isin(values) expected = psr.index.isin(values) @@ -4189,12 +4203,12 @@ def test_isin_index(data, values): ) def test_isin_multiindex(data, values, level, err): pmdx = data - gmdx = gd.from_pandas(data) + gmdx = cudf.from_pandas(data) if err is None: expected = pmdx.isin(values, level=level) if isinstance(values, pd.MultiIndex): - values = gd.from_pandas(values) + values = cudf.from_pandas(values) got = gmdx.isin(values, level=level) assert_eq(got, expected) @@ -4268,12 +4282,10 @@ def test_isin_multiindex(data, values, level, err): ], ) def test_isin_dataframe(data, values): - from cudf.utils.dtypes import is_scalar - pdf = data - gdf = gd.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) - if is_scalar(values): + if cudf.utils.dtypes.is_scalar(values): assert_exceptions_equal( lfunc=pdf.isin, rfunc=gdf.isin, @@ -4285,18 +4297,20 @@ def test_isin_dataframe(data, values): expected = pdf.isin(values) except ValueError as e: if str(e) == "Lengths must match.": - # xref https://github.com/pandas-dev/pandas/issues/34256 pytest.xfail( - "https://github.com/pandas-dev/pandas/issues/34256" + not PANDAS_GE_110, + "https://github.com/pandas-dev/pandas/issues/34256", ) + if isinstance(values, (pd.DataFrame, pd.Series)): - values = gd.from_pandas(values) + values = cudf.from_pandas(values) + got = gdf.isin(values) assert_eq(got, expected) def test_constructor_properties(): - df = gd.DataFrame() + df = cudf.DataFrame() key1 = "a" key2 = "b" val1 = np.array([123], dtype=np.float64) @@ -4307,16 +4321,16 @@ def test_constructor_properties(): # Correct use of _constructor (for DataFrame) assert_eq(df, df._constructor({key1: val1, key2: val2})) - # Correct use of _constructor (for gd.Series) + # Correct use of _constructor (for cudf.Series) assert_eq(df[key1], df[key2]._constructor(val1, name=key1)) # Correct use of _constructor_sliced (for DataFrame) assert_eq(df[key1], df._constructor_sliced(val1, name=key1)) - # Correct use of _constructor_expanddim (for gd.Series) + # Correct use of _constructor_expanddim (for cudf.Series) assert_eq(df, df[key2]._constructor_expanddim({key1: val1, key2: val2})) - # Incorrect use of _constructor_sliced (Raises for gd.Series) + # Incorrect use of _constructor_sliced (Raises for cudf.Series) with pytest.raises(NotImplementedError): df[key1]._constructor_sliced @@ -4335,14 +4349,14 @@ def test_df_astype_numeric_to_all(dtype, as_dtype): elif "float" in dtype: data = [1.0, 2.0, None, 4.0, np.nan, -7.0] - gdf = gd.DataFrame() + gdf = cudf.DataFrame() - gdf["foo"] = gd.Series(data, dtype=dtype) - gdf["bar"] = gd.Series(data, dtype=dtype) + gdf["foo"] = cudf.Series(data, dtype=dtype) + gdf["bar"] = cudf.Series(data, dtype=dtype) - insert_data = gd.Series(data, dtype=dtype) + insert_data = cudf.Series(data, dtype=dtype) - expect = gd.DataFrame() + expect = cudf.DataFrame() expect["foo"] = insert_data.astype(as_dtype) expect["bar"] = insert_data.astype(as_dtype) @@ -4375,11 +4389,11 @@ def test_df_astype_string_to_other(as_dtype): elif "float" in as_dtype: data = [1.0, 2.0, 3.0, np.nan] - insert_data = gd.Series.from_pandas(pd.Series(data, dtype="str")) - expect_data = gd.Series(data, dtype=as_dtype) + insert_data = cudf.Series.from_pandas(pd.Series(data, dtype="str")) + expect_data = cudf.Series(data, dtype=as_dtype) - gdf = gd.DataFrame() - expect = gd.DataFrame() + gdf = cudf.DataFrame() + expect = cudf.DataFrame() gdf["foo"] = insert_data gdf["bar"] = insert_data @@ -4410,28 +4424,28 @@ def test_df_astype_datetime_to_other(as_dtype): None, ] - gdf = gd.DataFrame() - expect = gd.DataFrame() + gdf = cudf.DataFrame() + expect = cudf.DataFrame() - gdf["foo"] = gd.Series(data, dtype="datetime64[ms]") - gdf["bar"] = gd.Series(data, dtype="datetime64[ms]") + gdf["foo"] = cudf.Series(data, dtype="datetime64[ms]") + gdf["bar"] = cudf.Series(data, dtype="datetime64[ms]") if as_dtype == "int64": - expect["foo"] = gd.Series( + expect["foo"] = cudf.Series( [690595200000, 1102118400000, 1473724800000, None], dtype="int64" ) - expect["bar"] = gd.Series( + expect["bar"] = cudf.Series( [690595200000, 1102118400000, 1473724800000, None], dtype="int64" ) elif as_dtype == "str": - expect["foo"] = gd.Series(data, dtype="str") - expect["bar"] = gd.Series(data, dtype="str") + expect["foo"] = cudf.Series(data, dtype="str") + expect["bar"] = cudf.Series(data, dtype="str") elif as_dtype == "category": - expect["foo"] = gd.Series(gdf["foo"], dtype="category") - expect["bar"] = gd.Series(gdf["bar"], dtype="category") + expect["foo"] = cudf.Series(gdf["foo"], dtype="category") + expect["bar"] = cudf.Series(gdf["bar"], dtype="category") else: - expect["foo"] = gd.Series(data, dtype=as_dtype) - expect["bar"] = gd.Series(data, dtype=as_dtype) + expect["foo"] = cudf.Series(data, dtype=as_dtype) + expect["bar"] = cudf.Series(data, dtype=as_dtype) got = gdf.astype(as_dtype) @@ -4460,7 +4474,7 @@ def test_df_astype_categorical_to_other(as_dtype): pdf = pd.DataFrame() pdf["foo"] = psr pdf["bar"] = psr - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(pdf.astype(as_dtype), gdf.astype(as_dtype)) @@ -4470,12 +4484,12 @@ def test_df_astype_to_categorical_ordered(ordered): pdf = pd.DataFrame() pdf["foo"] = psr pdf["bar"] = psr - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) ordered_dtype_pd = pd.CategoricalDtype( categories=[1, 2, 3], ordered=ordered ) - ordered_dtype_gd = gd.CategoricalDtype.from_pandas(ordered_dtype_pd) + ordered_dtype_gd = cudf.CategoricalDtype.from_pandas(ordered_dtype_pd) assert_eq( pdf.astype(ordered_dtype_pd).astype("int32"), @@ -4489,7 +4503,7 @@ def test_df_astype_to_categorical_ordered(ordered): + [("category", {"ordered": True}), ("category", {"ordered": False})], ) def test_empty_df_astype(dtype, args): - df = gd.DataFrame() + df = cudf.DataFrame() kwargs = {} kwargs.update(args) assert_eq(df, df.astype(dtype=dtype, **kwargs)) @@ -4509,7 +4523,7 @@ def test_empty_df_astype(dtype, args): ], ) def test_series_astype_error_handling(errors): - sr = gd.Series(["random", "words"]) + sr = cudf.Series(["random", "words"]) got = sr.astype("datetime64", errors=errors) assert_eq(sr, got) @@ -4527,12 +4541,12 @@ def test_df_constructor_dtype(dtype): else: data = [1, 2, 3, None] - sr = gd.Series(data, dtype=dtype) + sr = cudf.Series(data, dtype=dtype) - expect = gd.DataFrame() + expect = cudf.DataFrame() expect["foo"] = sr expect["bar"] = sr - got = gd.DataFrame({"foo": data, "bar": data}, dtype=dtype) + got = cudf.DataFrame({"foo": data, "bar": data}, dtype=dtype) assert_eq(expect, got) @@ -4540,31 +4554,31 @@ def test_df_constructor_dtype(dtype): @pytest.mark.parametrize( "data", [ - gd.datasets.randomdata( + cudf.datasets.randomdata( nrows=10, dtypes={"a": "category", "b": int, "c": float, "d": int} ), - gd.datasets.randomdata( + cudf.datasets.randomdata( nrows=10, dtypes={"a": "category", "b": int, "c": float, "d": str} ), - gd.datasets.randomdata( + cudf.datasets.randomdata( nrows=10, dtypes={"a": bool, "b": int, "c": float, "d": str} ), - gd.DataFrame(), - gd.DataFrame({"a": [0, 1, 2], "b": [1, None, 3]}), - gd.DataFrame( + cudf.DataFrame(), + cudf.DataFrame({"a": [0, 1, 2], "b": [1, None, 3]}), + cudf.DataFrame( { "a": [1, 2, 3, 4], "b": [7, np.NaN, 9, 10], "c": [np.NaN, np.NaN, np.NaN, np.NaN], - "d": gd.Series([None, None, None, None], dtype="int64"), + "d": cudf.Series([None, None, None, None], dtype="int64"), "e": [100, None, 200, None], - "f": gd.Series([10, None, np.NaN, 11], nan_as_null=False), + "f": cudf.Series([10, None, np.NaN, 11], nan_as_null=False), } ), - gd.DataFrame( + cudf.DataFrame( { "a": [10, 11, 12, 13, 14, 15], - "b": gd.Series( + "b": cudf.Series( [10, None, np.NaN, 2234, None, np.NaN], nan_as_null=False ), } @@ -4586,25 +4600,25 @@ def test_rowwise_ops(data, op, skipna): expected = getattr(pdf, op)(axis=1, skipna=skipna) got = getattr(gdf, op)(axis=1, skipna=skipna) - assert_eq(expected, got, check_less_precise=7) + assert_eq(expected, got, check_exact=False) @pytest.mark.parametrize( "op", ["max", "min", "sum", "product", "mean", "var", "std"] ) def test_rowwise_ops_nullable_dtypes_all_null(op): - gdf = gd.DataFrame( + gdf = cudf.DataFrame( { "a": [1, 2, 3, 4], "b": [7, np.NaN, 9, 10], "c": [np.NaN, np.NaN, np.NaN, np.NaN], - "d": gd.Series([None, None, None, None], dtype="int64"), + "d": cudf.Series([None, None, None, None], dtype="int64"), "e": [100, None, 200, None], - "f": gd.Series([10, None, np.NaN, 11], nan_as_null=False), + "f": cudf.Series([10, None, np.NaN, 11], nan_as_null=False), } ) - expected = gd.Series([None, None, None, None], dtype="float64") + expected = cudf.Series([None, None, None, None], dtype="float64") if op in ("var", "std"): got = getattr(gdf, op)(axis=1, ddof=0, skipna=False) @@ -4620,7 +4634,7 @@ def test_rowwise_ops_nullable_dtypes_all_null(op): [ ( "max", - gd.Series( + cudf.Series( [10.0, None, np.NaN, 2234.0, None, np.NaN], dtype="float64", nan_as_null=False, @@ -4628,7 +4642,7 @@ def test_rowwise_ops_nullable_dtypes_all_null(op): ), ( "min", - gd.Series( + cudf.Series( [10.0, None, np.NaN, 13.0, None, np.NaN], dtype="float64", nan_as_null=False, @@ -4636,7 +4650,7 @@ def test_rowwise_ops_nullable_dtypes_all_null(op): ), ( "sum", - gd.Series( + cudf.Series( [20.0, None, np.NaN, 2247.0, None, np.NaN], dtype="float64", nan_as_null=False, @@ -4644,7 +4658,7 @@ def test_rowwise_ops_nullable_dtypes_all_null(op): ), ( "product", - gd.Series( + cudf.Series( [100.0, None, np.NaN, 29042.0, None, np.NaN], dtype="float64", nan_as_null=False, @@ -4652,7 +4666,7 @@ def test_rowwise_ops_nullable_dtypes_all_null(op): ), ( "mean", - gd.Series( + cudf.Series( [10.0, None, np.NaN, 1123.5, None, np.NaN], dtype="float64", nan_as_null=False, @@ -4660,7 +4674,7 @@ def test_rowwise_ops_nullable_dtypes_all_null(op): ), ( "var", - gd.Series( + cudf.Series( [0.0, None, np.NaN, 1233210.25, None, np.NaN], dtype="float64", nan_as_null=False, @@ -4668,7 +4682,7 @@ def test_rowwise_ops_nullable_dtypes_all_null(op): ), ( "std", - gd.Series( + cudf.Series( [0.0, None, np.NaN, 1110.5, None, np.NaN], dtype="float64", nan_as_null=False, @@ -4677,10 +4691,10 @@ def test_rowwise_ops_nullable_dtypes_all_null(op): ], ) def test_rowwise_ops_nullable_dtypes_partial_null(op, expected): - gdf = gd.DataFrame( + gdf = cudf.DataFrame( { "a": [10, 11, 12, 13, 14, 15], - "b": gd.Series( + "b": cudf.Series( [10, None, np.NaN, 2234, None, np.NaN], nan_as_null=False, ), } @@ -4698,38 +4712,44 @@ def test_rowwise_ops_nullable_dtypes_partial_null(op, expected): @pytest.mark.parametrize( "op,expected", [ - ("max", gd.Series([10, None, None, 2234, None, 453], dtype="int64",),), - ("min", gd.Series([10, None, None, 13, None, 15], dtype="int64",),), - ("sum", gd.Series([20, None, None, 2247, None, 468], dtype="int64",),), + ( + "max", + cudf.Series([10, None, None, 2234, None, 453], dtype="int64",), + ), + ("min", cudf.Series([10, None, None, 13, None, 15], dtype="int64",),), + ( + "sum", + cudf.Series([20, None, None, 2247, None, 468], dtype="int64",), + ), ( "product", - gd.Series([100, None, None, 29042, None, 6795], dtype="int64",), + cudf.Series([100, None, None, 29042, None, 6795], dtype="int64",), ), ( "mean", - gd.Series( + cudf.Series( [10.0, None, None, 1123.5, None, 234.0], dtype="float32", ), ), ( "var", - gd.Series( + cudf.Series( [0.0, None, None, 1233210.25, None, 47961.0], dtype="float32", ), ), ( "std", - gd.Series( + cudf.Series( [0.0, None, None, 1110.5, None, 219.0], dtype="float32", ), ), ], ) def test_rowwise_ops_nullable_int_dtypes(op, expected): - gdf = gd.DataFrame( + gdf = cudf.DataFrame( { "a": [10, 11, None, 13, None, 15], - "b": gd.Series( + "b": cudf.Series( [10, None, 323, 2234, None, 453], nan_as_null=False, ), } @@ -4748,62 +4768,62 @@ def test_rowwise_ops_nullable_int_dtypes(op, expected): "data", [ { - "t1": gd.Series( + "t1": cudf.Series( ["2020-08-01 09:00:00", "1920-05-01 10:30:00"], dtype=" 0: if nulls == "some": @@ -431,14 +429,14 @@ def test_datetime_unique(data, nulls): @pytest.mark.parametrize( "data", [ - [], + pd.Series([], dtype="datetime64[ns]"), pd.Series(pd.date_range("2010-01-01", "2010-02-01")), pd.Series([None, None], dtype="datetime64[ns]"), ], ) @pytest.mark.parametrize("nulls", ["none", "some"]) def test_datetime_nunique(data, nulls): - psr = pd.Series(data) + psr = data.copy() if len(data) > 0: if nulls == "some": @@ -540,7 +538,7 @@ def test_datetime_dataframe(): [ None, [], - pd.Series([]), + pd.Series([], dtype="float64"), pd.Index([]), pd.Series([1, 2, 3]), pd.Series([0, 1, -1]), @@ -673,7 +671,7 @@ def test_to_datetime_not_implemented(): [ 1, [], - pd.Series([]), + pd.Series([], dtype="float64"), pd.Index([]), pd.Series([1, 2, 3]), pd.Series([1, 2.4, 3]), @@ -1182,7 +1180,7 @@ def test_datetime_stats(data, dtype, stat): assert_eq(expected, actual) -@pytest.mark.parametrize("op", ["max", "min"]) +@pytest.mark.parametrize("op", ["max", "min", "std", "median"]) @pytest.mark.parametrize( "data", [ @@ -1201,10 +1199,14 @@ def test_datetime_reductions(data, op, dtype): actual = getattr(sr, op)() expected = getattr(psr, op)() - if np.isnat(expected.to_numpy()) and np.isnat(actual): + if ( + expected is pd.NaT + and actual is pd.NaT + or (np.isnat(expected.to_numpy()) and np.isnat(actual)) + ): assert True else: - assert_eq(expected.to_numpy(), actual) + assert_eq(expected, actual) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py index 08378361188..d01627309d6 100644 --- a/python/cudf/cudf/tests/test_dropna.py +++ b/python/cudf/cudf/tests/test_dropna.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. + import numpy as np import pandas as pd import pytest @@ -21,7 +22,7 @@ @pytest.mark.parametrize("inplace", [True, False]) def test_dropna_series(data, nulls, inplace): - psr = pd.Series(data) + psr = cudf.utils.utils._create_pandas_series(data=data) if len(data) > 0: if nulls == "one": @@ -40,14 +41,12 @@ def test_dropna_series(data, nulls, inplace): if gsr.null_count == len(gsr): check_dtype = False + expected = psr.dropna() + actual = gsr.dropna() + if inplace: - psr.dropna() - gsr.dropna() expected = psr actual = gsr - else: - expected = psr.dropna() - actual = gsr.dropna() assert_eq(expected, actual, check_dtype=check_dtype) @@ -71,14 +70,12 @@ def test_dropna_dataframe(data, how, axis, inplace): pdf = pd.DataFrame(data) gdf = cudf.from_pandas(pdf) + expected = pdf.dropna(axis=axis, how=how, inplace=inplace) + actual = gdf.dropna(axis=axis, how=how, inplace=inplace) + if inplace: - pdf.dropna(axis=axis, how=how, inplace=inplace) - gdf.dropna(axis=axis, how=how, inplace=inplace) expected = pdf actual = gdf - else: - expected = pdf.dropna(axis=axis, how=how, inplace=inplace) - actual = gdf.dropna(axis=axis, how=how, inplace=inplace) assert_eq(expected, actual) @@ -192,18 +189,14 @@ def test_dropna_thresh_cols(thresh, subset, inplace): ) gdf = cudf.from_pandas(pdf) + expected = pdf.dropna( + axis=1, thresh=thresh, subset=subset, inplace=inplace + ) + actual = gdf.dropna(axis=1, thresh=thresh, subset=subset, inplace=inplace) + if inplace: - pdf.dropna(axis=1, thresh=thresh, subset=subset, inplace=inplace) - gdf.dropna(axis=1, thresh=thresh, subset=subset, inplace=inplace) expected = pdf actual = gdf - else: - expected = pdf.dropna( - axis=1, thresh=thresh, subset=subset, inplace=inplace - ) - actual = gdf.dropna( - axis=1, thresh=thresh, subset=subset, inplace=inplace - ) assert_eq( expected, actual, diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index 29f1c31a1ee..f721b7a28e5 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -1,4 +1,7 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. + +import itertools as it +import random import numpy as np import pytest @@ -56,7 +59,7 @@ def test_duplicated_with_misspelled_column_name(subset): ], ) def test_drop_duplicates_series(data, keep): - pds = Series(data) + pds = cudf.utils.utils._create_pandas_series(data) gds = cudf.from_pandas(pds) assert_df(pds.drop_duplicates(keep=keep), gds.drop_duplicates(keep=keep)) @@ -277,9 +280,6 @@ def test_drop_duplicates_empty(df): @pytest.mark.parametrize("num_columns", [3, 4, 5]) def test_dataframe_drop_duplicates_numeric_method(num_columns): - import itertools as it - import random - comb = list(it.permutations(range(num_columns), num_columns)) shuf = list(comb) random.Random(num_columns).shuffle(shuf) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 294443500a9..8011510d340 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1,10 +1,12 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. +import datetime import itertools import numpy as np import pandas as pd import pytest +from numba import cuda from numpy.testing import assert_array_equal import cudf @@ -284,8 +286,6 @@ def foo(df): def test_groupby_apply_grouped(): - from numba import cuda - np.random.seed(0) df = DataFrame() nelem = 20 @@ -732,12 +732,12 @@ def test_groupby_multi_agg_multi_groupby(): def test_groupby_datetime_multi_agg_multi_groupby(): - from datetime import datetime, timedelta - pdf = pd.DataFrame( { "a": pd.date_range( - datetime.now(), datetime.now() + timedelta(9), freq="D" + datetime.datetime.now(), + datetime.datetime.now() + datetime.timedelta(9), + freq="D", ), "b": np.random.randint(0, 5, 10), "c": np.random.randint(0, 5, 10), @@ -1496,7 +1496,8 @@ def test_groupby_apply_return_series_dataframe(cust_func): @pytest.mark.parametrize( - "pdf", [pd.DataFrame(), pd.DataFrame({"a": []}), pd.Series([])] + "pdf", + [pd.DataFrame(), pd.DataFrame({"a": []}), pd.Series([], dtype="float64")], ) def test_groupby_no_keys(pdf): gdf = cudf.from_pandas(pdf) @@ -1509,7 +1510,8 @@ def test_groupby_no_keys(pdf): @pytest.mark.parametrize( - "pdf", [pd.DataFrame(), pd.DataFrame({"a": []}), pd.Series([])] + "pdf", + [pd.DataFrame(), pd.DataFrame({"a": []}), pd.Series([], dtype="float64")], ) def test_groupby_apply_no_keys(pdf): gdf = cudf.from_pandas(pdf) diff --git a/python/cudf/cudf/tests/test_hdf.py b/python/cudf/cudf/tests/test_hdf.py index d5b18a08281..f908d5f51f5 100644 --- a/python/cudf/cudf/tests/test_hdf.py +++ b/python/cudf/cudf/tests/test_hdf.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. import os from string import ascii_letters @@ -14,7 +14,7 @@ import tables # noqa F401 except ImportError: pytest.skip( - "PyTables is not installed and is required for HDF " "reading/writing", + "PyTables is not installed and is required for HDF reading/writing", allow_module_level=True, ) @@ -34,7 +34,7 @@ def pdf(request): nrows=nrows, ncols=ncols, data_gen_f=lambda r, c: r, r_idx_type="i" ) # Delete the name of the column index, and rename the row index - del test_pdf.columns.name + test_pdf.columns.name = None test_pdf.index.name = "test_index" # Cast all the column dtypes to objects, rename them, and then cast to @@ -94,14 +94,16 @@ def test_hdf_reader(hdf_files, columns): expect_df = pd.read_hdf(hdf_df_file, columns=columns) got_df = cudf.read_hdf(hdf_df_file, columns=columns) - assert_eq(expect_df, got_df, check_categorical=False) + assert_eq( + expect_df, got_df, check_categorical=False, check_index_type=False + ) for column in hdf_series.keys(): expect_series = pd.read_hdf(hdf_series[column]) got_series = cudf.read_hdf(hdf_series[column]) - assert_eq(expect_series, got_series) + assert_eq(expect_series, got_series, check_index_type=False) @pytest.mark.parametrize("format", ["fixed", "table"]) @@ -130,7 +132,7 @@ def test_hdf_writer(tmpdir, pdf, gdf, complib, format): expect = pd.read_hdf(pdf_df_fname) got = pd.read_hdf(gdf_df_fname) - assert_eq(expect, got) + assert_eq(expect, got, check_index_type=False) for column in pdf.columns: pdf_series_fname = tmpdir.join(column + "_" + "pdf_series.hdf") @@ -149,4 +151,4 @@ def test_hdf_writer(tmpdir, pdf, gdf, complib, format): expect_series = pd.read_hdf(pdf_series_fname) got_series = pd.read_hdf(gdf_series_fname) - assert_eq(expect_series, got_series) + assert_eq(expect_series, got_series, check_index_type=False) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index f806b0a912c..688efef555b 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. """ Test related to Index @@ -11,7 +11,7 @@ import pytest import cudf -from cudf.core import DataFrame +from cudf.core._compat import PANDAS_GE_110 from cudf.core.index import ( CategoricalIndex, DatetimeIndex, @@ -34,7 +34,7 @@ def test_df_set_index_from_series(): - df = DataFrame() + df = cudf.DataFrame() df["a"] = list(range(10)) df["b"] = list(range(0, 20, 2)) @@ -48,7 +48,7 @@ def test_df_set_index_from_series(): def test_df_set_index_from_name(): - df = DataFrame() + df = cudf.DataFrame() df["a"] = list(range(10)) df["b"] = list(range(0, 20, 2)) @@ -64,7 +64,7 @@ def test_df_set_index_from_name(): def test_df_slice_empty_index(): - df = DataFrame() + df = cudf.DataFrame() assert isinstance(df.index, RangeIndex) assert isinstance(df.index[:1], RangeIndex) with pytest.raises(IndexError): @@ -152,10 +152,10 @@ def test_categorical_index(): pdf = pd.DataFrame() pdf["a"] = [1, 2, 3] pdf["index"] = pd.Categorical(["a", "b", "c"]) - initial_df = DataFrame.from_pandas(pdf) + initial_df = cudf.from_pandas(pdf) pdf = pdf.set_index("index") - gdf1 = DataFrame.from_pandas(pdf) - gdf2 = DataFrame() + gdf1 = cudf.from_pandas(pdf) + gdf2 = cudf.DataFrame() gdf2["a"] = [1, 2, 3] gdf2["index"] = pd.Categorical(["a", "b", "c"]) assert_eq(initial_df.index, gdf2.index) @@ -272,7 +272,7 @@ def test_index_rename_preserves_arg(): def test_set_index_as_property(): - cdf = DataFrame() + cdf = cudf.DataFrame() col1 = np.arange(10) col2 = np.arange(0, 20, 2) cdf["a"] = col1 @@ -565,7 +565,7 @@ def test_empty_df_head_tail_index(n): ( pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]), pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]) != "a", - "h", + "a", None, ), ( @@ -803,9 +803,12 @@ def test_index_difference(data, other, sort): and gd_other.dtype.kind != "f" or (gd_data.dtype.kind != "f" and gd_other.dtype.kind == "f") ): - pytest.xfail( - "Bug in Pandas: https://github.com/pandas-dev/pandas/issues/35217" + pytest.mark.xfail( + condition=not PANDAS_GE_110, + reason="Bug in Pandas: " + "https://github.com/pandas-dev/pandas/issues/35217", ) + expected = pd_data.difference(pd_other, sort=sort) actual = gd_data.difference(gd_other, sort=sort) assert_eq(expected, actual) @@ -867,9 +870,12 @@ def test_index_equals(data, other): if ( gd_data.dtype.kind == "f" or gd_other.dtype.kind == "f" ) and cudf.utils.dtypes.is_mixed_with_object_dtype(gd_data, gd_other): - pytest.xfail( - "Bug in Pandas: https://github.com/pandas-dev/pandas/issues/35217" + pytest.mark.xfail( + condition=not PANDAS_GE_110, + reason="Bug in Pandas: " + "https://github.com/pandas-dev/pandas/issues/35217", ) + expected = pd_data.equals(pd_other) actual = gd_data.equals(gd_other) assert_eq(expected, actual) @@ -921,8 +927,10 @@ def test_index_categories_equal(data, other): and gd_other.dtype.kind != "f" or (gd_data.dtype.kind != "f" and gd_other.dtype.kind == "f") ): - pytest.xfail( - "Bug in Pandas: https://github.com/pandas-dev/pandas/issues/35217" + pytest.mark.xfail( + condition=not PANDAS_GE_110, + reason="Bug in Pandas: " + "https://github.com/pandas-dev/pandas/issues/35217", ) expected = pd_data.equals(pd_other) @@ -983,7 +991,9 @@ def test_index_equal_misc(data, other): actual = gd_data.equals(np.array(gd_other)) assert_eq(expected, actual) - expected = pd_data.equals(pd.Series(pd_other)) + expected = pd_data.equals( + cudf.utils.utils._create_pandas_series(data=pd_other) + ) actual = gd_data.equals(cudf.Series(gd_other)) assert_eq(expected, actual) @@ -1408,7 +1418,7 @@ def test_multiindex_sample_basic(n, frac, replace, axis): "int": [1, 3, 5, 4, 2], }, ) - mul_index = cudf.Index(DataFrame.from_pandas(pdf)) + mul_index = cudf.Index(cudf.from_pandas(pdf)) random_state = 0 try: diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 5229881df25..73a074c0376 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + from itertools import combinations import cupy @@ -6,8 +8,7 @@ import pytest import cudf -from cudf import DataFrame, Series -from cudf.core._compat import PANDAS_GE_110 +from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_120 from cudf.tests import utils from cudf.tests.utils import INTEGER_TYPES, assert_eq, assert_exceptions_equal @@ -59,7 +60,11 @@ def pdf_gdf_multi(): pd.Series(range(3, 12)), pd.Series(range(0, 9, 2)), ), - (Series(range(12)), Series(range(3, 12)), Series(range(0, 9, 2))), + ( + cudf.Series(range(12)), + cudf.Series(range(3, 12)), + cudf.Series(range(0, 9, 2)), + ), ( [i in range(12) for i in range(20)], [i in range(3, 12) for i in range(12)], @@ -96,7 +101,7 @@ def pdf_gdf_multi(): ) def test_series_indexing(i1, i2, i3): a1 = np.arange(20) - series = Series(a1) + series = cudf.Series(a1) # Indexing sr1 = series.iloc[i1] assert sr1.null_count == 0 @@ -123,7 +128,7 @@ def test_series_indexing_large_size(): gsr = cudf.Series(cupy.ones(n_elem)) gsr[0] = None got = gsr[gsr.isna()] - expect = Series([None], dtype="float64") + expect = cudf.Series([None], dtype="float64") assert_eq(expect, got) @@ -133,7 +138,7 @@ def test_series_indexing_large_size(): "arg", ["b", ["a", "c"], slice(1, 2, 1), [True, False, True]] ) def test_series_get_item(psr, arg): - gsr = Series.from_pandas(psr) + gsr = cudf.from_pandas(psr) expect = psr[arg] got = gsr[arg] @@ -142,7 +147,7 @@ def test_series_get_item(psr, arg): def test_dataframe_column_name_indexing(): - df = DataFrame() + df = cudf.DataFrame() data = np.asarray(range(10), dtype=np.int32) df["a"] = data df[1] = data @@ -159,7 +164,7 @@ def test_dataframe_column_name_indexing(): pdf["key2"] = np.random.randint(0, 3, nelem) pdf[1] = np.arange(1, 1 + nelem) pdf[2] = np.random.random(nelem) - df = DataFrame.from_pandas(pdf) + df = cudf.from_pandas(pdf) assert_eq(df[df.columns], df) assert_eq(df[df.columns[:1]], df[["key1"]]) @@ -172,7 +177,7 @@ def test_dataframe_column_name_indexing(): df = pd.DataFrame() for i in range(0, 10): df[i] = range(nelem) - gdf = DataFrame.from_pandas(df) + gdf = cudf.DataFrame.from_pandas(df) assert_eq(gdf, df) assert_eq(gdf[gdf.columns], gdf) @@ -180,7 +185,7 @@ def test_dataframe_column_name_indexing(): def test_dataframe_slicing(): - df = DataFrame() + df = cudf.DataFrame() size = 123 df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype( np.int32 @@ -237,7 +242,7 @@ def test_dataframe_loc(scalar, step): } ) - df = DataFrame.from_pandas(pdf) + df = cudf.DataFrame.from_pandas(pdf) assert_eq(df.loc[:, ["a"]], pdf.loc[:, ["a"]]) @@ -309,7 +314,7 @@ def test_dataframe_loc(scalar, step): def test_dataframe_loc_duplicate_index_scalar(): pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5]}, index=[1, 2, 1, 4, 2]) - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(pdf.loc[2], gdf.loc[2]) @@ -323,13 +328,13 @@ def test_dataframe_loc_mask(mask, arg): pdf = pd.DataFrame( {"a": ["a", "b", "c", "d", "e"], "b": ["f", "g", "h", "i", "j"]} ) - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(pdf.loc[mask, arg], gdf.loc[mask, arg]) def test_dataframe_loc_outbound(): - df = DataFrame() + df = cudf.DataFrame() size = 10 df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype( np.int32 @@ -345,7 +350,7 @@ def test_dataframe_loc_outbound(): def test_series_loc_numerical(): ps = pd.Series([1, 2, 3, 4, 5], index=[5, 6, 7, 8, 9]) - gs = Series.from_pandas(ps) + gs = cudf.Series.from_pandas(ps) assert_eq(ps.loc[5], gs.loc[5]) assert_eq(ps.loc[6], gs.loc[6]) @@ -363,7 +368,7 @@ def test_series_loc_numerical(): def test_series_loc_float_index(): ps = pd.Series([1, 2, 3, 4, 5], index=[5.43, 6.34, 7.34, 8.0, 9.1]) - gs = Series.from_pandas(ps) + gs = cudf.Series.from_pandas(ps) assert_eq(ps.loc[5.43], gs.loc[5.43]) assert_eq(ps.loc[8], gs.loc[8]) @@ -381,7 +386,7 @@ def test_series_loc_string(): ps = pd.Series( [1, 2, 3, 4, 5], index=["one", "two", "three", "four", "five"] ) - gs = Series.from_pandas(ps) + gs = cudf.Series.from_pandas(ps) assert_eq(ps.loc["one"], gs.loc["one"]) assert_eq(ps.loc["five"], gs.loc["five"]) @@ -404,7 +409,7 @@ def test_series_loc_datetime(): ps = pd.Series( [1, 2, 3, 4, 5], index=pd.date_range("20010101", "20010105") ) - gs = Series.from_pandas(ps) + gs = cudf.Series.from_pandas(ps) # a few different ways of specifying a datetime label: assert_eq(ps.loc["20010101"], gs.loc["20010101"]) @@ -465,7 +470,7 @@ def test_series_loc_categorical(): ps = pd.Series( [1, 2, 3, 4, 5], index=pd.Categorical(["a", "b", "c", "d", "e"]) ) - gs = Series.from_pandas(ps) + gs = cudf.Series.from_pandas(ps) assert_eq(ps.loc["a"], gs.loc["a"]) assert_eq(ps.loc["e"], gs.loc["e"]) @@ -529,12 +534,12 @@ def test_dataframe_series_loc_multiindex(obj): @pytest.mark.parametrize("nelem", [2, 5, 20, 100]) def test_series_iloc(nelem): - # create random series + # create random cudf.Series np.random.seed(12) ps = pd.Series(np.random.sample(nelem)) - # gpu series - gs = Series(ps) + # gpu cudf.Series + gs = cudf.Series(ps) # positive tests for indexing np.testing.assert_allclose(gs.iloc[-1 * nelem], ps.iloc[-1 * nelem]) @@ -565,7 +570,7 @@ def test_series_iloc(nelem): @pytest.mark.parametrize("nelem", [2, 5, 20, 100]) def test_dataframe_iloc(nelem): - gdf = DataFrame() + gdf = cudf.DataFrame() gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( np.int32 @@ -617,7 +622,7 @@ def test_dataframe_iloc(nelem): @pytest.mark.xfail(raises=AssertionError, reason="Series.index are different") def test_dataframe_iloc_tuple(): - gdf = DataFrame() + gdf = cudf.DataFrame() nelem = 123 gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( np.int32 @@ -639,7 +644,7 @@ def test_dataframe_iloc_tuple(): raises=IndexError, reason="positional indexers are out-of-bounds" ) def test_dataframe_iloc_index_error(): - gdf = DataFrame() + gdf = cudf.DataFrame() nelem = 123 gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( np.int32 @@ -660,7 +665,7 @@ def assert_col(g, p): @pytest.mark.parametrize("ntake", [0, 1, 10, 123, 122, 200]) def test_dataframe_take(ntake): np.random.seed(0) - df = DataFrame() + df = cudf.DataFrame() nelem = 123 df["ii"] = np.random.randint(0, 20, nelem) @@ -679,7 +684,7 @@ def test_dataframe_take(ntake): @pytest.mark.parametrize("ntake", [1, 2, 8, 9]) def test_dataframe_take_with_multiIndex(ntake): np.random.seed(0) - df = DataFrame( + df = cudf.DataFrame( index=cudf.MultiIndex( levels=[["lama", "cow", "falcon"], ["speed", "weight", "length"]], codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], @@ -705,7 +710,7 @@ def test_series_take(ntake, keep_index): nelem = 123 data = np.random.randint(0, 20, nelem) - sr = Series(data) + sr = cudf.Series(data) take_indices = np.random.randint(0, len(sr), ntake) @@ -723,7 +728,7 @@ def test_series_take(ntake, keep_index): def test_series_take_positional(): psr = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"]) - gsr = Series.from_pandas(psr) + gsr = cudf.Series.from_pandas(psr) take_indices = [1, 2, 0, 3] @@ -737,7 +742,7 @@ def test_series_take_positional(): @pytest.mark.parametrize("slice_start", [None, 0, 1, 3, 10, -10]) @pytest.mark.parametrize("slice_end", [None, 0, 1, 30, 50, -1]) def test_dataframe_masked_slicing(nelem, slice_start, slice_end): - gdf = DataFrame() + gdf = cudf.DataFrame() gdf["a"] = list(range(nelem)) gdf["b"] = list(range(nelem, 2 * nelem)) gdf["a"] = gdf["a"].set_mask(utils.random_bitmask(nelem)) @@ -754,13 +759,13 @@ def do_slice(x): def test_dataframe_boolean_mask_with_None(): pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]}) - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) pdf_masked = pdf[[True, False, True, False]] gdf_masked = gdf[[True, False, True, False]] assert_eq(pdf_masked, gdf_masked) with pytest.raises(ValueError): - gdf[Series([True, False, None, False])] + gdf[cudf.Series([True, False, None, False])] @pytest.mark.parametrize("dtype", [int, float, str]) @@ -840,12 +845,12 @@ def test_dataframe_apply_boolean_mask(): "c": ["a", None, "b", "c"], } ) - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(pdf[[True, False, True, False]], gdf[[True, False, True, False]]) """ -This test compares cudf and Pandas dataframe boolean indexing. +This test compares cudf and Pandas DataFrame boolean indexing. """ @@ -973,7 +978,10 @@ def test_series_setitem_datetime(): assert_eq(psr, gsr) -@pytest.mark.xfail(reason="Pandas will coerce to object datatype here") +@pytest.mark.xfail( + condition=not PANDAS_GE_120, + reason="Pandas will coerce to object datatype here", +) def test_series_setitem_datetime_coerced(): psr = pd.Series(["2001", "2002", "2003"], dtype="datetime64[ns]") gsr = cudf.from_pandas(psr) @@ -1156,7 +1164,7 @@ def test_sliced_indexing(): a = list(range(4, 4 + 150)) b = list(range(0, 0 + 150)) pdf = pd.DataFrame({"a": a, "b": b}) - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) pdf = pdf.set_index("a") gdf = gdf.set_index("a") pidx = pdf.index[:75] diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index d99897584ec..8692057aa58 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -5,7 +5,7 @@ import pytest import cudf -from cudf.core import DataFrame, Series +from cudf.core._compat import PANDAS_GE_120 from cudf.core.dtypes import CategoricalDtype from cudf.tests.utils import ( INTEGER_TYPES, @@ -71,7 +71,7 @@ def pd_odd_joins(left, right, join_type): @pytest.mark.parametrize("aa,bb,how,method", make_params()) def test_dataframe_join_how(aa, bb, how, method): - df = DataFrame() + df = cudf.DataFrame() df["a"] = aa df["b"] = bb @@ -132,8 +132,7 @@ def work_gdf(df): def _check_series(expect, got): magic = 0xDEADBEAF - # print("expect\n", expect) - # print("got\n", got.to_string(nrows=None)) + direct_equal = np.all(expect.values == got.to_array()) nanfilled_equal = np.all( expect.fillna(magic).values == got.fillna(magic).to_array() @@ -147,7 +146,7 @@ def _check_series(expect, got): def test_dataframe_join_suffix(): np.random.seed(0) - df = DataFrame() + df = cudf.DataFrame() for k in "abc": df[k] = np.random.randint(0, 5, 5) @@ -174,12 +173,12 @@ def test_dataframe_join_suffix(): def test_dataframe_join_cats(): - lhs = DataFrame() + lhs = cudf.DataFrame() lhs["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) lhs["b"] = bb = np.arange(len(lhs)) lhs = lhs.set_index("a") - rhs = DataFrame() + rhs = cudf.DataFrame() rhs["a"] = pd.Categorical(list("abcac"), categories=list("abc")) rhs["c"] = cc = np.arange(len(rhs)) rhs = rhs.set_index("a") @@ -242,8 +241,8 @@ def test_dataframe_join_mismatch_cats(how): pdf1["join_col"] = pdf1["join_col"].astype("category") pdf2["join_col"] = pdf2["join_col"].astype("category") - gdf1 = DataFrame.from_pandas(pdf1) - gdf2 = DataFrame.from_pandas(pdf2) + gdf1 = cudf.from_pandas(pdf1) + gdf2 = cudf.from_pandas(pdf2) gdf1 = gdf1.set_index("join_col") gdf2 = gdf2.set_index("join_col") @@ -273,13 +272,13 @@ def test_dataframe_merge_on(on): np.random.seed(0) # Make cuDF - df_left = DataFrame() + df_left = cudf.DataFrame() nelem = 500 df_left["key1"] = np.random.randint(0, 40, nelem) df_left["key2"] = np.random.randint(0, 50, nelem) df_left["left_val"] = np.arange(nelem) - df_right = DataFrame() + df_right = cudf.DataFrame() nelem = 500 df_right["key1"] = np.random.randint(0, 30, nelem) df_right["key2"] = np.random.randint(0, 50, nelem) @@ -339,13 +338,13 @@ def test_dataframe_merge_on_unknown_column(): np.random.seed(0) # Make cuDF - df_left = DataFrame() + df_left = cudf.DataFrame() nelem = 500 df_left["key1"] = np.random.randint(0, 40, nelem) df_left["key2"] = np.random.randint(0, 50, nelem) df_left["left_val"] = np.arange(nelem) - df_right = DataFrame() + df_right = cudf.DataFrame() nelem = 500 df_right["key1"] = np.random.randint(0, 30, nelem) df_right["key2"] = np.random.randint(0, 50, nelem) @@ -360,13 +359,13 @@ def test_dataframe_merge_no_common_column(): np.random.seed(0) # Make cuDF - df_left = DataFrame() + df_left = cudf.DataFrame() nelem = 500 df_left["key1"] = np.random.randint(0, 40, nelem) df_left["key2"] = np.random.randint(0, 50, nelem) df_left["left_val"] = np.arange(nelem) - df_right = DataFrame() + df_right = cudf.DataFrame() nelem = 500 df_right["key3"] = np.random.randint(0, 30, nelem) df_right["key4"] = np.random.randint(0, 50, nelem) @@ -378,18 +377,18 @@ def test_dataframe_merge_no_common_column(): def test_dataframe_empty_merge(): - gdf1 = DataFrame({"a": [], "b": []}) - gdf2 = DataFrame({"a": [], "c": []}) + gdf1 = cudf.DataFrame({"a": [], "b": []}) + gdf2 = cudf.DataFrame({"a": [], "c": []}) - expect = DataFrame({"a": [], "b": [], "c": []}) + expect = cudf.DataFrame({"a": [], "b": [], "c": []}) got = gdf1.merge(gdf2, how="left", on=["a"]) assert_eq(expect, got) def test_dataframe_merge_order(): - gdf1 = DataFrame() - gdf2 = DataFrame() + gdf1 = cudf.DataFrame() + gdf2 = cudf.DataFrame() gdf1["id"] = [10, 11] gdf1["timestamp"] = [1, 2] gdf1["a"] = [3, 4] @@ -457,8 +456,8 @@ def test_dataframe_pairs_of_triples(pairs, max, rows, how): pdf_left[left_column] = np.random.randint(0, max, rows) for right_column in pairs[1]: pdf_right[right_column] = np.random.randint(0, max, rows) - gdf_left = DataFrame.from_pandas(pdf_left) - gdf_right = DataFrame.from_pandas(pdf_right) + gdf_left = cudf.from_pandas(pdf_left) + gdf_right = cudf.from_pandas(pdf_right) if not set(pdf_left.columns).intersection(pdf_right.columns): with pytest.raises( pd.core.reshape.merge.MergeError, @@ -493,10 +492,6 @@ def test_dataframe_pairs_of_triples(pairs, max, rows, how): def test_safe_merging_with_left_empty(): - import numpy as np - import pandas as pd - - from cudf import DataFrame np.random.seed(0) @@ -507,8 +502,8 @@ def test_safe_merging_with_left_empty(): pdf_left[left_column] = np.random.randint(0, 10, 0) for right_column in pairs[1]: pdf_right[right_column] = np.random.randint(0, 10, 5) - gdf_left = DataFrame.from_pandas(pdf_left) - gdf_right = DataFrame.from_pandas(pdf_right) + gdf_left = cudf.from_pandas(pdf_left) + gdf_right = cudf.from_pandas(pdf_right) pdf_result = pdf_left.merge(pdf_right) gdf_result = gdf_left.merge(gdf_right) @@ -541,16 +536,17 @@ def test_empty_joins(how, left_empty, right_empty): @pytest.mark.xfail( + condition=not PANDAS_GE_120, reason="left_on/right_on produces undefined results with 0" - "index and is disabled" + "index and is disabled", ) def test_merge_left_index_zero(): left = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6]}, index=[0, 1, 2, 3, 4, 5]) right = pd.DataFrame( {"y": [10, 20, 30, 6, 5, 4]}, index=[0, 1, 2, 3, 4, 6] ) - gleft = DataFrame.from_pandas(left) - gright = DataFrame.from_pandas(right) + gleft = cudf.from_pandas(left) + gright = cudf.from_pandas(right) pd_merge = left.merge(right, left_on="x", right_on="y") gd_merge = gleft.merge(gright, left_on="x", right_on="y") @@ -571,8 +567,8 @@ def test_merge_left_right_index_left_right_on_zero_kwargs(kwargs): right = pd.DataFrame( {"y": [10, 20, 30, 6, 5, 4]}, index=[0, 1, 2, 3, 4, 6] ) - gleft = DataFrame.from_pandas(left) - gright = DataFrame.from_pandas(right) + gleft = cudf.from_pandas(left) + gright = cudf.from_pandas(right) pd_merge = left.merge(right, **kwargs) gd_merge = gleft.merge(gright, **kwargs) assert_eq(pd_merge, gd_merge) @@ -592,8 +588,8 @@ def test_merge_left_right_index_left_right_on_kwargs(kwargs): right = pd.DataFrame( {"y": [10, 20, 30, 6, 5, 4]}, index=[1, 2, 3, 4, 5, 7] ) - gleft = DataFrame.from_pandas(left) - gright = DataFrame.from_pandas(right) + gleft = cudf.from_pandas(left) + gright = cudf.from_pandas(right) pd_merge = left.merge(right, **kwargs) gd_merge = gleft.merge(gright, **kwargs) assert_eq(pd_merge, gd_merge) @@ -670,8 +666,8 @@ def test_merge_on_index_retained(): def test_merge_left_right_index_left_right_on_kwargs2(kwargs): left = pd.DataFrame({"x": [1, 2, 3]}, index=[10, 20, 30]) right = pd.DataFrame({"y": [10, 20, 30]}, index=[1, 2, 30]) - gleft = DataFrame.from_pandas(left) - gright = DataFrame.from_pandas(right) + gleft = cudf.from_pandas(left) + gright = cudf.from_pandas(right) gd_merge = gleft.merge(gright, **kwargs) pd_merge = left.merge(right, **kwargs) if pd_merge.empty: @@ -705,8 +701,8 @@ def test_merge_sort(ons, hows): left.index = [6, 5, 4, 7, 5, 5, 5, 4, 4] right.index = [5, 4, 1, 9, 4, 3, 5, 4, 4] - gleft = DataFrame.from_pandas(left) - gright = DataFrame.from_pandas(right) + gleft = cudf.from_pandas(left) + gright = cudf.from_pandas(right) gd_merge = gleft.merge(gright, **kwargs) pd_merge = left.merge(right, **kwargs) @@ -751,8 +747,8 @@ def test_merge_sort_on_indexes(kwargs): left.index = [6, 5, 4, 7, 5, 5, 5, 4, 4] right.index = [5, 4, 1, 9, 4, 3, 5, 4, 4] - gleft = DataFrame.from_pandas(left) - gright = DataFrame.from_pandas(right) + gleft = cudf.from_pandas(left) + gright = cudf.from_pandas(right) gd_merge = gleft.merge(gright, **kwargs) if left_index and right_index: @@ -775,8 +771,8 @@ def test_join_datetimes_index(dtype): datetimes = pd.Series(pd.date_range("20010101", "20010102", freq="12h")) pdf_lhs = pd.DataFrame(index=[1, 0, 1, 2, 0, 0, 1]) pdf_rhs = pd.DataFrame({"d": datetimes}) - gdf_lhs = DataFrame.from_pandas(pdf_lhs) - gdf_rhs = DataFrame.from_pandas(pdf_rhs) + gdf_lhs = cudf.from_pandas(pdf_lhs) + gdf_rhs = cudf.from_pandas(pdf_rhs) gdf_rhs["d"] = gdf_rhs["d"].astype(dtype) @@ -791,8 +787,8 @@ def test_join_datetimes_index(dtype): def test_join_with_different_names(): left = pd.DataFrame({"a": [0, 1, 2.0, 3, 4, 5, 9]}) right = pd.DataFrame({"b": [12, 5, 3, 9.0, 5], "c": [1, 2, 3, 4, 5.0]}) - gleft = DataFrame.from_pandas(left) - gright = DataFrame.from_pandas(right) + gleft = cudf.from_pandas(left) + gright = cudf.from_pandas(right) pd_merge = left.merge(right, how="outer", left_on=["a"], right_on=["b"]) gd_merge = gleft.merge(gright, how="outer", left_on=["a"], right_on=["b"]) assert_eq(pd_merge, gd_merge.sort_values(by=["a"]).reset_index(drop=True)) @@ -801,8 +797,8 @@ def test_join_with_different_names(): def test_join_same_name_different_order(): left = pd.DataFrame({"a": [0, 0], "b": [1, 2]}) right = pd.DataFrame({"a": [1, 2], "b": [0, 0]}) - gleft = DataFrame.from_pandas(left) - gright = DataFrame.from_pandas(right) + gleft = cudf.from_pandas(left) + gright = cudf.from_pandas(right) pd_merge = left.merge(right, left_on=["a", "b"], right_on=["b", "a"]) gd_merge = gleft.merge(gright, left_on=["a", "b"], right_on=["b", "a"]) assert_eq( @@ -813,8 +809,8 @@ def test_join_same_name_different_order(): def test_join_empty_table_dtype(): left = pd.DataFrame({"a": []}) right = pd.DataFrame({"b": [12, 5, 3, 9.0, 5], "c": [1, 2, 3, 4, 5.0]}) - gleft = DataFrame.from_pandas(left) - gright = DataFrame.from_pandas(right) + gleft = cudf.from_pandas(left) + gright = cudf.from_pandas(right) pd_merge = left.merge(right, how="left", left_on=["a"], right_on=["b"]) gd_merge = gleft.merge(gright, how="left", left_on=["a"], right_on=["b"]) assert_eq(pd_merge["a"].dtype, gd_merge["a"].dtype) @@ -915,7 +911,7 @@ def test_join_multi(how, column_a, column_b, column_c): ) def test_merge_multi(kwargs): - left = DataFrame( + left = cudf.DataFrame( { "a": [1, 2, 3, 4, 3, 5, 6], "b": [1, 3, 5, 7, 5, 9, 0], @@ -923,7 +919,7 @@ def test_merge_multi(kwargs): "d": ["v", "w", "x", "y", "z", "1", "2"], } ) - right = DataFrame( + right = cudf.DataFrame( { "a": [0, 9, 3, 4, 3, 7, 8], "b": [2, 4, 5, 7, 5, 6, 8], @@ -979,19 +975,19 @@ def test_merge_multi(kwargs): def test_typecast_on_join_int_to_int(dtype_l, dtype_r): other_data = ["a", "b", "c"] - join_data_l = Series([1, 2, 3], dtype=dtype_l) - join_data_r = Series([1, 2, 4], dtype=dtype_r) + join_data_l = cudf.Series([1, 2, 3], dtype=dtype_l) + join_data_r = cudf.Series([1, 2, 4], dtype=dtype_r) - gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)]) exp_join_data = [1, 2] exp_other_data = ["a", "b"] - exp_join_col = Series(exp_join_data, dtype=exp_dtype) + exp_join_col = cudf.Series(exp_join_data, dtype=exp_dtype) - expect = DataFrame( + expect = cudf.DataFrame( { "join_col": exp_join_col, "B_x": exp_other_data, @@ -1009,11 +1005,11 @@ def test_typecast_on_join_int_to_int(dtype_l, dtype_r): def test_typecast_on_join_float_to_float(dtype_l, dtype_r): other_data = ["a", "b", "c", "d", "e", "f"] - join_data_l = Series([1, 2, 3, 0.9, 4.5, 6], dtype=dtype_l) - join_data_r = Series([1, 2, 3, 0.9, 4.5, 7], dtype=dtype_r) + join_data_l = cudf.Series([1, 2, 3, 0.9, 4.5, 6], dtype=dtype_l) + join_data_r = cudf.Series([1, 2, 3, 0.9, 4.5, 7], dtype=dtype_r) - gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)]) @@ -1024,9 +1020,9 @@ def test_typecast_on_join_float_to_float(dtype_l, dtype_r): exp_join_data = [1, 2, 3, 0.9, 4.5] exp_other_data = ["a", "b", "c", "d", "e"] - exp_join_col = Series(exp_join_data, dtype=exp_dtype) + exp_join_col = cudf.Series(exp_join_data, dtype=exp_dtype) - expect = DataFrame( + expect = cudf.DataFrame( { "join_col": exp_join_col, "B_x": exp_other_data, @@ -1050,19 +1046,19 @@ def test_typecast_on_join_mixed_int_float(dtype_l, dtype_r): other_data = ["a", "b", "c", "d", "e", "f"] - join_data_l = Series([1, 2, 3, 0.9, 4.5, 6], dtype=dtype_l) - join_data_r = Series([1, 2, 3, 0.9, 4.5, 7], dtype=dtype_r) + join_data_l = cudf.Series([1, 2, 3, 0.9, 4.5, 6], dtype=dtype_l) + join_data_r = cudf.Series([1, 2, 3, 0.9, 4.5, 7], dtype=dtype_r) - gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)]) exp_join_data = [1, 2, 3] exp_other_data = ["a", "b", "c"] - exp_join_col = Series(exp_join_data, dtype=exp_dtype) + exp_join_col = cudf.Series(exp_join_data, dtype=exp_dtype) - expect = DataFrame( + expect = cudf.DataFrame( { "join_col": exp_join_col, "B_x": exp_other_data, @@ -1079,18 +1075,18 @@ def test_typecast_on_join_no_float_round(): other_data = ["a", "b", "c", "d", "e"] - join_data_l = Series([1, 2, 3, 4, 5], dtype="int8") - join_data_r = Series([1, 2, 3, 4.01, 4.99], dtype="float32") + join_data_l = cudf.Series([1, 2, 3, 4, 5], dtype="int8") + join_data_r = cudf.Series([1, 2, 3, 4.01, 4.99], dtype="float32") - gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) exp_join_data = [1, 2, 3, 4, 5] exp_Bx = ["a", "b", "c", "d", "e"] exp_By = ["a", "b", "c", None, None] - exp_join_col = Series(exp_join_data, dtype="float32") + exp_join_col = cudf.Series(exp_join_data, dtype="float32") - expect = DataFrame( + expect = cudf.DataFrame( {"join_col": exp_join_col, "B_x": exp_Bx, "B_y": exp_By} ) @@ -1143,23 +1139,23 @@ def test_typecast_on_join_overflow_unsafe(dtypes): ) def test_typecast_on_join_dt_to_dt(dtype_l, dtype_r): other_data = ["a", "b", "c", "d", "e"] - join_data_l = Series( + join_data_l = cudf.Series( ["1991-11-20", "1999-12-31", "2004-12-04", "2015-01-01", "2019-08-15"] ).astype(dtype_l) - join_data_r = Series( + join_data_r = cudf.Series( ["1991-11-20", "1999-12-31", "2004-12-04", "2015-01-01", "2019-08-16"] ).astype(dtype_r) - gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) exp_dtype = max(np.dtype(dtype_l), np.dtype(dtype_r)) exp_join_data = ["1991-11-20", "1999-12-31", "2004-12-04", "2015-01-01"] exp_other_data = ["a", "b", "c", "d"] - exp_join_col = Series(exp_join_data, dtype=exp_dtype) + exp_join_col = cudf.Series(exp_join_data, dtype=exp_dtype) - expect = DataFrame( + expect = cudf.DataFrame( { "join_col": exp_join_col, "B_x": exp_other_data, @@ -1181,21 +1177,21 @@ def test_typecast_on_join_categorical(dtype_l, dtype_r): pytest.skip("Can't determine which categorical to use") other_data = ["a", "b", "c", "d", "e"] - join_data_l = Series([1, 2, 3, 4, 5], dtype=dtype_l) - join_data_r = Series([1, 2, 3, 4, 6], dtype=dtype_r) + join_data_l = cudf.Series([1, 2, 3, 4, 5], dtype=dtype_l) + join_data_r = cudf.Series([1, 2, 3, 4, 6], dtype=dtype_r) if dtype_l == "category": exp_dtype = join_data_l.dtype.categories.dtype elif dtype_r == "category": exp_dtype = join_data_r.dtype.categories.dtype - gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) exp_join_data = [1, 2, 3, 4] exp_other_data = ["a", "b", "c", "d"] - exp_join_col = Series(exp_join_data, dtype=exp_dtype) + exp_join_col = cudf.Series(exp_join_data, dtype=exp_dtype) - expect = DataFrame( + expect = cudf.DataFrame( { "join_col": exp_join_col, "B_x": exp_other_data, @@ -1424,8 +1420,8 @@ def test_categorical_typecast_outer_one_cat(dtype): def test_index_join(lhs, rhs, how, level): l_pdf = pd.DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]}) r_pdf = pd.DataFrame({"a": [1, 5, 4, 0], "b": [3, 9, 8, 4]}) - l_df = DataFrame.from_pandas(l_pdf) - r_df = DataFrame.from_pandas(r_pdf) + l_df = cudf.from_pandas(l_pdf) + r_df = cudf.from_pandas(r_pdf) p_lhs = l_pdf.set_index(lhs).index p_rhs = r_pdf.set_index(rhs).index g_lhs = l_df.set_index(lhs).index @@ -1452,8 +1448,8 @@ def test_index_join_corner_cases(): r_pdf = pd.DataFrame( {"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]} ) - l_df = DataFrame.from_pandas(l_pdf) - r_df = DataFrame.from_pandas(r_pdf) + l_df = cudf.from_pandas(l_pdf) + r_df = cudf.from_pandas(r_pdf) # Join when column name doesn't match with level lhs = ["a", "b"] @@ -1527,8 +1523,10 @@ def test_index_join_corner_cases(): def test_index_join_exception_cases(): - l_df = DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]}) - r_df = DataFrame({"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]}) + l_df = cudf.DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]}) + r_df = cudf.DataFrame( + {"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]} + ) # Join between two MultiIndex lhs = ["a", "b"] @@ -1551,12 +1549,12 @@ def test_index_join_exception_cases(): def test_typecast_on_join_indexes(): - join_data_l = Series([1, 2, 3, 4, 5], dtype="int8") - join_data_r = Series([1, 2, 3, 4, 6], dtype="int32") + join_data_l = cudf.Series([1, 2, 3, 4, 5], dtype="int8") + join_data_r = cudf.Series([1, 2, 3, 4, 6], dtype="int32") other_data = ["a", "b", "c", "d", "e"] - gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) gdf_l = gdf_l.set_index("join_col") gdf_r = gdf_r.set_index("join_col") @@ -1564,7 +1562,7 @@ def test_typecast_on_join_indexes(): exp_join_data = [1, 2, 3, 4] exp_other_data = ["a", "b", "c", "d"] - expect = DataFrame( + expect = cudf.DataFrame( { "join_col": exp_join_data, "B_x": exp_other_data, @@ -1579,17 +1577,17 @@ def test_typecast_on_join_indexes(): def test_typecast_on_join_multiindices(): - join_data_l_0 = Series([1, 2, 3, 4, 5], dtype="int8") - join_data_l_1 = Series([2, 3, 4.1, 5.9, 6], dtype="float32") - join_data_l_2 = Series([7, 8, 9, 0, 1], dtype="float32") + join_data_l_0 = cudf.Series([1, 2, 3, 4, 5], dtype="int8") + join_data_l_1 = cudf.Series([2, 3, 4.1, 5.9, 6], dtype="float32") + join_data_l_2 = cudf.Series([7, 8, 9, 0, 1], dtype="float32") - join_data_r_0 = Series([1, 2, 3, 4, 5], dtype="int32") - join_data_r_1 = Series([2, 3, 4, 5, 6], dtype="int32") - join_data_r_2 = Series([7, 8, 9, 0, 0], dtype="float64") + join_data_r_0 = cudf.Series([1, 2, 3, 4, 5], dtype="int32") + join_data_r_1 = cudf.Series([2, 3, 4, 5, 6], dtype="int32") + join_data_r_2 = cudf.Series([7, 8, 9, 0, 0], dtype="float64") other_data = ["a", "b", "c", "d", "e"] - gdf_l = DataFrame( + gdf_l = cudf.DataFrame( { "join_col_0": join_data_l_0, "join_col_1": join_data_l_1, @@ -1597,7 +1595,7 @@ def test_typecast_on_join_multiindices(): "B": other_data, } ) - gdf_r = DataFrame( + gdf_r = cudf.DataFrame( { "join_col_0": join_data_r_0, "join_col_1": join_data_r_1, @@ -1609,12 +1607,12 @@ def test_typecast_on_join_multiindices(): gdf_l = gdf_l.set_index(["join_col_0", "join_col_1", "join_col_2"]) gdf_r = gdf_r.set_index(["join_col_0", "join_col_1", "join_col_2"]) - exp_join_data_0 = Series([1, 2], dtype="int32") - exp_join_data_1 = Series([2, 3], dtype="float64") - exp_join_data_2 = Series([7, 8], dtype="float64") - exp_other_data = Series(["a", "b"]) + exp_join_data_0 = cudf.Series([1, 2], dtype="int32") + exp_join_data_1 = cudf.Series([2, 3], dtype="float64") + exp_join_data_2 = cudf.Series([7, 8], dtype="float64") + exp_other_data = cudf.Series(["a", "b"]) - expect = DataFrame( + expect = cudf.DataFrame( { "join_col_0": exp_join_data_0, "join_col_1": exp_join_data_1, @@ -1630,12 +1628,12 @@ def test_typecast_on_join_multiindices(): def test_typecast_on_join_indexes_matching_categorical(): - join_data_l = Series(["a", "b", "c", "d", "e"], dtype="category") - join_data_r = Series(["a", "b", "c", "d", "e"], dtype="str") + join_data_l = cudf.Series(["a", "b", "c", "d", "e"], dtype="category") + join_data_r = cudf.Series(["a", "b", "c", "d", "e"], dtype="str") other_data = [1, 2, 3, 4, 5] - gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) gdf_l = gdf_l.set_index("join_col") gdf_r = gdf_r.set_index("join_col") @@ -1643,7 +1641,7 @@ def test_typecast_on_join_indexes_matching_categorical(): exp_join_data = ["a", "b", "c", "d", "e"] exp_other_data = [1, 2, 3, 4, 5] - expect = DataFrame( + expect = cudf.DataFrame( { "join_col": exp_join_data, "B_x": exp_other_data, @@ -1697,9 +1695,9 @@ def test_series_dataframe_mixed_merging(lhs, rhs, how, kwargs): check_lhs = lhs.copy() check_rhs = rhs.copy() - if isinstance(lhs, Series): + if isinstance(lhs, cudf.Series): check_lhs = lhs.to_frame() - if isinstance(rhs, Series): + if isinstance(rhs, cudf.Series): check_rhs = rhs.to_frame() expect = check_lhs.merge(check_rhs, how=how, **kwargs) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index e032309bdbd..e0a922f35fe 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. import copy import itertools @@ -11,6 +11,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_110 from cudf.tests.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq @@ -134,13 +135,17 @@ def test_json_writer(tmpdir, pdf, gdf): assert os.path.exists(gdf_series_fname) try: - # xref 'https://github.com/pandas-dev/pandas/pull/33373') + # xref 'https://github.com/pandas-dev/pandas/pull/33373' expect_series = pd.read_json(pdf_series_fname, typ="series") except TypeError as e: - if str(e) == " is not convertible to datetime": + if ( + not PANDAS_GE_110 + and str(e) == " is not convertible to datetime" + ): continue else: raise e + got_series = pd.read_json(gdf_series_fname, typ="series") assert_eq(expect_series, got_series) diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index a2afa9f0a97..6d9bcda2c0b 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -1,127 +1,133 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + import numpy as np import pandas as pd import pytest import cudf -from cudf import Series +from cudf.core._compat import PANDAS_GE_100 from cudf.tests.utils import assert_eq def test_can_cast_safely_same_kind(): # 'i' -> 'i' - data = Series([1, 2, 3], dtype="int32")._column + data = cudf.Series([1, 2, 3], dtype="int32")._column to_dtype = np.dtype("int64") assert data.can_cast_safely(to_dtype) - data = Series([1, 2, 3], dtype="int64")._column + data = cudf.Series([1, 2, 3], dtype="int64")._column to_dtype = np.dtype("int32") assert data.can_cast_safely(to_dtype) - data = Series([1, 2, 2 ** 31], dtype="int64")._column + data = cudf.Series([1, 2, 2 ** 31], dtype="int64")._column assert not data.can_cast_safely(to_dtype) # 'u' -> 'u' - data = Series([1, 2, 3], dtype="uint32")._column + data = cudf.Series([1, 2, 3], dtype="uint32")._column to_dtype = np.dtype("uint64") assert data.can_cast_safely(to_dtype) - data = Series([1, 2, 3], dtype="uint64")._column + data = cudf.Series([1, 2, 3], dtype="uint64")._column to_dtype = np.dtype("uint32") assert data.can_cast_safely(to_dtype) - data = Series([1, 2, 2 ** 33], dtype="uint64")._column + data = cudf.Series([1, 2, 2 ** 33], dtype="uint64")._column assert not data.can_cast_safely(to_dtype) # 'f' -> 'f' - data = Series([np.inf, 1.0], dtype="float64")._column + data = cudf.Series([np.inf, 1.0], dtype="float64")._column to_dtype = np.dtype("float32") assert data.can_cast_safely(to_dtype) - data = Series([np.finfo("float32").max * 2, 1.0], dtype="float64")._column + data = cudf.Series( + [np.finfo("float32").max * 2, 1.0], dtype="float64" + )._column to_dtype = np.dtype("float32") assert not data.can_cast_safely(to_dtype) def test_can_cast_safely_mixed_kind(): - data = Series([1, 2, 3], dtype="int32")._column + data = cudf.Series([1, 2, 3], dtype="int32")._column to_dtype = np.dtype("float32") assert data.can_cast_safely(to_dtype) # too big to fit into f32 exactly - data = Series([1, 2, 2 ** 24 + 1], dtype="int32")._column + data = cudf.Series([1, 2, 2 ** 24 + 1], dtype="int32")._column assert not data.can_cast_safely(to_dtype) - data = Series([1, 2, 3], dtype="uint32")._column + data = cudf.Series([1, 2, 3], dtype="uint32")._column to_dtype = np.dtype("float32") assert data.can_cast_safely(to_dtype) # too big to fit into f32 exactly - data = Series([1, 2, 2 ** 24 + 1], dtype="uint32")._column + data = cudf.Series([1, 2, 2 ** 24 + 1], dtype="uint32")._column assert not data.can_cast_safely(to_dtype) to_dtype = np.dtype("float64") assert data.can_cast_safely(to_dtype) - data = Series([1.0, 2.0, 3.0], dtype="float32")._column + data = cudf.Series([1.0, 2.0, 3.0], dtype="float32")._column to_dtype = np.dtype("int32") assert data.can_cast_safely(to_dtype) # not integer float - data = Series([1.0, 2.0, 3.5], dtype="float32")._column + data = cudf.Series([1.0, 2.0, 3.5], dtype="float32")._column assert not data.can_cast_safely(to_dtype) - data = Series([10.0, 11.0, 2000.0], dtype="float64")._column + data = cudf.Series([10.0, 11.0, 2000.0], dtype="float64")._column assert data.can_cast_safely(to_dtype) # float out of int range - data = Series([1.0, 2.0, 1.0 * (2 ** 31)], dtype="float32")._column + data = cudf.Series([1.0, 2.0, 1.0 * (2 ** 31)], dtype="float32")._column assert not data.can_cast_safely(to_dtype) # negative signed integers casting to unsigned integers - data = Series([-1, 0, 1], dtype="int32")._column + data = cudf.Series([-1, 0, 1], dtype="int32")._column to_dtype = np.dtype("uint32") assert not data.can_cast_safely(to_dtype) @pytest.mark.xfail( - reason="cuDF null <-> pd.NA compatibility not yet supported" + condition=not PANDAS_GE_100, + reason="cuDF null <-> pd.NA compatibility not yet supported", ) def test_to_pandas_nullable_integer(): - gsr_not_null = Series([1, 2, 3]) - gsr_has_null = Series([1, 2, None]) + gsr_not_null = cudf.Series([1, 2, 3]) + gsr_has_null = cudf.Series([1, 2, None]) psr_not_null = pd.Series([1, 2, 3], dtype="int64") psr_has_null = pd.Series([1, 2, None], dtype="Int64") assert_eq(gsr_not_null.to_pandas(), psr_not_null) - assert_eq(gsr_has_null.to_pandas(), psr_has_null) + assert_eq(gsr_has_null.to_pandas(nullable=True), psr_has_null) @pytest.mark.xfail( - reason="cuDF null <-> pd.NA compatibility not yet supported" + condition=not PANDAS_GE_100, + reason="cuDF null <-> pd.NA compatibility not yet supported", ) def test_to_pandas_nullable_bool(): - gsr_not_null = Series([True, False, True]) - gsr_has_null = Series([True, False, None]) + gsr_not_null = cudf.Series([True, False, True]) + gsr_has_null = cudf.Series([True, False, None]) psr_not_null = pd.Series([True, False, True], dtype="bool") psr_has_null = pd.Series([True, False, None], dtype="boolean") assert_eq(gsr_not_null.to_pandas(), psr_not_null) - assert_eq(gsr_has_null.to_pandas(), psr_has_null) + assert_eq(gsr_has_null.to_pandas(nullable=True), psr_has_null) def test_can_cast_safely_has_nulls(): - data = Series([1, 2, 3, None], dtype="float32")._column + data = cudf.Series([1, 2, 3, None], dtype="float32")._column to_dtype = np.dtype("int64") assert data.can_cast_safely(to_dtype) - data = Series([1, 2, 3.1, None], dtype="float32")._column + data = cudf.Series([1, 2, 3.1, None], dtype="float32")._column assert not data.can_cast_safely(to_dtype) diff --git a/python/cudf/cudf/tests/test_ops.py b/python/cudf/cudf/tests/test_ops.py index 888380bc559..8cdef19d9ba 100644 --- a/python/cudf/cudf/tests/test_ops.py +++ b/python/cudf/cudf/tests/test_ops.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + import numpy as np import pandas as pd import pytest @@ -27,6 +29,7 @@ def test_sqrt_integer(): def math_op_test( dtype, fn, nelem=128, test_df=False, positive_only=False, check_dtype=True ): + np.random.seed(0) randvals = gen_rand(dtype, nelem, positive_only=positive_only) h_series = pd.Series(randvals.astype(dtype)) d_series = cudf.Series(h_series) diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index faf895b8f42..80a2e89bf46 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -1,8 +1,9 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. from __future__ import division, print_function import random +import re from itertools import product import numpy as np @@ -166,15 +167,20 @@ def test_date_minmax(): @pytest.mark.parametrize( - "op", - ["sum", "product", "std", "var", "median", "kurt", "kurtosis", "skew"], + "op", ["sum", "product", "var", "kurt", "kurtosis", "skew"], ) def test_datetime_unsupported_reductions(op): gsr = cudf.Series([1, 2, 3, None], dtype="datetime64[ns]") psr = gsr.to_pandas() utils.assert_exceptions_equal( - lfunc=getattr(psr, op), rfunc=getattr(gsr, op), + lfunc=getattr(psr, op), + rfunc=getattr(gsr, op), + expected_error_message=re.escape( + "cannot perform " + + ("kurtosis" if op == "kurt" else op) + + " with type datetime64[ns]" + ), ) @@ -183,7 +189,15 @@ def test_timedelta_unsupported_reductions(op): gsr = cudf.Series([1, 2, 3, None], dtype="timedelta64[ns]") psr = gsr.to_pandas() - utils.assert_exceptions_equal(getattr(psr, op), getattr(gsr, op)) + utils.assert_exceptions_equal( + lfunc=getattr(psr, op), + rfunc=getattr(gsr, op), + expected_error_message=re.escape( + "cannot perform " + + ("kurtosis" if op == "kurt" else op) + + " with type timedelta64[ns]" + ), + ) @pytest.mark.parametrize("op", ["sum", "product", "std", "var"]) diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index f4713b19015..e7baa4ee926 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -333,7 +333,7 @@ def test_fillna_method_numerical(data, container, data_dtype, method, inplace): @pytest.mark.parametrize( - "psr", + "psr_data", [ pd.Series(["a", "b", "a", None, "c", None], dtype="category"), pd.Series( @@ -373,8 +373,8 @@ def test_fillna_method_numerical(data, container, data_dtype, method, inplace): ], ) @pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_categorical(psr, fill_value, inplace): - +def test_fillna_categorical(psr_data, fill_value, inplace): + psr = psr_data.copy(deep=True) gsr = Series.from_pandas(psr) if isinstance(fill_value, pd.Series): @@ -382,18 +382,29 @@ def test_fillna_categorical(psr, fill_value, inplace): else: fill_value_cudf = fill_value - expected = psr.fillna(fill_value, inplace=inplace) - got = gsr.fillna(fill_value_cudf, inplace=inplace) + if ( + isinstance(fill_value_cudf, cudf.Series) + and gsr.dtype != fill_value_cudf.dtype + ): + assert_exceptions_equal( + lfunc=psr.fillna, + rfunc=gsr.fillna, + lfunc_args_and_kwargs=([fill_value], {"inplace": inplace}), + rfunc_args_and_kwargs=([fill_value_cudf], {"inplace": inplace}), + ) + else: + expected = psr.fillna(fill_value, inplace=inplace) + got = gsr.fillna(fill_value_cudf, inplace=inplace) - if inplace: - expected = psr - got = gsr + if inplace: + expected = psr + got = gsr - assert_eq(expected, got) + assert_eq(expected, got) @pytest.mark.parametrize( - "psr", + "psr_data", [ pd.Series(pd.date_range("2010-01-01", "2020-01-10", freq="1y")), pd.Series(["2010-01-01", None, "2011-10-10"], dtype="datetime64[ns]"), @@ -475,7 +486,8 @@ def test_fillna_categorical(psr, fill_value, inplace): ], ) @pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_datetime(psr, fill_value, inplace): +def test_fillna_datetime(psr_data, fill_value, inplace): + psr = psr_data.copy(deep=True) gsr = cudf.from_pandas(psr) if isinstance(fill_value, pd.Series): @@ -634,7 +646,7 @@ def test_fillna_dataframe(df, value, inplace): @pytest.mark.parametrize( - "psr", + "ps_data", [ pd.Series(["a", "b", "c", "d"]), pd.Series([None] * 4, dtype="object"), @@ -655,7 +667,8 @@ def test_fillna_dataframe(df, value, inplace): ], ) @pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_string(psr, fill_value, inplace): +def test_fillna_string(ps_data, fill_value, inplace): + psr = ps_data.copy(deep=True) gsr = cudf.from_pandas(psr) if isinstance(fill_value, pd.Series): diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 8c09dc91253..66e09f61869 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -1,4 +1,5 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. + import textwrap import cupy as cp @@ -158,7 +159,7 @@ def test_integer_dataframe(x): @settings(deadline=None) def test_integer_series(x): sr = cudf.Series(x) - ps = pd.Series(x) + ps = cudf.utils.utils._create_pandas_series(data=x) assert sr.__repr__() == ps.__repr__() @@ -175,7 +176,7 @@ def test_float_dataframe(x): @settings(deadline=None) def test_float_series(x): sr = cudf.Series(x, nan_as_null=False) - ps = pd.Series(x) + ps = cudf.utils.utils._create_pandas_series(data=x) assert sr.__repr__() == ps.__repr__() @@ -261,6 +262,7 @@ def test_generic_index(length, dtype): psr = pd.Series( range(length), index=np.random.randint(0, high=100, size=length).astype(dtype), + dtype="float64" if length == 0 else None, ) gsr = cudf.Series.from_pandas(psr) @@ -1169,8 +1171,7 @@ def test_timedelta_index_repr(index, expected_repr): def test_mulitIndex_repr(pmi, max_seq_items): pd.set_option("display.max_seq_items", max_seq_items) gmi = cudf.from_pandas(pmi) - print(gmi) - print(pmi) + assert gmi.__repr__() == pmi.__repr__() pd.reset_option("display.max_seq_items") @@ -1416,3 +1417,59 @@ def test_mulitIndex_null_repr(gdi, expected_repr): actual_repr = gdi.__repr__() assert actual_repr.split() == expected_repr.split() + + +def test_categorical_series_with_nan_repr(): + series = cudf.Series( + [1, 2, np.nan, 10, np.nan, None], nan_as_null=False + ).astype("category") + + expected_repr = textwrap.dedent( + """ + 0 1.0 + 1 2.0 + 2 NaN + 3 10.0 + 4 NaN + 5 + dtype: category + Categories (4, float64): [1.0, 10.0, 2.0, NaN] + """ + ) + + assert series.__repr__().split() == expected_repr.split() + + +def test_categorical_dataframe_with_nan_repr(): + series = cudf.Series( + [1, 2, np.nan, 10, np.nan, None], nan_as_null=False + ).astype("category") + df = cudf.DataFrame({"a": series}) + expected_repr = textwrap.dedent( + """ + a + 0 1.0 + 1 2.0 + 2 NaN + 3 10.0 + 4 NaN + 5 + """ + ) + + assert df.__repr__().split() == expected_repr.split() + + +def test_categorical_index_with_nan_repr(): + cat_index = cudf.Index( + cudf.Series( + [1, 2, np.nan, 10, np.nan, None], nan_as_null=False + ).astype("category") + ) + + expected_repr = ( + "CategoricalIndex([1.0, 2.0, NaN, 10.0, NaN, ], " + "categories=[1.0, 10.0, 2.0, NaN], ordered=False, dtype='category')" + ) + + assert cat_index.__repr__() == expected_repr diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index 315762c931f..b030924779d 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + import re import numpy as np @@ -6,7 +8,7 @@ import cudf from cudf import melt as cudf_melt -from cudf.core import DataFrame +from cudf.core._compat import PANDAS_GE_120 from cudf.tests.utils import ( ALL_TYPES, DATETIME_TYPES, @@ -53,7 +55,7 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): pdf[colname] = data value_vars.append(colname) - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) got = cudf_melt(frame=gdf, id_vars=id_vars, value_vars=value_vars) got_from_melt_method = gdf.melt(id_vars=id_vars, value_vars=value_vars) @@ -73,7 +75,14 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): @pytest.mark.parametrize( "dtype", list(NUMERIC_TYPES + DATETIME_TYPES) - + [pytest.param("str", marks=pytest.mark.xfail())], + + [ + pytest.param( + "str", + marks=pytest.mark.xfail( + condition=not PANDAS_GE_120, reason="pandas bug" + ), + ) + ], ) @pytest.mark.parametrize("nulls", ["none", "some"]) def test_df_stack(nulls, num_cols, num_rows, dtype): @@ -91,7 +100,7 @@ def test_df_stack(nulls, num_cols, num_rows, dtype): data[idx] = np.nan pdf[colname] = data - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) got = gdf.stack() @@ -102,7 +111,6 @@ def test_df_stack(nulls, num_cols, num_rows, dtype): ) assert_eq(expect, got) - pass @pytest.mark.parametrize("num_rows", [1, 2, 10, 1000]) @@ -128,7 +136,7 @@ def test_interleave_columns(nulls, num_cols, num_rows, dtype): data[idx] = np.nan pdf[colname] = data - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) if dtype == "category": with pytest.raises(ValueError): @@ -167,7 +175,7 @@ def test_tile(nulls, num_cols, num_rows, dtype, count): data[idx] = np.nan pdf[colname] = data - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) got = gdf.tile(count) expect = pd.DataFrame(pd.concat([pdf] * count)) @@ -347,7 +355,7 @@ def test_series_merge_sorted(nparts, key, na_position, ascending): ) def test_pivot_simple(index, column, data): pdf = pd.DataFrame({"index": index, "column": column, "data": data}) - gdf = cudf.DataFrame.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) expect = pdf.pivot("index", "column") got = gdf.pivot("index", "column") diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index 1ae5bab0da4..fcc5591adda 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + import math import numpy as np @@ -37,7 +39,7 @@ def test_rolling_series_basic(data, index, agg, nulls, center): elif nulls == "all": data = [np.nan] * len(data) - psr = pd.Series(data, index=index) + psr = cudf.utils.utils._create_pandas_series(data=data, index=index) gsr = cudf.Series(psr) for window_size in range(1, len(data) + 1): for min_periods in range(1, window_size + 1): @@ -99,13 +101,7 @@ def test_rolling_dataframe_basic(data, agg, nulls, center): pytest.param("min"), pytest.param("max"), pytest.param("mean"), - pytest.param( - "count", # Does not follow similar conventions as - # with non-offset columns - marks=pytest.mark.xfail( - reason="Differs from pandas behaviour here" - ), - ), + pytest.param("count"), ], ) def test_rolling_with_offset(agg): @@ -218,7 +214,7 @@ def test_rolling_getitem_window(): @pytest.mark.parametrize("center", [True, False]) def test_rollling_series_numba_udf_basic(data, index, center): - psr = pd.Series(data, index=index) + psr = cudf.utils.utils._create_pandas_series(data=data, index=index) gsr = cudf.from_pandas(psr) def some_func(A): diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index a19b88caf4c..ab9d3d91f73 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1,4 +1,5 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. + import operator import re from string import ascii_letters, digits @@ -28,7 +29,7 @@ def _series_na_data(): pd.Series([0, 1, 2, 3, 4]), pd.Series(["a", "b", "u", "h", "d"]), pd.Series([None, None, np.nan, None, np.inf, -np.inf]), - pd.Series([]), + pd.Series([], dtype="float64"), pd.Series( [pd.NaT, pd.Timestamp("1939-05-27"), pd.Timestamp("1940-04-25")] ), @@ -383,7 +384,7 @@ def test_series_tolist(data): [[], [None, None], ["a"], ["a", "b", "c"] * 500, [1.0, 2.0, 0.3] * 57], ) def test_series_size(data): - psr = pd.Series(data) + psr = cudf.utils.utils._create_pandas_series(data=data) gsr = cudf.Series(data) assert_eq(psr.size, gsr.size) @@ -481,7 +482,7 @@ def test_series_factorize(data, na_sentinel): @pytest.mark.parametrize( "data", [ - [], + pd.Series([], dtype="datetime64[ns]"), pd.Series(pd.date_range("2010-01-01", "2010-02-01")), pd.Series([None, None], dtype="datetime64[ns]"), ], @@ -490,7 +491,7 @@ def test_series_factorize(data, na_sentinel): @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize("nulls", ["none", "some"]) def test_series_datetime_value_counts(data, nulls, normalize, dropna): - psr = pd.Series(data) + psr = data.copy() if len(data) > 0: if nulls == "one": @@ -733,7 +734,8 @@ def test_series_notnull_notna(ps, nan_as_null): "sr1", [pd.Series([10, 11, 12], index=["a", "b", "z"]), pd.Series(["a"])] ) @pytest.mark.parametrize( - "sr2", [pd.Series([]), pd.Series(["a", "a", "c", "z", "A"])] + "sr2", + [pd.Series([], dtype="float64"), pd.Series(["a", "a", "c", "z", "A"])], ) @pytest.mark.parametrize( "op", @@ -852,6 +854,10 @@ def test_series_memory_usage(): dtype=pd.StringDtype(), ), ), + ( + cudf.Series([1, 2, None, 10.2, None], dtype="float32",), + pd.Series([1, 2, None, 10.2, None], dtype=pd.Float32Dtype(),), + ), ], ) def test_series_to_pandas_nullable_dtypes(sr, expected_psr): diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index 1b628142939..fc885a13808 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -1,11 +1,12 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. import numpy as np import pandas as pd import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.core._compat import PANDAS_GE_120 +from cudf.tests.utils import assert_eq, assert_exceptions_equal @pytest.mark.parametrize("df", [pd.DataFrame({"a": [1, 2, 3]})]) @@ -19,10 +20,12 @@ def test_dataframe_setitem_bool_mask_scaler(df, arg, value): assert_eq(df, gdf) -# pandas incorrectly adds nulls with dataframes -# but works fine with scalers -@pytest.mark.xfail() -def test_dataframe_setitem_scaler_bool_inconsistency(): +@pytest.mark.xfail( + condition=not PANDAS_GE_120, + reason="pandas incorrectly adds nulls with dataframes " + "but works fine with scalars", +) +def test_dataframe_setitem_scaler_bool(): df = pd.DataFrame({"a": [1, 2, 3]}) df[[True, False, True]] = pd.DataFrame({"a": [-1, -2]}) @@ -114,7 +117,7 @@ def test_series_set_item(psr, arg): ], ) def test_setitem_dataframe_series_inplace(df): - pdf = df + pdf = df.copy(deep=True) gdf = cudf.from_pandas(pdf) pdf["a"].replace(1, 500, inplace=True) @@ -184,3 +187,15 @@ def test_column_set_equal_length_object_by_mask(): data[bool_col] = replace_data assert_eq(cudf.Series(data), cudf.Series([100, 0, 300, 1, 500])) + + +def test_categorical_setitem_invalid(): + ps = pd.Series([1, 2, 3], dtype="category") + gs = cudf.Series([1, 2, 3], dtype="category") + + assert_exceptions_equal( + lfunc=ps.__setitem__, + rfunc=gs.__setitem__, + lfunc_args_and_kwargs=([0, 5], {}), + rfunc_args_and_kwargs=([0, 5], {}), + ) diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index 8bab802d89c..b90aebc33dc 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -1,5 +1,6 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. +import string from itertools import product import numpy as np @@ -140,7 +141,10 @@ def test_series_nsmallest(data, n): sr = Series(data) psr = pd.Series(data) assert_eq(sr.nsmallest(n), psr.nsmallest(n)) - assert_eq(sr.nsmallest(n, keep="last"), psr.nsmallest(n, keep="last")) + assert_eq( + sr.nsmallest(n, keep="last").sort_index(), + psr.nsmallest(n, keep="last").sort_index(), + ) assert_exceptions_equal( lfunc=psr.nsmallest, @@ -222,14 +226,12 @@ def test_dataframe_multi_column( num_cols, num_rows, dtype, ascending, na_position ): - from string import ascii_lowercase - np.random.seed(0) - by = list(ascii_lowercase[:num_cols]) + by = list(string.ascii_lowercase[:num_cols]) pdf = pd.DataFrame() for i in range(5): - colname = ascii_lowercase[i] + colname = string.ascii_lowercase[i] data = np.random.randint(0, 26, num_rows).astype(dtype) pdf[colname] = data @@ -253,14 +255,12 @@ def test_dataframe_multi_column_nulls( num_cols, num_rows, dtype, nulls, ascending, na_position ): - from string import ascii_lowercase - np.random.seed(0) - by = list(ascii_lowercase[:num_cols]) + by = list(string.ascii_lowercase[:num_cols]) pdf = pd.DataFrame() for i in range(3): - colname = ascii_lowercase[i] + colname = string.ascii_lowercase[i] data = np.random.randint(0, 26, num_rows).astype(dtype) if nulls == "some": idx = np.array([], dtype="int64") diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index c06fdd4a48e..4e07c974280 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -1,12 +1,13 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. import re +from concurrent.futures import ThreadPoolExecutor import numpy as np import pandas as pd import pytest -from cudf.core import Series +import cudf from cudf.datasets import randomdata from cudf.tests.utils import assert_eq, assert_exceptions_equal @@ -31,7 +32,7 @@ def test_series_reductions(method, dtype, skipna): arr = arr.astype(dtype) if dtype in (np.float32, np.float64): arr[[2, 5, 14, 19, 50, 70]] = np.nan - sr = Series.from_masked_array(arr, Series(mask).as_mask()) + sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask()) psr = sr.to_pandas() psr[~mask] = np.nan @@ -43,18 +44,16 @@ def call_test(sr, skipna): return fn(skipna=skipna) expect, got = call_test(psr, skipna=skipna), call_test(sr, skipna=skipna) - print(expect, got) + np.testing.assert_approx_equal(expect, got) @pytest.mark.parametrize("method", methods) def test_series_reductions_concurrency(method): - from concurrent.futures import ThreadPoolExecutor - e = ThreadPoolExecutor(10) np.random.seed(0) - srs = [Series(np.random.random(10000)) for _ in range(1)] + srs = [cudf.Series(np.random.random(10000)) for _ in range(1)] def call_test(sr): fn = getattr(sr, method) @@ -73,7 +72,7 @@ def f(sr): def test_series_std(ddof): np.random.seed(0) arr = np.random.random(100) - 0.5 - sr = Series(arr) + sr = cudf.Series(arr) pd = sr.to_pandas() got = sr.std(ddof=ddof) expect = pd.std(ddof=ddof) @@ -84,7 +83,7 @@ def test_series_unique(): for size in [10 ** x for x in range(5)]: arr = np.random.randint(low=-1, high=10, size=size) mask = arr != -1 - sr = Series.from_masked_array(arr, Series(mask).as_mask()) + sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask()) assert set(arr[mask]) == set(sr.unique().to_array()) assert len(set(arr[mask])) == sr.nunique() @@ -96,13 +95,13 @@ def test_series_unique(): def test_series_nunique(nan_as_null, dropna): # We remove nulls as opposed to NaNs using the dropna parameter, # so to test against pandas we replace NaN with another discrete value - cudf_series = Series([1, 2, 2, 3, 3], nan_as_null=nan_as_null) + cudf_series = cudf.Series([1, 2, 2, 3, 3], nan_as_null=nan_as_null) pd_series = pd.Series([1, 2, 2, 3, 3]) expect = pd_series.nunique(dropna=dropna) got = cudf_series.nunique(dropna=dropna) assert expect == got - cudf_series = Series( + cudf_series = cudf.Series( [1.0, 2.0, 3.0, np.nan, None], nan_as_null=nan_as_null ) if nan_as_null is True: @@ -114,7 +113,7 @@ def test_series_nunique(nan_as_null, dropna): got = cudf_series.nunique(dropna=dropna) assert expect == got - cudf_series = Series([1.0, np.nan, np.nan], nan_as_null=nan_as_null) + cudf_series = cudf.Series([1.0, np.nan, np.nan], nan_as_null=nan_as_null) if nan_as_null is True: pd_series = pd.Series([1.0, np.nan, np.nan]) else: @@ -126,7 +125,7 @@ def test_series_nunique(nan_as_null, dropna): def test_series_scale(): arr = pd.Series(np.random.randint(low=-10, high=10, size=100)) - sr = Series(arr) + sr = cudf.Series(arr) vmin = arr.min() vmax = arr.max() @@ -142,7 +141,7 @@ def test_exact_quantiles(int_method): quant_values = [0.0, 0.25, 0.33, 0.5, 1.0] df = pd.DataFrame(arr) - gdf_series = Series(arr) + gdf_series = cudf.Series(arr) q1 = gdf_series.quantile( quant_values, interpolation=int_method, exact=True @@ -161,7 +160,7 @@ def test_exact_quantiles_int(int_method): quant_values = [0.0, 0.25, 0.33, 0.5, 1.0] df = pd.DataFrame(arr) - gdf_series = Series(arr) + gdf_series = cudf.Series(arr) q1 = gdf_series.quantile( quant_values, interpolation=int_method, exact=True @@ -179,7 +178,7 @@ def test_approx_quantiles(): arr = np.asarray([6.8, 0.15, 3.4, 4.17, 2.13, 1.11, -1.01, 0.8, 5.7]) quant_values = [0.0, 0.25, 0.33, 0.5, 1.0] - gdf_series = Series(arr) + gdf_series = cudf.Series(arr) pdf_series = pd.Series(arr) q1 = gdf_series.quantile(quant_values, exact=False) @@ -193,7 +192,7 @@ def test_approx_quantiles_int(): quant_values = [0.5] approx_results = [2] - gdf_series = Series(arr) + gdf_series = cudf.Series(arr) q1 = gdf_series.quantile(quant_values, exact=False) @@ -204,8 +203,8 @@ def test_approx_quantiles_int(): @pytest.mark.parametrize("q", [[], 0.5, 1, 0.234, [0.345], [0.243, 0.5, 1]]) def test_misc_quantiles(data, q): - pdf_series = pd.Series(data) - gdf_series = Series(data) + pdf_series = cudf.utils.utils._create_pandas_series(data=data) + gdf_series = cudf.Series(data) expected = pdf_series.quantile(q) actual = gdf_series.quantile(q) @@ -215,17 +214,17 @@ def test_misc_quantiles(data, q): @pytest.mark.parametrize( "data", [ - Series(np.random.normal(-100, 100, 1000)), - Series(np.random.randint(-50, 50, 1000)), - Series(np.zeros(100)), - Series(np.repeat(np.nan, 100)), - Series(np.array([1.123, 2.343, np.nan, 0.0])), - Series( + cudf.Series(np.random.normal(-100, 100, 1000)), + cudf.Series(np.random.randint(-50, 50, 1000)), + cudf.Series(np.zeros(100)), + cudf.Series(np.repeat(np.nan, 100)), + cudf.Series(np.array([1.123, 2.343, np.nan, 0.0])), + cudf.Series( [5, 10, 53, None, np.nan, None, 12, 43, -423], nan_as_null=False ), - Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]), - Series([]), - Series([-3]), + cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]), + cudf.Series([]), + cudf.Series([-3]), randomdata( nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str} ), @@ -256,17 +255,17 @@ def test_kurtosis(data, null_flag): @pytest.mark.parametrize( "data", [ - Series(np.random.normal(-100, 100, 1000)), - Series(np.random.randint(-50, 50, 1000)), - Series(np.zeros(100)), - Series(np.repeat(np.nan, 100)), - Series(np.array([1.123, 2.343, np.nan, 0.0])), - Series( + cudf.Series(np.random.normal(-100, 100, 1000)), + cudf.Series(np.random.randint(-50, 50, 1000)), + cudf.Series(np.zeros(100)), + cudf.Series(np.repeat(np.nan, 100)), + cudf.Series(np.array([1.123, 2.343, np.nan, 0.0])), + cudf.Series( [5, 10, 53, None, np.nan, None, 12, 43, -423], nan_as_null=False ), - Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]), - Series([]), - Series([-3]), + cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]), + cudf.Series([]), + cudf.Series([-3]), randomdata( nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str} ), @@ -299,13 +298,13 @@ def test_series_median(dtype, num_na): mask = np.arange(100) >= num_na arr = arr.astype(dtype) - sr = Series.from_masked_array(arr, Series(mask).as_mask()) + sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask()) arr2 = arr[mask] ps = pd.Series(arr2, dtype=dtype) actual = sr.median(skipna=True) desired = ps.median(skipna=True) - print(actual, desired) + np.testing.assert_approx_equal(actual, desired) # only for float until integer null supported convert to pandas in cudf @@ -325,10 +324,10 @@ def test_series_median(dtype, num_na): np.zeros(100), np.repeat(np.nan, 100), np.array([1.123, 2.343, np.nan, 0.0]), - Series([5, 10, 53, None, np.nan, None], nan_as_null=False), - Series([1.1, 2.32, 43.4], index=[0, 4, 3]), - Series([]), - Series([-3]), + cudf.Series([5, 10, 53, None, np.nan, None], nan_as_null=False), + cudf.Series([1.1, 2.32, 43.4], index=[0, 4, 3]), + cudf.Series([]), + cudf.Series([-3]), ], ) @pytest.mark.parametrize( @@ -339,13 +338,13 @@ def test_series_median(dtype, num_na): np.zeros(100), np.repeat(np.nan, 100), np.array([1.123, 2.343, np.nan, 0.0]), - Series([1.1, 2.32, 43.4], index=[0, 500, 4000]), - Series([5]), + cudf.Series([1.1, 2.32, 43.4], index=[0, 500, 4000]), + cudf.Series([5]), ], ) def test_cov1d(data1, data2): - gs1 = Series(data1) - gs2 = Series(data2) + gs1 = cudf.Series(data1) + gs2 = cudf.Series(data2) ps1 = gs1.to_pandas() ps2 = gs2.to_pandas() @@ -363,10 +362,10 @@ def test_cov1d(data1, data2): np.zeros(100), np.repeat(np.nan, 100), np.array([1.123, 2.343, np.nan, 0.0]), - Series([5, 10, 53, None, np.nan, None], nan_as_null=False), - Series([1.1032, 2.32, 43.4], index=[0, 4, 3]), - Series([]), - Series([-3]), + cudf.Series([5, 10, 53, None, np.nan, None], nan_as_null=False), + cudf.Series([1.1032, 2.32, 43.4], index=[0, 4, 3]), + cudf.Series([]), + cudf.Series([-3]), ], ) @pytest.mark.parametrize( @@ -377,13 +376,13 @@ def test_cov1d(data1, data2): np.zeros(100), np.repeat(np.nan, 100), np.array([1.123, 2.343, np.nan, 0.0]), - Series([1.1, 2.32, 43.4], index=[0, 500, 4000]), - Series([5]), + cudf.Series([1.1, 2.32, 43.4], index=[0, 500, 4000]), + cudf.Series([5]), ], ) def test_corr1d(data1, data2): - gs1 = Series(data1) - gs2 = Series(data2) + gs1 = cudf.Series(data1) + gs2 = cudf.Series(data2) ps1 = gs1.to_pandas() ps2 = gs2.to_pandas() @@ -434,14 +433,14 @@ def test_df_corr(): ) @pytest.mark.parametrize("skipna", [True, False, None]) def test_nans_stats(data, ops, skipna): - psr = pd.Series(data) - gsr = Series(data) + psr = cudf.utils.utils._create_pandas_series(data=data) + gsr = cudf.Series(data) assert_eq( getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna) ) - psr = pd.Series(data) - gsr = Series(data, nan_as_null=False) + psr = cudf.utils.utils._create_pandas_series(data=data) + gsr = cudf.Series(data, nan_as_null=False) # Since there is no concept of `nan_as_null` in pandas, # nulls will be returned in the operations. So only # testing for `skipna=True` when `nan_as_null=False` @@ -461,7 +460,7 @@ def test_nans_stats(data, ops, skipna): @pytest.mark.parametrize("min_count", [-10, -1, 0, 1, 2, 3, 5, 10]) def test_min_count_ops(data, ops, skipna, min_count): psr = pd.Series(data) - gsr = Series(data) + gsr = cudf.Series(data) assert_eq( getattr(psr, ops)(skipna=skipna, min_count=min_count), @@ -472,8 +471,8 @@ def test_min_count_ops(data, ops, skipna, min_count): @pytest.mark.parametrize( "gsr", [ - Series([1, 2, 3, 4], dtype="datetime64[ns]"), - Series([1, 2, 3, 4], dtype="timedelta64[ns]"), + cudf.Series([1, 2, 3, 4], dtype="datetime64[ns]"), + cudf.Series([1, 2, 3, 4], dtype="timedelta64[ns]"), ], ) def test_cov_corr_invalid_dtypes(gsr): diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 080420c8f75..a015f3387b4 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1,5 +1,7 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. + import re +import urllib.parse from contextlib import ExitStack as does_not_raise from sys import getsizeof @@ -11,7 +13,7 @@ import cudf from cudf import concat -from cudf.core import DataFrame, Series +from cudf.core._compat import PANDAS_GE_110 from cudf.core.column.string import StringColumn from cudf.core.index import StringIndex, as_index from cudf.tests.utils import ( @@ -55,7 +57,7 @@ def index(request): @pytest.fixture def ps_gs(data, index): ps = pd.Series(data, index=index, dtype="str", name="nice name") - gs = Series(data, index=index, dtype="str", name="nice name") + gs = cudf.Series(data, index=index, dtype="str", name="nice name") return (ps, gs) @@ -63,7 +65,7 @@ def ps_gs(data, index): def test_string_ingest(construct): expect = ["a", "a", "b", "c", "a"] data = construct(expect) - got = Series(data) + got = cudf.Series(data) assert got.dtype == np.dtype("object") assert len(got) == 5 for idx, val in enumerate(expect): @@ -104,7 +106,7 @@ def test_string_get_item(ps_gs, item): ps, gs = ps_gs got = gs.iloc[item] - if isinstance(got, Series): + if isinstance(got, cudf.Series): got = got.to_arrow() if isinstance(item, cupy.ndarray): @@ -138,7 +140,7 @@ def test_string_bool_mask(ps_gs, item): ps, gs = ps_gs got = gs.iloc[item] - if isinstance(got, Series): + if isinstance(got, cudf.Series): got = got.to_arrow() if isinstance(item, cupy.ndarray): @@ -194,7 +196,7 @@ def test_string_astype(dtype): elif dtype == "str" or dtype == "object": data = ["ab", "cd", "ef", "gh", "ij"] ps = pd.Series(data) - gs = Series(data) + gs = cudf.Series(data) # Pandas str --> bool typecasting always returns True if there's a string if dtype.startswith("bool"): @@ -212,7 +214,7 @@ def test_string_astype(dtype): def test_string_empty_astype(dtype): data = [] ps = pd.Series(data, dtype="str") - gs = Series(data, dtype="str") + gs = cudf.Series(data, dtype="str") expect = ps.astype(dtype) got = gs.astype(dtype) @@ -243,7 +245,7 @@ def test_string_numeric_astype(dtype): if not dtype.startswith("datetime64"): ps = pd.Series(data, dtype=dtype) - gs = Series(data, dtype=dtype) + gs = cudf.Series(data, dtype=dtype) expect = pd.Series(ps.astype("str")) got = gs.astype("str") @@ -259,7 +261,7 @@ def test_string_empty_numeric_astype(dtype): ps = pd.Series(data, dtype="datetime64[ns]") else: ps = pd.Series(data, dtype=dtype) - gs = Series(data, dtype=dtype) + gs = cudf.Series(data, dtype=dtype) expect = ps.astype("str") got = gs.astype("str") @@ -274,8 +276,8 @@ def test_string_concat(): ps1 = pd.Series(data1, index=index) ps2 = pd.Series(data2, index=index) - gs1 = Series(data1, index=index) - gs2 = Series(data2, index=index) + gs1 = cudf.Series(data1, index=index) + gs2 = cudf.Series(data2, index=index) expect = pd.concat([ps1, ps2]) got = concat([gs1, gs2]) @@ -343,13 +345,15 @@ def _cat_convert_seq_to_cudf(others): pytest.param( pd.Index(["f", "g", "h", "i", "j"]), marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/33436" + condition=not PANDAS_GE_110, + reason="https://github.com/pandas-dev/pandas/issues/33436", ), ), pytest.param( pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/33436" + condition=not PANDAS_GE_110, + reason="https://github.com/pandas-dev/pandas/issues/33436", ), ), ( @@ -388,7 +392,8 @@ def _cat_convert_seq_to_cudf(others): pd.Index(["f", "g", "h", "i", "j"]), ), marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/33436" + condition=not PANDAS_GE_110, + reason="https://github.com/pandas-dev/pandas/issues/33436", ), ), pytest.param( @@ -403,7 +408,8 @@ def _cat_convert_seq_to_cudf(others): pd.Index(["f", "g", "h", "i", "j"]), ], marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/33436" + condition=not PANDAS_GE_110, + reason="https://github.com/pandas-dev/pandas/issues/33436", ), ), [ @@ -510,13 +516,15 @@ def test_string_cat(ps_gs, others, sep, na_rep, index): pytest.param( pd.Index(["f", "g", "h", "i", "j"]), marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/33436" + condition=not PANDAS_GE_110, + reason="https://github.com/pandas-dev/pandas/issues/33436", ), ), pytest.param( pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/33436" + condition=not PANDAS_GE_110, + reason="https://github.com/pandas-dev/pandas/issues/33436", ), ), ( @@ -543,7 +551,8 @@ def test_string_cat(ps_gs, others, sep, na_rep, index): pd.Index(["f", "g", "h", "i", "j"]), ), marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/33436" + condition=not PANDAS_GE_110, + reason="https://github.com/pandas-dev/pandas/issues/33436", ), ), pytest.param( @@ -558,7 +567,8 @@ def test_string_cat(ps_gs, others, sep, na_rep, index): pd.Index(["f", "g", "h", "i", "j"]), ], marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/33436" + condition=not PANDAS_GE_110, + reason="https://github.com/pandas-dev/pandas/issues/33436", ), ), [ @@ -611,16 +621,7 @@ def test_string_index_str_cat(data, others, sep, na_rep, name): @pytest.mark.parametrize( - "data", - [ - pytest.param( - ["a", None, "c", None, "e"], - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/5862" - ), - ), - ["a", "b", "c", "d", "a"], - ], + "data", [["a", None, "c", None, "e"], ["a", "b", "c", "d", "a"]], ) @pytest.mark.parametrize( "others", @@ -631,13 +632,15 @@ def test_string_index_str_cat(data, others, sep, na_rep, name): pytest.param( pd.Index(["f", "g", "h", "i", "j"]), marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/33436" + condition=not PANDAS_GE_110, + reason="https://github.com/pandas-dev/pandas/issues/33436", ), ), pytest.param( pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/33436" + condition=not PANDAS_GE_110, + reason="https://github.com/pandas-dev/pandas/issues/33436", ), ), [ @@ -732,16 +735,20 @@ def test_string_index_duplicate_str_cat(data, others, sep, na_rep, name): ) -@pytest.mark.xfail(raises=ValueError) -@pytest.mark.parametrize("sep", [None, "", " ", "|", ",", "|||"]) -@pytest.mark.parametrize("na_rep", [None, "", "null", "a"]) -def test_string_cat_str(ps_gs, sep, na_rep): - ps, gs = ps_gs - - got = gs.str.cat(gs.str, sep=sep, na_rep=na_rep) - expect = ps.str.cat(ps.str, sep=sep, na_rep=na_rep) - - assert_eq(expect, got) +def test_string_cat_str_error(): + gs = cudf.Series(["a", "v", "s"]) + # https://github.com/pandas-dev/pandas/issues/28277 + # ability to pass StringMethods is being removed in future. + with pytest.raises( + TypeError, + match=re.escape( + "others must be Series, Index, DataFrame, np.ndarrary " + "or list-like (either containing only strings or " + "containing only objects of type Series/Index/" + "np.ndarray[1-dim])" + ), + ): + gs.str.cat(gs.str) @pytest.mark.xfail(raises=(NotImplementedError, AttributeError)) @@ -847,12 +854,8 @@ def test_string_upper(ps_gs): @pytest.mark.parametrize("n", [-1, 0, 1, 3, 10]) @pytest.mark.parametrize("expand", [True, False, None]) def test_string_split(data, pat, n, expand): - - if data in (["a b", " c ", " d", "e ", "f"],) and pat is None: - pytest.xfail("None pattern split algorithm not implemented yet") - ps = pd.Series(data, dtype="str") - gs = Series(data, dtype="str") + gs = cudf.Series(data, dtype="str") expect = ps.str.split(pat=pat, n=n, expand=expand) got = gs.str.split(pat=pat, n=n, expand=expand) @@ -874,10 +877,10 @@ def test_string_join_key(str_data, str_data_raise, num_keys, how): other_data = [1, 2, 3, 4, 5][: len(str_data)] pdf = pd.DataFrame() - gdf = DataFrame() + gdf = cudf.DataFrame() for i in range(num_keys): pdf[i] = pd.Series(str_data, dtype="str") - gdf[i] = Series(str_data, dtype="str") + gdf[i] = cudf.Series(str_data, dtype="str") pdf["a"] = other_data gdf["a"] = other_data @@ -917,18 +920,18 @@ def test_string_join_key_nulls(str_data_nulls): other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)] pdf = pd.DataFrame() - gdf = DataFrame() + gdf = cudf.DataFrame() pdf["key"] = pd.Series(str_data, dtype="str") - gdf["key"] = Series(str_data, dtype="str") + gdf["key"] = cudf.Series(str_data, dtype="str") pdf["vals"] = other_data gdf["vals"] = other_data pdf2 = pd.DataFrame() - gdf2 = DataFrame() + gdf2 = cudf.DataFrame() pdf2["key"] = pd.Series(str_data_nulls, dtype="str") - gdf2["key"] = Series(str_data_nulls, dtype="str") + gdf2["key"] = cudf.Series(str_data_nulls, dtype="str") pdf2["vals"] = pd.Series(other_data_nulls, dtype="int64") - gdf2["vals"] = Series(other_data_nulls, dtype="int64") + gdf2["vals"] = cudf.Series(other_data_nulls, dtype="int64") expect = pdf.merge(pdf2, on="key", how="left") got = gdf.merge(gdf2, on="key", how="left") @@ -952,10 +955,10 @@ def test_string_join_non_key(str_data, num_cols, how): other_data = [1, 2, 3, 4, 5][: len(str_data)] pdf = pd.DataFrame() - gdf = DataFrame() + gdf = cudf.DataFrame() for i in range(num_cols): pdf[i] = pd.Series(str_data, dtype="str") - gdf[i] = Series(str_data, dtype="str") + gdf[i] = cudf.Series(str_data, dtype="str") pdf["a"] = other_data gdf["a"] = other_data @@ -990,18 +993,18 @@ def test_string_join_non_key_nulls(str_data_nulls): other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)] pdf = pd.DataFrame() - gdf = DataFrame() + gdf = cudf.DataFrame() pdf["vals"] = pd.Series(str_data, dtype="str") - gdf["vals"] = Series(str_data, dtype="str") + gdf["vals"] = cudf.Series(str_data, dtype="str") pdf["key"] = other_data gdf["key"] = other_data pdf2 = pd.DataFrame() - gdf2 = DataFrame() + gdf2 = cudf.DataFrame() pdf2["vals"] = pd.Series(str_data_nulls, dtype="str") - gdf2["vals"] = Series(str_data_nulls, dtype="str") + gdf2["vals"] = cudf.Series(str_data_nulls, dtype="str") pdf2["key"] = pd.Series(other_data_nulls, dtype="int64") - gdf2["key"] = Series(other_data_nulls, dtype="int64") + gdf2["key"] = cudf.Series(other_data_nulls, dtype="int64") expect = pdf.merge(pdf2, on="key", how="left") got = gdf.merge(gdf2, on="key", how="left") @@ -1041,8 +1044,8 @@ def test_string_join_values_nulls(): left_pdf = pd.DataFrame(left_dict) right_pdf = pd.DataFrame(right_dict) - left_gdf = DataFrame.from_pandas(left_pdf) - right_gdf = DataFrame.from_pandas(right_pdf) + left_gdf = cudf.DataFrame.from_pandas(left_pdf) + right_gdf = cudf.DataFrame.from_pandas(right_pdf) expect = left_pdf.merge(right_pdf, how="left", on="b") got = left_gdf.merge(right_gdf, how="left", on="b") @@ -1061,10 +1064,10 @@ def test_string_groupby_key(str_data, num_keys): other_data = [1, 2, 3, 4, 5][: len(str_data)] pdf = pd.DataFrame() - gdf = DataFrame() + gdf = cudf.DataFrame() for i in range(num_keys): pdf[i] = pd.Series(str_data, dtype="str") - gdf[i] = Series(str_data, dtype="str") + gdf[i] = cudf.Series(str_data, dtype="str") pdf["a"] = other_data gdf["a"] = other_data @@ -1086,10 +1089,10 @@ def test_string_groupby_non_key(str_data, num_cols, agg): other_data = [1, 2, 3, 4, 5][: len(str_data)] pdf = pd.DataFrame() - gdf = DataFrame() + gdf = cudf.DataFrame() for i in range(num_cols): pdf[i] = pd.Series(str_data, dtype="str") - gdf[i] = Series(str_data, dtype="str") + gdf[i] = cudf.Series(str_data, dtype="str") pdf["a"] = other_data gdf["a"] = other_data @@ -1111,9 +1114,9 @@ def test_string_groupby_key_index(): other_data = [1, 2, 3, 4, 5] pdf = pd.DataFrame() - gdf = DataFrame() + gdf = cudf.DataFrame() pdf["a"] = pd.Series(str_data, dtype="str") - gdf["a"] = Series(str_data, dtype="str") + gdf["a"] = cudf.Series(str_data, dtype="str") pdf["b"] = other_data gdf["b"] = other_data @@ -1127,7 +1130,7 @@ def test_string_groupby_key_index(): def test_string_set_scalar(scalar): pdf = pd.DataFrame() pdf["a"] = [1, 2, 3, 4, 5] - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) pdf["b"] = "a" gdf["b"] = "a" @@ -1137,10 +1140,8 @@ def test_string_set_scalar(scalar): def test_string_index(): - from cudf.core.column import as_column - pdf = pd.DataFrame(np.random.rand(5, 5)) - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) stringIndex = ["a", "b", "c", "d", "e"] pdf.index = stringIndex gdf.index = stringIndex @@ -1153,7 +1154,9 @@ def test_string_index(): pdf.index = stringIndex.to_pandas() gdf.index = stringIndex assert_eq(pdf, gdf) - stringIndex = as_index(as_column(["a", "b", "c", "d", "e"]), name="name") + stringIndex = cudf.Index( + cudf.core.column.as_column(["a", "b", "c", "d", "e"]), name="name" + ) pdf.index = stringIndex.to_pandas() gdf.index = stringIndex assert_eq(pdf, gdf) @@ -1171,7 +1174,7 @@ def test_string_index(): ) def test_string_unique(item): ps = pd.Series(item) - gs = Series(item) + gs = cudf.Series(item) # Pandas `unique` returns a numpy array pres = pd.Series(ps.unique()) # cudf returns sorted unique with `None` placed before other strings @@ -1181,12 +1184,12 @@ def test_string_unique(item): def test_string_slice(): - df = DataFrame({"a": ["hello", "world"]}) + df = cudf.DataFrame({"a": ["hello", "world"]}) pdf = pd.DataFrame({"a": ["hello", "world"]}) a_slice_got = df.a.str.slice(0, 2) a_slice_expected = pdf.a.str.slice(0, 2) - assert isinstance(a_slice_got, Series) + assert isinstance(a_slice_got, cudf.Series) assert_eq(a_slice_expected, a_slice_got) @@ -1196,8 +1199,8 @@ def test_string_equality(): ps1 = pd.Series(data1) ps2 = pd.Series(data2) - gs1 = Series(data1) - gs2 = Series(data2) + gs1 = cudf.Series(data1) + gs2 = cudf.Series(data2) expect = ps1 == ps2 got = gs1 == gs2 @@ -1210,7 +1213,7 @@ def test_string_equality(): assert_eq(expect, got.fillna(False)) ps1 = pd.Series(["a"]) - gs1 = Series(["a"]) + gs1 = cudf.Series(["a"]) expect = ps1 == "m" got = gs1 == "m" @@ -1234,7 +1237,7 @@ def test_string_equality(): ) def test_string_binary_op_add(lhs, rhs): pds = pd.Series(lhs) + pd.Series(rhs) - gds = Series(lhs) + Series(rhs) + gds = cudf.Series(lhs) + cudf.Series(rhs) assert_eq(pds, gds) @@ -1279,7 +1282,7 @@ def test_string_no_children_properties(): ) def test_string_get(string, index): pds = pd.Series(string) - gds = Series(string) + gds = cudf.Series(string) assert_eq( pds.str.get(index).fillna(""), gds.str.get(index).fillna(""), @@ -1302,7 +1305,7 @@ def test_string_get(string, index): ) def test_string_slice_str(string, number, diff): pds = pd.Series(string) - gds = Series(string) + gds = cudf.Series(string) assert_eq(pds.str.slice(start=number), gds.str.slice(start=number)) assert_eq(pds.str.slice(stop=number), gds.str.slice(stop=number)) @@ -1320,11 +1323,11 @@ def test_string_slice_str(string, number, diff): def test_string_slice_from(): - gs = Series(["hello world", "holy accéntéd", "batman", None, ""]) - d_starts = Series([2, 3, 0, -1, -1], dtype=np.int32) - d_stops = Series([-1, -1, 0, -1, -1], dtype=np.int32) + gs = cudf.Series(["hello world", "holy accéntéd", "batman", None, ""]) + d_starts = cudf.Series([2, 3, 0, -1, -1], dtype=np.int32) + d_stops = cudf.Series([-1, -1, 0, -1, -1], dtype=np.int32) got = gs.str.slice_from(starts=d_starts._column, stops=d_stops._column) - expected = Series(["llo world", "y accéntéd", "", None, ""]) + expected = cudf.Series(["llo world", "y accéntéd", "", None, ""]) assert_eq(got, expected) @@ -1341,7 +1344,7 @@ def test_string_slice_from(): @pytest.mark.parametrize("repr", ["2", "!!"]) def test_string_slice_replace(string, number, diff, repr): pds = pd.Series(string) - gds = Series(string) + gds = cudf.Series(string) assert_eq( pds.str.slice_replace(start=number, repl=repr), @@ -1365,7 +1368,7 @@ def test_string_slice_replace(string, number, diff, repr): def test_string_insert(): - gs = Series(["hello world", "holy accéntéd", "batman", None, ""]) + gs = cudf.Series(["hello world", "holy accéntéd", "batman", None, ""]) ps = pd.Series(["hello world", "holy accéntéd", "batman", None, ""]) @@ -1419,7 +1422,7 @@ def test_string_insert(): ) @pytest.mark.parametrize("data", _string_char_types_data) def test_string_char_types(type_op, data): - gs = Series(data) + gs = cudf.Series(data) ps = pd.Series(data) assert_eq(getattr(gs.str, type_op)(), getattr(ps.str, type_op)()) @@ -1435,8 +1438,8 @@ def test_string_filter_alphanum(): rs = rs + c expected.append(rs) - gs = Series(data) - assert_eq(gs.str.filter_alphanum(), Series(expected)) + gs = cudf.Series(data) + assert_eq(gs.str.filter_alphanum(), cudf.Series(expected)) expected = [] for st in data: @@ -1445,7 +1448,7 @@ def test_string_filter_alphanum(): if not str.isalnum(c): rs = rs + c expected.append(rs) - assert_eq(gs.str.filter_alphanum(keep=False), Series(expected)) + assert_eq(gs.str.filter_alphanum(keep=False), cudf.Series(expected)) expected = [] for st in data: @@ -1456,7 +1459,7 @@ def test_string_filter_alphanum(): else: rs = rs + "*" expected.append(rs) - assert_eq(gs.str.filter_alphanum("*"), Series(expected)) + assert_eq(gs.str.filter_alphanum("*"), cudf.Series(expected)) expected = [] for st in data: @@ -1467,7 +1470,7 @@ def test_string_filter_alphanum(): else: rs = rs + "*" expected.append(rs) - assert_eq(gs.str.filter_alphanum("*", keep=False), Series(expected)) + assert_eq(gs.str.filter_alphanum("*", keep=False), cudf.Series(expected)) @pytest.mark.parametrize( @@ -1486,7 +1489,7 @@ def test_string_filter_alphanum(): ], ) def test_string_char_case(case_op, data): - gs = Series(data) + gs = cudf.Series(data) ps = pd.Series(data) s = gs.str @@ -1516,7 +1519,7 @@ def test_string_char_case(case_op, data): ], ) def test_strings_rpartition(data): - gs = Series(data) + gs = cudf.Series(data) ps = pd.Series(data) assert_eq(ps.str.rpartition(), gs.str.rpartition()) @@ -1535,7 +1538,7 @@ def test_strings_rpartition(data): ], ) def test_strings_partition(data): - gs = Series(data, name="str_name") + gs = cudf.Series(data, name="str_name") ps = pd.Series(data, name="str_name") assert_eq(ps.str.partition(), gs.str.partition()) @@ -1567,7 +1570,7 @@ def test_strings_partition(data): @pytest.mark.parametrize("n", [-1, 2, 1, 9]) @pytest.mark.parametrize("expand", [True, False, None]) def test_strings_rsplit(data, n, expand): - gs = Series(data) + gs = cudf.Series(data) ps = pd.Series(data) assert_eq( @@ -1603,7 +1606,7 @@ def test_strings_rsplit(data, n, expand): @pytest.mark.parametrize("n", [-1, 2, 1, 9]) @pytest.mark.parametrize("expand", [True, False, None]) def test_strings_split(data, n, expand): - gs = Series(data) + gs = cudf.Series(data) ps = pd.Series(data) assert_eq( @@ -1642,7 +1645,7 @@ def test_strings_split(data, n, expand): "to_strip", ["⅕", None, "123.", ".!? \n\t", "123.!? \n\t", " ", ".", ","] ) def test_strings_strip_tests(data, to_strip): - gs = Series(data) + gs = cudf.Series(data) ps = pd.Series(data) assert_eq(ps.str.strip(to_strip=to_strip), gs.str.strip(to_strip=to_strip)) @@ -1684,7 +1687,7 @@ def test_strings_strip_tests(data, to_strip): @pytest.mark.parametrize("width", [0, 1, 4, 9, 100]) @pytest.mark.parametrize("fillchar", ["⅕", "1", ".", "t", " ", ","]) def test_strings_filling_tests(data, width, fillchar): - gs = Series(data) + gs = cudf.Series(data) ps = pd.Series(data) assert_eq( @@ -1730,7 +1733,7 @@ def test_strings_filling_tests(data, width, fillchar): ) @pytest.mark.parametrize("width", [0, 1, 4, 6, 9, 100]) def test_strings_zfill_tests(data, width): - gs = Series(data) + gs = cudf.Series(data) ps = pd.Series(data) assert_eq(ps.str.zfill(width=width), gs.str.zfill(width=width)) @@ -1758,7 +1761,7 @@ def test_strings_zfill_tests(data, width): ) @pytest.mark.parametrize("fillchar", [" ", ".", "\n", "+", "\t"]) def test_strings_pad_tests(data, width, side, fillchar): - gs = Series(data) + gs = cudf.Series(data) ps = pd.Series(data) assert_eq( @@ -1788,7 +1791,7 @@ def test_strings_pad_tests(data, width, side, fillchar): ) @pytest.mark.parametrize("width", [1, 4, 8, 12, 100]) def test_string_wrap(data, width): - gs = Series(data) + gs = cudf.Series(data) ps = pd.Series(data) assert_eq( @@ -1832,7 +1835,7 @@ def test_string_wrap(data, width): ) @pytest.mark.parametrize("pat", ["a", " ", "\t", "another", "0", r"\$"]) def test_string_count(data, pat): - gs = Series(data) + gs = cudf.Series(data) ps = pd.Series(data) assert_eq(gs.str.count(pat=pat), ps.str.count(pat=pat), check_dtype=False) @@ -1841,7 +1844,7 @@ def test_string_count(data, pat): def test_string_findall(): ps = pd.Series(["Lion", "Monkey", "Rabbit"]) - gs = Series(["Lion", "Monkey", "Rabbit"]) + gs = cudf.Series(["Lion", "Monkey", "Rabbit"]) assert_eq(ps.str.findall("Monkey")[1][0], gs.str.findall("Monkey")[0][1]) assert_eq(ps.str.findall("on")[0][0], gs.str.findall("on")[0][0]) @@ -1852,21 +1855,21 @@ def test_string_findall(): def test_string_replace_multi(): ps = pd.Series(["hello", "goodbye"]) - gs = Series(["hello", "goodbye"]) + gs = cudf.Series(["hello", "goodbye"]) expect = ps.str.replace("e", "E").str.replace("o", "O") got = gs.str.replace(["e", "o"], ["E", "O"]) assert_eq(expect, got) ps = pd.Series(["foo", "fuz", np.nan]) - gs = Series.from_pandas(ps) + gs = cudf.Series.from_pandas(ps) expect = ps.str.replace("f.", "ba", regex=True) got = gs.str.replace(["f."], ["ba"], regex=True) assert_eq(expect, got) ps = pd.Series(["f.o", "fuz", np.nan]) - gs = Series.from_pandas(ps) + gs = cudf.Series.from_pandas(ps) expect = ps.str.replace("f.", "ba", regex=False) got = gs.str.replace(["f."], ["ba"], regex=False) @@ -1902,7 +1905,7 @@ def test_string_replace_with_backrefs(find, replace): "tést-string-again", ] ps = pd.Series(s) - gs = Series(s) + gs = cudf.Series(s) got = gs.str.replace_with_backrefs(find, replace) expected = ps.str.replace(find, replace, regex=True) assert_eq(got, expected) @@ -1915,7 +1918,7 @@ def test_string_replace_with_backrefs(find, replace): def test_string_table_view_creation(): data = ["hi"] * 25 + [None] * 2027 psr = pd.Series(data) - gsr = Series.from_pandas(psr) + gsr = cudf.Series.from_pandas(psr) expect = psr[:1] got = gsr[:1] @@ -1941,7 +1944,7 @@ def test_string_table_view_creation(): ) def test_string_starts_ends(data, pat): ps = pd.Series(data) - gs = Series(data) + gs = cudf.Series(data) assert_eq( ps.str.startswith(pat), gs.str.startswith(pat), check_dtype=False @@ -1978,7 +1981,7 @@ def test_string_starts_ends(data, pat): ], ) def test_string_starts_ends_list_like_pat(data, pat): - gs = Series(data) + gs = cudf.Series(data) starts_expected = [] ends_expected = [] @@ -2017,7 +2020,7 @@ def test_string_starts_ends_list_like_pat(data, pat): ) def test_string_find(data, sub): ps = pd.Series(data) - gs = Series(data) + gs = cudf.Series(data) got = gs.str.find(sub) expect = ps.str.find(sub) @@ -2087,7 +2090,7 @@ def test_string_find(data, sub): ) def test_string_str_index(data, sub, er): ps = pd.Series(data) - gs = Series(data) + gs = cudf.Series(data) if er is None: assert_eq(ps.str.index(sub), gs.str.index(sub), check_dtype=False) @@ -2126,7 +2129,7 @@ def test_string_str_index(data, sub, er): ) def test_string_str_rindex(data, sub, er): ps = pd.Series(data) - gs = Series(data) + gs = cudf.Series(data) if er is None: assert_eq(ps.str.rindex(sub), gs.str.rindex(sub), check_dtype=False) @@ -2183,10 +2186,10 @@ def test_string_str_rindex(data, sub, er): ], ) def test_string_contains_multi(data, sub, expect): - gs = Series(data) - sub = Series(sub) + gs = cudf.Series(data) + sub = cudf.Series(sub) got = gs.str.contains(sub) - expect = Series(expect) + expect = cudf.Series(expect) assert_eq(expect, got, check_dtype=False) @@ -2206,7 +2209,7 @@ def test_string_contains_multi(data, sub, expect): @pytest.mark.parametrize("pat", ["", " ", "a", "abc", "cat", "$", "\n"]) def test_string_str_match(data, pat): ps = pd.Series(data) - gs = Series(data) + gs = cudf.Series(data) assert_eq(ps.str.match(pat), gs.str.match(pat)) assert_eq( @@ -2229,7 +2232,7 @@ def test_string_str_match(data, pat): ) def test_string_str_translate(data): ps = pd.Series(data) - gs = Series(data) + gs = cudf.Series(data) assert_eq( ps.str.translate(str.maketrans({"a": "z"})), @@ -2284,15 +2287,17 @@ def test_string_str_filter_characters(): "$1.50", "", ] - gs = Series(data) - expected = Series(["helloworld", "ABCD", "", "accnt", None, "150", ""]) + gs = cudf.Series(data) + expected = cudf.Series( + ["helloworld", "ABCD", "", "accnt", None, "150", ""] + ) filter = {"a": "z", "A": "Z", "0": "9"} assert_eq(expected, gs.str.filter_characters(filter)) - expected = Series([" ", "+++", "?!@#$%^&*()", "é", None, "$.", ""]) + expected = cudf.Series([" ", "+++", "?!@#$%^&*()", "é", None, "$.", ""]) assert_eq(expected, gs.str.filter_characters(filter, False)) - expected = Series( + expected = cudf.Series( ["hello world", "A B C D", " ", "acc nt", None, " 1 50", ""] ) assert_eq(expected, gs.str.filter_characters(filter, True, " ")) @@ -2311,7 +2316,7 @@ def test_string_str_code_points(): " 1234 ", "XYZ", ] - gs = Series(data) + gs = cudf.Series(data) expected = [ 97, 98, @@ -2351,7 +2356,7 @@ def test_string_str_code_points(): 89, 90, ] - expected = Series(expected) + expected = cudf.Series(expected) assert_eq(expected, gs.str.code_points(), check_dtype=False) @@ -2366,9 +2371,7 @@ def test_string_str_code_points(): ], ) def test_string_str_url_encode(data): - import urllib.parse - - gs = Series(data) + gs = cudf.Series(data) got = gs.str.url_encode() expected = pd.Series([urllib.parse.quote(url, safe="~") for url in data]) @@ -2386,9 +2389,7 @@ def test_string_str_url_encode(data): ], ) def test_string_str_decode_url(data): - import urllib.parse - - gs = Series(data) + gs = cudf.Series(data) got = gs.str.url_decode() expected = pd.Series([urllib.parse.unquote(url) for url in data]) @@ -2410,7 +2411,7 @@ def test_string_str_decode_url(data): @pytest.mark.parametrize("obj_type", [None, "str", "category"]) def test_string_typecast(data, obj_type, dtype): psr = pd.Series(data, dtype=obj_type) - gsr = Series(data, dtype=obj_type) + gsr = cudf.Series(data, dtype=obj_type) expect = psr.astype(dtype=dtype) actual = gsr.astype(dtype=dtype) @@ -2449,7 +2450,7 @@ def test_string_typecast(data, obj_type, dtype): @pytest.mark.parametrize("obj_type", [None, "str", "category"]) def test_string_typecast_error(data, obj_type, dtype): psr = pd.Series(data, dtype=obj_type) - gsr = Series(data, dtype=obj_type) + gsr = cudf.Series(data, dtype=obj_type) assert_exceptions_equal( lfunc=psr.astype, @@ -2471,23 +2472,23 @@ def test_string_typecast_error(data, obj_type, dtype): ) def test_string_hex_to_int(data): - gsr = Series(data) + gsr = cudf.Series(data) got = gsr.str.htoi() - expected = Series([263988422296292, 0, 281474976710655]) + expected = cudf.Series([263988422296292, 0, 281474976710655]) assert_eq(expected, got) def test_string_ishex(): - gsr = Series(["", None, "0x01a2b3c4d5e6f", "0789", "ABCDEF0"]) + gsr = cudf.Series(["", None, "0x01a2b3c4d5e6f", "0789", "ABCDEF0"]) got = gsr.str.ishex() - expected = Series([False, None, True, True, True]) + expected = cudf.Series([False, None, True, True, True]) assert_eq(expected, got) def test_string_istimestamp(): - gsr = Series( + gsr = cudf.Series( [ "", None, @@ -2505,7 +2506,7 @@ def test_string_istimestamp(): ] ) got = gsr.str.istimestamp(r"%Y%m%d %H%M%S.%f%p%z") - expected = Series( + expected = cudf.Series( [ False, None, @@ -2526,8 +2527,10 @@ def test_string_istimestamp(): def test_string_ip4_to_int(): - gsr = Series(["", None, "hello", "41.168.0.1", "127.0.0.1", "41.197.0.1"]) - expected = Series([0, None, 0, 698875905, 2130706433, 700776449]) + gsr = cudf.Series( + ["", None, "hello", "41.168.0.1", "127.0.0.1", "41.197.0.1"] + ) + expected = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449]) got = gsr.str.ip2int() @@ -2535,18 +2538,18 @@ def test_string_ip4_to_int(): def test_string_int_to_ipv4(): - gsr = Series([0, None, 0, 698875905, 2130706433, 700776449]) - expected = Series( + gsr = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449]) + expected = cudf.Series( ["0.0.0.0", None, "0.0.0.0", "41.168.0.1", "127.0.0.1", "41.197.0.1"] ) - got = Series(gsr._column.int2ip()) + got = cudf.Series(gsr._column.int2ip()) assert_eq(expected, got) def test_string_isipv4(): - gsr = Series( + gsr = cudf.Series( [ "", None, @@ -2562,7 +2565,7 @@ def test_string_isipv4(): ] ) got = gsr.str.isipv4() - expected = Series( + expected = cudf.Series( [ False, None, @@ -2584,7 +2587,7 @@ def test_string_isipv4(): "dtype", sorted(list(dtypeutils.NUMERIC_TYPES - {"int64", "uint64"})) ) def test_string_int_to_ipv4_dtype_fail(dtype): - gsr = Series([1, 2, 3, 4, 5]).astype(dtype) + gsr = cudf.Series([1, 2, 3, 4, 5]).astype(dtype) with pytest.raises(TypeError): gsr._column.int2ip() @@ -2615,7 +2618,7 @@ def test_string_int_to_ipv4_dtype_fail(dtype): ) def test_string_str_subscriptable(data, index): psr = pd.Series(data) - gsr = Series(data) + gsr = cudf.Series(data) assert_eq(psr.str[index], gsr.str[index]) @@ -2637,8 +2640,8 @@ def test_string_str_subscriptable(data, index): ], ) def test_string_str_byte_count(data, expected): - sr = Series(data) - expected = Series(expected, dtype="int32") + sr = cudf.Series(data) + expected = cudf.Series(expected, dtype="int32") actual = sr.str.byte_count() assert_eq(expected, actual) @@ -2686,8 +2689,8 @@ def test_string_str_byte_count(data, expected): ], ) def test_str_isinteger(data, expected): - sr = Series(data, dtype="str") - expected = Series(expected) + sr = cudf.Series(data, dtype="str") + expected = cudf.Series(expected) actual = sr.str.isinteger() assert_eq(expected, actual) @@ -2742,8 +2745,8 @@ def test_str_isinteger(data, expected): ], ) def test_str_isfloat(data, expected): - sr = Series(data, dtype="str") - expected = Series(expected) + sr = cudf.Series(data, dtype="str") + expected = cudf.Series(expected) actual = sr.str.isfloat() assert_eq(expected, actual) @@ -2773,7 +2776,7 @@ def test_str_isfloat(data, expected): ) def test_str_min(data): psr = pd.Series(data) - sr = Series(data) + sr = cudf.Series(data) assert_eq(psr.min(), sr.min()) @@ -2798,7 +2801,7 @@ def test_str_min(data): ) def test_str_max(data): psr = pd.Series(data) - sr = Series(data) + sr = cudf.Series(data) assert_eq(psr.max(), sr.max()) @@ -2823,13 +2826,13 @@ def test_str_max(data): ) def test_str_sum(data): psr = pd.Series(data) - sr = Series(data) + sr = cudf.Series(data) assert_eq(psr.sum(), sr.sum()) def test_str_mean(): - sr = Series(["a", "b", "c", "d", "e"]) + sr = cudf.Series(["a", "b", "c", "d", "e"]) with pytest.raises(TypeError): sr.mean() @@ -2837,20 +2840,20 @@ def test_str_mean(): def test_string_product(): psr = pd.Series(["1", "2", "3", "4", "5"]) - sr = Series(["1", "2", "3", "4", "5"]) + sr = cudf.Series(["1", "2", "3", "4", "5"]) assert_exceptions_equal( lfunc=psr.product, rfunc=sr.product, expected_error_message=re.escape( - f"cannot perform prod with type {sr.dtype}" + f"cannot perform product with type {sr.dtype}" ), ) def test_string_var(): psr = pd.Series(["1", "2", "3", "4", "5"]) - sr = Series(["1", "2", "3", "4", "5"]) + sr = cudf.Series(["1", "2", "3", "4", "5"]) assert_exceptions_equal( lfunc=psr.var, rfunc=sr.var, compare_error_message=False @@ -2859,7 +2862,7 @@ def test_string_var(): def test_string_std(): psr = pd.Series(["1", "2", "3", "4", "5"]) - sr = Series(["1", "2", "3", "4", "5"]) + sr = cudf.Series(["1", "2", "3", "4", "5"]) assert_exceptions_equal( lfunc=psr.std, rfunc=sr.std, compare_error_message=False diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 3b625a5ad85..3efc30af01e 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -1,4 +1,5 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. + import datetime import operator import re @@ -9,6 +10,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_120 from cudf.tests import utils as utils from cudf.tests.utils import assert_eq, assert_exceptions_equal @@ -423,7 +425,8 @@ def test_timedelta_dataframe_ops(df, op): pytest.param( np.timedelta64("nat"), marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/35529" + condition=not PANDAS_GE_120, + reason="https://github.com/pandas-dev/pandas/issues/35529", ), ), np.timedelta64(1, "s"), @@ -443,7 +446,8 @@ def test_timedelta_dataframe_ops(df, op): pytest.param( "floordiv", marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/35529" + condition=not PANDAS_GE_120, + reason="https://github.com/pandas-dev/pandas/issues/35529", ), ), ], @@ -521,14 +525,16 @@ def test_timedelta_series_ops_with_scalars(data, other_scalars, dtype, op): datetime.timedelta(microseconds=7), np.timedelta64(4, "s"), pytest.param( - np.timedelta64("nat"), + np.timedelta64("nat", "s"), marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/35529" + condition=not PANDAS_GE_120, + reason="https://github.com/pandas-dev/pandas/issues/35529", ), ), np.timedelta64(1, "s"), np.timedelta64(1, "ms"), np.timedelta64(1, "us"), + np.timedelta64("nat", "ns"), np.timedelta64(1, "ns"), ], ) @@ -543,7 +549,8 @@ def test_timedelta_series_ops_with_scalars(data, other_scalars, dtype, op): pytest.param( "floordiv", marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/35529" + condition=not PANDAS_GE_120, + reason="https://github.com/pandas-dev/pandas/issues/35529", ), ), ], @@ -597,18 +604,8 @@ def test_timedelta_series_ops_with_cudf_scalars(data, cpu_scalar, dtype, op): [1000000, 200000, 3000000], [1000000, 200000, None], [], - pytest.param( - [None], - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/35644" - ), - ), - pytest.param( - [None, None, None, None, None], - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/35644" - ), - ), + [None], + [None, None, None, None, None], [12, 12, 22, 343, 4353534, 435342], np.array([10, 20, 30, None, 100]), cp.asarray([10, 20, 30, 100]), @@ -872,7 +869,7 @@ def test_timedelta_index_ops_with_scalars(data, other_scalars, dtype, op): pytest.param( "floordiv", marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/35529" + reason="https://github.com/rapidsai/cudf/issues/5938" ), ), ], diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 274285990a6..20c86b2a4b7 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -14,6 +14,7 @@ import cudf from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar +from cudf.core._compat import PANDAS_GE_120 _NA_REP = "" _np_pa_dtypes = { @@ -73,6 +74,12 @@ pd.StringDtype(): np.dtype("object"), } +if PANDAS_GE_120: + cudf_dtypes_to_pandas_dtypes[np.dtype("float32")] = pd.Float32Dtype() + cudf_dtypes_to_pandas_dtypes[np.dtype("float64")] = pd.Float64Dtype() + pandas_dtypes_to_cudf_dtypes[pd.Float32Dtype()] = np.dtype("float32") + pandas_dtypes_to_cudf_dtypes[pd.Float64Dtype()] = np.dtype("float64") + SIGNED_INTEGER_TYPES = {"int8", "int16", "int32", "int64"} UNSIGNED_TYPES = {"uint8", "uint16", "uint32", "uint64"} INTEGER_TYPES = SIGNED_INTEGER_TYPES | UNSIGNED_TYPES diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 74622a8ceb2..e8b8c53312a 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. + import functools from collections import OrderedDict from collections.abc import Sequence @@ -622,3 +623,46 @@ def _categorical_scalar_broadcast_to(cat_scalar, size): offset=codes.offset, ordered=ordered, ) + + +def _create_pandas_series( + data=None, index=None, dtype=None, name=None, copy=False, fastpath=False +): + """ + Wrapper to create a Pandas Series. If the length of data is 0 and + dtype is not passed, this wrapper defaults the dtype to `float64`. + + Parameters + ---------- + data : array-like, Iterable, dict, or scalar value + Contains data stored in Series. If data is a dict, argument + order is maintained. + index : array-like or Index (1d) + Values must be hashable and have the same length as data. + Non-unique index values are allowed. Will default to + RangeIndex (0, 1, 2, …, n) if not provided. + If data is dict-like and index is None, then the keys + in the data are used as the index. If the index is not None, + the resulting Series is reindexed with the index values. + dtype : str, numpy.dtype, or ExtensionDtype, optional + Data type for the output Series. If not specified, this + will be inferred from data. See the user guide for more usages. + name : str, optional + The name to give to the Series. + copy : bool, default False + Copy input data. + + Returns + ------- + pd.Series + """ + if (data is None or len(data) == 0) and dtype is None: + dtype = "float64" + return pd.Series( + data=data, + index=index, + dtype=dtype, + name=name, + copy=copy, + fastpath=fastpath, + ) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 10719794843..e2b77ba192e 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + import random import cupy as cp @@ -658,7 +660,7 @@ def test_make_meta_backends(index): @pytest.mark.parametrize( "data", [ - pd.Series([]), + pd.Series([], dtype="float64"), pd.DataFrame({"abc": [], "xyz": []}), pd.Series([1, 2, 10, 11]), pd.DataFrame({"abc": [1, 2, 10, 11], "xyz": [100, 12, 120, 1]}), @@ -717,7 +719,9 @@ def test_dataframe_describe(): ddf = dgd.from_cudf(df, npartitions=4) pddf = dd.from_pandas(pdf, npartitions=4) - dd.assert_eq(ddf.describe(), pddf.describe(), check_less_precise=3) + dd.assert_eq( + ddf.describe(), pddf.describe(), check_exact=False, atol=0.0001 + ) def test_index_map_partitions(): diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 42ca4702987..f8ed00beb4f 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + import numpy as np import pandas as pd import pytest @@ -8,6 +10,7 @@ import dask_cudf import cudf +from cudf.core._compat import PANDAS_GE_120 @pytest.mark.parametrize("aggregation", ["sum", "mean", "count", "min", "max"]) @@ -126,10 +129,16 @@ def test_groupby_std(func): "func", [ pytest.param( - lambda df: df.groupby(["a", "b"]).x.sum(), marks=pytest.mark.xfail + lambda df: df.groupby(["a", "b"]).x.sum(), + marks=pytest.mark.xfail( + condition=not PANDAS_GE_120, reason="pandas bug" + ), ), pytest.param( - lambda df: df.groupby(["a", "b"]).sum(), marks=pytest.mark.xfail + lambda df: df.groupby(["a", "b"]).sum(), + marks=pytest.mark.xfail( + condition=not PANDAS_GE_120, reason="pandas bug" + ), ), pytest.param( lambda df: df.groupby(["a", "b"]).agg({"x", "sum"}), diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py index 61a7ae8af1c..030b7717fbc 100644 --- a/python/dask_cudf/dask_cudf/tests/test_reductions.py +++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py @@ -1,13 +1,15 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + import numpy as np import pandas as pd import pytest -import dask.dataframe as dd - -import cudf as gd +from dask import dataframe as dd import dask_cudf as dgd +import cudf + def _make_random_frame(nelem, npartitions=2): df = pd.DataFrame( @@ -16,7 +18,7 @@ def _make_random_frame(nelem, npartitions=2): "y": np.random.normal(size=nelem) + 1, } ) - gdf = gd.DataFrame.from_pandas(df) + gdf = cudf.DataFrame.from_pandas(df) dgf = dgd.from_cudf(gdf, npartitions=npartitions) return df, dgf @@ -47,15 +49,15 @@ def test_series_reduce(reducer): @pytest.mark.parametrize( "data", [ - gd.datasets.randomdata( + cudf.datasets.randomdata( nrows=10000, dtypes={"a": "category", "b": int, "c": float, "d": int}, ), - gd.datasets.randomdata( + cudf.datasets.randomdata( nrows=10000, dtypes={"a": "category", "b": int, "c": float, "d": str}, ), - gd.datasets.randomdata( + cudf.datasets.randomdata( nrows=10000, dtypes={"a": bool, "b": int, "c": float, "d": str} ), ], @@ -75,4 +77,4 @@ def test_rowwise_reductions(data, op): expected = getattr(pddf, op)(axis=1) got = getattr(pddf, op)(axis=1) - dd.assert_eq(expected.compute(), got.compute(), check_less_precise=7) + dd.assert_eq(expected.compute(), got.compute(), check_exact=False)