From 60c7c876f9a6fa7be44ccbe81902c7c113d6df5b Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 4 Aug 2021 08:35:06 -0400 Subject: [PATCH 01/20] Replace cudf.dtype -> np.dtype --- python/cudf/cudf/__init__.py | 1 + python/cudf/cudf/api/types.py | 10 ++++++++++ python/cudf/cudf/core/column/column.py | 8 ++++---- python/cudf/cudf/core/column/datetime.py | 4 ++-- python/cudf/cudf/core/column/numerical.py | 6 +++--- python/cudf/cudf/core/column/string.py | 8 ++++---- python/cudf/cudf/core/column/timedelta.py | 4 ++-- python/cudf/cudf/core/scalar.py | 3 ++- python/cudf/cudf/core/series.py | 2 +- python/cudf/cudf/testing/_utils.py | 2 +- python/cudf/cudf/testing/dataset_generator.py | 10 +++++----- python/cudf/cudf/tests/test_binops.py | 12 ++++++------ python/cudf/cudf/tests/test_categorical.py | 2 +- python/cudf/cudf/tests/test_contains.py | 5 +++-- python/cudf/cudf/tests/test_joining.py | 2 +- python/cudf/cudf/tests/test_label_encode.py | 3 ++- python/cudf/cudf/tests/test_numerical.py | 2 +- python/cudf/cudf/tests/test_reductions.py | 14 +++++++------- python/cudf/cudf/tests/test_repr.py | 4 ++-- python/cudf/cudf/tests/test_scalar.py | 4 ++-- python/cudf/cudf/tests/test_udf_binops.py | 3 ++- python/cudf/cudf/tests/test_unaops.py | 4 ++-- python/cudf/cudf/utils/dtypes.py | 8 ++++---- python/cudf/cudf/utils/utils.py | 2 +- 24 files changed, 69 insertions(+), 54 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 2d52b517242..23621f1e315 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -15,6 +15,7 @@ register_index_accessor, register_series_accessor, ) +from cudf.api.types import dtype from cudf.core import ( NA, BaseIndex, diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 01af22f70bf..a1237e34366 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -27,6 +27,16 @@ ) +def dtype(arbitrary): + try: + return np.dtype(arbitrary) + except TypeError: + pass + if isinstance(arbitrary, cudf.core.dtypes._BaseDtype): + return arbitrary + return pd.api.types.pandas_type(arbitrary) + + def is_numeric_dtype(obj): """Check whether the provided array or dtype is of a numeric dtype. diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d449d52927e..f2945694a16 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -432,7 +432,7 @@ def view(self, dtype: Dtype) -> ColumnBase: """ - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if dtype.kind in ("o", "u", "s"): raise TypeError( @@ -2078,11 +2078,11 @@ def as_column( data ) dtype = pd.api.types.pandas_dtype(dtype) - np_type = np.dtype(dtype).type + np_type = cudf.dtype(dtype).type if np_type == np.bool_: pa_type = pa.bool_() else: - pa_type = np_to_pa_dtype(np.dtype(dtype)) + pa_type = np_to_pa_dtype(cudf.dtype(dtype)) data = as_column( pa.array( arbitrary, @@ -2131,7 +2131,7 @@ def _construct_array( Construct a CuPy or NumPy array from `arbitrary` """ try: - dtype = dtype if dtype is None else np.dtype(dtype) + dtype = dtype if dtype is None else cudf.dtype(dtype) arbitrary = cupy.asarray(arbitrary, dtype=dtype) except (TypeError, ValueError): native_dtype = dtype diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index f3d1880b290..4d99308d128 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -71,7 +71,7 @@ def __init__( mask : Buffer; optional The validity mask """ - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") if size is None: @@ -236,7 +236,7 @@ def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]: return output def as_datetime_column(self, dtype: Dtype, **kwargs) -> DatetimeColumn: - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if dtype == self.dtype: return self return libcudf.unary.cast(self, dtype=dtype) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index a3f4a82a7dc..0955039dafd 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -53,7 +53,7 @@ def __init__( The dtype associated with the data Buffer mask : Buffer, optional """ - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") if size is None: @@ -253,7 +253,7 @@ def as_decimal_column( return libcudf.unary.cast(self, dtype) def as_numerical_column(self, dtype: Dtype, **kwargs) -> NumericalColumn: - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if dtype == self.dtype: return self return libcudf.unary.cast(self, dtype) @@ -608,7 +608,7 @@ def _safe_cast_to_int(col: ColumnBase, dtype: DtypeObj) -> ColumnBase: else: raise TypeError( f"Cannot safely cast non-equivalent " - f"{col.dtype.type.__name__} to {np.dtype(dtype).type.__name__}" + f"{col.dtype.type.__name__} to {cudf.dtype(dtype).type.__name__}" ) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 92c57477465..ed776b62470 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5054,7 +5054,7 @@ def __contains__(self, item: ScalarLike) -> bool: def as_numerical_column( self, dtype: Dtype, **kwargs ) -> "cudf.core.column.NumericalColumn": - out_dtype = np.dtype(dtype) + out_dtype = cudf.dtype(dtype) if out_dtype.kind in {"i", "u"}: if not libstrings.is_integer(self).all(): @@ -5096,7 +5096,7 @@ def _as_datetime_or_timedelta_column(self, dtype, format): def as_datetime_column( self, dtype: Dtype, **kwargs ) -> "cudf.core.column.DatetimeColumn": - out_dtype = np.dtype(dtype) + out_dtype = cudf.dtype(dtype) # infer on host from the first not na element # or return all null column if all values @@ -5120,7 +5120,7 @@ def as_datetime_column( def as_timedelta_column( self, dtype: Dtype, **kwargs ) -> "cudf.core.column.TimeDeltaColumn": - out_dtype = np.dtype(dtype) + out_dtype = cudf.dtype(dtype) format = "%D days %H:%M:%S" return self._as_datetime_or_timedelta_column(out_dtype, format) @@ -5379,7 +5379,7 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase": raise ValueError( "Can not produce a view of a string column with nulls" ) - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) str_byte_offset = self.base_children[0].element_indexing(self.offset) str_end_byte_offset = self.base_children[0].element_indexing( self.offset + self.size diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index a27c20cc50c..2c893fafae7 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -60,7 +60,7 @@ def __init__( The number of null values. If None, it is calculated automatically. """ - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") if size is None: @@ -353,7 +353,7 @@ def as_string_column( ) def as_timedelta_column(self, dtype: Dtype, **kwargs) -> TimeDeltaColumn: - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if dtype == self.dtype: return self return libcudf.unary.cast(self, dtype=dtype) diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index c6663a25684..75be36a1b16 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -5,6 +5,7 @@ import pyarrow as pa from pandas._libs.missing import NAType as pd_NAType +import cudf from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar from cudf.core.column.column import ColumnBase from cudf.core.dtypes import Decimal64Dtype, ListDtype, StructDtype @@ -171,7 +172,7 @@ def _preprocess_host_value(self, value, dtype): dtype = value.dtype if not isinstance(dtype, Decimal64Dtype): - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if not valid: value = NA diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index fb197fbc90d..0080207c908 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3774,7 +3774,7 @@ def one_hot_encoding(self, cats, dtype="float64"): cats = cats.to_pandas() else: cats = pd.Series(cats, dtype="object") - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) def encode(cat): if cat is None: diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 672e83e6f64..68914c9b0e2 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -245,7 +245,7 @@ def _get_args_kwars_for_assert_exceptions(func_args_and_kwargs): def gen_rand(dtype, size, **kwargs): - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if dtype.kind == "f": res = np.random.random(size=size).astype(dtype) if kwargs.get("positive_only", False): diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py index 5e03068f818..4a475c52777 100644 --- a/python/cudf/cudf/testing/dataset_generator.py +++ b/python/cudf/cudf/testing/dataset_generator.py @@ -380,7 +380,7 @@ def rand_dataframe( ) ) else: - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if dtype.kind in ("i", "u"): column_params.append( ColumnParameters( @@ -428,7 +428,7 @@ def rand_dataframe( dtype=dtype, size=cardinality ), is_sorted=False, - dtype=np.dtype(dtype), + dtype=cudf.dtype(dtype), ) ) elif dtype.kind == "m": @@ -440,7 +440,7 @@ def rand_dataframe( dtype=dtype, size=cardinality ), is_sorted=False, - dtype=np.dtype(dtype), + dtype=cudf.dtype(dtype), ) ) elif dtype.kind == "b": @@ -450,7 +450,7 @@ def rand_dataframe( null_frequency=null_frequency, generator=boolean_generator(cardinality), is_sorted=False, - dtype=np.dtype(dtype), + dtype=cudf.dtype(dtype), ) ) else: @@ -538,7 +538,7 @@ def get_values_for_nested_data(dtype, lists_max_length): Returns list of values based on dtype. """ cardinality = np.random.randint(0, lists_max_length) - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if dtype.kind in ("i", "u"): values = int_generator(dtype=dtype, size=cardinality)() elif dtype.kind == "f": diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 8277b8e7b32..c49b6d794d9 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -931,7 +931,7 @@ def test_ufunc_ops(lhs, rhs, ops): def dtype_scalar(val, dtype): if dtype == "str": return str(val) - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if dtype.type in {np.datetime64, np.timedelta64}: res, _ = np.datetime_data(dtype) return dtype.type(val, res) @@ -1695,13 +1695,13 @@ def test_binops_with_lhs_numpy_scalar(frame, dtype): ) if dtype == "datetime64[s]": - val = np.dtype(dtype).type(4, "s") + val = cudf.dtype(dtype).type(4, "s") elif dtype == "timedelta64[s]": - val = np.dtype(dtype).type(4, "s") + val = cudf.dtype(dtype).type(4, "s") elif dtype == "category": val = np.int64(4) else: - val = np.dtype(dtype).type(4) + val = cudf.dtype(dtype).type(4) expected = val == data.to_pandas() got = val == data @@ -2793,11 +2793,11 @@ def test_column_null_scalar_comparison(dtype, null_scalar, cmpop): # a new series where all the elements are . if isinstance(null_scalar, np.datetime64): - if np.dtype(dtype).kind not in "mM": + if cudf.dtype(dtype).kind not in "mM": pytest.skip() null_scalar = null_scalar.astype(dtype) - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) data = [1, 2, 3, 4, 5] sr = cudf.Series(data, dtype=dtype) diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index d8e10a62a12..51327038c39 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -799,7 +799,7 @@ def test_categorical_setitem_with_nan(): @pytest.mark.parametrize("dtype", list(NUMERIC_TYPES) + ["object"]) @pytest.mark.parametrize("input_obj", [[1, cudf.NA, 3]]) def test_series_construction_with_nulls(input_obj, dtype): - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) input_obj = [ dtype.type(v) if v is not cudf.NA else cudf.NA for v in input_obj ] diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py index b6650600261..f06142f4cc9 100644 --- a/python/cudf/cudf/tests/test_contains.py +++ b/python/cudf/cudf/tests/test_contains.py @@ -4,6 +4,7 @@ import pandas as pd import pytest +import cudf from cudf import Series from cudf.core.index import RangeIndex, as_index from cudf.testing._utils import ( @@ -82,7 +83,7 @@ def test_rangeindex_contains(): @pytest.mark.parametrize("dtype", NUMERIC_TYPES) def test_lists_contains(dtype): - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) inner_data = np.array([1, 2, 3], dtype=dtype) data = Series([inner_data]) @@ -96,7 +97,7 @@ def test_lists_contains(dtype): @pytest.mark.parametrize("dtype", DATETIME_TYPES + TIMEDELTA_TYPES) def test_lists_contains_datetime(dtype): - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) inner_data = np.array([1, 2, 3]) unit, _ = np.datetime_data(dtype) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 7b56f864272..da5b85b4e37 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -810,7 +810,7 @@ def test_join_datetimes_index(dtype): pdf = pdf_lhs.join(pdf_rhs, sort=True) gdf = gdf_lhs.join(gdf_rhs, sort=True) - assert gdf["d"].dtype == np.dtype(dtype) + assert gdf["d"].dtype == cudf.dtype(dtype) assert_join_results_equal(pdf, gdf, how="inner") diff --git a/python/cudf/cudf/tests/test_label_encode.py b/python/cudf/cudf/tests/test_label_encode.py index 29a787768f2..f513aa7a134 100644 --- a/python/cudf/cudf/tests/test_label_encode.py +++ b/python/cudf/cudf/tests/test_label_encode.py @@ -6,6 +6,7 @@ import numpy as np import pytest +import cudf from cudf.core import DataFrame, Series @@ -18,7 +19,7 @@ def _random_int(nelem, dtype): def _random(nelem, dtype): - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if dtype.kind in {"i", "u"}: return _random_int(nelem, dtype) elif dtype.kind == "f": diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index 7a766a49a62..2e1ce5cddfc 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -390,7 +390,7 @@ def test_to_numeric_error(data, errors): @pytest.mark.parametrize("dtype", NUMERIC_TYPES) @pytest.mark.parametrize("input_obj", [[1, cudf.NA, 3]]) def test_series_construction_with_nulls(dtype, input_obj): - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) # numpy case expect = pd.Series(input_obj, dtype=cudf_dtypes_to_pandas_dtypes[dtype]) diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index 7cbc56f943c..b7bbefb8c58 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -25,7 +25,7 @@ @pytest.mark.parametrize("dtype,nelem", params) def test_sum(dtype, nelem): - dtype = np.dtype(dtype).type + dtype = cudf.dtype(dtype).type data = gen_rand(dtype, nelem) sr = Series(data) @@ -69,8 +69,8 @@ def test_sum_decimal(dtype, nelem): @pytest.mark.parametrize("dtype,nelem", params) def test_product(dtype, nelem): np.random.seed(0) - dtype = np.dtype(dtype).type - if np.dtype(dtype).kind in {"u", "i"}: + dtype = cudf.dtype(dtype).type + if cudf.dtype(dtype).kind in {"u", "i"}: data = np.ones(nelem, dtype=dtype) # Set at most 30 items to [0..2) to keep the value within 2^32 for _ in range(30): @@ -107,7 +107,7 @@ def test_product_decimal(dtype): @pytest.mark.parametrize("dtype,nelem", params) def test_sum_of_squares(dtype, nelem): - dtype = np.dtype(dtype).type + dtype = cudf.dtype(dtype).type data = gen_rand(dtype, nelem) sr = Series(data) @@ -115,7 +115,7 @@ def test_sum_of_squares(dtype, nelem): # got = dtype(got) expect = (data ** 2).sum() - if np.dtype(dtype).kind in {"u", "i"}: + if cudf.dtype(dtype).kind in {"u", "i"}: if 0 <= expect <= np.iinfo(dtype).max: np.testing.assert_array_almost_equal(expect, got) else: @@ -141,7 +141,7 @@ def test_sum_of_squares_decimal(dtype): @pytest.mark.parametrize("dtype,nelem", params) def test_min(dtype, nelem): - dtype = np.dtype(dtype).type + dtype = cudf.dtype(dtype).type data = gen_rand(dtype, nelem) sr = Series(data) @@ -167,7 +167,7 @@ def test_min_decimal(dtype, nelem): @pytest.mark.parametrize("dtype,nelem", params) def test_max(dtype, nelem): - dtype = np.dtype(dtype).type + dtype = cudf.dtype(dtype).type data = gen_rand(dtype, nelem) sr = Series(data) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 4906349ecba..3ef0e2edaed 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -24,11 +24,11 @@ def test_null_series(nrows, dtype): data = cudf.Series(np.random.randint(1, 9, size)) column = data.set_mask(mask) sr = cudf.Series(column).astype(dtype) - if dtype != "category" and np.dtype(dtype).kind in {"u", "i"}: + if dtype != "category" and cudf.dtype(dtype).kind in {"u", "i"}: ps = pd.Series( sr._column.data_array_view.copy_to_host(), dtype=cudf_dtypes_to_pandas_dtypes.get( - np.dtype(dtype), np.dtype(dtype) + cudf.dtype(dtype), cudf.dtype(dtype) ), ) ps[sr.isnull().to_pandas()] = pd.NA diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index 605005f41fc..a9919900256 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -198,7 +198,7 @@ def test_null_scalar(dtype): s = cudf.Scalar(None, dtype=dtype) assert s.value is cudf.NA assert s.dtype == ( - np.dtype(dtype) + cudf.dtype(dtype) if not isinstance(dtype, cudf.Decimal64Dtype) else dtype ) @@ -239,7 +239,7 @@ def test_generic_null_scalar_construction_fails(value): def test_scalar_dtype_and_validity(dtype): s = cudf.Scalar(1, dtype=dtype) - assert s.dtype == np.dtype(dtype) + assert s.dtype == cudf.dtype(dtype) assert s.is_valid() is True diff --git a/python/cudf/cudf/tests/test_udf_binops.py b/python/cudf/cudf/tests/test_udf_binops.py index df7361ab183..5a5aca615ba 100644 --- a/python/cudf/cudf/tests/test_udf_binops.py +++ b/python/cudf/cudf/tests/test_udf_binops.py @@ -6,6 +6,7 @@ from numba.cuda import compile_ptx from numba.np import numpy_support +import cudf from cudf import _lib as libcudf from cudf.core import Series from cudf.utils import dtypes as dtypeutils @@ -27,7 +28,7 @@ def test_generic_ptx(dtype): def generic_function(a, b): return a ** 3 + b - nb_type = numpy_support.from_dtype(np.dtype(dtype)) + nb_type = numpy_support.from_dtype(cudf.dtype(dtype)) type_signature = (nb_type, nb_type) ptx_code, output_type = compile_ptx( diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py index 2089f764724..c549dd2712b 100644 --- a/python/cudf/cudf/tests/test_unaops.py +++ b/python/cudf/cudf/tests/test_unaops.py @@ -35,7 +35,7 @@ def test_series_invert(dtype): def test_series_not(dtype): import pandas as pd - dtype = np.dtype(dtype).type + dtype = cudf.dtype(dtype).type arr = pd.Series(np.random.choice([True, False], 1000)).astype(dtype) if dtype is not np.bool_: arr = arr * (np.random.random(1000) * 100).astype(dtype) @@ -134,7 +134,7 @@ def generate_valid_scalar_unaop_combos(): @pytest.mark.parametrize("slr,dtype,op", generate_valid_scalar_unaop_combos()) def test_scalar_unary_operations(slr, dtype, op): - slr_host = np.dtype(dtype).type(slr) + slr_host = cudf.dtype(dtype).type(slr) slr_device = cudf.Scalar(slr, dtype=dtype) expect = op(slr_host) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index e1ae87e5089..db7de4441ec 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -140,7 +140,7 @@ def np_to_pa_dtype(dtype): return pa.duration(time_unit) # default fallback unit is ns return pa.duration("ns") - return _np_pa_dtypes[np.dtype(dtype).type] + return _np_pa_dtypes[cudf.dtype(dtype).type] def get_numeric_type_info(dtype): @@ -202,7 +202,7 @@ def cudf_dtype_to_pa_type(dtype): ): return dtype.to_arrow() else: - return np_to_pa_dtype(np.dtype(dtype)) + return np_to_pa_dtype(cudf.dtype(dtype)) def cudf_dtype_from_pa_type(typ): @@ -404,7 +404,7 @@ def check_cast_unsupported_dtype(dtype): if isinstance(dtype, pd.core.arrays.numpy_.PandasDtype): dtype = dtype.numpy_dtype else: - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if dtype in cudf._lib.types.np_to_cudf_types: return dtype @@ -438,7 +438,7 @@ def get_time_unit(obj): def _get_nan_for_dtype(dtype): - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if pd.api.types.is_datetime64_dtype( dtype ) or pd.api.types.is_timedelta64_dtype(dtype): diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 209f61ad399..5804c794f97 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -70,7 +70,7 @@ def scalar_broadcast_to(scalar, size, dtype=None): scalar = to_cudf_compatible_scalar(scalar, dtype=dtype) dtype = scalar.dtype - if np.dtype(dtype).kind in ("O", "U"): + if cudf.dtype(dtype).kind in ("O", "U"): gather_map = column.full(size, 0, dtype="int32") scalar_str_col = column.as_column([scalar], dtype="str") return scalar_str_col[gather_map] From 5e50f522b3d8b235b5a6d4362148b0bbb6dd94d5 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 4 Aug 2021 09:29:05 -0400 Subject: [PATCH 02/20] First stab at cudf.dtype --- python/cudf/cudf/api/types.py | 19 +++++++++++++++++-- python/cudf/cudf/core/dtypes.py | 6 ++++++ python/cudf/cudf/tests/test_dtypes.py | 21 +++++++++++++++++++++ 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index a1237e34366..fc999ae422b 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -29,12 +29,27 @@ def dtype(arbitrary): try: - return np.dtype(arbitrary) + np_dtype = np.dtype(arbitrary) + if np_dtype.name == "float16": + np_dtype = np.dtype("float32") + elif np_dtype.name in ("object", "str"): + np_dtype = np.dtype("object") + return np_dtype except TypeError: pass if isinstance(arbitrary, cudf.core.dtypes._BaseDtype): return arbitrary - return pd.api.types.pandas_type(arbitrary) + elif isinstance(arbitrary, pd.CategoricalDtype): + return cudf.CategoricalDtype.from_pandas(arbitrary) + elif isinstance(arbitrary, pd.IntervalDtype): + return cudf.IntervalDtype.from_pandas(arbitrary) + pd_dtype = pd.api.types.pandas_dtype(arbitrary) + try: + return pd_dtype.numpy_dtype + except AttributeError: + # no NumPy type corresponding to this type + # always object? + return np.dtype("object") def is_numeric_dtype(obj): diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 6dbe55d0bb8..4062b734bb3 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -559,6 +559,12 @@ def to_arrow(self): pa.from_numpy_dtype(self.subtype), self.closed ) + @classmethod + def from_pandas(cls, pd_dtype: pd.IntervalDtype) -> "IntervalDtype": + return cls( + subtype=pd_dtype.subtype + ) # TODO: needs `closed` when we upgrade Pandas + def is_categorical_dtype(obj): """Check whether an array-like or dtype is of the Categorical dtype. diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index 41d7f5d215e..d21f67d1def 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -257,3 +257,24 @@ def test_lists_of_structs_dtype(data): assert_column_array_dtype_equal(got._column, expected) assert expected.equals(got._column.to_arrow()) + + +@pytest.mark.parametrize( + "in_dtype,expect", + [ + (np.dtype("int8"), np.dtype("int8")), + (np.int8, np.dtype("int8")), + (np.float16, np.dtype("float32")), + (pd.Int8Dtype(), np.dtype("int8")), + (pd.StringDtype(), np.dtype("object")), + ("int8", np.dtype("int8")), + ("boolean", np.dtype("bool")), + (int, np.dtype("int64")), + (float, np.dtype("float64")), + (cudf.ListDtype("int64"), cudf.ListDtype("int64")), + ("float16", np.dtype("float32")), + (np.dtype("U"), np.dtype("object")), + ], +) +def test_dtype(in_dtype, expect): + assert_eq(cudf.dtype(in_dtype), expect) From 367b743167a29a8841eba42ddacf6ee0476b0d44 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 4 Aug 2021 10:12:04 -0400 Subject: [PATCH 03/20] Handle datetimes/timedeltas in cudf.dtype --- python/cudf/cudf/api/types.py | 4 ++++ python/cudf/cudf/tests/test_dtypes.py | 10 ++++++++++ 2 files changed, 14 insertions(+) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index fc999ae422b..f1d87a6761b 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -34,6 +34,10 @@ def dtype(arbitrary): np_dtype = np.dtype("float32") elif np_dtype.name in ("object", "str"): np_dtype = np.dtype("object") + elif np_dtype.str == " Date: Wed, 4 Aug 2021 12:02:49 -0400 Subject: [PATCH 04/20] Fix test --- python/cudf/cudf/tests/test_binops.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index c49b6d794d9..d8761057683 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -1700,6 +1700,8 @@ def test_binops_with_lhs_numpy_scalar(frame, dtype): val = cudf.dtype(dtype).type(4, "s") elif dtype == "category": val = np.int64(4) + elif dtype == "str": + val = str(4) else: val = cudf.dtype(dtype).type(4) From 85351e99bfd54c8dec2efbe4ef65fd6372ff1bc8 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 5 Aug 2021 12:35:30 -0400 Subject: [PATCH 05/20] Handle disallowed numpy types --- python/cudf/cudf/api/types.py | 7 +++++-- python/cudf/cudf/tests/test_dtypes.py | 8 ++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index f1d87a6761b..8547ec0310c 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -32,15 +32,18 @@ def dtype(arbitrary): np_dtype = np.dtype(arbitrary) if np_dtype.name == "float16": np_dtype = np.dtype("float32") - elif np_dtype.name in ("object", "str"): + elif np_dtype.kind in ("OU"): np_dtype = np.dtype("object") elif np_dtype.str == " Date: Thu, 5 Aug 2021 14:09:05 -0400 Subject: [PATCH 06/20] Update python/cudf/cudf/tests/test_dtypes.py Co-authored-by: GALI PREM SAGAR --- python/cudf/cudf/tests/test_dtypes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index ad67327a105..36c05e40261 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -269,6 +269,8 @@ def test_lists_of_structs_dtype(data): (pd.StringDtype(), np.dtype("object")), ("int8", np.dtype("int8")), ("boolean", np.dtype("bool")), + ("bool_", np.dtype("bool")), + (np.bool_, np.dtype("bool")), (int, np.dtype("int64")), (float, np.dtype("float64")), (cudf.ListDtype("int64"), cudf.ListDtype("int64")), From a10eae01af842740ca88775650817deed095b9d1 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 6 Aug 2021 13:50:26 -0400 Subject: [PATCH 07/20] Some fixes --- python/cudf/cudf/api/types.py | 39 +++++++++++++------ python/cudf/cudf/core/column/column.py | 20 +++++----- python/cudf/cudf/testing/dataset_generator.py | 12 +++--- 3 files changed, 44 insertions(+), 27 deletions(-) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 8547ec0310c..bbcbfcbbe17 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -28,35 +28,52 @@ def dtype(arbitrary): + """ + Returns the cuDF-supported dtype corresponding to `arbitrary` + + Inputs + ------ + arbitrary: dtype or scalar-like + + Returns + ------- + dtype: the cuDF-supported dtype that best matches `arbitrary` + """ + # first, try interpreting arbitrary as a NumPy dtype that we support: try: np_dtype = np.dtype(arbitrary) if np_dtype.name == "float16": np_dtype = np.dtype("float32") elif np_dtype.kind in ("OU"): np_dtype = np.dtype("object") - elif np_dtype.str == " ColumnBase: return self.as_numerical_column(dtype, **kwargs) elif is_categorical_dtype(dtype): return self.as_categorical_column(dtype, **kwargs) - elif pandas_dtype(dtype).type in { + elif cudf.dtype(dtype).type in { np.str_, np.object_, str, @@ -1299,7 +1298,7 @@ def column_empty( ) -> ColumnBase: """Allocate a new column like the given row_count and dtype. """ - dtype = pandas_dtype(dtype) + dtype = cudf.dtype(dtype) children = () # type: Tuple[ColumnBase, ...] if is_struct_dtype(dtype): @@ -1364,7 +1363,7 @@ def build_column( offset : int, optional children : tuple, optional """ - dtype = pandas_dtype(dtype) + dtype = cudf.dtype(dtype) if _is_non_decimal_numeric_dtype(dtype): assert data is not None @@ -1769,9 +1768,9 @@ def as_column( col = ColumnBase.from_arrow(arbitrary) if isinstance(arbitrary, pa.NullArray): if type(dtype) == str and dtype == "empty": - new_dtype = pandas_dtype(arbitrary.type.to_pandas_dtype()) + new_dtype = np.dtype(arbitrary.type.to_pandas_dtype()) else: - new_dtype = pandas_dtype(dtype) + new_dtype = np.dtype(dtype) col = col.astype(new_dtype) return col @@ -1865,7 +1864,7 @@ def as_column( arbitrary = np.ascontiguousarray(arbitrary) if dtype is not None: - arbitrary = arbitrary.astype(dtype) + arbitrary = arbitrary.astype(np.dtype(dtype)) if arb_dtype.kind == "M": @@ -2034,12 +2033,11 @@ def as_column( return cudf.core.column.Decimal32Column.from_arrow( data ) - dtype = pd.api.types.pandas_dtype(dtype) - np_type = cudf.dtype(dtype).type + np_type = np.dtype(dtype).type if np_type == np.bool_: pa_type = pa.bool_() else: - pa_type = np_to_pa_dtype(cudf.dtype(dtype)) + pa_type = np_to_pa_dtype(np.dtype(dtype)) data = as_column( pa.array( arbitrary, @@ -2280,7 +2278,7 @@ def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase: def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: """Concatenate a sequence of columns.""" if len(objs) == 0: - dtype = pandas_dtype(None) + dtype = cudf.dtype(None) return column_empty(0, dtype=dtype, masked=True) # If all columns are `NumericalColumn` with different dtypes, diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py index 4a475c52777..cdea22a05af 100644 --- a/python/cudf/cudf/testing/dataset_generator.py +++ b/python/cudf/cudf/testing/dataset_generator.py @@ -18,6 +18,7 @@ from pyarrow import parquet as pq import cudf +from cudf.utils.dtypes import np_to_pa_dtype class ColumnParameters: @@ -94,6 +95,7 @@ def _write(tbl, path, format): def _generate_column(column_params, num_rows): # If cardinality is specified, we create a set to sample from. # Otherwise, we simply use the given generator to generate each value. + if column_params.cardinality is not None: # Construct set of values to sample from where # set size = cardinality @@ -127,7 +129,7 @@ def _generate_column(column_params, num_rows): if hasattr(column_params.dtype, "to_arrow"): arrow_type = column_params.dtype.to_arrow() elif column_params.dtype is not None: - arrow_type = pa.from_numpy_dtype(column_params.dtype) + arrow_type = np_to_pa_dtype(cudf.dtype(column_params.dtype)) else: arrow_type = None @@ -227,15 +229,15 @@ def get_dataframe(parameters, use_threads): ): arrow_type = pa.dictionary( index_type=pa.int64(), - value_type=pa.from_numpy_dtype( - type(next(iter(column_params.generator))) + value_type=np_to_pa_dtype( + cudf.dtype(type(next(iter(column_params.generator)))) ), ) elif hasattr(column_params.dtype, "to_arrow"): arrow_type = column_params.dtype.to_arrow() else: - arrow_type = pa.from_numpy_dtype( - type(next(iter(column_params.generator))) + arrow_type = np_to_pa_dtype( + cudf.dtype(type(next(iter(column_params.generator)))) if column_params.dtype is None else column_params.dtype ) From 89ac918ce902aee3cb90e4772791130ae2b7e03e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 9 Aug 2021 10:00:35 -0400 Subject: [PATCH 08/20] Remaining failures --- python/cudf/cudf/testing/_utils.py | 2 +- python/cudf/cudf/tests/test_dtypes.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 68914c9b0e2..b101835e626 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -284,7 +284,7 @@ def gen_rand(dtype, size, **kwargs): return pd.to_datetime( np.random.randint(low=low, high=high, size=size), unit=time_unit ) - elif dtype.kind == "U": + elif dtype.kind in ("O", "U"): return pd.util.testing.rands_array(10, size) raise NotImplementedError(f"dtype.kind={dtype.kind}") diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index ad67327a105..98a35312527 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -274,12 +274,12 @@ def test_lists_of_structs_dtype(data): (cudf.ListDtype("int64"), cudf.ListDtype("int64")), ("float16", np.dtype("float32")), (np.dtype("U"), np.dtype("object")), - ("timedelta64", np.dtype(" Date: Mon, 9 Aug 2021 17:38:51 -0400 Subject: [PATCH 09/20] Style --- python/cudf/cudf/api/types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index bbcbfcbbe17..5d495f3e21d 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -29,7 +29,7 @@ def dtype(arbitrary): """ - Returns the cuDF-supported dtype corresponding to `arbitrary` + Return the cuDF-supported dtype corresponding to `arbitrary`. Inputs ------ From a62ab3214f8dfe6f1d068eaf24dcbf446d87e73d Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Mon, 9 Aug 2021 18:55:47 -0400 Subject: [PATCH 10/20] Update python/cudf/cudf/api/types.py Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/api/types.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 5d495f3e21d..e1a04d5ea81 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -31,8 +31,8 @@ def dtype(arbitrary): """ Return the cuDF-supported dtype corresponding to `arbitrary`. - Inputs - ------ + Parameters + ------------ arbitrary: dtype or scalar-like Returns From f79e59f12e9f2f82f0a95f74da27b1b1a4364cc3 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 10 Aug 2021 15:30:06 -0400 Subject: [PATCH 11/20] cudf.dtype -> np.dtype --- python/cudf/cudf/__init__.py | 19 ++-- python/cudf/cudf/_fuzz_testing/utils.py | 54 +++++------ python/cudf/cudf/_lib/__init__.py | 1 + python/cudf/cudf/_lib/aggregation.pyx | 6 +- python/cudf/cudf/_lib/binaryop.pyx | 3 +- python/cudf/cudf/_lib/copying.pyx | 2 +- python/cudf/cudf/_lib/orc.pyx | 2 +- python/cudf/cudf/_lib/parquet.pyx | 2 +- python/cudf/cudf/_lib/scalar.pyx | 4 +- python/cudf/cudf/_lib/string_casting.pyx | 24 ++--- python/cudf/cudf/_lib/transform.pyx | 7 +- python/cudf/cudf/_lib/types.pyx | 25 +++--- python/cudf/cudf/api/types.py | 53 +---------- python/cudf/cudf/comm/gpuarrow.py | 3 +- python/cudf/cudf/core/__init__.py | 30 ------- python/cudf/cudf/core/_internals/__init__.py | 2 - python/cudf/cudf/core/buffer.py | 5 +- python/cudf/cudf/core/column/column.py | 49 +++++----- python/cudf/cudf/core/column/datetime.py | 12 +-- python/cudf/cudf/core/column/numerical.py | 28 +++--- python/cudf/cudf/core/column/string.py | 66 +++++++------- python/cudf/cudf/core/column/timedelta.py | 14 +-- python/cudf/cudf/core/cut.py | 3 +- python/cudf/cudf/core/dataframe.py | 17 ++-- python/cudf/cudf/core/dtypes.py | 51 ++++++++++- python/cudf/cudf/core/frame.py | 7 +- python/cudf/cudf/core/index.py | 6 +- python/cudf/cudf/core/scalar.py | 21 +++-- python/cudf/cudf/core/tools/datetimes.py | 2 +- python/cudf/cudf/core/tools/numeric.py | 14 +-- python/cudf/cudf/tests/test_binops.py | 2 +- python/cudf/cudf/tests/test_column.py | 2 +- python/cudf/cudf/tests/test_copying.py | 2 +- python/cudf/cudf/tests/test_datetime.py | 2 +- python/cudf/cudf/tests/test_factorize.py | 2 +- python/cudf/cudf/tests/test_groupby.py | 2 +- python/cudf/cudf/tests/test_label_encode.py | 2 +- python/cudf/cudf/tests/test_monotonic.py | 2 +- python/cudf/cudf/tests/test_numpy_interop.py | 2 +- python/cudf/cudf/tests/test_onehot.py | 4 +- python/cudf/cudf/tests/test_pack.py | 2 +- python/cudf/cudf/tests/test_pandas_interop.py | 2 +- python/cudf/cudf/tests/test_pickling.py | 2 +- python/cudf/cudf/tests/test_query.py | 2 +- python/cudf/cudf/tests/test_rank.py | 2 +- python/cudf/cudf/tests/test_reductions.py | 2 +- python/cudf/cudf/tests/test_sorting.py | 2 +- python/cudf/cudf/tests/test_sparse_df.py | 2 +- python/cudf/cudf/tests/test_transform.py | 2 +- python/cudf/cudf/tests/test_udf_binops.py | 3 +- python/cudf/cudf/tests/test_unaops.py | 2 +- python/cudf/cudf/utils/dtypes.py | 90 ++++++++----------- python/cudf/cudf/utils/utils.py | 2 +- python/dask_cudf/dask_cudf/backends.py | 4 +- 54 files changed, 318 insertions(+), 355 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 23621f1e315..55145aaa166 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -8,6 +8,7 @@ import rmm +from cudf.api.types import dtype from cudf import core, datasets, testing from cudf._version import get_versions from cudf.api.extensions import ( @@ -15,35 +16,35 @@ register_index_accessor, register_series_accessor, ) -from cudf.api.types import dtype -from cudf.core import ( +from cudf.core.scalar import ( NA, + Scalar, +) +from cudf.core.index import ( BaseIndex, CategoricalIndex, - DataFrame, DatetimeIndex, Float32Index, Float64Index, Index, + GenericIndex, Int8Index, Int16Index, Int32Index, Int64Index, IntervalIndex, - MultiIndex, RangeIndex, - Scalar, - Series, TimedeltaIndex, UInt8Index, UInt16Index, UInt32Index, UInt64Index, - cut, - from_pandas, interval_range, - merge, ) +from cudf.core.dataframe import DataFrame, from_pandas, merge +from cudf.core.series import Series +from cudf.core.multiindex import MultiIndex +from cudf.core.cut import cut from cudf.core.algorithms import factorize from cudf.core.dtypes import ( CategoricalDtype, diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index fe9ed4d4934..83ab02351f2 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -18,44 +18,44 @@ ALL_POSSIBLE_VALUES = "ALL_POSSIBLE_VALUES" _PANDAS_TO_AVRO_SCHEMA_MAP = { - np.dtype("int8"): "int", + cudf.dtype("int8"): "int", pd.Int8Dtype(): ["int", "null"], pd.Int16Dtype(): ["int", "null"], pd.Int32Dtype(): ["int", "null"], pd.Int64Dtype(): ["long", "null"], pd.BooleanDtype(): ["boolean", "null"], pd.StringDtype(): ["string", "null"], - np.dtype("bool_"): "boolean", - np.dtype("int16"): "int", - np.dtype("int32"): "int", - np.dtype("int64"): "long", - np.dtype("O"): "string", - np.dtype("str"): "string", - np.dtype("float32"): "float", - np.dtype("float64"): "double", - np.dtype(" ( ( - np_to_cudf_types[np.dtype(dtype)] + np_to_cudf_types[cudf.dtype(dtype)] ) ) ) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index fb58bf96098..a5789e4d0ae 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -787,7 +787,7 @@ cdef class _CPackedColumns: """ Construct a ``PackedColumns`` object from a ``cudf.DataFrame``. """ - from cudf.core import RangeIndex, dtypes + from cudf import RangeIndex, dtypes cdef _CPackedColumns p = _CPackedColumns.__new__(_CPackedColumns) diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index e15b569ed85..ef392b164a0 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -97,7 +97,7 @@ cpdef read_orc(object filepaths_or_buffers, if timestamp_type is None else ( ( - np_to_cudf_types[np.dtype(timestamp_type)] + np_to_cudf_types[cudf.dtype(timestamp_type)] ) ) ), diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 471aa3107d9..95ae2202f68 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -199,7 +199,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, meta_dtype = cols_dtype_map.get(col, None) df._data[col] = cudf.core.column.column_empty( row_count=0, - dtype=np.dtype(meta_dtype) + dtype=cudf.dtype(meta_dtype) ) # Set the index column diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index cf1d577bd8f..95fa5d4d20d 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -81,7 +81,7 @@ cdef class DeviceScalar: dtype : dtype A NumPy dtype. """ - self._dtype = dtype if dtype.kind != 'U' else np.dtype('object') + self._dtype = dtype if dtype.kind != 'U' else cudf.dtype('object') self._set_value(value, self._dtype) def _set_value(self, value, dtype): @@ -560,7 +560,7 @@ def _is_null_host_scalar(slr): def _create_proxy_nat_scalar(dtype): cdef DeviceScalar result = DeviceScalar.__new__(DeviceScalar) - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if dtype.char in 'mM': nat = dtype.type('NaT').astype(dtype) if dtype.type == np.datetime64: diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 8f65cc9fee5..8d7e307c5fb 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -56,6 +56,8 @@ from cudf._lib.cpp.strings.convert.convert_urls cimport ( ) from cudf._lib.cpp.types cimport data_type, type_id +import cudf + def floating_to_string(Column input_col): cdef column_view input_column_view = input_col.view() @@ -115,7 +117,7 @@ def stod(Column input_col, **kwargs): A Column with strings cast to double """ - return string_to_floating(input_col, np.dtype("float64")) + return string_to_floating(input_col, cudf.dtype("float64")) def ftos(Column input_col): @@ -147,7 +149,7 @@ def stof(Column input_col, **kwargs): A Column with strings cast to float """ - return string_to_floating(input_col, np.dtype("float32")) + return string_to_floating(input_col, cudf.dtype("float32")) def integer_to_string(Column input_col): @@ -208,7 +210,7 @@ def stoi8(Column input_col, **kwargs): A Column with strings cast to int8 """ - return string_to_integer(input_col, np.dtype("int8")) + return string_to_integer(input_col, cudf.dtype("int8")) def i16tos(Column input_col): @@ -240,7 +242,7 @@ def stoi16(Column input_col): A Column with strings cast to int16 """ - return string_to_integer(input_col, np.dtype("int16")) + return string_to_integer(input_col, cudf.dtype("int16")) def itos(Column input_col): @@ -272,7 +274,7 @@ def stoi(Column input_col): A Column with strings cast to int32 """ - return string_to_integer(input_col, np.dtype("int32")) + return string_to_integer(input_col, cudf.dtype("int32")) def ltos(Column input_col): @@ -304,7 +306,7 @@ def stol(Column input_col, **kwargs): A Column with strings cast to int64 """ - return string_to_integer(input_col, np.dtype("int64")) + return string_to_integer(input_col, cudf.dtype("int64")) def ui8tos(Column input_col): @@ -336,7 +338,7 @@ def stoui8(Column input_col, **kwargs): A Column with strings cast to uint8 """ - return string_to_integer(input_col, np.dtype("uint8")) + return string_to_integer(input_col, cudf.dtype("uint8")) def ui16tos(Column input_col): @@ -368,7 +370,7 @@ def stoui16(Column input_col, **kwargs): A Column with strings cast to uint16 """ - return string_to_integer(input_col, np.dtype("uint16")) + return string_to_integer(input_col, cudf.dtype("uint16")) def uitos(Column input_col): @@ -400,7 +402,7 @@ def stoui(Column input_col, **kwargs): A Column with strings cast to uint32 """ - return string_to_integer(input_col, np.dtype("uint32")) + return string_to_integer(input_col, cudf.dtype("uint32")) def ultos(Column input_col): @@ -432,7 +434,7 @@ def stoul(Column input_col, **kwargs): A Column with strings cast to uint64 """ - return string_to_integer(input_col, np.dtype("uint64")) + return string_to_integer(input_col, cudf.dtype("uint64")) def _to_booleans(Column input_col, object string_true="True"): @@ -745,7 +747,7 @@ def htoi(Column input_col, **kwargs): cdef column_view input_column_view = input_col.view() cdef type_id tid = ( ( - np_to_cudf_types[kwargs.get('dtype', np.dtype("int64"))] + np_to_cudf_types[kwargs.get('dtype', cudf.dtype("int64"))] ) ) cdef data_type c_out_type = data_type(tid) diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index 63abdb8314c..67fc1c441b0 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -58,8 +58,9 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit): Given a mask buffer, returns a boolean column representng bit 0 -> False and 1 -> True within range of [begin_bit, end_bit), """ - if not isinstance(mask_buffer, cudf.core.Buffer): - raise TypeError("mask_buffer is not an instance of cudf.core.Buffer") + if not isinstance(mask_buffer, cudf.core.buffer.Buffer): + raise TypeError("mask_buffer is not an instance of " + "cudf.core.buffer. Buffer") cdef bitmask_type* bit_mask = (mask_buffer.ptr) cdef unique_ptr[column] result @@ -98,7 +99,7 @@ def transform(Column input, op): nb_signature = (nb_type,) compiled_op = cudautils.compile_udf(op, nb_signature) c_str = compiled_op[0].encode('UTF-8') - np_dtype = np.dtype(compiled_op[1]) + np_dtype = cudf.dtype(compiled_op[1]) try: c_tid = ( diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index d93e1b75376..d3a4c45f213 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -30,6 +30,7 @@ from cudf.utils.dtypes import ( ) cimport cudf._lib.cpp.types as libcudf_types +import cudf class TypeId(IntEnum): @@ -188,11 +189,11 @@ cdef dtype_from_lists_column_view(column_view cv): cdef column_view child = lv.get()[0].child() if child.type().id() == libcudf_types.type_id.LIST: - return ListDtype(dtype_from_lists_column_view(child)) + return cudf.ListDtype(dtype_from_lists_column_view(child)) elif child.type().id() == libcudf_types.type_id.EMPTY: - return ListDtype(np.dtype("int8")) + return cudf.ListDtype("int8") else: - return ListDtype( + return cudf.ListDtype( dtype_from_column_view(child) ) @@ -201,7 +202,7 @@ cdef dtype_from_structs_column_view(column_view cv): str(i): dtype_from_column_view(cv.child(i)) for i in range(cv.num_children()) } - return StructDtype(fields) + return cudf.StructDtype(fields) cdef dtype_from_column_view(column_view cv): cdef libcudf_types.type_id tid = cv.type().id() @@ -210,26 +211,26 @@ cdef dtype_from_column_view(column_view cv): elif tid == libcudf_types.type_id.STRUCT: return dtype_from_structs_column_view(cv) elif tid == libcudf_types.type_id.DECIMAL64: - return Decimal64Dtype( - precision=Decimal64Dtype.MAX_PRECISION, + return cudf.Decimal64Dtype( + precision=cudf.Decimal64Dtype.MAX_PRECISION, scale=-cv.type().scale() ) elif tid == libcudf_types.type_id.DECIMAL32: - return Decimal32Dtype( - precision=Decimal32Dtype.MAX_PRECISION, + return cudf.Decimal32Dtype( + precision=cudf.Decimal32Dtype.MAX_PRECISION, scale=-cv.type().scale() ) else: return cudf_to_np_types[(tid)] cdef libcudf_types.data_type dtype_to_data_type(dtype) except *: - if is_list_dtype(dtype): + if cudf.api.types.is_list_dtype(dtype): tid = libcudf_types.type_id.LIST - elif is_struct_dtype(dtype): + elif cudf.api.types.is_struct_dtype(dtype): tid = libcudf_types.type_id.STRUCT - elif is_decimal64_dtype(dtype): + elif cudf.api.types.is_decimal64_dtype(dtype): tid = libcudf_types.type_id.DECIMAL64 - elif is_decimal32_dtype(dtype): + elif cudf.api.types.is_decimal32_dtype(dtype): tid = libcudf_types.type_id.DECIMAL32 else: tid = ( diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 5d495f3e21d..bf296e11178 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -14,9 +14,9 @@ from pandas.api import types as pd_types import cudf -from cudf._lib.scalar import DeviceScalar from cudf.core.dtypes import ( # noqa: F401 _BaseDtype, + dtype, is_categorical_dtype, is_decimal32_dtype, is_decimal64_dtype, @@ -27,55 +27,6 @@ ) -def dtype(arbitrary): - """ - Return the cuDF-supported dtype corresponding to `arbitrary`. - - Inputs - ------ - arbitrary: dtype or scalar-like - - Returns - ------- - dtype: the cuDF-supported dtype that best matches `arbitrary` - """ - # first, try interpreting arbitrary as a NumPy dtype that we support: - try: - np_dtype = np.dtype(arbitrary) - if np_dtype.name == "float16": - np_dtype = np.dtype("float32") - elif np_dtype.kind in ("OU"): - np_dtype = np.dtype("object") - except TypeError: - pass - else: - if np_dtype.kind not in "biufUOMm": - raise TypeError(f"Unsupported type {np_dtype}") - return np_dtype - - # next, check if `arbitrary` is one of our extension types: - if isinstance(arbitrary, cudf.core.dtypes._BaseDtype): - return arbitrary - - # use `pandas_dtype` to try and interpret - # `arbitrary` as a Pandas extension type. - # Return the corresponding NumPy/cuDF type. - pd_dtype = pd.api.types.pandas_dtype(arbitrary) - try: - return pd_dtype.numpy_dtype - except AttributeError: - if isinstance(pd_dtype, pd.CategoricalDtype): - return cudf.CategoricalDtype.from_pandas(pd_dtype) - elif isinstance(pd_dtype, pd.StringDtype): - return np.dtype("object") - elif isinstance(pd_dtype, pd.IntervalDtype): - return cudf.IntervalDtype.from_pandas(pd_dtype) - else: - raise TypeError( - f"Cannot interpret {arbitrary} as a valid cuDF dtype" - ) - - def is_numeric_dtype(obj): """Check whether the provided array or dtype is of a numeric dtype. @@ -173,7 +124,7 @@ def is_scalar(val): Return True if given object is scalar. """ return ( - isinstance(val, DeviceScalar) + isinstance(val, cudf._lib.scalar.DeviceScalar) or isinstance(val, cudf.Scalar) or isinstance(val, cudf.core.tools.datetimes.DateOffset) or pd_types.is_scalar(val) diff --git a/python/cudf/cudf/comm/gpuarrow.py b/python/cudf/cudf/comm/gpuarrow.py index 451572224c6..85b4bf20e5c 100644 --- a/python/cudf/cudf/comm/gpuarrow.py +++ b/python/cudf/cudf/comm/gpuarrow.py @@ -6,10 +6,11 @@ import pandas as pd import pyarrow as pa +from cudf import Series from cudf._lib.gpuarrow import ( CudaRecordBatchStreamReader as _CudaRecordBatchStreamReader, ) -from cudf.core import Series, column +from cudf.core import column from cudf.utils.utils import mask_bitsize, mask_dtype diff --git a/python/cudf/cudf/core/__init__.py b/python/cudf/cudf/core/__init__.py index 5eaa5b52fd4..ec4878b332d 100644 --- a/python/cudf/cudf/core/__init__.py +++ b/python/cudf/cudf/core/__init__.py @@ -1,31 +1 @@ # Copyright (c) 2018-2021, NVIDIA CORPORATION. - -from cudf.core import _internals, buffer, column, column_accessor, common -from cudf.core.buffer import Buffer -from cudf.core.dataframe import DataFrame, from_pandas, merge -from cudf.core.index import ( - BaseIndex, - CategoricalIndex, - DatetimeIndex, - Float32Index, - Float64Index, - GenericIndex, - Index, - Int8Index, - Int16Index, - Int32Index, - Int64Index, - IntervalIndex, - RangeIndex, - TimedeltaIndex, - UInt8Index, - UInt16Index, - UInt32Index, - UInt64Index, - interval_range, -) -from cudf.core.multiindex import MultiIndex -from cudf.core.scalar import NA, Scalar -from cudf.core.series import Series -import cudf.core.udf -from cudf.core.cut import cut diff --git a/python/cudf/cudf/core/_internals/__init__.py b/python/cudf/cudf/core/_internals/__init__.py index 53d186def85..6faeeffdbec 100644 --- a/python/cudf/cudf/core/_internals/__init__.py +++ b/python/cudf/cudf/core/_internals/__init__.py @@ -1,3 +1 @@ # Copyright (c) 2021, NVIDIA CORPORATION. - -from cudf.core._internals.where import where diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py index c6875052685..bb121023a68 100644 --- a/python/cudf/cudf/core/buffer.py +++ b/python/cudf/cudf/core/buffer.py @@ -11,6 +11,7 @@ import rmm from rmm import DeviceBuffer +import cudf from cudf.core.abc import Serializable @@ -157,7 +158,7 @@ def _buffer_data_from_array_interface(array_interface): ptr = array_interface["data"][0] if ptr is None: ptr = 0 - itemsize = np.dtype(array_interface["typestr"]).itemsize + itemsize = cudf.dtype(array_interface["typestr"]).itemsize shape = ( array_interface["shape"] if len(array_interface["shape"]) > 0 else (1,) ) @@ -168,7 +169,7 @@ def _buffer_data_from_array_interface(array_interface): def confirm_1d_contiguous(array_interface): strides = array_interface["strides"] shape = array_interface["shape"] - itemsize = np.dtype(array_interface["typestr"]).itemsize + itemsize = cudf.dtype(array_interface["typestr"]).itemsize typestr = array_interface["typestr"] if typestr not in ("|i1", "|u1"): raise TypeError("Buffer data must be of uint8 type") diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 5f4f5702cd6..7c1eeb06a98 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -64,7 +64,6 @@ ) from cudf.utils import ioutils, utils from cudf.utils.dtypes import ( - check_cast_unsupported_dtype, cudf_dtype_from_pa_type, get_time_unit, min_unsigned_type, @@ -241,7 +240,12 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: """ if not isinstance(array, (pa.Array, pa.ChunkedArray)): raise TypeError("array should be PyArrow array or chunked array") + + if array.type == pa.float16(): + array = pa.Array.from_pandas(array.to_numpy().astype("float32")) + data = pa.table([array], [None]) + if isinstance(array.type, pa.DictionaryType): indices_table = pa.table( { @@ -500,7 +504,10 @@ def slice(self, start: int, stop: int, stride: int = None) -> ColumnBase: else: # Need to create a gather map for given slice with stride gather_map = arange( - start=start, stop=stop, step=stride, dtype=np.dtype(np.int32), + start=start, + stop=stop, + step=stride, + dtype=cudf.dtype(np.int32), ) return self.take(gather_map) @@ -543,7 +550,7 @@ def __setitem__(self, key: Any, value: Any): start=key_start, stop=key_stop, step=key_stride, - dtype=np.dtype(np.int32), + dtype=cudf.dtype(np.int32), ) nelem = len(key) else: @@ -950,7 +957,7 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase: cats = cats._column.dropna(drop_nan=False) min_type = min_unsigned_type(len(cats), 8) labels = labels - 1 - if np.dtype(min_type).itemsize < labels.dtype.itemsize: + if cudf.dtype(min_type).itemsize < labels.dtype.itemsize: labels = labels.astype(min_type) return build_categorical_column( @@ -1311,7 +1318,7 @@ def column_empty( data = None children = ( build_column( - data=Buffer.empty(row_count * np.dtype("int32").itemsize), + data=Buffer.empty(row_count * cudf.dtype("int32").itemsize), dtype="int32", ), ) @@ -1320,7 +1327,7 @@ def column_empty( children = ( full(row_count + 1, 0, dtype="int32"), build_column( - data=Buffer.empty(row_count * np.dtype("int8").itemsize), + data=Buffer.empty(row_count * cudf.dtype("int8").itemsize), dtype="int8", ), ) @@ -1719,9 +1726,9 @@ def as_column( elif hasattr(arbitrary, "__cuda_array_interface__"): desc = arbitrary.__cuda_array_interface__ - current_dtype = np.dtype(desc["typestr"]) + current_dtype = cudf.dtype(desc["typestr"]) - arb_dtype = check_cast_unsupported_dtype(current_dtype) + arb_dtype = cudf.dtype(current_dtype) if desc.get("mask", None) is not None: # Extract and remove the mask from arbitrary before @@ -1768,9 +1775,9 @@ def as_column( col = ColumnBase.from_arrow(arbitrary) if isinstance(arbitrary, pa.NullArray): if type(dtype) == str and dtype == "empty": - new_dtype = np.dtype(arbitrary.type.to_pandas_dtype()) + new_dtype = cudf.dtype(arbitrary.type.to_pandas_dtype()) else: - new_dtype = np.dtype(dtype) + new_dtype = cudf.dtype(dtype) col = col.astype(new_dtype) return col @@ -1787,7 +1794,7 @@ def as_column( elif arbitrary.dtype == np.bool_: data = as_column(cupy.asarray(arbitrary), dtype=arbitrary.dtype) elif arbitrary.dtype.kind in ("f"): - arb_dtype = check_cast_unsupported_dtype(arbitrary.dtype) + arb_dtype = cudf.dtype(arbitrary.dtype) data = as_column( cupy.asarray(arbitrary, dtype=arb_dtype), nan_as_null=nan_as_null, @@ -1825,7 +1832,7 @@ def as_column( ): arbitrary = None if dtype is None: - dtype = np.dtype("float64") + dtype = cudf.dtype("float64") data = as_column( utils.scalar_broadcast_to(arbitrary, length, dtype=dtype) @@ -1840,7 +1847,7 @@ def as_column( # CUDF assumes values are always contiguous desc = arbitrary.__array_interface__ shape = desc["shape"] - arb_dtype = np.dtype(desc["typestr"]) + arb_dtype = cudf.dtype(desc["typestr"]) # CUDF assumes values are always contiguous if len(shape) > 1: raise ValueError("Data must be 1-dimensional") @@ -1872,7 +1879,7 @@ def as_column( cast_dtype = time_unit in ("D", "W", "M", "Y") if cast_dtype: - arbitrary = arbitrary.astype(np.dtype("datetime64[s]")) + arbitrary = arbitrary.astype(cudf.dtype("datetime64[s]")) buffer = Buffer(arbitrary.view("|u1")) mask = None @@ -1892,7 +1899,7 @@ def as_column( cast_dtype = time_unit in ("D", "W", "M", "Y") if cast_dtype: - arbitrary = arbitrary.astype(np.dtype("timedelta64[s]")) + arbitrary = arbitrary.astype(cudf.dtype("timedelta64[s]")) buffer = Buffer(arbitrary.view("|u1")) mask = None @@ -1931,9 +1938,7 @@ def as_column( if dtype is not None: data = data.astype(dtype) elif arb_dtype.kind in ("f"): - arb_dtype = check_cast_unsupported_dtype( - arb_dtype if dtype is None else dtype - ) + arb_dtype = cudf.dtype(arb_dtype if dtype is None else dtype) data = as_column( cupy.asarray(arbitrary, dtype=arb_dtype), nan_as_null=nan_as_null, @@ -1946,9 +1951,9 @@ def as_column( arb_dtype = arbitrary.dtype else: if arbitrary.dtype == pd.StringDtype(): - arb_dtype = np.dtype("O") + arb_dtype = cudf.dtype("O") else: - arb_dtype = check_cast_unsupported_dtype(arbitrary.dtype) + arb_dtype = cudf.dtype(arbitrary.dtype) if arb_dtype != arbitrary.dtype.numpy_dtype: arbitrary = arbitrary.astype(arb_dtype) if ( @@ -2100,7 +2105,7 @@ def _construct_array( arbitrary, dtype=native_dtype if native_dtype is None - else np.dtype(native_dtype), + else cudf.dtype(native_dtype), ) return arbitrary @@ -2109,7 +2114,7 @@ def _data_from_cuda_array_interface_desc(obj) -> Buffer: desc = obj.__cuda_array_interface__ ptr = desc["data"][0] nelem = desc["shape"][0] if len(desc["shape"]) > 0 else 1 - dtype = np.dtype(desc["typestr"]) + dtype = cudf.dtype(desc["typestr"]) data = Buffer(data=ptr, size=nelem * dtype.itemsize, owner=obj) return data diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 4d99308d128..bf5bba9d288 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -264,7 +264,7 @@ def as_string_column( ) if len(self) > 0: return string._datetime_to_str_typecast_functions[ - np.dtype(self.dtype) + cudf.dtype(self.dtype) ](self, format) else: return cast( @@ -316,7 +316,7 @@ def binary_operator( return rhs._datetime_binop(self, op, reflect=reflect) lhs: Union[ScalarLike, ColumnBase] = self if op in ("eq", "ne", "lt", "gt", "le", "ge", "NULL_EQUALS"): - out_dtype = np.dtype(np.bool_) # type: Dtype + out_dtype = cudf.dtype(np.bool_) # type: Dtype elif op == "add" and pd.api.types.is_timedelta64_dtype(rhs.dtype): out_dtype = cudf.core.column.timedelta._timedelta_add_result_dtype( rhs, lhs @@ -389,13 +389,13 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: to_res, _ = np.datetime_data(to_dtype) self_res, _ = np.datetime_data(self.dtype) - max_int = np.iinfo(np.dtype("int64")).max + max_int = np.iinfo(cudf.dtype("int64")).max max_dist = np.timedelta64( - self.max().astype(np.dtype("int64"), copy=False), self_res + self.max().astype(cudf.dtype("int64"), copy=False), self_res ) min_dist = np.timedelta64( - self.min().astype(np.dtype("int64"), copy=False), self_res + self.min().astype(cudf.dtype("int64"), copy=False), self_res ) self_delta_dtype = np.timedelta64(0, self_res).dtype @@ -408,7 +408,7 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: return True else: return False - elif to_dtype == np.dtype("int64") or to_dtype == np.dtype("O"): + elif to_dtype == cudf.dtype("int64") or to_dtype == cudf.dtype("O"): # can safely cast to representation, or string return True else: diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 0955039dafd..03033f86ffc 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -121,14 +121,14 @@ def binary_operator( self, binop: str, rhs: BinaryOperand, reflect: bool = False, ) -> ColumnBase: int_dtypes = [ - np.dtype("int8"), - np.dtype("int16"), - np.dtype("int32"), - np.dtype("int64"), - np.dtype("uint8"), - np.dtype("uint16"), - np.dtype("uint32"), - np.dtype("uint64"), + cudf.dtype("int8"), + cudf.dtype("int16"), + cudf.dtype("int32"), + cudf.dtype("int64"), + cudf.dtype("uint8"), + cudf.dtype("uint16"), + cudf.dtype("uint32"), + cudf.dtype("uint64"), ] if rhs is None: out_dtype = self.dtype @@ -158,7 +158,7 @@ def binary_operator( (np.isscalar(tmp) and (0 == tmp)) or ((isinstance(tmp, NumericalColumn)) and (0.0 in tmp)) ): - out_dtype = np.dtype("float64") + out_dtype = cudf.dtype("float64") if binop in {"lt", "gt", "le", "ge", "eq", "ne", "NULL_EQUALS"}: out_dtype = "bool" @@ -183,13 +183,13 @@ def normalize_binop_value( if isinstance(other, cudf.Scalar): return other other_dtype = np.promote_types(self.dtype, other_dtype) - if other_dtype == np.dtype("float16"): - other_dtype = np.dtype("float32") + if other_dtype == cudf.dtype("float16"): + other_dtype = cudf.dtype("float32") other = other_dtype.type(other) if self.dtype.kind == "b": other_dtype = min_signed_type(other) if np.isscalar(other): - other = np.dtype(other_dtype).type(other) + other = cudf.dtype(other_dtype).type(other) return other else: ary = utils.scalar_broadcast_to( @@ -202,7 +202,7 @@ def normalize_binop_value( raise TypeError(f"cannot broadcast {type(other)}") def int2ip(self) -> "cudf.core.column.StringColumn": - if self.dtype != np.dtype("int64"): + if self.dtype != cudf.dtype("int64"): raise TypeError("Only int64 type can be converted to ip") return libcudf.string_casting.int2ip(self) @@ -212,7 +212,7 @@ def as_string_column( ) -> "cudf.core.column.StringColumn": if len(self) > 0: return string._numeric_to_str_typecast_functions[ - np.dtype(self.dtype) + cudf.dtype(self.dtype) ](self) else: return cast( diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 5f38f2c698e..f22aee2fbf3 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -53,47 +53,47 @@ def str_to_boolean(column: StringColumn): _str_to_numeric_typecast_functions = { - np.dtype("int8"): str_cast.stoi8, - np.dtype("int16"): str_cast.stoi16, - np.dtype("int32"): str_cast.stoi, - np.dtype("int64"): str_cast.stol, - np.dtype("uint8"): str_cast.stoui8, - np.dtype("uint16"): str_cast.stoui16, - np.dtype("uint32"): str_cast.stoui, - np.dtype("uint64"): str_cast.stoul, - np.dtype("float32"): str_cast.stof, - np.dtype("float64"): str_cast.stod, - np.dtype("bool"): str_to_boolean, + cudf.dtype("int8"): str_cast.stoi8, + cudf.dtype("int16"): str_cast.stoi16, + cudf.dtype("int32"): str_cast.stoi, + cudf.dtype("int64"): str_cast.stol, + cudf.dtype("uint8"): str_cast.stoui8, + cudf.dtype("uint16"): str_cast.stoui16, + cudf.dtype("uint32"): str_cast.stoui, + cudf.dtype("uint64"): str_cast.stoul, + cudf.dtype("float32"): str_cast.stof, + cudf.dtype("float64"): str_cast.stod, + cudf.dtype("bool"): str_to_boolean, } _numeric_to_str_typecast_functions = { - np.dtype("int8"): str_cast.i8tos, - np.dtype("int16"): str_cast.i16tos, - np.dtype("int32"): str_cast.itos, - np.dtype("int64"): str_cast.ltos, - np.dtype("uint8"): str_cast.ui8tos, - np.dtype("uint16"): str_cast.ui16tos, - np.dtype("uint32"): str_cast.uitos, - np.dtype("uint64"): str_cast.ultos, - np.dtype("float32"): str_cast.ftos, - np.dtype("float64"): str_cast.dtos, - np.dtype("bool"): str_cast.from_booleans, + cudf.dtype("int8"): str_cast.i8tos, + cudf.dtype("int16"): str_cast.i16tos, + cudf.dtype("int32"): str_cast.itos, + cudf.dtype("int64"): str_cast.ltos, + cudf.dtype("uint8"): str_cast.ui8tos, + cudf.dtype("uint16"): str_cast.ui16tos, + cudf.dtype("uint32"): str_cast.uitos, + cudf.dtype("uint64"): str_cast.ultos, + cudf.dtype("float32"): str_cast.ftos, + cudf.dtype("float64"): str_cast.dtos, + cudf.dtype("bool"): str_cast.from_booleans, } _datetime_to_str_typecast_functions = { # TODO: support Date32 UNIX days - # np.dtype("datetime64[D]"): str_cast.int2timestamp, - np.dtype("datetime64[s]"): str_cast.int2timestamp, - np.dtype("datetime64[ms]"): str_cast.int2timestamp, - np.dtype("datetime64[us]"): str_cast.int2timestamp, - np.dtype("datetime64[ns]"): str_cast.int2timestamp, + # cudf.dtype("datetime64[D]"): str_cast.int2timestamp, + cudf.dtype("datetime64[s]"): str_cast.int2timestamp, + cudf.dtype("datetime64[ms]"): str_cast.int2timestamp, + cudf.dtype("datetime64[us]"): str_cast.int2timestamp, + cudf.dtype("datetime64[ns]"): str_cast.int2timestamp, } _timedelta_to_str_typecast_functions = { - np.dtype("timedelta64[s]"): str_cast.int2timedelta, - np.dtype("timedelta64[ms]"): str_cast.int2timedelta, - np.dtype("timedelta64[us]"): str_cast.int2timedelta, - np.dtype("timedelta64[ns]"): str_cast.int2timedelta, + cudf.dtype("timedelta64[s]"): str_cast.int2timedelta, + cudf.dtype("timedelta64[ms]"): str_cast.int2timedelta, + cudf.dtype("timedelta64[us]"): str_cast.int2timedelta, + cudf.dtype("timedelta64[ns]"): str_cast.int2timedelta, } @@ -4895,7 +4895,7 @@ def __init__( Two non-null columns containing the string data and offsets respectively """ - dtype = np.dtype("object") + dtype = cudf.dtype("object") if size is None: for child in children: @@ -5240,7 +5240,7 @@ def deserialize(cls, header: dict, frames: list) -> StringColumn: return col def can_cast_safely(self, to_dtype: Dtype) -> bool: - to_dtype = np.dtype(to_dtype) + to_dtype = cudf.dtype(to_dtype) if self.dtype == to_dtype: return True diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 2c893fafae7..37bff1907fa 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -137,7 +137,7 @@ def _binary_op_floordiv( rhs = cudf.Scalar(None, "float64") else: rhs = rhs.astype(common_dtype).astype("float64") - out_dtype = np.dtype("int64") + out_dtype = cudf.dtype("int64") elif rhs.dtype.kind in ("f", "i", "u"): out_dtype = self.dtype else: @@ -204,7 +204,7 @@ def _binary_op_truediv( else: rhs = rhs.astype(common_dtype).astype("float64") - out_dtype = np.dtype("float64") + out_dtype = cudf.dtype("float64") elif rhs.dtype.kind in ("f", "i", "u"): out_dtype = self.dtype else: @@ -344,7 +344,7 @@ def as_string_column( ) if len(self) > 0: return string._timedelta_to_str_typecast_functions[ - np.dtype(self.dtype) + cudf.dtype(self.dtype) ](self, format=format) else: return cast( @@ -575,9 +575,9 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn": def determine_out_dtype(lhs_dtype: Dtype, rhs_dtype: Dtype) -> Dtype: - if np.can_cast(np.dtype(lhs_dtype), np.dtype(rhs_dtype)): + if np.can_cast(cudf.dtype(lhs_dtype), cudf.dtype(rhs_dtype)): return rhs_dtype - elif np.can_cast(np.dtype(rhs_dtype), np.dtype(lhs_dtype)): + elif np.can_cast(cudf.dtype(rhs_dtype), cudf.dtype(lhs_dtype)): return lhs_dtype else: raise TypeError(f"Cannot type-cast {lhs_dtype} and {rhs_dtype}") @@ -594,7 +594,7 @@ def _timedelta_add_result_dtype( lhs_unit = units.index(lhs_time_unit) rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs) rhs_unit = units.index(rhs_time_unit) - out_dtype = np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]") + out_dtype = cudf.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]") else: raise TypeError( f"Addition of {lhs.dtype} with {rhs.dtype} " @@ -619,7 +619,7 @@ def _timedelta_sub_result_dtype( lhs_unit = units.index(lhs_time_unit) rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs) rhs_unit = units.index(rhs_time_unit) - out_dtype = np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]") + out_dtype = cudf.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]") else: raise TypeError( f"Subtraction of {lhs.dtype} with {rhs.dtype} " diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index 7811f477170..91f623a3cd3 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -5,7 +5,6 @@ import pandas as pd import cudf -from cudf._lib.labeling import label_bins from cudf.core.column import as_column, build_categorical_column from cudf.core.index import IntervalIndex, interval_range from cudf.utils.dtypes import is_list_like @@ -240,7 +239,7 @@ def cut( # the input arr must be changed to the same type as the edges input_arr = input_arr.astype(left_edges.dtype) # get the indexes for the appropriate number - index_labels = label_bins( + index_labels = cudf._lib.labeling.label_bins( input_arr, left_edges, left_inclusive, right_edges, right_inclusive ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index bc068413efb..7d2fe5dfb98 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -23,6 +23,7 @@ from pandas.io.formats.printing import pprint_thing import cudf +import cudf.core.common from cudf import _lib as libcudf from cudf.api.types import is_bool_dtype, is_dict_like from cudf.core import column, reshape @@ -3533,12 +3534,12 @@ def as_gpu_matrix(self, columns=None, order="F"): if ncol < 1: # This is the case for empty dataframe - construct empty cupy array matrix = cupy.empty( - shape=(0, 0), dtype=np.dtype("float64"), order=order + shape=(0, 0), dtype=cudf.dtype("float64"), order=order ) return cuda.as_cuda_array(matrix) if any( - (is_categorical_dtype(c) or np.issubdtype(c, np.dtype("object"))) + (is_categorical_dtype(c) or np.issubdtype(c, cudf.dtype("object"))) for c in cols ): raise TypeError("non-numeric data not yet supported") @@ -3552,7 +3553,7 @@ def as_gpu_matrix(self, columns=None, order="F"): ) cupy_dtype = dtype if np.issubdtype(cupy_dtype, np.datetime64): - cupy_dtype = np.dtype("int64") + cupy_dtype = cudf.dtype("int64") if order not in ("F", "C"): raise ValueError( @@ -5752,7 +5753,7 @@ def to_records(self, index=True): """ members = [("index", self.index.dtype)] if index else [] members += [(col, self[col].dtype) for col in self._data.names] - dtype = np.dtype(members) + dtype = cudf.dtype(members) ret = np.recarray(len(self), dtype=dtype) if index: ret["index"] = self.index.to_array() @@ -6137,12 +6138,12 @@ def isin(self, values): isinstance( self[col]._column, cudf.core.column.CategoricalColumn ) - or np.issubdtype(self[col].dtype, np.dtype("object")) + or np.issubdtype(self[col].dtype, cudf.dtype("object")) ) or ( isinstance( values._column, cudf.core.column.CategoricalColumn ) - or np.issubdtype(values.dtype, np.dtype("object")) + or np.issubdtype(values.dtype, cudf.dtype("object")) ): result[col] = utils.scalar_broadcast_to(False, len(self)) else: @@ -7209,7 +7210,7 @@ def _apply_support_method(self, method, axis=0, *args, **kwargs): prepared._data[col] ) if not is_datetime_dtype(common_dtype) - else np.dtype("float64") + else cudf.dtype("float64") ) .fillna(np.nan) ) @@ -8081,7 +8082,7 @@ def _get_union_of_indices(indexes): if len(indexes) == 1: return indexes[0] else: - merged_index = cudf.core.Index._concat(indexes) + merged_index = cudf.Index._concat(indexes) merged_index = merged_index.drop_duplicates() _, inds = merged_index._values.sort_by_values() return merged_index.take(inds) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 4062b734bb3..03da08097e2 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -21,6 +21,55 @@ from cudf.core.buffer import Buffer +def dtype(arbitrary): + """ + Return the cuDF-supported dtype corresponding to `arbitrary`. + + Inputs + ------ + arbitrary: dtype or scalar-like + + Returns + ------- + dtype: the cuDF-supported dtype that best matches `arbitrary` + """ + # first, try interpreting arbitrary as a NumPy dtype that we support: + try: + np_dtype = np.dtype(arbitrary) + if np_dtype.name == "float16": + np_dtype = np.dtype("float32") + elif np_dtype.kind in ("OU"): + np_dtype = np.dtype("object") + except TypeError: + pass + else: + if np_dtype.kind not in "biufUOMm": + raise TypeError(f"Unsupported type {np_dtype}") + return np_dtype + + # next, check if `arbitrary` is one of our extension types: + if isinstance(arbitrary, cudf.core.dtypes._BaseDtype): + return arbitrary + + # use `pandas_dtype` to try and interpret + # `arbitrary` as a Pandas extension type. + # Return the corresponding NumPy/cuDF type. + pd_dtype = pd.api.types.pandas_dtype(arbitrary) + try: + return pd_dtype.numpy_dtype + except AttributeError: + if isinstance(pd_dtype, pd.CategoricalDtype): + return cudf.CategoricalDtype.from_pandas(pd_dtype) + elif isinstance(pd_dtype, pd.StringDtype): + return np.dtype("object") + elif isinstance(pd_dtype, pd.IntervalDtype): + return cudf.IntervalDtype.from_pandas(pd_dtype) + else: + raise TypeError( + f"Cannot interpret {arbitrary} as a valid cuDF dtype" + ) + + class _BaseDtype(ExtensionDtype, Serializable): # Base type for all cudf-specific dtypes pass @@ -157,7 +206,7 @@ def element_type(self) -> Dtype: elif isinstance(self._typ.value_type, pa.StructType): return StructDtype.from_arrow(self._typ.value_type) else: - return np.dtype(self._typ.value_type.to_pandas_dtype()).name + return cudf.dtype(self._typ.value_type.to_pandas_dtype()).name @property def leaf_type(self): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 14b8ebe801f..e26248340f5 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -884,8 +884,9 @@ def where(self, cond, other=None, inplace=False): 4 dtype: int64 """ + import cudf.core._internals.where - return cudf.core._internals.where( + return cudf.core._internals.where.where( frame=self, cond=cond, other=other, inplace=inplace ) @@ -3304,7 +3305,7 @@ def _reindex( if index is not None: index = cudf.core.index.as_index(index) - if isinstance(index, cudf.core.MultiIndex): + if isinstance(index, cudf.MultiIndex): idx_dtype_match = ( df.index._source_data.dtypes == index._source_data.dtypes ).all() @@ -4016,7 +4017,7 @@ def _get_replacement_values_for_columns( col: [value] if _is_non_decimal_numeric_dtype(columns_dtype_map[col]) else cudf.utils.utils.scalar_broadcast_to( - value, (len(to_replace),), np.dtype(type(value)), + value, (len(to_replace),), cudf.dtype(type(value)), ) for col in columns_dtype_map } diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 97ee0948209..64041e23763 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -646,12 +646,12 @@ def append(self, other): if is_mixed_with_object_dtype(this, other): got_dtype = ( other.dtype - if this.dtype == np.dtype("object") + if this.dtype == cudf.dtype("object") else this.dtype ) raise TypeError( f"cudf does not support appending an Index of " - f"dtype `{np.dtype('object')}` with an Index " + f"dtype `{cudf.dtype('object')}` with an Index " f"of dtype `{got_dtype}`, please type-cast " f"either one of them to same dtypes." ) @@ -1629,7 +1629,7 @@ def dtype(self): """ `dtype` of the range of values in RangeIndex. """ - return np.dtype(np.int64) + return cudf.dtype(np.int64) @property def is_contiguous(self): diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 75be36a1b16..cda9e9414e4 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -6,7 +6,6 @@ from pandas._libs.missing import NAType as pd_NAType import cudf -from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar from cudf.core.column.column import ColumnBase from cudf.core.dtypes import Decimal64Dtype, ListDtype, StructDtype from cudf.core.index import BaseIndex @@ -68,7 +67,7 @@ def __init__(self, value, dtype=None): self._host_dtype = value._host_dtype else: self._device_value = value._device_value - elif isinstance(value, DeviceScalar): + elif isinstance(value, cudf._lib.scalar.DeviceScalar): self._device_value = value else: self._host_value, self._host_dtype = self._preprocess_host_value( @@ -86,7 +85,7 @@ def _is_device_value_current(self): @property def device_value(self): if self._device_value is None: - self._device_value = DeviceScalar( + self._device_value = cudf._lib.scalar.DeviceScalar( self._host_value, self._host_dtype ) return self._device_value @@ -102,7 +101,7 @@ def value(self): def dtype(self): if self._is_host_value_current: if isinstance(self._host_value, str): - return np.dtype("object") + return cudf.dtype("object") else: return self._host_dtype else: @@ -111,13 +110,13 @@ def dtype(self): def is_valid(self): if not self._is_host_value_current: self._device_value_to_host() - return not _is_null_host_scalar(self._host_value) + return not cudf._lib.scalar._is_null_host_scalar(self._host_value) def _device_value_to_host(self): self._host_value = self._device_value._to_host_scalar() def _preprocess_host_value(self, value, dtype): - valid = not _is_null_host_scalar(value) + valid = not cudf._lib.scalar._is_null_host_scalar(value) if isinstance(value, list): if dtype is not None: @@ -187,7 +186,7 @@ def _sync(self): if self._is_host_value_current and self._is_device_value_current: return elif self._is_host_value_current and not self._is_device_value_current: - self._device_value = DeviceScalar( + self._device_value = cudf._lib.scalar.DeviceScalar( self._host_value, self._host_dtype ) elif self._is_device_value_current and not self._is_host_value_current: @@ -324,10 +323,10 @@ def _binop_result_dtype_or_error(self, other, op): and self.dtype.char == other.dtype.char == "M" ): res, _ = np.datetime_data(max(self.dtype, other.dtype)) - return np.dtype("m8" + f"[{res}]") + return cudf.dtype("m8" + f"[{res}]") return np.result_type(self.dtype, other.dtype) - return np.dtype(out_dtype) + return cudf.dtype(out_dtype) def _scalar_binop(self, other, op): if isinstance(other, (ColumnBase, Series, BaseIndex, np.ndarray)): @@ -358,9 +357,9 @@ def _unaop_result_type_or_error(self, op): if op in {"__ceil__", "__floor__"}: if self.dtype.char in "bBhHf?": - return np.dtype("float32") + return cudf.dtype("float32") else: - return np.dtype("float64") + return cudf.dtype("float64") return self.dtype def _scalar_unaop(self, op): diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 00f60cfc8b5..75ecb9b90be 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -495,7 +495,7 @@ def __init__(self, n=1, normalize=False, **kwds): dtype = "int16" else: unit = self._UNITS_TO_CODES[k] - dtype = np.dtype(f"timedelta64[{unit}]") + dtype = cudf.dtype(f"timedelta64[{unit}]") scalars[k] = cudf.Scalar(v, dtype=dtype) self._scalars = scalars diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 6d31c1ba74d..d5c4df12246 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -109,7 +109,7 @@ def to_numeric(arg, errors="raise", downcast=None): dtype = col.dtype if is_datetime_dtype(dtype) or is_timedelta_dtype(dtype): - col = col.as_numerical_column(np.dtype("int64")) + col = col.as_numerical_column(cudf.dtype("int64")) elif is_categorical_dtype(dtype): cat_dtype = col.dtype.type if _is_non_decimal_numeric_dtype(cat_dtype): @@ -140,7 +140,7 @@ def to_numeric(arg, errors="raise", downcast=None): raise ValueError("Unrecognized datatype") # str->float conversion may require lower precision - if col.dtype == np.dtype("f"): + if col.dtype == cudf.dtype("f"): col = col.as_numerical_column("d") if downcast: @@ -150,13 +150,13 @@ def to_numeric(arg, errors="raise", downcast=None): "unsigned": list(np.typecodes["UnsignedInteger"]), } float_types = list(np.typecodes["Float"]) - idx = float_types.index(np.dtype(np.float32).char) + idx = float_types.index(cudf.dtype(np.float32).char) downcast_type_map["float"] = float_types[idx:] type_set = downcast_type_map[downcast] for t in type_set: - downcast_dtype = np.dtype(t) + downcast_dtype = cudf.dtype(t) if downcast_dtype.itemsize <= col.dtype.itemsize: if col.can_cast_safely(downcast_dtype): col = libcudf.unary.cast(col, downcast_dtype) @@ -197,7 +197,7 @@ def _convert_str_col(col, errors, _downcast=None): is_integer = libstrings.is_integer(col) if is_integer.all(): - return col.as_numerical_column(dtype=np.dtype("i8")) + return col.as_numerical_column(dtype=cudf.dtype("i8")) col = _proc_inf_empty_strings(col) @@ -210,9 +210,9 @@ def _convert_str_col(col, errors, _downcast=None): "limited by float32 precision." ) ) - return col.as_numerical_column(dtype=np.dtype("f")) + return col.as_numerical_column(dtype=cudf.dtype("f")) else: - return col.as_numerical_column(dtype=np.dtype("d")) + return col.as_numerical_column(dtype=cudf.dtype("d")) else: if errors == "coerce": col = libcudf.string_casting.stod(col) diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index d8761057683..22dfd1aa145 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -13,7 +13,7 @@ import pytest import cudf -from cudf.core import Series +from cudf import Series from cudf.core.index import as_index from cudf.testing import _utils as utils from cudf.utils.dtypes import ( diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 761b2f32f18..cc4c98b611f 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -362,7 +362,7 @@ def test_column_view_string_slice(slc): ) def test_as_column_buffer(data, expected): actual_column = cudf.core.column.as_column( - cudf.core.Buffer(data), dtype=data.dtype + cudf.core.buffer.Buffer(data), dtype=data.dtype ) assert_eq(cudf.Series(actual_column), cudf.Series(expected)) diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index 0965b5298a4..21a6a9172db 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -5,7 +5,7 @@ import pytest import cudf -from cudf.core import Series +from cudf import Series from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 5f5a0a78414..4e17e4e52df 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -12,7 +12,7 @@ import pytest import cudf -from cudf.core import DataFrame, Series +from cudf import DataFrame, Series from cudf.core.index import DatetimeIndex from cudf.testing._utils import ( DATETIME_TYPES, diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py index 3df0031745e..46cbc9d2b52 100644 --- a/python/cudf/cudf/tests/test_factorize.py +++ b/python/cudf/cudf/tests/test_factorize.py @@ -6,7 +6,7 @@ import pytest import cudf -from cudf.core import DataFrame, Index +from cudf import DataFrame, Index from cudf.testing._utils import assert_eq diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index de7d8e35bce..10217a2193f 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -13,7 +13,7 @@ import rmm import cudf -from cudf.core import DataFrame, Series +from cudf import DataFrame, Series from cudf.core._compat import PANDAS_GE_110 from cudf.testing._utils import ( DATETIME_TYPES, diff --git a/python/cudf/cudf/tests/test_label_encode.py b/python/cudf/cudf/tests/test_label_encode.py index f513aa7a134..bac324d9c1c 100644 --- a/python/cudf/cudf/tests/test_label_encode.py +++ b/python/cudf/cudf/tests/test_label_encode.py @@ -7,7 +7,7 @@ import pytest import cudf -from cudf.core import DataFrame, Series +from cudf import DataFrame, Series def _random_float(nelem, dtype): diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py index e9c828ec0f5..7643bfdf050 100644 --- a/python/cudf/cudf/tests/test_monotonic.py +++ b/python/cudf/cudf/tests/test_monotonic.py @@ -8,7 +8,7 @@ import pytest import cudf -from cudf.core import MultiIndex, Series +from cudf import MultiIndex, Series from cudf.core.index import ( CategoricalIndex, DatetimeIndex, diff --git a/python/cudf/cudf/tests/test_numpy_interop.py b/python/cudf/cudf/tests/test_numpy_interop.py index e5efe2f027d..55b5a38c3e5 100644 --- a/python/cudf/cudf/tests/test_numpy_interop.py +++ b/python/cudf/cudf/tests/test_numpy_interop.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from cudf.core import DataFrame, Series +from cudf import DataFrame, Series from cudf.testing._utils import assert_eq diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py index bbec4594e15..0a3ead6cf31 100644 --- a/python/cudf/cudf/tests/test_onehot.py +++ b/python/cudf/cudf/tests/test_onehot.py @@ -7,7 +7,7 @@ import pytest import cudf -from cudf.core import DataFrame, GenericIndex, Series +from cudf import DataFrame, Index, Series from cudf.testing import _utils as utils @@ -86,7 +86,7 @@ def test_onehot_generic_index(): indices = np.random.randint(low=0, high=100, size=size) df = DataFrame() values = np.random.randint(low=0, high=4, size=size) - df["fo"] = Series(values, index=GenericIndex(indices)) + df["fo"] = Series(values, index=Index(indices)) out = df.one_hot_encoding( "fo", cats=df.fo.unique(), prefix="fo", dtype=np.int32 ) diff --git a/python/cudf/cudf/tests/test_pack.py b/python/cudf/cudf/tests/test_pack.py index c735a71d5e1..8f54e17c0c3 100644 --- a/python/cudf/cudf/tests/test_pack.py +++ b/python/cudf/cudf/tests/test_pack.py @@ -18,8 +18,8 @@ import numpy as np import pandas as pd +from cudf import DataFrame, GenericIndex, Series from cudf._lib.copying import pack, unpack -from cudf.core import DataFrame, GenericIndex, Series from cudf.testing._utils import assert_eq diff --git a/python/cudf/cudf/tests/test_pandas_interop.py b/python/cudf/cudf/tests/test_pandas_interop.py index a8a45fc3c28..c90d6f23c2d 100644 --- a/python/cudf/cudf/tests/test_pandas_interop.py +++ b/python/cudf/cudf/tests/test_pandas_interop.py @@ -4,7 +4,7 @@ import pandas as pd import cudf -from cudf.core import DataFrame +from cudf import DataFrame from cudf.testing._utils import assert_eq diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py index 48a25fcfadb..0f8b46cee35 100644 --- a/python/cudf/cudf/tests/test_pickling.py +++ b/python/cudf/cudf/tests/test_pickling.py @@ -6,7 +6,7 @@ import pandas as pd import pytest -from cudf.core import DataFrame, GenericIndex, Series +from cudf import DataFrame, GenericIndex, Series from cudf.core.buffer import Buffer from cudf.testing._utils import assert_eq diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py index 8dc5df2dd7c..07c6cce5cd3 100644 --- a/python/cudf/cudf/tests/test_query.py +++ b/python/cudf/cudf/tests/test_query.py @@ -11,7 +11,7 @@ import pytest import cudf -from cudf.core import DataFrame +from cudf import DataFrame from cudf.testing._utils import assert_eq from cudf.utils import queryutils diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py index 3c98496def3..563278e3a8f 100644 --- a/python/cudf/cudf/tests/test_rank.py +++ b/python/cudf/cudf/tests/test_rank.py @@ -6,7 +6,7 @@ import pandas as pd import pytest -from cudf.core import DataFrame +from cudf import DataFrame from cudf.testing._utils import assert_eq, assert_exceptions_equal diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index b7bbefb8c58..5b6a91e33ce 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -11,7 +11,7 @@ import pytest import cudf -from cudf.core import Series +from cudf import Series from cudf.core.dtypes import Decimal64Dtype from cudf.testing import _utils as utils from cudf.testing._utils import NUMERIC_TYPES, assert_eq, gen_rand diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index 759feedf2d5..ef9f853bd11 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -7,7 +7,7 @@ import pandas as pd import pytest -from cudf.core import DataFrame, Series +from cudf import DataFrame, Series from cudf.core.column import NumericalColumn from cudf.testing._utils import ( DATETIME_TYPES, diff --git a/python/cudf/cudf/tests/test_sparse_df.py b/python/cudf/cudf/tests/test_sparse_df.py index 50c8f3f41a8..e10ad8e5306 100644 --- a/python/cudf/cudf/tests/test_sparse_df.py +++ b/python/cudf/cudf/tests/test_sparse_df.py @@ -6,8 +6,8 @@ import pytest from numba import cuda +from cudf import DataFrame, Series from cudf.comm.gpuarrow import GpuArrowReader -from cudf.core import DataFrame, Series from cudf.testing._utils import assert_eq diff --git a/python/cudf/cudf/tests/test_transform.py b/python/cudf/cudf/tests/test_transform.py index 582d5a43edf..0c246554082 100644 --- a/python/cudf/cudf/tests/test_transform.py +++ b/python/cudf/cudf/tests/test_transform.py @@ -5,7 +5,7 @@ import numpy as np import pytest -from cudf.core import Series +from cudf import Series from cudf.testing._utils import NUMERIC_TYPES supported_types = NUMERIC_TYPES diff --git a/python/cudf/cudf/tests/test_udf_binops.py b/python/cudf/cudf/tests/test_udf_binops.py index 5a5aca615ba..4d6188acf8c 100644 --- a/python/cudf/cudf/tests/test_udf_binops.py +++ b/python/cudf/cudf/tests/test_udf_binops.py @@ -7,8 +7,7 @@ from numba.np import numpy_support import cudf -from cudf import _lib as libcudf -from cudf.core import Series +from cudf import Series, _lib as libcudf from cudf.utils import dtypes as dtypeutils diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py index c549dd2712b..25ebe6fa710 100644 --- a/python/cudf/cudf/tests/test_unaops.py +++ b/python/cudf/cudf/tests/test_unaops.py @@ -9,7 +9,7 @@ import pytest import cudf -from cudf.core import Series +from cudf import Series from cudf.testing import _utils as utils _unaops = [operator.abs, operator.invert, operator.neg, np.ceil, np.floor] diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 71173faf9d7..9511bb389e7 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -54,16 +54,16 @@ } cudf_dtypes_to_pandas_dtypes = { - np.dtype("uint8"): pd.UInt8Dtype(), - np.dtype("uint16"): pd.UInt16Dtype(), - np.dtype("uint32"): pd.UInt32Dtype(), - np.dtype("uint64"): pd.UInt64Dtype(), - np.dtype("int8"): pd.Int8Dtype(), - np.dtype("int16"): pd.Int16Dtype(), - np.dtype("int32"): pd.Int32Dtype(), - np.dtype("int64"): pd.Int64Dtype(), - np.dtype("bool_"): pd.BooleanDtype(), - np.dtype("object"): pd.StringDtype(), + cudf.dtype("uint8"): pd.UInt8Dtype(), + cudf.dtype("uint16"): pd.UInt16Dtype(), + cudf.dtype("uint32"): pd.UInt32Dtype(), + cudf.dtype("uint64"): pd.UInt64Dtype(), + cudf.dtype("int8"): pd.Int8Dtype(), + cudf.dtype("int16"): pd.Int16Dtype(), + cudf.dtype("int32"): pd.Int32Dtype(), + cudf.dtype("int64"): pd.Int64Dtype(), + cudf.dtype("bool_"): pd.BooleanDtype(), + cudf.dtype("object"): pd.StringDtype(), } pyarrow_dtypes_to_pandas_dtypes = { @@ -80,16 +80,16 @@ } pandas_dtypes_to_cudf_dtypes = { - pd.UInt8Dtype(): np.dtype("uint8"), - pd.UInt16Dtype(): np.dtype("uint16"), - pd.UInt32Dtype(): np.dtype("uint32"), - pd.UInt64Dtype(): np.dtype("uint64"), - pd.Int8Dtype(): np.dtype("int8"), - pd.Int16Dtype(): np.dtype("int16"), - pd.Int32Dtype(): np.dtype("int32"), - pd.Int64Dtype(): np.dtype("int64"), - pd.BooleanDtype(): np.dtype("bool_"), - pd.StringDtype(): np.dtype("object"), + pd.UInt8Dtype(): cudf.dtype("uint8"), + pd.UInt16Dtype(): cudf.dtype("uint16"), + pd.UInt32Dtype(): cudf.dtype("uint32"), + pd.UInt64Dtype(): cudf.dtype("uint64"), + pd.Int8Dtype(): cudf.dtype("int8"), + pd.Int16Dtype(): cudf.dtype("int16"), + pd.Int32Dtype(): cudf.dtype("int32"), + pd.Int64Dtype(): cudf.dtype("int64"), + pd.BooleanDtype(): cudf.dtype("bool_"), + pd.StringDtype(): cudf.dtype("object"), } pandas_dtypes_alias_to_cudf_alias = { @@ -105,10 +105,10 @@ } if PANDAS_GE_120: - cudf_dtypes_to_pandas_dtypes[np.dtype("float32")] = pd.Float32Dtype() - cudf_dtypes_to_pandas_dtypes[np.dtype("float64")] = pd.Float64Dtype() - pandas_dtypes_to_cudf_dtypes[pd.Float32Dtype()] = np.dtype("float32") - pandas_dtypes_to_cudf_dtypes[pd.Float64Dtype()] = np.dtype("float64") + cudf_dtypes_to_pandas_dtypes[cudf.dtype("float32")] = pd.Float32Dtype() + cudf_dtypes_to_pandas_dtypes[cudf.dtype("float64")] = pd.Float64Dtype() + pandas_dtypes_to_cudf_dtypes[pd.Float32Dtype()] = cudf.dtype("float32") + pandas_dtypes_to_cudf_dtypes[pd.Float64Dtype()] = cudf.dtype("float64") pandas_dtypes_alias_to_cudf_alias["Float32"] = "float32" pandas_dtypes_alias_to_cudf_alias["Float64"] = "float64" @@ -351,7 +351,7 @@ def min_signed_type(x, min_size=8): that can represent the integer ``x`` """ for int_dtype in np.sctypes["int"]: - if (np.dtype(int_dtype).itemsize * 8) >= min_size: + if (cudf.dtype(int_dtype).itemsize * 8) >= min_size: if np.iinfo(int_dtype).min <= x <= np.iinfo(int_dtype).max: return int_dtype # resort to using `int64` and let numpy raise appropriate exception: @@ -364,7 +364,7 @@ def min_unsigned_type(x, min_size=8): that can represent the integer ``x`` """ for int_dtype in np.sctypes["uint"]: - if (np.dtype(int_dtype).itemsize * 8) >= min_size: + if (cudf.dtype(int_dtype).itemsize * 8) >= min_size: if 0 <= x <= np.iinfo(int_dtype).max: return int_dtype # resort to using `uint64` and let numpy raise appropriate exception: @@ -388,9 +388,9 @@ def min_column_type(x, expected_type): max_bound_dtype = np.min_scalar_type(x.max()) min_bound_dtype = np.min_scalar_type(x.min()) result_type = np.promote_types(max_bound_dtype, min_bound_dtype) - if result_type == np.dtype("float16"): + if result_type == cudf.dtype("float16"): # cuDF does not support float16 dtype - result_type = np.dtype("float32") + result_type = cudf.dtype("float32") return result_type if np.issubdtype(expected_type, np.integer): @@ -405,32 +405,12 @@ def get_min_float_dtype(col): max_bound_dtype = np.min_scalar_type(float(col.max())) min_bound_dtype = np.min_scalar_type(float(col.min())) result_type = np.promote_types(max_bound_dtype, min_bound_dtype) - if result_type == np.dtype("float16"): + if result_type == cudf.dtype("float16"): # cuDF does not support float16 dtype - result_type = np.dtype("float32") + result_type = cudf.dtype("float32") return result_type -def check_cast_unsupported_dtype(dtype): - if is_categorical_dtype(dtype): - return dtype - - if isinstance(dtype, pd.core.arrays.numpy_.PandasDtype): - dtype = dtype.numpy_dtype - else: - dtype = cudf.dtype(dtype) - - if dtype in cudf._lib.types.np_to_cudf_types: - return dtype - - if dtype == np.dtype("float16"): - return np.dtype("float32") - - raise NotImplementedError( - f"Cannot cast {dtype} dtype, as it is not supported by CuDF." - ) - - def is_mixed_with_object_dtype(lhs, rhs): return (lhs.dtype == "object" and rhs.dtype != "object") or ( rhs.dtype == "object" and lhs.dtype != "object" @@ -550,7 +530,7 @@ def find_common_type(dtypes): [dtype for dtype in dtypes if is_decimal_dtype(dtype)] ) else: - return np.dtype("O") + return cudf.dtype("O") # Corner case 1: # Resort to np.result_type to handle "M" and "m" types separately @@ -567,9 +547,9 @@ def find_common_type(dtypes): dtypes.add(np.result_type(*td_dtypes)) common_dtype = np.find_common_type(list(dtypes), []) - if common_dtype == np.dtype("float16"): + if common_dtype == cudf.dtype("float16"): # cuDF does not support float16 dtype - return np.dtype("float32") + return cudf.dtype("float32") else: return common_dtype @@ -582,9 +562,9 @@ def _can_cast(from_dtype, to_dtype): cudf specific dtypes. """ if isinstance(from_dtype, type): - from_dtype = np.dtype(from_dtype) + from_dtype = cudf.dtype(from_dtype) if isinstance(to_dtype, type): - to_dtype = np.dtype(to_dtype) + to_dtype = cudf.dtype(to_dtype) # TODO : Add precision & scale checking for # decimal types in future diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 58d133d16d8..c9d38c8399e 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -17,7 +17,7 @@ from cudf.utils.dtypes import to_cudf_compatible_scalar # The size of the mask in bytes -mask_dtype = np.dtype(np.int32) +mask_dtype = cudf.dtype(np.int32) mask_bitsize = mask_dtype.itemsize * 8 diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 53543b9e886..0cc2821e8ef 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -67,10 +67,10 @@ def _nonempty_index(idx): return cudf.core.index.GenericIndex( np.arange(2, dtype=idx.dtype), name=idx.name ) - elif isinstance(idx, cudf.core.MultiIndex): + elif isinstance(idx, cudf.core.multiindex.MultiIndex): levels = [meta_nonempty(lev) for lev in idx.levels] codes = [[0, 0] for i in idx.levels] - return cudf.core.MultiIndex( + return cudf.core.multiindex.MultiIndex( levels=levels, codes=codes, names=idx.names ) From 3eba47c14447ee5f36451fd80eebd9a9bcd18d0b Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 11 Aug 2021 13:57:09 -0400 Subject: [PATCH 12/20] Progress --- python/cudf/cudf/_lib/copying.pyx | 7 ++++--- python/cudf/cudf/core/column/column.py | 3 --- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/utils/dtypes.py | 23 +++++++---------------- 4 files changed, 12 insertions(+), 23 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index a5789e4d0ae..aa279e84d91 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -787,12 +787,13 @@ cdef class _CPackedColumns: """ Construct a ``PackedColumns`` object from a ``cudf.DataFrame``. """ - from cudf import RangeIndex, dtypes + import cudf.core.dtypes + from cudf import RangeIndex cdef _CPackedColumns p = _CPackedColumns.__new__(_CPackedColumns) if keep_index and ( - not isinstance(input_table.index, RangeIndex) + not isinstance(input_table.index, cudf.RangeIndex) or input_table.index.start != 0 or input_table.index.stop != len(input_table) or input_table.index.step != 1 @@ -805,7 +806,7 @@ cdef class _CPackedColumns: p.column_names = input_table._column_names p.column_dtypes = {} for name, col in input_table._data.items(): - if isinstance(col.dtype, dtypes._BaseDtype): + if isinstance(col.dtype, cudf.core.dtypes._BaseDtype): p.column_dtypes[name] = col.dtype p.c_obj = move(cpp_copying.pack(input_table_view)) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 7b7bef0e2d1..7a02e98ef12 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -261,9 +261,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: if not isinstance(array, (pa.Array, pa.ChunkedArray)): raise TypeError("array should be PyArrow array or chunked array") - if array.type == pa.float16(): - array = pa.Array.from_pandas(array.to_numpy().astype("float32")) - data = pa.table([array], [None]) if isinstance(array.type, pa.DictionaryType): diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a9ddc77963c..d18c9adb7a1 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5756,7 +5756,7 @@ def to_records(self, index=True): """ members = [("index", self.index.dtype)] if index else [] members += [(col, self[col].dtype) for col in self._data.names] - dtype = cudf.dtype(members) + dtype = np.dtype(members) ret = np.recarray(len(self), dtype=dtype) if index: ret["index"] = self.index.to_array() diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 9511bb389e7..81727aad9a2 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -388,27 +388,22 @@ def min_column_type(x, expected_type): max_bound_dtype = np.min_scalar_type(x.max()) min_bound_dtype = np.min_scalar_type(x.min()) result_type = np.promote_types(max_bound_dtype, min_bound_dtype) - if result_type == cudf.dtype("float16"): - # cuDF does not support float16 dtype - result_type = cudf.dtype("float32") - return result_type - if np.issubdtype(expected_type, np.integer): + elif np.issubdtype(expected_type, np.integer): max_bound_dtype = np.min_scalar_type(x.max()) min_bound_dtype = np.min_scalar_type(x.min()) - return np.promote_types(max_bound_dtype, min_bound_dtype) + result_type = np.promote_types(max_bound_dtype, min_bound_dtype) + else: + result_type = x.dtype - return x.dtype + return cudf.dtype(result_type) def get_min_float_dtype(col): max_bound_dtype = np.min_scalar_type(float(col.max())) min_bound_dtype = np.min_scalar_type(float(col.min())) result_type = np.promote_types(max_bound_dtype, min_bound_dtype) - if result_type == cudf.dtype("float16"): - # cuDF does not support float16 dtype - result_type = cudf.dtype("float32") - return result_type + return cudf.dtype(result_type) def is_mixed_with_object_dtype(lhs, rhs): @@ -547,11 +542,7 @@ def find_common_type(dtypes): dtypes.add(np.result_type(*td_dtypes)) common_dtype = np.find_common_type(list(dtypes), []) - if common_dtype == cudf.dtype("float16"): - # cuDF does not support float16 dtype - return cudf.dtype("float32") - else: - return common_dtype + return cudf.dtype(common_dtype) def _can_cast(from_dtype, to_dtype): From 048629c4b31db385e855c2eb0e1578186aaffa8f Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 11 Aug 2021 14:32:27 -0400 Subject: [PATCH 13/20] More fix --- python/cudf/cudf/core/dtypes.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 0e5646c6026..c5988d76207 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -38,6 +38,8 @@ def dtype(arbitrary): np_dtype = np.dtype(arbitrary) if np_dtype.name == "float16": np_dtype = np.dtype("float32") + elif np_dtype.name == "float128": + raise NotImplementedError() elif np_dtype.kind in ("OU"): np_dtype = np.dtype("object") except TypeError: @@ -56,7 +58,7 @@ def dtype(arbitrary): # Return the corresponding NumPy/cuDF type. pd_dtype = pd.api.types.pandas_dtype(arbitrary) try: - return pd_dtype.numpy_dtype + return dtype(pd_dtype.numpy_dtype) except AttributeError: if isinstance(pd_dtype, pd.CategoricalDtype): return cudf.CategoricalDtype.from_pandas(pd_dtype) From 40736c46cce9615e91f0895b31264a9a3cc7df53 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 11 Aug 2021 14:42:26 -0400 Subject: [PATCH 14/20] Early returns --- python/cudf/cudf/core/dtypes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index c5988d76207..ead0b6453c1 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -37,11 +37,11 @@ def dtype(arbitrary): try: np_dtype = np.dtype(arbitrary) if np_dtype.name == "float16": - np_dtype = np.dtype("float32") + return np.dtype("float32") elif np_dtype.name == "float128": raise NotImplementedError() elif np_dtype.kind in ("OU"): - np_dtype = np.dtype("object") + return np.dtype("object") except TypeError: pass else: From 550c7ba3a8e3105dd6852031e6fefb0d0b3c4dac Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 11 Aug 2021 14:54:29 -0400 Subject: [PATCH 15/20] More tests --- python/cudf/cudf/tests/test_dtypes.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index 0c3769fab67..a5a9109e13c 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -286,6 +286,21 @@ def test_lists_of_structs_dtype(data): ("datetime64[ms]", np.dtype(" Date: Wed, 11 Aug 2021 18:25:26 -0400 Subject: [PATCH 16/20] Resolve circular import issues --- python/cudf/cudf/_lib/scalar.pyx | 11 +++++------ python/cudf/cudf/_lib/string_casting.pyx | 7 ++----- .../cudf/_lib/strings/convert/convert_fixed_point.pyx | 8 ++------ python/cudf/cudf/_lib/table.pyx | 6 +++--- python/cudf/cudf/core/column/categorical.py | 5 +++-- 5 files changed, 15 insertions(+), 22 deletions(-) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 95fa5d4d20d..fe11d5e2627 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -35,6 +35,7 @@ from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id from cudf._lib.interop import from_arrow, to_arrow +cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.cpp.scalar.scalar cimport ( duration_scalar, fixed_point_scalar, @@ -60,9 +61,7 @@ from cudf._lib.cpp.wrappers.timestamps cimport ( ) from cudf._lib.utils cimport data_from_table_view -from cudf.utils.dtypes import _decimal_to_int64, is_list_dtype, is_struct_dtype - -cimport cudf._lib.cpp.types as libcudf_types +import cudf cdef class DeviceScalar: @@ -120,9 +119,9 @@ cdef class DeviceScalar: def _to_host_scalar(self): if isinstance(self.dtype, cudf.Decimal64Dtype): result = _get_py_decimal_from_fixed_point(self.c_value) - elif is_struct_dtype(self.dtype): + elif cudf.api.types.is_struct_dtype(self.dtype): result = _get_py_dict_from_struct(self.c_value) - elif is_list_dtype(self.dtype): + elif cudf.api.types.is_list_dtype(self.dtype): result = _get_py_list_from_list(self.c_value) elif pd.api.types.is_string_dtype(self.dtype): result = _get_py_string_from_string(self.c_value) @@ -309,7 +308,7 @@ cdef _set_decimal64_from_scalar(unique_ptr[scalar]& s, object value, object dtype, bool valid=True): - value = _decimal_to_int64(value) if valid else 0 + value = cudf.utils.dtypes._decimal_to_int64(value) if valid else 0 s.reset( new fixed_point_scalar[decimal64]( np.int64(value), scale_type(-dtype.scale), valid diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 8d7e307c5fb..25e4149183e 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -10,10 +10,6 @@ from cudf._lib.scalar cimport DeviceScalar from cudf._lib.types import np_to_cudf_types -from cudf._lib.types cimport underlying_type_t_type_id - -from cudf.core.column.column import as_column - from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.utility cimport move @@ -55,6 +51,7 @@ from cudf._lib.cpp.strings.convert.convert_urls cimport ( url_encode as cpp_url_encode, ) from cudf._lib.cpp.types cimport data_type, type_id +from cudf._lib.types cimport underlying_type_t_type_id import cudf @@ -590,7 +587,7 @@ def istimestamp( """ if input_col.size == 0: - return as_column([], dtype=kwargs.get('dtype')) + return cudf.core.column.as_column([], dtype=kwargs.get('dtype')) cdef column_view input_column_view = input_col.view() cdef string c_timestamp_format = str(format).encode('UTF-8') cdef unique_ptr[column] c_result diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx index 6eb8984b869..e35ab6489c6 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx @@ -6,11 +6,6 @@ from cudf._lib.column cimport Column from cudf._lib.types import np_to_cudf_types -from cudf._lib.cpp.types cimport DECIMAL64 -from cudf._lib.types cimport underlying_type_t_type_id - -from cudf.core.column.column import as_column - from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.utility cimport move @@ -22,7 +17,8 @@ from cudf._lib.cpp.strings.convert.convert_fixed_point cimport ( is_fixed_point as cpp_is_fixed_point, to_fixed_point as cpp_to_fixed_point, ) -from cudf._lib.cpp.types cimport data_type, type_id +from cudf._lib.cpp.types cimport DECIMAL64, data_type, type_id +from cudf._lib.types cimport underlying_type_t_type_id def from_decimal(Column input_col): diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx index 09cb05a076d..2981a46a54a 100644 --- a/python/cudf/cudf/_lib/table.pyx +++ b/python/cudf/cudf/_lib/table.pyx @@ -4,8 +4,6 @@ import itertools import numpy as np -from cudf.core.column_accessor import ColumnAccessor - from cython.operator cimport dereference from libc.stdint cimport uintptr_t from libcpp.memory cimport unique_ptr @@ -19,6 +17,8 @@ from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport mutable_table_view, table_view from cudf._lib.cpp.types cimport size_type +import cudf + cdef class Table: def __init__(self, object data=None, object index=None): @@ -34,7 +34,7 @@ cdef class Table: """ if data is None: data = {} - self._data = ColumnAccessor(data) + self._data = cudf.core.column_accessor.ColumnAccessor(data) self._index = index @property diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index f435e0fa88c..a486d70f047 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -22,7 +22,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.scalar import as_device_scalar from cudf._lib.transform import bools_to_mask from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer @@ -884,7 +883,9 @@ def _fill( return self if inplace else self.copy() fill_code = self._encode(fill_value) - fill_scalar = as_device_scalar(fill_code, self.codes.dtype) + fill_scalar = cudf._lib.scalar.as_device_scalar( + fill_code, self.codes.dtype + ) result = self if inplace else self.copy() From c8925f55a9a797528e60059e63f873b04465dc60 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 12 Aug 2021 10:48:37 -0400 Subject: [PATCH 17/20] Unused import --- python/cudf/cudf/_lib/copying.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index aa279e84d91..e00ed6bc647 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -787,8 +787,7 @@ cdef class _CPackedColumns: """ Construct a ``PackedColumns`` object from a ``cudf.DataFrame``. """ - import cudf.core.dtypes - from cudf import RangeIndex + import cudf.core.dtype cdef _CPackedColumns p = _CPackedColumns.__new__(_CPackedColumns) From 26df99a62c63539ae5dd99b6173e3c3be22d3b04 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 12 Aug 2021 10:49:00 -0400 Subject: [PATCH 18/20] Space --- python/cudf/cudf/_lib/transform.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index 67fc1c441b0..9fada59640e 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -60,7 +60,7 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit): """ if not isinstance(mask_buffer, cudf.core.buffer.Buffer): raise TypeError("mask_buffer is not an instance of " - "cudf.core.buffer. Buffer") + "cudf.core.buffer.Buffer") cdef bitmask_type* bit_mask = (mask_buffer.ptr) cdef unique_ptr[column] result From fec34d919f94af656aa0de814ed30b44083cf2d7 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 12 Aug 2021 10:58:40 -0400 Subject: [PATCH 19/20] Add interval tests --- python/cudf/cudf/tests/test_dtypes.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index a5a9109e13c..ee6cc7b6df6 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -301,6 +301,9 @@ def test_lists_of_structs_dtype(data): pd.array([1], dtype="int16").dtype, np.dtype("int16"), ), + (pd.IntervalDtype("int"), cudf.IntervalDtype("int64")), + (cudf.IntervalDtype("int"), cudf.IntervalDtype("int64")), + (pd.IntervalDtype("int64"), cudf.IntervalDtype("int64")), ], ) def test_dtype(in_dtype, expect): From 5fc19a92e53089a68a5ad4526923b941bd495706 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 12 Aug 2021 13:28:58 -0400 Subject: [PATCH 20/20] :( --- python/cudf/cudf/_lib/copying.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index e00ed6bc647..ed31574b4a5 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -787,7 +787,7 @@ cdef class _CPackedColumns: """ Construct a ``PackedColumns`` object from a ``cudf.DataFrame``. """ - import cudf.core.dtype + import cudf.core.dtypes cdef _CPackedColumns p = _CPackedColumns.__new__(_CPackedColumns)