Skip to content

Commit

Permalink
cudf.dtype function (#8949)
Browse files Browse the repository at this point in the history
Closes #8915

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: #8949
  • Loading branch information
shwina authored Aug 13, 2021
1 parent e25630a commit 2b92220
Show file tree
Hide file tree
Showing 67 changed files with 491 additions and 404 deletions.
18 changes: 10 additions & 8 deletions python/cudf/cudf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,42 +8,44 @@

import rmm

from cudf.api.types import dtype
from cudf import core, datasets, testing
from cudf._version import get_versions
from cudf.api.extensions import (
register_dataframe_accessor,
register_index_accessor,
register_series_accessor,
)
from cudf.core import (
from cudf.core.scalar import (
NA,
Scalar,
)
from cudf.core.index import (
BaseIndex,
CategoricalIndex,
DataFrame,
DatetimeIndex,
Float32Index,
Float64Index,
Index,
GenericIndex,
Int8Index,
Int16Index,
Int32Index,
Int64Index,
IntervalIndex,
MultiIndex,
RangeIndex,
StringIndex,
Scalar,
Series,
TimedeltaIndex,
UInt8Index,
UInt16Index,
UInt32Index,
UInt64Index,
cut,
from_pandas,
interval_range,
merge,
)
from cudf.core.dataframe import DataFrame, from_pandas, merge
from cudf.core.series import Series
from cudf.core.multiindex import MultiIndex
from cudf.core.cut import cut
from cudf.core.algorithms import factorize
from cudf.core.dtypes import (
CategoricalDtype,
Expand Down
54 changes: 27 additions & 27 deletions python/cudf/cudf/_fuzz_testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,44 +18,44 @@
ALL_POSSIBLE_VALUES = "ALL_POSSIBLE_VALUES"

_PANDAS_TO_AVRO_SCHEMA_MAP = {
np.dtype("int8"): "int",
cudf.dtype("int8"): "int",
pd.Int8Dtype(): ["int", "null"],
pd.Int16Dtype(): ["int", "null"],
pd.Int32Dtype(): ["int", "null"],
pd.Int64Dtype(): ["long", "null"],
pd.BooleanDtype(): ["boolean", "null"],
pd.StringDtype(): ["string", "null"],
np.dtype("bool_"): "boolean",
np.dtype("int16"): "int",
np.dtype("int32"): "int",
np.dtype("int64"): "long",
np.dtype("O"): "string",
np.dtype("str"): "string",
np.dtype("float32"): "float",
np.dtype("float64"): "double",
np.dtype("<M8[ns]"): {"type": "long", "logicalType": "timestamp-millis"},
np.dtype("<M8[ms]"): {"type": "long", "logicalType": "timestamp-millis"},
np.dtype("<M8[us]"): {"type": "long", "logicalType": "timestamp-micros"},
cudf.dtype("bool_"): "boolean",
cudf.dtype("int16"): "int",
cudf.dtype("int32"): "int",
cudf.dtype("int64"): "long",
cudf.dtype("O"): "string",
cudf.dtype("str"): "string",
cudf.dtype("float32"): "float",
cudf.dtype("float64"): "double",
cudf.dtype("<M8[ns]"): {"type": "long", "logicalType": "timestamp-millis"},
cudf.dtype("<M8[ms]"): {"type": "long", "logicalType": "timestamp-millis"},
cudf.dtype("<M8[us]"): {"type": "long", "logicalType": "timestamp-micros"},
}

PANDAS_TO_ORC_TYPES = {
np.dtype("int8"): pyorc.TinyInt(),
cudf.dtype("int8"): pyorc.TinyInt(),
pd.Int8Dtype(): pyorc.TinyInt(),
pd.Int16Dtype(): pyorc.SmallInt(),
pd.Int32Dtype(): pyorc.Int(),
pd.Int64Dtype(): pyorc.BigInt(),
pd.BooleanDtype(): pyorc.Boolean(),
np.dtype("bool_"): pyorc.Boolean(),
np.dtype("int16"): pyorc.SmallInt(),
np.dtype("int32"): pyorc.Int(),
np.dtype("int64"): pyorc.BigInt(),
np.dtype("O"): pyorc.String(),
cudf.dtype("bool_"): pyorc.Boolean(),
cudf.dtype("int16"): pyorc.SmallInt(),
cudf.dtype("int32"): pyorc.Int(),
cudf.dtype("int64"): pyorc.BigInt(),
cudf.dtype("O"): pyorc.String(),
pd.StringDtype(): pyorc.String(),
np.dtype("float32"): pyorc.Float(),
np.dtype("float64"): pyorc.Double(),
np.dtype("<M8[ns]"): pyorc.Timestamp(),
np.dtype("<M8[ms]"): pyorc.Timestamp(),
np.dtype("<M8[us]"): pyorc.Timestamp(),
cudf.dtype("float32"): pyorc.Float(),
cudf.dtype("float64"): pyorc.Double(),
cudf.dtype("<M8[ns]"): pyorc.Timestamp(),
cudf.dtype("<M8[ms]"): pyorc.Timestamp(),
cudf.dtype("<M8[us]"): pyorc.Timestamp(),
}

ORC_TO_PANDAS_TYPES = {
Expand All @@ -64,10 +64,10 @@
pyorc.Boolean().name: pd.BooleanDtype(),
pyorc.SmallInt().name: pd.Int16Dtype(),
pyorc.BigInt().name: pd.Int64Dtype(),
pyorc.String().name: np.dtype("O"),
pyorc.Float().name: np.dtype("float32"),
pyorc.Double().name: np.dtype("float64"),
pyorc.Timestamp().name: np.dtype("<M8[ns]"),
pyorc.String().name: cudf.dtype("O"),
pyorc.Float().name: cudf.dtype("float32"),
pyorc.Double().name: cudf.dtype("float64"),
pyorc.Timestamp().name: cudf.dtype("<M8[ns]"),
}


Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
interop,
join,
json,
labeling,
merge,
null_mask,
nvtext,
Expand Down
6 changes: 4 additions & 2 deletions python/cudf/cudf/_lib/aggregation.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ from cudf._lib.types import Interpolation
cimport cudf._lib.cpp.aggregation as libcudf_aggregation
cimport cudf._lib.cpp.types as libcudf_types

import cudf


class AggregationKind(Enum):
SUM = libcudf_aggregation.aggregation.Kind.SUM
Expand Down Expand Up @@ -277,7 +279,7 @@ cdef class Aggregation:
nb_type = numpy_support.from_dtype(kwargs['dtype'])
type_signature = (nb_type[:],)
compiled_op = cudautils.compile_udf(op, type_signature)
output_np_dtype = np.dtype(compiled_op[1])
output_np_dtype = cudf.dtype(compiled_op[1])
cpp_str = compiled_op[0].encode('UTF-8')
if output_np_dtype not in np_to_cudf_types:
raise TypeError(
Expand Down Expand Up @@ -421,7 +423,7 @@ cdef class RollingAggregation:
nb_type = numpy_support.from_dtype(kwargs['dtype'])
type_signature = (nb_type[:],)
compiled_op = cudautils.compile_udf(op, type_signature)
output_np_dtype = np.dtype(compiled_op[1])
output_np_dtype = cudf.dtype(compiled_op[1])
cpp_str = compiled_op[0].encode('UTF-8')
if output_np_dtype not in np_to_cudf_types:
raise TypeError(
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/_lib/binaryop.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ from cudf.utils.dtypes import is_scalar, is_string_dtype

cimport cudf._lib.cpp.binaryop as cpp_binaryop
from cudf._lib.cpp.binaryop cimport binary_operator
import cudf


class BinaryOperation(IntEnum):
Expand Down Expand Up @@ -211,7 +212,7 @@ def binaryop_udf(Column lhs, Column rhs, udf_ptx, dtype):
cdef type_id tid = (
<type_id> (
<underlying_type_t_type_id> (
np_to_cudf_types[np.dtype(dtype)]
np_to_cudf_types[cudf.dtype(dtype)]
)
)
)
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/_lib/copying.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -787,12 +787,12 @@ cdef class _CPackedColumns:
"""
Construct a ``PackedColumns`` object from a ``cudf.DataFrame``.
"""
from cudf.core import RangeIndex, dtypes
import cudf.core.dtypes

cdef _CPackedColumns p = _CPackedColumns.__new__(_CPackedColumns)

if keep_index and (
not isinstance(input_table.index, RangeIndex)
not isinstance(input_table.index, cudf.RangeIndex)
or input_table.index.start != 0
or input_table.index.stop != len(input_table)
or input_table.index.step != 1
Expand All @@ -805,7 +805,7 @@ cdef class _CPackedColumns:
p.column_names = input_table._column_names
p.column_dtypes = {}
for name, col in input_table._data.items():
if isinstance(col.dtype, dtypes._BaseDtype):
if isinstance(col.dtype, cudf.core.dtypes._BaseDtype):
p.column_dtypes[name] = col.dtype

p.c_obj = move(cpp_copying.pack(input_table_view))
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/orc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ cpdef read_orc(object filepaths_or_buffers,
if timestamp_type is None else
<type_id>(
<underlying_type_t_type_id> (
np_to_cudf_types[np.dtype(timestamp_type)]
np_to_cudf_types[cudf.dtype(timestamp_type)]
)
)
),
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
meta_dtype = cols_dtype_map.get(col, None)
df._data[col] = cudf.core.column.column_empty(
row_count=0,
dtype=np.dtype(meta_dtype)
dtype=cudf.dtype(meta_dtype)
)

# Set the index column
Expand Down
15 changes: 7 additions & 8 deletions python/cudf/cudf/_lib/scalar.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id

from cudf._lib.interop import from_arrow, to_arrow

cimport cudf._lib.cpp.types as libcudf_types
from cudf._lib.cpp.scalar.scalar cimport (
duration_scalar,
fixed_point_scalar,
Expand All @@ -60,9 +61,7 @@ from cudf._lib.cpp.wrappers.timestamps cimport (
)
from cudf._lib.utils cimport data_from_table_view

from cudf.utils.dtypes import _decimal_to_int64, is_list_dtype, is_struct_dtype

cimport cudf._lib.cpp.types as libcudf_types
import cudf


cdef class DeviceScalar:
Expand All @@ -81,7 +80,7 @@ cdef class DeviceScalar:
dtype : dtype
A NumPy dtype.
"""
self._dtype = dtype if dtype.kind != 'U' else np.dtype('object')
self._dtype = dtype if dtype.kind != 'U' else cudf.dtype('object')
self._set_value(value, self._dtype)

def _set_value(self, value, dtype):
Expand Down Expand Up @@ -120,9 +119,9 @@ cdef class DeviceScalar:
def _to_host_scalar(self):
if isinstance(self.dtype, cudf.Decimal64Dtype):
result = _get_py_decimal_from_fixed_point(self.c_value)
elif is_struct_dtype(self.dtype):
elif cudf.api.types.is_struct_dtype(self.dtype):
result = _get_py_dict_from_struct(self.c_value)
elif is_list_dtype(self.dtype):
elif cudf.api.types.is_list_dtype(self.dtype):
result = _get_py_list_from_list(self.c_value)
elif pd.api.types.is_string_dtype(self.dtype):
result = _get_py_string_from_string(self.c_value)
Expand Down Expand Up @@ -309,7 +308,7 @@ cdef _set_decimal64_from_scalar(unique_ptr[scalar]& s,
object value,
object dtype,
bool valid=True):
value = _decimal_to_int64(value) if valid else 0
value = cudf.utils.dtypes._decimal_to_int64(value) if valid else 0
s.reset(
new fixed_point_scalar[decimal64](
<int64_t>np.int64(value), scale_type(-dtype.scale), valid
Expand Down Expand Up @@ -560,7 +559,7 @@ def _is_null_host_scalar(slr):
def _create_proxy_nat_scalar(dtype):
cdef DeviceScalar result = DeviceScalar.__new__(DeviceScalar)

dtype = np.dtype(dtype)
dtype = cudf.dtype(dtype)
if dtype.char in 'mM':
nat = dtype.type('NaT').astype(dtype)
if dtype.type == np.datetime64:
Expand Down
Loading

0 comments on commit 2b92220

Please sign in to comment.