diff --git a/CHANGELOG.md b/CHANGELOG.md index 44f31e89b9a..fcbe1b739ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -76,6 +76,7 @@ - PR #2406 Moved all existing `table` related files to a `legacy/` directory - PR #2350 Performance related changes to get_dummies - PR #2420 Remove `cudautils.astype` and replace with `typecast.apply_cast` +- PR #2456 Small improvement to typecast utility - PR #2458 Fix handling of thirdparty packages in `isort` config ## Bug Fixes diff --git a/python/cudf/cudf/bindings/copying.pyx b/python/cudf/cudf/bindings/copying.pyx index 7b9a9352239..5bb94156688 100644 --- a/python/cudf/cudf/bindings/copying.pyx +++ b/python/cudf/cudf/bindings/copying.pyx @@ -54,10 +54,9 @@ def apply_gather(in_cols, maps, out_cols=None): else: in_size = in_cols[0].data.size - import cudf.bindings.typecast as typecast from cudf.dataframe import columnops - col = typecast.apply_cast(columnops.as_column(maps), dtype=np.int32) - maps = col.data.mem + maps = columnops.as_column(maps).astype("int32") + maps = maps.data.mem # TODO: replace with libcudf pymod when available maps = modulo(maps, in_size) diff --git a/python/cudf/cudf/bindings/sort.pyx b/python/cudf/cudf/bindings/sort.pyx index 8e04f7a0d9c..f58e0a665ab 100644 --- a/python/cudf/cudf/bindings/sort.pyx +++ b/python/cudf/cudf/bindings/sort.pyx @@ -182,12 +182,8 @@ class SegmentedRadixSortPlan(object): # Note: .astype is required below because .copy_to_device # is just a plain memcpy - import cudf.bindings.typecast as typecast from cudf.dataframe import columnops - col = typecast.apply_cast( - columnops.as_column(segments), - dtype=seg_dtype - ) + col = columnops.as_column(segments).astype(seg_dtype) d_begins.copy_to_device(col.data.mem) d_ends[-1:].copy_to_device(np.require([self.nelem], dtype=seg_dtype)) diff --git a/python/cudf/cudf/bindings/typecast.pyx b/python/cudf/cudf/bindings/typecast.pyx index 321c3c31b77..a81a74842eb 100644 --- a/python/cudf/cudf/bindings/typecast.pyx +++ b/python/cudf/cudf/bindings/typecast.pyx @@ -12,10 +12,10 @@ from cudf.dataframe.column import Column from libc.stdlib cimport free import numpy as np - +import pandas as pd _time_unit = { - 'none': TIME_UNIT_NONE, + None: TIME_UNIT_NONE, 's': TIME_UNIT_s, 'ms': TIME_UNIT_ms, 'us': TIME_UNIT_us, @@ -23,32 +23,32 @@ _time_unit = { } -def apply_cast(incol, **kwargs): +def apply_cast(incol, dtype="float64", time_unit=None): """ - Cast from incol.dtype to outcol.dtype + Return a Column with values in `incol` casted to `dtype`. + Currently supports numeric and datetime dtypes. """ check_gdf_compatibility(incol) + dtype = pd.api.types.pandas_dtype(dtype).type cdef gdf_column* c_incol = column_view_from_column(incol) - npdtype = kwargs.get("dtype", np.float64) - cdef gdf_dtype dtype = dtypes[npdtype] - cdef uintptr_t category + cdef gdf_dtype c_dtype = dtypes[dtype] + cdef uintptr_t c_category cdef gdf_dtype_extra_info info = gdf_dtype_extra_info( time_unit=TIME_UNIT_NONE, - category=category + category=c_category ) - unit = kwargs.get("time_unit", 'none') - info.time_unit = _time_unit[unit] + info.time_unit = _time_unit[time_unit] cdef gdf_column result with nogil: result = cast( c_incol[0], - dtype, + c_dtype, info ) diff --git a/python/cudf/cudf/dataframe/buffer.py b/python/cudf/cudf/dataframe/buffer.py index 0fa4b6ec93b..32cb1efc221 100644 --- a/python/cudf/cudf/dataframe/buffer.py +++ b/python/cudf/cudf/dataframe/buffer.py @@ -126,14 +126,13 @@ def append(self, element): self.extend(np.asarray(element, dtype=self.dtype)) def extend(self, array): + from cudf.dataframe import columnops + needed = array.size self._sentry_capacity(needed) - import cudf.bindings.typecast as typecast - from cudf.dataframe import columnops - array = typecast.apply_cast( - columnops.as_column(array), dtype=self.dtype.type - ).data.mem + array = columnops.as_column(array).astype(self.dtype).data.mem + self.mem[self.size : self.size + needed].copy_to_device(array) self.size += needed diff --git a/python/cudf/cudf/dataframe/datetime.py b/python/cudf/cudf/dataframe/datetime.py index 146304b5020..88b473f93c2 100644 --- a/python/cudf/cudf/dataframe/datetime.py +++ b/python/cudf/cudf/dataframe/datetime.py @@ -123,20 +123,16 @@ def as_numerical(self): return self.view( numerical.NumericalColumn, dtype="int64", - data=typecast.apply_cast(self, dtype=np.int64).data, + data=typecast.apply_cast(self, np.int64).data, ) def as_datetime_column(self, dtype, **kwargs): import cudf.bindings.typecast as typecast - return typecast.apply_cast(self, dtype=np.dtype(dtype).type) + return typecast.apply_cast(self, dtype=dtype) def as_numerical_column(self, dtype, **kwargs): - import cudf.bindings.typecast as typecast - - return typecast.apply_cast( - self.as_numerical, dtype=np.dtype(dtype).type - ) + return self.as_numerical.astype(dtype) def as_string_column(self, dtype, **kwargs): from cudf.dataframe import string diff --git a/python/cudf/cudf/dataframe/numerical.py b/python/cudf/cudf/dataframe/numerical.py index ee48635172a..47abdce38ef 100644 --- a/python/cudf/cudf/dataframe/numerical.py +++ b/python/cudf/cudf/dataframe/numerical.py @@ -142,15 +142,14 @@ def as_datetime_column(self, dtype, **kwargs): return self.view( datetime.DatetimeColumn, dtype=dtype, - data=typecast.apply_cast(self, dtype=np.dtype(dtype).type).data, + data=typecast.apply_cast(self, dtype=dtype).data, ) def as_numerical_column(self, dtype, **kwargs): import cudf.bindings.typecast as typecast return self.replace( - data=typecast.apply_cast(self, dtype=np.dtype(dtype).type).data, - dtype=np.dtype(dtype), + data=typecast.apply_cast(self, dtype).data, dtype=np.dtype(dtype) ) def sort_by_values(self, ascending=True, na_position="last"):