diff --git a/CHANGELOG.md b/CHANGELOG.md index da2a140dc29..b46c27999a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -71,6 +71,7 @@ - PR #5845 Add support for `mask_to_bools` - PR #5851 Add support for `Index.sort_values` - PR #5859 Add conversion form `fixed_point` to `bool` +- PR #5781 Add duration types support in cudf(python/cython) - PR #5815 LIST Support for ColumnVector ## Improvements diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp index dd74c1cbf17..5d61724f680 100644 --- a/cpp/include/cudf/detail/interop.hpp +++ b/cpp/include/cudf/detail/interop.hpp @@ -74,6 +74,18 @@ std::shared_ptr to_arrow_array(cudf::type_id id, Ts&&... args) case type_id::TIMESTAMP_NANOSECONDS: return std::make_shared(arrow::timestamp(arrow::TimeUnit::NANO), std::forward(args)...); + case type_id::DURATION_SECONDS: + return std::make_shared(arrow::duration(arrow::TimeUnit::SECOND), + std::forward(args)...); + case type_id::DURATION_MILLISECONDS: + return std::make_shared(arrow::duration(arrow::TimeUnit::MILLI), + std::forward(args)...); + case type_id::DURATION_MICROSECONDS: + return std::make_shared(arrow::duration(arrow::TimeUnit::MICRO), + std::forward(args)...); + case type_id::DURATION_NANOSECONDS: + return std::make_shared(arrow::duration(arrow::TimeUnit::NANO), + std::forward(args)...); default: CUDF_FAIL("Unsupported type_id conversion to arrow"); } } diff --git a/cpp/src/binaryop/jit/code/operation.cpp b/cpp/src/binaryop/jit/code/operation.cpp index e1cc2509c59..5d930afc1c6 100644 --- a/cpp/src/binaryop/jit/code/operation.cpp +++ b/cpp/src/binaryop/jit/code/operation.cpp @@ -143,7 +143,7 @@ const char* operation = struct RTrueDiv { template static TypeOut operate(TypeLhs x, TypeRhs y) { - return (static_cast(y) / static_cast(x)); + return TrueDiv::operate(y, x); } }; @@ -157,7 +157,7 @@ const char* operation = struct RFloorDiv { template static TypeOut operate(TypeLhs x, TypeRhs y) { - return floor(static_cast(y) / static_cast(x)); + return FloorDiv::operate(y, x); } }; @@ -221,25 +221,20 @@ const char* operation = double y1 = static_cast(y); return fmod(fmod(x1, y1) + y1, y1); } - }; - struct RPyMod { template )>* = nullptr> + enable_if_t<(is_duration_v && is_duration_v)>* = nullptr> static TypeOut operate(TypeLhs x, TypeRhs y) { - return ((y % x) + x) % x; + return ((x % y) + y) % y; } + }; - template )>* = nullptr> + struct RPyMod { + template static TypeOut operate(TypeLhs x, TypeRhs y) { - double x1 = static_cast(x); - double y1 = static_cast(y); - return fmod(fmod(y1, x1) + x1, x1); + return PyMod::operate(y, x); } }; @@ -253,7 +248,7 @@ const char* operation = struct RPow { template static TypeOut operate(TypeLhs x, TypeRhs y) { - return pow(static_cast(y), static_cast(x)); + return Pow::operate(y, x); } }; diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index c051733634b..d4e76fcabe5 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -60,6 +60,16 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type) default: CUDF_FAIL("Unsupported timestamp unit in arrow"); } } + case arrow::Type::DURATION: { + arrow::DurationType const* type = static_cast(&arrow_type); + switch (type->unit()) { + case arrow::TimeUnit::type::SECOND: return data_type(type_id::DURATION_SECONDS); + case arrow::TimeUnit::type::MILLI: return data_type(type_id::DURATION_MILLISECONDS); + case arrow::TimeUnit::type::MICRO: return data_type(type_id::DURATION_MICROSECONDS); + case arrow::TimeUnit::type::NANO: return data_type(type_id::DURATION_NANOSECONDS); + default: CUDF_FAIL("Unsupported duration unit in arrow"); + } + } case arrow::Type::STRING: return data_type(type_id::STRING); case arrow::Type::DICTIONARY: return data_type(type_id::DICTIONARY32); case arrow::Type::LIST: return data_type(type_id::LIST); diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 4b5c866124d..11e9efe15b4 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -53,6 +53,12 @@ std::unique_ptr get_cudf_table() struct FromArrowTest : public cudf::test::BaseFixture { }; +template +struct FromArrowTestDurationsTest : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(FromArrowTestDurationsTest, cudf::test::DurationTypes); + TEST_F(FromArrowTest, EmptyTable) { auto tables = get_tables(0); @@ -89,6 +95,39 @@ TEST_F(FromArrowTest, DateTimeTable) cudf::test::expect_tables_equal(expected_table_view, got_cudf_table->view()); } +TYPED_TEST(FromArrowTestDurationsTest, DurationTable) +{ + using T = TypeParam; + + auto data = {T{1}, T{2}, T{3}, T{4}, T{5}, T{6}}; + auto col = cudf::test::fixed_width_column_wrapper(data); + + std::shared_ptr arr; + cudf::table_view expected_table_view({col}); + arrow::TimeUnit::type arrow_unit; + + switch (cudf::type_to_id()) { + case cudf::type_id::DURATION_SECONDS: arrow_unit = arrow::TimeUnit::type::SECOND; break; + case cudf::type_id::DURATION_MILLISECONDS: arrow_unit = arrow::TimeUnit::type::MILLI; break; + case cudf::type_id::DURATION_MICROSECONDS: arrow_unit = arrow::TimeUnit::type::MICRO; break; + case cudf::type_id::DURATION_NANOSECONDS: arrow_unit = arrow::TimeUnit::type::NANO; break; + case cudf::type_id::DURATION_DAYS: return; + default: CUDF_FAIL("Unsupported duration unit in arrow"); + } + arrow::DurationBuilder duration_builder(duration(arrow_unit), arrow::default_memory_pool()); + duration_builder.AppendValues(std::vector{1, 2, 3, 4, 5, 6}); + CUDF_EXPECTS(duration_builder.Finish(&arr).ok(), "Failed to build array"); + + std::vector> schema_vector({arrow::field("a", arr->type())}); + auto schema = std::make_shared(schema_vector); + + auto arrow_table = arrow::Table::Make(schema, {arr}); + + auto got_cudf_table = cudf::from_arrow(*arrow_table); + + cudf::test::expect_tables_equal(expected_table_view, got_cudf_table->view()); +} + TEST_F(FromArrowTest, NestedList) { auto valids = diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index db008191732..88bc41ff279 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -97,6 +97,12 @@ std::pair, std::shared_ptr> get_table struct ToArrowTest : public cudf::test::BaseFixture { }; +template +struct ToArrowTestDurationsTest : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(ToArrowTestDurationsTest, cudf::test::DurationTypes); + TEST_F(ToArrowTest, EmptyTable) { auto tables = get_tables(0); @@ -133,6 +139,39 @@ TEST_F(ToArrowTest, DateTimeTable) ASSERT_EQ(expected_arrow_table->Equals(*got_arrow_table, true), true); } +TYPED_TEST(ToArrowTestDurationsTest, DurationTable) +{ + using T = TypeParam; + + auto data = {T{1}, T{2}, T{3}, T{4}, T{5}, T{6}}; + auto col = cudf::test::fixed_width_column_wrapper(data); + + cudf::table_view input_view({col}); + + std::shared_ptr arr; + arrow::TimeUnit::type arrow_unit; + switch (cudf::type_to_id()) { + case cudf::type_id::DURATION_SECONDS: arrow_unit = arrow::TimeUnit::type::SECOND; break; + case cudf::type_id::DURATION_MILLISECONDS: arrow_unit = arrow::TimeUnit::type::MILLI; break; + case cudf::type_id::DURATION_MICROSECONDS: arrow_unit = arrow::TimeUnit::type::MICRO; break; + case cudf::type_id::DURATION_NANOSECONDS: arrow_unit = arrow::TimeUnit::type::NANO; break; + case cudf::type_id::DURATION_DAYS: return; + default: CUDF_FAIL("Unsupported duration unit in arrow"); + } + arrow::DurationBuilder duration_builder(duration(arrow_unit), arrow::default_memory_pool()); + duration_builder.AppendValues(std::vector{1, 2, 3, 4, 5, 6}); + CUDF_EXPECTS(duration_builder.Finish(&arr).ok(), "Failed to build array"); + + std::vector> schema_vector({arrow::field("a", arr->type())}); + auto schema = std::make_shared(schema_vector); + + auto expected_arrow_table = arrow::Table::Make(schema, {arr}); + + auto got_arrow_table = cudf::to_arrow(input_view, {"a"}); + + ASSERT_EQ(expected_arrow_table->Equals(*got_arrow_table, true), true); +} + TEST_F(ToArrowTest, NestedList) { auto col = cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{6}, {7, 8, 9}}}); diff --git a/docs/cudf/source/api.rst b/docs/cudf/source/api.rst index 67e55bb5281..7902a8532d0 100644 --- a/docs/cudf/source/api.rst +++ b/docs/cudf/source/api.rst @@ -169,6 +169,14 @@ DatetimeIndex :inherited-members: :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list +TimedeltaIndex +-------------- +.. currentmodule:: cudf.core.index +.. autoclass:: TimedeltaIndex + :members: + :inherited-members: + :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list + Categories ---------- .. currentmodule:: cudf.core.column.categorical diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index d953f517e4a..c142b5d89bc 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -24,6 +24,7 @@ MultiIndex, RangeIndex, Series, + TimedeltaIndex, UInt8Index, UInt16Index, UInt32Index, @@ -34,6 +35,7 @@ from cudf.core.dtypes import CategoricalDtype from cudf.core.groupby import Grouper from cudf.core.ops import ( + add, arccos, arcsin, arctan, @@ -44,10 +46,13 @@ logical_and, logical_not, logical_or, + multiply, remainder, sin, sqrt, + subtract, tan, + true_divide, ) from cudf.core.reshape import concat, get_dummies, melt, merge_sorted from cudf.core.series import isclose diff --git a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd b/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd index 6811613c2dd..3eb11c2bfd0 100644 --- a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd +++ b/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd @@ -35,6 +35,16 @@ cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil: int32_t ticks_since_epoch_32 "ticks_since_epoch"() except + T value() except + + cdef cppclass duration_scalar[T](scalar): + duration_scalar() except + + duration_scalar(duration_scalar other) except + + duration_scalar(int64_t value) except + + duration_scalar(int64_t value, bool is_valid) except + + duration_scalar(int32_t value) except + + duration_scalar(int32_t value, bool is_valid) except + + int64_t ticks "count"() except + + T value() except + + cdef cppclass string_scalar(scalar): string_scalar() except + string_scalar(string st) except + diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_durations.pxd b/python/cudf/cudf/_lib/cpp/strings/convert/convert_durations.pxd new file mode 100644 index 00000000000..98faebfcaa2 --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/strings/convert/convert_durations.pxd @@ -0,0 +1,19 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. + +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.types cimport data_type + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string + +cdef extern from "cudf/strings/convert/convert_durations.hpp" namespace \ + "cudf::strings" nogil: + cdef unique_ptr[column] to_durations( + const column_view & strings_col, + data_type duration_type, + const string & format) except + + + cdef unique_ptr[column] from_durations( + const column_view & durations, + const string & format) except + diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/cpp/types.pxd index 903ab2a4be8..9f8646108ee 100644 --- a/python/cudf/cudf/_lib/cpp/types.pxd +++ b/python/cudf/cudf/_lib/cpp/types.pxd @@ -68,6 +68,10 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil: STRING "cudf::type_id::STRING" LIST "cudf::type_id::LIST" NUM_TYPE_IDS "cudf::type_id::NUM_TYPE_IDS" + DURATION_SECONDS "cudf::type_id::DURATION_SECONDS" + DURATION_MILLISECONDS "cudf::type_id::DURATION_MILLISECONDS" + DURATION_MICROSECONDS "cudf::type_id::DURATION_MICROSECONDS" + DURATION_NANOSECONDS "cudf::type_id::DURATION_NANOSECONDS" cdef cppclass data_type: data_type() except + diff --git a/python/cudf/cudf/_lib/cpp/wrappers/durations.pxd b/python/cudf/cudf/_lib/cpp/wrappers/durations.pxd new file mode 100644 index 00000000000..7c648425eb5 --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/wrappers/durations.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. + +from libc.stdint cimport int64_t + + +cdef extern from "cudf/wrappers/durations.hpp" namespace "cudf" nogil: + ctypedef int64_t duration_s + ctypedef int64_t duration_ms + ctypedef int64_t duration_us + ctypedef int64_t duration_ns diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd index e36f74e8d6f..34dfea5431a 100644 --- a/python/cudf/cudf/_lib/scalar.pxd +++ b/python/cudf/cudf/_lib/scalar.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2020, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from libcpp cimport bool from cudf._lib.cpp.scalar.scalar cimport scalar @@ -10,3 +11,5 @@ cdef class Scalar: @staticmethod cdef Scalar from_unique_ptr(unique_ptr[scalar] ptr) + + cpdef bool is_valid(Scalar s) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index e10dde66a82..7757c5a8ad6 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -16,7 +16,9 @@ from libc.stdint cimport ( from libcpp.memory cimport unique_ptr from libcpp cimport bool -from cudf._lib.types import np_to_cudf_types, cudf_to_np_types +import cudf +from cudf._lib.types import cudf_to_np_types, duration_unit_map +from cudf._lib.types import datetime_unit_map from cudf._lib.types cimport underlying_type_t_type_id from cudf._lib.move cimport move @@ -26,10 +28,17 @@ from cudf._lib.cpp.wrappers.timestamps cimport ( timestamp_us, timestamp_ns ) +from cudf._lib.cpp.wrappers.durations cimport( + duration_s, + duration_ms, + duration_us, + duration_ns +) from cudf._lib.cpp.scalar.scalar cimport ( scalar, numeric_scalar, timestamp_scalar, + duration_scalar, string_scalar ) cimport cudf._lib.cpp.types as libcudf_types @@ -51,9 +60,9 @@ cdef class Scalar: dtype : dtype A NumPy dtype. """ - from cudf.utils.dtypes import to_cudf_compatible_scalar - value = to_cudf_compatible_scalar(value, dtype=dtype) + value = cudf.utils.dtypes.to_cudf_compatible_scalar(value, dtype=dtype) + valid = value is not None if dtype is None: @@ -74,11 +83,14 @@ cdef class Scalar: _set_datetime64_from_np_scalar( self.c_value, value, dtype, valid ) + elif pd.api.types.is_timedelta64_dtype(dtype): + _set_timedelta64_from_np_scalar( + self.c_value, value, dtype, valid + ) else: raise ValueError( - "Cannot convert value of type {} to cudf scalar".format( - type(value).__name__ - ) + f"Cannot convert value of type " + f"{type(value).__name__} to cudf scalar" ) @property @@ -101,11 +113,19 @@ cdef class Scalar: return _get_np_scalar_from_numeric(self.c_value) elif pd.api.types.is_datetime64_dtype(self.dtype): return _get_np_scalar_from_timestamp64(self.c_value) + elif pd.api.types.is_timedelta64_dtype(self.dtype): + return _get_np_scalar_from_timedelta64(self.c_value) else: raise ValueError( "Could not convert cudf::scalar to a Python value" ) + cpdef bool is_valid(self): + """ + Returns if the Scalar is valid or not(i.e., ). + """ + return self.c_value.get()[0].is_valid() + def __repr__(self): if self.value is None: return f"Scalar({self.value}, {self.dtype.__repr__()})" @@ -128,8 +148,8 @@ cdef _set_string_from_np_string(unique_ptr[scalar]& s, value, bool valid=True): cdef _set_numeric_from_np_scalar(unique_ptr[scalar]& s, - value, - dtype, + object value, + object dtype, bool valid=True): value = value if valid else 0 if dtype == "int8": @@ -155,14 +175,16 @@ cdef _set_numeric_from_np_scalar(unique_ptr[scalar]& s, elif dtype == "bool": s.reset(new numeric_scalar[bool](value, valid)) else: - raise ValueError("dtype not supported: {}".format(dtype)) + raise ValueError(f"dtype not supported: {dtype}") cdef _set_datetime64_from_np_scalar(unique_ptr[scalar]& s, - value, - dtype, + object value, + object dtype, bool valid=True): + value = value if valid else 0 + if dtype == "datetime64[s]": s.reset( new timestamp_scalar[timestamp_s](np.int64(value), valid) @@ -180,8 +202,33 @@ cdef _set_datetime64_from_np_scalar(unique_ptr[scalar]& s, new timestamp_scalar[timestamp_ns](np.int64(value), valid) ) else: - raise ValueError("dtype not supported: {}".format(dtype)) + raise ValueError(f"dtype not supported: {dtype}") + +cdef _set_timedelta64_from_np_scalar(unique_ptr[scalar]& s, + object value, + object dtype, + bool valid=True): + value = value if valid else 0 + + if dtype == "timedelta64[s]": + s.reset( + new duration_scalar[duration_s](np.int64(value), valid) + ) + elif dtype == "timedelta64[ms]": + s.reset( + new duration_scalar[duration_ms](np.int64(value), valid) + ) + elif dtype == "timedelta64[us]": + s.reset( + new duration_scalar[duration_us](np.int64(value), valid) + ) + elif dtype == "timedelta64[ns]": + s.reset( + new duration_scalar[duration_ns](np.int64(value), valid) + ) + else: + raise ValueError(f"dtype not supported: {dtype}") cdef _get_py_string_from_string(unique_ptr[scalar]& s): if not s.get()[0].is_valid(): @@ -236,28 +283,69 @@ cdef _get_np_scalar_from_timestamp64(unique_ptr[scalar]& s): ( s_ptr )[0].ticks_since_epoch_64(), - "s" + datetime_unit_map[(cdtype.id())] ) elif cdtype.id() == libcudf_types.TIMESTAMP_MILLISECONDS: return np.datetime64( ( s_ptr )[0].ticks_since_epoch_64(), - "ms" + datetime_unit_map[(cdtype.id())] ) elif cdtype.id() == libcudf_types.TIMESTAMP_MICROSECONDS: return np.datetime64( ( s_ptr )[0].ticks_since_epoch_64(), - "us" + datetime_unit_map[(cdtype.id())] ) elif cdtype.id() == libcudf_types.TIMESTAMP_NANOSECONDS: return np.datetime64( ( s_ptr )[0].ticks_since_epoch_64(), - "ns" + datetime_unit_map[(cdtype.id())] + ) + else: + raise ValueError("Could not convert cudf::scalar to numpy scalar") + + +cdef _get_np_scalar_from_timedelta64(unique_ptr[scalar]& s): + + cdef scalar* s_ptr = s.get() + + if not s_ptr[0].is_valid(): + return None + + cdef libcudf_types.data_type cdtype = s_ptr[0].type() + + if cdtype.id() == libcudf_types.DURATION_SECONDS: + return np.timedelta64( + ( + s_ptr + )[0].ticks(), + duration_unit_map[(cdtype.id())] + ) + elif cdtype.id() == libcudf_types.DURATION_MILLISECONDS: + return np.timedelta64( + ( + s_ptr + )[0].ticks(), + duration_unit_map[(cdtype.id())] + ) + elif cdtype.id() == libcudf_types.DURATION_MICROSECONDS: + return np.timedelta64( + ( + s_ptr + )[0].ticks(), + duration_unit_map[(cdtype.id())] + ) + elif cdtype.id() == libcudf_types.DURATION_NANOSECONDS: + return np.timedelta64( + ( + s_ptr + )[0].ticks(), + duration_unit_map[(cdtype.id())] ) else: raise ValueError("Could not convert cudf::scalar to numpy scalar") @@ -270,4 +358,4 @@ def as_scalar(val, dtype=None): else: return Scalar(val.value, dtype) else: - return Scalar(val, dtype) + return Scalar(value=val, dtype=dtype) diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index ef8f3846cd8..4dbb2d99db3 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -41,6 +41,10 @@ from cudf._lib.cpp.strings.convert.convert_urls cimport ( url_encode as cpp_url_encode, url_decode as cpp_url_decode ) +from cudf._lib.cpp.strings.convert.convert_durations cimport ( + to_durations as cpp_to_durations, + from_durations as cpp_from_durations +) from cudf._lib.cpp.types cimport ( type_id, data_type, @@ -569,6 +573,73 @@ def timestamp2int( return Column.from_unique_ptr(move(c_result)) +def timedelta2int( + Column input_col, + **kwargs): + """ + Converting/Casting input string column to TimeDelta column with specified + format + + Parameters + ---------- + input_col : input column of type string + + Returns + ------- + A Column with string represented in TimeDelta format + + """ + if input_col.size == 0: + return as_column([], dtype=kwargs.get('dtype')) + cdef column_view input_column_view = input_col.view() + cdef type_id tid = ( + ( + np_to_cudf_types[kwargs.get('dtype')] + ) + ) + cdef data_type out_type = data_type(tid) + cdef string c_duration_format = kwargs.get('format').encode('UTF-8') + cdef unique_ptr[column] c_result + with nogil: + c_result = move( + cpp_to_durations( + input_column_view, + out_type, + c_duration_format)) + + return Column.from_unique_ptr(move(c_result)) + + +def int2timedelta( + Column input_col, + **kwargs): + """ + Converting/Casting input Timedelta column to string + column with specified format + + Parameters + ---------- + input_col : input column of type Timedelta in integer format + + Returns + ------- + A Column with Timedelta represented in string format + + """ + + cdef column_view input_column_view = input_col.view() + cdef string c_duration_format = kwargs.get( + 'format', "%D days %H:%M:%S").encode('UTF-8') + cdef unique_ptr[column] c_result + with nogil: + c_result = move( + cpp_from_durations( + input_column_view, + c_duration_format)) + + return Column.from_unique_ptr(move(c_result)) + + def int2ip(Column input_col, **kwargs): """ Converting/Casting integer column to string column in ipv4 format diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index af1abfa1c9e..d054013cc4f 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -48,6 +48,18 @@ class TypeId(IntEnum): ) STRING = libcudf_types.type_id.STRING BOOL8 = libcudf_types.type_id.BOOL8 + DURATION_SECONDS = ( + libcudf_types.type_id.DURATION_SECONDS + ) + DURATION_MILLISECONDS = ( + libcudf_types.type_id.DURATION_MILLISECONDS + ) + DURATION_MICROSECONDS = ( + libcudf_types.type_id.DURATION_MICROSECONDS + ) + DURATION_NANOSECONDS = ( + libcudf_types.type_id.DURATION_NANOSECONDS + ) np_to_cudf_types = { @@ -67,6 +79,10 @@ np_to_cudf_types = { np.dtype("datetime64[ns]"): TypeId.TIMESTAMP_NANOSECONDS, np.dtype("object"): TypeId.STRING, np.dtype("bool"): TypeId.BOOL8, + np.dtype("timedelta64[s]"): TypeId.DURATION_SECONDS, + np.dtype("timedelta64[ms]"): TypeId.DURATION_MILLISECONDS, + np.dtype("timedelta64[us]"): TypeId.DURATION_MICROSECONDS, + np.dtype("timedelta64[ns]"): TypeId.DURATION_NANOSECONDS, } cudf_to_np_types = { @@ -86,6 +102,24 @@ cudf_to_np_types = { TypeId.TIMESTAMP_NANOSECONDS: np.dtype("datetime64[ns]"), TypeId.STRING: np.dtype("object"), TypeId.BOOL8: np.dtype("bool"), + TypeId.DURATION_SECONDS: np.dtype("timedelta64[s]"), + TypeId.DURATION_MILLISECONDS: np.dtype("timedelta64[ms]"), + TypeId.DURATION_MICROSECONDS: np.dtype("timedelta64[us]"), + TypeId.DURATION_NANOSECONDS: np.dtype("timedelta64[ns]"), +} + +duration_unit_map = { + TypeId.DURATION_SECONDS: "s", + TypeId.DURATION_MILLISECONDS: "ms", + TypeId.DURATION_MICROSECONDS: "us", + TypeId.DURATION_NANOSECONDS: "ns" +} + +datetime_unit_map = { + TypeId.TIMESTAMP_SECONDS: "s", + TypeId.TIMESTAMP_MILLISECONDS: "ms", + TypeId.TIMESTAMP_MICROSECONDS: "us", + TypeId.TIMESTAMP_NANOSECONDS: "ns", } diff --git a/python/cudf/cudf/core/__init__.py b/python/cudf/cudf/core/__init__.py index 8ed237a6a6c..d30f949f72c 100644 --- a/python/cudf/cudf/core/__init__.py +++ b/python/cudf/cudf/core/__init__.py @@ -14,6 +14,7 @@ Int32Index, Int64Index, RangeIndex, + TimedeltaIndex, UInt8Index, UInt16Index, UInt32Index, diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index ab38665176d..d4532df8548 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -17,3 +17,4 @@ from cudf.core.column.lists import ListColumn # noqa: F401 from cudf.core.column.numerical import NumericalColumn # noqa: F401 from cudf.core.column.string import StringColumn # noqa: F401 +from cudf.core.column.timedelta import TimeDeltaColumn # noqa: F401 diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 2739a8193ff..4b4fca2b935 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -87,9 +87,7 @@ def categories(self): """ The categories of this categorical. """ - from cudf.core.index import as_index - - return as_index(self._column.categories) + return cudf.core.index.as_index(self._column.categories) @property def codes(self): @@ -872,6 +870,26 @@ def ordered(self, value): def cat(self, parent=None): return CategoricalAccessor(self, parent=parent) + @classmethod + def from_arrow(cls, array): + codes_dtype = min_unsigned_type(len(array.indices)) + codes = column.as_column(array.indices).astype(codes_dtype) + if isinstance(array.dictionary, pa.NullArray): + categories = column.as_column([], dtype="object") + else: + categories = column.as_column(array.dictionary) + + dtype = CategoricalDtype( + categories=categories, ordered=array.type.ordered + ) + return CategoricalColumn( + dtype=dtype, + mask=codes.base_mask, + children=(codes,), + size=codes.size, + offset=codes.offset, + ) + def unary_operator(self, unaryop): raise TypeError( f"Series of dtype `category` cannot perform the operation: " @@ -897,9 +915,8 @@ def binary_operator(self, op, rhs, reflect=False): return self.as_numerical.binary_operator(op, rhs.as_numerical) def normalize_binop_value(self, other): - from cudf.utils import utils - ary = utils.scalar_broadcast_to( + ary = cudf.utils.utils.scalar_broadcast_to( self._encode(other), size=len(self), dtype=self.codes.dtype ) col = column.build_categorical_column( @@ -1163,6 +1180,11 @@ def as_datetime_column(self, dtype, **kwargs): dtype, **kwargs ) + def as_timedelta_column(self, dtype, **kwargs): + return self._get_decategorized_column().as_timedelta_column( + dtype, **kwargs + ) + def _get_decategorized_column(self): if self.null_count == len(self): # self.categories is empty; just return codes diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 863d3d10cb0..1e451e9308a 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -28,6 +28,7 @@ from cudf.utils import ioutils, utils from cudf.utils.dtypes import ( check_cast_unsupported_dtype, + get_time_unit, is_categorical_dtype, is_list_dtype, is_numerical_dtype, @@ -36,7 +37,7 @@ min_unsigned_type, np_to_pa_dtype, ) -from cudf.utils.utils import buffers_from_pyarrow, mask_dtype +from cudf.utils.utils import mask_dtype class ColumnBase(Column, Serializable): @@ -69,12 +70,10 @@ def __init__( ) def as_frame(self): - from cudf.core.frame import Frame - """ Converts a Column to Frame """ - return Frame({None: self.copy(deep=False)}) + return cudf.core.frame.Frame({None: self.copy(deep=False)}) @property def data_array_view(self): @@ -470,7 +469,6 @@ def element_indexing(self, index): return libcudf.copying.get_element(self, index).value def __getitem__(self, arg): - from cudf.core.column import column if isinstance(arg, Number): arg = int(arg) @@ -513,9 +511,9 @@ def __getitem__(self, arg): ) return self.take(gather_map) else: - arg = column.as_column(arg) + arg = as_column(arg) if len(arg) == 0: - arg = column.as_column([], dtype="int32") + arg = as_column([], dtype="int32") if pd.api.types.is_integer_dtype(arg.dtype): return self.take(arg) if pd.api.types.is_bool_dtype(arg.dtype): @@ -529,7 +527,6 @@ def __setitem__(self, key, value): If value and self are of different types, value is coerced to self.dtype """ - from cudf.core import column if isinstance(key, slice): key_start, key_stop, key_stride = key.indices(len(self)) @@ -554,15 +551,15 @@ def __setitem__(self, key, value): else: nelem = abs(key_stop - key_start) else: - key = column.as_column(key) + key = as_column(key) if pd.api.types.is_bool_dtype(key.dtype): if not len(key) == len(self): raise ValueError( "Boolean mask must be of same length as column" ) - key = column.as_column(cupy.arange(len(self)))[key] + key = as_column(cupy.arange(len(self)))[key] if hasattr(value, "__len__") and len(value) == len(self): - value = column.as_column(value)[key] + value = as_column(value)[key] nelem = len(key) if is_scalar(value): @@ -578,7 +575,7 @@ def __setitem__(self, key, value): f"{nelem}" ) raise ValueError(msg) - value = column.as_column(value).astype(self.dtype) + value = as_column(value).astype(self.dtype) if is_categorical_dtype(value.dtype): value = value.cat().set_categories(self.categories) assert self.dtype == value.dtype @@ -687,8 +684,6 @@ def find_last_value(self, value): return indices[-1] def append(self, other): - from cudf.core.column import as_column - return ColumnBase._concat([self, as_column(other)]) def quantile(self, q, interpolation, exact): @@ -807,9 +802,8 @@ def as_mask(self): @ioutils.doc_to_dlpack() def to_dlpack(self): """{docstring}""" - from cudf.io import dlpack as dlpack - return dlpack.to_dlpack(self) + return cudf.io.dlpack.to_dlpack(self) @property def is_unique(self): @@ -882,6 +876,8 @@ def astype(self, dtype, **kwargs): return self.as_categorical_column(dtype, **kwargs) elif np.issubdtype(dtype, np.datetime64): return self.as_datetime_column(dtype, **kwargs) + elif np.issubdtype(dtype, np.timedelta64): + return self.as_timedelta_column(dtype, **kwargs) elif pd.api.types.pandas_dtype(dtype).type in (np.str_, np.object_): return self.as_string_column(dtype, **kwargs) else: @@ -936,6 +932,9 @@ def as_numerical_column(self, dtype, **kwargs): def as_datetime_column(self, dtype, **kwargs): raise NotImplementedError + def as_timedelta_column(self, dtype, **kwargs): + raise NotImplementedError + def as_string_column(self, dtype, **kwargs): raise NotImplementedError @@ -1154,6 +1153,15 @@ def build_column( offset=offset, null_count=null_count, ) + elif dtype.type is np.timedelta64: + return cudf.core.column.TimeDeltaColumn( + data=data, + dtype=dtype, + mask=mask, + size=size, + offset=offset, + null_count=null_count, + ) elif dtype.type in (np.object_, np.str_): return cudf.core.column.StringColumn( mask=mask, @@ -1331,18 +1339,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): elif isinstance(arbitrary, pa.Array): if isinstance(arbitrary, pa.StringArray): - pa_size, pa_offset, nbuf, obuf, sbuf = buffers_from_pyarrow( - arbitrary - ) - children = ( - build_column(data=obuf, dtype="int32"), - build_column(data=sbuf, dtype="int8"), - ) - - data = cudf.core.column.string.StringColumn( - mask=nbuf, children=children, size=pa_size, offset=pa_offset - ) - + data = cudf.core.column.StringColumn.from_arrow(arbitrary) elif isinstance(arbitrary, pa.NullArray): if type(dtype) == str and dtype == "empty": new_dtype = pd.api.types.pandas_dtype( @@ -1369,47 +1366,15 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): arbitrary = arbitrary.cast(np_to_pa_dtype(new_dtype)) data = as_column(arbitrary, nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.DictionaryArray): - codes_dtype = min_unsigned_type(len(arbitrary.indices)) - codes = as_column(arbitrary.indices).astype(codes_dtype) - if isinstance(arbitrary.dictionary, pa.NullArray): - categories = as_column([], dtype="object") - else: - categories = as_column(arbitrary.dictionary) - - dtype = CategoricalDtype( - categories=categories, ordered=arbitrary.type.ordered - ) - data = cudf.core.column.categorical.CategoricalColumn( - dtype=dtype, - mask=codes.base_mask, - children=(codes,), - size=codes.size, - offset=codes.offset, - ) + data = cudf.core.column.CategoricalColumn.from_arrow(arbitrary) elif isinstance(arbitrary, pa.TimestampArray): - dtype = np.dtype("M8[{}]".format(arbitrary.type.unit)) - pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( - arbitrary, dtype=dtype - ) - - data = cudf.core.column.datetime.DatetimeColumn( - data=padata, - mask=pamask, - dtype=dtype, - size=pa_size, - offset=pa_offset, - ) + data = cudf.core.column.DatetimeColumn.from_arrow(arbitrary) + elif isinstance(arbitrary, pa.DurationArray): + data = cudf.core.column.TimeDeltaColumn.from_arrow(arbitrary) elif isinstance(arbitrary, pa.Date64Array): raise NotImplementedError - pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( - arbitrary, dtype="M8[ms]" - ) - data = cudf.core.column.datetime.DatetimeColumn( - data=padata, - mask=pamask, - dtype=np.dtype("M8[ms]"), - size=pa_size, - offset=pa_offset, + data = cudf.core.column.DatetimeColumn.from_arrow( + arbitrary, dtype=np.dtype("M8[ms]") ) elif isinstance(arbitrary, pa.Date32Array): # No equivalent np dtype and not yet supported @@ -1422,36 +1387,14 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): elif isinstance(arbitrary, pa.BooleanArray): # Arrow uses 1 bit per value while we use int8 dtype = np.dtype(np.bool) - # Needed because of bug in PyArrow - # https://issues.apache.org/jira/browse/ARROW-4766 - if len(arbitrary) > 0: - arbitrary = arbitrary.cast(pa.int8()) - else: - arbitrary = pa.array([], type=pa.int8()) - - pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( + arbitrary = arbitrary.cast(pa.int8()) + data = cudf.core.column.NumericalColumn.from_arrow( arbitrary, dtype=dtype ) - data = cudf.core.column.NumericalColumn( - data=padata, - mask=pamask, - dtype=dtype, - size=pa_size, - offset=pa_offset, - ) elif isinstance(arbitrary, pa.ListArray): data = cudf.core.column.ListColumn.from_arrow(arbitrary) else: - pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( - arbitrary - ) - data = cudf.core.column.NumericalColumn( - data=padata, - dtype=np.dtype(arbitrary.type.to_pandas_dtype()), - mask=pamask, - size=pa_size, - offset=pa_offset, - ) + data = cudf.core.column.NumericalColumn.from_arrow(arbitrary) elif isinstance(arbitrary, pa.ChunkedArray): gpu_cols = [ @@ -1556,7 +1499,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): if arb_dtype.kind == "M": - time_unit, _ = np.datetime_data(arbitrary.dtype) + time_unit = get_time_unit(arbitrary) cast_dtype = time_unit in ("D", "W", "M", "Y") if cast_dtype: @@ -1574,6 +1517,26 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): data = cudf.core.column.datetime.DatetimeColumn( data=buffer, mask=mask, dtype=arbitrary.dtype ) + elif arb_dtype.kind == "m": + + time_unit = get_time_unit(arbitrary) + cast_dtype = time_unit in ("D", "W", "M", "Y") + + if cast_dtype: + arbitrary = arbitrary.astype(np.dtype("timedelta64[s]")) + + buffer = Buffer(arbitrary.view("|u1")) + mask = None + if nan_as_null is None or nan_as_null is True: + data = as_column( + buffer, dtype=arbitrary.dtype, nan_as_null=nan_as_null + ) + data = utils.time_col_replace_nulls(data) + mask = data.mask + + data = cudf.core.column.timedelta.TimeDeltaColumn( + data=buffer, mask=mask, dtype=arbitrary.dtype + ) elif arb_dtype.kind in ("O", "U"): data = as_column( pa.Array.from_pandas(arbitrary), dtype=arbitrary.dtype diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index a6afa4b887e..9581d1ded0c 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -1,16 +1,18 @@ # Copyright (c) 2019-2020, NVIDIA CORPORATION. import datetime as dt +import re import numpy as np import pandas as pd import pyarrow as pa +import cudf from cudf import _lib as libcudf from cudf._lib.nvtx import annotate -from cudf.core.buffer import Buffer -from cudf.core.column import column -from cudf.utils import utils +from cudf._lib.scalar import Scalar, as_scalar +from cudf.core.column import column, string from cudf.utils.dtypes import is_scalar, np_to_pa_dtype +from cudf.utils.utils import buffers_from_pyarrow # nanoseconds per time_unit _numpy_to_pandas_conversion = { @@ -20,7 +22,7 @@ "s": 1000000000, "m": 60000000000, "h": 3600000000000, - "D": 1000000000 * 86400, + "D": 86400000000000, } _dtype_to_format_conversion = { @@ -59,16 +61,21 @@ def __init__( offset=offset, null_count=null_count, ) - assert self.dtype.type is np.datetime64 + + if not (self.dtype.type is np.datetime64): + raise TypeError(f"{self.dtype} is not a supported datetime type") + self._time_unit, _ = np.datetime_data(self.dtype) def __contains__(self, item): - # Handles improper item types try: item = np.datetime64(item, self._time_unit) - except Exception: + except ValueError: + # If item cannot be converted to datetime type + # np.datetime64 raises ValueError, hence `item` + # cannot exist in `self`. return False - return item.astype("int_") in self.as_numerical + return item.astype("int64") in self.as_numerical @property def time_unit(self): @@ -108,29 +115,35 @@ def get_dt_field(self, field): def normalize_binop_value(self, other): if isinstance(other, dt.datetime): other = np.datetime64(other) + elif isinstance(other, dt.timedelta): + other = np.timedelta64(other) + elif isinstance(other, pd.Timestamp): + other = other.to_datetime64() + elif isinstance(other, pd.Timedelta): + other = other.to_timedelta64() + + if isinstance(other, np.datetime64): + if np.isnat(other): + return as_scalar(val=None, dtype=self.dtype) - if isinstance(other, pd.Timestamp): - m = _numpy_to_pandas_conversion[self.time_unit] - ary = utils.scalar_broadcast_to( - other.value * m, size=len(self), dtype=self.dtype - ) - elif isinstance(other, np.datetime64): other = other.astype(self.dtype) - ary = utils.scalar_broadcast_to( - other, size=len(self), dtype=self.dtype - ) - else: - raise TypeError("cannot broadcast {}".format(type(other))) + return as_scalar(other) + elif isinstance(other, np.timedelta64): + other_time_unit = cudf.utils.dtypes.get_time_unit(other) - return column.build_column( - data=Buffer(ary.data_array_view.view("|u1")), dtype=self.dtype - ) + if other_time_unit not in ("s", "ms", "ns", "us"): + other = other.astype("timedelta64[s]") + + if np.isnat(other): + return as_scalar(val=None, dtype=other.dtype) + + return as_scalar(other) + else: + raise TypeError("cannot normalize {}".format(type(other))) @property def as_numerical(self): - from cudf.core.column import build_column - - return build_column( + return column.build_column( data=self.base_data, dtype=np.int64, mask=self.base_mask, @@ -144,11 +157,15 @@ def as_datetime_column(self, dtype, **kwargs): return self return libcudf.unary.cast(self, dtype=dtype) + def as_timedelta_column(self, dtype, **kwargs): + raise TypeError( + f"cannot astype a datetimelike from [{self.dtype}] to [{dtype}]" + ) + def as_numerical_column(self, dtype, **kwargs): return self.as_numerical.astype(dtype) def as_string_column(self, dtype, **kwargs): - from cudf.core.column import string if not kwargs.get("format"): fmt = _dtype_to_format_conversion.get( @@ -163,6 +180,11 @@ def as_string_column(self, dtype, **kwargs): return column.column_empty(0, dtype="object", masked=False) def to_pandas(self, index=None, nullable_pd_dtype=False): + if nullable_pd_dtype: + raise NotImplementedError( + f"nullable_pd_dtype=True is not supported for {self.dtype}" + ) + return pd.Series( self.to_array(fillna="pandas").astype(self.dtype), index=index ) @@ -183,29 +205,44 @@ def to_arrow(self): def default_na_value(self): """Returns the default NA value for this column """ - dkind = self.dtype.kind - if dkind == "M": - return np.datetime64("nat", self.time_unit) - else: - raise TypeError( - "datetime column of {} has no NaN value".format(self.dtype) - ) + return np.datetime64("nat", self.time_unit) def binary_operator(self, op, rhs, reflect=False): lhs, rhs = self, rhs - if op in ("eq", "ne", "lt", "gt", "le", "ge"): out_dtype = np.bool + elif op == "add" and pd.api.types.is_timedelta64_dtype(rhs.dtype): + out_dtype = cudf.core.column.timedelta._timedelta_binary_op_add( + rhs, lhs + ) + elif op == "sub" and pd.api.types.is_timedelta64_dtype(rhs.dtype): + out_dtype = cudf.core.column.timedelta._timedelta_binary_op_sub( + rhs if reflect else lhs, lhs if reflect else rhs + ) + elif op == "sub" and pd.api.types.is_datetime64_dtype(rhs.dtype): + units = ["s", "ms", "us", "ns"] + lhs_time_unit = cudf.utils.dtypes.get_time_unit(lhs) + lhs_unit = units.index(lhs_time_unit) + rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs) + rhs_unit = units.index(rhs_time_unit) + out_dtype = np.dtype( + f"timedelta64[{units[max(lhs_unit, rhs_unit)]}]" + ) else: raise TypeError( f"Series of dtype {self.dtype} cannot perform " f" the operation {op}" ) + + if reflect: + lhs, rhs = rhs, lhs + return binop(lhs, rhs, op=op, out_dtype=out_dtype) def fillna(self, fill_value): if is_scalar(fill_value): - fill_value = np.datetime64(fill_value, self.time_unit) + if not isinstance(fill_value, Scalar): + fill_value = np.datetime64(fill_value, self.time_unit) else: fill_value = column.as_column(fill_value, nan_as_null=False) @@ -233,6 +270,21 @@ def find_last_value(self, value, closest=False): def is_unique(self): return self.as_numerical.is_unique + @classmethod + def from_arrow(cls, array, dtype=None): + if dtype is None: + dtype = np.dtype("M8[{}]".format(array.type.unit)) + + pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow(array) + + return DatetimeColumn( + data=padata, + mask=pamask, + dtype=dtype, + size=pa_size, + offset=pa_offset, + ) + def can_cast_safely(self, to_dtype): if np.issubdtype(to_dtype, np.datetime64): @@ -271,7 +323,6 @@ def infer_format(element, **kwargs): """ Infers datetime format from a string, also takes cares for `ms` and `ns` """ - import re fmt = pd.core.tools.datetimes._guess_datetime_format(element, **kwargs) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index da2197fea21..5b6d19b826b 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -9,7 +9,7 @@ from cudf._lib.nvtx import annotate from cudf._lib.scalar import Scalar from cudf.core.buffer import Buffer -from cudf.core.column import as_column, column +from cudf.core.column import as_column, build_column, column, string from cudf.utils import cudautils, utils from cudf.utils.dtypes import ( cudf_dtypes_to_pandas_dtypes, @@ -18,6 +18,7 @@ np_to_pa_dtype, numeric_normalize_types, ) +from cudf.utils.utils import buffers_from_pyarrow class NumericalColumn(column.ColumnBase): @@ -58,7 +59,7 @@ def __contains__(self, item): item = self.data_array_view.dtype.type(item) else: return False - except Exception: + except (TypeError, ValueError): return False # TODO: Use `scalar`-based `contains` wrapper return libcudf.search.contains( @@ -89,7 +90,7 @@ def binary_operator(self, binop, rhs, reflect=False): (np.isscalar(tmp) and (0 == tmp)) or ((isinstance(tmp, NumericalColumn)) and (0.0 in tmp)) ): - out_dtype = np.dtype("float_") + out_dtype = np.dtype("float64") elif rhs is None: out_dtype = self.dtype else: @@ -135,7 +136,6 @@ def int2ip(self): return libcudf.string_casting.int2ip(self) def as_string_column(self, dtype, **kwargs): - from cudf.core.column import as_column, string if len(self) > 0: return string._numeric_to_str_typecast_functions[ @@ -145,7 +145,16 @@ def as_string_column(self, dtype, **kwargs): return as_column([], dtype="object") def as_datetime_column(self, dtype, **kwargs): - from cudf.core.column import build_column + + return build_column( + data=self.astype("int64").base_data, + dtype=dtype, + mask=self.base_mask, + offset=self.offset, + size=self.size, + ) + + def as_timedelta_column(self, dtype, **kwargs): return build_column( data=self.astype("int64").base_data, @@ -161,6 +170,20 @@ def as_numerical_column(self, dtype, **kwargs): return self return libcudf.unary.cast(self, dtype) + @classmethod + def from_arrow(cls, array, dtype=None): + if dtype is None: + dtype = np.dtype(array.type.to_pandas_dtype()) + + pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow(array) + return NumericalColumn( + data=padata, + mask=pamask, + dtype=dtype, + size=pa_size, + offset=pa_offset, + ) + def to_pandas(self, index=None, nullable_pd_dtype=False): if nullable_pd_dtype: arrow_data = self.to_arrow() diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 6939e652895..13235e3eccd 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -145,6 +145,7 @@ is_scalar, is_string_dtype, ) +from cudf.utils.utils import buffers_from_pyarrow _str_to_numeric_typecast_functions = { np.dtype("int8"): str_cast.stoi8, @@ -164,6 +165,10 @@ np.dtype("datetime64[ms]"): str_cast.timestamp2int, np.dtype("datetime64[us]"): str_cast.timestamp2int, np.dtype("datetime64[ns]"): str_cast.timestamp2int, + np.dtype("timedelta64[s]"): str_cast.timedelta2int, + np.dtype("timedelta64[ms]"): str_cast.timedelta2int, + np.dtype("timedelta64[us]"): str_cast.timedelta2int, + np.dtype("timedelta64[ns]"): str_cast.timedelta2int, } _numeric_to_str_typecast_functions = { @@ -184,6 +189,10 @@ np.dtype("datetime64[ms]"): str_cast.int2timestamp, np.dtype("datetime64[us]"): str_cast.int2timestamp, np.dtype("datetime64[ns]"): str_cast.int2timestamp, + np.dtype("timedelta64[s]"): str_cast.int2timedelta, + np.dtype("timedelta64[ms]"): str_cast.int2timedelta, + np.dtype("timedelta64[us]"): str_cast.int2timedelta, + np.dtype("timedelta64[ns]"): str_cast.int2timedelta, } @@ -4503,6 +4512,18 @@ def __len__(self): def _set_mask(self, value): super()._set_mask(value) + @classmethod + def from_arrow(cls, array): + pa_size, pa_offset, nbuf, obuf, sbuf = buffers_from_pyarrow(array) + children = ( + column.build_column(data=obuf, dtype="int32"), + column.build_column(data=sbuf, dtype="int8"), + ) + + return StringColumn( + mask=nbuf, children=children, size=pa_size, offset=pa_offset + ) + @property def _nbytes(self): if self.size == 0: @@ -4526,6 +4547,12 @@ def as_numerical_column(self, dtype, **kwargs): if len(self) > 0 and self.binary_operator("eq", "None").any(): raise ValueError("Could not convert `None` value to datetime") + boolean_match = self.binary_operator("eq", "NaT") + elif out_dtype.type is np.timedelta64: + if "format" not in kwargs: + if len(self) > 0: + kwargs.update(format="%D days %H:%M:%S") + boolean_match = self.binary_operator("eq", "NaT") elif out_dtype.kind in {"i", "u"}: if not cpp_is_integer(self).all(): @@ -4543,13 +4570,18 @@ def as_numerical_column(self, dtype, **kwargs): result_col = _str_to_numeric_typecast_functions[out_dtype]( self, **kwargs ) - if (out_dtype.type is np.datetime64) and boolean_match.any(): + if ( + out_dtype.type in (np.datetime64, np.timedelta64) + ) and boolean_match.any(): result_col[boolean_match] = None return result_col def as_datetime_column(self, dtype, **kwargs): return self.as_numerical_column(dtype, **kwargs) + def as_timedelta_column(self, dtype, **kwargs): + return self.as_numerical_column(dtype, **kwargs) + def as_string_column(self, dtype, **kwargs): return self diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py new file mode 100644 index 00000000000..c706f29a25d --- /dev/null +++ b/python/cudf/cudf/core/column/timedelta.py @@ -0,0 +1,621 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. +import datetime as dt +from numbers import Number + +import numpy as np +import pandas as pd +import pyarrow as pa + +import cudf +from cudf import _lib as libcudf +from cudf._lib.nvtx import annotate +from cudf._lib.scalar import Scalar, as_scalar +from cudf.core.column import column, string +from cudf.core.column.datetime import _numpy_to_pandas_conversion +from cudf.utils.dtypes import is_scalar, np_to_pa_dtype +from cudf.utils.utils import buffers_from_pyarrow + +_dtype_to_format_conversion = { + "timedelta64[ns]": "%D days %H:%M:%S", + "timedelta64[us]": "%D days %H:%M:%S", + "timedelta64[ms]": "%D days %H:%M:%S", + "timedelta64[s]": "%D days %H:%M:%S", +} + + +class TimeDeltaColumn(column.ColumnBase): + def __init__( + self, data, dtype, size, mask=None, offset=0, null_count=None + ): + """ + Parameters + ---------- + data : Buffer + The Timedelta values + dtype : np.dtype + The data type + size : int + Size of memory allocation. + mask : Buffer; optional + The validity mask + offset : int + Data offset + null_count : int, optional + The number of null values. + If None, it is calculated automatically. + """ + dtype = np.dtype(dtype) + if data.size % dtype.itemsize: + raise ValueError("Buffer size must be divisible by element size") + + super().__init__( + data, + size=size, + dtype=dtype, + mask=mask, + offset=offset, + null_count=null_count, + ) + + if not (self.dtype.type is np.timedelta64): + raise TypeError(f"{self.dtype} is not a supported duration type") + + self._time_unit, _ = np.datetime_data(self.dtype) + + def __contains__(self, item): + try: + item = np.timedelta64(item, self._time_unit) + except ValueError: + # If item cannot be converted to duration type + # np.timedelta64 raises ValueError, hence `item` + # cannot exist in `self`. + return False + return item.view("int64") in self.as_numerical + + def _repr_str_col(self): + # NOTE: This is for __repr__ purposes only. + # Typecasting to "str" yields different output + # when compared to __repr__ output of a duration type + # A Typecast to "str" preserves "Days" field even if + # the number of days in all rows are <= 0, whereas + # in __repr__ output the number of "Days" are truncated + # from the output if the number of days in all the rows + # are <= 0, hence this is an interal API which truncates "Days" + # from the output repr to both match pandas and have + # cleaner output. + + if not ( + self.binary_operator(op="gt", rhs=np.timedelta64(1, "D")) + ).any(): + format = "%H:%M:%S" + else: + format = None + + result = self.as_string_column(dtype="str", format=format) + if self.has_nulls: + result = result.fillna(cudf._NA_REP) + return result + + def to_pandas(self, index=None, nullable_pd_dtype=False): + if nullable_pd_dtype: + raise NotImplementedError( + f"nullable_pd_dtype=True is not supported for {self.dtype}" + ) + + return pd.Series( + self.to_array(fillna="pandas").astype(self.dtype), index=index + ) + + def to_arrow(self): + mask = None + if self.nullable: + mask = pa.py_buffer(self.mask_array_view.copy_to_host()) + data = pa.py_buffer(self.as_numerical.data_array_view.copy_to_host()) + pa_dtype = np_to_pa_dtype(self.dtype) + return pa.Array.from_buffers( + type=pa_dtype, + length=len(self), + buffers=[mask, data], + null_count=self.null_count, + ) + + def _binary_op_floordiv(self, rhs): + lhs, rhs = self, rhs + if pd.api.types.is_timedelta64_dtype(rhs.dtype): + common_dtype = determine_out_dtype(self.dtype, rhs.dtype) + lhs = lhs.astype(common_dtype).astype("float64") + + if isinstance(rhs, Scalar): + if rhs.is_valid(): + if isinstance(rhs, Scalar): + rhs = np.timedelta64(rhs.value) + + rhs = rhs.astype(common_dtype).astype("float64") + else: + rhs = as_scalar(None, "float64") + else: + rhs = rhs.astype(common_dtype).astype("float64") + + out_dtype = np.dtype("int64") + elif rhs.dtype.kind in ("f", "i", "u"): + out_dtype = self.dtype + else: + raise TypeError( + f"Floor Division of {self.dtype} with {rhs.dtype} " + f"cannot be performed." + ) + + return lhs, rhs, out_dtype + + def _binary_op_mul(self, rhs): + if rhs.dtype.kind in ("f", "i", "u"): + out_dtype = self.dtype + else: + raise TypeError( + f"Multiplication of {self.dtype} with {rhs.dtype} " + f"cannot be performed." + ) + return out_dtype + + def _binary_op_mod(self, rhs): + if pd.api.types.is_timedelta64_dtype(rhs.dtype): + out_dtype = determine_out_dtype(self.dtype, rhs.dtype) + elif rhs.dtype.kind in ("f", "i", "u"): + out_dtype = self.dtype + else: + raise TypeError( + f"Modulus of {self.dtype} with {rhs.dtype} " + f"cannot be performed." + ) + return out_dtype + + def _binary_op_eq_ne(self, rhs): + if pd.api.types.is_timedelta64_dtype(rhs.dtype): + out_dtype = np.bool + else: + raise TypeError( + f"Equality of {self.dtype} with {rhs.dtype} " + f"cannot be performed." + ) + return out_dtype + + def _binary_op_lt_gt_le_ge(self, rhs): + if pd.api.types.is_timedelta64_dtype(rhs.dtype): + return np.bool + else: + raise TypeError( + f"Invalid comparison between dtype={self.dtype}" + f" and {rhs.dtype}" + ) + + def _binary_op_truediv(self, rhs): + lhs, rhs = self, rhs + if pd.api.types.is_timedelta64_dtype(rhs.dtype): + common_dtype = determine_out_dtype(self.dtype, rhs.dtype) + lhs = lhs.astype(common_dtype).astype("float64") + + if isinstance(rhs, Scalar): + if rhs.is_valid(): + if isinstance(rhs, Scalar): + rhs = np.timedelta64(rhs.value) + + rhs = rhs.astype(common_dtype).astype("float64") + else: + rhs = as_scalar(None, "float64") + else: + rhs = rhs.astype(common_dtype).astype("float64") + + out_dtype = np.dtype("float64") + elif rhs.dtype.kind in ("f", "i", "u"): + out_dtype = self.dtype + else: + raise TypeError( + f"Division of {self.dtype} with {rhs.dtype} " + f"cannot be performed." + ) + + return lhs, rhs, out_dtype + + def binary_operator(self, op, rhs, reflect=False): + lhs, rhs = self, rhs + + if op in ("eq", "ne"): + out_dtype = self._binary_op_eq_ne(rhs) + elif op in ("lt", "gt", "le", "ge"): + out_dtype = self._binary_op_lt_gt_le_ge(rhs) + elif op == "mul": + out_dtype = self._binary_op_mul(rhs) + elif op == "mod": + out_dtype = self._binary_op_mod(rhs) + elif op == "truediv": + lhs, rhs, out_dtype = self._binary_op_truediv(rhs) + elif op == "floordiv": + lhs, rhs, out_dtype = self._binary_op_floordiv(rhs) + op = "truediv" + elif op == "add": + out_dtype = _timedelta_binary_op_add(lhs, rhs) + elif op == "sub": + out_dtype = _timedelta_binary_op_sub(lhs, rhs) + else: + raise TypeError( + f"Series of dtype {self.dtype} cannot perform " + f"the operation {op}" + ) + + if reflect: + lhs, rhs = rhs, lhs + + return binop(lhs, rhs, op=op, out_dtype=out_dtype) + + def normalize_binop_value(self, other): + if isinstance(other, dt.timedelta): + other = np.timedelta64(other) + elif isinstance(other, pd.Timestamp): + other = other.to_datetime64() + elif isinstance(other, pd.Timedelta): + other = other.to_timedelta64() + + if isinstance(other, np.timedelta64): + other_time_unit = cudf.utils.dtypes.get_time_unit(other) + if np.isnat(other): + return as_scalar(val=None, dtype=self.dtype) + + if other_time_unit not in ("s", "ms", "ns", "us"): + other = other.astype("timedelta64[s]") + else: + common_dtype = determine_out_dtype(self.dtype, other.dtype) + other = other.astype(common_dtype) + return as_scalar(other) + elif np.isscalar(other): + return as_scalar(other) + else: + raise TypeError("cannot normalize {}".format(type(other))) + + @property + def as_numerical(self): + + return column.build_column( + data=self.base_data, + dtype=np.int64, + mask=self.base_mask, + offset=self.offset, + size=self.size, + ) + + @classmethod + def from_arrow(cls, array, dtype=None): + if dtype is None: + dtype = np.dtype("m8[{}]".format(array.type.unit)) + + pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow(array) + + return TimeDeltaColumn( + data=padata, + mask=pamask, + dtype=dtype, + size=pa_size, + offset=pa_offset, + ) + + def default_na_value(self): + """Returns the default NA value for this column + """ + return np.timedelta64("nat", self.time_unit) + + @property + def time_unit(self): + return self._time_unit + + def fillna(self, fill_value): + col = self + if is_scalar(fill_value): + if isinstance(fill_value, np.timedelta64): + dtype = determine_out_dtype(self.dtype, fill_value.dtype) + fill_value = fill_value.astype(dtype) + col = col.astype(dtype) + elif not isinstance(fill_value, Scalar): + fill_value = np.timedelta64(fill_value) + else: + fill_value = column.as_column(fill_value, nan_as_null=False) + + result = libcudf.replace.replace_nulls(col, fill_value) + + return result + + def as_numerical_column(self, dtype, **kwargs): + return self.as_numerical.astype(dtype) + + def as_datetime_column(self, dtype, **kwargs): + raise TypeError( + f"cannot astype a timedelta from [{self.dtype}] to [{dtype}]" + ) + + def as_string_column(self, dtype, **kwargs): + + if not kwargs.get("format"): + fmt = _dtype_to_format_conversion.get( + self.dtype.name, "%D days %H:%M:%S" + ) + kwargs["format"] = fmt + if len(self) > 0: + return string._numeric_to_str_typecast_functions[ + np.dtype(self.dtype) + ](self, **kwargs) + else: + return column.column_empty(0, dtype="object", masked=False) + + def as_timedelta_column(self, dtype, **kwargs): + dtype = np.dtype(dtype) + if dtype == self.dtype: + return self + return libcudf.unary.cast(self, dtype=dtype) + + def mean(self, dtype=np.float64): + return pd.Timedelta( + self.as_numerical.mean(dtype=dtype), unit=self.time_unit + ) + + def median(self, dtype=np.float64): + return pd.Timedelta( + self.as_numerical.median(dtype=dtype), unit=self.time_unit + ) + + def quantile(self, q, interpolation, exact): + result = self.as_numerical.quantile( + q=q, interpolation=interpolation, exact=exact + ) + if isinstance(q, Number): + return [pd.Timedelta(result[0], unit=self.time_unit)] + return result.astype(self.dtype) + + def sum(self, dtype=None): + if len(self) == 0: + return pd.Timedelta(None, unit=self.time_unit) + else: + return pd.Timedelta( + self.as_numerical.sum(dtype=dtype), unit=self.time_unit + ) + + def components(self, index=None): + """ + Return a Dataframe of the components of the Timedeltas. + + Returns + ------- + DataFrame + + Examples + -------- + >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='s')) + >>> s = cudf.Series([12231312123, 1231231231, 1123236768712, 2135656, + ... 3244334234], dtype='timedelta64[ms]') + >>> s + 0 141 days 13:35:12.123 + 1 14 days 06:00:31.231 + 2 13000 days 10:12:48.712 + 3 0 days 00:35:35.656 + 4 37 days 13:12:14.234 + dtype: timedelta64[ms] + >>> s.dt.components + days hours minutes seconds milliseconds microseconds nanoseconds + 0 141 13 35 12 123 0 0 + 1 14 6 0 31 231 0 0 + 2 13000 10 12 48 712 0 0 + 3 0 0 35 35 656 0 0 + 4 37 13 12 14 234 0 0 + """ # noqa: E501 + + return cudf.DataFrame( + data={ + "days": self.binary_operator( + "floordiv", + as_scalar( + np.timedelta64(_numpy_to_pandas_conversion["D"], "ns") + ), + ), + "hours": self.binary_operator( + "mod", + as_scalar( + np.timedelta64(_numpy_to_pandas_conversion["D"], "ns") + ), + ).binary_operator( + "floordiv", + as_scalar( + np.timedelta64(_numpy_to_pandas_conversion["h"], "ns") + ), + ), + "minutes": self.binary_operator( + "mod", + as_scalar( + np.timedelta64(_numpy_to_pandas_conversion["h"], "ns") + ), + ).binary_operator( + "floordiv", + as_scalar( + np.timedelta64(_numpy_to_pandas_conversion["m"], "ns") + ), + ), + "seconds": self.binary_operator( + "mod", + as_scalar( + np.timedelta64(_numpy_to_pandas_conversion["m"], "ns") + ), + ).binary_operator( + "floordiv", + as_scalar( + np.timedelta64(_numpy_to_pandas_conversion["s"], "ns") + ), + ), + "milliseconds": self.binary_operator( + "mod", + as_scalar( + np.timedelta64(_numpy_to_pandas_conversion["s"], "ns") + ), + ).binary_operator( + "floordiv", + as_scalar( + np.timedelta64(_numpy_to_pandas_conversion["ms"], "ns") + ), + ), + "microseconds": self.binary_operator( + "mod", + as_scalar( + np.timedelta64(_numpy_to_pandas_conversion["ms"], "ns") + ), + ).binary_operator( + "floordiv", + as_scalar( + np.timedelta64(_numpy_to_pandas_conversion["us"], "ns") + ), + ), + "nanoseconds": self.binary_operator( + "mod", + as_scalar( + np.timedelta64(_numpy_to_pandas_conversion["us"], "ns") + ), + ).binary_operator( + "floordiv", + as_scalar( + np.timedelta64(_numpy_to_pandas_conversion["ns"], "ns") + ), + ), + }, + index=index, + ) + + @property + def days(self): + """ + Number of days for each element. + + Returns + ------- + NumericalColumn + """ + return self.binary_operator( + "floordiv", + as_scalar(np.timedelta64(_numpy_to_pandas_conversion["D"], "ns")), + ) + + @property + def seconds(self): + """ + Number of seconds (>= 0 and less than 1 day). + + Returns + ------- + NumericalColumn + """ + # This property must return the number of seconds (>= 0 and + # less than 1 day) for each element, hence first performing + # mod operation to remove the number of days and then performing + # division operation to extract the number of seconds. + + return self.binary_operator( + "mod", + as_scalar(np.timedelta64(_numpy_to_pandas_conversion["D"], "ns")), + ).binary_operator( + "floordiv", + as_scalar(np.timedelta64(_numpy_to_pandas_conversion["s"], "ns")), + ) + + @property + def microseconds(self): + """ + Number of microseconds (>= 0 and less than 1 second). + + Returns + ------- + NumericalColumn + """ + # This property must return the number of microseconds (>= 0 and + # less than 1 second) for each element, hence first performing + # mod operation to remove the number of seconds and then performing + # division operation to extract the number of microseconds. + + return self.binary_operator( + "mod", np.timedelta64(_numpy_to_pandas_conversion["s"], "ns") + ).binary_operator( + "floordiv", + as_scalar(np.timedelta64(_numpy_to_pandas_conversion["us"], "ns")), + ) + + @property + def nanoseconds(self): + """ + Return the number of nanoseconds (n), where 0 <= n < 1 microsecond. + + Returns + ------- + NumericalColumn + """ + # This property must return the number of nanoseconds (>= 0 and + # less than 1 microsecond) for each element, hence first performing + # mod operation to remove the number of microseconds and then + # performing division operation to extract the number + # of nanoseconds. + + return self.binary_operator( + "mod", + as_scalar(np.timedelta64(_numpy_to_pandas_conversion["us"], "ns")), + ).binary_operator( + "floordiv", + as_scalar(np.timedelta64(_numpy_to_pandas_conversion["ns"], "ns")), + ) + + +@annotate("BINARY_OP", color="orange", domain="cudf_python") +def binop(lhs, rhs, op, out_dtype): + out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) + return out + + +def determine_out_dtype(lhs_dtype, rhs_dtype): + if np.can_cast(np.dtype(lhs_dtype), np.dtype(rhs_dtype)): + return rhs_dtype + elif np.can_cast(np.dtype(rhs_dtype), np.dtype(lhs_dtype)): + return lhs_dtype + else: + raise TypeError(f"Cannot type-cast {lhs_dtype} and {rhs_dtype}") + + +def _timedelta_binary_op_add(lhs, rhs): + if pd.api.types.is_timedelta64_dtype(rhs.dtype): + out_dtype = determine_out_dtype(lhs.dtype, rhs.dtype) + elif pd.api.types.is_datetime64_dtype(rhs.dtype): + units = ["s", "ms", "us", "ns"] + lhs_time_unit = cudf.utils.dtypes.get_time_unit(lhs) + lhs_unit = units.index(lhs_time_unit) + rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs) + rhs_unit = units.index(rhs_time_unit) + out_dtype = np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]") + else: + raise TypeError( + f"Addition of {lhs.dtype} with {rhs.dtype} " + f"cannot be performed." + ) + + return out_dtype + + +def _timedelta_binary_op_sub(lhs, rhs): + if pd.api.types.is_timedelta64_dtype( + lhs.dtype + ) and pd.api.types.is_timedelta64_dtype(rhs.dtype): + out_dtype = determine_out_dtype(lhs.dtype, rhs.dtype) + elif pd.api.types.is_timedelta64_dtype( + rhs.dtype + ) and pd.api.types.is_datetime64_dtype(lhs.dtype): + units = ["s", "ms", "us", "ns"] + lhs_time_unit = cudf.utils.dtypes.get_time_unit(lhs) + lhs_unit = units.index(lhs_time_unit) + rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs) + rhs_unit = units.index(rhs_time_unit) + out_dtype = np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]") + else: + raise TypeError( + f"Subtraction of {lhs.dtype} with {rhs.dtype} " + f"cannot be performed." + ) + + return out_dtype diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c0fbb65001e..bbce1b41037 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1164,10 +1164,12 @@ def _clean_nulls_from_dataframe(self, df): filling with `` values. """ for col in df._data: - if is_list_dtype(self._data[col]): + if is_list_dtype(df._data[col]): # TODO we need to handle this pass - elif self._data[col].has_nulls: + elif isinstance(df._data[col], cudf.core.column.TimeDeltaColumn): + df[col] = df._data[col]._repr_str_col() + elif df._data[col].has_nulls: df[col] = df._data[col].astype("str").fillna(cudf._NA_REP) else: df[col] = df._data[col] @@ -4673,7 +4675,7 @@ def to_pandas(self, **kwargs): >>> type(pdf) """ - nullable_pd_dtype = kwargs.get("nullable_pd_dtype", True) + nullable_pd_dtype = kwargs.get("nullable_pd_dtype", None) out_data = {} out_index = self.index.to_pandas() diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 3c478b1963e..19d88cc6afe 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2,10 +2,12 @@ from __future__ import division, print_function import pickle +from numbers import Number import cupy import numpy as np import pandas as pd +from pandas._config import get_option import cudf from cudf._lib.nvtx import annotate @@ -16,6 +18,7 @@ DatetimeColumn, NumericalColumn, StringColumn, + TimeDeltaColumn, column, ) from cudf.core.column.string import StringMethods as StringMethods @@ -48,8 +51,6 @@ def _to_frame(this_index, index=True, name=None): cudf DataFrame """ - from cudf import DataFrame - if name is not None: col_name = name elif this_index.name is None: @@ -57,7 +58,7 @@ def _to_frame(this_index, index=True, name=None): else: col_name = this_index.name - return DataFrame( + return cudf.DataFrame( {col_name: this_index._values}, index=this_index if index else None ) @@ -79,6 +80,14 @@ def __new__( return as_index(data, copy=copy, dtype=dtype, name=name, **kwargs) + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + + if method == "__call__" and hasattr(cudf, ufunc.__name__): + func = getattr(cudf, ufunc.__name__) + return func(*inputs) + else: + return NotImplemented + def __init__( self, data=None, @@ -503,9 +512,8 @@ def tolist(self): @ioutils.doc_to_dlpack() def to_dlpack(self): """{docstring}""" - from cudf.io import dlpack as dlpack - return dlpack.to_dlpack(self) + return cudf.io.dlpack.to_dlpack(self) @property def gpu_values(self): @@ -1298,6 +1306,8 @@ def _from_table(cls, table): ) elif isinstance(values, DatetimeColumn): out = super(Index, DatetimeIndex).__new__(DatetimeIndex) + elif isinstance(values, TimeDeltaColumn): + out = super(Index, TimedeltaIndex).__new__(TimedeltaIndex) elif isinstance(values, StringColumn): out = super(Index, StringIndex).__new__(StringIndex) elif isinstance(values, CategoricalColumn): @@ -1414,9 +1424,9 @@ def _values(self): @property def _data(self): - from cudf.core.column_accessor import ColumnAccessor - - return ColumnAccessor({self.name: self._values}) + return cudf.core.column_accessor.ColumnAccessor( + {self.name: self._values} + ) def __contains__(self, item): if not isinstance( @@ -1451,8 +1461,6 @@ def __len__(self): return max(0, self._stop - self._start) def __getitem__(self, index): - from numbers import Number - if isinstance(index, slice): start, stop, step = index.indices(len(self)) sln = (stop - start) // step @@ -1717,8 +1725,6 @@ def __len__(self): return len(self._values) def __repr__(self): - from pandas._config import get_option - max_seq_items = get_option("max_seq_items") or len(self) mr = 0 if 2 * max_seq_items < len(self): @@ -1727,9 +1733,8 @@ def __repr__(self): if len(self) > mr and mr != 0: top = self[0:mr] bottom = self[-1 * mr :] - from cudf import concat - preprocess = concat([top, bottom]) + preprocess = cudf.concat([top, bottom]) else: preprocess = self @@ -1997,7 +2002,7 @@ def __new__( elif isinstance(data, pd.DatetimeIndex): data = column.as_column(data.values) elif isinstance(data, (list, tuple)): - data = column.as_column(np.array(data, dtype=">> import cudf + >>> cudf.TimedeltaIndex([1132223, 2023232, 342234324, 4234324], + ... dtype='timedelta64[ns]') + TimedeltaIndex(['00:00:00.001132', '00:00:00.002023', '00:00:00.342234', + '00:00:00.004234'], + dtype='timedelta64[ns]') + >>> cudf.TimedeltaIndex([1, 2, 3, 4], dtype='timedelta64[s]', + ... name="delta-index") + TimedeltaIndex(['00:00:01', '00:00:02', '00:00:03', '00:00:04'], + dtype='timedelta64[s]', name='delta-index') + """ + + def __new__( + cls, + data=None, + unit=None, + freq=None, + closed=None, + dtype="timedelta64[ns]", + copy=False, + name=None, + ) -> "TimedeltaIndex": + + out = Frame.__new__(cls) + + if freq is not None: + raise NotImplementedError("freq is not yet supported") + + if unit is not None: + raise NotImplementedError( + "unit is not yet supported, alternatively " + "dtype parameter is supported" + ) + + if copy: + data = column.as_column(data).copy() + kwargs = _setdefault_name(data, name=name) + if isinstance(data, np.ndarray) and data.dtype.kind == "m": + data = column.as_column(data) + elif isinstance(data, pd.TimedeltaIndex): + data = column.as_column(data.values) + elif isinstance(data, (list, tuple)): + data = column.as_column(np.array(data, dtype=dtype)) + out._initialize(data, **kwargs) + return out + + def to_pandas(self): + return pd.TimedeltaIndex( + self._values.to_pandas(nullable_pd_dtype=False), + name=self.name, + unit=self._values.time_unit, + ) + + @property + def days(self): + """ + Number of days for each element. + """ + return as_index(arbitrary=self._values.days, name=self.name) + + @property + def seconds(self): + """ + Number of seconds (>= 0 and less than 1 day) for each element. + """ + return as_index(arbitrary=self._values.seconds, name=self.name) + + @property + def microseconds(self): + """ + Number of microseconds (>= 0 and less than 1 second) for each element. + """ + return as_index(arbitrary=self._values.microseconds, name=self.name) + + @property + def nanoseconds(self): + """ + Number of nanoseconds (>= 0 and less than 1 microsecond) for each + element. + """ + return as_index(arbitrary=self._values.nanoseconds, name=self.name) + + @property + def components(self): + """ + Return a dataframe of the components (days, hours, minutes, + seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas. + """ + return self._values.components() + + @property + def inferred_freq(self): + raise NotImplementedError("inferred_freq is not yet supported") + + def _clean_nulls_from_index(self): + """ + Convert all na values(if any) in Index object + to `` as a preprocessing step to `__repr__` methods. + + This will involve changing type of Index object + to StringIndex but it is the responsibility of the `__repr__` + methods using this method to replace or handle representation + of the actual types correctly. + """ + return cudf.Index( + self._values._repr_str_col().fillna(cudf._NA_REP) + if self._values.has_nulls + else self._values._repr_str_col(), + name=self.name, + ) + + class CategoricalIndex(GenericIndex): """An categorical of orderable values that represent the indices of another Column @@ -2287,7 +2433,6 @@ def as_index(arbitrary, **kwargs): """ kwargs = _setdefault_name(arbitrary, **kwargs) - if isinstance(arbitrary, cudf.MultiIndex): return arbitrary elif isinstance(arbitrary, Index): @@ -2303,6 +2448,8 @@ def as_index(arbitrary, **kwargs): return StringIndex(arbitrary, **kwargs) elif isinstance(arbitrary, DatetimeColumn): return DatetimeIndex(arbitrary, **kwargs) + elif isinstance(arbitrary, TimeDeltaColumn): + return TimedeltaIndex(arbitrary, **kwargs) elif isinstance(arbitrary, CategoricalColumn): return CategoricalIndex(arbitrary, **kwargs) elif isinstance(arbitrary, cudf.Series): diff --git a/python/cudf/cudf/core/ops.py b/python/cudf/cudf/core/ops.py index 7bd4cb3fd10..fe9e012f406 100644 --- a/python/cudf/cudf/core/ops.py +++ b/python/cudf/cudf/core/ops.py @@ -1,8 +1,10 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2019-2020, NVIDIA CORPORATION. from numbers import Number import numpy as np +from cudf.core.frame import Frame + """ Global __array_ufunc__ methods """ @@ -94,12 +96,52 @@ def logical_or(lhs, rhs): def remainder(lhs, rhs): if isinstance(lhs, Number) and isinstance(rhs, Number): return np.mod(lhs, rhs) - else: + elif isinstance(lhs, Frame): return getattr(lhs, "remainder")(rhs) + else: + return getattr(rhs, "__rmod__")(lhs) def floor_divide(lhs, rhs): if isinstance(lhs, Number) and isinstance(rhs, Number): return np.floor_divide(lhs, rhs) - else: + elif isinstance(lhs, Frame): return getattr(lhs, "floordiv")(rhs) + else: + return getattr(rhs, "__rfloordiv__")(lhs) + + +def subtract(lhs, rhs): + if isinstance(lhs, Number) and isinstance(rhs, Number): + return np.subtract(lhs, rhs) + elif isinstance(lhs, Frame): + return getattr(lhs, "__sub__")(rhs) + else: + return getattr(rhs, "__rsub__")(lhs) + + +def add(lhs, rhs): + if isinstance(lhs, Number) and isinstance(rhs, Number): + return np.add(lhs, rhs) + elif isinstance(rhs, Frame): + return getattr(rhs, "__radd__")(lhs) + else: + return getattr(lhs, "__add__")(rhs) + + +def true_divide(lhs, rhs): + if isinstance(lhs, Number) and isinstance(rhs, Number): + return np.true_divide(lhs, rhs) + elif isinstance(rhs, Frame): + return getattr(rhs, "__rtruediv__")(lhs) + else: + return getattr(lhs, "__truediv__")(rhs) + + +def multiply(lhs, rhs): + if isinstance(lhs, Number) and isinstance(rhs, Number): + return np.multiply(lhs, rhs) + elif isinstance(rhs, Frame): + return getattr(rhs, "__rmul__")(lhs) + else: + return getattr(lhs, "__mul__")(rhs) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index ceca0177ff6..d80c3d59c50 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4,6 +4,7 @@ from collections import abc as abc from numbers import Number from shutil import get_terminal_size +from uuid import uuid4 import cupy import numpy as np @@ -19,6 +20,7 @@ from cudf.core.column import ( ColumnBase, DatetimeColumn, + TimeDeltaColumn, as_column, column, column_empty_like, @@ -61,9 +63,7 @@ def _constructor_sliced(self): @property def _constructor_expanddim(self): - from cudf import DataFrame - - return DataFrame + return cudf.DataFrame @classmethod def from_categorical(cls, categorical, codes=None): @@ -71,9 +71,9 @@ def from_categorical(cls, categorical, codes=None): If ``codes`` is defined, use it instead of ``categorical.codes`` """ - from cudf.core.column.categorical import pandas_categorical_as_column - - col = pandas_categorical_as_column(categorical, codes=codes) + col = cudf.core.column.categorical.pandas_categorical_as_column( + categorical, codes=codes + ) return Series(data=col) @classmethod @@ -148,8 +148,6 @@ def __init__( if name is None: name = data.name if isinstance(data.index, pd.MultiIndex): - import cudf - index = cudf.from_pandas(data.index) else: index = as_index(data.index) @@ -379,6 +377,8 @@ def dt(self): """ if isinstance(self._column, DatetimeColumn): return DatetimeProperties(self) + elif isinstance(self._column, TimeDeltaColumn): + return TimedeltaProperties(self) else: raise AttributeError( "Can only use .dt accessor with datetimelike values" @@ -678,8 +678,6 @@ def to_frame(self, name=None): cudf DataFrame """ - from cudf import DataFrame - if name is not None: col = name elif self.name is None: @@ -687,7 +685,7 @@ def to_frame(self, name=None): else: col = self.name - return DataFrame({col: self._column}, index=self.index) + return cudf.DataFrame({col: self._column}, index=self.index) def set_mask(self, mask, null_count=None): """Create new Series by setting a mask array. @@ -761,8 +759,6 @@ def __len__(self): return len(self._column) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - import cudf - if method == "__call__" and hasattr(cudf, ufunc.__name__): func = getattr(cudf, ufunc.__name__) return func(*inputs) @@ -972,11 +968,13 @@ def __repr__(self): preprocess = cudf.concat([top, bottom]) else: - preprocess = self + preprocess = self.copy() preprocess.index = preprocess.index._clean_nulls_from_index() - - if ( + if isinstance(preprocess._column, cudf.core.column.TimeDeltaColumn): + preprocess._column = preprocess._column._repr_str_col() + output = preprocess.to_pandas().__repr__() + elif ( preprocess.nullable and not isinstance( preprocess._column, cudf.core.column.CategoricalColumn @@ -1045,9 +1043,7 @@ def _binaryop(self, other, fn, fill_value=None, reflect=False): If ``reflect`` is ``True``, swap the order of the operands. """ - from cudf import DataFrame - - if isinstance(other, DataFrame): + if isinstance(other, cudf.DataFrame): # TODO: fn is not the same as arg expected by _apply_op # e.g. for fn = 'and', _apply_op equivalent is '__and__' return other._apply_op(self, fn) @@ -1640,10 +1636,8 @@ def dtype(self): def _concat(cls, objs, axis=0, index=True): # Concatenate index if not provided if index is True: - from cudf.core.multiindex import MultiIndex - - if isinstance(objs[0].index, MultiIndex): - index = MultiIndex._concat([o.index for o in objs]) + if isinstance(objs[0].index, cudf.MultiIndex): + index = cudf.MultiIndex._concat([o.index for o in objs]) else: index = Index._concat([o.index for o in objs]) @@ -2006,7 +2000,14 @@ def to_pandas(self, index=True, **kwargs): >>> type(pds) """ - nullable_pd_dtype = kwargs.get("nullable_pd_dtype", True) + nullable_pd_dtype = kwargs.get("nullable_pd_dtype", None) + + if nullable_pd_dtype is None: + if self.dtype.kind in ("i", "u", "b"): + nullable_pd_dtype = True + else: + nullable_pd_dtype = False + if index is True: index = self.index.to_pandas() s = self._column.to_pandas( @@ -2414,7 +2415,6 @@ def label_encoding(self, cats, dtype=None, na_sentinel=-1): 4 -1 dtype: int8 """ - from cudf import DataFrame def _return_sentinel_series(): return Series( @@ -2444,8 +2444,8 @@ def _return_sentinel_series(): order = column.as_column(cupy.arange(len(self))) codes = column.as_column(cupy.arange(len(cats), dtype=dtype)) - value = DataFrame({"value": cats, "code": codes}) - codes = DataFrame( + value = cudf.DataFrame({"value": cats, "code": codes}) + codes = cudf.DataFrame( {"value": self._data.columns[0].copy(deep=False), "order": order} ) @@ -3825,8 +3825,6 @@ def quantile( self._column.quantile(q, interpolation, exact), name=self.name ) else: - from cudf.core.column import column_empty_like - np_array_q = np.asarray(q) if len(self) == 0: result = column_empty_like( @@ -3951,9 +3949,9 @@ def digitize(self, bins, right=False): ------- A new Series containing the indices. """ - from cudf.core.column import numerical - - return Series(numerical.digitize(self._column, bins, right)) + return Series( + cudf.core.column.numerical.digitize(self._column, bins, right) + ) def diff(self, periods=1): """Calculate the difference between values at positions i and i - N in @@ -4033,23 +4031,22 @@ def rolling( @ioutils.doc_to_json() def to_json(self, path_or_buf=None, *args, **kwargs): """{docstring}""" - from cudf.io import json as json - return json.to_json(self, path_or_buf=path_or_buf, *args, **kwargs) + return cudf.io.json.to_json( + self, path_or_buf=path_or_buf, *args, **kwargs + ) @ioutils.doc_to_hdf() def to_hdf(self, path_or_buf, key, *args, **kwargs): """{docstring}""" - from cudf.io import hdf as hdf - hdf.to_hdf(path_or_buf, key, self, *args, **kwargs) + cudf.io.hdf.to_hdf(path_or_buf, key, self, *args, **kwargs) @ioutils.doc_to_dlpack() def to_dlpack(self): """{docstring}""" - from cudf.io import dlpack as dlpack - return dlpack.to_dlpack(self) + return cudf.io.dlpack.to_dlpack(self) def rename(self, index=None, copy=True): """ @@ -4135,7 +4132,6 @@ def _align_to_index( """ Align to the given Index. See _align_indices below. """ - from uuid import uuid4 index = as_index(index) if self.index.equals(index): @@ -4264,6 +4260,55 @@ def keys(self): class DatetimeProperties(object): + """ + Accessor object for datetimelike properties of the Series values. + + Returns + ------- + Returns a Series indexed like the original Series. + + Examples + -------- + >>> import cudf + >>> import pandas as pd + >>> seconds_series = cudf.Series(pd.date_range("2000-01-01", periods=3, + ... freq="s")) + >>> seconds_series + 0 2000-01-01 00:00:00 + 1 2000-01-01 00:00:01 + 2 2000-01-01 00:00:02 + dtype: datetime64[ns] + >>> seconds_series.dt.second + 0 0 + 1 1 + 2 2 + dtype: int16 + >>> hours_series = cudf.Series(pd.date_range("2000-01-01", periods=3, + ... freq="h")) + >>> hours_series + 0 2000-01-01 00:00:00 + 1 2000-01-01 01:00:00 + 2 2000-01-01 02:00:00 + dtype: datetime64[ns] + >>> hours_series.dt.hour + 0 0 + 1 1 + 2 2 + dtype: int16 + >>> weekday_series = cudf.Series(pd.date_range("2000-01-01", periods=3, + ... freq="q")) + >>> weekday_series + 0 2000-03-31 + 1 2000-06-30 + 2 2000-09-30 + dtype: datetime64[ns] + >>> weekday_series.dt.weekday + 0 4 + 1 4 + 2 5 + dtype: int16 + """ + def __init__(self, series): self.series = series @@ -4302,6 +4347,244 @@ def _get_dt_field(self, field): ) +class TimedeltaProperties(object): + """ + Accessor object for timedeltalike properties of the Series values. + + Returns + ------- + Returns a Series indexed like the original Series. + + Examples + -------- + >>> import cudf + >>> seconds_series = cudf.Series([1, 2, 3], dtype='timedelta64[s]') + >>> seconds_series + 0 00:00:01 + 1 00:00:02 + 2 00:00:03 + dtype: timedelta64[s] + >>> seconds_series.dt.seconds + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> series = cudf.Series([12231312123, 1231231231, 1123236768712, 2135656, + ... 3244334234], dtype='timedelta64[ms]') + >>> series + 0 141 days 13:35:12.123 + 1 14 days 06:00:31.231 + 2 13000 days 10:12:48.712 + 3 0 days 00:35:35.656 + 4 37 days 13:12:14.234 + dtype: timedelta64[ms] + >>> series.dt.components + days hours minutes seconds milliseconds microseconds nanoseconds + 0 141 13 35 12 123 0 0 + 1 14 6 0 31 231 0 0 + 2 13000 10 12 48 712 0 0 + 3 0 0 35 35 656 0 0 + 4 37 13 12 14 234 0 0 + >>> series.dt.days + 0 141 + 1 14 + 2 13000 + 3 0 + 4 37 + dtype: int64 + >>> series.dt.seconds + 0 48912 + 1 21631 + 2 36768 + 3 2135 + 4 47534 + dtype: int64 + >>> series.dt.microseconds + 0 123000 + 1 231000 + 2 712000 + 3 656000 + 4 234000 + dtype: int64 + >>> s.dt.nanoseconds + 0 0 + 1 0 + 2 0 + 3 0 + 4 0 + dtype: int64 + """ + + def __init__(self, series): + self.series = series + + @property + def days(self): + """ + Number of days. + + Returns + ------- + Series + + Examples + -------- + >>> import cudf + >>> s = cudf.Series([12231312123, 1231231231, 1123236768712, 2135656, + ... 3244334234], dtype='timedelta64[ms]') + >>> s + 0 141 days 13:35:12.123 + 1 14 days 06:00:31.231 + 2 13000 days 10:12:48.712 + 3 0 days 00:35:35.656 + 4 37 days 13:12:14.234 + dtype: timedelta64[ms] + >>> s.dt.days + 0 141 + 1 14 + 2 13000 + 3 0 + 4 37 + dtype: int64 + """ + return self._get_td_field("days") + + @property + def seconds(self): + """ + Number of seconds (>= 0 and less than 1 day). + + Returns + ------- + Series + + Examples + -------- + >>> import cudf + >>> s = cudf.Series([12231312123, 1231231231, 1123236768712, 2135656, + ... 3244334234], dtype='timedelta64[ms]') + >>> s + 0 141 days 13:35:12.123 + 1 14 days 06:00:31.231 + 2 13000 days 10:12:48.712 + 3 0 days 00:35:35.656 + 4 37 days 13:12:14.234 + dtype: timedelta64[ms] + >>> s.dt.seconds + 0 48912 + 1 21631 + 2 36768 + 3 2135 + 4 47534 + dtype: int64 + >>> s.dt.microseconds + 0 123000 + 1 231000 + 2 712000 + 3 656000 + 4 234000 + dtype: int64 + """ + return self._get_td_field("seconds") + + @property + def microseconds(self): + """ + Number of microseconds (>= 0 and less than 1 second). + + Returns + ------- + Series + + Examples + -------- + >>> import cudf + >>> s = cudf.Series([12231312123, 1231231231, 1123236768712, 2135656, + ... 3244334234], dtype='timedelta64[ms]') + >>> s + 0 141 days 13:35:12.123 + 1 14 days 06:00:31.231 + 2 13000 days 10:12:48.712 + 3 0 days 00:35:35.656 + 4 37 days 13:12:14.234 + dtype: timedelta64[ms] + >>> s.dt.microseconds + 0 123000 + 1 231000 + 2 712000 + 3 656000 + 4 234000 + dtype: int64 + """ + return self._get_td_field("microseconds") + + @property + def nanoseconds(self): + """ + Return the number of nanoseconds (n), where 0 <= n < 1 microsecond. + + Returns + ------- + Series + + Examples + -------- + >>> import cudf + >>> s = cudf.Series([12231312123, 1231231231, 1123236768712, 2135656, + ... 3244334234], dtype='timedelta64[ns]') + >>> s + 0 00:00:12.231312123 + 1 00:00:01.231231231 + 2 00:18:43.236768712 + 3 00:00:00.002135656 + 4 00:00:03.244334234 + dtype: timedelta64[ns] + >>> s.dt.nanoseconds + 0 123 + 1 231 + 2 712 + 3 656 + 4 234 + dtype: int64 + """ + return self._get_td_field("nanoseconds") + + @property + def components(self): + """ + Return a Dataframe of the components of the Timedeltas. + + Returns + ------- + DataFrame + + Examples + -------- + >>> s = cudf.Series([12231312123, 1231231231, 1123236768712, 2135656, 3244334234], dtype='timedelta64[ms]') + >>> s + 0 141 days 13:35:12.123 + 1 14 days 06:00:31.231 + 2 13000 days 10:12:48.712 + 3 0 days 00:35:35.656 + 4 37 days 13:12:14.234 + dtype: timedelta64[ms] + >>> s.dt.components + days hours minutes seconds milliseconds microseconds nanoseconds + 0 141 13 35 12 123 0 0 + 1 14 6 0 31 231 0 0 + 2 13000 10 12 48 712 0 0 + 3 0 0 35 35 656 0 0 + 4 37 13 12 14 234 0 0 + """ # noqa: E501 + return self.series._column.components(index=self.series._index) + + def _get_td_field(self, field): + out_column = getattr(self.series._column, field) + return Series( + data=out_column, index=self.series._index, name=self.series.name + ) + + def _align_indices(series_list, how="outer", allow_non_unique=False): """ Internal util to align the indices of a list of Series objects diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 53a120d409f..fb34c3c2f49 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -259,6 +259,10 @@ def to_datetime( def _process_col(col, unit, dayfirst, infer_datetime_format, format): if col.dtype.kind == "M": return col + elif col.dtype.kind == "m": + raise TypeError( + f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}" + ) if col.dtype.kind in ("f"): if unit not in (None, "ns"): diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 47f37506edb..18b05dba439 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -728,17 +728,36 @@ def test_vector_to_none_binops(dtype): utils.assert_eq(expect, got) -@pytest.mark.parametrize("lhs", [pd.Series([0, 10, 20, 30, 3, 4, 5, 6, 2])]) +@pytest.mark.parametrize( + "lhs", + [ + 1, + 3, + 4, + pd.Series([5, 6, 2]), + pd.Series([0, 10, 20, 30, 3, 4, 5, 6, 2]), + 6, + ], +) @pytest.mark.parametrize("rhs", [1, 3, 4, pd.Series([5, 6, 2])]) @pytest.mark.parametrize( "ops", - [(np.remainder, cudf.remainder), (np.floor_divide, cudf.floor_divide)], + [ + (np.remainder, cudf.remainder), + (np.floor_divide, cudf.floor_divide), + (np.subtract, cudf.subtract), + (np.add, cudf.add), + (np.true_divide, cudf.true_divide), + (np.multiply, cudf.multiply), + ], ) def test_ufunc_ops(lhs, rhs, ops): np_op, cu_op = ops if isinstance(lhs, pd.Series): culhs = cudf.from_pandas(lhs) + else: + culhs = lhs if isinstance(rhs, pd.Series): curhs = cudf.from_pandas(rhs) @@ -747,6 +766,11 @@ def test_ufunc_ops(lhs, rhs, ops): expect = np_op(lhs, rhs) got = cu_op(culhs, curhs) - utils.assert_eq( - expect.fillna(got._column.default_na_value()), got, check_dtype=False - ) + if np.isscalar(expect): + assert got == expect + else: + utils.assert_eq( + expect.fillna(got._column.default_na_value()), + got, + check_dtype=False, + ) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index eb13ca8f8cd..ab356f8f094 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -6,15 +6,22 @@ import pytest import cudf +from cudf._lib.transform import mask_to_bools from cudf.core.column.column import as_column from cudf.tests.utils import assert_eq from cudf.utils import dtypes as dtypeutils -from cudf._lib.transform import mask_to_bools dtypes = sorted( list( dtypeutils.ALL_TYPES - - {"datetime64[s]", "datetime64[ms]", "datetime64[us]"} + - { + "datetime64[s]", + "datetime64[ms]", + "datetime64[us]", + "timedelta64[s]", + "timedelta64[ms]", + "timedelta64[us]", + } ) ) @@ -32,7 +39,7 @@ def str_host_view(list_of_str, to_dtype): @pytest.mark.parametrize("offset", [0, 1, 15]) -@pytest.mark.parametrize("size", [None, 50, 10, 0]) +@pytest.mark.parametrize("size", [50, 10, 0]) def test_column_offset_and_size(pandas_input, offset, size): col = cudf.core.column.as_column(pandas_input) col = cudf.core.column.build_column( diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 3d4c56d8f88..7d6e8f74789 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1,4 +1,5 @@ # Copyright (c) 2019-2020, NVIDIA CORPORATION. +import datetime import datetime as dt import re @@ -11,7 +12,7 @@ import cudf from cudf.core import DataFrame, Series from cudf.core.index import DatetimeIndex -from cudf.tests.utils import NUMERIC_TYPES, assert_eq +from cudf.tests.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq def data1(): @@ -605,6 +606,7 @@ def test_cudf_to_datetime(data, dayfirst, infer_datetime_format): "2", ["1", "2", "3"], ["1/1/1", "2/2/2", "1"], + pd.Series([1, 2, 3], dtype="timedelta64[ns]"), pd.DataFrame( { "year": [2015, 2016], @@ -809,3 +811,188 @@ def test_str_null_to_datetime(): gsr.astype("datetime64[s]") else: raise AssertionError("Expected psr.astype('datetime64[s]') to fail") + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3, 4, 10, 100, 20000], + [None] * 7, + [10, 20, 30, None, 100, 200, None], + [3223.234, 342.2332, 23423.23, 3343.23324, 23432.2323, 242.23, 233], + ], +) +@pytest.mark.parametrize( + "other", + [ + [1, 2, 3, 4, 10, 100, 20000], + [None] * 7, + [10, 20, 30, None, 100, 200, None], + [3223.234, 342.2332, 23423.23, 3343.23324, 23432.2323, 242.23, 233], + np.datetime64("2005-02"), + np.datetime64("2005-02-25"), + np.datetime64("2005-02-25T03:30"), + np.datetime64("nat"), + ], +) +@pytest.mark.parametrize("data_dtype", DATETIME_TYPES) +@pytest.mark.parametrize("other_dtype", DATETIME_TYPES) +def test_datetime_subtract(data, other, data_dtype, other_dtype): + + gsr = cudf.Series(data, dtype=data_dtype) + psr = gsr.to_pandas() + + if isinstance(other, np.datetime64): + gsr_other = other + psr_other = other + else: + gsr_other = cudf.Series(other, dtype=other_dtype) + psr_other = gsr_other.to_pandas() + + expected = psr - psr_other + actual = gsr - gsr_other + + assert_eq(expected, actual) + + expected = psr_other - psr + actual = gsr_other - gsr + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", + [ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + [None], + [None, None, None, None, None], + [12, 12, 22, 343, 4353534, 435342], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + [1000000, 200000, 3000000], + [1000000, 200000, None], + [1], + [12, 11, 232, 223432411, 2343241, 234324, 23234], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ], +) +@pytest.mark.parametrize( + "other_scalars", + [ + datetime.timedelta(days=768), + datetime.timedelta(seconds=768), + datetime.timedelta(microseconds=7), + datetime.timedelta(minutes=447), + datetime.timedelta(hours=447), + datetime.timedelta(weeks=734), + np.timedelta64(4, "s"), + np.timedelta64(456, "D"), + np.timedelta64(46, "h"), + np.timedelta64("nat"), + np.timedelta64(1, "s"), + np.timedelta64(1, "ms"), + np.timedelta64(1, "us"), + np.timedelta64(1, "ns"), + ], +) +@pytest.mark.parametrize("dtype", DATETIME_TYPES) +@pytest.mark.parametrize( + "op", ["add", "sub"], +) +def test_datetime_series_ops_with_scalars(data, other_scalars, dtype, op): + gsr = cudf.Series(data=data, dtype=dtype) + psr = gsr.to_pandas() + + if op == "add": + expected = psr + other_scalars + actual = gsr + other_scalars + elif op == "sub": + expected = psr - other_scalars + actual = gsr - other_scalars + + assert_eq(expected, actual) + + if op == "add": + expected = other_scalars + psr + actual = other_scalars + gsr + + assert_eq(expected, actual) + + elif op == "sub": + with pytest.raises(TypeError): + expected = other_scalars - psr + with pytest.raises(TypeError): + actual = other_scalars - gsr + + +def test_datetime_invalid_ops(): + sr = cudf.Series([1, 2, 3], dtype="datetime64[ns]") + psr = sr.to_pandas() + + try: + psr + pd.Timestamp(1513393355.5, unit="s") + except TypeError: + with pytest.raises(TypeError): + sr + pd.Timestamp(1513393355.5, unit="s") + else: + raise AssertionError("Expected psr + pd.Timestamp to fail") + + try: + psr / pd.Timestamp(1513393355.5, unit="s") + except TypeError: + with pytest.raises(TypeError): + sr / pd.Timestamp(1513393355.5, unit="s") + else: + raise AssertionError("Expected psr / pd.Timestamp to fail") + + try: + psr + psr + except TypeError: + with pytest.raises(TypeError): + sr + sr + else: + raise AssertionError("Expected psr + psr to fail") + + try: + psr // psr + except TypeError: + with pytest.raises(TypeError): + sr // sr + else: + raise AssertionError("Expected psr // psr to fail") + + try: + psr // pd.Timestamp(1513393355.5, unit="s") + except TypeError: + with pytest.raises(TypeError): + sr // pd.Timestamp(1513393355.5, unit="s") + else: + raise AssertionError("Expected psr // pd.Timestamp to fail") + + try: + psr + 1 + except TypeError: + with pytest.raises(TypeError): + sr + 1 + else: + raise AssertionError("Expected psr + 1 to fail") + + try: + psr / "a" + except TypeError: + with pytest.raises(TypeError): + sr / "a" + else: + raise AssertionError("Expected psr / 'a' to fail") + + try: + psr * 1 + except TypeError: + with pytest.raises(TypeError): + sr * 1 + else: + raise AssertionError("Expected psr * 1 to fail") diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 61647c157d3..7288b751867 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -559,8 +559,7 @@ def test_index_argsort(data): pd.Index([102, 1001, 1002, 0.0, 23], dtype="datetime64[ns]"), pd.Index([13240.2, 1001, 100.2, 0.0, 23], dtype="datetime64[ns]"), pd.RangeIndex(0, 10, 1), - # pd.Index([-10.2, 100.1, -100.2, 0.0, 0.23], dtype='timedelta64[ns]') - # TODO: Enable in timedelta PR. + pd.Index([-10.2, 100.1, -100.2, 0.0, 23], dtype="timedelta64[ns]"), ], ) @pytest.mark.parametrize("ascending", [True, False]) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 230d292cbff..3aa476c1ffe 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -1,4 +1,7 @@ # Copyright (c) 2019-2020, NVIDIA CORPORATION. +import textwrap + +import cupy as cp import numpy as np import pandas as pd import pytest @@ -560,3 +563,537 @@ def test_series_null_index_repr(sr, pandas_special_case): # to be printed as `None` everywhere. actual_repr = gsr.__repr__().replace("None", "") assert expected_repr.split() == actual_repr.split() + + +@pytest.mark.parametrize( + "data", + [ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + [None], + [None, None, None, None, None], + [12, 12, 22, 343, 4353534, 435342], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + [1000000, 200000, 3000000], + [1000000, 200000, None], + [1], + [12, 11, 232, 223432411, 2343241, 234324, 23234], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + [ + 136457654, + 134736784, + 245345345, + 223432411, + 2343241, + 3634548734, + 23234, + ], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ], +) +@pytest.mark.parametrize("dtype", ["timedelta64[s]", "timedelta64[us]"]) +def test_timedelta_series_s_us_repr(data, dtype): + sr = cudf.Series(data, dtype=dtype) + psr = sr.to_pandas() + + expected = ( + psr.__repr__().replace("timedelta64[ns]", dtype).replace("NaT", "") + ) + actual = sr.__repr__() + + assert expected.split() == actual.split() + + +@pytest.mark.parametrize( + "ser, expected_repr", + [ + ( + cudf.Series([], dtype="timedelta64[ns]"), + textwrap.dedent( + """ + Series([], dtype: timedelta64[ns]) + """ + ), + ), + ( + cudf.Series([], dtype="timedelta64[ms]"), + textwrap.dedent( + """ + Series([], dtype: timedelta64[ms]) + """ + ), + ), + ( + cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[ns]"), + textwrap.dedent( + """ + 0 00:00:00.001000000 + 1 00:00:00.000200000 + 2 00:00:00.003000000 + dtype: timedelta64[ns] + """ + ), + ), + ( + cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[ms]"), + textwrap.dedent( + """ + 0 00:16:40 + 1 00:03:20 + 2 00:50:00 + dtype: timedelta64[ms] + """ + ), + ), + ( + cudf.Series([1000000, 200000, None], dtype="timedelta64[ns]"), + textwrap.dedent( + """ + 0 00:00:00.001000000 + 1 00:00:00.000200000 + 2 + dtype: timedelta64[ns] + """ + ), + ), + ( + cudf.Series([1000000, 200000, None], dtype="timedelta64[ms]"), + textwrap.dedent( + """ + 0 00:16:40 + 1 00:03:20 + 2 + dtype: timedelta64[ms] + """ + ), + ), + ( + cudf.Series( + [None, None, None, None, None], dtype="timedelta64[ns]" + ), + textwrap.dedent( + """ + 0 + 1 + 2 + 3 + 4 + dtype: timedelta64[ns] + """ + ), + ), + ( + cudf.Series( + [None, None, None, None, None], dtype="timedelta64[ms]" + ), + textwrap.dedent( + """ + 0 + 1 + 2 + 3 + 4 + dtype: timedelta64[ms] + """ + ), + ), + ( + cudf.Series( + [12, 12, 22, 343, 4353534, 435342], dtype="timedelta64[ns]" + ), + textwrap.dedent( + """ + 0 00:00:00.000000012 + 1 00:00:00.000000012 + 2 00:00:00.000000022 + 3 00:00:00.000000343 + 4 00:00:00.004353534 + 5 00:00:00.000435342 + dtype: timedelta64[ns] + """ + ), + ), + ( + cudf.Series( + [12, 12, 22, 343, 4353534, 435342], dtype="timedelta64[ms]" + ), + textwrap.dedent( + """ + 0 00:00:00.012 + 1 00:00:00.012 + 2 00:00:00.022 + 3 00:00:00.343 + 4 01:12:33.534 + 5 00:07:15.342 + dtype: timedelta64[ms] + """ + ), + ), + ( + cudf.Series( + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + dtype="timedelta64[ns]", + ), + textwrap.dedent( + """ + 0 00:00:00.000000001 + 1 00:00:00.000001132 + 2 00:00:00.023223231 + 3 00:00:00.000000233 + 4 00:00:00 + 5 00:00:00.000000332 + 6 00:00:00.000000323 + dtype: timedelta64[ns] + """ + ), + ), + ( + cudf.Series( + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + dtype="timedelta64[ms]", + ), + textwrap.dedent( + """ + 0 00:00:00.001 + 1 00:00:01.132 + 2 06:27:03.231 + 3 00:00:00.233 + 4 00:00:00 + 5 00:00:00.332 + 6 00:00:00.323 + dtype: timedelta64[ms] + """ + ), + ), + ( + cudf.Series( + [ + 13645765432432, + 134736784, + 245345345, + 223432411, + 999992343241, + 3634548734, + 23234, + ], + dtype="timedelta64[ms]", + ), + textwrap.dedent( + """ + 0 157937 days 02:23:52.432 + 1 1 days 13:25:36.784 + 2 2 days 20:09:05.345 + 3 2 days 14:03:52.411 + 4 11573 days 23:39:03.241 + 5 42 days 01:35:48.734 + 6 0 days 00:00:23.234 + dtype: timedelta64[ms] + """ + ), + ), + ( + cudf.Series( + [ + 13645765432432, + 134736784, + 245345345, + 223432411, + 999992343241, + 3634548734, + 23234, + ], + dtype="timedelta64[ns]", + ), + textwrap.dedent( + """ + 0 03:47:25.765432432 + 1 00:00:00.134736784 + 2 00:00:00.245345345 + 3 00:00:00.223432411 + 4 00:16:39.992343241 + 5 00:00:03.634548734 + 6 00:00:00.000023234 + dtype: timedelta64[ns] + """ + ), + ), + ( + cudf.Series( + [ + 13645765432432, + 134736784, + 245345345, + 223432411, + 999992343241, + 3634548734, + 23234, + ], + dtype="timedelta64[ms]", + name="abc", + ), + textwrap.dedent( + """ + 0 157937 days 02:23:52.432 + 1 1 days 13:25:36.784 + 2 2 days 20:09:05.345 + 3 2 days 14:03:52.411 + 4 11573 days 23:39:03.241 + 5 42 days 01:35:48.734 + 6 0 days 00:00:23.234 + Name: abc, dtype: timedelta64[ms] + """ + ), + ), + ( + cudf.Series( + [ + 13645765432432, + 134736784, + 245345345, + 223432411, + 999992343241, + 3634548734, + 23234, + ], + dtype="timedelta64[ns]", + index=["a", "b", "z", "x", "y", "l", "m"], + name="hello", + ), + textwrap.dedent( + """ + a 03:47:25.765432432 + b 00:00:00.134736784 + z 00:00:00.245345345 + x 00:00:00.223432411 + y 00:16:39.992343241 + l 00:00:03.634548734 + m 00:00:00.000023234 + Name: hello, dtype: timedelta64[ns] + """ + ), + ), + ], +) +def test_timedelta_series_ns_ms_repr(ser, expected_repr): + expected = expected_repr + actual = ser.__repr__() + + assert expected.split() == actual.split() + + +@pytest.mark.parametrize( + "df,expected_repr", + [ + ( + cudf.DataFrame( + { + "a": cudf.Series( + [1000000, 200000, 3000000], dtype="timedelta64[s]" + ) + } + ), + textwrap.dedent( + """ + a + 0 11 days 13:46:40 + 1 2 days 07:33:20 + 2 34 days 17:20:00 + """ + ), + ), + ( + cudf.DataFrame( + { + "a": cudf.Series( + [ + 136457654, + None, + 245345345, + 223432411, + None, + 3634548734, + 23234, + ], + dtype="timedelta64[s]", + ), + "b": [10, 11, 22, 33, 44, 55, 66], + } + ), + textwrap.dedent( + """ + a b + 0 1579 days 08:54:14 10 + 1 11 + 2 2839 days 15:29:05 22 + 3 2586 days 00:33:31 33 + 4 44 + 5 42066 days 12:52:14 55 + 6 0 days 06:27:14 66 + """ + ), + ), + ( + cudf.DataFrame( + { + "a": cudf.Series( + [ + 136457654, + None, + 245345345, + 223432411, + None, + 3634548734, + 23234, + ], + dtype="timedelta64[s]", + index=["a", "b", "c", "d", "e", "f", "g"], + ) + } + ), + textwrap.dedent( + """ + a + a 1579 days 08:54:14 + b + c 2839 days 15:29:05 + d 2586 days 00:33:31 + e + f 42066 days 12:52:14 + g 0 days 06:27:14 + """ + ), + ), + ( + cudf.DataFrame( + { + "a": cudf.Series( + [1, 2, 3, 4, 5, 6, 7], + index=cudf.Index( + [ + 136457654, + None, + 245345345, + 223432411, + None, + 3634548734, + 23234, + ], + dtype="timedelta64[ms]", + ), + ) + } + ), + textwrap.dedent( + """ + a + 1 days 13:54:17.654 1 + 2 + 2 days 20:09:05.345 3 + 2 days 14:03:52.411 4 + 5 + 42 days 01:35:48.734 6 + 0 days 00:00:23.234 7 + """ + ), + ), + ( + cudf.DataFrame( + { + "a": cudf.Series( + ["a", "f", "q", "e", "w", "e", "t"], + index=cudf.Index( + [ + 136457654, + None, + 245345345, + 223432411, + None, + 3634548734, + 23234, + ], + dtype="timedelta64[ns]", + ), + ) + } + ), + textwrap.dedent( + """ + a + 00:00:00.136457654 a + f + 00:00:00.245345345 q + 00:00:00.223432411 e + w + 00:00:03.634548734 e + 00:00:00.000023234 t + """ + ), + ), + ], +) +def test_timedelta_dataframe_repr(df, expected_repr): + actual_repr = df.__repr__() + + assert actual_repr.split() == expected_repr.split() + + +@pytest.mark.parametrize( + "index, expected_repr", + [ + ( + cudf.Index([1000000, 200000, 3000000], dtype="timedelta64[ms]"), + "TimedeltaIndex(['00:16:40', '00:03:20', '00:50:00'], " + "dtype='timedelta64[ms]')", + ), + ( + cudf.Index( + [None, None, None, None, None], dtype="timedelta64[us]" + ), + "TimedeltaIndex([, , , , ], " + "dtype='timedelta64[us]')", + ), + ( + cudf.Index( + [ + 136457654, + None, + 245345345, + 223432411, + None, + 3634548734, + 23234, + ], + dtype="timedelta64[us]", + ), + "TimedeltaIndex([00:02:16.457654, , 00:04:05.345345, " + "00:03:43.432411, ," + " 01:00:34.548734, 00:00:00.023234]," + " dtype='timedelta64[us]')", + ), + ( + cudf.Index( + [ + 136457654, + None, + 245345345, + 223432411, + None, + 3634548734, + 23234, + ], + dtype="timedelta64[s]", + ), + "TimedeltaIndex([1579 days 08:54:14, , 2839 days 15:29:05," + " 2586 days 00:33:31, , 42066 days 12:52:14, " + "0 days 06:27:14]," + " dtype='timedelta64[s]')", + ), + ], +) +def test_timedelta_index_repr(index, expected_repr): + actual_repr = index.__repr__() + + assert actual_repr.split() == expected_repr.split() diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index dda910b9b33..c8fb5e40d23 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -1,8 +1,12 @@ +import datetime +import datetime as dt + import numpy as np +import pandas as pd import pytest from cudf._lib.scalar import Scalar -from cudf.tests.utils import DATETIME_TYPES, NUMERIC_TYPES +from cudf.tests.utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES @pytest.mark.parametrize( @@ -85,11 +89,58 @@ ) def test_round_trip_scalar(value): s = Scalar(value) + np.testing.assert_equal(s.value, value) + assert s.is_valid() is True -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES + ["object"]) +@pytest.mark.parametrize( + "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["object"] +) def test_null_scalar(dtype): s = Scalar(None, dtype=dtype) assert s.value is None assert s.dtype == np.dtype(dtype) + assert s.is_valid() is False + + +@pytest.mark.parametrize( + "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["object"] +) +def test_valid_scalar(dtype): + s = Scalar(1, dtype=dtype) + + assert s.dtype == np.dtype(dtype) + assert s.is_valid() is True + + +@pytest.mark.parametrize( + "value", + [ + datetime.timedelta(seconds=76), + datetime.timedelta(microseconds=7), + datetime.timedelta(minutes=47), + datetime.timedelta(hours=4427), + datetime.timedelta(weeks=7134), + pd.Timestamp(15133.5, unit="s"), + pd.Timestamp(15133.5, unit="D"), + pd.Timedelta(1513393355.5, unit="s"), + pd.Timedelta(34765, unit="D"), + ], +) +def test_date_duration_scalars(value): + s = Scalar(value) + + actual = s.value + + if isinstance(value, dt.datetime): + expected = np.datetime64(value) + elif isinstance(value, dt.timedelta): + expected = np.timedelta64(value) + elif isinstance(value, pd.Timestamp): + expected = value.to_datetime64() + elif isinstance(value, pd.Timedelta): + expected = value.to_timedelta64() + + np.testing.assert_equal(actual, expected) + assert s.is_valid() is True diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py new file mode 100644 index 00000000000..2f58d3e4b19 --- /dev/null +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -0,0 +1,1222 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. +import datetime +import re + +import cupy as cp +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.tests.utils import assert_eq +from cudf.utils import dtypes as dtypeutils + +_TIMEDELTA_DATA = [ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + [None], + [None, None, None, None, None], + [12, 12, 22, 343, 4353534, 435342], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + [1000000, 200000, 3000000], + [1000000, 200000, None], + [1], + [12, 11, 232, 223432411, 2343241, 234324, 23234], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + [ + 136457654736252, + 134736784364431, + 245345345545332, + 223432411, + 2343241, + 3634548734, + 23234, + ], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], +] + +_TIMEDELTA_DATA_NON_OVERFLOW = [ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + [None], + [None, None, None, None, None], + [12, 12, 22, 343, 4353534, 435342], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + [1000000, 200000, 3000000], + [1000000, 200000, None], + [1], + [12, 11, 232, 223432411, 2343241, 234324, 23234], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], +] + + +@pytest.mark.parametrize( + "data", + [ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + [None], + [None, None, None, None, None], + [12, 12, 22, 343, 4353534, 435342], + [0.3534, 12, 22, 343, 43.53534, 4353.42], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + ], +) +@pytest.mark.parametrize("dtype", dtypeutils.TIMEDELTA_TYPES) +def test_timedelta_series_create(data, dtype): + if dtype not in ("timedelta64[ns]"): + pytest.skip( + "Bug in pandas" "https://github.com/pandas-dev/pandas/issues/35465" + ) + psr = pd.Series( + cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, dtype=dtype + ) + gsr = cudf.Series(data, dtype=dtype) + + assert_eq(psr, gsr) + + +@pytest.mark.parametrize( + "data", + [ + [1000000, 200000, 3000000], + [12, 12, 22, 343, 4353534, 435342], + [0.3534, 12, 22, 343, 43.53534, 4353.42], + cp.asarray([10, 20, 30, 100]), + ], +) +@pytest.mark.parametrize("dtype", dtypeutils.TIMEDELTA_TYPES) +@pytest.mark.parametrize("cast_dtype", ["int64", "category"]) +def test_timedelta_from_typecast(data, dtype, cast_dtype): + if dtype not in ("timedelta64[ns]"): + pytest.skip( + "Bug in pandas" "https://github.com/pandas-dev/pandas/issues/35465" + ) + psr = pd.Series( + cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, dtype=dtype + ) + gsr = cudf.Series(data, dtype=dtype) + + assert_eq(psr.astype(cast_dtype), gsr.astype(cast_dtype)) + + +@pytest.mark.parametrize( + "data", + [ + [1000000, 200000, 3000000], + [12, 12, 22, 343, 4353534, 435342], + [0.3534, 12, 22, 343, 43.53534, 4353.42], + cp.asarray([10, 20, 30, 100]), + ], +) +@pytest.mark.parametrize("cast_dtype", dtypeutils.TIMEDELTA_TYPES) +def test_timedelta_to_typecast(data, cast_dtype): + psr = pd.Series(cp.asnumpy(data) if isinstance(data, cp.ndarray) else data) + gsr = cudf.Series(data) + + assert_eq(psr.astype(cast_dtype), gsr.astype(cast_dtype)) + + +@pytest.mark.parametrize( + "data", + [ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + [None], + [None, None, None, None, None], + [12, 12, 22, 343, 4353534, 435342], + [0.3534, 12, 22, 343, 43.53534, 4353.42], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + ], +) +@pytest.mark.parametrize("dtype", dtypeutils.TIMEDELTA_TYPES) +def test_timedelta_from_pandas(data, dtype): + psr = pd.Series( + cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, dtype=dtype + ) + gsr = cudf.from_pandas(psr) + + assert_eq(psr, gsr) + + +@pytest.mark.parametrize( + "data", + [ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + [None], + [None, None, None, None, None], + [12, 12, 22, 343, 4353534, 435342], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + ], +) +@pytest.mark.parametrize("dtype", dtypeutils.TIMEDELTA_TYPES) +@pytest.mark.parametrize("fillna", [None, "pandas"]) +def test_timedelta_series_to_array(data, dtype, fillna): + gsr = cudf.Series(data, dtype=dtype) + + expected = np.array( + cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, dtype=dtype + ) + if fillna is None: + expected = expected[~np.isnan(expected)] + + actual = gsr.to_array(fillna=fillna) + + np.testing.assert_array_equal(expected, actual) + + +@pytest.mark.parametrize( + "data", + [ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + [None], + [None, None, None, None, None], + [12, 12, 22, 343, 4353534, 435342], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + ], +) +@pytest.mark.parametrize("dtype", dtypeutils.TIMEDELTA_TYPES) +def test_timedelta_series_to_pandas(data, dtype): + gsr = cudf.Series(data, dtype=dtype) + + expected = np.array( + cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, dtype=dtype + ) + + expected = pd.Series(expected) + actual = gsr.to_pandas() + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data,other", + [ + ([1000000, 200000, 3000000], [1000000, 200000, 3000000]), + ([1000000, 200000, None], [1000000, 200000, None]), + ([], []), + ([None], [None]), + ([None, None, None, None, None], [None, None, None, None, None]), + ( + [12, 12, 22, 343, 4353534, 435342], + [12, 12, 22, 343, 4353534, 435342], + ), + (np.array([10, 20, 30, None, 100]), np.array([10, 20, 30, None, 100])), + (cp.asarray([10, 20, 30, 100]), cp.asarray([10, 20, 30, 100])), + ([1000000, 200000, 3000000], [200000, 34543, 3000000]), + ([1000000, 200000, None], [1000000, 200000, 3000000]), + ([None], [1]), + ( + [12, 12, 22, 343, 4353534, 435342], + [None, 1, 220, 3, 34, 4353423287], + ), + (np.array([10, 20, 30, None, 100]), np.array([10, 20, 30, None, 100])), + (cp.asarray([10, 20, 30, 100]), cp.asarray([10, 20, 30, 100])), + ], +) +@pytest.mark.parametrize("dtype", dtypeutils.TIMEDELTA_TYPES) +@pytest.mark.parametrize( + "ops", + [ + "eq", + "ne", + "lt", + "gt", + "le", + "ge", + "add", + "radd", + "sub", + "rsub", + "floordiv", + "truediv", + "mod", + ], +) +def test_timedelta_ops_misc_inputs(data, other, dtype, ops): + gsr = cudf.Series(data, dtype=dtype) + other_gsr = cudf.Series(other, dtype=dtype) + + psr = gsr.to_pandas() + other_psr = other_gsr.to_pandas() + + expected = getattr(psr, ops)(other_psr) + actual = getattr(gsr, ops)(other_gsr) + if ops in ("eq", "lt", "gt", "le", "ge"): + actual = actual.fillna(False) + elif ops == "ne": + actual = actual.fillna(True) + + if expected.dtype in cudf.utils.dtypes.cudf_dtypes_to_pandas_dtypes: + expected = expected.astype( + cudf.utils.dtypes.cudf_dtypes_to_pandas_dtypes[expected.dtype] + ) + + if ops == "floordiv": + expected[actual.isna().to_pandas()] = pd.NA + + assert_eq( + expected, + actual, + nullable_pd_dtype=False if actual.dtype.kind == "m" else True, + ) + + +@pytest.mark.parametrize( + "datetime_data,timedelta_data", + [ + ([1000000, 200000, 3000000], [1000000, 200000, 3000000]), + ([1000000, 200000, None], [1000000, 200000, None]), + ([], []), + ([None], [None]), + ([None, None, None, None, None], [None, None, None, None, None]), + ( + [12, 12, 22, 343, 4353534, 435342], + [12, 12, 22, 343, 4353534, 435342], + ), + (np.array([10, 20, 30, None, 100]), np.array([10, 20, 30, None, 100])), + (cp.asarray([10, 20, 30, 100]), cp.asarray([10, 20, 30, 100])), + ([1000000, 200000, 3000000], [200000, 34543, 3000000]), + ([1000000, 200000, None], [1000000, 200000, 3000000]), + ([None], [1]), + ( + [12, 12, 22, 343, 4353534, 435342], + [None, 1, 220, 3, 34, 4353423287], + ), + (np.array([10, 20, 30, None, 100]), np.array([10, 20, 30, None, 100])), + (cp.asarray([10, 20, 30, 100]), cp.asarray([10, 20, 30, 100])), + ( + [12, 11, 232, 223432411, 2343241, 234324, 23234], + [11, 1132324, 2322323111, 23341, 2434, 332, 323], + ), + ( + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + [11, 1132324, 2322323111, 23341, 2434, 332, 323], + ), + ( + [11, 1132324, 2322323111, 23341, 2434, 332, 323], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ), + ( + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ), + ], +) +@pytest.mark.parametrize("datetime_dtype", dtypeutils.DATETIME_TYPES) +@pytest.mark.parametrize("timedelta_dtype", dtypeutils.TIMEDELTA_TYPES) +@pytest.mark.parametrize( + "ops", ["add", "sub"], +) +def test_timedelta_ops_datetime_inputs( + datetime_data, timedelta_data, datetime_dtype, timedelta_dtype, ops +): + gsr_datetime = cudf.Series(datetime_data, dtype=datetime_dtype) + gsr_timedelta = cudf.Series(timedelta_data, dtype=timedelta_dtype) + + psr_datetime = gsr_datetime.to_pandas() + psr_timedelta = gsr_timedelta.to_pandas() + + expected = getattr(psr_datetime, ops)(psr_timedelta) + actual = getattr(gsr_datetime, ops)(gsr_timedelta) + + assert_eq(expected, actual) + + if ops == "add": + expected = getattr(psr_timedelta, ops)(psr_datetime) + actual = getattr(gsr_timedelta, ops)(gsr_datetime) + + assert_eq(expected, actual) + elif ops == "sub": + with pytest.raises( + TypeError, + match=re.escape( + f"Subtraction of {gsr_timedelta.dtype} with " + f"{gsr_datetime.dtype} cannot be performed." + ), + ): + actual = getattr(gsr_timedelta, ops)(gsr_datetime) + + +@pytest.mark.parametrize( + "df", + [ + pd.DataFrame( + { + "A": pd.Series(pd.date_range("2012-1-1", periods=3, freq="D")), + "B": pd.Series([pd.Timedelta(days=i) for i in range(3)]), + } + ), + pd.DataFrame( + { + "A": pd.Series( + pd.date_range("1994-1-1", periods=50, freq="D") + ), + "B": pd.Series([pd.Timedelta(days=i) for i in range(50)]), + } + ), + ], +) +@pytest.mark.parametrize("op", ["add", "sub"]) +def test_timedelta_dataframe_ops(df, op): + pdf = df + gdf = cudf.from_pandas(pdf) + + if op == "add": + pdf["C"] = pdf["A"] + pdf["B"] + gdf["C"] = gdf["A"] + gdf["B"] + elif op == "sub": + pdf["C"] = pdf["A"] - pdf["B"] + gdf["C"] = gdf["A"] - gdf["B"] + + assert_eq(pdf, gdf) + + +@pytest.mark.parametrize( + "data", + [ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + [None], + [None, None, None, None, None], + [12, 12, 22, 343, 4353534, 435342], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + [1000000, 200000, 3000000], + [1000000, 200000, None], + [1], + [12, 11, 232, 223432411, 2343241, 234324, 23234], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + pytest.param( + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + marks=pytest.mark.xfail( + reason="https://github.com/rapidsai/cudf/issues/5938" + ), + ), + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ], +) +@pytest.mark.parametrize( + "other_scalars", + [ + datetime.timedelta(days=768), + datetime.timedelta(seconds=768), + datetime.timedelta(microseconds=7), + datetime.timedelta(minutes=447), + datetime.timedelta(hours=447), + datetime.timedelta(weeks=734), + np.timedelta64(4, "s"), + np.timedelta64(456, "D"), + np.timedelta64(46, "h"), + pytest.param( + np.timedelta64("nat"), + marks=pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/35529" + ), + ), + np.timedelta64(1, "s"), + np.timedelta64(1, "ms"), + np.timedelta64(1, "us"), + np.timedelta64(1, "ns"), + ], +) +@pytest.mark.parametrize("dtype", dtypeutils.TIMEDELTA_TYPES) +@pytest.mark.parametrize( + "op", + [ + "add", + "sub", + "truediv", + "mod", + pytest.param( + "floordiv", + marks=pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/35529" + ), + ), + ], +) +def test_timedelta_series_ops_with_scalars(data, other_scalars, dtype, op): + gsr = cudf.Series(data=data, dtype=dtype) + psr = gsr.to_pandas() + + if op == "add": + expected = psr + other_scalars + actual = gsr + other_scalars + elif op == "sub": + expected = psr - other_scalars + actual = gsr - other_scalars + elif op == "truediv": + expected = psr / other_scalars + actual = gsr / other_scalars + elif op == "floordiv": + expected = psr // other_scalars + actual = gsr // other_scalars + elif op == "mod": + expected = psr % other_scalars + actual = gsr % other_scalars + + assert_eq(expected, actual) + + if op == "add": + expected = other_scalars + psr + actual = other_scalars + gsr + elif op == "sub": + expected = other_scalars - psr + actual = other_scalars - gsr + elif op == "truediv": + expected = other_scalars / psr + actual = other_scalars / gsr + elif op == "floordiv": + expected = other_scalars // psr + actual = other_scalars // gsr + elif op == "mod": + expected = other_scalars % psr + actual = other_scalars % gsr + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", + [ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + pytest.param( + [None], + marks=pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/35644" + ), + ), + pytest.param( + [None, None, None, None, None], + marks=pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/35644" + ), + ), + [12, 12, 22, 343, 4353534, 435342], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + [1000000, 200000, 3000000], + [1000000, 200000, None], + [1], + [12, 11, 232, 223432411, 2343241, 234324, 23234], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ], +) +@pytest.mark.parametrize("dtype", dtypeutils.TIMEDELTA_TYPES) +@pytest.mark.parametrize("reduction_op", ["sum", "mean", "median", "quantile"]) +def test_timedelta_reduction_ops(data, dtype, reduction_op): + gsr = cudf.Series(data, dtype=dtype) + psr = gsr.to_pandas() + + expected = getattr(psr, reduction_op)() + actual = getattr(gsr, reduction_op)() + if pd.isna(expected) and pd.isna(actual): + pass + elif isinstance(expected, pd.Timedelta) and isinstance( + actual, pd.Timedelta + ): + assert ( + expected.round(gsr._column.time_unit).value + == actual.round(gsr._column.time_unit).value + ) + else: + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", _TIMEDELTA_DATA, +) +@pytest.mark.parametrize("dtype", dtypeutils.TIMEDELTA_TYPES) +def test_timedelta_dt_components(data, dtype): + gsr = cudf.Series(data, dtype=dtype) + psr = gsr.to_pandas() + + expected = psr.dt.components + actual = gsr.dt.components + + if gsr.isnull().any(): + assert_eq(expected, actual.astype("float")) + else: + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", _TIMEDELTA_DATA, +) +@pytest.mark.parametrize("dtype", dtypeutils.TIMEDELTA_TYPES) +def test_timedelta_dt_properties(data, dtype): + gsr = cudf.Series(data, dtype=dtype) + psr = gsr.to_pandas() + + def local_assert(expected, actual): + if gsr.isnull().any(): + assert_eq(expected, actual.astype("float")) + else: + assert_eq(expected, actual) + + expected_days = psr.dt.days + actual_days = gsr.dt.days + + local_assert(expected_days, actual_days) + + expected_seconds = psr.dt.seconds + actual_seconds = gsr.dt.seconds + + local_assert(expected_seconds, actual_seconds) + + expected_microseconds = psr.dt.microseconds + actual_microseconds = gsr.dt.microseconds + + local_assert(expected_microseconds, actual_microseconds) + + expected_nanoseconds = psr.dt.nanoseconds + actual_nanoseconds = gsr.dt.nanoseconds + + local_assert(expected_nanoseconds, actual_nanoseconds) + + +@pytest.mark.parametrize( + "data", _TIMEDELTA_DATA, +) +@pytest.mark.parametrize("dtype", dtypeutils.TIMEDELTA_TYPES) +def test_timedelta_index(data, dtype): + gdi = cudf.Index(data, dtype=dtype) + pdi = gdi.to_pandas() + + assert_eq(pdi, gdi) + + +@pytest.mark.parametrize("data", _TIMEDELTA_DATA_NON_OVERFLOW) +@pytest.mark.parametrize("datetime_dtype", dtypeutils.DATETIME_TYPES) +@pytest.mark.parametrize("timedelta_dtype", dtypeutils.TIMEDELTA_TYPES) +def test_timedelta_index_datetime_index_ops( + data, datetime_dtype, timedelta_dtype +): + gdt = cudf.Index(data, dtype=datetime_dtype) + gtd = cudf.Index(data, dtype=timedelta_dtype) + + pdt = gdt.to_pandas() + ptd = gtd.to_pandas() + + assert_eq(gdt - gtd, pdt - ptd) + assert_eq(gdt + gtd, pdt + ptd) + + +@pytest.mark.parametrize( + "datetime_data,timedelta_data", + [ + ([1000000, 200000, 3000000], [1000000, 200000, 3000000]), + ([1000000, 200000, None], [1000000, 200000, None]), + ([], []), + ([None], [None]), + ([None, None, None, None, None], [None, None, None, None, None]), + ( + [12, 12, 22, 343, 4353534, 435342], + [12, 12, 22, 343, 4353534, 435342], + ), + (np.array([10, 20, 30, None, 100]), np.array([10, 20, 30, None, 100])), + (cp.asarray([10, 20, 30, 100]), cp.asarray([10, 20, 30, 100])), + ([1000000, 200000, 3000000], [200000, 34543, 3000000]), + ([1000000, 200000, None], [1000000, 200000, 3000000]), + ([None], [1]), + ( + [12, 12, 22, 343, 4353534, 435342], + [None, 1, 220, 3, 34, 4353423287], + ), + (np.array([10, 20, 30, None, 100]), np.array([10, 20, 30, None, 100])), + (cp.asarray([10, 20, 30, 100]), cp.asarray([10, 20, 30, 100])), + ( + [12, 11, 232, 223432411, 2343241, 234324, 23234], + [11, 1132324, 2322323111, 23341, 2434, 332, 323], + ), + ( + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + [11, 1132324, 2322323111, 23341, 2434, 332, 323], + ), + ( + [11, 1132324, 2322323111, 23341, 2434, 332, 323], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ), + ( + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ), + ], +) +@pytest.mark.parametrize("datetime_dtype", dtypeutils.DATETIME_TYPES) +@pytest.mark.parametrize("timedelta_dtype", dtypeutils.TIMEDELTA_TYPES) +def test_timedelta_datetime_index_ops_misc( + datetime_data, timedelta_data, datetime_dtype, timedelta_dtype +): + gdt = cudf.Index(datetime_data, dtype=datetime_dtype) + gtd = cudf.Index(timedelta_data, dtype=timedelta_dtype) + + pdt = gdt.to_pandas() + ptd = gtd.to_pandas() + + assert_eq(gdt - gtd, pdt - ptd) + assert_eq(gdt + gtd, pdt + ptd) + + +@pytest.mark.parametrize("data", _TIMEDELTA_DATA_NON_OVERFLOW) +@pytest.mark.parametrize( + "other_scalars", + [ + pd.Timedelta(1513393355.5, unit="s"), + pd.Timedelta(34765, unit="D"), + datetime.timedelta(days=768), + datetime.timedelta(seconds=768), + datetime.timedelta(microseconds=7), + datetime.timedelta(minutes=447), + datetime.timedelta(hours=447), + datetime.timedelta(weeks=734), + np.timedelta64(4, "s"), + np.timedelta64(456, "D"), + np.timedelta64(46, "h"), + np.timedelta64("nat"), + np.timedelta64(1, "s"), + np.timedelta64(1, "ms"), + np.timedelta64(1, "us"), + np.timedelta64(1, "ns"), + ], +) +@pytest.mark.parametrize("dtype", dtypeutils.TIMEDELTA_TYPES) +@pytest.mark.parametrize( + "op", + [ + "add", + "sub", + "truediv", + pytest.param( + "floordiv", + marks=pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/35529" + ), + ), + ], +) +def test_timedelta_index_ops_with_scalars(data, other_scalars, dtype, op): + gtdi = cudf.Index(data=data, dtype=dtype) + ptdi = gtdi.to_pandas() + + if op == "add": + expected = ptdi + other_scalars + actual = gtdi + other_scalars + elif op == "sub": + expected = ptdi - other_scalars + actual = gtdi - other_scalars + elif op == "truediv": + expected = ptdi / other_scalars + actual = gtdi / other_scalars + elif op == "floordiv": + expected = ptdi // other_scalars + actual = gtdi // other_scalars + + assert_eq(expected, actual) + + if op == "add": + expected = other_scalars + ptdi + actual = other_scalars + gtdi + elif op == "sub": + expected = other_scalars - ptdi + actual = other_scalars - gtdi + elif op == "truediv": + expected = other_scalars / ptdi + actual = other_scalars / gtdi + elif op == "floordiv": + expected = other_scalars // ptdi + actual = other_scalars // gtdi + + assert_eq(expected, actual) + + +@pytest.mark.parametrize("data", _TIMEDELTA_DATA) +@pytest.mark.parametrize("dtype", dtypeutils.TIMEDELTA_TYPES) +@pytest.mark.parametrize("name", ["abcd", None]) +def test_timedelta_index_properties(data, dtype, name): + gdi = cudf.Index(data, dtype=dtype, name=name) + pdi = gdi.to_pandas() + + def local_assert(expected, actual): + if actual._values.null_count: + assert_eq(expected, actual.astype("float64")) + else: + assert_eq(expected, actual) + + expected_days = pdi.days + actual_days = gdi.days + + local_assert(expected_days, actual_days) + + expected_seconds = pdi.seconds + actual_seconds = gdi.seconds + + local_assert(expected_seconds, actual_seconds) + + expected_microseconds = pdi.microseconds + actual_microseconds = gdi.microseconds + + local_assert(expected_microseconds, actual_microseconds) + + expected_nanoseconds = pdi.nanoseconds + actual_nanoseconds = gdi.nanoseconds + + local_assert(expected_nanoseconds, actual_nanoseconds) + + expected_components = pdi.components + actual_components = gdi.components + + if actual_components.isnull().any().any(): + assert_eq(expected_components, actual_components.astype("float")) + else: + assert_eq( + expected_components, + actual_components, + check_index_type=not actual_components.empty, + ) + + +@pytest.mark.parametrize("data", _TIMEDELTA_DATA) +@pytest.mark.parametrize("dtype", dtypeutils.TIMEDELTA_TYPES) +@pytest.mark.parametrize( + "fill_value", + [ + np.timedelta64(4, "s"), + np.timedelta64(456, "D"), + np.timedelta64(46, "h"), + np.timedelta64("nat"), + np.timedelta64(1, "s"), + np.timedelta64(1, "ms"), + np.timedelta64(1, "us"), + np.timedelta64(1, "ns"), + ], +) +def test_timedelta_fillna(data, dtype, fill_value): + sr = cudf.Series(data, dtype=dtype) + psr = sr.to_pandas() + + expected = psr.fillna(fill_value) + actual = sr.fillna(fill_value) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "gsr,expected_series", + [ + ( + cudf.Series([1, 2, 3], dtype="timedelta64[ns]"), + cudf.Series( + [ + "0 days 00:00:00.000000001", + "0 days 00:00:00.000000002", + "0 days 00:00:00.000000003", + ] + ), + ), + ( + cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[ms]"), + cudf.Series( + ["0 days 00:16:40", "0 days 00:03:20", "0 days 00:50:00"] + ), + ), + ( + cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[s]"), + cudf.Series( + ["11 days 13:46:40", "2 days 07:33:20", "34 days 17:20:00"] + ), + ), + ( + cudf.Series( + [None, None, None, None, None], dtype="timedelta64[us]" + ), + cudf.Series([None, None, None, None, None], dtype="str"), + ), + ( + cudf.Series( + [ + 136457654, + None, + 245345345, + 223432411, + None, + 3634548734, + 23234, + ], + dtype="timedelta64[us]", + ), + cudf.Series( + [ + "0 days 00:02:16.457654", + None, + "0 days 00:04:05.345345", + "0 days 00:03:43.432411", + None, + "0 days 01:00:34.548734", + "0 days 00:00:00.023234", + ] + ), + ), + ( + cudf.Series( + [ + 136457654, + None, + 245345345, + 223432411, + None, + 3634548734, + 23234, + ], + dtype="timedelta64[ms]", + ), + cudf.Series( + [ + "1 days 13:54:17.654", + None, + "2 days 20:09:05.345", + "2 days 14:03:52.411", + None, + "42 days 01:35:48.734", + "0 days 00:00:23.234", + ] + ), + ), + ( + cudf.Series( + [ + 136457654, + None, + 245345345, + 223432411, + None, + 3634548734, + 23234, + ], + dtype="timedelta64[s]", + ), + cudf.Series( + [ + "1579 days 08:54:14", + None, + "2839 days 15:29:05", + "2586 days 00:33:31", + None, + "42066 days 12:52:14", + "0 days 06:27:14", + ] + ), + ), + ( + cudf.Series( + [ + 136457654, + None, + 245345345, + 223432411, + None, + 3634548734, + 23234, + ], + dtype="timedelta64[ns]", + ), + cudf.Series( + [ + "0 days 00:00:00.136457654", + None, + "0 days 00:00:00.245345345", + "0 days 00:00:00.223432411", + None, + "0 days 00:00:03.634548734", + "0 days 00:00:00.000023234", + ] + ), + ), + ], +) +def test_timedelta_str_roundtrip(gsr, expected_series): + + actual_series = gsr.astype("str") + + assert_eq(expected_series, actual_series) + + assert_eq(gsr, actual_series.astype(gsr.dtype)) + + +def test_timedelta_invalid_ops(): + sr = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") + psr = sr.to_pandas() + + try: + psr + 1 + except TypeError: + with pytest.raises( + TypeError, + match=re.escape( + f"Addition of {sr.dtype} with {np.dtype('int64')} " + f"cannot be performed." + ), + ): + sr + 1 + else: + raise AssertionError("Expected psr+1 to fail") + + try: + psr + "a" + except TypeError: + with pytest.raises( + TypeError, + match=re.escape( + f"Addition of {sr.dtype} with {np.dtype('object')} " + f"cannot be performed." + ), + ): + sr + "a" + else: + raise AssertionError("Expected psr + 'a' to fail") + + dt_sr = cudf.Series([1, 2, 3], dtype="datetime64[ns]") + dt_psr = dt_sr.to_pandas() + + try: + psr % dt_psr + except TypeError: + with pytest.raises( + TypeError, + match=re.escape( + f"Modulus of {sr.dtype} with {dt_sr.dtype} " + f"cannot be performed." + ), + ): + sr % dt_sr + else: + raise AssertionError("Expected psr \\%d t_psr to fail") + + try: + psr % "a" + except TypeError: + with pytest.raises( + TypeError, + match=re.escape( + f"Modulus of {sr.dtype} with {np.dtype('object')} " + f"cannot be performed." + ), + ): + sr % "a" + else: + raise AssertionError("Expected psr \\%d 'a' to fail") + + try: + psr > dt_psr + except TypeError: + with pytest.raises( + TypeError, + match=re.escape( + f"Invalid comparison between dtype={sr.dtype}" + f" and {dt_sr.dtype}" + ), + ): + sr > dt_sr + else: + raise AssertionError("Expected psr > t_psr to fail") + + try: + psr < dt_psr + except TypeError: + with pytest.raises( + TypeError, + match=re.escape( + f"Invalid comparison between dtype={sr.dtype}" + f" and {dt_sr.dtype}" + ), + ): + sr < dt_sr + else: + raise AssertionError("Expected psr < t_psr to fail") + + try: + psr >= dt_psr + except TypeError: + with pytest.raises( + TypeError, + match=re.escape( + f"Invalid comparison between dtype={sr.dtype}" + f" and {dt_sr.dtype}" + ), + ): + sr >= dt_sr + else: + raise AssertionError("Expected psr >= t_psr to fail") + + try: + psr <= dt_psr + except TypeError: + with pytest.raises( + TypeError, + match=re.escape( + f"Invalid comparison between dtype={sr.dtype}" + f" and {dt_sr.dtype}" + ), + ): + sr <= dt_sr + else: + raise AssertionError("Expected psr <= t_psr to fail") + + try: + psr / dt_psr + except TypeError: + with pytest.raises( + TypeError, + match=re.escape( + f"Division of {sr.dtype} with {dt_sr.dtype} " + f"cannot be performed." + ), + ): + sr / dt_sr + else: + raise AssertionError("Expected psr / t_psr to fail") + + try: + psr // dt_psr + except TypeError: + with pytest.raises( + TypeError, + match=re.escape( + f"Floor Division of {sr.dtype} with {dt_sr.dtype} " + f"cannot be performed." + ), + ): + sr // dt_sr + else: + raise AssertionError("Expected psr // t_psr to fail") + + try: + psr * dt_psr + except TypeError: + with pytest.raises( + TypeError, + match=re.escape( + f"Multiplication of {sr.dtype} with {dt_sr.dtype} " + f"cannot be performed." + ), + ): + sr * dt_sr + else: + raise AssertionError("Expected psr * t_psr to fail") + + try: + psr * psr + except TypeError: + with pytest.raises( + TypeError, + match=re.escape( + f"Multiplication of {sr.dtype} with {sr.dtype} " + f"cannot be performed." + ), + ): + sr * sr + else: + raise AssertionError("Expected psr * psr to fail") + + try: + psr ^ psr + except TypeError: + with pytest.raises( + TypeError, + match=re.escape( + f"Series of dtype {sr.dtype} cannot perform " + f"the operation xor" + ), + ): + sr ^ sr + else: + raise AssertionError("Expected psr ^ psr to fail") + + +def test_timedelta_datetime_cast_invalid(): + sr = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") + psr = sr.to_pandas() + + try: + psr.astype("datetime64[ns]") + except TypeError as e: + with pytest.raises(type(e), match=re.escape(e.__str__())): + sr.astype("datetime64[ns]") + else: + raise AssertionError("Expected timedelta to datetime typecast to fail") + + sr = cudf.Series([1, 2, 3], dtype="datetime64[ns]") + psr = sr.to_pandas() + + try: + psr.astype("timedelta64[ns]") + except TypeError as e: + with pytest.raises(type(e), match=re.escape(e.__str__())): + sr.astype("timedelta64[ns]") + else: + raise AssertionError("Expected datetime to timedelta typecast to fail") + + +@pytest.mark.parametrize("data", [[], [1, 2, 3, 4, 5]]) +@pytest.mark.parametrize("dtype", dtypeutils.NUMERIC_TYPES) +@pytest.mark.parametrize("timedelta_dtype", dtypeutils.TIMEDELTA_TYPES) +def test_numeric_to_timedelta(data, dtype, timedelta_dtype): + sr = cudf.Series(data, dtype=dtype) + psr = sr.to_pandas() + + actual = sr.astype(timedelta_dtype) + expected = pd.Series(psr.to_numpy().astype(timedelta_dtype)) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize("data", [[], [1, 2, 3, 4, 5]]) +@pytest.mark.parametrize("dtype", dtypeutils.TIMEDELTA_TYPES) +@pytest.mark.parametrize( + "scalar", + [ + 1, + 2, + 3, + "a", + np.timedelta64(1, "s"), + np.timedelta64(2, "s"), + np.timedelta64(2, "D"), + np.timedelta64(3, "ms"), + np.timedelta64(4, "us"), + np.timedelta64(5, "ns"), + np.timedelta64(6, "ns"), + np.datetime64(6, "s"), + ], +) +def test_timedelta_contains(data, dtype, scalar): + sr = cudf.Series(data, dtype=dtype) + psr = sr.to_pandas() + + expected = scalar in sr + actual = scalar in psr + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/utils.py b/python/cudf/cudf/tests/utils.py index 6e9a236e352..5e114bee173 100644 --- a/python/cudf/cudf/tests/utils.py +++ b/python/cudf/cudf/tests/utils.py @@ -28,6 +28,7 @@ SIGNED_TYPES = sorted(list(dtypeutils.SIGNED_TYPES)) NUMERIC_TYPES = sorted(list(dtypeutils.NUMERIC_TYPES)) DATETIME_TYPES = sorted(list(dtypeutils.DATETIME_TYPES)) +TIMEDELTA_TYPES = sorted(list(dtypeutils.TIMEDELTA_TYPES)) OTHER_TYPES = sorted(list(dtypeutils.OTHER_TYPES)) ALL_TYPES = sorted(list(dtypeutils.ALL_TYPES)) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index ec41bb4a60a..bddc9bb9064 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -1,3 +1,4 @@ +import datetime as dt import numbers from collections import namedtuple from collections.abc import Sequence @@ -6,9 +7,11 @@ import numpy as np import pandas as pd import pyarrow as pa +from pandas.core.dtypes.common import infer_dtype_from_object from pandas.core.dtypes.dtypes import CategoricalDtype, CategoricalDtypeType import cudf +from cudf._lib.scalar import Scalar _NA_REP = "" _np_pa_dtypes = { @@ -54,8 +57,14 @@ "datetime64[us]", "datetime64[ns]", } +TIMEDELTA_TYPES = { + "timedelta64[s]", + "timedelta64[ms]", + "timedelta64[us]", + "timedelta64[ns]", +} OTHER_TYPES = {"bool", "category", "str"} -ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | OTHER_TYPES +ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | OTHER_TYPES def np_to_pa_dtype(dtype): @@ -69,6 +78,13 @@ def np_to_pa_dtype(dtype): return pa.timestamp(time_unit) # default is int64_t UNIX ms return pa.date64() + elif dtype.kind == "m": + time_unit, _ = np.datetime_data(dtype) + if time_unit in ("s", "ms", "us", "ns"): + # return a pa.Duration of the appropriate unit + return pa.duration(time_unit) + # default fallback unit is ns + return pa.duration("ns") return _np_pa_dtypes[np.dtype(dtype).type] @@ -175,7 +191,6 @@ def cudf_dtype_from_pydata_dtype(dtype): """ Given a numpy or pandas dtype, converts it into the equivalent cuDF Python dtype. """ - from pandas.core.dtypes.common import infer_dtype_from_object if is_categorical_dtype(dtype): return cudf.core.dtypes.CategoricalDtype @@ -188,12 +203,17 @@ def cudf_dtype_from_pydata_dtype(dtype): def is_scalar(val): return ( val is None + or isinstance(val, Scalar) or isinstance(val, str) or isinstance(val, numbers.Number) or np.isscalar(val) or (isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0) or isinstance(val, pd.Timestamp) or (isinstance(val, pd.Categorical) and len(val) == 1) + or (isinstance(val, pd.Timedelta)) + or (isinstance(val, pd.Timestamp)) + or (isinstance(val, dt.datetime)) + or (isinstance(val, dt.timedelta)) ) @@ -219,6 +239,15 @@ def to_cudf_compatible_scalar(val, dtype=None): if ((dtype is None) and isinstance(val, str)) or is_string_dtype(dtype): dtype = "str" + if isinstance(val, dt.datetime): + val = np.datetime64(val) + elif isinstance(val, dt.timedelta): + val = np.timedelta64(val) + elif isinstance(val, pd.Timestamp): + val = val.to_datetime64() + elif isinstance(val, pd.Timedelta): + val = val.to_timedelta64() + val = pd.api.types.pandas_dtype(type(val)).type(val) if dtype is not None: @@ -228,6 +257,10 @@ def to_cudf_compatible_scalar(val, dtype=None): time_unit, _ = np.datetime_data(val.dtype) if time_unit in ("D", "W", "M", "Y"): val = val.astype("datetime64[s]") + elif val.dtype.type is np.timedelta64: + time_unit, _ = np.datetime_data(val.dtype) + if time_unit in ("D", "W", "M", "Y"): + val = val.astype("timedelta64[ns]") return val @@ -344,9 +377,8 @@ def min_column_type(x, expected_type): If the column is not a subtype of `np.signedinteger` or `np.floating` returns the same dtype as the dtype of `x` without modification """ - from cudf.core.column import NumericalColumn - if not isinstance(x, NumericalColumn): + if not isinstance(x, cudf.core.column.NumericalColumn): raise TypeError("Argument x must be of type column.NumericalColumn") if x.valid_count == 0: return x.dtype @@ -369,8 +401,6 @@ def min_column_type(x, expected_type): def check_cast_unsupported_dtype(dtype): - from cudf._lib.types import np_to_cudf_types - if is_categorical_dtype(dtype): return dtype @@ -379,7 +409,7 @@ def check_cast_unsupported_dtype(dtype): else: dtype = np.dtype(dtype) - if dtype in np_to_cudf_types: + if dtype in cudf._lib.types.np_to_cudf_types: return dtype if dtype == np.dtype("float16"): @@ -394,3 +424,17 @@ def is_mixed_with_object_dtype(lhs, rhs): return (lhs.dtype == "object" and rhs.dtype != "object") or ( rhs.dtype == "object" and lhs.dtype != "object" ) + + +def get_time_unit(obj): + if isinstance( + obj, + ( + cudf.core.column.datetime.DatetimeColumn, + cudf.core.column.timedelta.TimeDeltaColumn, + ), + ): + return obj.time_unit + + time_unit, _ = np.datetime_data(obj.dtype) + return time_unit diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 6b8a199550b..df5f8546675 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -8,6 +8,7 @@ import pandas as pd import pyarrow as pa from numba import njit +from pyarrow.cuda import CudaBuffer as arrowCudaBuffer import rmm @@ -65,7 +66,10 @@ def scalar_broadcast_to(scalar, size, dtype=None): if isinstance(size, (tuple, list)): size = size[0] - if scalar is None: + if scalar is None or ( + isinstance(scalar, (np.datetime64, np.timedelta64)) + and np.isnat(scalar) + ): if dtype is None: dtype = "object" return column.column_empty(size, dtype=dtype, masked=True) @@ -100,7 +104,7 @@ def normalize_index(index, size, doraise=True): list_types_tuple = (list, np.array) -def buffers_from_pyarrow(pa_arr, dtype=None): +def buffers_from_pyarrow(pa_arr): """ Given a pyarrow array returns a 5 length tuple of: - size @@ -109,12 +113,13 @@ def buffers_from_pyarrow(pa_arr, dtype=None): - cudf.Buffer --> data - cudf.Buffer --> string characters """ - from cudf._lib.null_mask import bitmask_allocation_size_bytes buffers = pa_arr.buffers() if pa_arr.null_count: - mask_size = bitmask_allocation_size_bytes(len(pa_arr)) + mask_size = cudf._lib.null_mask.bitmask_allocation_size_bytes( + len(pa_arr) + ) pamask = pyarrow_buffer_to_cudf_buffer(buffers[0], mask_size=mask_size) else: pamask = None @@ -138,7 +143,6 @@ def pyarrow_buffer_to_cudf_buffer(arrow_buf, mask_size=0): Given a PyArrow Buffer backed by either host or device memory, convert it to a cuDF Buffer """ - from pyarrow.cuda import CudaBuffer as arrowCudaBuffer # Try creating a PyArrow CudaBuffer from the PyArrow Buffer object, it # fails with an ArrowTypeError if it's a host based Buffer so we catch and @@ -417,9 +421,9 @@ def time_col_replace_nulls(input_col): input_col, column.as_column( Buffer( - np.array([np.datetime64("NaT")], dtype=input_col.dtype).view( - "|u1" - ) + np.array( + [input_col.default_na_value()], dtype=input_col.dtype + ).view("|u1") ), dtype=input_col.dtype, ),