From 8ff8f95da58813b83d1b8a3b08e285f0fd724163 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Sun, 18 Sep 2022 15:03:55 -0700 Subject: [PATCH 01/51] fixes --- cpp/include/cudf/datetime.hpp | 6 + cpp/include/cudf/detail/datetime.hpp | 5 + cpp/src/datetime/datetime_ops.cu | 16 ++ python/cudf/cudf/_lib/cpp/datetime.pxd | 1 + python/cudf/cudf/_lib/datetime.pyx | 2 + python/cudf/cudf/core/dataframe.py | 26 ++- python/cudf/cudf/core/index.py | 30 ++- python/cudf/cudf/core/series.py | 2 +- python/cudf/cudf/tests/test_repr.py | 274 ++++++++++++++++++++++++- python/cudf/cudf/utils/docutils.py | 20 +- 10 files changed, 358 insertions(+), 24 deletions(-) diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index a8955ffb17c..5d86b9f5fb7 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -133,6 +133,12 @@ std::unique_ptr extract_second( cudf::column_view const& column, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +std::unique_ptr extract_milli_second( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + + /** @} */ // end of group /** * @addtogroup datetime_compute diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp index 7a2545fbdcf..40a7276f977 100644 --- a/cpp/include/cudf/detail/datetime.hpp +++ b/cpp/include/cudf/detail/datetime.hpp @@ -94,6 +94,11 @@ std::unique_ptr extract_second( rmm::cuda_stream_view stream = cudf::default_stream_value, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr extract_milli_second( + cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::mr::device_memory_resource *) * diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index ee026d6c395..5a4f3d02971 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -495,6 +495,15 @@ std::unique_ptr extract_second(column_view const& column, cudf::type_id::INT16>(column, stream, mr); } +std::unique_ptr extract_milli_second(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return detail::apply_datetime_op< + detail::extract_component_operator, + cudf::type_id::INT64>(column, stream, mr); +} + std::unique_ptr last_day_of_month(column_view const& column, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -607,6 +616,13 @@ std::unique_ptr extract_second(column_view const& column, return detail::extract_second(column, cudf::default_stream_value, mr); } +std::unique_ptr extract_milli_second(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_milli_second(column, cudf::default_stream_value, mr); +} + std::unique_ptr last_day_of_month(column_view const& column, rmm::mr::device_memory_resource* mr) { diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd index 498fc313cf9..e49b021966d 100644 --- a/python/cudf/cudf/_lib/cpp/datetime.pxd +++ b/python/cudf/cudf/_lib/cpp/datetime.pxd @@ -13,6 +13,7 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: cdef unique_ptr[column] extract_hour(const column_view& column) except + cdef unique_ptr[column] extract_minute(const column_view& column) except + cdef unique_ptr[column] extract_second(const column_view& column) except + + cdef unique_ptr[column] extract_milli_second(const column_view& column) except + ctypedef enum rounding_frequency "cudf::datetime::rounding_frequency": DAY "cudf::datetime::rounding_frequency::DAY" diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index e218400a2db..1e6b40c5027 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -49,6 +49,8 @@ def extract_datetime_component(Column col, object field): c_result = move(libcudf_datetime.extract_minute(col_view)) elif field == "second": c_result = move(libcudf_datetime.extract_second(col_view)) + elif field == "milli_second": + c_result = move(libcudf_datetime.extract_milli_second(col_view)) elif field == "day_of_year": c_result = move(libcudf_datetime.day_of_year(col_view)) else: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 2f1695e4445..0b2f6545b6c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -570,12 +570,12 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): ... [(t0+ timedelta(seconds=x)) for x in range(n)]) ... }) >>> df - id datetimes - 0 0 2018-10-07 12:00:00 - 1 1 2018-10-07 12:00:01 - 2 2 2018-10-07 12:00:02 - 3 3 2018-10-07 12:00:03 - 4 4 2018-10-07 12:00:04 + id datetimes + 0 0 2018-10-07 12:00:00.000000 + 1 1 2018-10-07 12:00:01.000000 + 2 2 2018-10-07 12:00:02.000000 + 3 3 2018-10-07 12:00:03.000000 + 4 4 2018-10-07 12:00:04.000000 Build DataFrame via list of rows as tuples: @@ -1048,8 +1048,8 @@ def dtypes(self): ... 'datetime': [pd.Timestamp('20180310')], ... 'string': ['foo']}) >>> df - float int datetime string - 0 1.0 1 2018-03-10 foo + float int datetime string + 0 1.0 1 2018-03-10 00:00:00.000000 foo >>> df.dtypes float float64 int int64 @@ -1754,6 +1754,12 @@ def _clean_renderable_dataframe(self, output): width, _ = console.get_console_size() else: width = None + + for col_name, col in output._data.items(): + if isinstance(col, (cudf.core.column.timedelta.TimeDeltaColumn, cudf.core.column.datetime.DatetimeColumn)): + output._data[col_name] = output._data[col_name].astype('str') + else: + output._data[col_name] = output._data[col_name] output = output.to_pandas().to_string( max_rows=max_rows, @@ -3900,8 +3906,8 @@ def query(self, expr, local_dict=None): >>> df['datetimes'] = data >>> search_date = datetime.datetime.strptime('2018-10-08', '%Y-%m-%d') >>> df.query('datetimes==@search_date') - datetimes - 1 2018-10-08 + datetimes + 1 2018-10-08 00:00:00 Using local_dict: diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index d1995615e0c..dd44f3c1276 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1237,7 +1237,7 @@ def __repr__(self): output = repr(preprocess.to_pandas()) output = output.replace("nan", cudf._NA_REP) - elif preprocess._values.nullable: + elif preprocess._values.nullable or isinstance(preprocess, (DatetimeIndex, TimedeltaIndex)): output = repr(self._clean_nulls_from_index().to_pandas()) if not isinstance(self, StringIndex): @@ -2097,6 +2097,20 @@ def _get_dt_field(self, field): def is_boolean(self): return False + + def _clean_nulls_from_index(self): + """ + Convert all na values(if any) in Index object + to `` as a preprocessing step to `__repr__` methods. + + This will involve changing type of Index object + to StringIndex but it is the responsibility of the `__repr__` + methods using this method to replace or handle representation + of the actual types correctly. + """ + return cudf.Index( + self._values.astype("str").fillna(cudf._NA_REP), name=self.name + ) @_cudf_nvtx_annotate def ceil(self, freq): @@ -2345,6 +2359,20 @@ def inferred_freq(self): def is_boolean(self): return False + + def _clean_nulls_from_index(self): + """ + Convert all na values(if any) in Index object + to `` as a preprocessing step to `__repr__` methods. + + This will involve changing type of Index object + to StringIndex but it is the responsibility of the `__repr__` + methods using this method to replace or handle representation + of the actual types correctly. + """ + return cudf.Index( + self._values.astype("str").fillna(cudf._NA_REP), name=self.name + ) class CategoricalIndex(GenericIndex): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 4ab28cab5a0..c1bd0b99a8e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1206,7 +1206,7 @@ def __repr__(self): and not is_decimal_dtype(preprocess.dtype) and not is_struct_dtype(preprocess.dtype) ) or isinstance( - preprocess._column, cudf.core.column.timedelta.TimeDeltaColumn + preprocess._column, (cudf.core.column.timedelta.TimeDeltaColumn, cudf.core.column.datetime.DatetimeColumn) ): output = repr( preprocess.astype("O").fillna(cudf._NA_REP).to_pandas() diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index c4985639173..b1ff159cca6 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -81,22 +81,60 @@ def test_null_dataframe(ncols): assert pdf_repr.split() == repr(gdf).split() pd.reset_option("display.max_columns") +def _assert_date_series_repr(ps, gs): + """ + This is a utility function to compare pandas & cudf + datetime series repr's. + The repr's differ in the way text is spaced: + + >>> s = cudf.Series([100, 200, 300], dtype='datetime64[ns]') + >>> s + 0 1970-01-01 00:00:00.000000100 + 1 1970-01-01 00:00:00.000000200 + 2 1970-01-01 00:00:00.000000300 + dtype: datetime64[ns] + + >>> s.to_pandas() + 0 1970-01-01 00:00:00.000000100 + 1 1970-01-01 00:00:00.000000200 + 2 1970-01-01 00:00:00.000000300 + dtype: datetime64[ns] + """ + expected_list = repr(ps).split("\n") + actual_list = repr(gs).split("\n") + + new_actual_list = [] + for text in actual_list: + if not text.startswith(" "): + new_actual_list.append(text.replace(" ", " ", 1)) + else: + if "..." in text: + new_actual_list.append(" ".join(text.rsplit(" ", 1))) + else: + new_actual_list.append(text) + # import pdb;pdb.set_trace() + assert expected_list == new_actual_list @pytest.mark.parametrize("dtype", repr_categories) @pytest.mark.parametrize("nrows", [None, 0, 1, 2, 9, 10, 11, 19, 20, 21]) def test_full_series(nrows, dtype): + np.random.seed(0) size = 20 ps = pd.Series(np.random.randint(0, 100, size)).astype(dtype) sr = cudf.from_pandas(ps) pd.options.display.max_rows = nrows - assert repr(ps) == repr(sr) + + if cudf.api.types.is_datetime_dtype(dtype): + _assert_date_series_repr(ps , sr) + else: + assert repr(ps) == repr(sr) pd.reset_option("display.max_rows") @pytest.mark.parametrize("nrows", [5, 10, 15]) @pytest.mark.parametrize("ncols", [5, 10, 15]) @pytest.mark.parametrize("size", [20, 21]) -@pytest.mark.parametrize("dtype", repr_categories) +@pytest.mark.parametrize("dtype", sorted(list(set(repr_categories) - {"datetime64[ns]"}))) def test_full_dataframe_20(dtype, size, nrows, ncols): pdf = pd.DataFrame( {idx: np.random.randint(0, 100, size) for idx in range(size)} @@ -111,6 +149,201 @@ def test_full_dataframe_20(dtype, size, nrows, ncols): assert pdf._repr_latex_() == gdf._repr_latex_() +@pytest.mark.parametrize("nrows,ncols,data,expected_repr",[ + ( + 5, + 5, + {idx: np.arange(0, 100, 5) for idx in range(20)}, + textwrap.dedent( + """ + 0 1 ... \\ +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 ... +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 ... +.. ... ... ... +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 ... +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 ... + + 18 19 +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +.. ... ... +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 + +[20 rows x 20 columns] +""" + )), + (5, 15, + {idx: np.arange(0, 100, 5) for idx in range(20)}, + textwrap.dedent( + """ + 0 1 \\ +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +.. ... ... +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 + + 2 3 \\ +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +.. ... ... +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 + + 4 5 \\ +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +.. ... ... +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 + + 6 ... 13 \\ +0 1970-01-01 00:00:00.000000000 ... 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 ... 1970-01-01 00:00:00.000000005 +.. ... ... ... +18 1970-01-01 00:00:00.000000090 ... 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 ... 1970-01-01 00:00:00.000000095 + + 14 15 \\ +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +.. ... ... +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 + + 16 17 \\ +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +.. ... ... +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 + + 18 19 +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +.. ... ... +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 + +[20 rows x 20 columns] +""" + )), + (15, 15, + {idx: np.arange(0, 100, 5) for idx in range(20)}, + textwrap.dedent( + """ + 0 1 \\ +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 +3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 +4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 +.. ... ... +15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 +16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 +17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 + + 2 3 \\ +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 +3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 +4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 +.. ... ... +15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 +16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 +17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 + + 4 5 \\ +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 +3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 +4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 +.. ... ... +15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 +16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 +17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 + + 6 ... 13 \\ +0 1970-01-01 00:00:00.000000000 ... 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 ... 1970-01-01 00:00:00.000000005 +2 1970-01-01 00:00:00.000000010 ... 1970-01-01 00:00:00.000000010 +3 1970-01-01 00:00:00.000000015 ... 1970-01-01 00:00:00.000000015 +4 1970-01-01 00:00:00.000000020 ... 1970-01-01 00:00:00.000000020 +.. ... ... ... +15 1970-01-01 00:00:00.000000075 ... 1970-01-01 00:00:00.000000075 +16 1970-01-01 00:00:00.000000080 ... 1970-01-01 00:00:00.000000080 +17 1970-01-01 00:00:00.000000085 ... 1970-01-01 00:00:00.000000085 +18 1970-01-01 00:00:00.000000090 ... 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 ... 1970-01-01 00:00:00.000000095 + + 14 15 \\ +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 +3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 +4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 +.. ... ... +15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 +16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 +17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 + + 16 17 \\ +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 +3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 +4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 +.. ... ... +15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 +16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 +17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 + + 18 19 +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 +3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 +4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 +.. ... ... +15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 +16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 +17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 + +[20 rows x 20 columns] +""" + ), + ) +]) +def test_full_datetime_dataframe(nrows, ncols, data, expected_repr): + pdf = pd.DataFrame( + data + ).astype('datetime64[ns]') + gdf = cudf.from_pandas(pdf) + + + with pd.option_context( + "display.max_rows", int(nrows), "display.max_columns", int(ncols) + ): + assert expected_repr.split() == repr(gdf).split() + assert pdf._repr_html_() == gdf._repr_html_() + assert pdf._repr_latex_() == gdf._repr_latex_() + + @given( st.lists( st.integers(-9223372036854775808, 9223372036854775807), @@ -1494,3 +1727,40 @@ def test_repr_struct_after_concat(): pdf = df.to_pandas() assert repr(df) == repr(pdf) + +@pytest.mark.parametrize("ser,expected_repr",[ + ( + cudf.Series( + [ + "1969-12-31 23:59:58.001001", "1839-12-24 03:58:56.000826", "1647-05-20 19:25:03.000638" + ], + dtype="datetime64[us]", + index=["a", "b", "z"], + name="hello", + ), + textwrap.dedent( + """ + a 1969-12-31 23:59:58.001001 + b 1839-12-24 03:58:56.000826 + z 1647-05-20 19:25:03.000638 + Name: hello, dtype: datetime64[us] + """ + ), + ), + ( + cudf.Series(["2499-12-01 01:00:00", "2499-11-01 01:30:00"], dtype="datetime64[s]"), + textwrap.dedent( + """ + 0 2499-12-01 01:00:00 + 1 2499-11-01 01:30:00 + dtype: datetime64[s] + """ + ) + ) + ], +) +def test_datetime_series_repr(ser, expected_repr): + expected = expected_repr + actual = repr(ser) + + assert expected.split() == actual.split() \ No newline at end of file diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py index 9f04e30fb28..4982d61161c 100644 --- a/python/cudf/cudf/utils/docutils.py +++ b/python/cudf/cudf/utils/docutils.py @@ -220,18 +220,18 @@ def wrapper(func): ... np.datetime64("2010-01-01") ... ]) >>> s - 0 2000-01-01 - 1 2010-01-01 - 2 2010-01-01 + 0 2000-01-01 00:00:00 + 1 2010-01-01 00:00:00 + 2 2010-01-01 00:00:00 dtype: datetime64[s] >>> s.describe() - count 3 - mean 2006-09-01 08:00:00 - min 2000-01-01 00:00:00 - 25% 2004-12-31 12:00:00 - 50% 2010-01-01 00:00:00 - 75% 2010-01-01 00:00:00 - max 2010-01-01 00:00:00 + count 3 + mean 2006-09-01 08:00:00 + min 2000-01-01 00:00:00 + 25% 2004-12-31 12:00:00 + 50% 2010-01-01 00:00:00 + 75% 2010-01-01 00:00:00 + max 2010-01-01 00:00:00 dtype: object Describing a ``DataFrame``. By default only numeric fields are From 579b6b1c6885bf8e53ebcf53f5b753308f1d019a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 22 Sep 2022 20:38:40 -0700 Subject: [PATCH 02/51] fix repr --- cpp/include/cudf/datetime.hpp | 38 ++++++- cpp/include/cudf/detail/datetime.hpp | 10 ++ cpp/src/datetime/datetime_ops.cu | 52 ++++++++-- python/cudf/cudf/_lib/cpp/datetime.pxd | 2 + python/cudf/cudf/_lib/datetime.pyx | 4 + python/cudf/cudf/core/column/datetime.py | 29 ++++++ python/cudf/cudf/core/column/timedelta.py | 28 +++++ python/cudf/cudf/core/dataframe.py | 34 ++++--- python/cudf/cudf/core/index.py | 31 +++++- python/cudf/cudf/core/series.py | 18 +++- python/cudf/cudf/tests/test_repr.py | 118 ++++++++++++---------- python/cudf/cudf/utils/docutils.py | 6 +- 12 files changed, 289 insertions(+), 81 deletions(-) diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index 5d86b9f5fb7..633772d9eeb 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -133,11 +133,47 @@ std::unique_ptr extract_second( cudf::column_view const& column, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - +/** + * @brief Extracts millisecond from any date time type and returns an int16_t + * cudf::column. + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column + * + * @returns cudf::column of the extracted int16_t milliseconds + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + */ std::unique_ptr extract_milli_second( cudf::column_view const& column, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Extracts microsecond from any date time type and returns an int16_t + * cudf::column. + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column + * + * @returns cudf::column of the extracted int16_t microseconds + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + */ +std::unique_ptr extract_micro_second( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Extracts nanosecond from any date time type and returns an int16_t + * cudf::column. + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column + * + * @returns cudf::column of the extracted int16_t nanoseconds + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + */ +std::unique_ptr extract_nano_second( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group /** diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp index 40a7276f977..266944f18ff 100644 --- a/cpp/include/cudf/detail/datetime.hpp +++ b/cpp/include/cudf/detail/datetime.hpp @@ -99,6 +99,16 @@ std::unique_ptr extract_milli_second( rmm::cuda_stream_view stream = cudf::default_stream_value, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr extract_micro_second( + cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +std::unique_ptr extract_nano_second( + cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::mr::device_memory_resource *) * diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index 5a4f3d02971..419ec251bae 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -76,9 +76,14 @@ struct extract_component_operator { if (time_since_midnight.count() < 0) { time_since_midnight += days(1); } - auto hrs_ = duration_cast(time_since_midnight); - auto mins_ = duration_cast(time_since_midnight - hrs_); - auto secs_ = duration_cast(time_since_midnight - hrs_ - mins_); + auto hrs_ = duration_cast(time_since_midnight); + auto mins_ = duration_cast(time_since_midnight - hrs_); + auto secs_ = duration_cast(time_since_midnight - hrs_ - mins_); + auto millisecs_ = duration_cast(time_since_midnight - hrs_ - mins_ - secs_); + auto microsecs_ = + duration_cast(time_since_midnight - hrs_ - mins_ - secs_ - millisecs_); + auto nanoosecs_ = duration_cast(time_since_midnight - hrs_ - mins_ - secs_ - + millisecs_ - microsecs_); switch (Component) { case datetime_component::YEAR: @@ -92,6 +97,9 @@ struct extract_component_operator { case datetime_component::HOUR: return hrs_.count(); case datetime_component::MINUTE: return mins_.count(); case datetime_component::SECOND: return secs_.count(); + case datetime_component::MILLISECOND: return millisecs_.count(); + case datetime_component::MICROSECOND: return microsecs_.count(); + case datetime_component::NANOSECOND: return nanoosecs_.count(); default: return 0; } } @@ -496,14 +504,32 @@ std::unique_ptr extract_second(column_view const& column, } std::unique_ptr extract_milli_second(column_view const& column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return detail::apply_datetime_op< detail::extract_component_operator, cudf::type_id::INT64>(column, stream, mr); } +std::unique_ptr extract_micro_second(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return detail::apply_datetime_op< + detail::extract_component_operator, + cudf::type_id::INT64>(column, stream, mr); +} + +std::unique_ptr extract_nano_second(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return detail::apply_datetime_op< + detail::extract_component_operator, + cudf::type_id::INT64>(column, stream, mr); +} + std::unique_ptr last_day_of_month(column_view const& column, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -617,12 +643,26 @@ std::unique_ptr extract_second(column_view const& column, } std::unique_ptr extract_milli_second(column_view const& column, - rmm::mr::device_memory_resource* mr) + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::extract_milli_second(column, cudf::default_stream_value, mr); } +std::unique_ptr extract_micro_second(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_micro_second(column, cudf::default_stream_value, mr); +} + +std::unique_ptr extract_nano_second(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_nano_second(column, cudf::default_stream_value, mr); +} + std::unique_ptr last_day_of_month(column_view const& column, rmm::mr::device_memory_resource* mr) { diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd index fdca87f7b4a..a6af3e0874d 100644 --- a/python/cudf/cudf/_lib/cpp/datetime.pxd +++ b/python/cudf/cudf/_lib/cpp/datetime.pxd @@ -16,6 +16,8 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: cdef unique_ptr[column] extract_minute(const column_view& column) except + cdef unique_ptr[column] extract_second(const column_view& column) except + cdef unique_ptr[column] extract_milli_second(const column_view& column) except + + cdef unique_ptr[column] extract_micro_second(const column_view& column) except + + cdef unique_ptr[column] extract_nano_second(const column_view& column) except + ctypedef enum rounding_frequency "cudf::datetime::rounding_frequency": DAY "cudf::datetime::rounding_frequency::DAY" diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index 1e6b40c5027..a545877428c 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -51,6 +51,10 @@ def extract_datetime_component(Column col, object field): c_result = move(libcudf_datetime.extract_second(col_view)) elif field == "milli_second": c_result = move(libcudf_datetime.extract_milli_second(col_view)) + elif field == "micro_second": + c_result = move(libcudf_datetime.extract_micro_second(col_view)) + elif field == "nano_second": + c_result = move(libcudf_datetime.extract_nano_second(col_view)) elif field == "day_of_year": c_result = move(libcudf_datetime.day_of_year(col_view)) else: diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 1419b14e8c6..50f09d19543 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -11,6 +11,7 @@ import numpy as np import pandas as pd +from typing_extensions import Self import cudf from cudf import _lib as libcudf @@ -214,6 +215,34 @@ def to_pandas( index=index, ) + def _preprocess_column_for_repr(self): + has_hr = (self.get_dt_field("hour") > 0).any() + has_m = (self.get_dt_field("minute") > 0).any() + has_s = (self.get_dt_field("second") > 0).any() + has_ms = (self.get_dt_field("milli_second") > 0).any() + has_us = (self.get_dt_field("micro_second") > 0).any() + has_ns = (self.get_dt_field("nano_second") > 0).any() + has_ns = (self.get_dt_field("nano_second") > 0).any() + + if has_ns: + preprocess = self.astype("O") + elif has_us: + preprocess = self.astype( + "O", format=_dtype_to_format_conversion.get("datetime64[us]") + ) + elif has_ms: + preprocess = self.astype( + "O", format=_dtype_to_format_conversion.get("datetime64[ms]") + ) + elif has_s or has_m or has_hr: + preprocess = self.astype( + "O", format=_dtype_to_format_conversion.get("datetime64[s]") + ) + else: + preprocess = self.astype("O", format="%Y-%m-%d") + + return preprocess + @property def values(self): """ diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index e6d688014fa..a25713f7b8a 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -250,6 +250,34 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn": def time_unit(self) -> str: return self._time_unit + def _preprocess_column_for_repr(self): + components = self.components() + has_hr = (components.hours > 0).any() + has_m = (components.seconds > 0).any() + has_s = (components.seconds > 0).any() + has_ms = (components.milliseconds > 0).any() + has_us = (components.microseconds > 0).any() + has_ns = (components.nanoseconds > 0).any() + + if has_ns: + preprocess = self.astype("O") + elif has_us: + preprocess = self.astype( + "O", format=_dtype_to_format_conversion.get("timedelta64[us]") + ) + elif has_ms: + preprocess = self.astype( + "O", format=_dtype_to_format_conversion.get("timedelta64[ms]") + ) + elif has_s or has_m or has_hr: + preprocess = self.astype( + "O", format=_dtype_to_format_conversion.get("timedelta64[s]") + ) + else: + preprocess = self.astype("O", format="%D days") + + return preprocess + def fillna( self, fill_value: Any = None, method: str = None, dtype: Dtype = None ) -> TimeDeltaColumn: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 77bed35fd4b..e6a3da11b80 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -570,12 +570,12 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): ... [(t0+ timedelta(seconds=x)) for x in range(n)]) ... }) >>> df - id datetimes - 0 0 2018-10-07 12:00:00.000000 - 1 1 2018-10-07 12:00:01.000000 - 2 2 2018-10-07 12:00:02.000000 - 3 3 2018-10-07 12:00:03.000000 - 4 4 2018-10-07 12:00:04.000000 + id datetimes + 0 0 2018-10-07 12:00:00 + 1 1 2018-10-07 12:00:01 + 2 2 2018-10-07 12:00:02 + 3 3 2018-10-07 12:00:03 + 4 4 2018-10-07 12:00:04 Build DataFrame via list of rows as tuples: @@ -1048,8 +1048,8 @@ def dtypes(self): ... 'datetime': [pd.Timestamp('20180310')], ... 'string': ['foo']}) >>> df - float int datetime string - 0 1.0 1 2018-03-10 00:00:00.000000 foo + float int datetime string + 0 1.0 1 2018-03-10 foo >>> df.dtypes float float64 int int64 @@ -1754,10 +1754,18 @@ def _clean_renderable_dataframe(self, output): width, _ = console.get_console_size() else: width = None - + for col_name, col in output._data.items(): - if isinstance(col, (cudf.core.column.timedelta.TimeDeltaColumn, cudf.core.column.datetime.DatetimeColumn)): - output._data[col_name] = output._data[col_name].astype('str') + if isinstance( + col, + ( + cudf.core.column.timedelta.TimeDeltaColumn, + cudf.core.column.datetime.DatetimeColumn, + ), + ): + output._data[col_name] = output._data[ + col_name + ]._preprocess_column_for_repr() else: output._data[col_name] = output._data[col_name] @@ -3906,8 +3914,8 @@ def query(self, expr, local_dict=None): >>> df['datetimes'] = data >>> search_date = datetime.datetime.strptime('2018-10-08', '%Y-%m-%d') >>> df.query('datetimes==@search_date') - datetimes - 1 2018-10-08 00:00:00 + datetimes + 1 2018-10-08 Using local_dict: diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 9efae2abc2f..0dad86bc423 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1241,9 +1241,18 @@ def __repr__(self): output = repr(preprocess.to_pandas()) output = output.replace("nan", cudf._NA_REP) - elif preprocess._values.nullable or isinstance(preprocess, (DatetimeIndex, TimedeltaIndex)): + elif preprocess._values.nullable or isinstance( + preprocess, (DatetimeIndex, TimedeltaIndex) + ): output = repr(self._clean_nulls_from_index().to_pandas()) + import pdb + pdb.set_trace() + if isinstance(self, (DatetimeIndex, TimedeltaIndex)): + output = ( + output[: output.rfind("categories=[")] + + output[output.rfind(" dtype=") :] + ) if not isinstance(self, StringIndex): # We should remove all the single quotes # from the output due to the type-cast to @@ -1275,6 +1284,12 @@ def __repr__(self): else: lines[-1] = lines[-1] + ")" + if isinstance(preprocess, (DatetimeIndex, TimedeltaIndex)): + if len(lines) > 1: + lines[1:-1] = [ + line.replace(" ", "", 1) for line in lines[1:-1] + ] + lines[-1] = lines[-1].replace(" ", "", 1) return "\n".join(lines) @_cudf_nvtx_annotate @@ -2101,7 +2116,7 @@ def _get_dt_field(self, field): def is_boolean(self): return False - + def _clean_nulls_from_index(self): """ Convert all na values(if any) in Index object @@ -2113,7 +2128,10 @@ def _clean_nulls_from_index(self): of the actual types correctly. """ return cudf.Index( - self._values.astype("str").fillna(cudf._NA_REP), name=self.name + self._values._preprocess_column_for_repr() + .fillna(cudf._NA_REP) + .astype("category"), + name=self.name, ) @_cudf_nvtx_annotate @@ -2363,7 +2381,7 @@ def inferred_freq(self): def is_boolean(self): return False - + def _clean_nulls_from_index(self): """ Convert all na values(if any) in Index object @@ -2375,7 +2393,10 @@ def _clean_nulls_from_index(self): of the actual types correctly. """ return cudf.Index( - self._values.astype("str").fillna(cudf._NA_REP), name=self.name + self._values._preprocess_column_for_repr() + .fillna(cudf._NA_REP) + .astype("category"), + name=self.name, ) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index c1bd0b99a8e..4711abb7891 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1205,12 +1205,26 @@ def __repr__(self): and not is_struct_dtype(preprocess.dtype) and not is_decimal_dtype(preprocess.dtype) and not is_struct_dtype(preprocess.dtype) - ) or isinstance( - preprocess._column, (cudf.core.column.timedelta.TimeDeltaColumn, cudf.core.column.datetime.DatetimeColumn) ): output = repr( preprocess.astype("O").fillna(cudf._NA_REP).to_pandas() ) + elif isinstance( + preprocess._column, + ( + cudf.core.column.timedelta.TimeDeltaColumn, + cudf.core.column.datetime.DatetimeColumn, + ), + ): + output = repr( + Series( + preprocess._column._preprocess_column_for_repr(), + index=preprocess.index, + name=preprocess.name, + ) + .fillna(cudf._NA_REP) + .to_pandas() + ) elif isinstance( preprocess._column, cudf.core.column.CategoricalColumn ): diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index b1ff159cca6..1cf5ad0d460 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -81,6 +81,7 @@ def test_null_dataframe(ncols): assert pdf_repr.split() == repr(gdf).split() pd.reset_option("display.max_columns") + def _assert_date_series_repr(ps, gs): """ This is a utility function to compare pandas & cudf @@ -115,6 +116,7 @@ def _assert_date_series_repr(ps, gs): # import pdb;pdb.set_trace() assert expected_list == new_actual_list + @pytest.mark.parametrize("dtype", repr_categories) @pytest.mark.parametrize("nrows", [None, 0, 1, 2, 9, 10, 11, 19, 20, 21]) def test_full_series(nrows, dtype): @@ -125,7 +127,7 @@ def test_full_series(nrows, dtype): pd.options.display.max_rows = nrows if cudf.api.types.is_datetime_dtype(dtype): - _assert_date_series_repr(ps , sr) + _assert_date_series_repr(ps, sr) else: assert repr(ps) == repr(sr) pd.reset_option("display.max_rows") @@ -134,7 +136,9 @@ def test_full_series(nrows, dtype): @pytest.mark.parametrize("nrows", [5, 10, 15]) @pytest.mark.parametrize("ncols", [5, 10, 15]) @pytest.mark.parametrize("size", [20, 21]) -@pytest.mark.parametrize("dtype", sorted(list(set(repr_categories) - {"datetime64[ns]"}))) +@pytest.mark.parametrize( + "dtype", sorted(list(set(repr_categories) - {"datetime64[ns]"})) +) def test_full_dataframe_20(dtype, size, nrows, ncols): pdf = pd.DataFrame( {idx: np.random.randint(0, 100, size) for idx in range(size)} @@ -149,13 +153,15 @@ def test_full_dataframe_20(dtype, size, nrows, ncols): assert pdf._repr_latex_() == gdf._repr_latex_() -@pytest.mark.parametrize("nrows,ncols,data,expected_repr",[ - ( - 5, - 5, - {idx: np.arange(0, 100, 5) for idx in range(20)}, - textwrap.dedent( - """ +@pytest.mark.parametrize( + "nrows,ncols,data,expected_repr", + [ + ( + 5, + 5, + {idx: np.arange(0, 100, 5) for idx in range(20)}, + textwrap.dedent( + """ 0 1 ... \\ 0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 ... 1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 ... @@ -172,11 +178,14 @@ def test_full_dataframe_20(dtype, size, nrows, ncols): [20 rows x 20 columns] """ - )), - (5, 15, - {idx: np.arange(0, 100, 5) for idx in range(20)}, - textwrap.dedent( - """ + ), + ), + ( + 5, + 15, + {idx: np.arange(0, 100, 5) for idx in range(20)}, + textwrap.dedent( + """ 0 1 \\ 0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 @@ -228,11 +237,14 @@ def test_full_dataframe_20(dtype, size, nrows, ncols): [20 rows x 20 columns] """ - )), - (15, 15, - {idx: np.arange(0, 100, 5) for idx in range(20)}, - textwrap.dedent( - """ + ), + ), + ( + 15, + 15, + {idx: np.arange(0, 100, 5) for idx in range(20)}, + textwrap.dedent( + """ 0 1 \\ 0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 @@ -326,15 +338,13 @@ def test_full_dataframe_20(dtype, size, nrows, ncols): [20 rows x 20 columns] """ - ), - ) -]) + ), + ), + ], +) def test_full_datetime_dataframe(nrows, ncols, data, expected_repr): - pdf = pd.DataFrame( - data - ).astype('datetime64[ns]') + pdf = pd.DataFrame(data).astype("datetime64[ns]") gdf = cudf.from_pandas(pdf) - with pd.option_context( "display.max_rows", int(nrows), "display.max_columns", int(ncols) @@ -591,35 +601,33 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows): ), ( cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[ns]")), - "DatetimeIndex([1970-01-01 00:00:00.000000010, " - "1970-01-01 00:00:00.000000020," - "\n 1970-01-01 00:00:00.000000030, ],\n " - "dtype='datetime64[ns]')", + "DatetimeIndex([1970-01-01 00:00:00.000000010, 1970-01-01 00:00:00.000000020,\n" + " 1970-01-01 00:00:00.000000030, ],\n" + " dtype='datetime64[ns]')", ), ( cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[s]")), - "DatetimeIndex([1970-01-01 00:00:10, " - "1970-01-01 00:00:20, 1970-01-01 00:00:30,\n" - " ],\n dtype='datetime64[s]')", + "DatetimeIndex([1970-01-01 00:00:10, 1970-01-01 00:00:20, 1970-01-01 00:00:30,\n" + " ],\n" + " dtype='datetime64[s]')", ), ( cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[us]")), - "DatetimeIndex([1970-01-01 00:00:00.000010, " - "1970-01-01 00:00:00.000020,\n " - "1970-01-01 00:00:00.000030, ],\n " - "dtype='datetime64[us]')", + "DatetimeIndex([1970-01-01 00:00:00.000010, 1970-01-01 00:00:00.000020,\n" + " 1970-01-01 00:00:00.000030, ],\n" + " dtype='datetime64[us]')", ), ( cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[ms]")), - "DatetimeIndex([1970-01-01 00:00:00.010, " - "1970-01-01 00:00:00.020,\n " - "1970-01-01 00:00:00.030, ],\n " - "dtype='datetime64[ms]')", + "DatetimeIndex([1970-01-01 00:00:00.010, 1970-01-01 00:00:00.020,\n" + " 1970-01-01 00:00:00.030, ],\n" + " dtype='datetime64[ms]')", ), ( cudf.Index(np.array([None] * 10, dtype="datetime64[ms]")), - "DatetimeIndex([, , , , , , , , " - ",\n ],\n dtype='datetime64[ms]')", + "DatetimeIndex([, , , , , , , , ,\n" + " ],\n" + " dtype='datetime64[ms]')", ), ], ) @@ -1285,8 +1293,8 @@ def test_timedelta_dataframe_repr(df, expected_repr): [ ( cudf.Index([1000000, 200000, 3000000], dtype="timedelta64[ms]"), - "TimedeltaIndex(['0 days 00:16:40', " - "'0 days 00:03:20', '0 days 00:50:00'], " + "TimedeltaIndex([0 days 00:16:40, " + "0 days 00:03:20, 0 days 00:50:00], " "dtype='timedelta64[ms]')", ), ( @@ -1728,11 +1736,16 @@ def test_repr_struct_after_concat(): assert repr(df) == repr(pdf) -@pytest.mark.parametrize("ser,expected_repr",[ + +@pytest.mark.parametrize( + "ser,expected_repr", + [ ( cudf.Series( [ - "1969-12-31 23:59:58.001001", "1839-12-24 03:58:56.000826", "1647-05-20 19:25:03.000638" + "1969-12-31 23:59:58.001001", + "1839-12-24 03:58:56.000826", + "1647-05-20 19:25:03.000638", ], dtype="datetime64[us]", index=["a", "b", "z"], @@ -1748,19 +1761,22 @@ def test_repr_struct_after_concat(): ), ), ( - cudf.Series(["2499-12-01 01:00:00", "2499-11-01 01:30:00"], dtype="datetime64[s]"), + cudf.Series( + ["2499-12-01 01:00:00", "2499-11-01 01:30:00"], + dtype="datetime64[s]", + ), textwrap.dedent( """ 0 2499-12-01 01:00:00 1 2499-11-01 01:30:00 dtype: datetime64[s] """ - ) - ) + ), + ), ], ) def test_datetime_series_repr(ser, expected_repr): expected = expected_repr actual = repr(ser) - assert expected.split() == actual.split() \ No newline at end of file + assert expected.split() == actual.split() diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py index 4982d61161c..c373aa0b127 100644 --- a/python/cudf/cudf/utils/docutils.py +++ b/python/cudf/cudf/utils/docutils.py @@ -220,9 +220,9 @@ def wrapper(func): ... np.datetime64("2010-01-01") ... ]) >>> s - 0 2000-01-01 00:00:00 - 1 2010-01-01 00:00:00 - 2 2010-01-01 00:00:00 + 0 2000-01-01 + 1 2010-01-01 + 2 2010-01-01 dtype: datetime64[s] >>> s.describe() count 3 From c6250180b03547b77213d864afad350e7567a7a5 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 23 Sep 2022 10:06:08 -0700 Subject: [PATCH 03/51] fix --- python/cudf/cudf/core/index.py | 21 +- python/cudf/cudf/tests/test_repr.py | 316 ++++++++++++++-------------- 2 files changed, 172 insertions(+), 165 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 0dad86bc423..7f64535e37c 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1244,16 +1244,16 @@ def __repr__(self): elif preprocess._values.nullable or isinstance( preprocess, (DatetimeIndex, TimedeltaIndex) ): - output = repr(self._clean_nulls_from_index().to_pandas()) - import pdb + output = repr(preprocess._clean_nulls_from_index().to_pandas()) - pdb.set_trace() if isinstance(self, (DatetimeIndex, TimedeltaIndex)): output = ( output[: output.rfind("categories=[")] + output[output.rfind(" dtype=") :] ) - if not isinstance(self, StringIndex): + if not isinstance( + self, (StringIndex, DatetimeIndex, TimedeltaIndex) + ): # We should remove all the single quotes # from the output due to the type-cast to # object dtype happening above. @@ -1285,11 +1285,14 @@ def __repr__(self): lines[-1] = lines[-1] + ")" if isinstance(preprocess, (DatetimeIndex, TimedeltaIndex)): + replace_spaces = ( + " " if isinstance(preprocess, DatetimeIndex) else " " + ) if len(lines) > 1: lines[1:-1] = [ - line.replace(" ", "", 1) for line in lines[1:-1] + line.replace(replace_spaces, "", 1) for line in lines[1:-1] ] - lines[-1] = lines[-1].replace(" ", "", 1) + lines[-1] = lines[-1].replace(replace_spaces + " ", "", 1) return "\n".join(lines) @_cudf_nvtx_annotate @@ -2020,9 +2023,9 @@ def day_of_year(self): ... "2017-01-08", freq="D")) >>> datetime_index DatetimeIndex(['2016-12-31', '2017-01-01', '2017-01-02', '2017-01-03', - '2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07', - '2017-01-08'], - dtype='datetime64[ns]') + '2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07', + '2017-01-08'], + dtype='datetime64[ns]') >>> datetime_index.day_of_year Int16Index([366, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int16') """ diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 1cf5ad0d460..be68b8eca5f 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -163,18 +163,18 @@ def test_full_dataframe_20(dtype, size, nrows, ncols): textwrap.dedent( """ 0 1 ... \\ -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 ... -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 ... -.. ... ... ... -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 ... -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 ... - - 18 19 -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -.. ... ... -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 ... +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 ... +.. ... ... ... +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 ... +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 ... + + 18 19 +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +.. ... ... +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 [20 rows x 20 columns] """ @@ -187,53 +187,53 @@ def test_full_dataframe_20(dtype, size, nrows, ncols): textwrap.dedent( """ 0 1 \\ -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -.. ... ... -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +.. ... ... +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 2 3 \\ -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -.. ... ... -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +.. ... ... +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 4 5 \\ -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -.. ... ... -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +.. ... ... +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 6 ... 13 \\ -0 1970-01-01 00:00:00.000000000 ... 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 ... 1970-01-01 00:00:00.000000005 -.. ... ... ... -18 1970-01-01 00:00:00.000000090 ... 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 ... 1970-01-01 00:00:00.000000095 +0 1970-01-01 00:00:00.000000000 ... 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 ... 1970-01-01 00:00:00.000000005 +.. ... ... ... +18 1970-01-01 00:00:00.000000090 ... 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 ... 1970-01-01 00:00:00.000000095 14 15 \\ -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -.. ... ... -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +.. ... ... +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 16 17 \\ -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -.. ... ... -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 - - 18 19 -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -.. ... ... -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +.. ... ... +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 + + 18 19 +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +.. ... ... +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 [20 rows x 20 columns] """ @@ -246,95 +246,95 @@ def test_full_dataframe_20(dtype, size, nrows, ncols): textwrap.dedent( """ 0 1 \\ -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 -3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 -4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 -.. ... ... -15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 -16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 -17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 +3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 +4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 +.. ... ... +15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 +16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 +17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 2 3 \\ -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 -3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 -4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 -.. ... ... -15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 -16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 -17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 +3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 +4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 +.. ... ... +15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 +16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 +17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 4 5 \\ -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 -3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 -4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 -.. ... ... -15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 -16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 -17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 +3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 +4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 +.. ... ... +15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 +16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 +17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 6 ... 13 \\ -0 1970-01-01 00:00:00.000000000 ... 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 ... 1970-01-01 00:00:00.000000005 -2 1970-01-01 00:00:00.000000010 ... 1970-01-01 00:00:00.000000010 -3 1970-01-01 00:00:00.000000015 ... 1970-01-01 00:00:00.000000015 -4 1970-01-01 00:00:00.000000020 ... 1970-01-01 00:00:00.000000020 -.. ... ... ... -15 1970-01-01 00:00:00.000000075 ... 1970-01-01 00:00:00.000000075 -16 1970-01-01 00:00:00.000000080 ... 1970-01-01 00:00:00.000000080 -17 1970-01-01 00:00:00.000000085 ... 1970-01-01 00:00:00.000000085 -18 1970-01-01 00:00:00.000000090 ... 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 ... 1970-01-01 00:00:00.000000095 +0 1970-01-01 00:00:00.000000000 ... 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 ... 1970-01-01 00:00:00.000000005 +2 1970-01-01 00:00:00.000000010 ... 1970-01-01 00:00:00.000000010 +3 1970-01-01 00:00:00.000000015 ... 1970-01-01 00:00:00.000000015 +4 1970-01-01 00:00:00.000000020 ... 1970-01-01 00:00:00.000000020 +.. ... ... ... +15 1970-01-01 00:00:00.000000075 ... 1970-01-01 00:00:00.000000075 +16 1970-01-01 00:00:00.000000080 ... 1970-01-01 00:00:00.000000080 +17 1970-01-01 00:00:00.000000085 ... 1970-01-01 00:00:00.000000085 +18 1970-01-01 00:00:00.000000090 ... 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 ... 1970-01-01 00:00:00.000000095 14 15 \\ -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 -3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 -4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 -.. ... ... -15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 -16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 -17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 +3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 +4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 +.. ... ... +15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 +16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 +17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 16 17 \\ -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 -3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 -4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 -.. ... ... -15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 -16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 -17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 - - 18 19 -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 -3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 -4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 -.. ... ... -15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 -16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 -17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 +3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 +4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 +.. ... ... +15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 +16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 +17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 + + 18 19 +0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 +1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 +2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 +3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 +4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 +.. ... ... +15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 +16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 +17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 +18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 +19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 [20 rows x 20 columns] """ @@ -601,33 +601,36 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows): ), ( cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[ns]")), - "DatetimeIndex([1970-01-01 00:00:00.000000010, 1970-01-01 00:00:00.000000020,\n" - " 1970-01-01 00:00:00.000000030, ],\n" - " dtype='datetime64[ns]')", + "DatetimeIndex(['1970-01-01 00:00:00.000000010',\n" + " '1970-01-01 00:00:00.000000020',\n" + " '1970-01-01 00:00:00.000000030', ''],\n" + " dtype='datetime64[ns]')", ), ( cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[s]")), - "DatetimeIndex([1970-01-01 00:00:10, 1970-01-01 00:00:20, 1970-01-01 00:00:30,\n" - " ],\n" - " dtype='datetime64[s]')", + "DatetimeIndex(['1970-01-01 00:00:10', '1970-01-01 00:00:20',\n" + " '1970-01-01 00:00:30', ''],\n" + " dtype='datetime64[s]')", ), ( cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[us]")), - "DatetimeIndex([1970-01-01 00:00:00.000010, 1970-01-01 00:00:00.000020,\n" - " 1970-01-01 00:00:00.000030, ],\n" - " dtype='datetime64[us]')", + "DatetimeIndex(['1970-01-01 00:00:00.000010', " + "'1970-01-01 00:00:00.000020',\n" + " '1970-01-01 00:00:00.000030', ''],\n" + " dtype='datetime64[us]')", ), ( cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[ms]")), - "DatetimeIndex([1970-01-01 00:00:00.010, 1970-01-01 00:00:00.020,\n" - " 1970-01-01 00:00:00.030, ],\n" - " dtype='datetime64[ms]')", + "DatetimeIndex(['1970-01-01 00:00:00.010', " + "'1970-01-01 00:00:00.020',\n " + " '1970-01-01 00:00:00.030', ''],\n" + " dtype='datetime64[ms]')", ), ( cudf.Index(np.array([None] * 10, dtype="datetime64[ms]")), - "DatetimeIndex([, , , , , , , , ,\n" - " ],\n" - " dtype='datetime64[ms]')", + "DatetimeIndex(['', '', '', '', '', '', " + "'',\n '', '', ''],\n" + " dtype='datetime64[ms]')", ), ], ) @@ -1293,15 +1296,14 @@ def test_timedelta_dataframe_repr(df, expected_repr): [ ( cudf.Index([1000000, 200000, 3000000], dtype="timedelta64[ms]"), - "TimedeltaIndex([0 days 00:16:40, " - "0 days 00:03:20, 0 days 00:50:00], " - "dtype='timedelta64[ms]')", + "TimedeltaIndex(['0 days 00:16:40', '0 days 00:03:20', " + "'0 days 00:50:00'], dtype='timedelta64[ms]')", ), ( cudf.Index( [None, None, None, None, None], dtype="timedelta64[us]" ), - "TimedeltaIndex([, , , , ], " + "TimedeltaIndex(['', '', '', '', ''], " "dtype='timedelta64[us]')", ), ( @@ -1317,11 +1319,12 @@ def test_timedelta_dataframe_repr(df, expected_repr): ], dtype="timedelta64[us]", ), - "TimedeltaIndex([0 days 00:02:16.457654, , " - "0 days 00:04:05.345345, " - "0 days 00:03:43.432411, ," - " 0 days 01:00:34.548734, 0 days 00:00:00.023234]," - " dtype='timedelta64[us]')", + "TimedeltaIndex(['0 days 00:02:16.457654', '', " + "'0 days 00:04:05.345345',\n" + " '0 days 00:03:43.432411', '', " + "'0 days 01:00:34.548734',\n" + " '0 days 00:00:00.023234'],\n" + " dtype='timedelta64[us]')", ), ( cudf.Index( @@ -1336,10 +1339,11 @@ def test_timedelta_dataframe_repr(df, expected_repr): ], dtype="timedelta64[s]", ), - "TimedeltaIndex([1579 days 08:54:14, , 2839 days 15:29:05," - " 2586 days 00:33:31, , 42066 days 12:52:14, " - "0 days 06:27:14]," - " dtype='timedelta64[s]')", + "TimedeltaIndex(['1579 days 08:54:14', '', " + "'2839 days 15:29:05',\n" + " '2586 days 00:33:31', '', " + "'42066 days 12:52:14',\n '0 days 06:27:14'],\n" + " dtype='timedelta64[s]')", ), ], ) From 148d0f872c797e1084cbd22bdacc92d00f5d05df Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 23 Sep 2022 12:19:53 -0500 Subject: [PATCH 04/51] Update python/cudf/cudf/core/column/datetime.py --- python/cudf/cudf/core/column/datetime.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 50f09d19543..b2ab165d4e4 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -11,7 +11,6 @@ import numpy as np import pandas as pd -from typing_extensions import Self import cudf from cudf import _lib as libcudf From 97debb2f28f5647ef17e8ded3d95abaaab645fc0 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 23 Sep 2022 10:43:38 -0700 Subject: [PATCH 05/51] flake 8 --- python/cudf/cudf/_lib/cpp/datetime.pxd | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd index a6af3e0874d..5951ca34de3 100644 --- a/python/cudf/cudf/_lib/cpp/datetime.pxd +++ b/python/cudf/cudf/_lib/cpp/datetime.pxd @@ -15,9 +15,15 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: cdef unique_ptr[column] extract_hour(const column_view& column) except + cdef unique_ptr[column] extract_minute(const column_view& column) except + cdef unique_ptr[column] extract_second(const column_view& column) except + - cdef unique_ptr[column] extract_milli_second(const column_view& column) except + - cdef unique_ptr[column] extract_micro_second(const column_view& column) except + - cdef unique_ptr[column] extract_nano_second(const column_view& column) except + + cdef unique_ptr[column] extract_milli_second( + const column_view& column + ) except + + cdef unique_ptr[column] extract_micro_second( + const column_view& column + ) except + + cdef unique_ptr[column] extract_nano_second( + const column_view& column + ) except + ctypedef enum rounding_frequency "cudf::datetime::rounding_frequency": DAY "cudf::datetime::rounding_frequency::DAY" From af4a94d38f09eb9e681827b4bb992c986f10cf21 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 23 Sep 2022 10:47:34 -0700 Subject: [PATCH 06/51] docs --- cpp/include/cudf/detail/datetime.hpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp index 266944f18ff..554c6003485 100644 --- a/cpp/include/cudf/detail/datetime.hpp +++ b/cpp/include/cudf/detail/datetime.hpp @@ -94,16 +94,31 @@ std::unique_ptr extract_second( rmm::cuda_stream_view stream = cudf::default_stream_value, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @copydoc cudf::extract_milli_second(cudf::column_view const&, rmm::mr::device_memory_resource *) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ std::unique_ptr extract_milli_second( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::default_stream_value, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @copydoc cudf::extract_micro_second(cudf::column_view const&, rmm::mr::device_memory_resource *) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ std::unique_ptr extract_micro_second( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::default_stream_value, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @copydoc cudf::extract_nano_second(cudf::column_view const&, rmm::mr::device_memory_resource *) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ std::unique_ptr extract_nano_second( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::default_stream_value, From 2b26c15d64fc2e2f552bbd502f291a6bbb0e9582 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 23 Sep 2022 11:55:40 -0700 Subject: [PATCH 07/51] more changes --- cpp/src/datetime/datetime_ops.cu | 6 +- docs/cudf/source/api_docs/index_objects.rst | 3 + docs/cudf/source/api_docs/series.rst | 3 + python/cudf/cudf/core/column/timedelta.py | 14 ++-- python/cudf/cudf/core/index.py | 65 ++++++++++++++++++ python/cudf/cudf/core/series.py | 75 +++++++++++++++++++++ python/cudf/cudf/tests/test_datetime.py | 16 +++++ 7 files changed, 172 insertions(+), 10 deletions(-) diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index 419ec251bae..31abdf42e28 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -509,7 +509,7 @@ std::unique_ptr extract_milli_second(column_view const& column, { return detail::apply_datetime_op< detail::extract_component_operator, - cudf::type_id::INT64>(column, stream, mr); + cudf::type_id::INT16>(column, stream, mr); } std::unique_ptr extract_micro_second(column_view const& column, @@ -518,7 +518,7 @@ std::unique_ptr extract_micro_second(column_view const& column, { return detail::apply_datetime_op< detail::extract_component_operator, - cudf::type_id::INT64>(column, stream, mr); + cudf::type_id::INT16>(column, stream, mr); } std::unique_ptr extract_nano_second(column_view const& column, @@ -527,7 +527,7 @@ std::unique_ptr extract_nano_second(column_view const& column, { return detail::apply_datetime_op< detail::extract_component_operator, - cudf::type_id::INT64>(column, stream, mr); + cudf::type_id::INT16>(column, stream, mr); } std::unique_ptr last_day_of_month(column_view const& column, diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst index 8e0e3bbd411..e20c9188f16 100644 --- a/docs/cudf/source/api_docs/index_objects.rst +++ b/docs/cudf/source/api_docs/index_objects.rst @@ -262,6 +262,9 @@ Time/date components DatetimeIndex.hour DatetimeIndex.minute DatetimeIndex.second + DatetimeIndex.millisecond + DatetimeIndex.microsecond + DatetimeIndex.nanosecond DatetimeIndex.dayofweek DatetimeIndex.dayofyear DatetimeIndex.day_of_year diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst index 1e53c90b44d..9d98e1c66e9 100644 --- a/docs/cudf/source/api_docs/series.rst +++ b/docs/cudf/source/api_docs/series.rst @@ -266,8 +266,11 @@ Datetime properties days_in_month day_of_year hour + microsecond + millisecond minute month + nanosecond second weekday year diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index a25713f7b8a..b3f4877a790 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -251,13 +251,13 @@ def time_unit(self) -> str: return self._time_unit def _preprocess_column_for_repr(self): - components = self.components() - has_hr = (components.hours > 0).any() - has_m = (components.seconds > 0).any() - has_s = (components.seconds > 0).any() - has_ms = (components.milliseconds > 0).any() - has_us = (components.microseconds > 0).any() - has_ns = (components.nanoseconds > 0).any() + components = self.components() > 0 + has_hr = components.hours.any() + has_m = components.seconds.any() + has_s = components.seconds.any() + has_ms = components.milliseconds.any() + has_us = components.microseconds.any() + has_ns = components.nanoseconds.any() if has_ns: preprocess = self.astype("O") diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 7f64535e37c..2ce297442b4 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1941,6 +1941,71 @@ def second(self): """ return self._get_dt_field("second") + @property # type: ignore + @_cudf_nvtx_annotate + def millisecond(self): + """ + The milliseconds of the datetime. + + Examples + -------- + >>> import pandas as pd + >>> import cudf + >>> datetime_index = cudf.Index(pd.date_range("2000-01-01", + ... periods=3, freq="ms")) + >>> datetime_index + DatetimeIndex(['2000-01-01 00:00:00.000', '2000-01-01 00:00:00.001', + '2000-01-01 00:00:00.002'], + dtype='datetime64[ns]') + >>> datetime_index.millisecond + Int16Index([0, 1, 2], dtype='int16') + Int16Index([0, 1, 2], dtype='int16') + """ + return self._get_dt_field("milli_second") + + @property # type: ignore + @_cudf_nvtx_annotate + def microsecond(self): + """ + The microseconds of the datetime. + + Examples + -------- + >>> import pandas as pd + >>> import cudf + >>> datetime_index = cudf.Index(pd.date_range("2000-01-01", + ... periods=3, freq="us")) + >>> datetime_index + DatetimeIndex(['2000-01-01 00:00:00.000000', '2000-01-01 00:00:00.000001', + '2000-01-01 00:00:00.000002'], + dtype='datetime64[ns]') + >>> datetime_index.microsecond + Int16Index([0, 1, 2], dtype='int16') + """ # noqa: E501 + return self._get_dt_field("micro_second") + + @property # type: ignore + @_cudf_nvtx_annotate + def nanosecond(self): + """ + The nanoseconds of the datetime. + + Examples + -------- + >>> import pandas as pd + >>> import cudf + >>> datetime_index = cudf.Index(pd.date_range("2000-01-01", + ... periods=3, freq="ns")) + >>> datetime_index + DatetimeIndex(['2000-01-01 00:00:00.000000000', + '2000-01-01 00:00:00.000000001', + '2000-01-01 00:00:00.000000002'], + dtype='datetime64[ns]') + >>> datetime_index.nanosecond + Int16Index([0, 1, 2], dtype='int16') + """ + return self._get_dt_field("nano_second") + @property # type: ignore @_cudf_nvtx_annotate def weekday(self): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 4711abb7891..b395e89c396 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3562,6 +3562,81 @@ def second(self): """ return self._get_dt_field("second") + @property # type: ignore + @_cudf_nvtx_annotate + def millisecond(self): + """ + The millisecond of the datetime. + + Examples + -------- + >>> import pandas as pd + >>> import cudf + >>> datetime_series = cudf.Series(pd.date_range("2000-01-01", + ... periods=3, freq="ms")) + >>> datetime_series + 0 2000-01-01 00:00:00.000 + 1 2000-01-01 00:00:00.001 + 2 2000-01-01 00:00:00.002 + dtype: datetime64[ns] + >>> datetime_series.dt.millisecond + 0 0 + 1 1 + 2 2 + dtype: int16 + """ + return self._get_dt_field("milli_second") + + @property # type: ignore + @_cudf_nvtx_annotate + def microsecond(self): + """ + The microsecond of the datetime. + + Examples + -------- + >>> import pandas as pd + >>> import cudf + >>> datetime_series = cudf.Series(pd.date_range("2000-01-01", + ... periods=3, freq="us")) + >>> datetime_series + 0 2000-01-01 00:00:00.000000 + 1 2000-01-01 00:00:00.000001 + 2 2000-01-01 00:00:00.000002 + dtype: datetime64[ns] + >>> datetime_series.dt.microsecond + 0 0 + 1 1 + 2 2 + dtype: int16 + """ + return self._get_dt_field("micro_second") + + @property # type: ignore + @_cudf_nvtx_annotate + def nanosecond(self): + """ + The nanosecond of the datetime. + + Examples + -------- + >>> import pandas as pd + >>> import cudf + >>> datetime_series = cudf.Series(pd.date_range("2000-01-01", + ... periods=3, freq="ns")) + >>> datetime_series + 0 2000-01-01 00:00:00.000000000 + 1 2000-01-01 00:00:00.000000001 + 2 2000-01-01 00:00:00.000000002 + dtype: datetime64[ns] + >>> datetime_series.dt.nanosecond + 0 0 + 1 1 + 2 2 + dtype: int16 + """ + return self._get_dt_field("nano_second") + @property # type: ignore @_cudf_nvtx_annotate def weekday(self): diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 800a8aeeab5..3875467477f 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -81,6 +81,8 @@ def numerical_data(): "hour", "minute", "second", + "microsecond", + "nanosecond", "weekday", "dayofweek", "dayofyear", @@ -2040,3 +2042,17 @@ def test_datetime_constructor(data, dtype): actual = cudf.DatetimeIndex(data=cudf.Series(data), dtype=dtype) assert_eq(expected, actual) + + +# Pandas supports 'second', 'microsecond' & 'nanosecond' +# but weirdly left out 'millisecond', hence can't compare to +# a pandas API. +def test_datetime_millisecond_property(): + data = pd.date_range("2000-01-01", periods=3, freq="ms") + + gsr = cudf.Series(data) + + assert_eq(gsr.dt.millisecond, cudf.Series([0, 1, 2], dtype="int16")) + + gi = cudf.Index(data) + assert_eq(gi.millisecond, cudf.Index([0, 1, 2], dtype="int16")) From 87421b1f07357e1d7e4aabcfca4065f8a5844425 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 23 Sep 2022 12:04:57 -0700 Subject: [PATCH 08/51] fix docstring --- python/cudf/cudf/core/index.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 2ce297442b4..4d1ee79b397 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1959,7 +1959,6 @@ def millisecond(self): dtype='datetime64[ns]') >>> datetime_index.millisecond Int16Index([0, 1, 2], dtype='int16') - Int16Index([0, 1, 2], dtype='int16') """ return self._get_dt_field("milli_second") From cf5e6fb1c95536581e30a8b50bd63307d6dd7c3f Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 23 Sep 2022 12:23:56 -0700 Subject: [PATCH 09/51] add datetime overflow tests --- python/cudf/cudf/tests/test_repr.py | 82 ++++++++++++++++++++++++----- 1 file changed, 69 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index be68b8eca5f..0fc4914b927 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -1742,19 +1742,20 @@ def test_repr_struct_after_concat(): @pytest.mark.parametrize( - "ser,expected_repr", + "cudf_type,data,expected_repr", [ ( - cudf.Series( - [ + cudf.Series, + { + "data": [ "1969-12-31 23:59:58.001001", "1839-12-24 03:58:56.000826", "1647-05-20 19:25:03.000638", ], - dtype="datetime64[us]", - index=["a", "b", "z"], - name="hello", - ), + "dtype": "datetime64[us]", + "index": ["a", "b", "z"], + "name": "hello", + }, textwrap.dedent( """ a 1969-12-31 23:59:58.001001 @@ -1765,10 +1766,11 @@ def test_repr_struct_after_concat(): ), ), ( - cudf.Series( - ["2499-12-01 01:00:00", "2499-11-01 01:30:00"], - dtype="datetime64[s]", - ), + cudf.Series, + { + "data": ["2499-12-01 01:00:00", "2499-11-01 01:30:00"], + "dtype": "datetime64[s]", + }, textwrap.dedent( """ 0 2499-12-01 01:00:00 @@ -1777,10 +1779,64 @@ def test_repr_struct_after_concat(): """ ), ), + ( + cudf.Index, + { + "data": ["2499-12-01 01:00:00", "2499-11-01 01:30:00"], + "dtype": "datetime64[s]", + }, + textwrap.dedent( + """ + DatetimeIndex(['2499-12-01 01:00:00', '2499-11-01 01:30:00'], + dtype='datetime64[s]') + """ + ), + ), + ( + cudf.Series, + { + "data": ["2499-12-01 01:00:00", "2499-11-01 01:30:00"], + "dtype": "datetime64[s]", + "index": ["2499-12-01 01:00:00", "2499-11-01 01:30:00"], + }, + textwrap.dedent( + """ + 2499-12-01 01:00:00 2499-12-01 01:00:00 + 2499-11-01 01:30:00 2499-11-01 01:30:00 + dtype: datetime64[s] + """ + ), + ), + ( + cudf.DataFrame, + { + "data": { + "a": [ + "2499-12-01 01:00:00", + "2499-11-01 01:30:00", + "1647-05-20 19:25:03", + ], + "b": [ + "1969-12-31 23:59:58", + "1839-12-24 03:58:56", + "1647-05-20 19:25:03", + ], + }, + "dtype": "datetime64[s]", + }, + textwrap.dedent( + """ + a b + 0 2499-12-01 01:00:00 1969-12-31 23:59:58 + 1 2499-11-01 01:30:00 1839-12-24 03:58:56 + 2 1647-05-20 19:25:03 1647-05-20 19:25:03 + """ + ), + ), ], ) -def test_datetime_series_repr(ser, expected_repr): +def test_datetime_overflow_repr(cudf_type, data, expected_repr): expected = expected_repr - actual = repr(ser) + actual = repr(cudf_type(**data)) assert expected.split() == actual.split() From b5ea2cc7bb1b00f52c40ce060663a81441e685e4 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 23 Sep 2022 12:32:39 -0700 Subject: [PATCH 10/51] cleanup --- python/cudf/cudf/tests/test_repr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 0fc4914b927..2297ed753d2 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -113,7 +113,7 @@ def _assert_date_series_repr(ps, gs): new_actual_list.append(" ".join(text.rsplit(" ", 1))) else: new_actual_list.append(text) - # import pdb;pdb.set_trace() + assert expected_list == new_actual_list From 5c3ac19273ddc33c163b8cc6e100d25bf077c196 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 23 Sep 2022 13:26:43 -0700 Subject: [PATCH 11/51] rename --- cpp/include/cudf/datetime.hpp | 6 ++--- cpp/include/cudf/detail/datetime.hpp | 12 ++++----- cpp/src/datetime/datetime_ops.cu | 36 +++++++++++++------------- python/cudf/cudf/_lib/cpp/datetime.pxd | 6 ++--- python/cudf/cudf/_lib/datetime.pyx | 6 ++--- 5 files changed, 33 insertions(+), 33 deletions(-) diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index 633772d9eeb..22e8d03fb1b 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -143,7 +143,7 @@ std::unique_ptr extract_second( * @returns cudf::column of the extracted int16_t milliseconds * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_milli_second( +std::unique_ptr extract_millisecond( cudf::column_view const& column, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -157,7 +157,7 @@ std::unique_ptr extract_milli_second( * @returns cudf::column of the extracted int16_t microseconds * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_micro_second( +std::unique_ptr extract_microsecond( cudf::column_view const& column, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -171,7 +171,7 @@ std::unique_ptr extract_micro_second( * @returns cudf::column of the extracted int16_t nanoseconds * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_nano_second( +std::unique_ptr extract_nanosecond( cudf::column_view const& column, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp index 554c6003485..a953f81587f 100644 --- a/cpp/include/cudf/detail/datetime.hpp +++ b/cpp/include/cudf/detail/datetime.hpp @@ -95,31 +95,31 @@ std::unique_ptr extract_second( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @copydoc cudf::extract_milli_second(cudf::column_view const&, rmm::mr::device_memory_resource *) + * @copydoc cudf::extract_millisecond(cudf::column_view const&, rmm::mr::device_memory_resource *) * * @param stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr extract_milli_second( +std::unique_ptr extract_millisecond( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::default_stream_value, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @copydoc cudf::extract_micro_second(cudf::column_view const&, rmm::mr::device_memory_resource *) + * @copydoc cudf::extract_microsecond(cudf::column_view const&, rmm::mr::device_memory_resource *) * * @param stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr extract_micro_second( +std::unique_ptr extract_microsecond( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::default_stream_value, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @copydoc cudf::extract_nano_second(cudf::column_view const&, rmm::mr::device_memory_resource *) + * @copydoc cudf::extract_nanosecond(cudf::column_view const&, rmm::mr::device_memory_resource *) * * @param stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr extract_nano_second( +std::unique_ptr extract_nanosecond( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::default_stream_value, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index 31abdf42e28..68921a226d6 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -503,27 +503,27 @@ std::unique_ptr extract_second(column_view const& column, cudf::type_id::INT16>(column, stream, mr); } -std::unique_ptr extract_milli_second(column_view const& column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr extract_millisecond(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return detail::apply_datetime_op< detail::extract_component_operator, cudf::type_id::INT16>(column, stream, mr); } -std::unique_ptr extract_micro_second(column_view const& column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr extract_microsecond(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return detail::apply_datetime_op< detail::extract_component_operator, cudf::type_id::INT16>(column, stream, mr); } -std::unique_ptr extract_nano_second(column_view const& column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr extract_nanosecond(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return detail::apply_datetime_op< detail::extract_component_operator, @@ -642,25 +642,25 @@ std::unique_ptr extract_second(column_view const& column, return detail::extract_second(column, cudf::default_stream_value, mr); } -std::unique_ptr extract_milli_second(column_view const& column, - rmm::mr::device_memory_resource* mr) +std::unique_ptr extract_millisecond(column_view const& column, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract_milli_second(column, cudf::default_stream_value, mr); + return detail::extract_millisecond(column, cudf::default_stream_value, mr); } -std::unique_ptr extract_micro_second(column_view const& column, - rmm::mr::device_memory_resource* mr) +std::unique_ptr extract_microsecond(column_view const& column, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract_micro_second(column, cudf::default_stream_value, mr); + return detail::extract_microsecond(column, cudf::default_stream_value, mr); } -std::unique_ptr extract_nano_second(column_view const& column, - rmm::mr::device_memory_resource* mr) +std::unique_ptr extract_nanosecond(column_view const& column, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract_nano_second(column, cudf::default_stream_value, mr); + return detail::extract_nanosecond(column, cudf::default_stream_value, mr); } std::unique_ptr last_day_of_month(column_view const& column, diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd index 5951ca34de3..0f97fc5635c 100644 --- a/python/cudf/cudf/_lib/cpp/datetime.pxd +++ b/python/cudf/cudf/_lib/cpp/datetime.pxd @@ -15,13 +15,13 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: cdef unique_ptr[column] extract_hour(const column_view& column) except + cdef unique_ptr[column] extract_minute(const column_view& column) except + cdef unique_ptr[column] extract_second(const column_view& column) except + - cdef unique_ptr[column] extract_milli_second( + cdef unique_ptr[column] extract_millisecond( const column_view& column ) except + - cdef unique_ptr[column] extract_micro_second( + cdef unique_ptr[column] extract_microsecond( const column_view& column ) except + - cdef unique_ptr[column] extract_nano_second( + cdef unique_ptr[column] extract_nanosecond( const column_view& column ) except + diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index a545877428c..df0f1f981c5 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -50,11 +50,11 @@ def extract_datetime_component(Column col, object field): elif field == "second": c_result = move(libcudf_datetime.extract_second(col_view)) elif field == "milli_second": - c_result = move(libcudf_datetime.extract_milli_second(col_view)) + c_result = move(libcudf_datetime.extract_millisecond(col_view)) elif field == "micro_second": - c_result = move(libcudf_datetime.extract_micro_second(col_view)) + c_result = move(libcudf_datetime.extract_microsecond(col_view)) elif field == "nano_second": - c_result = move(libcudf_datetime.extract_nano_second(col_view)) + c_result = move(libcudf_datetime.extract_nanosecond(col_view)) elif field == "day_of_year": c_result = move(libcudf_datetime.day_of_year(col_view)) else: From d69006306741284dd4b80e274477322557994a3d Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 26 Sep 2022 17:48:22 -0700 Subject: [PATCH 12/51] add cpp tests --- cpp/tests/datetime/datetime_ops_test.cpp | 54 ++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp index 2898a649e36..75ffcb1c537 100644 --- a/cpp/tests/datetime/datetime_ops_test.cpp +++ b/cpp/tests/datetime/datetime_ops_test.cpp @@ -60,6 +60,9 @@ TYPED_TEST(NonTimestampTest, TestThrowsOnNonTimestamp) EXPECT_THROW(extract_hour(col), cudf::logic_error); EXPECT_THROW(extract_minute(col), cudf::logic_error); EXPECT_THROW(extract_second(col), cudf::logic_error); + EXPECT_THROW(extract_millisecond(col), cudf::logic_error); + EXPECT_THROW(extract_microsecond(col), cudf::logic_error); + EXPECT_THROW(extract_nanosecond(col), cudf::logic_error); EXPECT_THROW(last_day_of_month(col), cudf::logic_error); EXPECT_THROW(day_of_year(col), cudf::logic_error); EXPECT_THROW(add_calendrical_months( @@ -97,12 +100,21 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents) 1674631932929 // 2023-01-25 07:32:12.929 GMT }; + auto timestamps_ns = + cudf::test::fixed_width_column_wrapper{ + -23324234, // 1969-12-31 23:59:59.976675766 GMT + 23432424, // 1970-01-01 00:00:00.023432424 GMT + 987234623 // 1970-01-01 00:00:00.987234623 GMT + }; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps_D), fixed_width_column_wrapper{1965, 2018, 2023}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps_s), fixed_width_column_wrapper{1965, 2018, 2023}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps_ms), fixed_width_column_wrapper{1965, 2018, 2023}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps_ns), + fixed_width_column_wrapper{1969, 1970, 1970}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps_D), fixed_width_column_wrapper{10, 7, 1}); @@ -110,6 +122,8 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents) fixed_width_column_wrapper{10, 7, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps_ms), fixed_width_column_wrapper{10, 7, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps_ns), + fixed_width_column_wrapper{12, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps_D), fixed_width_column_wrapper{26, 4, 25}); @@ -117,6 +131,8 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents) fixed_width_column_wrapper{26, 4, 25}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps_ms), fixed_width_column_wrapper{26, 4, 25}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps_ns), + fixed_width_column_wrapper{3, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps_D), fixed_width_column_wrapper{2, 3, 3}); @@ -124,6 +140,8 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents) fixed_width_column_wrapper{2, 3, 3}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps_ms), fixed_width_column_wrapper{2, 3, 3}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps_ms), + fixed_width_column_wrapper{2, 3, 3}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps_D), fixed_width_column_wrapper{0, 0, 0}); @@ -131,6 +149,8 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents) fixed_width_column_wrapper{14, 12, 7}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps_ms), fixed_width_column_wrapper{14, 12, 7}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps_ns), + fixed_width_column_wrapper{23, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps_D), fixed_width_column_wrapper{0, 0, 0}); @@ -138,6 +158,8 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents) fixed_width_column_wrapper{1, 0, 32}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps_ms), fixed_width_column_wrapper{1, 0, 32}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps_ns), + fixed_width_column_wrapper{59, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_second(timestamps_D), fixed_width_column_wrapper{0, 0, 0}); @@ -145,6 +167,35 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents) fixed_width_column_wrapper{12, 0, 12}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_second(timestamps_ms), fixed_width_column_wrapper{12, 0, 12}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps_ns), + fixed_width_column_wrapper{59, 0, 0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond(timestamps_D), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond(timestamps_s), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond(timestamps_ms), + fixed_width_column_wrapper{762, 0, 929}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond(timestamps_ns), + fixed_width_column_wrapper{976, 23, 987}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond(timestamps_D), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond(timestamps_s), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond(timestamps_ms), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond(timestamps_ns), + fixed_width_column_wrapper{675, 432, 234}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond(timestamps_D), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond(timestamps_s), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond(timestamps_ms), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond(timestamps_ns), + fixed_width_column_wrapper{766, 424, 623}); } template @@ -175,6 +226,9 @@ TYPED_TEST(TypedDatetimeOpsTest, TestEmptyColumns) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps), int16s); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps), int16s); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_second(timestamps), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond(timestamps), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond(timestamps), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond(timestamps), int16s); } TYPED_TEST(TypedDatetimeOpsTest, TestExtractingGeneratedDatetimeComponents) From 8d66fc00fb37ce70f6ccfaeae007a7d1b7b577d0 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 26 Sep 2022 19:23:24 -0700 Subject: [PATCH 13/51] fix test --- cpp/tests/datetime/datetime_ops_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp index 75ffcb1c537..07ed7f33303 100644 --- a/cpp/tests/datetime/datetime_ops_test.cpp +++ b/cpp/tests/datetime/datetime_ops_test.cpp @@ -132,7 +132,7 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps_ms), fixed_width_column_wrapper{26, 4, 25}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps_ns), - fixed_width_column_wrapper{3, 1, 1}); + fixed_width_column_wrapper{31, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps_D), fixed_width_column_wrapper{2, 3, 3}); From 6d556e185808087e6308d836baac5f7b5dc58f74 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 27 Sep 2022 11:21:37 -0700 Subject: [PATCH 14/51] move logic to as_string_column --- python/cudf/cudf/core/column/datetime.py | 65 +++++++++++----------- python/cudf/cudf/core/column/timedelta.py | 67 +++++++++++------------ python/cudf/cudf/core/dataframe.py | 4 +- python/cudf/cudf/core/index.py | 8 +-- python/cudf/cudf/core/series.py | 14 +---- python/cudf/cudf/tests/test_concat.py | 4 +- python/cudf/cudf/tests/test_dataframe.py | 34 ++++++++---- python/cudf/cudf/tests/test_repr.py | 46 ++++++++-------- 8 files changed, 120 insertions(+), 122 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index b2ab165d4e4..bcdce023c92 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -214,34 +214,6 @@ def to_pandas( index=index, ) - def _preprocess_column_for_repr(self): - has_hr = (self.get_dt_field("hour") > 0).any() - has_m = (self.get_dt_field("minute") > 0).any() - has_s = (self.get_dt_field("second") > 0).any() - has_ms = (self.get_dt_field("milli_second") > 0).any() - has_us = (self.get_dt_field("micro_second") > 0).any() - has_ns = (self.get_dt_field("nano_second") > 0).any() - has_ns = (self.get_dt_field("nano_second") > 0).any() - - if has_ns: - preprocess = self.astype("O") - elif has_us: - preprocess = self.astype( - "O", format=_dtype_to_format_conversion.get("datetime64[us]") - ) - elif has_ms: - preprocess = self.astype( - "O", format=_dtype_to_format_conversion.get("datetime64[ms]") - ) - elif has_s or has_m or has_hr: - preprocess = self.astype( - "O", format=_dtype_to_format_conversion.get("datetime64[s]") - ) - else: - preprocess = self.astype("O", format="%Y-%m-%d") - - return preprocess - @property def values(self): """ @@ -359,9 +331,16 @@ def as_string_column( self, dtype: Dtype, format=None, **kwargs ) -> "cudf.core.column.StringColumn": if format is None: - format = _dtype_to_format_conversion.get( - self.dtype.name, "%Y-%m-%d %H:%M:%S" - ) + if _has_ns(self): + format = _dtype_to_format_conversion.get(self.dtype.name) + elif _has_us(self): + format = _dtype_to_format_conversion.get("datetime64[us]") + elif _has_ms(self): + format = _dtype_to_format_conversion.get("datetime64[ms]") + elif _has_hr(self) or _has_m(self) or _has_s(self): + format = _dtype_to_format_conversion.get("datetime64[s]") + else: + format = "%Y-%m-%d" if format in _DATETIME_SPECIAL_FORMATS: names = as_column(_DATETIME_NAMES) else: @@ -609,3 +588,27 @@ def _resolve_mixed_dtypes( rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs) rhs_unit = units.index(rhs_time_unit) return cudf.dtype(f"{base_type}[{units[max(lhs_unit, rhs_unit)]}]") + + +def _has_hr(col: DatetimeColumn) -> bool: + return (col.get_dt_field("hour") > 0).any() + + +def _has_m(col: DatetimeColumn) -> bool: + return (col.get_dt_field("minute") > 0).any() + + +def _has_s(col: DatetimeColumn) -> bool: + return (col.get_dt_field("second") > 0).any() + + +def _has_ms(col: DatetimeColumn) -> bool: + return (col.get_dt_field("milli_second") > 0).any() + + +def _has_us(col: DatetimeColumn) -> bool: + return (col.get_dt_field("micro_second") > 0).any() + + +def _has_ns(col: DatetimeColumn) -> bool: + return (col.get_dt_field("nano_second") > 0).any() diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index b3f4877a790..f43644b9473 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -250,34 +250,6 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn": def time_unit(self) -> str: return self._time_unit - def _preprocess_column_for_repr(self): - components = self.components() > 0 - has_hr = components.hours.any() - has_m = components.seconds.any() - has_s = components.seconds.any() - has_ms = components.milliseconds.any() - has_us = components.microseconds.any() - has_ns = components.nanoseconds.any() - - if has_ns: - preprocess = self.astype("O") - elif has_us: - preprocess = self.astype( - "O", format=_dtype_to_format_conversion.get("timedelta64[us]") - ) - elif has_ms: - preprocess = self.astype( - "O", format=_dtype_to_format_conversion.get("timedelta64[ms]") - ) - elif has_s or has_m or has_hr: - preprocess = self.astype( - "O", format=_dtype_to_format_conversion.get("timedelta64[s]") - ) - else: - preprocess = self.astype("O", format="%D days") - - return preprocess - def fillna( self, fill_value: Any = None, method: str = None, dtype: Dtype = None ) -> TimeDeltaColumn: @@ -316,13 +288,40 @@ def as_string_column( self, dtype: Dtype, format=None, **kwargs ) -> "cudf.core.column.StringColumn": if format is None: - format = _dtype_to_format_conversion.get( - self.dtype.name, "%D days %H:%M:%S" - ) - if len(self) > 0: + components = self.components() > 0 + has_hr = components.hours.any() + has_m = components.seconds.any() + has_s = components.seconds.any() + has_ms = components.milliseconds.any() + has_us = components.microseconds.any() + has_ns = components.nanoseconds.any() + + if has_ns: + format = _dtype_to_format_conversion.get("timedelta64[ns]") + target_dtype = cudf.dtype("timedelta64[ns]") + elif has_us: + format = _dtype_to_format_conversion.get("timedelta64[us]") + target_dtype = cudf.dtype("timedelta64[us]") + elif has_ms: + format = _dtype_to_format_conversion.get("timedelta64[ms]") + target_dtype = cudf.dtype("timedelta64[ms]") + elif has_s or has_m or has_hr: + format = _dtype_to_format_conversion.get("timedelta64[s]") + target_dtype = cudf.dtype("timedelta64[s]") + else: + target_dtype = self.dtype + format = "%D days" + + if self.dtype != target_dtype: + timedelta_col = self.astype(target_dtype) + else: + timedelta_col = self + else: + timedelta_col = self + if len(timedelta_col) > 0: return string._timedelta_to_str_typecast_functions[ - cudf.dtype(self.dtype) - ](self, format=format) + cudf.dtype(timedelta_col.dtype) + ](timedelta_col, format=format) else: return cast( "cudf.core.column.StringColumn", diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 84f2a720dfe..bd89fd6e130 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1763,9 +1763,7 @@ def _clean_renderable_dataframe(self, output): cudf.core.column.datetime.DatetimeColumn, ), ): - output._data[col_name] = output._data[ - col_name - ]._preprocess_column_for_repr() + output._data[col_name] = output._data[col_name].astype("str") else: output._data[col_name] = output._data[col_name] diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index e9c5c00b6fa..6625d88077e 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2215,9 +2215,7 @@ def _clean_nulls_from_index(self): of the actual types correctly. """ return cudf.Index( - self._values._preprocess_column_for_repr() - .fillna(cudf._NA_REP) - .astype("category"), + self._values.astype("str").fillna(cudf._NA_REP).astype("category"), name=self.name, ) @@ -2480,9 +2478,7 @@ def _clean_nulls_from_index(self): of the actual types correctly. """ return cudf.Index( - self._values._preprocess_column_for_repr() - .fillna(cudf._NA_REP) - .astype("category"), + self._values.astype("str").fillna(cudf._NA_REP).astype("category"), name=self.name, ) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index a10341a34b1..50d58bcfd4f 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1205,11 +1205,7 @@ def __repr__(self): and not is_struct_dtype(preprocess.dtype) and not is_decimal_dtype(preprocess.dtype) and not is_struct_dtype(preprocess.dtype) - ): - output = repr( - preprocess.astype("O").fillna(cudf._NA_REP).to_pandas() - ) - elif isinstance( + ) or isinstance( preprocess._column, ( cudf.core.column.timedelta.TimeDeltaColumn, @@ -1217,13 +1213,7 @@ def __repr__(self): ), ): output = repr( - Series( - preprocess._column._preprocess_column_for_repr(), - index=preprocess.index, - name=preprocess.name, - ) - .fillna(cudf._NA_REP) - .to_pandas() + preprocess.astype("O").fillna(cudf._NA_REP).to_pandas() ) elif isinstance( preprocess._column, cudf.core.column.CategoricalColumn diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 8f6dce4828a..7b0b18b7b55 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -1697,8 +1697,8 @@ def test_concat_decimal_numeric_series(s1, s2, s3, expected): [ "955.22", "8.20", - "2007-06-12 00:00:00", - "2006-03-14 00:00:00", + "2007-06-12", + "2006-03-14", ], index=[0, 1, 0, 1], ), diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d9e9a4dbba1..37a6407cf85 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -4023,19 +4023,20 @@ def test_series_astype_datetime_to_other(as_dtype): @pytest.mark.parametrize( - "inp", + "dtype", [ - ("datetime64[ns]", "2011-01-01 00:00:00.000000000"), - ("datetime64[us]", "2011-01-01 00:00:00.000000"), - ("datetime64[ms]", "2011-01-01 00:00:00.000"), - ("datetime64[s]", "2011-01-01 00:00:00"), + "datetime64[ns]", + "datetime64[us]", + "datetime64[ms]", + "datetime64[s]", ], ) -def test_series_astype_datetime_to_string(inp): - dtype, expect = inp +def test_series_astype_datetime_to_string(dtype): base_date = "2011-01-01" sr = cudf.Series([base_date], dtype=dtype) + psr = sr.to_pandas() got = sr.astype(str)[0] + expect = psr.astype(str)[0] assert expect == got @@ -4167,7 +4168,14 @@ def test_series_astype_null_cases(): "2001-03-01 00:00:00.000000", ] assert_eq( - cudf.Series(data), + cudf.Series( + [ + "2001-01-01", + "2001-02-01", + None, + "2001-03-01", + ] + ), cudf.Series(data, dtype="datetime64[us]").astype("str"), ) @@ -4565,8 +4573,14 @@ def test_df_astype_datetime_to_other(as_dtype): [690595200000, 1102118400000, 1473724800000, None], dtype="int64" ) elif as_dtype == "str": - expect["foo"] = cudf.Series(data, dtype="str") - expect["bar"] = cudf.Series(data, dtype="str") + expect["foo"] = cudf.Series( + gdf["foo"].to_pandas().astype("str").replace("NaT", None), + dtype="str", + ) + expect["bar"] = cudf.Series( + gdf["bar"].to_pandas().astype("str").replace("NaT", None), + dtype="str", + ) elif as_dtype == "category": expect["foo"] = cudf.Series(gdf["foo"], dtype="category") expect["bar"] = cudf.Series(gdf["bar"], dtype="category") diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 2297ed753d2..de6977e6d06 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -817,16 +817,14 @@ def test_series_null_index_repr(sr, pandas_special_case): @pytest.mark.parametrize( "data", [ - [1000000, 200000, 3000000], - [1000000, 200000, None], + [1000000, 2000000, 3000000], + [1000000, 2000000, None], [], [None], [None, None, None, None, None], [12, 12, 22, 343, 4353534, 435342], np.array([10, 20, 30, None, 100]), cp.asarray([10, 20, 30, 100]), - [1000000, 200000, 3000000], - [1000000, 200000, None], [1], [12, 11, 232, 223432411, 2343241, 234324, 23234], [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], @@ -881,9 +879,9 @@ def test_timedelta_series_s_us_repr(data, dtype): cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[ns]"), textwrap.dedent( """ - 0 0 days 00:00:00.001000000 - 1 0 days 00:00:00.000200000 - 2 0 days 00:00:00.003000000 + 0 0 days 00:00:00.001000 + 1 0 days 00:00:00.000200 + 2 0 days 00:00:00.003000 dtype: timedelta64[ns] """ ), @@ -903,9 +901,9 @@ def test_timedelta_series_s_us_repr(data, dtype): cudf.Series([1000000, 200000, None], dtype="timedelta64[ns]"), textwrap.dedent( """ - 0 0 days 00:00:00.001000000 - 1 0 days 00:00:00.000200000 - 2 + 0 0 days 00:00:00.001000 + 1 0 days 00:00:00.000200 + 2 dtype: timedelta64[ns] """ ), @@ -1034,13 +1032,13 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 157937 days 02:23:52.432 - 1 1 days 13:25:36.784 - 2 2 days 20:09:05.345 - 3 2 days 14:03:52.411 - 4 11573 days 23:39:03.241 - 5 42 days 01:35:48.734 - 6 0 days 00:00:23.234 + 0 -55566 days 21:10:41.277551616 + 1 1 days 13:25:36.784000000 + 2 2 days 20:09:05.345000000 + 3 2 days 14:03:52.411000000 + 4 11573 days 23:39:03.241000000 + 5 42 days 01:35:48.734000000 + 6 0 days 00:00:23.234000000 dtype: timedelta64[ms] """ ), @@ -1087,13 +1085,13 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 157937 days 02:23:52.432 - 1 1 days 13:25:36.784 - 2 2 days 20:09:05.345 - 3 2 days 14:03:52.411 - 4 11573 days 23:39:03.241 - 5 42 days 01:35:48.734 - 6 0 days 00:00:23.234 + 0 -55566 days 21:10:41.277551616 + 1 1 days 13:25:36.784000000 + 2 2 days 20:09:05.345000000 + 3 2 days 14:03:52.411000000 + 4 11573 days 23:39:03.241000000 + 5 42 days 01:35:48.734000000 + 6 0 days 00:00:23.234000000 Name: abc, dtype: timedelta64[ms] """ ), From 9c5a2dfaef1307bc48b0b494227b835f3e21f6f7 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 27 Sep 2022 11:23:04 -0700 Subject: [PATCH 15/51] todo --- python/cudf/cudf/core/column/timedelta.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index f43644b9473..205ce41327b 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -312,6 +312,8 @@ def as_string_column( target_dtype = self.dtype format = "%D days" + # TODO: Remove the type-casting after there is option from libcudf + # side to support %3S, %6S, %9S. if self.dtype != target_dtype: timedelta_col = self.astype(target_dtype) else: From b48155e80560292e93543f88f32b42efa56959d7 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 27 Sep 2022 13:39:17 -0500 Subject: [PATCH 16/51] Update cpp/src/datetime/datetime_ops.cu Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com> --- cpp/src/datetime/datetime_ops.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index 68921a226d6..b35806afe29 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -82,7 +82,7 @@ struct extract_component_operator { auto millisecs_ = duration_cast(time_since_midnight - hrs_ - mins_ - secs_); auto microsecs_ = duration_cast(time_since_midnight - hrs_ - mins_ - secs_ - millisecs_); - auto nanoosecs_ = duration_cast(time_since_midnight - hrs_ - mins_ - secs_ - + auto nanosecs_ = duration_cast(time_since_midnight - hrs_ - mins_ - secs_ - millisecs_ - microsecs_); switch (Component) { From 09a998609e67ff5e5195f0b063e8571a622d603c Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 27 Sep 2022 13:39:49 -0500 Subject: [PATCH 17/51] Update cpp/src/datetime/datetime_ops.cu --- cpp/src/datetime/datetime_ops.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index b35806afe29..eda942f51d6 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -99,7 +99,7 @@ struct extract_component_operator { case datetime_component::SECOND: return secs_.count(); case datetime_component::MILLISECOND: return millisecs_.count(); case datetime_component::MICROSECOND: return microsecs_.count(); - case datetime_component::NANOSECOND: return nanoosecs_.count(); + case datetime_component::NANOSECOND: return nanosecs_.count(); default: return 0; } } From 0438b71177972413e818c1a6cb90b6936beaec92 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 27 Sep 2022 11:50:57 -0700 Subject: [PATCH 18/51] rename API name --- cpp/include/cudf/datetime.hpp | 12 ++++---- cpp/include/cudf/detail/datetime.hpp | 15 ++++++---- cpp/src/datetime/datetime_ops.cu | 38 ++++++++++++------------ cpp/tests/datetime/datetime_ops_test.cpp | 36 +++++++++++----------- python/cudf/cudf/_lib/cpp/datetime.pxd | 6 ++-- python/cudf/cudf/_lib/datetime.pyx | 12 ++++++-- 6 files changed, 64 insertions(+), 55 deletions(-) diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index 22e8d03fb1b..f4af5bbd251 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -134,7 +134,7 @@ std::unique_ptr extract_second( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts millisecond from any date time type and returns an int16_t + * @brief Extracts millisecond fraction from any date time type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -143,12 +143,12 @@ std::unique_ptr extract_second( * @returns cudf::column of the extracted int16_t milliseconds * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_millisecond( +std::unique_ptr extract_millisecond_fraction( cudf::column_view const& column, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts microsecond from any date time type and returns an int16_t + * @brief Extracts microsecond fraction from any date time type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -157,12 +157,12 @@ std::unique_ptr extract_millisecond( * @returns cudf::column of the extracted int16_t microseconds * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_microsecond( +std::unique_ptr extract_microsecond_fraction( cudf::column_view const& column, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts nanosecond from any date time type and returns an int16_t + * @brief Extracts nanosecond fraction from any date time type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -171,7 +171,7 @@ std::unique_ptr extract_microsecond( * @returns cudf::column of the extracted int16_t nanoseconds * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_nanosecond( +std::unique_ptr extract_nanosecond_fraction( cudf::column_view const& column, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp index a953f81587f..d17e641533e 100644 --- a/cpp/include/cudf/detail/datetime.hpp +++ b/cpp/include/cudf/detail/datetime.hpp @@ -95,31 +95,34 @@ std::unique_ptr extract_second( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @copydoc cudf::extract_millisecond(cudf::column_view const&, rmm::mr::device_memory_resource *) + * @copydoc cudf::extract_millisecond_fraction(cudf::column_view const&, + * rmm::mr::device_memory_resource *) * * @param stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr extract_millisecond( +std::unique_ptr extract_millisecond_fraction( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::default_stream_value, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @copydoc cudf::extract_microsecond(cudf::column_view const&, rmm::mr::device_memory_resource *) + * @copydoc cudf::extract_microsecond_fraction(cudf::column_view const&, + * rmm::mr::device_memory_resource *) * * @param stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr extract_microsecond( +std::unique_ptr extract_microsecond_fraction( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::default_stream_value, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @copydoc cudf::extract_nanosecond(cudf::column_view const&, rmm::mr::device_memory_resource *) + * @copydoc cudf::extract_nanosecond_fraction(cudf::column_view const&, + * rmm::mr::device_memory_resource *) * * @param stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr extract_nanosecond( +std::unique_ptr extract_nanosecond_fraction( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::default_stream_value, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index eda942f51d6..7f3b786693f 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -83,7 +83,7 @@ struct extract_component_operator { auto microsecs_ = duration_cast(time_since_midnight - hrs_ - mins_ - secs_ - millisecs_); auto nanosecs_ = duration_cast(time_since_midnight - hrs_ - mins_ - secs_ - - millisecs_ - microsecs_); + millisecs_ - microsecs_); switch (Component) { case datetime_component::YEAR: @@ -503,27 +503,27 @@ std::unique_ptr extract_second(column_view const& column, cudf::type_id::INT16>(column, stream, mr); } -std::unique_ptr extract_millisecond(column_view const& column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr extract_millisecond_fraction(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return detail::apply_datetime_op< detail::extract_component_operator, cudf::type_id::INT16>(column, stream, mr); } -std::unique_ptr extract_microsecond(column_view const& column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr extract_microsecond_fraction(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return detail::apply_datetime_op< detail::extract_component_operator, cudf::type_id::INT16>(column, stream, mr); } -std::unique_ptr extract_nanosecond(column_view const& column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr extract_nanosecond_fraction(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return detail::apply_datetime_op< detail::extract_component_operator, @@ -642,25 +642,25 @@ std::unique_ptr extract_second(column_view const& column, return detail::extract_second(column, cudf::default_stream_value, mr); } -std::unique_ptr extract_millisecond(column_view const& column, - rmm::mr::device_memory_resource* mr) +std::unique_ptr extract_millisecond_fraction(column_view const& column, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract_millisecond(column, cudf::default_stream_value, mr); + return detail::extract_millisecond_fraction(column, cudf::default_stream_value, mr); } -std::unique_ptr extract_microsecond(column_view const& column, - rmm::mr::device_memory_resource* mr) +std::unique_ptr extract_microsecond_fraction(column_view const& column, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract_microsecond(column, cudf::default_stream_value, mr); + return detail::extract_microsecond_fraction(column, cudf::default_stream_value, mr); } -std::unique_ptr extract_nanosecond(column_view const& column, - rmm::mr::device_memory_resource* mr) +std::unique_ptr extract_nanosecond_fraction(column_view const& column, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract_nanosecond(column, cudf::default_stream_value, mr); + return detail::extract_nanosecond_fraction(column, cudf::default_stream_value, mr); } std::unique_ptr last_day_of_month(column_view const& column, diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp index 07ed7f33303..c6d36b2aa6e 100644 --- a/cpp/tests/datetime/datetime_ops_test.cpp +++ b/cpp/tests/datetime/datetime_ops_test.cpp @@ -60,9 +60,9 @@ TYPED_TEST(NonTimestampTest, TestThrowsOnNonTimestamp) EXPECT_THROW(extract_hour(col), cudf::logic_error); EXPECT_THROW(extract_minute(col), cudf::logic_error); EXPECT_THROW(extract_second(col), cudf::logic_error); - EXPECT_THROW(extract_millisecond(col), cudf::logic_error); - EXPECT_THROW(extract_microsecond(col), cudf::logic_error); - EXPECT_THROW(extract_nanosecond(col), cudf::logic_error); + EXPECT_THROW(extract_millisecond_fraction(col), cudf::logic_error); + EXPECT_THROW(extract_microsecond_fraction(col), cudf::logic_error); + EXPECT_THROW(extract_nanosecond_fraction(col), cudf::logic_error); EXPECT_THROW(last_day_of_month(col), cudf::logic_error); EXPECT_THROW(day_of_year(col), cudf::logic_error); EXPECT_THROW(add_calendrical_months( @@ -170,31 +170,31 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps_ns), fixed_width_column_wrapper{59, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond(timestamps_D), + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps_D), fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond(timestamps_s), + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps_s), fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond(timestamps_ms), + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps_ms), fixed_width_column_wrapper{762, 0, 929}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond(timestamps_ns), + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps_ns), fixed_width_column_wrapper{976, 23, 987}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond(timestamps_D), + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps_D), fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond(timestamps_s), + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps_s), fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond(timestamps_ms), + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps_ms), fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond(timestamps_ns), + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps_ns), fixed_width_column_wrapper{675, 432, 234}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond(timestamps_D), + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_D), fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond(timestamps_s), + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_s), fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond(timestamps_ms), + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_ms), fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond(timestamps_ns), + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_ns), fixed_width_column_wrapper{766, 424, 623}); } @@ -226,9 +226,9 @@ TYPED_TEST(TypedDatetimeOpsTest, TestEmptyColumns) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps), int16s); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps), int16s); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_second(timestamps), int16s); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond(timestamps), int16s); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond(timestamps), int16s); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond(timestamps), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps), int16s); } TYPED_TEST(TypedDatetimeOpsTest, TestExtractingGeneratedDatetimeComponents) diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd index 0f97fc5635c..d03587745e1 100644 --- a/python/cudf/cudf/_lib/cpp/datetime.pxd +++ b/python/cudf/cudf/_lib/cpp/datetime.pxd @@ -15,13 +15,13 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: cdef unique_ptr[column] extract_hour(const column_view& column) except + cdef unique_ptr[column] extract_minute(const column_view& column) except + cdef unique_ptr[column] extract_second(const column_view& column) except + - cdef unique_ptr[column] extract_millisecond( + cdef unique_ptr[column] extract_millisecond_fraction( const column_view& column ) except + - cdef unique_ptr[column] extract_microsecond( + cdef unique_ptr[column] extract_microsecond_fraction( const column_view& column ) except + - cdef unique_ptr[column] extract_nanosecond( + cdef unique_ptr[column] extract_nanosecond_fraction( const column_view& column ) except + diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index df0f1f981c5..d37c9c24875 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -50,11 +50,17 @@ def extract_datetime_component(Column col, object field): elif field == "second": c_result = move(libcudf_datetime.extract_second(col_view)) elif field == "milli_second": - c_result = move(libcudf_datetime.extract_millisecond(col_view)) + c_result = move( + libcudf_datetime.extract_millisecond_fraction(col_view) + ) elif field == "micro_second": - c_result = move(libcudf_datetime.extract_microsecond(col_view)) + c_result = move( + libcudf_datetime.extract_microsecond_fraction(col_view) + ) elif field == "nano_second": - c_result = move(libcudf_datetime.extract_nanosecond(col_view)) + c_result = move( + libcudf_datetime.extract_nanosecond_fraction(col_view) + ) elif field == "day_of_year": c_result = move(libcudf_datetime.day_of_year(col_view)) else: From 76d42348cec02d9381c4c54f2d7724825d20ff62 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 27 Sep 2022 13:57:36 -0500 Subject: [PATCH 19/51] Update cpp/include/cudf/datetime.hpp Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com> --- cpp/include/cudf/datetime.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index f4af5bbd251..40c2a09292b 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -151,6 +151,9 @@ std::unique_ptr extract_millisecond_fraction( * @brief Extracts microsecond fraction from any date time type and returns an int16_t * cudf::column. * + * A microsecond fraction is only the 3 digits that make up the microsecond portion of a duration. + * For example, the microsecond fraction of 1.234567890 seconds is 567. + * * @param column cudf::column_view of the input datetime values * @param mr Device memory resource used to allocate device memory of the returned column * From b0750ca516695c78c5cb9acaba5d00f60f0a4e05 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 27 Sep 2022 11:59:07 -0700 Subject: [PATCH 20/51] add examples --- cpp/include/cudf/datetime.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index 40c2a09292b..b7aefd382fa 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -137,6 +137,9 @@ std::unique_ptr extract_second( * @brief Extracts millisecond fraction from any date time type and returns an int16_t * cudf::column. * + * A millisecond fraction is only the 3 digits that make up the millisecond portion of a duration. + * For example, the microsecond fraction of 1.234567890 seconds is 234. + * * @param column cudf::column_view of the input datetime values * @param mr Device memory resource used to allocate device memory of the returned column * @@ -168,6 +171,9 @@ std::unique_ptr extract_microsecond_fraction( * @brief Extracts nanosecond fraction from any date time type and returns an int16_t * cudf::column. * + * A nanosecond fraction is only the 3 digits that make up the nanosecond portion of a duration. + * For example, the nanosecond fraction of 1.234567890 seconds is 890. + * * @param column cudf::column_view of the input datetime values * @param mr Device memory resource used to allocate device memory of the returned column * From 55c5ab97c503672b0c74073f6d145a4a8874cc43 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 27 Sep 2022 12:45:13 -0700 Subject: [PATCH 21/51] add comments --- python/cudf/cudf/core/dataframe.py | 5 +++-- python/cudf/cudf/core/index.py | 12 ++++++++++++ python/cudf/cudf/core/series.py | 3 +++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index bd89fd6e130..92a102064d0 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1763,9 +1763,10 @@ def _clean_renderable_dataframe(self, output): cudf.core.column.datetime.DatetimeColumn, ), ): + # Converting to string column is necessary for Timedelta + # & DatetimeColumn's because larger values will easily + # overflow while being converted to pandas later. output._data[col_name] = output._data[col_name].astype("str") - else: - output._data[col_name] = output._data[col_name] output = output.to_pandas().to_string( max_rows=max_rows, diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 6625d88077e..6dabe3ad317 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2214,6 +2214,12 @@ def _clean_nulls_from_index(self): methods using this method to replace or handle representation of the actual types correctly. """ + # Converting to string Index is necessary for DatetimeColumn + # because larger values will easily overflow while being + # converted to pandas later. + # Converting to CategoricalIndex is necessary to maintain repr + # formatting, as StringIndex is resulting in drastically different + # output. return cudf.Index( self._values.astype("str").fillna(cudf._NA_REP).astype("category"), name=self.name, @@ -2477,6 +2483,12 @@ def _clean_nulls_from_index(self): methods using this method to replace or handle representation of the actual types correctly. """ + # Converting to string Index is necessary for TimedeltaColumn + # because larger values will easily overflow while being + # converted to pandas later. + # Converting to CategoricalIndex is necessary to maintain repr + # formatting, as StringIndex is resulting in drastically different + # output. return cudf.Index( self._values.astype("str").fillna(cudf._NA_REP).astype("category"), name=self.name, diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 50d58bcfd4f..47601013e12 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1212,6 +1212,9 @@ def __repr__(self): cudf.core.column.datetime.DatetimeColumn, ), ): + # Converting to string column is necessary for Timedelta + # & DatetimeColumn's because larger values will easily + # overflow while being converted to pandas later. output = repr( preprocess.astype("O").fillna(cudf._NA_REP).to_pandas() ) From c8e78cacf2e2835b0932b7da321201c5efbe5061 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 27 Sep 2022 13:18:01 -0700 Subject: [PATCH 22/51] remove redundant code and docstrings --- python/cudf/cudf/core/index.py | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 6dabe3ad317..8705d12548a 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2205,15 +2205,6 @@ def is_boolean(self): return False def _clean_nulls_from_index(self): - """ - Convert all na values(if any) in Index object - to `` as a preprocessing step to `__repr__` methods. - - This will involve changing type of Index object - to StringIndex but it is the responsibility of the `__repr__` - methods using this method to replace or handle representation - of the actual types correctly. - """ # Converting to string Index is necessary for DatetimeColumn # because larger values will easily overflow while being # converted to pandas later. @@ -2474,15 +2465,6 @@ def is_boolean(self): return False def _clean_nulls_from_index(self): - """ - Convert all na values(if any) in Index object - to `` as a preprocessing step to `__repr__` methods. - - This will involve changing type of Index object - to StringIndex but it is the responsibility of the `__repr__` - methods using this method to replace or handle representation - of the actual types correctly. - """ # Converting to string Index is necessary for TimedeltaColumn # because larger values will easily overflow while being # converted to pandas later. @@ -2920,16 +2902,6 @@ def __repr__(self): def str(self): return StringMethods(parent=self) - def _clean_nulls_from_index(self): - """ - Convert all na values(if any) in Index object - to `` as a preprocessing step to `__repr__` methods. - """ - if self._values.has_nulls(): - return self.fillna(cudf._NA_REP) - else: - return self - def is_boolean(self): return False From d31060aba4946a42671fdf75c085fbf0cbd0065a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 27 Sep 2022 13:55:49 -0700 Subject: [PATCH 23/51] document categorical index conversion with example --- python/cudf/cudf/core/index.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 8705d12548a..77397bbd8a4 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1255,6 +1255,26 @@ def __repr__(self): output = repr(preprocess._clean_nulls_from_index().to_pandas()) if isinstance(self, (DatetimeIndex, TimedeltaIndex)): + # Converting to CategoricalIndex was necessary to maintain repr + # formatting of 1 value per line, now we will have to remove + # the CategoricalIndex: + """ + >>> s = cudf.Index([2113, 1221, 12321], dtype='datetime64[ns]') + >>> s.to_pandas() + DatetimeIndex(['1970-01-01 00:00:00.000002113', + '1970-01-01 00:00:00.000001221', + '1970-01-01 00:00:00.000012321'], + dtype='datetime64[ns]', freq=None) + >>> s.to_pandas().astype('str') + Index(['1970-01-01 00:00:00.000002113', '1970-01-01 00:00:00.000001221', + '1970-01-01 00:00:00.000012321'], + dtype='object') + >>> s.to_pandas().astype('category') + CategoricalIndex(['1970-01-01 00:00:00.000002113', + '1970-01-01 00:00:00.000001221', + '1970-01-01 00:00:00.000012321'], + categories=[1970-01-01 00:00:00.000001221, 1970-01-01 00:00:00.000002113, 1970-01-01 00:00:00.000012321], ordered=False, dtype='category') + """ # noqa: E501 output = ( output[: output.rfind("categories=[")] + output[output.rfind(" dtype=") :] From 872a1b7c51d99cdabf1aa586a598e6ee9bdb0f14 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 27 Sep 2022 16:12:25 -0500 Subject: [PATCH 24/51] Update python/cudf/cudf/core/index.py Co-authored-by: Ashwin Srinath <3190405+shwina@users.noreply.github.com> --- python/cudf/cudf/core/index.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 77397bbd8a4..a7d825dbd96 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2225,12 +2225,18 @@ def is_boolean(self): return False def _clean_nulls_from_index(self): - # Converting to string Index is necessary for DatetimeColumn - # because larger values will easily overflow while being - # converted to pandas later. - # Converting to CategoricalIndex is necessary to maintain repr - # formatting, as StringIndex is resulting in drastically different - # output. + # __repr__ for other data types works by converting + # to Pandas first, and relying on Pandas to convert + # values to strings. However, + # Pandas encounters issues with overflow for datetime + # and timedelta types because it does not support + # sub-nanosecond resolutions. Thus, we do the work of + # converting datetimes/timedeltas to strings before handing + # off to Pandas. + # + # Further, we need to cast the result to a `CategoricalIndex`, + # because `StringIndex` values in Pandas are printed on + # the same line, rather than one-per-line. return cudf.Index( self._values.astype("str").fillna(cudf._NA_REP).astype("category"), name=self.name, From 1337b94ec95cf3305655d66d7006cb8db42edc39 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 27 Sep 2022 14:14:45 -0700 Subject: [PATCH 25/51] style --- python/cudf/cudf/core/index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index a7d825dbd96..74fd9053870 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2233,7 +2233,7 @@ def _clean_nulls_from_index(self): # sub-nanosecond resolutions. Thus, we do the work of # converting datetimes/timedeltas to strings before handing # off to Pandas. - # + # # Further, we need to cast the result to a `CategoricalIndex`, # because `StringIndex` values in Pandas are printed on # the same line, rather than one-per-line. From 486bf42b0922e9bf78ef2f7fcd47a1bb518c74c9 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 12 Oct 2022 08:57:22 -0700 Subject: [PATCH 26/51] revert all repr changes --- python/cudf/cudf/core/column/datetime.py | 37 +- python/cudf/cudf/core/column/timedelta.py | 41 +- python/cudf/cudf/core/dataframe.py | 33 +- python/cudf/cudf/core/index.py | 102 +---- python/cudf/cudf/core/series.py | 8 +- python/cudf/cudf/tests/test_concat.py | 4 +- python/cudf/cudf/tests/test_dataframe.py | 34 +- python/cudf/cudf/tests/test_repr.py | 456 +++------------------- python/cudf/cudf/utils/docutils.py | 20 +- 9 files changed, 118 insertions(+), 617 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index bcdce023c92..1419b14e8c6 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -331,16 +331,9 @@ def as_string_column( self, dtype: Dtype, format=None, **kwargs ) -> "cudf.core.column.StringColumn": if format is None: - if _has_ns(self): - format = _dtype_to_format_conversion.get(self.dtype.name) - elif _has_us(self): - format = _dtype_to_format_conversion.get("datetime64[us]") - elif _has_ms(self): - format = _dtype_to_format_conversion.get("datetime64[ms]") - elif _has_hr(self) or _has_m(self) or _has_s(self): - format = _dtype_to_format_conversion.get("datetime64[s]") - else: - format = "%Y-%m-%d" + format = _dtype_to_format_conversion.get( + self.dtype.name, "%Y-%m-%d %H:%M:%S" + ) if format in _DATETIME_SPECIAL_FORMATS: names = as_column(_DATETIME_NAMES) else: @@ -588,27 +581,3 @@ def _resolve_mixed_dtypes( rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs) rhs_unit = units.index(rhs_time_unit) return cudf.dtype(f"{base_type}[{units[max(lhs_unit, rhs_unit)]}]") - - -def _has_hr(col: DatetimeColumn) -> bool: - return (col.get_dt_field("hour") > 0).any() - - -def _has_m(col: DatetimeColumn) -> bool: - return (col.get_dt_field("minute") > 0).any() - - -def _has_s(col: DatetimeColumn) -> bool: - return (col.get_dt_field("second") > 0).any() - - -def _has_ms(col: DatetimeColumn) -> bool: - return (col.get_dt_field("milli_second") > 0).any() - - -def _has_us(col: DatetimeColumn) -> bool: - return (col.get_dt_field("micro_second") > 0).any() - - -def _has_ns(col: DatetimeColumn) -> bool: - return (col.get_dt_field("nano_second") > 0).any() diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 205ce41327b..e6d688014fa 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -288,42 +288,13 @@ def as_string_column( self, dtype: Dtype, format=None, **kwargs ) -> "cudf.core.column.StringColumn": if format is None: - components = self.components() > 0 - has_hr = components.hours.any() - has_m = components.seconds.any() - has_s = components.seconds.any() - has_ms = components.milliseconds.any() - has_us = components.microseconds.any() - has_ns = components.nanoseconds.any() - - if has_ns: - format = _dtype_to_format_conversion.get("timedelta64[ns]") - target_dtype = cudf.dtype("timedelta64[ns]") - elif has_us: - format = _dtype_to_format_conversion.get("timedelta64[us]") - target_dtype = cudf.dtype("timedelta64[us]") - elif has_ms: - format = _dtype_to_format_conversion.get("timedelta64[ms]") - target_dtype = cudf.dtype("timedelta64[ms]") - elif has_s or has_m or has_hr: - format = _dtype_to_format_conversion.get("timedelta64[s]") - target_dtype = cudf.dtype("timedelta64[s]") - else: - target_dtype = self.dtype - format = "%D days" - - # TODO: Remove the type-casting after there is option from libcudf - # side to support %3S, %6S, %9S. - if self.dtype != target_dtype: - timedelta_col = self.astype(target_dtype) - else: - timedelta_col = self - else: - timedelta_col = self - if len(timedelta_col) > 0: + format = _dtype_to_format_conversion.get( + self.dtype.name, "%D days %H:%M:%S" + ) + if len(self) > 0: return string._timedelta_to_str_typecast_functions[ - cudf.dtype(timedelta_col.dtype) - ](timedelta_col, format=format) + cudf.dtype(self.dtype) + ](self, format=format) else: return cast( "cudf.core.column.StringColumn", diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 56fa7e17c63..f00c7d1f2b5 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -570,12 +570,12 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): ... [(t0+ timedelta(seconds=x)) for x in range(n)]) ... }) >>> df - id datetimes - 0 0 2018-10-07 12:00:00 - 1 1 2018-10-07 12:00:01 - 2 2 2018-10-07 12:00:02 - 3 3 2018-10-07 12:00:03 - 4 4 2018-10-07 12:00:04 + id datetimes + 0 0 2018-10-07 12:00:00 + 1 1 2018-10-07 12:00:01 + 2 2 2018-10-07 12:00:02 + 3 3 2018-10-07 12:00:03 + 4 4 2018-10-07 12:00:04 Build DataFrame via list of rows as tuples: @@ -1048,8 +1048,8 @@ def dtypes(self): ... 'datetime': [pd.Timestamp('20180310')], ... 'string': ['foo']}) >>> df - float int datetime string - 0 1.0 1 2018-03-10 foo + float int datetime string + 0 1.0 1 2018-03-10 foo >>> df.dtypes float float64 int int64 @@ -1755,19 +1755,6 @@ def _clean_renderable_dataframe(self, output): else: width = None - for col_name, col in output._data.items(): - if isinstance( - col, - ( - cudf.core.column.timedelta.TimeDeltaColumn, - cudf.core.column.datetime.DatetimeColumn, - ), - ): - # Converting to string column is necessary for Timedelta - # & DatetimeColumn's because larger values will easily - # overflow while being converted to pandas later. - output._data[col_name] = output._data[col_name].astype("str") - output = output.to_pandas().to_string( max_rows=max_rows, min_rows=min_rows, @@ -3913,8 +3900,8 @@ def query(self, expr, local_dict=None): >>> df['datetimes'] = data >>> search_date = datetime.datetime.strptime('2018-10-08', '%Y-%m-%d') >>> df.query('datetimes==@search_date') - datetimes - 1 2018-10-08 + datetimes + 1 2018-10-08 Using local_dict: diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 640097fa886..b838baeddcc 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1282,39 +1282,10 @@ def __repr__(self): output = repr(preprocess.to_pandas()) output = output.replace("nan", cudf._NA_REP) - elif preprocess._values.nullable or isinstance( - preprocess, (DatetimeIndex, TimedeltaIndex) - ): - output = repr(preprocess._clean_nulls_from_index().to_pandas()) - - if isinstance(self, (DatetimeIndex, TimedeltaIndex)): - # Converting to CategoricalIndex was necessary to maintain repr - # formatting of 1 value per line, now we will have to remove - # the CategoricalIndex: - """ - >>> s = cudf.Index([2113, 1221, 12321], dtype='datetime64[ns]') - >>> s.to_pandas() - DatetimeIndex(['1970-01-01 00:00:00.000002113', - '1970-01-01 00:00:00.000001221', - '1970-01-01 00:00:00.000012321'], - dtype='datetime64[ns]', freq=None) - >>> s.to_pandas().astype('str') - Index(['1970-01-01 00:00:00.000002113', '1970-01-01 00:00:00.000001221', - '1970-01-01 00:00:00.000012321'], - dtype='object') - >>> s.to_pandas().astype('category') - CategoricalIndex(['1970-01-01 00:00:00.000002113', - '1970-01-01 00:00:00.000001221', - '1970-01-01 00:00:00.000012321'], - categories=[1970-01-01 00:00:00.000001221, 1970-01-01 00:00:00.000002113, 1970-01-01 00:00:00.000012321], ordered=False, dtype='category') - """ # noqa: E501 - output = ( - output[: output.rfind("categories=[")] - + output[output.rfind(" dtype=") :] - ) - if not isinstance( - self, (StringIndex, DatetimeIndex, TimedeltaIndex) - ): + elif preprocess._values.nullable: + output = repr(self._clean_nulls_from_index().to_pandas()) + + if not isinstance(self, StringIndex): # We should remove all the single quotes # from the output due to the type-cast to # object dtype happening above. @@ -1345,15 +1316,6 @@ def __repr__(self): else: lines[-1] = lines[-1] + ")" - if isinstance(preprocess, (DatetimeIndex, TimedeltaIndex)): - replace_spaces = ( - " " if isinstance(preprocess, DatetimeIndex) else " " - ) - if len(lines) > 1: - lines[1:-1] = [ - line.replace(replace_spaces, "", 1) for line in lines[1:-1] - ] - lines[-1] = lines[-1].replace(replace_spaces + " ", "", 1) return "\n".join(lines) @_cudf_nvtx_annotate @@ -2102,12 +2064,12 @@ def millisecond(self): >>> datetime_index = cudf.Index(pd.date_range("2000-01-01", ... periods=3, freq="ms")) >>> datetime_index - DatetimeIndex(['2000-01-01 00:00:00.000', '2000-01-01 00:00:00.001', - '2000-01-01 00:00:00.002'], + DatetimeIndex([ '2000-01-01 00:00:00', '2000-01-01 00:00:00.001000', + '2000-01-01 00:00:00.002000'], dtype='datetime64[ns]') >>> datetime_index.millisecond Int16Index([0, 1, 2], dtype='int16') - """ + """ # noqa: E501 return self._get_dt_field("milli_second") @property # type: ignore @@ -2123,9 +2085,9 @@ def microsecond(self): >>> datetime_index = cudf.Index(pd.date_range("2000-01-01", ... periods=3, freq="us")) >>> datetime_index - DatetimeIndex(['2000-01-01 00:00:00.000000', '2000-01-01 00:00:00.000001', - '2000-01-01 00:00:00.000002'], - dtype='datetime64[ns]') + DatetimeIndex([ '2000-01-01 00:00:00', '2000-01-01 00:00:00.000001', + '2000-01-01 00:00:00.000002'], + dtype='datetime64[ns]') >>> datetime_index.microsecond Int16Index([0, 1, 2], dtype='int16') """ # noqa: E501 @@ -2144,7 +2106,7 @@ def nanosecond(self): >>> datetime_index = cudf.Index(pd.date_range("2000-01-01", ... periods=3, freq="ns")) >>> datetime_index - DatetimeIndex(['2000-01-01 00:00:00.000000000', + DatetimeIndex([ '2000-01-01 00:00:00', '2000-01-01 00:00:00.000000001', '2000-01-01 00:00:00.000000002'], dtype='datetime64[ns]') @@ -2235,9 +2197,9 @@ def day_of_year(self): ... "2017-01-08", freq="D")) >>> datetime_index DatetimeIndex(['2016-12-31', '2017-01-01', '2017-01-02', '2017-01-03', - '2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07', - '2017-01-08'], - dtype='datetime64[ns]') + '2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07', + '2017-01-08'], + dtype='datetime64[ns]') >>> datetime_index.day_of_year Int16Index([366, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int16') """ @@ -2332,24 +2294,6 @@ def _get_dt_field(self, field): def is_boolean(self): return False - def _clean_nulls_from_index(self): - # __repr__ for other data types works by converting - # to Pandas first, and relying on Pandas to convert - # values to strings. However, - # Pandas encounters issues with overflow for datetime - # and timedelta types because it does not support - # sub-nanosecond resolutions. Thus, we do the work of - # converting datetimes/timedeltas to strings before handing - # off to Pandas. - # - # Further, we need to cast the result to a `CategoricalIndex`, - # because `StringIndex` values in Pandas are printed on - # the same line, rather than one-per-line. - return cudf.Index( - self._values.astype("str").fillna(cudf._NA_REP).astype("category"), - name=self.name, - ) - @_cudf_nvtx_annotate def ceil(self, freq): """ @@ -2598,18 +2542,6 @@ def inferred_freq(self): def is_boolean(self): return False - def _clean_nulls_from_index(self): - # Converting to string Index is necessary for TimedeltaColumn - # because larger values will easily overflow while being - # converted to pandas later. - # Converting to CategoricalIndex is necessary to maintain repr - # formatting, as StringIndex is resulting in drastically different - # output. - return cudf.Index( - self._values.astype("str").fillna(cudf._NA_REP).astype("category"), - name=self.name, - ) - class CategoricalIndex(GenericIndex): """ @@ -3036,6 +2968,12 @@ def __repr__(self): def str(self): return StringMethods(parent=self) + def _clean_nulls_from_index(self): + if self._values.has_nulls(): + return self.fillna(cudf._NA_REP) + else: + return self + def is_boolean(self): return False diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 8eaddebccd7..35591091ef9 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1207,14 +1207,8 @@ def __repr__(self): and not is_struct_dtype(preprocess.dtype) ) or isinstance( preprocess._column, - ( - cudf.core.column.timedelta.TimeDeltaColumn, - cudf.core.column.datetime.DatetimeColumn, - ), + cudf.core.column.timedelta.TimeDeltaColumn, ): - # Converting to string column is necessary for Timedelta - # & DatetimeColumn's because larger values will easily - # overflow while being converted to pandas later. output = repr( preprocess.astype("O").fillna(cudf._NA_REP).to_pandas() ) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 7b0b18b7b55..8f6dce4828a 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -1697,8 +1697,8 @@ def test_concat_decimal_numeric_series(s1, s2, s3, expected): [ "955.22", "8.20", - "2007-06-12", - "2006-03-14", + "2007-06-12 00:00:00", + "2006-03-14 00:00:00", ], index=[0, 1, 0, 1], ), diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 37a6407cf85..d9e9a4dbba1 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -4023,20 +4023,19 @@ def test_series_astype_datetime_to_other(as_dtype): @pytest.mark.parametrize( - "dtype", + "inp", [ - "datetime64[ns]", - "datetime64[us]", - "datetime64[ms]", - "datetime64[s]", + ("datetime64[ns]", "2011-01-01 00:00:00.000000000"), + ("datetime64[us]", "2011-01-01 00:00:00.000000"), + ("datetime64[ms]", "2011-01-01 00:00:00.000"), + ("datetime64[s]", "2011-01-01 00:00:00"), ], ) -def test_series_astype_datetime_to_string(dtype): +def test_series_astype_datetime_to_string(inp): + dtype, expect = inp base_date = "2011-01-01" sr = cudf.Series([base_date], dtype=dtype) - psr = sr.to_pandas() got = sr.astype(str)[0] - expect = psr.astype(str)[0] assert expect == got @@ -4168,14 +4167,7 @@ def test_series_astype_null_cases(): "2001-03-01 00:00:00.000000", ] assert_eq( - cudf.Series( - [ - "2001-01-01", - "2001-02-01", - None, - "2001-03-01", - ] - ), + cudf.Series(data), cudf.Series(data, dtype="datetime64[us]").astype("str"), ) @@ -4573,14 +4565,8 @@ def test_df_astype_datetime_to_other(as_dtype): [690595200000, 1102118400000, 1473724800000, None], dtype="int64" ) elif as_dtype == "str": - expect["foo"] = cudf.Series( - gdf["foo"].to_pandas().astype("str").replace("NaT", None), - dtype="str", - ) - expect["bar"] = cudf.Series( - gdf["bar"].to_pandas().astype("str").replace("NaT", None), - dtype="str", - ) + expect["foo"] = cudf.Series(data, dtype="str") + expect["bar"] = cudf.Series(data, dtype="str") elif as_dtype == "category": expect["foo"] = cudf.Series(gdf["foo"], dtype="category") expect["bar"] = cudf.Series(gdf["bar"], dtype="category") diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index de6977e6d06..c4985639173 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -82,63 +82,21 @@ def test_null_dataframe(ncols): pd.reset_option("display.max_columns") -def _assert_date_series_repr(ps, gs): - """ - This is a utility function to compare pandas & cudf - datetime series repr's. - The repr's differ in the way text is spaced: - - >>> s = cudf.Series([100, 200, 300], dtype='datetime64[ns]') - >>> s - 0 1970-01-01 00:00:00.000000100 - 1 1970-01-01 00:00:00.000000200 - 2 1970-01-01 00:00:00.000000300 - dtype: datetime64[ns] - - >>> s.to_pandas() - 0 1970-01-01 00:00:00.000000100 - 1 1970-01-01 00:00:00.000000200 - 2 1970-01-01 00:00:00.000000300 - dtype: datetime64[ns] - """ - expected_list = repr(ps).split("\n") - actual_list = repr(gs).split("\n") - - new_actual_list = [] - for text in actual_list: - if not text.startswith(" "): - new_actual_list.append(text.replace(" ", " ", 1)) - else: - if "..." in text: - new_actual_list.append(" ".join(text.rsplit(" ", 1))) - else: - new_actual_list.append(text) - - assert expected_list == new_actual_list - - @pytest.mark.parametrize("dtype", repr_categories) @pytest.mark.parametrize("nrows", [None, 0, 1, 2, 9, 10, 11, 19, 20, 21]) def test_full_series(nrows, dtype): - np.random.seed(0) size = 20 ps = pd.Series(np.random.randint(0, 100, size)).astype(dtype) sr = cudf.from_pandas(ps) pd.options.display.max_rows = nrows - - if cudf.api.types.is_datetime_dtype(dtype): - _assert_date_series_repr(ps, sr) - else: - assert repr(ps) == repr(sr) + assert repr(ps) == repr(sr) pd.reset_option("display.max_rows") @pytest.mark.parametrize("nrows", [5, 10, 15]) @pytest.mark.parametrize("ncols", [5, 10, 15]) @pytest.mark.parametrize("size", [20, 21]) -@pytest.mark.parametrize( - "dtype", sorted(list(set(repr_categories) - {"datetime64[ns]"})) -) +@pytest.mark.parametrize("dtype", repr_categories) def test_full_dataframe_20(dtype, size, nrows, ncols): pdf = pd.DataFrame( {idx: np.random.randint(0, 100, size) for idx in range(size)} @@ -153,207 +111,6 @@ def test_full_dataframe_20(dtype, size, nrows, ncols): assert pdf._repr_latex_() == gdf._repr_latex_() -@pytest.mark.parametrize( - "nrows,ncols,data,expected_repr", - [ - ( - 5, - 5, - {idx: np.arange(0, 100, 5) for idx in range(20)}, - textwrap.dedent( - """ - 0 1 ... \\ -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 ... -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 ... -.. ... ... ... -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 ... -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 ... - - 18 19 -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -.. ... ... -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 - -[20 rows x 20 columns] -""" - ), - ), - ( - 5, - 15, - {idx: np.arange(0, 100, 5) for idx in range(20)}, - textwrap.dedent( - """ - 0 1 \\ -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -.. ... ... -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 - - 2 3 \\ -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -.. ... ... -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 - - 4 5 \\ -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -.. ... ... -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 - - 6 ... 13 \\ -0 1970-01-01 00:00:00.000000000 ... 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 ... 1970-01-01 00:00:00.000000005 -.. ... ... ... -18 1970-01-01 00:00:00.000000090 ... 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 ... 1970-01-01 00:00:00.000000095 - - 14 15 \\ -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -.. ... ... -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 - - 16 17 \\ -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -.. ... ... -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 - - 18 19 -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -.. ... ... -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 - -[20 rows x 20 columns] -""" - ), - ), - ( - 15, - 15, - {idx: np.arange(0, 100, 5) for idx in range(20)}, - textwrap.dedent( - """ - 0 1 \\ -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 -3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 -4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 -.. ... ... -15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 -16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 -17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 - - 2 3 \\ -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 -3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 -4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 -.. ... ... -15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 -16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 -17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 - - 4 5 \\ -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 -3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 -4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 -.. ... ... -15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 -16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 -17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 - - 6 ... 13 \\ -0 1970-01-01 00:00:00.000000000 ... 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 ... 1970-01-01 00:00:00.000000005 -2 1970-01-01 00:00:00.000000010 ... 1970-01-01 00:00:00.000000010 -3 1970-01-01 00:00:00.000000015 ... 1970-01-01 00:00:00.000000015 -4 1970-01-01 00:00:00.000000020 ... 1970-01-01 00:00:00.000000020 -.. ... ... ... -15 1970-01-01 00:00:00.000000075 ... 1970-01-01 00:00:00.000000075 -16 1970-01-01 00:00:00.000000080 ... 1970-01-01 00:00:00.000000080 -17 1970-01-01 00:00:00.000000085 ... 1970-01-01 00:00:00.000000085 -18 1970-01-01 00:00:00.000000090 ... 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 ... 1970-01-01 00:00:00.000000095 - - 14 15 \\ -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 -3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 -4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 -.. ... ... -15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 -16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 -17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 - - 16 17 \\ -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 -3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 -4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 -.. ... ... -15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 -16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 -17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 - - 18 19 -0 1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000 -1 1970-01-01 00:00:00.000000005 1970-01-01 00:00:00.000000005 -2 1970-01-01 00:00:00.000000010 1970-01-01 00:00:00.000000010 -3 1970-01-01 00:00:00.000000015 1970-01-01 00:00:00.000000015 -4 1970-01-01 00:00:00.000000020 1970-01-01 00:00:00.000000020 -.. ... ... -15 1970-01-01 00:00:00.000000075 1970-01-01 00:00:00.000000075 -16 1970-01-01 00:00:00.000000080 1970-01-01 00:00:00.000000080 -17 1970-01-01 00:00:00.000000085 1970-01-01 00:00:00.000000085 -18 1970-01-01 00:00:00.000000090 1970-01-01 00:00:00.000000090 -19 1970-01-01 00:00:00.000000095 1970-01-01 00:00:00.000000095 - -[20 rows x 20 columns] -""" - ), - ), - ], -) -def test_full_datetime_dataframe(nrows, ncols, data, expected_repr): - pdf = pd.DataFrame(data).astype("datetime64[ns]") - gdf = cudf.from_pandas(pdf) - - with pd.option_context( - "display.max_rows", int(nrows), "display.max_columns", int(ncols) - ): - assert expected_repr.split() == repr(gdf).split() - assert pdf._repr_html_() == gdf._repr_html_() - assert pdf._repr_latex_() == gdf._repr_latex_() - - @given( st.lists( st.integers(-9223372036854775808, 9223372036854775807), @@ -601,36 +358,35 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows): ), ( cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[ns]")), - "DatetimeIndex(['1970-01-01 00:00:00.000000010',\n" - " '1970-01-01 00:00:00.000000020',\n" - " '1970-01-01 00:00:00.000000030', ''],\n" - " dtype='datetime64[ns]')", + "DatetimeIndex([1970-01-01 00:00:00.000000010, " + "1970-01-01 00:00:00.000000020," + "\n 1970-01-01 00:00:00.000000030, ],\n " + "dtype='datetime64[ns]')", ), ( cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[s]")), - "DatetimeIndex(['1970-01-01 00:00:10', '1970-01-01 00:00:20',\n" - " '1970-01-01 00:00:30', ''],\n" - " dtype='datetime64[s]')", + "DatetimeIndex([1970-01-01 00:00:10, " + "1970-01-01 00:00:20, 1970-01-01 00:00:30,\n" + " ],\n dtype='datetime64[s]')", ), ( cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[us]")), - "DatetimeIndex(['1970-01-01 00:00:00.000010', " - "'1970-01-01 00:00:00.000020',\n" - " '1970-01-01 00:00:00.000030', ''],\n" - " dtype='datetime64[us]')", + "DatetimeIndex([1970-01-01 00:00:00.000010, " + "1970-01-01 00:00:00.000020,\n " + "1970-01-01 00:00:00.000030, ],\n " + "dtype='datetime64[us]')", ), ( cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[ms]")), - "DatetimeIndex(['1970-01-01 00:00:00.010', " - "'1970-01-01 00:00:00.020',\n " - " '1970-01-01 00:00:00.030', ''],\n" - " dtype='datetime64[ms]')", + "DatetimeIndex([1970-01-01 00:00:00.010, " + "1970-01-01 00:00:00.020,\n " + "1970-01-01 00:00:00.030, ],\n " + "dtype='datetime64[ms]')", ), ( cudf.Index(np.array([None] * 10, dtype="datetime64[ms]")), - "DatetimeIndex(['', '', '', '', '', '', " - "'',\n '', '', ''],\n" - " dtype='datetime64[ms]')", + "DatetimeIndex([, , , , , , , , " + ",\n ],\n dtype='datetime64[ms]')", ), ], ) @@ -817,14 +573,16 @@ def test_series_null_index_repr(sr, pandas_special_case): @pytest.mark.parametrize( "data", [ - [1000000, 2000000, 3000000], - [1000000, 2000000, None], + [1000000, 200000, 3000000], + [1000000, 200000, None], [], [None], [None, None, None, None, None], [12, 12, 22, 343, 4353534, 435342], np.array([10, 20, 30, None, 100]), cp.asarray([10, 20, 30, 100]), + [1000000, 200000, 3000000], + [1000000, 200000, None], [1], [12, 11, 232, 223432411, 2343241, 234324, 23234], [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], @@ -879,9 +637,9 @@ def test_timedelta_series_s_us_repr(data, dtype): cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[ns]"), textwrap.dedent( """ - 0 0 days 00:00:00.001000 - 1 0 days 00:00:00.000200 - 2 0 days 00:00:00.003000 + 0 0 days 00:00:00.001000000 + 1 0 days 00:00:00.000200000 + 2 0 days 00:00:00.003000000 dtype: timedelta64[ns] """ ), @@ -901,9 +659,9 @@ def test_timedelta_series_s_us_repr(data, dtype): cudf.Series([1000000, 200000, None], dtype="timedelta64[ns]"), textwrap.dedent( """ - 0 0 days 00:00:00.001000 - 1 0 days 00:00:00.000200 - 2 + 0 0 days 00:00:00.001000000 + 1 0 days 00:00:00.000200000 + 2 dtype: timedelta64[ns] """ ), @@ -1032,13 +790,13 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 -55566 days 21:10:41.277551616 - 1 1 days 13:25:36.784000000 - 2 2 days 20:09:05.345000000 - 3 2 days 14:03:52.411000000 - 4 11573 days 23:39:03.241000000 - 5 42 days 01:35:48.734000000 - 6 0 days 00:00:23.234000000 + 0 157937 days 02:23:52.432 + 1 1 days 13:25:36.784 + 2 2 days 20:09:05.345 + 3 2 days 14:03:52.411 + 4 11573 days 23:39:03.241 + 5 42 days 01:35:48.734 + 6 0 days 00:00:23.234 dtype: timedelta64[ms] """ ), @@ -1085,13 +843,13 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 -55566 days 21:10:41.277551616 - 1 1 days 13:25:36.784000000 - 2 2 days 20:09:05.345000000 - 3 2 days 14:03:52.411000000 - 4 11573 days 23:39:03.241000000 - 5 42 days 01:35:48.734000000 - 6 0 days 00:00:23.234000000 + 0 157937 days 02:23:52.432 + 1 1 days 13:25:36.784 + 2 2 days 20:09:05.345 + 3 2 days 14:03:52.411 + 4 11573 days 23:39:03.241 + 5 42 days 01:35:48.734 + 6 0 days 00:00:23.234 Name: abc, dtype: timedelta64[ms] """ ), @@ -1294,14 +1052,15 @@ def test_timedelta_dataframe_repr(df, expected_repr): [ ( cudf.Index([1000000, 200000, 3000000], dtype="timedelta64[ms]"), - "TimedeltaIndex(['0 days 00:16:40', '0 days 00:03:20', " - "'0 days 00:50:00'], dtype='timedelta64[ms]')", + "TimedeltaIndex(['0 days 00:16:40', " + "'0 days 00:03:20', '0 days 00:50:00'], " + "dtype='timedelta64[ms]')", ), ( cudf.Index( [None, None, None, None, None], dtype="timedelta64[us]" ), - "TimedeltaIndex(['', '', '', '', ''], " + "TimedeltaIndex([, , , , ], " "dtype='timedelta64[us]')", ), ( @@ -1317,12 +1076,11 @@ def test_timedelta_dataframe_repr(df, expected_repr): ], dtype="timedelta64[us]", ), - "TimedeltaIndex(['0 days 00:02:16.457654', '', " - "'0 days 00:04:05.345345',\n" - " '0 days 00:03:43.432411', '', " - "'0 days 01:00:34.548734',\n" - " '0 days 00:00:00.023234'],\n" - " dtype='timedelta64[us]')", + "TimedeltaIndex([0 days 00:02:16.457654, , " + "0 days 00:04:05.345345, " + "0 days 00:03:43.432411, ," + " 0 days 01:00:34.548734, 0 days 00:00:00.023234]," + " dtype='timedelta64[us]')", ), ( cudf.Index( @@ -1337,11 +1095,10 @@ def test_timedelta_dataframe_repr(df, expected_repr): ], dtype="timedelta64[s]", ), - "TimedeltaIndex(['1579 days 08:54:14', '', " - "'2839 days 15:29:05',\n" - " '2586 days 00:33:31', '', " - "'42066 days 12:52:14',\n '0 days 06:27:14'],\n" - " dtype='timedelta64[s]')", + "TimedeltaIndex([1579 days 08:54:14, , 2839 days 15:29:05," + " 2586 days 00:33:31, , 42066 days 12:52:14, " + "0 days 06:27:14]," + " dtype='timedelta64[s]')", ), ], ) @@ -1737,104 +1494,3 @@ def test_repr_struct_after_concat(): pdf = df.to_pandas() assert repr(df) == repr(pdf) - - -@pytest.mark.parametrize( - "cudf_type,data,expected_repr", - [ - ( - cudf.Series, - { - "data": [ - "1969-12-31 23:59:58.001001", - "1839-12-24 03:58:56.000826", - "1647-05-20 19:25:03.000638", - ], - "dtype": "datetime64[us]", - "index": ["a", "b", "z"], - "name": "hello", - }, - textwrap.dedent( - """ - a 1969-12-31 23:59:58.001001 - b 1839-12-24 03:58:56.000826 - z 1647-05-20 19:25:03.000638 - Name: hello, dtype: datetime64[us] - """ - ), - ), - ( - cudf.Series, - { - "data": ["2499-12-01 01:00:00", "2499-11-01 01:30:00"], - "dtype": "datetime64[s]", - }, - textwrap.dedent( - """ - 0 2499-12-01 01:00:00 - 1 2499-11-01 01:30:00 - dtype: datetime64[s] - """ - ), - ), - ( - cudf.Index, - { - "data": ["2499-12-01 01:00:00", "2499-11-01 01:30:00"], - "dtype": "datetime64[s]", - }, - textwrap.dedent( - """ - DatetimeIndex(['2499-12-01 01:00:00', '2499-11-01 01:30:00'], - dtype='datetime64[s]') - """ - ), - ), - ( - cudf.Series, - { - "data": ["2499-12-01 01:00:00", "2499-11-01 01:30:00"], - "dtype": "datetime64[s]", - "index": ["2499-12-01 01:00:00", "2499-11-01 01:30:00"], - }, - textwrap.dedent( - """ - 2499-12-01 01:00:00 2499-12-01 01:00:00 - 2499-11-01 01:30:00 2499-11-01 01:30:00 - dtype: datetime64[s] - """ - ), - ), - ( - cudf.DataFrame, - { - "data": { - "a": [ - "2499-12-01 01:00:00", - "2499-11-01 01:30:00", - "1647-05-20 19:25:03", - ], - "b": [ - "1969-12-31 23:59:58", - "1839-12-24 03:58:56", - "1647-05-20 19:25:03", - ], - }, - "dtype": "datetime64[s]", - }, - textwrap.dedent( - """ - a b - 0 2499-12-01 01:00:00 1969-12-31 23:59:58 - 1 2499-11-01 01:30:00 1839-12-24 03:58:56 - 2 1647-05-20 19:25:03 1647-05-20 19:25:03 - """ - ), - ), - ], -) -def test_datetime_overflow_repr(cudf_type, data, expected_repr): - expected = expected_repr - actual = repr(cudf_type(**data)) - - assert expected.split() == actual.split() diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py index c373aa0b127..9f04e30fb28 100644 --- a/python/cudf/cudf/utils/docutils.py +++ b/python/cudf/cudf/utils/docutils.py @@ -220,18 +220,18 @@ def wrapper(func): ... np.datetime64("2010-01-01") ... ]) >>> s - 0 2000-01-01 - 1 2010-01-01 - 2 2010-01-01 + 0 2000-01-01 + 1 2010-01-01 + 2 2010-01-01 dtype: datetime64[s] >>> s.describe() - count 3 - mean 2006-09-01 08:00:00 - min 2000-01-01 00:00:00 - 25% 2004-12-31 12:00:00 - 50% 2010-01-01 00:00:00 - 75% 2010-01-01 00:00:00 - max 2010-01-01 00:00:00 + count 3 + mean 2006-09-01 08:00:00 + min 2000-01-01 00:00:00 + 25% 2004-12-31 12:00:00 + 50% 2010-01-01 00:00:00 + 75% 2010-01-01 00:00:00 + max 2010-01-01 00:00:00 dtype: object Describing a ``DataFrame``. By default only numeric fields are From 1bf8893c2a9e55fead4bcc71084defe5f713e632 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 12 Oct 2022 11:56:55 -0500 Subject: [PATCH 27/51] Update cpp/include/cudf/datetime.hpp --- cpp/include/cudf/datetime.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index b7aefd382fa..0d9b7d4d501 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -138,7 +138,7 @@ std::unique_ptr extract_second( * cudf::column. * * A millisecond fraction is only the 3 digits that make up the millisecond portion of a duration. - * For example, the microsecond fraction of 1.234567890 seconds is 234. + * For example, the millisecond fraction of 1.234567890 seconds is 234. * * @param column cudf::column_view of the input datetime values * @param mr Device memory resource used to allocate device memory of the returned column From e3a181578c7649c53c580cfd8dd20fc53cf274ba Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 12 Oct 2022 12:49:44 -0500 Subject: [PATCH 28/51] Apply suggestions from code review Co-authored-by: Bradley Dice --- python/cudf/cudf/_lib/datetime.pyx | 6 +++--- python/cudf/cudf/core/index.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index d37c9c24875..cb0a245b915 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -49,15 +49,15 @@ def extract_datetime_component(Column col, object field): c_result = move(libcudf_datetime.extract_minute(col_view)) elif field == "second": c_result = move(libcudf_datetime.extract_second(col_view)) - elif field == "milli_second": + elif field == "millisecond": c_result = move( libcudf_datetime.extract_millisecond_fraction(col_view) ) - elif field == "micro_second": + elif field == "microsecond": c_result = move( libcudf_datetime.extract_microsecond_fraction(col_view) ) - elif field == "nano_second": + elif field == "nanosecond": c_result = move( libcudf_datetime.extract_nanosecond_fraction(col_view) ) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index b838baeddcc..8b168554851 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2070,7 +2070,7 @@ def millisecond(self): >>> datetime_index.millisecond Int16Index([0, 1, 2], dtype='int16') """ # noqa: E501 - return self._get_dt_field("milli_second") + return self._get_dt_field("millisecond") @property # type: ignore @_cudf_nvtx_annotate @@ -2091,7 +2091,7 @@ def microsecond(self): >>> datetime_index.microsecond Int16Index([0, 1, 2], dtype='int16') """ # noqa: E501 - return self._get_dt_field("micro_second") + return self._get_dt_field("microsecond") @property # type: ignore @_cudf_nvtx_annotate @@ -2113,7 +2113,7 @@ def nanosecond(self): >>> datetime_index.nanosecond Int16Index([0, 1, 2], dtype='int16') """ - return self._get_dt_field("nano_second") + return self._get_dt_field("nanosecond") @property # type: ignore @_cudf_nvtx_annotate From 1d618464552679b4ca6da8aae9892ccf7c61a842 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 12 Oct 2022 12:50:14 -0500 Subject: [PATCH 29/51] Update python/cudf/cudf/core/series.py Co-authored-by: Bradley Dice --- python/cudf/cudf/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 35591091ef9..d9c9f09c187 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3615,7 +3615,7 @@ def millisecond(self): 2 2 dtype: int16 """ - return self._get_dt_field("milli_second") + return self._get_dt_field("millisecond") @property # type: ignore @_cudf_nvtx_annotate From b79912b143a67380215b20423c6518a7b3a4848c Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 12 Oct 2022 12:50:22 -0500 Subject: [PATCH 30/51] Update python/cudf/cudf/core/series.py Co-authored-by: Bradley Dice --- python/cudf/cudf/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index d9c9f09c187..3025c777a64 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3640,7 +3640,7 @@ def microsecond(self): 2 2 dtype: int16 """ - return self._get_dt_field("micro_second") + return self._get_dt_field("microsecond") @property # type: ignore @_cudf_nvtx_annotate From 38b1bd12eb15be42d4cb65c9a6d605fdc4313522 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 12 Oct 2022 12:50:27 -0500 Subject: [PATCH 31/51] Update python/cudf/cudf/core/series.py Co-authored-by: Bradley Dice --- python/cudf/cudf/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 3025c777a64..736880fa23e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3665,7 +3665,7 @@ def nanosecond(self): 2 2 dtype: int16 """ - return self._get_dt_field("nano_second") + return self._get_dt_field("nanosecond") @property # type: ignore @_cudf_nvtx_annotate From 88d118a2a68888805e86ecea873f57913dfd0018 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 12 Oct 2022 10:51:36 -0700 Subject: [PATCH 32/51] fix inconsistencies --- cpp/include/cudf/datetime.hpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index 0d9b7d4d501..fb04336871f 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -36,7 +36,7 @@ namespace datetime { */ /** - * @brief Extracts year from any date time type and returns an int16_t + * @brief Extracts year from any datetime type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -50,7 +50,7 @@ std::unique_ptr extract_year( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts month from any date time type and returns an int16_t + * @brief Extracts month from any datetime type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -64,7 +64,7 @@ std::unique_ptr extract_month( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts day from any date time type and returns an int16_t + * @brief Extracts day from any datetime type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -78,7 +78,7 @@ std::unique_ptr extract_day( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts day from any date time type and returns an int16_t + * @brief Extracts day from any datetime type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -92,7 +92,7 @@ std::unique_ptr extract_weekday( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts hour from any date time type and returns an int16_t + * @brief Extracts hour from any datetime type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -106,7 +106,7 @@ std::unique_ptr extract_hour( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts minute from any date time type and returns an int16_t + * @brief Extracts minute from any datetime type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -120,7 +120,7 @@ std::unique_ptr extract_minute( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts second from any date time type and returns an int16_t + * @brief Extracts second from any datetime type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -134,7 +134,7 @@ std::unique_ptr extract_second( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts millisecond fraction from any date time type and returns an int16_t + * @brief Extracts millisecond fraction from any datetime type and returns an int16_t * cudf::column. * * A millisecond fraction is only the 3 digits that make up the millisecond portion of a duration. @@ -151,7 +151,7 @@ std::unique_ptr extract_millisecond_fraction( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts microsecond fraction from any date time type and returns an int16_t + * @brief Extracts microsecond fraction from any datetime type and returns an int16_t * cudf::column. * * A microsecond fraction is only the 3 digits that make up the microsecond portion of a duration. @@ -168,7 +168,7 @@ std::unique_ptr extract_microsecond_fraction( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts nanosecond fraction from any date time type and returns an int16_t + * @brief Extracts nanosecond fraction from any datetime type and returns an int16_t * cudf::column. * * A nanosecond fraction is only the 3 digits that make up the nanosecond portion of a duration. @@ -192,7 +192,7 @@ std::unique_ptr extract_nanosecond_fraction( */ /** - * @brief Computes the last day of the month in date time type and returns a TIMESTAMP_DAYS + * @brief Computes the last day of the month in datetime type and returns a TIMESTAMP_DAYS * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -220,7 +220,7 @@ std::unique_ptr day_of_year( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Adds or subtracts a number of months from the date time type and returns a + * @brief Adds or subtracts a number of months from the datetime type and returns a * timestamp column that is of the same type as the input `timestamps` column. * * For a given row, if the `timestamps` or the `months` column value is null, @@ -255,7 +255,7 @@ std::unique_ptr add_calendrical_months( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Adds or subtracts a number of months from the date time type and returns a + * @brief Adds or subtracts a number of months from the datetime type and returns a * timestamp column that is of the same type as the input `timestamps` column. * * For a given row, if the `timestamps` value is null, the output for that row is null. From 16e7b8f12c3d31c37501cc64b73a95491ba1da3c Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 12 Oct 2022 11:13:11 -0700 Subject: [PATCH 33/51] reduce inefficiency --- cpp/src/datetime/datetime_ops.cu | 37 +++++++++++++++++++------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index 7f3b786693f..92ffba6abb9 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -76,15 +76,6 @@ struct extract_component_operator { if (time_since_midnight.count() < 0) { time_since_midnight += days(1); } - auto hrs_ = duration_cast(time_since_midnight); - auto mins_ = duration_cast(time_since_midnight - hrs_); - auto secs_ = duration_cast(time_since_midnight - hrs_ - mins_); - auto millisecs_ = duration_cast(time_since_midnight - hrs_ - mins_ - secs_); - auto microsecs_ = - duration_cast(time_since_midnight - hrs_ - mins_ - secs_ - millisecs_); - auto nanosecs_ = duration_cast(time_since_midnight - hrs_ - mins_ - secs_ - - millisecs_ - microsecs_); - switch (Component) { case datetime_component::YEAR: return static_cast(year_month_day(days_since_epoch).year()); @@ -94,12 +85,28 @@ struct extract_component_operator { return static_cast(year_month_day(days_since_epoch).day()); case datetime_component::WEEKDAY: return year_month_weekday(days_since_epoch).weekday().iso_encoding(); - case datetime_component::HOUR: return hrs_.count(); - case datetime_component::MINUTE: return mins_.count(); - case datetime_component::SECOND: return secs_.count(); - case datetime_component::MILLISECOND: return millisecs_.count(); - case datetime_component::MICROSECOND: return microsecs_.count(); - case datetime_component::NANOSECOND: return nanosecs_.count(); + case datetime_component::HOUR: + case datetime_component::MINUTE: + case datetime_component::SECOND: + case datetime_component::MILLISECOND: + case datetime_component::MICROSECOND: + case datetime_component::NANOSECOND: { + auto const hrs_ = duration_cast(time_since_midnight); + if (Component == datetime_component::HOUR) { return hrs_.count(); } + auto const mins_ = duration_cast(time_since_midnight - hrs_); + if (Component == datetime_component::MINUTE) { return mins_.count(); } + auto const secs_ = duration_cast(time_since_midnight - hrs_ - mins_); + if (Component == datetime_component::SECOND) { return secs_.count(); } + auto const millisecs_ = + duration_cast(time_since_midnight - hrs_ - mins_ - secs_); + if (Component == datetime_component::MILLISECOND) { return millisecs_.count(); } + auto const microsecs_ = + duration_cast(time_since_midnight - hrs_ - mins_ - secs_ - millisecs_); + if (Component == datetime_component::MICROSECOND) { return microsecs_.count(); } + auto const nanosecs_ = duration_cast(time_since_midnight - hrs_ - mins_ - + secs_ - millisecs_ - microsecs_); + if (Component == datetime_component::NANOSECOND) { return nanosecs_.count(); } + } default: return 0; } } From e78b8065fa69cd8fe0fc1cee45c686af5661b7e3 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 12 Oct 2022 11:16:59 -0700 Subject: [PATCH 34/51] add comment --- python/cudf/cudf/tests/test_datetime.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 3875467477f..f09891af0cd 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -82,6 +82,9 @@ def numerical_data(): "minute", "second", "microsecond", + # Pandas supports 'second', 'microsecond' & 'nanosecond' + # but weirdly left out 'millisecond', hence can't have 'millisecond' + # in this list. "nanosecond", "weekday", "dayofweek", From cc94b094733c5ae2d6578c5d626b03f4b2b4d750 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 12 Oct 2022 11:19:53 -0700 Subject: [PATCH 35/51] Revert "reduce inefficiency" This reverts commit 16e7b8f12c3d31c37501cc64b73a95491ba1da3c. --- cpp/src/datetime/datetime_ops.cu | 37 +++++++++++++------------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index 92ffba6abb9..7f3b786693f 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -76,6 +76,15 @@ struct extract_component_operator { if (time_since_midnight.count() < 0) { time_since_midnight += days(1); } + auto hrs_ = duration_cast(time_since_midnight); + auto mins_ = duration_cast(time_since_midnight - hrs_); + auto secs_ = duration_cast(time_since_midnight - hrs_ - mins_); + auto millisecs_ = duration_cast(time_since_midnight - hrs_ - mins_ - secs_); + auto microsecs_ = + duration_cast(time_since_midnight - hrs_ - mins_ - secs_ - millisecs_); + auto nanosecs_ = duration_cast(time_since_midnight - hrs_ - mins_ - secs_ - + millisecs_ - microsecs_); + switch (Component) { case datetime_component::YEAR: return static_cast(year_month_day(days_since_epoch).year()); @@ -85,28 +94,12 @@ struct extract_component_operator { return static_cast(year_month_day(days_since_epoch).day()); case datetime_component::WEEKDAY: return year_month_weekday(days_since_epoch).weekday().iso_encoding(); - case datetime_component::HOUR: - case datetime_component::MINUTE: - case datetime_component::SECOND: - case datetime_component::MILLISECOND: - case datetime_component::MICROSECOND: - case datetime_component::NANOSECOND: { - auto const hrs_ = duration_cast(time_since_midnight); - if (Component == datetime_component::HOUR) { return hrs_.count(); } - auto const mins_ = duration_cast(time_since_midnight - hrs_); - if (Component == datetime_component::MINUTE) { return mins_.count(); } - auto const secs_ = duration_cast(time_since_midnight - hrs_ - mins_); - if (Component == datetime_component::SECOND) { return secs_.count(); } - auto const millisecs_ = - duration_cast(time_since_midnight - hrs_ - mins_ - secs_); - if (Component == datetime_component::MILLISECOND) { return millisecs_.count(); } - auto const microsecs_ = - duration_cast(time_since_midnight - hrs_ - mins_ - secs_ - millisecs_); - if (Component == datetime_component::MICROSECOND) { return microsecs_.count(); } - auto const nanosecs_ = duration_cast(time_since_midnight - hrs_ - mins_ - - secs_ - millisecs_ - microsecs_); - if (Component == datetime_component::NANOSECOND) { return nanosecs_.count(); } - } + case datetime_component::HOUR: return hrs_.count(); + case datetime_component::MINUTE: return mins_.count(); + case datetime_component::SECOND: return secs_.count(); + case datetime_component::MILLISECOND: return millisecs_.count(); + case datetime_component::MICROSECOND: return microsecs_.count(); + case datetime_component::NANOSECOND: return nanosecs_.count(); default: return 0; } } From 1b8af1c811d19278f36257a3e3f5d97d4889934e Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 12 Oct 2022 11:21:52 -0700 Subject: [PATCH 36/51] use const --- cpp/src/datetime/datetime_ops.cu | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index 7f3b786693f..522901b1a95 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -76,14 +76,14 @@ struct extract_component_operator { if (time_since_midnight.count() < 0) { time_since_midnight += days(1); } - auto hrs_ = duration_cast(time_since_midnight); - auto mins_ = duration_cast(time_since_midnight - hrs_); - auto secs_ = duration_cast(time_since_midnight - hrs_ - mins_); - auto millisecs_ = duration_cast(time_since_midnight - hrs_ - mins_ - secs_); - auto microsecs_ = + auto const hrs_ = duration_cast(time_since_midnight); + auto const mins_ = duration_cast(time_since_midnight - hrs_); + auto const secs_ = duration_cast(time_since_midnight - hrs_ - mins_); + auto const millisecs_ = duration_cast(time_since_midnight - hrs_ - mins_ - secs_); + auto const microsecs_ = duration_cast(time_since_midnight - hrs_ - mins_ - secs_ - millisecs_); - auto nanosecs_ = duration_cast(time_since_midnight - hrs_ - mins_ - secs_ - - millisecs_ - microsecs_); + auto const nanosecs_ = duration_cast(time_since_midnight - hrs_ - mins_ - secs_ - + millisecs_ - microsecs_); switch (Component) { case datetime_component::YEAR: From 9c8dbfb16951a27df9a0d8e666108c429ebc4be1 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 12 Oct 2022 11:33:24 -0700 Subject: [PATCH 37/51] use lambdas --- cpp/src/datetime/datetime_ops.cu | 44 ++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index 522901b1a95..465595f9e2a 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -76,14 +76,30 @@ struct extract_component_operator { if (time_since_midnight.count() < 0) { time_since_midnight += days(1); } - auto const hrs_ = duration_cast(time_since_midnight); - auto const mins_ = duration_cast(time_since_midnight - hrs_); - auto const secs_ = duration_cast(time_since_midnight - hrs_ - mins_); - auto const millisecs_ = duration_cast(time_since_midnight - hrs_ - mins_ - secs_); - auto const microsecs_ = - duration_cast(time_since_midnight - hrs_ - mins_ - secs_ - millisecs_); - auto const nanosecs_ = duration_cast(time_since_midnight - hrs_ - mins_ - secs_ - - millisecs_ - microsecs_); + auto const hrs_ = [&] { return duration_cast(time_since_midnight); }; + auto const mins_ = [&] { return duration_cast(time_since_midnight) - hrs_(); }; + auto const secs_ = [&] { + return duration_cast(time_since_midnight) - hrs_(); + -mins_(); + }; + auto const millisecs_ = [&] { + return duration_cast(time_since_midnight) - hrs_(); + -mins_(); + -secs_(); + }; + auto const microsecs_ = [&] { + return duration_cast(time_since_midnight) - hrs_(); + -mins_(); + -secs_(); + -millisecs_(); + }; + auto const nanosecs_ = [&] { + return duration_cast(time_since_midnight) - hrs_(); + -mins_(); + -secs_(); + -millisecs_(); + -microsecs_(); + }; switch (Component) { case datetime_component::YEAR: @@ -94,12 +110,12 @@ struct extract_component_operator { return static_cast(year_month_day(days_since_epoch).day()); case datetime_component::WEEKDAY: return year_month_weekday(days_since_epoch).weekday().iso_encoding(); - case datetime_component::HOUR: return hrs_.count(); - case datetime_component::MINUTE: return mins_.count(); - case datetime_component::SECOND: return secs_.count(); - case datetime_component::MILLISECOND: return millisecs_.count(); - case datetime_component::MICROSECOND: return microsecs_.count(); - case datetime_component::NANOSECOND: return nanosecs_.count(); + case datetime_component::HOUR: return hrs_().count(); + case datetime_component::MINUTE: return mins_().count(); + case datetime_component::SECOND: return secs_().count(); + case datetime_component::MILLISECOND: return millisecs_().count(); + case datetime_component::MICROSECOND: return microsecs_().count(); + case datetime_component::NANOSECOND: return nanosecs_().count(); default: return 0; } } From 7c7cdfc40df848fff447def7d1ce1a763263fd9b Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 12 Oct 2022 11:40:38 -0700 Subject: [PATCH 38/51] remove ;s --- cpp/src/datetime/datetime_ops.cu | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index 465595f9e2a..be877cbafeb 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -79,26 +79,18 @@ struct extract_component_operator { auto const hrs_ = [&] { return duration_cast(time_since_midnight); }; auto const mins_ = [&] { return duration_cast(time_since_midnight) - hrs_(); }; auto const secs_ = [&] { - return duration_cast(time_since_midnight) - hrs_(); - -mins_(); + return duration_cast(time_since_midnight) - hrs_() - mins_() }; auto const millisecs_ = [&] { - return duration_cast(time_since_midnight) - hrs_(); - -mins_(); - -secs_(); + return duration_cast(time_since_midnight) - hrs_() - mins_() - secs_() }; auto const microsecs_ = [&] { - return duration_cast(time_since_midnight) - hrs_(); - -mins_(); - -secs_(); - -millisecs_(); + return duration_cast(time_since_midnight) - hrs_() - mins_() - secs_() - + millisecs_() }; auto const nanosecs_ = [&] { - return duration_cast(time_since_midnight) - hrs_(); - -mins_(); - -secs_(); - -millisecs_(); - -microsecs_(); + return duration_cast(time_since_midnight) - hrs_() - mins_() - secs_() - + millisecs_() - microsecs_() }; switch (Component) { From 2c9de715a387692f319266f6c20ed8f9525a1fc5 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 12 Oct 2022 11:59:03 -0700 Subject: [PATCH 39/51] remove ;s --- cpp/src/datetime/datetime_ops.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index be877cbafeb..e89792525c9 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -79,18 +79,18 @@ struct extract_component_operator { auto const hrs_ = [&] { return duration_cast(time_since_midnight); }; auto const mins_ = [&] { return duration_cast(time_since_midnight) - hrs_(); }; auto const secs_ = [&] { - return duration_cast(time_since_midnight) - hrs_() - mins_() + return duration_cast(time_since_midnight) - hrs_() - mins_(); }; auto const millisecs_ = [&] { - return duration_cast(time_since_midnight) - hrs_() - mins_() - secs_() + return duration_cast(time_since_midnight) - hrs_() - mins_() - secs_(); }; auto const microsecs_ = [&] { return duration_cast(time_since_midnight) - hrs_() - mins_() - secs_() - - millisecs_() + millisecs_(); }; auto const nanosecs_ = [&] { return duration_cast(time_since_midnight) - hrs_() - mins_() - secs_() - - millisecs_() - microsecs_() + millisecs_() - microsecs_(); }; switch (Component) { From 90f1fee325e94edccbce38afd9ba4defd6c8376b Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 13 Oct 2022 13:50:54 -0500 Subject: [PATCH 40/51] Update python/cudf/cudf/core/series.py Co-authored-by: Bradley Dice --- python/cudf/cudf/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 736880fa23e..e45805abc24 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3596,7 +3596,7 @@ def second(self): @_cudf_nvtx_annotate def millisecond(self): """ - The millisecond of the datetime. + The milliseconds of the datetime. Examples -------- From 206cdc01acbd385853f4d2fed2e5ad0926ef492d Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 13 Oct 2022 13:51:00 -0500 Subject: [PATCH 41/51] Update python/cudf/cudf/core/series.py Co-authored-by: Bradley Dice --- python/cudf/cudf/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index e45805abc24..5742fd1593e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3621,7 +3621,7 @@ def millisecond(self): @_cudf_nvtx_annotate def microsecond(self): """ - The microsecond of the datetime. + The microseconds of the datetime. Examples -------- From 82c51cfe2cf425faf9791cf8c16e21b8e7e0f04d Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 13 Oct 2022 13:51:16 -0500 Subject: [PATCH 42/51] Update python/cudf/cudf/core/series.py Co-authored-by: Bradley Dice --- python/cudf/cudf/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 5742fd1593e..92d328449b9 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3646,7 +3646,7 @@ def microsecond(self): @_cudf_nvtx_annotate def nanosecond(self): """ - The nanosecond of the datetime. + The nanoseconds of the datetime. Examples -------- From 5a0879c169e2489ab31de3b5733a32e435ef784c Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 13 Oct 2022 12:38:43 -0700 Subject: [PATCH 43/51] remove duplication in test data and expand the test coverage --- python/cudf/cudf/tests/test_datetime.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index f09891af0cd..ffa60a503a2 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -28,7 +28,9 @@ def data1(): def data2(): - return pd.date_range("20010101", "20020215", freq="400h", name="times") + return pd.date_range( + "20010101", freq="243434324423423234N", name="times", periods=10 + ) def timeseries_us_data(): @@ -81,7 +83,12 @@ def numerical_data(): "hour", "minute", "second", - "microsecond", + pytest.param( + "microsecond", + marks=pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/49073" + ), + ), # Pandas supports 'second', 'microsecond' & 'nanosecond' # but weirdly left out 'millisecond', hence can't have 'millisecond' # in this list. @@ -177,7 +184,7 @@ def test_dt_ops(data): # libcudf doesn't respect timezones -@pytest.mark.parametrize("data", [data1()]) +@pytest.mark.parametrize("data", [data1(), data2()]) @pytest.mark.parametrize("field", fields) def test_dt_series(data, field): pd_data = pd.Series(data.copy()) @@ -187,7 +194,7 @@ def test_dt_series(data, field): assert_eq(base, test) -@pytest.mark.parametrize("data", [data1()]) +@pytest.mark.parametrize("data", [data1(), data2()]) @pytest.mark.parametrize("field", fields) def test_dt_index(data, field): pd_data = data.copy() From 62cfcc81f74d20062ebb4d7c835f8692f2939ffc Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 13 Oct 2022 12:56:13 -0700 Subject: [PATCH 44/51] reorder --- docs/cudf/source/api_docs/series.rst | 30 ++++++++++++++-------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst index 345646bab13..6286fa317ea 100644 --- a/docs/cudf/source/api_docs/series.rst +++ b/docs/cudf/source/api_docs/series.rst @@ -260,28 +260,28 @@ Datetime properties .. autosummary:: :toctree: api/ + year + month day - dayofweek - dayofyear - days_in_month - day_of_year hour + minute + second microsecond millisecond - minute - month nanosecond - second + dayofweek weekday - year - is_leap_year + dayofyear + day_of_year + quarter is_month_start is_month_end is_quarter_start is_quarter_end is_year_start is_year_end - quarter + is_leap_year + days_in_month Datetime methods ^^^^^^^^^^^^^^^^ @@ -289,11 +289,11 @@ Datetime methods .. autosummary:: :toctree: api/ - strftime isocalendar - ceil - floor + strftime round + floor + ceil Timedelta properties @@ -303,11 +303,11 @@ Timedelta properties .. autosummary:: :toctree: api/ - components days + seconds microseconds nanoseconds - seconds + components .. _api.series.str: .. include:: string_handling.rst From c0c7aff6f0444ab256b9086530eca84a61654b2a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 13 Oct 2022 12:58:22 -0700 Subject: [PATCH 45/51] reorder --- docs/cudf/source/api_docs/index_objects.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst index e20c9188f16..44a028a0d5d 100644 --- a/docs/cudf/source/api_docs/index_objects.rst +++ b/docs/cudf/source/api_docs/index_objects.rst @@ -265,12 +265,13 @@ Time/date components DatetimeIndex.millisecond DatetimeIndex.microsecond DatetimeIndex.nanosecond - DatetimeIndex.dayofweek - DatetimeIndex.dayofyear DatetimeIndex.day_of_year + DatetimeIndex.dayofyear + DatetimeIndex.dayofweek DatetimeIndex.weekday - DatetimeIndex.is_leap_year DatetimeIndex.quarter + DatetimeIndex.is_leap_year + DatetimeIndex.isocalendar Time-specific operations From c698435bf660684760511ed370efa1e7b3fc1f6d Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 14 Oct 2022 12:05:13 -0700 Subject: [PATCH 46/51] address reviews --- python/cudf/cudf/core/index.py | 8 ++++++-- python/cudf/cudf/core/series.py | 7 ++++++- python/cudf/cudf/tests/test_datetime.py | 24 +----------------------- 3 files changed, 13 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 8b168554851..25d41aaf637 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2089,9 +2089,13 @@ def microsecond(self): '2000-01-01 00:00:00.000002'], dtype='datetime64[ns]') >>> datetime_index.microsecond - Int16Index([0, 1, 2], dtype='int16') + Int32Index([0, 1, 2], dtype='int32') """ # noqa: E501 - return self._get_dt_field("microsecond") + return as_index( + (self._values.get_dt_field("millisecond") * 1000) + + self._values.get_dt_field("microsecond"), + name=self.name, + ) @property # type: ignore @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 92d328449b9..8eae41ff64a 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3640,7 +3640,12 @@ def microsecond(self): 2 2 dtype: int16 """ - return self._get_dt_field("microsecond") + return Series( + data=(self.series._column.get_dt_field("millisecond") * 1000) + + self.series._column.get_dt_field("microsecond"), + index=self.series._index, + name=self.series.name, + ) @property # type: ignore @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index ffa60a503a2..bd3b3561701 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -83,15 +83,7 @@ def numerical_data(): "hour", "minute", "second", - pytest.param( - "microsecond", - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/49073" - ), - ), - # Pandas supports 'second', 'microsecond' & 'nanosecond' - # but weirdly left out 'millisecond', hence can't have 'millisecond' - # in this list. + "microsecond", "nanosecond", "weekday", "dayofweek", @@ -2052,17 +2044,3 @@ def test_datetime_constructor(data, dtype): actual = cudf.DatetimeIndex(data=cudf.Series(data), dtype=dtype) assert_eq(expected, actual) - - -# Pandas supports 'second', 'microsecond' & 'nanosecond' -# but weirdly left out 'millisecond', hence can't compare to -# a pandas API. -def test_datetime_millisecond_property(): - data = pd.date_range("2000-01-01", periods=3, freq="ms") - - gsr = cudf.Series(data) - - assert_eq(gsr.dt.millisecond, cudf.Series([0, 1, 2], dtype="int16")) - - gi = cudf.Index(data) - assert_eq(gi.millisecond, cudf.Index([0, 1, 2], dtype="int16")) From c3dc34c72ee752cccedbd98aac733f0860019d1e Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 14 Oct 2022 12:08:37 -0700 Subject: [PATCH 47/51] cleanup millisecond --- python/cudf/cudf/core/index.py | 21 --------------------- python/cudf/cudf/core/series.py | 25 ------------------------- 2 files changed, 46 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 25d41aaf637..d398a45f682 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2051,27 +2051,6 @@ def second(self): """ return self._get_dt_field("second") - @property # type: ignore - @_cudf_nvtx_annotate - def millisecond(self): - """ - The milliseconds of the datetime. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> datetime_index = cudf.Index(pd.date_range("2000-01-01", - ... periods=3, freq="ms")) - >>> datetime_index - DatetimeIndex([ '2000-01-01 00:00:00', '2000-01-01 00:00:00.001000', - '2000-01-01 00:00:00.002000'], - dtype='datetime64[ns]') - >>> datetime_index.millisecond - Int16Index([0, 1, 2], dtype='int16') - """ # noqa: E501 - return self._get_dt_field("millisecond") - @property # type: ignore @_cudf_nvtx_annotate def microsecond(self): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 8eae41ff64a..52b0b27e86c 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3592,31 +3592,6 @@ def second(self): """ return self._get_dt_field("second") - @property # type: ignore - @_cudf_nvtx_annotate - def millisecond(self): - """ - The milliseconds of the datetime. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> datetime_series = cudf.Series(pd.date_range("2000-01-01", - ... periods=3, freq="ms")) - >>> datetime_series - 0 2000-01-01 00:00:00.000 - 1 2000-01-01 00:00:00.001 - 2 2000-01-01 00:00:00.002 - dtype: datetime64[ns] - >>> datetime_series.dt.millisecond - 0 0 - 1 1 - 2 2 - dtype: int16 - """ - return self._get_dt_field("millisecond") - @property # type: ignore @_cudf_nvtx_annotate def microsecond(self): From a67123b34aaa8706b00f479ccf98deae4b000411 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 14 Oct 2022 14:09:32 -0500 Subject: [PATCH 48/51] Update docs/cudf/source/api_docs/series.rst --- docs/cudf/source/api_docs/series.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst index 6286fa317ea..842319338b3 100644 --- a/docs/cudf/source/api_docs/series.rst +++ b/docs/cudf/source/api_docs/series.rst @@ -267,7 +267,6 @@ Datetime properties minute second microsecond - millisecond nanosecond dayofweek weekday From 9c8b6638bae8c697b94e400f10493a166dc35242 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 14 Oct 2022 14:09:39 -0500 Subject: [PATCH 49/51] Update docs/cudf/source/api_docs/index_objects.rst --- docs/cudf/source/api_docs/index_objects.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst index 44a028a0d5d..6edd15e7176 100644 --- a/docs/cudf/source/api_docs/index_objects.rst +++ b/docs/cudf/source/api_docs/index_objects.rst @@ -262,7 +262,6 @@ Time/date components DatetimeIndex.hour DatetimeIndex.minute DatetimeIndex.second - DatetimeIndex.millisecond DatetimeIndex.microsecond DatetimeIndex.nanosecond DatetimeIndex.day_of_year From cb1042c84b9dd21a3d0d28d78fee31bb8889532f Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 17 Oct 2022 10:12:56 -0700 Subject: [PATCH 50/51] fix docstring --- python/cudf/cudf/core/index.py | 5 ++++- python/cudf/cudf/core/series.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index d398a45f682..0628497fc29 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2071,7 +2071,10 @@ def microsecond(self): Int32Index([0, 1, 2], dtype='int32') """ # noqa: E501 return as_index( - (self._values.get_dt_field("millisecond") * 1000) + ( + self._values.get_dt_field("millisecond") + * cudf.Scalar(1000, dtype="int32") + ) + self._values.get_dt_field("microsecond"), name=self.name, ) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 52b0b27e86c..5af26179450 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3616,7 +3616,10 @@ def microsecond(self): dtype: int16 """ return Series( - data=(self.series._column.get_dt_field("millisecond") * 1000) + data=( + self.series._column.get_dt_field("millisecond") + * cudf.Scalar(1000, dtype="int32") + ) + self.series._column.get_dt_field("microsecond"), index=self.series._index, name=self.series.name, From a320c457963004a7e259caa7da202e67472e2e2d Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 17 Oct 2022 10:13:36 -0700 Subject: [PATCH 51/51] fix docstring --- python/cudf/cudf/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 5af26179450..7493202a3d1 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3613,7 +3613,7 @@ def microsecond(self): 0 0 1 1 2 2 - dtype: int16 + dtype: int32 """ return Series( data=(