Skip to content

Commit

Permalink
Add docstrings and test for strings.convert_durations APIs for pylibc…
Browse files Browse the repository at this point in the history
…udf (#16982)

Contributes to #15162

Since the implementation already existed:

* Added docstrings
* Like #16971, made the `format` parameter accept `str` instead
* Aligned parameter names closer to pylibcudf
* Added missing `move`s
* Moved `convert_duration` tests to `test_string_convert_duration.py` and added a new test for `from_durations`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #16982
  • Loading branch information
mroeschke authored Oct 4, 2024
1 parent 119aa9d commit 77f3a5d
Show file tree
Hide file tree
Showing 6 changed files with 130 additions and 62 deletions.
7 changes: 2 additions & 5 deletions python/cudf/cudf/_lib/string_casting.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -525,12 +525,11 @@ def timedelta2int(Column input_col, dtype, format):
"""
dtype = dtype_to_pylibcudf_type(dtype)
cdef string c_timestamp_format = format.encode('UTF-8')
return Column.from_pylibcudf(
plc.strings.convert.convert_durations.to_durations(
input_col.to_pylibcudf(mode="read"),
dtype,
c_timestamp_format
format
)
)

Expand All @@ -549,12 +548,10 @@ def int2timedelta(Column input_col, str format):
A Column with Timedelta represented in string format
"""

cdef string c_duration_format = format.encode('UTF-8')
return Column.from_pylibcudf(
plc.strings.convert.convert_durations.from_durations(
input_col.to_pylibcudf(mode="read"),
c_duration_format
format
)
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ from pylibcudf.libcudf.types cimport data_type
cdef extern from "cudf/strings/convert/convert_durations.hpp" namespace \
"cudf::strings" nogil:
cdef unique_ptr[column] to_durations(
const column_view & strings_col,
const column_view & input,
data_type duration_type,
const string & format) except +

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ from pylibcudf.types cimport DataType
cpdef Column to_durations(
Column input,
DataType duration_type,
const string& format
str format
)

cpdef Column from_durations(
Column input,
const string& format
Column durations,
str format=*
)
73 changes: 63 additions & 10 deletions python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -15,27 +15,80 @@ from pylibcudf.types import DataType
cpdef Column to_durations(
Column input,
DataType duration_type,
const string& format
str format
):
"""
Returns a new duration column converting a strings column into
durations using the provided format pattern.
For details, see cpp:func:`cudf::strings::to_durations`
Parameters
----------
input : Column
Strings instance for this operation.
duration_type : DataType
The duration type used for creating the output column.
format : str
String specifying the duration format in strings.
Returns
-------
Column
New duration column.
"""
cdef unique_ptr[column] c_result
cdef string c_format = format.encode()

with nogil:
c_result = cpp_convert_durations.to_durations(
input.view(),
duration_type.c_obj,
format
c_result = move(
cpp_convert_durations.to_durations(
input.view(),
duration_type.c_obj,
c_format
)
)

return Column.from_libcudf(move(c_result))

cpdef Column from_durations(
Column input,
const string& format
Column durations,
str format=None
):
"""
Returns a new strings column converting a duration column into
strings using the provided format pattern.
For details, see cpp:func:`cudf::strings::from_durations`
Parameters
----------
durations : Column
Duration values to convert.
format : str
The string specifying output format.
Default format is "%D days %H:%M:%S".
Returns
-------
Column
New strings column with formatted durations.
"""
cdef unique_ptr[column] c_result

if format is None:
format = "%D days %H:%M:%S"
cdef string c_format = format.encode()

with nogil:
c_result = cpp_convert_durations.from_durations(
input.view(),
format
c_result = move(
cpp_convert_durations.from_durations(
durations.view(),
c_format
)
)

return Column.from_libcudf(move(c_result))
43 changes: 0 additions & 43 deletions python/pylibcudf/pylibcudf/tests/test_string_convert.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from datetime import datetime

import pyarrow as pa
import pylibcudf as plc
import pytest
Expand All @@ -21,39 +19,16 @@ def timestamp_type(request):
return request.param


@pytest.fixture(
scope="module",
params=[
pa.duration("ns"),
pa.duration("us"),
pa.duration("ms"),
pa.duration("s"),
],
)
def duration_type(request):
return request.param


@pytest.fixture(scope="module")
def pa_timestamp_col():
return pa.array(["2011-01-01", "2011-01-02", "2011-01-03"])


@pytest.fixture(scope="module")
def pa_duration_col():
return pa.array(["05:20:25"])


@pytest.fixture(scope="module")
def plc_timestamp_col(pa_timestamp_col):
return plc.interop.from_arrow(pa_timestamp_col)


@pytest.fixture(scope="module")
def plc_duration_col(pa_duration_col):
return plc.interop.from_arrow(pa_duration_col)


@pytest.mark.parametrize("format", ["%Y-%m-%d"])
def test_to_datetime(
pa_timestamp_col, plc_timestamp_col, timestamp_type, format
Expand All @@ -65,21 +40,3 @@ def test_to_datetime(
format,
)
assert_column_eq(expect, got)


@pytest.mark.parametrize("format", ["%H:%M:%S"])
def test_to_duration(pa_duration_col, plc_duration_col, duration_type, format):
def to_timedelta(duration_str):
date = datetime.strptime(duration_str, format)
return date - datetime(1900, 1, 1) # "%H:%M:%S" zero date

expect = pa.array([to_timedelta(d.as_py()) for d in pa_duration_col]).cast(
duration_type
)

got = plc.strings.convert.convert_durations.to_durations(
plc_duration_col,
plc.interop.from_arrow(duration_type),
format.encode(),
)
assert_column_eq(expect, got)
61 changes: 61 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from datetime import datetime, timedelta

import pyarrow as pa
import pylibcudf as plc
import pytest
from utils import assert_column_eq


@pytest.fixture(
params=[
pa.duration("ns"),
pa.duration("us"),
pa.duration("ms"),
pa.duration("s"),
],
)
def duration_type(request):
return request.param


@pytest.fixture(scope="module")
def pa_duration_col():
return pa.array(["05:20:25"])


@pytest.fixture(scope="module")
def plc_duration_col(pa_duration_col):
return plc.interop.from_arrow(pa_duration_col)


def test_to_duration(pa_duration_col, plc_duration_col, duration_type):
format = "%H:%M:%S"

def to_timedelta(duration_str):
date = datetime.strptime(duration_str, format)
return date - datetime(1900, 1, 1) # "%H:%M:%S" zero date

expect = pa.array([to_timedelta(d.as_py()) for d in pa_duration_col]).cast(
duration_type
)

got = plc.strings.convert.convert_durations.to_durations(
plc_duration_col,
plc.interop.from_arrow(duration_type),
format,
)
assert_column_eq(expect, got)


@pytest.mark.parametrize("format", [None, "%D days %H:%M:%S"])
def test_from_durations(format):
pa_array = pa.array(
[timedelta(days=1, hours=1, minutes=1, seconds=1), None]
)
result = plc.strings.convert.convert_durations.from_durations(
plc.interop.from_arrow(pa_array), format
)
expected = pa.array(["1 days 01:01:01", None])
assert_column_eq(result, expected)

0 comments on commit 77f3a5d

Please sign in to comment.