Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add string.convert.convert_urls APIs to pylibcudf #17003

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cpp/include/cudf/strings/convert/convert_urls.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ namespace strings {
*/

/**
* @brief Decodes each string using URL encoding.
* @brief Encodes each string using URL encoding.
*
* Converts mostly non-ascii characters and control characters into UTF-8 hex code-points
* prefixed with '%'. For example, the space character must be converted to characters '%20' where
Expand All @@ -49,7 +49,7 @@ std::unique_ptr<column> url_encode(
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Encodes each string using URL encoding.
* @brief Decodes each string using URL encoding.
*
* Converts all character sequences starting with '%' into character code-points
* interpreting the 2 following characters as hex values to create the code-point.
Expand Down
36 changes: 7 additions & 29 deletions python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
Original file line number Diff line number Diff line change
@@ -1,17 +1,9 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
import pylibcudf as plc

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.strings.convert.convert_urls cimport (
url_decode as cpp_url_decode,
url_encode as cpp_url_encode,
)

from cudf._lib.column cimport Column


Expand All @@ -28,17 +20,10 @@ def url_decode(Column source_strings):
-------
URL decoded string column
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_url_decode(
source_view
))

return Column.from_unique_ptr(
move(c_result)
plc_column = plc.strings.convert.convert_urls.url_decode(
source_strings.to_pylibcudf(mode="read")
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -57,14 +42,7 @@ def url_encode(Column source_strings):
-------
URL encoded string column
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_url_encode(
source_view
))

return Column.from_unique_ptr(
move(c_result)
plc_column = plc.strings.convert.convert_urls.url_encode(
source_strings.to_pylibcudf(mode="read")
)
return Column.from_pylibcudf(plc_column)
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ from pylibcudf.libcudf.column.column_view cimport column_view
cdef extern from "cudf/strings/convert/convert_urls.hpp" namespace \
"cudf::strings" nogil:
cdef unique_ptr[column] url_encode(
column_view input_col) except +
column_view input) except +
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@vyasr should we be marking these APIs all with libcudf_exception_handler as we go even if currently the API has no documented exception cases?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that's probably a good best practice yeah.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

xref #17036 to follow up in the future


cdef unique_ptr[column] url_decode(
column_view input_col) except +
column_view input) except +
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources convert_booleans.pyx convert_durations.pyx convert_datetime.pyx)
set(cython_sources convert_booleans.pyx convert_durations.pyx convert_datetime.pyx convert_urls.pyx)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand Down
7 changes: 6 additions & 1 deletion python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from . cimport convert_booleans, convert_datetime, convert_durations
from . cimport (
convert_booleans,
convert_datetime,
convert_durations,
convert_urls,
)
7 changes: 6 additions & 1 deletion python/pylibcudf/pylibcudf/strings/convert/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from . import convert_booleans, convert_datetime, convert_durations
from . import (
convert_booleans,
convert_datetime,
convert_durations,
convert_urls,
)
8 changes: 8 additions & 0 deletions python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column


cpdef Column url_encode(Column Input)

cpdef Column url_decode(Column Input)
63 changes: 63 additions & 0 deletions python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.strings.convert cimport convert_urls as cpp_convert_urls


cpdef Column url_encode(Column input):
"""
Encodes each string using URL encoding.

For details, see :cpp:func:`cudf::strings::url_encode`

Parameters
----------
input : Column
Strings instance for this operation.

Returns
-------
Column
New strings column.
"""
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_convert_urls.url_encode(
input.view()
)
)

return Column.from_libcudf(move(c_result))


cpdef Column url_decode(Column input):
"""
Decodes each string using URL encoding.

For details, see :cpp:func:`cudf::strings::url_decode`

Parameters
----------
input : Column
Strings instance for this operation.

Returns
-------
Column
New strings column.
"""
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_convert_urls.url_decode(
input.view()
)
)

return Column.from_libcudf(move(c_result))
36 changes: 36 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
import urllib

import pyarrow as pa
import pylibcudf as plc
from utils import assert_column_eq


def test_url_encode():
data = ["/home/nfs", None]
arr = pa.array(data)
result = plc.strings.convert.convert_urls.url_encode(
plc.interop.from_arrow(arr)
)
expected = pa.array(
[
urllib.parse.quote(url, safe="") if isinstance(url, str) else url
for url in data
]
)
assert_column_eq(result, expected)


def test_url_decode():
data = ["%2Fhome%2fnfs", None]
arr = pa.array(data)
result = plc.strings.convert.convert_urls.url_decode(
plc.interop.from_arrow(arr)
)
expected = pa.array(
[
urllib.parse.unquote(url) if isinstance(url, str) else url
for url in data
]
)
assert_column_eq(result, expected)
Loading