Skip to content

Commit

Permalink
Add json APIs to pylibcudf (#17025)
Browse files Browse the repository at this point in the history
Contributes to #15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #17025
  • Loading branch information
mroeschke authored Oct 10, 2024
1 parent 69b0f66 commit 7d49df7
Show file tree
Hide file tree
Showing 15 changed files with 246 additions and 73 deletions.
1 change: 1 addition & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ This page provides API documentation for pylibcudf.
groupby
interop
join
json
labeling
lists
merge
Expand Down
6 changes: 6 additions & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/json.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
====
json
====

.. automodule:: pylibcudf.json
:members:
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@
)
from cudf._lib.strings.find_multiple import find_multiple
from cudf._lib.strings.findall import find_re, findall
from cudf._lib.strings.json import GetJsonObjectOptions, get_json_object
from cudf._lib.strings.json import get_json_object
from cudf._lib.strings.padding import center, ljust, pad, rjust, zfill
from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence
from cudf._lib.strings.replace import (
Expand Down
80 changes: 11 additions & 69 deletions python/cudf/cudf/_lib/strings/json.pyx
Original file line number Diff line number Diff line change
@@ -1,84 +1,26 @@
# Copyright (c) 2021-2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
import pylibcudf as plc
from pylibcudf.json cimport GetJsonObjectOptions

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.json cimport (
get_json_object as cpp_get_json_object,
get_json_object_options,
)

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar


@acquire_spill_lock()
def get_json_object(
Column col, object py_json_path, GetJsonObjectOptions options):
Column col,
object py_json_path,
GetJsonObjectOptions options
):
"""
Apply a JSONPath string to all rows in an input column
of json strings.
"""
cdef unique_ptr[column] c_result

cdef column_view col_view = col.view()
cdef DeviceScalar json_path = py_json_path.device_value

cdef const string_scalar* scalar_json_path = <const string_scalar*>(
json_path.get_raw_ptr()
plc_column = plc.json.get_json_object(
col.to_pylibcudf(mode="read"),
py_json_path.device_value.c_value,
options
)

with nogil:
c_result = move(cpp_get_json_object(
col_view,
scalar_json_path[0],
options.options,
))

return Column.from_unique_ptr(move(c_result))


cdef class GetJsonObjectOptions:
cdef get_json_object_options options

def __init__(
self,
*,
allow_single_quotes=False,
strip_quotes_from_single_strings=True,
missing_fields_as_nulls=False
):
self.options.set_allow_single_quotes(allow_single_quotes)
self.options.set_strip_quotes_from_single_strings(
strip_quotes_from_single_strings
)
self.options.set_missing_fields_as_nulls(missing_fields_as_nulls)

@property
def allow_single_quotes(self):
return self.options.get_allow_single_quotes()

@property
def strip_quotes_from_single_strings(self):
return self.options.get_strip_quotes_from_single_strings()

@property
def missing_fields_as_nulls(self):
return self.options.get_missing_fields_as_nulls()

@allow_single_quotes.setter
def allow_single_quotes(self, val):
self.options.set_allow_single_quotes(val)

@strip_quotes_from_single_strings.setter
def strip_quotes_from_single_strings(self, val):
self.options.set_strip_quotes_from_single_strings(val)

@missing_fields_as_nulls.setter
def missing_fields_as_nulls(self, val):
self.options.set_missing_fields_as_nulls(val)
return Column.from_pylibcudf(plc_column)
3 changes: 1 addition & 2 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -2385,8 +2385,7 @@ def get_json_object(
0 [\n { "category": "reference",\n ...
dtype: object
"""

options = libstrings.GetJsonObjectOptions(
options = plc.json.GetJsonObjectOptions(
allow_single_quotes=allow_single_quotes,
strip_quotes_from_single_strings=(
strip_quotes_from_single_strings
Expand Down
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ set(cython_sources
groupby.pyx
interop.pyx
join.pyx
json.pyx
labeling.pyx
lists.pyx
merge.pyx
Expand Down
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ from . cimport (
filling,
groupby,
join,
json,
labeling,
lists,
merge,
Expand Down Expand Up @@ -60,6 +61,7 @@ __all__ = [
"gpumemoryview",
"groupby",
"join",
"json",
"lists",
"merge",
"null_mask",
Expand Down
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
interop,
io,
join,
json,
labeling,
lists,
merge,
Expand Down Expand Up @@ -73,6 +74,7 @@
"interop",
"io",
"join",
"json",
"labeling",
"lists",
"merge",
Expand Down
16 changes: 16 additions & 0 deletions python/pylibcudf/pylibcudf/json.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column
from pylibcudf.libcudf.json cimport get_json_object_options
from pylibcudf.scalar cimport Scalar


cdef class GetJsonObjectOptions:
cdef get_json_object_options options


cpdef Column get_json_object(
Column col,
Scalar json_path,
GetJsonObjectOptions options=*
)
154 changes: 154 additions & 0 deletions python/pylibcudf/pylibcudf/json.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cython.operator cimport dereference
from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf cimport json as cpp_json
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.scalar cimport Scalar


cdef class GetJsonObjectOptions:
"""Settings for ``get_json_object()``"""
def __init__(
self,
*,
allow_single_quotes=False,
strip_quotes_from_single_strings=True,
missing_fields_as_nulls=False
):
self.set_allow_single_quotes(allow_single_quotes)
self.set_strip_quotes_from_single_strings(
strip_quotes_from_single_strings
)
self.set_missing_fields_as_nulls(missing_fields_as_nulls)

def get_allow_single_quotes(self):
"""
Returns true/false depending on whether single-quotes for representing strings
are allowed.
Returns
-------
bool
true if single-quotes are allowed, false otherwise.
"""
return self.options.get_allow_single_quotes()

def get_strip_quotes_from_single_strings(self):
"""
Returns true/false depending on whether individually returned string values have
their quotes stripped.
Returns
-------
bool
true if individually returned string values have their quotes stripped.
"""
return self.options.get_strip_quotes_from_single_strings()

def get_missing_fields_as_nulls(self):
"""
Whether a field not contained by an object is to be interpreted as null.
Returns
-------
bool
true if missing fields are interpreted as null.
"""
return self.options.get_missing_fields_as_nulls()

def set_allow_single_quotes(self, bool val):
"""
Set whether single-quotes for strings are allowed.
Parameters
----------
val : bool
Whether to allow single quotes
Returns
-------
None
"""
self.options.set_allow_single_quotes(val)

def set_strip_quotes_from_single_strings(self, bool val):
"""
Set whether individually returned string values have their quotes stripped.
Parameters
----------
val : bool
Whether to strip quotes from single strings.
Returns
-------
None
"""
self.options.set_strip_quotes_from_single_strings(val)

def set_missing_fields_as_nulls(self, bool val):
"""
Set whether missing fields are interpreted as null.
Parameters
----------
val : bool
Whether to treat missing fields as nulls.
Returns
-------
None
"""
self.options.set_missing_fields_as_nulls(val)


cpdef Column get_json_object(
Column col,
Scalar json_path,
GetJsonObjectOptions options=None
):
"""
Apply a JSONPath string to all rows in an input strings column.
For details, see :cpp:func:`cudf::get_json_object`
Parameters
----------
col : Column
The input strings column. Each row must contain a valid json string.
json_path : Scalar
The JSONPath string to be applied to each row.
options : GetJsonObjectOptions
Options for controlling the behavior of the function.
Returns
-------
Column
New strings column containing the retrieved json object strings.
"""
cdef unique_ptr[column] c_result
cdef string_scalar* c_json_path = <string_scalar*>(
json_path.c_obj.get()
)
if options is None:
options = GetJsonObjectOptions()

cdef cpp_json.get_json_object_options c_options = options.options

with nogil:
c_result = move(
cpp_json.get_json_object(
col.view(),
dereference(c_json_path),
c_options
)
)

return Column.from_libcudf(move(c_result))
File renamed without changes.
4 changes: 4 additions & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ from . cimport (
padding,
regex_flags,
regex_program,
repeat,
replace,
side_type,
slice,
Expand All @@ -33,9 +34,12 @@ __all__ = [
"convert",
"extract",
"find",
"find_multiple",
"findall",
"padding",
"regex_flags",
"regex_program",
"repeat",
"replace",
"slice",
"strip",
Expand Down
3 changes: 3 additions & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,12 @@
"convert",
"extract",
"find",
"find_multiple",
"findall",
"padding",
"regex_flags",
"regex_program",
"repeat",
"replace",
"slice",
"strip",
Expand Down
Loading

0 comments on commit 7d49df7

Please sign in to comment.