Skip to content

Commit

Permalink
Add Python bindings for get_json_object (#7981)
Browse files Browse the repository at this point in the history
Fixes: [7916](#7916)

Authors:
  - Sheilah Kirui (https://github.com/skirui-source)

Approvers:
  - Keith Kraus (https://github.com/kkraus14)

URL: #7981
  • Loading branch information
skirui-source authored May 6, 2021
1 parent 96c0706 commit 8ae73d5
Show file tree
Hide file tree
Showing 4 changed files with 259 additions and 0 deletions.
16 changes: 16 additions & 0 deletions python/cudf/cudf/_lib/cpp/strings/json.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.scalar.scalar cimport string_scalar
from libcpp.memory cimport unique_ptr
from libcpp.string cimport string

from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.scalar.scalar cimport scalar


cdef extern from "cudf/strings/json.hpp" namespace "cudf::strings" nogil:
cdef unique_ptr[column] get_json_object(
column_view col,
string_scalar json_path,
) except +
36 changes: 36 additions & 0 deletions python/cudf/cudf/_lib/strings/json.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.scalar.scalar cimport string_scalar
from cudf._lib.cpp.types cimport size_type
from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar
from cudf._lib.cpp.column.column cimport column

from cudf._lib.cpp.strings.json cimport (
get_json_object as cpp_get_json_object
)


def get_json_object(Column col, object py_json_path):
"""
Apply a JSONPath string to all rows in an input column
of json strings.
"""
cdef unique_ptr[column] c_result

cdef column_view col_view = col.view()
cdef DeviceScalar json_path = py_json_path.device_value

cdef const string_scalar* scalar_json_path = <const string_scalar*>(
json_path.get_raw_ptr()
)
with nogil:
c_result = move(cpp_get_json_object(
col_view,
scalar_json_path[0],
))

return Column.from_unique_ptr(move(c_result))
67 changes: 67 additions & 0 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@
startswith_multiple as cpp_startswith_multiple,
)
from cudf._lib.strings.findall import findall as cpp_findall
from cudf._lib.strings.json import get_json_object as cpp_get_json_object
from cudf._lib.strings.padding import (
PadSide,
center as cpp_center,
Expand Down Expand Up @@ -2180,6 +2181,72 @@ def get(self, i: int = 0) -> ParentType:

return self._return_or_inplace(cpp_string_get(self._column, i))

def get_json_object(self, json_path):
"""
Applies a JSONPath string to an input strings column
where each row in the column is a valid json string
Parameters
----------
json_path: str
The JSONPath string to be applied to each row
of the input column
Returns
-------
Column: New strings column containing the retrieved json object strings
Examples
--------
>>> import cudf
>>> s = cudf.Series(
[
\"\"\"
{
"store":{
"book":[
{
"category":"reference",
"author":"Nigel Rees",
"title":"Sayings of the Century",
"price":8.95
},
{
"category":"fiction",
"author":"Evelyn Waugh",
"title":"Sword of Honour",
"price":12.99
}
]
}
}
\"\"\"
])
>>> s
0 {"store": {\n "book": [\n { "cat...
dtype: object
>>> s.str.get_json_object("$.store.book")
0 [\n { "category": "reference",\n ...
dtype: object
"""

try:
res = self._return_or_inplace(
cpp_get_json_object(
self._column, cudf.Scalar(json_path, "str")
)
)
except RuntimeError as e:
matches = (
"Unrecognized JSONPath operator",
"Invalid empty name in JSONPath query string",
)
if any(match in str(e) for match in matches):
raise ValueError("JSONPath value not found") from e
raise
else:
return res

def split(
self, pat: str = None, n: int = -1, expand: bool = None
) -> ParentType:
Expand Down
140 changes: 140 additions & 0 deletions python/cudf/cudf/tests/test_string.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2018-2021, NVIDIA CORPORATION.

import json
import re
import urllib.parse
from contextlib import ExitStack as does_not_raise
Expand Down Expand Up @@ -2932,6 +2933,145 @@ def test_string_slice_with_mask():
assert_eq(actual, expected)


@pytest.mark.parametrize(
"data",
[
[
"""
{
"store":{
"book":[
{
"category":"reference",
"author":"Nigel Rees",
"title":"Sayings of the Century",
"price":8.95
},
{
"category":"fiction",
"author":"Evelyn Waugh",
"title":"Sword of Honour",
"price":12.99
}
]
}
}
"""
],
[
"""
{
"store":{
"book":[
{
"category":"reference",
"author":"Nigel Rees",
"title":"Sayings of the Century",
"price":8.95
}
]
}
}
""",
"""
{
"store":{
"book":[
{
"category":"fiction",
"author":"Evelyn Waugh",
"title":"Sword of Honour",
"price":12.99
}
]
}
}
""",
],
],
)
def test_string_get_json_object_n(data):
gs = cudf.Series(data)
ps = pd.Series(data)

assert_eq(
json.loads(gs.str.get_json_object("$.store")[0]),
ps.apply(lambda x: json.loads(x)["store"])[0],
)
assert_eq(
json.loads(gs.str.get_json_object("$.store.book")[0]),
ps.apply(lambda x: json.loads(x)["store"]["book"])[0],
)
assert_eq(
gs.str.get_json_object("$.store.book[0].category"),
ps.apply(lambda x: json.loads(x)["store"]["book"][0]["category"]),
)


@pytest.mark.parametrize(
"json_path", ["$.store", "$.store.book", "$.store.book[*].category", " "]
)
def test_string_get_json_object_empty_json_strings(json_path):
gs = cudf.Series(
[
"""
{
"":{
"":[
{
"":"",
"":"",
"":""
},
{
"":"fiction",
"":"",
"title":""
}
]
}
}
"""
]
)

got = gs.str.get_json_object(json_path)
expect = cudf.Series([None], dtype="object")

assert_eq(got, expect)


@pytest.mark.parametrize("json_path", ["a", ".", "/.store"])
def test_string_get_json_object_invalid_JSONPath(json_path):
gs = cudf.Series(
[
"""
{
"store":{
"book":[
{
"category":"reference",
"author":"Nigel Rees",
"title":"Sayings of the Century",
"price":8.95
},
{
"category":"fiction",
"author":"Evelyn Waugh",
"title":"Sword of Honour",
"price":12.99
}
]
}
}
"""
]
)

with pytest.raises(ValueError):
gs.str.get_json_object(json_path)


def test_str_join_lists_error():
sr = cudf.Series([["a", "a"], ["b"], ["c"]])

Expand Down

0 comments on commit 8ae73d5

Please sign in to comment.