Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Python bindings for get_json_object #7981

Merged
merged 27 commits into from
May 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
c04739b
created json.pxd file
skirui-source Apr 16, 2021
af61b04
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into j…
skirui-source Apr 16, 2021
a3e63f0
added python wrapper >>json.pyx
skirui-source Apr 17, 2021
3048766
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into j…
skirui-source Apr 17, 2021
a5630d8
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into j…
skirui-source Apr 21, 2021
b86b0d3
added extract_json.py
skirui-source Apr 22, 2021
93fd718
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into j…
skirui-source Apr 22, 2021
dd6f773
added test, addressed review
skirui-source Apr 23, 2021
4296584
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into j…
skirui-source Apr 23, 2021
09c4bcd
.
skirui-source Apr 23, 2021
b3e133d
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into j…
skirui-source Apr 26, 2021
366d0ab
added more tests
skirui-source Apr 27, 2021
1392807
all tests passing, ready for review
skirui-source Apr 28, 2021
06c5302
fixed style issues
skirui-source Apr 28, 2021
239b207
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into j…
skirui-source Apr 28, 2021
48c6216
.
skirui-source Apr 28, 2021
39bba46
fixed style issues with json strings
skirui-source Apr 28, 2021
8409dbc
addressed review
skirui-source Apr 30, 2021
c16496e
.
skirui-source Apr 30, 2021
45efb70
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into j…
skirui-source Apr 30, 2021
4b88b72
all tests passing now
skirui-source May 3, 2021
d2c377a
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into j…
skirui-source May 3, 2021
93c44cd
fixed tests, addressed reviews
skirui-source May 5, 2021
1e70807
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into j…
skirui-source May 5, 2021
ad80419
removed print in tests
skirui-source May 5, 2021
db8bda6
.
skirui-source May 6, 2021
36a0cab
fixed format issue
skirui-source May 6, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions python/cudf/cudf/_lib/cpp/strings/json.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.scalar.scalar cimport string_scalar
from libcpp.memory cimport unique_ptr
from libcpp.string cimport string

from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.scalar.scalar cimport scalar


cdef extern from "cudf/strings/json.hpp" namespace "cudf::strings" nogil:
cdef unique_ptr[column] get_json_object(
column_view col,
string_scalar json_path,
) except +
36 changes: 36 additions & 0 deletions python/cudf/cudf/_lib/strings/json.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.scalar.scalar cimport string_scalar
from cudf._lib.cpp.types cimport size_type
from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar
from cudf._lib.cpp.column.column cimport column

from cudf._lib.cpp.strings.json cimport (
get_json_object as cpp_get_json_object
)


def get_json_object(Column col, object py_json_path):
"""
Apply a JSONPath string to all rows in an input column
of json strings.
"""
cdef unique_ptr[column] c_result

cdef column_view col_view = col.view()
cdef DeviceScalar json_path = py_json_path.device_value
skirui-source marked this conversation as resolved.
Show resolved Hide resolved

cdef const string_scalar* scalar_json_path = <const string_scalar*>(
json_path.get_raw_ptr()
)
with nogil:
c_result = move(cpp_get_json_object(
col_view,
scalar_json_path[0],
))

return Column.from_unique_ptr(move(c_result))
67 changes: 67 additions & 0 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@
startswith_multiple as cpp_startswith_multiple,
)
from cudf._lib.strings.findall import findall as cpp_findall
from cudf._lib.strings.json import get_json_object as cpp_get_json_object
from cudf._lib.strings.padding import (
PadSide,
center as cpp_center,
Expand Down Expand Up @@ -2180,6 +2181,72 @@ def get(self, i: int = 0) -> ParentType:

return self._return_or_inplace(cpp_string_get(self._column, i))

def get_json_object(self, json_path):
"""
Applies a JSONPath string to an input strings column
where each row in the column is a valid json string

Parameters
----------
json_path: str
The JSONPath string to be applied to each row
of the input column

Returns
-------
Column: New strings column containing the retrieved json object strings

Examples
--------
>>> import cudf
>>> s = cudf.Series(
[
\"\"\"
{
"store":{
"book":[
{
"category":"reference",
"author":"Nigel Rees",
"title":"Sayings of the Century",
"price":8.95
},
{
"category":"fiction",
"author":"Evelyn Waugh",
"title":"Sword of Honour",
"price":12.99
}
]
}
}
\"\"\"
])
>>> s
0 {"store": {\n "book": [\n { "cat...
dtype: object
>>> s.str.get_json_object("$.store.book")
0 [\n { "category": "reference",\n ...
dtype: object
"""

try:
res = self._return_or_inplace(
cpp_get_json_object(
self._column, cudf.Scalar(json_path, "str")
)
)
except RuntimeError as e:
matches = (
"Unrecognized JSONPath operator",
"Invalid empty name in JSONPath query string",
)
if any(match in str(e) for match in matches):
raise ValueError("JSONPath value not found") from e
raise
else:
return res

def split(
self, pat: str = None, n: int = -1, expand: bool = None
) -> ParentType:
Expand Down
140 changes: 140 additions & 0 deletions python/cudf/cudf/tests/test_string.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2018-2021, NVIDIA CORPORATION.

import json
import re
import urllib.parse
from contextlib import ExitStack as does_not_raise
Expand Down Expand Up @@ -2932,6 +2933,145 @@ def test_string_slice_with_mask():
assert_eq(actual, expected)


@pytest.mark.parametrize(
"data",
[
[
"""
{
"store":{
"book":[
{
"category":"reference",
"author":"Nigel Rees",
"title":"Sayings of the Century",
"price":8.95
},
{
"category":"fiction",
"author":"Evelyn Waugh",
"title":"Sword of Honour",
"price":12.99
}
]
}
}
"""
],
[
"""
{
"store":{
"book":[
{
"category":"reference",
"author":"Nigel Rees",
"title":"Sayings of the Century",
"price":8.95
}
]
}
}
""",
"""
{
"store":{
"book":[
{
"category":"fiction",
"author":"Evelyn Waugh",
"title":"Sword of Honour",
"price":12.99
}
]
}
}
""",
],
],
skirui-source marked this conversation as resolved.
Show resolved Hide resolved
)
def test_string_get_json_object_n(data):
gs = cudf.Series(data)
ps = pd.Series(data)

assert_eq(
json.loads(gs.str.get_json_object("$.store")[0]),
ps.apply(lambda x: json.loads(x)["store"])[0],
)
assert_eq(
json.loads(gs.str.get_json_object("$.store.book")[0]),
ps.apply(lambda x: json.loads(x)["store"]["book"])[0],
)
assert_eq(
gs.str.get_json_object("$.store.book[0].category"),
ps.apply(lambda x: json.loads(x)["store"]["book"][0]["category"]),
)


@pytest.mark.parametrize(
"json_path", ["$.store", "$.store.book", "$.store.book[*].category", " "]
)
def test_string_get_json_object_empty_json_strings(json_path):
gs = cudf.Series(
[
"""
{
"":{
"":[
{
"":"",
"":"",
"":""
},
{
"":"fiction",
"":"",
"title":""
}
]
}
}
"""
]
)

got = gs.str.get_json_object(json_path)
expect = cudf.Series([None], dtype="object")

assert_eq(got, expect)


@pytest.mark.parametrize("json_path", ["a", ".", "/.store"])
def test_string_get_json_object_invalid_JSONPath(json_path):
gs = cudf.Series(
[
"""
{
"store":{
"book":[
{
"category":"reference",
"author":"Nigel Rees",
"title":"Sayings of the Century",
"price":8.95
},
{
"category":"fiction",
"author":"Evelyn Waugh",
"title":"Sword of Honour",
"price":12.99
}
]
}
}
"""
]
)

with pytest.raises(ValueError):
gs.str.get_json_object(json_path)
skirui-source marked this conversation as resolved.
Show resolved Hide resolved


def test_str_join_lists_error():
sr = cudf.Series([["a", "a"], ["b"], ["c"]])

Expand Down