diff --git a/python/cudf/cudf/_lib/cpp/strings/json.pxd b/python/cudf/cudf/_lib/cpp/strings/json.pxd new file mode 100644 index 00000000000..c0e215f2085 --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/strings/json.pxd @@ -0,0 +1,16 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.scalar.scalar cimport string_scalar +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string + +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.scalar.scalar cimport scalar + + +cdef extern from "cudf/strings/json.hpp" namespace "cudf::strings" nogil: + cdef unique_ptr[column] get_json_object( + column_view col, + string_scalar json_path, + ) except + diff --git a/python/cudf/cudf/_lib/strings/json.pyx b/python/cudf/cudf/_lib/strings/json.pyx new file mode 100644 index 00000000000..211bbe9d4f0 --- /dev/null +++ b/python/cudf/cudf/_lib/strings/json.pyx @@ -0,0 +1,36 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.scalar.scalar cimport string_scalar +from cudf._lib.cpp.types cimport size_type +from cudf._lib.column cimport Column +from cudf._lib.scalar cimport DeviceScalar +from cudf._lib.cpp.column.column cimport column + +from cudf._lib.cpp.strings.json cimport ( + get_json_object as cpp_get_json_object +) + + +def get_json_object(Column col, object py_json_path): + """ + Apply a JSONPath string to all rows in an input column + of json strings. + """ + cdef unique_ptr[column] c_result + + cdef column_view col_view = col.view() + cdef DeviceScalar json_path = py_json_path.device_value + + cdef const string_scalar* scalar_json_path = ( + json_path.get_raw_ptr() + ) + with nogil: + c_result = move(cpp_get_json_object( + col_view, + scalar_json_path[0], + )) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 044088b68b5..14b71ad5528 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -110,6 +110,7 @@ startswith_multiple as cpp_startswith_multiple, ) from cudf._lib.strings.findall import findall as cpp_findall +from cudf._lib.strings.json import get_json_object as cpp_get_json_object from cudf._lib.strings.padding import ( PadSide, center as cpp_center, @@ -2180,6 +2181,72 @@ def get(self, i: int = 0) -> ParentType: return self._return_or_inplace(cpp_string_get(self._column, i)) + def get_json_object(self, json_path): + """ + Applies a JSONPath string to an input strings column + where each row in the column is a valid json string + + Parameters + ---------- + json_path: str + The JSONPath string to be applied to each row + of the input column + + Returns + ------- + Column: New strings column containing the retrieved json object strings + + Examples + -------- + >>> import cudf + >>> s = cudf.Series( + [ + \"\"\" + { + "store":{ + "book":[ + { + "category":"reference", + "author":"Nigel Rees", + "title":"Sayings of the Century", + "price":8.95 + }, + { + "category":"fiction", + "author":"Evelyn Waugh", + "title":"Sword of Honour", + "price":12.99 + } + ] + } + } + \"\"\" + ]) + >>> s + 0 {"store": {\n "book": [\n { "cat... + dtype: object + >>> s.str.get_json_object("$.store.book") + 0 [\n { "category": "reference",\n ... + dtype: object + """ + + try: + res = self._return_or_inplace( + cpp_get_json_object( + self._column, cudf.Scalar(json_path, "str") + ) + ) + except RuntimeError as e: + matches = ( + "Unrecognized JSONPath operator", + "Invalid empty name in JSONPath query string", + ) + if any(match in str(e) for match in matches): + raise ValueError("JSONPath value not found") from e + raise + else: + return res + def split( self, pat: str = None, n: int = -1, expand: bool = None ) -> ParentType: diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 0ff5b81ce81..c6a64824a86 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1,5 +1,6 @@ # Copyright (c) 2018-2021, NVIDIA CORPORATION. +import json import re import urllib.parse from contextlib import ExitStack as does_not_raise @@ -2932,6 +2933,145 @@ def test_string_slice_with_mask(): assert_eq(actual, expected) +@pytest.mark.parametrize( + "data", + [ + [ + """ + { + "store":{ + "book":[ + { + "category":"reference", + "author":"Nigel Rees", + "title":"Sayings of the Century", + "price":8.95 + }, + { + "category":"fiction", + "author":"Evelyn Waugh", + "title":"Sword of Honour", + "price":12.99 + } + ] + } + } + """ + ], + [ + """ + { + "store":{ + "book":[ + { + "category":"reference", + "author":"Nigel Rees", + "title":"Sayings of the Century", + "price":8.95 + } + ] + } + } + """, + """ + { + "store":{ + "book":[ + { + "category":"fiction", + "author":"Evelyn Waugh", + "title":"Sword of Honour", + "price":12.99 + } + ] + } + } + """, + ], + ], +) +def test_string_get_json_object_n(data): + gs = cudf.Series(data) + ps = pd.Series(data) + + assert_eq( + json.loads(gs.str.get_json_object("$.store")[0]), + ps.apply(lambda x: json.loads(x)["store"])[0], + ) + assert_eq( + json.loads(gs.str.get_json_object("$.store.book")[0]), + ps.apply(lambda x: json.loads(x)["store"]["book"])[0], + ) + assert_eq( + gs.str.get_json_object("$.store.book[0].category"), + ps.apply(lambda x: json.loads(x)["store"]["book"][0]["category"]), + ) + + +@pytest.mark.parametrize( + "json_path", ["$.store", "$.store.book", "$.store.book[*].category", " "] +) +def test_string_get_json_object_empty_json_strings(json_path): + gs = cudf.Series( + [ + """ + { + "":{ + "":[ + { + "":"", + "":"", + "":"" + }, + { + "":"fiction", + "":"", + "title":"" + } + ] + } + } + """ + ] + ) + + got = gs.str.get_json_object(json_path) + expect = cudf.Series([None], dtype="object") + + assert_eq(got, expect) + + +@pytest.mark.parametrize("json_path", ["a", ".", "/.store"]) +def test_string_get_json_object_invalid_JSONPath(json_path): + gs = cudf.Series( + [ + """ + { + "store":{ + "book":[ + { + "category":"reference", + "author":"Nigel Rees", + "title":"Sayings of the Century", + "price":8.95 + }, + { + "category":"fiction", + "author":"Evelyn Waugh", + "title":"Sword of Honour", + "price":12.99 + } + ] + } + } + """ + ] + ) + + with pytest.raises(ValueError): + gs.str.get_json_object(json_path) + + def test_str_join_lists_error(): sr = cudf.Series([["a", "a"], ["b"], ["c"]])