diff --git a/python/cudf/cudf/_lib/cpp/strings/json.pxd b/python/cudf/cudf/_lib/cpp/strings/json.pxd index 972e3c99d59..a017e1c5382 100644 --- a/python/cudf/cudf/_lib/cpp/strings/json.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/json.pxd @@ -1,5 +1,6 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -9,7 +10,19 @@ from cudf._lib.cpp.scalar.scalar cimport scalar, string_scalar cdef extern from "cudf/strings/json.hpp" namespace "cudf::strings" nogil: + cdef cppclass get_json_object_options: + get_json_object_options() except + + # getters + bool get_allow_single_quotes() except + + bool get_strip_quotes_from_single_strings() except + + bool get_missing_fields_as_nulls() except + + # setters + void set_allow_single_quotes(bool val) except + + void set_strip_quotes_from_single_strings(bool val) except + + void set_missing_fields_as_nulls(bool val) except + + cdef unique_ptr[column] get_json_object( column_view col, string_scalar json_path, + get_json_object_options options, ) except + diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index 15d5949b2cb..7e1c88c9258 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -62,7 +62,7 @@ startswith_multiple, ) from cudf._lib.strings.findall import findall, findall_record -from cudf._lib.strings.json import get_json_object +from cudf._lib.strings.json import get_json_object, GetJsonObjectOptions from cudf._lib.strings.padding import PadSide, center, ljust, pad, rjust, zfill from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence from cudf._lib.strings.replace import ( diff --git a/python/cudf/cudf/_lib/strings/json.pyx b/python/cudf/cudf/_lib/strings/json.pyx index c7545b6e481..1b1a9717e44 100644 --- a/python/cudf/cudf/_lib/strings/json.pyx +++ b/python/cudf/cudf/_lib/strings/json.pyx @@ -1,5 +1,6 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -7,12 +8,16 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.scalar.scalar cimport string_scalar -from cudf._lib.cpp.strings.json cimport get_json_object as cpp_get_json_object +from cudf._lib.cpp.strings.json cimport ( + get_json_object as cpp_get_json_object, + get_json_object_options, +) from cudf._lib.cpp.types cimport size_type from cudf._lib.scalar cimport DeviceScalar -def get_json_object(Column col, object py_json_path): +def get_json_object( + Column col, object py_json_path, GetJsonObjectOptions options): """ Apply a JSONPath string to all rows in an input column of json strings. @@ -25,10 +30,53 @@ def get_json_object(Column col, object py_json_path): cdef const string_scalar* scalar_json_path = ( json_path.get_raw_ptr() ) + with nogil: c_result = move(cpp_get_json_object( col_view, scalar_json_path[0], + options.options, )) return Column.from_unique_ptr(move(c_result)) + + +cdef class GetJsonObjectOptions: + cdef get_json_object_options options + + def __init__( + self, + *, + allow_single_quotes=False, + strip_quotes_from_single_strings=True, + missing_fields_as_nulls=False + ): + self.options.set_allow_single_quotes(allow_single_quotes) + self.options.set_strip_quotes_from_single_strings( + strip_quotes_from_single_strings + ) + self.options.set_missing_fields_as_nulls(missing_fields_as_nulls) + + @property + def allow_single_quotes(self): + return self.options.get_allow_single_quotes() + + @property + def strip_quotes_from_single_strings(self): + return self.options.get_strip_quotes_from_single_strings() + + @property + def missing_fields_as_nulls(self): + return self.options.get_missing_fields_as_nulls() + + @allow_single_quotes.setter + def allow_single_quotes(self, val): + self.options.set_allow_single_quotes(val) + + @strip_quotes_from_single_strings.setter + def strip_quotes_from_single_strings(self, val): + self.options.set_strip_quotes_from_single_strings(val) + + @missing_fields_as_nulls.setter + def missing_fields_as_nulls(self, val): + self.options.set_missing_fields_as_nulls(val) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index edb27cc3473..2602ea65d85 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -2236,16 +2236,38 @@ def get(self, i: int = 0) -> SeriesOrIndex: return self._return_or_inplace(libstrings.get(self._column, i)) - def get_json_object(self, json_path): + def get_json_object( + self, + json_path, + *, + allow_single_quotes=False, + strip_quotes_from_single_strings=True, + missing_fields_as_nulls=False, + ): r""" Applies a JSONPath string to an input strings column where each row in the column is a valid json string Parameters ---------- - json_path: str + json_path : str The JSONPath string to be applied to each row of the input column + allow_single_quotes : bool, default False + If True, representing strings with single + quotes is allowed. + If False, strings must only be represented + with double quotes. + strip_quotes_from_single_strings : bool, default True + If True, strip the quotes from the return value of + a given row if it is a string. + If False, values returned for a given row include + quotes if they are strings. + missing_fields_as_nulls : bool, default False + If True, when an object is queried for a field + it does not contain, "null" is returned. + If False, when an object is queried for a field + it does not contain, None is returned. Returns ------- @@ -2286,9 +2308,16 @@ def get_json_object(self, json_path): """ try: + options = libstrings.GetJsonObjectOptions( + allow_single_quotes=allow_single_quotes, + strip_quotes_from_single_strings=( + strip_quotes_from_single_strings + ), + missing_fields_as_nulls=missing_fields_as_nulls, + ) res = self._return_or_inplace( libstrings.get_json_object( - self._column, cudf.Scalar(json_path, "str") + self._column, cudf.Scalar(json_path, "str"), options ) ) except RuntimeError as e: diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index d212c6b2072..47854368199 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -3129,6 +3129,157 @@ def test_string_get_json_object_invalid_JSONPath(json_path): gs.str.get_json_object(json_path) +def test_string_get_json_object_allow_single_quotes(): + gs = cudf.Series( + [ + """ + { + "store":{ + "book":[ + { + 'author':"Nigel Rees", + "title":'Sayings of the Century', + "price":8.95 + }, + { + "category":"fiction", + "author":"Evelyn Waugh", + 'title':"Sword of Honour", + "price":12.99 + } + ] + } + } + """ + ] + ) + assert_eq( + gs.str.get_json_object( + "$.store.book[0].author", allow_single_quotes=True + ), + cudf.Series(["Nigel Rees"]), + ) + assert_eq( + gs.str.get_json_object( + "$.store.book[*].title", allow_single_quotes=True + ), + cudf.Series(["['Sayings of the Century',\"Sword of Honour\"]"]), + ) + + assert_eq( + gs.str.get_json_object( + "$.store.book[0].author", allow_single_quotes=False + ), + cudf.Series([None]), + ) + assert_eq( + gs.str.get_json_object( + "$.store.book[*].title", allow_single_quotes=False + ), + cudf.Series([None]), + ) + + +def test_string_get_json_object_strip_quotes_from_single_strings(): + gs = cudf.Series( + [ + """ + { + "store":{ + "book":[ + { + "author":"Nigel Rees", + "title":"Sayings of the Century", + "price":8.95 + }, + { + "category":"fiction", + "author":"Evelyn Waugh", + "title":"Sword of Honour", + "price":12.99 + } + ] + } + } + """ + ] + ) + assert_eq( + gs.str.get_json_object( + "$.store.book[0].author", strip_quotes_from_single_strings=True + ), + cudf.Series(["Nigel Rees"]), + ) + assert_eq( + gs.str.get_json_object( + "$.store.book[*].title", strip_quotes_from_single_strings=True + ), + cudf.Series(['["Sayings of the Century","Sword of Honour"]']), + ) + assert_eq( + gs.str.get_json_object( + "$.store.book[0].author", strip_quotes_from_single_strings=False + ), + cudf.Series(['"Nigel Rees"']), + ) + assert_eq( + gs.str.get_json_object( + "$.store.book[*].title", strip_quotes_from_single_strings=False + ), + cudf.Series(['["Sayings of the Century","Sword of Honour"]']), + ) + + +def test_string_get_json_object_missing_fields_as_nulls(): + gs = cudf.Series( + [ + """ + { + "store":{ + "book":[ + { + "author":"Nigel Rees", + "title":"Sayings of the Century", + "price":8.95 + }, + { + "category":"fiction", + "author":"Evelyn Waugh", + "title":"Sword of Honour", + "price":12.99 + } + ] + } + } + """ + ] + ) + assert_eq( + gs.str.get_json_object( + "$.store.book[0].category", missing_fields_as_nulls=True + ), + cudf.Series(["null"]), + ) + assert_eq( + gs.str.get_json_object( + "$.store.book[*].category", missing_fields_as_nulls=True + ), + cudf.Series(['[null,"fiction"]']), + ) + assert_eq( + gs.str.get_json_object( + "$.store.book[0].category", missing_fields_as_nulls=False + ), + cudf.Series([None]), + ) + assert_eq( + gs.str.get_json_object( + "$.store.book[*].category", missing_fields_as_nulls=False + ), + cudf.Series(['["fiction"]']), + ) + + def test_str_join_lists_error(): sr = cudf.Series([["a", "a"], ["b"], ["c"]])