rapidsai · rapids-bot · Jul 11, 2022 · Jun 27, 2022 · Jun 28, 2022 · Jun 29, 2022
@@ -1,5 +1,6 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
@@ -9,7 +10,19 @@ from cudf._lib.cpp.scalar.scalar cimport scalar, string_scalar
 
 
 cdef extern from "cudf/strings/json.hpp" namespace "cudf::strings" nogil:
+    cdef cppclass get_json_object_options:
+        get_json_object_options() except +
+        # getters
+        bool get_allow_single_quotes() except +
+        bool get_strip_quotes_from_single_strings() except +
+        bool get_missing_fields_as_nulls() except +
+        # setters
+        void set_allow_single_quotes(bool val) except +
+        void set_strip_quotes_from_single_strings(bool val) except +
+        void set_missing_fields_as_nulls(bool val) except +
+
     cdef unique_ptr[column] get_json_object(
         column_view col,
         string_scalar json_path,
+        get_json_object_options options,
     ) except +
@@ -62,7 +62,7 @@
     startswith_multiple,
 )
 from cudf._lib.strings.findall import findall, findall_record
-from cudf._lib.strings.json import get_json_object
+from cudf._lib.strings.json import get_json_object, GetJsonObjectOptions
 from cudf._lib.strings.padding import PadSide, center, ljust, pad, rjust, zfill
 from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence
 from cudf._lib.strings.replace import (

@@ -1,18 +1,23 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.json cimport get_json_object as cpp_get_json_object
+from cudf._lib.cpp.strings.json cimport (
+    get_json_object as cpp_get_json_object,
+    get_json_object_options,
+)
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
-def get_json_object(Column col, object py_json_path):
+def get_json_object(
+        Column col, object py_json_path, GetJsonObjectOptions options):
     """
     Apply a JSONPath string to all rows in an input column
     of json strings.
@@ -25,10 +30,53 @@ def get_json_object(Column col, object py_json_path):
     cdef const string_scalar* scalar_json_path = <const string_scalar*>(
         json_path.get_raw_ptr()
     )
+
     with nogil:
         c_result = move(cpp_get_json_object(
             col_view,
             scalar_json_path[0],
+            options.options,
         ))
 
     return Column.from_unique_ptr(move(c_result))
+
+
+cdef class GetJsonObjectOptions:
+    cdef get_json_object_options options
+
+    def __init__(
+        self,
+        *,
+        allow_single_quotes=False,
+        strip_quotes_from_single_strings=True,
+        missing_fields_as_nulls=False
+    ):
+        self.options.set_allow_single_quotes(allow_single_quotes)
+        self.options.set_strip_quotes_from_single_strings(
+            strip_quotes_from_single_strings
+        )
+        self.options.set_missing_fields_as_nulls(missing_fields_as_nulls)
+
+    @property
+    def allow_single_quotes(self):
+        return self.options.get_allow_single_quotes()
+
+    @property
+    def strip_quotes_from_single_strings(self):
+        return self.options.get_strip_quotes_from_single_strings()
+
+    @property
+    def missing_fields_as_nulls(self):
+        return self.options.get_missing_fields_as_nulls()
+
+    @allow_single_quotes.setter
+    def allow_single_quotes(self, val):
+        self.options.set_allow_single_quotes(val)
+
+    @strip_quotes_from_single_strings.setter
+    def strip_quotes_from_single_strings(self, val):
+        self.options.set_strip_quotes_from_single_strings(val)
+
+    @missing_fields_as_nulls.setter
+    def missing_fields_as_nulls(self, val):
+        self.options.set_missing_fields_as_nulls(val)
@@ -2236,16 +2236,38 @@ def get(self, i: int = 0) -> SeriesOrIndex:
 
         return self._return_or_inplace(libstrings.get(self._column, i))
 
-    def get_json_object(self, json_path):
+    def get_json_object(
+        self,
+        json_path,
+        *,
+        allow_single_quotes=False,
+        strip_quotes_from_single_strings=True,
+        missing_fields_as_nulls=False,
+    ):
         r"""
         Applies a JSONPath string to an input strings column
         where each row in the column is a valid json string
 
         Parameters
         ----------
-        json_path: str
+        json_path : str
             The JSONPath string to be applied to each row
             of the input column
+        allow_single_quotes : bool, default False
+            If True, representing strings with single
+            quotes is allowed.
+            If False, strings must only be represented
+            with double quotes.
+        strip_quotes_from_single_strings : bool, default True
+            If True, strip the quotes from the return value of
+            a given row if it is a string.
+            If False, values returned for a given row include
+            quotes if they are strings.
+        missing_fields_as_nulls : bool, default False
+            If True, when an object is queried for a field
+            it does not contain, "null" is returned.
+            If False, when an object is queried for a field
+            it does not contain, None is returned.
 
         Returns
         -------
@@ -2286,9 +2308,16 @@ def get_json_object(self, json_path):
         """
 
         try:
+            options = libstrings.GetJsonObjectOptions(
+                allow_single_quotes=allow_single_quotes,
+                strip_quotes_from_single_strings=(
+                    strip_quotes_from_single_strings
+                ),
+                missing_fields_as_nulls=missing_fields_as_nulls,
+            )
             res = self._return_or_inplace(
                 libstrings.get_json_object(
-                    self._column, cudf.Scalar(json_path, "str")
+                    self._column, cudf.Scalar(json_path, "str"), options
                 )
             )
         except RuntimeError as e:

@@ -3129,6 +3129,157 @@ def test_string_get_json_object_invalid_JSONPath(json_path):
         gs.str.get_json_object(json_path)
 
 
+def test_string_get_json_object_allow_single_quotes():
+    gs = cudf.Series(
+        [
+            """
+            {
+                "store":{
+                    "book":[
+                        {
+                            'author':"Nigel Rees",
+                            "title":'Sayings of the Century',
+                            "price":8.95
+                        },
+                        {
+                            "category":"fiction",
+                            "author":"Evelyn Waugh",
+                            'title':"Sword of Honour",
+                            "price":12.99
+                        }
+                    ]
+                }
+            }
+            """
+        ]
+    )
+    assert_eq(
+        gs.str.get_json_object(
+            "$.store.book[0].author", allow_single_quotes=True
+        ),
+        cudf.Series(["Nigel Rees"]),
+    )
+    assert_eq(
+        gs.str.get_json_object(
+            "$.store.book[*].title", allow_single_quotes=True
+        ),
+        cudf.Series(["['Sayings of the Century',\"Sword of Honour\"]"]),
+    )
+
+    assert_eq(
+        gs.str.get_json_object(
+            "$.store.book[0].author", allow_single_quotes=False
+        ),
+        cudf.Series([None]),
+    )
+    assert_eq(
+        gs.str.get_json_object(
+            "$.store.book[*].title", allow_single_quotes=False
+        ),
+        cudf.Series([None]),
+    )
+
+
+def test_string_get_json_object_strip_quotes_from_single_strings():
+    gs = cudf.Series(
+        [
+            """
+            {
+                "store":{
+                    "book":[
+                        {
+                            "author":"Nigel Rees",
+                            "title":"Sayings of the Century",
+                            "price":8.95
+                        },
+                        {
+                            "category":"fiction",
+                            "author":"Evelyn Waugh",
+                            "title":"Sword of Honour",
+                            "price":12.99
+                        }
+                    ]
+                }
+            }
+            """
+        ]
+    )
+    assert_eq(
+        gs.str.get_json_object(
+            "$.store.book[0].author", strip_quotes_from_single_strings=True
+        ),
+        cudf.Series(["Nigel Rees"]),
+    )
+    assert_eq(
+        gs.str.get_json_object(
+            "$.store.book[*].title", strip_quotes_from_single_strings=True
+        ),
+        cudf.Series(['["Sayings of the Century","Sword of Honour"]']),
+    )
+    assert_eq(
+        gs.str.get_json_object(
+            "$.store.book[0].author", strip_quotes_from_single_strings=False
+        ),
+        cudf.Series(['"Nigel Rees"']),
+    )
+    assert_eq(
+        gs.str.get_json_object(
+            "$.store.book[*].title", strip_quotes_from_single_strings=False
+        ),
+        cudf.Series(['["Sayings of the Century","Sword of Honour"]']),
+    )
+
+
+def test_string_get_json_object_missing_fields_as_nulls():
+    gs = cudf.Series(
+        [
+            """
+            {
+                "store":{
+                    "book":[
+                        {
+                            "author":"Nigel Rees",
+                            "title":"Sayings of the Century",
+                            "price":8.95
+                        },
+                        {
+                            "category":"fiction",
+                            "author":"Evelyn Waugh",
+                            "title":"Sword of Honour",
+                            "price":12.99
+                        }
+                    ]
+                }
+            }
+            """
+        ]
+    )
+    assert_eq(
+        gs.str.get_json_object(
+            "$.store.book[0].category", missing_fields_as_nulls=True
+        ),
+        cudf.Series(["null"]),
+    )
+    assert_eq(
+        gs.str.get_json_object(
+            "$.store.book[*].category", missing_fields_as_nulls=True
+        ),
+        cudf.Series(['[null,"fiction"]']),
+    )
+    assert_eq(
+        gs.str.get_json_object(
+            "$.store.book[0].category", missing_fields_as_nulls=False
+        ),
+        cudf.Series([None]),
+    )
+    assert_eq(
+        gs.str.get_json_object(
+            "$.store.book[*].category", missing_fields_as_nulls=False
+        ),
+        cudf.Series(['["fiction"]']),
+    )
+
+
 def test_str_join_lists_error():
     sr = cudf.Series([["a", "a"], ["b"], ["c"]])