Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose get_json_object_options to Python #11180

15 changes: 14 additions & 1 deletion python/cudf/cudf/_lib/cpp/strings/json.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2021, NVIDIA CORPORATION.
# Copyright (c) 2021-2022, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from libcpp.string cimport string

Expand All @@ -9,7 +10,19 @@ from cudf._lib.cpp.scalar.scalar cimport scalar, string_scalar


cdef extern from "cudf/strings/json.hpp" namespace "cudf::strings" nogil:
cdef cppclass get_json_object_options:
get_json_object_options() except +
# getters
bool get_allow_single_quotes() except +
bool get_strip_quotes_from_single_strings() except +
bool get_missing_fields_as_nulls() except +
# setters
void set_allow_single_quotes(bool val) except +
void set_strip_quotes_from_single_strings(bool val) except +
void set_missing_fields_as_nulls(bool val) except +

cdef unique_ptr[column] get_json_object(
column_view col,
string_scalar json_path,
get_json_object_options options,
) except +
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
startswith_multiple,
)
from cudf._lib.strings.findall import findall, findall_record
from cudf._lib.strings.json import get_json_object
from cudf._lib.strings.json import get_json_object, GetJsonObjectOptions
from cudf._lib.strings.padding import PadSide, center, ljust, pad, rjust, zfill
from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence
from cudf._lib.strings.replace import (
Expand Down
35 changes: 32 additions & 3 deletions python/cudf/cudf/_lib/strings/json.pyx
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
# Copyright (c) 2021, NVIDIA CORPORATION.
# Copyright (c) 2021-2022, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf._lib.column cimport Column
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.scalar.scalar cimport string_scalar
from cudf._lib.cpp.strings.json cimport get_json_object as cpp_get_json_object
from cudf._lib.cpp.strings.json cimport (
get_json_object as cpp_get_json_object,
get_json_object_options,
)
from cudf._lib.cpp.types cimport size_type
from cudf._lib.scalar cimport DeviceScalar


def get_json_object(Column col, object py_json_path):
def get_json_object(
Column col, object py_json_path, GetJsonObjectOptions options):
"""
Apply a JSONPath string to all rows in an input column
of json strings.
Expand All @@ -25,10 +30,34 @@ def get_json_object(Column col, object py_json_path):
cdef const string_scalar* scalar_json_path = <const string_scalar*>(
json_path.get_raw_ptr()
)

with nogil:
c_result = move(cpp_get_json_object(
col_view,
scalar_json_path[0],
options.options,
))

return Column.from_unique_ptr(move(c_result))


cdef class GetJsonObjectOptions:
cdef get_json_object_options options

def get_allow_single_quotes(self):
vyasr marked this conversation as resolved.
Show resolved Hide resolved
return self.options.get_allow_single_quotes()

def get_strip_quotes_from_single_strings(self):
return self.options.get_strip_quotes_from_single_strings()

def get_missing_fields_as_nulls(self):
return self.options.get_missing_fields_as_nulls()

def set_allow_single_quotes(self, val):
self.options.set_allow_single_quotes(val)

def set_strip_quotes_from_single_strings(self, val):
self.options.set_strip_quotes_from_single_strings(val)

def set_missing_fields_as_nulls(self, val):
self.options.set_missing_fields_as_nulls(val)
35 changes: 32 additions & 3 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -2236,16 +2236,38 @@ def get(self, i: int = 0) -> SeriesOrIndex:

return self._return_or_inplace(libstrings.get(self._column, i))

def get_json_object(self, json_path):
def get_json_object(
self,
json_path,
*,
allow_single_quotes=False,
strip_quotes_from_single_strings=True,
missing_fields_as_nulls=False,
):
r"""
Applies a JSONPath string to an input strings column
where each row in the column is a valid json string

Parameters
----------
json_path: str
json_path : str
The JSONPath string to be applied to each row
of the input column
allow_single_quotes : bool, default False
If True, representing strings with single
quotes is allowed.
If False, strings must only be represented
with double quotes.
strip_quotes_from_single_strings : bool, default True
If True, strip the quotes from the return value of
a given row if it is a string.
If False, values returned for a given row include
quotes if they are strings.
missing_fields_as_nulls : bool, default False
If True, when an object is queried for a field
it does not contain, "null" is returned.
If False, when an object is queried for a field
it does not contain, None is returned.

Returns
-------
Expand Down Expand Up @@ -2286,9 +2308,16 @@ def get_json_object(self, json_path):
"""

try:
options = libstrings.GetJsonObjectOptions()
options.set_allow_single_quotes(allow_single_quotes)
options.set_strip_quotes_from_single_strings(
strip_quotes_from_single_strings
)
options.set_missing_fields_as_nulls(missing_fields_as_nulls)

res = self._return_or_inplace(
libstrings.get_json_object(
self._column, cudf.Scalar(json_path, "str")
self._column, cudf.Scalar(json_path, "str"), options
)
)
except RuntimeError as e:
Expand Down
151 changes: 151 additions & 0 deletions python/cudf/cudf/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -3129,6 +3129,157 @@ def test_string_get_json_object_invalid_JSONPath(json_path):
gs.str.get_json_object(json_path)


def test_string_get_json_object_allow_single_quotes():
gs = cudf.Series(
[
"""
{
"store":{
"book":[
{
'author':"Nigel Rees",
"title":'Sayings of the Century',
"price":8.95
},
{
"category":"fiction",
"author":"Evelyn Waugh",
'title':"Sword of Honour",
"price":12.99
}
]
}
}
"""
]
)
Comment on lines +3132 to +3155
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps prefer

@pytest.fixture
def gs():
     return cudf.Series(...)


def test-string_get_json_object_allow_single_quotes(gs)
    ...

To avoid repeating the definition of the series object in each test.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good suggestion. @vyasr would this be in line with the recommendations for writing test? Or would this be better written in a pytest-case?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think @wence-'s suggestion is already a big improvement here. I would indeed prefer to use cases, but before making the argument for, I want to be clear on the fact that the difference between cases and fixtures is almost entirely one of semantics and providing nicer syntactic sugar. @wence- perhaps the discussion will help you give me some suggestions on how to improve with regards to your comment on my other PR, or maybe it'll provide you ammunition for a counterpoint that makes me change my mind!

Here's how I would write something like this with cases:

def json_object_case_single_quotes():
    return cudf.Series(...)

@parametrize_with_cases(gs, cases=".", prefix="json_object_case_")
def test_string_get_json_object(gs):
    ....

@parametrize_with_cases("gs", cases=".", prefix="json_object_case_")
def test_string_get_json_object(gs):
    ....

@parametrize_with_cases("gs", cases=".", prefix="json_object_case_")
def test_string_get_json_object_strip_quotes(gs):

Some notes:

  • I've included the cases in the same file here for brevity, but for a test file test_foo.py cases will be automatically detected from test_foo_cases.py. This has the benefit of enforcing some organization that you would otherwise have to do manually. The alternatives are cluttering the test file (or worse, conftest.py).
  • Cases are designed for parametrization, whereas fixtures are not. By that, what I mean is that a test runs on a single fixture, not many. (pytest-cases actually allows you to relax this restriction in some ways, but not as naturally as with cases. Also, it leads to cognitive dissonance since that's a way in which pytest-cases behaves differently from pytest rather than just adding functionality, so it could confuse users)
  • With cases you can select in various ways: prefix is an obvious choice, but you can also use globs or tag specific cases, for instance. That makes it much easier to create a suite of cases for a particular function, then write a set of tests for that function, and then create a many-to-many mapping between cases and the tests that those cases should be used for.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So let me see if I understand. In this instance, I think there is no difference in the behaviour. So I will try and explain what I think and then provide an example that shows differences?

A pytest.fixture is requested by a test by name.

@pytest.fixture
def foo():
     pass

def test_foo(foo):
     # fixture foo is invoked

So any parametrisation of fixtures happens by params:

@pytest.fixture(params=[1,2,3])
def foo(request):
     return request.param

def test_foo(foo):
    # invoked three times with foo=1, foo=2, foo=3

In the example we're discussing, the object we're testing is not parameterised by anything, so we're using a fixture for OAOO, and to avoid eager initialisation. This is better than just defining a module-level variable in a number of ways (including that the object is recreated for each test in turn, so stateful modifications between tests are not a problem).

In contrast, I think for the pytest-cases approach if I want a test argument to take multiple values I would write separate cases:

def my_case_the_first():
     ...

def my_case_the_second():
    ...

@parameterize_with_cases("foo", cases=".", prefix="my_case_")
def test_foo(foo):
    ... # test is called twice, first with foo=my_case_the_first(), then with foo=my_case_the_second()

So I can create a bunch of test cases and then select how I use them in my test functions.

So now my fixtures can have descriptive names, but I can bind them locally in the test to something short: I agree this makes fixtures more self-documenting.

I guess the other big thing that pytest-cases brings is that it augments the algebra of parameterized fixtures to union of sets rather than only cartesian product (though we're not using that here).

So. In sum, I can see the advantages for cases in the general case (hah!) when building lots of fixtures that one will use for many test functions (and need to manipulate in various ways).

In this particular instance, I'm not sold on it, since it seems to introduce more boilerplate than really necessary. Perhaps just using @fixture from pytest-cases rather than pytest would be reasonable in this case.

Copy link
Contributor

@vyasr vyasr Jul 14, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this particular instance, I'm not sold on it, since it seems to introduce more boilerplate than really necessary.

Just to clarify, the boilerplate that you're referring to is the need to decorate each function with parametrize_with_cases, right? And there's not much benefit here because 1) we only have a single case, and 2) the exact same case is used in a large number of tests? If so, I think that I agree.

Using some of your points, let's try to come up with some reasonable guidelines for when using cases vs fixtures.

Use fixtures when:

  • Many tests must be run on exactly the same input
  • Many tests must be run on exactly the same set of inputs, and all of those inputs can be constructed as simple parametrizations.

Use cases when:

  • A test must be run on many different kinds of inputs
  • Given a set of cases, different tests need to run on different subsets with a nonempty intersection
  • Many tests must be run on exactly the same set of inputs, and all of those inputs can be constructed as simple complex parametrizations.

We would also need to clearly define "simple parametrization". To me, it's something where a single scalar parameter varies, and where that parameter is directly passed to some other function (like a constructor). Something like this:

@pytest.fixture(params=['a', 'b'])
def foo(request):
    if request.param == 'a':
        # Some complex initialization
    elif request.param == 'b':
        # Some other complex initialization

would constitute a fixture that should instead be implemented as separate cases.

Do these align with your expectations @wence- ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I think that's what I expect. Although I note that the last two points of the "use X when:" lists are the same, was the cases one meant to say "complex parameterizations" ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct, that is what I meant. Fixed.

assert_eq(
SrikarVanavasam marked this conversation as resolved.
Show resolved Hide resolved
gs.str.get_json_object(
"$.store.book[0].author", allow_single_quotes=True
),
cudf.Series(["Nigel Rees"]),
)
assert_eq(
gs.str.get_json_object(
"$.store.book[*].title", allow_single_quotes=True
),
cudf.Series(["['Sayings of the Century',\"Sword of Honour\"]"]),
)

assert_eq(
gs.str.get_json_object(
"$.store.book[0].author", allow_single_quotes=False
),
cudf.Series([None]),
)
assert_eq(
gs.str.get_json_object(
"$.store.book[*].title", allow_single_quotes=False
),
cudf.Series([None]),
)


def test_string_get_json_object_strip_quotes_from_single_strings():
gs = cudf.Series(
[
"""
{
"store":{
"book":[
{
"author":"Nigel Rees",
"title":"Sayings of the Century",
"price":8.95
},
{
"category":"fiction",
"author":"Evelyn Waugh",
"title":"Sword of Honour",
"price":12.99
}
]
}
}
"""
]
)
assert_eq(
gs.str.get_json_object(
"$.store.book[0].author", strip_quotes_from_single_strings=True
),
cudf.Series(["Nigel Rees"]),
)
assert_eq(
gs.str.get_json_object(
"$.store.book[*].title", strip_quotes_from_single_strings=True
),
cudf.Series(['["Sayings of the Century","Sword of Honour"]']),
)
assert_eq(
gs.str.get_json_object(
"$.store.book[0].author", strip_quotes_from_single_strings=False
),
cudf.Series(['"Nigel Rees"']),
)
assert_eq(
gs.str.get_json_object(
"$.store.book[*].title", strip_quotes_from_single_strings=False
),
cudf.Series(['["Sayings of the Century","Sword of Honour"]']),
)


def test_string_get_json_object_missing_fields_as_nulls():
gs = cudf.Series(
[
"""
{
"store":{
"book":[
{
"author":"Nigel Rees",
"title":"Sayings of the Century",
"price":8.95
},
{
"category":"fiction",
"author":"Evelyn Waugh",
"title":"Sword of Honour",
"price":12.99
}
]
}
}
"""
]
)
assert_eq(
gs.str.get_json_object(
"$.store.book[0].category", missing_fields_as_nulls=True
),
cudf.Series(["null"]),
)
assert_eq(
gs.str.get_json_object(
"$.store.book[*].category", missing_fields_as_nulls=True
),
cudf.Series(['[null,"fiction"]']),
)
assert_eq(
gs.str.get_json_object(
"$.store.book[0].category", missing_fields_as_nulls=False
),
cudf.Series([None]),
)
assert_eq(
gs.str.get_json_object(
"$.store.book[*].category", missing_fields_as_nulls=False
),
cudf.Series(['["fiction"]']),
)


def test_str_join_lists_error():
sr = cudf.Series([["a", "a"], ["b"], ["c"]])

Expand Down