diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 052479d6720..62e14a67ee5 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -21,6 +21,7 @@ This page provides API documentation for pylibcudf. groupby interop join + json labeling lists merge diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/json.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/json.rst new file mode 100644 index 00000000000..bb38d179a57 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/json.rst @@ -0,0 +1,6 @@ +==== +json +==== + +.. automodule:: pylibcudf.json + :members: diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index e712937f816..ffa5e603408 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -72,7 +72,7 @@ ) from cudf._lib.strings.find_multiple import find_multiple from cudf._lib.strings.findall import find_re, findall -from cudf._lib.strings.json import GetJsonObjectOptions, get_json_object +from cudf._lib.strings.json import get_json_object from cudf._lib.strings.padding import center, ljust, pad, rjust, zfill from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence from cudf._lib.strings.replace import ( diff --git a/python/cudf/cudf/_lib/strings/json.pyx b/python/cudf/cudf/_lib/strings/json.pyx index c9b0bba088d..374a104635a 100644 --- a/python/cudf/cudf/_lib/strings/json.pyx +++ b/python/cudf/cudf/_lib/strings/json.pyx @@ -1,84 +1,26 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move +import pylibcudf as plc +from pylibcudf.json cimport GetJsonObjectOptions from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.json cimport ( - get_json_object as cpp_get_json_object, - get_json_object_options, -) - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar @acquire_spill_lock() def get_json_object( - Column col, object py_json_path, GetJsonObjectOptions options): + Column col, + object py_json_path, + GetJsonObjectOptions options +): """ Apply a JSONPath string to all rows in an input column of json strings. """ - cdef unique_ptr[column] c_result - - cdef column_view col_view = col.view() - cdef DeviceScalar json_path = py_json_path.device_value - - cdef const string_scalar* scalar_json_path = ( - json_path.get_raw_ptr() + plc_column = plc.json.get_json_object( + col.to_pylibcudf(mode="read"), + py_json_path.device_value.c_value, + options ) - - with nogil: - c_result = move(cpp_get_json_object( - col_view, - scalar_json_path[0], - options.options, - )) - - return Column.from_unique_ptr(move(c_result)) - - -cdef class GetJsonObjectOptions: - cdef get_json_object_options options - - def __init__( - self, - *, - allow_single_quotes=False, - strip_quotes_from_single_strings=True, - missing_fields_as_nulls=False - ): - self.options.set_allow_single_quotes(allow_single_quotes) - self.options.set_strip_quotes_from_single_strings( - strip_quotes_from_single_strings - ) - self.options.set_missing_fields_as_nulls(missing_fields_as_nulls) - - @property - def allow_single_quotes(self): - return self.options.get_allow_single_quotes() - - @property - def strip_quotes_from_single_strings(self): - return self.options.get_strip_quotes_from_single_strings() - - @property - def missing_fields_as_nulls(self): - return self.options.get_missing_fields_as_nulls() - - @allow_single_quotes.setter - def allow_single_quotes(self, val): - self.options.set_allow_single_quotes(val) - - @strip_quotes_from_single_strings.setter - def strip_quotes_from_single_strings(self, val): - self.options.set_strip_quotes_from_single_strings(val) - - @missing_fields_as_nulls.setter - def missing_fields_as_nulls(self, val): - self.options.set_missing_fields_as_nulls(val) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index b50e23bd52e..45d1a8b087b 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -2385,8 +2385,7 @@ def get_json_object( 0 [\n { "category": "reference",\n ... dtype: object """ - - options = libstrings.GetJsonObjectOptions( + options = plc.json.GetJsonObjectOptions( allow_single_quotes=allow_single_quotes, strip_quotes_from_single_strings=( strip_quotes_from_single_strings diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt index 1d72eacac12..2854d7c42ac 100644 --- a/python/pylibcudf/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/CMakeLists.txt @@ -27,6 +27,7 @@ set(cython_sources groupby.pyx interop.pyx join.pyx + json.pyx labeling.pyx lists.pyx merge.pyx diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd index b98b37fe0fd..79c2f0c5455 100644 --- a/python/pylibcudf/pylibcudf/__init__.pxd +++ b/python/pylibcudf/pylibcudf/__init__.pxd @@ -13,6 +13,7 @@ from . cimport ( filling, groupby, join, + json, labeling, lists, merge, @@ -60,6 +61,7 @@ __all__ = [ "gpumemoryview", "groupby", "join", + "json", "lists", "merge", "null_mask", diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py index 304f27be340..88e72418cda 100644 --- a/python/pylibcudf/pylibcudf/__init__.py +++ b/python/pylibcudf/pylibcudf/__init__.py @@ -24,6 +24,7 @@ interop, io, join, + json, labeling, lists, merge, @@ -73,6 +74,7 @@ "interop", "io", "join", + "json", "labeling", "lists", "merge", diff --git a/python/pylibcudf/pylibcudf/json.pxd b/python/pylibcudf/pylibcudf/json.pxd new file mode 100644 index 00000000000..87a87349b8a --- /dev/null +++ b/python/pylibcudf/pylibcudf/json.pxd @@ -0,0 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.json cimport get_json_object_options +from pylibcudf.scalar cimport Scalar + + +cdef class GetJsonObjectOptions: + cdef get_json_object_options options + + +cpdef Column get_json_object( + Column col, + Scalar json_path, + GetJsonObjectOptions options=* +) diff --git a/python/pylibcudf/pylibcudf/json.pyx b/python/pylibcudf/pylibcudf/json.pyx new file mode 100644 index 00000000000..4a8d11068f9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/json.pyx @@ -0,0 +1,154 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf cimport json as cpp_json +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.scalar cimport Scalar + + +cdef class GetJsonObjectOptions: + """Settings for ``get_json_object()``""" + def __init__( + self, + *, + allow_single_quotes=False, + strip_quotes_from_single_strings=True, + missing_fields_as_nulls=False + ): + self.set_allow_single_quotes(allow_single_quotes) + self.set_strip_quotes_from_single_strings( + strip_quotes_from_single_strings + ) + self.set_missing_fields_as_nulls(missing_fields_as_nulls) + + def get_allow_single_quotes(self): + """ + Returns true/false depending on whether single-quotes for representing strings + are allowed. + + Returns + ------- + bool + true if single-quotes are allowed, false otherwise. + """ + return self.options.get_allow_single_quotes() + + def get_strip_quotes_from_single_strings(self): + """ + Returns true/false depending on whether individually returned string values have + their quotes stripped. + + Returns + ------- + bool + true if individually returned string values have their quotes stripped. + """ + return self.options.get_strip_quotes_from_single_strings() + + def get_missing_fields_as_nulls(self): + """ + Whether a field not contained by an object is to be interpreted as null. + + Returns + ------- + bool + true if missing fields are interpreted as null. + """ + return self.options.get_missing_fields_as_nulls() + + def set_allow_single_quotes(self, bool val): + """ + Set whether single-quotes for strings are allowed. + + Parameters + ---------- + val : bool + Whether to allow single quotes + + Returns + ------- + None + """ + self.options.set_allow_single_quotes(val) + + def set_strip_quotes_from_single_strings(self, bool val): + """ + Set whether individually returned string values have their quotes stripped. + + Parameters + ---------- + val : bool + Whether to strip quotes from single strings. + + Returns + ------- + None + """ + self.options.set_strip_quotes_from_single_strings(val) + + def set_missing_fields_as_nulls(self, bool val): + """ + Set whether missing fields are interpreted as null. + + Parameters + ---------- + val : bool + Whether to treat missing fields as nulls. + + Returns + ------- + None + """ + self.options.set_missing_fields_as_nulls(val) + + +cpdef Column get_json_object( + Column col, + Scalar json_path, + GetJsonObjectOptions options=None +): + """ + Apply a JSONPath string to all rows in an input strings column. + + For details, see :cpp:func:`cudf::get_json_object` + + Parameters + ---------- + col : Column + The input strings column. Each row must contain a valid json string. + + json_path : Scalar + The JSONPath string to be applied to each row. + + options : GetJsonObjectOptions + Options for controlling the behavior of the function. + + Returns + ------- + Column + New strings column containing the retrieved json object strings. + """ + cdef unique_ptr[column] c_result + cdef string_scalar* c_json_path = ( + json_path.c_obj.get() + ) + if options is None: + options = GetJsonObjectOptions() + + cdef cpp_json.get_json_object_options c_options = options.options + + with nogil: + c_result = move( + cpp_json.get_json_object( + col.view(), + dereference(c_json_path), + c_options + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/json.pxd b/python/pylibcudf/pylibcudf/libcudf/json.pxd similarity index 100% rename from python/pylibcudf/pylibcudf/libcudf/strings/json.pxd rename to python/pylibcudf/pylibcudf/libcudf/json.pxd diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index 187ef113073..e45048a500f 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -14,6 +14,7 @@ from . cimport ( padding, regex_flags, regex_program, + repeat, replace, side_type, slice, @@ -33,9 +34,12 @@ __all__ = [ "convert", "extract", "find", + "find_multiple", "findall", + "padding", "regex_flags", "regex_program", + "repeat", "replace", "slice", "strip", diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index 6033cea0625..c6253d94b40 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -34,9 +34,12 @@ "convert", "extract", "find", + "find_multiple", "findall", + "padding", "regex_flags", "regex_program", + "repeat", "replace", "slice", "strip", diff --git a/python/pylibcudf/pylibcudf/tests/test_json.py b/python/pylibcudf/pylibcudf/tests/test_json.py new file mode 100644 index 00000000000..3d2955211f8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_json.py @@ -0,0 +1,42 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture(scope="module") +def plc_col(): + arr = pa.array( + ['{"foo": {"bar": [{"a": 1, "b": 2}, {"a": 3, "b": 4}]', None] + ) + return plc.interop.from_arrow(arr) + + +@pytest.fixture(scope="module") +def json_path(): + slr = pa.scalar("$.foo.bar") + return plc.interop.from_arrow(slr) + + +@pytest.mark.parametrize("allow_single_quotes", [True, False]) +@pytest.mark.parametrize("strip_quotes_from_single_strings", [True, False]) +@pytest.mark.parametrize("missing_fields_as_nulls", [True, False]) +def test_get_json_object( + plc_col, + json_path, + allow_single_quotes, + strip_quotes_from_single_strings, + missing_fields_as_nulls, +): + result = plc.json.get_json_object( + plc_col, + json_path, + plc.json.GetJsonObjectOptions( + allow_single_quotes=allow_single_quotes, + strip_quotes_from_single_strings=strip_quotes_from_single_strings, + missing_fields_as_nulls=missing_fields_as_nulls, + ), + ) + expected = pa.array(['[{"a": 1, "b": 2}, {"a": 3, "b": 4}]', None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index c9a685de3e9..ea5b3065896 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -97,7 +97,8 @@ skip = [ ] [tool.pytest.ini_options] -addopts = "--tb=native --strict-config --strict-markers" +# --import-mode=importlib because two test_json.py exists and tests directory is not a structured module +addopts = "--tb=native --strict-config --strict-markers --import-mode=importlib" empty_parameter_set_mark = "fail_at_collect" filterwarnings = [ "error",