From 20aa4442d27ca858796c7890ad0542dbaee542e1 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Wed, 5 Jun 2024 15:25:51 -0400 Subject: [PATCH 1/4] DOC: Add documentation for cudf.pandas in the Developer Guide (#15889) This PR provides documentation for cudf.pandas in the Developer Guide. It will describe the fast-slow proxy wrapping scheme as well as document the `CUDF_PANDAS_DEBUGGING` environment variable created in PR #15837 for issue #14975. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15889 --- .../source/developer_guide/cudf_pandas.md | 121 ++++++++++++++++++ docs/cudf/source/developer_guide/index.md | 1 + 2 files changed, 122 insertions(+) create mode 100644 docs/cudf/source/developer_guide/cudf_pandas.md diff --git a/docs/cudf/source/developer_guide/cudf_pandas.md b/docs/cudf/source/developer_guide/cudf_pandas.md new file mode 100644 index 00000000000..aeb43f66b2d --- /dev/null +++ b/docs/cudf/source/developer_guide/cudf_pandas.md @@ -0,0 +1,121 @@ +# cudf.pandas +The use of the cuDF pandas accelerator mode (`cudf.pandas`) is explained [in the user guide](../cudf_pandas/index.rst). +The purpose of this document is to explain how the fast-slow proxy mechanism works and document internal environment variables that can be used to debug `cudf.pandas` itself. + +## fast-slow proxy mechanism +`cudf.pandas` works by wrapping each Pandas type and its corresponding cuDF type in a new proxy type also known as a fast-slow proxy type. +The purpose of proxy types is to attempt computations on the fast (cuDF) object first, and then fall back to running on the slow (Pandas) object if the fast version fails. + +### Types: +#### Wrapped Types and Proxy Types +The "wrapped" types/classes are the Pandas and cuDF specific types that have been wrapped into proxy types. +Wrapped objects and proxy objects are instances of wrapped types and proxy types, respectively. +In the snippet below `s1` and `s2` are wrapped objects and `s3` is a fast-slow proxy object. +Also note that the module `xpd` is a wrapped module and contains cuDF and Pandas modules as attributes. + ```python + import cudf.pandas + cudf.pandas.install() + import pandas as xpd + + cudf = xpd._fsproxy_fast + pd = xpd._fsproxy_slow + + s1 = cudf.Series([1,2]) + s2 = pd.Series([1,2]) + s3 = xpd.Series([1,2]) + ``` + +```{note} +Note that users should never have to interact with the wrapped objects directly in this way. +This code is purely for demonstrative purposes. +``` + +#### The Different Kinds of Proxy Types +In `cudf.pandas`, there are two main kinds of proxy types: final types and intermediate types. + +##### Final and Intermediate Proxy Types +Final types are types for which known operations exist for converting an object of a "fast" type to a "slow" type and vice versa. +For example, `cudf.DataFrame` can be converted to Pandas using the method `to_pandas`, and `pd.DataFrame` can be converted to cuDF using the function `cudf.from_pandas`. +Intermediate types are the types of the results of operations invoked on final types. +For example, `xpd.DataFrameGroupBy` is an intermediate type that will be created during a groupby operation on the final type `xpd.DataFrame`. + +##### Attributes and Callable Proxy Types +Final proxy types are typically classes or modules, both of which have attributes. +Classes also have methods. +These attributes and methods must be wrapped as well to support the fast-slow proxy scheme. + +#### Creating New Proxy Types +`_FinalProxy` and `_IntermediateProxy` types are created using the functions `make_final_proxy_type` and `make_intermediate_proxy` type, respectively. +Creating a new final type looks like this. + +```python +DataFrame = make_final_proxy_type( + "DataFrame", + cudf.DataFrame, + pd.DataFrame, + fast_to_slow=lambda fast: fast.to_pandas(), + slow_to_fast=cudf.from_pandas, +) +``` + +### The Fallback Mechanism +Proxied calls are implemented with fallback via [`_fast_slow_function_call`](https://github.com/rapidsai/cudf/blob/57aeeb78d85e169ac18b82f51d2b1cbd01b0608d/python/cudf/cudf/pandas/fast_slow_proxy.py#L869). This implements the mechanism by which we attempt operations the fast way (using cuDF) and then fall back to the slow way (using Pandas) on failure. +The function looks like this: +```python +def _fast_slow_function_call(func: Callable, *args, **kwargs): + try: + ... + fast_args, fast_kwargs = _fast_arg(args), _fast_arg(kwargs) + result = func(*fast_args, **fast_kwargs) + ... + except Exception: + ... + slow_args, slow_kwargs = _slow_arg(args), _slow_arg(kwargs) + result = func(*slow_args, **slow_kwargs) + ... + return _maybe_wrap_result(result, func, *args, **kwargs), fast +``` +As we can see the function attempts to call `func` the fast way using cuDF and if any `Exception` occurs, it calls the function using Pandas. +In essence, this `try-except` is what allows `cudf.pandas` to support the bulk of the Pandas API. + +At the end, the function wraps the result from either path in a fast-slow proxy object, if necessary. + +#### Converting Proxy Objects +Note that before the `func` is called, the proxy object and its attributes need to be converted to either their cuDF or Pandas implementations. +This conversion is handled in the function `_transform_arg` which both `_fast_arg` and `_slow_arg` call. + +`_transform_arg` is a recursive function that will call itself depending on the type or argument passed to it (eg. `_transform_arg` is called for each element in a list of arguments). + +### Using Metaclasses +`cudf.pandas` uses a [metaclass](https://docs.python.org/3/glossary.html#term-metaclass) called (`_FastSlowProxyMeta`) to find class attributes and classmethods of fast-slow proxy types. +For example, in the snippet below, the `xpd.Series` type is an instance of `_FastSlowProxyMeta`. +Therefore we can access the property `_fsproxy_fast` defined in the metaclass. +```python +import cudf.pandas +cudf.pandas.install() +import pandas as xpd + +print(xpd.Series._fsproxy_fast) # output is cudf.core.series.Series +``` + +## debugging `cudf.pandas` +Several environment variables are available for debugging purposes. + +Setting the environment variable `CUDF_PANDAS_DEBUGGING` produces a warning when the results from cuDF and Pandas differ from one another. +For example, the snippet below produces the warning below. +```python +import cudf.pandas +cudf.pandas.install() +import pandas as pd +import numpy as np + +setattr(pd.Series.mean, "_fsproxy_slow", lambda self, *args, **kwargs: np.float64(1)) +s = pd.Series([1,2,3]) +s.mean() +``` +``` +UserWarning: The results from cudf and pandas were different. The exception was +Arrays are not almost equal to 7 decimals + ACTUAL: 1.0 + DESIRED: 2.0. +``` diff --git a/docs/cudf/source/developer_guide/index.md b/docs/cudf/source/developer_guide/index.md index 5cafa8f784c..5e099631fc5 100644 --- a/docs/cudf/source/developer_guide/index.md +++ b/docs/cudf/source/developer_guide/index.md @@ -27,4 +27,5 @@ testing benchmarking options pylibcudf +cudf_pandas ``` From d91380ef393e9156c34a078998041a6affca7923 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Wed, 5 Jun 2024 21:16:29 -0400 Subject: [PATCH 2/4] Allow tests to be built when stream util is disabled (#15933) Allows cudf to be built with `BUILD_SHARED_LIBS=OFF`, `CUDA_STATIC_RUNTIME=ON` and tests enabled Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) - Gera Shegalov (https://github.com/gerashegalov) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/15933 --- cpp/tests/CMakeLists.txt | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 2f2c12f265c..a0d9083c4a4 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -68,12 +68,14 @@ function(ConfigureTest CMAKE_TEST_NAME) INSTALL_COMPONENT_SET testing ) - set_tests_properties( - ${CMAKE_TEST_NAME} - PROPERTIES - ENVIRONMENT - "GTEST_CUDF_STREAM_MODE=new_${_CUDF_TEST_STREAM_MODE}_default;LD_PRELOAD=$" - ) + if(CUDF_BUILD_STREAMS_TEST_UTIL) + set_tests_properties( + ${CMAKE_TEST_NAME} + PROPERTIES + ENVIRONMENT + "GTEST_CUDF_STREAM_MODE=new_${_CUDF_TEST_STREAM_MODE}_default;LD_PRELOAD=$" + ) + endif() endfunction() # ################################################################################################## @@ -401,14 +403,10 @@ ConfigureTest(SPAN_TEST utilities_tests/span_tests.cu) ConfigureTest(SPAN_TEST_DEVICE_VECTOR utilities_tests/span_tests.cu) # Overwrite the environments set by ConfigureTest -set_tests_properties( - SPAN_TEST - PROPERTIES - ENVIRONMENT - "GTEST_FILTER=-${_allowlist_filter};GTEST_CUDF_STREAM_MODE=new_cudf_default;LD_PRELOAD=$" -) -set_tests_properties( - SPAN_TEST_DEVICE_VECTOR PROPERTIES ENVIRONMENT "GTEST_FILTER=${_allowlist_filter}" +set_property( + TEST SPAN_TEST SPAN_TEST_DEVICE_VECTOR + APPEND + PROPERTY ENVIRONMENT "GTEST_FILTER=-${_allowlist_filter}" ) # ################################################################################################## @@ -671,9 +669,11 @@ target_include_directories(JIT_PARSER_TEST PRIVATE "$ Date: Wed, 5 Jun 2024 20:48:10 -0500 Subject: [PATCH 3/4] Migrate strings `contains` operations to `pylibcudf` (#15880) This PR creates pylibcudf strings `contains` APIs and migrates the cuDF cython to leverage them. Part of https://github.com/rapidsai/cudf/issues/15162. Authors: - https://github.com/brandon-b-miller Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15880 --- .../api_docs/pylibcudf/strings/contains.rst | 6 ++ .../api_docs/pylibcudf/strings/index.rst | 1 + .../pylibcudf/libcudf/strings/CMakeLists.txt | 2 +- .../pylibcudf/libcudf/strings/regex_flags.pxd | 13 +++-- .../pylibcudf/libcudf/strings/regex_flags.pyx | 0 .../_lib/pylibcudf/strings/CMakeLists.txt | 4 +- .../cudf/_lib/pylibcudf/strings/__init__.pxd | 11 +++- .../cudf/_lib/pylibcudf/strings/__init__.py | 11 +++- .../cudf/_lib/pylibcudf/strings/contains.pxd | 7 +++ .../cudf/_lib/pylibcudf/strings/contains.pyx | 41 ++++++++++++++ .../_lib/pylibcudf/strings/regex_flags.pxd | 2 + .../_lib/pylibcudf/strings/regex_flags.pyx | 4 ++ .../_lib/pylibcudf/strings/regex_program.pxd | 10 ++++ .../_lib/pylibcudf/strings/regex_program.pyx | 37 +++++++++++++ python/cudf/cudf/_lib/strings/contains.pyx | 23 +++----- .../pylibcudf_tests/test_regex_program.py | 13 +++++ .../pylibcudf_tests/test_string_contains.py | 55 +++++++++++++++++++ 17 files changed, 215 insertions(+), 25 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_regex_program.py create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_contains.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst new file mode 100644 index 00000000000..e5745331bc7 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst @@ -0,0 +1,6 @@ +======== +contains +======== + +.. automodule:: cudf._lib.pylibcudf.strings.contains + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 8970fc80c0b..bfaef732555 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -4,4 +4,5 @@ strings .. toctree:: :maxdepth: 1 + contains replace diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt index 930c22781d0..bd6e2e0af02 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources char_types.pyx) +set(cython_sources char_types.pyx regex_flags.pyx) set(linked_libraries cudf::cudf) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd index 2a5701fa6a3..41617f157b7 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd @@ -1,9 +1,12 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. + +from libc.stdint cimport int32_t + cdef extern from "cudf/strings/regex/flags.hpp" \ namespace "cudf::strings" nogil: - ctypedef enum regex_flags: - DEFAULT 'cudf::strings::regex_flags::DEFAULT' - MULTILINE 'cudf::strings::regex_flags::MULTILINE' - DOTALL 'cudf::strings::regex_flags::DOTALL' + cpdef enum class regex_flags(int32_t): + DEFAULT + MULTILINE + DOTALL diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt index c9a983e24f4..cb7f71b1912 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt @@ -12,7 +12,9 @@ # the License. # ============================================================================= -set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx replace.pyx) +set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx + regex_program.pyx replace.pyx +) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd index 7563df8a107..959aa94737d 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd @@ -1,3 +1,12 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport capitalize, case, char_types, find, replace +from . cimport ( + capitalize, + case, + char_types, + contains, + find, + regex_flags, + regex_program, + replace, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py index cb4f0e38f97..b7384913286 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py @@ -1,3 +1,12 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import capitalize, case, char_types, find, replace +from . import ( + capitalize, + case, + char_types, + contains, + find, + regex_flags, + regex_program, + replace, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd new file mode 100644 index 00000000000..275aa95d97e --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram + + +cpdef Column contains_re(Column input, RegexProgram prog) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx new file mode 100644 index 00000000000..8c598b7c953 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx @@ -0,0 +1,41 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.strings cimport contains as cpp_contains +from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram + + +cpdef Column contains_re( + Column input, + RegexProgram prog +): + """Returns a boolean column identifying rows which match the given + regex_program object. + + For details, see :cpp:func:`cudf::strings::contains_re`. + + Parameters + ---------- + input : Column + The input strings + prog : RegexProgram + Regex program instance + + Returns + ------- + pylibcudf.Column + New column of boolean results for each string + """ + + cdef unique_ptr[column] result + + with nogil: + result = cpp_contains.contains_re( + input.view(), + prog.c_obj.get()[0] + ) + + return Column.from_libcudf(move(result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd new file mode 100644 index 00000000000..79937bf574a --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd @@ -0,0 +1,2 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx new file mode 100644 index 00000000000..903c2ddd503 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.libcudf.strings.regex_flags import \ + regex_flags as RegexFlags # no-cython-lint diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd new file mode 100644 index 00000000000..61ed268fb2d --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string + +from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program + + +cdef class RegexProgram: + cdef unique_ptr[regex_program] c_obj diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx new file mode 100644 index 00000000000..d605b0aba02 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx @@ -0,0 +1,37 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags +from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program + +from cudf._lib.pylibcudf.strings.regex_flags import RegexFlags +from cudf._lib.pylibcudf.strings.regex_flags cimport regex_flags + + +cdef class RegexProgram: + + def __init__(self, *args, **kwargs): + raise ValueError("Do not instantiate RegexProgram directly, use create") + + @staticmethod + def create(str pattern, int flags): + cdef unique_ptr[regex_program] c_prog + cdef regex_flags c_flags + cdef string c_pattern = pattern.encode() + + cdef RegexProgram ret = RegexProgram.__new__(RegexProgram) + if isinstance(flags, object): + if isinstance(flags, (int, RegexFlags)): + c_flags = flags + with nogil: + c_prog = regex_program.create(c_pattern, c_flags) + + ret.c_obj = move(c_prog) + else: + raise ValueError("flags must be of type RegexFlags") + + return ret diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx index 087acd8062d..502a1d14696 100644 --- a/python/cudf/cudf/_lib/strings/contains.pyx +++ b/python/cudf/cudf/_lib/strings/contains.pyx @@ -14,7 +14,6 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar from cudf._lib.pylibcudf.libcudf.strings.contains cimport ( - contains_re as cpp_contains_re, count_re as cpp_count_re, like as cpp_like, matches_re as cpp_matches_re, @@ -23,6 +22,9 @@ from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program from cudf._lib.scalar cimport DeviceScalar +from cudf._lib.pylibcudf.strings import contains +from cudf._lib.pylibcudf.strings.regex_program import RegexProgram + @acquire_spill_lock() def contains_re(Column source_strings, object reg_ex, uint32_t flags): @@ -30,21 +32,10 @@ def contains_re(Column source_strings, object reg_ex, uint32_t flags): Returns a Column of boolean values with True for `source_strings` that contain regular expression `reg_ex`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string reg_ex_string = str(reg_ex).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(reg_ex_string, c_flags)) - c_result = move(cpp_contains_re( - source_view, - dereference(c_prog) - )) - - return Column.from_unique_ptr(move(c_result)) + prog = RegexProgram.create(str(reg_ex), flags) + return Column.from_pylibcudf( + contains.contains_re(source_strings.to_pylibcudf(mode="read"), prog) + ) @acquire_spill_lock() diff --git a/python/cudf/cudf/pylibcudf_tests/test_regex_program.py b/python/cudf/cudf/pylibcudf_tests/test_regex_program.py new file mode 100644 index 00000000000..3a9bcec3616 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_regex_program.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pytest + +import cudf._lib.pylibcudf as plc + + +@pytest.mark.parametrize("pat", ["(", "*", "\\"]) +def test_regex_program_invalid(pat): + with pytest.raises(RuntimeError): + plc.strings.regex_program.RegexProgram.create( + pat, plc.strings.regex_flags.RegexFlags.DEFAULT + ) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_contains.py b/python/cudf/cudf/pylibcudf_tests/test_string_contains.py new file mode 100644 index 00000000000..8cdb6f7c521 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_string_contains.py @@ -0,0 +1,55 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import cudf._lib.pylibcudf as plc + + +@pytest.fixture(scope="module") +def pa_target_col(): + return pa.array( + ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"] + ) + + +@pytest.fixture(scope="module") +def plc_target_col(pa_target_col): + return plc.interop.from_arrow(pa_target_col) + + +@pytest.fixture( + params=[ + "A", + "de", + ".*", + "^a", + "^A", + "[^a-z]", + "[a-z]{3,}", + "^[A-Z]{2,}", + "j|u", + ], + scope="module", +) +def pa_target_scalar(request): + return pa.scalar(request.param, type=pa.string()) + + +@pytest.fixture(scope="module") +def plc_target_pat(pa_target_scalar): + prog = plc.strings.regex_program.RegexProgram.create( + pa_target_scalar.as_py(), plc.strings.regex_flags.RegexFlags.DEFAULT + ) + return prog + + +def test_contains_re( + pa_target_col, plc_target_col, pa_target_scalar, plc_target_pat +): + got = plc.strings.contains.contains_re(plc_target_col, plc_target_pat) + expected = pa.compute.match_substring_regex( + pa_target_col, pa_target_scalar.as_py() + ) + assert_column_eq(got, expected) From 3b734ec2fd591f037fe1d8f8ce424c7049cb5a3e Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 6 Jun 2024 04:41:01 -0700 Subject: [PATCH 4/4] Start migrating I/O to pylibcudf (#15899) xref #15162 Starts migrating cudf I/O cython to use pylibcudf APIs, starting with avro. Authors: - Thomas Li (https://github.com/lithomas1) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15899 --- .../user_guide/api_docs/pylibcudf/index.rst | 1 + .../user_guide/api_docs/pylibcudf/io/avro.rst | 6 + .../api_docs/pylibcudf/io/index.rst | 18 +++ python/cudf/cudf/_lib/avro.pyx | 50 ++----- python/cudf/cudf/_lib/csv.pyx | 8 +- python/cudf/cudf/_lib/parquet.pyx | 2 +- .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 1 + .../cudf/_lib/pylibcudf/io/CMakeLists.txt | 25 ++++ .../cudf/cudf/_lib/pylibcudf/io/__init__.pxd | 4 + .../cudf/cudf/_lib/pylibcudf/io/__init__.py | 4 + python/cudf/cudf/_lib/pylibcudf/io/avro.pxd | 12 ++ python/cudf/cudf/_lib/pylibcudf/io/avro.pyx | 58 +++++++++ python/cudf/cudf/_lib/pylibcudf/io/types.pxd | 29 +++++ python/cudf/cudf/_lib/pylibcudf/io/types.pyx | 110 ++++++++++++++++ .../cudf/_lib/pylibcudf/libcudf/io/orc.pxd | 6 +- .../cudf/_lib/pylibcudf/libcudf/io/types.pxd | 58 ++++----- python/cudf/cudf/_lib/utils.pxd | 1 + python/cudf/cudf/_lib/utils.pyx | 11 ++ .../cudf/cudf/pylibcudf_tests/common/utils.py | 17 +++ python/cudf/cudf/pylibcudf_tests/test_avro.py | 123 ++++++++++++++++++ .../cudf/pylibcudf_tests/test_source_info.py | 69 ++++++++++ 21 files changed, 541 insertions(+), 72 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/__init__.py create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/avro.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/avro.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/types.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/types.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_avro.py create mode 100644 python/cudf/cudf/pylibcudf_tests/test_source_info.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index b6ad1157511..870ed8856d1 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -17,6 +17,7 @@ This page provides API documentation for pylibcudf. filling gpumemoryview groupby + io/index.rst join lists merge diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst new file mode 100644 index 00000000000..495bd505fdc --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst @@ -0,0 +1,6 @@ +==== +Avro +==== + +.. automodule:: cudf._lib.pylibcudf.io.avro + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst new file mode 100644 index 00000000000..0d53ac92db9 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst @@ -0,0 +1,18 @@ +=== +I/O +=== + +I/O Utility Classes +=================== + +.. automodule:: cudf._lib.pylibcudf.io.types + :members: + + +I/O Functions +============= + +.. toctree:: + :maxdepth: 1 + + avro diff --git a/python/cudf/cudf/_lib/avro.pyx b/python/cudf/cudf/_lib/avro.pyx index ae17a5f1ab6..3c132b22880 100644 --- a/python/cudf/cudf/_lib/avro.pyx +++ b/python/cudf/cudf/_lib/avro.pyx @@ -1,20 +1,12 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.string cimport string -from libcpp.utility cimport move -from libcpp.vector cimport vector +from cudf._lib.utils cimport data_from_pylibcudf_io -from cudf._lib.io.utils cimport make_source_info -from cudf._lib.pylibcudf.libcudf.io.avro cimport ( - avro_reader_options, - read_avro as libcudf_read_avro, -) -from cudf._lib.pylibcudf.libcudf.io.types cimport table_with_metadata -from cudf._lib.pylibcudf.libcudf.types cimport size_type -from cudf._lib.utils cimport data_from_unique_ptr +import cudf._lib.pylibcudf as plc +from cudf._lib.pylibcudf.io.types import SourceInfo -cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1): +cpdef read_avro(datasource, columns=None, skip_rows=0, num_rows=-1): """ Cython function to call libcudf read_avro, see `read_avro`. @@ -28,28 +20,14 @@ cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1): if not isinstance(num_rows, int) or num_rows < -1: raise TypeError("num_rows must be an int >= -1") - if not isinstance(skip_rows, int) or skip_rows < -1: - raise TypeError("skip_rows must be an int >= -1") - - cdef vector[string] c_columns - if columns is not None and len(columns) > 0: - c_columns.reserve(len(columns)) - for col in columns: - c_columns.push_back(str(col).encode()) - - cdef avro_reader_options options = move( - avro_reader_options.builder(make_source_info([datasource])) - .columns(c_columns) - .skip_rows( skip_rows) - .num_rows( num_rows) - .build() + if not isinstance(skip_rows, int) or skip_rows < 0: + raise TypeError("skip_rows must be an int >= 0") + + return data_from_pylibcudf_io( + plc.io.avro.read_avro( + SourceInfo([datasource]), + columns, + skip_rows, + num_rows + ) ) - - cdef table_with_metadata c_result - - with nogil: - c_result = move(libcudf_read_avro(options)) - - names = [info.name.decode() for info in c_result.metadata.schema_info] - - return data_from_unique_ptr(move(c_result.tbl), column_names=names) diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index aa771295607..0b0bbdb2589 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -151,14 +151,14 @@ cdef csv_reader_options make_csv_reader_options( ) if quoting == 1: - c_quoting = quote_style.QUOTE_ALL + c_quoting = quote_style.ALL elif quoting == 2: - c_quoting = quote_style.QUOTE_NONNUMERIC + c_quoting = quote_style.NONNUMERIC elif quoting == 3: - c_quoting = quote_style.QUOTE_NONE + c_quoting = quote_style.NONE else: # Default value - c_quoting = quote_style.QUOTE_MINIMAL + c_quoting = quote_style.MINIMAL cdef csv_reader_options csv_reader_options_c = move( csv_reader_options.builder(c_source_info) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index f0eef9be124..ac592cedaac 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -491,7 +491,7 @@ def write_parquet( "Valid values are '1.0' and '2.0'" ) - dict_policy = ( + cdef cudf_io_types.dictionary_policy dict_policy = ( cudf_io_types.dictionary_policy.ADAPTIVE if use_dictionary else cudf_io_types.dictionary_policy.NEVER diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 7d0676f6def..6beb7b0f506 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -50,3 +50,4 @@ link_to_pyarrow_headers(pylibcudf_interop) add_subdirectory(libcudf) add_subdirectory(strings) +add_subdirectory(io) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt new file mode 100644 index 00000000000..2cfec101bab --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt @@ -0,0 +1,25 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources avro.pyx types.pyx) + +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf +) + +set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_types) +link_to_pyarrow_headers("${targets_using_arrow_headers}") diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd new file mode 100644 index 00000000000..250292746c1 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . cimport avro, types +from .types cimport SourceInfo, TableWithMetadata diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py new file mode 100644 index 00000000000..5242c741911 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . import avro, types +from .types import SourceInfo, TableWithMetadata diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd b/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd new file mode 100644 index 00000000000..3695f36a6e7 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata +from cudf._lib.pylibcudf.libcudf.io.avro cimport avro_reader_options +from cudf._lib.pylibcudf.libcudf.types cimport size_type + + +cpdef TableWithMetadata read_avro( + SourceInfo source_info, + list columns = *, + size_type skip_rows = *, + size_type num_rows = * +) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx new file mode 100644 index 00000000000..946e0896fc8 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx @@ -0,0 +1,58 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.string cimport string +from libcpp.utility cimport move +from libcpp.vector cimport vector + +from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata +from cudf._lib.pylibcudf.libcudf.io.avro cimport ( + avro_reader_options, + read_avro as cpp_read_avro, +) +from cudf._lib.pylibcudf.libcudf.types cimport size_type + + +cpdef TableWithMetadata read_avro( + SourceInfo source_info, + list columns = None, + size_type skip_rows = 0, + size_type num_rows = -1 +): + """ + Reads an Avro dataset into a set of columns. + + Parameters + ---------- + source_info: SourceInfo + The SourceInfo object to read the avro dataset from. + columns: list, default None + Optional columns to read, if not provided, reads all columns in the file. + skip_rows: size_type, default 0 + The number of rows to skip. + num_rows: size_type, default -1 + The number of rows to read, after skipping rows. + If -1 is passed, all rows will be read. + + Returns + ------- + TableWithMetadata + The Table and its corresponding metadata that was read in. + """ + cdef vector[string] c_columns + if columns is not None and len(columns) > 0: + c_columns.reserve(len(columns)) + for col in columns: + c_columns.push_back(str(col).encode()) + + cdef avro_reader_options avro_opts = move( + avro_reader_options.builder(source_info.c_obj) + .columns(c_columns) + .skip_rows(skip_rows) + .num_rows(num_rows) + .build() + ) + + with nogil: + c_result = move(cpp_read_avro(avro_opts)) + + return TableWithMetadata.from_libcudf(c_result) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd new file mode 100644 index 00000000000..aa846a47343 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from cudf._lib.pylibcudf.libcudf.io.types cimport ( + column_encoding, + column_in_metadata, + column_name_info, + compression_type, + dictionary_policy, + io_type, + partition_info, + quote_style, + sink_info, + source_info, + statistics_freq, + table_input_metadata, + table_metadata, + table_with_metadata, +) +from cudf._lib.pylibcudf.table cimport Table + + +cdef class TableWithMetadata: + cdef public Table tbl + cdef table_metadata metadata + + @staticmethod + cdef TableWithMetadata from_libcudf(table_with_metadata& tbl) + +cdef class SourceInfo: + cdef source_info c_obj diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx new file mode 100644 index 00000000000..cd777232b33 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx @@ -0,0 +1,110 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.string cimport string +from libcpp.utility cimport move +from libcpp.vector cimport vector + +from cudf._lib.pylibcudf.libcudf.io.types cimport ( + host_buffer, + source_info, + table_with_metadata, +) + +import errno +import io +import os + + +cdef class TableWithMetadata: + """A container holding a table and its associated metadata + (e.g. column names) + + For details, see :cpp:class:`cudf::io::table_with_metadata`. + """ + + @property + def columns(self): + """ + Return a list containing the columns of the table + """ + return self.tbl.columns() + + @property + def column_names(self): + """ + Return a list containing the column names of the table + """ + cdef list names = [] + for col_info in self.metadata.schema_info: + # TODO: Handle nesting (columns with child columns) + assert col_info.children.size() == 0, "Child column names are not handled!" + names.append(col_info.name.decode()) + return names + + @staticmethod + cdef TableWithMetadata from_libcudf(table_with_metadata& tbl_with_meta): + """Create a Python TableWithMetadata from a libcudf table_with_metadata""" + cdef TableWithMetadata out = TableWithMetadata.__new__(TableWithMetadata) + out.tbl = Table.from_libcudf(move(tbl_with_meta.tbl)) + out.metadata = tbl_with_meta.metadata + return out + +cdef class SourceInfo: + """A class containing details on a source to read from. + + For details, see :cpp:class:`cudf::io::source_info`. + + Parameters + ---------- + sources : List[Union[str, os.PathLike, bytes, io.BytesIO]] + A homogeneous list of sources (this can be a string filename, + an os.PathLike, bytes, or an io.BytesIO) to read from. + + Mixing different types of sources will raise a `ValueError`. + """ + + def __init__(self, list sources): + if not sources: + raise ValueError("Need to pass at least one source") + + cdef vector[string] c_files + + if isinstance(sources[0], (os.PathLike, str)): + c_files.reserve(len(sources)) + + for src in sources: + if not isinstance(src, (os.PathLike, str)): + raise ValueError("All sources must be of the same type!") + if not os.path.isfile(src): + raise FileNotFoundError(errno.ENOENT, + os.strerror(errno.ENOENT), + src) + + c_files.push_back( str(src).encode()) + + self.c_obj = move(source_info(c_files)) + return + + # TODO: host_buffer is deprecated API, use host_span instead + cdef vector[host_buffer] c_host_buffers + cdef const unsigned char[::1] c_buffer + cdef bint empty_buffer = False + if isinstance(sources[0], bytes): + empty_buffer = True + for buffer in sources: + if not isinstance(buffer, bytes): + raise ValueError("All sources must be of the same type!") + if (len(buffer) > 0): + c_buffer = buffer + c_host_buffers.push_back(host_buffer(&c_buffer[0], + c_buffer.shape[0])) + empty_buffer = False + elif isinstance(sources[0], io.BytesIO): + for bio in sources: + if not isinstance(bio, io.BytesIO): + raise ValueError("All sources must be of the same type!") + c_buffer = bio.getbuffer() # check if empty? + c_host_buffers.push_back(host_buffer(&c_buffer[0], + c_buffer.shape[0])) + + self.c_obj = source_info(c_host_buffers) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd index e553515dfdf..25f91849dea 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd @@ -94,7 +94,9 @@ cdef extern from "cudf/io/orc.hpp" \ orc_writer_options_builder& compression( cudf_io_types.compression_type comp ) except + - orc_writer_options_builder& enable_statistics(bool val) except + + orc_writer_options_builder& enable_statistics( + cudf_io_types.statistics_freq val + ) except + orc_writer_options_builder& stripe_size_bytes(size_t val) except + orc_writer_options_builder& stripe_size_rows(size_type val) except + orc_writer_options_builder& row_index_stride(size_type val) except + @@ -147,7 +149,7 @@ cdef extern from "cudf/io/orc.hpp" \ cudf_io_types.compression_type comp ) except + chunked_orc_writer_options_builder& enable_statistics( - bool val + cudf_io_types.statistics_freq val ) except + orc_writer_options_builder& stripe_size_bytes(size_t val) except + orc_writer_options_builder& stripe_size_rows(size_type val) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd index 38fae1df1e5..8d87deb1472 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd @@ -20,45 +20,45 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/io/types.hpp" \ namespace "cudf::io" nogil: - ctypedef enum quote_style: - QUOTE_MINIMAL "cudf::io::quote_style::MINIMAL" - QUOTE_ALL "cudf::io::quote_style::ALL" - QUOTE_NONNUMERIC "cudf::io::quote_style::NONNUMERIC" - QUOTE_NONE "cudf::io::quote_style::NONE" - - ctypedef enum compression_type: - NONE "cudf::io::compression_type::NONE" - AUTO "cudf::io::compression_type::AUTO" - SNAPPY "cudf::io::compression_type::SNAPPY" - GZIP "cudf::io::compression_type::GZIP" - BZIP2 "cudf::io::compression_type::BZIP2" - BROTLI "cudf::io::compression_type::BROTLI" - ZIP "cudf::io::compression_type::ZIP" - XZ "cudf::io::compression_type::XZ" - ZLIB "cudf::io::compression_type::ZLIB" - LZ4 "cudf::io::compression_type::LZ4" - LZO "cudf::io::compression_type::LZO" - ZSTD "cudf::io::compression_type::ZSTD" - - ctypedef enum io_type: - FILEPATH "cudf::io::io_type::FILEPATH" - HOST_BUFFER "cudf::io::io_type::HOST_BUFFER" - VOID "cudf::io::io_type::VOID" - USER_IMPLEMENTED "cudf::io::io_type::USER_IMPLEMENTED" - - ctypedef enum statistics_freq: + cpdef enum class quote_style(int32_t): + MINIMAL + ALL + NONNUMERIC + NONE + + cpdef enum class compression_type(int32_t): + NONE + AUTO + SNAPPY + GZIP + BZIP2 + BROTLI + ZIP + XZ + ZLIB + LZ4 + LZO + ZSTD + + cpdef enum class io_type(int32_t): + FILEPATH + HOST_BUFFER + VOID + USER_IMPLEMENTED + + cpdef enum class statistics_freq(int32_t): STATISTICS_NONE = 0, STATISTICS_ROWGROUP = 1, STATISTICS_PAGE = 2, STATISTICS_COLUMN = 3, - ctypedef enum dictionary_policy: + cpdef enum class dictionary_policy(int32_t): NEVER = 0, ADAPTIVE = 1, ALWAYS = 2, cdef extern from "cudf/io/types.hpp" namespace "cudf::io" nogil: - cpdef enum class column_encoding: + cpdef enum class column_encoding(int32_t): USE_DEFAULT = -1 DICTIONARY = 0 PLAIN = 1 diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd index c5a1e7552b9..99850d549a1 100644 --- a/python/cudf/cudf/_lib/utils.pxd +++ b/python/cudf/cudf/_lib/utils.pxd @@ -11,6 +11,7 @@ from cudf._lib.pylibcudf.libcudf.table.table cimport table, table_view cdef data_from_unique_ptr( unique_ptr[table] c_tbl, column_names, index_names=*) cdef data_from_pylibcudf_table(tbl, column_names, index_names=*) +cdef data_from_pylibcudf_io(tbl_with_meta) cdef data_from_table_view( table_view tv, object owner, object column_names, object index_names=*) cdef table_view table_view_from_columns(columns) except * diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 4c4cd48d6ed..de6b9f690b6 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -315,6 +315,17 @@ cdef data_from_pylibcudf_table(tbl, column_names, index_names=None): index_names ) +cdef data_from_pylibcudf_io(tbl_with_meta): + """ + Unpacks the TableWithMetadata from libcudf I/O + into a dict of columns and an Index (cuDF format) + """ + return _data_from_columns( + columns=[Column.from_pylibcudf(plc) for plc in tbl_with_meta.columns], + column_names=tbl_with_meta.column_names, + index_names=None + ) + cdef columns_from_table_view( table_view tv, object owners, diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index e00053529a8..54d38f1a8cf 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -63,6 +63,23 @@ def assert_table_eq(pa_table: pa.Table, plc_table: plc.Table) -> None: assert_column_eq(pa_col, plc_col) +def assert_table_and_meta_eq( + plc_table_w_meta: plc.io.types.TableWithMetadata, pa_table: pa.Table +) -> None: + """Verify that the pylibcudf TableWithMetadata and PyArrow table are equal""" + + plc_table = plc_table_w_meta.tbl + + plc_shape = (plc_table.num_rows(), plc_table.num_columns()) + assert plc_shape == pa_table.shape + + for plc_col, pa_col in zip(plc_table.columns(), pa_table.columns): + assert_column_eq(plc_col, pa_col) + + # Check column name equality + assert plc_table_w_meta.column_names == pa_table.column_names + + def cudf_raises(expected_exception: BaseException, *args, **kwargs): # A simple wrapper around pytest.raises that defaults to looking for cudf exceptions match = kwargs.get("match", None) diff --git a/python/cudf/cudf/pylibcudf_tests/test_avro.py b/python/cudf/cudf/pylibcudf_tests/test_avro.py new file mode 100644 index 00000000000..d6cd86768cd --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_avro.py @@ -0,0 +1,123 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import io +import itertools + +import fastavro +import pyarrow as pa +import pytest +from utils import assert_table_and_meta_eq + +import cudf._lib.pylibcudf as plc + +avro_dtype_pairs = [ + ("boolean", pa.bool_()), + ("int", pa.int32()), + ("long", pa.int64()), + ("float", pa.float32()), + ("double", pa.float64()), + ("bytes", pa.string()), + ("string", pa.string()), +] + + +@pytest.fixture( + scope="module", params=itertools.combinations(avro_dtype_pairs, 2) +) +def avro_dtypes(request): + return request.param + + +@pytest.fixture +def avro_dtype_data(avro_dtypes): + (avro_type1, _), (avro_type2, _) = avro_dtypes + + def _get_data(avro_type): + if avro_type == "boolean": + return [True, False, True] + elif avro_type in {"int", "long"}: + return [1, 2, -1] + elif avro_type in {"float", "double"}: + return [1.0, 3.1415, -3.1415] + elif avro_type == "bytes": + return [b"a", b"b", b"c"] + elif avro_type == "string": + return ["Hello", "World!", ""] + + return _get_data(avro_type1), _get_data(avro_type2) + + +@pytest.fixture( + params=[ + (0, 0), + (0, -1), + (1, -1), + (3, -1), + ] +) +def row_opts(request): + """ + (skip_rows, num_rows) combos for the avro reader + """ + return request.param + + +@pytest.mark.parametrize("columns", [["prop1"], [], ["prop1", "prop2"]]) +@pytest.mark.parametrize("nullable", [True, False]) +def test_read_avro(avro_dtypes, avro_dtype_data, row_opts, columns, nullable): + (avro_type1, expected_type1), (avro_type2, expected_type2) = avro_dtypes + + avro_type1 = avro_type1 if not nullable else ["null", avro_type1] + avro_type2 = avro_type2 if not nullable else ["null", avro_type2] + + skip_rows, num_rows = row_opts + + schema = fastavro.parse_schema( + { + "type": "record", + "name": "test", + "fields": [ + {"name": "prop1", "type": avro_type1}, + {"name": "prop2", "type": avro_type2}, + ], + } + ) + + if nullable: + avro_dtype_data = ( + avro_dtype_data[0] + [None], + avro_dtype_data[1] + [None], + ) + + records = [ + {"prop1": val1, "prop2": val2} for val1, val2 in zip(*avro_dtype_data) + ] + + buffer = io.BytesIO() + fastavro.writer(buffer, schema, records) + buffer.seek(0) + + res = plc.io.avro.read_avro( + plc.io.types.SourceInfo([buffer]), + columns=columns, + skip_rows=skip_rows, + num_rows=num_rows, + ) + + expected = pa.Table.from_arrays( + [ + pa.array(avro_dtype_data[0], type=expected_type1), + pa.array(avro_dtype_data[1], type=expected_type2), + ], + names=["prop1", "prop2"], + ) + + # Adjust for skip_rows/num_rows in result + length = num_rows if num_rows != -1 else None + expected = expected.slice(skip_rows, length=length) + + # adjust for # of columns + if columns != []: + expected = expected.select(columns) + + assert_table_and_meta_eq(res, expected) diff --git a/python/cudf/cudf/pylibcudf_tests/test_source_info.py b/python/cudf/cudf/pylibcudf_tests/test_source_info.py new file mode 100644 index 00000000000..71a3ecbcc30 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_source_info.py @@ -0,0 +1,69 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import io + +import pytest + +import cudf._lib.pylibcudf as plc + + +@pytest.mark.parametrize( + "source", ["a.txt", b"hello world", io.BytesIO(b"hello world")] +) +def test_source_info_ctor(source, tmp_path): + if isinstance(source, str): + file = tmp_path / source + file.write_bytes("hello world".encode("utf-8")) + source = str(file) + + plc.io.SourceInfo([source]) + + # TODO: test contents of source_info buffer is correct + # once buffers are exposed on python side + + +@pytest.mark.parametrize( + "sources", + [ + ["a.txt", "a.txt"], + [b"hello world", b"hello there"], + [io.BytesIO(b"hello world"), io.BytesIO(b"hello there")], + ], +) +def test_source_info_ctor_multiple(sources, tmp_path): + for i in range(len(sources)): + source = sources[i] + if isinstance(source, str): + file = tmp_path / source + file.write_bytes("hello world".encode("utf-8")) + sources[i] = str(file) + + plc.io.SourceInfo(sources) + + # TODO: test contents of source_info buffer is correct + # once buffers are exposed on python side + + +@pytest.mark.parametrize( + "sources", + [ + ["awef.txt", b"hello world", io.BytesIO(b"hello world")], + [b"hello world", b"hello there", "awef.txt"], + [ + io.BytesIO(b"hello world"), + io.BytesIO(b"hello there"), + b"hello world", + ], + ], +) +def test_source_info_ctor_mixing_invalid(sources, tmp_path): + # Unlike the previous test + # don't create files so that they are missing + for i in range(len(sources)): + source = sources[i] + if isinstance(source, str): + file = tmp_path / source + file.write_bytes("hello world".encode("utf-8")) + sources[i] = str(file) + with pytest.raises(ValueError): + plc.io.SourceInfo(sources)