diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index d81bcf34a0c..826f879ddc0 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -68,12 +68,14 @@ function(ConfigureTest CMAKE_TEST_NAME) INSTALL_COMPONENT_SET testing ) - set_tests_properties( - ${CMAKE_TEST_NAME} - PROPERTIES - ENVIRONMENT - "GTEST_CUDF_STREAM_MODE=new_${_CUDF_TEST_STREAM_MODE}_default;LD_PRELOAD=$" - ) + if(CUDF_BUILD_STREAMS_TEST_UTIL) + set_tests_properties( + ${CMAKE_TEST_NAME} + PROPERTIES + ENVIRONMENT + "GTEST_CUDF_STREAM_MODE=new_${_CUDF_TEST_STREAM_MODE}_default;LD_PRELOAD=$" + ) + endif() endfunction() # ################################################################################################## @@ -401,14 +403,10 @@ ConfigureTest(SPAN_TEST utilities_tests/span_tests.cu) ConfigureTest(SPAN_TEST_DEVICE_VECTOR utilities_tests/span_tests.cu) # Overwrite the environments set by ConfigureTest -set_tests_properties( - SPAN_TEST - PROPERTIES - ENVIRONMENT - "GTEST_FILTER=-${_allowlist_filter};GTEST_CUDF_STREAM_MODE=new_cudf_default;LD_PRELOAD=$" -) -set_tests_properties( - SPAN_TEST_DEVICE_VECTOR PROPERTIES ENVIRONMENT "GTEST_FILTER=${_allowlist_filter}" +set_property( + TEST SPAN_TEST SPAN_TEST_DEVICE_VECTOR + APPEND + PROPERTY ENVIRONMENT "GTEST_FILTER=-${_allowlist_filter}" ) # ################################################################################################## @@ -672,9 +670,11 @@ target_include_directories(JIT_PARSER_TEST PRIVATE "$= -1") - if not isinstance(skip_rows, int) or skip_rows < -1: - raise TypeError("skip_rows must be an int >= -1") - - cdef vector[string] c_columns - if columns is not None and len(columns) > 0: - c_columns.reserve(len(columns)) - for col in columns: - c_columns.push_back(str(col).encode()) - - cdef avro_reader_options options = move( - avro_reader_options.builder(make_source_info([datasource])) - .columns(c_columns) - .skip_rows( skip_rows) - .num_rows( num_rows) - .build() + if not isinstance(skip_rows, int) or skip_rows < 0: + raise TypeError("skip_rows must be an int >= 0") + + return data_from_pylibcudf_io( + plc.io.avro.read_avro( + SourceInfo([datasource]), + columns, + skip_rows, + num_rows + ) ) - - cdef table_with_metadata c_result - - with nogil: - c_result = move(libcudf_read_avro(options)) - - names = [info.name.decode() for info in c_result.metadata.schema_info] - - return data_from_unique_ptr(move(c_result.tbl), column_names=names) diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index aa771295607..0b0bbdb2589 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -151,14 +151,14 @@ cdef csv_reader_options make_csv_reader_options( ) if quoting == 1: - c_quoting = quote_style.QUOTE_ALL + c_quoting = quote_style.ALL elif quoting == 2: - c_quoting = quote_style.QUOTE_NONNUMERIC + c_quoting = quote_style.NONNUMERIC elif quoting == 3: - c_quoting = quote_style.QUOTE_NONE + c_quoting = quote_style.NONE else: # Default value - c_quoting = quote_style.QUOTE_MINIMAL + c_quoting = quote_style.MINIMAL cdef csv_reader_options csv_reader_options_c = move( csv_reader_options.builder(c_source_info) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index f0eef9be124..ac592cedaac 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -491,7 +491,7 @@ def write_parquet( "Valid values are '1.0' and '2.0'" ) - dict_policy = ( + cdef cudf_io_types.dictionary_policy dict_policy = ( cudf_io_types.dictionary_policy.ADAPTIVE if use_dictionary else cudf_io_types.dictionary_policy.NEVER diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 7d0676f6def..6beb7b0f506 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -50,3 +50,4 @@ link_to_pyarrow_headers(pylibcudf_interop) add_subdirectory(libcudf) add_subdirectory(strings) +add_subdirectory(io) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt new file mode 100644 index 00000000000..2cfec101bab --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt @@ -0,0 +1,25 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources avro.pyx types.pyx) + +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf +) + +set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_types) +link_to_pyarrow_headers("${targets_using_arrow_headers}") diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd new file mode 100644 index 00000000000..250292746c1 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . cimport avro, types +from .types cimport SourceInfo, TableWithMetadata diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py new file mode 100644 index 00000000000..5242c741911 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . import avro, types +from .types import SourceInfo, TableWithMetadata diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd b/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd new file mode 100644 index 00000000000..3695f36a6e7 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata +from cudf._lib.pylibcudf.libcudf.io.avro cimport avro_reader_options +from cudf._lib.pylibcudf.libcudf.types cimport size_type + + +cpdef TableWithMetadata read_avro( + SourceInfo source_info, + list columns = *, + size_type skip_rows = *, + size_type num_rows = * +) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx new file mode 100644 index 00000000000..946e0896fc8 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx @@ -0,0 +1,58 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.string cimport string +from libcpp.utility cimport move +from libcpp.vector cimport vector + +from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata +from cudf._lib.pylibcudf.libcudf.io.avro cimport ( + avro_reader_options, + read_avro as cpp_read_avro, +) +from cudf._lib.pylibcudf.libcudf.types cimport size_type + + +cpdef TableWithMetadata read_avro( + SourceInfo source_info, + list columns = None, + size_type skip_rows = 0, + size_type num_rows = -1 +): + """ + Reads an Avro dataset into a set of columns. + + Parameters + ---------- + source_info: SourceInfo + The SourceInfo object to read the avro dataset from. + columns: list, default None + Optional columns to read, if not provided, reads all columns in the file. + skip_rows: size_type, default 0 + The number of rows to skip. + num_rows: size_type, default -1 + The number of rows to read, after skipping rows. + If -1 is passed, all rows will be read. + + Returns + ------- + TableWithMetadata + The Table and its corresponding metadata that was read in. + """ + cdef vector[string] c_columns + if columns is not None and len(columns) > 0: + c_columns.reserve(len(columns)) + for col in columns: + c_columns.push_back(str(col).encode()) + + cdef avro_reader_options avro_opts = move( + avro_reader_options.builder(source_info.c_obj) + .columns(c_columns) + .skip_rows(skip_rows) + .num_rows(num_rows) + .build() + ) + + with nogil: + c_result = move(cpp_read_avro(avro_opts)) + + return TableWithMetadata.from_libcudf(c_result) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd new file mode 100644 index 00000000000..aa846a47343 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from cudf._lib.pylibcudf.libcudf.io.types cimport ( + column_encoding, + column_in_metadata, + column_name_info, + compression_type, + dictionary_policy, + io_type, + partition_info, + quote_style, + sink_info, + source_info, + statistics_freq, + table_input_metadata, + table_metadata, + table_with_metadata, +) +from cudf._lib.pylibcudf.table cimport Table + + +cdef class TableWithMetadata: + cdef public Table tbl + cdef table_metadata metadata + + @staticmethod + cdef TableWithMetadata from_libcudf(table_with_metadata& tbl) + +cdef class SourceInfo: + cdef source_info c_obj diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx new file mode 100644 index 00000000000..cd777232b33 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx @@ -0,0 +1,110 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.string cimport string +from libcpp.utility cimport move +from libcpp.vector cimport vector + +from cudf._lib.pylibcudf.libcudf.io.types cimport ( + host_buffer, + source_info, + table_with_metadata, +) + +import errno +import io +import os + + +cdef class TableWithMetadata: + """A container holding a table and its associated metadata + (e.g. column names) + + For details, see :cpp:class:`cudf::io::table_with_metadata`. + """ + + @property + def columns(self): + """ + Return a list containing the columns of the table + """ + return self.tbl.columns() + + @property + def column_names(self): + """ + Return a list containing the column names of the table + """ + cdef list names = [] + for col_info in self.metadata.schema_info: + # TODO: Handle nesting (columns with child columns) + assert col_info.children.size() == 0, "Child column names are not handled!" + names.append(col_info.name.decode()) + return names + + @staticmethod + cdef TableWithMetadata from_libcudf(table_with_metadata& tbl_with_meta): + """Create a Python TableWithMetadata from a libcudf table_with_metadata""" + cdef TableWithMetadata out = TableWithMetadata.__new__(TableWithMetadata) + out.tbl = Table.from_libcudf(move(tbl_with_meta.tbl)) + out.metadata = tbl_with_meta.metadata + return out + +cdef class SourceInfo: + """A class containing details on a source to read from. + + For details, see :cpp:class:`cudf::io::source_info`. + + Parameters + ---------- + sources : List[Union[str, os.PathLike, bytes, io.BytesIO]] + A homogeneous list of sources (this can be a string filename, + an os.PathLike, bytes, or an io.BytesIO) to read from. + + Mixing different types of sources will raise a `ValueError`. + """ + + def __init__(self, list sources): + if not sources: + raise ValueError("Need to pass at least one source") + + cdef vector[string] c_files + + if isinstance(sources[0], (os.PathLike, str)): + c_files.reserve(len(sources)) + + for src in sources: + if not isinstance(src, (os.PathLike, str)): + raise ValueError("All sources must be of the same type!") + if not os.path.isfile(src): + raise FileNotFoundError(errno.ENOENT, + os.strerror(errno.ENOENT), + src) + + c_files.push_back( str(src).encode()) + + self.c_obj = move(source_info(c_files)) + return + + # TODO: host_buffer is deprecated API, use host_span instead + cdef vector[host_buffer] c_host_buffers + cdef const unsigned char[::1] c_buffer + cdef bint empty_buffer = False + if isinstance(sources[0], bytes): + empty_buffer = True + for buffer in sources: + if not isinstance(buffer, bytes): + raise ValueError("All sources must be of the same type!") + if (len(buffer) > 0): + c_buffer = buffer + c_host_buffers.push_back(host_buffer(&c_buffer[0], + c_buffer.shape[0])) + empty_buffer = False + elif isinstance(sources[0], io.BytesIO): + for bio in sources: + if not isinstance(bio, io.BytesIO): + raise ValueError("All sources must be of the same type!") + c_buffer = bio.getbuffer() # check if empty? + c_host_buffers.push_back(host_buffer(&c_buffer[0], + c_buffer.shape[0])) + + self.c_obj = source_info(c_host_buffers) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd index e553515dfdf..25f91849dea 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd @@ -94,7 +94,9 @@ cdef extern from "cudf/io/orc.hpp" \ orc_writer_options_builder& compression( cudf_io_types.compression_type comp ) except + - orc_writer_options_builder& enable_statistics(bool val) except + + orc_writer_options_builder& enable_statistics( + cudf_io_types.statistics_freq val + ) except + orc_writer_options_builder& stripe_size_bytes(size_t val) except + orc_writer_options_builder& stripe_size_rows(size_type val) except + orc_writer_options_builder& row_index_stride(size_type val) except + @@ -147,7 +149,7 @@ cdef extern from "cudf/io/orc.hpp" \ cudf_io_types.compression_type comp ) except + chunked_orc_writer_options_builder& enable_statistics( - bool val + cudf_io_types.statistics_freq val ) except + orc_writer_options_builder& stripe_size_bytes(size_t val) except + orc_writer_options_builder& stripe_size_rows(size_type val) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd index 38fae1df1e5..8d87deb1472 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd @@ -20,45 +20,45 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/io/types.hpp" \ namespace "cudf::io" nogil: - ctypedef enum quote_style: - QUOTE_MINIMAL "cudf::io::quote_style::MINIMAL" - QUOTE_ALL "cudf::io::quote_style::ALL" - QUOTE_NONNUMERIC "cudf::io::quote_style::NONNUMERIC" - QUOTE_NONE "cudf::io::quote_style::NONE" - - ctypedef enum compression_type: - NONE "cudf::io::compression_type::NONE" - AUTO "cudf::io::compression_type::AUTO" - SNAPPY "cudf::io::compression_type::SNAPPY" - GZIP "cudf::io::compression_type::GZIP" - BZIP2 "cudf::io::compression_type::BZIP2" - BROTLI "cudf::io::compression_type::BROTLI" - ZIP "cudf::io::compression_type::ZIP" - XZ "cudf::io::compression_type::XZ" - ZLIB "cudf::io::compression_type::ZLIB" - LZ4 "cudf::io::compression_type::LZ4" - LZO "cudf::io::compression_type::LZO" - ZSTD "cudf::io::compression_type::ZSTD" - - ctypedef enum io_type: - FILEPATH "cudf::io::io_type::FILEPATH" - HOST_BUFFER "cudf::io::io_type::HOST_BUFFER" - VOID "cudf::io::io_type::VOID" - USER_IMPLEMENTED "cudf::io::io_type::USER_IMPLEMENTED" - - ctypedef enum statistics_freq: + cpdef enum class quote_style(int32_t): + MINIMAL + ALL + NONNUMERIC + NONE + + cpdef enum class compression_type(int32_t): + NONE + AUTO + SNAPPY + GZIP + BZIP2 + BROTLI + ZIP + XZ + ZLIB + LZ4 + LZO + ZSTD + + cpdef enum class io_type(int32_t): + FILEPATH + HOST_BUFFER + VOID + USER_IMPLEMENTED + + cpdef enum class statistics_freq(int32_t): STATISTICS_NONE = 0, STATISTICS_ROWGROUP = 1, STATISTICS_PAGE = 2, STATISTICS_COLUMN = 3, - ctypedef enum dictionary_policy: + cpdef enum class dictionary_policy(int32_t): NEVER = 0, ADAPTIVE = 1, ALWAYS = 2, cdef extern from "cudf/io/types.hpp" namespace "cudf::io" nogil: - cpdef enum class column_encoding: + cpdef enum class column_encoding(int32_t): USE_DEFAULT = -1 DICTIONARY = 0 PLAIN = 1 diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt index 930c22781d0..bd6e2e0af02 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources char_types.pyx) +set(cython_sources char_types.pyx regex_flags.pyx) set(linked_libraries cudf::cudf) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd index 2a5701fa6a3..41617f157b7 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd @@ -1,9 +1,12 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. + +from libc.stdint cimport int32_t + cdef extern from "cudf/strings/regex/flags.hpp" \ namespace "cudf::strings" nogil: - ctypedef enum regex_flags: - DEFAULT 'cudf::strings::regex_flags::DEFAULT' - MULTILINE 'cudf::strings::regex_flags::MULTILINE' - DOTALL 'cudf::strings::regex_flags::DOTALL' + cpdef enum class regex_flags(int32_t): + DEFAULT + MULTILINE + DOTALL diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt index c9a983e24f4..cb7f71b1912 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt @@ -12,7 +12,9 @@ # the License. # ============================================================================= -set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx replace.pyx) +set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx + regex_program.pyx replace.pyx +) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd index 7563df8a107..959aa94737d 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd @@ -1,3 +1,12 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport capitalize, case, char_types, find, replace +from . cimport ( + capitalize, + case, + char_types, + contains, + find, + regex_flags, + regex_program, + replace, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py index cb4f0e38f97..b7384913286 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py @@ -1,3 +1,12 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import capitalize, case, char_types, find, replace +from . import ( + capitalize, + case, + char_types, + contains, + find, + regex_flags, + regex_program, + replace, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd new file mode 100644 index 00000000000..275aa95d97e --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram + + +cpdef Column contains_re(Column input, RegexProgram prog) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx new file mode 100644 index 00000000000..8c598b7c953 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx @@ -0,0 +1,41 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.strings cimport contains as cpp_contains +from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram + + +cpdef Column contains_re( + Column input, + RegexProgram prog +): + """Returns a boolean column identifying rows which match the given + regex_program object. + + For details, see :cpp:func:`cudf::strings::contains_re`. + + Parameters + ---------- + input : Column + The input strings + prog : RegexProgram + Regex program instance + + Returns + ------- + pylibcudf.Column + New column of boolean results for each string + """ + + cdef unique_ptr[column] result + + with nogil: + result = cpp_contains.contains_re( + input.view(), + prog.c_obj.get()[0] + ) + + return Column.from_libcudf(move(result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd new file mode 100644 index 00000000000..79937bf574a --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd @@ -0,0 +1,2 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx new file mode 100644 index 00000000000..903c2ddd503 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.libcudf.strings.regex_flags import \ + regex_flags as RegexFlags # no-cython-lint diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd new file mode 100644 index 00000000000..61ed268fb2d --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string + +from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program + + +cdef class RegexProgram: + cdef unique_ptr[regex_program] c_obj diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx new file mode 100644 index 00000000000..d605b0aba02 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx @@ -0,0 +1,37 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags +from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program + +from cudf._lib.pylibcudf.strings.regex_flags import RegexFlags +from cudf._lib.pylibcudf.strings.regex_flags cimport regex_flags + + +cdef class RegexProgram: + + def __init__(self, *args, **kwargs): + raise ValueError("Do not instantiate RegexProgram directly, use create") + + @staticmethod + def create(str pattern, int flags): + cdef unique_ptr[regex_program] c_prog + cdef regex_flags c_flags + cdef string c_pattern = pattern.encode() + + cdef RegexProgram ret = RegexProgram.__new__(RegexProgram) + if isinstance(flags, object): + if isinstance(flags, (int, RegexFlags)): + c_flags = flags + with nogil: + c_prog = regex_program.create(c_pattern, c_flags) + + ret.c_obj = move(c_prog) + else: + raise ValueError("flags must be of type RegexFlags") + + return ret diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx index 087acd8062d..502a1d14696 100644 --- a/python/cudf/cudf/_lib/strings/contains.pyx +++ b/python/cudf/cudf/_lib/strings/contains.pyx @@ -14,7 +14,6 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar from cudf._lib.pylibcudf.libcudf.strings.contains cimport ( - contains_re as cpp_contains_re, count_re as cpp_count_re, like as cpp_like, matches_re as cpp_matches_re, @@ -23,6 +22,9 @@ from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program from cudf._lib.scalar cimport DeviceScalar +from cudf._lib.pylibcudf.strings import contains +from cudf._lib.pylibcudf.strings.regex_program import RegexProgram + @acquire_spill_lock() def contains_re(Column source_strings, object reg_ex, uint32_t flags): @@ -30,21 +32,10 @@ def contains_re(Column source_strings, object reg_ex, uint32_t flags): Returns a Column of boolean values with True for `source_strings` that contain regular expression `reg_ex`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string reg_ex_string = str(reg_ex).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(reg_ex_string, c_flags)) - c_result = move(cpp_contains_re( - source_view, - dereference(c_prog) - )) - - return Column.from_unique_ptr(move(c_result)) + prog = RegexProgram.create(str(reg_ex), flags) + return Column.from_pylibcudf( + contains.contains_re(source_strings.to_pylibcudf(mode="read"), prog) + ) @acquire_spill_lock() diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd index c5a1e7552b9..99850d549a1 100644 --- a/python/cudf/cudf/_lib/utils.pxd +++ b/python/cudf/cudf/_lib/utils.pxd @@ -11,6 +11,7 @@ from cudf._lib.pylibcudf.libcudf.table.table cimport table, table_view cdef data_from_unique_ptr( unique_ptr[table] c_tbl, column_names, index_names=*) cdef data_from_pylibcudf_table(tbl, column_names, index_names=*) +cdef data_from_pylibcudf_io(tbl_with_meta) cdef data_from_table_view( table_view tv, object owner, object column_names, object index_names=*) cdef table_view table_view_from_columns(columns) except * diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 4c4cd48d6ed..de6b9f690b6 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -315,6 +315,17 @@ cdef data_from_pylibcudf_table(tbl, column_names, index_names=None): index_names ) +cdef data_from_pylibcudf_io(tbl_with_meta): + """ + Unpacks the TableWithMetadata from libcudf I/O + into a dict of columns and an Index (cuDF format) + """ + return _data_from_columns( + columns=[Column.from_pylibcudf(plc) for plc in tbl_with_meta.columns], + column_names=tbl_with_meta.column_names, + index_names=None + ) + cdef columns_from_table_view( table_view tv, object owners, diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index e00053529a8..54d38f1a8cf 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -63,6 +63,23 @@ def assert_table_eq(pa_table: pa.Table, plc_table: plc.Table) -> None: assert_column_eq(pa_col, plc_col) +def assert_table_and_meta_eq( + plc_table_w_meta: plc.io.types.TableWithMetadata, pa_table: pa.Table +) -> None: + """Verify that the pylibcudf TableWithMetadata and PyArrow table are equal""" + + plc_table = plc_table_w_meta.tbl + + plc_shape = (plc_table.num_rows(), plc_table.num_columns()) + assert plc_shape == pa_table.shape + + for plc_col, pa_col in zip(plc_table.columns(), pa_table.columns): + assert_column_eq(plc_col, pa_col) + + # Check column name equality + assert plc_table_w_meta.column_names == pa_table.column_names + + def cudf_raises(expected_exception: BaseException, *args, **kwargs): # A simple wrapper around pytest.raises that defaults to looking for cudf exceptions match = kwargs.get("match", None) diff --git a/python/cudf/cudf/pylibcudf_tests/test_avro.py b/python/cudf/cudf/pylibcudf_tests/test_avro.py new file mode 100644 index 00000000000..d6cd86768cd --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_avro.py @@ -0,0 +1,123 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import io +import itertools + +import fastavro +import pyarrow as pa +import pytest +from utils import assert_table_and_meta_eq + +import cudf._lib.pylibcudf as plc + +avro_dtype_pairs = [ + ("boolean", pa.bool_()), + ("int", pa.int32()), + ("long", pa.int64()), + ("float", pa.float32()), + ("double", pa.float64()), + ("bytes", pa.string()), + ("string", pa.string()), +] + + +@pytest.fixture( + scope="module", params=itertools.combinations(avro_dtype_pairs, 2) +) +def avro_dtypes(request): + return request.param + + +@pytest.fixture +def avro_dtype_data(avro_dtypes): + (avro_type1, _), (avro_type2, _) = avro_dtypes + + def _get_data(avro_type): + if avro_type == "boolean": + return [True, False, True] + elif avro_type in {"int", "long"}: + return [1, 2, -1] + elif avro_type in {"float", "double"}: + return [1.0, 3.1415, -3.1415] + elif avro_type == "bytes": + return [b"a", b"b", b"c"] + elif avro_type == "string": + return ["Hello", "World!", ""] + + return _get_data(avro_type1), _get_data(avro_type2) + + +@pytest.fixture( + params=[ + (0, 0), + (0, -1), + (1, -1), + (3, -1), + ] +) +def row_opts(request): + """ + (skip_rows, num_rows) combos for the avro reader + """ + return request.param + + +@pytest.mark.parametrize("columns", [["prop1"], [], ["prop1", "prop2"]]) +@pytest.mark.parametrize("nullable", [True, False]) +def test_read_avro(avro_dtypes, avro_dtype_data, row_opts, columns, nullable): + (avro_type1, expected_type1), (avro_type2, expected_type2) = avro_dtypes + + avro_type1 = avro_type1 if not nullable else ["null", avro_type1] + avro_type2 = avro_type2 if not nullable else ["null", avro_type2] + + skip_rows, num_rows = row_opts + + schema = fastavro.parse_schema( + { + "type": "record", + "name": "test", + "fields": [ + {"name": "prop1", "type": avro_type1}, + {"name": "prop2", "type": avro_type2}, + ], + } + ) + + if nullable: + avro_dtype_data = ( + avro_dtype_data[0] + [None], + avro_dtype_data[1] + [None], + ) + + records = [ + {"prop1": val1, "prop2": val2} for val1, val2 in zip(*avro_dtype_data) + ] + + buffer = io.BytesIO() + fastavro.writer(buffer, schema, records) + buffer.seek(0) + + res = plc.io.avro.read_avro( + plc.io.types.SourceInfo([buffer]), + columns=columns, + skip_rows=skip_rows, + num_rows=num_rows, + ) + + expected = pa.Table.from_arrays( + [ + pa.array(avro_dtype_data[0], type=expected_type1), + pa.array(avro_dtype_data[1], type=expected_type2), + ], + names=["prop1", "prop2"], + ) + + # Adjust for skip_rows/num_rows in result + length = num_rows if num_rows != -1 else None + expected = expected.slice(skip_rows, length=length) + + # adjust for # of columns + if columns != []: + expected = expected.select(columns) + + assert_table_and_meta_eq(res, expected) diff --git a/python/cudf/cudf/pylibcudf_tests/test_regex_program.py b/python/cudf/cudf/pylibcudf_tests/test_regex_program.py new file mode 100644 index 00000000000..3a9bcec3616 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_regex_program.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pytest + +import cudf._lib.pylibcudf as plc + + +@pytest.mark.parametrize("pat", ["(", "*", "\\"]) +def test_regex_program_invalid(pat): + with pytest.raises(RuntimeError): + plc.strings.regex_program.RegexProgram.create( + pat, plc.strings.regex_flags.RegexFlags.DEFAULT + ) diff --git a/python/cudf/cudf/pylibcudf_tests/test_source_info.py b/python/cudf/cudf/pylibcudf_tests/test_source_info.py new file mode 100644 index 00000000000..71a3ecbcc30 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_source_info.py @@ -0,0 +1,69 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import io + +import pytest + +import cudf._lib.pylibcudf as plc + + +@pytest.mark.parametrize( + "source", ["a.txt", b"hello world", io.BytesIO(b"hello world")] +) +def test_source_info_ctor(source, tmp_path): + if isinstance(source, str): + file = tmp_path / source + file.write_bytes("hello world".encode("utf-8")) + source = str(file) + + plc.io.SourceInfo([source]) + + # TODO: test contents of source_info buffer is correct + # once buffers are exposed on python side + + +@pytest.mark.parametrize( + "sources", + [ + ["a.txt", "a.txt"], + [b"hello world", b"hello there"], + [io.BytesIO(b"hello world"), io.BytesIO(b"hello there")], + ], +) +def test_source_info_ctor_multiple(sources, tmp_path): + for i in range(len(sources)): + source = sources[i] + if isinstance(source, str): + file = tmp_path / source + file.write_bytes("hello world".encode("utf-8")) + sources[i] = str(file) + + plc.io.SourceInfo(sources) + + # TODO: test contents of source_info buffer is correct + # once buffers are exposed on python side + + +@pytest.mark.parametrize( + "sources", + [ + ["awef.txt", b"hello world", io.BytesIO(b"hello world")], + [b"hello world", b"hello there", "awef.txt"], + [ + io.BytesIO(b"hello world"), + io.BytesIO(b"hello there"), + b"hello world", + ], + ], +) +def test_source_info_ctor_mixing_invalid(sources, tmp_path): + # Unlike the previous test + # don't create files so that they are missing + for i in range(len(sources)): + source = sources[i] + if isinstance(source, str): + file = tmp_path / source + file.write_bytes("hello world".encode("utf-8")) + sources[i] = str(file) + with pytest.raises(ValueError): + plc.io.SourceInfo(sources) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_contains.py b/python/cudf/cudf/pylibcudf_tests/test_string_contains.py new file mode 100644 index 00000000000..8cdb6f7c521 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_string_contains.py @@ -0,0 +1,55 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import cudf._lib.pylibcudf as plc + + +@pytest.fixture(scope="module") +def pa_target_col(): + return pa.array( + ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"] + ) + + +@pytest.fixture(scope="module") +def plc_target_col(pa_target_col): + return plc.interop.from_arrow(pa_target_col) + + +@pytest.fixture( + params=[ + "A", + "de", + ".*", + "^a", + "^A", + "[^a-z]", + "[a-z]{3,}", + "^[A-Z]{2,}", + "j|u", + ], + scope="module", +) +def pa_target_scalar(request): + return pa.scalar(request.param, type=pa.string()) + + +@pytest.fixture(scope="module") +def plc_target_pat(pa_target_scalar): + prog = plc.strings.regex_program.RegexProgram.create( + pa_target_scalar.as_py(), plc.strings.regex_flags.RegexFlags.DEFAULT + ) + return prog + + +def test_contains_re( + pa_target_col, plc_target_col, pa_target_scalar, plc_target_pat +): + got = plc.strings.contains.contains_re(plc_target_col, plc_target_pat) + expected = pa.compute.match_substring_regex( + pa_target_col, pa_target_scalar.as_py() + ) + assert_column_eq(got, expected)