From e657537819181aa79e46903fbf6ca861bbca91b8 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 16 Dec 2024 13:30:13 -0800 Subject: [PATCH 1/3] Add ORC reader options structs to pylibcudf --- python/cudf/cudf/io/orc.py | 28 ++- python/pylibcudf/pylibcudf/io/orc.pxd | 32 ++- python/pylibcudf/pylibcudf/io/orc.pyi | 30 ++- python/pylibcudf/pylibcudf/io/orc.pyx | 238 +++++++++++++----- .../pylibcudf/pylibcudf/tests/io/test_orc.py | 17 +- 5 files changed, 240 insertions(+), 105 deletions(-) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index 5616413b7e4..3960df97748 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -240,15 +240,27 @@ def read_orc( elif not isinstance(num_rows, int) or num_rows < -1: raise TypeError("num_rows must be an int >= -1") - tbl_w_meta = plc.io.orc.read_orc( - plc.io.SourceInfo(filepaths_or_buffers), - columns, - stripes, - skiprows, - num_rows, - use_index, - dtype_to_pylibcudf_type(cudf.dtype(timestamp_type)), + options = ( + plc.io.orc.OrcReaderOptions.builder( + plc.io.types.SourceInfo(filepaths_or_buffers) + ) + .use_index(use_index) + .build() ) + if num_rows >= 0: + options.set_num_rows(num_rows) + if skiprows >= 0: + options.set_skip_rows(skiprows) + if stripes is not None: + options.set_stripes(stripes) + if timestamp_type is not None: + options.set_timestamp_type( + dtype_to_pylibcudf_type(cudf.dtype(timestamp_type)) + ) + if columns is not None and len(columns) > 0: + options.set_columns(columns) + + tbl_w_meta = plc.io.orc.read_orc(options) if isinstance(columns, list) and len(columns) == 0: # When `columns=[]`, index needs to be diff --git a/python/pylibcudf/pylibcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/io/orc.pxd index 671f0692444..7531608519c 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pxd +++ b/python/pylibcudf/pylibcudf/io/orc.pxd @@ -1,5 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from libc.stdint cimport uint64_t +from libc.stdint cimport uint64_t, int64_t from libcpp cimport bool from libcpp.optional cimport optional from libcpp.string cimport string @@ -19,6 +19,8 @@ from pylibcudf.libcudf.io.orc_metadata cimport ( ) from pylibcudf.libcudf.io.orc cimport ( orc_chunked_writer, + orc_reader_options, + orc_reader_options_builder, orc_writer_options, orc_writer_options_builder, chunked_orc_writer_options, @@ -32,17 +34,23 @@ from pylibcudf.libcudf.io.types cimport ( statistics_freq, ) -cpdef TableWithMetadata read_orc( - SourceInfo source_info, - list columns = *, - list stripes = *, - size_type skip_rows = *, - size_type nrows = *, - bool use_index = *, - bool use_np_dtypes = *, - DataType timestamp_type = *, - list decimal128_columns = * -) +cdef class OrcReaderOptions: + cdef orc_reader_options c_obj + cdef SourceInfo source + cpdef void set_num_rows(self, int64_t nrows) + cpdef void set_skip_rows(self, int64_t skip_rows) + cpdef void set_stripes(self, list stripes) + cpdef void set_decimal128_columns(self, list val) + cpdef void set_timestamp_type(self, DataType type_) + cpdef void set_columns(self, list col_names) + +cdef class OrcReaderOptionsBuilder: + cdef orc_reader_options_builder c_obj + cdef SourceInfo source + cpdef OrcReaderOptionsBuilder use_index(self, bool use) + cpdef OrcReaderOptions build(self) + +cpdef TableWithMetadata read_orc(OrcReaderOptions options) cdef class OrcColumnStatistics: cdef optional[uint64_t] number_of_values_c diff --git a/python/pylibcudf/pylibcudf/io/orc.pyi b/python/pylibcudf/pylibcudf/io/orc.pyi index 516f97981e9..c496b7a2152 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pyi +++ b/python/pylibcudf/pylibcudf/io/orc.pyi @@ -1,6 +1,8 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from typing import Any, Self +from typing import Any + +from typing_extensions import Self from pylibcudf.io.types import ( CompressionType, @@ -11,19 +13,21 @@ from pylibcudf.io.types import ( TableWithMetadata, ) from pylibcudf.table import Table -from pylibcudf.types import DataType -def read_orc( - source_info: SourceInfo, - columns: list[str] | None = None, - stripes: list[list[int]] | None = None, - skip_rows: int = 0, - nrows: int = -1, - use_index: bool = True, - use_np_dtypes: bool = True, - timestamp_type: DataType | None = None, - decimal128_columns: list[str] | None = None, -) -> TableWithMetadata: ... +class OrcReaderOptions: + def set_num_rows(self, nrows: int) -> None: ... + def set_skip_rows(self, skip_rows: int) -> None: ... + def set_stripes(self, stripes: list[list[int]]) -> None: ... + def set_decimal128_columns(self, val: list[str]) -> None: ... + def set_columns(self, col_names: list[str]) -> None: ... + @staticmethod + def builder(source: SourceInfo) -> OrcReaderOptionsBuilder: ... + +class OrcReaderOptionsBuilder: + def use_index(self, use: bool) -> Self: ... + def build(self) -> OrcReaderOptions: ... + +def read_orc(options: OrcReaderOptions) -> TableWithMetadata: ... class OrcColumnStatistics: def __init__(self): ... diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx index 63eab4a9634..9319a244873 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pyx +++ b/python/pylibcudf/pylibcudf/io/orc.pyx @@ -237,84 +237,190 @@ cdef class ParsedOrcStatistics: return out -cpdef TableWithMetadata read_orc( - SourceInfo source_info, - list columns = None, - list stripes = None, - size_type skip_rows = 0, - size_type nrows = -1, - bool use_index = True, - bool use_np_dtypes = True, - DataType timestamp_type = None, - list decimal128_columns = None, -): - """Reads an ORC file into a :py:class:`~.types.TableWithMetadata`. - - Parameters - ---------- - source_info : SourceInfo - The SourceInfo object to read the Parquet file from. - columns : list, default None - The string names of the columns to be read. - stripes : list[list[size_type]], default None - List of stripes to be read. - skip_rows : int64_t, default 0 - The number of rows to skip from the start of the file. - nrows : size_type, default -1 - The number of rows to read. By default, read the entire file. - use_index : bool, default True - Whether to use the row index to speed up reading. - use_np_dtypes : bool, default True - Whether to use numpy compatible dtypes. - timestamp_type : DataType, default None - The timestamp type to use for the timestamp columns. - decimal128_columns : list, default None - List of column names to be read as 128-bit decimals. +cdef class OrcReaderOptions: + """ + The settings to use for ``read_orc`` - Returns - ------- - TableWithMetadata - The Table and its corresponding metadata (column names) that were read in. + For details, see :cpp:class:`cudf::io::orc_reader_options` """ - cdef orc_reader_options opts - cdef vector[vector[size_type]] c_stripes - opts = ( - orc_reader_options.builder(source_info.c_obj) - .use_index(use_index) - .build() - ) - if nrows >= 0: - opts.set_num_rows(nrows) - if skip_rows >= 0: - opts.set_skip_rows(skip_rows) - if stripes is not None: - c_stripes = stripes - opts.set_stripes(c_stripes) - if timestamp_type is not None: - opts.set_timestamp_type(timestamp_type.c_obj) - - cdef vector[string] c_decimal128_columns - if decimal128_columns is not None and len(decimal128_columns) > 0: - c_decimal128_columns.reserve(len(decimal128_columns)) - for col in decimal128_columns: + @staticmethod + def builder(SourceInfo source): + """ + Create a OrcReaderOptionsBuilder object + + For details, see :cpp:func:`cudf::io::orc_reader_options::builder` + + Parameters + ---------- + sink : SourceInfo + The source to read the ORC file from. + + Returns + ------- + OrcReaderOptionsBuilder + Builder to build OrcReaderOptions + """ + cdef OrcReaderOptionsBuilder orc_builder = ( + OrcReaderOptionsBuilder.__new__(OrcReaderOptionsBuilder) + ) + orc_builder.c_obj = orc_reader_options.builder(source.c_obj) + orc_builder.source = source + return orc_builder + + cpdef void set_num_rows(self, int64_t nrows): + """ + Sets number of row to read. + + Parameters + ---------- + nrows: int64_t + Number of rows + + Returns + ------- + None + """ + self.c_obj.set_num_rows(nrows) + + cpdef void set_skip_rows(self, int64_t skip_rows): + """ + Sets number of rows to skip from the start. + + Parameters + ---------- + skip_rows: int64_t + Number of rows + + Returns + ------- + None + """ + self.c_obj.set_skip_rows(skip_rows) + + cpdef void set_stripes(self, list stripes): + """ + Sets list of stripes to read for each input source. + + Parameters + ---------- + stripes: list[list[size_type]] + List of lists, mapping stripes to read to input sources + + Returns + ------- + None + """ + cdef vector[vector[size_type]] c_stripes + cdef vector[size_type] vec + for sub_list in stripes: + for x in sub_list: + vec.push_back(x) + c_stripes.push_back(vec) + vec.clear() + self.c_obj.set_stripes(c_stripes) + + cpdef void set_decimal128_columns(self, list val): + """ + Set columns that should be read as 128-bit Decimal. + + Parameters + ---------- + val: list[str] + List of fully qualified column names + + Returns + ------- + None + """ + cdef vector[string] c_decimal128_columns + c_decimal128_columns.reserve(len(val)) + for col in val: if not isinstance(col, str): raise TypeError("Decimal 128 column names must be strings!") c_decimal128_columns.push_back(col.encode()) - opts.set_decimal128_columns(c_decimal128_columns) + self.c_obj.set_decimal128_columns(c_decimal128_columns) + + cpdef void set_timestamp_type(self, DataType type_): + """ + Sets timestamp type to which timestamp column will be cast. + + Parameters + ---------- + type_: DataType + Type of timestamp - cdef vector[string] c_column_names - if columns is not None and len(columns) > 0: - c_column_names.reserve(len(columns)) - for col in columns: + Returns + ------- + None + """ + self.c_obj.set_timestamp_type(type_.c_obj) + + cpdef void set_columns(self, list col_names): + """ + Sets names of the column to read. + + Parameters + ---------- + col_names: list[str] + List of column names + + Returns + ------- + None + """ + cdef vector[string] c_column_names + c_column_names.reserve(len(col_names)) + for col in col_names: if not isinstance(col, str): raise TypeError("Column names must be strings!") c_column_names.push_back(col.encode()) - opts.set_columns(c_column_names) + self.c_obj.set_columns(c_column_names) + +cdef class OrcReaderOptionsBuilder: + cpdef OrcReaderOptionsBuilder use_index(self, bool use): + """ + Enable/Disable use of row index to speed-up reading. + + Parameters + ---------- + use : bool + Boolean value to enable/disable row index use + Returns + ------- + OrcReaderOptionsBuilder + """ + self.c_obj.use_index(use) + return self + + cpdef OrcReaderOptions build(self): + """Create a OrcReaderOptions object""" + cdef OrcReaderOptions orc_options = OrcReaderOptions.__new__( + OrcReaderOptions + ) + orc_options.c_obj = move(self.c_obj.build()) + orc_options.source = self.source + return orc_options + + +cpdef TableWithMetadata read_orc(OrcReaderOptions options): + """ + Read from ORC format. + + The source to read from and options are encapsulated + by the `options` object. + + For details, see :cpp:func:`read_orc`. + + Parameters + ---------- + options: OrcReaderOptions + Settings for controlling reading behavior + """ cdef table_with_metadata c_result with nogil: - c_result = move(cpp_read_orc(opts)) + c_result = move(cpp_read_orc(options.c_obj)) return TableWithMetadata.from_libcudf(c_result) @@ -503,7 +609,7 @@ cpdef void write_orc(OrcWriterOptions options): The table to write, output paths, and options are encapsulated by the `options` object. - For details, see :cpp:func:`write_csv`. + For details, see :cpp:func:`write_orc`. Parameters ---------- diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py index 2557e40c935..fe35255505c 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_orc.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py @@ -37,12 +37,17 @@ def test_read_orc_basic( binary_source_or_sink, pa_table, **_COMMON_ORC_SOURCE_KWARGS ) - res = plc.io.orc.read_orc( - plc.io.SourceInfo([source]), - nrows=nrows, - skip_rows=skiprows, - columns=columns, - ) + options = plc.io.orc.OrcReaderOptions.builder( + plc.io.types.SourceInfo([source]) + ).build() + if nrows >= 0: + options.set_num_rows(nrows) + if skiprows >= 0: + options.set_skip_rows(skiprows) + if columns is not None and len(columns) > 0: + options.set_columns(columns) + + res = plc.io.orc.read_orc(options) if columns is not None: pa_table = pa_table.select(columns) From 6eacae5b155ef53a8683e597110604800c168dfe Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 16 Dec 2024 13:36:12 -0800 Subject: [PATCH 2/3] add to __all__ --- python/pylibcudf/pylibcudf/io/orc.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx index 9319a244873..c125d7e76fa 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pyx +++ b/python/pylibcudf/pylibcudf/io/orc.pyx @@ -46,6 +46,8 @@ __all__ = [ "read_orc", "read_parsed_orc_statistics", "write_orc", + "OrcReaderOptions", + "OrcReaderOptionsBuilder", "OrcWriterOptions", "OrcWriterOptionsBuilder", "OrcChunkedWriter", From 6e2a3a337b88af7d49e162aab0cbbbc369ea30a7 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 17 Dec 2024 20:24:30 -0500 Subject: [PATCH 3/3] add check for empty stripes --- python/cudf/cudf/io/orc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index 3960df97748..5103137bc77 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -251,7 +251,7 @@ def read_orc( options.set_num_rows(num_rows) if skiprows >= 0: options.set_skip_rows(skiprows) - if stripes is not None: + if stripes is not None and len(stripes) > 0: options.set_stripes(stripes) if timestamp_type is not None: options.set_timestamp_type(