From 4e2f60c734e8fb2a3b38e9a41af52111ceea7915 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 12 Nov 2024 20:48:56 -0800 Subject: [PATCH 01/16] [WIP] Migrate ORC Writer to pylibcudf --- python/pylibcudf/pylibcudf/io/orc.pxd | 27 +++++++++++++++++++++++++- python/pylibcudf/pylibcudf/io/orc.pyi | 28 ++++++++++++++++++++++++++- 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/python/pylibcudf/pylibcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/io/orc.pxd index b111d617b1b..db7dc5e99e8 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pxd +++ b/python/pylibcudf/pylibcudf/io/orc.pxd @@ -4,12 +4,21 @@ from libcpp cimport bool from libcpp.optional cimport optional from libcpp.string cimport string from libcpp.vector cimport vector -from pylibcudf.io.types cimport SourceInfo, TableWithMetadata +from pylibcudf.io.types cimport ( + SourceInfo, + TableWithMetadata, + CompressionType, + StatisticsFreq, +) from pylibcudf.libcudf.io.orc_metadata cimport ( column_statistics, parsed_orc_statistics, statistics_type, ) +from pylibcudf.libcudf.io.orc cimport ( + orc_writer_options, + orc_writer_options_builder, +) from pylibcudf.libcudf.types cimport size_type from pylibcudf.types cimport DataType @@ -48,3 +57,19 @@ cdef class ParsedOrcStatistics: cpdef ParsedOrcStatistics read_parsed_orc_statistics( SourceInfo source_info ) + + +cdef class OrcWriterOptions: + cdef orc_writer_options c_obj + + @staticmethod + cdef OrcWriterOptionsBuilder builder(SinkInfo sink, Table table) + + +cdef class OrcWriterOptionsBuilder: + cdef orc_writer_options_builder c_obj + cpdef OrcWriterOptionsBuilder compression(self, CompressionType comp) + cpdef OrcWriterOptionsBuilder enable_statistics(self, StatisticsFreq val) + cpdef OrcWriterOptionsBuilder key_value_metadata(self, object kvm) + cpdef OrcWriterOptionsBuilder metadata(self, TableWithMetadata meta) + cpdef OrcWriterOptions build(self) diff --git a/python/pylibcudf/pylibcudf/io/orc.pyi b/python/pylibcudf/pylibcudf/io/orc.pyi index 4cf87f1a832..7e589f18d6f 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pyi +++ b/python/pylibcudf/pylibcudf/io/orc.pyi @@ -2,7 +2,14 @@ from typing import Any -from pylibcudf.io.types import SourceInfo, TableWithMetadata +from pylibcudf.io.types import ( + CompressionType, + SinkInfo, + SourceInfo, + StatisticsFreq, + TableWithMetadata, +) +from pylibcudf.table import Table from pylibcudf.types import DataType def read_orc( @@ -39,3 +46,22 @@ class ParsedOrcStatistics: def read_parsed_orc_statistics( source_info: SourceInfo, ) -> ParsedOrcStatistics: ... + +class OrcWriterOptions: + def __init__(self): ... + @staticmethod + def builder(sink: SinkInfo, table: Table) -> OrcWriterOptionsBuilder: ... + +class OrcWriterOptionsBuilder: + def __init__(self): ... + def compression( + self, comp: CompressionType + ) -> OrcWriterOptionsBuilder: ... + def enable_statistics( + self, val: StatisticsFreq + ) -> OrcWriterOptionsBuilder: ... + def key_value_metadata(self, kvm: object) -> OrcWriterOptionsBuilder: ... + def metadata(self, meta: TableWithMetadata) -> OrcWriterOptionsBuilder: ... + def build(self) -> OrcWriterOptions: ... + +def write_orc(options: OrcWriterOptions) -> None: ... From 763b870d3122978d7826123a7027e0551fe6c802 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 18 Nov 2024 13:09:53 -0800 Subject: [PATCH 02/16] add orc chuncked writer --- python/cudf/cudf/_lib/orc.pyx | 165 ++++++++------ python/pylibcudf/pylibcudf/io/orc.pxd | 60 ++++- python/pylibcudf/pylibcudf/io/orc.pyi | 3 + python/pylibcudf/pylibcudf/io/orc.pyx | 134 ++++++++++- python/pylibcudf/pylibcudf/io/types.pxd | 36 +++ python/pylibcudf/pylibcudf/io/types.pyx | 211 +++++++++++++++++- .../pylibcudf/pylibcudf/tests/io/test_orc.py | 3 + 7 files changed, 516 insertions(+), 96 deletions(-) diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 32a5e463916..bb245dbd57b 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -3,11 +3,10 @@ from libc.stdint cimport int64_t from libcpp cimport bool, int from libcpp.map cimport map -from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.utility cimport move from libcpp.vector cimport vector - +import itertools from collections import OrderedDict try: @@ -16,23 +15,13 @@ except ImportError: import json cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view -from pylibcudf.libcudf.io.data_sink cimport data_sink -from pylibcudf.libcudf.io.orc cimport ( - chunked_orc_writer_options, - orc_chunked_writer, - orc_writer_options, - write_orc as libcudf_write_orc, -) from pylibcudf.libcudf.io.types cimport ( column_in_metadata, - sink_info, - table_input_metadata, ) -from pylibcudf.libcudf.table.table_view cimport table_view from cudf._lib.column cimport Column -from cudf._lib.io.utils cimport make_sink_info, update_col_struct_field_names -from cudf._lib.utils cimport data_from_pylibcudf_io, table_view_from_table +from cudf._lib.io.utils cimport update_col_struct_field_names +from cudf._lib.utils cimport data_from_pylibcudf_io import pylibcudf as plc @@ -40,7 +29,8 @@ import cudf from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES from cudf._lib.utils import _index_level_name, generate_pandas_metadata from cudf.core.buffer import acquire_spill_lock - +from pylibcudf.io.types cimport TableInputMetadata, SinkInfo +from pylibcudf.io.orc cimport OrcChunkedWriter # TODO: Consider inlining this function since it seems to only be used in one place. cpdef read_parsed_orc_statistics(filepath_or_buffer): @@ -246,61 +236,70 @@ def write_orc( -------- cudf.read_orc """ - cdef unique_ptr[data_sink] data_sink_c - cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c) - cdef table_input_metadata tbl_meta cdef map[string, string] user_data user_data[str.encode("pandas")] = str.encode(generate_pandas_metadata( table, index) ) - if index is True or ( index is None and not isinstance(table._index, cudf.RangeIndex) ): - tv = table_view_from_table(table) - tbl_meta = table_input_metadata(tv) + if table._index is not None: + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain(table.index._columns, table._columns) + ] + ) + else: + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in table._columns + ] + ) + tbl_meta = TableInputMetadata(plc_table) for level, idx_name in enumerate(table._index.names): - tbl_meta.column_metadata[level].set_name( - str.encode( - _index_level_name(idx_name, level, table._column_names) - ) + tbl_meta.c_obj.column_metadata[level].set_name( + str.encode(_index_level_name(idx_name, level, table._column_names)) ) num_index_cols_meta = len(table._index.names) else: - tv = table_view_from_table(table, ignore_index=True) - tbl_meta = table_input_metadata(tv) + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in table._columns] + ) + tbl_meta = TableInputMetadata(plc_table) num_index_cols_meta = 0 if cols_as_map_type is not None: cols_as_map_type = set(cols_as_map_type) for i, name in enumerate(table._column_names, num_index_cols_meta): - tbl_meta.column_metadata[i].set_name(name.encode()) + tbl_meta.c_obj.column_metadata[i].set_name(name.encode()) _set_col_children_metadata( table[name]._column, - tbl_meta.column_metadata[i], + tbl_meta.c_obj.column_metadata[i], (cols_as_map_type is not None) and (name in cols_as_map_type), ) - cdef orc_writer_options c_orc_writer_options = move( - orc_writer_options.builder( - sink_info_c, tv - ).metadata(tbl_meta) + options = ( + plc.io.orc.OrcWriterOptions.builder( + plc.io.SinkInfo([path_or_buf]), plc_table + ) + .metadata(tbl_meta) .key_value_metadata(move(user_data)) .compression(_get_comp_type(compression)) .enable_statistics(_get_orc_stat_freq(statistics)) .build() ) if stripe_size_bytes is not None: - c_orc_writer_options.set_stripe_size_bytes(stripe_size_bytes) + options.set_stripe_size_bytes(stripe_size_bytes) if stripe_size_rows is not None: - c_orc_writer_options.set_stripe_size_rows(stripe_size_rows) + options.set_stripe_size_rows(stripe_size_rows) if row_index_stride is not None: - c_orc_writer_options.set_row_index_stride(row_index_stride) + options.set_row_index_stride(row_index_stride) - with nogil: - libcudf_write_orc(c_orc_writer_options) + plc.io.orc.write_orc(options) cdef int64_t get_skiprows_arg(object arg) except*: @@ -326,13 +325,12 @@ cdef class ORCWriter: cudf.io.orc.to_orc """ cdef bool initialized - cdef unique_ptr[orc_chunked_writer] writer - cdef sink_info sink - cdef unique_ptr[data_sink] _data_sink + cdef OrcChunkedWriter writer + cdef SinkInfo sink cdef str statistics cdef object compression cdef object index - cdef table_input_metadata tbl_meta + cdef TableInputMetadata tbl_meta cdef object cols_as_map_type cdef object stripe_size_bytes cdef object stripe_size_rows @@ -347,8 +345,7 @@ cdef class ORCWriter: object stripe_size_bytes=None, object stripe_size_rows=None, object row_index_stride=None): - - self.sink = make_sink_info(path, self._data_sink) + self.sink = plc.io.SinkInfo([path]) self.statistics = statistics self.compression = compression self.index = index @@ -368,17 +365,23 @@ cdef class ORCWriter: table._index.name is not None or isinstance(table._index, cudf.core.multiindex.MultiIndex) ) - tv = table_view_from_table(table, not keep_index) + if keep_index: + columns = [ + col.to_pylibcudf(mode="read") + for col in itertools.chain(table.index._columns, table._columns) + ] + else: + columns = [col.to_pylibcudf(mode="read") for col in table._columns] - with nogil: - self.writer.get()[0].write(tv) + self.writer.write( + plc.Table(columns) + ) def close(self): if not self.initialized: return - with nogil: - self.writer.get()[0].close() + self.writer.close() def __dealloc__(self): self.close() @@ -387,35 +390,50 @@ cdef class ORCWriter: """ Prepare all the values required to build the chunked_orc_writer_options anb creates a writer""" - cdef table_view tv num_index_cols_meta = 0 - self.tbl_meta = table_input_metadata( - table_view_from_table(table, ignore_index=True), + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in table._columns + ] ) + self.tbl_meta = TableInputMetadata(plc_table) if self.index is not False: if isinstance(table._index, cudf.core.multiindex.MultiIndex): - tv = table_view_from_table(table) - self.tbl_meta = table_input_metadata(tv) + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain(table.index._columns, table._columns) + ] + ) + self.tbl_meta = TableInputMetadata(plc_table) for level, idx_name in enumerate(table._index.names): - self.tbl_meta.column_metadata[level].set_name( - (str.encode(idx_name)) + self.tbl_meta.c_obj.column_metadata[level].set_name( + str.encode(idx_name) ) num_index_cols_meta = len(table._index.names) else: if table._index.name is not None: - tv = table_view_from_table(table) - self.tbl_meta = table_input_metadata(tv) - self.tbl_meta.column_metadata[0].set_name( + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain( + table.index._columns, table._columns + ) + ] + ) + self.tbl_meta = TableInputMetadata(plc_table) + self.tbl_meta.c_obj.column_metadata[0].set_name( str.encode(table._index.name) ) num_index_cols_meta = 1 for i, name in enumerate(table._column_names, num_index_cols_meta): - self.tbl_meta.column_metadata[i].set_name(name.encode()) + self.tbl_meta.c_obj.column_metadata[i].set_name(name.encode()) _set_col_children_metadata( table[name]._column, - self.tbl_meta.column_metadata[i], + self.tbl_meta.c_obj.column_metadata[i], (self.cols_as_map_type is not None) and (name in self.cols_as_map_type), ) @@ -424,23 +442,22 @@ cdef class ORCWriter: pandas_metadata = generate_pandas_metadata(table, self.index) user_data[str.encode("pandas")] = str.encode(pandas_metadata) - cdef chunked_orc_writer_options c_opts = move( - chunked_orc_writer_options.builder(self.sink) - .metadata(self.tbl_meta) - .key_value_metadata(move(user_data)) - .compression(_get_comp_type(self.compression)) - .enable_statistics(_get_orc_stat_freq(self.statistics)) - .build() - ) + options = ( + plc.io.orc.ChunkedOrcWriterOptions.builder(self.sink) + .metadata(self.tbl_meta) + .key_value_metadata(move(user_data)) + .compression(_get_comp_type(self.compression)) + .enable_statistics(_get_orc_stat_freq(self.statistics)) + .build() + ) if self.stripe_size_bytes is not None: - c_opts.set_stripe_size_bytes(self.stripe_size_bytes) + options.set_stripe_size_bytes(self.stripe_size_bytes) if self.stripe_size_rows is not None: - c_opts.set_stripe_size_rows(self.stripe_size_rows) + options.set_stripe_size_rows(self.stripe_size_rows) if self.row_index_stride is not None: - c_opts.set_row_index_stride(self.row_index_stride) + options.set_row_index_stride(self.row_index_stride) - with nogil: - self.writer.reset(new orc_chunked_writer(c_opts)) + self.writer = plc.io.orc.OrcChunkedWriter.from_options(options) self.initialized = True diff --git a/python/pylibcudf/pylibcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/io/orc.pxd index db7dc5e99e8..0329bcecf54 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pxd +++ b/python/pylibcudf/pylibcudf/io/orc.pxd @@ -4,11 +4,13 @@ from libcpp cimport bool from libcpp.optional cimport optional from libcpp.string cimport string from libcpp.vector cimport vector +from libcpp.memory cimport unique_ptr +from libcpp.map cimport map from pylibcudf.io.types cimport ( SourceInfo, + SinkInfo, TableWithMetadata, - CompressionType, - StatisticsFreq, + TableInputMetadata, ) from pylibcudf.libcudf.io.orc_metadata cimport ( column_statistics, @@ -16,12 +18,19 @@ from pylibcudf.libcudf.io.orc_metadata cimport ( statistics_type, ) from pylibcudf.libcudf.io.orc cimport ( + orc_chunked_writer, orc_writer_options, orc_writer_options_builder, + chunked_orc_writer_options, + chunked_orc_writer_options_builder, ) from pylibcudf.libcudf.types cimport size_type from pylibcudf.types cimport DataType - +from pylibcudf.table cimport Table +from pylibcudf.libcudf.io.types cimport ( + compression_type, + statistics_freq, +) cpdef TableWithMetadata read_orc( SourceInfo source_info, @@ -58,18 +67,45 @@ cpdef ParsedOrcStatistics read_parsed_orc_statistics( SourceInfo source_info ) - cdef class OrcWriterOptions: cdef orc_writer_options c_obj - - @staticmethod - cdef OrcWriterOptionsBuilder builder(SinkInfo sink, Table table) - + cdef Table table + cdef SinkInfo sink + cpdef void set_stripe_size_bytes(self, size_t size_bytes) + cpdef void set_stripe_size_rows(self, size_t size_rows) + cpdef void set_row_index_stride(self, size_t stride) cdef class OrcWriterOptionsBuilder: cdef orc_writer_options_builder c_obj - cpdef OrcWriterOptionsBuilder compression(self, CompressionType comp) - cpdef OrcWriterOptionsBuilder enable_statistics(self, StatisticsFreq val) - cpdef OrcWriterOptionsBuilder key_value_metadata(self, object kvm) - cpdef OrcWriterOptionsBuilder metadata(self, TableWithMetadata meta) + cdef Table table + cdef SinkInfo sink + cpdef OrcWriterOptionsBuilder compression(self, compression_type comp) + cpdef OrcWriterOptionsBuilder enable_statistics(self, statistics_freq val) + cpdef OrcWriterOptionsBuilder key_value_metadata(self, map[string, string] kvm) + cpdef OrcWriterOptionsBuilder metadata(self, TableInputMetadata meta) cpdef OrcWriterOptions build(self) + +cpdef void write_orc(OrcWriterOptions options) + +cdef class OrcChunkedWriter: + cdef unique_ptr[orc_chunked_writer] c_obj + cpdef void close(self) + cpdef write(self, Table table) + +cdef class ChunkedOrcWriterOptions: + cdef chunked_orc_writer_options c_obj + cdef SinkInfo sink + cpdef void set_stripe_size_bytes(self, size_t size_bytes) + cpdef void set_stripe_size_rows(self, size_t size_rows) + cpdef void set_row_index_stride(self, size_t stride) + +cdef class ChunkedOrcWriterOptionsBuilder: + cdef chunked_orc_writer_options_builder c_obj + cdef SinkInfo sink + cpdef ChunkedOrcWriterOptionsBuilder compression(self, compression_type comp) + cpdef ChunkedOrcWriterOptionsBuilder enable_statistics(self, statistics_freq val) + cpdef ChunkedOrcWriterOptionsBuilder key_value_metadata( + self, map[string, string] kvm + ) + cpdef ChunkedOrcWriterOptionsBuilder metadata(self, TableInputMetadata meta) + cpdef ChunkedOrcWriterOptions build(self) diff --git a/python/pylibcudf/pylibcudf/io/orc.pyi b/python/pylibcudf/pylibcudf/io/orc.pyi index 7e589f18d6f..3d8b273f7b0 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pyi +++ b/python/pylibcudf/pylibcudf/io/orc.pyi @@ -49,6 +49,9 @@ def read_parsed_orc_statistics( class OrcWriterOptions: def __init__(self): ... + def set_stripe_size_bytes(self, size_bytes: int) -> None: ... + def set_stripe_size_rows(self, size_rows: int) -> None: ... + def set_row_index_stride(self, stride: int) -> None: ... @staticmethod def builder(sink: SinkInfo, table: Table) -> OrcWriterOptionsBuilder: ... diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx index 4270f5b4f95..903f6e88a45 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pyx +++ b/python/pylibcudf/pylibcudf/io/orc.pyx @@ -6,10 +6,11 @@ from libcpp.vector cimport vector import datetime -from pylibcudf.io.types cimport SourceInfo, TableWithMetadata +from pylibcudf.io.types cimport SourceInfo, TableWithMetadata, SinkInfo from pylibcudf.libcudf.io.orc cimport ( orc_reader_options, read_orc as cpp_read_orc, + write_orc as cpp_write_orc, ) from pylibcudf.libcudf.io.orc_metadata cimport ( binary_statistics, @@ -29,12 +30,22 @@ from pylibcudf.libcudf.io.types cimport table_with_metadata from pylibcudf.libcudf.types cimport size_type from pylibcudf.types cimport DataType from pylibcudf.variant cimport get_if, holds_alternative +from pylibcudf.libcudf.io.types cimport ( + compression_type, + statistics_freq, +) +from pylibcudf.libcudf.io.orc cimport ( + orc_chunked_writer, + orc_writer_options, + chunked_orc_writer_options, +) __all__ = [ "OrcColumnStatistics", "ParsedOrcStatistics", "read_orc", "read_parsed_orc_statistics", + "write_orc", ] cdef class OrcColumnStatistics: @@ -310,3 +321,124 @@ cpdef ParsedOrcStatistics read_parsed_orc_statistics( cpp_read_parsed_orc_statistics(source_info.c_obj) ) return ParsedOrcStatistics.from_libcudf(parsed) + + +cdef class OrcWriterOptions: + cpdef void set_stripe_size_bytes(self, size_t size_bytes): + self.c_obj.set_stripe_size_bytes(size_bytes) + + cpdef void set_stripe_size_rows(self, size_t size_rows): + self.c_obj.set_stripe_size_rows(size_rows) + + cpdef void set_row_index_stride(self, size_t stride): + self.c_obj.set_row_index_stride(stride) + + @staticmethod + def builder(SinkInfo sink, Table table): + cdef OrcWriterOptionsBuilder orc_builder = OrcWriterOptionsBuilder.__new__( + OrcWriterOptionsBuilder + ) + orc_builder.c_obj = orc_writer_options.builder(sink.c_obj, table.view()) + orc_builder.table = table + orc_builder.sink = sink + return orc_builder + + +cdef class OrcWriterOptionsBuilder: + cpdef OrcWriterOptionsBuilder compression(self, compression_type comp): + self.c_obj.compression(comp) + return self + + cpdef OrcWriterOptionsBuilder enable_statistics(self, statistics_freq val): + self.c_obj.enable_statistics(val) + return self + + cpdef OrcWriterOptionsBuilder key_value_metadata(self, map[string, string] kvm): + self.c_obj.key_value_metadata(kvm) + return self + + cpdef OrcWriterOptionsBuilder metadata(self, TableInputMetadata meta): + self.c_obj.metadata(meta.c_obj) + return self + + cpdef OrcWriterOptions build(self): + cdef OrcWriterOptions orc_options = OrcWriterOptions.__new__( + OrcWriterOptions + ) + orc_options.c_obj = move(self.c_obj.build()) + orc_options.table = self.table + orc_options.sink = self.sink + return orc_options + + +cpdef void write_orc(OrcWriterOptions options): + with nogil: + cpp_write_orc(move(options.c_obj)) + + +cdef class OrcChunkedWriter: + cpdef void close(self): + with nogil: + self.c_obj.get()[0].close() + + cpdef write(self, Table table): + with nogil: + self.c_obj.get()[0].write(table.view()) + + @staticmethod + def from_options(ChunkedOrcWriterOptions options): + cdef OrcChunkedWriter orc_writer = OrcChunkedWriter.__new__( + OrcChunkedWriter + ) + orc_writer.c_obj.reset(new orc_chunked_writer(options.c_obj)) + return orc_writer + + +cdef class ChunkedOrcWriterOptions: + cpdef void set_stripe_size_bytes(self, size_t size_bytes): + self.c_obj.set_stripe_size_bytes(size_bytes) + + cpdef void set_stripe_size_rows(self, size_t size_rows): + self.c_obj.set_stripe_size_rows(size_rows) + + cpdef void set_row_index_stride(self, size_t stride): + self.c_obj.set_row_index_stride(stride) + + @staticmethod + def builder(SinkInfo sink): + cdef ChunkedOrcWriterOptionsBuilder orc_builder = \ + ChunkedOrcWriterOptionsBuilder.__new__( + ChunkedOrcWriterOptionsBuilder + ) + orc_builder.c_obj = chunked_orc_writer_options.builder(sink.c_obj) + orc_builder.sink = sink + return orc_builder + + +cdef class ChunkedOrcWriterOptionsBuilder: + cpdef ChunkedOrcWriterOptionsBuilder compression(self, compression_type comp): + self.c_obj.compression(comp) + return self + + cpdef ChunkedOrcWriterOptionsBuilder enable_statistics(self, statistics_freq val): + self.c_obj.enable_statistics(val) + return self + + cpdef ChunkedOrcWriterOptionsBuilder key_value_metadata( + self, + map[string, string] kvm + ): + self.c_obj.key_value_metadata(kvm) + return self + + cpdef ChunkedOrcWriterOptionsBuilder metadata(self, TableInputMetadata meta): + self.c_obj.metadata(meta.c_obj) + return self + + cpdef ChunkedOrcWriterOptions build(self): + cdef ChunkedOrcWriterOptions orc_options = ChunkedOrcWriterOptions.__new__( + ChunkedOrcWriterOptions + ) + orc_options.c_obj = move(self.c_obj.build()) + orc_options.sink = self.sink + return orc_options diff --git a/python/pylibcudf/pylibcudf/io/types.pxd b/python/pylibcudf/pylibcudf/io/types.pxd index 0ab28cb0973..6e42c9c309b 100644 --- a/python/pylibcudf/pylibcudf/io/types.pxd +++ b/python/pylibcudf/pylibcudf/io/types.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector +from libcpp cimport bool from pylibcudf.libcudf.io.data_sink cimport data_sink from pylibcudf.libcudf.io.types cimport ( column_encoding, @@ -45,3 +46,38 @@ cdef class SinkInfo: # This vector just exists to keep the unique_ptrs to the sinks alive cdef vector[unique_ptr[data_sink]] sink_storage cdef sink_info c_obj + +cdef class ColumnInMetadata: + cdef column_in_metadata c_obj + + cpdef ColumnInMetadata set_name(self, str name) + + cpdef ColumnInMetadata set_name(self, str name) + + cpdef ColumnInMetadata set_nullability(self, bool nullable) + + cpdef ColumnInMetadata set_list_column_as_map(self) + + cpdef ColumnInMetadata set_int96_timestamps(self, bool req) + + cpdef ColumnInMetadata set_decimal_precision(self, int req) + + cpdef ColumnInMetadata child(self, int i) + + cpdef ColumnInMetadata set_output_as_binary(self, bool binary) + + cpdef ColumnInMetadata set_type_length(self, int type_length) + + cpdef ColumnInMetadata set_skip_compression(self, bool skip) + + cpdef ColumnInMetadata set_encoding(self, column_encoding encoding) + + cpdef str get_name(self) + + @staticmethod + cdef ColumnInMetadata from_libcudf(column_in_metadata data) + +cdef class TableInputMetadata: + cdef public Table table + cdef table_input_metadata c_obj + cdef list column_metadata diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index 7a3f16c4c50..e55ca330d46 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -14,6 +14,8 @@ from pylibcudf.libcudf.io.types cimport ( host_buffer, source_info, table_with_metadata, + column_in_metadata, + table_input_metadata, ) import codecs @@ -287,25 +289,30 @@ cdef class SinkInfo: cdef object initial_sink_cls = type(sinks[0]) - if not all(isinstance(s, initial_sink_cls) for s in sinks): + if not all(isinstance(s, initial_sink_cls) or + (isinstance(sinks[0], io.IOBase) and isinstance(s, io.IOBase)) + for s in sinks): raise ValueError("All sinks must be of the same type!") - if initial_sink_cls in {io.StringIO, io.BytesIO, io.TextIOBase}: + if isinstance(sinks[0], io.IOBase): data_sinks.reserve(len(sinks)) - if isinstance(sinks[0], (io.StringIO, io.BytesIO)): - for s in sinks: + for s in sinks: + if isinstance(s, (io.StringIO, io.BytesIO)): self.sink_storage.push_back( unique_ptr[data_sink](new iobase_data_sink(s)) ) - elif isinstance(sinks[0], io.TextIOBase): - for s in sinks: - if codecs.lookup(s).name not in ('utf-8', 'ascii'): + elif isinstance(s, io.TextIOBase): + if codecs.lookup(s.encoding).name not in ('utf-8', 'ascii'): raise NotImplementedError(f"Unsupported encoding {s.encoding}") self.sink_storage.push_back( unique_ptr[data_sink](new iobase_data_sink(s.buffer)) ) - data_sinks.push_back(self.sink_storage.back().get()) - elif initial_sink_cls is str: + else: + self.sink_storage.push_back( + unique_ptr[data_sink](new iobase_data_sink(s)) + ) + data_sinks.push_back(self.sink_storage.back().get()) + elif isinstance(sinks[0], str): paths.reserve(len(sinks)) for s in sinks: paths.push_back( s.encode()) @@ -321,3 +328,189 @@ cdef class SinkInfo: self.c_obj = sink_info(paths) __hash__ = None + + +cdef class ColumnInMetadata: + """ + Metadata for a column + Parameters + ---------- + metadata : column_in_metadata + """ + + def __init__(self): + raise ValueError( + "ColumnInMetadata should not be constructed directly. " + "Use one of the factories." + ) + + @staticmethod + cdef ColumnInMetadata from_libcudf(column_in_metadata data): + """Create a Python ColumnInMetadata from a libcudf column_in_metadata.""" + cdef ColumnInMetadata out = ColumnInMetadata.__new__(ColumnInMetadata) + out.c_obj = move(data) + return out + + cpdef ColumnInMetadata set_name(self, str name): + """ + Set the name of this column. + Parameters + ---------- + name : str + Name of the column + Returns + ------- + Self + """ + self.c_obj.set_name(name.encode()) + return self + + cpdef ColumnInMetadata set_nullability(self, bool nullable): + """ + Set the nullability of this column. + Parameters + ---------- + nullable : bool + Whether this column is nullable + Returns + ------- + Self + """ + self.c_obj.set_nullability(nullable) + return self + + cpdef ColumnInMetadata set_list_column_as_map(self): + """ + Specify that this list column should be encoded as a map in the + written file. + Returns + ------- + Self + """ + self.c_obj.set_list_column_as_map() + return self + + cpdef ColumnInMetadata set_int96_timestamps(self, bool req): + """ + Specifies whether this timestamp column should be encoded using + the deprecated int96. + Parameters + ---------- + req : bool + True = use int96 physical type. False = use int64 physical type. + Returns + ------- + Self + """ + self.c_obj.set_int96_timestamps(req) + return self + + cpdef ColumnInMetadata set_decimal_precision(self, int precision): + """ + Set the decimal precision of this column. + Only valid if this column is a decimal (fixed-point) type. + Parameters + ---------- + precision : int + The integer precision to set for this decimal column + Returns + ------- + Self + """ + self.c_obj.set_decimal_precision(precision) + return self + + cpdef ColumnInMetadata child(self, int i): + """ + Get reference to a child of this column. + Parameters + ---------- + i : int + Index of the child to get. + Returns + ------- + ColumnInMetadata + """ + return ColumnInMetadata.from_libcudf( + move(self.c_obj.child(i)) + ) + + cpdef ColumnInMetadata set_output_as_binary(self, bool binary): + """ + Specifies whether this column should be written as binary or string data. + Parameters + ---------- + binary : bool + True = use binary data type. False = use string data type + Returns + ------- + Self + """ + self.c_obj.set_output_as_binary(binary) + return self + + cpdef ColumnInMetadata set_type_length(self, int type_length): + """ + Sets the length of fixed length data. + Parameters + ---------- + type_length : int + Size of the data type in bytes + Returns + ------- + Self + """ + self.c_obj.set_type_length(type_length) + return self + + cpdef ColumnInMetadata set_skip_compression(self, bool skip): + """ + Specifies whether this column should not be compressed + regardless of the compression. + Parameters + ---------- + skip : bool + If `true` do not compress this column + Returns + ------- + Self + """ + self.c_obj.set_skip_compression(skip) + return self + + cpdef ColumnInMetadata set_encoding(self, column_encoding encoding): + """ + Specifies whether this column should not be compressed + regardless of the compression. + Parameters + ---------- + encoding : ColumnEncoding + The encoding to use + Returns + ------- + ColumnInMetadata + """ + self.c_obj.set_encoding(encoding) + return self + + cpdef str get_name(self): + """ + Get the name of this column. + Returns + ------- + str + The name of this column + """ + return self.c_obj.get_name().decode() + + +cdef class TableInputMetadata: + """ + Metadata for a table + Parameters + ---------- + table : Table + The Table to construct metadata for + """ + def __cinit__(self, Table table): + self.c_obj = table_input_metadata(table.view()) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py index 5ed660ba6cf..dfa41840752 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_orc.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py @@ -52,3 +52,6 @@ def test_read_orc_basic( ) assert_table_and_meta_eq(pa_table, res, check_field_nullability=False) + +def test_write_orc(): + pass \ No newline at end of file From 120ec07bf0efe1f226bd9badc0e1fd66b6489bcb Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 18 Nov 2024 13:15:24 -0800 Subject: [PATCH 03/16] check style --- python/pylibcudf/pylibcudf/tests/io/test_orc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py index dfa41840752..09b7d39714e 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_orc.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py @@ -53,5 +53,6 @@ def test_read_orc_basic( assert_table_and_meta_eq(pa_table, res, check_field_nullability=False) + def test_write_orc(): - pass \ No newline at end of file + pass From 9f208206ef7191f1b2c0118e623be06d5e016e8e Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 18 Nov 2024 19:50:33 -0800 Subject: [PATCH 04/16] add a test --- python/cudf/cudf/_lib/orc.pyx | 15 ++--- python/pylibcudf/pylibcudf/io/orc.pxd | 4 +- python/pylibcudf/pylibcudf/io/orc.pyx | 12 ++-- python/pylibcudf/pylibcudf/io/types.pyx | 2 +- .../pylibcudf/pylibcudf/tests/io/test_orc.py | 55 ++++++++++++++++++- 5 files changed, 70 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index bb245dbd57b..a8006113863 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -4,7 +4,6 @@ from libc.stdint cimport int64_t from libcpp cimport bool, int from libcpp.map cimport map from libcpp.string cimport string -from libcpp.utility cimport move from libcpp.vector cimport vector import itertools from collections import OrderedDict @@ -236,10 +235,8 @@ def write_orc( -------- cudf.read_orc """ - cdef map[string, string] user_data - user_data[str.encode("pandas")] = str.encode(generate_pandas_metadata( - table, index) - ) + user_data = {} + user_data["pandas"] = generate_pandas_metadata(table, index) if index is True or ( index is None and not isinstance(table._index, cudf.RangeIndex) ): @@ -287,7 +284,7 @@ def write_orc( plc.io.SinkInfo([path_or_buf]), plc_table ) .metadata(tbl_meta) - .key_value_metadata(move(user_data)) + .key_value_metadata(user_data) .compression(_get_comp_type(compression)) .enable_statistics(_get_orc_stat_freq(statistics)) .build() @@ -438,14 +435,14 @@ cdef class ORCWriter: and (name in self.cols_as_map_type), ) - cdef map[string, string] user_data + user_data = {} pandas_metadata = generate_pandas_metadata(table, self.index) - user_data[str.encode("pandas")] = str.encode(pandas_metadata) + user_data["pandas"] = pandas_metadata options = ( plc.io.orc.ChunkedOrcWriterOptions.builder(self.sink) .metadata(self.tbl_meta) - .key_value_metadata(move(user_data)) + .key_value_metadata(user_data) .compression(_get_comp_type(self.compression)) .enable_statistics(_get_orc_stat_freq(self.statistics)) .build() diff --git a/python/pylibcudf/pylibcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/io/orc.pxd index 0329bcecf54..dbc6e9dee10 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pxd +++ b/python/pylibcudf/pylibcudf/io/orc.pxd @@ -81,7 +81,7 @@ cdef class OrcWriterOptionsBuilder: cdef SinkInfo sink cpdef OrcWriterOptionsBuilder compression(self, compression_type comp) cpdef OrcWriterOptionsBuilder enable_statistics(self, statistics_freq val) - cpdef OrcWriterOptionsBuilder key_value_metadata(self, map[string, string] kvm) + cpdef OrcWriterOptionsBuilder key_value_metadata(self, dict kvm) cpdef OrcWriterOptionsBuilder metadata(self, TableInputMetadata meta) cpdef OrcWriterOptions build(self) @@ -105,7 +105,7 @@ cdef class ChunkedOrcWriterOptionsBuilder: cpdef ChunkedOrcWriterOptionsBuilder compression(self, compression_type comp) cpdef ChunkedOrcWriterOptionsBuilder enable_statistics(self, statistics_freq val) cpdef ChunkedOrcWriterOptionsBuilder key_value_metadata( - self, map[string, string] kvm + self, dict kvm ) cpdef ChunkedOrcWriterOptionsBuilder metadata(self, TableInputMetadata meta) cpdef ChunkedOrcWriterOptions build(self) diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx index 903f6e88a45..5e971418737 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pyx +++ b/python/pylibcudf/pylibcudf/io/orc.pyx @@ -353,8 +353,10 @@ cdef class OrcWriterOptionsBuilder: self.c_obj.enable_statistics(val) return self - cpdef OrcWriterOptionsBuilder key_value_metadata(self, map[string, string] kvm): - self.c_obj.key_value_metadata(kvm) + cpdef OrcWriterOptionsBuilder key_value_metadata(self, dict kvm): + self.c_obj.key_value_metadata( + {key.encode(): value.encode() for key, value in kvm.items()} + ) return self cpdef OrcWriterOptionsBuilder metadata(self, TableInputMetadata meta): @@ -426,9 +428,11 @@ cdef class ChunkedOrcWriterOptionsBuilder: cpdef ChunkedOrcWriterOptionsBuilder key_value_metadata( self, - map[string, string] kvm + dict kvm ): - self.c_obj.key_value_metadata(kvm) + self.c_obj.key_value_metadata( + {key.encode(): value.encode() for key, value in kvm.items()} + ) return self cpdef ChunkedOrcWriterOptionsBuilder metadata(self, TableInputMetadata meta): diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index e55ca330d46..c72fe3d4aa0 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -512,5 +512,5 @@ cdef class TableInputMetadata: table : Table The Table to construct metadata for """ - def __cinit__(self, Table table): + def __init__(self, Table table): self.c_obj = table_input_metadata(table.view()) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py index 09b7d39714e..c70ed22dd6d 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_orc.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py @@ -1,4 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +import io + import pyarrow as pa import pytest from utils import _convert_types, assert_table_and_meta_eq, make_source @@ -54,5 +56,54 @@ def test_read_orc_basic( assert_table_and_meta_eq(pa_table, res, check_field_nullability=False) -def test_write_orc(): - pass +@pytest.mark.parametrize( + "compression", + [ + plc.io.types.CompressionType.NONE, + plc.io.types.CompressionType.SNAPPY, + ], +) +@pytest.mark.parametrize( + "statistics", + [ + plc.io.types.StatisticsFreq.STATISTICS_NONE, + plc.io.types.StatisticsFreq.STATISTICS_COLUMN, + ], +) +@pytest.mark.parametrize("stripe_size_bytes", [None, 65536]) +@pytest.mark.parametrize("stripe_size_rows", [None, 512]) +@pytest.mark.parametrize("row_index_stride", [None, 512]) +def test_write_orc( + compression, + statistics, + stripe_size_bytes, + stripe_size_rows, + row_index_stride, +): + names = ["a", "b"] + pa_table = pa.Table.from_arrays( + [pa.array([1.0, 2.0, None]), pa.array([True, None, False])], + names=names, + ) + plc_table = plc.interop.from_arrow(pa_table) + tbl_meta = plc.io.types.TableInputMetadata(plc_table) + sink = plc.io.SinkInfo([io.BytesIO()]) + user_data = {"foo": "{'bar': 'baz'}"} + options = ( + plc.io.orc.OrcWriterOptions.builder(sink, plc_table) + .metadata(tbl_meta) + .key_value_metadata(user_data) + .compression(compression) + .enable_statistics(statistics) + .build() + ) + if stripe_size_bytes is not None: + options.set_stripe_size_bytes(stripe_size_bytes) + if stripe_size_rows is not None: + options.set_stripe_size_rows(stripe_size_rows) + if row_index_stride is not None: + options.set_row_index_stride(row_index_stride) + + plc.io.orc.write_orc(options) + + # pd.read_orc(...) From b9047479ecf605065a4b048ea55440b2afd2370a Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 19 Nov 2024 06:31:33 -0800 Subject: [PATCH 05/16] clean up, address review --- python/cudf/cudf/_lib/orc.pyx | 22 +++--------- python/pylibcudf/pylibcudf/io/orc.pyi | 34 ++++++++++++++++++- python/pylibcudf/pylibcudf/io/orc.pyx | 5 +++ .../pylibcudf/pylibcudf/tests/io/test_orc.py | 31 +++++++++++------ 4 files changed, 64 insertions(+), 28 deletions(-) diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index a8006113863..28339e6d3f6 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -240,20 +240,10 @@ def write_orc( if index is True or ( index is None and not isinstance(table._index, cudf.RangeIndex) ): - if table._index is not None: - plc_table = plc.Table( - [ - col.to_pylibcudf(mode="read") - for col in itertools.chain(table.index._columns, table._columns) - ] - ) - else: - plc_table = plc.Table( - [ - col.to_pylibcudf(mode="read") - for col in table._columns - ] - ) + columns = table._columns if table._index is None else [ + *table.index._columns, *table._columns + ] + plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns]) tbl_meta = TableInputMetadata(plc_table) for level, idx_name in enumerate(table._index.names): tbl_meta.c_obj.column_metadata[level].set_name( @@ -370,9 +360,7 @@ cdef class ORCWriter: else: columns = [col.to_pylibcudf(mode="read") for col in table._columns] - self.writer.write( - plc.Table(columns) - ) + self.writer.write(plc.Table(columns)) def close(self): if not self.initialized: diff --git a/python/pylibcudf/pylibcudf/io/orc.pyi b/python/pylibcudf/pylibcudf/io/orc.pyi index 3d8b273f7b0..3165bb8675f 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pyi +++ b/python/pylibcudf/pylibcudf/io/orc.pyi @@ -7,6 +7,7 @@ from pylibcudf.io.types import ( SinkInfo, SourceInfo, StatisticsFreq, + TableInputMetadata, TableWithMetadata, ) from pylibcudf.table import Table @@ -63,8 +64,39 @@ class OrcWriterOptionsBuilder: def enable_statistics( self, val: StatisticsFreq ) -> OrcWriterOptionsBuilder: ... - def key_value_metadata(self, kvm: object) -> OrcWriterOptionsBuilder: ... + def key_value_metadata( + self, kvm: dict[str, str] + ) -> OrcWriterOptionsBuilder: ... def metadata(self, meta: TableWithMetadata) -> OrcWriterOptionsBuilder: ... def build(self) -> OrcWriterOptions: ... def write_orc(options: OrcWriterOptions) -> None: ... + +class OrcChunkedWriter: + def __init__(self): ... + def close(self) -> None: ... + def write(self, table: Table) -> None: ... + +class ChunkedOrcWriterOptions: + def __init__(self): ... + def set_stripe_size_bytes(self, size_bytes: int) -> None: ... + def set_stripe_size_rows(self, size_rows: int) -> None: ... + def set_row_index_stride(self, stride: int) -> None: ... + @staticmethod + def builder(sink: SinkInfo) -> ChunkedOrcWriterOptionsBuilder: ... + +class ChunkedOrcWriterOptionsBuilder: + def __init__(self): ... + def compression( + self, comp: CompressionType + ) -> ChunkedOrcWriterOptionsBuilder: ... + def enable_statistics( + self, val: StatisticsFreq + ) -> ChunkedOrcWriterOptionsBuilder: ... + def key_value_metadata( + self, kvm: dict[str, str] + ) -> ChunkedOrcWriterOptionsBuilder: ... + def metadata( + self, meta: TableInputMetadata + ) -> ChunkedOrcWriterOptionsBuilder: ... + def build(self) -> ChunkedOrcWriterOptions: ... diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx index 5e971418737..96a37ab1bff 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pyx +++ b/python/pylibcudf/pylibcudf/io/orc.pyx @@ -46,6 +46,11 @@ __all__ = [ "read_orc", "read_parsed_orc_statistics", "write_orc", + "OrcWriterOptions", + "OrcWriterOptionsBuilder", + "OrcChunkedWriter", + "ChunkedOrcWriterOptions", + "ChunkedOrcWriterOptionsBuilder", ] cdef class OrcColumnStatistics: diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py index c70ed22dd6d..5a967ba001e 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_orc.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py @@ -1,6 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -import io +import pandas as pd import pyarrow as pa import pytest from utils import _convert_types, assert_table_and_meta_eq, make_source @@ -73,22 +73,25 @@ def test_read_orc_basic( @pytest.mark.parametrize("stripe_size_bytes", [None, 65536]) @pytest.mark.parametrize("stripe_size_rows", [None, 512]) @pytest.mark.parametrize("row_index_stride", [None, 512]) -def test_write_orc( +def test_roundtrip_pd_dataframe( compression, statistics, stripe_size_bytes, stripe_size_rows, row_index_stride, + tmp_path, ): - names = ["a", "b"] - pa_table = pa.Table.from_arrays( - [pa.array([1.0, 2.0, None]), pa.array([True, None, False])], - names=names, - ) + pdf = pd.DataFrame({"a": [1.0, 2.0, None], "b": [True, None, False]}) + + pa_table = pa.Table.from_pandas(pdf) plc_table = plc.interop.from_arrow(pa_table) + + tmpfile_name = tmp_path / "test.orc" + + sink = plc.io.SinkInfo([str(tmpfile_name)]) + tbl_meta = plc.io.types.TableInputMetadata(plc_table) - sink = plc.io.SinkInfo([io.BytesIO()]) - user_data = {"foo": "{'bar': 'baz'}"} + user_data = {"a": "", "b": ""} options = ( plc.io.orc.OrcWriterOptions.builder(sink, plc_table) .metadata(tbl_meta) @@ -106,4 +109,12 @@ def test_write_orc( plc.io.orc.write_orc(options) - # pd.read_orc(...) + expected_pdf = pd.read_orc(tmpfile_name) + expected_pdf.columns = pdf.columns + + res = plc.io.types.TableWithMetadata( + plc.interop.from_arrow(pa.Table.from_pandas(expected_pdf)), + [(name, []) for name in pdf.columns], + ) + + assert_table_and_meta_eq(pa_table, res, check_field_nullability=False) From 5fc0eece0e73b8875d28656e192320dab7dc4521 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 19 Nov 2024 11:47:29 -0800 Subject: [PATCH 06/16] use a pointer instead --- python/cudf/cudf/_lib/orc.pyx | 29 +++++++++---------- python/pylibcudf/pylibcudf/io/types.pxd | 4 +-- python/pylibcudf/pylibcudf/io/types.pyx | 37 +++++++++++++++---------- 3 files changed, 37 insertions(+), 33 deletions(-) diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 28339e6d3f6..c829cac6409 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -14,9 +14,6 @@ except ImportError: import json cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view -from pylibcudf.libcudf.io.types cimport ( - column_in_metadata, -) from cudf._lib.column cimport Column from cudf._lib.io.utils cimport update_col_struct_field_names @@ -28,7 +25,7 @@ import cudf from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES from cudf._lib.utils import _index_level_name, generate_pandas_metadata from cudf.core.buffer import acquire_spill_lock -from pylibcudf.io.types cimport TableInputMetadata, SinkInfo +from pylibcudf.io.types cimport TableInputMetadata, SinkInfo, ColumnInMetadata from pylibcudf.io.orc cimport OrcChunkedWriter # TODO: Consider inlining this function since it seems to only be used in one place. @@ -246,8 +243,8 @@ def write_orc( plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns]) tbl_meta = TableInputMetadata(plc_table) for level, idx_name in enumerate(table._index.names): - tbl_meta.c_obj.column_metadata[level].set_name( - str.encode(_index_level_name(idx_name, level, table._column_names)) + tbl_meta.column_metadata[level].set_name( + _index_level_name(idx_name, level, table._column_names) ) num_index_cols_meta = len(table._index.names) else: @@ -261,10 +258,10 @@ def write_orc( cols_as_map_type = set(cols_as_map_type) for i, name in enumerate(table._column_names, num_index_cols_meta): - tbl_meta.c_obj.column_metadata[i].set_name(name.encode()) + tbl_meta.column_metadata[i].set_name(name) _set_col_children_metadata( table[name]._column, - tbl_meta.c_obj.column_metadata[i], + tbl_meta.column_metadata[i], (cols_as_map_type is not None) and (name in cols_as_map_type), ) @@ -394,8 +391,8 @@ cdef class ORCWriter: ) self.tbl_meta = TableInputMetadata(plc_table) for level, idx_name in enumerate(table._index.names): - self.tbl_meta.c_obj.column_metadata[level].set_name( - str.encode(idx_name) + self.tbl_meta.column_metadata[level].set_name( + idx_name ) num_index_cols_meta = len(table._index.names) else: @@ -409,16 +406,16 @@ cdef class ORCWriter: ] ) self.tbl_meta = TableInputMetadata(plc_table) - self.tbl_meta.c_obj.column_metadata[0].set_name( - str.encode(table._index.name) + self.tbl_meta.column_metadata[0].set_name( + table._index.name ) num_index_cols_meta = 1 for i, name in enumerate(table._column_names, num_index_cols_meta): - self.tbl_meta.c_obj.column_metadata[i].set_name(name.encode()) + self.tbl_meta.column_metadata[i].set_name(name) _set_col_children_metadata( table[name]._column, - self.tbl_meta.c_obj.column_metadata[i], + self.tbl_meta.column_metadata[i], (self.cols_as_map_type is not None) and (name in self.cols_as_map_type), ) @@ -447,13 +444,13 @@ cdef class ORCWriter: self.initialized = True cdef _set_col_children_metadata(Column col, - column_in_metadata& col_meta, + ColumnInMetadata col_meta, list_column_as_map=False): if isinstance(col.dtype, cudf.StructDtype): for i, (child_col, name) in enumerate( zip(col.children, list(col.dtype.fields)) ): - col_meta.child(i).set_name(name.encode()) + col_meta.child(i).set_name(name) _set_col_children_metadata( child_col, col_meta.child(i), list_column_as_map ) diff --git a/python/pylibcudf/pylibcudf/io/types.pxd b/python/pylibcudf/pylibcudf/io/types.pxd index 6e42c9c309b..f8d1c515db1 100644 --- a/python/pylibcudf/pylibcudf/io/types.pxd +++ b/python/pylibcudf/pylibcudf/io/types.pxd @@ -48,7 +48,7 @@ cdef class SinkInfo: cdef sink_info c_obj cdef class ColumnInMetadata: - cdef column_in_metadata c_obj + cdef column_in_metadata* c_obj cpdef ColumnInMetadata set_name(self, str name) @@ -75,7 +75,7 @@ cdef class ColumnInMetadata: cpdef str get_name(self) @staticmethod - cdef ColumnInMetadata from_libcudf(column_in_metadata data) + cdef ColumnInMetadata from_libcudf(column_in_metadata* data) cdef class TableInputMetadata: cdef public Table table diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index c72fe3d4aa0..f8b73370ba7 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -33,6 +33,7 @@ from pylibcudf.libcudf.io.types import ( quote_style as QuoteStyle, # no-cython-lint statistics_freq as StatisticsFreq, # no-cython-lint ) +from cython.operator cimport dereference __all__ = [ "ColumnEncoding", @@ -345,10 +346,10 @@ cdef class ColumnInMetadata: ) @staticmethod - cdef ColumnInMetadata from_libcudf(column_in_metadata data): + cdef ColumnInMetadata from_libcudf(column_in_metadata* data): """Create a Python ColumnInMetadata from a libcudf column_in_metadata.""" cdef ColumnInMetadata out = ColumnInMetadata.__new__(ColumnInMetadata) - out.c_obj = move(data) + out.c_obj = data return out cpdef ColumnInMetadata set_name(self, str name): @@ -362,7 +363,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_name(name.encode()) + dereference(self.c_obj).set_name(name.encode()) return self cpdef ColumnInMetadata set_nullability(self, bool nullable): @@ -376,7 +377,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_nullability(nullable) + dereference(self.c_obj).set_nullability(nullable) return self cpdef ColumnInMetadata set_list_column_as_map(self): @@ -387,7 +388,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_list_column_as_map() + dereference(self.c_obj).set_list_column_as_map() return self cpdef ColumnInMetadata set_int96_timestamps(self, bool req): @@ -402,7 +403,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_int96_timestamps(req) + dereference(self.c_obj).set_int96_timestamps(req) return self cpdef ColumnInMetadata set_decimal_precision(self, int precision): @@ -417,7 +418,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_decimal_precision(precision) + dereference(self.c_obj).set_decimal_precision(precision) return self cpdef ColumnInMetadata child(self, int i): @@ -431,9 +432,8 @@ cdef class ColumnInMetadata: ------- ColumnInMetadata """ - return ColumnInMetadata.from_libcudf( - move(self.c_obj.child(i)) - ) + cdef column_in_metadata* child_c_obj = &dereference(self.c_obj).child(i) + return ColumnInMetadata.from_libcudf(child_c_obj) cpdef ColumnInMetadata set_output_as_binary(self, bool binary): """ @@ -446,7 +446,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_output_as_binary(binary) + dereference(self.c_obj).set_output_as_binary(binary) return self cpdef ColumnInMetadata set_type_length(self, int type_length): @@ -460,7 +460,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_type_length(type_length) + dereference(self.c_obj).set_type_length(type_length) return self cpdef ColumnInMetadata set_skip_compression(self, bool skip): @@ -475,7 +475,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_skip_compression(skip) + dereference(self.c_obj).set_skip_compression(skip) return self cpdef ColumnInMetadata set_encoding(self, column_encoding encoding): @@ -490,7 +490,7 @@ cdef class ColumnInMetadata: ------- ColumnInMetadata """ - self.c_obj.set_encoding(encoding) + dereference(self.c_obj).set_encoding(encoding) return self cpdef str get_name(self): @@ -501,7 +501,7 @@ cdef class ColumnInMetadata: str The name of this column """ - return self.c_obj.get_name().decode() + return dereference(self.c_obj).get_name().decode() cdef class TableInputMetadata: @@ -514,3 +514,10 @@ cdef class TableInputMetadata: """ def __init__(self, Table table): self.c_obj = table_input_metadata(table.view()) + self.column_metadata = [] + + cdef int num_columns = self.c_obj.column_metadata.size() + for i in range(num_columns): + self.column_metadata.append( + ColumnInMetadata.from_libcudf(&self.c_obj.column_metadata[i]) + ) From 866fdc2b3d989020663e46eecb959b168f46b068 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 20 Nov 2024 07:06:05 -0800 Subject: [PATCH 07/16] add doc strings --- python/pylibcudf/pylibcudf/io/orc.pxd | 10 +- python/pylibcudf/pylibcudf/io/orc.pyx | 280 +++++++++++++++++++++++- python/pylibcudf/pylibcudf/io/types.pyx | 25 ++- 3 files changed, 304 insertions(+), 11 deletions(-) diff --git a/python/pylibcudf/pylibcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/io/orc.pxd index dbc6e9dee10..671f0692444 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pxd +++ b/python/pylibcudf/pylibcudf/io/orc.pxd @@ -72,8 +72,8 @@ cdef class OrcWriterOptions: cdef Table table cdef SinkInfo sink cpdef void set_stripe_size_bytes(self, size_t size_bytes) - cpdef void set_stripe_size_rows(self, size_t size_rows) - cpdef void set_row_index_stride(self, size_t stride) + cpdef void set_stripe_size_rows(self, size_type size_rows) + cpdef void set_row_index_stride(self, size_type stride) cdef class OrcWriterOptionsBuilder: cdef orc_writer_options_builder c_obj @@ -90,14 +90,14 @@ cpdef void write_orc(OrcWriterOptions options) cdef class OrcChunkedWriter: cdef unique_ptr[orc_chunked_writer] c_obj cpdef void close(self) - cpdef write(self, Table table) + cpdef void write(self, Table table) cdef class ChunkedOrcWriterOptions: cdef chunked_orc_writer_options c_obj cdef SinkInfo sink cpdef void set_stripe_size_bytes(self, size_t size_bytes) - cpdef void set_stripe_size_rows(self, size_t size_rows) - cpdef void set_row_index_stride(self, size_t stride) + cpdef void set_stripe_size_rows(self, size_type size_rows) + cpdef void set_row_index_stride(self, size_type stride) cdef class ChunkedOrcWriterOptionsBuilder: cdef chunked_orc_writer_options_builder c_obj diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx index 96a37ab1bff..0303cd8d9fe 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pyx +++ b/python/pylibcudf/pylibcudf/io/orc.pyx @@ -330,16 +330,79 @@ cpdef ParsedOrcStatistics read_parsed_orc_statistics( cdef class OrcWriterOptions: cpdef void set_stripe_size_bytes(self, size_t size_bytes): + """ + Sets the maximum stripe size, in bytes. + + For details, see :cpp:func:`cudf::io::orc_writer_options::set_stripe_size_bytes` + + Parameters + ---------- + size_bytes: size_t + Sets the maximum stripe size, in bytes. + + Returns + ------- + None + """ self.c_obj.set_stripe_size_bytes(size_bytes) - cpdef void set_stripe_size_rows(self, size_t size_rows): + cpdef void set_stripe_size_rows(self, size_type size_rows): + """ + Sets the maximum stripe size, in rows. + + If the stripe size is smaller that the row group size, + row group size will be reduced to math the stripe size. + + For details, see :cpp:func:`cudf::io::orc_writer_options::set_stripe_size_rows` + + Parameters + ---------- + size_bytes: size_type + Maximum stripe size, in rows to be set + + Returns + ------- + None + """ self.c_obj.set_stripe_size_rows(size_rows) - cpdef void set_row_index_stride(self, size_t stride): + cpdef void set_row_index_stride(self, size_type stride): + """ + Sets the row index stride. + + Rounded down to a multiple of 8. + + For details, see :cpp:func:`cudf::io::orc_writer_options::set_row_index_stride` + + Parameters + ---------- + size_bytes: size_type + Maximum stripe size, in rows to be set + + Returns + ------- + None + """ self.c_obj.set_row_index_stride(stride) @staticmethod def builder(SinkInfo sink, Table table): + """ + Create builder to create OrcWriterOptions. + + For details, see :cpp:func:`cudf::io::orc_writer_options::builder` + + Parameters + ---------- + sink: SinkInfo + The sink used for writer output + table: Table + Table to be written to output + + Returns + ------- + OrcWriterOptionsBuilder + """ cdef OrcWriterOptionsBuilder orc_builder = OrcWriterOptionsBuilder.__new__( OrcWriterOptionsBuilder ) @@ -351,24 +414,79 @@ cdef class OrcWriterOptions: cdef class OrcWriterOptionsBuilder: cpdef OrcWriterOptionsBuilder compression(self, compression_type comp): + """ + Sets compression type. + + For details, see :cpp:func:`cudf::io::orc_writer_options_builder::compression` + + Parameters + ---------- + comp: CompressionType + The compression type to use + + Returns + ------- + OrcWriterOptionsBuilder + """ self.c_obj.compression(comp) return self cpdef OrcWriterOptionsBuilder enable_statistics(self, statistics_freq val): + """ + Choose granularity of column statistics to be written. + + For details, see :cpp:func:`enable_statistics` + + Parameters + ---------- + val: StatisticsFreq + Level of statistics collection + + Returns + ------- + OrcWriterOptionsBuilder + """ self.c_obj.enable_statistics(val) return self cpdef OrcWriterOptionsBuilder key_value_metadata(self, dict kvm): + """ + Sets Key-Value footer metadata. + + Parameters + ---------- + kvm: dict + Key-Value footer metadata + + Returns + ------- + OrcWriterOptionsBuilder + """ self.c_obj.key_value_metadata( {key.encode(): value.encode() for key, value in kvm.items()} ) return self cpdef OrcWriterOptionsBuilder metadata(self, TableInputMetadata meta): + """ + Sets associated metadata. + + For details, see :cpp:func:`cudf::io::orc_writer_options_builder::metadata` + + Parameters + ---------- + meta: TableInputMetadata + Associated metadata + + Returns + ------- + OrcWriterOptionsBuilder + """ self.c_obj.metadata(meta.c_obj) return self cpdef OrcWriterOptions build(self): + """Moves the ORC writer options builder""" cdef OrcWriterOptions orc_options = OrcWriterOptions.__new__( OrcWriterOptions ) @@ -379,21 +497,69 @@ cdef class OrcWriterOptionsBuilder: cpdef void write_orc(OrcWriterOptions options): + """ + Write to ORC format. + + The table to write, output paths, and options are encapsulated + by the `options` object. + + For details, see :cpp:func:`write_csv`. + + Parameters + ---------- + options: OrcWriterOptions + Settings for controlling writing behavior + + Returns + ------- + None + """ with nogil: cpp_write_orc(move(options.c_obj)) cdef class OrcChunkedWriter: cpdef void close(self): + """ + Closes the chunked ORC writer. + + Returns + ------- + None + """ with nogil: self.c_obj.get()[0].close() - cpdef write(self, Table table): + cpdef void write(self, Table table): + """ + Writes table to output. + + Parameters + ---------- + table: Table + able that needs to be written + + Returns + ------- + None + """ with nogil: self.c_obj.get()[0].write(table.view()) @staticmethod def from_options(ChunkedOrcWriterOptions options): + """ + Creates a chunked ORC writer from options + + Parameters + ---------- + options: ChunkedOrcWriterOptions + Settings for controlling writing behavior + + Returns + ------- + OrcChunkedWriter + """ cdef OrcChunkedWriter orc_writer = OrcChunkedWriter.__new__( OrcChunkedWriter ) @@ -403,16 +569,71 @@ cdef class OrcChunkedWriter: cdef class ChunkedOrcWriterOptions: cpdef void set_stripe_size_bytes(self, size_t size_bytes): + """ + Sets the maximum stripe size, in bytes. + + Parameters + ---------- + size_bytes: size_t + Sets the maximum stripe size, in bytes. + + Returns + ------- + None + """ self.c_obj.set_stripe_size_bytes(size_bytes) - cpdef void set_stripe_size_rows(self, size_t size_rows): + cpdef void set_stripe_size_rows(self, size_type size_rows): + """ + Sets the maximum stripe size, in rows. + + If the stripe size is smaller that the row group size, + row group size will be reduced to math the stripe size. + + Parameters + ---------- + size_bytes: size_type + Maximum stripe size, in rows to be set + + Returns + ------- + None + """ self.c_obj.set_stripe_size_rows(size_rows) - cpdef void set_row_index_stride(self, size_t stride): + cpdef void set_row_index_stride(self, size_type stride): + """ + Sets the row index stride. + + Rounded down to a multiple of 8. + + Parameters + ---------- + size_bytes: size_type + Maximum stripe size, in rows to be set + + Returns + ------- + None + """ self.c_obj.set_row_index_stride(stride) @staticmethod def builder(SinkInfo sink): + """ + Create builder to create ChunkedOrcWriterOptions. + + Parameters + ---------- + sink: SinkInfo + The sink used for writer output + table: Table + Table to be written to output + + Returns + ------- + ChunkedOrcWriterOptionsBuilder + """ cdef ChunkedOrcWriterOptionsBuilder orc_builder = \ ChunkedOrcWriterOptionsBuilder.__new__( ChunkedOrcWriterOptionsBuilder @@ -424,10 +645,34 @@ cdef class ChunkedOrcWriterOptions: cdef class ChunkedOrcWriterOptionsBuilder: cpdef ChunkedOrcWriterOptionsBuilder compression(self, compression_type comp): + """ + Sets compression type. + + Parameters + ---------- + comp: CompressionType + The compression type to use + + Returns + ------- + ChunkedOrcWriterOptionsBuilder + """ self.c_obj.compression(comp) return self cpdef ChunkedOrcWriterOptionsBuilder enable_statistics(self, statistics_freq val): + """ + Choose granularity of column statistics to be written. + + Parameters + ---------- + val: StatisticsFreq + Level of statistics collection + + Returns + ------- + ChunkedOrcWriterOptionsBuilder + """ self.c_obj.enable_statistics(val) return self @@ -435,16 +680,41 @@ cdef class ChunkedOrcWriterOptionsBuilder: self, dict kvm ): + """ + Sets Key-Value footer metadata. + + Parameters + ---------- + kvm: dict + Key-Value footer metadata + + Returns + ------- + ChunkedOrcWriterOptionsBuilder + """ self.c_obj.key_value_metadata( {key.encode(): value.encode() for key, value in kvm.items()} ) return self cpdef ChunkedOrcWriterOptionsBuilder metadata(self, TableInputMetadata meta): + """ + Sets associated metadata. + + Parameters + ---------- + meta: TableInputMetadata + Associated metadata + + Returns + ------- + ChunkedOrcWriterOptionsBuilder + """ self.c_obj.metadata(meta.c_obj) return self cpdef ChunkedOrcWriterOptions build(self): + """Moves the chunked ORC writer options builder""" cdef ChunkedOrcWriterOptions orc_options = ChunkedOrcWriterOptions.__new__( ChunkedOrcWriterOptions ) diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index f8b73370ba7..fca414a088a 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -334,9 +334,11 @@ cdef class SinkInfo: cdef class ColumnInMetadata: """ Metadata for a column + Parameters ---------- - metadata : column_in_metadata + metadata: + column_in_metadata """ def __init__(self): @@ -355,10 +357,12 @@ cdef class ColumnInMetadata: cpdef ColumnInMetadata set_name(self, str name): """ Set the name of this column. + Parameters ---------- name : str Name of the column + Returns ------- Self @@ -369,10 +373,12 @@ cdef class ColumnInMetadata: cpdef ColumnInMetadata set_nullability(self, bool nullable): """ Set the nullability of this column. + Parameters ---------- nullable : bool Whether this column is nullable + Returns ------- Self @@ -384,6 +390,7 @@ cdef class ColumnInMetadata: """ Specify that this list column should be encoded as a map in the written file. + Returns ------- Self @@ -395,10 +402,12 @@ cdef class ColumnInMetadata: """ Specifies whether this timestamp column should be encoded using the deprecated int96. + Parameters ---------- req : bool True = use int96 physical type. False = use int64 physical type. + Returns ------- Self @@ -410,10 +419,12 @@ cdef class ColumnInMetadata: """ Set the decimal precision of this column. Only valid if this column is a decimal (fixed-point) type. + Parameters ---------- precision : int The integer precision to set for this decimal column + Returns ------- Self @@ -424,10 +435,12 @@ cdef class ColumnInMetadata: cpdef ColumnInMetadata child(self, int i): """ Get reference to a child of this column. + Parameters ---------- i : int Index of the child to get. + Returns ------- ColumnInMetadata @@ -438,10 +451,12 @@ cdef class ColumnInMetadata: cpdef ColumnInMetadata set_output_as_binary(self, bool binary): """ Specifies whether this column should be written as binary or string data. + Parameters ---------- binary : bool True = use binary data type. False = use string data type + Returns ------- Self @@ -452,10 +467,12 @@ cdef class ColumnInMetadata: cpdef ColumnInMetadata set_type_length(self, int type_length): """ Sets the length of fixed length data. + Parameters ---------- type_length : int Size of the data type in bytes + Returns ------- Self @@ -467,10 +484,12 @@ cdef class ColumnInMetadata: """ Specifies whether this column should not be compressed regardless of the compression. + Parameters ---------- skip : bool If `true` do not compress this column + Returns ------- Self @@ -482,10 +501,12 @@ cdef class ColumnInMetadata: """ Specifies whether this column should not be compressed regardless of the compression. + Parameters ---------- encoding : ColumnEncoding The encoding to use + Returns ------- ColumnInMetadata @@ -496,6 +517,7 @@ cdef class ColumnInMetadata: cpdef str get_name(self): """ Get the name of this column. + Returns ------- str @@ -507,6 +529,7 @@ cdef class ColumnInMetadata: cdef class TableInputMetadata: """ Metadata for a table + Parameters ---------- table : Table From 50e5e161c4e7a6af22bbab9bc1dcb12329f49f6f Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 20 Nov 2024 09:15:27 -0800 Subject: [PATCH 08/16] skip test if pandas version < 2.2.3 --- python/pylibcudf/pylibcudf/tests/io/test_orc.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py index 5a967ba001e..af082870828 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_orc.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py @@ -3,6 +3,7 @@ import pandas as pd import pyarrow as pa import pytest +from packaging import version from utils import _convert_types, assert_table_and_meta_eq, make_source import pylibcudf as plc @@ -56,6 +57,10 @@ def test_read_orc_basic( assert_table_and_meta_eq(pa_table, res, check_field_nullability=False) +@pytest.mark.skipif( + version.parse(pd.__version__) < version.parse("2.2.3"), + reason="fails on a pandas version < 2.2.3", +) @pytest.mark.parametrize( "compression", [ From 52b1c77673cb5085732ffff08fb84190ee418bc7 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 20 Nov 2024 15:36:35 -0800 Subject: [PATCH 09/16] address review --- .../pylibcudf/pylibcudf/tests/io/test_orc.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py index af082870828..2557e40c935 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_orc.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py @@ -1,9 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -import pandas as pd import pyarrow as pa import pytest -from packaging import version from utils import _convert_types, assert_table_and_meta_eq, make_source import pylibcudf as plc @@ -57,10 +55,6 @@ def test_read_orc_basic( assert_table_and_meta_eq(pa_table, res, check_field_nullability=False) -@pytest.mark.skipif( - version.parse(pd.__version__) < version.parse("2.2.3"), - reason="fails on a pandas version < 2.2.3", -) @pytest.mark.parametrize( "compression", [ @@ -78,7 +72,7 @@ def test_read_orc_basic( @pytest.mark.parametrize("stripe_size_bytes", [None, 65536]) @pytest.mark.parametrize("stripe_size_rows", [None, 512]) @pytest.mark.parametrize("row_index_stride", [None, 512]) -def test_roundtrip_pd_dataframe( +def test_roundtrip_pa_table( compression, statistics, stripe_size_bytes, @@ -86,9 +80,7 @@ def test_roundtrip_pd_dataframe( row_index_stride, tmp_path, ): - pdf = pd.DataFrame({"a": [1.0, 2.0, None], "b": [True, None, False]}) - - pa_table = pa.Table.from_pandas(pdf) + pa_table = pa.table({"a": [1.0, 2.0, None], "b": [True, None, False]}) plc_table = plc.interop.from_arrow(pa_table) tmpfile_name = tmp_path / "test.orc" @@ -114,12 +106,11 @@ def test_roundtrip_pd_dataframe( plc.io.orc.write_orc(options) - expected_pdf = pd.read_orc(tmpfile_name) - expected_pdf.columns = pdf.columns + read_table = pa.orc.read_table(str(tmpfile_name)) res = plc.io.types.TableWithMetadata( - plc.interop.from_arrow(pa.Table.from_pandas(expected_pdf)), - [(name, []) for name in pdf.columns], + plc.interop.from_arrow(read_table), + [(name, []) for name in pa_table.schema.names], ) assert_table_and_meta_eq(pa_table, res, check_field_nullability=False) From fd4c6cddf4c54023f631ee20b1bf217a247fee63 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Thu, 21 Nov 2024 05:37:23 -0800 Subject: [PATCH 10/16] address review --- python/pylibcudf/pylibcudf/io/orc.pyi | 32 +++++++------------------ python/pylibcudf/pylibcudf/io/orc.pyx | 2 +- python/pylibcudf/pylibcudf/io/types.pxd | 6 ++--- python/pylibcudf/pylibcudf/io/types.pyi | 18 +++++++++++++- python/pylibcudf/pylibcudf/io/types.pyx | 26 +++++++++++--------- 5 files changed, 45 insertions(+), 39 deletions(-) diff --git a/python/pylibcudf/pylibcudf/io/orc.pyi b/python/pylibcudf/pylibcudf/io/orc.pyi index 3165bb8675f..516f97981e9 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pyi +++ b/python/pylibcudf/pylibcudf/io/orc.pyi @@ -1,6 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from typing import Any +from typing import Any, Self from pylibcudf.io.types import ( CompressionType, @@ -58,16 +58,10 @@ class OrcWriterOptions: class OrcWriterOptionsBuilder: def __init__(self): ... - def compression( - self, comp: CompressionType - ) -> OrcWriterOptionsBuilder: ... - def enable_statistics( - self, val: StatisticsFreq - ) -> OrcWriterOptionsBuilder: ... - def key_value_metadata( - self, kvm: dict[str, str] - ) -> OrcWriterOptionsBuilder: ... - def metadata(self, meta: TableWithMetadata) -> OrcWriterOptionsBuilder: ... + def compression(self, comp: CompressionType) -> Self: ... + def enable_statistics(self, val: StatisticsFreq) -> Self: ... + def key_value_metadata(self, kvm: dict[str, str]) -> Self: ... + def metadata(self, meta: TableWithMetadata) -> Self: ... def build(self) -> OrcWriterOptions: ... def write_orc(options: OrcWriterOptions) -> None: ... @@ -87,16 +81,8 @@ class ChunkedOrcWriterOptions: class ChunkedOrcWriterOptionsBuilder: def __init__(self): ... - def compression( - self, comp: CompressionType - ) -> ChunkedOrcWriterOptionsBuilder: ... - def enable_statistics( - self, val: StatisticsFreq - ) -> ChunkedOrcWriterOptionsBuilder: ... - def key_value_metadata( - self, kvm: dict[str, str] - ) -> ChunkedOrcWriterOptionsBuilder: ... - def metadata( - self, meta: TableInputMetadata - ) -> ChunkedOrcWriterOptionsBuilder: ... + def compression(self, comp: CompressionType) -> Self: ... + def enable_statistics(self, val: StatisticsFreq) -> Self: ... + def key_value_metadata(self, kvm: dict[str, str]) -> Self: ... + def metadata(self, meta: TableInputMetadata) -> Self: ... def build(self) -> ChunkedOrcWriterOptions: ... diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx index 0303cd8d9fe..63eab4a9634 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pyx +++ b/python/pylibcudf/pylibcudf/io/orc.pyx @@ -714,7 +714,7 @@ cdef class ChunkedOrcWriterOptionsBuilder: return self cpdef ChunkedOrcWriterOptions build(self): - """Moves the chunked ORC writer options builder""" + """Create a OrcWriterOptions object""" cdef ChunkedOrcWriterOptions orc_options = ChunkedOrcWriterOptions.__new__( ChunkedOrcWriterOptions ) diff --git a/python/pylibcudf/pylibcudf/io/types.pxd b/python/pylibcudf/pylibcudf/io/types.pxd index f8d1c515db1..08f27eae5e1 100644 --- a/python/pylibcudf/pylibcudf/io/types.pxd +++ b/python/pylibcudf/pylibcudf/io/types.pxd @@ -20,7 +20,7 @@ from pylibcudf.libcudf.io.types cimport ( table_with_metadata, ) from pylibcudf.table cimport Table - +from pylibcudf.libcudf.types cimport size_type cdef class TableWithMetadata: cdef public Table tbl @@ -62,7 +62,7 @@ cdef class ColumnInMetadata: cpdef ColumnInMetadata set_decimal_precision(self, int req) - cpdef ColumnInMetadata child(self, int i) + cpdef ColumnInMetadata child(self, size_type i) cpdef ColumnInMetadata set_output_as_binary(self, bool binary) @@ -75,7 +75,7 @@ cdef class ColumnInMetadata: cpdef str get_name(self) @staticmethod - cdef ColumnInMetadata from_libcudf(column_in_metadata* data) + cdef ColumnInMetadata from_libcudf_ptr(column_in_metadata* data) cdef class TableInputMetadata: cdef public Table table diff --git a/python/pylibcudf/pylibcudf/io/types.pyi b/python/pylibcudf/pylibcudf/io/types.pyi index a4f4fc13bdc..19ef5e9120e 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyi +++ b/python/pylibcudf/pylibcudf/io/types.pyi @@ -3,7 +3,7 @@ import io import os from collections.abc import Mapping from enum import IntEnum -from typing import Any, Literal, TypeAlias, overload +from typing import Any, Literal, Self, TypeAlias, overload from pylibcudf.column import Column from pylibcudf.io.datasource import Datasource @@ -59,6 +59,22 @@ class QuoteStyle(IntEnum): ColumnNameSpec: TypeAlias = tuple[str, list[ColumnNameSpec]] ChildNameSpec: TypeAlias = Mapping[str, ChildNameSpec] +class TableInputMetadata: + def __init__(self, table: Table): ... + +class ColumnInMetadata: + def set_name(self, name: str) -> Self: ... + def set_nullability(self, nullable: bool) -> Self: ... + def set_list_column_as_map(self) -> Self: ... + def set_int96_timestamps(self, req: bool) -> Self: ... + def set_decimal_precision(self, precision: int) -> Self: ... + def child(self, i: int) -> Self: ... + def set_output_as_binary(self, binary: bool) -> Self: ... + def set_type_length(self, type_length: int) -> Self: ... + def set_skip_compression(self, skip: bool) -> Self: ... + def set_encoding(self, encoding: ColumnEncoding) -> Self: ... + def get_name(self) -> str: ... + class TableWithMetadata: tbl: Table def __init__( diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index fca414a088a..aaadb2f010b 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -34,9 +34,11 @@ from pylibcudf.libcudf.io.types import ( statistics_freq as StatisticsFreq, # no-cython-lint ) from cython.operator cimport dereference +from pylibcudf.libcudf.types cimport size_type __all__ = [ "ColumnEncoding", + "ColumnInMetadata", "CompressionType", "DictionaryPolicy", "JSONRecoveryMode", @@ -44,6 +46,7 @@ __all__ = [ "SinkInfo", "SourceInfo", "StatisticsFreq", + "TableInputMetadata", "TableWithMetadata", ] @@ -334,11 +337,6 @@ cdef class SinkInfo: cdef class ColumnInMetadata: """ Metadata for a column - - Parameters - ---------- - metadata: - column_in_metadata """ def __init__(self): @@ -348,10 +346,16 @@ cdef class ColumnInMetadata: ) @staticmethod - cdef ColumnInMetadata from_libcudf(column_in_metadata* data): - """Create a Python ColumnInMetadata from a libcudf column_in_metadata.""" + cdef ColumnInMetadata from_libcudf_ptr(column_in_metadata* metadata): + """ + Create a Python ColumnInMetadata from a libcudf column_in_metadata pointer. + + Parameters + ---------- + metadata : column_in_metadata* + """ cdef ColumnInMetadata out = ColumnInMetadata.__new__(ColumnInMetadata) - out.c_obj = data + out.c_obj = metadata return out cpdef ColumnInMetadata set_name(self, str name): @@ -432,7 +436,7 @@ cdef class ColumnInMetadata: dereference(self.c_obj).set_decimal_precision(precision) return self - cpdef ColumnInMetadata child(self, int i): + cpdef ColumnInMetadata child(self, size_type i): """ Get reference to a child of this column. @@ -446,7 +450,7 @@ cdef class ColumnInMetadata: ColumnInMetadata """ cdef column_in_metadata* child_c_obj = &dereference(self.c_obj).child(i) - return ColumnInMetadata.from_libcudf(child_c_obj) + return ColumnInMetadata.from_libcudf_ptr(child_c_obj) cpdef ColumnInMetadata set_output_as_binary(self, bool binary): """ @@ -542,5 +546,5 @@ cdef class TableInputMetadata: cdef int num_columns = self.c_obj.column_metadata.size() for i in range(num_columns): self.column_metadata.append( - ColumnInMetadata.from_libcudf(&self.c_obj.column_metadata[i]) + ColumnInMetadata.from_libcudf_ptr(&self.c_obj.column_metadata[i]) ) From 3bd27a7eac9543a9215feed9f1d9a46daad0fabe Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 25 Nov 2024 07:12:50 -0800 Subject: [PATCH 11/16] clean up --- python/pylibcudf/pylibcudf/io/types.pxd | 50 +-- python/pylibcudf/pylibcudf/io/types.pyx | 451 ++++++++++++------------ 2 files changed, 251 insertions(+), 250 deletions(-) diff --git a/python/pylibcudf/pylibcudf/io/types.pxd b/python/pylibcudf/pylibcudf/io/types.pxd index 90893e97772..161dd8949ca 100644 --- a/python/pylibcudf/pylibcudf/io/types.pxd +++ b/python/pylibcudf/pylibcudf/io/types.pxd @@ -28,31 +28,6 @@ from pylibcudf.libcudf.types cimport size_type cdef class PartitionInfo: cdef partition_info c_obj -cdef class TableWithMetadata: - cdef public Table tbl - cdef table_metadata metadata - - cdef vector[column_name_info] _make_column_info(self, list column_names) - - cdef list _make_columns_list(self, dict child_dict) - - @staticmethod - cdef dict _parse_col_names(vector[column_name_info] infos) - - @staticmethod - cdef TableWithMetadata from_libcudf(table_with_metadata& tbl) - -cdef class SourceInfo: - cdef source_info c_obj - # Keep the bytes converted from stringio alive - # (otherwise we end up with a use after free when they get gc'ed) - cdef list byte_sources - -cdef class SinkInfo: - # This vector just exists to keep the unique_ptrs to the sinks alive - cdef vector[unique_ptr[data_sink]] sink_storage - cdef sink_info c_obj - cdef class ColumnInMetadata: cdef column_in_metadata* c_obj @@ -89,3 +64,28 @@ cdef class TableInputMetadata: cdef public Table table cdef table_input_metadata c_obj cdef list column_metadata + +cdef class TableWithMetadata: + cdef public Table tbl + cdef table_metadata metadata + + cdef vector[column_name_info] _make_column_info(self, list column_names) + + cdef list _make_columns_list(self, dict child_dict) + + @staticmethod + cdef dict _parse_col_names(vector[column_name_info] infos) + + @staticmethod + cdef TableWithMetadata from_libcudf(table_with_metadata& tbl) + +cdef class SourceInfo: + cdef source_info c_obj + # Keep the bytes converted from stringio alive + # (otherwise we end up with a use after free when they get gc'ed) + cdef list byte_sources + +cdef class SinkInfo: + # This vector just exists to keep the unique_ptrs to the sinks alive + cdef vector[unique_ptr[data_sink]] sink_storage + cdef sink_info c_obj diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index ed93d70f0ab..72ebea19467 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -74,6 +74,232 @@ cdef class PartitionInfo: def __init__(self, size_type start_row, size_type num_rows): self.c_obj = partition_info(start_row, num_rows) + +cdef class ColumnInMetadata: + """ + Metadata for a column + """ + + def __init__(self): + raise ValueError( + "ColumnInMetadata should not be constructed directly. " + "Use one of the factories." + ) + + @staticmethod + cdef ColumnInMetadata from_libcudf_ptr(column_in_metadata* metadata): + """ + Create a Python ColumnInMetadata from a libcudf column_in_metadata pointer. + + Parameters + ---------- + metadata : column_in_metadata* + """ + cdef ColumnInMetadata out = ColumnInMetadata.__new__(ColumnInMetadata) + out.c_obj = metadata + return out + + cpdef ColumnInMetadata set_name(self, str name): + """ + Set the name of this column. + + Parameters + ---------- + name : str + Name of the column + + Returns + ------- + Self + """ + dereference(self.c_obj).set_name(name.encode()) + return self + + cpdef ColumnInMetadata set_nullability(self, bool nullable): + """ + Set the nullability of this column. + + Parameters + ---------- + nullable : bool + Whether this column is nullable + + Returns + ------- + Self + """ + dereference(self.c_obj).set_nullability(nullable) + return self + + cpdef ColumnInMetadata set_list_column_as_map(self): + """ + Specify that this list column should be encoded as a map in the + written file. + + Returns + ------- + Self + """ + dereference(self.c_obj).set_list_column_as_map() + return self + + cpdef ColumnInMetadata set_int96_timestamps(self, bool req): + """ + Specifies whether this timestamp column should be encoded using + the deprecated int96. + + Parameters + ---------- + req : bool + True = use int96 physical type. False = use int64 physical type. + + Returns + ------- + Self + """ + dereference(self.c_obj).set_int96_timestamps(req) + return self + + cpdef ColumnInMetadata set_decimal_precision(self, int precision): + """ + Set the decimal precision of this column. + Only valid if this column is a decimal (fixed-point) type. + + Parameters + ---------- + precision : int + The integer precision to set for this decimal column + + Returns + ------- + Self + """ + dereference(self.c_obj).set_decimal_precision(precision) + return self + + cpdef ColumnInMetadata child(self, size_type i): + """ + Get reference to a child of this column. + + Parameters + ---------- + i : int + Index of the child to get. + + Returns + ------- + ColumnInMetadata + """ + cdef column_in_metadata* child_c_obj = &dereference(self.c_obj).child(i) + return ColumnInMetadata.from_libcudf_ptr(child_c_obj) + + cpdef ColumnInMetadata set_output_as_binary(self, bool binary): + """ + Specifies whether this column should be written as binary or string data. + + Parameters + ---------- + binary : bool + True = use binary data type. False = use string data type + + Returns + ------- + Self + """ + dereference(self.c_obj).set_output_as_binary(binary) + return self + + cpdef ColumnInMetadata set_type_length(self, int type_length): + """ + Sets the length of fixed length data. + + Parameters + ---------- + type_length : int + Size of the data type in bytes + + Returns + ------- + Self + """ + dereference(self.c_obj).set_type_length(type_length) + return self + + cpdef ColumnInMetadata set_skip_compression(self, bool skip): + """ + Specifies whether this column should not be compressed + regardless of the compression. + + Parameters + ---------- + skip : bool + If `true` do not compress this column + + Returns + ------- + Self + """ + dereference(self.c_obj).set_skip_compression(skip) + return self + + cpdef ColumnInMetadata set_encoding(self, column_encoding encoding): + """ + Specifies whether this column should not be compressed + regardless of the compression. + + Parameters + ---------- + encoding : ColumnEncoding + The encoding to use + + Returns + ------- + ColumnInMetadata + """ + dereference(self.c_obj).set_encoding(encoding) + return self + + cpdef str get_name(self): + """ + Get the name of this column. + + Returns + ------- + str + The name of this column + """ + return dereference(self.c_obj).get_name().decode() + + def __del__(self): + # strictly for testing purposes + print("Deleting ColumnInMetadata object...") + + +cdef class TableInputMetadata: + """ + Metadata for a table + + Parameters + ---------- + table : Table + The Table to construct metadata for + """ + def __init__(self, Table table): + self.c_obj = table_input_metadata(table.view()) + self.column_metadata = [] + self.table = table + + cdef int num_columns = self.c_obj.column_metadata.size() + for i in range(num_columns): + col_meta = ColumnInMetadata.from_libcudf_ptr(&self.c_obj.column_metadata[i]) + col_meta.table = self + self.column_metadata.append(col_meta) + + def __del__(self): + # strictly for testing purposes + print("Deleting TableInputMetadata object...") + + cdef class TableWithMetadata: """A container holding a table and its associated metadata (e.g. column names) @@ -367,228 +593,3 @@ cdef class SinkInfo: self.c_obj = sink_info(paths) __hash__ = None - - -cdef class ColumnInMetadata: - """ - Metadata for a column - """ - - def __init__(self): - raise ValueError( - "ColumnInMetadata should not be constructed directly. " - "Use one of the factories." - ) - - @staticmethod - cdef ColumnInMetadata from_libcudf_ptr(column_in_metadata* metadata): - """ - Create a Python ColumnInMetadata from a libcudf column_in_metadata pointer. - - Parameters - ---------- - metadata : column_in_metadata* - """ - cdef ColumnInMetadata out = ColumnInMetadata.__new__(ColumnInMetadata) - out.c_obj = metadata - return out - - cpdef ColumnInMetadata set_name(self, str name): - """ - Set the name of this column. - - Parameters - ---------- - name : str - Name of the column - - Returns - ------- - Self - """ - dereference(self.c_obj).set_name(name.encode()) - return self - - cpdef ColumnInMetadata set_nullability(self, bool nullable): - """ - Set the nullability of this column. - - Parameters - ---------- - nullable : bool - Whether this column is nullable - - Returns - ------- - Self - """ - dereference(self.c_obj).set_nullability(nullable) - return self - - cpdef ColumnInMetadata set_list_column_as_map(self): - """ - Specify that this list column should be encoded as a map in the - written file. - - Returns - ------- - Self - """ - dereference(self.c_obj).set_list_column_as_map() - return self - - cpdef ColumnInMetadata set_int96_timestamps(self, bool req): - """ - Specifies whether this timestamp column should be encoded using - the deprecated int96. - - Parameters - ---------- - req : bool - True = use int96 physical type. False = use int64 physical type. - - Returns - ------- - Self - """ - dereference(self.c_obj).set_int96_timestamps(req) - return self - - cpdef ColumnInMetadata set_decimal_precision(self, int precision): - """ - Set the decimal precision of this column. - Only valid if this column is a decimal (fixed-point) type. - - Parameters - ---------- - precision : int - The integer precision to set for this decimal column - - Returns - ------- - Self - """ - dereference(self.c_obj).set_decimal_precision(precision) - return self - - cpdef ColumnInMetadata child(self, size_type i): - """ - Get reference to a child of this column. - - Parameters - ---------- - i : int - Index of the child to get. - - Returns - ------- - ColumnInMetadata - """ - cdef column_in_metadata* child_c_obj = &dereference(self.c_obj).child(i) - return ColumnInMetadata.from_libcudf_ptr(child_c_obj) - - cpdef ColumnInMetadata set_output_as_binary(self, bool binary): - """ - Specifies whether this column should be written as binary or string data. - - Parameters - ---------- - binary : bool - True = use binary data type. False = use string data type - - Returns - ------- - Self - """ - dereference(self.c_obj).set_output_as_binary(binary) - return self - - cpdef ColumnInMetadata set_type_length(self, int type_length): - """ - Sets the length of fixed length data. - - Parameters - ---------- - type_length : int - Size of the data type in bytes - - Returns - ------- - Self - """ - dereference(self.c_obj).set_type_length(type_length) - return self - - cpdef ColumnInMetadata set_skip_compression(self, bool skip): - """ - Specifies whether this column should not be compressed - regardless of the compression. - - Parameters - ---------- - skip : bool - If `true` do not compress this column - - Returns - ------- - Self - """ - dereference(self.c_obj).set_skip_compression(skip) - return self - - cpdef ColumnInMetadata set_encoding(self, column_encoding encoding): - """ - Specifies whether this column should not be compressed - regardless of the compression. - - Parameters - ---------- - encoding : ColumnEncoding - The encoding to use - - Returns - ------- - ColumnInMetadata - """ - dereference(self.c_obj).set_encoding(encoding) - return self - - cpdef str get_name(self): - """ - Get the name of this column. - - Returns - ------- - str - The name of this column - """ - return dereference(self.c_obj).get_name().decode() - - def __del__(self): - # strictly for testing purposes - print("Deleting ColumnInMetadata object...") - - -cdef class TableInputMetadata: - """ - Metadata for a table - - Parameters - ---------- - table : Table - The Table to construct metadata for - """ - def __init__(self, Table table): - self.c_obj = table_input_metadata(table.view()) - self.column_metadata = [] - self.table = table - - cdef int num_columns = self.c_obj.column_metadata.size() - for i in range(num_columns): - col_meta = ColumnInMetadata.from_libcudf_ptr(&self.c_obj.column_metadata[i]) - col_meta.table = self - self.column_metadata.append(col_meta) - - def __del__(self): - # strictly for testing purposes - print("Deleting TableInputMetadata object...") From 31853a87a31eee6f86ecac17167d33cf26604c83 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 25 Nov 2024 13:50:32 -0800 Subject: [PATCH 12/16] try a different approach in gc test --- python/pylibcudf/pylibcudf/io/types.pyx | 8 ---- .../pylibcudf/tests/io/test_types.py | 38 ++++++------------- 2 files changed, 12 insertions(+), 34 deletions(-) diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index 72ebea19467..5d209ee7071 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -270,10 +270,6 @@ cdef class ColumnInMetadata: """ return dereference(self.c_obj).get_name().decode() - def __del__(self): - # strictly for testing purposes - print("Deleting ColumnInMetadata object...") - cdef class TableInputMetadata: """ @@ -295,10 +291,6 @@ cdef class TableInputMetadata: col_meta.table = self self.column_metadata.append(col_meta) - def __del__(self): - # strictly for testing purposes - print("Deleting TableInputMetadata object...") - cdef class TableWithMetadata: """A container holding a table and its associated metadata diff --git a/python/pylibcudf/pylibcudf/tests/io/test_types.py b/python/pylibcudf/pylibcudf/tests/io/test_types.py index 1e25961968b..959c50053f5 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_types.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_types.py @@ -1,39 +1,25 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import gc -import io -from contextlib import redirect_stdout import pyarrow as pa import pylibcudf as plc -def test_gc_with_table_and_column_input_metadata(): - buffer = io.StringIO() +def test_gc_with_table_and_column_input_metadata(monkeypatch): + class A(plc.io.types.TableInputMetadata): + def __del__(self): + print("Deleting A...") - with redirect_stdout(buffer): - pa_table = pa.table( - {"a": pa.array([1, 2, 3]), "b": pa.array(["a", "b", "c"])} - ) + pa_table = pa.table( + {"a": pa.array([1, 2, 3]), "b": pa.array(["a", "b", "c"])} + ) + plc_table = plc.interop.from_arrow(pa_table) - plc_table = plc.interop.from_arrow(pa_table) + tbl_meta = A(plc_table) - tbl_meta = plc.io.types.TableInputMetadata(plc_table) + del tbl_meta - del tbl_meta - - collected = gc.collect() # force gc - - output = buffer.getvalue() - - # the circular reference creates one uncollectable object, 2+1+1 = 4 - assert ( - collected == 4 - ), f"Expected 4 collected objects, but got {collected} objects" - assert ( - output.count("ColumnInMetadata") == 2 - ), f"Expected 2 deleted column objects, but got {output.count("ColumnInMetadata")} objects" - assert ( - output.count("TableInputMetadata") == 1 - ), f"Expected 1 deleted table object, but got {output.count("TableInputMetadata")} objects" + # Circular reference creates one uncollectable object 1 + assert gc.collect() == 4 From b2a154a8bf60ad75d46dd7a0c315b1fa6bd9d2a2 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 25 Nov 2024 15:08:43 -0800 Subject: [PATCH 13/16] enable diable gc in test --- python/pylibcudf/pylibcudf/tests/io/test_types.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_types.py b/python/pylibcudf/pylibcudf/tests/io/test_types.py index 959c50053f5..c1f31df1208 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_types.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_types.py @@ -19,7 +19,12 @@ def __del__(self): tbl_meta = A(plc_table) + gc.disable() + gc.collect() + del tbl_meta - # Circular reference creates one uncollectable object 1 + # Circular reference creates an additional uncollectable object assert gc.collect() == 4 + + gc.enable() From 8ae013fd9908aae112fd6a0c5267fd1b6d29b3a1 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 25 Nov 2024 19:44:14 -0800 Subject: [PATCH 14/16] address review --- .../pylibcudf/pylibcudf/tests/io/test_types.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_types.py b/python/pylibcudf/pylibcudf/tests/io/test_types.py index c1f31df1208..44b01b21cce 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_types.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_types.py @@ -1,6 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import gc +import weakref import pyarrow as pa @@ -8,23 +9,20 @@ def test_gc_with_table_and_column_input_metadata(monkeypatch): - class A(plc.io.types.TableInputMetadata): + class Foo(plc.io.types.TableInputMetadata): def __del__(self): - print("Deleting A...") + pass pa_table = pa.table( {"a": pa.array([1, 2, 3]), "b": pa.array(["a", "b", "c"])} ) plc_table = plc.interop.from_arrow(pa_table) - tbl_meta = A(plc_table) - - gc.disable() - gc.collect() + tbl_meta = Foo(plc_table) + weak_tbl_meta = weakref.ref(tbl_meta) del tbl_meta - # Circular reference creates an additional uncollectable object - assert gc.collect() == 4 + gc.collect() - gc.enable() + assert weak_tbl_meta() is None From b51c9263621be3d0aa24d7204cbdb3abf4673cad Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Mon, 25 Nov 2024 22:58:30 -0500 Subject: [PATCH 15/16] Update python/pylibcudf/pylibcudf/tests/io/test_types.py --- python/pylibcudf/pylibcudf/tests/io/test_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_types.py b/python/pylibcudf/pylibcudf/tests/io/test_types.py index 44b01b21cce..a7642556bf2 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_types.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_types.py @@ -8,7 +8,7 @@ import pylibcudf as plc -def test_gc_with_table_and_column_input_metadata(monkeypatch): +def test_gc_with_table_and_column_input_metadata(): class Foo(plc.io.types.TableInputMetadata): def __del__(self): pass From 5b12ef59776be0450d43efd288580e66d3b3dbd8 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 26 Nov 2024 05:05:09 -0800 Subject: [PATCH 16/16] address review --- python/pylibcudf/pylibcudf/io/types.pxd | 10 +++++---- python/pylibcudf/pylibcudf/io/types.pyx | 28 +++++++++++++------------ 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/python/pylibcudf/pylibcudf/io/types.pxd b/python/pylibcudf/pylibcudf/io/types.pxd index 161dd8949ca..a1f3b17936c 100644 --- a/python/pylibcudf/pylibcudf/io/types.pxd +++ b/python/pylibcudf/pylibcudf/io/types.pxd @@ -30,6 +30,7 @@ cdef class PartitionInfo: cdef class ColumnInMetadata: cdef column_in_metadata* c_obj + cdef TableInputMetadata owner cdef TableInputMetadata table @@ -43,13 +44,13 @@ cdef class ColumnInMetadata: cpdef ColumnInMetadata set_int96_timestamps(self, bool req) - cpdef ColumnInMetadata set_decimal_precision(self, int req) + cpdef ColumnInMetadata set_decimal_precision(self, uint8_t precision) cpdef ColumnInMetadata child(self, size_type i) cpdef ColumnInMetadata set_output_as_binary(self, bool binary) - cpdef ColumnInMetadata set_type_length(self, int type_length) + cpdef ColumnInMetadata set_type_length(self, int32_t type_length) cpdef ColumnInMetadata set_skip_compression(self, bool skip) @@ -58,10 +59,11 @@ cdef class ColumnInMetadata: cpdef str get_name(self) @staticmethod - cdef ColumnInMetadata from_libcudf_ptr(column_in_metadata* data) + cdef ColumnInMetadata from_libcudf( + column_in_metadata* metadata, TableInputMetadata owner + ) cdef class TableInputMetadata: - cdef public Table table cdef table_input_metadata c_obj cdef list column_metadata diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index 5d209ee7071..a2155829f2c 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -87,16 +87,22 @@ cdef class ColumnInMetadata: ) @staticmethod - cdef ColumnInMetadata from_libcudf_ptr(column_in_metadata* metadata): + cdef ColumnInMetadata from_libcudf( + column_in_metadata* metadata, TableInputMetadata owner + ): """ - Create a Python ColumnInMetadata from a libcudf column_in_metadata pointer. + A Python representation of `column_in_metadata`. Parameters ---------- metadata : column_in_metadata* + Raw pointer to C++ metadata. + owner : TableInputMetadata + Owning table input metadata that manages lifetime of the raw pointer. """ cdef ColumnInMetadata out = ColumnInMetadata.__new__(ColumnInMetadata) out.c_obj = metadata + out.owner = owner return out cpdef ColumnInMetadata set_name(self, str name): @@ -160,7 +166,7 @@ cdef class ColumnInMetadata: dereference(self.c_obj).set_int96_timestamps(req) return self - cpdef ColumnInMetadata set_decimal_precision(self, int precision): + cpdef ColumnInMetadata set_decimal_precision(self, uint8_t precision): """ Set the decimal precision of this column. Only valid if this column is a decimal (fixed-point) type. @@ -191,7 +197,7 @@ cdef class ColumnInMetadata: ColumnInMetadata """ cdef column_in_metadata* child_c_obj = &dereference(self.c_obj).child(i) - return ColumnInMetadata.from_libcudf_ptr(child_c_obj) + return ColumnInMetadata.from_libcudf(child_c_obj, self.owner) cpdef ColumnInMetadata set_output_as_binary(self, bool binary): """ @@ -209,7 +215,7 @@ cdef class ColumnInMetadata: dereference(self.c_obj).set_output_as_binary(binary) return self - cpdef ColumnInMetadata set_type_length(self, int type_length): + cpdef ColumnInMetadata set_type_length(self, int32_t type_length): """ Sets the length of fixed length data. @@ -282,14 +288,10 @@ cdef class TableInputMetadata: """ def __init__(self, Table table): self.c_obj = table_input_metadata(table.view()) - self.column_metadata = [] - self.table = table - - cdef int num_columns = self.c_obj.column_metadata.size() - for i in range(num_columns): - col_meta = ColumnInMetadata.from_libcudf_ptr(&self.c_obj.column_metadata[i]) - col_meta.table = self - self.column_metadata.append(col_meta) + self.column_metadata = [ + ColumnInMetadata.from_libcudf(&self.c_obj.column_metadata[i], self) + for i in range(self.c_obj.column_metadata.size()) + ] cdef class TableWithMetadata: