From 569cd04c3e521f04b7d128820c820d919de45ccb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 31 Oct 2024 16:06:15 -0700 Subject: [PATCH 1/6] Start implementing io/text --- .../api_docs/pylibcudf/io/index.rst | 1 + .../user_guide/api_docs/pylibcudf/io/text.rst | 6 + python/cudf/cudf/_lib/text.pyx | 80 ++------ python/pylibcudf/pylibcudf/io/CMakeLists.txt | 2 +- python/pylibcudf/pylibcudf/io/__init__.pxd | 2 +- python/pylibcudf/pylibcudf/io/__init__.py | 2 +- python/pylibcudf/pylibcudf/io/text.pxd | 29 +++ python/pylibcudf/pylibcudf/io/text.pyx | 184 ++++++++++++++++++ 8 files changed, 244 insertions(+), 62 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst create mode 100644 python/pylibcudf/pylibcudf/io/text.pxd create mode 100644 python/pylibcudf/pylibcudf/io/text.pyx diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst index 53638f071cc..cd5c5a5f77e 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst @@ -19,4 +19,5 @@ I/O Functions csv json parquet + text timezone diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst new file mode 100644 index 00000000000..327ca043f36 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst @@ -0,0 +1,6 @@ +==== +text +==== + +.. automodule:: pylibcudf.io.text + :members: diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx index b2c7232f549..c1204030124 100644 --- a/python/cudf/cudf/_lib/text.pyx +++ b/python/cudf/cudf/_lib/text.pyx @@ -2,32 +2,17 @@ from io import TextIOBase -from cython.operator cimport dereference -from libc.stdint cimport uint64_t -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.io.text cimport ( - byte_range_info, - data_chunk_source, - make_source, - make_source_from_bgzip_file, - make_source_from_file, - multibyte_split, - parse_options, -) +import pylibcudf as plc from cudf._lib.column cimport Column def read_text(object filepaths_or_buffers, - object delimiter=None, - object byte_range=None, - object strip_delimiters=False, - object compression=None, - object compression_offsets=None): + str delimiter, + object byte_range, + bool strip_delimiters, + object compression, + object compression_offsets): """ Cython function to call into libcudf API, see `multibyte_split`. @@ -35,24 +20,11 @@ def read_text(object filepaths_or_buffers, -------- cudf.io.text.read_text """ - cdef string delim = delimiter.encode() - - cdef unique_ptr[data_chunk_source] datasource - cdef unique_ptr[column] c_col - - cdef size_t c_byte_range_offset - cdef size_t c_byte_range_size - cdef uint64_t c_compression_begin_offset - cdef uint64_t c_compression_end_offset - cdef parse_options c_options - if compression is None: if isinstance(filepaths_or_buffers, TextIOBase): - datasource = move(make_source( - filepaths_or_buffers.read().encode())) + datasource = plc.io.text.make_source(filepaths_or_buffers.read()) else: - datasource = move(make_source_from_file( - filepaths_or_buffers.encode())) + datasource = plc.io.text.make_source_from_file(filepaths_or_buffers) elif compression == "bgzip": if isinstance(filepaths_or_buffers, TextIOBase): raise ValueError("bgzip compression requires a file path") @@ -60,30 +32,20 @@ def read_text(object filepaths_or_buffers, if len(compression_offsets) != 2: raise ValueError( "compression offsets need to consist of two elements") - c_compression_begin_offset = compression_offsets[0] - c_compression_end_offset = compression_offsets[1] - datasource = move(make_source_from_bgzip_file( - filepaths_or_buffers.encode(), - c_compression_begin_offset, - c_compression_end_offset)) + datasource = plc.io.text.make_source_from_bgzip_file( + filepaths_or_buffers, + compression_offsets[0], + compression_offsets[1] + ) else: - datasource = move(make_source_from_bgzip_file( - filepaths_or_buffers.encode())) + datasource = plc.io.text.make_source_from_bgzip_file( + filepaths_or_buffers, + ) else: raise ValueError("Only bgzip compression is supported at the moment") - c_options = parse_options() - if byte_range is not None: - c_byte_range_offset = byte_range[0] - c_byte_range_size = byte_range[1] - c_options.byte_range = byte_range_info( - c_byte_range_offset, - c_byte_range_size) - c_options.strip_delimiters = strip_delimiters - with nogil: - c_col = move(multibyte_split( - dereference(datasource), - delim, - c_options)) - - return Column.from_unique_ptr(move(c_col)) + options = plc.io.text.ParseOptions( + byte_range=byte_range, strip_delimiters=strip_delimiters + ) + plc_column = plc.io.text.multibyte_split(datasource, delimiter, options) + return Column.from_pylibcudf(plc_column) diff --git a/python/pylibcudf/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt index 965724a47b1..f78d97ef4d1 100644 --- a/python/pylibcudf/pylibcudf/io/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/io/CMakeLists.txt @@ -13,7 +13,7 @@ # ============================================================================= set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx timezone.pyx - types.pyx + text.pyx types.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/io/__init__.pxd index 1bcc0a3f963..6ba7f78a013 100644 --- a/python/pylibcudf/pylibcudf/io/__init__.pxd +++ b/python/pylibcudf/pylibcudf/io/__init__.pxd @@ -1,5 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. # CSV is removed since it is def not cpdef (to force kw-only arguments) -from . cimport avro, datasource, json, orc, parquet, timezone, types +from . cimport avro, datasource, json, orc, parquet, timezone, text, types from .types cimport SourceInfo, TableWithMetadata diff --git a/python/pylibcudf/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py index 2e4f215b12c..0fc77dd0f57 100644 --- a/python/pylibcudf/pylibcudf/io/__init__.py +++ b/python/pylibcudf/pylibcudf/io/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import avro, csv, datasource, json, orc, parquet, timezone, types +from . import avro, csv, datasource, json, orc, parquet, timezone, text, types from .types import SinkInfo, SourceInfo, TableWithMetadata diff --git a/python/pylibcudf/pylibcudf/io/text.pxd b/python/pylibcudf/pylibcudf/io/text.pxd new file mode 100644 index 00000000000..051a8ae61aa --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/text.pxd @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.io.text cimport parse_options, + +cdef class ParseOptions: + cdef parse_options c_options + +cdef class DataChunkSource: + cdef data_chunk_source c_data_chunk_source + + cdef DataChunkSource from_source(data_chunk_source source) + + +cpdef Column multibyte_split( + source, + str delimiter, + ParseOptions options=* +) + +cpdef DataChunkSource make_source(str data) + +cpdef DataChunkSource make_source_from_file(str filename) + +cpdef DataChunkSource make_source_from_bgzip_file( + str filename, + int virtual_begin=*, + int virtual_end=*, +) diff --git a/python/pylibcudf/pylibcudf/io/text.pyx b/python/pylibcudf/pylibcudf/io/text.pyx new file mode 100644 index 00000000000..6e71536a09f --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/text.pyx @@ -0,0 +1,184 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libc.stdint cimport uint64_t +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.io cimport text as cpp_text + +cdef class ParseOptions: + """Parsing options for `multibyte_split`""" + def __init__( + self, + *, + byte_range=None, + strip_delimiters=False, + ): + self.c_options = cpp_text.parse_options() + if byte_range is not None: + c_byte_range_offset = byte_range[0] + c_byte_range_size = byte_range[1] + self.c_options.byte_range = cpp_text.byte_range_info( + c_byte_range_offset, + c_byte_range_size + ) + self.c_options.strip_delimiters = strip_delimiters + + +cdef class DataChunkSource: + """Data source for `multibyte_split`""" + + def __init__(self): + raise ValueError( + "This class cannot be instantiated directly. " + "Use one of the make_source functions instead" + ) + + @staticmethod + cdef DataChunkSource from_source(data_chunk_source source): + cdef DataChunkSource datasource = DataChunkSource.__new__(DataChunkSource) + datasource.c_data_chunk_source = source + return datasource + + +cpdef DataChunkSource make_source(str data): + """ + Creates a data source capable of producing device-buffered views + of the given string. + + Parameters + ---------- + data : str + The host data to be exposed as a data chunk source. + + Returns + ------- + DataChunkSource + The data chunk source for the provided host data. + """ + cdef data_chunk_source c_source + cdef string c_data = data.encode() + + with nogil: + c_source = cpp_text.make_source(c_data) + + return DataChunkSource.from_source(c_source) + + +cpdef DataChunkSource make_source_from_file(str filename): + """ + Creates a data source capable of producing device-buffered views of the file. + + Parameters + ---------- + filename : str + The filename of the file to be exposed as a data chunk source. + + Returns + ------- + DataChunkSource + The data chunk source for the provided filename. + """ + cdef data_chunk_source c_source + cdef string c_filename = filename.encode() + + with nogil: + c_source = cpp_text.make_source_from_file(c_filename) + + return DataChunkSource.from_source(c_source) + +cpdef DataChunkSource make_source_from_bgzip_file( + str filename, + int virtual_begin=None, + int virtual_end=None, +): + """ + Creates a data source capable of producing device-buffered views of + a BGZIP compressed file with virtual record offsets. + + Parameters + ---------- + filename : str + The filename of the BGZIP-compressed file to be exposed as a data chunk source. + + virtual_begin : int, default None + The virtual (Tabix) offset of the first byte to be read. Its upper 48 bits + describe the offset into the compressed file, its lower 16 bits describe the + block-local offset. + + virtual_end : int, default None + The data chunk source for the provided filename. + + Returns + ------- + DataChunkSource + The data chunk source for the provided filename. + """ + cdef data_chunk_source c_source + cdef string c_filename = filename.encode() + + if virtual_begin is None and virtual_end is None: + with nogil: + c_source = cpp_text.make_source_from_bgzip_file(c_filename) + elif virtual_begin is not None and virtual_end is not None: + cdef uint64_t c_virtual_begin = virtual_begin + cdef uint64_t c_virtual_end = c_virtual_end + with nogil: + c_source = cpp_text.make_source_from_bgzip_file( + c_filename, + virtual_begin, + c_virtual_end + ) + else: + raise ValueError( + "virtual_begin and virtual_end must both be None or both be int" + ) + return DataChunkSource.from_source(c_source) + +cpdef Column multibyte_split( + DataChunkSource source, + str delimiter, + ParseOptions options=None +): + """ + Splits the source text into a strings column using a multiple byte delimiter. + + For details, see :cpp:func:`cudf::io::text::multibyte_split` + + Parameters + ---------- + source : + The source string. + + delimiter : str + UTF-8 encoded string for which to find offsets in the source. + + options : ParseOptions + The parsing options to use (including byte range). + + Returns + ------- + Column + The strings found by splitting the source by the delimiter + within the relevant byte range. + """ + cdef unique_ptr[column] c_result + cdef data_chunk_source c_source = source.c_data_chunk_source + cdef string c_delimiter = delimiter.encode() + + if options is None: + options = ParseOptions() + + cdef cpp_text.parse_options c_options = options.c_options + + with nogil: + c_result = cpp_text.multibyte_split( + c_source + c_delimiter, + c_options + ) + + return Column.from_libcudf(move(c_result)) From 74240f30ee3b9a99c0949f228aa1c05a9b2d80f9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 31 Oct 2024 17:31:34 -0700 Subject: [PATCH 2/6] Fix implementation --- python/cudf/cudf/_lib/text.pyx | 2 + python/pylibcudf/pylibcudf/io/text.pxd | 10 +++-- python/pylibcudf/pylibcudf/io/text.pyx | 43 ++++++++++--------- .../pylibcudf/pylibcudf/tests/io/test_text.py | 31 +++++++++++++ 4 files changed, 62 insertions(+), 24 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/tests/io/test_text.py diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx index c1204030124..7942d067c2b 100644 --- a/python/cudf/cudf/_lib/text.pyx +++ b/python/cudf/cudf/_lib/text.pyx @@ -1,5 +1,7 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from libcpp cimport bool + from io import TextIOBase import pylibcudf as plc diff --git a/python/pylibcudf/pylibcudf/io/text.pxd b/python/pylibcudf/pylibcudf/io/text.pxd index 051a8ae61aa..10538f48f68 100644 --- a/python/pylibcudf/pylibcudf/io/text.pxd +++ b/python/pylibcudf/pylibcudf/io/text.pxd @@ -1,19 +1,21 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr from pylibcudf.column cimport Column -from pylibcudf.libcudf.io.text cimport parse_options, +from pylibcudf.libcudf.io.text cimport parse_options, data_chunk_source cdef class ParseOptions: cdef parse_options c_options cdef class DataChunkSource: - cdef data_chunk_source c_data_chunk_source + cdef unique_ptr[data_chunk_source] c_data_chunk_source - cdef DataChunkSource from_source(data_chunk_source source) + @staticmethod + cdef DataChunkSource from_source(unique_ptr[data_chunk_source] source) cpdef Column multibyte_split( - source, + DataChunkSource source, str delimiter, ParseOptions options=* ) diff --git a/python/pylibcudf/pylibcudf/io/text.pyx b/python/pylibcudf/pylibcudf/io/text.pyx index 6e71536a09f..4c01c681437 100644 --- a/python/pylibcudf/pylibcudf/io/text.pyx +++ b/python/pylibcudf/pylibcudf/io/text.pyx @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from cython.operator cimport dereference from libc.stdint cimport uint64_t from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -34,13 +35,13 @@ cdef class DataChunkSource: def __init__(self): raise ValueError( "This class cannot be instantiated directly. " - "Use one of the make_source functions instead" + "Use one of the make_source functions instead." ) @staticmethod - cdef DataChunkSource from_source(data_chunk_source source): + cdef DataChunkSource from_source(unique_ptr[data_chunk_source] source): cdef DataChunkSource datasource = DataChunkSource.__new__(DataChunkSource) - datasource.c_data_chunk_source = source + datasource.c_data_chunk_source = move(source) return datasource @@ -59,13 +60,13 @@ cpdef DataChunkSource make_source(str data): DataChunkSource The data chunk source for the provided host data. """ - cdef data_chunk_source c_source + cdef unique_ptr[data_chunk_source] c_source cdef string c_data = data.encode() with nogil: c_source = cpp_text.make_source(c_data) - return DataChunkSource.from_source(c_source) + return DataChunkSource.from_source(move(c_source)) cpdef DataChunkSource make_source_from_file(str filename): @@ -82,18 +83,18 @@ cpdef DataChunkSource make_source_from_file(str filename): DataChunkSource The data chunk source for the provided filename. """ - cdef data_chunk_source c_source + cdef unique_ptr[data_chunk_source] c_source cdef string c_filename = filename.encode() with nogil: c_source = cpp_text.make_source_from_file(c_filename) - return DataChunkSource.from_source(c_source) + return DataChunkSource.from_source(move(c_source)) cpdef DataChunkSource make_source_from_bgzip_file( str filename, - int virtual_begin=None, - int virtual_end=None, + int virtual_begin=-1, + int virtual_end=-1, ): """ Creates a data source capable of producing device-buffered views of @@ -104,39 +105,41 @@ cpdef DataChunkSource make_source_from_bgzip_file( filename : str The filename of the BGZIP-compressed file to be exposed as a data chunk source. - virtual_begin : int, default None + virtual_begin : int The virtual (Tabix) offset of the first byte to be read. Its upper 48 bits describe the offset into the compressed file, its lower 16 bits describe the block-local offset. virtual_end : int, default None - The data chunk source for the provided filename. + The virtual (Tabix) offset one past the last byte to be read Returns ------- DataChunkSource The data chunk source for the provided filename. """ - cdef data_chunk_source c_source + cdef unique_ptr[data_chunk_source] c_source cdef string c_filename = filename.encode() + cdef uint64_t c_virtual_begin + cdef uint64_t c_virtual_end - if virtual_begin is None and virtual_end is None: + if virtual_begin == -1 and virtual_end == -1: with nogil: c_source = cpp_text.make_source_from_bgzip_file(c_filename) - elif virtual_begin is not None and virtual_end is not None: - cdef uint64_t c_virtual_begin = virtual_begin - cdef uint64_t c_virtual_end = c_virtual_end + elif virtual_begin != -1 and virtual_end != -1: + c_virtual_begin = virtual_begin + c_virtual_end = virtual_end with nogil: c_source = cpp_text.make_source_from_bgzip_file( c_filename, - virtual_begin, + c_virtual_begin, c_virtual_end ) else: raise ValueError( "virtual_begin and virtual_end must both be None or both be int" ) - return DataChunkSource.from_source(c_source) + return DataChunkSource.from_source(move(c_source)) cpdef Column multibyte_split( DataChunkSource source, @@ -166,7 +169,7 @@ cpdef Column multibyte_split( within the relevant byte range. """ cdef unique_ptr[column] c_result - cdef data_chunk_source c_source = source.c_data_chunk_source + cdef unique_ptr[data_chunk_source] c_source = move(source.c_data_chunk_source) cdef string c_delimiter = delimiter.encode() if options is None: @@ -176,7 +179,7 @@ cpdef Column multibyte_split( with nogil: c_result = cpp_text.multibyte_split( - c_source + dereference(c_source), c_delimiter, c_options ) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_text.py b/python/pylibcudf/pylibcudf/tests/io/test_text.py new file mode 100644 index 00000000000..b809b8fb9d6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/io/test_text.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.mark.parametrize( + "source_func", + [ + "make_source", + "make_source_from_file", + ], +) +@pytest.mark.parametrize("options", [None, plc.io.text.ParseOptions()]) +def test_multibyte_split(source_func, options, tmp_path): + data = "x::y::z" + func = getattr(plc.io.text, source_func) + if source_func == "make_source": + source = func(data) + elif source_func == "make_source_from_file": + fle = tmp_path / "fle.txt" + fle.write_text(data) + source = func(str(fle)) + result = plc.io.text.multibyte_split(source, "::", options) + expected = pa.array(["x::", "y::", "z"]) + breakpoint() + assert_column_eq(result, expected) From abdbb0ad725982a7083684f68c510d779a0f4894 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 1 Nov 2024 10:31:40 -0700 Subject: [PATCH 3/6] Remove breakpoint --- python/pylibcudf/pylibcudf/tests/io/test_text.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_text.py b/python/pylibcudf/pylibcudf/tests/io/test_text.py index b809b8fb9d6..f69e940e34e 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_text.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_text.py @@ -1,6 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - import pyarrow as pa import pytest from utils import assert_column_eq @@ -27,5 +26,4 @@ def test_multibyte_split(source_func, options, tmp_path): source = func(str(fle)) result = plc.io.text.multibyte_split(source, "::", options) expected = pa.array(["x::", "y::", "z"]) - breakpoint() assert_column_eq(result, expected) From 39a447553a99d9acdcdb407a1a78b170b6255bdb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 4 Nov 2024 10:39:01 -0800 Subject: [PATCH 4/6] document parameters in ParseOptions --- python/pylibcudf/pylibcudf/io/text.pyx | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/python/pylibcudf/pylibcudf/io/text.pyx b/python/pylibcudf/pylibcudf/io/text.pyx index 4c01c681437..96baed0956c 100644 --- a/python/pylibcudf/pylibcudf/io/text.pyx +++ b/python/pylibcudf/pylibcudf/io/text.pyx @@ -11,7 +11,19 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.io cimport text as cpp_text cdef class ParseOptions: - """Parsing options for `multibyte_split`""" + """ + Parsing options for `multibyte_split` + + Parameters + ---------- + byte_range : list | tuple, default None + Only rows starting inside this byte range will be + part of the output column. + + strip_delimiters : bool, default True + Whether delimiters at the end of rows should + be stripped from the output column. + """ def __init__( self, *, From b06d64e71b1a9704bd039ac550e1d9b4cf544529 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 4 Nov 2024 14:06:51 -0800 Subject: [PATCH 5/6] store reference of bytes data on DataChunkSource --- python/pylibcudf/pylibcudf/io/text.pxd | 7 ++-- python/pylibcudf/pylibcudf/io/text.pyx | 57 +++++++++++--------------- 2 files changed, 28 insertions(+), 36 deletions(-) diff --git a/python/pylibcudf/pylibcudf/io/text.pxd b/python/pylibcudf/pylibcudf/io/text.pxd index 10538f48f68..051e9bc0cde 100644 --- a/python/pylibcudf/pylibcudf/io/text.pxd +++ b/python/pylibcudf/pylibcudf/io/text.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from libcpp.string cimport string from pylibcudf.column cimport Column from pylibcudf.libcudf.io.text cimport parse_options, data_chunk_source @@ -8,10 +9,8 @@ cdef class ParseOptions: cdef parse_options c_options cdef class DataChunkSource: - cdef unique_ptr[data_chunk_source] c_data_chunk_source - - @staticmethod - cdef DataChunkSource from_source(unique_ptr[data_chunk_source] source) + cdef unique_ptr[data_chunk_source] c_source + cdef string data_ref cpdef Column multibyte_split( diff --git a/python/pylibcudf/pylibcudf/io/text.pyx b/python/pylibcudf/pylibcudf/io/text.pyx index 96baed0956c..f969075cdf9 100644 --- a/python/pylibcudf/pylibcudf/io/text.pyx +++ b/python/pylibcudf/pylibcudf/io/text.pyx @@ -42,19 +42,17 @@ cdef class ParseOptions: cdef class DataChunkSource: - """Data source for `multibyte_split`""" + """ + Data source for `multibyte_split` - def __init__(self): - raise ValueError( - "This class cannot be instantiated directly. " - "Use one of the make_source functions instead." - ) + Parameters + ---------- + data : str + Filename or data itself. + """ - @staticmethod - cdef DataChunkSource from_source(unique_ptr[data_chunk_source] source): - cdef DataChunkSource datasource = DataChunkSource.__new__(DataChunkSource) - datasource.c_data_chunk_source = move(source) - return datasource + def __cinit__(self, str data): + self.data_ref = data.encode() cpdef DataChunkSource make_source(str data): @@ -72,13 +70,10 @@ cpdef DataChunkSource make_source(str data): DataChunkSource The data chunk source for the provided host data. """ - cdef unique_ptr[data_chunk_source] c_source - cdef string c_data = data.encode() - + cdef DataChunkSource dcs = DataChunkSource(data) with nogil: - c_source = cpp_text.make_source(c_data) - - return DataChunkSource.from_source(move(c_source)) + dcs.c_source = move(cpp_text.make_source(dcs.data_ref)) + return dcs cpdef DataChunkSource make_source_from_file(str filename): @@ -95,13 +90,10 @@ cpdef DataChunkSource make_source_from_file(str filename): DataChunkSource The data chunk source for the provided filename. """ - cdef unique_ptr[data_chunk_source] c_source - cdef string c_filename = filename.encode() - + cdef DataChunkSource dcs = DataChunkSource(filename) with nogil: - c_source = cpp_text.make_source_from_file(c_filename) - - return DataChunkSource.from_source(move(c_source)) + dcs.c_source = move(cpp_text.make_source_from_file(dcs.data_ref)) + return dcs cpdef DataChunkSource make_source_from_bgzip_file( str filename, @@ -130,28 +122,29 @@ cpdef DataChunkSource make_source_from_bgzip_file( DataChunkSource The data chunk source for the provided filename. """ - cdef unique_ptr[data_chunk_source] c_source - cdef string c_filename = filename.encode() cdef uint64_t c_virtual_begin cdef uint64_t c_virtual_end + cdef DataChunkSource dcs = DataChunkSource(filename) if virtual_begin == -1 and virtual_end == -1: with nogil: - c_source = cpp_text.make_source_from_bgzip_file(c_filename) + dcs.c_source = move(cpp_text.make_source_from_bgzip_file(dcs.data_ref)) elif virtual_begin != -1 and virtual_end != -1: c_virtual_begin = virtual_begin c_virtual_end = virtual_end with nogil: - c_source = cpp_text.make_source_from_bgzip_file( - c_filename, - c_virtual_begin, - c_virtual_end + dcs.c_source = move( + cpp_text.make_source_from_bgzip_file( + dcs.data_ref, + c_virtual_begin, + c_virtual_end, + ) ) else: raise ValueError( "virtual_begin and virtual_end must both be None or both be int" ) - return DataChunkSource.from_source(move(c_source)) + return dcs cpdef Column multibyte_split( DataChunkSource source, @@ -181,7 +174,7 @@ cpdef Column multibyte_split( within the relevant byte range. """ cdef unique_ptr[column] c_result - cdef unique_ptr[data_chunk_source] c_source = move(source.c_data_chunk_source) + cdef unique_ptr[data_chunk_source] c_source = move(source.c_source) cdef string c_delimiter = delimiter.encode() if options is None: From 169604a4fb4b5f8e8a840f839ddd01caea9f7f36 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 4 Nov 2024 14:10:08 -0800 Subject: [PATCH 6/6] add comment about lifetime --- python/pylibcudf/pylibcudf/io/text.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pylibcudf/pylibcudf/io/text.pyx b/python/pylibcudf/pylibcudf/io/text.pyx index f969075cdf9..667a054baaa 100644 --- a/python/pylibcudf/pylibcudf/io/text.pyx +++ b/python/pylibcudf/pylibcudf/io/text.pyx @@ -52,6 +52,7 @@ cdef class DataChunkSource: """ def __cinit__(self, str data): + # Need to keep a reference alive for make_source self.data_ref = data.encode()