From 569cd04c3e521f04b7d128820c820d919de45ccb Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 31 Oct 2024 16:06:15 -0700
Subject: [PATCH 1/6] Start implementing io/text

---
 .../api_docs/pylibcudf/io/index.rst           |   1 +
 .../user_guide/api_docs/pylibcudf/io/text.rst |   6 +
 python/cudf/cudf/_lib/text.pyx                |  80 ++------
 python/pylibcudf/pylibcudf/io/CMakeLists.txt  |   2 +-
 python/pylibcudf/pylibcudf/io/__init__.pxd    |   2 +-
 python/pylibcudf/pylibcudf/io/__init__.py     |   2 +-
 python/pylibcudf/pylibcudf/io/text.pxd        |  29 +++
 python/pylibcudf/pylibcudf/io/text.pyx        | 184 ++++++++++++++++++
 8 files changed, 244 insertions(+), 62 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst
 create mode 100644 python/pylibcudf/pylibcudf/io/text.pxd
 create mode 100644 python/pylibcudf/pylibcudf/io/text.pyx

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
index 53638f071cc..cd5c5a5f77e 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
@@ -19,4 +19,5 @@ I/O Functions
     csv
     json
     parquet
+    text
     timezone
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst
new file mode 100644
index 00000000000..327ca043f36
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst
@@ -0,0 +1,6 @@
+====
+text
+====
+
+.. automodule:: pylibcudf.io.text
+   :members:
diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx
index b2c7232f549..c1204030124 100644
--- a/python/cudf/cudf/_lib/text.pyx
+++ b/python/cudf/cudf/_lib/text.pyx
@@ -2,32 +2,17 @@
 
 from io import TextIOBase
 
-from cython.operator cimport dereference
-from libc.stdint cimport uint64_t
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.io.text cimport (
-    byte_range_info,
-    data_chunk_source,
-    make_source,
-    make_source_from_bgzip_file,
-    make_source_from_file,
-    multibyte_split,
-    parse_options,
-)
+import pylibcudf as plc
 
 from cudf._lib.column cimport Column
 
 
 def read_text(object filepaths_or_buffers,
-              object delimiter=None,
-              object byte_range=None,
-              object strip_delimiters=False,
-              object compression=None,
-              object compression_offsets=None):
+              str delimiter,
+              object byte_range,
+              bool strip_delimiters,
+              object compression,
+              object compression_offsets):
     """
     Cython function to call into libcudf API, see `multibyte_split`.
 
@@ -35,24 +20,11 @@ def read_text(object filepaths_or_buffers,
     --------
     cudf.io.text.read_text
     """
-    cdef string delim = delimiter.encode()
-
-    cdef unique_ptr[data_chunk_source] datasource
-    cdef unique_ptr[column] c_col
-
-    cdef size_t c_byte_range_offset
-    cdef size_t c_byte_range_size
-    cdef uint64_t c_compression_begin_offset
-    cdef uint64_t c_compression_end_offset
-    cdef parse_options c_options
-
     if compression is None:
         if isinstance(filepaths_or_buffers, TextIOBase):
-            datasource = move(make_source(
-                filepaths_or_buffers.read().encode()))
+            datasource = plc.io.text.make_source(filepaths_or_buffers.read())
         else:
-            datasource = move(make_source_from_file(
-                filepaths_or_buffers.encode()))
+            datasource = plc.io.text.make_source_from_file(filepaths_or_buffers)
     elif compression == "bgzip":
         if isinstance(filepaths_or_buffers, TextIOBase):
             raise ValueError("bgzip compression requires a file path")
@@ -60,30 +32,20 @@ def read_text(object filepaths_or_buffers,
             if len(compression_offsets) != 2:
                 raise ValueError(
                     "compression offsets need to consist of two elements")
-            c_compression_begin_offset = compression_offsets[0]
-            c_compression_end_offset = compression_offsets[1]
-            datasource = move(make_source_from_bgzip_file(
-                filepaths_or_buffers.encode(),
-                c_compression_begin_offset,
-                c_compression_end_offset))
+            datasource = plc.io.text.make_source_from_bgzip_file(
+                filepaths_or_buffers,
+                compression_offsets[0],
+                compression_offsets[1]
+            )
         else:
-            datasource = move(make_source_from_bgzip_file(
-                filepaths_or_buffers.encode()))
+            datasource = plc.io.text.make_source_from_bgzip_file(
+                filepaths_or_buffers,
+            )
     else:
         raise ValueError("Only bgzip compression is supported at the moment")
 
-    c_options = parse_options()
-    if byte_range is not None:
-        c_byte_range_offset = byte_range[0]
-        c_byte_range_size = byte_range[1]
-        c_options.byte_range = byte_range_info(
-            c_byte_range_offset,
-            c_byte_range_size)
-    c_options.strip_delimiters = strip_delimiters
-    with nogil:
-        c_col = move(multibyte_split(
-            dereference(datasource),
-            delim,
-            c_options))
-
-    return Column.from_unique_ptr(move(c_col))
+    options = plc.io.text.ParseOptions(
+        byte_range=byte_range, strip_delimiters=strip_delimiters
+    )
+    plc_column = plc.io.text.multibyte_split(datasource, delimiter, options)
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/pylibcudf/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
index 965724a47b1..f78d97ef4d1 100644
--- a/python/pylibcudf/pylibcudf/io/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
@@ -13,7 +13,7 @@
 # =============================================================================
 
 set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx timezone.pyx
-                   types.pyx
+                   text.pyx types.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/io/__init__.pxd
index 1bcc0a3f963..6ba7f78a013 100644
--- a/python/pylibcudf/pylibcudf/io/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/io/__init__.pxd
@@ -1,5 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 # CSV is removed since it is def not cpdef (to force kw-only arguments)
-from . cimport avro, datasource, json, orc, parquet, timezone, types
+from . cimport avro, datasource, json, orc, parquet, timezone, text, types
 from .types cimport SourceInfo, TableWithMetadata
diff --git a/python/pylibcudf/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py
index 2e4f215b12c..0fc77dd0f57 100644
--- a/python/pylibcudf/pylibcudf/io/__init__.py
+++ b/python/pylibcudf/pylibcudf/io/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import avro, csv, datasource, json, orc, parquet, timezone, types
+from . import avro, csv, datasource, json, orc, parquet, timezone, text, types
 from .types import SinkInfo, SourceInfo, TableWithMetadata
diff --git a/python/pylibcudf/pylibcudf/io/text.pxd b/python/pylibcudf/pylibcudf/io/text.pxd
new file mode 100644
index 00000000000..051a8ae61aa
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/text.pxd
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.io.text cimport parse_options,
+
+cdef class ParseOptions:
+    cdef parse_options c_options
+
+cdef class DataChunkSource:
+    cdef data_chunk_source c_data_chunk_source
+
+    cdef DataChunkSource from_source(data_chunk_source source)
+
+
+cpdef Column multibyte_split(
+    source,
+    str delimiter,
+    ParseOptions options=*
+)
+
+cpdef DataChunkSource make_source(str data)
+
+cpdef DataChunkSource make_source_from_file(str filename)
+
+cpdef DataChunkSource make_source_from_bgzip_file(
+    str filename,
+    int virtual_begin=*,
+    int virtual_end=*,
+)
diff --git a/python/pylibcudf/pylibcudf/io/text.pyx b/python/pylibcudf/pylibcudf/io/text.pyx
new file mode 100644
index 00000000000..6e71536a09f
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/text.pyx
@@ -0,0 +1,184 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libc.stdint cimport uint64_t
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+from libcpp.utility cimport move
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.io cimport text as cpp_text
+
+cdef class ParseOptions:
+    """Parsing options for `multibyte_split`"""
+    def __init__(
+        self,
+        *,
+        byte_range=None,
+        strip_delimiters=False,
+    ):
+        self.c_options = cpp_text.parse_options()
+        if byte_range is not None:
+            c_byte_range_offset = byte_range[0]
+            c_byte_range_size = byte_range[1]
+            self.c_options.byte_range = cpp_text.byte_range_info(
+                c_byte_range_offset,
+                c_byte_range_size
+            )
+        self.c_options.strip_delimiters = strip_delimiters
+
+
+cdef class DataChunkSource:
+    """Data source for `multibyte_split`"""
+
+    def __init__(self):
+        raise ValueError(
+            "This class cannot be instantiated directly. "
+            "Use one of the make_source functions instead"
+        )
+
+    @staticmethod
+    cdef DataChunkSource from_source(data_chunk_source source):
+        cdef DataChunkSource datasource = DataChunkSource.__new__(DataChunkSource)
+        datasource.c_data_chunk_source = source
+        return datasource
+
+
+cpdef DataChunkSource make_source(str data):
+    """
+    Creates a data source capable of producing device-buffered views
+    of the given string.
+
+    Parameters
+    ----------
+    data : str
+        The host data to be exposed as a data chunk source.
+
+    Returns
+    -------
+    DataChunkSource
+        The data chunk source for the provided host data.
+    """
+    cdef data_chunk_source c_source
+    cdef string c_data = data.encode()
+
+    with nogil:
+        c_source = cpp_text.make_source(c_data)
+
+    return DataChunkSource.from_source(c_source)
+
+
+cpdef DataChunkSource make_source_from_file(str filename):
+    """
+    Creates a data source capable of producing device-buffered views of the file.
+
+    Parameters
+    ----------
+    filename : str
+        The filename of the file to be exposed as a data chunk source.
+
+    Returns
+    -------
+    DataChunkSource
+        The data chunk source for the provided filename.
+    """
+    cdef data_chunk_source c_source
+    cdef string c_filename = filename.encode()
+
+    with nogil:
+        c_source = cpp_text.make_source_from_file(c_filename)
+
+    return DataChunkSource.from_source(c_source)
+
+cpdef DataChunkSource make_source_from_bgzip_file(
+    str filename,
+    int virtual_begin=None,
+    int virtual_end=None,
+):
+    """
+    Creates a data source capable of producing device-buffered views of
+    a BGZIP compressed file with virtual record offsets.
+
+    Parameters
+    ----------
+    filename : str
+        The filename of the BGZIP-compressed file to be exposed as a data chunk source.
+
+    virtual_begin : int, default None
+        The virtual (Tabix) offset of the first byte to be read. Its upper 48 bits
+        describe the offset into the compressed file, its lower 16 bits describe the
+        block-local offset.
+
+    virtual_end : int, default None
+        The data chunk source for the provided filename.
+
+    Returns
+    -------
+    DataChunkSource
+        The data chunk source for the provided filename.
+    """
+    cdef data_chunk_source c_source
+    cdef string c_filename = filename.encode()
+
+    if virtual_begin is None and virtual_end is None:
+        with nogil:
+            c_source = cpp_text.make_source_from_bgzip_file(c_filename)
+    elif virtual_begin is not None and virtual_end is not None:
+        cdef uint64_t c_virtual_begin = virtual_begin
+        cdef uint64_t c_virtual_end = c_virtual_end
+        with nogil:
+            c_source = cpp_text.make_source_from_bgzip_file(
+                c_filename,
+                virtual_begin,
+                c_virtual_end
+            )
+    else:
+        raise ValueError(
+            "virtual_begin and virtual_end must both be None or both be int"
+        )
+    return DataChunkSource.from_source(c_source)
+
+cpdef Column multibyte_split(
+    DataChunkSource source,
+    str delimiter,
+    ParseOptions options=None
+):
+    """
+    Splits the source text into a strings column using a multiple byte delimiter.
+
+    For details, see :cpp:func:`cudf::io::text::multibyte_split`
+
+    Parameters
+    ----------
+    source :
+        The source string.
+
+    delimiter : str
+        UTF-8 encoded string for which to find offsets in the source.
+
+    options : ParseOptions
+        The parsing options to use (including byte range).
+
+    Returns
+    -------
+    Column
+        The strings found by splitting the source by the delimiter
+        within the relevant byte range.
+    """
+    cdef unique_ptr[column] c_result
+    cdef data_chunk_source c_source = source.c_data_chunk_source
+    cdef string c_delimiter = delimiter.encode()
+
+    if options is None:
+        options = ParseOptions()
+
+    cdef cpp_text.parse_options c_options = options.c_options
+
+    with nogil:
+        c_result = cpp_text.multibyte_split(
+            c_source
+            c_delimiter,
+            c_options
+        )
+
+    return Column.from_libcudf(move(c_result))

From 74240f30ee3b9a99c0949f228aa1c05a9b2d80f9 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 31 Oct 2024 17:31:34 -0700
Subject: [PATCH 2/6] Fix implementation

---
 python/cudf/cudf/_lib/text.pyx                |  2 +
 python/pylibcudf/pylibcudf/io/text.pxd        | 10 +++--
 python/pylibcudf/pylibcudf/io/text.pyx        | 43 ++++++++++---------
 .../pylibcudf/pylibcudf/tests/io/test_text.py | 31 +++++++++++++
 4 files changed, 62 insertions(+), 24 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/tests/io/test_text.py

diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx
index c1204030124..7942d067c2b 100644
--- a/python/cudf/cudf/_lib/text.pyx
+++ b/python/cudf/cudf/_lib/text.pyx
@@ -1,5 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+from libcpp cimport bool
+
 from io import TextIOBase
 
 import pylibcudf as plc
diff --git a/python/pylibcudf/pylibcudf/io/text.pxd b/python/pylibcudf/pylibcudf/io/text.pxd
index 051a8ae61aa..10538f48f68 100644
--- a/python/pylibcudf/pylibcudf/io/text.pxd
+++ b/python/pylibcudf/pylibcudf/io/text.pxd
@@ -1,19 +1,21 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from libcpp.memory cimport unique_ptr
 from pylibcudf.column cimport Column
-from pylibcudf.libcudf.io.text cimport parse_options,
+from pylibcudf.libcudf.io.text cimport parse_options, data_chunk_source
 
 cdef class ParseOptions:
     cdef parse_options c_options
 
 cdef class DataChunkSource:
-    cdef data_chunk_source c_data_chunk_source
+    cdef unique_ptr[data_chunk_source] c_data_chunk_source
 
-    cdef DataChunkSource from_source(data_chunk_source source)
+    @staticmethod
+    cdef DataChunkSource from_source(unique_ptr[data_chunk_source] source)
 
 
 cpdef Column multibyte_split(
-    source,
+    DataChunkSource source,
     str delimiter,
     ParseOptions options=*
 )
diff --git a/python/pylibcudf/pylibcudf/io/text.pyx b/python/pylibcudf/pylibcudf/io/text.pyx
index 6e71536a09f..4c01c681437 100644
--- a/python/pylibcudf/pylibcudf/io/text.pyx
+++ b/python/pylibcudf/pylibcudf/io/text.pyx
@@ -1,5 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from cython.operator cimport dereference
 from libc.stdint cimport uint64_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -34,13 +35,13 @@ cdef class DataChunkSource:
     def __init__(self):
         raise ValueError(
             "This class cannot be instantiated directly. "
-            "Use one of the make_source functions instead"
+            "Use one of the make_source functions instead."
         )
 
     @staticmethod
-    cdef DataChunkSource from_source(data_chunk_source source):
+    cdef DataChunkSource from_source(unique_ptr[data_chunk_source] source):
         cdef DataChunkSource datasource = DataChunkSource.__new__(DataChunkSource)
-        datasource.c_data_chunk_source = source
+        datasource.c_data_chunk_source = move(source)
         return datasource
 
 
@@ -59,13 +60,13 @@ cpdef DataChunkSource make_source(str data):
     DataChunkSource
         The data chunk source for the provided host data.
     """
-    cdef data_chunk_source c_source
+    cdef unique_ptr[data_chunk_source] c_source
     cdef string c_data = data.encode()
 
     with nogil:
         c_source = cpp_text.make_source(c_data)
 
-    return DataChunkSource.from_source(c_source)
+    return DataChunkSource.from_source(move(c_source))
 
 
 cpdef DataChunkSource make_source_from_file(str filename):
@@ -82,18 +83,18 @@ cpdef DataChunkSource make_source_from_file(str filename):
     DataChunkSource
         The data chunk source for the provided filename.
     """
-    cdef data_chunk_source c_source
+    cdef unique_ptr[data_chunk_source] c_source
     cdef string c_filename = filename.encode()
 
     with nogil:
         c_source = cpp_text.make_source_from_file(c_filename)
 
-    return DataChunkSource.from_source(c_source)
+    return DataChunkSource.from_source(move(c_source))
 
 cpdef DataChunkSource make_source_from_bgzip_file(
     str filename,
-    int virtual_begin=None,
-    int virtual_end=None,
+    int virtual_begin=-1,
+    int virtual_end=-1,
 ):
     """
     Creates a data source capable of producing device-buffered views of
@@ -104,39 +105,41 @@ cpdef DataChunkSource make_source_from_bgzip_file(
     filename : str
         The filename of the BGZIP-compressed file to be exposed as a data chunk source.
 
-    virtual_begin : int, default None
+    virtual_begin : int
         The virtual (Tabix) offset of the first byte to be read. Its upper 48 bits
         describe the offset into the compressed file, its lower 16 bits describe the
         block-local offset.
 
     virtual_end : int, default None
-        The data chunk source for the provided filename.
+        The virtual (Tabix) offset one past the last byte to be read
 
     Returns
     -------
     DataChunkSource
         The data chunk source for the provided filename.
     """
-    cdef data_chunk_source c_source
+    cdef unique_ptr[data_chunk_source] c_source
     cdef string c_filename = filename.encode()
+    cdef uint64_t c_virtual_begin
+    cdef uint64_t c_virtual_end
 
-    if virtual_begin is None and virtual_end is None:
+    if virtual_begin == -1 and virtual_end == -1:
         with nogil:
             c_source = cpp_text.make_source_from_bgzip_file(c_filename)
-    elif virtual_begin is not None and virtual_end is not None:
-        cdef uint64_t c_virtual_begin = virtual_begin
-        cdef uint64_t c_virtual_end = c_virtual_end
+    elif virtual_begin != -1 and virtual_end != -1:
+        c_virtual_begin = virtual_begin
+        c_virtual_end = virtual_end
         with nogil:
             c_source = cpp_text.make_source_from_bgzip_file(
                 c_filename,
-                virtual_begin,
+                c_virtual_begin,
                 c_virtual_end
             )
     else:
         raise ValueError(
             "virtual_begin and virtual_end must both be None or both be int"
         )
-    return DataChunkSource.from_source(c_source)
+    return DataChunkSource.from_source(move(c_source))
 
 cpdef Column multibyte_split(
     DataChunkSource source,
@@ -166,7 +169,7 @@ cpdef Column multibyte_split(
         within the relevant byte range.
     """
     cdef unique_ptr[column] c_result
-    cdef data_chunk_source c_source = source.c_data_chunk_source
+    cdef unique_ptr[data_chunk_source] c_source = move(source.c_data_chunk_source)
     cdef string c_delimiter = delimiter.encode()
 
     if options is None:
@@ -176,7 +179,7 @@ cpdef Column multibyte_split(
 
     with nogil:
         c_result = cpp_text.multibyte_split(
-            c_source
+            dereference(c_source),
             c_delimiter,
             c_options
         )
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_text.py b/python/pylibcudf/pylibcudf/tests/io/test_text.py
new file mode 100644
index 00000000000..b809b8fb9d6
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/io/test_text.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.mark.parametrize(
+    "source_func",
+    [
+        "make_source",
+        "make_source_from_file",
+    ],
+)
+@pytest.mark.parametrize("options", [None, plc.io.text.ParseOptions()])
+def test_multibyte_split(source_func, options, tmp_path):
+    data = "x::y::z"
+    func = getattr(plc.io.text, source_func)
+    if source_func == "make_source":
+        source = func(data)
+    elif source_func == "make_source_from_file":
+        fle = tmp_path / "fle.txt"
+        fle.write_text(data)
+        source = func(str(fle))
+    result = plc.io.text.multibyte_split(source, "::", options)
+    expected = pa.array(["x::", "y::", "z"])
+    breakpoint()
+    assert_column_eq(result, expected)

From abdbb0ad725982a7083684f68c510d779a0f4894 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 1 Nov 2024 10:31:40 -0700
Subject: [PATCH 3/6] Remove breakpoint

---
 python/pylibcudf/pylibcudf/tests/io/test_text.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/pylibcudf/pylibcudf/tests/io/test_text.py b/python/pylibcudf/pylibcudf/tests/io/test_text.py
index b809b8fb9d6..f69e940e34e 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_text.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_text.py
@@ -1,6 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-
 import pyarrow as pa
 import pytest
 from utils import assert_column_eq
@@ -27,5 +26,4 @@ def test_multibyte_split(source_func, options, tmp_path):
         source = func(str(fle))
     result = plc.io.text.multibyte_split(source, "::", options)
     expected = pa.array(["x::", "y::", "z"])
-    breakpoint()
     assert_column_eq(result, expected)

From 39a447553a99d9acdcdb407a1a78b170b6255bdb Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 4 Nov 2024 10:39:01 -0800
Subject: [PATCH 4/6] document parameters in ParseOptions

---
 python/pylibcudf/pylibcudf/io/text.pyx | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/python/pylibcudf/pylibcudf/io/text.pyx b/python/pylibcudf/pylibcudf/io/text.pyx
index 4c01c681437..96baed0956c 100644
--- a/python/pylibcudf/pylibcudf/io/text.pyx
+++ b/python/pylibcudf/pylibcudf/io/text.pyx
@@ -11,7 +11,19 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.io cimport text as cpp_text
 
 cdef class ParseOptions:
-    """Parsing options for `multibyte_split`"""
+    """
+    Parsing options for `multibyte_split`
+
+    Parameters
+    ----------
+    byte_range : list | tuple, default None
+        Only rows starting inside this byte range will be
+        part of the output column.
+
+    strip_delimiters : bool, default True
+        Whether delimiters at the end of rows should
+        be stripped from the output column.
+    """
     def __init__(
         self,
         *,

From b06d64e71b1a9704bd039ac550e1d9b4cf544529 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 4 Nov 2024 14:06:51 -0800
Subject: [PATCH 5/6] store reference of bytes data on DataChunkSource

---
 python/pylibcudf/pylibcudf/io/text.pxd |  7 ++--
 python/pylibcudf/pylibcudf/io/text.pyx | 57 +++++++++++---------------
 2 files changed, 28 insertions(+), 36 deletions(-)

diff --git a/python/pylibcudf/pylibcudf/io/text.pxd b/python/pylibcudf/pylibcudf/io/text.pxd
index 10538f48f68..051e9bc0cde 100644
--- a/python/pylibcudf/pylibcudf/io/text.pxd
+++ b/python/pylibcudf/pylibcudf/io/text.pxd
@@ -1,6 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
 from pylibcudf.column cimport Column
 from pylibcudf.libcudf.io.text cimport parse_options, data_chunk_source
 
@@ -8,10 +9,8 @@ cdef class ParseOptions:
     cdef parse_options c_options
 
 cdef class DataChunkSource:
-    cdef unique_ptr[data_chunk_source] c_data_chunk_source
-
-    @staticmethod
-    cdef DataChunkSource from_source(unique_ptr[data_chunk_source] source)
+    cdef unique_ptr[data_chunk_source] c_source
+    cdef string data_ref
 
 
 cpdef Column multibyte_split(
diff --git a/python/pylibcudf/pylibcudf/io/text.pyx b/python/pylibcudf/pylibcudf/io/text.pyx
index 96baed0956c..f969075cdf9 100644
--- a/python/pylibcudf/pylibcudf/io/text.pyx
+++ b/python/pylibcudf/pylibcudf/io/text.pyx
@@ -42,19 +42,17 @@ cdef class ParseOptions:
 
 
 cdef class DataChunkSource:
-    """Data source for `multibyte_split`"""
+    """
+    Data source for `multibyte_split`
 
-    def __init__(self):
-        raise ValueError(
-            "This class cannot be instantiated directly. "
-            "Use one of the make_source functions instead."
-        )
+    Parameters
+    ----------
+    data : str
+        Filename or data itself.
+    """
 
-    @staticmethod
-    cdef DataChunkSource from_source(unique_ptr[data_chunk_source] source):
-        cdef DataChunkSource datasource = DataChunkSource.__new__(DataChunkSource)
-        datasource.c_data_chunk_source = move(source)
-        return datasource
+    def __cinit__(self, str data):
+        self.data_ref = data.encode()
 
 
 cpdef DataChunkSource make_source(str data):
@@ -72,13 +70,10 @@ cpdef DataChunkSource make_source(str data):
     DataChunkSource
         The data chunk source for the provided host data.
     """
-    cdef unique_ptr[data_chunk_source] c_source
-    cdef string c_data = data.encode()
-
+    cdef DataChunkSource dcs = DataChunkSource(data)
     with nogil:
-        c_source = cpp_text.make_source(c_data)
-
-    return DataChunkSource.from_source(move(c_source))
+        dcs.c_source = move(cpp_text.make_source(dcs.data_ref))
+    return dcs
 
 
 cpdef DataChunkSource make_source_from_file(str filename):
@@ -95,13 +90,10 @@ cpdef DataChunkSource make_source_from_file(str filename):
     DataChunkSource
         The data chunk source for the provided filename.
     """
-    cdef unique_ptr[data_chunk_source] c_source
-    cdef string c_filename = filename.encode()
-
+    cdef DataChunkSource dcs = DataChunkSource(filename)
     with nogil:
-        c_source = cpp_text.make_source_from_file(c_filename)
-
-    return DataChunkSource.from_source(move(c_source))
+        dcs.c_source = move(cpp_text.make_source_from_file(dcs.data_ref))
+    return dcs
 
 cpdef DataChunkSource make_source_from_bgzip_file(
     str filename,
@@ -130,28 +122,29 @@ cpdef DataChunkSource make_source_from_bgzip_file(
     DataChunkSource
         The data chunk source for the provided filename.
     """
-    cdef unique_ptr[data_chunk_source] c_source
-    cdef string c_filename = filename.encode()
     cdef uint64_t c_virtual_begin
     cdef uint64_t c_virtual_end
+    cdef DataChunkSource dcs = DataChunkSource(filename)
 
     if virtual_begin == -1 and virtual_end == -1:
         with nogil:
-            c_source = cpp_text.make_source_from_bgzip_file(c_filename)
+            dcs.c_source = move(cpp_text.make_source_from_bgzip_file(dcs.data_ref))
     elif virtual_begin != -1 and virtual_end != -1:
         c_virtual_begin = virtual_begin
         c_virtual_end = virtual_end
         with nogil:
-            c_source = cpp_text.make_source_from_bgzip_file(
-                c_filename,
-                c_virtual_begin,
-                c_virtual_end
+            dcs.c_source = move(
+                cpp_text.make_source_from_bgzip_file(
+                    dcs.data_ref,
+                    c_virtual_begin,
+                    c_virtual_end,
+                )
             )
     else:
         raise ValueError(
             "virtual_begin and virtual_end must both be None or both be int"
         )
-    return DataChunkSource.from_source(move(c_source))
+    return dcs
 
 cpdef Column multibyte_split(
     DataChunkSource source,
@@ -181,7 +174,7 @@ cpdef Column multibyte_split(
         within the relevant byte range.
     """
     cdef unique_ptr[column] c_result
-    cdef unique_ptr[data_chunk_source] c_source = move(source.c_data_chunk_source)
+    cdef unique_ptr[data_chunk_source] c_source = move(source.c_source)
     cdef string c_delimiter = delimiter.encode()
 
     if options is None:

From 169604a4fb4b5f8e8a840f839ddd01caea9f7f36 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 4 Nov 2024 14:10:08 -0800
Subject: [PATCH 6/6] add comment about lifetime

---
 python/pylibcudf/pylibcudf/io/text.pyx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/pylibcudf/pylibcudf/io/text.pyx b/python/pylibcudf/pylibcudf/io/text.pyx
index f969075cdf9..667a054baaa 100644
--- a/python/pylibcudf/pylibcudf/io/text.pyx
+++ b/python/pylibcudf/pylibcudf/io/text.pyx
@@ -52,6 +52,7 @@ cdef class DataChunkSource:
     """
 
     def __cinit__(self, str data):
+        # Need to keep a reference alive for make_source
         self.data_ref = data.encode()