From 3b734ec2fd591f037fe1d8f8ce424c7049cb5a3e Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 6 Jun 2024 04:41:01 -0700
Subject: [PATCH] Start migrating I/O to pylibcudf (#15899)

xref #15162

Starts migrating cudf I/O cython to use pylibcudf APIs, starting with avro.

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15899
---
 .../user_guide/api_docs/pylibcudf/index.rst   |   1 +
 .../user_guide/api_docs/pylibcudf/io/avro.rst |   6 +
 .../api_docs/pylibcudf/io/index.rst           |  18 +++
 python/cudf/cudf/_lib/avro.pyx                |  50 ++-----
 python/cudf/cudf/_lib/csv.pyx                 |   8 +-
 python/cudf/cudf/_lib/parquet.pyx             |   2 +-
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |   1 +
 .../cudf/_lib/pylibcudf/io/CMakeLists.txt     |  25 ++++
 .../cudf/cudf/_lib/pylibcudf/io/__init__.pxd  |   4 +
 .../cudf/cudf/_lib/pylibcudf/io/__init__.py   |   4 +
 python/cudf/cudf/_lib/pylibcudf/io/avro.pxd   |  12 ++
 python/cudf/cudf/_lib/pylibcudf/io/avro.pyx   |  58 +++++++++
 python/cudf/cudf/_lib/pylibcudf/io/types.pxd  |  29 +++++
 python/cudf/cudf/_lib/pylibcudf/io/types.pyx  | 110 ++++++++++++++++
 .../cudf/_lib/pylibcudf/libcudf/io/orc.pxd    |   6 +-
 .../cudf/_lib/pylibcudf/libcudf/io/types.pxd  |  58 ++++-----
 python/cudf/cudf/_lib/utils.pxd               |   1 +
 python/cudf/cudf/_lib/utils.pyx               |  11 ++
 .../cudf/cudf/pylibcudf_tests/common/utils.py |  17 +++
 python/cudf/cudf/pylibcudf_tests/test_avro.py | 123 ++++++++++++++++++
 .../cudf/pylibcudf_tests/test_source_info.py  |  69 ++++++++++
 21 files changed, 541 insertions(+), 72 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/__init__.py
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/avro.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/avro.pyx
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/types.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/types.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_avro.py
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_source_info.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index b6ad1157511..870ed8856d1 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -17,6 +17,7 @@ This page provides API documentation for pylibcudf.
     filling
     gpumemoryview
     groupby
+    io/index.rst
     join
     lists
     merge
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst
new file mode 100644
index 00000000000..495bd505fdc
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst
@@ -0,0 +1,6 @@
+====
+Avro
+====
+
+.. automodule:: cudf._lib.pylibcudf.io.avro
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
new file mode 100644
index 00000000000..0d53ac92db9
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
@@ -0,0 +1,18 @@
+===
+I/O
+===
+
+I/O Utility Classes
+===================
+
+.. automodule:: cudf._lib.pylibcudf.io.types
+   :members:
+
+
+I/O Functions
+=============
+
+.. toctree::
+    :maxdepth: 1
+
+    avro
diff --git a/python/cudf/cudf/_lib/avro.pyx b/python/cudf/cudf/_lib/avro.pyx
index ae17a5f1ab6..3c132b22880 100644
--- a/python/cudf/cudf/_lib/avro.pyx
+++ b/python/cudf/cudf/_lib/avro.pyx
@@ -1,20 +1,12 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.string cimport string
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
+from cudf._lib.utils cimport data_from_pylibcudf_io
 
-from cudf._lib.io.utils cimport make_source_info
-from cudf._lib.pylibcudf.libcudf.io.avro cimport (
-    avro_reader_options,
-    read_avro as libcudf_read_avro,
-)
-from cudf._lib.pylibcudf.libcudf.io.types cimport table_with_metadata
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-from cudf._lib.utils cimport data_from_unique_ptr
+import cudf._lib.pylibcudf as plc
+from cudf._lib.pylibcudf.io.types import SourceInfo
 
 
-cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1):
+cpdef read_avro(datasource, columns=None, skip_rows=0, num_rows=-1):
     """
     Cython function to call libcudf read_avro, see `read_avro`.
 
@@ -28,28 +20,14 @@ cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1):
 
     if not isinstance(num_rows, int) or num_rows < -1:
         raise TypeError("num_rows must be an int >= -1")
-    if not isinstance(skip_rows, int) or skip_rows < -1:
-        raise TypeError("skip_rows must be an int >= -1")
-
-    cdef vector[string] c_columns
-    if columns is not None and len(columns) > 0:
-        c_columns.reserve(len(columns))
-        for col in columns:
-            c_columns.push_back(str(col).encode())
-
-    cdef avro_reader_options options = move(
-        avro_reader_options.builder(make_source_info([datasource]))
-        .columns(c_columns)
-        .skip_rows(<size_type> skip_rows)
-        .num_rows(<size_type> num_rows)
-        .build()
+    if not isinstance(skip_rows, int) or skip_rows < 0:
+        raise TypeError("skip_rows must be an int >= 0")
+
+    return data_from_pylibcudf_io(
+        plc.io.avro.read_avro(
+            SourceInfo([datasource]),
+            columns,
+            skip_rows,
+            num_rows
+        )
     )
-
-    cdef table_with_metadata c_result
-
-    with nogil:
-        c_result = move(libcudf_read_avro(options))
-
-    names = [info.name.decode() for info in c_result.metadata.schema_info]
-
-    return data_from_unique_ptr(move(c_result.tbl), column_names=names)
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index aa771295607..0b0bbdb2589 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -151,14 +151,14 @@ cdef csv_reader_options make_csv_reader_options(
         )
 
     if quoting == 1:
-        c_quoting = quote_style.QUOTE_ALL
+        c_quoting = quote_style.ALL
     elif quoting == 2:
-        c_quoting = quote_style.QUOTE_NONNUMERIC
+        c_quoting = quote_style.NONNUMERIC
     elif quoting == 3:
-        c_quoting = quote_style.QUOTE_NONE
+        c_quoting = quote_style.NONE
     else:
         # Default value
-        c_quoting = quote_style.QUOTE_MINIMAL
+        c_quoting = quote_style.MINIMAL
 
     cdef csv_reader_options csv_reader_options_c = move(
         csv_reader_options.builder(c_source_info)
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index f0eef9be124..ac592cedaac 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -491,7 +491,7 @@ def write_parquet(
             "Valid values are '1.0' and '2.0'"
         )
 
-    dict_policy = (
+    cdef cudf_io_types.dictionary_policy dict_policy = (
         cudf_io_types.dictionary_policy.ADAPTIVE
         if use_dictionary
         else cudf_io_types.dictionary_policy.NEVER
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index 7d0676f6def..6beb7b0f506 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -50,3 +50,4 @@ link_to_pyarrow_headers(pylibcudf_interop)
 
 add_subdirectory(libcudf)
 add_subdirectory(strings)
+add_subdirectory(io)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
new file mode 100644
index 00000000000..2cfec101bab
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
@@ -0,0 +1,25 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources avro.pyx types.pyx)
+
+set(linked_libraries cudf::cudf)
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf
+)
+
+set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_types)
+link_to_pyarrow_headers("${targets_using_arrow_headers}")
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
new file mode 100644
index 00000000000..250292746c1
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from . cimport avro, types
+from .types cimport SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
new file mode 100644
index 00000000000..5242c741911
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from . import avro, types
+from .types import SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd b/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd
new file mode 100644
index 00000000000..3695f36a6e7
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from cudf._lib.pylibcudf.libcudf.io.avro cimport avro_reader_options
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+
+cpdef TableWithMetadata read_avro(
+    SourceInfo source_info,
+    list columns = *,
+    size_type skip_rows = *,
+    size_type num_rows = *
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx
new file mode 100644
index 00000000000..946e0896fc8
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx
@@ -0,0 +1,58 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+
+from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from cudf._lib.pylibcudf.libcudf.io.avro cimport (
+    avro_reader_options,
+    read_avro as cpp_read_avro,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+
+cpdef TableWithMetadata read_avro(
+    SourceInfo source_info,
+    list columns = None,
+    size_type skip_rows = 0,
+    size_type num_rows = -1
+):
+    """
+    Reads an Avro dataset into a set of columns.
+
+    Parameters
+    ----------
+    source_info: SourceInfo
+        The SourceInfo object to read the avro dataset from.
+    columns: list, default None
+        Optional columns to read, if not provided, reads all columns in the file.
+    skip_rows: size_type, default 0
+        The number of rows to skip.
+    num_rows: size_type, default -1
+        The number of rows to read, after skipping rows.
+        If -1 is passed, all rows will be read.
+
+    Returns
+    -------
+    TableWithMetadata
+        The Table and its corresponding metadata that was read in.
+    """
+    cdef vector[string] c_columns
+    if columns is not None and len(columns) > 0:
+        c_columns.reserve(len(columns))
+        for col in columns:
+            c_columns.push_back(str(col).encode())
+
+    cdef avro_reader_options avro_opts = move(
+        avro_reader_options.builder(source_info.c_obj)
+        .columns(c_columns)
+        .skip_rows(skip_rows)
+        .num_rows(num_rows)
+        .build()
+    )
+
+    with nogil:
+        c_result = move(cpp_read_avro(avro_opts))
+
+    return TableWithMetadata.from_libcudf(c_result)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
new file mode 100644
index 00000000000..aa846a47343
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
+    column_encoding,
+    column_in_metadata,
+    column_name_info,
+    compression_type,
+    dictionary_policy,
+    io_type,
+    partition_info,
+    quote_style,
+    sink_info,
+    source_info,
+    statistics_freq,
+    table_input_metadata,
+    table_metadata,
+    table_with_metadata,
+)
+from cudf._lib.pylibcudf.table cimport Table
+
+
+cdef class TableWithMetadata:
+    cdef public Table tbl
+    cdef table_metadata metadata
+
+    @staticmethod
+    cdef TableWithMetadata from_libcudf(table_with_metadata& tbl)
+
+cdef class SourceInfo:
+    cdef source_info c_obj
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
new file mode 100644
index 00000000000..cd777232b33
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
@@ -0,0 +1,110 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
+    host_buffer,
+    source_info,
+    table_with_metadata,
+)
+
+import errno
+import io
+import os
+
+
+cdef class TableWithMetadata:
+    """A container holding a table and its associated metadata
+    (e.g. column names)
+
+    For details, see :cpp:class:`cudf::io::table_with_metadata`.
+    """
+
+    @property
+    def columns(self):
+        """
+        Return a list containing the columns of the table
+        """
+        return self.tbl.columns()
+
+    @property
+    def column_names(self):
+        """
+        Return a list containing the column names of the table
+        """
+        cdef list names = []
+        for col_info in self.metadata.schema_info:
+            # TODO: Handle nesting (columns with child columns)
+            assert col_info.children.size() == 0, "Child column names are not handled!"
+            names.append(col_info.name.decode())
+        return names
+
+    @staticmethod
+    cdef TableWithMetadata from_libcudf(table_with_metadata& tbl_with_meta):
+        """Create a Python TableWithMetadata from a libcudf table_with_metadata"""
+        cdef TableWithMetadata out = TableWithMetadata.__new__(TableWithMetadata)
+        out.tbl = Table.from_libcudf(move(tbl_with_meta.tbl))
+        out.metadata = tbl_with_meta.metadata
+        return out
+
+cdef class SourceInfo:
+    """A class containing details on a source to read from.
+
+    For details, see :cpp:class:`cudf::io::source_info`.
+
+    Parameters
+    ----------
+    sources : List[Union[str, os.PathLike, bytes, io.BytesIO]]
+        A homogeneous list of sources (this can be a string filename,
+        an os.PathLike, bytes, or an io.BytesIO) to read from.
+
+        Mixing different types of sources will raise a `ValueError`.
+    """
+
+    def __init__(self, list sources):
+        if not sources:
+            raise ValueError("Need to pass at least one source")
+
+        cdef vector[string] c_files
+
+        if isinstance(sources[0], (os.PathLike, str)):
+            c_files.reserve(len(sources))
+
+            for src in sources:
+                if not isinstance(src, (os.PathLike, str)):
+                    raise ValueError("All sources must be of the same type!")
+                if not os.path.isfile(src):
+                    raise FileNotFoundError(errno.ENOENT,
+                                            os.strerror(errno.ENOENT),
+                                            src)
+
+                c_files.push_back(<string> str(src).encode())
+
+            self.c_obj = move(source_info(c_files))
+            return
+
+        # TODO: host_buffer is deprecated API, use host_span instead
+        cdef vector[host_buffer] c_host_buffers
+        cdef const unsigned char[::1] c_buffer
+        cdef bint empty_buffer = False
+        if isinstance(sources[0], bytes):
+            empty_buffer = True
+            for buffer in sources:
+                if not isinstance(buffer, bytes):
+                    raise ValueError("All sources must be of the same type!")
+                if (len(buffer) > 0):
+                    c_buffer = buffer
+                    c_host_buffers.push_back(host_buffer(<char*>&c_buffer[0],
+                                                         c_buffer.shape[0]))
+                    empty_buffer = False
+        elif isinstance(sources[0], io.BytesIO):
+            for bio in sources:
+                if not isinstance(bio, io.BytesIO):
+                    raise ValueError("All sources must be of the same type!")
+                c_buffer = bio.getbuffer()  # check if empty?
+                c_host_buffers.push_back(host_buffer(<char*>&c_buffer[0],
+                                                     c_buffer.shape[0]))
+
+        self.c_obj = source_info(c_host_buffers)
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd
index e553515dfdf..25f91849dea 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd
@@ -94,7 +94,9 @@ cdef extern from "cudf/io/orc.hpp" \
         orc_writer_options_builder& compression(
             cudf_io_types.compression_type comp
         ) except +
-        orc_writer_options_builder& enable_statistics(bool val) except +
+        orc_writer_options_builder& enable_statistics(
+            cudf_io_types.statistics_freq val
+        ) except +
         orc_writer_options_builder& stripe_size_bytes(size_t val) except +
         orc_writer_options_builder& stripe_size_rows(size_type val) except +
         orc_writer_options_builder& row_index_stride(size_type val) except +
@@ -147,7 +149,7 @@ cdef extern from "cudf/io/orc.hpp" \
             cudf_io_types.compression_type comp
         ) except +
         chunked_orc_writer_options_builder& enable_statistics(
-            bool val
+            cudf_io_types.statistics_freq val
         ) except +
         orc_writer_options_builder& stripe_size_bytes(size_t val) except +
         orc_writer_options_builder& stripe_size_rows(size_type val) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
index 38fae1df1e5..8d87deb1472 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
@@ -20,45 +20,45 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type
 cdef extern from "cudf/io/types.hpp" \
         namespace "cudf::io" nogil:
 
-    ctypedef enum quote_style:
-        QUOTE_MINIMAL "cudf::io::quote_style::MINIMAL"
-        QUOTE_ALL "cudf::io::quote_style::ALL"
-        QUOTE_NONNUMERIC "cudf::io::quote_style::NONNUMERIC"
-        QUOTE_NONE "cudf::io::quote_style::NONE"
-
-    ctypedef enum compression_type:
-        NONE "cudf::io::compression_type::NONE"
-        AUTO "cudf::io::compression_type::AUTO"
-        SNAPPY "cudf::io::compression_type::SNAPPY"
-        GZIP "cudf::io::compression_type::GZIP"
-        BZIP2 "cudf::io::compression_type::BZIP2"
-        BROTLI "cudf::io::compression_type::BROTLI"
-        ZIP "cudf::io::compression_type::ZIP"
-        XZ "cudf::io::compression_type::XZ"
-        ZLIB "cudf::io::compression_type::ZLIB"
-        LZ4 "cudf::io::compression_type::LZ4"
-        LZO "cudf::io::compression_type::LZO"
-        ZSTD "cudf::io::compression_type::ZSTD"
-
-    ctypedef enum io_type:
-        FILEPATH "cudf::io::io_type::FILEPATH"
-        HOST_BUFFER "cudf::io::io_type::HOST_BUFFER"
-        VOID "cudf::io::io_type::VOID"
-        USER_IMPLEMENTED "cudf::io::io_type::USER_IMPLEMENTED"
-
-    ctypedef enum statistics_freq:
+    cpdef enum class quote_style(int32_t):
+        MINIMAL
+        ALL
+        NONNUMERIC
+        NONE
+
+    cpdef enum class compression_type(int32_t):
+        NONE
+        AUTO
+        SNAPPY
+        GZIP
+        BZIP2
+        BROTLI
+        ZIP
+        XZ
+        ZLIB
+        LZ4
+        LZO
+        ZSTD
+
+    cpdef enum class io_type(int32_t):
+        FILEPATH
+        HOST_BUFFER
+        VOID
+        USER_IMPLEMENTED
+
+    cpdef enum class statistics_freq(int32_t):
         STATISTICS_NONE = 0,
         STATISTICS_ROWGROUP = 1,
         STATISTICS_PAGE = 2,
         STATISTICS_COLUMN = 3,
 
-    ctypedef enum dictionary_policy:
+    cpdef enum class dictionary_policy(int32_t):
         NEVER = 0,
         ADAPTIVE = 1,
         ALWAYS = 2,
 
     cdef extern from "cudf/io/types.hpp" namespace "cudf::io" nogil:
-        cpdef enum class column_encoding:
+        cpdef enum class column_encoding(int32_t):
             USE_DEFAULT = -1
             DICTIONARY = 0
             PLAIN = 1
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index c5a1e7552b9..99850d549a1 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -11,6 +11,7 @@ from cudf._lib.pylibcudf.libcudf.table.table cimport table, table_view
 cdef data_from_unique_ptr(
     unique_ptr[table] c_tbl, column_names, index_names=*)
 cdef data_from_pylibcudf_table(tbl, column_names, index_names=*)
+cdef data_from_pylibcudf_io(tbl_with_meta)
 cdef data_from_table_view(
     table_view tv, object owner, object column_names, object index_names=*)
 cdef table_view table_view_from_columns(columns) except *
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 4c4cd48d6ed..de6b9f690b6 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -315,6 +315,17 @@ cdef data_from_pylibcudf_table(tbl, column_names, index_names=None):
         index_names
     )
 
+cdef data_from_pylibcudf_io(tbl_with_meta):
+    """
+    Unpacks the TableWithMetadata from libcudf I/O
+    into a dict of columns and an Index (cuDF format)
+    """
+    return _data_from_columns(
+        columns=[Column.from_pylibcudf(plc) for plc in tbl_with_meta.columns],
+        column_names=tbl_with_meta.column_names,
+        index_names=None
+    )
+
 cdef columns_from_table_view(
     table_view tv,
     object owners,
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index e00053529a8..54d38f1a8cf 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -63,6 +63,23 @@ def assert_table_eq(pa_table: pa.Table, plc_table: plc.Table) -> None:
         assert_column_eq(pa_col, plc_col)
 
 
+def assert_table_and_meta_eq(
+    plc_table_w_meta: plc.io.types.TableWithMetadata, pa_table: pa.Table
+) -> None:
+    """Verify that the pylibcudf TableWithMetadata and PyArrow table are equal"""
+
+    plc_table = plc_table_w_meta.tbl
+
+    plc_shape = (plc_table.num_rows(), plc_table.num_columns())
+    assert plc_shape == pa_table.shape
+
+    for plc_col, pa_col in zip(plc_table.columns(), pa_table.columns):
+        assert_column_eq(plc_col, pa_col)
+
+    # Check column name equality
+    assert plc_table_w_meta.column_names == pa_table.column_names
+
+
 def cudf_raises(expected_exception: BaseException, *args, **kwargs):
     # A simple wrapper around pytest.raises that defaults to looking for cudf exceptions
     match = kwargs.get("match", None)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_avro.py b/python/cudf/cudf/pylibcudf_tests/test_avro.py
new file mode 100644
index 00000000000..d6cd86768cd
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_avro.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import io
+import itertools
+
+import fastavro
+import pyarrow as pa
+import pytest
+from utils import assert_table_and_meta_eq
+
+import cudf._lib.pylibcudf as plc
+
+avro_dtype_pairs = [
+    ("boolean", pa.bool_()),
+    ("int", pa.int32()),
+    ("long", pa.int64()),
+    ("float", pa.float32()),
+    ("double", pa.float64()),
+    ("bytes", pa.string()),
+    ("string", pa.string()),
+]
+
+
+@pytest.fixture(
+    scope="module", params=itertools.combinations(avro_dtype_pairs, 2)
+)
+def avro_dtypes(request):
+    return request.param
+
+
+@pytest.fixture
+def avro_dtype_data(avro_dtypes):
+    (avro_type1, _), (avro_type2, _) = avro_dtypes
+
+    def _get_data(avro_type):
+        if avro_type == "boolean":
+            return [True, False, True]
+        elif avro_type in {"int", "long"}:
+            return [1, 2, -1]
+        elif avro_type in {"float", "double"}:
+            return [1.0, 3.1415, -3.1415]
+        elif avro_type == "bytes":
+            return [b"a", b"b", b"c"]
+        elif avro_type == "string":
+            return ["Hello", "World!", ""]
+
+    return _get_data(avro_type1), _get_data(avro_type2)
+
+
+@pytest.fixture(
+    params=[
+        (0, 0),
+        (0, -1),
+        (1, -1),
+        (3, -1),
+    ]
+)
+def row_opts(request):
+    """
+    (skip_rows, num_rows) combos for the avro reader
+    """
+    return request.param
+
+
+@pytest.mark.parametrize("columns", [["prop1"], [], ["prop1", "prop2"]])
+@pytest.mark.parametrize("nullable", [True, False])
+def test_read_avro(avro_dtypes, avro_dtype_data, row_opts, columns, nullable):
+    (avro_type1, expected_type1), (avro_type2, expected_type2) = avro_dtypes
+
+    avro_type1 = avro_type1 if not nullable else ["null", avro_type1]
+    avro_type2 = avro_type2 if not nullable else ["null", avro_type2]
+
+    skip_rows, num_rows = row_opts
+
+    schema = fastavro.parse_schema(
+        {
+            "type": "record",
+            "name": "test",
+            "fields": [
+                {"name": "prop1", "type": avro_type1},
+                {"name": "prop2", "type": avro_type2},
+            ],
+        }
+    )
+
+    if nullable:
+        avro_dtype_data = (
+            avro_dtype_data[0] + [None],
+            avro_dtype_data[1] + [None],
+        )
+
+    records = [
+        {"prop1": val1, "prop2": val2} for val1, val2 in zip(*avro_dtype_data)
+    ]
+
+    buffer = io.BytesIO()
+    fastavro.writer(buffer, schema, records)
+    buffer.seek(0)
+
+    res = plc.io.avro.read_avro(
+        plc.io.types.SourceInfo([buffer]),
+        columns=columns,
+        skip_rows=skip_rows,
+        num_rows=num_rows,
+    )
+
+    expected = pa.Table.from_arrays(
+        [
+            pa.array(avro_dtype_data[0], type=expected_type1),
+            pa.array(avro_dtype_data[1], type=expected_type2),
+        ],
+        names=["prop1", "prop2"],
+    )
+
+    # Adjust for skip_rows/num_rows in result
+    length = num_rows if num_rows != -1 else None
+    expected = expected.slice(skip_rows, length=length)
+
+    # adjust for # of columns
+    if columns != []:
+        expected = expected.select(columns)
+
+    assert_table_and_meta_eq(res, expected)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_source_info.py b/python/cudf/cudf/pylibcudf_tests/test_source_info.py
new file mode 100644
index 00000000000..71a3ecbcc30
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_source_info.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import io
+
+import pytest
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.mark.parametrize(
+    "source", ["a.txt", b"hello world", io.BytesIO(b"hello world")]
+)
+def test_source_info_ctor(source, tmp_path):
+    if isinstance(source, str):
+        file = tmp_path / source
+        file.write_bytes("hello world".encode("utf-8"))
+        source = str(file)
+
+    plc.io.SourceInfo([source])
+
+    # TODO: test contents of source_info buffer is correct
+    # once buffers are exposed on python side
+
+
+@pytest.mark.parametrize(
+    "sources",
+    [
+        ["a.txt", "a.txt"],
+        [b"hello world", b"hello there"],
+        [io.BytesIO(b"hello world"), io.BytesIO(b"hello there")],
+    ],
+)
+def test_source_info_ctor_multiple(sources, tmp_path):
+    for i in range(len(sources)):
+        source = sources[i]
+        if isinstance(source, str):
+            file = tmp_path / source
+            file.write_bytes("hello world".encode("utf-8"))
+            sources[i] = str(file)
+
+    plc.io.SourceInfo(sources)
+
+    # TODO: test contents of source_info buffer is correct
+    # once buffers are exposed on python side
+
+
+@pytest.mark.parametrize(
+    "sources",
+    [
+        ["awef.txt", b"hello world", io.BytesIO(b"hello world")],
+        [b"hello world", b"hello there", "awef.txt"],
+        [
+            io.BytesIO(b"hello world"),
+            io.BytesIO(b"hello there"),
+            b"hello world",
+        ],
+    ],
+)
+def test_source_info_ctor_mixing_invalid(sources, tmp_path):
+    # Unlike the previous test
+    # don't create files so that they are missing
+    for i in range(len(sources)):
+        source = sources[i]
+        if isinstance(source, str):
+            file = tmp_path / source
+            file.write_bytes("hello world".encode("utf-8"))
+            sources[i] = str(file)
+    with pytest.raises(ValueError):
+        plc.io.SourceInfo(sources)