diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 0b0bbdb2589..c706351a683 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -8,7 +8,7 @@ from libcpp.utility cimport move from libcpp.vector cimport vector cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types -from cudf._lib.io.datasource cimport Datasource, NativeFileDatasource +from cudf._lib.pylibcudf.io.datasource cimport Datasource, NativeFileDatasource from cudf._lib.pylibcudf.libcudf.types cimport data_type from cudf._lib.types cimport dtype_to_data_type diff --git a/python/cudf/cudf/_lib/io/CMakeLists.txt b/python/cudf/cudf/_lib/io/CMakeLists.txt index 2408fa1c12f..620229a1275 100644 --- a/python/cudf/cudf/_lib/io/CMakeLists.txt +++ b/python/cudf/cudf/_lib/io/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources datasource.pyx utils.pyx) +set(cython_sources utils.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( CXX diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx index 3c14ec46122..1d7c56888d9 100644 --- a/python/cudf/cudf/_lib/io/utils.pyx +++ b/python/cudf/cudf/_lib/io/utils.pyx @@ -8,7 +8,7 @@ from libcpp.utility cimport move from libcpp.vector cimport vector from cudf._lib.column cimport Column -from cudf._lib.io.datasource cimport Datasource +from cudf._lib.pylibcudf.io.datasource cimport Datasource from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource from cudf._lib.pylibcudf.libcudf.io.types cimport ( diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index d3e6053ef4b..9609e3131b4 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -23,12 +23,12 @@ except ImportError: cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types from cudf._lib.column cimport Column -from cudf._lib.io.datasource cimport NativeFileDatasource from cudf._lib.io.utils cimport ( make_sink_info, make_source_info, update_column_struct_field_names, ) +from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink from cudf._lib.pylibcudf.libcudf.io.orc cimport ( chunked_orc_writer_options, diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index f6f9cfa9a7c..7914ed7e9d9 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -37,12 +37,12 @@ cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types cimport cudf._lib.pylibcudf.libcudf.types as cudf_types from cudf._lib.column cimport Column from cudf._lib.expressions cimport Expression -from cudf._lib.io.datasource cimport NativeFileDatasource from cudf._lib.io.utils cimport ( make_sinks_info, make_source_info, update_struct_field_names, ) +from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource from cudf._lib.pylibcudf.libcudf.expressions cimport expression from cudf._lib.pylibcudf.libcudf.io.parquet cimport ( chunked_parquet_reader as cpp_chunked_parquet_reader, diff --git a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt index 2cfec101bab..32f0f5543e4 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources avro.pyx types.pyx) +set(cython_sources avro.pyx datasource.pyx types.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( @@ -21,5 +21,5 @@ rapids_cython_create_modules( LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf ) -set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_types) +set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_datasource pylibcudf_io_types) link_to_pyarrow_headers("${targets_using_arrow_headers}") diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd index 250292746c1..cfd6d2cd281 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd @@ -1,4 +1,4 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport avro, types +from . cimport avro, datasource, types from .types cimport SourceInfo, TableWithMetadata diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py index 5242c741911..a54ba1834dc 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import avro, types +from . import avro, datasource, types from .types import SourceInfo, TableWithMetadata diff --git a/python/cudf/cudf/_lib/io/datasource.pxd b/python/cudf/cudf/_lib/pylibcudf/io/datasource.pxd similarity index 100% rename from python/cudf/cudf/_lib/io/datasource.pxd rename to python/cudf/cudf/_lib/pylibcudf/io/datasource.pxd diff --git a/python/cudf/cudf/_lib/io/datasource.pyx b/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx similarity index 100% rename from python/cudf/cudf/_lib/io/datasource.pyx rename to python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx index cd777232b33..ab3375da662 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx @@ -4,6 +4,8 @@ from libcpp.string cimport string from libcpp.utility cimport move from libcpp.vector cimport vector +from cudf._lib.pylibcudf.io.datasource cimport Datasource +from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource from cudf._lib.pylibcudf.libcudf.io.types cimport ( host_buffer, source_info, @@ -56,9 +58,8 @@ cdef class SourceInfo: Parameters ---------- - sources : List[Union[str, os.PathLike, bytes, io.BytesIO]] - A homogeneous list of sources (this can be a string filename, - an os.PathLike, bytes, or an io.BytesIO) to read from. + sources : List[Union[str, os.PathLike, bytes, io.BytesIO, DataSource]] + A homogeneous list of sources to read from. Mixing different types of sources will raise a `ValueError`. """ @@ -68,6 +69,7 @@ cdef class SourceInfo: raise ValueError("Need to pass at least one source") cdef vector[string] c_files + cdef vector[datasource*] c_datasources if isinstance(sources[0], (os.PathLike, str)): c_files.reserve(len(sources)) @@ -84,6 +86,13 @@ cdef class SourceInfo: self.c_obj = move(source_info(c_files)) return + elif isinstance(sources[0], Datasource): + for csrc in sources: + if not isinstance(csrc, Datasource): + raise ValueError("All sources must be of the same type!") + c_datasources.push_back((csrc).get_datasource()) + self.c_obj = move(source_info(c_datasources)) + return # TODO: host_buffer is deprecated API, use host_span instead cdef vector[host_buffer] c_host_buffers @@ -106,5 +115,11 @@ cdef class SourceInfo: c_buffer = bio.getbuffer() # check if empty? c_host_buffers.push_back(host_buffer(&c_buffer[0], c_buffer.shape[0])) + else: + raise ValueError("Sources must be a list of str/paths, " + "bytes, io.BytesIO, or a Datasource") + + if empty_buffer is True: + c_host_buffers.push_back(host_buffer(NULL, 0)) - self.c_obj = source_info(c_host_buffers) + self.c_obj = move(source_info(c_host_buffers)) diff --git a/python/cudf/cudf/pylibcudf_tests/test_source_info.py b/python/cudf/cudf/pylibcudf_tests/test_source_info.py index 71a3ecbcc30..019321b7259 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_source_info.py +++ b/python/cudf/cudf/pylibcudf_tests/test_source_info.py @@ -2,13 +2,21 @@ import io +import pyarrow as pa import pytest import cudf._lib.pylibcudf as plc +from cudf._lib.pylibcudf.io.datasource import NativeFileDatasource @pytest.mark.parametrize( - "source", ["a.txt", b"hello world", io.BytesIO(b"hello world")] + "source", + [ + "a.txt", + b"hello world", + io.BytesIO(b"hello world"), + NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")), + ], ) def test_source_info_ctor(source, tmp_path): if isinstance(source, str): @@ -28,6 +36,10 @@ def test_source_info_ctor(source, tmp_path): ["a.txt", "a.txt"], [b"hello world", b"hello there"], [io.BytesIO(b"hello world"), io.BytesIO(b"hello there")], + [ + NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")), + NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")), + ], ], ) def test_source_info_ctor_multiple(sources, tmp_path): @@ -54,6 +66,11 @@ def test_source_info_ctor_multiple(sources, tmp_path): io.BytesIO(b"hello there"), b"hello world", ], + [ + NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")), + "awef.txt", + b"hello world", + ], ], ) def test_source_info_ctor_mixing_invalid(sources, tmp_path): @@ -67,3 +84,8 @@ def test_source_info_ctor_mixing_invalid(sources, tmp_path): sources[i] = str(file) with pytest.raises(ValueError): plc.io.SourceInfo(sources) + + +def test_source_info_invalid(): + with pytest.raises(ValueError): + plc.io.SourceInfo([123]) diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd index 84a3a32646d..2de0bf39785 100644 --- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd +++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd @@ -7,7 +7,7 @@ from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector -from cudf._lib.io.datasource cimport Datasource +from cudf._lib.pylibcudf.io.datasource cimport Datasource from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource