Skip to content

Commit

Permalink
Merge branch 'branch-24.08' of github.com:rapidsai/cudf into pylibcud…
Browse files Browse the repository at this point in the history
…f-io-json
  • Loading branch information
lithomas1 committed Jul 2, 2024
2 parents e1683a4 + 64325a1 commit 1751238
Show file tree
Hide file tree
Showing 49 changed files with 819 additions and 207 deletions.
13 changes: 7 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -136,11 +136,6 @@ repos:
.*test.*|
^CHANGELOG.md$
)
- repo: https://github.com/rapidsai/dependency-file-generator
rev: v1.13.11
hooks:
- id: rapids-dependency-file-generator
args: ["--clean"]
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.4.8
hooks:
Expand All @@ -149,7 +144,7 @@ repos:
- id: ruff-format
files: python/.*$
- repo: https://github.com/rapidsai/pre-commit-hooks
rev: v0.0.3
rev: v0.2.0
hooks:
- id: verify-copyright
exclude: |
Expand All @@ -158,6 +153,12 @@ repos:
cpp/src/io/parquet/ipc/Message_generated[.]h$|
cpp/src/io/parquet/ipc/Schema_generated[.]h$
)
- id: verify-alpha-spec
- repo: https://github.com/rapidsai/dependency-file-generator
rev: v1.13.11
hooks:
- id: rapids-dependency-file-generator
args: ["--clean"]

default_language_version:
python: python3
4 changes: 1 addition & 3 deletions ci/test_cudf_polars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,8 @@ rapids-logger "Install cudf wheel"
# echo to expand wildcard before adding `[extra]` requires for pip
python -m pip install $(echo ./dist/cudf*.whl)[test]

rapids-logger "Install polars (allow pre-release versions)"
python -m pip install 'polars>=1.0.0a0'

rapids-logger "Install cudf_polars"
python -m pip install 'polars>=1.0'
python -m pip install --no-deps python/cudf_polars

rapids-logger "Run cudf_polars tests"
Expand Down
5 changes: 2 additions & 3 deletions conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ dependencies:
- cupy>=12.0.0
- cxx-compiler
- cython>=3.0.3
- dask-cuda==24.8.*
- dask-cuda==24.8.*,>=0.0.0a0
- dlpack>=0.8,<1.0
- doxygen=1.9.1
Expand All @@ -44,10 +43,10 @@ dependencies:
- libcufile=1.4.0.31
- libcurand-dev=10.3.0.86
- libcurand=10.3.0.86
- libkvikio==24.8.*
- libkvikio==24.8.*,>=0.0.0a0
- libparquet==16.1.0.*
- librdkafka>=1.9.0,<1.10.0a0
- librmm==24.8.*
- librmm==24.8.*,>=0.0.0a0
- make
- moto>=4.0.8
- msgpack-python
Expand Down
7 changes: 3 additions & 4 deletions conda/environments/all_cuda-122_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ dependencies:
- cupy>=12.0.0
- cxx-compiler
- cython>=3.0.3
- dask-cuda==24.8.*
- dask-cuda==24.8.*,>=0.0.0a0
- dlpack>=0.8,<1.0
- doxygen=1.9.1
Expand All @@ -43,10 +42,10 @@ dependencies:
- libarrow==16.1.0.*
- libcufile-dev
- libcurand-dev
- libkvikio==24.8.*
- libkvikio==24.8.*,>=0.0.0a0
- libparquet==16.1.0.*
- librdkafka>=1.9.0,<1.10.0a0
- librmm==24.8.*
- librmm==24.8.*,>=0.0.0a0
- make
- moto>=4.0.8
- msgpack-python
Expand All @@ -66,7 +65,7 @@ dependencies:
- pre-commit
- pyarrow==16.1.0.*
- pydata-sphinx-theme!=0.14.2
- pynvjitlink
- pynvjitlink>=0.0.0a0
- pytest-benchmark
- pytest-cases>=3.8.2
- pytest-cov
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,7 @@ add_library(
src/interop/to_arrow_device.cu
src/interop/from_arrow_device.cu
src/interop/from_arrow_host.cu
src/interop/from_arrow_stream.cu
src/interop/to_arrow_schema.cpp
src/interop/detail/arrow_allocator.cpp
src/io/avro/avro.cpp
Expand Down
38 changes: 30 additions & 8 deletions cpp/include/cudf/interop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ struct ArrowSchema;

struct ArrowArray;

struct ArrowArrayStream;

namespace cudf {
/**
* @addtogroup interop_dlpack
Expand Down Expand Up @@ -367,10 +369,11 @@ std::unique_ptr<cudf::scalar> from_arrow(
* @param mr Device memory resource used to allocate `cudf::table`
* @return cudf table generated from given arrow data
*/
std::unique_ptr<cudf::table> from_arrow(ArrowSchema const* schema,
ArrowArray const* input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);
std::unique_ptr<cudf::table> from_arrow(
ArrowSchema const* schema,
ArrowArray const* input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create `cudf::column` from a given ArrowArray and ArrowSchema input
Expand All @@ -385,10 +388,11 @@ std::unique_ptr<cudf::table> from_arrow(ArrowSchema const* schema,
* @param mr Device memory resource used to allocate `cudf::column`
* @return cudf column generated from given arrow data
*/
std::unique_ptr<cudf::column> from_arrow_column(ArrowSchema const* schema,
ArrowArray const* input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);
std::unique_ptr<cudf::column> from_arrow_column(
ArrowSchema const* schema,
ArrowArray const* input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create `cudf::table` from given ArrowDeviceArray input
Expand All @@ -414,6 +418,24 @@ std::unique_ptr<table> from_arrow_host(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create `cudf::table` from given ArrowArrayStream input
*
* @throws std::invalid_argument if input is NULL
*
* The conversion WILL release the input ArrayArrayStream and its constituent
* arrays or schema since Arrow streams are not suitable for multiple reads.
*
* @param input `ArrowArrayStream` pointer to object that will produce ArrowArray data
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to perform cuda allocation
* @return cudf table generated from the given Arrow data
*/
std::unique_ptr<table> from_arrow_stream(
ArrowArrayStream* input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create `cudf::column` from given ArrowDeviceArray input
*
Expand Down
15 changes: 11 additions & 4 deletions cpp/include/cudf/io/text/byte_range_info.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -24,17 +24,22 @@
namespace cudf {
namespace io {
namespace text {
/**
* @addtogroup io_readers
* @{
* @file
*/

/**
* @brief stores offset and size used to indicate a byte range
*/
class byte_range_info {
private:
int64_t _offset; ///< offset in bytes
int64_t _size; ///< size in bytes
int64_t _offset{}; ///< offset in bytes
int64_t _size{}; ///< size in bytes

public:
constexpr byte_range_info() noexcept : _offset(0), _size(0) {}
constexpr byte_range_info() = default;
/**
* @brief Constructs a byte_range_info object
*
Expand Down Expand Up @@ -104,6 +109,8 @@ std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_b
*/
byte_range_info create_byte_range_info_max();

/** @} */ // end of group

} // namespace text
} // namespace io
} // namespace cudf
10 changes: 9 additions & 1 deletion cpp/include/cudf/io/text/data_chunk_source.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -25,6 +25,12 @@ namespace cudf {
namespace io {
namespace text {

/**
* @addtogroup io_readers
* @{
* @file
*/

/**
* @brief A contract guaranteeing stream-ordered memory access to the underlying device data.
*
Expand Down Expand Up @@ -110,6 +116,8 @@ class data_chunk_source {
[[nodiscard]] virtual std::unique_ptr<data_chunk_reader> create_reader() const = 0;
};

/** @} */ // end of group

} // namespace text
} // namespace io
} // namespace cudf
27 changes: 23 additions & 4 deletions cpp/include/cudf/io/text/multibyte_split.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@
namespace cudf {
namespace io {
namespace text {
/**
* @addtogroup io_readers
* @{
* @file
*/

/**
* @brief Parsing options for multibyte_split.
Expand Down Expand Up @@ -79,6 +84,7 @@ struct parse_options {
* @param source The source string
* @param delimiter UTF-8 encoded string for which to find offsets in the source
* @param options the parsing options to use (including byte range)
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Memory resource to use for the device memory allocation
* @return The strings found by splitting the source by the delimiter within the relevant byte
* range.
Expand All @@ -87,17 +93,30 @@ std::unique_ptr<cudf::column> multibyte_split(
data_chunk_source const& source,
std::string const& delimiter,
parse_options options = {},
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());

std::unique_ptr<cudf::column> multibyte_split(
/**
* @brief Splits the source text into a strings column using a multiple byte delimiter.
*
* @deprecated Since 24.08
*
* @param source The source input data encoded in UTF-8
* @param delimiter UTF-8 encoded string for which to find offsets in the source
* @param byte_range The position and size within `source` to produce the column from
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Memory resource to use for the device memory allocation
* @return The strings found by splitting the source by the delimiter within the relevant byte
* range.
*/
[[deprecated]] std::unique_ptr<cudf::column> multibyte_split(
data_chunk_source const& source,
std::string const& delimiter,
std::optional<byte_range_info> byte_range,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());

std::unique_ptr<cudf::column> multibyte_split(data_chunk_source const& source,
std::string const& delimiter,
rmm::device_async_resource_ref mr);
/** @} */ // end of group

} // namespace text
} // namespace io
Expand Down
12 changes: 6 additions & 6 deletions cpp/include/cudf/table/experimental/row_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ using optional_dremel_view = thrust::optional<detail::dremel_device_view const>;
*
* @tparam has_nested_columns compile-time optimization for primitive types.
* This template parameter is to be used by the developer by querying
* `cudf::detail::has_nested_columns(input)`. `true` compiles operator
* `cudf::has_nested_columns(input)`. `true` compiles operator
* overloads for nested types, while `false` only compiles operator
* overloads for primitive types.
* @tparam Nullate A cudf::nullate type describing whether to check for nulls.
Expand Down Expand Up @@ -1014,7 +1014,7 @@ class self_comparator {
*
* @tparam has_nested_columns compile-time optimization for primitive types.
* This template parameter is to be used by the developer by querying
* `cudf::detail::has_nested_columns(input)`. `true` compiles operator
* `cudf::has_nested_columns(input)`. `true` compiles operator
* overloads for nested types, while `false` only compiles operator
* overloads for primitive types.
* @tparam Nullate A cudf::nullate type describing whether to check for nulls.
Expand Down Expand Up @@ -1186,7 +1186,7 @@ class two_table_comparator {
*
* @tparam has_nested_columns compile-time optimization for primitive types.
* This template parameter is to be used by the developer by querying
* `cudf::detail::has_nested_columns(input)`. `true` compiles operator
* `cudf::has_nested_columns(input)`. `true` compiles operator
* overloads for nested types, while `false` only compiles operator
* overloads for primitive types.
* @tparam Nullate A cudf::nullate type describing whether to check for nulls.
Expand Down Expand Up @@ -1326,7 +1326,7 @@ struct nan_equal_physical_equality_comparator {
*
* @tparam has_nested_columns compile-time optimization for primitive types.
* This template parameter is to be used by the developer by querying
* `cudf::detail::has_nested_columns(input)`. `true` compiles operator
* `cudf::has_nested_columns(input)`. `true` compiles operator
* overloads for nested types, while `false` only compiles operator
* overloads for primitive types.
* @tparam Nullate A cudf::nullate type describing whether to check for nulls.
Expand Down Expand Up @@ -1643,7 +1643,7 @@ class self_comparator {
*
* @tparam has_nested_columns compile-time optimization for primitive types.
* This template parameter is to be used by the developer by querying
* `cudf::detail::has_nested_columns(input)`. `true` compiles operator
* `cudf::has_nested_columns(input)`. `true` compiles operator
* overloads for nested types, while `false` only compiles operator
* overloads for primitive types.
* @tparam Nullate A cudf::nullate type describing whether to check for nulls.
Expand Down Expand Up @@ -1757,7 +1757,7 @@ class two_table_comparator {
*
* @tparam has_nested_columns compile-time optimization for primitive types.
* This template parameter is to be used by the developer by querying
* `cudf::detail::has_nested_columns(input)`. `true` compiles operator
* `cudf::has_nested_columns(input)`. `true` compiles operator
* overloads for nested types, while `false` only compiles operator
* overloads for primitive types.
* @tparam Nullate A cudf::nullate type describing whether to check for nulls.
Expand Down
Loading

0 comments on commit 1751238

Please sign in to comment.