Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/branch-23.12' into fix/recover…
Browse files Browse the repository at this point in the history
…ing-json-lines-incomplete-lines
  • Loading branch information
elstehle committed Oct 11, 2023
2 parents bfb5397 + b17904d commit 47cb227
Show file tree
Hide file tree
Showing 73 changed files with 3,025 additions and 1,344 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,7 @@ add_library(
src/io/parquet/predicate_pushdown.cpp
src/io/parquet/reader.cpp
src/io/parquet/reader_impl.cpp
src/io/parquet/reader_impl_chunking.cu
src/io/parquet/reader_impl_helpers.cpp
src/io/parquet/reader_impl_preprocess.cu
src/io/parquet/writer_impl.cu
Expand Down
24 changes: 19 additions & 5 deletions cpp/benchmarks/copying/shift.cu
Original file line number Diff line number Diff line change
Expand Up @@ -56,18 +56,32 @@ static void BM_shift(benchmark::State& state)
cudf::size_type size = state.range(0);
cudf::size_type offset = size * (static_cast<double>(shift_factor) / 100.0);

auto const input_table =
create_sequence_table({cudf::type_to_id<int>()},
row_count{size},
use_validity ? std::optional<double>{1.0} : std::nullopt);
auto constexpr column_type_id = cudf::type_id::INT32;
using column_type = cudf::id_to_type<column_type_id>;

auto const input_table = create_sequence_table(
{column_type_id}, row_count{size}, use_validity ? std::optional<double>{1.0} : std::nullopt);
cudf::column_view input{input_table->get_column(0)};

auto fill = use_validity ? make_scalar<int>() : make_scalar<int>(777);
auto fill = use_validity ? make_scalar<column_type>() : make_scalar<column_type>(777);

for (auto _ : state) {
cuda_event_timer raii(state, true);
auto output = cudf::shift(input, offset, *fill);
}

auto const elems_read = (size - offset);
auto const bytes_read = elems_read * sizeof(column_type);

// If 'use_validity' is false, the fill value is a number, and the entire column
// (excluding the null bitmask) needs to be written. On the other hand, if 'use_validity'
// is true, only the elements that can be shifted are written, along with the full null bitmask.
auto const elems_written = use_validity ? (size - offset) : size;
auto const bytes_written = elems_written * sizeof(column_type);
auto const null_bytes = use_validity ? 2 * cudf::bitmask_allocation_size_bytes(size) : 0;

state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
(bytes_written + bytes_read + null_bytes));
}

class Shift : public cudf::benchmark {};
Expand Down
17 changes: 15 additions & 2 deletions cpp/benchmarks/transpose/transpose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,19 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/table/table.hpp>
#include <cudf/transpose.hpp>
#include <cudf/types.hpp>

#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>

static void BM_transpose(benchmark::State& state)
{
auto count = state.range(0);
auto count = state.range(0);
constexpr auto column_type_id = cudf::type_id::INT32;
auto int_column_generator =
thrust::make_transform_iterator(thrust::counting_iterator(0), [count](int i) {
return cudf::make_numeric_column(
cudf::data_type{cudf::type_id::INT32}, count, cudf::mask_state::ALL_VALID);
cudf::data_type{column_type_id}, count, cudf::mask_state::ALL_VALID);
});

auto input_table = cudf::table(std::vector(int_column_generator, int_column_generator + count));
Expand All @@ -40,6 +42,17 @@ static void BM_transpose(benchmark::State& state)
cuda_event_timer raii(state, true);
auto output = cudf::transpose(input);
}

// Collect memory statistics.
auto const bytes_read = static_cast<uint64_t>(input.num_columns()) * input.num_rows() *
sizeof(cudf::id_to_type<column_type_id>);
auto const bytes_written = bytes_read;
// Account for nullability in input and output.
auto const null_bytes = 2 * static_cast<uint64_t>(input.num_columns()) *
cudf::bitmask_allocation_size_bytes(input.num_rows());

state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
(bytes_read + bytes_written + null_bytes));
}

class Transpose : public cudf::benchmark {};
Expand Down
8 changes: 4 additions & 4 deletions cpp/include/cudf/io/detail/parquet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class parquet_reader_options;
class parquet_writer_options;
class chunked_parquet_writer_options;

namespace detail::parquet {
namespace parquet::detail {

/**
* @brief Class to read Parquet dataset data into columns.
Expand Down Expand Up @@ -186,7 +186,7 @@ class writer {
*/
explicit writer(std::vector<std::unique_ptr<data_sink>> sinks,
parquet_writer_options const& options,
single_write_mode mode,
cudf::io::detail::single_write_mode mode,
rmm::cuda_stream_view stream);

/**
Expand All @@ -201,7 +201,7 @@ class writer {
*/
explicit writer(std::vector<std::unique_ptr<data_sink>> sinks,
chunked_parquet_writer_options const& options,
single_write_mode mode,
cudf::io::detail::single_write_mode mode,
rmm::cuda_stream_view stream);

/**
Expand Down Expand Up @@ -250,5 +250,5 @@ class writer {
* metadata.
*/
parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> const> sources);
} // namespace detail::parquet
} // namespace parquet::detail
} // namespace cudf::io
4 changes: 2 additions & 2 deletions cpp/include/cudf/io/parquet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -499,7 +499,7 @@ class chunked_parquet_reader {
[[nodiscard]] table_with_metadata read_chunk() const;

private:
std::unique_ptr<cudf::io::detail::parquet::chunked_reader> reader;
std::unique_ptr<cudf::io::parquet::detail::chunked_reader> reader;
};

/** @} */ // end of group
Expand Down Expand Up @@ -1750,7 +1750,7 @@ class parquet_chunked_writer {
std::vector<std::string> const& column_chunks_file_paths = {});

/// Unique pointer to impl writer class
std::unique_ptr<cudf::io::detail::parquet::writer> writer;
std::unique_ptr<parquet::detail::writer> writer;
};

/** @} */ // end of group
Expand Down
4 changes: 4 additions & 0 deletions cpp/include/cudf/lists/combine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,15 @@ enum class concatenate_null_policy { IGNORE, NULLIFY_OUTPUT_ROW };
* @param input Table of lists to be concatenated.
* @param null_policy The parameter to specify whether a null list element will be ignored from
* concatenation, or any concatenation involving a null element will result in a null list.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return A new column in which each row is a list resulted from concatenating all list elements in
* the corresponding row of the input table.
*/
std::unique_ptr<column> concatenate_rows(
table_view const& input,
concatenate_null_policy null_policy = concatenate_null_policy::IGNORE,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -86,13 +88,15 @@ std::unique_ptr<column> concatenate_rows(
* @param input The lists column containing lists of list elements to concatenate.
* @param null_policy The parameter to specify whether a null list element will be ignored from
* concatenation, or any concatenation involving a null element will result in a null list.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return A new column in which each row is a list resulted from concatenating all list elements in
* the corresponding row of the input lists column.
*/
std::unique_ptr<column> concatenate_list_elements(
column_view const& input,
concatenate_null_policy null_policy = concatenate_null_policy::IGNORE,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
Expand Down
14 changes: 12 additions & 2 deletions cpp/include/cudf/lists/contains.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,14 @@ namespace lists {
*
* @param lists Lists column whose `n` rows are to be searched
* @param search_key The scalar key to be looked up in each list row
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory
* @return BOOL8 column of `n` rows with the result of the lookup
*/
std::unique_ptr<column> contains(
cudf::lists_column_view const& lists,
cudf::scalar const& search_key,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -63,13 +65,15 @@ std::unique_ptr<column> contains(
* 2. The list row `lists[i]` is null
*
* @param lists Lists column whose `n` rows are to be searched
* @param search_keys Column of elements to be looked up in each list row
* @param search_keys Column of elements to be looked up in each list row.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory
* @return BOOL8 column of `n` rows with the result of the lookup
*/
std::unique_ptr<column> contains(
cudf::lists_column_view const& lists,
cudf::column_view const& search_keys,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -84,12 +88,14 @@ std::unique_ptr<column> contains(
* A row with an empty list will always return false.
* Nulls inside non-null nested elements (such as lists or structs) are not considered.
*
* @param lists Lists column whose `n` rows are to be searched
* @param lists Lists column whose `n` rows are to be searched.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory
* @return BOOL8 column of `n` rows with the result of the lookup
*/
std::unique_ptr<column> contains_nulls(
cudf::lists_column_view const& lists,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down Expand Up @@ -125,13 +131,15 @@ enum class duplicate_find_option : int32_t {
* @param search_key The scalar key to be looked up in each list row
* @param find_option Whether to return the position of the first match (`FIND_FIRST`) or
* last (`FIND_LAST`)
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return column of `n` rows with the location of the `search_key`
*/
std::unique_ptr<column> index_of(
cudf::lists_column_view const& lists,
cudf::scalar const& search_key,
duplicate_find_option find_option = duplicate_find_option::FIND_FIRST,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down Expand Up @@ -160,13 +168,15 @@ std::unique_ptr<column> index_of(
* `lists`
* @param find_option Whether to return the position of the first match (`FIND_FIRST`) or
* last (`FIND_LAST`)
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return column of `n` rows with the location of the `search_key`
*/
std::unique_ptr<column> index_of(
cudf::lists_column_view const& lists,
cudf::column_view const& search_keys,
duplicate_find_option find_option = duplicate_find_option::FIND_FIRST,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
Expand Down
2 changes: 2 additions & 0 deletions cpp/include/cudf/lists/count_elements.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,13 @@ namespace lists {
* in the output column.
*
* @param input Input lists column
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New column with the number of elements for each row
*/
std::unique_ptr<column> count_elements(
lists_column_view const& input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of lists_elements group
Expand Down
14 changes: 9 additions & 5 deletions cpp/include/cudf/strings/padding.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -51,6 +51,7 @@ namespace strings {
* Default is pad right (left justify)
* @param fill_char Single UTF-8 character to use for padding;
* Default is the space character
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New column with padded strings
*/
Expand All @@ -59,6 +60,7 @@ std::unique_ptr<column> pad(
size_type width,
side_type side = side_type::RIGHT,
std::string_view fill_char = " ",
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -79,14 +81,16 @@ std::unique_ptr<column> pad(
* r is now ['001234','-09876','+00.34','-342567', '0002+2']
* @endcode
*
* @param input Strings instance for this operation.
* @param width The minimum number of characters for each string.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column of strings.
* @param input Strings instance for this operation
* @param width The minimum number of characters for each string
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New column of strings
*/
std::unique_ptr<column> zfill(
strings_column_view const& input,
size_type width,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
Expand Down
30 changes: 17 additions & 13 deletions cpp/include/cudf/strings/slice.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,18 +50,20 @@ namespace strings {
* r2 is now ["lo","ob"]
* @endcode
*
* @param strings Strings column for this operation.
* @param start First character position to begin the substring.
* @param stop Last character position (exclusive) to end the substring.
* @param step Distance between input characters retrieved.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings column with sorted elements of this instance.
* @param input Strings column for this operation
* @param start First character position to begin the substring
* @param stop Last character position (exclusive) to end the substring
* @param step Distance between input characters retrieved
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings column with sorted elements of this instance
*/
std::unique_ptr<column> slice_strings(
strings_column_view const& strings,
strings_column_view const& input,
numeric_scalar<size_type> const& start = numeric_scalar<size_type>(0, false),
numeric_scalar<size_type> const& stop = numeric_scalar<size_type>(0, false),
numeric_scalar<size_type> const& step = numeric_scalar<size_type>(1),
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down Expand Up @@ -95,16 +97,18 @@ std::unique_ptr<column> slice_strings(
* @throw cudf::logic_error if starts and stops are not same integer type.
* @throw cudf::logic_error if starts or stops contains nulls.
*
* @param strings Strings column for this operation.
* @param starts First character positions to begin the substring.
* @param stops Last character (exclusive) positions to end the substring.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings column with sorted elements of this instance.
* @param input Strings column for this operation
* @param starts First character positions to begin the substring
* @param stops Last character (exclusive) positions to end the substring
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings column with sorted elements of this instance
*/
std::unique_ptr<column> slice_strings(
strings_column_view const& strings,
strings_column_view const& input,
column_view const& starts,
column_view const& stops,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
Expand Down
4 changes: 3 additions & 1 deletion cpp/include/cudf/strings/strip.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -57,13 +57,15 @@ namespace strings {
* string; Default is both
* @param to_strip UTF-8 encoded characters to strip from each string;
* Default is empty string which indicates strip whitespace characters
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings column.
*/
std::unique_ptr<column> strip(
strings_column_view const& input,
side_type side = side_type::BOTH,
string_scalar const& to_strip = string_scalar(""),
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
Expand Down
Loading

0 comments on commit 47cb227

Please sign in to comment.