Skip to content

Commit

Permalink
Merge branch 'branch-24.06' into fea-json_filter_columns
Browse files Browse the repository at this point in the history
  • Loading branch information
karthikeyann authored May 2, 2024
2 parents 8def9db + 500cb29 commit e4fd7b7
Show file tree
Hide file tree
Showing 113 changed files with 1,537 additions and 825 deletions.
53 changes: 0 additions & 53 deletions .github/workflows/jni-docker-build.yml

This file was deleted.

9 changes: 5 additions & 4 deletions cpp/benchmarks/json/json.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

#include <cudf/column/column_factories.hpp>
#include <cudf/json/json.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/strings_children_ex.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/string_view.hpp>
#include <cudf/strings/strings_column_view.hpp>
Expand Down Expand Up @@ -77,8 +77,9 @@ struct json_benchmark_row_builder {
cudf::column_device_view const d_book_pct; // Book percentage
cudf::column_device_view const d_misc_order; // Misc-Store order
cudf::column_device_view const d_store_order; // Books-Bicycles order
int32_t* d_offsets{};
cudf::size_type* d_sizes{};
char* d_chars{};
cudf::detail::input_offsetalator d_offsets;
thrust::minstd_rand rng{5236};
thrust::uniform_int_distribution<int> dist{};

Expand Down Expand Up @@ -155,7 +156,7 @@ struct json_benchmark_row_builder {
output_str += Misc;
}
output_str += brace2;
if (!output_str.ptr) d_offsets[idx] = output_str.bytes;
if (!output_str.ptr) { d_sizes[idx] = output_str.bytes; }
}
};

Expand All @@ -177,7 +178,7 @@ auto build_json_string_column(int desired_bytes, int num_rows)
auto d_store_order = cudf::column_device_view::create(float_2bool_columns->get_column(2));
json_benchmark_row_builder jb{
desired_bytes, num_rows, {*d_books, *d_bicycles}, *d_book_pct, *d_misc_order, *d_store_order};
auto [offsets, chars] = cudf::strings::detail::make_strings_children(
auto [offsets, chars] = cudf::strings::detail::experimental::make_strings_children(
jb, num_rows, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
return cudf::make_strings_column(num_rows, std::move(offsets), chars.release(), 0, {});
}
Expand Down
15 changes: 11 additions & 4 deletions cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -943,13 +943,14 @@ Use the `CUDF_EXPECTS` macro to enforce runtime conditions necessary for correct
Example usage:

```c++
CUDF_EXPECTS(lhs.type() == rhs.type(), "Column type mismatch");
CUDF_EXPECTS(cudf::have_same_types(lhs, rhs), "Type mismatch", cudf::data_type_error);
```
The first argument is the conditional expression expected to resolve to `true` under normal
conditions. If the conditional evaluates to `false`, then an error has occurred and an instance of
`cudf::logic_error` is thrown. The second argument to `CUDF_EXPECTS` is a short description of the
error that has occurred and is used for the exception's `what()` message.
conditions. The second argument to `CUDF_EXPECTS` is a short description of the error that has
occurred and is used for the exception's `what()` message. If the conditional evaluates to
`false`, then an error has occurred and an instance of the exception class in the third argument
(or the default, `cudf::logic_error`) is thrown.
There are times where a particular code path, if reached, should indicate an error no matter what.
For example, often the `default` case of a `switch` statement represents an invalid alternative.
Expand Down Expand Up @@ -1048,6 +1049,12 @@ types such as numeric types and timestamps/durations, adding support for nested
Enabling an algorithm differently for different types uses either template specialization or SFINAE,
as discussed in [Specializing Type-Dispatched Code Paths](#specializing-type-dispatched-code-paths).

## Comparing Data Types

When comparing the data types of two columns or scalars, do not directly compare
`a.type() == b.type()`. Nested types such as lists of structs of integers will not be handled
properly if only the top level type is compared. Instead, use the `cudf::have_same_types` function.

# Type Dispatcher

libcudf stores data (for columns and scalars) "type erased" in `void*` device memory. This
Expand Down
7 changes: 5 additions & 2 deletions cpp/include/cudf/detail/scatter.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@
#include <cudf/strings/detail/scatter.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/traits.hpp>
#include <cudf/utilities/type_checks.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>
Expand Down Expand Up @@ -213,8 +215,9 @@ struct column_scatterer_impl<dictionary32> {
// check the keys match
dictionary_column_view const source(source_in);
dictionary_column_view const target(target_in);
CUDF_EXPECTS(source.keys().type() == target.keys().type(),
"scatter dictionary keys must be the same type");
CUDF_EXPECTS(cudf::have_same_types(source.keys(), target.keys()),
"scatter dictionary keys must be the same type",
cudf::data_type_error);

// first combine keys so both dictionaries have the same set
auto target_matched = dictionary::detail::add_keys(target, source.keys(), stream, mr);
Expand Down
17 changes: 9 additions & 8 deletions cpp/include/cudf/io/detail/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#pragma once

#include <cudf/io/datasource.hpp>
#include <cudf/io/json.hpp>

#include <rmm/cuda_stream_view.hpp>
Expand Down Expand Up @@ -56,22 +57,22 @@ void write_json(data_sink* sink,
/**
* @brief Normalize single quotes to double quotes using FST
*
* @param inbuf Input device buffer
* @param indata Input device buffer
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource to use for device memory allocation
*/
rmm::device_uvector<char> normalize_single_quotes(rmm::device_uvector<char>&& inbuf,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);
void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<char>>& indata,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

/**
* @brief Normalize unquoted whitespace (space and tab characters) using FST
*
* @param inbuf Input device buffer
* @param indata Input device buffer
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource to use for device memory allocation
*/
rmm::device_uvector<char> normalize_whitespace(rmm::device_uvector<char>&& inbuf,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);
void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<char>>& indata,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);
} // namespace cudf::io::json::detail
2 changes: 1 addition & 1 deletion cpp/include/cudf/lists/detail/scatter.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ std::unique_ptr<column> scatter_impl(rmm::device_uvector<unbound_list_view> cons
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_EXPECTS(column_types_equal(source, target), "Mismatched column types.");
CUDF_EXPECTS(have_same_types(source, target), "Mismatched column types.");

auto const child_column_type = lists_column_view(target).child().type();

Expand Down
18 changes: 7 additions & 11 deletions cpp/include/cudf/strings/detail/gather.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,23 +19,19 @@
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/detail/sizes_to_offsets_iterator.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>
#include <rmm/resource_ref.hpp>

#include <cuda/functional>
#include <thrust/advance.h>
#include <thrust/binary_search.h>
#include <thrust/distance.h>
#include <thrust/execution_policy.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>

namespace cudf {
Expand Down Expand Up @@ -226,7 +222,7 @@ rmm::device_uvector<char> gather_chars(StringIterator strings_begin,
MapIterator map_begin,
MapIterator map_end,
cudf::detail::input_offsetalator const offsets,
size_type chars_bytes,
int64_t chars_bytes,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
Expand All @@ -239,9 +235,9 @@ rmm::device_uvector<char> gather_chars(StringIterator strings_begin,
constexpr int warps_per_threadblock = 4;
// String parallel strategy will be used if average string length is above this threshold.
// Otherwise, char parallel strategy will be used.
constexpr size_type string_parallel_threshold = 32;
constexpr int64_t string_parallel_threshold = 32;

size_type average_string_length = chars_bytes / output_count;
int64_t const average_string_length = chars_bytes / output_count;

if (average_string_length > string_parallel_threshold) {
constexpr int max_threadblocks = 65536;
Expand Down Expand Up @@ -302,16 +298,16 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
strings.is_empty() ? make_empty_column(type_id::INT32)->view() : strings.offsets(),
strings.offset());
auto offsets_itr = thrust::make_transform_iterator(
auto sizes_itr = thrust::make_transform_iterator(
begin,
cuda::proclaim_return_type<size_type>(
[d_strings = *d_strings, d_in_offsets] __device__(size_type idx) {
if (NullifyOutOfBounds && (idx < 0 || idx >= d_strings.size())) { return 0; }
if (not d_strings.is_valid(idx)) { return 0; }
return static_cast<size_type>(d_in_offsets[idx + 1] - d_in_offsets[idx]);
}));
auto [out_offsets_column, total_bytes] =
cudf::detail::make_offsets_child_column(offsets_itr, offsets_itr + output_count, stream, mr);
auto [out_offsets_column, total_bytes] = cudf::strings::detail::make_offsets_child_column(
sizes_itr, sizes_itr + output_count, stream, mr);
// build chars column
auto const offsets_view =
Expand Down
11 changes: 1 addition & 10 deletions cpp/include/cudf/table/table_view.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -339,15 +339,6 @@ bool has_nested_nullable_columns(table_view const& input);
*/
std::vector<column_view> get_nullable_columns(table_view const& table);

/**
* @brief Checks if two `table_view`s have columns of same types
*
* @param lhs left-side table_view operand
* @param rhs right-side table_view operand
* @return boolean comparison result
*/
bool have_same_types(table_view const& lhs, table_view const& rhs);

/**
* @brief Copy column_views from a table_view into another table_view according to
* a column indices map.
Expand Down
Loading

0 comments on commit e4fd7b7

Please sign in to comment.