Skip to content

Commit

Permalink
Merge branch 'branch-24.08' into fix/cudf_pandas_excelwriter_hierarchy
Browse files Browse the repository at this point in the history
  • Loading branch information
galipremsagar authored May 31, 2024
2 parents 3d7c82b + 7949a9c commit c7196f7
Show file tree
Hide file tree
Showing 17 changed files with 230 additions and 47 deletions.
17 changes: 15 additions & 2 deletions cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -594,8 +594,7 @@ void make_device_json_column(device_span<SymbolT const> input,
col.validity =
cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
col.type = json_col_t::StringColumn;
col.child_columns.clear(); // their references should be deleted too.
col.column_order.clear();
// destroy references of all child columns after this step, by calling remove_child_columns
};

path_from_tree tree_path{column_categories,
Expand Down Expand Up @@ -628,6 +627,19 @@ void make_device_json_column(device_span<SymbolT const> input,
std::vector<uint8_t> is_pruned(num_columns, 0);
columns.try_emplace(parent_node_sentinel, std::ref(root));

std::function<void(NodeIndexT, device_json_column&)> remove_child_columns =
[&](NodeIndexT this_col_id, device_json_column& col) {
for (auto col_name : col.column_order) {
auto child_id = mapped_columns[{this_col_id, col_name}];
is_mixed_type_column[child_id] = 1;
remove_child_columns(child_id, col.child_columns.at(col_name));
mapped_columns.erase({this_col_id, col_name});
columns.erase(child_id);
}
col.child_columns.clear(); // their references are deleted above.
col.column_order.clear();
};

auto name_and_parent_index = [&is_array_of_arrays,
&row_array_parent_col_id,
&column_parent_ids,
Expand Down Expand Up @@ -721,6 +733,7 @@ void make_device_json_column(device_span<SymbolT const> input,
auto& col = columns.at(old_col_id).get();
if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) {
reinitialize_as_string(old_col_id, col);
remove_child_columns(old_col_id, col);
// all its children (which are already inserted) are ignored later.
}
col.forced_as_string_column = true;
Expand Down
6 changes: 4 additions & 2 deletions cpp/src/io/orc/dict_enc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include "orc_gpu.hpp"

#include <cudf/detail/offsets_iterator.cuh>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/io/orc_types.hpp>
#include <cudf/table/experimental/row_operators.cuh>
Expand Down Expand Up @@ -43,11 +44,12 @@ CUDF_KERNEL void rowgroup_char_counts_kernel(device_2dspan<size_type> char_count
auto const start_row = rowgroup_bounds[row_group_idx][col_idx].begin + str_col.offset();
auto const num_rows = rowgroup_bounds[row_group_idx][col_idx].size();

auto const& offsets = str_col.child(strings_column_view::offsets_column_index);
auto const& offsets = str_col.child(strings_column_view::offsets_column_index);
auto const offsets_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type());
char_counts[str_col_idx][row_group_idx] =
(num_rows == 0)
? 0
: offsets.element<size_type>(start_row + num_rows) - offsets.element<size_type>(start_row);
: static_cast<size_type>(offsets_itr[start_row + num_rows] - offsets_itr[start_row]);
}

void rowgroup_char_counts(device_2dspan<size_type> counts,
Expand Down
49 changes: 37 additions & 12 deletions cpp/src/strings/replace/multi.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,23 +30,17 @@
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/resource_ref.hpp>

#include <cuda/functional>
#include <thrust/binary_search.h>
#include <thrust/copy.h>
#include <thrust/count.h>
#include <thrust/distance.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/optional.h>
#include <thrust/scan.h>
#include <thrust/transform.h>

namespace cudf {
Expand Down Expand Up @@ -262,6 +256,38 @@ struct replace_multi_parallel_fn {
device_span<string_view const> d_replacements;
};

constexpr int64_t block_size = 512; // number of threads per block
constexpr size_type bytes_per_thread = 4; // bytes processed per thread

/**
* @brief Count the number of targets in a strings column
*
* @param fn Functor containing has_target() function
* @param chars_bytes Number of bytes in the strings column
* @param d_output Result of the count
*/
CUDF_KERNEL void count_targets(replace_multi_parallel_fn fn, int64_t chars_bytes, int64_t* d_output)
{
auto const idx = cudf::detail::grid_1d::global_thread_id();
auto const byte_idx = static_cast<int64_t>(idx) * bytes_per_thread;
auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);

using block_reduce = cub::BlockReduce<int64_t, block_size>;
__shared__ typename block_reduce::TempStorage temp_storage;

int64_t count = 0;
// each thread processes multiple bytes
for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < chars_bytes); ++i) {
count += fn.has_target(i, chars_bytes);
}
auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum());

if ((lane_idx == 0) && (total > 0)) {
cuda::atomic_ref<int64_t, cuda::thread_scope_block> ref{*d_output};
ref.fetch_add(total, cuda::std::memory_order_relaxed);
}
}

/**
* @brief Used by the copy-if function to produce target_pair objects
*
Expand Down Expand Up @@ -308,12 +334,11 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in

// Count the number of targets in the entire column.
// Note this may over-count in the case where a target spans adjacent strings.
auto target_count = thrust::count_if(
rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator<int64_t>(0),
thrust::make_counting_iterator<int64_t>(chars_bytes),
[fn, chars_bytes] __device__(int64_t idx) { return fn.has_target(idx, chars_bytes); });

rmm::device_scalar<int64_t> d_count(0, stream);
auto const num_blocks = util::div_rounding_up_safe(
util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)), block_size);
count_targets<<<num_blocks, block_size, 0, stream.value()>>>(fn, chars_bytes, d_count.data());
auto target_count = d_count.value(stream);
// Create a vector of every target position in the chars column.
// These may also include overlapping targets which will be resolved later.
auto targets_positions = rmm::device_uvector<int64_t>(target_count, stream);
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/text/vocabulary_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -240,10 +240,10 @@ CUDF_KERNEL void token_counts_fn(cudf::column_device_view const d_strings,
return;
}

auto const offsets =
d_strings.child(cudf::strings_column_view::offsets_column_index).data<cudf::size_type>();
auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()];
auto const chars_begin = d_strings.data<char>() + offsets[d_strings.offset()];
auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index);
auto const offsets_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type());
auto const offset = offsets_itr[str_idx + d_strings.offset()] - offsets_itr[d_strings.offset()];
auto const chars_begin = d_strings.data<char>() + offsets_itr[d_strings.offset()];

auto const begin = d_str.data();
auto const end = begin + d_str.size_bytes();
Expand Down
36 changes: 36 additions & 0 deletions cpp/tests/io/json_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2679,4 +2679,40 @@ TEST_F(JsonReaderTest, JsonNestedDtypeFilter)
}
}

TEST_F(JsonReaderTest, JSONMixedTypeChildren)
{
std::string const json_str = R"(
{ "Root": { "Key": [ { "EE": "A" } ] } }
{ "Root": { "Key": { } } }
{ "Root": { "Key": [{ "YY": 1}] } }
)";
// Column "EE" is created and destroyed
// Column "YY" should not be created

cudf::io::json_reader_options options =
cudf::io::json_reader_options::builder(cudf::io::source_info{json_str.c_str(), json_str.size()})
.lines(true)
.recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
.normalize_single_quotes(true)
.normalize_whitespace(false)
.mixed_types_as_string(true)
.keep_quotes(true);

auto result = cudf::io::read_json(options);

ASSERT_EQ(result.tbl->num_columns(), 1);
ASSERT_EQ(result.metadata.schema_info.size(), 1);
EXPECT_EQ(result.metadata.schema_info[0].name, "Root");
ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1);
EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key");
ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2);
EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets");
// types
EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING);
cudf::test::strings_column_wrapper expected({R"([ { "EE": "A" } ])", "{ }", R"([{ "YY": 1}])"});

CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result.tbl->get_column(0).child(0));
}

CUDF_TEST_PROGRAM_MAIN()
1 change: 1 addition & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ This page provides API documentation for pylibcudf.
reduce
reshape
rolling
round
scalar
search
stream_compaction
Expand Down
6 changes: 6 additions & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/round.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=====
round
=====

.. automodule:: cudf._lib.pylibcudf.round
:members:
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ set(cython_sources
replace.pyx
reshape.pyx
rolling.pyx
round.pyx
scalar.pyx
search.pyx
stream_compaction.pyx
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ from . cimport (
replace,
reshape,
rolling,
round,
search,
sorting,
stream_compaction,
Expand Down Expand Up @@ -48,6 +49,7 @@ __all__ = [
"reduce",
"replace",
"rolling",
"round",
"search",
"stream_compaction",
"strings",
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
replace,
reshape,
rolling,
round,
search,
sorting,
stream_compaction,
Expand Down Expand Up @@ -48,6 +49,7 @@
"reduce",
"replace",
"rolling",
"round",
"search",
"stream_compaction",
"strings",
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources aggregation.pyx binaryop.pyx copying.pyx replace.pyx reduce.pxd
set(cython_sources aggregation.pyx binaryop.pyx copying.pyx replace.pyx reduce.pxd round.pyx
stream_compaction.pyx types.pyx unary.pyx
)

Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view

cdef extern from "cudf/round.hpp" namespace "cudf" nogil:

ctypedef enum rounding_method "cudf::rounding_method":
HALF_UP "cudf::rounding_method::HALF_UP"
HALF_EVEN "cudf::rounding_method::HALF_EVEN"
cpdef enum class rounding_method(int32_t):
HALF_UP
HALF_EVEN

cdef unique_ptr[column] round (
const column_view& input,
Expand Down
Empty file.
13 changes: 13 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/round.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from libc.stdint cimport int32_t

from cudf._lib.pylibcudf.libcudf.round cimport rounding_method

from .column cimport Column


cpdef Column round(
Column source,
int32_t decimal_places = *,
rounding_method round_method = *
)
54 changes: 54 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/round.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from libc.stdint cimport int32_t
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf._lib.pylibcudf.libcudf.round cimport (
round as cpp_round,
rounding_method,
)

from cudf._lib.pylibcudf.libcudf.round import \
rounding_method as RoundingMethod # no-cython-lint

from cudf._lib.pylibcudf.libcudf.column.column cimport column

from .column cimport Column


cpdef Column round(
Column source,
int32_t decimal_places = 0,
rounding_method round_method = rounding_method.HALF_UP
):
"""Rounds all the values in a column to the specified number of decimal places.
For details, see :cpp:func:`round`.
Parameters
----------
source : Column
The Column for which to round values.
decimal_places: int32_t, optional
The number of decimal places to round to (default 0)
round_method: rounding_method, optional
The method by which to round each value.
Can be one of { RoundingMethod.HALF_UP, RoundingMethod.HALF_EVEN }
(default rounding_method.HALF_UP)
Returns
-------
pylibcudf.Column
A Column with values rounded
"""
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_round(
source.view(),
decimal_places,
round_method
)
)

return Column.from_libcudf(move(c_result))
36 changes: 13 additions & 23 deletions python/cudf/cudf/_lib/round.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,10 @@

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf._lib.column cimport Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
from cudf._lib.pylibcudf.libcudf.round cimport (
round as cpp_round,
rounding_method as cpp_rounding_method,
)

import cudf._lib.pylibcudf as plc
from cudf._lib.pylibcudf.round import RoundingMethod


@acquire_spill_lock()
Expand All @@ -31,19 +25,15 @@ def round(Column input_col, int decimal_places=0, how="half_even"):
if how not in {"half_even", "half_up"}:
raise ValueError("'how' must be either 'half_even' or 'half_up'")

cdef column_view input_col_view = input_col.view()
cdef unique_ptr[column] c_result
cdef cpp_rounding_method c_how = (
cpp_rounding_method.HALF_EVEN if how == "half_even"
else cpp_rounding_method.HALF_UP
how = (
RoundingMethod.HALF_EVEN if how == "half_even"
else RoundingMethod.HALF_UP
)
with nogil:
c_result = move(
cpp_round(
input_col_view,
decimal_places,
c_how
)
)

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(
plc.round.round(
input_col.to_pylibcudf(mode="read"),
decimal_places,
how
)
)
Loading

0 comments on commit c7196f7

Please sign in to comment.