Skip to content

Commit

Permalink
Move chars column to parent data buffer in strings column (#14202)
Browse files Browse the repository at this point in the history
Eliminates chars column and moves chars data to parent string column's _data buffer.

Summary of changes 
- chars child column is removed, chars buffer is added to parent column
- Adds stream to `chars_size()`, `chars_end()` in `strings_column_view` and their invocations
- Remove `chars_column_index`, and deprecate `chars()` from `strings_column_view`
- Replace `chars_col.begin<char>()` with `static_cast<char*>(parent.head())`
- Adds string column factory which accepts `rmm::device_buffer` instead of chars column
- Deprecate string column factory which accepts chars column
- IO changes - contiguous split (From @nvdbaranec ), to_arrow, parquet writer.
- Fix binary ops, column_view,  interleave columns, byte cast, strings APIs, text APIs
- Fix tests, benchmarks (mostly adding `stream` parameter to chars_size)
- Java fixes (From @andygrove)
- Python changes
  - .data special case for string column
  - get size from offsets column for rmm.DeviceBuffer in column
  - special condition for string slice
  - Pickle file update for string column
  - a few unit tests updates

Preparing for #13733

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)
  - Lawrence Mitchell (https://github.com/wence-)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Ashwin Srinath (https://github.com/shwina)

URL: #14202
  • Loading branch information
karthikeyann authored Jan 17, 2024
1 parent 8f5e64d commit c7acdaa
Show file tree
Hide file tree
Showing 93 changed files with 519 additions and 378 deletions.
4 changes: 2 additions & 2 deletions cpp/benchmarks/hashing/hash.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -43,7 +43,7 @@ static void bench_hash(nvbench::state& state)

// collect statistics
cudf::strings_column_view input(data->get_column(1).view());
auto const chars_size = input.chars_size();
auto const chars_size = input.chars_size(stream);
// add memory read from string column
state.add_global_memory_reads<nvbench::int8_t>(chars_size);
// add memory read from int64_t column
Expand Down
4 changes: 2 additions & 2 deletions cpp/benchmarks/json/json.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -190,7 +190,7 @@ void BM_case(benchmark::State& state, std::string query_arg)
int desired_bytes = state.range(1);
auto input = build_json_string_column(desired_bytes, num_rows);
cudf::strings_column_view scv(input->view());
size_t num_chars = scv.chars().size();
size_t num_chars = scv.chars_size(cudf::get_default_stream());

std::string json_path(query_arg);

Expand Down
18 changes: 9 additions & 9 deletions cpp/benchmarks/string/case.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -43,28 +43,28 @@ void bench_case(nvbench::state& state)
if (encoding == "ascii") {
data_profile ascii_profile = data_profile_builder().no_validity().distribution(
cudf::type_id::INT8, distribution_id::UNIFORM, 32, 126); // nice ASCII range
auto input = cudf::strings_column_view(col_view);
auto ascii_column =
create_random_column(cudf::type_id::INT8, row_count{input.chars_size()}, ascii_profile);
auto input = cudf::strings_column_view(col_view);
auto ascii_column = create_random_column(
cudf::type_id::INT8, row_count{input.chars_size(cudf::get_default_stream())}, ascii_profile);
auto ascii_data = ascii_column->view();

col_view = cudf::column_view(col_view.type(),
col_view.size(),
nullptr,
ascii_data.data<char>(),
col_view.null_mask(),
col_view.null_count(),
0,
{input.offsets(), ascii_data});
{input.offsets()});

ascii_contents = ascii_column->release();
}
auto input = cudf::strings_column_view(col_view);

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));

state.add_element_count(input.chars_size(), "chars_size");
state.add_global_memory_reads<nvbench::int8_t>(input.chars_size());
state.add_global_memory_writes<nvbench::int8_t>(input.chars_size());
state.add_element_count(input.chars_size(cudf::get_default_stream()), "chars_size");
state.add_global_memory_reads<nvbench::int8_t>(input.chars_size(cudf::get_default_stream()));
state.add_global_memory_writes<nvbench::int8_t>(input.chars_size(cudf::get_default_stream()));

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto result = cudf::strings::to_lower(input); });
Expand Down
4 changes: 2 additions & 2 deletions cpp/benchmarks/string/char_types.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -42,7 +42,7 @@ static void bench_char_types(nvbench::state& state)

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
// gather some throughput statistics as well
auto chars_size = input.chars_size();
auto chars_size = input.chars_size(cudf::get_default_stream());
state.add_global_memory_reads<nvbench::int8_t>(chars_size); // all bytes are read;
if (api_type == "all") {
state.add_global_memory_writes<nvbench::int8_t>(num_rows); // output is a bool8 per row
Expand Down
5 changes: 3 additions & 2 deletions cpp/benchmarks/string/combine.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -44,7 +44,8 @@ static void BM_combine(benchmark::State& state)
cudf::strings::concatenate(table->view(), separator);
}

state.SetBytesProcessed(state.iterations() * (input1.chars_size() + input2.chars_size()));
state.SetBytesProcessed(state.iterations() * (input1.chars_size(cudf::get_default_stream()) +
input2.chars_size(cudf::get_default_stream())));
}

static void generate_bench_args(benchmark::internal::Benchmark* b)
Expand Down
4 changes: 2 additions & 2 deletions cpp/benchmarks/string/contains.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -100,7 +100,7 @@ static void bench_contains(nvbench::state& state)
auto pattern = patterns[pattern_index];
auto program = cudf::strings::regex_program::create(pattern);

auto chars_size = input.chars_size();
auto chars_size = input.chars_size(cudf::get_default_stream());
state.add_element_count(chars_size, "chars_size");
state.add_global_memory_reads<nvbench::int8_t>(chars_size);
state.add_global_memory_writes<nvbench::int32_t>(input.size());
Expand Down
5 changes: 3 additions & 2 deletions cpp/benchmarks/string/convert_datetime.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -48,7 +48,8 @@ void BM_convert_datetime(benchmark::State& state, direction dir)
cudf::strings::from_timestamps(input, "%Y-%m-%d %H:%M:%S");
}

auto const bytes = dir == direction::to ? source_string.chars_size() : n_rows * sizeof(TypeParam);
auto const bytes = dir == direction::to ? source_string.chars_size(cudf::get_default_stream())
: n_rows * sizeof(TypeParam);
state.SetBytesProcessed(state.iterations() * bytes);
}

Expand Down
10 changes: 6 additions & 4 deletions cpp/benchmarks/string/convert_fixed_point.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -49,8 +49,9 @@ void convert_to_fixed_point(benchmark::State& state)
}

// bytes_processed = bytes_input + bytes_output
state.SetBytesProcessed(state.iterations() *
(strings_view.chars_size() + rows * cudf::size_of(dtype)));
state.SetBytesProcessed(
state.iterations() *
(strings_view.chars_size(cudf::get_default_stream()) + rows * cudf::size_of(dtype)));
}

class StringsFromFixedPoint : public cudf::benchmark {};
Expand All @@ -74,7 +75,8 @@ void convert_from_fixed_point(benchmark::State& state)
// bytes_processed = bytes_input + bytes_output
state.SetBytesProcessed(
state.iterations() *
(cudf::strings_column_view(results->view()).chars_size() + rows * cudf::size_of(dtype)));
(cudf::strings_column_view(results->view()).chars_size(cudf::get_default_stream()) +
rows * cudf::size_of(dtype)));
}

#define CONVERT_TO_FIXED_POINT_BMD(name, fixed_point_type) \
Expand Down
10 changes: 6 additions & 4 deletions cpp/benchmarks/string/convert_numerics.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -63,8 +63,9 @@ void convert_to_number(benchmark::State& state)
}

// bytes_processed = bytes_input + bytes_output
state.SetBytesProcessed(state.iterations() *
(strings_view.chars_size() + rows * sizeof(NumericType)));
state.SetBytesProcessed(
state.iterations() *
(strings_view.chars_size(cudf::get_default_stream()) + rows * sizeof(NumericType)));
}

class StringsFromNumeric : public cudf::benchmark {};
Expand All @@ -90,7 +91,8 @@ void convert_from_number(benchmark::State& state)
// bytes_processed = bytes_input + bytes_output
state.SetBytesProcessed(
state.iterations() *
(cudf::strings_column_view(results->view()).chars_size() + rows * sizeof(NumericType)));
(cudf::strings_column_view(results->view()).chars_size(cudf::get_default_stream()) +
rows * sizeof(NumericType)));
}

#define CONVERT_TO_NUMERICS_BD(name, type) \
Expand Down
7 changes: 4 additions & 3 deletions cpp/benchmarks/string/copy.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -64,8 +64,9 @@ static void BM_copy(benchmark::State& state, copy_type ct)
}
}

state.SetBytesProcessed(state.iterations() *
cudf::strings_column_view(source->view().column(0)).chars_size());
state.SetBytesProcessed(
state.iterations() *
cudf::strings_column_view(source->view().column(0)).chars_size(cudf::get_default_stream()));
}

static void generate_bench_args(benchmark::internal::Benchmark* b)
Expand Down
4 changes: 2 additions & 2 deletions cpp/benchmarks/string/count.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -47,7 +47,7 @@ static void bench_count(nvbench::state& state)

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
// gather some throughput statistics as well
auto chars_size = input.chars_size();
auto chars_size = input.chars_size(cudf::get_default_stream());
state.add_element_count(chars_size, "chars_size");
state.add_global_memory_reads<nvbench::int8_t>(chars_size);
state.add_global_memory_writes<nvbench::int32_t>(input.size());
Expand Down
4 changes: 2 additions & 2 deletions cpp/benchmarks/string/extract.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -67,7 +67,7 @@ static void bench_extract(nvbench::state& state)

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
// gather some throughput statistics as well
auto chars_size = strings_view.chars_size();
auto chars_size = strings_view.chars_size(cudf::get_default_stream());
state.add_element_count(chars_size, "chars_size"); // number of bytes;
state.add_global_memory_reads<nvbench::int8_t>(chars_size); // all bytes are read;
state.add_global_memory_writes<nvbench::int8_t>(chars_size); // all bytes are written
Expand Down
4 changes: 2 additions & 2 deletions cpp/benchmarks/string/factory.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -67,7 +67,7 @@ static void BM_factory(benchmark::State& state)
}

cudf::strings_column_view input(column->view());
state.SetBytesProcessed(state.iterations() * input.chars_size());
state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
}

static void generate_bench_args(benchmark::internal::Benchmark* b)
Expand Down
4 changes: 2 additions & 2 deletions cpp/benchmarks/string/filter.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -57,7 +57,7 @@ static void BM_filter_chars(benchmark::State& state, FilterAPI api)
}
}

state.SetBytesProcessed(state.iterations() * input.chars_size());
state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
}

static void generate_bench_args(benchmark::internal::Benchmark* b)
Expand Down
4 changes: 2 additions & 2 deletions cpp/benchmarks/string/find.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -56,7 +56,7 @@ static void BM_find_scalar(benchmark::State& state, FindAPI find_api)
}
}

state.SetBytesProcessed(state.iterations() * input.chars_size());
state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
}

static void generate_bench_args(benchmark::internal::Benchmark* b)
Expand Down
5 changes: 3 additions & 2 deletions cpp/benchmarks/string/gather.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -43,7 +43,8 @@ static void bench_gather(nvbench::state& state)
create_random_table({cudf::type_id::INT32}, row_count{num_rows}, map_profile);

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
auto chars_size = cudf::strings_column_view(input_table->view().column(0)).chars_size();
auto chars_size =
cudf::strings_column_view(input_table->view().column(0)).chars_size(cudf::get_default_stream());
state.add_global_memory_reads<nvbench::int8_t>(chars_size); // all bytes are read;
state.add_global_memory_writes<nvbench::int8_t>(chars_size);

Expand Down
4 changes: 2 additions & 2 deletions cpp/benchmarks/string/join_strings.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -40,7 +40,7 @@ static void bench_join(nvbench::state& state)

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
// gather some throughput statistics as well
auto const chars_size = input.chars_size();
auto const chars_size = input.chars_size(cudf::get_default_stream());
state.add_element_count(chars_size, "chars_size"); // number of bytes;
state.add_global_memory_reads<nvbench::int8_t>(chars_size); // all bytes are read;
state.add_global_memory_writes<nvbench::int8_t>(chars_size); // all bytes are written
Expand Down
4 changes: 2 additions & 2 deletions cpp/benchmarks/string/lengths.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -40,7 +40,7 @@ static void bench_lengths(nvbench::state& state)

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
// gather some throughput statistics as well
auto chars_size = input.chars_size();
auto chars_size = input.chars_size(cudf::get_default_stream());
state.add_global_memory_reads<nvbench::int8_t>(chars_size); // all bytes are read;
state.add_global_memory_writes<nvbench::int32_t>(num_rows); // output is an integer per row

Expand Down
4 changes: 2 additions & 2 deletions cpp/benchmarks/string/like.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -99,7 +99,7 @@ static void bench_like(nvbench::state& state)

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
// gather some throughput statistics as well
auto chars_size = input.chars_size();
auto chars_size = input.chars_size(cudf::get_default_stream());
state.add_element_count(chars_size, "chars_size"); // number of bytes;
state.add_global_memory_reads<nvbench::int8_t>(chars_size); // all bytes are read;
state.add_global_memory_writes<nvbench::int8_t>(n_rows); // writes are BOOL8
Expand Down
8 changes: 4 additions & 4 deletions cpp/benchmarks/string/repeat_strings.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -59,7 +59,7 @@ static void BM_repeat_strings_scalar_times(benchmark::State& state)
cudf::strings::repeat_strings(strings_col, default_repeat_times);
}

state.SetBytesProcessed(state.iterations() * strings_col.chars_size());
state.SetBytesProcessed(state.iterations() * strings_col.chars_size(cudf::get_default_stream()));
}

static void BM_repeat_strings_column_times(benchmark::State& state)
Expand All @@ -75,8 +75,8 @@ static void BM_repeat_strings_column_times(benchmark::State& state)
cudf::strings::repeat_strings(strings_col, repeat_times_col);
}

state.SetBytesProcessed(state.iterations() *
(strings_col.chars_size() + repeat_times_col.size() * sizeof(int32_t)));
state.SetBytesProcessed(state.iterations() * (strings_col.chars_size(cudf::get_default_stream()) +
repeat_times_col.size() * sizeof(int32_t)));
}

static void generate_bench_args(benchmark::internal::Benchmark* b)
Expand Down
Loading

0 comments on commit c7acdaa

Please sign in to comment.