Skip to content

Commit

Permalink
Merge branch 'branch-22.06' into list-get-sequence-input
Browse files Browse the repository at this point in the history
  • Loading branch information
shwina authored Apr 11, 2022
2 parents aee243d + 012af64 commit 0facf27
Show file tree
Hide file tree
Showing 42 changed files with 2,941 additions and 3,194 deletions.
260 changes: 258 additions & 2 deletions CHANGELOG.md

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions ci/benchmark/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ export GBENCH_BENCHMARKS_DIR="$WORKSPACE/cpp/build/gbenchmarks/"
export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache"

# Dask & Distributed option to install main(nightly) or `conda-forge` packages.
export INSTALL_DASK_MAIN=0
export INSTALL_DASK_MAIN=1

function remove_libcudf_kernel_cache_dir {
EXITCODE=$?
Expand Down Expand Up @@ -82,8 +82,8 @@ if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then
gpuci_logger "gpuci_mamba_retry update dask"
gpuci_mamba_retry update dask
else
gpuci_logger "gpuci_mamba_retry install conda-forge::dask==2022.03.0 conda-forge::distributed==2022.03.0 conda-forge::dask-core==2022.03.0 --force-reinstall"
gpuci_mamba_retry install conda-forge::dask==2022.03.0 conda-forge::distributed==2022.03.0 conda-forge::dask-core==2022.03.0 --force-reinstall
gpuci_logger "gpuci_mamba_retry install conda-forge::dask>=2022.03.0 conda-forge::distributed>=2022.03.0 conda-forge::dask-core>=2022.03.0 --force-reinstall"
gpuci_mamba_retry install conda-forge::dask>=2022.03.0 conda-forge::distributed>=2022.03.0 conda-forge::dask-core>=2022.03.0 --force-reinstall
fi

# Install the master version of streamz
Expand Down
6 changes: 3 additions & 3 deletions ci/gpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ export GIT_DESCRIBE_TAG=`git describe --tags`
export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`

# Dask & Distributed option to install main(nightly) or `conda-forge` packages.
export INSTALL_DASK_MAIN=0
export INSTALL_DASK_MAIN=1

# ucx-py version
export UCX_PY_VERSION='0.26.*'
Expand Down Expand Up @@ -112,8 +112,8 @@ function install_dask {
gpuci_mamba_retry update dask
conda list
else
gpuci_logger "gpuci_mamba_retry install conda-forge::dask==2022.03.0 conda-forge::distributed==2022.03.0 conda-forge::dask-core==2022.03.0 --force-reinstall"
gpuci_mamba_retry install conda-forge::dask==2022.03.0 conda-forge::distributed==2022.03.0 conda-forge::dask-core==2022.03.0 --force-reinstall
gpuci_logger "gpuci_mamba_retry install conda-forge::dask>=2022.03.0 conda-forge::distributed>=2022.03.0 conda-forge::dask-core>=2022.03.0 --force-reinstall"
gpuci_mamba_retry install conda-forge::dask>=2022.03.0 conda-forge::distributed>=2022.03.0 conda-forge::dask-core>=2022.03.0 --force-reinstall
fi
# Install the main version of streamz
gpuci_logger "Install the main version of streamz"
Expand Down
4 changes: 2 additions & 2 deletions conda/environments/cudf_dev_cuda11.5.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ dependencies:
- pydocstyle=6.1.1
- typing_extensions
- pre-commit
- dask==2022.03.0
- distributed==2022.03.0
- dask>=2022.03.0
- distributed>=2022.03.0
- streamz
- arrow-cpp=7.0.0
- dlpack>=0.5,<0.6.0a0
Expand Down
4 changes: 2 additions & 2 deletions conda/recipes/custreamz/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ requirements:
- python
- streamz
- cudf {{ version }}
- dask==2022.03.0
- distributed==2022.03.0
- dask>=2022.03.0
- distributed>=2022.03.0
- python-confluent-kafka >=1.7.0,<1.8.0a0
- cudf_kafka {{ version }}

Expand Down
8 changes: 4 additions & 4 deletions conda/recipes/dask-cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,14 @@ requirements:
host:
- python
- cudf {{ version }}
- dask==2022.03.0
- distributed==2022.03.0
- dask>=2022.03.0
- distributed>=2022.03.0
- cudatoolkit {{ cuda_version }}
run:
- python
- cudf {{ version }}
- dask==2022.03.0
- distributed==2022.03.0
- dask>=2022.03.0
- distributed>=2022.03.0
- {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}

test: # [linux64]
Expand Down
2 changes: 1 addition & 1 deletion cpp/benchmarks/sort/sort_structs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,5 +80,5 @@ void nvbench_sort_struct(nvbench::state& state)
NVBENCH_BENCH(nvbench_sort_struct)
.set_name("sort_struct")
.add_int64_power_of_two_axis("NumRows", {10, 18, 26})
.add_int64_axis("Depth", {1, 8})
.add_int64_axis("Depth", {0, 1, 8})
.add_int64_axis("Nulls", {0, 1});
85 changes: 68 additions & 17 deletions cpp/benchmarks/string/contains.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,48 +19,99 @@
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf_test/column_wrapper.hpp>

#include <cudf/filling.hpp>
#include <cudf/strings/contains.hpp>
#include <cudf/strings/findall.hpp>
#include <cudf/strings/strings_column_view.hpp>

class StringContains : public cudf::benchmark {
};

std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows, int32_t hit_rate)
{
// build input table using the following data
auto data = cudf::test::strings_column_wrapper({
"123 abc 4567890 DEFGHI 0987 5W43", // matches both patterns;
"012345 6789 01234 56789 0123 456", // the rest do not match
"abc 4567890 DEFGHI 0987 Wxyz 123",
"abcdefghijklmnopqrstuvwxyz 01234",
"",
"AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01",
"9876543210,abcdefghijklmnopqrstU",
"9876543210,abcdefghijklmnopqrstU",
"123 édf 4567890 DéFG 0987 X5",
"1",
});
auto data_view = cudf::column_view(data);

// compute number of rows in n_rows that should match
auto matches = static_cast<int32_t>(n_rows * hit_rate) / 100;

// Create a randomized gather-map to build a column out of the strings in data.
data_profile gather_profile;
gather_profile.set_distribution_params(
cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1);
gather_profile.set_null_frequency(0.0); // no nulls for gather-map
gather_profile.set_cardinality(0);
auto gather_table =
create_random_table({cudf::type_id::INT32}, row_count{n_rows}, gather_profile);
gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0);

// Create scatter map by placing 0-index values throughout the gather-map
auto scatter_data = cudf::sequence(
matches, cudf::numeric_scalar<int32_t>(0), cudf::numeric_scalar<int32_t>(n_rows / matches));
auto zero_scalar = cudf::numeric_scalar<int32_t>(0);
auto table = cudf::scatter({zero_scalar}, scatter_data->view(), gather_table->view());
auto gather_map = table->view().column(0);
table = cudf::gather(cudf::table_view({data_view}), gather_map);

return std::move(table->release().front());
}

enum contains_type { contains, count, findall };

// longer pattern lengths demand more working memory per string
std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$"};

static void BM_contains(benchmark::State& state, contains_type ct)
{
cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
auto const table = create_random_table({cudf::type_id::STRING}, row_count{n_rows});
cudf::strings_column_view input(table->view().column(0));
auto const n_rows = static_cast<cudf::size_type>(state.range(0));
auto const pattern_index = static_cast<int32_t>(state.range(1));
auto const hit_rate = static_cast<int32_t>(state.range(2));

auto col = build_input_column(n_rows, hit_rate);
auto input = cudf::strings_column_view(col->view());

auto pattern = patterns[pattern_index];

for (auto _ : state) {
cuda_event_timer raii(state, true, rmm::cuda_stream_default);
// contains_re(), matches_re(), and count_re() all have similar functions
// with count_re() being the most regex intensive
switch (ct) {
case contains_type::contains: // contains_re and matches_re use the same main logic
cudf::strings::contains_re(input, "\\d+");
cudf::strings::contains_re(input, pattern);
break;
case contains_type::count: // counts occurrences of pattern
cudf::strings::count_re(input, "\\d+");
case contains_type::count: // counts occurrences of matches
cudf::strings::count_re(input, pattern);
break;
case contains_type::findall: // returns occurrences of matches
cudf::strings::findall(input, "\\d+");
case contains_type::findall: // returns occurrences of all matches
cudf::strings::findall(input, pattern);
break;
}
}

state.SetBytesProcessed(state.iterations() * input.chars_size());
}

#define STRINGS_BENCHMARK_DEFINE(name, b) \
BENCHMARK_DEFINE_F(StringContains, name) \
(::benchmark::State & st) { BM_contains(st, contains_type::b); } \
BENCHMARK_REGISTER_F(StringContains, name) \
->RangeMultiplier(8) \
->Ranges({{1 << 12, 1 << 24}}) \
->UseManualTime() \
#define STRINGS_BENCHMARK_DEFINE(name, b) \
BENCHMARK_DEFINE_F(StringContains, name) \
(::benchmark::State & st) { BM_contains(st, contains_type::b); } \
BENCHMARK_REGISTER_F(StringContains, name) \
->ArgsProduct({{4096, 32768, 262144, 2097152, 16777216}, /* row count */ \
{0, 1}, /* patterns index */ \
{1, 5, 10, 25, 70, 100}}) /* hit rate */ \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

STRINGS_BENCHMARK_DEFINE(contains_re, contains)
Expand Down
6 changes: 3 additions & 3 deletions cpp/include/cudf/io/json.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -44,8 +44,8 @@ class json_reader_options_builder;
/**
* @brief Input arguments to the `read_json` interface.
*
* Available parameters and are closely patterned after PANDAS' `read_json` API.
* Not all parameters are unsupported. If the matching PANDAS' parameter
* Available parameters are closely patterned after PANDAS' `read_json` API.
* Not all parameters are supported. If the matching PANDAS' parameter
* has a default value of `None`, then a default value of `-1` or `0` may be
* used as the equivalent.
*
Expand Down
6 changes: 5 additions & 1 deletion cpp/include/cudf/table/experimental/row_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,11 @@ class device_row_comparator {
return cuda::std::make_pair(state, depth);
}

// Structs have been modified to only have 1 child when using this.
if (lcol.num_child_columns() == 0) {
return cuda::std::make_pair(weak_ordering::EQUIVALENT, depth);
}

// Non-empty structs have been modified to only have 1 child when using this.
lcol = lcol.children()[0];
rcol = rcol.children()[0];
++depth;
Expand Down
13 changes: 13 additions & 0 deletions cpp/src/io/parquet/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1179,6 +1179,19 @@ rmm::device_buffer reader::impl::decompress_page_data(
codec_stats{parquet::SNAPPY, 0, 0},
codec_stats{parquet::BROTLI, 0, 0}};

auto is_codec_supported = [&codecs](int8_t codec) {
if (codec == parquet::UNCOMPRESSED) return true;
return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) {
return codec == cstats.compression_type;
}) != codecs.end();
};
CUDF_EXPECTS(std::all_of(chunks.begin(),
chunks.end(),
[&is_codec_supported](auto const& chunk) {
return is_codec_supported(chunk.codec);
}),
"Unsupported compression type");

for (auto& codec : codecs) {
for_each_codec_page(codec.compression_type, [&](size_t page) {
auto page_uncomp_size = pages[page].uncompressed_page_size;
Expand Down
93 changes: 21 additions & 72 deletions cpp/src/strings/contains.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* limitations under the License.
*/

#include <strings/count_matches.hpp>
#include <strings/regex/dispatcher.hpp>
#include <strings/regex/regex.cuh>
#include <strings/utilities.hpp>
Expand Down Expand Up @@ -114,6 +115,26 @@ std::unique_ptr<column> matches_re(
return regex_dispatcher(*d_prog, contains_dispatch_fn{*d_prog, true}, input, stream, mr);
}

std::unique_ptr<column> count_re(strings_column_view const& input,
std::string const& pattern,
regex_flags const flags,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
// compile regex into device object
auto d_prog =
reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);

auto const d_strings = column_device_view::create(input.parent(), stream);

auto result = count_matches(*d_strings, *d_prog, input.size(), stream, mr);
if (input.has_nulls()) {
result->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr),
input.null_count());
}
return result;
}

} // namespace detail

// external APIs
Expand All @@ -136,78 +157,6 @@ std::unique_ptr<column> matches_re(strings_column_view const& strings,
return detail::matches_re(strings, pattern, flags, rmm::cuda_stream_default, mr);
}

namespace detail {
namespace {
/**
* @brief This counts the number of times the regex pattern matches in each string.
*/
template <int stack_size>
struct count_fn {
reprog_device prog;
column_device_view const d_strings;

__device__ int32_t operator()(unsigned int idx)
{
if (d_strings.is_null(idx)) return 0;
auto const d_str = d_strings.element<string_view>(idx);
auto const nchars = d_str.length();
int32_t find_count = 0;
int32_t begin = 0;
while (begin < nchars) {
auto end = static_cast<int32_t>(nchars);
if (prog.find<stack_size>(idx, d_str, begin, end) <= 0) break;
++find_count;
begin = end > begin ? end : begin + 1;
}
return find_count;
}
};

struct count_dispatch_fn {
reprog_device d_prog;

template <int stack_size>
std::unique_ptr<column> operator()(strings_column_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto results = make_numeric_column(data_type{type_id::INT32},
input.size(),
cudf::detail::copy_bitmask(input.parent(), stream, mr),
input.null_count(),
stream,
mr);

auto const d_strings = column_device_view::create(input.parent(), stream);
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(input.size()),
results->mutable_view().data<int32_t>(),
count_fn<stack_size>{d_prog, *d_strings});
return results;
}
};

} // namespace

std::unique_ptr<column> count_re(
strings_column_view const& input,
std::string const& pattern,
regex_flags const flags,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
// compile regex into device object
auto d_prog =
reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);

return regex_dispatcher(*d_prog, count_dispatch_fn{*d_prog}, input, stream, mr);
}

} // namespace detail

// external API

std::unique_ptr<column> count_re(strings_column_view const& strings,
std::string const& pattern,
regex_flags const flags,
Expand Down
16 changes: 10 additions & 6 deletions cpp/src/strings/convert/convert_datetime.cu
Original file line number Diff line number Diff line change
Expand Up @@ -298,12 +298,14 @@ struct parse_datetime {
}
case 'z': {
// 'z' format is +hh:mm -- single sign char and 2 chars each for hour and minute
auto const sign = *ptr == '-' ? 1 : -1;
auto const [hh, lh] = parse_int(ptr + 1, 2);
auto const [mm, lm] = parse_int(ptr + 3, 2);
// revert timezone back to UTC
timeparts.tz_minutes = sign * ((hh * 60) + mm);
bytes_read -= lh + lm;
if (item.length == 5) {
auto const sign = *ptr == '-' ? 1 : -1;
auto const [hh, lh] = parse_int(ptr + 1, 2);
auto const [mm, lm] = parse_int(ptr + 3, 2);
// revert timezone back to UTC
timeparts.tz_minutes = sign * ((hh * 60) + mm);
bytes_read -= lh + lm;
}
break;
}
case 'Z': break; // skip
Expand Down Expand Up @@ -574,6 +576,8 @@ struct check_datetime_format {
auto const cvm = check_value(ptr + 3, 2, 0, 59);
result = (*ptr == '-' || *ptr == '+') && cvh.first && cvm.first;
bytes_read -= cvh.second + cvm.second;
} else if (item.length == 1) {
result = *ptr == 'Z';
}
break;
}
Expand Down
Loading

0 comments on commit 0facf27

Please sign in to comment.