Skip to content

Commit

Permalink
Merge branch 'branch-23.04' into feat/no_message_parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
vyasr authored Feb 25, 2023
2 parents 6ffcc8a + eb4da93 commit 7b5131b
Show file tree
Hide file tree
Showing 36 changed files with 702 additions and 364 deletions.
24 changes: 24 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Copyright (c) 2017-2023, NVIDIA CORPORATION.

[flake8]
filename = *.py, *.pyx, *.pxd, *.pxi
exclude = __init__.py, *.egg, build, docs, .git
force-check = True
ignore =
# line break before binary operator
W503,
# whitespace before :
E203
per-file-ignores =
# Rules ignored only in Cython:
# E211: whitespace before '(' (used in multi-line imports)
# E225: Missing whitespace around operators (breaks cython casting syntax like <int>)
# E226: Missing whitespace around arithmetic operators (breaks cython pointer syntax like int*)
# E227: Missing whitespace around bitwise or shift operator (Can also break casting syntax)
# E275: Missing whitespace after keyword (Doesn't work with Cython except?)
# E402: invalid syntax (works for Python, not Cython)
# E999: invalid syntax (works for Python, not Cython)
# W504: line break after binary operator (breaks lines that end with a pointer)
*.pyx: E211, E225, E226, E227, E275, E402, E999, W504
*.pxd: E211, E225, E226, E227, E275, E402, E999, W504
*.pxi: E211, E225, E226, E227, E275, E402, E999, W504
2 changes: 2 additions & 0 deletions .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ jobs:
checks:
secrets: inherit
uses: rapidsai/shared-action-workflows/.github/workflows/[email protected]
with:
enable_check_generated_files: false
conda-cpp-build:
needs: checks
secrets: inherit
Expand Down
27 changes: 23 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ repos:
rev: 5.0.4
hooks:
- id: flake8
args: ["--config=setup.cfg"]
args: ["--config=.flake8"]
files: python/.*$
types: [file]
types_or: [python, cython]
Expand All @@ -48,7 +48,7 @@ repos:
hooks:
- id: mypy
additional_dependencies: [types-cachetools]
args: ["--config-file=setup.cfg",
args: ["--config-file=pyproject.toml",
"python/cudf/cudf",
"python/custreamz/custreamz",
"python/cudf_kafka/cudf_kafka",
Expand All @@ -58,7 +58,19 @@ repos:
rev: 6.1.1
hooks:
- id: pydocstyle
args: ["--config=setup.cfg"]
# https://github.com/PyCQA/pydocstyle/issues/603
additional_dependencies: [toml]
args: ["--config=pyproject.toml"]
- repo: https://github.com/nbQA-dev/nbQA
rev: 1.6.3
hooks:
- id: nbqa-isort
# Use the cudf_kafka isort orderings in notebooks so that dask
# and RAPIDS packages have their own sections.
args: ["--settings-file=python/cudf_kafka/pyproject.toml"]
- id: nbqa-black
# Explicitly specify the pyproject.toml at the repo root, not per-project.
args: ["--config=pyproject.toml"]
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v11.1.0
hooks:
Expand Down Expand Up @@ -138,14 +150,21 @@ repos:
pass_filenames: false
verbose: false
- repo: https://github.com/codespell-project/codespell
rev: v2.1.0
rev: v2.2.2
hooks:
- id: codespell
additional_dependencies: [tomli]
args: ["--toml", "pyproject.toml"]
exclude: |
(?x)^(
.*test.*|
^CHANGELOG.md$
)
- repo: https://github.com/rapidsai/dependency-file-generator
rev: v1.4.0
hooks:
- id: rapids-dependency-file-generator
args: ["--clean"]

default_language_version:
python: python3
4 changes: 2 additions & 2 deletions ci/check_style.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

set -euo pipefail

Expand All @@ -20,4 +20,4 @@ mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE})
wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL}

# Run pre-commit checks
pre-commit run --hook-stage manual --all-files --show-diff-on-failure
pre-commit run --all-files --show-diff-on-failure
2 changes: 1 addition & 1 deletion ci/release/update-version.sh
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ sed_runner "s/rmm==.*\",/rmm==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/cudf/setup
sed_runner "s/cudf==.*\",/cudf==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/dask_cudf/setup.py

# Dependency versions in pyproject.toml
sed_runner "s/rmm==.*\",/rmm==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/pyproject.toml
sed_runner "s/rmm==.*\",/rmm==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/cudf/pyproject.toml

for FILE in .github/workflows/*.yaml; do
sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
Expand Down
2 changes: 1 addition & 1 deletion ci/test_cpp_memcheck.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ set +e
rapids-logger "Memcheck gtests with rmm_mode=cuda"
export GTEST_CUDF_RMM_MODE=cuda
COMPUTE_SANITIZER_CMD="compute-sanitizer --tool memcheck"
for gt in "$CONDA_PREFIX"/bin/gtests/{libcudf,libcudf_kafka}/* ; do
for gt in "$CONDA_PREFIX"/bin/gtests/libcudf/* ; do
test_name=$(basename ${gt})
if [[ "$test_name" == "ERROR_TEST" ]] || [[ "$test_name" == "STREAM_IDENTIFICATION_TEST" ]]; then
continue
Expand Down
4 changes: 2 additions & 2 deletions cpp/benchmarks/common/generate_input.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -501,7 +501,7 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
rmm::device_uvector<cudf::size_type> offsets(num_rows + 1, cudf::get_default_stream());
thrust::exclusive_scan(
thrust::device, valid_lengths, valid_lengths + lengths.size(), offsets.begin());
// offfsets are ready.
// offsets are ready.
auto chars_length = *thrust::device_pointer_cast(offsets.end() - 1);
rmm::device_uvector<char> chars(chars_length, cudf::get_default_stream());
thrust::for_each_n(thrust::device,
Expand Down
6 changes: 3 additions & 3 deletions cpp/benchmarks/common/generate_input.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -373,13 +373,13 @@ class data_profile {

void set_bool_probability_true(double p)
{
CUDF_EXPECTS(p >= 0. and p <= 1., "probablity must be in range [0...1]");
CUDF_EXPECTS(p >= 0. and p <= 1., "probability must be in range [0...1]");
bool_probability_true = p;
}
void set_null_probability(std::optional<double> p)
{
CUDF_EXPECTS(p.value_or(0.) >= 0. and p.value_or(0.) <= 1.,
"probablity must be in range [0...1]");
"probability must be in range [0...1]");
null_probability = p;
}
void set_cardinality(cudf::size_type c) { cardinality = c; }
Expand Down
30 changes: 24 additions & 6 deletions cpp/include/cudf/lists/detail/dremel.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -183,16 +183,34 @@ struct dremel_data {
* - | - | -- | ---
* ```
*
* @param col Column of LIST type
* @param level_nullability Pre-determined nullability at each list level. Empty means infer from
* `col`
* @param input Column of LIST type
* @param nullability Pre-determined nullability at each list level. Empty means infer from
* `input`
* @param output_as_byte_array if `true`, then any nested list level that has a child of type
* `uint8_t` will be considered as the last level
* @param stream CUDA stream used for device memory operations and kernel launches.
*
* @return A struct containing dremel data
*/
dremel_data get_dremel_data(column_view h_col,
dremel_data get_dremel_data(column_view input,
std::vector<uint8_t> nullability,
bool output_as_byte_array,
rmm::cuda_stream_view stream);

/**
* @brief Get Dremel offsets, repetition levels, and modified definition levels to be used for
* lexicographical comparators. The modified definition levels are produced by treating
* each nested column in the input as nullable
*
* @param input Column of LIST type
* @param nullability Pre-determined nullability at each list level. Empty means infer from
* `input`
* @param output_as_byte_array if `true`, then any nested list level that has a child of type
* `uint8_t` will be considered as the last level
* @param stream CUDA stream used for device memory operations and kernel launches.
* @return A struct containing dremel data
*/
dremel_data get_comparator_data(column_view input,
std::vector<uint8_t> nullability,
bool output_as_byte_array,
rmm::cuda_stream_view stream);
} // namespace cudf::detail
3 changes: 2 additions & 1 deletion cpp/include/cudf/table/experimental/row_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,8 @@ class device_row_comparator {
// element_index because either both rows have a deeply nested NULL at the
// same position, and we'll "continue" in our iteration, or we will early
// exit if only one of the rows has a deeply nested NULL
if (lcol.nullable() and l_def_levels[l_dremel_index] == l_max_def_level - 1) {
if ((lcol.nullable() and l_def_levels[l_dremel_index] == l_max_def_level - 1) or
(rcol.nullable() and r_def_levels[r_dremel_index] == r_max_def_level - 1)) {
++element_index;
}
if (l_def_level == r_def_level) { continue; }
Expand Down
48 changes: 35 additions & 13 deletions cpp/src/lists/dremel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
#include <thrust/iterator/discard_iterator.h>

namespace cudf::detail {

namespace {
/**
* @brief Functor to get definition level value for a nested struct column until the leaf level or
* the first list level.
Expand All @@ -46,6 +46,7 @@ struct def_level_fn {
uint8_t const* d_nullability;
uint8_t sub_level_start;
uint8_t curr_def_level;
bool always_nullable;

__device__ uint32_t operator()(size_type i)
{
Expand All @@ -55,7 +56,7 @@ struct def_level_fn {
auto col = *parent_col;
do {
// If col not nullable then it does not contribute to def levels
if (d_nullability[l]) {
if (always_nullable or d_nullability[l]) {
if (not col.nullable() or bit_is_set(col.null_mask(), i)) {
++def;
} else { // We have found the shallowest level at which this row is null
Expand All @@ -72,10 +73,11 @@ struct def_level_fn {
}
};

dremel_data get_dremel_data(column_view h_col,
std::vector<uint8_t> nullability,
bool output_as_byte_array,
rmm::cuda_stream_view stream)
dremel_data get_encoding(column_view h_col,
std::vector<uint8_t> nullability,
bool output_as_byte_array,
bool always_nullable,
rmm::cuda_stream_view stream)
{
auto get_list_level = [](column_view col) {
while (col.type().id() == type_id::STRUCT) {
Expand Down Expand Up @@ -173,14 +175,14 @@ dremel_data get_dremel_data(column_view h_col,
uint32_t def = 0;
start_at_sub_level.push_back(curr_nesting_level_idx);
while (col.type().id() == type_id::STRUCT) {
def += (nullability[curr_nesting_level_idx]) ? 1 : 0;
def += (always_nullable or nullability[curr_nesting_level_idx]) ? 1 : 0;
col = col.child(0);
++curr_nesting_level_idx;
}
// At the end of all those structs is either a list column or the leaf. List column contributes
// at least one def level. Leaf contributes 1 level only if it is nullable.
def +=
(col.type().id() == type_id::LIST ? 1 : 0) + (nullability[curr_nesting_level_idx] ? 1 : 0);
def += (col.type().id() == type_id::LIST ? 1 : 0) +
(always_nullable or nullability[curr_nesting_level_idx] ? 1 : 0);
def_at_level.push_back(def);
++curr_nesting_level_idx;
};
Expand Down Expand Up @@ -209,7 +211,7 @@ dremel_data get_dremel_data(column_view h_col,
}
}

auto [device_view_owners, d_nesting_levels] =
[[maybe_unused]] auto [device_view_owners, d_nesting_levels] =
contiguous_copy_column_device_views<column_device_view>(nesting_levels, stream);

auto max_def_level = def_at_level.back();
Expand Down Expand Up @@ -297,7 +299,8 @@ dremel_data get_dremel_data(column_view h_col,
def_level_fn{d_nesting_levels + level,
d_nullability.data(),
start_at_sub_level[level],
def_at_level[level]});
def_at_level[level],
always_nullable});

// `nesting_levels.size()` == no of list levels + leaf. Max repetition level = no of list levels
auto input_child_rep_it = thrust::make_constant_iterator(nesting_levels.size() - 1);
Expand All @@ -306,7 +309,8 @@ dremel_data get_dremel_data(column_view h_col,
def_level_fn{d_nesting_levels + level + 1,
d_nullability.data(),
start_at_sub_level[level + 1],
def_at_level[level + 1]});
def_at_level[level + 1],
always_nullable});

// Zip the input and output value iterators so that merge operation is done only once
auto input_parent_zip_it =
Expand Down Expand Up @@ -389,7 +393,8 @@ dremel_data get_dremel_data(column_view h_col,
def_level_fn{d_nesting_levels + level,
d_nullability.data(),
start_at_sub_level[level],
def_at_level[level]});
def_at_level[level],
always_nullable});

// Zip the input and output value iterators so that merge operation is done only once
auto input_parent_zip_it =
Expand Down Expand Up @@ -459,5 +464,22 @@ dremel_data get_dremel_data(column_view h_col,
leaf_data_size,
max_def_level};
}
} // namespace

dremel_data get_dremel_data(column_view h_col,
std::vector<uint8_t> nullability,
bool output_as_byte_array,
rmm::cuda_stream_view stream)
{
return get_encoding(h_col, nullability, output_as_byte_array, false, stream);
}

dremel_data get_comparator_data(column_view h_col,
std::vector<uint8_t> nullability,
bool output_as_byte_array,
rmm::cuda_stream_view stream)
{
return get_encoding(h_col, nullability, output_as_byte_array, true, stream);
}

} // namespace cudf::detail
2 changes: 1 addition & 1 deletion cpp/src/table/row_operators.cu
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ auto list_lex_preprocess(table_view table, rmm::cuda_stream_view stream)
std::vector<detail::dremel_device_view> dremel_device_views;
for (auto const& col : table) {
if (col.type().id() == type_id::LIST) {
dremel_data.push_back(detail::get_dremel_data(col, {}, false, stream));
dremel_data.push_back(detail::get_comparator_data(col, {}, false, stream));
dremel_device_views.push_back(dremel_data.back());
}
}
Expand Down
Loading

0 comments on commit 7b5131b

Please sign in to comment.