Skip to content

Commit

Permalink
Merge branch 'branch-22.10' into fea-nvcomp-zstd-comp
Browse files Browse the repository at this point in the history
  • Loading branch information
vuule authored Sep 12, 2022
2 parents f766f00 + dca285b commit 1f60695
Show file tree
Hide file tree
Showing 72 changed files with 1,756 additions and 389 deletions.
4 changes: 3 additions & 1 deletion .gitattributes
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
python/cudf/cudf/_version.py export-subst
CHANGELOG.md merge=union
python/cudf_kafka/cudf_kafka/_version.py export-subst
python/custreamz/custreamz/_version.py export-subst
python/dask_cudf/dask_cudf/_version.py export-subst
8 changes: 4 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,16 @@ cudf.egg-info/
python/build
python/*/build
python/cudf/cudf-coverage.xml
python/cudf/*/_lib/**/*\.cpp
python/cudf/*/_lib/**/*.cpp
python/cudf/*/_lib/**/*.h
python/cudf/*/_lib/.nfs*
python/cudf/*/_cuda/*\.cpp
python/cudf/*/_cuda/*.cpp
python/cudf/*.ipynb
python/cudf/.ipynb_checkpoints
python/*/record.txt
python/cudf_kafka/*/_lib/**/*\.cpp
python/cudf_kafka/*/_lib/**/*.cpp
python/cudf_kafka/*/_lib/**/*.h
python/custreamz/*/_lib/**/*\.cpp
python/custreamz/*/_lib/**/*.cpp
python/custreamz/*/_lib/**/*.h
.Python
env/
Expand Down
21 changes: 19 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,15 @@ repos:
files: python/.*\.(py|pyx|pxd)$
types: [file]
- repo: https://github.com/pre-commit/mirrors-mypy
rev: 'v0.782'
rev: 'v0.971'
hooks:
- id: mypy
args: ["--config-file=setup.cfg", "python/cudf/cudf", "python/dask_cudf/dask_cudf", "python/custreamz/custreamz", "python/cudf_kafka/cudf_kafka"]
additional_dependencies: [types-cachetools]
args: ["--config-file=setup.cfg",
"python/cudf/cudf",
"python/custreamz/custreamz",
"python/cudf_kafka/cudf_kafka",
"python/dask_cudf/dask_cudf"]
pass_filenames: false
- repo: https://github.com/PyCQA/pydocstyle
rev: 6.1.1
Expand Down Expand Up @@ -88,6 +93,18 @@ repos:
language: system
pass_filenames: false
verbose: true
- id: headers-recipe-check
name: headers-recipe-check
entry: ./ci/checks/headers_test.sh
files: |
(?x)^(
^cpp/include/|
^conda/.*/meta.yaml
)
types_or: [file]
language: system
pass_filenames: false
verbose: false

default_language_version:
python: python3
5 changes: 1 addition & 4 deletions ci/checks/headers_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,9 @@ for DIRNAME in ${DIRNAMES[@]}; do
LIB_RETVAL=$?

if [ "$LIB_RETVAL" != "0" ]; then
echo -e "\n\n>>>> FAILED: lib${LIBNAME} header existence conda/recipes/lib${LIBNAME}/meta.yaml check; begin output\n\n"
echo -e ">>>> FAILED: lib${LIBNAME} has different headers in include/${DIRNAME}/ and conda/recipes/lib${LIBNAME}/meta.yaml. The diff is shown below:"
echo -e "$HEADER_DIFF"
echo -e "\n\n>>>> FAILED: lib${LIBNAME} header existence conda/recipes/lib${LIBNAME}/meta.yaml check; end output\n\n"
RETVAL=1
else
echo -e "\n\n>>>> PASSED: lib${LIBNAME} header existence conda/recipes/lib${LIBNAME}/meta.yaml check\n\n"
fi
done

Expand Down
7 changes: 1 addition & 6 deletions ci/checks/style.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,8 @@ else
echo -e "\n\n>>>> PASSED: clang format check\n\n"
fi

# Run header meta.yml check and get results/return code
HEADER_META=`ci/checks/headers_test.sh`
HEADER_META_RETVAL=$?
echo -e "$HEADER_META"

RETVALS=(
$CR_RETVAL $PRE_COMMIT_RETVAL $CLANG_FORMAT_RETVAL $HEADER_META_RETVAL
$CR_RETVAL $PRE_COMMIT_RETVAL $CLANG_FORMAT_RETVAL
)
IFS=$'\n'
RETVAL=`echo "${RETVALS[*]}" | sort -nr | head -n1`
Expand Down
15 changes: 15 additions & 0 deletions ci/gpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,13 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
echo "Running GoogleTest $test_name"
${gt} --gtest_output=xml:"$WORKSPACE/test-results/"
done

# Test libcudf (csv, orc, and parquet) with `LIBCUDF_CUFILE_POLICY=KVIKIO`
for test_name in "CSV_TEST" "ORC_TEST" "PARQUET_TEST"; do
gt="$WORKSPACE/cpp/build/gtests/$test_name"
echo "Running GoogleTest $test_name (LIBCUDF_CUFILE_POLICY=KVIKIO)"
LIBCUDF_CUFILE_POLICY=KVIKIO ${gt} --gtest_output=xml:"$WORKSPACE/test-results/"
done
fi
else
#Project Flash
Expand Down Expand Up @@ -182,10 +189,18 @@ else
gpuci_logger "GoogleTests"
# Run libcudf and libcudf_kafka gtests from libcudf-tests package
for gt in "$CONDA_PREFIX/bin/gtests/libcudf"*/* ; do
test_name=$(basename ${gt})
echo "Running GoogleTest $test_name"
${gt} --gtest_output=xml:"$WORKSPACE/test-results/"
done

# Test libcudf (csv, orc, and parquet) with `LIBCUDF_CUFILE_POLICY=KVIKIO`
for test_name in "CSV_TEST" "ORC_TEST" "PARQUET_TEST"; do
gt="$CONDA_PREFIX/bin/gtests/libcudf/$test_name"
echo "Running GoogleTest $test_name (LIBCUDF_CUFILE_POLICY=KVIKIO)"
LIBCUDF_CUFILE_POLICY=KVIKIO ${gt} --gtest_output=xml:"$WORKSPACE/test-results/"
done

export LIB_BUILD_DIR="$WORKSPACE/ci/artifacts/cudf/cpu/libcudf_work/cpp/build"
# Copy libcudf build time results
echo "Checking for build time log $LIB_BUILD_DIR/ninja_log.xml"
Expand Down
5 changes: 3 additions & 2 deletions conda/environments/cudf_dev_cuda11.5.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,15 @@ dependencies:
- nbsphinx
- numpydoc
- ipython
- pandoc=<2.0.0
- pandoc<=2.0.0
- cudatoolkit=11.5
- cuda-python >=11.5,<11.7.1
- pip
- flake8=3.8.3
- black=22.3.0
- isort=5.10.1
- mypy=0.782
- mypy=0.971
- types-cachetools
- doxygen=1.8.20
- pydocstyle=6.1.1
- typing_extensions
Expand Down
5 changes: 5 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -749,6 +749,11 @@ if(CUDF_BUILD_BENCHMARKS)
add_subdirectory(benchmarks)
endif()

# build pretty-printer load script
if(Thrust_SOURCE_DIR AND rmm_SOURCE_DIR)
configure_file(scripts/load-pretty-printers.in load-pretty-printers @ONLY)
endif()

# ##################################################################################################
# * install targets -------------------------------------------------------------------------------
rapids_cmake_install_lib_dir(lib_dir)
Expand Down
2 changes: 1 addition & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ ConfigureNVBench(SEARCH_NVBENCH search/contains.cpp)
# ##################################################################################################
# * sort benchmark --------------------------------------------------------------------------------
ConfigureBench(SORT_BENCH sort/rank.cpp sort/sort.cpp sort/sort_strings.cpp)
ConfigureNVBench(SORT_NVBENCH sort/sort_structs.cpp)
ConfigureNVBench(SORT_NVBENCH sort/sort_lists.cpp sort/sort_structs.cpp)

# ##################################################################################################
# * quantiles benchmark
Expand Down
49 changes: 49 additions & 0 deletions cpp/benchmarks/sort/sort_lists.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/rmm_pool_raii.hpp>

#include <cudf/detail/sorting.hpp>

#include <nvbench/nvbench.cuh>

void nvbench_sort_lists(nvbench::state& state)
{
cudf::rmm_pool_raii pool_raii;

const size_t size_bytes(state.get_int64("size_bytes"));
const cudf::size_type depth{static_cast<cudf::size_type>(state.get_int64("depth"))};
auto const null_frequency{state.get_float64("null_frequency")};

data_profile table_profile;
table_profile.set_distribution_params(cudf::type_id::LIST, distribution_id::UNIFORM, 0, 5);
table_profile.set_list_depth(depth);
table_profile.set_null_probability(null_frequency);
auto const table =
create_random_table({cudf::type_id::LIST}, table_size_bytes{size_bytes}, table_profile);

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
rmm::cuda_stream_view stream_view{launch.get_stream()};
cudf::detail::sorted_order(*table, {}, {}, stream_view, rmm::mr::get_current_device_resource());
});
}

NVBENCH_BENCH(nvbench_sort_lists)
.set_name("sort_list")
.add_int64_power_of_two_axis("size_bytes", {10, 18, 24, 28})
.add_int64_axis("depth", {1, 4})
.add_float64_axis("null_frequency", {0, 0.2});
74 changes: 74 additions & 0 deletions cpp/include/cudf/io/parquet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ class parquet_reader_options {

// List of individual row groups to read (ignored if empty)
std::vector<std::vector<size_type>> _row_groups;
// Number of rows to skip from the start
size_type _skip_rows = 0;
// Number of rows to read; -1 is all
size_type _num_rows = -1;

// Whether to store string data as categorical type
bool _convert_strings_to_categories = false;
Expand Down Expand Up @@ -127,6 +131,20 @@ class parquet_reader_options {
return _reader_column_schema;
}

/**
* @brief Returns number of rows to skip from the start.
*
* @return Number of rows to skip from the start
*/
[[nodiscard]] size_type get_skip_rows() const { return _skip_rows; }

/**
* @brief Returns number of rows to read.
*
* @return Number of rows to read
*/
[[nodiscard]] size_type get_num_rows() const { return _num_rows; }

/**
* @brief Returns names of column to be read, if set.
*
Expand Down Expand Up @@ -162,6 +180,10 @@ class parquet_reader_options {
*/
void set_row_groups(std::vector<std::vector<size_type>> row_groups)
{
if ((!row_groups.empty()) and ((_skip_rows != 0) or (_num_rows != -1))) {
CUDF_FAIL("row_groups can't be set along with skip_rows and num_rows");
}

_row_groups = std::move(row_groups);
}

Expand Down Expand Up @@ -190,6 +212,34 @@ class parquet_reader_options {
_reader_column_schema = std::move(val);
}

/**
* @brief Sets number of rows to skip.
*
* @param val Number of rows to skip from start
*/
void set_skip_rows(size_type val)
{
if ((val != 0) and (!_row_groups.empty())) {
CUDF_FAIL("skip_rows can't be set along with a non-empty row_groups");
}

_skip_rows = val;
}

/**
* @brief Sets number of rows to read.
*
* @param val Number of rows to read after skip
*/
void set_num_rows(size_type val)
{
if ((val != -1) and (!_row_groups.empty())) {
CUDF_FAIL("num_rows can't be set along with a non-empty row_groups");
}

_num_rows = val;
}

/**
* @brief Sets timestamp_type used to cast timestamp columns.
*
Expand Down Expand Up @@ -279,6 +329,30 @@ class parquet_reader_options_builder {
return *this;
}

/**
* @brief Sets number of rows to skip.
*
* @param val Number of rows to skip from start
* @return this for chaining
*/
parquet_reader_options_builder& skip_rows(size_type val)
{
options.set_skip_rows(val);
return *this;
}

/**
* @brief Sets number of rows to read.
*
* @param val Number of rows to read after skip
* @return this for chaining
*/
parquet_reader_options_builder& num_rows(size_type val)
{
options.set_num_rows(val);
return *this;
}

/**
* @brief timestamp_type used to cast timestamp columns.
*
Expand Down
14 changes: 5 additions & 9 deletions cpp/include/cudf/lists/detail/dremel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,11 @@ namespace cudf::detail {
* @see the `dremel_data` struct for more info.
*/
struct dremel_device_view {
// TODO: These elements are default initializable to support default
// initialization of the object. This is currently exploited to create views
// that will never actually be used. We should consider whether this
// represents a serious issue that should be worked around more robustly.
size_type const* offsets{};
uint8_t const* rep_levels{};
uint8_t const* def_levels{};
size_type const leaf_data_size{};
uint8_t const max_def_level{};
size_type const* offsets;
uint8_t const* rep_levels;
uint8_t const* def_levels;
size_type const leaf_data_size;
uint8_t const max_def_level;
};

/**
Expand Down
Loading

0 comments on commit 1f60695

Please sign in to comment.