Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/branch-0.20' into refactor/oned_…
Browse files Browse the repository at this point in the history
…frames
  • Loading branch information
vyasr committed Apr 29, 2021
2 parents 1bd0e2b + cea6c20 commit 501d7ef
Show file tree
Hide file tree
Showing 47 changed files with 2,812 additions and 531 deletions.
6 changes: 6 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,12 @@ repos:
entry: mypy --config-file=python/cudf/setup.cfg python/cudf/cudf
language: system
types: [python]
- repo: https://github.com/pycqa/pydocstyle
rev: 6.0.0
hooks:
- id: pydocstyle
args: ["--config=python/.flake8"]


default_language_version:
python: python3
16 changes: 14 additions & 2 deletions ci/checks/style.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Copyright (c) 2018, NVIDIA CORPORATION.
# Copyright (c) 2018-2021, NVIDIA CORPORATION.
#####################
# cuDF Style Tester #
#####################
Expand Down Expand Up @@ -33,6 +33,10 @@ FLAKE_CYTHON_RETVAL=$?
MYPY_CUDF=`mypy --config=python/cudf/setup.cfg python/cudf/cudf`
MYPY_CUDF_RETVAL=$?

# Run pydocstyle and get results/return code
PYDOCSTYLE=`pydocstyle --config=python/.flake8 python`
PYDOCSTYLE_RETVAL=$?

# Run clang-format and check for a consistent code format
CLANG_FORMAT=`python cpp/scripts/run-clang-format.py 2>&1`
CLANG_FORMAT_RETVAL=$?
Expand Down Expand Up @@ -78,6 +82,14 @@ else
echo -e "\n\n>>>> PASSED: mypy style check\n\n"
fi

if [ "$PYDOCSTYLE_RETVAL" != "0" ]; then
echo -e "\n\n>>>> FAILED: pydocstyle style check; begin output\n\n"
echo -e "$PYDOCSTYLE"
echo -e "\n\n>>>> FAILED: pydocstyle style check; end output\n\n"
else
echo -e "\n\n>>>> PASSED: pydocstyle style check\n\n"
fi

if [ "$CLANG_FORMAT_RETVAL" != "0" ]; then
echo -e "\n\n>>>> FAILED: clang format check; begin output\n\n"
echo -e "$CLANG_FORMAT"
Expand All @@ -91,7 +103,7 @@ HEADER_META=`ci/checks/headers_test.sh`
HEADER_META_RETVAL=$?
echo -e "$HEADER_META"

RETVALS=($ISORT_RETVAL $BLACK_RETVAL $FLAKE_RETVAL $FLAKE_CYTHON_RETVAL $CLANG_FORMAT_RETVAL $HEADER_META_RETVAL $MYPY_CUDF_RETVAL)
RETVALS=($ISORT_RETVAL $BLACK_RETVAL $FLAKE_RETVAL $FLAKE_CYTHON_RETVAL $PYDOCSTYLE_RETVAL $CLANG_FORMAT_RETVAL $HEADER_META_RETVAL $MYPY_CUDF_RETVAL)
IFS=$'\n'
RETVAL=`echo "${RETVALS[*]}" | sort -nr | head -n1`

Expand Down
1 change: 1 addition & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ test:
- test -f $PREFIX/include/cudf/replace.hpp
- test -f $PREFIX/include/cudf/reshape.hpp
- test -f $PREFIX/include/cudf/rolling.hpp
- test -f $PREFIX/include/cudf/rolling/range_window_bounds.hpp
- test -f $PREFIX/include/cudf/round.hpp
- test -f $PREFIX/include/cudf/scalar/scalar_factories.hpp
- test -f $PREFIX/include/cudf/scalar/scalar.hpp
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,7 @@ add_library(cudf
src/reshape/tile.cu
src/rolling/grouped_rolling.cu
src/rolling/rolling.cu
src/rolling/range_window_bounds.cpp
src/round/round.cu
src/scalar/scalar.cpp
src/scalar/scalar_factories.cpp
Expand Down
5 changes: 5 additions & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,11 @@ ConfigureBench(REDUCTION_BENCH
reduction/scan_benchmark.cpp
reduction/minmax_benchmark.cpp)

###################################################################################################
# - filling benchmark -----------------------------------------------------------------------------
ConfigureBench(FILL_BENCH
filling/repeat_benchmark.cpp)

###################################################################################################
# - groupby benchmark -----------------------------------------------------------------------------
ConfigureBench(GROUPBY_BENCH
Expand Down
9 changes: 5 additions & 4 deletions cpp/benchmarks/common/generate_benchmark_input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "random_distribution_factory.hpp"

#include <cudf/column/column.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/table/table.hpp>
#include <cudf/utilities/bit.hpp>

Expand All @@ -26,7 +27,7 @@

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>
#include <rmm/device_vector.hpp>
#include <rmm/device_uvector.hpp>

#include <future>
#include <memory>
Expand Down Expand Up @@ -413,9 +414,9 @@ std::unique_ptr<cudf::column> create_random_column<cudf::string_view>(data_profi
}
}

rmm::device_vector<char> d_chars(out_col.chars);
rmm::device_vector<cudf::size_type> d_offsets(out_col.offsets);
rmm::device_vector<cudf::bitmask_type> d_null_mask(out_col.null_mask);
auto d_chars = cudf::detail::make_device_uvector_sync(out_col.chars);
auto d_offsets = cudf::detail::make_device_uvector_sync(out_col.offsets);
auto d_null_mask = cudf::detail::make_device_uvector_sync(out_col.null_mask);
return cudf::make_strings_column(d_chars, d_offsets, d_null_mask);
}

Expand Down
99 changes: 99 additions & 0 deletions cpp/benchmarks/filling/repeat_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmark/benchmark.h>

#include <cudf/filling.hpp>
#include <cudf/null_mask.hpp>

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_wrapper.hpp>

#include <thrust/iterator/counting_iterator.h>

#include <random>

#include "../fixture/benchmark_fixture.hpp"
#include "../synchronization/synchronization.hpp"

class Repeat : public cudf::benchmark {
};

template <class TypeParam, bool nulls>
void BM_repeat(benchmark::State& state)
{
using column_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam>;
auto const n_rows = static_cast<cudf::size_type>(state.range(0));
auto const n_cols = static_cast<cudf::size_type>(state.range(1));

auto idx_begin = thrust::make_counting_iterator<cudf::size_type>(0);
auto idx_end = thrust::make_counting_iterator<cudf::size_type>(n_rows);

std::vector<column_wrapper> columns;
columns.reserve(n_rows);
std::generate_n(std::back_inserter(columns), n_cols, [&]() {
return nulls ? column_wrapper(
idx_begin,
idx_end,
thrust::make_transform_iterator(idx_begin, [](auto idx) { return true; }))
: column_wrapper(idx_begin, idx_end);
});

// repeat counts
std::default_random_engine generator;
std::uniform_int_distribution<int> distribution(0, 3);

std::vector<cudf::size_type> host_repeat_count(n_rows);
std::generate(
host_repeat_count.begin(), host_repeat_count.end(), [&] { return distribution(generator); });

cudf::test::fixed_width_column_wrapper<cudf::size_type> repeat_count(host_repeat_count.begin(),
host_repeat_count.end());

// Create column views
auto const column_views = std::vector<cudf::column_view>(columns.begin(), columns.end());

// Create table view
auto input = cudf::table_view(column_views);

// warm up
auto output = cudf::repeat(input, repeat_count);

for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
cudf::repeat(input, repeat_count);
}

auto data_bytes =
(input.num_columns() * input.num_rows() + output->num_columns() * output->num_rows()) *
sizeof(TypeParam);
auto null_bytes =
nulls ? input.num_columns() * cudf::bitmask_allocation_size_bytes(input.num_rows()) +
output->num_columns() * cudf::bitmask_allocation_size_bytes(output->num_rows())
: 0;
state.SetBytesProcessed(state.iterations() * (data_bytes + null_bytes));
}

#define REPEAT_BENCHMARK_DEFINE(name, type, nulls) \
BENCHMARK_DEFINE_F(Repeat, name)(::benchmark::State & state) { BM_repeat<type, nulls>(state); } \
BENCHMARK_REGISTER_F(Repeat, name) \
->RangeMultiplier(8) \
->Ranges({{1 << 10, 1 << 26}, {1, 8}}) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

REPEAT_BENCHMARK_DEFINE(double_nulls, double, true);
REPEAT_BENCHMARK_DEFINE(double_no_nulls, double, false);
18 changes: 9 additions & 9 deletions cpp/include/cudf/detail/hashing.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -29,17 +29,17 @@ namespace detail {
*/
std::unique_ptr<column> hash(
table_view const& input,
hash_id hash_function = hash_id::HASH_MURMUR3,
std::vector<uint32_t> const& initial_hash = {},
uint32_t seed = 0,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
hash_id hash_function = hash_id::HASH_MURMUR3,
cudf::host_span<uint32_t const> initial_hash = {},
uint32_t seed = 0,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

std::unique_ptr<column> murmur_hash3_32(
table_view const& input,
std::vector<uint32_t> const& initial_hash = {},
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
cudf::host_span<uint32_t const> initial_hash = {},
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

std::unique_ptr<column> md5_hash(
table_view const& input,
Expand Down
15 changes: 8 additions & 7 deletions cpp/include/cudf/hashing.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -17,6 +17,7 @@

#include <cudf/table/table.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/span.hpp>

namespace cudf {
/**
Expand All @@ -29,18 +30,18 @@ namespace cudf {
* @brief Computes the hash value of each row in the input set of columns.
*
* @param input The table of columns to hash
* @param initial_hash Optional vector of initial hash values for each column.
* If this vector is empty then each element will be hashed as-is.
* @param initial_hash Optional host_span of initial hash values for each column.
* If this span is empty then each element will be hashed as-is.
* @param mr Device memory resource used to allocate the returned column's device memory.
*
* @returns A column where each row is the hash of a column from the input
*/
std::unique_ptr<column> hash(
table_view const& input,
hash_id hash_function = hash_id::HASH_MURMUR3,
std::vector<uint32_t> const& initial_hash = {},
uint32_t seed = DEFAULT_HASH_SEED,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
hash_id hash_function = hash_id::HASH_MURMUR3,
cudf::host_span<uint32_t const> initial_hash = {},
uint32_t seed = DEFAULT_HASH_SEED,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
} // namespace cudf
8 changes: 4 additions & 4 deletions cpp/include/cudf/lists/detail/scatter.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ namespace {
* also holding a reference to the list column.
*
* Analogous to the list_view, this class is default constructable,
* and can thus be stored in rmm::device_vector. It is used to represent
* the results of a `scatter()` operation; a device_vector may hold
* and can thus be stored in rmm::device_uvector. It is used to represent
* the results of a `scatter()` operation; a device_uvector may hold
* several instances of unbound_list_view, each with a flag indicating
* whether it came from the scatter source or target. Each instance
* may later be "bound" to the appropriate source/target column, to
Expand Down Expand Up @@ -131,7 +131,7 @@ struct unbound_list_view {
}

private:
// Note: Cannot store reference to list column, because of storage in device_vector.
// Note: Cannot store reference to list column, because of storage in device_uvector.
// Only keep track of whether this list row came from the source or target of scatter.

label_type _label{
Expand Down Expand Up @@ -247,7 +247,7 @@ void print(std::string const& msg,
* The protocol is as follows:
*
* Inputs:
* 1. list_vector: A device_vector of unbound_list_view, with each element
* 1. list_vector: A device_uvector of unbound_list_view, with each element
* indicating the position, size, and which column the list
* row came from.
* 2. list_offsets: The offsets column for the (outer) lists column, each offset
Expand Down
Loading

0 comments on commit 501d7ef

Please sign in to comment.