Skip to content

Commit

Permalink
Merge branch 'branch-22.06' into distinct_tests
Browse files Browse the repository at this point in the history
# Conflicts:
#	cpp/tests/stream_compaction/distinct_tests.cpp
  • Loading branch information
ttnghia committed May 16, 2022
2 parents ff6d063 + d0d7193 commit e007814
Show file tree
Hide file tree
Showing 16 changed files with 304 additions and 80 deletions.
4 changes: 2 additions & 2 deletions ci/checks/copyright.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
re.compile(r"[.]flake8[.]cython$"),
re.compile(r"meta[.]yaml$")
]
ExemptFiles = []
ExemptFiles = ["cpp/include/cudf_test/cxxopts.hpp"]

# this will break starting at year 10000, which is probably OK :)
CheckSimple = re.compile(
Expand Down Expand Up @@ -230,4 +230,4 @@ def checkCopyright_main():

if __name__ == "__main__":
import sys
sys.exit(checkCopyright_main())
sys.exit(checkCopyright_main())
3 changes: 3 additions & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ outputs:
- test -f $PREFIX/include/cudf/detail/hashing.hpp
- test -f $PREFIX/include/cudf/detail/interop.hpp
- test -f $PREFIX/include/cudf/detail/is_element_valid.hpp
- test -f $PREFIX/include/cudf/detail/join.hpp
- test -f $PREFIX/include/cudf/detail/null_mask.hpp
- test -f $PREFIX/include/cudf/detail/nvtx/nvtx3.hpp
- test -f $PREFIX/include/cudf/detail/nvtx/ranges.hpp
Expand Down Expand Up @@ -168,6 +169,7 @@ outputs:
- test -f $PREFIX/include/cudf/lists/detail/interleave_columns.hpp
- test -f $PREFIX/include/cudf/lists/detail/sorting.hpp
- test -f $PREFIX/include/cudf/lists/detail/scatter_helper.cuh
- test -f $PREFIX/include/cudf/lists/detail/stream_compaction.hpp
- test -f $PREFIX/include/cudf/lists/combine.hpp
- test -f $PREFIX/include/cudf/lists/count_elements.hpp
- test -f $PREFIX/include/cudf/lists/explode.hpp
Expand All @@ -178,6 +180,7 @@ outputs:
- test -f $PREFIX/include/cudf/lists/gather.hpp
- test -f $PREFIX/include/cudf/lists/lists_column_view.hpp
- test -f $PREFIX/include/cudf/lists/sorting.hpp
- test -f $PREFIX/include/cudf/lists/stream_compaction.hpp
- test -f $PREFIX/include/cudf/merge.hpp
- test -f $PREFIX/include/cudf/null_mask.hpp
- test -f $PREFIX/include/cudf/partitioning.hpp
Expand Down
7 changes: 5 additions & 2 deletions cpp/doxygen/Doxyfile
Original file line number Diff line number Diff line change
Expand Up @@ -892,7 +892,9 @@ EXCLUDE_PATTERNS = */nvtx/* */detail/*
# Note that the wildcards are matched against the file with absolute path, so to
# exclude all test directories use the pattern */test/*

EXCLUDE_SYMBOLS = org::apache
EXCLUDE_SYMBOLS = org::apache \
*_impl \
*Impl

# The EXAMPLE_PATH tag can be used to specify one or more files or directories
# that contain example code fragments that are included (see the \include
Expand Down Expand Up @@ -2130,7 +2132,8 @@ INCLUDE_FILE_PATTERNS =
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.

PREDEFINED = __device__= \
__host__=
__host__= \
DOXYGEN_SHOULD_SKIP_THIS

# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
# tag can be used to specify a list of macro names that should be expanded. The
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/cudf/column/column_device_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1198,7 +1198,7 @@ struct optional_accessor {
/**
* @brief Constructor
*
* @param col Column on which to iterator over its elements.
* @param _col Column on which to iterator over its elements.
* @param with_nulls Indicates if the `col` should be checked for nulls.
*/
optional_accessor(column_device_view const& _col, Nullate with_nulls)
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/cudf/table/experimental/row_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,7 @@ class self_comparator {
* @brief Construct an owning object for performing a lexicographic comparison between two rows of
* the same table.
*
* @param table The table to compare
* @param t The table to compare
* @param column_order Optional, host array the same length as a row that indicates the desired
* ascending/descending order of each column in a row. If empty, it is assumed all columns are
* sorted in ascending order.
Expand Down
83 changes: 49 additions & 34 deletions cpp/include/cudf/utilities/type_dispatcher.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,41 +156,38 @@ constexpr bool is_fixed_point(cudf::type_id id)
template <> \
struct id_to_type_impl<Id> { \
using type = Type; \
}
};
#endif

/**
* @brief Defines all of the mappings between C++ types and their corresponding
* `cudf::type_id` values.
*/
CUDF_TYPE_MAPPING(bool, type_id::BOOL8);
CUDF_TYPE_MAPPING(int8_t, type_id::INT8);
CUDF_TYPE_MAPPING(int16_t, type_id::INT16);
CUDF_TYPE_MAPPING(int32_t, type_id::INT32);
CUDF_TYPE_MAPPING(int64_t, type_id::INT64);
CUDF_TYPE_MAPPING(uint8_t, type_id::UINT8);
CUDF_TYPE_MAPPING(uint16_t, type_id::UINT16);
CUDF_TYPE_MAPPING(uint32_t, type_id::UINT32);
CUDF_TYPE_MAPPING(uint64_t, type_id::UINT64);
CUDF_TYPE_MAPPING(float, type_id::FLOAT32);
CUDF_TYPE_MAPPING(double, type_id::FLOAT64);
CUDF_TYPE_MAPPING(cudf::string_view, type_id::STRING);
CUDF_TYPE_MAPPING(cudf::timestamp_D, type_id::TIMESTAMP_DAYS);
CUDF_TYPE_MAPPING(cudf::timestamp_s, type_id::TIMESTAMP_SECONDS);
CUDF_TYPE_MAPPING(cudf::timestamp_ms, type_id::TIMESTAMP_MILLISECONDS);
CUDF_TYPE_MAPPING(cudf::timestamp_us, type_id::TIMESTAMP_MICROSECONDS);
CUDF_TYPE_MAPPING(cudf::timestamp_ns, type_id::TIMESTAMP_NANOSECONDS);
CUDF_TYPE_MAPPING(cudf::duration_D, type_id::DURATION_DAYS);
CUDF_TYPE_MAPPING(cudf::duration_s, type_id::DURATION_SECONDS);
CUDF_TYPE_MAPPING(cudf::duration_ms, type_id::DURATION_MILLISECONDS);
CUDF_TYPE_MAPPING(cudf::duration_us, type_id::DURATION_MICROSECONDS);
CUDF_TYPE_MAPPING(cudf::duration_ns, type_id::DURATION_NANOSECONDS);
CUDF_TYPE_MAPPING(dictionary32, type_id::DICTIONARY32);
CUDF_TYPE_MAPPING(cudf::list_view, type_id::LIST);
CUDF_TYPE_MAPPING(numeric::decimal32, type_id::DECIMAL32);
CUDF_TYPE_MAPPING(numeric::decimal64, type_id::DECIMAL64);
CUDF_TYPE_MAPPING(numeric::decimal128, type_id::DECIMAL128);
CUDF_TYPE_MAPPING(cudf::struct_view, type_id::STRUCT);
// Defines all of the mappings between C++ types and their corresponding `cudf::type_id` values.
CUDF_TYPE_MAPPING(bool, type_id::BOOL8)
CUDF_TYPE_MAPPING(int8_t, type_id::INT8)
CUDF_TYPE_MAPPING(int16_t, type_id::INT16)
CUDF_TYPE_MAPPING(int32_t, type_id::INT32)
CUDF_TYPE_MAPPING(int64_t, type_id::INT64)
CUDF_TYPE_MAPPING(uint8_t, type_id::UINT8)
CUDF_TYPE_MAPPING(uint16_t, type_id::UINT16)
CUDF_TYPE_MAPPING(uint32_t, type_id::UINT32)
CUDF_TYPE_MAPPING(uint64_t, type_id::UINT64)
CUDF_TYPE_MAPPING(float, type_id::FLOAT32)
CUDF_TYPE_MAPPING(double, type_id::FLOAT64)
CUDF_TYPE_MAPPING(cudf::string_view, type_id::STRING)
CUDF_TYPE_MAPPING(cudf::timestamp_D, type_id::TIMESTAMP_DAYS)
CUDF_TYPE_MAPPING(cudf::timestamp_s, type_id::TIMESTAMP_SECONDS)
CUDF_TYPE_MAPPING(cudf::timestamp_ms, type_id::TIMESTAMP_MILLISECONDS)
CUDF_TYPE_MAPPING(cudf::timestamp_us, type_id::TIMESTAMP_MICROSECONDS)
CUDF_TYPE_MAPPING(cudf::timestamp_ns, type_id::TIMESTAMP_NANOSECONDS)
CUDF_TYPE_MAPPING(cudf::duration_D, type_id::DURATION_DAYS)
CUDF_TYPE_MAPPING(cudf::duration_s, type_id::DURATION_SECONDS)
CUDF_TYPE_MAPPING(cudf::duration_ms, type_id::DURATION_MILLISECONDS)
CUDF_TYPE_MAPPING(cudf::duration_us, type_id::DURATION_MICROSECONDS)
CUDF_TYPE_MAPPING(cudf::duration_ns, type_id::DURATION_NANOSECONDS)
CUDF_TYPE_MAPPING(dictionary32, type_id::DICTIONARY32)
CUDF_TYPE_MAPPING(cudf::list_view, type_id::LIST)
CUDF_TYPE_MAPPING(numeric::decimal32, type_id::DECIMAL32)
CUDF_TYPE_MAPPING(numeric::decimal64, type_id::DECIMAL64)
CUDF_TYPE_MAPPING(numeric::decimal128, type_id::DECIMAL128)
CUDF_TYPE_MAPPING(cudf::struct_view, type_id::STRUCT)

/**
* @brief Use this specialization on `type_dispatcher` whenever you only need to operate on the
Expand All @@ -210,6 +207,12 @@ struct type_to_scalar_type_impl {
using ScalarType = cudf::scalar;
};

/**
* @brief Macro used to define scalar type and scalar device type for
* `cudf::numeric_scalar` template class for numeric C++ types.
*
* @param Type The numeric C++ type
*/
#ifndef MAP_NUMERIC_SCALAR
#define MAP_NUMERIC_SCALAR(Type) \
template <> \
Expand All @@ -230,7 +233,7 @@ MAP_NUMERIC_SCALAR(uint32_t)
MAP_NUMERIC_SCALAR(uint64_t)
MAP_NUMERIC_SCALAR(float)
MAP_NUMERIC_SCALAR(double)
MAP_NUMERIC_SCALAR(bool);
MAP_NUMERIC_SCALAR(bool)

template <>
struct type_to_scalar_type_impl<std::string> {
Expand Down Expand Up @@ -281,6 +284,12 @@ struct type_to_scalar_type_impl<cudf::struct_view> {
// using ScalarDeviceType = cudf::struct_scalar_device_view; // CALEB: TODO!
};

/**
* @brief Macro used to define scalar type and scalar device type for
* `cudf::timestamp_scalar` template class for timestamp C++ types.
*
* @param Type The timestamp C++ type
*/
#ifndef MAP_TIMESTAMP_SCALAR
#define MAP_TIMESTAMP_SCALAR(Type) \
template <> \
Expand All @@ -296,6 +305,12 @@ MAP_TIMESTAMP_SCALAR(timestamp_ms)
MAP_TIMESTAMP_SCALAR(timestamp_us)
MAP_TIMESTAMP_SCALAR(timestamp_ns)

/**
* @brief Macro used to define scalar type and scalar device type for
* `cudf::duration_scalar` template class for duration C++ types.
*
* @param Type The duration C++ type
*/
#ifndef MAP_DURATION_SCALAR
#define MAP_DURATION_SCALAR(Type) \
template <> \
Expand Down
3 changes: 3 additions & 0 deletions cpp/include/cudf_test/cxxopts.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ THE SOFTWARE.
#ifndef CXXOPTS_HPP_INCLUDED
#define CXXOPTS_HPP_INCLUDED

#ifndef DOXYGEN_SHOULD_SKIP_THIS

#include <cctype>
#include <cstring>
#include <exception>
Expand Down Expand Up @@ -1498,4 +1500,5 @@ inline const HelpGroupDetails& Options::group_help(const std::string& group) con

} // namespace cxxopts

#endif // DOXYGEN_SHOULD_SKIP_THIS
#endif // CXXOPTS_HPP_INCLUDED
2 changes: 1 addition & 1 deletion cpp/include/nvtext/bpe_tokenize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
* @throw cudf::logic_error if `separator` is invalid
*
* @param input Strings to encode.
* @param merge_pairs Created by a call to @ref nvtext::load_merge_pairs_file.
* @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs_file.
* @param separator String used to build the output after encoding.
* Default is a space.
* @param mr Memory resource to allocate any returned objects.
Expand Down
25 changes: 19 additions & 6 deletions cpp/src/interop/dlpack.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,20 +148,33 @@ std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
CUDF_EXPECTS(tensor.device.device_id == device_id, "DLTensor device ID must be current device");
}

// Currently only 1D and 2D tensors are supported
CUDF_EXPECTS(tensor.ndim > 0 && tensor.ndim <= 2, "DLTensor must be 1D or 2D");

// We only support 1D and 2D tensors with some restrictions on layout
if (tensor.ndim == 1) {
// 1D tensors must have dense layout (strides == nullptr <=> dense row-major)
CUDF_EXPECTS(nullptr == tensor.strides || tensor.strides[0] == 1,
"from_dlpack of 1D DLTensor only for unit-stride data");
} else if (tensor.ndim == 2) {
// 2D tensors must have column-major layout and the fastest dimension must have dense layout
CUDF_EXPECTS((
// 1D tensor reshaped into (N, 1) is fine
tensor.shape[1] == 1 && (nullptr == tensor.strides || tensor.strides[0] == 1))
// General case
|| (nullptr != tensor.strides && tensor.strides[0] == 1 &&
tensor.strides[1] >= tensor.shape[0]),
"from_dlpack of 2D DLTensor only for column-major unit-stride data");
} else {
CUDF_FAIL("DLTensor must be 1D or 2D");
}
CUDF_EXPECTS(tensor.shape[0] >= 0,
"DLTensor first dim should be of shape greater than or equal-to 0.");
"DLTensor first dim should be of shape greater than or equal to 0.");
CUDF_EXPECTS(tensor.shape[0] < std::numeric_limits<size_type>::max(),
"DLTensor first dim exceeds size supported by cudf");
if (tensor.ndim > 1) {
CUDF_EXPECTS(tensor.shape[1] >= 0,
"DLTensor second dim should be of shape greater than or equal-to 0.");
"DLTensor second dim should be of shape greater than or equal to 0.");
CUDF_EXPECTS(tensor.shape[1] < std::numeric_limits<size_type>::max(),
"DLTensor second dim exceeds size supported by cudf");
}

size_t const num_columns = (tensor.ndim == 2) ? static_cast<size_t>(tensor.shape[1]) : 1;

// Validate and convert data type to cudf
Expand Down
66 changes: 46 additions & 20 deletions cpp/src/io/csv/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#include <rmm/cuda_stream_view.hpp>

#include <thrust/host_vector.h>
#include <thrust/iterator/counting_iterator.h>

#include <algorithm>
#include <iostream>
Expand Down Expand Up @@ -696,37 +697,62 @@ table_with_metadata read_csv(cudf::io::datasource* source,

column_flags.resize(num_actual_columns, column_parse::enabled | column_parse::inferred);

std::vector<size_t> col_loop_order(column_names.size());
auto unnamed_it = std::copy_if(
thrust::make_counting_iterator<size_t>(0),
thrust::make_counting_iterator<size_t>(column_names.size()),
col_loop_order.begin(),
[&column_names](auto col_idx) -> bool { return not column_names[col_idx].empty(); });
// Rename empty column names to "Unnamed: col_index"
for (size_t col_idx = 0; col_idx < column_names.size(); ++col_idx) {
if (column_names[col_idx].empty()) {
column_names[col_idx] = string("Unnamed: ") + std::to_string(col_idx);
}
}
std::copy_if(thrust::make_counting_iterator<size_t>(0),
thrust::make_counting_iterator<size_t>(column_names.size()),
unnamed_it,
[&column_names](auto col_idx) -> bool {
auto is_empty = column_names[col_idx].empty();
if (is_empty)
column_names[col_idx] = string("Unnamed: ") + std::to_string(col_idx);
return is_empty;
});

// Looking for duplicates
std::unordered_map<string, int> col_names_histogram;
for (auto& col_name : column_names) {
// Operator [] inserts a default-initialized value if the given key is not
// present
if (++col_names_histogram[col_name] > 1) {
if (reader_opts.is_enabled_mangle_dupe_cols()) {
// Rename duplicates of column X as X.1, X.2, ...; First appearance
// stays as X
do {
col_name += "." + std::to_string(col_names_histogram[col_name] - 1);
} while (col_names_histogram[col_name]++);
} else {
std::unordered_map<string, int> col_names_counts;
if (!reader_opts.is_enabled_mangle_dupe_cols()) {
for (auto& col_name : column_names) {
if (++col_names_counts[col_name] > 1) {
// All duplicate columns will be ignored; First appearance is parsed
const auto idx = &col_name - column_names.data();
column_flags[idx] = column_parse::disabled;
}
}
} else {
// For constant/linear search.
std::unordered_multiset<std::string> header(column_names.begin(), column_names.end());
for (auto const col_idx : col_loop_order) {
auto col = column_names[col_idx];
auto cur_count = col_names_counts[col];
if (cur_count > 0) {
auto const old_col = col;
// Rename duplicates of column X as X.1, X.2, ...; First appearance stays as X
while (cur_count > 0) {
col_names_counts[old_col] = cur_count + 1;
col = old_col + "." + std::to_string(cur_count);
if (header.find(col) != header.end()) {
cur_count++;
} else {
cur_count = col_names_counts[col];
}
}
if (auto pos = header.find(old_col); pos != header.end()) { header.erase(pos); }
header.insert(col);
column_names[col_idx] = col;
}
col_names_counts[col] = cur_count + 1;
}
}

// Update the number of columns to be processed, if some might have been
// removed
// Update the number of columns to be processed, if some might have been removed
if (!reader_opts.is_enabled_mangle_dupe_cols()) {
num_active_columns = col_names_histogram.size();
num_active_columns = col_names_counts.size();
}
}

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/stream_compaction/distinct.cu
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ std::unique_ptr<table> distinct(table_view const& input,
auto keys_view = input.select(keys);
auto preprocessed_keys =
cudf::experimental::row::hash::preprocessed_table::create(keys_view, stream);
auto has_null = nullate::DYNAMIC{cudf::has_nulls(keys_view)};
auto const has_null = nullate::DYNAMIC{cudf::has_nested_nulls(keys_view)};
auto const num_rows{keys_view.num_rows()};

hash_map_type key_map{compute_hash_table_size(num_rows),
Expand Down
Loading

0 comments on commit e007814

Please sign in to comment.