From ae2062e8ce8086308622634b286f8241ed5d8222 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20Mart=C3=ADnez?= <26169771+miguelusque@users.noreply.github.com> Date: Sat, 14 May 2022 17:53:26 +0200 Subject: [PATCH 1/5] Remove typo in ngram documentation (#10859) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a typo in ngram() method documentation, consisting in a duplicated assignment. Authors: - Miguel Martínez (https://github.com/miguelusque) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/10859 --- python/cudf/cudf/core/column/string.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 09a4754f519..dc05e8d1937 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4519,7 +4519,6 @@ def ngrams(self, n: int = 2, separator: str = "_") -> SeriesOrIndex: -------- >>> import cudf >>> str_series = cudf.Series(['this is my', 'favorite book']) - >>> str_series = cudf.Series(['this is my', 'favorite book']) >>> str_series.str.ngrams(2, "_") 0 this is my_favorite book dtype: object From 6591a6a1851196b497bf61f74064d88e98e70357 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Sun, 15 May 2022 01:58:38 +0530 Subject: [PATCH 2/5] fix doxygen warnings (#10842) This PR fixes 433 doxygen warnings. As prerequisite for doxygen CI check https://github.com/rapidsai/cudf/issues/9373, the warnings from doxygen should be minimized as much as possible. This is one of the series of PRs to fix doxygen documentation warnings. Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Nghia Truong (https://github.com/ttnghia) - AJ Schmidt (https://github.com/ajschmidt8) URL: https://github.com/rapidsai/cudf/pull/10842 --- ci/checks/copyright.py | 4 +- conda/recipes/libcudf/meta.yaml | 3 + cpp/doxygen/Doxyfile | 7 +- .../cudf/column/column_device_view.cuh | 2 +- .../cudf/table/experimental/row_operators.cuh | 2 +- .../cudf/utilities/type_dispatcher.hpp | 83 +++++++++++-------- cpp/include/cudf_test/cxxopts.hpp | 3 + cpp/include/nvtext/bpe_tokenize.hpp | 2 +- 8 files changed, 65 insertions(+), 41 deletions(-) diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py index d72fd95fea3..e3ff0d677ee 100644 --- a/ci/checks/copyright.py +++ b/ci/checks/copyright.py @@ -37,7 +37,7 @@ re.compile(r"[.]flake8[.]cython$"), re.compile(r"meta[.]yaml$") ] -ExemptFiles = [] +ExemptFiles = ["cpp/include/cudf_test/cxxopts.hpp"] # this will break starting at year 10000, which is probably OK :) CheckSimple = re.compile( @@ -230,4 +230,4 @@ def checkCopyright_main(): if __name__ == "__main__": import sys - sys.exit(checkCopyright_main()) \ No newline at end of file + sys.exit(checkCopyright_main()) diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 11874db0506..c8b5a9d373b 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -89,6 +89,7 @@ outputs: - test -f $PREFIX/include/cudf/detail/hashing.hpp - test -f $PREFIX/include/cudf/detail/interop.hpp - test -f $PREFIX/include/cudf/detail/is_element_valid.hpp + - test -f $PREFIX/include/cudf/detail/join.hpp - test -f $PREFIX/include/cudf/detail/null_mask.hpp - test -f $PREFIX/include/cudf/detail/nvtx/nvtx3.hpp - test -f $PREFIX/include/cudf/detail/nvtx/ranges.hpp @@ -168,6 +169,7 @@ outputs: - test -f $PREFIX/include/cudf/lists/detail/interleave_columns.hpp - test -f $PREFIX/include/cudf/lists/detail/sorting.hpp - test -f $PREFIX/include/cudf/lists/detail/scatter_helper.cuh + - test -f $PREFIX/include/cudf/lists/detail/stream_compaction.hpp - test -f $PREFIX/include/cudf/lists/combine.hpp - test -f $PREFIX/include/cudf/lists/count_elements.hpp - test -f $PREFIX/include/cudf/lists/explode.hpp @@ -178,6 +180,7 @@ outputs: - test -f $PREFIX/include/cudf/lists/gather.hpp - test -f $PREFIX/include/cudf/lists/lists_column_view.hpp - test -f $PREFIX/include/cudf/lists/sorting.hpp + - test -f $PREFIX/include/cudf/lists/stream_compaction.hpp - test -f $PREFIX/include/cudf/merge.hpp - test -f $PREFIX/include/cudf/null_mask.hpp - test -f $PREFIX/include/cudf/partitioning.hpp diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index 6929b529728..4aa0bfca78a 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -892,7 +892,9 @@ EXCLUDE_PATTERNS = */nvtx/* */detail/* # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* -EXCLUDE_SYMBOLS = org::apache +EXCLUDE_SYMBOLS = org::apache \ + *_impl \ + *Impl # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include @@ -2130,7 +2132,8 @@ INCLUDE_FILE_PATTERNS = # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. PREDEFINED = __device__= \ - __host__= + __host__= \ + DOXYGEN_SHOULD_SKIP_THIS # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index 070ca80858b..0e99fb52186 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -1198,7 +1198,7 @@ struct optional_accessor { /** * @brief Constructor * - * @param col Column on which to iterator over its elements. + * @param _col Column on which to iterator over its elements. * @param with_nulls Indicates if the `col` should be checked for nulls. */ optional_accessor(column_device_view const& _col, Nullate with_nulls) diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index 336420ed840..43f09ff55f0 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -420,7 +420,7 @@ class self_comparator { * @brief Construct an owning object for performing a lexicographic comparison between two rows of * the same table. * - * @param table The table to compare + * @param t The table to compare * @param column_order Optional, host array the same length as a row that indicates the desired * ascending/descending order of each column in a row. If empty, it is assumed all columns are * sorted in ascending order. diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp index 5c3a6b128b5..920c5222552 100644 --- a/cpp/include/cudf/utilities/type_dispatcher.hpp +++ b/cpp/include/cudf/utilities/type_dispatcher.hpp @@ -156,41 +156,38 @@ constexpr bool is_fixed_point(cudf::type_id id) template <> \ struct id_to_type_impl { \ using type = Type; \ - } + }; #endif -/** - * @brief Defines all of the mappings between C++ types and their corresponding - * `cudf::type_id` values. - */ -CUDF_TYPE_MAPPING(bool, type_id::BOOL8); -CUDF_TYPE_MAPPING(int8_t, type_id::INT8); -CUDF_TYPE_MAPPING(int16_t, type_id::INT16); -CUDF_TYPE_MAPPING(int32_t, type_id::INT32); -CUDF_TYPE_MAPPING(int64_t, type_id::INT64); -CUDF_TYPE_MAPPING(uint8_t, type_id::UINT8); -CUDF_TYPE_MAPPING(uint16_t, type_id::UINT16); -CUDF_TYPE_MAPPING(uint32_t, type_id::UINT32); -CUDF_TYPE_MAPPING(uint64_t, type_id::UINT64); -CUDF_TYPE_MAPPING(float, type_id::FLOAT32); -CUDF_TYPE_MAPPING(double, type_id::FLOAT64); -CUDF_TYPE_MAPPING(cudf::string_view, type_id::STRING); -CUDF_TYPE_MAPPING(cudf::timestamp_D, type_id::TIMESTAMP_DAYS); -CUDF_TYPE_MAPPING(cudf::timestamp_s, type_id::TIMESTAMP_SECONDS); -CUDF_TYPE_MAPPING(cudf::timestamp_ms, type_id::TIMESTAMP_MILLISECONDS); -CUDF_TYPE_MAPPING(cudf::timestamp_us, type_id::TIMESTAMP_MICROSECONDS); -CUDF_TYPE_MAPPING(cudf::timestamp_ns, type_id::TIMESTAMP_NANOSECONDS); -CUDF_TYPE_MAPPING(cudf::duration_D, type_id::DURATION_DAYS); -CUDF_TYPE_MAPPING(cudf::duration_s, type_id::DURATION_SECONDS); -CUDF_TYPE_MAPPING(cudf::duration_ms, type_id::DURATION_MILLISECONDS); -CUDF_TYPE_MAPPING(cudf::duration_us, type_id::DURATION_MICROSECONDS); -CUDF_TYPE_MAPPING(cudf::duration_ns, type_id::DURATION_NANOSECONDS); -CUDF_TYPE_MAPPING(dictionary32, type_id::DICTIONARY32); -CUDF_TYPE_MAPPING(cudf::list_view, type_id::LIST); -CUDF_TYPE_MAPPING(numeric::decimal32, type_id::DECIMAL32); -CUDF_TYPE_MAPPING(numeric::decimal64, type_id::DECIMAL64); -CUDF_TYPE_MAPPING(numeric::decimal128, type_id::DECIMAL128); -CUDF_TYPE_MAPPING(cudf::struct_view, type_id::STRUCT); +// Defines all of the mappings between C++ types and their corresponding `cudf::type_id` values. +CUDF_TYPE_MAPPING(bool, type_id::BOOL8) +CUDF_TYPE_MAPPING(int8_t, type_id::INT8) +CUDF_TYPE_MAPPING(int16_t, type_id::INT16) +CUDF_TYPE_MAPPING(int32_t, type_id::INT32) +CUDF_TYPE_MAPPING(int64_t, type_id::INT64) +CUDF_TYPE_MAPPING(uint8_t, type_id::UINT8) +CUDF_TYPE_MAPPING(uint16_t, type_id::UINT16) +CUDF_TYPE_MAPPING(uint32_t, type_id::UINT32) +CUDF_TYPE_MAPPING(uint64_t, type_id::UINT64) +CUDF_TYPE_MAPPING(float, type_id::FLOAT32) +CUDF_TYPE_MAPPING(double, type_id::FLOAT64) +CUDF_TYPE_MAPPING(cudf::string_view, type_id::STRING) +CUDF_TYPE_MAPPING(cudf::timestamp_D, type_id::TIMESTAMP_DAYS) +CUDF_TYPE_MAPPING(cudf::timestamp_s, type_id::TIMESTAMP_SECONDS) +CUDF_TYPE_MAPPING(cudf::timestamp_ms, type_id::TIMESTAMP_MILLISECONDS) +CUDF_TYPE_MAPPING(cudf::timestamp_us, type_id::TIMESTAMP_MICROSECONDS) +CUDF_TYPE_MAPPING(cudf::timestamp_ns, type_id::TIMESTAMP_NANOSECONDS) +CUDF_TYPE_MAPPING(cudf::duration_D, type_id::DURATION_DAYS) +CUDF_TYPE_MAPPING(cudf::duration_s, type_id::DURATION_SECONDS) +CUDF_TYPE_MAPPING(cudf::duration_ms, type_id::DURATION_MILLISECONDS) +CUDF_TYPE_MAPPING(cudf::duration_us, type_id::DURATION_MICROSECONDS) +CUDF_TYPE_MAPPING(cudf::duration_ns, type_id::DURATION_NANOSECONDS) +CUDF_TYPE_MAPPING(dictionary32, type_id::DICTIONARY32) +CUDF_TYPE_MAPPING(cudf::list_view, type_id::LIST) +CUDF_TYPE_MAPPING(numeric::decimal32, type_id::DECIMAL32) +CUDF_TYPE_MAPPING(numeric::decimal64, type_id::DECIMAL64) +CUDF_TYPE_MAPPING(numeric::decimal128, type_id::DECIMAL128) +CUDF_TYPE_MAPPING(cudf::struct_view, type_id::STRUCT) /** * @brief Use this specialization on `type_dispatcher` whenever you only need to operate on the @@ -210,6 +207,12 @@ struct type_to_scalar_type_impl { using ScalarType = cudf::scalar; }; +/** + * @brief Macro used to define scalar type and scalar device type for + * `cudf::numeric_scalar` template class for numeric C++ types. + * + * @param Type The numeric C++ type + */ #ifndef MAP_NUMERIC_SCALAR #define MAP_NUMERIC_SCALAR(Type) \ template <> \ @@ -230,7 +233,7 @@ MAP_NUMERIC_SCALAR(uint32_t) MAP_NUMERIC_SCALAR(uint64_t) MAP_NUMERIC_SCALAR(float) MAP_NUMERIC_SCALAR(double) -MAP_NUMERIC_SCALAR(bool); +MAP_NUMERIC_SCALAR(bool) template <> struct type_to_scalar_type_impl { @@ -281,6 +284,12 @@ struct type_to_scalar_type_impl { // using ScalarDeviceType = cudf::struct_scalar_device_view; // CALEB: TODO! }; +/** + * @brief Macro used to define scalar type and scalar device type for + * `cudf::timestamp_scalar` template class for timestamp C++ types. + * + * @param Type The timestamp C++ type + */ #ifndef MAP_TIMESTAMP_SCALAR #define MAP_TIMESTAMP_SCALAR(Type) \ template <> \ @@ -296,6 +305,12 @@ MAP_TIMESTAMP_SCALAR(timestamp_ms) MAP_TIMESTAMP_SCALAR(timestamp_us) MAP_TIMESTAMP_SCALAR(timestamp_ns) +/** + * @brief Macro used to define scalar type and scalar device type for + * `cudf::duration_scalar` template class for duration C++ types. + * + * @param Type The duration C++ type + */ #ifndef MAP_DURATION_SCALAR #define MAP_DURATION_SCALAR(Type) \ template <> \ diff --git a/cpp/include/cudf_test/cxxopts.hpp b/cpp/include/cudf_test/cxxopts.hpp index e3ff4fccfe3..d0fc3c7e38c 100644 --- a/cpp/include/cudf_test/cxxopts.hpp +++ b/cpp/include/cudf_test/cxxopts.hpp @@ -20,6 +20,8 @@ THE SOFTWARE. #ifndef CXXOPTS_HPP_INCLUDED #define CXXOPTS_HPP_INCLUDED +#ifndef DOXYGEN_SHOULD_SKIP_THIS + #include #include #include @@ -1498,4 +1500,5 @@ inline const HelpGroupDetails& Options::group_help(const std::string& group) con } // namespace cxxopts +#endif // DOXYGEN_SHOULD_SKIP_THIS #endif // CXXOPTS_HPP_INCLUDED diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp index 23fcd3acd03..dcd24674029 100644 --- a/cpp/include/nvtext/bpe_tokenize.hpp +++ b/cpp/include/nvtext/bpe_tokenize.hpp @@ -107,7 +107,7 @@ std::unique_ptr load_merge_pairs_file( * @throw cudf::logic_error if `separator` is invalid * * @param input Strings to encode. - * @param merge_pairs Created by a call to @ref nvtext::load_merge_pairs_file. + * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs_file. * @param separator String used to build the output after encoding. * Default is a space. * @param mr Memory resource to allocate any returned objects. From e58d0490472467c724cd9d9e5ec1c6f6efa4beae Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Mon, 16 May 2022 10:33:10 +0530 Subject: [PATCH 3/5] update mangle_dupe_cols behavior in csv reader to match pandas 1.4.0 behavior (#10749) Fixes https://github.com/rapidsai/cudf/issues/10618 Depends on https://github.com/rapidsai/cudf/pull/10584 Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/10749 --- cpp/src/io/csv/reader_impl.cu | 66 +++++++++++++++++++++--------- python/cudf/cudf/tests/test_csv.py | 29 ++++++++----- 2 files changed, 64 insertions(+), 31 deletions(-) diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index cd070d28f38..d20155b4720 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -43,6 +43,7 @@ #include #include +#include #include #include @@ -696,37 +697,62 @@ table_with_metadata read_csv(cudf::io::datasource* source, column_flags.resize(num_actual_columns, column_parse::enabled | column_parse::inferred); + std::vector col_loop_order(column_names.size()); + auto unnamed_it = std::copy_if( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(column_names.size()), + col_loop_order.begin(), + [&column_names](auto col_idx) -> bool { return not column_names[col_idx].empty(); }); // Rename empty column names to "Unnamed: col_index" - for (size_t col_idx = 0; col_idx < column_names.size(); ++col_idx) { - if (column_names[col_idx].empty()) { - column_names[col_idx] = string("Unnamed: ") + std::to_string(col_idx); - } - } + std::copy_if(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(column_names.size()), + unnamed_it, + [&column_names](auto col_idx) -> bool { + auto is_empty = column_names[col_idx].empty(); + if (is_empty) + column_names[col_idx] = string("Unnamed: ") + std::to_string(col_idx); + return is_empty; + }); // Looking for duplicates - std::unordered_map col_names_histogram; - for (auto& col_name : column_names) { - // Operator [] inserts a default-initialized value if the given key is not - // present - if (++col_names_histogram[col_name] > 1) { - if (reader_opts.is_enabled_mangle_dupe_cols()) { - // Rename duplicates of column X as X.1, X.2, ...; First appearance - // stays as X - do { - col_name += "." + std::to_string(col_names_histogram[col_name] - 1); - } while (col_names_histogram[col_name]++); - } else { + std::unordered_map col_names_counts; + if (!reader_opts.is_enabled_mangle_dupe_cols()) { + for (auto& col_name : column_names) { + if (++col_names_counts[col_name] > 1) { // All duplicate columns will be ignored; First appearance is parsed const auto idx = &col_name - column_names.data(); column_flags[idx] = column_parse::disabled; } } + } else { + // For constant/linear search. + std::unordered_multiset header(column_names.begin(), column_names.end()); + for (auto const col_idx : col_loop_order) { + auto col = column_names[col_idx]; + auto cur_count = col_names_counts[col]; + if (cur_count > 0) { + auto const old_col = col; + // Rename duplicates of column X as X.1, X.2, ...; First appearance stays as X + while (cur_count > 0) { + col_names_counts[old_col] = cur_count + 1; + col = old_col + "." + std::to_string(cur_count); + if (header.find(col) != header.end()) { + cur_count++; + } else { + cur_count = col_names_counts[col]; + } + } + if (auto pos = header.find(old_col); pos != header.end()) { header.erase(pos); } + header.insert(col); + column_names[col_idx] = col; + } + col_names_counts[col] = cur_count + 1; + } } - // Update the number of columns to be processed, if some might have been - // removed + // Update the number of columns to be processed, if some might have been removed if (!reader_opts.is_enabled_mangle_dupe_cols()) { - num_active_columns = col_names_histogram.size(); + num_active_columns = col_names_counts.size(); } } diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index acad2507292..6ddc973b1a0 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -473,20 +473,27 @@ def test_csv_reader_usecols_int_char(tmpdir, pd_mixed_dataframe): assert_eq(df_out, out, check_names=False) -def test_csv_reader_mangle_dupe_cols(tmpdir): - buffer = "abc,ABC,abc,abcd,abc\n1,2,3,4,5\n" - +@pytest.mark.parametrize( + "buffer", + [ + "abc,ABC,abc,abcd,abc\n1,2,3,4,5\n", + "A,A,A.1,A,A.2,A,A.4,A,A\n1,2,3.1,4,a.2,a,a.4,a,a", + "A,A,A.1,,Unnamed: 4,A,A.4,A,A\n1,2,3.1,4,a.2,a,a.4,a,a", + ], +) +@pytest.mark.parametrize("mangle_dupe_cols", [True, False]) +def test_csv_reader_mangle_dupe_cols(tmpdir, buffer, mangle_dupe_cols): # Default: mangle_dupe_cols=True - pd_df = pd.read_csv(StringIO(buffer)) - cu_df = read_csv(StringIO(buffer)) + cu_df = read_csv(StringIO(buffer), mangle_dupe_cols=mangle_dupe_cols) + if mangle_dupe_cols: + pd_df = pd.read_csv(StringIO(buffer)) + else: + # Pandas does not support mangle_dupe_cols=False + head = buffer.split("\n")[0].split(",") + first_cols = np.unique(head, return_index=True)[1] + pd_df = pd.read_csv(StringIO(buffer), usecols=first_cols) assert_eq(cu_df, pd_df) - # Pandas does not support mangle_dupe_cols=False - cu_df = read_csv(StringIO(buffer), mangle_dupe_cols=False) - # check that the dupe columns were removed - assert len(cu_df.columns) == 3 - np.testing.assert_array_equal(cu_df["abc"].to_numpy(), [1]) - def test_csv_reader_float_decimal(tmpdir): fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file12.csv") From 9e004c3be6f7e9287bcaba3bd90dc274c87b7f75 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 16 May 2022 16:30:16 +0100 Subject: [PATCH 4/5] More error checking in `from_dlpack` (#10850) The implementation copying a full column at a time relies on unit-stride, column-major data in the incoming dlpack tensor. Check these preconditions and report error messages, rather than either silently producing bad data (unit-stride, row-major 2D tensors) or invalid memory accesses (non-unit-stride in the fastest-varying dimension). Closes #10754 by providing an explicit error message for this unsupported case. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - David Wendt (https://github.com/davidwendt) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/10850 --- cpp/src/interop/dlpack.cpp | 25 +++- cpp/tests/interop/dlpack_test.cpp | 114 ++++++++++++++++++ .../source/api_docs/general_functions.rst | 1 + 3 files changed, 134 insertions(+), 6 deletions(-) diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp index e5da4794ca3..be64a9c9bc1 100644 --- a/cpp/src/interop/dlpack.cpp +++ b/cpp/src/interop/dlpack.cpp @@ -148,20 +148,33 @@ std::unique_ptr from_dlpack(DLManagedTensor const* managed_tensor, CUDF_EXPECTS(tensor.device.device_id == device_id, "DLTensor device ID must be current device"); } - // Currently only 1D and 2D tensors are supported - CUDF_EXPECTS(tensor.ndim > 0 && tensor.ndim <= 2, "DLTensor must be 1D or 2D"); - + // We only support 1D and 2D tensors with some restrictions on layout + if (tensor.ndim == 1) { + // 1D tensors must have dense layout (strides == nullptr <=> dense row-major) + CUDF_EXPECTS(nullptr == tensor.strides || tensor.strides[0] == 1, + "from_dlpack of 1D DLTensor only for unit-stride data"); + } else if (tensor.ndim == 2) { + // 2D tensors must have column-major layout and the fastest dimension must have dense layout + CUDF_EXPECTS(( + // 1D tensor reshaped into (N, 1) is fine + tensor.shape[1] == 1 && (nullptr == tensor.strides || tensor.strides[0] == 1)) + // General case + || (nullptr != tensor.strides && tensor.strides[0] == 1 && + tensor.strides[1] >= tensor.shape[0]), + "from_dlpack of 2D DLTensor only for column-major unit-stride data"); + } else { + CUDF_FAIL("DLTensor must be 1D or 2D"); + } CUDF_EXPECTS(tensor.shape[0] >= 0, - "DLTensor first dim should be of shape greater than or equal-to 0."); + "DLTensor first dim should be of shape greater than or equal to 0."); CUDF_EXPECTS(tensor.shape[0] < std::numeric_limits::max(), "DLTensor first dim exceeds size supported by cudf"); if (tensor.ndim > 1) { CUDF_EXPECTS(tensor.shape[1] >= 0, - "DLTensor second dim should be of shape greater than or equal-to 0."); + "DLTensor second dim should be of shape greater than or equal to 0."); CUDF_EXPECTS(tensor.shape[1] < std::numeric_limits::max(), "DLTensor second dim exceeds size supported by cudf"); } - size_t const num_columns = (tensor.ndim == 2) ? static_cast(tensor.shape[1]) : 1; // Validate and convert data type to cudf diff --git a/cpp/tests/interop/dlpack_test.cpp b/cpp/tests/interop/dlpack_test.cpp index 2528c3e5a83..a722f66951f 100644 --- a/cpp/tests/interop/dlpack_test.cpp +++ b/cpp/tests/interop/dlpack_test.cpp @@ -208,6 +208,120 @@ TEST_F(DLPackUntypedTests, UnsupportedLanesFromDlpack) EXPECT_THROW(cudf::from_dlpack(tensor.get()), cudf::logic_error); } +TEST_F(DLPackUntypedTests, UnsupportedBroadcast1DTensorFromDlpack) +{ + using T = float; + constexpr int ndim = 1; + // Broadcasted (stride-0) 1D tensor + auto const data = cudf::test::make_type_param_vector({1}); + int64_t shape[ndim] = {5}; + int64_t strides[ndim] = {0}; + + DLManagedTensor tensor{}; + tensor.dl_tensor.device.device_type = kDLCPU; + tensor.dl_tensor.dtype = get_dtype(); + tensor.dl_tensor.ndim = ndim; + tensor.dl_tensor.byte_offset = 0; + tensor.dl_tensor.shape = shape; + tensor.dl_tensor.strides = strides; + + thrust::host_vector host_vector(data.begin(), data.end()); + tensor.dl_tensor.data = host_vector.data(); + + EXPECT_THROW(cudf::from_dlpack(&tensor), cudf::logic_error); +} + +TEST_F(DLPackUntypedTests, UnsupportedStrided1DTensorFromDlpack) +{ + using T = float; + constexpr int ndim = 1; + // Strided 1D tensor + auto const data = cudf::test::make_type_param_vector({1, 2, 3, 4}); + int64_t shape[ndim] = {2}; + int64_t strides[ndim] = {2}; + + DLManagedTensor tensor{}; + tensor.dl_tensor.device.device_type = kDLCPU; + tensor.dl_tensor.dtype = get_dtype(); + tensor.dl_tensor.ndim = ndim; + tensor.dl_tensor.byte_offset = 0; + tensor.dl_tensor.shape = shape; + tensor.dl_tensor.strides = strides; + + thrust::host_vector host_vector(data.begin(), data.end()); + tensor.dl_tensor.data = host_vector.data(); + + EXPECT_THROW(cudf::from_dlpack(&tensor), cudf::logic_error); +} + +TEST_F(DLPackUntypedTests, UnsupportedImplicitRowMajor2DTensorFromDlpack) +{ + using T = float; + constexpr int ndim = 2; + // Row major 2D tensor + auto const data = cudf::test::make_type_param_vector({1, 2, 3, 4}); + int64_t shape[ndim] = {2, 2}; + + DLManagedTensor tensor{}; + tensor.dl_tensor.device.device_type = kDLCPU; + tensor.dl_tensor.dtype = get_dtype(); + tensor.dl_tensor.ndim = ndim; + tensor.dl_tensor.byte_offset = 0; + tensor.dl_tensor.shape = shape; + tensor.dl_tensor.strides = nullptr; + + thrust::host_vector host_vector(data.begin(), data.end()); + tensor.dl_tensor.data = host_vector.data(); + + EXPECT_THROW(cudf::from_dlpack(&tensor), cudf::logic_error); +} + +TEST_F(DLPackUntypedTests, UnsupportedExplicitRowMajor2DTensorFromDlpack) +{ + using T = float; + constexpr int ndim = 2; + // Row major 2D tensor with explicit strides + auto const data = cudf::test::make_type_param_vector({1, 2, 3, 4}); + int64_t shape[ndim] = {2, 2}; + int64_t strides[ndim] = {2, 1}; + + DLManagedTensor tensor{}; + tensor.dl_tensor.device.device_type = kDLCPU; + tensor.dl_tensor.dtype = get_dtype(); + tensor.dl_tensor.ndim = ndim; + tensor.dl_tensor.byte_offset = 0; + tensor.dl_tensor.shape = shape; + tensor.dl_tensor.strides = strides; + + thrust::host_vector host_vector(data.begin(), data.end()); + tensor.dl_tensor.data = host_vector.data(); + + EXPECT_THROW(cudf::from_dlpack(&tensor), cudf::logic_error); +} + +TEST_F(DLPackUntypedTests, UnsupportedStridedColMajor2DTensorFromDlpack) +{ + using T = float; + constexpr int ndim = 2; + // Column major, but strided in fastest dimension + auto const data = cudf::test::make_type_param_vector({1, 2, 3, 4, 5, 6, 7, 8}); + int64_t shape[ndim] = {2, 2}; + int64_t strides[ndim] = {2, 4}; + + DLManagedTensor tensor{}; + tensor.dl_tensor.device.device_type = kDLCPU; + tensor.dl_tensor.dtype = get_dtype(); + tensor.dl_tensor.ndim = ndim; + tensor.dl_tensor.byte_offset = 0; + tensor.dl_tensor.shape = shape; + tensor.dl_tensor.strides = strides; + + thrust::host_vector host_vector(data.begin(), data.end()); + tensor.dl_tensor.data = host_vector.data(); + + EXPECT_THROW(cudf::from_dlpack(&tensor), cudf::logic_error); +} + template class DLPackTimestampTests : public BaseFixture { }; diff --git a/docs/cudf/source/api_docs/general_functions.rst b/docs/cudf/source/api_docs/general_functions.rst index a4a08a6be7f..c666bcb147d 100644 --- a/docs/cudf/source/api_docs/general_functions.rst +++ b/docs/cudf/source/api_docs/general_functions.rst @@ -23,6 +23,7 @@ Top-level conversions :toctree: api/ cudf.to_numeric + cudf.from_dlpack Top-level dealing with datetimelike ----------------------------------- From d0d7193c901cf8c6e994535d18578b6f33a45bfd Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 16 May 2022 12:10:20 -0400 Subject: [PATCH 5/5] Fix a bug in `distinct`: using nested nulls logic (#10848) This PR fixes a bug in `cudf::distinct` where the `Nullate` of the comparator should be determined by whether nested nulls are present. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Nghia Truong (https://github.com/ttnghia) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/10848 --- cpp/src/stream_compaction/distinct.cu | 2 +- .../stream_compaction/distinct_tests.cpp | 40 +++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu index 35c74178620..d698c547a61 100644 --- a/cpp/src/stream_compaction/distinct.cu +++ b/cpp/src/stream_compaction/distinct.cu @@ -59,7 +59,7 @@ std::unique_ptr
distinct(table_view const& input, auto keys_view = input.select(keys); auto preprocessed_keys = cudf::experimental::row::hash::preprocessed_table::create(keys_view, stream); - auto has_null = nullate::DYNAMIC{cudf::has_nulls(keys_view)}; + auto const has_null = nullate::DYNAMIC{cudf::has_nested_nulls(keys_view)}; auto const num_rows{keys_view.num_rows()}; hash_map_type key_map{compute_hash_table_size(num_rows), diff --git a/cpp/tests/stream_compaction/distinct_tests.cpp b/cpp/tests/stream_compaction/distinct_tests.cpp index 2c822b93444..5ce39b42fea 100644 --- a/cpp/tests/stream_compaction/distinct_tests.cpp +++ b/cpp/tests/stream_compaction/distinct_tests.cpp @@ -303,6 +303,46 @@ TEST_F(Distinct, StructOfStruct) CUDF_TEST_EXPECT_COLUMNS_EQUAL(sliced_expect->get_column(1), sorted_sliced_result->get_column(1)); } +TEST_F(Distinct, StructWithNullElement) +{ + using FWCW = cudf::test::fixed_width_column_wrapper; + using MASK = std::vector; + + /* + `@` indicates null + + /+-------------+ + |s1{s2{a,b}, c}| + +--------------+ + 0 | { {1, 1}, 2}| + 1 | {@{1, 1}, 2}| + +--------------+ + */ + + auto col_a = FWCW{1, 1}; + auto col_b = FWCW{1, 1}; + auto s2_mask = MASK{1, 0}; + auto col_c = FWCW{2, 2}; + auto s1_mask = MASK{1, 1}; + auto idx = FWCW{0, 1}; + + std::vector> s2_children; + s2_children.push_back(col_a.release()); + s2_children.push_back(col_b.release()); + auto s2 = cudf::test::structs_column_wrapper(std::move(s2_children), s2_mask); + + std::vector> s1_children; + s1_children.push_back(s2.release()); + s1_children.push_back(col_c.release()); + auto s1 = cudf::test::structs_column_wrapper(std::move(s1_children), s1_mask); + + auto input = cudf::table_view({idx, s1}); + + auto result = cudf::distinct(input, {1}); + auto sorted_result = cudf::sort_by_key(*result, result->select({0})); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(input.column(1), sorted_result->get_column(1)); +} + TEST_F(Distinct, ListOfEmptyStruct) { // 0. [] ==