diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml index 35108ddd8ca..fa0b1126190 100644 --- a/conda/environments/cudf_dev_cuda10.1.yml +++ b/conda/environments/cudf_dev_cuda10.1.yml @@ -43,7 +43,7 @@ dependencies: - mypy=0.782 - typing_extensions - pre_commit - - dask>=2.22.0 + - dask>=2021.3.1 - distributed>=2.22.0 - streamz - dlpack diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml index 3a24e38a397..52d82c4f4ef 100644 --- a/conda/environments/cudf_dev_cuda10.2.yml +++ b/conda/environments/cudf_dev_cuda10.2.yml @@ -43,7 +43,7 @@ dependencies: - mypy=0.782 - typing_extensions - pre_commit - - dask>=2.22.0 + - dask>=2021.3.1 - distributed>=2.22.0 - streamz - dlpack diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml index 821c6f5320d..2e64365bdf6 100644 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ b/conda/environments/cudf_dev_cuda11.0.yml @@ -43,7 +43,7 @@ dependencies: - mypy=0.782 - typing_extensions - pre_commit - - dask>=2.22.0 + - dask>=2021.3.1 - distributed>=2.22.0 - streamz - dlpack diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 5635f54ba20..a119040bbcf 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -28,7 +28,7 @@ requirements: - numba >=0.49.0 - dlpack - pyarrow 1.0.1 - - libcudf {{ version }} + - libcudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} - rmm {{ minor_version }} - cudatoolkit {{ cuda_version }} run: diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml index 0acd9ec4bb2..cc3f30091bf 100644 --- a/conda/recipes/cudf_kafka/meta.yaml +++ b/conda/recipes/cudf_kafka/meta.yaml @@ -29,12 +29,12 @@ requirements: - python - cython >=0.29,<0.30 - setuptools - - cudf {{ version }} - - libcudf_kafka {{ version }} + - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} + - libcudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} run: - - libcudf_kafka {{ version }} + - libcudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} - python-confluent-kafka - - cudf {{ version }} + - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} test: requires: diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml index ffda6d0c3c6..8edca7a51d0 100644 --- a/conda/recipes/custreamz/meta.yaml +++ b/conda/recipes/custreamz/meta.yaml @@ -23,15 +23,15 @@ requirements: host: - python - python-confluent-kafka - - cudf_kafka {{ version }} + - cudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} run: - python - - streamz - - cudf {{ version }} + - streamz + - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} - dask >=2.22.0 - distributed >=2.22.0 - python-confluent-kafka - - cudf_kafka {{ version }} + - cudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} test: requires: diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index 170075743bd..a8768e26056 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -22,15 +22,15 @@ build: requirements: host: - python - - cudf {{ version }} - - dask >=2.22.0 + - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} + - dask>=2021.3.1 - distributed >=2.22.0 run: - python - - cudf {{ version }} - - dask >=2.22.0 + - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} + - dask>=2021.3.1 - distributed >=2.22.0 - + test: requires: - cudatoolkit {{ cuda_version }}.* diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 39587b4bd05..75955428eab 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -178,12 +178,14 @@ test: - test -f $PREFIX/include/cudf/strings/detail/converters.hpp - test -f $PREFIX/include/cudf/strings/detail/copying.hpp - test -f $PREFIX/include/cudf/strings/detail/fill.hpp + - test -f $PREFIX/include/cudf/strings/detail/json.hpp - test -f $PREFIX/include/cudf/strings/detail/replace.hpp - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp - test -f $PREFIX/include/cudf/strings/extract.hpp - test -f $PREFIX/include/cudf/strings/findall.hpp - test -f $PREFIX/include/cudf/strings/find.hpp - test -f $PREFIX/include/cudf/strings/find_multiple.hpp + - test -f $PREFIX/include/cudf/strings/json.hpp - test -f $PREFIX/include/cudf/strings/padding.hpp - test -f $PREFIX/include/cudf/strings/replace.hpp - test -f $PREFIX/include/cudf/strings/replace_re.hpp diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml index 5348ec471e9..81ff922b8d7 100644 --- a/conda/recipes/libcudf_kafka/meta.yaml +++ b/conda/recipes/libcudf_kafka/meta.yaml @@ -25,7 +25,7 @@ requirements: build: - cmake >=3.17.0 host: - - libcudf {{ version }} + - libcudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} - librdkafka >=1.5.0,<1.5.3 run: - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5cd82e52180..61cb13d3445 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -346,6 +346,7 @@ add_library(cudf src/strings/find.cu src/strings/find_multiple.cu src/strings/padding.cu + src/strings/json/json_path.cu src/strings/regex/regcomp.cpp src/strings/regex/regexec.cu src/strings/replace/backref_re.cu diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 5aa7e0132f8..11af408f1c5 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -202,3 +202,8 @@ ConfigureBench(STRINGS_BENCH string/substring_benchmark.cpp string/translate_benchmark.cpp string/url_decode_benchmark.cpp) + +################################################################################################### +# - json benchmark ------------------------------------------------------------------- +ConfigureBench(JSON_BENCH + string/json_benchmark.cpp) diff --git a/cpp/benchmarks/string/json_benchmark.cpp b/cpp/benchmarks/string/json_benchmark.cpp new file mode 100644 index 00000000000..6fb6a07a8d0 --- /dev/null +++ b/cpp/benchmarks/string/json_benchmark.cpp @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include + +#include +#include + +class JsonPath : public cudf::benchmark { +}; + +float frand() { return static_cast(rand()) / static_cast(RAND_MAX); } + +int rand_range(int min, int max) { return min + static_cast(frand() * (max - min)); } + +std::vector Books{ + "{\n\"category\": \"reference\",\n\"author\": \"Nigel Rees\",\n\"title\": \"Sayings of the " + "Century\",\n\"price\": 8.95\n}", + "{\n\"category\": \"fiction\",\n\"author\": \"Evelyn Waugh\",\n\"title\": \"Sword of " + "Honour\",\n\"price\": 12.99\n}", + "{\n\"category\": \"fiction\",\n\"author\": \"Herman Melville\",\n\"title\": \"Moby " + "Dick\",\n\"isbn\": \"0-553-21311-3\",\n\"price\": 8.99\n}", + "{\n\"category\": \"fiction\",\n\"author\": \"J. R. R. Tolkien\",\n\"title\": \"The Lord of the " + "Rings\",\n\"isbn\": \"0-395-19395-8\",\n\"price\": 22.99\n}"}; +constexpr int Approx_book_size = 110; +std::vector Bicycles{ + "{\"color\": \"red\", \"price\": 9.95}", + "{\"color\": \"green\", \"price\": 29.95}", + "{\"color\": \"blue\", \"price\": 399.95}", + "{\"color\": \"yellow\", \"price\": 99.95}", + "{\"color\": \"mauve\", \"price\": 199.95}", +}; +constexpr int Approx_bicycle_size = 33; +std::string Misc{"\n\"expensive\": 10\n"}; +std::string generate_field(std::vector const& values, int num_values) +{ + std::string res; + for (int idx = 0; idx < num_values; idx++) { + if (idx > 0) { res += std::string(",\n"); } + int vindex = std::min(static_cast(floor(frand() * values.size())), + static_cast(values.size() - 1)); + res += values[vindex]; + } + return res; +} + +std::string build_row(int desired_bytes) +{ + // always have at least 2 books and 2 bikes + int num_books = 2; + int num_bicycles = 2; + int remaining_bytes = + desired_bytes - ((num_books * Approx_book_size) + (num_bicycles * Approx_bicycle_size)); + + // divide up the remainder between books and bikes + float book_pct = frand(); + float bicycle_pct = 1.0f - book_pct; + num_books += (remaining_bytes * book_pct) / Approx_book_size; + num_bicycles += (remaining_bytes * bicycle_pct) / Approx_bicycle_size; + + std::string books = "\"book\": [\n" + generate_field(Books, num_books) + "]\n"; + std::string bicycles = "\"bicycle\": [\n" + generate_field(Bicycles, num_bicycles) + "]\n"; + + std::string store = "\"store\": {\n"; + if (frand() <= 0.5f) { + store += books + std::string(",\n") + bicycles; + } else { + store += bicycles + std::string(",\n") + books; + } + store += std::string("}\n"); + + std::string row = std::string("{\n"); + if (frand() <= 0.5f) { + row += store + std::string(",\n") + Misc; + } else { + row += Misc + std::string(",\n") + store; + } + row += std::string("}\n"); + return row; +} + +template +static void BM_case(benchmark::State& state, QueryArg&&... query_arg) +{ + srand(5236); + auto iter = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [desired_bytes = state.range(1)](int index) { return build_row(desired_bytes); }); + int num_rows = state.range(0); + cudf::test::strings_column_wrapper input(iter, iter + num_rows); + cudf::strings_column_view scv(input); + size_t num_chars = scv.chars().size(); + + std::string json_path(query_arg...); + + for (auto _ : state) { + cuda_event_timer raii(state, true, 0); + auto result = cudf::strings::get_json_object(scv, json_path); + cudaStreamSynchronize(0); + } + + // this isn't strictly 100% accurate. a given query isn't necessarily + // going to visit every single incoming character. but in spirit it does. + state.SetBytesProcessed(state.iterations() * num_chars); +} + +#define JSON_BENCHMARK_DEFINE(name, query) \ + BENCHMARK_CAPTURE(BM_case, name, query) \ + ->ArgsProduct({{100, 1000, 100000, 400000}, {300, 600, 4096}}) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +JSON_BENCHMARK_DEFINE(query0, "$"); +JSON_BENCHMARK_DEFINE(query1, "$.store"); +JSON_BENCHMARK_DEFINE(query2, "$.store.book"); +JSON_BENCHMARK_DEFINE(query3, "$.store.*"); +JSON_BENCHMARK_DEFINE(query4, "$.store.book[*]"); +JSON_BENCHMARK_DEFINE(query5, "$.store.book[*].category"); +JSON_BENCHMARK_DEFINE(query6, "$.store['bicycle']"); +JSON_BENCHMARK_DEFINE(query7, "$.store.book[*]['isbn']"); +JSON_BENCHMARK_DEFINE(query8, "$.store.bicycle[1]"); diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index 3c454c85720..74ce6e42d7e 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -230,10 +230,13 @@ std::unique_ptr make_collect_list_aggregation( * @param null_handling Indicates whether to include/exclude nulls during collection * @param nulls_equal Flag to specify whether null entries within each list should be considered * equal + * @param nans_equal Flag to specify whether NaN values in floating point column should be + * considered equal */ std::unique_ptr make_collect_set_aggregation( null_policy null_handling = null_policy::INCLUDE, - null_equality null_equal = null_equality::EQUAL); + null_equality nulls_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::UNEQUAL); /// Factory to create a LAG aggregation std::unique_ptr make_lag_aggregation(size_type offset); diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 18bef301e03..0bfe6b84ae2 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -345,24 +345,32 @@ struct collect_list_aggregation final : derived_aggregation */ struct collect_set_aggregation final : derived_aggregation { explicit collect_set_aggregation(null_policy null_handling = null_policy::INCLUDE, - null_equality null_equal = null_equality::EQUAL) - : derived_aggregation{COLLECT_SET}, _null_handling{null_handling}, _null_equal(null_equal) + null_equality nulls_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::UNEQUAL) + : derived_aggregation{COLLECT_SET}, + _null_handling{null_handling}, + _nulls_equal(nulls_equal), + _nans_equal(nans_equal) { } null_policy _null_handling; ///< include or exclude nulls - null_equality _null_equal; ///< whether to consider nulls as equal values + null_equality _nulls_equal; ///< whether to consider nulls as equal values + nan_equality _nans_equal; ///< whether to consider NaNs as equal value (applicable only to + ///< floating point types) protected: friend class derived_aggregation; bool operator==(collect_set_aggregation const& other) const { - return _null_handling == other._null_handling && _null_equal == other._null_equal; + return _null_handling == other._null_handling && _nulls_equal == other._nulls_equal && + _nans_equal == other._nans_equal; } size_t hash_impl() const { - return std::hash{}(static_cast(_null_handling) ^ static_cast(_null_equal)); + return std::hash{}(static_cast(_null_handling) ^ static_cast(_nulls_equal) ^ + static_cast(_nans_equal)); } }; diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index 31533a69487..e79107e32cf 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -542,6 +542,22 @@ hash_value_type CUDA_DEVICE_CALLABLE MurmurHash3_32::operator()(double c return this->compute_floating_point(key); } +template <> +hash_value_type CUDA_DEVICE_CALLABLE +MurmurHash3_32::operator()(cudf::list_view const& key) const +{ + cudf_assert(false && "List column hashing is not supported"); + return 0; +} + +template <> +hash_value_type CUDA_DEVICE_CALLABLE +MurmurHash3_32::operator()(cudf::struct_view const& key) const +{ + cudf_assert(false && "Direct hashing of struct_view is not supported"); + return 0; +} + template struct SparkMurmurHash3_32 { using argument_type = Key; @@ -671,6 +687,22 @@ SparkMurmurHash3_32::operator()(numeric::decimal64 const& ke return this->compute(key.value()); } +template <> +hash_value_type CUDA_DEVICE_CALLABLE +SparkMurmurHash3_32::operator()(cudf::list_view const& key) const +{ + cudf_assert(false && "List column hashing is not supported"); + return 0; +} + +template <> +hash_value_type CUDA_DEVICE_CALLABLE +SparkMurmurHash3_32::operator()(cudf::struct_view const& key) const +{ + cudf_assert(false && "Direct hashing of struct_view is not supported"); + return 0; +} + /** * @brief Specialization of MurmurHash3_32 operator for strings. */ diff --git a/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp b/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp index ba3e1d17d7f..53b31015145 100644 --- a/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp +++ b/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp @@ -31,6 +31,7 @@ namespace detail { std::unique_ptr drop_list_duplicates( lists_column_view const& lists_column, null_equality nulls_equal, + nan_equality nans_equal, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/lists/drop_list_duplicates.hpp b/cpp/include/cudf/lists/drop_list_duplicates.hpp index 0939bd7956a..f1ce3b7f0e3 100644 --- a/cpp/include/cudf/lists/drop_list_duplicates.hpp +++ b/cpp/include/cudf/lists/drop_list_duplicates.hpp @@ -41,6 +41,8 @@ namespace lists { * * @param lists_column The input lists_column_view * @param nulls_equal Flag to specify whether null entries should be considered equal + * @param nans_equal Flag to specify whether NaN entries should be considered as equal value (only + * applicable for floating point data column) * @param mr Device resource used to allocate memory * * @code{.pseudo} @@ -56,6 +58,7 @@ namespace lists { std::unique_ptr drop_list_duplicates( lists_column_view const& lists_column, null_equality nulls_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::UNEQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/strings/detail/json.hpp b/cpp/include/cudf/strings/detail/json.hpp new file mode 100644 index 00000000000..e6a0b49f102 --- /dev/null +++ b/cpp/include/cudf/strings/detail/json.hpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { + +/** + * @copydoc cudf::strings::get_json_object + * + * @param stream CUDA stream used for device memory operations and kernel launches + */ +std::unique_ptr get_json_object( + cudf::strings_column_view const& col, + cudf::string_scalar const& json_path, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace detail +} // namespace strings +} // namespace cudf diff --git a/cpp/include/cudf/strings/json.hpp b/cpp/include/cudf/strings/json.hpp new file mode 100644 index 00000000000..b39e4a2027c --- /dev/null +++ b/cpp/include/cudf/strings/json.hpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace cudf { +namespace strings { + +/** + * @addtogroup strings_json + * @{ + * @file + */ + +/** + * @brief Apply a JSONPath string to all rows in an input strings column. + * + * Applies a JSONPath string to an incoming strings column where each row in the column + * is a valid json string. The output is returned by row as a strings column. + * + * https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html + * Implements only the operators: $ . [] * + * + * @param col The input strings column. Each row must contain a valid json string + * @param json_path The JSONPath string to be applied to each row + * @param mr Resource for allocating device memory. + * @return New strings column containing the retrieved json object strings + */ +std::unique_ptr get_json_object( + cudf::strings_column_view const& col, + cudf::string_scalar const& json_path, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of doxygen group +} // namespace strings +} // namespace cudf diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index 1b8d83883b3..789bb3037f4 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -137,6 +137,15 @@ enum class nan_policy : bool { NAN_IS_VALID ///< treat nans as valid elements (non-null) }; +/** + * @brief Enum to consider different elements (of floating point types) holding NaN value as equal + * or unequal + */ +enum class nan_equality /*unspecified*/ { + ALL_EQUAL, ///< All NaNs compare equal, regardless of sign + UNEQUAL ///< All NaNs compare unequal (IEEE754 behavior) +}; + /** * @brief */ diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index 65dd5c73475..f78ff98d49d 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -127,6 +127,7 @@ * @defgroup strings_modify Modifying * @defgroup strings_replace Replacing * @defgroup strings_split Splitting + * @defgroup strings_json JSON * @} * @defgroup dictionary_apis Dictionary * @{ diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index 33c19617308..3a044a42101 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -132,9 +132,10 @@ std::unique_ptr make_collect_list_aggregation(null_policy null_hand } /// Factory to create a COLLECT_SET aggregation std::unique_ptr make_collect_set_aggregation(null_policy null_handling, - null_equality null_equal) + null_equality nulls_equal, + nan_equality nans_equal) { - return std::make_unique(null_handling, null_equal); + return std::make_unique(null_handling, nulls_equal, nans_equal); } /// Factory to create a LAG aggregation std::unique_ptr make_lag_aggregation(size_type offset) diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 4e2303c8b9b..46185e07600 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -379,11 +379,14 @@ void aggregrate_result_functor::operator()(aggregation auto const collect_result = detail::group_collect( get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr); auto const nulls_equal = - static_cast(agg)._null_equal; - cache.add_result(col_idx, - agg, - lists::detail::drop_list_duplicates( - lists_column_view(collect_result->view()), nulls_equal, stream, mr)); + static_cast(agg)._nulls_equal; + auto const nans_equal = + static_cast(agg)._nans_equal; + cache.add_result( + col_idx, + agg, + lists::detail::drop_list_duplicates( + lists_column_view(collect_result->view()), nulls_equal, nans_equal, stream, mr)); }; } // namespace detail diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu index 16efb666b3e..53be019f73b 100644 --- a/cpp/src/hash/hashing.cu +++ b/cpp/src/hash/hashing.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,6 +29,8 @@ #include +#include + namespace cudf { namespace { @@ -38,6 +40,22 @@ bool md5_type_check(data_type dt) return !is_chrono(dt) && (is_fixed_width(dt) || (dt.id() == type_id::STRING)); } +template +std::vector to_leaf_columns(IterType iter_begin, IterType iter_end) +{ + std::vector leaf_columns; + std::for_each(iter_begin, iter_end, [&leaf_columns](column_view const& col) { + if (is_nested(col.type())) { + CUDF_EXPECTS(col.type().id() == type_id::STRUCT, "unsupported nested type"); + auto child_columns = to_leaf_columns(col.child_begin(), col.child_end()); + leaf_columns.insert(leaf_columns.end(), child_columns.begin(), child_columns.end()); + } else { + leaf_columns.emplace_back(col); + } + }); + return leaf_columns; +} + } // namespace namespace detail { @@ -133,10 +151,11 @@ std::unique_ptr serial_murmur_hash3_32(table_view const& input, if (input.num_columns() == 0 || input.num_rows() == 0) { return output; } - auto const device_input = table_device_view::create(input, stream); + table_view const leaf_table(to_leaf_columns(input.begin(), input.end())); + auto const device_input = table_device_view::create(leaf_table, stream); auto output_view = output->mutable_view(); - if (has_nulls(input)) { + if (has_nulls(leaf_table)) { thrust::tabulate(rmm::exec_policy(stream), output_view.begin(), output_view.end(), diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu index 86e5f1fdcae..44acc7fc55f 100644 --- a/cpp/src/io/csv/csv_gpu.cu +++ b/cpp/src/io/csv/csv_gpu.cu @@ -196,7 +196,7 @@ __global__ void __launch_bounds__(csvparse_block_dim) } else if (serialized_trie_contains(opts.trie_true, {field_start, field_len}) || serialized_trie_contains(opts.trie_false, {field_start, field_len})) { atomicAdd(&d_columnData[actual_col].bool_count, 1); - } else if (cudf::io::gpu::is_infinity(field_start, next_delimiter)) { + } else if (cudf::io::is_infinity(field_start, next_delimiter)) { atomicAdd(&d_columnData[actual_col].float_count, 1); } else { long countNumber = 0; @@ -277,7 +277,7 @@ __inline__ __device__ T decode_value(char const *begin, char const *end, parse_options_view const &opts) { - return cudf::io::gpu::parse_numeric(begin, end, opts); + return cudf::io::parse_numeric(begin, end, opts); } template @@ -285,7 +285,7 @@ __inline__ __device__ T decode_value(char const *begin, char const *end, parse_options_view const &opts) { - return cudf::io::gpu::parse_numeric(begin, end, opts); + return cudf::io::parse_numeric(begin, end, opts); } template <> diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu index 5efb64fd4d5..75910ae6b5b 100644 --- a/cpp/src/io/json/json_gpu.cu +++ b/cpp/src/io/json/json_gpu.cu @@ -114,7 +114,7 @@ __inline__ __device__ T decode_value(const char *begin, uint64_t end, parse_options_view const &opts) { - return cudf::io::gpu::parse_numeric(begin, end, opts); + return cudf::io::parse_numeric(begin, end, opts); } /** @@ -131,7 +131,7 @@ __inline__ __device__ T decode_value(const char *begin, const char *end, parse_options_view const &opts) { - return cudf::io::gpu::parse_numeric(begin, end, opts); + return cudf::io::parse_numeric(begin, end, opts); } /** diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 584d2c9a74a..b7719cba580 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -20,6 +20,8 @@ #include #include +#include + #include using cudf::device_span; @@ -82,67 +84,6 @@ struct parse_options { } }; -namespace gpu { -/** - * @brief CUDA kernel iterates over the data until the end of the current field - * - * Also iterates over (one or more) delimiter characters after the field. - * Function applies to formats with field delimiters and line terminators. - * - * @param begin Pointer to the first element of the string - * @param end Pointer to the first element after the string - * @param opts A set of parsing options - * @param escape_char A boolean value to signify whether to consider `\` as escape character or - * just a character. - * - * @return Pointer to the last character in the field, including the - * delimiter(s) following the field data - */ -__device__ __inline__ char const* seek_field_end(char const* begin, - char const* end, - parse_options_view const& opts, - bool escape_char = false) -{ - bool quotation = false; - auto current = begin; - bool escape_next = false; - while (true) { - // Use simple logic to ignore control chars between any quote seq - // Handles nominal cases including doublequotes within quotes, but - // may not output exact failures as PANDAS for malformed fields. - // Check for instances such as "a2\"bc" and "\\" if `escape_char` is true. - - if (*current == opts.quotechar and not escape_next) { - quotation = !quotation; - } else if (!quotation) { - if (*current == opts.delimiter) { - while (opts.multi_delimiter && current < end && *(current + 1) == opts.delimiter) { - ++current; - } - break; - } else if (*current == opts.terminator) { - break; - } else if (*current == '\r' && (current + 1 < end && *(current + 1) == '\n')) { - --end; - break; - } - } - - if (escape_char == true) { - // If a escape character is encountered, escape next character in next loop. - if (escape_next == false and *current == '\\') { - escape_next = true; - } else { - escape_next = false; - } - } - - if (current >= end) break; - current++; - } - return current; -} - /** * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization * for integral types. Handles hexadecimal digits, both uppercase and lowercase. @@ -155,7 +96,7 @@ __device__ __inline__ char const* seek_field_end(char const* begin, * @return uint8_t Numeric value of the character, or `0` */ template ::value>* = nullptr> -__device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag) +constexpr uint8_t decode_digit(char c, bool* valid_flag) { if (c >= '0' && c <= '9') return c - '0'; if (c >= 'a' && c <= 'f') return c - 'a' + 10; @@ -176,7 +117,7 @@ __device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag) * @return uint8_t Numeric value of the character, or `0` */ template ::value>* = nullptr> -__device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag) +constexpr uint8_t decode_digit(char c, bool* valid_flag) { if (c >= '0' && c <= '9') return c - '0'; @@ -185,10 +126,7 @@ __device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag) } // Converts character to lowercase. -__inline__ __device__ char to_lower(char const c) -{ - return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; -} +constexpr char to_lower(char const c) { return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; } /** * @brief Checks if string is infinity, case insensitive with/without sign @@ -199,7 +137,7 @@ __inline__ __device__ char to_lower(char const c) * @param end Pointer to the first element after the string * @return true if string is valid infinity, else false. */ -__inline__ __device__ bool is_infinity(char const* begin, char const* end) +constexpr bool is_infinity(char const* begin, char const* end) { if (*begin == '-' || *begin == '+') begin++; char const* cinf = "infinity"; @@ -223,9 +161,10 @@ __inline__ __device__ bool is_infinity(char const* begin, char const* end) * @return The parsed and converted value */ template -__inline__ __device__ T parse_numeric(const char* begin, - const char* end, - parse_options_view const& opts) +constexpr T parse_numeric(const char* begin, + const char* end, + parse_options_view const& opts, + T error_result = std::numeric_limits::quiet_NaN()) { T value{}; bool all_digits_valid = true; @@ -281,11 +220,72 @@ __inline__ __device__ T parse_numeric(const char* begin, if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); } } } - if (!all_digits_valid) { return std::numeric_limits::quiet_NaN(); } + if (!all_digits_valid) { return error_result; } return value * sign; } +namespace gpu { +/** + * @brief CUDA kernel iterates over the data until the end of the current field + * + * Also iterates over (one or more) delimiter characters after the field. + * Function applies to formats with field delimiters and line terminators. + * + * @param begin Pointer to the first element of the string + * @param end Pointer to the first element after the string + * @param opts A set of parsing options + * @param escape_char A boolean value to signify whether to consider `\` as escape character or + * just a character. + * + * @return Pointer to the last character in the field, including the + * delimiter(s) following the field data + */ +__device__ __inline__ char const* seek_field_end(char const* begin, + char const* end, + parse_options_view const& opts, + bool escape_char = false) +{ + bool quotation = false; + auto current = begin; + bool escape_next = false; + while (true) { + // Use simple logic to ignore control chars between any quote seq + // Handles nominal cases including doublequotes within quotes, but + // may not output exact failures as PANDAS for malformed fields. + // Check for instances such as "a2\"bc" and "\\" if `escape_char` is true. + + if (*current == opts.quotechar and not escape_next) { + quotation = !quotation; + } else if (!quotation) { + if (*current == opts.delimiter) { + while (opts.multi_delimiter && current < end && *(current + 1) == opts.delimiter) { + ++current; + } + break; + } else if (*current == opts.terminator) { + break; + } else if (*current == '\r' && (current + 1 < end && *(current + 1) == '\n')) { + --end; + break; + } + } + + if (escape_char == true) { + // If a escape character is encountered, escape next character in next loop. + if (escape_next == false and *current == '\\') { + escape_next = true; + } else { + escape_next = false; + } + } + + if (current >= end) break; + current++; + } + return current; +} + /** * @brief Lexicographically compare digits in input against string * representing an integer diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 584b9791d19..564d919b65d 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -16,15 +16,16 @@ #include #include -#include +#include #include #include #include #include #include -#include +#include #include +#include #include #include @@ -34,62 +35,100 @@ namespace cudf { namespace lists { namespace detail { namespace { +template +struct has_negative_nans { + column_device_view const d_entries; + bool const has_nulls; + + __device__ Type operator()(size_type idx) const noexcept + { + if (has_nulls && d_entries.is_null_nocheck(idx)) { return false; } + + auto const val = d_entries.element(idx); + return std::isnan(val) && std::signbit(val); // std::signbit(x) == true if x is negative + } +}; /** - * @brief Copy list entries and entry list offsets ignoring duplicates - * - * Given an array of all entries flattened from a list column and an array that maps each entry to - * the offset of the list containing that entry, those entries and list offsets are copied into - * new arrays such that the duplicated entries within each list will be ignored. - * - * @param all_lists_entries The input array containing all list entries - * @param entries_list_offsets A map from list entries to their corresponding list offsets - * @param nulls_equal Flag to specify whether null entries should be considered equal - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device resource used to allocate memory - * - * @return A pair of columns, the first one contains unique list entries and the second one - * contains their corresponding list offsets + * @brief A structure to be used along with type_dispatcher to check if a + * `column_view` has any negative NaN entry */ -template -std::vector> get_unique_entries_and_list_offsets( - column_view const& all_lists_entries, - column_view const& entries_list_offsets, - null_equality nulls_equal, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - // Create an intermediate table, since the comparator only work on tables - auto const device_input_table = - cudf::table_device_view::create(table_view{{all_lists_entries}}, stream); - auto const comp = row_equality_comparator( - *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL); +struct has_negative_nans_fn { + template >* = nullptr> + bool operator()(column_view const& lists_entries, rmm::cuda_stream_view stream) const noexcept + { + auto const d_entries = column_device_view::create(lists_entries, stream); + return thrust::count_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(lists_entries.size()), + detail::has_negative_nans{*d_entries, lists_entries.has_nulls()}); + } - auto const num_entries = all_lists_entries.size(); - // Allocate memory to store the indices of the unique entries - auto const unique_indices = cudf::make_numeric_column( - entries_list_offsets.type(), num_entries, mask_state::UNALLOCATED, stream); - auto const unique_indices_begin = unique_indices->mutable_view().begin(); + template >* = nullptr> + bool operator()(column_view const&, rmm::cuda_stream_view) const noexcept + { + // Columns of non floating-point data will never contain NaN + return false; + } +}; - auto const copy_end = thrust::unique_copy( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - unique_indices_begin, - [list_offsets = entries_list_offsets.begin(), comp] __device__(auto i, auto j) { - return list_offsets[i] == list_offsets[j] && comp(i, j); - }); +template +struct replace_negative_nans { + __device__ Type operator()(Type val) const noexcept + { + return std::isnan(val) ? std::numeric_limits::quiet_NaN() : val; + } +}; - // Collect unique entries and entry list offsets - auto const indices = cudf::detail::slice( - unique_indices->view(), 0, thrust::distance(unique_indices_begin, copy_end)); - return cudf::detail::gather(table_view{{all_lists_entries, entries_list_offsets}}, - indices, - cudf::out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr) - ->release(); +/** + * @brief A structure to be used along with type_dispatcher to replace -NaN by NaN for all entries + * of a floating-point data column + */ +struct replace_negative_nans_fn { + template >* = nullptr> + void operator()(column_view const&, mutable_column_view const&, rmm::cuda_stream_view) const + { + CUDF_FAIL("Cannot operate on a type that is not floating-point."); + } + + template >* = nullptr> + void operator()(column_view const& lists_entries, + mutable_column_view const& new_entries, + rmm::cuda_stream_view stream) const noexcept + { + // Do not care whether an entry is null or not, just consider it as a floating-point value + thrust::transform(rmm::exec_policy(stream), + lists_entries.begin(), + lists_entries.end(), + new_entries.begin(), + detail::replace_negative_nans{}); + } +}; + +/** + * @brief Transform a given lists column to a new lists column in which all the list entries holding + * -NaN value are replaced by (positive) NaN + */ +std::unique_ptr replace_negative_nans_entries(column_view const& lists_entries, + lists_column_view const& lists_column, + rmm::cuda_stream_view stream) +{ + auto new_offsets = std::make_unique(lists_column.offsets()); + auto new_entries = std::make_unique(lists_entries); + + type_dispatcher(lists_entries.type(), + detail::replace_negative_nans_fn{}, + lists_entries, + new_entries->mutable_view(), + stream); + + return make_lists_column( + lists_column.size(), + std::move(new_offsets), + std::move(new_entries), + lists_column.null_count(), + cudf::detail::copy_bitmask( + lists_column.parent(), stream, rmm::mr::get_current_device_resource())); } /** @@ -165,6 +204,189 @@ std::unique_ptr generate_entry_list_offsets(size_type num_entries, return entry_list_offsets; } +/** + * @brief Performs an equality comparison between two entries in a lists column + * + * For the two elements that are in the same list in the lists column, they will always be + * considered as different. If they are from the same list and their type is one of floating + * point types, this functor will return the same comparison result as + * `cudf::element_equality_comparator`. + * + * For floating-point types, entries holding NaN value can be considered as different values or the + * same value depending on the nans_equal parameter. + * + * @tparam Type The data type of entries + * @tparam nans_equal Flag to specify whether NaN entries should be considered as equal value (only + * applicable for floating-point data column) + */ +template +class list_entry_comparator { + public: + list_entry_comparator(offset_type const* list_offsets, + column_device_view d_view, + null_equality nulls_equal, + bool has_nulls) + : list_offsets(list_offsets), d_view{d_view}, nulls_equal{nulls_equal}, has_nulls(has_nulls) + { + } + + template + std::enable_if_t and nans_equal_, bool> __device__ + operator()(size_type i, size_type j) const noexcept + { + // Two entries are not considered for equality if they belong to different lists + if (list_offsets[i] != list_offsets[j]) { return false; } + + if (has_nulls) { + bool const nullable = d_view.nullable(); + bool const lhs_is_null{nullable and d_view.is_null_nocheck(i)}; + bool const rhs_is_null{nullable and d_view.is_null_nocheck(j)}; + if (lhs_is_null and rhs_is_null) { + return nulls_equal == null_equality::EQUAL; + } else if (lhs_is_null != rhs_is_null) { + return false; + } + } + + // For floating-point types, if both element(i) and element(j) are NaNs then this comparison + // will return `true`. This is the desired behavior in Pandas. + auto const lhs = d_view.element(i); + auto const rhs = d_view.element(j); + if (std::isnan(lhs) and std::isnan(rhs)) { return true; } + return lhs == rhs; + } + + template + std::enable_if_t or not nans_equal_, bool> __device__ + operator()(size_type i, size_type j) const noexcept + { + // Two entries are not considered for equality if they belong to different lists + if (list_offsets[i] != list_offsets[j]) { return false; } + + if (has_nulls) { + bool const nullable = d_view.nullable(); + bool const lhs_is_null{nullable and d_view.is_null_nocheck(i)}; + bool const rhs_is_null{nullable and d_view.is_null_nocheck(j)}; + if (lhs_is_null and rhs_is_null) { + return nulls_equal == null_equality::EQUAL; + } else if (lhs_is_null != rhs_is_null) { + return false; + } + } + + // For floating-point types, if both element(i) and element(j) are NaNs then this comparison + // will return `false`. This is the desired behavior in Apache Spark. + return d_view.element(i) == d_view.element(j); + } + + private: + offset_type const* list_offsets; + column_device_view d_view; + null_equality nulls_equal; + bool has_nulls; +}; + +/** + * @brief Construct type-dispatched function object for copying indices of the list entries + * ignoring duplicates + */ +struct get_unique_entries_fn { + template ()>* = nullptr> + offset_type* operator()(offset_type const*, + column_device_view&, + size_type, + offset_type*, + null_equality, + nan_equality, + bool, + rmm::cuda_stream_view) const + { + CUDF_FAIL("Cannot operate on types that are not equally comparable."); + } + + template ()>* = nullptr> + offset_type* operator()(offset_type const* list_offsets, + column_device_view& d_view, + size_type num_entries, + offset_type* output_begin, + null_equality nulls_equal, + nan_equality nans_equal, + bool has_nulls, + rmm::cuda_stream_view stream) const noexcept + { + if (nans_equal == nan_equality::ALL_EQUAL) { + list_entry_comparator const comp{list_offsets, d_view, nulls_equal, has_nulls}; + return thrust::unique_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + output_begin, + comp); + } else { + list_entry_comparator const comp{list_offsets, d_view, nulls_equal, has_nulls}; + return thrust::unique_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + output_begin, + comp); + } + } +}; + +/** + * @brief Copy list entries and entry list offsets ignoring duplicates + * + * Given an array of all entries flattened from a list column and an array that maps each entry to + * the offset of the list containing that entry, those entries and list offsets are copied into + * new arrays such that the duplicated entries within each list will be ignored. + * + * @param all_lists_entries The input array containing all list entries + * @param entries_list_offsets A map from list entries to their corresponding list offsets + * @param nulls_equal Flag to specify whether null entries should be considered equal + * @param nans_equal Flag to specify whether NaN entries should be considered as equal + * value (only applicable for floating-point data column) + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device resource used to allocate memory + * + * @return A pair of columns, the first one contains unique list entries and the second one + * contains their corresponding list offsets + */ +std::vector> get_unique_entries_and_list_offsets( + column_view const& all_lists_entries, + column_view const& entries_list_offsets, + null_equality nulls_equal, + nan_equality nans_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const num_entries = all_lists_entries.size(); + auto const d_view_entries = column_device_view::create(all_lists_entries, stream); + + // Allocate memory to store the indices of the unique entries + auto unique_indices = rmm::device_uvector(num_entries, stream); + auto const output_begin = unique_indices.begin(); + auto const output_end = type_dispatcher(all_lists_entries.type(), + get_unique_entries_fn{}, + entries_list_offsets.begin(), + *d_view_entries, + num_entries, + output_begin, + nulls_equal, + nans_equal, + all_lists_entries.has_nulls(), + stream); + + // Collect unique entries and entry list offsets + // The new null_count and bitmask of the unique entries will also be generated + // by the gather function + return cudf::detail::gather(table_view{{all_lists_entries, entries_list_offsets}}, + output_begin, + output_end, + cudf::out_of_bounds_policy::DONT_CHECK, + stream, + mr) + ->release(); +} + /** * @brief Generate list offsets from entry offsets * @@ -225,6 +447,7 @@ void generate_offsets(size_type num_entries, return offsets[i - prefix_sum_empty_lists[i]]; }); } + } // anonymous namespace /** @@ -234,6 +457,7 @@ void generate_offsets(size_type num_entries, */ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_column, null_equality nulls_equal, + nan_equality nans_equal, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -242,27 +466,40 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu CUDF_FAIL("Nested types are not supported in drop_list_duplicates."); } - // Call segmented sort on the list elements and store them in a temporary column sorted_list - auto const sorted_lists = - detail::sort_lists(lists_column, order::ASCENDING, null_order::AFTER, stream); - // Flatten all entries (depth = 1) of the lists column - auto const all_lists_entries = lists_column_view(sorted_lists->view()).get_sliced_child(stream); + auto const lists_entries = lists_column.get_sliced_child(stream); + + // sorted_lists will store the results of the original lists after calling segmented_sort + auto const sorted_lists = [&]() { + // If nans_equal == ALL_EQUAL and the column contains lists of floating-point data type, + // we need to replace -NaN by NaN before sorting + auto const replace_negative_nan = + nans_equal == nan_equality::ALL_EQUAL and + type_dispatcher(lists_entries.type(), detail::has_negative_nans_fn{}, lists_entries, stream); + if (replace_negative_nan) { + // The column new_lists_column is temporary, thus we will not pass in `mr` + auto const new_lists_column = + detail::replace_negative_nans_entries(lists_entries, lists_column, stream); + return detail::sort_lists( + lists_column_view(new_lists_column->view()), order::ASCENDING, null_order::AFTER, stream); + } else { + return detail::sort_lists(lists_column, order::ASCENDING, null_order::AFTER, stream); + } + }(); + + auto const sorted_lists_entries = + lists_column_view(sorted_lists->view()).get_sliced_child(stream); // Generate a 0-based offset column auto lists_offsets = detail::generate_clean_offsets(lists_column, stream, mr); // Generate a mapping from list entries to offsets of the lists containing those entries auto const entries_list_offsets = - detail::generate_entry_list_offsets(all_lists_entries.size(), lists_offsets->view(), stream); + detail::generate_entry_list_offsets(sorted_lists_entries.size(), lists_offsets->view(), stream); // Copy non-duplicated entries (along with their list offsets) to new arrays - auto unique_entries_and_list_offsets = - all_lists_entries.has_nulls() - ? detail::get_unique_entries_and_list_offsets( - all_lists_entries, entries_list_offsets->view(), nulls_equal, stream, mr) - : detail::get_unique_entries_and_list_offsets( - all_lists_entries, entries_list_offsets->view(), nulls_equal, stream, mr); + auto unique_entries_and_list_offsets = detail::get_unique_entries_and_list_offsets( + sorted_lists_entries, entries_list_offsets->view(), nulls_equal, nans_equal, stream, mr); // Generate offsets for the new lists column detail::generate_offsets(unique_entries_and_list_offsets.front()->size(), @@ -271,6 +508,10 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu stream); // Construct a new lists column without duplicated entries + // Reuse the null_count and bitmask of the lists_column: those are the null information for + // the list elements (rows) + // For the entries of those lists (rows), their null_count and bitmask were generated separately + // during the step `get_unique_entries_and_list_offsets` above return make_lists_column(lists_column.size(), std::move(lists_offsets), std::move(unique_entries_and_list_offsets.front()), @@ -285,10 +526,12 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu */ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_column, null_equality nulls_equal, + nan_equality nans_equal, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::drop_list_duplicates(lists_column, nulls_equal, rmm::cuda_stream_default, mr); + return detail::drop_list_duplicates( + lists_column, nulls_equal, nans_equal, rmm::cuda_stream_default, mr); } } // namespace lists diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu index 2b495deb47f..5f6f1c308ac 100644 --- a/cpp/src/lists/explode.cu +++ b/cpp/src/lists/explode.cu @@ -62,22 +62,27 @@ std::unique_ptr build_table( std::vector> columns = gathered_table.release()->release(); - columns.insert(columns.begin() + explode_column_idx, - explode_col_gather_map - ? std::move(detail::gather(table_view({sliced_child}), - explode_col_gather_map->begin(), - explode_col_gather_map->end(), - cudf::out_of_bounds_policy::NULLIFY, - stream, - mr) - ->release()[0]) - : std::make_unique(sliced_child, stream, mr)); + auto inserted = columns.insert(columns.begin() + explode_column_idx, + explode_col_gather_map + ? std::move(detail::gather(table_view({sliced_child}), + explode_col_gather_map->begin(), + explode_col_gather_map->end(), + cudf::out_of_bounds_policy::NULLIFY, + stream, + mr) + ->release()[0]) + : std::make_unique(sliced_child, stream, mr)); if (position_array) { size_type position_size = position_array->size(); + // the null mask for position matches the exploded column's gather map, so copy it over + rmm::device_buffer nullmask = + explode_col_gather_map ? copy_bitmask(*inserted->get()) : rmm::device_buffer(0, stream); columns.insert(columns.begin() + explode_column_idx, - std::make_unique( - data_type(type_to_id()), position_size, position_array->release())); + std::make_unique(data_type(type_to_id()), + position_size, + position_array->release(), + std::move(nullmask))); } return std::make_unique
(std::move(columns)); diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index afc2bbb37bd..65750deaa57 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -426,7 +426,7 @@ std::unique_ptr replace_nulls(cudf::column_view const& input, if (input.is_empty()) { return cudf::empty_like(input); } if (!input.has_nulls()) { return std::make_unique(input); } - return cudf::type_dispatcher( + return cudf::type_dispatcher( input.type(), replace_nulls_column_kernel_forwarder{}, input, replacement, stream, mr); } diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu new file mode 100644 index 00000000000..cd8aae12070 --- /dev/null +++ b/cpp/src/strings/json/json_path.cu @@ -0,0 +1,952 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { + +namespace { + +// debug accessibility + +// change to "\n" and 1 to make output more readable +#define DEBUG_NEWLINE +constexpr int DEBUG_NEWLINE_LEN = 0; + +/** + * @brief Result of calling a parse function. + * + * The primary use of this is to distinguish between "success" and + * "success but no data" return cases. For example, if you are reading the + * values of an array you might call a parse function in a while loop. You + * would want to continue doing this until you either encounter an error (parse_result::ERROR) + * or you get nothing back (parse_result::EMPTY) + */ +enum class parse_result { + ERROR, // failure + SUCCESS, // success + EMPTY, // success, but no data +}; + +/** + * @brief Base parser class inherited by the (device-side) json_state class and + * (host-side) path_state class. + * + * Contains a number of useful utility functions common to parsing json and + * JSONPath strings. + */ +class parser { + protected: + CUDA_HOST_DEVICE_CALLABLE parser() : input(nullptr), input_len(0), pos(nullptr) {} + CUDA_HOST_DEVICE_CALLABLE parser(const char* _input, int64_t _input_len) + : input(_input), input_len(_input_len), pos(_input) + { + parse_whitespace(); + } + + CUDA_HOST_DEVICE_CALLABLE parser(parser const& p) + : input(p.input), input_len(p.input_len), pos(p.pos) + { + } + + CUDA_HOST_DEVICE_CALLABLE bool eof(const char* p) { return p - input >= input_len; } + CUDA_HOST_DEVICE_CALLABLE bool eof() { return eof(pos); } + + CUDA_HOST_DEVICE_CALLABLE bool parse_whitespace() + { + while (!eof()) { + if (is_whitespace(*pos)) { + pos++; + } else { + return true; + } + } + return false; + } + + CUDA_HOST_DEVICE_CALLABLE parse_result parse_string(string_view& str, + bool can_be_empty, + char quote) + { + str = string_view(nullptr, 0); + + if (parse_whitespace() && *pos == quote) { + const char* start = ++pos; + while (!eof()) { + if (*pos == quote) { + str = string_view(start, pos - start); + pos++; + return parse_result::SUCCESS; + } + pos++; + } + } + + return can_be_empty ? parse_result::EMPTY : parse_result::ERROR; + } + + // a name means: + // - a string followed by a : + // - no string + CUDA_HOST_DEVICE_CALLABLE parse_result parse_name(string_view& name, + bool can_be_empty, + char quote) + { + if (parse_string(name, can_be_empty, quote) == parse_result::ERROR) { + return parse_result::ERROR; + } + + // if we got a real string, the next char must be a : + if (name.size_bytes() > 0) { + if (!parse_whitespace()) { return parse_result::ERROR; } + if (*pos == ':') { + pos++; + return parse_result::SUCCESS; + } + } + return parse_result::EMPTY; + } + + // numbers, true, false, null. + // this function is not particularly strong. badly formed values will get + // consumed without throwing any errors + CUDA_HOST_DEVICE_CALLABLE parse_result parse_non_string_value(string_view& val) + { + if (!parse_whitespace()) { return parse_result::ERROR; } + + // parse to the end of the value + char const* start = pos; + char const* end = start; + while (!eof(end)) { + char const c = *end; + if (c == ',' || c == '}' || c == ']' || is_whitespace(c)) { break; } + + // illegal chars + if (c == '[' || c == '{' || c == ':' || c == '\"') { return parse_result::ERROR; } + end++; + } + pos = end; + + val = string_view(start, end - start); + + return parse_result::SUCCESS; + } + + protected: + char const* input; + int64_t input_len; + char const* pos; + + private: + CUDA_HOST_DEVICE_CALLABLE bool is_whitespace(char c) { return c <= ' '; } +}; + +/** + * @brief Output buffer object. Used during the preprocess/size-computation step + * and the actual output step. + * + * There is an important distinction between two cases: + * + * - producing no output at all. that is, the query matched nothing in the input. + * - producing empty output. the query matched something in the input, but the + * value of the result is an empty string. + * + * The `has_output` field is the flag which indicates whether or not the output + * from the query should be considered empty or null. + * + */ +struct json_output { + size_t output_max_len; + char* output; + thrust::optional output_len; + + __device__ void add_output(const char* str, size_t len) + { + if (output != nullptr) { memcpy(output + output_len.value_or(0), str, len); } + output_len = output_len.value_or(0) + len; + } + + __device__ void add_output(string_view const& str) { add_output(str.data(), str.size_bytes()); } +}; + +enum json_element_type { NONE, OBJECT, ARRAY, VALUE }; + +/** + * @brief Parsing class that holds the current state of the json to be parse and provides + * functions for navigating through it. + */ +class json_state : private parser { + public: + __device__ json_state() + : parser(), + cur_el_start(nullptr), + cur_el_type(json_element_type::NONE), + parent_el_type(json_element_type::NONE) + { + } + __device__ json_state(const char* _input, int64_t _input_len) + : parser(_input, _input_len), + cur_el_start(nullptr), + cur_el_type(json_element_type::NONE), + parent_el_type(json_element_type::NONE) + { + } + + __device__ json_state(json_state const& j) + : parser(j), + cur_el_start(j.cur_el_start), + cur_el_type(j.cur_el_type), + parent_el_type(j.parent_el_type) + { + } + + // retrieve the entire current element into the output + __device__ parse_result extract_element(json_output* output, bool list_element) + { + char const* start = cur_el_start; + char const* end = start; + + // if we're a value type, do a simple value parse. + if (cur_el_type == VALUE) { + pos = cur_el_start; + if (parse_value() != parse_result::SUCCESS) { return parse_result::ERROR; } + end = pos; + + // SPARK-specific behavior. if this is a non-list-element wrapped in quotes, + // strip them. we may need to make this behavior configurable in some way + // later on. + if (!list_element && *start == '\"' && *(end - 1) == '\"') { + start++; + end--; + } + } + // otherwise, march through everything inside + else { + int obj_count = 0; + int arr_count = 0; + + while (!eof(end)) { + // could do some additional checks here. we know our current + // element type, so we could be more strict on what kinds of + // characters we expect to see. + switch (*end++) { + case '{': obj_count++; break; + case '}': obj_count--; break; + case '[': arr_count++; break; + case ']': arr_count--; break; + default: break; + } + if (obj_count == 0 && arr_count == 0) { break; } + } + if (obj_count > 0 || arr_count > 0) { return parse_result::ERROR; } + pos = end; + } + + // parse trailing , + if (parse_whitespace()) { + if (*pos == ',') { pos++; } + } + + if (output != nullptr) { output->add_output({start, static_cast(end - start)}); } + return parse_result::SUCCESS; + } + + // skip the next element + __device__ parse_result skip_element() { return extract_element(nullptr, false); } + + // advance to the next element + __device__ parse_result next_element() { return next_element_internal(false); } + + // advance inside the current element + __device__ parse_result child_element(json_element_type expected_type) + { + if (expected_type != NONE && cur_el_type != expected_type) { return parse_result::ERROR; } + + // if we succeed, record our parent element type. + auto const prev_el_type = cur_el_type; + auto const result = next_element_internal(true); + if (result == parse_result::SUCCESS) { parent_el_type = prev_el_type; } + return result; + } + + // return the next element that matches the specified name. + __device__ parse_result next_matching_element(string_view const& name, bool inclusive) + { + // if we're not including the current element, skip it + if (!inclusive) { + parse_result result = next_element_internal(false); + if (result != parse_result::SUCCESS) { return result; } + } + // loop until we find a match or there's nothing left + do { + // wildcard matches anything + if (name.size_bytes() == 1 && name.data()[0] == '*') { + return parse_result::SUCCESS; + } else if (cur_el_name == name) { + return parse_result::SUCCESS; + } + + // next + parse_result result = next_element_internal(false); + if (result != parse_result::SUCCESS) { return result; } + } while (1); + + return parse_result::ERROR; + } + + private: + // parse a value - either a string or a number/null/bool + __device__ parse_result parse_value() + { + if (!parse_whitespace()) { return parse_result::ERROR; } + + // string or number? + string_view unused; + return *pos == '\"' ? parse_string(unused, false, '\"') : parse_non_string_value(unused); + } + + __device__ parse_result next_element_internal(bool child) + { + // if we're not getting a child element, skip the current element. + // this will leave pos as the first character -after- the close of + // the current element + if (!child && cur_el_start != nullptr) { + if (skip_element() == parse_result::ERROR) { return parse_result::ERROR; } + cur_el_start = nullptr; + } + // otherwise pos will be at the first character within the current element + + // can only get the child of an object or array. + // this could theoretically be handled as an error, but the evaluators I've found + // seem to treat this as "it's nothing" + if (child && (cur_el_type == VALUE || cur_el_type == NONE)) { return parse_result::EMPTY; } + + // what's next + if (!parse_whitespace()) { return parse_result::EMPTY; } + // if we're closing off a parent element, we're done + char const c = *pos; + if (c == ']' || c == '}') { return parse_result::EMPTY; } + + // if we're not accessing elements of an array, check for name. + bool const array_access = + (cur_el_type == ARRAY && child) || (parent_el_type == ARRAY && !child); + if (!array_access && parse_name(cur_el_name, true, '\"') == parse_result::ERROR) { + return parse_result::ERROR; + } + + // element type + if (!parse_whitespace()) { return parse_result::EMPTY; } + switch (*pos++) { + case '[': cur_el_type = ARRAY; break; + case '{': cur_el_type = OBJECT; break; + + case ',': + case ':': + case '\'': return parse_result::ERROR; + + // value type + default: cur_el_type = VALUE; break; + } + + // the start of the current element is always at the value, not the name + cur_el_start = pos - 1; + return parse_result::SUCCESS; + } + + const char* cur_el_start; // pointer to the first character of the -value- of the current + // element - not the name + string_view cur_el_name; // name of the current element (if applicable) + json_element_type cur_el_type; // type of the current element + json_element_type parent_el_type; // parent element type +}; + +enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, END }; + +/** + * @brief A "command" operator used to query a json string. A full query is + * an array of these operators applied to the incoming json string, + */ +struct path_operator { + CUDA_HOST_DEVICE_CALLABLE path_operator() + : type(path_operator_type::ERROR), index(-1), expected_type{NONE} + { + } + CUDA_HOST_DEVICE_CALLABLE path_operator(path_operator_type _type, + json_element_type _expected_type = NONE) + : type(_type), index(-1), expected_type{_expected_type} + { + } + + path_operator_type type; // operator type + // the expected element type we're applying this operation to. + // for example: + // - you cannot retrieve a subscripted field (eg [5]) from an object. + // - you cannot retrieve a field by name (eg .book) from an array. + // - you -can- use .* for both arrays and objects + // a value of NONE imples any type accepted + json_element_type expected_type; // the expected type of the element we're working with + string_view name; // name to match against (if applicable) + int index; // index for subscript operator +}; + +/** + * @brief Parsing class that holds the current state of the JSONPath string to be parsed + * and provides functions for navigating through it. This is only called on the host + * during the preprocess step which builds a command buffer that the gpu uses. + */ +class path_state : private parser { + public: + path_state(const char* _path, size_t _path_len) : parser(_path, _path_len) {} + + // get the next operator in the JSONPath string + path_operator get_next_operator() + { + if (eof()) { return {path_operator_type::END}; } + + switch (*pos++) { + case '$': return {path_operator_type::ROOT}; + + case '.': { + path_operator op; + string_view term{".[", 2}; + if (parse_path_name(op.name, term)) { + // this is another potential use case for __SPARK_BEHAVIORS / configurability + // Spark currently only handles the wildcard operator inside [*], it does + // not handle .* + if (op.name.size_bytes() == 1 && op.name.data()[0] == '*') { + op.type = path_operator_type::CHILD_WILDCARD; + op.expected_type = NONE; + } else { + op.type = path_operator_type::CHILD; + op.expected_type = OBJECT; + } + return op; + } + } break; + + // 3 ways this can be used + // indices: [0] + // name: ['book'] + // wildcard: [*] + case '[': { + path_operator op; + string_view term{"]", 1}; + bool const is_string = *pos == '\'' ? true : false; + if (parse_path_name(op.name, term)) { + pos++; + if (op.name.size_bytes() == 1 && op.name.data()[0] == '*') { + op.type = path_operator_type::CHILD_WILDCARD; + op.expected_type = NONE; + } else { + if (is_string) { + op.type = path_operator_type::CHILD; + op.expected_type = OBJECT; + } else { + op.type = path_operator_type::CHILD_INDEX; + op.index = cudf::io::parse_numeric( + op.name.data(), op.name.data() + op.name.size_bytes(), json_opts, -1); + CUDF_EXPECTS(op.index >= 0, "Invalid numeric index specified in JSONPath"); + op.expected_type = ARRAY; + } + } + return op; + } + } break; + + // wildcard operator + case '*': { + pos++; + return path_operator{path_operator_type::CHILD_WILDCARD}; + } break; + + default: CUDF_FAIL("Unrecognized JSONPath operator"); break; + } + return {path_operator_type::ERROR}; + } + + private: + cudf::io::parse_options_view json_opts{',', '\n', '\"', '.'}; + + bool parse_path_name(string_view& name, string_view const& terminators) + { + switch (*pos) { + case '*': + name = string_view(pos, 1); + pos++; + break; + + case '\'': + if (parse_string(name, false, '\'') != parse_result::SUCCESS) { return false; } + break; + + default: { + size_t const chars_left = input_len - (pos - input); + char const* end = std::find_first_of( + pos, pos + chars_left, terminators.data(), terminators.data() + terminators.size_bytes()); + if (end) { + name = string_view(pos, end - pos); + pos = end; + } else { + name = string_view(pos, chars_left); + pos = input + input_len; + } + break; + } + } + + // an empty name is not valid + CUDF_EXPECTS(name.size_bytes() > 0, "Invalid empty name in JSONPath query string"); + + return true; + } +}; + +/** + * @brief Preprocess the incoming JSONPath string on the host to generate a + * command buffer for use by the GPU. + * + * @param json_path The incoming json path + * @param stream Cuda stream to perform any gpu actions on + * @returns A pair containing the command buffer, and maximum stack depth required. + */ +std::pair>, int> build_command_buffer( + cudf::string_scalar const& json_path, rmm::cuda_stream_view stream) +{ + std::string h_json_path = json_path.to_string(stream); + path_state p_state(h_json_path.data(), static_cast(h_json_path.size())); + + std::vector h_operators; + + path_operator op; + int max_stack_depth = 1; + do { + op = p_state.get_next_operator(); + if (op.type == path_operator_type::ERROR) { + CUDF_FAIL("Encountered invalid JSONPath input string"); + } + if (op.type == path_operator_type::CHILD_WILDCARD) { max_stack_depth++; } + // convert pointer to device pointer + if (op.name.size_bytes() > 0) { + op.name = + string_view(json_path.data() + (op.name.data() - h_json_path.data()), op.name.size_bytes()); + } + if (op.type == path_operator_type::ROOT) { + CUDF_EXPECTS(h_operators.size() == 0, "Root operator ($) can only exist at the root"); + } + // if we havent' gotten a root operator to start, and we're not empty, quietly push a + // root operator now. + if (h_operators.size() == 0 && op.type != path_operator_type::ROOT && + op.type != path_operator_type::END) { + h_operators.push_back(path_operator{path_operator_type::ROOT}); + } + h_operators.push_back(op); + } while (op.type != path_operator_type::END); + + auto const is_empty = h_operators.size() == 1 && h_operators[0].type == path_operator_type::END; + return is_empty + ? std::make_pair(thrust::nullopt, 0) + : std::make_pair( + thrust::make_optional(cudf::detail::make_device_uvector_sync(h_operators, stream)), + max_stack_depth); +} + +#define PARSE_TRY(_x) \ + do { \ + last_result = _x; \ + if (last_result == parse_result::ERROR) { return parse_result::ERROR; } \ + } while (0) + +/** + * @brief Parse a single json string using the provided command buffer + * + * @param j_state The incoming json string and associated parser + * @param commands The command buffer to be applied to the string. Always ends with a + * path_operator_type::END + * @param output Buffer user to store the results of the query + * @returns A result code indicating success/fail/empty. + */ +template +__device__ parse_result parse_json_path(json_state& j_state, + path_operator const* commands, + json_output& output) +{ + // manually maintained context stack in lieu of calling parse_json_path recursively. + struct context { + json_state j_state; + path_operator const* commands; + bool list_element; + bool state_flag; + }; + context stack[max_command_stack_depth]; + int stack_pos = 0; + auto push_context = [&stack, &stack_pos](json_state const& _j_state, + path_operator const* _commands, + bool _list_element = false, + bool _state_flag = false) { + if (stack_pos == max_command_stack_depth - 1) { return false; } + stack[stack_pos++] = context{_j_state, _commands, _list_element, _state_flag}; + return true; + }; + auto pop_context = [&stack, &stack_pos](context& c) { + if (stack_pos > 0) { + c = stack[--stack_pos]; + return true; + } + return false; + }; + push_context(j_state, commands, false); + + parse_result last_result = parse_result::SUCCESS; + context ctx; + int element_count = 0; + while (pop_context(ctx)) { + path_operator op = *ctx.commands; + + switch (op.type) { + // whatever the first object is + case path_operator_type::ROOT: + PARSE_TRY(ctx.j_state.next_element()); + push_context(ctx.j_state, ctx.commands + 1); + break; + + // .name + // ['name'] + // [1] + // will return a single thing + case path_operator_type::CHILD: { + PARSE_TRY(ctx.j_state.child_element(op.expected_type)); + if (last_result == parse_result::SUCCESS) { + PARSE_TRY(ctx.j_state.next_matching_element(op.name, true)); + if (last_result == parse_result::SUCCESS) { + push_context(ctx.j_state, ctx.commands + 1, ctx.list_element); + } + } + } break; + + // .* + // [*] + // will return an array of things + case path_operator_type::CHILD_WILDCARD: { + // if we're on the first element of this wildcard + if (!ctx.state_flag) { + // we will only ever be returning 1 array + if (!ctx.list_element) { output.add_output({"[" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); } + + // step into the child element + PARSE_TRY(ctx.j_state.child_element(op.expected_type)); + if (last_result == parse_result::EMPTY) { + if (!ctx.list_element) { + output.add_output({"]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); + } + last_result = parse_result::SUCCESS; + break; + } + + // first element + PARSE_TRY(ctx.j_state.next_matching_element({"*", 1}, true)); + if (last_result == parse_result::EMPTY) { + if (!ctx.list_element) { + output.add_output({"]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); + } + last_result = parse_result::SUCCESS; + break; + } + + // re-push ourselves + push_context(ctx.j_state, ctx.commands, ctx.list_element, true); + // push the next command + push_context(ctx.j_state, ctx.commands + 1, true); + } else { + // next element + PARSE_TRY(ctx.j_state.next_matching_element({"*", 1}, false)); + if (last_result == parse_result::EMPTY) { + if (!ctx.list_element) { + output.add_output({"]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); + } + last_result = parse_result::SUCCESS; + break; + } + + // re-push ourselves + push_context(ctx.j_state, ctx.commands, ctx.list_element, true); + // push the next command + push_context(ctx.j_state, ctx.commands + 1, true); + } + } break; + + // [0] + // [1] + // etc + // returns a single thing + case path_operator_type::CHILD_INDEX: { + PARSE_TRY(ctx.j_state.child_element(op.expected_type)); + if (last_result == parse_result::SUCCESS) { + string_view const any{"*", 1}; + PARSE_TRY(ctx.j_state.next_matching_element(any, true)); + if (last_result == parse_result::SUCCESS) { + int idx; + for (idx = 1; idx <= op.index; idx++) { + PARSE_TRY(ctx.j_state.next_matching_element(any, false)); + if (last_result == parse_result::EMPTY) { break; } + } + // if we didn't end up at the index we requested, this is an invalid index + if (idx - 1 != op.index) { return parse_result::ERROR; } + push_context(ctx.j_state, ctx.commands + 1, ctx.list_element); + } + } + } break; + + // some sort of error. + case path_operator_type::ERROR: return parse_result::ERROR; break; + + // END case + default: { + if (ctx.list_element && element_count > 0) { + output.add_output({"," DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); + } + PARSE_TRY(ctx.j_state.extract_element(&output, ctx.list_element)); + if (ctx.list_element && last_result != parse_result::EMPTY) { element_count++; } + } break; + } + } + + return parse_result::SUCCESS; +} + +// hardcoding this for now. to reach a stack depth of 8 would require +// a JSONPath containing 7 nested wildcards so this is probably reasonable. +constexpr int max_command_stack_depth = 8; + +/** + * @brief Parse a single json string using the provided command buffer + * + * This function exists primarily as a shim for debugging purposes. + * + * @param input The incoming json string + * @param input_len Size of the incoming json string + * @param commands The command buffer to be applied to the string. Always ends with a + * path_operator_type::END + * @param out_buf Buffer user to store the results of the query (nullptr in the size computation + * step) + * @param out_buf_size Size of the output buffer + * @returns A pair containing the result code the output buffer. + */ +__device__ thrust::pair get_json_object_single( + char const* input, + size_t input_len, + path_operator const* const commands, + char* out_buf, + size_t out_buf_size) +{ + json_state j_state(input, input_len); + json_output output{out_buf_size, out_buf}; + + auto const result = parse_json_path(j_state, commands, output); + + return {result, output}; +} + +/** + * @brief Kernel for running the JSONPath query. + * + * This kernel operates in a 2-pass way. On the first pass, it computes + * output sizes. On the second pass it fills in the provided output buffers + * (chars and validity) + * + * @param col Device view of the incoming string + * @param commands JSONPath command buffer + * @param output_offsets Buffer used to store the string offsets for the results of the query + * @param out_buf Buffer used to store the results of the query + * @param out_validity Output validity buffer + * @param out_valid_count Output count of # of valid bits + */ +template +__launch_bounds__(block_size) __global__ + void get_json_object_kernel(column_device_view col, + path_operator const* const commands, + offset_type* output_offsets, + thrust::optional out_buf, + thrust::optional out_validity, + thrust::optional out_valid_count) +{ + size_type tid = threadIdx.x + (blockDim.x * blockIdx.x); + size_type stride = blockDim.x * gridDim.x; + + if (out_valid_count.has_value()) { *(out_valid_count.value()) = 0; } + size_type warp_valid_count{0}; + + auto active_threads = __ballot_sync(0xffffffff, tid < col.size()); + while (tid < col.size()) { + bool is_valid = false; + string_view const str = col.element(tid); + size_type output_size = 0; + if (str.size_bytes() > 0) { + char* dst = out_buf.has_value() ? out_buf.value() + output_offsets[tid] : nullptr; + size_t const dst_size = + out_buf.has_value() ? output_offsets[tid + 1] - output_offsets[tid] : 0; + + parse_result result; + json_output out; + thrust::tie(result, out) = + get_json_object_single(str.data(), str.size_bytes(), commands, dst, dst_size); + output_size = out.output_len.value_or(0); + if (out.output_len.has_value() && result == parse_result::SUCCESS) { is_valid = true; } + } + + // filled in only during the precompute step. during the compute step, the offsets + // are fed back in so we do -not- want to write them out + if (!out_buf.has_value()) { output_offsets[tid] = static_cast(output_size); } + + // validity filled in only during the output step + if (out_validity.has_value()) { + uint32_t mask = __ballot_sync(active_threads, is_valid); + // 0th lane of the warp writes the validity + if (!(tid % cudf::detail::warp_size)) { + out_validity.value()[cudf::word_index(tid)] = mask; + warp_valid_count += __popc(mask); + } + } + + tid += stride; + active_threads = __ballot_sync(active_threads, tid < col.size()); + } + + // sum the valid counts across the whole block + if (out_valid_count) { + size_type block_valid_count = + cudf::detail::single_lane_block_sum_reduce(warp_valid_count); + if (threadIdx.x == 0) { atomicAdd(out_valid_count.value(), block_valid_count); } + } +} + +/** + * @copydoc cudf::strings::detail::get_json_object + */ +std::unique_ptr get_json_object(cudf::strings_column_view const& col, + cudf::string_scalar const& json_path, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // preprocess the json_path into a command buffer + auto preprocess = build_command_buffer(json_path, stream); + CUDF_EXPECTS(std::get<1>(preprocess) <= max_command_stack_depth, + "Encountered JSONPath string that is too complex"); + + // allocate output offsets buffer. + auto offsets = cudf::make_fixed_width_column( + data_type{type_id::INT32}, col.size() + 1, mask_state::UNALLOCATED, stream, mr); + cudf::mutable_column_view offsets_view(*offsets); + + // if the query is empty, return a string column containing all nulls + if (!std::get<0>(preprocess).has_value()) { + return std::make_unique( + data_type{type_id::STRING}, + col.size(), + rmm::device_buffer{0, stream, mr}, // no data + cudf::detail::create_null_mask(col.size(), mask_state::ALL_NULL, stream, mr), + col.size()); // null count + } + + constexpr int block_size = 512; + cudf::detail::grid_1d const grid{col.size(), block_size}; + + auto cdv = column_device_view::create(col.parent(), stream); + + // preprocess sizes (returned in the offsets buffer) + get_json_object_kernel + <<>>( + *cdv, + std::get<0>(preprocess).value().data(), + offsets_view.head(), + thrust::nullopt, + thrust::nullopt, + thrust::nullopt); + + // convert sizes to offsets + thrust::exclusive_scan(rmm::exec_policy(stream), + offsets_view.head(), + offsets_view.head() + col.size() + 1, + offsets_view.head(), + 0); + size_type const output_size = + cudf::detail::get_value(offsets_view, col.size(), stream); + + // allocate output string column + auto chars = cudf::make_fixed_width_column( + data_type{type_id::INT8}, output_size, mask_state::UNALLOCATED, stream, mr); + + // potential optimization : if we know that all outputs are valid, we could skip creating + // the validity mask altogether + rmm::device_buffer validity = + cudf::detail::create_null_mask(col.size(), mask_state::UNINITIALIZED, stream, mr); + + // compute results + cudf::mutable_column_view chars_view(*chars); + rmm::device_scalar d_valid_count{0, stream}; + get_json_object_kernel + <<>>( + *cdv, + std::get<0>(preprocess).value().data(), + offsets_view.head(), + chars_view.head(), + static_cast(validity.data()), + d_valid_count.data()); + + return make_strings_column(col.size(), + std::move(offsets), + std::move(chars), + col.size() - d_valid_count.value(), + std::move(validity), + stream, + mr); +} + +} // namespace +} // namespace detail + +/** + * @copydoc cudf::strings::get_json_object + */ +std::unique_ptr get_json_object(cudf::strings_column_view const& col, + cudf::string_scalar const& json_path, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::get_json_object(col, json_path, 0, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 082f039054e..f9904dda49e 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -334,6 +334,7 @@ ConfigureTest(STRINGS_TEST strings/hash_string.cu strings/integers_tests.cu strings/ipv4_tests.cpp + strings/json_tests.cpp strings/pad_tests.cpp strings/replace_regex_tests.cpp strings/replace_tests.cpp diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp index 5641d445ff3..d928a17b3d1 100644 --- a/cpp/tests/hashing/hash_test.cpp +++ b/cpp/tests/hashing/hash_test.cpp @@ -257,20 +257,35 @@ TEST_F(SerialMurmurHash3Test, MultiValueWithSeeds) fixed_width_column_wrapper const bools_col1({0, 1, 1, 1, 0}); fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0}); - auto const input1 = cudf::table_view({strings_col}); - auto const input2 = cudf::table_view({ints_col}); - auto const input3 = cudf::table_view({strings_col, ints_col, bools_col1}); - auto const input4 = cudf::table_view({strings_col, ints_col, bools_col2}); - - auto const hashed_output1 = cudf::hash(input1, cudf::hash_id::HASH_SERIAL_MURMUR3, {}, 314); - auto const hashed_output2 = cudf::hash(input2, cudf::hash_id::HASH_SERIAL_MURMUR3, {}, 42); - auto const hashed_output3 = cudf::hash(input3, cudf::hash_id::HASH_SERIAL_MURMUR3, {}); - auto const hashed_output4 = cudf::hash(input4, cudf::hash_id::HASH_SERIAL_MURMUR3, {}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(hashed_output1->view(), strings_col_result, true); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(hashed_output2->view(), ints_col_result, true); - EXPECT_EQ(input3.num_rows(), hashed_output3->size()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(hashed_output3->view(), hashed_output4->view(), true); + std::vector> struct_field_cols; + struct_field_cols.emplace_back(std::make_unique(strings_col)); + struct_field_cols.emplace_back(std::make_unique(ints_col)); + struct_field_cols.emplace_back(std::make_unique(bools_col1)); + structs_column_wrapper structs_col(std::move(struct_field_cols)); + + auto const combo1 = cudf::table_view({strings_col, ints_col, bools_col1}); + auto const combo2 = cudf::table_view({strings_col, ints_col, bools_col2}); + + constexpr auto hasher = cudf::hash_id::HASH_SERIAL_MURMUR3; + auto const strings_hash = cudf::hash(cudf::table_view({strings_col}), hasher, {}, 314); + auto const ints_hash = cudf::hash(cudf::table_view({ints_col}), hasher, {}, 42); + auto const combo1_hash = cudf::hash(combo1, hasher, {}); + auto const combo2_hash = cudf::hash(combo2, hasher, {}); + auto const structs_hash = cudf::hash(cudf::table_view({structs_col}), hasher, {}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*strings_hash, strings_col_result, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ints_hash, ints_col_result, true); + EXPECT_EQ(combo1.num_rows(), combo1_hash->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*combo1_hash, *combo2_hash, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*structs_hash, *combo1_hash, true); +} + +TEST_F(SerialMurmurHash3Test, ListThrows) +{ + lists_column_wrapper strings_list_col({{""}, {"abc"}, {"123"}}); + EXPECT_THROW( + cudf::hash(cudf::table_view({strings_list_col}), cudf::hash_id::HASH_SERIAL_MURMUR3, {}), + cudf::logic_error); } class SparkMurmurHash3Test : public cudf::test::BaseFixture { @@ -280,31 +295,38 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) { // The hash values were determined by running the following Scala code in Apache Spark: // import org.apache.spark.sql.catalyst.util.DateTimeUtils - // val schema = new StructType().add("strings",StringType).add("doubles",DoubleType) - // .add("timestamps",TimestampType).add("decimal64", DecimalType(18,7)).add("longs",LongType) - // .add("floats",FloatType).add("dates",DateType).add("decimal32", DecimalType(9,3)) - // .add("ints",IntegerType).add("shorts",ShortType).add("bytes",ByteType) - // .add("bools",BooleanType) + // val schema = new StructType().add("structs", new StructType().add("a",IntegerType) + // .add("b",StringType).add("c",new StructType().add("x",FloatType).add("y",LongType))) + // .add("strings",StringType).add("doubles",DoubleType).add("timestamps",TimestampType) + // .add("decimal64", DecimalType(18,7)).add("longs",LongType).add("floats",FloatType) + // .add("dates",DateType).add("decimal32", DecimalType(9,3)).add("ints",IntegerType) + // .add("shorts",ShortType).add("bytes",ByteType).add("bools",BooleanType) // val data = Seq( - // Row("", 0.toDouble, DateTimeUtils.toJavaTimestamp(0), BigDecimal(0), 0.toLong, 0.toFloat, - // DateTimeUtils.toJavaDate(0), BigDecimal(0), 0, 0.toShort, 0.toByte, false), - // Row("The quick brown fox", -(0.toDouble), DateTimeUtils.toJavaTimestamp(100), - // BigDecimal("0.00001"), 100.toLong, -(0.toFloat), DateTimeUtils.toJavaDate(100), - // BigDecimal("0.1"), 100, 100.toShort, 100.toByte, true), - // Row("jumps over the lazy dog.", -Double.NaN, DateTimeUtils.toJavaTimestamp(-100), - // BigDecimal("-0.00001"), -100.toLong, -Float.NaN, DateTimeUtils.toJavaDate(-100), - // BigDecimal("-0.1"), -100, -100.toShort, -100.toByte, true), - // Row("All work and no play makes Jack a dull boy", Double.MinValue, - // DateTimeUtils.toJavaTimestamp(Long.MinValue/1000000), BigDecimal("-99999999999.9999999"), - // Long.MinValue, Float.MinValue, DateTimeUtils.toJavaDate(Int.MinValue/100), - // BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true), - // Row("!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721", Double.MaxValue, - // DateTimeUtils.toJavaTimestamp(Long.MaxValue/1000000), BigDecimal("99999999999.9999999"), - // Long.MaxValue, Float.MaxValue, DateTimeUtils.toJavaDate(Int.MaxValue/100), - // BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false)) + // Row(Row(0, "a", Row(0f, 0L)), "", 0.toDouble, DateTimeUtils.toJavaTimestamp(0), BigDecimal(0), + // 0.toLong, 0.toFloat, DateTimeUtils.toJavaDate(0), BigDecimal(0), 0, 0.toShort, 0.toByte, + // false), + // Row(Row(100, "bc", Row(100f, 100L)), "The quick brown fox", -(0.toDouble), + // DateTimeUtils.toJavaTimestamp(100), BigDecimal("0.00001"), 100.toLong, -(0.toFloat), + // DateTimeUtils.toJavaDate(100), BigDecimal("0.1"), 100, 100.toShort, 100.toByte, true), + // Row(Row(-100, "def", Row(-100f, -100L)), "jumps over the lazy dog.", -Double.NaN, + // DateTimeUtils.toJavaTimestamp(-100), BigDecimal("-0.00001"), -100.toLong, -Float.NaN, + // DateTimeUtils.toJavaDate(-100), BigDecimal("-0.1"), -100, -100.toShort, -100.toByte, + // true), + // Row(Row(0x12345678, "ghij", Row(Float.PositiveInfinity, 0x123456789abcdefL)), + // "All work and no play makes Jack a dull boy", Double.MinValue, + // DateTimeUtils.toJavaTimestamp(Long.MinValue/1000000), BigDecimal("-99999999999.9999999"), + // Long.MinValue, Float.MinValue, DateTimeUtils.toJavaDate(Int.MinValue/100), + // BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true), + // Row(Row(-0x76543210, "klmno", Row(Float.NegativeInfinity, -0x123456789abcdefL)), + // "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721", Double.MaxValue, + // DateTimeUtils.toJavaTimestamp(Long.MaxValue/1000000), BigDecimal("99999999999.9999999"), + // Long.MaxValue, Float.MaxValue, DateTimeUtils.toJavaDate(Int.MaxValue/100), + // BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false)) // val df = spark.createDataFrame(sc.parallelize(data), schema) // df.columns.foreach(c => println(s"$c => ${df.select(hash(col(c))).collect.mkString(",")}")) // df.select(hash(col("*"))).collect + fixed_width_column_wrapper const hash_structs_expected( + {-105406170, 90479889, -678041645, 1667387937, 301478567}); fixed_width_column_wrapper const hash_strings_expected( {1467149710, 723257560, -1620282500, -2001858707, 1588473657}); fixed_width_column_wrapper const hash_doubles_expected( @@ -330,18 +352,26 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) fixed_width_column_wrapper const hash_bools_expected( {933211791, -559580957, -559580957, -559580957, 933211791}); fixed_width_column_wrapper const hash_combined_expected( - {-1947042614, -1731440908, 807283935, 725489209, 822276819}); + {-1172364561, -442972638, 1213234395, 796626751, 214075225}); + + using double_limits = std::numeric_limits; + using long_limits = std::numeric_limits; + using float_limits = std::numeric_limits; + using int_limits = std::numeric_limits; + fixed_width_column_wrapper a_col{0, 100, -100, 0x12345678, -0x76543210}; + strings_column_wrapper b_col{"a", "bc", "def", "ghij", "klmno"}; + fixed_width_column_wrapper x_col{ + 0.f, 100.f, -100.f, float_limits::infinity(), -float_limits::infinity()}; + fixed_width_column_wrapper y_col{ + 0L, 100L, -100L, 0x123456789abcdefL, -0x123456789abcdefL}; + structs_column_wrapper c_col{{x_col, y_col}}; + structs_column_wrapper const structs_col{{a_col, b_col, c_col}}; strings_column_wrapper const strings_col({"", "The quick brown fox", "jumps over the lazy dog.", "All work and no play makes Jack a dull boy", "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"}); - - using double_limits = std::numeric_limits; - using long_limits = std::numeric_limits; - using float_limits = std::numeric_limits; - using int_limits = std::numeric_limits; fixed_width_column_wrapper const doubles_col( {0., -0., -double_limits::quiet_NaN(), double_limits::lowest(), double_limits::max()}); fixed_width_column_wrapper const timestamps_col( @@ -364,6 +394,7 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0}); constexpr auto hasher = cudf::hash_id::HASH_SPARK_MURMUR3; + auto const hash_structs = cudf::hash(cudf::table_view({structs_col}), hasher, {}, 42); auto const hash_strings = cudf::hash(cudf::table_view({strings_col}), hasher, {}, 314); auto const hash_doubles = cudf::hash(cudf::table_view({doubles_col}), hasher, {}, 42); auto const hash_timestamps = cudf::hash(cudf::table_view({timestamps_col}), hasher, {}, 42); @@ -378,6 +409,7 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) auto const hash_bools1 = cudf::hash(cudf::table_view({bools_col1}), hasher, {}, 42); auto const hash_bools2 = cudf::hash(cudf::table_view({bools_col2}), hasher, {}, 42); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_structs, hash_structs_expected, true); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected, true); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_doubles, hash_doubles_expected, true); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_timestamps, hash_timestamps_expected, true); @@ -392,7 +424,8 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools1, hash_bools_expected, true); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools2, hash_bools_expected, true); - auto const combined_table = cudf::table_view({strings_col, + auto const combined_table = cudf::table_view({structs_col, + strings_col, doubles_col, timestamps_col, decimal64_col, @@ -408,6 +441,14 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_combined, hash_combined_expected, true); } +TEST_F(SparkMurmurHash3Test, ListThrows) +{ + lists_column_wrapper strings_list_col({{""}, {"abc"}, {"123"}}); + EXPECT_THROW( + cudf::hash(cudf::table_view({strings_list_col}), cudf::hash_id::HASH_SPARK_MURMUR3, {}), + cudf::logic_error); +} + class MD5HashTest : public cudf::test::BaseFixture { }; diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index 0948ba96f62..bc413fd220a 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -14,174 +14,241 @@ * limitations under the License. */ -#include - #include #include +#include + +#include +#include + +#include +#include -using float_type = float; using int_type = int32_t; -using INT_LCW = cudf::test::lists_column_wrapper; -using FLT_LCW = cudf::test::lists_column_wrapper; -using STR_LCW = cudf::test::lists_column_wrapper; +using float_type = float; + +using LIST_COL_FLT = cudf::test::lists_column_wrapper; +using LIST_COL_STR = cudf::test::lists_column_wrapper; -template +auto constexpr neg_NaN = -std::numeric_limits::quiet_NaN(); +auto constexpr neg_Inf = -std::numeric_limits::infinity(); +auto constexpr NaN = std::numeric_limits::quiet_NaN(); +auto constexpr Inf = std::numeric_limits::infinity(); + +template void test_once(cudf::column_view const& input, LCW const& expected, cudf::null_equality nulls_equal = cudf::null_equality::EQUAL) { auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{input}, nulls_equal); - if (equal_test) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, true); + if (cudf::is_floating_point(input.type())) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } else { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); } } struct DropListDuplicatesTest : public cudf::test::BaseFixture { }; -TEST_F(DropListDuplicatesTest, InvalidCasesTests) +TEST_F(DropListDuplicatesTest, FloatingPointTestsWithSignedZero) { - // Lists of nested types are not supported - EXPECT_THROW( - cudf::lists::drop_list_duplicates(cudf::lists_column_view{INT_LCW{INT_LCW{{1, 2}, {3}}}}), - cudf::logic_error); - EXPECT_THROW( - cudf::lists::drop_list_duplicates(cudf::lists_column_view{FLT_LCW{FLT_LCW{{1, 2}, {3}}}}), - cudf::logic_error); - EXPECT_THROW( - cudf::lists::drop_list_duplicates(cudf::lists_column_view{STR_LCW{STR_LCW{STR_LCW{"string"}}}}), - cudf::logic_error); + // -0.0 and 0.0 should be considered equal + test_once(LIST_COL_FLT{0.0, 1, 2, -0.0, 1, 2, 0.0, 1, 2, -0.0, -0.0, 0.0, 0.0}, + LIST_COL_FLT{0, 1, 2}); +} + +TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInf) +{ + // Lists contain inf + test_once(LIST_COL_FLT{0, 1, 2, 0, 1, 2, 0, 1, 2, Inf, Inf, Inf}, LIST_COL_FLT{0, 1, 2, Inf}); + test_once(LIST_COL_FLT{Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf}, + LIST_COL_FLT{neg_Inf, 0, Inf}); +} + +// The position of NaN is undefined after sorting, thus we need to offload the data to CPU to +// check for validity +// We will not store NaN in the results_expected variable (an unordered_set) because we can't check +// for NaN existence in a set. Instead, we will count the number of NaNs in the input and compare +// with the number of NaNs in the output. +static void test_floating_point(std::vector const& h_input, + std::unordered_set const& results_expected, + cudf::nan_equality nans_equal) +{ + // If NaNs are considered as equal value, the final result should always contain at max ONE NaN + // entry per list + std::size_t const num_NaNs = + nans_equal == cudf::nan_equality::ALL_EQUAL + ? std::size_t{1} + : std::count_if(h_input.begin(), h_input.end(), [](auto x) { return std::isnan(x); }); + + auto const results_col = cudf::lists::drop_list_duplicates( + cudf::lists_column_view{LIST_COL_FLT(h_input.begin(), h_input.end())}, + cudf::null_equality::EQUAL, + nans_equal); + auto const results_arr = + cudf::test::to_host(cudf::lists_column_view(results_col->view()).child()).first; + + EXPECT_EQ(results_arr.size(), results_expected.size() + num_NaNs); + + std::size_t NaN_count{0}; + std::unordered_set results; + for (auto const x : results_arr) { + if (std::isnan(x)) { + ++NaN_count; + } else { + results.insert(x); + } + } + EXPECT_TRUE(results_expected.size() == results.size() && NaN_count == num_NaNs); } -TEST_F(DropListDuplicatesTest, FloatingPointTestsNonNull) +TEST_F(DropListDuplicatesTest, FloatingPointTestsWithNaNs) +{ + std::vector h_input{ + 0, -1, 1, NaN, 2, 0, neg_NaN, 1, -2, 2, 0, 1, 2, neg_NaN, NaN, NaN, NaN, neg_NaN}; + std::unordered_set results_expected{-2, -1, 0, 1, 2}; + test_floating_point(h_input, results_expected, cudf::nan_equality::UNEQUAL); + test_floating_point(h_input, results_expected, cudf::nan_equality::ALL_EQUAL); +} + +TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInfsAndNaNs) +{ + std::vector h_input{neg_Inf, 0, neg_NaN, 1, -1, -2, NaN, NaN, Inf, NaN, + neg_NaN, 2, -1, 0, neg_NaN, 1, 2, Inf, 0, 1, + neg_Inf, 2, neg_NaN, Inf, neg_NaN, neg_NaN, NaN, neg_Inf}; + std::unordered_set results_expected{-2, -1, 0, 1, 2, neg_Inf, Inf}; + test_floating_point(h_input, results_expected, cudf::nan_equality::UNEQUAL); + test_floating_point(h_input, results_expected, cudf::nan_equality::ALL_EQUAL); +} + +TEST_F(DropListDuplicatesTest, StringTestsNonNull) { // Trivial cases - test_once(FLT_LCW{{}}, FLT_LCW{{}}); - test_once(FLT_LCW{{0, 1, 2, 3, 4, 5}, {}}, FLT_LCW{{0, 1, 2, 3, 4, 5}, {}}); + test_once(LIST_COL_STR{{}}, LIST_COL_STR{{}}); + test_once(LIST_COL_STR{"this", "is", "a", "string"}, LIST_COL_STR{"a", "is", "string", "this"}); - // Multiple empty lists - test_once(FLT_LCW{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, - FLT_LCW{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); + // One list column + test_once(LIST_COL_STR{"this", "is", "is", "is", "a", "string", "string"}, + LIST_COL_STR{"a", "is", "string", "this"}); - auto constexpr p_inf = std::numeric_limits::infinity(); - auto constexpr m_inf = -std::numeric_limits::infinity(); + // Multiple lists column + test_once( + LIST_COL_STR{LIST_COL_STR{"this", "is", "a", "no duplicate", "string"}, + LIST_COL_STR{"this", "is", "is", "a", "one duplicate", "string"}, + LIST_COL_STR{"this", "is", "is", "is", "a", "two duplicates", "string"}, + LIST_COL_STR{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}}, + LIST_COL_STR{LIST_COL_STR{"a", "is", "no duplicate", "string", "this"}, + LIST_COL_STR{"a", "is", "one duplicate", "string", "this"}, + LIST_COL_STR{"a", "is", "string", "this", "two duplicates"}, + LIST_COL_STR{"a", "is", "string", "this", "three duplicates"}}); +} - // Lists contain inf - // We can't test for lists containing nan because the order of nan is - // undefined after sorting - test_once(FLT_LCW{0, 1, 2, 0, 1, 2, 0, 1, 2, p_inf, p_inf, p_inf}, - FLT_LCW{0, 1, 2, p_inf}); - test_once(FLT_LCW{p_inf, 0, m_inf, 0, p_inf, 0, m_inf, 0, p_inf, 0, m_inf}, - FLT_LCW{m_inf, 0, p_inf}); +TEST_F(DropListDuplicatesTest, StringTestsWithNulls) +{ + auto const null = std::string(""); + + // One list column with null entries + test_once( + LIST_COL_STR{{"this", null, "is", "is", "is", "a", null, "string", null, "string"}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i != 1 && i != 6 && i != 8; })}, + LIST_COL_STR{{"a", "is", "string", "this", null}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })}); + + // Multiple lists column with null lists and null entries + test_once( + LIST_COL_STR{ + {LIST_COL_STR{ + {"this", null, "is", null, "a", null, "no duplicate", null, "string"}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; })}, + LIST_COL_STR{}, + LIST_COL_STR{"this", "is", "is", "a", "one duplicate", "string"}}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}, + LIST_COL_STR{{LIST_COL_STR{{"a", "is", "no duplicate", "string", "this", null}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i <= 4; })}, + LIST_COL_STR{}, + LIST_COL_STR{"a", "is", "one duplicate", "string", "this"}}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}); } -TEST_F(DropListDuplicatesTest, IntegerTestsNonNull) +template +struct DropListDuplicatesTypedTest : public cudf::test::BaseFixture { +}; +#define LIST_COL cudf::test::lists_column_wrapper + +using TypesForTest = + cudf::test::Concat; +TYPED_TEST_CASE(DropListDuplicatesTypedTest, TypesForTest); + +TYPED_TEST(DropListDuplicatesTypedTest, InvalidInputTests) { + // Lists of nested types are not supported + EXPECT_THROW( + cudf::lists::drop_list_duplicates(cudf::lists_column_view{LIST_COL{LIST_COL{{1, 2}, {3}}}}), + cudf::logic_error); +} + +TYPED_TEST(DropListDuplicatesTypedTest, TrivialInputTests) +{ + // Empty input + test_once(LIST_COL{{}}, LIST_COL{{}}); + // Trivial cases - test_once(INT_LCW{{}}, INT_LCW{{}}); - test_once(INT_LCW{{0, 1, 2, 3, 4, 5}, {}}, INT_LCW{{0, 1, 2, 3, 4, 5}, {}}); + test_once(LIST_COL{0, 1, 2, 3, 4, 5}, LIST_COL{0, 1, 2, 3, 4, 5}); // Multiple empty lists - test_once(INT_LCW{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, - INT_LCW{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); + test_once(LIST_COL{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, + LIST_COL{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); +} +TYPED_TEST(DropListDuplicatesTypedTest, NonNullInputTests) +{ // Adjacent lists containing the same entries - test_once( - INT_LCW{{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 2, 2, 2}, {2, 2, 2, 2, 3, 3, 3, 3}}, - INT_LCW{{1}, {1, 2}, {2, 3}}); + test_once(LIST_COL{{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 2, 2, 2}, {2, 2, 2, 2, 3, 3, 3, 3}}, + LIST_COL{{1}, {1, 2}, {2, 3}}); // Sliced list column - auto const list0 = INT_LCW{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; + auto const list0 = + LIST_COL{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; auto const list1 = cudf::slice(list0, {0, 5})[0]; auto const list2 = cudf::slice(list0, {1, 5})[0]; auto const list3 = cudf::slice(list0, {1, 3})[0]; auto const list4 = cudf::slice(list0, {0, 3})[0]; - test_once(list0, INT_LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); - test_once(list1, INT_LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); - test_once(list2, INT_LCW{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); - test_once(list3, INT_LCW{{1, 2, 3, 4}, {5}}); - test_once(list4, INT_LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}}); + test_once(list0, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); + test_once(list1, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); + test_once(list2, LIST_COL{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); + test_once(list3, LIST_COL{{1, 2, 3, 4}, {5}}); + test_once(list4, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}}); } -TEST_F(DropListDuplicatesTest, IntegerTestsWithNulls) +TYPED_TEST(DropListDuplicatesTypedTest, WithNullInputTests) { - auto constexpr null = std::numeric_limits::max(); + auto constexpr null = TypeParam{0}; // null lists - test_once(INT_LCW{{{3, 2, 1, 4, 1}, {5}, {}, {}, {10, 8, 9}, {6, 7}}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i != 2 && i != 3; })}, - INT_LCW{{{1, 2, 3, 4}, {5}, {}, {}, {8, 9, 10}, {6, 7}}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i != 2 && i != 3; })}); + test_once(LIST_COL{{{3, 2, 1, 4, 1}, {5}, {}, {}, {10, 8, 9}, {6, 7}}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i != 2 && i != 3; })}, + LIST_COL{{{1, 2, 3, 4}, {5}, {}, {}, {8, 9, 10}, {6, 7}}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i != 2 && i != 3; })}); // null entries are equal - test_once( - INT_LCW{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, - INT_LCW{{1, 3, 5, 7, 9, null}, - std::initializer_list{true, true, true, true, true, false}}); + test_once( + LIST_COL{std::initializer_list{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, + LIST_COL{std::initializer_list{1, 3, 5, 7, 9, null}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 5; })}); // nulls entries are not equal - test_once( - INT_LCW{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, - INT_LCW{ - {1, 3, 5, 7, 9, null, null, null, null, null}, - std::initializer_list{true, true, true, true, true, false, false, false, false, false}}, + test_once( + LIST_COL{std::initializer_list{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, + LIST_COL{std::initializer_list{1, 3, 5, 7, 9, null, null, null, null, null}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i < 5; })}, cudf::null_equality::UNEQUAL); } - -TEST_F(DropListDuplicatesTest, StringTestsNonNull) -{ - // Trivial cases - test_once(STR_LCW{{}}, STR_LCW{{}}); - test_once(STR_LCW{"this", "is", "a", "string"}, STR_LCW{"a", "is", "string", "this"}); - - // One list column - test_once(STR_LCW{"this", "is", "is", "is", "a", "string", "string"}, - STR_LCW{"a", "is", "string", "this"}); - - // Multiple lists column - test_once( - STR_LCW{STR_LCW{"this", "is", "a", "no duplicate", "string"}, - STR_LCW{"this", "is", "is", "a", "one duplicate", "string"}, - STR_LCW{"this", "is", "is", "is", "a", "two duplicates", "string"}, - STR_LCW{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}}, - STR_LCW{STR_LCW{"a", "is", "no duplicate", "string", "this"}, - STR_LCW{"a", "is", "one duplicate", "string", "this"}, - STR_LCW{"a", "is", "string", "this", "two duplicates"}, - STR_LCW{"a", "is", "string", "this", "three duplicates"}}); -} - -TEST_F(DropListDuplicatesTest, StringTestsWithNulls) -{ - auto const null = std::string(""); - - // One list column with null entries - test_once( - STR_LCW{{"this", null, "is", "is", "is", "a", null, "string", null, "string"}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i != 1 && i != 6 && i != 8; })}, - STR_LCW{{"a", "is", "string", "this", null}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })}); - - // Multiple lists column with null lists and null entries - test_once( - STR_LCW{{STR_LCW{{"this", null, "is", null, "a", null, "no duplicate", null, "string"}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i % 2 == 0; })}, - STR_LCW{}, - STR_LCW{"this", "is", "is", "a", "one duplicate", "string"}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}, - STR_LCW{ - {STR_LCW{{"a", "is", "no duplicate", "string", "this", null}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i <= 4; })}, - STR_LCW{}, - STR_LCW{"a", "is", "one duplicate", "string", "this"}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}); -} diff --git a/cpp/tests/lists/explode_tests.cpp b/cpp/tests/lists/explode_tests.cpp index 4c7ded0efd7..ded3d2b9193 100644 --- a/cpp/tests/lists/explode_tests.cpp +++ b/cpp/tests/lists/explode_tests.cpp @@ -530,7 +530,7 @@ TEST_F(ExplodeOuterTest, SingleNull) auto ret = cudf::explode_outer(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - FCW expected_pos_col{0, 0, 1, 0, 0, 1}; + FCW expected_pos_col{{0, 0, 1, 0, 0, 1}, {0, 1, 1, 0, 1, 1}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); auto pos_ret = cudf::explode_outer_position(t, 0); @@ -561,7 +561,7 @@ TEST_F(ExplodeOuterTest, Nulls) auto ret = cudf::explode_outer(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - FCW expected_pos_col{0, 1, 2, 0, 0, 1}; + FCW expected_pos_col{{0, 1, 2, 0, 0, 1}, {1, 1, 1, 0, 1, 1}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); auto pos_ret = cudf::explode_outer_position(t, 0); @@ -591,7 +591,7 @@ TEST_F(ExplodeOuterTest, AllNulls) auto ret = cudf::explode_outer(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - FCW expected_pos_col{0, 0, 0}; + FCW expected_pos_col{{0, 0, 0}, {0, 0, 0}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); auto pos_ret = cudf::explode_outer_position(t, 0); @@ -624,7 +624,7 @@ TEST_F(ExplodeOuterTest, SequentialNulls) auto ret = cudf::explode_outer(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - FCW expected_pos_col{0, 1, 2, 0, 1, 0, 0, 0, 1, 2}; + FCW expected_pos_col{{0, 1, 2, 0, 1, 0, 0, 0, 1, 2}, {1, 1, 0, 1, 1, 0, 0, 1, 1, 1}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); auto pos_ret = cudf::explode_outer_position(t, 0); @@ -655,7 +655,7 @@ TEST_F(ExplodeOuterTest, MoreEmptyThanData) auto ret = cudf::explode_outer(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - FCW expected_pos_col{0, 1, 0, 0, 0, 0, 0}; + FCW expected_pos_col{{0, 1, 0, 0, 0, 0, 0}, {1, 1, 0, 0, 0, 0, 1}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); auto pos_ret = cudf::explode_outer_position(t, 0); @@ -685,7 +685,7 @@ TEST_F(ExplodeOuterTest, TrailingEmptys) auto ret = cudf::explode_outer(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - FCW expected_pos_col{0, 1, 0, 0, 0, 0}; + FCW expected_pos_col{{0, 1, 0, 0, 0, 0}, {1, 1, 0, 0, 0, 0}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); auto pos_ret = cudf::explode_outer_position(t, 0); @@ -718,7 +718,7 @@ TEST_F(ExplodeOuterTest, LeadingNulls) auto ret = cudf::explode_outer(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - FCW expected_pos_col{0, 0, 0, 0, 0, 1}; + FCW expected_pos_col{{0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 1, 1}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); auto pos_ret = cudf::explode_outer_position(t, 0); @@ -753,7 +753,7 @@ TEST_F(ExplodeOuterTest, NullsInList) CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - FCW expected_pos_col{0, 1, 2, 0, 1, 2, 3, 0, 0, 1, 2}; + FCW expected_pos_col{{0, 1, 2, 0, 1, 2, 3, 0, 0, 1, 2}, {1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); auto pos_ret = cudf::explode_outer_position(t, 0); @@ -813,7 +813,7 @@ TEST_F(ExplodeOuterTest, NestedNulls) auto ret = cudf::explode_outer(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - FCW expected_pos_col{0, 1, 0, 0, 1, 2}; + FCW expected_pos_col{{0, 1, 0, 0, 1, 2}, {1, 1, 0, 1, 1, 1}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); auto pos_ret = cudf::explode_outer_position(t, 0); @@ -884,7 +884,8 @@ TEST_F(ExplodeOuterTest, NullsInNestedDoubleExplode) CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - FCW expected_pos_col{0, 1, 0, 0, 1, 2, 0, 1, 0, 1, 0, 0, 1}; + FCW expected_pos_col{{0, 1, 0, 0, 1, 2, 0, 1, 0, 1, 0, 0, 1}, + {1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); auto pos_ret = cudf::explode_outer_position(first_explode_ret->view(), 0); diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp index bd3bf7ddd03..e969f53609e 100644 --- a/cpp/tests/replace/replace_nulls_tests.cpp +++ b/cpp/tests/replace/replace_nulls_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2019, NVIDIA CORPORATION. + * Copyright 2019-2021, NVIDIA CORPORATION. * * Copyright 2018 BlazingDB, Inc. * Copyright 2018 Alexander Ocsa @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -437,6 +438,151 @@ TYPED_TEST(ReplaceNullsPolicyTest, FollowingFillTrailingNulls) cudf::replace_policy::FOLLOWING); } +template +struct ReplaceNullsFixedPointTest : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(ReplaceNullsFixedPointTest, cudf::test::FixedPointTypes); + +TYPED_TEST(ReplaceNullsFixedPointTest, ReplaceColumn) +{ + auto const scale = numeric::scale_type{0}; + auto const sz = std::size_t{1000}; + auto data_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return TypeParam{i, scale}; + }); + auto valid_begin = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i % 3 ? 1 : 0; }); + auto replace_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return TypeParam{-2, scale}; + }); + auto expected_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + int val = i % 3 ? static_cast(i) : -2; + return TypeParam{val, scale}; + }); + + ReplaceNullsColumn( + cudf::test::fixed_width_column_wrapper(data_begin, data_begin + sz, valid_begin), + cudf::test::fixed_width_column_wrapper(replace_begin, replace_begin + sz), + cudf::test::fixed_width_column_wrapper(expected_begin, expected_begin + sz)); +} + +TYPED_TEST(ReplaceNullsFixedPointTest, ReplaceColumn_Empty) +{ + ReplaceNullsColumn(cudf::test::fixed_width_column_wrapper{}, + cudf::test::fixed_width_column_wrapper{}, + cudf::test::fixed_width_column_wrapper{}); +} + +TYPED_TEST(ReplaceNullsFixedPointTest, ReplaceScalar) +{ + auto const scale = numeric::scale_type{0}; + auto const sz = std::size_t{1000}; + auto data_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return TypeParam{i, scale}; + }); + auto valid_begin = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i % 3 ? 1 : 0; }); + auto expected_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + int val = i % 3 ? static_cast(i) : -2; + return TypeParam{val, scale}; + }); + + cudf::fixed_point_scalar replacement{-2, scale}; + + ReplaceNullsScalar( + cudf::test::fixed_width_column_wrapper(data_begin, data_begin + sz, valid_begin), + replacement, + cudf::test::fixed_width_column_wrapper(expected_begin, expected_begin + sz)); +} + +TYPED_TEST(ReplaceNullsFixedPointTest, ReplacementHasNulls) +{ + auto const scale = numeric::scale_type{0}; + auto const sz = std::size_t{1000}; + auto data_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return TypeParam{i, scale}; + }); + auto data_valid_begin = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i % 3 ? 1 : 0; }); + auto replace_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return TypeParam{-2, scale}; + }); + auto replace_valid_begin = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i % 2 ? 1 : 0; }); + auto expected_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + int val = i % 3 ? static_cast(i) : -2; + return TypeParam{val, scale}; + }); + auto expected_valid_begin = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i % 6 ? 1 : 0; }); + + ReplaceNullsColumn(cudf::test::fixed_width_column_wrapper( + data_begin, data_begin + sz, data_valid_begin), + cudf::test::fixed_width_column_wrapper( + replace_begin, replace_begin + sz, replace_valid_begin), + cudf::test::fixed_width_column_wrapper( + expected_begin, expected_begin + sz, expected_valid_begin)); +} + +template +struct ReplaceNullsPolicyFixedPointTest : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(ReplaceNullsPolicyFixedPointTest, cudf::test::FixedPointTypes); + +TYPED_TEST(ReplaceNullsPolicyFixedPointTest, PrecedingFill) +{ + using fp = TypeParam; + auto const s = numeric::scale_type{0}; + auto col = cudf::test::fixed_width_column_wrapper( + {fp{42, s}, fp{2, s}, fp{1, s}, fp{-10, s}, fp{20, s}, fp{-30, s}}, {1, 0, 0, 1, 0, 1}); + auto expect_col = cudf::test::fixed_width_column_wrapper( + {fp{42, s}, fp{42, s}, fp{42, s}, fp{-10, s}, fp{-10, s}, fp{-30, s}}, {1, 1, 1, 1, 1, 1}); + + TestReplaceNullsWithPolicy( + std::move(col), std::move(expect_col), cudf::replace_policy::PRECEDING); +} + +TYPED_TEST(ReplaceNullsPolicyFixedPointTest, FollowingFill) +{ + using fp = TypeParam; + auto const s = numeric::scale_type{0}; + auto col = cudf::test::fixed_width_column_wrapper( + {fp{42, s}, fp{2, s}, fp{1, s}, fp{-10, s}, fp{20, s}, fp{-30, s}}, {1, 0, 0, 1, 0, 1}); + auto expect_col = cudf::test::fixed_width_column_wrapper( + {fp{42, s}, fp{-10, s}, fp{-10, s}, fp{-10, s}, fp{-30, s}, fp{-30, s}}, {1, 1, 1, 1, 1, 1}); + + TestReplaceNullsWithPolicy( + std::move(col), std::move(expect_col), cudf::replace_policy::FOLLOWING); +} + +TYPED_TEST(ReplaceNullsPolicyFixedPointTest, PrecedingFillLeadingNulls) +{ + using fp = TypeParam; + auto const s = numeric::scale_type{0}; + auto col = cudf::test::fixed_width_column_wrapper( + {fp{1, s}, fp{2, s}, fp{3, s}, fp{4, s}, fp{5, s}}, {0, 0, 1, 0, 1}); + auto expect_col = cudf::test::fixed_width_column_wrapper( + {fp{1, s}, fp{2, s}, fp{3, s}, fp{3, s}, fp{5, s}}, {0, 0, 1, 1, 1}); + + TestReplaceNullsWithPolicy( + std::move(col), std::move(expect_col), cudf::replace_policy::PRECEDING); +} + +TYPED_TEST(ReplaceNullsPolicyFixedPointTest, FollowingFillTrailingNulls) +{ + using fp = TypeParam; + auto const s = numeric::scale_type{0}; + auto col = cudf::test::fixed_width_column_wrapper( + {fp{1, s}, fp{2, s}, fp{3, s}, fp{4, s}, fp{5, s}}, {1, 0, 1, 0, 0}); + auto expect_col = cudf::test::fixed_width_column_wrapper( + {fp{1, s}, fp{3, s}, fp{3, s}, fp{4, s}, fp{5, s}}, {1, 1, 1, 0, 0}); + + TestReplaceNullsWithPolicy( + std::move(col), std::move(expect_col), cudf::replace_policy::FOLLOWING); +} + struct ReplaceDictionaryTest : public cudf::test::BaseFixture { }; diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp new file mode 100644 index 00000000000..44eb35d4163 --- /dev/null +++ b/cpp/tests/strings/json_tests.cpp @@ -0,0 +1,761 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include + +// reference: https://jsonpath.herokuapp.com/ + +// clang-format off +std::string json_string{ + "{" + "\"store\": {""\"book\": [" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]," + "\"bicycle\": {" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + "}," + "\"expensive\": 10" + "}" +}; +// clang-format on + +std::unique_ptr drop_whitespace(cudf::column_view const& col) +{ + cudf::test::strings_column_wrapper whitespace{"\n", "\r", "\t"}; + cudf::test::strings_column_wrapper repl{"", "", ""}; + + cudf::strings_column_view strings(col); + cudf::strings_column_view targets(whitespace); + cudf::strings_column_view replacements(repl); + return cudf::strings::replace(strings, targets, replacements); +} + +struct JsonTests : public cudf::test::BaseFixture { +}; + +TEST_F(JsonTests, GetJsonObjectRootOp) +{ + // root + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + auto expected = drop_whitespace(input); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); +} + +TEST_F(JsonTests, GetJsonObjectChildOp) +{ + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "{" + "\"book\": [" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]," + "\"bicycle\": {" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + "}" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "[" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } +} + +TEST_F(JsonTests, GetJsonObjectWildcardOp) +{ + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.*"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "[" + "[" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]," + "{" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + "]" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("*"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "[" + "{" + "\"book\": [" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]," + "\"bicycle\": {" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + "}," + "10" + "]" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } +} + +TEST_F(JsonTests, GetJsonObjectSubscriptOp) +{ + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[2]"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store['bicycle']"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "{" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[*]"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "[" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } +} + +TEST_F(JsonTests, GetJsonObjectFilter) +{ + // queries that result in filtering/collating results (mostly meaning - generates new + // json instead of just returning parts of the existing string + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[*]['isbn']"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{"[\"0-553-21311-3\",\"0-395-19395-8\"]"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[*].category"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{ + "[\"reference\",\"fiction\",\"fiction\",\"fiction\"]"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[*].title"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{ + "[\"Sayings of the Century\",\"Sword of Honour\",\"Moby Dick\",\"The Lord of the Rings\"]"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book.*.price"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{"[8.95,12.99,8.99,22.99]"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + // spark behavioral difference. + // standard: "fiction" + // spark: fiction + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[2].category"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{"fiction"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } +} + +TEST_F(JsonTests, GetJsonObjectNullInputs) +{ + { + std::string str("{\"a\" : \"b\"}"); + cudf::test::strings_column_wrapper input({str, str, str, str}, {1, 0, 1, 0}); + + std::string json_path("$.a"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw({"b", "", "b", ""}, {1, 0, 1, 0}); + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } +} + +TEST_F(JsonTests, GetJsonObjectEmptyQuery) +{ + // empty query -> null + { + cudf::test::strings_column_wrapper input{"{\"a\" : \"b\"}"}; + std::string json_path(""); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } +} + +TEST_F(JsonTests, GetJsonObjectEmptyInputsAndOutputs) +{ + // empty input -> null + { + cudf::test::strings_column_wrapper input{""}; + std::string json_path("$"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + // slightly different from "empty output". in this case, we're + // returning something, but it happens to be empty. so we expect + // a valid, but empty row + { + cudf::test::strings_column_wrapper input{"{\"store\": { \"bicycle\" : \"\" } }"}; + std::string json_path("$.store.bicycle"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {1}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } +} + +// badly formed JSONpath strings +TEST_F(JsonTests, GetJsonObjectIllegalQuery) +{ + // can't have more than one root operator, or a root operator anywhere other + // than the beginning + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$$"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + // invalid index + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[auh46h-]"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + // invalid index + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[[]]"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + // negative index + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[-1]"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + // child operator with no name specified + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("."); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("]["); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("6hw6,56i3"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } +} + +// queries that are legal, but reference invalid parts of the input +TEST_F(JsonTests, GetJsonObjectInvalidQuery) +{ + // non-existent field + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[*].c"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + // non-existent field + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[*].c[2]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + // non-existent field + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book.price"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + // out of bounds index + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[4]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } +} + +TEST_F(JsonTests, MixedOutput) +{ + // various queries on: + // clang-format off + std::vector input_strings { + "{\"a\": {\"b\" : \"c\"}}", + + "{" + "\"a\": {\"b\" : \"c\"}," + "\"d\": [{\"e\":123}, {\"f\":-10}]" + "}", + + "{" + "\"b\": 123" + "}", + + "{" + "\"a\": [\"y\",500]" + "}", + + "{" + "\"a\": \"\"" + "}", + + "{" + "\"a\": {" + "\"z\": {\"i\": 10, \"j\": 100}," + "\"b\": [\"c\",null,true,-1]" + "}" + "}" + }; + // clang-format on + cudf::test::strings_column_wrapper input(input_strings.begin(), input_strings.end()); + { + std::string json_path("$.a"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "{\"b\" : \"c\"}", + "{\"b\" : \"c\"}", + "", + "[\"y\",500]", + "", + "{" + "\"z\": {\"i\": 10, \"j\": 100}," + "\"b\": [\"c\",null,true,-1]" + "}" + }, + {1, 1, 0, 1, 1, 1}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + { + std::string json_path("$.a[1]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "", + "", + "", + "500", + "", + "", + }, + {0, 0, 0, 1, 0, 0}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + { + std::string json_path("$.a.b"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "c", + "c", + "", + "", + "", + "[\"c\",null,true,-1]"}, + {1, 1, 0, 0, 0, 1}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + { + std::string json_path("$.a[*]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "[\"c\"]", + "[\"c\"]", + "", + "[\"y\",500]", + "[]", + "[" + "{\"i\": 10, \"j\": 100}," + "[\"c\",null,true,-1]" + "]" }, + {1, 1, 0, 1, 1, 1}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + { + std::string json_path("$.a.b[*]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "[]", + "[]", + "", + "", + "", + "[\"c\",null,true,-1]"}, + {1, 1, 0, 0, 0, 1}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } +} diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index 78a67464654..a54c86405a5 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -71,7 +71,7 @@ struct column_property_comparator { // equivalent, but not exactly equal columns can have a different number of children if their // sizes are both 0. Specifically, empty string columns may or may not have children. - if (check_exact_equality || lhs.size() > 0) { + if (check_exact_equality || (lhs.size() > 0 && lhs.null_count() < lhs.size())) { EXPECT_EQ(lhs.num_children(), rhs.num_children()); } } diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh index eee943cde38..b2d0b066ce7 100755 --- a/java/ci/build-in-docker.sh +++ b/java/ci/build-in-docker.sh @@ -24,6 +24,7 @@ SKIP_JAVA_TESTS=${SKIP_JAVA_TESTS:-true} BUILD_CPP_TESTS=${BUILD_CPP_TESTS:-OFF} ENABLE_PTDS=${ENABLE_PTDS:-ON} RMM_LOGGING_LEVEL=${RMM_LOGGING_LEVEL:-OFF} +ENABLE_NVTX=${ENABLE_NVTX:-ON} OUT=${OUT:-out} SIGN_FILE=$1 @@ -35,6 +36,7 @@ echo "SIGN_FILE: $SIGN_FILE,\ SKIP_JAVA_TESTS: $SKIP_JAVA_TESTS,\ BUILD_CPP_TESTS: $BUILD_CPP_TESTS,\ ENABLED_PTDS: $ENABLE_PTDS,\ + ENABLE_NVTX: $ENABLE_NVTX,\ RMM_LOGGING_LEVEL: $RMM_LOGGING_LEVEL,\ OUT_PATH: $OUT_PATH" @@ -51,7 +53,7 @@ export PATH=/usr/local/cmake-3.19.0-Linux-x86_64/bin:$PATH rm -rf $WORKSPACE/cpp/build mkdir -p $WORKSPACE/cpp/build cd $WORKSPACE/cpp/build -cmake .. -DUSE_NVTX=OFF -DCUDF_USE_ARROW_STATIC=ON -DBoost_USE_STATIC_LIBS=ON -DBUILD_TESTS=$SKIP_CPP_TESTS -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL +cmake .. -DUSE_NVTX=$ENABLE_NVTX -DCUDF_USE_ARROW_STATIC=ON -DBoost_USE_STATIC_LIBS=ON -DBUILD_TESTS=$SKIP_CPP_TESTS -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL make -j$PARALLEL_LEVEL make install DESTDIR=$INSTALL_PREFIX diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index e6675591164..fcdb5d44ad3 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -570,8 +570,7 @@ public static ColumnVector serial32BitMurmurHash3(int seed, ColumnView columns[] assert columns[i] != null : "Column vectors passed may not be null"; assert columns[i].getRowCount() == size : "Row count mismatch, all columns must be the same size"; assert !columns[i].getType().isDurationType() : "Unsupported column type Duration"; - assert !columns[i].getType().isTimestampType() : "Unsupported column type Timestamp"; - assert !columns[i].getType().isNestedType() : "Unsupported column of nested type"; + assert !columns[i].getType().equals(DType.LIST) : "List columns are not supported"; columnViews[i] = columns[i].getNativeView(); } return new ColumnVector(hash(columnViews, HashType.HASH_SERIAL_MURMUR3.getNativeId(), new int[0], seed)); @@ -606,7 +605,7 @@ public static ColumnVector spark32BitMurmurHash3(int seed, ColumnView columns[]) assert columns[i] != null : "Column vectors passed may not be null"; assert columns[i].getRowCount() == size : "Row count mismatch, all columns must be the same size"; assert !columns[i].getType().isDurationType() : "Unsupported column type Duration"; - assert !columns[i].getType().isNestedType() : "Unsupported column of nested type"; + assert !columns[i].getType().equals(DType.LIST) : "List columns are not supported"; columnViews[i] = columns[i].getNativeView(); } return new ColumnVector(hash(columnViews, HashType.HASH_SPARK_MURMUR3.getNativeId(), new int[0], seed)); diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 5d869ab75fb..402c64dd83d 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -2083,6 +2083,23 @@ public final ColumnVector substring(ColumnView start, ColumnView end) { return new ColumnVector(substringColumn(getNativeView(), start.getNativeView(), end.getNativeView())); } + /** + * Apply a JSONPath string to all rows in an input strings column. + * + * Applies a JSONPath string to an incoming strings column where each row in the column + * is a valid json string. The output is returned by row as a strings column. + * + * For reference, https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html + * Note: Only implements the operators: $ . [] * + * + * @param path The JSONPath string to be applied to each row + * @return new strings ColumnVector containing the retrieved json object strings + */ + public final ColumnVector getJSONObject(Scalar path) { + assert(type.equals(DType.STRING)) : "column type must be a String"; + return new ColumnVector(getJSONObject(getNativeView(), path.getScalarHandle())); + } + /** * Returns a new strings column where target string within each string is replaced with the specified * replacement string. @@ -2649,6 +2666,8 @@ static DeviceMemoryBufferView getOffsetsBuffer(long viewHandle) { */ private static native long stringTimestampToTimestamp(long viewHandle, int unit, String format); + private static native long getJSONObject(long viewHandle, long scalarHandle) throws CudfException; + /** * Native method to parse and convert a timestamp column vector to string column vector. A unix * timestamp is a long value representing how many units since 1970-01-01 00:00:00:000 in either diff --git a/java/src/main/java/ai/rapids/cudf/MemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/MemoryBuffer.java index a1be9b561a0..9f0d9a451c0 100644 --- a/java/src/main/java/ai/rapids/cudf/MemoryBuffer.java +++ b/java/src/main/java/ai/rapids/cudf/MemoryBuffer.java @@ -146,6 +146,39 @@ public final long getAddress() { return address; } + /** + * Copy a subset of src to this buffer starting at destOffset using the specified CUDA stream. + * The copy has completed when this returns, but the memory copy could overlap with + * operations occurring on other streams. + * @param destOffset the offset in this to start copying from. + * @param src what to copy from + * @param srcOffset offset into src to start out + * @param length how many bytes to copy + * @param stream CUDA stream to use + */ + public final void copyFromMemoryBuffer( + long destOffset, MemoryBuffer src, long srcOffset, long length, Cuda.Stream stream) { + addressOutOfBoundsCheck(address + destOffset, length, "copy range dest"); + src.addressOutOfBoundsCheck(src.address + srcOffset, length, "copy range src"); + Cuda.memcpy(address + destOffset, src.address + srcOffset, length, CudaMemcpyKind.DEFAULT, stream); + } + + /** + * Copy a subset of src to this buffer starting at destOffset using the specified CUDA stream. + * The copy is async and may not have completed when this returns. + * @param destOffset the offset in this to start copying from. + * @param src what to copy from + * @param srcOffset offset into src to start out + * @param length how many bytes to copy + * @param stream CUDA stream to use + */ + public final void copyFromMemoryBufferAsync( + long destOffset, MemoryBuffer src, long srcOffset, long length, Cuda.Stream stream) { + addressOutOfBoundsCheck(address + destOffset, length, "copy range dest"); + src.addressOutOfBoundsCheck(src.address + srcOffset, length, "copy range src"); + Cuda.asyncMemcpy(address + destOffset, src.address + srcOffset, length, CudaMemcpyKind.DEFAULT, stream); + } + /** * Slice off a part of the buffer. Note that this is a zero copy operation and all * slices must be closed along with the original buffer before the memory is released. diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index dc1acc50b5f..cec3a1a92a6 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -65,6 +66,8 @@ #include "cudf_jni_apis.hpp" #include "dtype_utils.hpp" +#include "jni.h" +#include "jni_utils.hpp" namespace { @@ -1835,4 +1838,24 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyColumnViewToCV(JNIEnv } CATCH_STD(env, 0) } + +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env, jclass, + jlong j_view_handle, jlong j_scalar_handle) { + + JNI_NULL_CHECK(env, j_view_handle, "view cannot be null", 0); + JNI_NULL_CHECK(env, j_scalar_handle, "path cannot be null", 0); + + try { + cudf::jni::auto_set_device(env); + cudf::column_view* n_column_view = reinterpret_cast(j_view_handle); + cudf::strings_column_view n_strings_col_view(*n_column_view); + cudf::string_scalar *n_scalar_path = reinterpret_cast(j_scalar_handle); + + auto result = cudf::strings::get_json_object(n_strings_col_view, *n_scalar_path); + + return reinterpret_cast(result.release()); + } + CATCH_STD(env, 0) + +} } // extern "C" diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index fe1cba5ceb1..36123704ae6 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -490,6 +490,25 @@ void testSerial32BitMurmur3HashMixed() { } } + @Test + void testSerial32BitMurmur3HashStruct() { + try (ColumnVector strings = ColumnVector.fromStrings( + "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721", + "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " + + "in the MD5 hash function. This string needed to be longer.", + null, null); + ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null); + ColumnVector doubles = ColumnVector.fromBoxedDoubles( + 0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null); + ColumnVector floats = ColumnVector.fromBoxedFloats( + 0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null); + ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null); + ColumnVector result = ColumnVector.serial32BitMurmurHash3(1868, new ColumnVector[]{strings, integers, doubles, floats, bools}); + ColumnVector expected = ColumnVector.fromBoxedInts(387200465, 1988790727, 774895031, 814731646, -1073686048, 1868)) { + assertColumnsAreEqual(expected, result); + } + } + @Test void testSpark32BitMurmur3HashStrings() { try (ColumnVector v0 = ColumnVector.fromStrings( @@ -529,6 +548,8 @@ void testSpark32BitMurmur3HashDoubles() { @Test void testSpark32BitMurmur3HashTimestamps() { + // The hash values were derived from Apache Spark in a manner similar to the one documented at + // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307 try (ColumnVector v = ColumnVector.timestampMicroSecondsFromBoxedLongs( 0L, null, 100L, -100L, 0x123456789abcdefL, null, -0x123456789abcdefL); ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v}); @@ -539,6 +560,8 @@ void testSpark32BitMurmur3HashTimestamps() { @Test void testSpark32BitMurmur3HashDecimal64() { + // The hash values were derived from Apache Spark in a manner similar to the one documented at + // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307 try (ColumnVector v = ColumnVector.decimalFromLongs(-7, 0L, 100L, -100L, 0x123456789abcdefL, -0x123456789abcdefL); ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v}); @@ -549,6 +572,8 @@ void testSpark32BitMurmur3HashDecimal64() { @Test void testSpark32BitMurmur3HashDecimal32() { + // The hash values were derived from Apache Spark in a manner similar to the one documented at + // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307 try (ColumnVector v = ColumnVector.decimalFromInts(-3, 0, 100, -100, 0x12345678, -0x12345678); ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v}); @@ -559,6 +584,8 @@ void testSpark32BitMurmur3HashDecimal32() { @Test void testSpark32BitMurmur3HashDates() { + // The hash values were derived from Apache Spark in a manner similar to the one documented at + // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307 try (ColumnVector v = ColumnVector.timestampDaysFromBoxedInts( 0, null, 100, -100, 0x12345678, null, -0x12345678); ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v}); @@ -587,7 +614,6 @@ void testSpark32BitMurmur3HashBools() { ColumnVector result = ColumnVector.spark32BitMurmurHash3(0, new ColumnVector[]{v0, v1}); ColumnVector expected = ColumnVector.fromBoxedInts(0, -1589400010, -239939054, -68075478, 593689054, -1194558265)) { assertColumnsAreEqual(expected, result); - } } @@ -610,6 +636,26 @@ void testSpark32BitMurmur3HashMixed() { } } + @Test + void testSpark32BitMurmur3HashStruct() { + try (ColumnVector strings = ColumnVector.fromStrings( + "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721", + "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " + + "in the MD5 hash function. This string needed to be longer.", + null, null); + ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null); + ColumnVector doubles = ColumnVector.fromBoxedDoubles( + 0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null); + ColumnVector floats = ColumnVector.fromBoxedFloats( + 0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null); + ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null); + ColumnView structs = ColumnView.makeStructView(strings, integers, doubles, floats, bools); + ColumnVector result = ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{structs}); + ColumnVector expected = ColumnVector.spark32BitMurmurHash3(1868, new ColumnVector[]{strings, integers, doubles, floats, bools})) { + assertColumnsAreEqual(expected, result); + } + } + @Test void testAndNullReconfigureNulls() { try (ColumnVector v0 = ColumnVector.fromBoxedInts(0, 100, null, null, Integer.MIN_VALUE, null); @@ -4132,6 +4178,50 @@ void testCopyToColumnVector() { } } + @Test + void testGetJSONObject() { + String jsonString = "{ \"store\": {\n" + + " \"book\": [\n" + + " { \"category\": \"reference\",\n" + + " \"author\": \"Nigel Rees\",\n" + + " \"title\": \"Sayings of the Century\",\n" + + " \"price\": 8.95\n" + + " },\n" + + " { \"category\": \"fiction\",\n" + + " \"author\": \"Evelyn Waugh\",\n" + + " \"title\": \"Sword of Honour\",\n" + + " \"price\": 12.99\n" + + " },\n" + + " { \"category\": \"fiction\",\n" + + " \"author\": \"Herman Melville\",\n" + + " \"title\": \"Moby Dick\",\n" + + " \"isbn\": \"0-553-21311-3\",\n" + + " \"price\": 8.99\n" + + " },\n" + + " { \"category\": \"fiction\",\n" + + " \"author\": \"J. R. R. Tolkien\",\n" + + " \"title\": \"The Lord of the Rings\",\n" + + " \"isbn\": \"0-395-19395-8\",\n" + + " \"price\": 22.99\n" + + " }\n" + + " ],\n" + + " \"bicycle\": {\n" + + " \"color\": \"red\",\n" + + " \"price\": 19.95\n" + + " }\n" + + " }\n" + + "}"; + + try (ColumnVector json = ColumnVector.fromStrings(jsonString, jsonString); + ColumnVector expectedAuthors = ColumnVector.fromStrings("[\"Nigel Rees\",\"Evelyn " + + "Waugh\",\"Herman Melville\",\"J. R. R. Tolkien\"]", "[\"Nigel Rees\",\"Evelyn " + + "Waugh\",\"Herman Melville\",\"J. R. R. Tolkien\"]"); + Scalar path = Scalar.fromString("$.store.book[*].author"); + ColumnVector gotAuthors = json.getJSONObject(path)) { + assertColumnsAreEqual(expectedAuthors, gotAuthors); + } + } + @Test void testMakeStructEmpty() { final int numRows = 10; diff --git a/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java b/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java new file mode 100644 index 00000000000..df710c71f63 --- /dev/null +++ b/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java @@ -0,0 +1,171 @@ +/* + * + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +public class MemoryBufferTest extends CudfTestBase { + private static final byte[] BYTES = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + private static final byte[] EXPECTED = {0, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + @Test + public void testAddressOutOfBoundsExceptionWhenCopying() { + try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16); + HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) { + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(-1, from, 0, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(16, from, 0, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(0, from, -1, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(0, from, 16, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(0, from, 0, -1, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(0, from, 0, 17, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(1, from, 0, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(0, from, 1, 16, Cuda.DEFAULT_STREAM)); + } + } + + @Test + public void testAddressOutOfBoundsExceptionWhenCopyingAsync() { + try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16); + HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) { + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(-1, from, 0, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(16, from, 0, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(0, from, -1, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(0, from, 16, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(0, from, 0, -1, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(0, from, 0, 17, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(1, from, 0, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(0, from, 1, 16, Cuda.DEFAULT_STREAM)); + } + } + + @Test + public void testCopyingFromDeviceToDevice() { + try (HostMemoryBuffer in = HostMemoryBuffer.allocate(16); + DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16); + DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16); + HostMemoryBuffer out = HostMemoryBuffer.allocate(16)) { + in.setBytes(0, BYTES, 0, 16); + from.copyFromHostBuffer(in); + to.copyFromMemoryBuffer(0, from, 0, 16, Cuda.DEFAULT_STREAM); + to.copyFromMemoryBuffer(1, from, 2, 3, Cuda.DEFAULT_STREAM); + out.copyFromDeviceBuffer(to); + verifyOutput(out); + } + } + + @Test + public void testCopyingFromDeviceToDeviceAsync() { + try (HostMemoryBuffer in = HostMemoryBuffer.allocate(16); + DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16); + DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16); + HostMemoryBuffer out = HostMemoryBuffer.allocate(16)) { + in.setBytes(0, BYTES, 0, 16); + from.copyFromHostBuffer(in); + to.copyFromMemoryBufferAsync(0, from, 0, 16, Cuda.DEFAULT_STREAM); + to.copyFromMemoryBufferAsync(1, from, 2, 3, Cuda.DEFAULT_STREAM); + out.copyFromDeviceBufferAsync(to, Cuda.DEFAULT_STREAM); + Cuda.DEFAULT_STREAM.sync(); + verifyOutput(out); + } + } + + @Test + public void testCopyingFromHostToHost() { + try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16); + HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) { + from.setBytes(0, BYTES, 0, 16); + to.setBytes(0, BYTES, 0, 16); + to.copyFromMemoryBuffer(1, from, 2, 3, Cuda.DEFAULT_STREAM); + verifyOutput(to); + } + } + + @Test + public void testCopyingFromHostToHostAsync() { + try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16); + HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) { + from.setBytes(0, BYTES, 0, 16); + to.setBytes(0, BYTES, 0, 16); + to.copyFromMemoryBufferAsync(1, from, 2, 3, Cuda.DEFAULT_STREAM); + verifyOutput(to); + } + } + + @Test + public void testCopyingFromHostToDevice() { + try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16); + DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16); + HostMemoryBuffer out = HostMemoryBuffer.allocate(16)) { + from.setBytes(0, BYTES, 0, 16); + to.copyFromMemoryBuffer(0, from, 0, 16, Cuda.DEFAULT_STREAM); + to.copyFromMemoryBufferAsync(1, from, 2, 3, Cuda.DEFAULT_STREAM); + out.copyFromDeviceBuffer(to); + verifyOutput(out); + } + } + + @Test + public void testCopyingFromHostToDeviceAsync() { + try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16); + DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16); + HostMemoryBuffer out = HostMemoryBuffer.allocate(16)) { + from.setBytes(0, BYTES, 0, 16); + to.copyFromMemoryBufferAsync(0, from, 0, 16, Cuda.DEFAULT_STREAM); + to.copyFromMemoryBufferAsync(1, from, 2, 3, Cuda.DEFAULT_STREAM); + out.copyFromDeviceBufferAsync(to, Cuda.DEFAULT_STREAM); + Cuda.DEFAULT_STREAM.sync(); + verifyOutput(out); + } + } + + @Test + public void testCopyingFromDeviceToHost() { + try (HostMemoryBuffer in = HostMemoryBuffer.allocate(16); + DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16); + HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) { + in.setBytes(0, BYTES, 0, 16); + from.copyFromHostBuffer(in); + to.setBytes(0, BYTES, 0, 16); + to.copyFromMemoryBuffer(1, from, 2, 3, Cuda.DEFAULT_STREAM); + verifyOutput(to); + } + } + + @Test + public void testCopyingFromDeviceToHostAsync() { + try (HostMemoryBuffer in = HostMemoryBuffer.allocate(16); + DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16); + HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) { + in.setBytes(0, BYTES, 0, 16); + from.copyFromHostBuffer(in); + to.setBytes(0, BYTES, 0, 16); + to.copyFromMemoryBufferAsync(1, from, 2, 3, Cuda.DEFAULT_STREAM); + Cuda.DEFAULT_STREAM.sync(); + verifyOutput(to); + } + } + + private void verifyOutput(HostMemoryBuffer out) { + byte[] bytes = new byte[16]; + out.getBytes(bytes, 0, 0, 16); + assertArrayEquals(EXPECTED, bytes); + } +} diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 9c67966c16c..8b7ece5d60b 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -5067,7 +5067,7 @@ private Table[] buildExplodeTestTableWithPrimitiveTypes(boolean pos, boolean out .build()) { Table.TestBuilder expectedBuilder = new Table.TestBuilder(); if (pos) { - Integer[] posData = outer ? new Integer[]{0, 1, 2, 0, 1, 0, 0, 0} : new Integer[]{0, 1, 2, 0, 1, 0}; + Integer[] posData = outer ? new Integer[]{0, 1, 2, 0, 1, 0, null, null} : new Integer[]{0, 1, 2, 0, 1, 0}; expectedBuilder.column(posData); } List expectedData = new ArrayList(){{ @@ -5109,10 +5109,11 @@ private Table[] buildExplodeTestTableWithNestedTypes(boolean pos, boolean outer) .build()) { Table.TestBuilder expectedBuilder = new Table.TestBuilder(); if (pos) { - if (!outer) + if (outer) { + expectedBuilder.column(0, 1, 2, 0, 1, 0, null, null); + } else { expectedBuilder.column(0, 1, 2, 0, 1, 0, 0); - else - expectedBuilder.column(0, 1, 2, 0, 1, 0, 0, 0); + } } List expectedData = new ArrayList(){{ if (!outer) { diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 4c72ba2e055..8f93866612e 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. import pandas as pd @@ -564,11 +564,11 @@ def copy_if_else(object lhs, object rhs, Column boolean_mask): return _copy_if_else_column_column(lhs, rhs, boolean_mask) else: return _copy_if_else_column_scalar( - lhs, as_device_scalar(rhs, lhs.dtype), boolean_mask) + lhs, as_device_scalar(rhs), boolean_mask) else: if isinstance(rhs, Column): return _copy_if_else_scalar_column( - as_device_scalar(lhs, rhs.dtype), rhs, boolean_mask) + as_device_scalar(lhs), rhs, boolean_mask) else: if lhs is None and rhs is None: return lhs diff --git a/python/cudf/cudf/_lib/cpp/lists/drop_list_duplicates.pxd b/python/cudf/cudf/_lib/cpp/lists/drop_list_duplicates.pxd new file mode 100644 index 00000000000..40b1836f932 --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/lists/drop_list_duplicates.pxd @@ -0,0 +1,15 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.types cimport null_equality, nan_equality + +cdef extern from "cudf/lists/drop_list_duplicates.hpp" \ + namespace "cudf::lists" nogil: + cdef unique_ptr[column] drop_list_duplicates( + const lists_column_view lists_column, + null_equality nulls_equal, + nan_equality nans_equal + ) except + diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/cpp/types.pxd index bd1108b2cdf..1f2094b3958 100644 --- a/python/cudf/cudf/_lib/cpp/types.pxd +++ b/python/cudf/cudf/_lib/cpp/types.pxd @@ -46,6 +46,10 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil: EQUAL "cudf::null_equality::EQUAL" UNEQUAL "cudf::null_equality::UNEQUAL" + ctypedef enum nan_equality "cudf::nan_equality": + ALL_EQUAL "cudf::nan_equality::ALL_EQUAL" + UNEQUAL "cudf::nan_equality::UNEQUAL" + ctypedef enum type_id "cudf::type_id": EMPTY "cudf::type_id::EMPTY" INT8 "cudf::type_id::INT8" diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 7f745e58c67..e93cba20f65 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -10,6 +10,9 @@ from cudf._lib.cpp.lists.count_elements cimport ( from cudf._lib.cpp.lists.explode cimport ( explode_outer as cpp_explode_outer ) +from cudf._lib.cpp.lists.drop_list_duplicates cimport ( + drop_list_duplicates as cpp_drop_list_duplicates +) from cudf._lib.cpp.lists.sorting cimport ( sort_lists as cpp_sort_lists ) @@ -22,7 +25,13 @@ from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.cpp.types cimport size_type, order, null_order +from cudf._lib.cpp.types cimport ( + size_type, + null_equality, + order, + null_order, + nan_equality +) from cudf._lib.column cimport Column from cudf._lib.table cimport Table @@ -71,6 +80,34 @@ def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False): ) +def drop_list_duplicates(Column col, bool nulls_equal, bool nans_all_equal): + """ + nans_all_equal == True indicates that libcudf should treat any two elements + from {+nan, -nan} as equal, and as unequal otherwise. + nulls_equal == True indicates that libcudf should treat any two nulls as + equal, and as unequal otherwise. + """ + cdef shared_ptr[lists_column_view] list_view = ( + make_shared[lists_column_view](col.view()) + ) + cdef null_equality c_nulls_equal = ( + null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL + ) + cdef nan_equality c_nans_equal = ( + nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL + ) + + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_drop_list_duplicates(list_view.get()[0], + c_nulls_equal, + c_nans_equal) + ) + return Column.from_unique_ptr(move(c_result)) + + def sort_lists(Column col, bool ascending, str na_position): cdef shared_ptr[lists_column_view] list_view = ( make_shared[lists_column_view](col.view()) @@ -121,6 +158,5 @@ def contains_scalar(Column col, DeviceScalar search_key): list_view.get()[0], search_key_value[0], )) - result = Column.from_unique_ptr(move(c_result)) return result diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index d8b4fbbbe4b..4ea2adec23a 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -312,6 +312,9 @@ cpdef write_parquet( num_index_cols_meta = 0 for i, name in enumerate(table._column_names, num_index_cols_meta): + if not isinstance(name, str): + raise ValueError("parquet must have string column names") + tbl_meta.get().column_metadata[i].set_name(name.encode()) _set_col_metadata( table[name]._column, tbl_meta.get().column_metadata[i] diff --git a/python/cudf/cudf/core/__init__.py b/python/cudf/cudf/core/__init__.py index 91a369c31f8..59173cc0247 100644 --- a/python/cudf/cudf/core/__init__.py +++ b/python/cudf/cudf/core/__init__.py @@ -1,6 +1,6 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. -from cudf.core import buffer, column, column_accessor, common +from cudf.core import _internals, buffer, column, column_accessor, common from cudf.core.buffer import Buffer from cudf.core.dataframe import DataFrame, from_pandas, merge from cudf.core.index import ( diff --git a/python/cudf/cudf/core/_internals/__init__.py b/python/cudf/cudf/core/_internals/__init__.py new file mode 100644 index 00000000000..53d186def85 --- /dev/null +++ b/python/cudf/cudf/core/_internals/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from cudf.core._internals.where import where diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py new file mode 100644 index 00000000000..1fdc907875e --- /dev/null +++ b/python/cudf/cudf/core/_internals/where.py @@ -0,0 +1,383 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +import warnings +from typing import Any, Optional, Tuple, Union, cast + +import numpy as np +import pandas as pd + +import cudf +from cudf._typing import ColumnLike, ScalarLike +from cudf.core.column import ColumnBase +from cudf.core.dataframe import DataFrame +from cudf.core.frame import Frame +from cudf.core.index import Index +from cudf.core.series import Series + + +def _normalize_scalars(col: ColumnBase, other: ScalarLike) -> ScalarLike: + """ + Try to normalize scalar values as per col dtype + """ + if (isinstance(other, float) and not np.isnan(other)) and ( + col.dtype.type(other) != other + ): + raise TypeError( + f"Cannot safely cast non-equivalent " + f"{type(other).__name__} to {col.dtype.name}" + ) + + return cudf.Scalar(other, dtype=col.dtype if other is None else None) + + +def _check_and_cast_columns_with_other( + source_col: ColumnBase, + other: Union[ScalarLike, ColumnBase], + inplace: bool, +) -> Tuple[ColumnBase, Union[ScalarLike, ColumnBase]]: + """ + Returns type-casted column `source_col` & scalar `other_scalar` + based on `inplace` parameter. + """ + if cudf.utils.dtypes.is_categorical_dtype(source_col.dtype): + return source_col, other + + if cudf.utils.dtypes.is_scalar(other): + device_obj = _normalize_scalars(source_col, other) + else: + device_obj = other + + if other is None: + return source_col, device_obj + elif cudf.utils.dtypes.is_mixed_with_object_dtype(device_obj, source_col): + raise TypeError( + "cudf does not support mixed types, please type-cast " + "the column of dataframe/series and other " + "to same dtypes." + ) + if inplace: + if not cudf.utils.dtypes._can_cast(device_obj.dtype, source_col.dtype): + warnings.warn( + f"Type-casting from {device_obj.dtype} " + f"to {source_col.dtype}, there could be potential data loss" + ) + return source_col, device_obj.astype(source_col.dtype) + else: + if ( + cudf.utils.dtypes.is_scalar(other) + and cudf.utils.dtypes.is_numerical_dtype(source_col.dtype) + and cudf.utils.dtypes._can_cast(other, source_col.dtype) + ): + common_dtype = source_col.dtype + return ( + source_col.astype(common_dtype), + cudf.Scalar(other, dtype=common_dtype), + ) + else: + common_dtype = cudf.utils.dtypes.find_common_type( + [ + source_col.dtype, + np.min_scalar_type(other) + if cudf.utils.dtypes.is_scalar(other) + else other.dtype, + ] + ) + if cudf.utils.dtypes.is_scalar(device_obj): + device_obj = cudf.Scalar(other, dtype=common_dtype) + else: + device_obj = device_obj.astype(common_dtype) + return source_col.astype(common_dtype), device_obj + + +def _normalize_columns_and_scalars_type( + frame: Union[Series, Index, DataFrame], other: Any, inplace: bool = False, +) -> Tuple[ + Union[Series, Index, DataFrame, ColumnLike], Any, +]: + """ + Try to normalize the other's dtypes as per frame. + + Parameters + ---------- + + frame : Can be a DataFrame or Series or Index + other : Can be a DataFrame, Series, Index, Array + like object or a scalar value + + if frame is DataFrame, other can be only a + scalar or array like with size of number of columns + in DataFrame or a DataFrame with same dimension + + if frame is Series, other can be only a scalar or + a series like with same length as frame + + Returns: + -------- + A dataframe/series/list/scalar form of normalized other + """ + if isinstance(frame, DataFrame) and isinstance(other, DataFrame): + source_df = frame.copy(deep=False) + other_df = other.copy(deep=False) + for self_col in source_df._column_names: + source_col, other_col = _check_and_cast_columns_with_other( + source_col=source_df._data[self_col], + other=other_df._data[self_col], + inplace=inplace, + ) + source_df._data[self_col] = source_col + other_df._data[self_col] = other_col + return source_df, other_df + + elif isinstance( + frame, (Series, Index) + ) and not cudf.utils.dtypes.is_scalar(other): + other = cudf.core.column.as_column(other) + input_col = frame._data[frame.name] + return _check_and_cast_columns_with_other( + source_col=input_col, other=other, inplace=inplace + ) + else: + # Handles scalar or list/array like scalars + if isinstance(frame, (Series, Index)) and cudf.utils.dtypes.is_scalar( + other + ): + input_col = frame._data[frame.name] + return _check_and_cast_columns_with_other( + source_col=frame._data[frame.name], + other=other, + inplace=inplace, + ) + + elif isinstance(frame, DataFrame): + if cudf.utils.dtypes.is_scalar(other): + other = [other for i in range(len(frame._column_names))] + + source_df = frame.copy(deep=False) + others = [] + for col_name, other_sclr in zip(frame._column_names, other): + + ( + source_col, + other_scalar, + ) = _check_and_cast_columns_with_other( + source_col=source_df._data[col_name], + other=other_sclr, + inplace=inplace, + ) + source_df._data[col_name] = source_col + others.append(other_scalar) + return source_df, others + else: + raise ValueError( + f"Inappropriate input {type(frame)} " + f"and other {type(other)} combination" + ) + + +def where( + frame: Union[Series, Index, DataFrame], + cond: Any, + other: Any = None, + inplace: bool = False, +) -> Optional[Union[Frame]]: + """ + Replace values where the condition is False. + + Parameters + ---------- + cond : bool Series/DataFrame, array-like + Where cond is True, keep the original value. + Where False, replace with corresponding value from other. + Callables are not supported. + other: scalar, list of scalars, Series/DataFrame + Entries where cond is False are replaced with + corresponding value from other. Callables are not + supported. Default is None. + + DataFrame expects only Scalar or array like with scalars or + dataframe with same dimension as frame. + + Series expects only scalar or series like with same length + inplace : bool, default False + Whether to perform the operation in place on the data. + + Returns + ------- + Same type as caller + + Examples + -------- + >>> import cudf + >>> df = DataFrame({"A":[1, 4, 5], "B":[3, 5, 8]}) + >>> df.where(df % 2 == 0, [-1, -1]) + A B + 0 -1 -1 + 1 4 -1 + 2 -1 8 + + >>> ser = Series([4, 3, 2, 1, 0]) + >>> ser.where(ser > 2, 10) + 0 4 + 1 3 + 2 10 + 3 10 + 4 10 + dtype: int64 + >>> ser.where(ser > 2) + 0 4 + 1 3 + 2 + 3 + 4 + dtype: int64 + """ + + if isinstance(frame, DataFrame): + if hasattr(cond, "__cuda_array_interface__"): + cond = DataFrame( + cond, columns=frame._column_names, index=frame.index + ) + elif ( + hasattr(cond, "__array_interface__") + and cond.__array_interface__["shape"] != frame.shape + ): + raise ValueError("conditional must be same shape as self") + elif not isinstance(cond, DataFrame): + cond = frame.from_pandas(pd.DataFrame(cond)) + + common_cols = set(frame._column_names).intersection( + set(cond._column_names) + ) + if len(common_cols) > 0: + # If `frame` and `cond` are having unequal index, + # then re-index `cond`. + if not frame.index.equals(cond.index): + cond = cond.reindex(frame.index) + else: + if cond.shape != frame.shape: + raise ValueError( + """Array conditional must be same shape as self""" + ) + # Setting `frame` column names to `cond` + # as `cond` has no column names. + cond.columns = frame.columns + + (source_df, others,) = _normalize_columns_and_scalars_type( + frame, other + ) + if isinstance(other, Frame): + others = others._data.columns + + out_df = DataFrame(index=frame.index) + if len(frame._columns) != len(others): + raise ValueError( + """Replacement list length or number of dataframe columns + should be equal to Number of columns of dataframe""" + ) + for i, column_name in enumerate(frame._column_names): + input_col = source_df._data[column_name] + other_column = others[i] + if column_name in cond._data: + if isinstance(input_col, cudf.core.column.CategoricalColumn): + if cudf.utils.dtypes.is_scalar(other_column): + try: + other_column = input_col._encode(other_column) + except ValueError: + # When other is not present in categories, + # fill with Null. + other_column = None + other_column = cudf.Scalar( + other_column, dtype=input_col.codes.dtype + ) + elif isinstance( + other_column, cudf.core.column.CategoricalColumn + ): + other_column = other_column.codes + input_col = input_col.codes + + result = cudf._lib.copying.copy_if_else( + input_col, other_column, cond._data[column_name] + ) + + if isinstance( + frame._data[column_name], + cudf.core.column.CategoricalColumn, + ): + result = cudf.core.column.build_categorical_column( + categories=frame._data[column_name].categories, + codes=cudf.core.column.as_column( + result.base_data, dtype=result.dtype + ), + mask=result.base_mask, + size=result.size, + offset=result.offset, + ordered=frame._data[column_name].ordered, + ) + else: + out_mask = cudf._lib.null_mask.create_null_mask( + len(input_col), + state=cudf._lib.null_mask.MaskState.ALL_NULL, + ) + result = input_col.set_mask(out_mask) + out_df[column_name] = frame[column_name].__class__(result) + + return frame._mimic_inplace(out_df, inplace=inplace) + + else: + if isinstance(other, DataFrame): + raise NotImplementedError( + "cannot align with a higher dimensional Frame" + ) + input_col = frame._data[frame.name] + cond = cudf.core.column.as_column(cond) + if len(cond) != len(frame): + raise ValueError( + """Array conditional must be same shape as self""" + ) + + (input_col, other,) = _normalize_columns_and_scalars_type( + frame, other, inplace + ) + + if isinstance(input_col, cudf.core.column.CategoricalColumn): + if cudf.utils.dtypes.is_scalar(other): + try: + other = input_col._encode(other) + except ValueError: + # When other is not present in categories, + # fill with Null. + other = None + other = cudf.Scalar(other, dtype=input_col.codes.dtype) + elif isinstance(other, cudf.core.column.CategoricalColumn): + other = other.codes + + input_col = input_col.codes + + result = cudf._lib.copying.copy_if_else(input_col, other, cond) + + if isinstance( + frame._data[frame.name], cudf.core.column.CategoricalColumn + ): + result = cudf.core.column.build_categorical_column( + categories=cast( + cudf.core.column.CategoricalColumn, + frame._data[frame.name], + ).categories, + codes=cudf.core.column.as_column( + result.base_data, dtype=result.dtype + ), + mask=result.base_mask, + size=result.size, + offset=result.offset, + ordered=cast( + cudf.core.column.CategoricalColumn, + frame._data[frame.name], + ).ordered, + ) + + if isinstance(frame, Index): + result = Index(result, name=frame.name) + else: + result = frame._copy_construct(data=result) + + return frame._mimic_inplace(result, inplace=inplace) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index b7f34e8c007..364675cd035 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -10,6 +10,7 @@ from cudf._lib.lists import ( contains_scalar, count_elements, + drop_list_duplicates, extract_element, sort_lists, ) @@ -361,6 +362,41 @@ def take(self, lists_indices): else: return res + def unique(self): + """ + Returns unique element for each list in the column, order for each + unique element is not guaranteed. + + Returns + ------- + ListColumn + + Examples + -------- + >>> s = cudf.Series([[1, 1, 2, None, None], None, [4, 4], []]) + >>> s + 0 [1.0, 1.0, 2.0, nan, nan] + 1 None + 2 [4.0, 4.0] + 3 [] + dtype: list + >>> s.list.unique() # Order of list element is not guaranteed + 0 [1.0, 2.0, nan] + 1 None + 2 [4.0] + 3 [] + dtype: list + """ + + if is_list_dtype(self._column.children[1].dtype): + raise NotImplementedError("Nested lists unique is not supported.") + + return self._return_or_inplace( + drop_list_duplicates( + self._column, nulls_equal=True, nans_all_equal=True + ) + ) + def sort_values( self, ascending=True, diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 01b96151485..6639fc7c25c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1658,8 +1658,9 @@ def update( if not self.index.equals(other.index): other = other.reindex(self.index, axis=0) - for col in self.columns: - this = self[col] + source_df = self.copy(deep=False) + for col in source_df._column_names: + this = source_df[col] that = other[col] if errors == "raise": @@ -1676,8 +1677,9 @@ def update( # don't overwrite columns unnecessarily if mask.all(): continue + source_df[col] = source_df[col].where(mask, that) - self[col] = this.where(mask, that) + self._mimic_inplace(source_df, inplace=True) def __add__(self, other): return self._apply_op("__add__", other) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index fb746d6c794..bc43c367833 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6,7 +6,7 @@ import functools import warnings from collections import OrderedDict, abc as abc -from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, Union, overload +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, TypeVar, Union import cupy import numpy as np @@ -14,7 +14,6 @@ import pyarrow as pa from nvtx import annotate from pandas.api.types import is_dict_like, is_dtype_equal -from typing_extensions import Literal import cudf from cudf import _lib as libcudf @@ -53,19 +52,9 @@ class Frame(libcudf.table.Table): def _from_table(cls, table: Frame): return cls(table._data, index=table._index) - @overload - def _mimic_inplace(self, result: Frame) -> Frame: - ... - - @overload - def _mimic_inplace(self, result: Frame, inplace: Literal[True]): - ... - - @overload - def _mimic_inplace(self, result: Frame, inplace: Literal[False]) -> Frame: - ... - - def _mimic_inplace(self, result, inplace=False): + def _mimic_inplace( + self: T, result: Frame, inplace: bool = False + ) -> Optional[Frame]: if inplace: for col in self._data: if col in result._data: @@ -74,6 +63,7 @@ def _mimic_inplace(self, result, inplace=False): ) self._data = result._data self._index = result._index + return None else: return result @@ -796,87 +786,6 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): return self._mimic_inplace(output, inplace=inplace) - def _normalize_scalars(self, other): - """ - Try to normalizes scalar values as per self dtype - """ - if ( - other is not None - and (isinstance(other, float) and not np.isnan(other)) - ) and (self.dtype.type(other) != other): - raise TypeError( - f"Cannot safely cast non-equivalent " - f"{type(other).__name__} to {self.dtype.name}" - ) - - return ( - self.dtype.type(other) - if ( - other is not None - and (isinstance(other, float) and not np.isnan(other)) - ) - else other - ) - - def _normalize_columns_and_scalars_type(self, other): - """ - Try to normalize the other's dtypes as per self. - - Parameters - ---------- - - self : Can be a DataFrame or Series or Index - other : Can be a DataFrame, Series, Index, Array - like object or a scalar value - - if self is DataFrame, other can be only a - scalar or array like with size of number of columns - in DataFrame or a DataFrame with same dimension - - if self is Series, other can be only a scalar or - a series like with same length as self - - Returns: - -------- - A dataframe/series/list/scalar form of normalized other - """ - if isinstance(self, cudf.DataFrame) and isinstance( - other, cudf.DataFrame - ): - return [ - other[self_col].astype(self._data[self_col].dtype)._column - for self_col in self._data.names - ] - - elif isinstance(self, (cudf.Series, cudf.Index)) and not is_scalar( - other - ): - other = as_column(other) - return other.astype(self.dtype) - - else: - # Handles scalar or list/array like scalars - if isinstance(self, (cudf.Series, cudf.Index)) and is_scalar( - other - ): - return self._normalize_scalars(other) - - elif isinstance(self, cudf.DataFrame): - out = [] - if is_scalar(other): - other = [other for i in range(len(self._data.names))] - out = [ - self[in_col_name]._normalize_scalars(sclr) - for in_col_name, sclr in zip(self._data.names, other) - ] - - return out - else: - raise ValueError( - f"Inappropriate input {type(self)} " - f"and other {type(other)} combination" - ) - def where(self, cond, other=None, inplace=False): """ Replace values where the condition is False. @@ -930,133 +839,9 @@ def where(self, cond, other=None, inplace=False): dtype: int64 """ - if isinstance(self, cudf.DataFrame): - if hasattr(cond, "__cuda_array_interface__"): - cond = cudf.DataFrame( - cond, columns=self._data.names, index=self.index - ) - elif not isinstance(cond, cudf.DataFrame): - cond = self.from_pandas(pd.DataFrame(cond)) - - common_cols = set(self._data.names).intersection( - set(cond._data.names) - ) - if len(common_cols) > 0: - # If `self` and `cond` are having unequal index, - # then re-index `cond`. - if not self.index.equals(cond.index): - cond = cond.reindex(self.index) - else: - if cond.shape != self.shape: - raise ValueError( - """Array conditional must be same shape as self""" - ) - # Setting `self` column names to `cond` - # as `cond` has no column names. - cond.columns = self.columns - - other = self._normalize_columns_and_scalars_type(other) - out_df = cudf.DataFrame(index=self.index) - if len(self._columns) != len(other): - raise ValueError( - """Replacement list length or number of dataframe columns - should be equal to Number of columns of dataframe""" - ) - - for column_name, other_column in zip(self._data.names, other): - input_col = self._data[column_name] - if column_name in cond._data: - if isinstance( - input_col, cudf.core.column.CategoricalColumn - ): - if np.isscalar(other_column): - try: - other_column = input_col._encode(other_column) - except ValueError: - # When other is not present in categories, - # fill with Null. - other_column = None - elif hasattr(other_column, "codes"): - other_column = other_column.codes - input_col = input_col.codes - - result = libcudf.copying.copy_if_else( - input_col, other_column, cond._data[column_name] - ) - - if isinstance( - self._data[column_name], - cudf.core.column.CategoricalColumn, - ): - result = build_categorical_column( - categories=self._data[column_name].categories, - codes=as_column( - result.base_data, dtype=result.dtype - ), - mask=result.base_mask, - size=result.size, - offset=result.offset, - ordered=self._data[column_name].ordered, - ) - else: - from cudf._lib.null_mask import MaskState, create_null_mask - - out_mask = create_null_mask( - len(input_col), state=MaskState.ALL_NULL - ) - result = input_col.set_mask(out_mask) - out_df[column_name] = self[column_name].__class__(result) - - return self._mimic_inplace(out_df, inplace=inplace) - - else: - - if isinstance(other, cudf.DataFrame): - raise NotImplementedError( - "cannot align with a higher dimensional Frame" - ) - - other = self._normalize_columns_and_scalars_type(other) - - cond = as_column(cond) - if len(cond) != len(self): - raise ValueError( - """Array conditional must be same shape as self""" - ) - input_col = self._data[self.name] - if isinstance(input_col, cudf.core.column.CategoricalColumn): - if np.isscalar(other): - try: - other = input_col._encode(other) - except ValueError: - # When other is not present in categories, - # fill with Null. - other = None - elif hasattr(other, "codes"): - other = other.codes - - input_col = input_col.codes - - result = libcudf.copying.copy_if_else(input_col, other, cond) - - if is_categorical_dtype(self.dtype): - result = build_categorical_column( - categories=self._data[self.name].categories, - codes=as_column(result.base_data, dtype=result.dtype), - mask=result.base_mask, - size=result.size, - offset=result.offset, - ordered=self._data[self.name].ordered, - ) - - if isinstance(self, cudf.Index): - from cudf.core.index import as_index - - result = as_index(result, name=self.name) - else: - result = self._copy_construct(data=result) - - return self._mimic_inplace(result, inplace=inplace) + return cudf.core._internals.where( + frame=self, cond=cond, other=other, inplace=inplace + ) def mask(self, cond, other=None, inplace=False): """ @@ -2735,7 +2520,6 @@ def searchsorted( array([4, 4, 4, 0], dtype=int32) """ # Call libcudf++ search_sorted primitive - from cudf.utils.dtypes import is_scalar scalar_flag = None if is_scalar(values): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 71a4a48a07a..955519d0b57 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3923,6 +3923,110 @@ def replace( return self._mimic_inplace(result, inplace=inplace) + def update(self, other): + """ + Modify Series in place using values from passed Series. + Uses non-NA values from passed Series to make updates. Aligns + on index. + + Parameters + ---------- + other : Series, or object coercible into Series + + Examples + -------- + >>> import cudf + >>> s = cudf.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.update(cudf.Series([4, 5, 6])) + >>> s + 0 4 + 1 5 + 2 6 + dtype: int64 + >>> s = cudf.Series(['a', 'b', 'c']) + >>> s + 0 a + 1 b + 2 c + dtype: object + >>> s.update(cudf.Series(['d', 'e'], index=[0, 2])) + >>> s + 0 d + 1 b + 2 e + dtype: object + >>> s = cudf.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.update(cudf.Series([4, 5, 6, 7, 8])) + >>> s + 0 4 + 1 5 + 2 6 + dtype: int64 + + If ``other`` contains NaNs the corresponding values are not updated + in the original Series. + + >>> s = cudf.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.update(cudf.Series([4, np.nan, 6], nan_as_null=False)) + >>> s + 0 4 + 1 2 + 2 6 + dtype: int64 + + ``other`` can also be a non-Series object type + that is coercible into a Series + + >>> s = cudf.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.update([4, np.nan, 6]) + >>> s + 0 4 + 1 2 + 2 6 + dtype: int64 + >>> s = cudf.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.update({1: 9}) + >>> s + 0 1 + 1 9 + 2 3 + dtype: int64 + """ + + if not isinstance(other, cudf.Series): + other = cudf.Series(other) + + if not self.index.equals(other.index): + other = other.reindex(index=self.index) + mask = other.notna() + + self.mask(mask, other, inplace=True) + def reverse(self): """ Reverse the Series diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d72b88f1713..f068d02d575 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8215,9 +8215,6 @@ def test_agg_for_dataframe_with_string_columns(aggs): @pytest.mark.parametrize( "overwrite", [True, False], ) -@pytest.mark.parametrize( - "filter_func", [None], -) @pytest.mark.parametrize( "errors", ["ignore"], ) @@ -8262,19 +8259,17 @@ def test_agg_for_dataframe_with_string_columns(aggs): }, ], ) -def test_update_for_dataframes( - data, data2, join, overwrite, filter_func, errors -): +def test_update_for_dataframes(data, data2, join, overwrite, errors): pdf = pd.DataFrame(data) gdf = cudf.DataFrame(data) other_pd = pd.DataFrame(data2) other_gd = cudf.DataFrame(data2) - expect = pdf.update(other_pd, join, overwrite, filter_func, errors) - got = gdf.update(other_gd, join, overwrite, filter_func, errors) + pdf.update(other=other_pd, join=join, overwrite=overwrite, errors=errors) + gdf.update(other=other_gd, join=join, overwrite=overwrite, errors=errors) - assert_eq(expect, got) + assert_eq(pdf, gdf, check_dtype=False) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 5645ce60596..9906600304b 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -1,6 +1,7 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. import functools +import numpy as np import pandas as pd import pyarrow as pa import pytest @@ -162,6 +163,39 @@ def test_take_invalid(invalid, exception): gs.list.take(invalid) +@pytest.mark.parametrize( + ("data", "expected"), + [ + ([[1, 1, 2, 2], [], None, [3, 4, 5]], [[1, 2], [], None, [3, 4, 5]]), + ( + [[1.233, np.nan, 1.234, 3.141, np.nan, 1.234]], + [[1.233, 1.234, np.nan, 3.141]], + ), # duplicate nans + ([[1, 1, 2, 2, None, None]], [[1, 2, None]]), # duplicate nulls + ( + [[1.233, np.nan, None, 1.234, 3.141, np.nan, 1.234, None]], + [[1.233, 1.234, np.nan, None, 3.141]], + ), # duplicate nans and nulls + ([[2, None, 1, None, 2]], [[1, 2, None]]), + ([[], []], [[], []]), + ([[], None], [[], None]), + ], +) +def test_unique(data, expected): + """ + Pandas de-duplicates nans and nulls respectively in Series.unique. + `expected` is setup to mimic such behavior + """ + gs = cudf.Series(data, nan_as_null=False) + + got = gs.list.unique() + expected = cudf.Series(expected, nan_as_null=False).list.sort_values() + + got = got.list.sort_values() + + assert_eq(expected, got) + + def key_func_builder(x, na_position): if x is None: if na_position == "first": diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index fe418d1ade1..4781ff995b0 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -19,7 +19,7 @@ import cudf from cudf.io.parquet import ParquetWriter, merge_parquet_filemetadata from cudf.tests import dataset_generator as dg -from cudf.tests.utils import assert_eq +from cudf.tests.utils import assert_eq, assert_exceptions_equal @pytest.fixture(scope="module") @@ -1937,3 +1937,15 @@ def test_parquet_writer_decimal(tmpdir): got = pd.read_parquet(fname) assert_eq(gdf, got) + + +def test_parquet_writer_column_validation(): + df = cudf.DataFrame({1: [1, 2, 3], "1": ["a", "b", "c"]}) + pdf = df.to_pandas() + + assert_exceptions_equal( + lfunc=df.to_parquet, + rfunc=pdf.to_parquet, + lfunc_args_and_kwargs=(["cudf.parquet"],), + rfunc_args_and_kwargs=(["pandas.parquet"],), + ) diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index e7baa4ee926..65ce2a79992 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -709,25 +709,40 @@ def test_series_where(data_dtype, fill_value): sr.where(sr > 0, fill_value) else: # Cast back to original dtype as pandas automatically upcasts - expect = psr.where(psr > 0, fill_value).astype(psr.dtype) + expect = psr.where(psr > 0, fill_value) got = sr.where(sr > 0, fill_value) - assert_eq(expect, got) + # pandas returns 'float16' dtype, which is not supported in cudf + assert_eq( + expect, + got, + check_dtype=False if expect.dtype.kind in ("f") else True, + ) if sr.dtype.type(fill_value) != fill_value: with pytest.raises(TypeError): sr.where(sr < 0, fill_value) else: - expect = psr.where(psr < 0, fill_value).astype(psr.dtype) + expect = psr.where(psr < 0, fill_value) got = sr.where(sr < 0, fill_value) - assert_eq(expect, got) + # pandas returns 'float16' dtype, which is not supported in cudf + assert_eq( + expect, + got, + check_dtype=False if expect.dtype.kind in ("f") else True, + ) if sr.dtype.type(fill_value) != fill_value: with pytest.raises(TypeError): sr.where(sr == 0, fill_value) else: - expect = psr.where(psr == 0, fill_value).astype(psr.dtype) + expect = psr.where(psr == 0, fill_value) got = sr.where(sr == 0, fill_value) - assert_eq(expect, got) + # pandas returns 'float16' dtype, which is not supported in cudf + assert_eq( + expect, + got, + check_dtype=False if expect.dtype.kind in ("f") else True, + ) @pytest.mark.parametrize("fill_value", [100, 100.0, 100.5]) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index beda14934ca..0dc53fa29e9 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -921,6 +921,42 @@ def custom_add_func(sr, val): ) +@pytest.mark.parametrize( + "data", + [cudf.Series([1, 2, 3]), cudf.Series([10, 11, 12], index=[1, 2, 3])], +) +@pytest.mark.parametrize( + "other", + [ + cudf.Series([4, 5, 6]), + cudf.Series([4, 5, 6, 7, 8]), + cudf.Series([4, np.nan, 6], nan_as_null=False), + [4, np.nan, 6], + {1: 9}, + ], +) +def test_series_update(data, other): + gs = data.copy(deep=True) + if isinstance(other, cudf.Series): + g_other = other.copy(deep=True) + p_other = g_other.to_pandas() + else: + g_other = other + p_other = other + + ps = gs.to_pandas() + + gs_column_before = gs._column + gs.update(g_other) + gs_column_after = gs._column + + assert_eq(gs_column_before.to_array(), gs_column_after.to_array()) + + ps.update(p_other) + + assert_eq(gs, ps) + + @pytest.mark.parametrize( "data", [ @@ -942,6 +978,19 @@ def test_fillna_with_nan(data, nan_as_null, fill_value): assert_eq(expected, actual) +def test_series_mask_mixed_dtypes_error(): + s = cudf.Series(["a", "b", "c"]) + with pytest.raises( + TypeError, + match=re.escape( + "cudf does not support mixed types, please type-cast " + "the column of dataframe/series and other " + "to same dtypes." + ), + ): + s.where([True, False, True], [1, 2, 3]) + + @pytest.mark.parametrize( "ps", [ diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 8af225ecb58..be2b1bca2e0 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -345,7 +345,7 @@ def to_cudf_compatible_scalar(val, dtype=None): if not is_scalar(val): raise ValueError( f"Cannot convert value of type {type(val).__name__} " - " to cudf scalar" + "to cudf scalar" ) if isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0: @@ -637,6 +637,11 @@ def find_common_type(dtypes): # Aggregate same types dtypes = set(dtypes) + if any(is_decimal_dtype(dtype) for dtype in dtypes): + raise NotImplementedError( + "DecimalDtype is not yet supported in find_common_type" + ) + # Corner case 1: # Resort to np.result_type to handle "M" and "m" types separately dt_dtypes = set(filter(lambda t: is_datetime_dtype(t), dtypes)) @@ -651,7 +656,64 @@ def find_common_type(dtypes): dtypes = dtypes - td_dtypes dtypes.add(np.result_type(*td_dtypes)) - return np.find_common_type(list(dtypes), []) + common_dtype = np.find_common_type(list(dtypes), []) + if common_dtype == np.dtype("float16"): + # cuDF does not support float16 dtype + return np.dtype("float32") + else: + return common_dtype + + +def _can_cast(from_dtype, to_dtype): + """ + Utility function to determine if we can cast + from `from_dtype` to `to_dtype`. This function primarily calls + `np.can_cast` but with some special handling around + cudf specific dtypes. + """ + if isinstance(from_dtype, type): + from_dtype = np.dtype(from_dtype) + if isinstance(to_dtype, type): + to_dtype = np.dtype(to_dtype) + + # TODO : Add precision & scale checking for + # decimal types in future + if isinstance(from_dtype, cudf.core.dtypes.Decimal64Dtype): + if isinstance(to_dtype, cudf.core.dtypes.Decimal64Dtype): + return True + elif isinstance(to_dtype, np.dtype): + if to_dtype.kind in {"i", "f", "u", "U", "O"}: + return True + else: + return False + elif isinstance(from_dtype, np.dtype): + if isinstance(to_dtype, np.dtype): + return np.can_cast(from_dtype, to_dtype) + elif isinstance(to_dtype, cudf.core.dtypes.Decimal64Dtype): + if from_dtype.kind in {"i", "f", "u", "U", "O"}: + return True + else: + return False + elif isinstance(to_dtype, cudf.core.types.CategoricalDtype): + return True + else: + return False + elif isinstance(from_dtype, cudf.core.dtypes.ListDtype): + # TODO: Add level based checks too once casting of + # list columns is supported + if isinstance(to_dtype, cudf.core.dtypes.ListDtype): + return np.can_cast(from_dtype.leaf_type, to_dtype.leaf_type) + else: + return False + elif isinstance(from_dtype, cudf.core.dtypes.CategoricalDtype): + if isinstance(to_dtype, cudf.core.dtypes.CategoricalDtype): + return True + elif isinstance(to_dtype, np.dtype): + return np.can_cast(from_dtype._categories.dtype, to_dtype) + else: + return False + else: + return np.can_cast(from_dtype, to_dtype) # Type dispatch loops similar to what are found in `np.add.types` diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index bb52ebce262..66b06acc858 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -1,12 +1,20 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. + +from distutils.version import LooseVersion + import cupy as cp import numpy as np import pandas as pd import pyarrow as pa +import dask from dask.dataframe.categorical import categorical_dtype_dispatch from dask.dataframe.core import get_parallel_type, make_meta, meta_nonempty -from dask.dataframe.methods import concat_dispatch, tolist_dispatch +from dask.dataframe.methods import ( + concat_dispatch, + is_categorical_dtype_dispatch, + tolist_dispatch, +) from dask.dataframe.utils import ( UNKNOWN_CATEGORIES, _nonempty_scalar, @@ -23,6 +31,7 @@ get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame) get_parallel_type.register(cudf.Series, lambda _: Series) get_parallel_type.register(cudf.Index, lambda _: Index) +DASK_VERSION = LooseVersion(dask.__version__) @meta_nonempty.register(cudf.Index) @@ -196,18 +205,45 @@ def make_meta_object(x, index=None): raise TypeError(f"Don't know how to create metadata from {x}") -@concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.Index)) -def concat_cudf( - dfs, - axis=0, - join="outer", - uniform=False, - filter_warning=True, - sort=None, - ignore_index=False, -): - assert join == "outer" - return cudf.concat(dfs, axis=axis, ignore_index=ignore_index) +if DASK_VERSION > "2021.03.1": + + @concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.Index)) + def concat_cudf( + dfs, + axis=0, + join="outer", + uniform=False, + filter_warning=True, + sort=None, + ignore_index=False, + **kwargs, + ): + assert join == "outer" + + ignore_order = kwargs.get("ignore_order", False) + if ignore_order: + raise NotImplementedError( + "ignore_order parameter is not yet supported in dask-cudf" + ) + + return cudf.concat(dfs, axis=axis, ignore_index=ignore_index) + + +else: + + @concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.Index)) + def concat_cudf( + dfs, + axis=0, + join="outer", + uniform=False, + filter_warning=True, + sort=None, + ignore_index=False, + ): + assert join == "outer" + + return cudf.concat(dfs, axis=axis, ignore_index=ignore_index) @categorical_dtype_dispatch.register((cudf.DataFrame, cudf.Series, cudf.Index)) @@ -220,6 +256,13 @@ def tolist_cudf(obj): return obj.to_arrow().to_pylist() +@is_categorical_dtype_dispatch.register( + (cudf.Series, cudf.Index, cudf.CategoricalDtype, Series) +) +def is_categorical_dtype_cudf(obj): + return cudf.utils.dtypes.is_categorical_dtype(obj) + + try: from dask.dataframe.utils import group_split_dispatch, hash_object_dispatch diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py new file mode 100644 index 00000000000..6bf4b956404 --- /dev/null +++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py @@ -0,0 +1,16 @@ +import pandas as pd + +from dask.dataframe.methods import is_categorical_dtype + +import cudf + + +def test_is_categorical_dispatch(): + assert is_categorical_dtype(pd.CategoricalDtype([1, 2, 3])) + assert is_categorical_dtype(cudf.CategoricalDtype([1, 2, 3])) + + assert is_categorical_dtype(cudf.Series([1, 2, 3], dtype="category")) + assert is_categorical_dtype(pd.Series([1, 2, 3], dtype="category")) + + assert is_categorical_dtype(pd.Index([1, 2, 3], dtype="category")) + assert is_categorical_dtype(cudf.Index([1, 2, 3], dtype="category")) diff --git a/python/dask_cudf/dask_cudf/tests/test_onehot.py b/python/dask_cudf/dask_cudf/tests/test_onehot.py index d5fb9e9a110..a9d88b5203c 100644 --- a/python/dask_cudf/dask_cudf/tests/test_onehot.py +++ b/python/dask_cudf/dask_cudf/tests/test_onehot.py @@ -3,10 +3,10 @@ from dask import dataframe as dd -import dask_cudf - import cudf +import dask_cudf + def test_get_dummies_cat(): df = pd.DataFrame({"C": [], "A": []}) @@ -101,3 +101,22 @@ def test_get_dummies_large(): dd.get_dummies(gddf).compute(), check_dtype=False, ) + + +def test_get_dummies_categorical(): + # https://github.com/rapidsai/cudf/issues/7111 + gdf = cudf.DataFrame({"A": ["a", "b", "b"], "B": [1, 2, 3]}) + pdf = gdf.to_pandas() + + gddf = dask_cudf.from_cudf(gdf, npartitions=1) + gddf = gddf.categorize(columns=["B"]) + + pddf = dd.from_pandas(pdf, npartitions=1) + pddf = pddf.categorize(columns=["B"]) + + expect = dd.get_dummies(pddf, columns=["B"]) + got = dd.get_dummies(gddf, columns=["B"]) + + dd.assert_eq( + expect, got, + )