diff --git a/CHANGELOG.md b/CHANGELOG.md index a5c3a013e1f..776cb25b2c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,11 +14,13 @@ - PR #3667 Define and implement round-robin partition API. - PR #3690 Add bools_to_mask - PR #3683 Added support for multiple delimiters in `nvtext.token_count()` +- PR #3792 Adding is_nan and is_notnan ## Improvements - PR #3292 Port NVStrings regex contains function - PR #3409 Port NVStrings regex replace function +- PR #3417 Port NVStrings regex findall function - PR #3351 Add warning when filepath resolves to multiple files in cudf readers - PR #3370 Port NVStrings strip functions - PR #3453 Port NVStrings IPv4 convert functions to cudf strings column @@ -45,6 +47,7 @@ - PR #3640 Enable memory_usage in dask_cudf (also adds pd.Index from_pandas) - PR #3654 Update Jitify submodule ref to include gcc-8 fix - PR #3639 Define and implement `nans_to_nulls` +- PR #3561 Rework contains implementation in search - PR #3616 Add aggregation infrastructure for argmax/argmin. - PR #3699 Stringify libcudacxx headers for binary op JIT - PR #3697 Improve column insert performance for wide frames @@ -53,6 +56,7 @@ - PR #3657 Define and implement compiled binops for string column comparisons - PR #3520 Change read_parquet defaults and add warnings - PR #3780 Java APIs for selecting a GPU +- PR #3805 Avoid CuPy 7.1.0 for now ## Bug Fixes @@ -91,6 +95,8 @@ - PR #3783 Bind cuDF operators to Dask Dataframe - PR #3775 Fix segfault when reading compressed CSV files larger than 4GB - PR #3803 Keep name when unpickling Index objects +- PR #3804 Fix cuda crash in AVRO reader +- PR #3766 Remove references to cudf::type_id::CATEGORY from IO code # cuDF 0.11.0 (11 Dec 2019) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 949b17134c6..06fb6f26e27 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -60,7 +60,7 @@ source activate gdf conda install "rmm=$MINOR_VERSION.*" "cudatoolkit=$CUDA_REL" \ "dask>=2.1.0" "distributed>=2.1.0" "numpy>=1.16" "double-conversion" \ "rapidjson" "flatbuffers" "boost-cpp" "fsspec>=0.3.3" "dlpack" \ - "feather-format" "cupy>=6.6.0,<8.0.0a0" "arrow-cpp=0.15.0" "pyarrow=0.15.0" \ + "feather-format" "cupy>=6.6.0,<8.0.0a0,!=7.1.0" "arrow-cpp=0.15.0" "pyarrow=0.15.0" \ "fastavro>=0.22.0" "pandas>=0.25,<0.26" "hypothesis" "s3fs" "gcsfs" \ "boto3" "moto" "httpretty" "streamz" diff --git a/conda/environments/cudf_dev_cuda10.0.yml b/conda/environments/cudf_dev_cuda10.0.yml index 29ceaef7ca9..fd621f4895c 100644 --- a/conda/environments/cudf_dev_cuda10.0.yml +++ b/conda/environments/cudf_dev_cuda10.0.yml @@ -6,7 +6,7 @@ channels: - conda-forge - defaults dependencies: - - cupy>=6.6.0,<8.0.0a0 + - cupy>=6.6.0,<8.0.0a0,!=7.1.0 - rmm=0.12.* - cmake>=3.12 - cmake_setuptools>=0.1.3 diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml index d6528dd1206..174226886d7 100644 --- a/conda/environments/cudf_dev_cuda10.1.yml +++ b/conda/environments/cudf_dev_cuda10.1.yml @@ -6,7 +6,7 @@ channels: - conda-forge - defaults dependencies: - - cupy>=6.6.0,<8.0.0a0 + - cupy>=6.6.0,<8.0.0a0,!=7.1.0 - rmm=0.12.* - cmake>=3.12 - cmake_setuptools>=0.1.3 diff --git a/conda/environments/cudf_dev_cuda9.2.yml b/conda/environments/cudf_dev_cuda9.2.yml index b69b2b0c8f8..990cb3e48ea 100644 --- a/conda/environments/cudf_dev_cuda9.2.yml +++ b/conda/environments/cudf_dev_cuda9.2.yml @@ -6,7 +6,7 @@ channels: - conda-forge - defaults dependencies: - - cupy>=6.6.0,<8.0.0a0 + - cupy>=6.6.0,<8.0.0a0,!=7.1.0 - rmm=0.12.* - cmake>=3.12 - cmake_setuptools>=0.1.3 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index f8bcba787f9..f6dcb90e901 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -31,7 +31,7 @@ requirements: run: - python - pandas>=0.24.2,<0.25 - - cupy >=6.6.0,<8.0.0a0 + - cupy >=6.6.0,<8.0.0a0,!=7.1.0 - numba >=0.46.0 - pyarrow 0.15.0.* - fastavro >=0.22.0 diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 23a7a7af2ca..836ea404e9c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -473,6 +473,7 @@ add_library(cudf src/merge/legacy/merge.cu src/unary/cast_ops.cu src/unary/null_ops.cu + src/unary/nan_ops.cu src/unary/legacy/math_ops.cu src/unary/legacy/cast_ops.cu src/unary/legacy/null_ops.cu @@ -576,6 +577,7 @@ add_library(cudf src/strings/copying/copying.cu src/strings/extract.cu src/strings/find.cu + src/strings/findall.cu src/strings/find_multiple.cu src/strings/filling/fill.cu src/strings/padding.cu diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp new file mode 100644 index 00000000000..fce820353ba --- /dev/null +++ b/cpp/include/cudf/strings/findall.hpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +namespace cudf +{ +namespace strings +{ + +/** + * @brief Returns a table of strings columns for each matching occurrence of the + * regex pattern within each string. + * + * The number of output columns is determined by the string with the most + * matches. + * + * Any null string entries return corresponding null output column entries. + * + * @param strings Strings instance for this operation. + * @param pattern Regex pattern to match within each string. + * @param mr Resource for allocating device memory. + * @return New table of strings columns. + */ +std::unique_ptr findall_re( strings_column_view const& strings, + std::string const& pattern, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); + +} // namespace strings +} // namespace cudf diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp index e16bd8acb75..b4107ec8b60 100644 --- a/cpp/include/cudf/unary.hpp +++ b/cpp/include/cudf/unary.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019, NVIDIA CORPORATION. + * Copyright (c) 2018-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -93,5 +93,39 @@ std::unique_ptr is_valid(cudf::column_view const& input, std::unique_ptr cast(column_view const& input, data_type out_type, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); +/** + * @brief Creates a column of `BOOL8` elements indicating the presence of `NaN` values + * in a column of floating point values. + * The output element at row `i` is `true` if the element in `input` at row i is `NAN`, else `false` + * + * @throws cudf::logic_error if `input` is a non-floating point type + * + * @param input A column of floating-point elements + * @param mr Optional, The resource to use for allocating the device memory in the returned column. + * + * @returns unique_ptr A non-nulalble column of `BOOL8` elements with `true` + * representing `NAN` values + */ +std::unique_ptr is_nan(cudf::column_view const& input, + rmm::mr::device_memory_resource* mr = + rmm::mr::get_default_resource()); + +/** + * @brief Creates a column of `BOOL8` elements indicating the absence of `NaN` values + * in a column of floating point values. + * The output element at row `i` is `false` if the element in `input` at row i is `NAN`, else `true` + * + * @throws cudf::logic_error if `input` is a non-floating point type + * + * @param input A column of floating-point elements + * @param mr Optional, The resource to use for allocating the device memory in the returned column. + * + * @returns unique_ptr A non-nulalble column of `BOOL8` elements with `false` + * representing `NAN` values + */ +std::unique_ptr is_not_nan(cudf::column_view const& input, + rmm::mr::device_memory_resource* mr = + rmm::mr::get_default_resource()); + } // namespace experimental } // namespace cudf diff --git a/cpp/src/io/avro/legacy/avro_reader_impl.cu b/cpp/src/io/avro/legacy/avro_reader_impl.cu index 1df6bef2b98..a0651af505e 100644 --- a/cpp/src/io/avro/legacy/avro_reader_impl.cu +++ b/cpp/src/io/avro/legacy/avro_reader_impl.cu @@ -452,7 +452,7 @@ void reader::Impl::decode_data( static_cast(block_list.data()), schema_desc.device_ptr(), reinterpret_cast(global_dictionary.device_ptr()), static_cast(block_data.data()), - static_cast(block_list.size()), + static_cast(md_->block_list.size()), static_cast(schema_desc.size()), static_cast(total_dictionary_entries), md_->num_rows, md_->skip_rows, min_row_data_size, 0)); diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu index 6768659aec2..7c3dd94573f 100644 --- a/cpp/src/io/avro/reader_impl.cu +++ b/cpp/src/io/avro/reader_impl.cu @@ -329,7 +329,7 @@ void reader::impl::decode_data( static_cast(block_list.data()), schema_desc.device_ptr(), reinterpret_cast(global_dictionary.device_ptr()), static_cast(block_data.data()), - static_cast(block_list.size()), + static_cast(_metadata->block_list.size()), static_cast(schema_desc.size()), static_cast(total_dictionary_entries), _metadata->num_rows, _metadata->skip_rows, min_row_data_size, stream)); diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu index 27d43af0cf4..ca1f92c257b 100644 --- a/cpp/src/io/csv/csv_gpu.cu +++ b/cpp/src/io/csv/csv_gpu.cu @@ -679,8 +679,7 @@ __global__ void convertCsvToGdf(const char *raw_csv, const ParseOptions opts, // Modify start & end to ignore whitespace and quotechars long tempPos = pos - 1; - if (!is_na && dtype[actual_col].id() != cudf::type_id::CATEGORY && - dtype[actual_col].id() != cudf::type_id::STRING) { + if (!is_na && dtype[actual_col].id() != cudf::type_id::STRING) { trim_field_start_end(raw_csv, &start, &tempPos, opts.quotechar); } diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 4a9221a788b..480c0a2fed0 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -128,7 +128,7 @@ data_type convertStringToDtype(const std::string &dtype) { return data_type(cudf::type_id::TIMESTAMP_MICROSECONDS); if (dtype == "timestamp[ns]") return data_type(cudf::type_id::TIMESTAMP_NANOSECONDS); - if (dtype == "category") return data_type(cudf::type_id::CATEGORY); + if (dtype == "category") return data_type(cudf::type_id::INT32); if (dtype == "date32") return data_type(cudf::type_id::TIMESTAMP_DAYS); if (dtype == "bool" || dtype == "boolean") return data_type(cudf::type_id::BOOL8); diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 876c44e6403..33e0724f7fe 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -91,8 +91,6 @@ constexpr orc::TypeKind to_orc_type(cudf::type_id id) { case cudf::type_id::TIMESTAMP_MILLISECONDS: case cudf::type_id::TIMESTAMP_NANOSECONDS: return TypeKind::TIMESTAMP; - case cudf::type_id::CATEGORY: - return TypeKind::INT; case cudf::type_id::STRING: return TypeKind::STRING; default: diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index a70e95ee69a..85cec512cf3 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -97,7 +97,7 @@ constexpr type_id to_type_id(parquet::Type physical, case parquet::BYTE_ARRAY: case parquet::FIXED_LEN_BYTE_ARRAY: // Can be mapped to GDF_CATEGORY (32-bit hash) or GDF_STRING (nvstring) - return strings_to_categorical ? type_id::CATEGORY : type_id::STRING; + return strings_to_categorical ? type_id::INT32 : type_id::STRING; case parquet::INT96: return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id @@ -146,7 +146,7 @@ std::tuple conversion_info(type_id column_type_id, type_width = 1; // I32 -> I8 } else if (column_type_id == type_id::INT16) { type_width = 2; // I32 -> I16 - } else if (column_type_id == type_id::CATEGORY) { + } else if (column_type_id == type_id::INT32) { type_width = 4; // str -> hash32 } else if (is_timestamp(data_type{column_type_id})) { clock_rate = to_clockrate(timestamp_type_id); diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 395c3f499a9..a1fe99f8c9d 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -128,7 +128,6 @@ class parquet_column_view { _stats_dtype = statistics_dtype::dtype_int16; break; case cudf::type_id::INT32: - case cudf::type_id::CATEGORY: _physical_type = Type::INT32; _stats_dtype = statistics_dtype::dtype_int32; break; diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu index f4e80ef076e..78f14790d4f 100644 --- a/cpp/src/search/search.cu +++ b/cpp/src/search/search.cu @@ -21,7 +21,9 @@ #include #include #include + #include +#include #include #include @@ -122,72 +124,28 @@ std::unique_ptr search_ordered(table_view const& t, return result; } -template -struct compare_with_value{ - compare_with_value(column_device_view c, Element val, bool val_is_valid, bool nulls_are_equal) - - : col{c}, value{val}, val_is_valid{val_is_valid}, nulls_are_equal{nulls_are_equal} {} - - __device__ bool operator()(size_type i) noexcept { - if (nullable) { - bool const col_is_null{col.nullable() and col.is_null(i)}; - if (col_is_null and not val_is_valid) - return nulls_are_equal; - else if (col_is_null == val_is_valid) - return false; - } - - return equality_compare(col.element(i), value); - } - - column_device_view col; - Element value; - bool val_is_valid; - bool nulls_are_equal; -}; - -template -void populate_element(scalar const& value, Element &e) { - using ScalarType = cudf::experimental::scalar_type_t; - auto s1 = static_cast(&value); - - e = s1->value(); -} - -template <> -void populate_element(scalar const& value, string_view &e) { - using ScalarType = cudf::experimental::scalar_type_t; - auto s1 = static_cast(&value); - - e = string_view{s1->data(), s1->size()}; -} - struct contains_scalar_dispatch { template bool operator()(column_view const& col, scalar const& value, - cudaStream_t stream, - rmm::mr::device_memory_resource *mr) { + cudaStream_t stream, rmm::mr::device_memory_resource *mr) { + using ScalarType = cudf::experimental::scalar_type_t; auto d_col = column_device_view::create(col, stream); - auto data_it = thrust::make_counting_iterator(0); - - bool element_is_valid{value.is_valid()}; - Element element; - - populate_element(value, element); + auto s = static_cast(&value); if (col.has_nulls()) { - auto eq_op = compare_with_value(*d_col, element, element_is_valid, true); + auto found_iter = thrust::find(rmm::exec_policy(stream)->on(stream), + d_col->pair_begin(), + d_col->pair_end(), + thrust::make_pair(s->value(), true)); - return thrust::any_of(rmm::exec_policy(stream)->on(stream), - data_it, data_it + col.size(), - eq_op); + return found_iter != d_col->pair_end(); } else { - auto eq_op = compare_with_value(*d_col, element, element_is_valid, true); + auto found_iter = thrust::find(rmm::exec_policy(stream)->on(stream), + d_col->begin(), + d_col->end(), s->value()); - return thrust::any_of(rmm::exec_policy(stream)->on(stream), - data_it, data_it + col.size(), - eq_op); + return found_iter != d_col->end(); } } }; diff --git a/cpp/src/strings/findall.cu b/cpp/src/strings/findall.cu new file mode 100644 index 00000000000..da0aba88f6e --- /dev/null +++ b/cpp/src/strings/findall.cu @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +namespace cudf +{ +namespace strings +{ +namespace detail +{ + +using string_index_pair = thrust::pair; +using findall_result = thrust::pair; + +namespace +{ + +/** + * @brief This functor handles extracting matched strings by applying the compiled regex pattern + * and creating string_index_pairs for all the substrings. + */ +template +struct findall_fn +{ + column_device_view const d_strings; + reprog_device prog; + size_type column_index; + size_type const* d_counts; + + findall_fn( column_device_view const& d_strings, + reprog_device& prog, + size_type column_index = -1, + size_type const* d_counts = nullptr ) + : d_strings(d_strings), prog(prog), column_index(column_index), d_counts(d_counts) {} + + // this will count columns as well as locate a specific string for a column + __device__ findall_result findall(size_type idx) + { + string_index_pair result{nullptr,0}; + if( d_strings.is_null(idx) || + (d_counts && (column_index >= d_counts[idx])) ) + return findall_result{0,result}; + u_char data1[stack_size]; + u_char data2[stack_size]; + prog.set_stack_mem(data1,data2); + string_view d_str = d_strings.element(idx); + auto nchars = d_str.length(); + size_type spos = 0; + size_type epos = nchars; + size_type column_count = 0; + while( spos <= nchars ) + { + if( prog.find(idx,d_str,spos,epos) <=0 ) + break; // no more matches found + if( column_count == column_index ) + break; // found our column + spos = epos > spos ? epos : spos + 1; + epos = nchars; + ++column_count; + } + if( spos <= epos ) + { + spos = d_str.byte_offset(spos); // convert + epos = d_str.byte_offset(epos); // to bytes + result = string_index_pair{d_str.data() + spos, (epos-spos)}; + } + // return the strings location and the column count + return findall_result{column_count,result}; + } + + __device__ string_index_pair operator()(size_type idx) + { + // this one only cares about the string + return findall(idx).second; + } +}; + +template +struct findall_count_fn : public findall_fn +{ + findall_count_fn( column_device_view const& strings, + reprog_device& prog) + : findall_fn{strings,prog} {} + + __device__ size_type operator()(size_type idx) + { + // this one only cares about the column count + return findall_fn::findall(idx).first; + } +}; + + +} // namespace + +// +std::unique_ptr findall_re( strings_column_view const& strings, + std::string const& pattern, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(), + cudaStream_t stream = 0) +{ + auto strings_count = strings.size(); + auto strings_column = column_device_view::create(strings.parent(),stream); + auto d_strings = *strings_column; + + auto d_flags = detail::get_character_flags_table(); + // compile regex into device object + auto prog = reprog_device::create(pattern,d_flags,strings_count,stream); + auto d_prog = *prog; + auto execpol = rmm::exec_policy(stream); + int regex_insts = prog->insts_counts(); + + rmm::device_vector find_counts(strings_count); + auto d_find_counts = find_counts.data().get(); + + if( (regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS) ) + thrust::transform(execpol->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + d_find_counts, findall_count_fn{d_strings,d_prog}); + else if( regex_insts <= RX_MEDIUM_INSTS ) + thrust::transform(execpol->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + d_find_counts, findall_count_fn{d_strings,d_prog}); + else + thrust::transform(execpol->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + d_find_counts, findall_count_fn{d_strings,d_prog}); + + std::vector> results; + + size_type columns = *thrust::max_element(execpol->on(stream), find_counts.begin(), find_counts.end() ); + // boundary case: if no columns, return all nulls column (issue #119) + if( columns==0 ) + results.push_back(std::make_unique( data_type{STRING}, strings_count, + rmm::device_buffer{0,stream,mr}, // no data + create_null_mask(strings_count,ALL_NULL,stream,mr), strings_count )); + + for( int32_t column_index=0; column_index < columns; ++column_index ) + { + rmm::device_vector indices(strings_count); + string_index_pair* d_indices = indices.data().get(); + + if( (regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS) ) + thrust::transform(execpol->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + d_indices, findall_fn{d_strings, d_prog, column_index, d_find_counts}); + else if( regex_insts <= RX_MEDIUM_INSTS ) + thrust::transform(execpol->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + d_indices, findall_fn{d_strings, d_prog, column_index, d_find_counts}); + else + thrust::transform(execpol->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + d_indices, findall_fn{d_strings, d_prog, column_index, d_find_counts}); + // + auto column = make_strings_column(indices,stream,mr); + results.emplace_back(std::move(column)); + } + return std::make_unique(std::move(results)); +} + +} // namespace detail + +// external API + +std::unique_ptr findall_re( strings_column_view const& strings, + std::string const& pattern, + rmm::mr::device_memory_resource* mr) +{ + return detail::findall_re(strings, pattern, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/src/unary/nan_ops.cu b/cpp/src/unary/nan_ops.cu new file mode 100644 index 00000000000..8e933ede7bf --- /dev/null +++ b/cpp/src/unary/nan_ops.cu @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +namespace cudf { +namespace experimental { + +namespace detail{ + +struct nan_dispatcher { + template + std::enable_if_t::value, std::unique_ptr> + operator()(cudf::column_view const& input, + Predicate predicate, + rmm::mr::device_memory_resource* mr, + cudaStream_t stream) { + + auto input_device_view = column_device_view::create(input); + + if (input.has_nulls()) { + auto input_pair_iterator = make_pair_iterator(*input_device_view); + return true_if(input_pair_iterator, + input_pair_iterator + input.size(), + input.size(), predicate, mr); + } else { + auto input_pair_iterator = make_pair_iterator(*input_device_view); + return true_if(input_pair_iterator, + input_pair_iterator + input.size(), + input.size(), predicate, mr); + } + } + + template + std::enable_if_t::value, std::unique_ptr> + operator()(cudf::column_view const& input, + Predicate predicate, + rmm::mr::device_memory_resource* mr, + cudaStream_t stream) { + + CUDF_FAIL("NAN is not supported in a Non-floating point type column"); + } +}; + +/** + * @copydoc cudf::experimental::is_nan + * + * @param[in] stream Optional CUDA stream on which to execute kernels + */ +std::unique_ptr is_nan(cudf::column_view const& input, + rmm::mr::device_memory_resource* mr= + rmm::mr::get_default_resource(), + cudaStream_t stream = 0) { + + auto predicate = [] __device__ (auto element_validity_pair) { + return element_validity_pair.second and std::isnan(element_validity_pair.first); + }; + + return cudf::experimental::type_dispatcher(input.type(), nan_dispatcher{}, + input, predicate, mr, stream); +} + +/** + * @copydoc cudf::experimental::is_not_nan + * + * @param[in] stream Optional CUDA stream on which to execute kernels + */ +std::unique_ptr is_not_nan(cudf::column_view const& input, + rmm::mr::device_memory_resource* mr= + rmm::mr::get_default_resource(), + cudaStream_t stream = 0) { + + auto predicate = [] __device__ (auto element_validity_pair) { + return !element_validity_pair.second or !std::isnan(element_validity_pair.first); + }; + + return cudf::experimental::type_dispatcher(input.type(), nan_dispatcher{}, + input, predicate, mr, stream); +} + +}// namespace detail + +std::unique_ptr is_nan(cudf::column_view const& input, + rmm::mr::device_memory_resource* mr) { + + return detail::is_nan(input, mr); +} + +std::unique_ptr is_not_nan(cudf::column_view const& input, + rmm::mr::device_memory_resource* mr) { + + return detail::is_not_nan(input, mr); +} + +}// namespace experimental +}// namespace cudf + diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 63d103ca044..d313e90bfdb 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -770,6 +770,7 @@ set(STRINGS_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/strings/datetime_tests.cu" "${CMAKE_CURRENT_SOURCE_DIR}/strings/extract_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/strings/fill_tests.cu" + "${CMAKE_CURRENT_SOURCE_DIR}/strings/findall_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/strings/find_tests.cu" "${CMAKE_CURRENT_SOURCE_DIR}/strings/find_multiple_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/strings/floats_tests.cu" diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp new file mode 100644 index 00000000000..71e4a3b1bcd --- /dev/null +++ b/cpp/tests/strings/findall_tests.cpp @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +struct StringsFindallTests : public cudf::test::BaseFixture {}; + + +TEST_F(StringsFindallTests, FindallTest) +{ + std::vector h_strings{ "First Last", "Joe Schmoe", "John Smith", "Jane Smith", "Beyonce", "Sting", nullptr, "" }; + + cudf::test::strings_column_wrapper strings( h_strings.begin(), h_strings.end(), + thrust::make_transform_iterator( h_strings.begin(), [] (auto str) { return str!=nullptr; })); + auto strings_view = cudf::strings_column_view(strings); + + std::vector h_expecteds{ "First", "Joe", "John", "Jane", "Beyonce", "Sting", nullptr, nullptr, + "Last", "Schmoe", "Smith", "Smith", nullptr, nullptr, nullptr, nullptr }; + + std::string pattern = "(\\w+)"; + auto results = cudf::strings::findall_re(strings_view,pattern); + EXPECT_TRUE( results->num_columns()==2 ); + + cudf::test::strings_column_wrapper expected1( h_expecteds.data(), h_expecteds.data() + h_strings.size(), + thrust::make_transform_iterator( h_expecteds.begin(), [] (auto str) { return str!=nullptr; })); + cudf::test::strings_column_wrapper expected2( h_expecteds.data()+h_strings.size(), h_expecteds.data() + h_expecteds.size(), + thrust::make_transform_iterator( h_expecteds.data()+h_strings.size(), [] (auto str) { return str!=nullptr; })); + std::vector> columns; + columns.push_back( expected1.release() ); + columns.push_back( expected2.release() ); + cudf::experimental::table expected(std::move(columns)); + cudf::test::expect_tables_equal(*results,expected); +} + +TEST_F(StringsFindallTests, MediumRegex) +{ + // This results in 15 regex instructions and falls in the 'medium' range. + std::string medium_regex = "(\\w+) (\\w+) (\\d+)"; + + std::vector h_strings{ "first words 1234 and just numbers 9876", "neither" }; + cudf::test::strings_column_wrapper strings( h_strings.begin(), h_strings.end(), + thrust::make_transform_iterator( h_strings.begin(), [] (auto str) { return str!=nullptr; })); + + auto strings_view = cudf::strings_column_view(strings); + auto results = cudf::strings::findall_re(strings_view, medium_regex); + EXPECT_TRUE( results->num_columns()==2 ); + + std::vector h_expected1{"first words 1234", nullptr }; + cudf::test::strings_column_wrapper expected1( h_expected1.begin(), h_expected1.end(), + thrust::make_transform_iterator( h_expected1.begin(), [] (auto str) { return str!=nullptr; })); + cudf::test::expect_columns_equal(results->get_column(0),expected1); + + std::vector h_expected2{"just numbers 9876", nullptr }; + cudf::test::strings_column_wrapper expected2( h_expected2.begin(), h_expected2.end(), + thrust::make_transform_iterator( h_expected2.begin(), [] (auto str) { return str!=nullptr; })); + cudf::test::expect_columns_equal(results->get_column(1),expected2); +} + +TEST_F(StringsFindallTests, LargeRegex) +{ + // This results in 115 regex instructions and falls in the 'large' range. + std::string large_regex = "hello @abc @def world The quick brown @fox jumps over the lazy @dog hello http://www.world.com I'm here @home zzzz"; + + std::vector h_strings{ + "hello @abc @def world The quick brown @fox jumps over the lazy @dog hello http://www.world.com I'm here @home zzzz", + "12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890", + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz" + }; + cudf::test::strings_column_wrapper strings( h_strings.begin(), h_strings.end(), + thrust::make_transform_iterator( h_strings.begin(), [] (auto str) { return str!=nullptr; })); + + auto strings_view = cudf::strings_column_view(strings); + auto results = cudf::strings::findall_re(strings_view, large_regex); + EXPECT_TRUE( results->num_columns()==1 ); + + std::vector h_expected{large_regex.c_str(), nullptr, nullptr }; + cudf::test::strings_column_wrapper expected( h_expected.begin(), h_expected.end(), + thrust::make_transform_iterator( h_expected.begin(), [] (auto str) { return str!=nullptr; })); + cudf::test::expect_columns_equal(results->get_column(0),expected); +} diff --git a/cpp/tests/unary/unary_ops_test.cu b/cpp/tests/unary/unary_ops_test.cu index 7095ce3feca..72a668df963 100644 --- a/cpp/tests/unary/unary_ops_test.cu +++ b/cpp/tests/unary/unary_ops_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -709,3 +709,98 @@ TYPED_TEST(CastFromTimestamps, WithNulls) { validate_cast_result(timestamps_us_exp, *timestamps_us_got); validate_cast_result(timestamps_ns_exp, *timestamps_ns_got); } + +template +struct IsNAN : public cudf::test::BaseFixture {}; + +TYPED_TEST_CASE(IsNAN, cudf::test::FloatingPointTypes); + +TYPED_TEST(IsNAN, AllValid) { + using T = TypeParam; + + cudf::test::fixed_width_column_wrapper col {{1, 2, NAN, 4, NAN, 6, 7}}; + cudf::test::fixed_width_column_wrapper expected = {false, false, true, false, true, false, false}; + + std::unique_ptr got = cudf::experimental::is_nan(col); + + cudf::test::expect_columns_equal(expected, got->view()); +} + +TYPED_TEST(IsNAN, WithNull) { + using T = TypeParam; + + // The last NAN is null + cudf::test::fixed_width_column_wrapper col {{1, 2, NAN, 4, NAN, 6, 7}, {1, 0, 1, 1, 0, 1, 1}}; + cudf::test::fixed_width_column_wrapper expected = {false, false, true, false, false, false, false}; + + std::unique_ptr got = cudf::experimental::is_nan(col); + + cudf::test::expect_columns_equal(expected, got->view()); +} + +TYPED_TEST(IsNAN, EmptyColumn) { + using T = TypeParam; + + cudf::test::fixed_width_column_wrapper col {}; + cudf::test::fixed_width_column_wrapper expected = {}; + + std::unique_ptr got = cudf::experimental::is_nan(col); + + cudf::test::expect_columns_equal(expected, got->view()); +} + +TYPED_TEST(IsNAN, NonFloatingColumn) { + using T = TypeParam; + + cudf::test::fixed_width_column_wrapper col {{1, 2, 5, 3, 5, 6, 7}, {1, 0, 1, 1, 0, 1, 1}}; + + EXPECT_THROW(std::unique_ptr got = cudf::experimental::is_nan(col), cudf::logic_error); +} + +template +struct IsNotNAN : public cudf::test::BaseFixture {}; + +TYPED_TEST_CASE(IsNotNAN, cudf::test::FloatingPointTypes); + +TYPED_TEST(IsNotNAN, AllValid) { + using T = TypeParam; + + cudf::test::fixed_width_column_wrapper col {{1, 2, NAN, 4, NAN, 6, 7}}; + cudf::test::fixed_width_column_wrapper expected = {true, true, false, true, false, true, true}; + + std::unique_ptr got = cudf::experimental::is_not_nan(col); + + cudf::test::expect_columns_equal(expected, got->view()); +} + +TYPED_TEST(IsNotNAN, WithNull) { + using T = TypeParam; + + // The last NAN is null + cudf::test::fixed_width_column_wrapper col {{1, 2, NAN, 4, NAN, 6, 7}, {1, 0, 1, 1, 0, 1, 1}}; + cudf::test::fixed_width_column_wrapper expected = {true, true, false, true, true, true, true}; + + std::unique_ptr got = cudf::experimental::is_not_nan(col); + + cudf::test::expect_columns_equal(expected, got->view()); +} + +TYPED_TEST(IsNotNAN, EmptyColumn) { + using T = TypeParam; + + cudf::test::fixed_width_column_wrapper col {}; + cudf::test::fixed_width_column_wrapper expected = {}; + + std::unique_ptr got = cudf::experimental::is_not_nan(col); + + cudf::test::expect_columns_equal(expected, got->view()); +} + +TYPED_TEST(IsNotNAN, NonFloatingColumn) { + using T = TypeParam; + + cudf::test::fixed_width_column_wrapper col {{1, 2, 5, 3, 5, 6, 7}, {1, 0, 1, 1, 0, 1, 1}}; + + EXPECT_THROW(std::unique_ptr got = cudf::experimental::is_not_nan(col), cudf::logic_error); +} + diff --git a/python/cudf/cudf/tests/test_avro.py b/python/cudf/cudf/tests/test_avro.py index 6358eb994cb..4a8a8d1bbdb 100644 --- a/python/cudf/cudf/tests/test_avro.py +++ b/python/cudf/cudf/tests/test_avro.py @@ -44,10 +44,6 @@ def _make_path_or_buf(src): yield _make_path_or_buf -@pytest.mark.skip( - reason="This test segfaults occasionally. Skipping until " - "we can figure out why." -) @pytest.mark.filterwarnings("ignore:Using CPU") @pytest.mark.parametrize("engine", ["cudf"]) @pytest.mark.parametrize("inputfile, columns", [("example.avro", None)])