From ee78a916bd2410abeda8fdbad496840ba56b0fdb Mon Sep 17 00:00:00 2001 From: jeanp413 Date: Fri, 14 Feb 2020 10:00:30 -0500 Subject: [PATCH 01/79] Make fill/copy_range no-op on empty columns --- cpp/src/column/column_view.cpp | 3 +-- cpp/src/copying/copy_range.cu | 28 +++++++++----------------- cpp/src/filling/fill.cu | 10 ++++----- cpp/tests/copying/copy_range_tests.cpp | 17 ++++++++++++++-- cpp/tests/filling/fill_tests.cu | 18 +++++++++++++---- 5 files changed, 43 insertions(+), 33 deletions(-) diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index b7cad8b5013..3aff211dbe6 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -67,8 +67,7 @@ size_type column_view_base::null_count() const { } size_type column_view_base::null_count(size_type begin, size_type end) const { - CUDF_EXPECTS((begin <= end) && (begin >= 0) && (begin < size()) && - (end <= size()), + CUDF_EXPECTS((begin >= 0) && (end <= size()) && (begin <= end), "Range is out of bounds."); return (null_count() == 0) ? 0 : cudf::count_unset_bits(null_mask(), offset() + begin, offset() + end); diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu index b87ca06880a..d8f4ce6f79a 100644 --- a/cpp/src/copying/copy_range.cu +++ b/cpp/src/copying/copy_range.cu @@ -162,16 +162,11 @@ void copy_range(column_view const& source, mutable_column_view& target, cudaStream_t stream) { CUDF_EXPECTS(cudf::is_fixed_width(target.type()) == true, "In-place copy_range does not support variable-sized types."); - CUDF_EXPECTS((source_begin <= source_end) && - (source_begin >= 0) && - (source_begin < source.size()) && - (source_end <= source.size()) && - (target_begin >= 0) && - (target_begin < target.size()) && - (target_begin + (source_end - source_begin) <= - target.size()) && - // overflow - (target_begin + (source_end - source_begin) >= target_begin), + CUDF_EXPECTS((source_begin >= 0) && + (source_end <= source.size()) && + (source_begin <= source_end) && + (target_begin >= 0) && + (target_begin <= target.size() - (source_end - source_begin)), "Range is out of bounds."); CUDF_EXPECTS(target.type() == source.type(), "Data type mismatch."); CUDF_EXPECTS((target.nullable() == true) || (source.has_nulls() == false), @@ -192,15 +187,10 @@ std::unique_ptr copy_range(column_view const& source, rmm::mr::device_memory_resource* mr, cudaStream_t stream) { CUDF_EXPECTS((source_begin >= 0) && - (source_begin <= source_end) && - (source_begin < source.size()) && - (source_end <= source.size()) && - (target_begin >= 0) && - (target_begin < target.size()) && - (target_begin + (source_end - source_begin) <= - target.size()) && - // overflow - (target_begin + (source_end - source_begin) >= target_begin), + (source_end <= source.size()) && + (source_begin <= source_end) && + (target_begin >= 0) && + (target_begin <= target.size() - (source_end - source_begin)), "Range is out of bounds."); CUDF_EXPECTS(target.type() == source.type(), "Data type mismatch."); diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu index bf4d428db11..03654058a42 100644 --- a/cpp/src/filling/fill.cu +++ b/cpp/src/filling/fill.cu @@ -131,9 +131,8 @@ void fill_in_place(mutable_column_view& destination, CUDF_EXPECTS(cudf::is_fixed_width(destination.type()) == true, "In-place fill does not support variable-sized types."); CUDF_EXPECTS((begin >= 0) && - (begin <= end) && - (begin < destination.size()) && - (end <= destination.size()), + (end <= destination.size()) && + (begin <= end), "Range is out of bounds."); CUDF_EXPECTS((destination.nullable() == true) || (value.is_valid() == true), "destination should be nullable or value should be non-null."); @@ -156,9 +155,8 @@ std::unique_ptr fill(column_view const& input, rmm::mr::device_memory_resource* mr, cudaStream_t stream) { CUDF_EXPECTS((begin >= 0) && - (begin <= end) && - (begin < input.size()) && - (end <= input.size()), + (end <= input.size()) && + (begin <= end), "Range is out of bounds."); CUDF_EXPECTS(input.type() == value.type(), "Data type mismatch."); diff --git a/cpp/tests/copying/copy_range_tests.cpp b/cpp/tests/copying/copy_range_tests.cpp index b4e22ba00e8..53ca43ea8c1 100644 --- a/cpp/tests/copying/copy_range_tests.cpp +++ b/cpp/tests/copying/copy_range_tests.cpp @@ -422,6 +422,7 @@ TEST_F(CopyRangeErrorTestFixture, InvalidRange) thrust::make_counting_iterator(0) + size); cudf::mutable_column_view target_view{target}; + cudf::column_view source_view{source}; // empty_range == no-op, this is valid EXPECT_NO_THROW(cudf::experimental::copy_range( @@ -447,10 +448,10 @@ TEST_F(CopyRangeErrorTestFixture, InvalidRange) // source_begin >= source.size() EXPECT_THROW(cudf::experimental::copy_range( - source, target_view, 100, 100, 0), + source, target_view, 101, 100, 0), cudf::logic_error); EXPECT_THROW(auto p_ret = cudf::experimental::copy_range( - source, target, 100, 100, 0), + source, target, 101, 100, 0), cudf::logic_error); // source_end > source.size() @@ -484,6 +485,18 @@ TEST_F(CopyRangeErrorTestFixture, InvalidRange) EXPECT_THROW(auto p_ret = cudf::experimental::copy_range( source, target, 50, 100, 80), cudf::logic_error); + + // Empty column + target = cudf::test::fixed_width_column_wrapper{}; + source = cudf::test::fixed_width_column_wrapper{}; + target_view = target; + source_view = source; + + // empty column == no-op, this is valid + EXPECT_NO_THROW(cudf::experimental::copy_range( + source_view, target_view, 0, source_view.size(), 0)); + EXPECT_NO_THROW(auto p_ret = cudf::experimental::copy_range( + source_view, target, 0, source_view.size(), 0)); } TEST_F(CopyRangeErrorTestFixture, DTypeMismatch) diff --git a/cpp/tests/filling/fill_tests.cu b/cpp/tests/filling/fill_tests.cu index 2f5a2621850..f059524e5ea 100644 --- a/cpp/tests/filling/fill_tests.cu +++ b/cpp/tests/filling/fill_tests.cu @@ -352,10 +352,10 @@ TEST_F(FillErrorTestFixture, InvalidRange) *p_val), cudf::logic_error); - // out_begin >= destination.size() - EXPECT_THROW(cudf::experimental::fill_in_place(destination_view, 100, 100, *p_val), - cudf::logic_error); - EXPECT_THROW(auto p_ret = cudf::experimental::fill(destination, 100, 100, + // out_begin > destination.size() + EXPECT_THROW(cudf::experimental::fill_in_place(destination_view, 101, 100, *p_val), + cudf::logic_error); + EXPECT_THROW(auto p_ret = cudf::experimental::fill(destination, 101, 100, *p_val), cudf::logic_error); @@ -365,6 +365,16 @@ TEST_F(FillErrorTestFixture, InvalidRange) EXPECT_THROW(auto p_ret = cudf::experimental::fill(destination, 99, 101, *p_val), cudf::logic_error); + + // Empty Column + destination = cudf::test::fixed_width_column_wrapper{}; + destination_view = destination; + + // empty column, this is valid + EXPECT_NO_THROW(cudf::experimental::fill_in_place(destination_view, 0, + destination_view.size(), *p_val)); + EXPECT_NO_THROW(auto p_ret = cudf::experimental::fill(destination, 0, + destination_view.size(), *p_val)); } TEST_F(FillErrorTestFixture, DTypeMismatch) From 557e8cc5654cdb30a1d1e991b0514efd12be6ed8 Mon Sep 17 00:00:00 2001 From: jeanp413 Date: Fri, 14 Feb 2020 10:05:34 -0500 Subject: [PATCH 02/79] :lipstick: --- cpp/src/filling/fill.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu index 03654058a42..8d99d063fe1 100644 --- a/cpp/src/filling/fill.cu +++ b/cpp/src/filling/fill.cu @@ -156,7 +156,7 @@ std::unique_ptr fill(column_view const& input, cudaStream_t stream) { CUDF_EXPECTS((begin >= 0) && (end <= input.size()) && - (begin <= end), + (begin <= end), "Range is out of bounds."); CUDF_EXPECTS(input.type() == value.type(), "Data type mismatch."); From 1f48da7829079706eb95a4cb64906684f8b4305f Mon Sep 17 00:00:00 2001 From: jeanp413 Date: Fri, 14 Feb 2020 10:08:55 -0500 Subject: [PATCH 03/79] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 340abb10610..121b4b8aef7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -101,6 +101,7 @@ - PR #4125 Fix type enum to account for added Dictionary type in `types.hpp` - PR #4137 Update Java for mutating fill and rolling window changes - PR #4141 Fix NVStrings test_convert failure in 10.2 build +- PR #4156 Make fill/copy_range no-op on empty columns # cuDF 0.12.0 (04 Feb 2020) From bf18a0e2321b3317277b80a0a71c9850ba5e8569 Mon Sep 17 00:00:00 2001 From: Jean Pierre Date: Sun, 16 Feb 2020 19:58:07 -0500 Subject: [PATCH 04/79] Fix unit test --- cpp/tests/copying/copy_range_tests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/copying/copy_range_tests.cpp b/cpp/tests/copying/copy_range_tests.cpp index 2c49dbc968c..225687296b3 100644 --- a/cpp/tests/copying/copy_range_tests.cpp +++ b/cpp/tests/copying/copy_range_tests.cpp @@ -448,7 +448,7 @@ TEST_F(CopyRangeErrorTestFixture, InvalidRange) // source_begin >= source.size() EXPECT_THROW(cudf::experimental::copy_range_in_place( - source, target_view, 100, 100, 0), + source, target_view, 101, 100, 0), cudf::logic_error); EXPECT_THROW(auto p_ret = cudf::experimental::copy_range( source, target, 101, 100, 0), From 55deaf8f3ec9784ec3868405c07dd63725a1a516 Mon Sep 17 00:00:00 2001 From: jeanp413 Date: Mon, 2 Mar 2020 16:10:47 -0500 Subject: [PATCH 05/79] Update docs --- cpp/include/cudf/copying.hpp | 10 ++++------ cpp/include/cudf/filling.hpp | 6 ++---- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp index bcfb167f80b..2a71a3a1fb5 100644 --- a/cpp/include/cudf/copying.hpp +++ b/cpp/include/cudf/copying.hpp @@ -243,9 +243,8 @@ std::unique_ptr empty_like(table_view const& input_table); * variable width types). * @throws `cudf::logic_error` for invalid range (if * @p source_begin > @p source_end, @p source_begin < 0, - * @p source_begin >= @p source.size(), @p source_end > @p source.size(), - * @p target_begin < 0, target_begin >= @p target.size(), or - * @p target_begin + (@p source_end - @p source_begin) > @p target.size()). + * @p source_end > @p source.size(), @p target_begin < 0, + * or @p target_begin + (@p source_end - @p source_begin) > @p target.size()). * @throws `cudf::logic_error` if @p target and @p source have different types. * @throws `cudf::logic_error` if @p source has null values and @p target is not * nullable. @@ -278,9 +277,8 @@ void copy_range_in_place(column_view const& source, * * @throws `cudf::logic_error` for invalid range (if * @p source_begin > @p source_end, @p source_begin < 0, - * @p source_begin >= @p source.size(), @p source_end > @p source.size(), - * @p target_begin < 0, target_begin >= @p target.size(), or - * @p target_begin + (@p source_end - @p source_begin) > @p target.size()). + * @p source_end > @p source.size(), @p target_begin < 0, + * or @p target_begin + (@p source_end - @p source_begin) > @p target.size()). * @throws `cudf::logic_error` if @p target and @p source have different types. * * @param source The column to copy from inside the range. diff --git a/cpp/include/cudf/filling.hpp b/cpp/include/cudf/filling.hpp index 0316a154835..f6d3c67dfb6 100644 --- a/cpp/include/cudf/filling.hpp +++ b/cpp/include/cudf/filling.hpp @@ -37,8 +37,7 @@ namespace experimental { * @throws `cudf::logic_error` if memory reallocation is required (e.g. for * variable width types). * @throws `cudf::logic_error` for invalid range (if @p begin < 0, - * @p begin > @p end, @p begin >= @p destination.size(), or - * @p end > @p destination.size()). + * @p begin > @p end, or @p end > @p destination.size()). * @throws `cudf::logic_error` if @p destination and @p value have different * types. * @throws `cudf::logic_error` if @p value is invalid but @p destination is not @@ -62,8 +61,7 @@ void fill_in_place(mutable_column_view& destination, size_type begin, * indicated by the indices [@p begin, @p end) were overwritten by @p value. * * @throws `cudf::logic_error` for invalid range (if @p begin < 0, - * @p begin > @p end, @p begin >= @p destination.size(), or - * @p end > @p destination.size()). + * @p begin > @p end, or @p end > @p destination.size()). * @throws `cudf::logic_error` if @p destination and @p value have different * types. * From 8dbc1d51e81e2d8c94821bf957247817e99e0fb1 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Thu, 5 Mar 2020 00:57:07 +0530 Subject: [PATCH 06/79] Initial commit for binaryops cython port --- cpp/include/cudf/binaryop.hpp | 2 +- python/cudf/cudf/_libxx/__init__.py | 1 + python/cudf/cudf/_libxx/binaryop.pxd | 6 ++ python/cudf/cudf/_libxx/binaryop.pyx | 117 +++++++++++++++++++++++ python/cudf/cudf/_libxx/cpp/binaryop.pxd | 42 ++++++++ 5 files changed, 167 insertions(+), 1 deletion(-) create mode 100644 python/cudf/cudf/_libxx/binaryop.pxd create mode 100644 python/cudf/cudf/_libxx/binaryop.pyx create mode 100644 python/cudf/cudf/_libxx/cpp/binaryop.pxd diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp index f383b4f24ff..891e40f93a5 100644 --- a/cpp/include/cudf/binaryop.hpp +++ b/cpp/include/cudf/binaryop.hpp @@ -27,7 +27,7 @@ namespace experimental { /** * @brief Types of binary operations that can be performed on data. */ -enum class binary_operator { +enum class binary_operator : int32_t { ADD, ///< operator + SUB, ///< operator - MUL, ///< operator * diff --git a/python/cudf/cudf/_libxx/__init__.py b/python/cudf/cudf/_libxx/__init__.py index 0b3aa602333..cb16fc6d169 100644 --- a/python/cudf/cudf/_libxx/__init__.py +++ b/python/cudf/cudf/_libxx/__init__.py @@ -4,6 +4,7 @@ from . import ( avro, + binaryop, copying, dlpack, gpuarrow, diff --git a/python/cudf/cudf/_libxx/binaryop.pxd b/python/cudf/cudf/_libxx/binaryop.pxd new file mode 100644 index 00000000000..3fb36055465 --- /dev/null +++ b/python/cudf/cudf/_libxx/binaryop.pxd @@ -0,0 +1,6 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. + +from libc.stdint cimport int32_t + + +ctypedef int32_t underlying_type_t_binary_operator diff --git a/python/cudf/cudf/_libxx/binaryop.pyx b/python/cudf/cudf/_libxx/binaryop.pyx new file mode 100644 index 00000000000..8ba5ead41ad --- /dev/null +++ b/python/cudf/cudf/_libxx/binaryop.pyx @@ -0,0 +1,117 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. + +import numpy as np +from enum import IntEnum + +from libcpp.memory cimport unique_ptr + +from cudf._libxx.binaryop cimport underlying_type_t_binary_operator +from cudf._libxx.column cimport Column +from cudf._libxx.move cimport move +from cudf._libxx.types import np_to_cudf_types + +from cudf._libxx.cpp.column.column cimport column +from cudf._libxx.cpp.column.column_view cimport column_view +from cudf._libxx.cpp.types cimport ( + data_type, + type_id, +) + +cimport cudf._libxx.cpp.binaryop as cpp_binaryop + + +class BinaryOperation(IntEnum): + ADD = ( + cpp_binaryop.binary_operator.ADD + ) + SUB = ( + cpp_binaryop.binary_operator.SUB + ) + MUL = ( + cpp_binaryop.binary_operator.MUL + ) + DIV = ( + cpp_binaryop.binary_operator.DIV + ) + TRUE_DIV = ( + cpp_binaryop.binary_operator.TRUE_DIV + ) + FLOOR_DIV = ( + cpp_binaryop.binary_operator.FLOOR_DIV + ) + MOD = ( + cpp_binaryop.binary_operator.MOD + ) + PYMOD = ( + cpp_binaryop.binary_operator.PYMOD + ) + POW = ( + cpp_binaryop.binary_operator.POW + ) + EQUAL = ( + cpp_binaryop.binary_operator.EQUAL + ) + NOT_EQUAL = ( + cpp_binaryop.binary_operator.NOT_EQUAL + ) + LESS = ( + cpp_binaryop.binary_operator.LESS + ) + GREATER = ( + cpp_binaryop.binary_operator.GREATER + ) + LESS_EQUAL = ( + cpp_binaryop.binary_operator.LESS_EQUAL + ) + GREATER_EQUAL = ( + cpp_binaryop.binary_operator.GREATER_EQUAL + ) + BITWISE_AND = ( + cpp_binaryop.binary_operator.BITWISE_AND + ) + BITWISE_OR = ( + cpp_binaryop.binary_operator.BITWISE_OR + ) + BITWISE_XOR = ( + cpp_binaryop.binary_operator.BITWISE_XOR + ) + LOGICAL_AND = ( + cpp_binaryop.binary_operator.LOGICAL_AND + ) + LOGICAL_OR = ( + cpp_binaryop.binary_operator.LOGICAL_OR + ) + COALESCE = ( + cpp_binaryop.binary_operator.COALESCE + ) + GENERIC_BINARY = ( + cpp_binaryop.binary_operator.GENERIC_BINARY + ) + + +def binaryop(Column lhs, Column rhs, object op, object dtype): + """ + Dispatches a binary op call to the appropriate libcudf function: + """ + cdef column_view c_lhs = lhs.view() + cdef column_view c_rhs = rhs.view() + cdef cpp_binaryop.binary_operator c_op = \ + ( + op + ) + cdef type_id tid = np_to_cudf_types[np.dtype(dtype)] + cdef data_type c_dtype = data_type(tid) + + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_binaryop.binary_operation( + c_lhs, + c_rhs, + c_op, + c_dtype + ) + ) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_libxx/cpp/binaryop.pxd b/python/cudf/cudf/_libxx/cpp/binaryop.pxd new file mode 100644 index 00000000000..8eb3f65a313 --- /dev/null +++ b/python/cudf/cudf/_libxx/cpp/binaryop.pxd @@ -0,0 +1,42 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._libxx.cpp.column.column cimport column +from cudf._libxx.cpp.column.column_view cimport column_view +from cudf._libxx.cpp.types cimport ( + data_type +) + +cdef extern from "cudf/binaryop.hpp" namespace "cudf::experimental" nogil: + ctypedef enum binary_operator: + ADD "cudf::experimental::binary_operator::ADD" + SUB "cudf::experimental::binary_operator::SUB" + MUL "cudf::experimental::binary_operator::MUL" + DIV "cudf::experimental::binary_operator::DIV" + TRUE_DIV "cudf::experimental::binary_operator::TRUE_DIV" + FLOOR_DIV "cudf::experimental::binary_operator::FLOOR_DIV" + MOD "cudf::experimental::binary_operator::MOD" + PYMOD "cudf::experimental::binary_operator::PYMOD" + POW "cudf::experimental::binary_operator::POW" + EQUAL "cudf::experimental::binary_operator::EQUAL" + NOT_EQUAL "cudf::experimental::binary_operator::NOT_EQUAL" + LESS "cudf::experimental::binary_operator::LESS" + GREATER "cudf::experimental::binary_operator::GREATER" + LESS_EQUAL "cudf::experimental::binary_operator::LESS_EQUAL" + GREATER_EQUAL "cudf::experimental::binary_operator::GREATER_EQUAL" + BITWISE_AND "cudf::experimental::binary_operator::BITWISE_AND" + BITWISE_OR "cudf::experimental::binary_operator::BITWISE_OR" + BITWISE_XOR "cudf::experimental::binary_operator::BITWISE_XOR" + LOGICAL_AND "cudf::experimental::binary_operator::LOGICAL_AND" + LOGICAL_OR "cudf::experimental::binary_operator::LOGICAL_OR" + COALESCE "cudf::experimental::binary_operator::COALESCE" + GENERIC_BINARY "cudf::experimental::binary_operator::GENERIC_BINARY" + + cdef unique_ptr[column] binary_operation ( + const column_view& lhs, + const column_view& rhs, + binary_operator op, + data_type output_type + ) except + + \ No newline at end of file From a0847eb3740aadf585ec41c575b2a6c5519e9af9 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Fri, 6 Mar 2020 01:09:27 +0530 Subject: [PATCH 07/79] column-column binop with libxx working --- python/cudf/cudf/_libxx/binaryop.pyx | 29 ++++++++++------------- python/cudf/cudf/core/column/numerical.py | 22 ++--------------- 2 files changed, 15 insertions(+), 36 deletions(-) diff --git a/python/cudf/cudf/_libxx/binaryop.pyx b/python/cudf/cudf/_libxx/binaryop.pyx index 8ba5ead41ad..fe2ffeb2b23 100644 --- a/python/cudf/cudf/_libxx/binaryop.pyx +++ b/python/cudf/cudf/_libxx/binaryop.pyx @@ -33,52 +33,49 @@ class BinaryOperation(IntEnum): DIV = ( cpp_binaryop.binary_operator.DIV ) - TRUE_DIV = ( + TRUEDIV = ( cpp_binaryop.binary_operator.TRUE_DIV ) - FLOOR_DIV = ( + FLOORDIV = ( cpp_binaryop.binary_operator.FLOOR_DIV ) MOD = ( - cpp_binaryop.binary_operator.MOD - ) - PYMOD = ( cpp_binaryop.binary_operator.PYMOD ) POW = ( cpp_binaryop.binary_operator.POW ) - EQUAL = ( + EQ = ( cpp_binaryop.binary_operator.EQUAL ) - NOT_EQUAL = ( + NE = ( cpp_binaryop.binary_operator.NOT_EQUAL ) - LESS = ( + LT = ( cpp_binaryop.binary_operator.LESS ) - GREATER = ( + GT = ( cpp_binaryop.binary_operator.GREATER ) - LESS_EQUAL = ( + LE = ( cpp_binaryop.binary_operator.LESS_EQUAL ) - GREATER_EQUAL = ( + GE = ( cpp_binaryop.binary_operator.GREATER_EQUAL ) - BITWISE_AND = ( + AND = ( cpp_binaryop.binary_operator.BITWISE_AND ) - BITWISE_OR = ( + OR = ( cpp_binaryop.binary_operator.BITWISE_OR ) - BITWISE_XOR = ( + XOR = ( cpp_binaryop.binary_operator.BITWISE_XOR ) - LOGICAL_AND = ( + L_AND = ( cpp_binaryop.binary_operator.LOGICAL_AND ) - LOGICAL_OR = ( + L_OR = ( cpp_binaryop.binary_operator.LOGICAL_OR ) COALESCE = ( diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index b700200f67c..a1960a19d71 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -458,29 +458,11 @@ def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False): if reflect: lhs, rhs = rhs, lhs libcudf.nvtx.nvtx_range_push("CUDF_BINARY_OP", "orange") - # Allocate output - masked = False - if np.isscalar(lhs): - masked = rhs.nullable - row_count = len(rhs) - elif np.isscalar(rhs): - masked = lhs.nullable - row_count = len(lhs) - elif rhs is None: - masked = True - row_count = len(lhs) - elif lhs is None: - masked = True - row_count = len(rhs) - else: - masked = lhs.nullable or rhs.nullable - row_count = len(lhs) is_op_comparison = op in ["lt", "gt", "le", "ge", "eq", "ne"] - out = column.column_empty(row_count, dtype=out_dtype, masked=masked) - - _ = libcudf.binops.apply_op(lhs, rhs, out, op) + operator = libcudfxx.binaryop.BinaryOperation[op.upper()] + out = libcudfxx.binaryop.binaryop(lhs, rhs, operator, out_dtype) if is_op_comparison: out = out.fillna(op == "ne") From 06b2609e7257c0a0c39eb0d74daeeb54cf1bbdd7 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Fri, 6 Mar 2020 04:19:53 +0530 Subject: [PATCH 08/79] Scalar binops now working with libxx --- python/cudf/cudf/_libxx/binaryop.pyx | 129 +++++++++++++++++------ python/cudf/cudf/_libxx/cpp/binaryop.pxd | 15 +++ 2 files changed, 114 insertions(+), 30 deletions(-) diff --git a/python/cudf/cudf/_libxx/binaryop.pyx b/python/cudf/cudf/_libxx/binaryop.pyx index fe2ffeb2b23..4b9cc4d8afe 100644 --- a/python/cudf/cudf/_libxx/binaryop.pyx +++ b/python/cudf/cudf/_libxx/binaryop.pyx @@ -8,96 +8,110 @@ from libcpp.memory cimport unique_ptr from cudf._libxx.binaryop cimport underlying_type_t_binary_operator from cudf._libxx.column cimport Column from cudf._libxx.move cimport move +from cudf._libxx.scalar cimport Scalar from cudf._libxx.types import np_to_cudf_types from cudf._libxx.cpp.column.column cimport column +from cudf._libxx.cpp.scalar.scalar cimport scalar from cudf._libxx.cpp.column.column_view cimport column_view from cudf._libxx.cpp.types cimport ( data_type, type_id, ) +from cudf._libxx.cpp.binaryop cimport binary_operator cimport cudf._libxx.cpp.binaryop as cpp_binaryop class BinaryOperation(IntEnum): ADD = ( - cpp_binaryop.binary_operator.ADD + binary_operator.ADD ) SUB = ( - cpp_binaryop.binary_operator.SUB + binary_operator.SUB ) MUL = ( - cpp_binaryop.binary_operator.MUL + binary_operator.MUL ) DIV = ( - cpp_binaryop.binary_operator.DIV + binary_operator.DIV ) TRUEDIV = ( - cpp_binaryop.binary_operator.TRUE_DIV + binary_operator.TRUE_DIV ) FLOORDIV = ( - cpp_binaryop.binary_operator.FLOOR_DIV + binary_operator.FLOOR_DIV ) MOD = ( - cpp_binaryop.binary_operator.PYMOD + binary_operator.PYMOD ) POW = ( - cpp_binaryop.binary_operator.POW + binary_operator.POW ) EQ = ( - cpp_binaryop.binary_operator.EQUAL + binary_operator.EQUAL ) NE = ( - cpp_binaryop.binary_operator.NOT_EQUAL + binary_operator.NOT_EQUAL ) LT = ( - cpp_binaryop.binary_operator.LESS + binary_operator.LESS ) GT = ( - cpp_binaryop.binary_operator.GREATER + binary_operator.GREATER ) LE = ( - cpp_binaryop.binary_operator.LESS_EQUAL + binary_operator.LESS_EQUAL ) GE = ( - cpp_binaryop.binary_operator.GREATER_EQUAL + binary_operator.GREATER_EQUAL ) AND = ( - cpp_binaryop.binary_operator.BITWISE_AND + binary_operator.BITWISE_AND ) OR = ( - cpp_binaryop.binary_operator.BITWISE_OR + binary_operator.BITWISE_OR ) XOR = ( - cpp_binaryop.binary_operator.BITWISE_XOR + binary_operator.BITWISE_XOR ) L_AND = ( - cpp_binaryop.binary_operator.LOGICAL_AND + binary_operator.LOGICAL_AND ) L_OR = ( - cpp_binaryop.binary_operator.LOGICAL_OR + binary_operator.LOGICAL_OR ) COALESCE = ( - cpp_binaryop.binary_operator.COALESCE + binary_operator.COALESCE ) GENERIC_BINARY = ( - cpp_binaryop.binary_operator.GENERIC_BINARY + binary_operator.GENERIC_BINARY ) -def binaryop(Column lhs, Column rhs, object op, object dtype): - """ - Dispatches a binary op call to the appropriate libcudf function: - """ +cdef binaryop_v_v(Column lhs, Column rhs, + binary_operator c_op, data_type c_dtype): cdef column_view c_lhs = lhs.view() cdef column_view c_rhs = rhs.view() - cdef cpp_binaryop.binary_operator c_op = \ - ( - op + + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_binaryop.binary_operation( + c_lhs, + c_rhs, + c_op, + c_dtype + ) ) - cdef type_id tid = np_to_cudf_types[np.dtype(dtype)] - cdef data_type c_dtype = data_type(tid) + + return Column.from_unique_ptr(move(c_result)) + +cdef binaryop_v_s(Column lhs, Scalar rhs, + binary_operator c_op, data_type c_dtype): + cdef column_view c_lhs = lhs.view() + cdef scalar* c_rhs = rhs.c_value.get() cdef unique_ptr[column] c_result @@ -105,6 +119,25 @@ def binaryop(Column lhs, Column rhs, object op, object dtype): c_result = move( cpp_binaryop.binary_operation( c_lhs, + c_rhs[0], + c_op, + c_dtype + ) + ) + + return Column.from_unique_ptr(move(c_result)) + +cdef binaryop_s_v(Scalar lhs, Column rhs, + binary_operator c_op, data_type c_dtype): + cdef scalar* c_lhs = lhs.c_value.get() + cdef column_view c_rhs = rhs.view() + + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_binaryop.binary_operation( + c_lhs[0], c_rhs, c_op, c_dtype @@ -112,3 +145,39 @@ def binaryop(Column lhs, Column rhs, object op, object dtype): ) return Column.from_unique_ptr(move(c_result)) + +def binaryop(lhs, rhs, op, dtype): + """ + Dispatches a binary op call to the appropriate libcudf function: + """ + cdef binary_operator c_op = ( + op + ) + cdef type_id tid = np_to_cudf_types[np.dtype(dtype)] + cdef data_type c_dtype = data_type(tid) + + if np.isscalar(lhs) or lhs is None: + s_lhs = Scalar(lhs, dtype=rhs.dtype if lhs is None else None) + return binaryop_s_v( + s_lhs, + rhs, + c_op, + c_dtype + ) + + elif np.isscalar(rhs) or rhs is None: + s_rhs = Scalar(rhs, dtype=lhs.dtype if rhs is None else None) + return binaryop_v_s( + lhs, + s_rhs, + c_op, + c_dtype + ) + + else: + return binaryop_v_v( + lhs, + rhs, + c_op, + c_dtype + ) diff --git a/python/cudf/cudf/_libxx/cpp/binaryop.pxd b/python/cudf/cudf/_libxx/cpp/binaryop.pxd index 8eb3f65a313..93e95bf6aba 100644 --- a/python/cudf/cudf/_libxx/cpp/binaryop.pxd +++ b/python/cudf/cudf/_libxx/cpp/binaryop.pxd @@ -3,6 +3,7 @@ from libcpp.memory cimport unique_ptr from cudf._libxx.cpp.column.column cimport column +from cudf._libxx.cpp.scalar.scalar cimport scalar from cudf._libxx.cpp.column.column_view cimport column_view from cudf._libxx.cpp.types cimport ( data_type @@ -33,6 +34,20 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf::experimental" nogil: COALESCE "cudf::experimental::binary_operator::COALESCE" GENERIC_BINARY "cudf::experimental::binary_operator::GENERIC_BINARY" + cdef unique_ptr[column] binary_operation ( + const scalar& lhs, + const column_view& rhs, + binary_operator op, + data_type output_type + ) except + + + cdef unique_ptr[column] binary_operation ( + const column_view& lhs, + const scalar& rhs, + binary_operator op, + data_type output_type + ) except + + cdef unique_ptr[column] binary_operation ( const column_view& lhs, const column_view& rhs, From 79abc0c880d6ec6f3a45bc6d020ad8557ea24721 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Fri, 6 Mar 2020 05:03:12 +0530 Subject: [PATCH 09/79] UDF binaryop working with libxx --- python/cudf/cudf/_libxx/binaryop.pyx | 30 +++++++++++++++++++++++ python/cudf/cudf/_libxx/cpp/binaryop.pxd | 12 +++++++-- python/cudf/cudf/tests/test_udf_binops.py | 4 +-- 3 files changed, 42 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_libxx/binaryop.pyx b/python/cudf/cudf/_libxx/binaryop.pyx index 4b9cc4d8afe..9a7b8b9128d 100644 --- a/python/cudf/cudf/_libxx/binaryop.pyx +++ b/python/cudf/cudf/_libxx/binaryop.pyx @@ -4,6 +4,7 @@ import numpy as np from enum import IntEnum from libcpp.memory cimport unique_ptr +from libcpp.string cimport string from cudf._libxx.binaryop cimport underlying_type_t_binary_operator from cudf._libxx.column cimport Column @@ -181,3 +182,32 @@ def binaryop(lhs, rhs, op, dtype): c_op, c_dtype ) + +def binaryop_udf(Column lhs, Column rhs, udf_ptx, dtype): + """ + Apply a user-defined binary operator (a UDF) defined in `udf_ptx` on + the two input columns `lhs` and `rhs`. The output type of the UDF + has to be specified in `dtype`, a numpy data type. + Currently ONLY int32, int64, float32 and float64 are supported. + """ + cdef column_view c_lhs = lhs.view() + cdef column_view c_rhs = rhs.view() + + cdef type_id tid = np_to_cudf_types[np.dtype(dtype)] + cdef data_type c_dtype = data_type(tid) + + cdef string cpp_str = udf_ptx.encode("UTF-8") + + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_binaryop.binary_operation( + c_lhs, + c_rhs, + cpp_str, + c_dtype + ) + ) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_libxx/cpp/binaryop.pxd b/python/cudf/cudf/_libxx/cpp/binaryop.pxd index 93e95bf6aba..37111fbacb7 100644 --- a/python/cudf/cudf/_libxx/cpp/binaryop.pxd +++ b/python/cudf/cudf/_libxx/cpp/binaryop.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2020, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from libcpp.string cimport string from cudf._libxx.cpp.column.column cimport column from cudf._libxx.cpp.scalar.scalar cimport scalar @@ -40,18 +41,25 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf::experimental" nogil: binary_operator op, data_type output_type ) except + - + cdef unique_ptr[column] binary_operation ( const column_view& lhs, const scalar& rhs, binary_operator op, data_type output_type ) except + - + cdef unique_ptr[column] binary_operation ( const column_view& lhs, const column_view& rhs, binary_operator op, data_type output_type ) except + + + cdef unique_ptr[column] binary_operation ( + const column_view& lhs, + const column_view& rhs, + const string& op, + data_type output_type + ) except + \ No newline at end of file diff --git a/python/cudf/cudf/tests/test_udf_binops.py b/python/cudf/cudf/tests/test_udf_binops.py index 4693d61fbbd..d377ef332cd 100644 --- a/python/cudf/cudf/tests/test_udf_binops.py +++ b/python/cudf/cudf/tests/test_udf_binops.py @@ -7,7 +7,7 @@ import pytest from packaging.version import Version -import cudf._lib as libcudf +import cudf._libxx as libcudfxx from cudf.core import Series supported_types = ["int16", "int32", "int64", "float32", "float64"] @@ -41,7 +41,7 @@ def generic_function(a, b): output_type = numba.numpy_support.as_dtype(result.signature.return_type) - out_col = libcudf.binops.apply_op_udf( + out_col = libcudfxx.binaryop.binaryop_udf( lhs_col, rhs_col, ptx_code, output_type.type ) From 72191c2c0b42cc5dedb295aad745016b33102e47 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Sat, 7 Mar 2020 01:22:05 +0530 Subject: [PATCH 10/79] style fixes --- python/cudf/cudf/_libxx/binaryop.pyx | 38 +++++++++++++----------- python/cudf/cudf/_libxx/cpp/binaryop.pxd | 1 - 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/_libxx/binaryop.pyx b/python/cudf/cudf/_libxx/binaryop.pyx index 9a7b8b9128d..d99a13f4fd1 100644 --- a/python/cudf/cudf/_libxx/binaryop.pyx +++ b/python/cudf/cudf/_libxx/binaryop.pyx @@ -109,6 +109,7 @@ cdef binaryop_v_v(Column lhs, Column rhs, return Column.from_unique_ptr(move(c_result)) + cdef binaryop_v_s(Column lhs, Scalar rhs, binary_operator c_op, data_type c_dtype): cdef column_view c_lhs = lhs.view() @@ -128,6 +129,7 @@ cdef binaryop_v_s(Column lhs, Scalar rhs, return Column.from_unique_ptr(move(c_result)) + cdef binaryop_s_v(Scalar lhs, Column rhs, binary_operator c_op, data_type c_dtype): cdef scalar* c_lhs = lhs.c_value.get() @@ -147,6 +149,7 @@ cdef binaryop_s_v(Scalar lhs, Column rhs, return Column.from_unique_ptr(move(c_result)) + def binaryop(lhs, rhs, op, dtype): """ Dispatches a binary op call to the appropriate libcudf function: @@ -160,28 +163,29 @@ def binaryop(lhs, rhs, op, dtype): if np.isscalar(lhs) or lhs is None: s_lhs = Scalar(lhs, dtype=rhs.dtype if lhs is None else None) return binaryop_s_v( - s_lhs, - rhs, - c_op, - c_dtype - ) - + s_lhs, + rhs, + c_op, + c_dtype + ) + elif np.isscalar(rhs) or rhs is None: s_rhs = Scalar(rhs, dtype=lhs.dtype if rhs is None else None) return binaryop_v_s( - lhs, - s_rhs, - c_op, - c_dtype - ) - + lhs, + s_rhs, + c_op, + c_dtype + ) + else: return binaryop_v_v( - lhs, - rhs, - c_op, - c_dtype - ) + lhs, + rhs, + c_op, + c_dtype + ) + def binaryop_udf(Column lhs, Column rhs, udf_ptx, dtype): """ diff --git a/python/cudf/cudf/_libxx/cpp/binaryop.pxd b/python/cudf/cudf/_libxx/cpp/binaryop.pxd index 37111fbacb7..07481ab2bca 100644 --- a/python/cudf/cudf/_libxx/cpp/binaryop.pxd +++ b/python/cudf/cudf/_libxx/cpp/binaryop.pxd @@ -62,4 +62,3 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf::experimental" nogil: const string& op, data_type output_type ) except + - \ No newline at end of file From e0c394273c355f3b706c9aaca6c0e554906ec379 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Tue, 10 Mar 2020 23:12:44 +0530 Subject: [PATCH 11/79] Fix issue with MIN/MAX strings when val have nulls --- cpp/include/cudf/detail/replace.hpp | 2 +- cpp/src/groupby/hash/groupby.cu | 20 ++++++++++++++++---- cpp/tests/groupby/sort/group_max_test.cu | 14 ++++++++++++++ cpp/tests/groupby/sort/group_min_test.cu | 14 ++++++++++++++ 4 files changed, 45 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/detail/replace.hpp b/cpp/include/cudf/detail/replace.hpp index 1818526ff53..413eb1f90ef 100644 --- a/cpp/include/cudf/detail/replace.hpp +++ b/cpp/include/cudf/detail/replace.hpp @@ -57,7 +57,7 @@ std::unique_ptr replace_nulls(column_view const& input, * @returns Copy of `input` with null values replaced by `replacement`. */ std::unique_ptr replace_nulls(column_view const& input, - scalar const* replacement, + scalar const& replacement, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(), cudaStream_t stream = 0); diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index cf281bc76d7..06a206f79b7 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -23,10 +23,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -166,10 +168,20 @@ void sparse_to_dense_results( [&col, to_dense_agg_result, mr, stream] (auto const& agg_kind) { auto tranformed_agg = std::make_unique(agg_kind); - auto argmax_result = to_dense_agg_result(tranformed_agg); - auto transformed_result = experimental::detail::gather( - table_view({col}), *argmax_result, false, false, false, mr, stream); - return std::move(transformed_result->release()[0]); + auto arg_result = to_dense_agg_result(tranformed_agg); + if (arg_result->has_nulls()) { + auto replacement = numeric_scalar(-1, true, stream); + auto null_replaced_map = cudf::detail::replace_nulls( + *arg_result, replacement, rmm::mr::get_default_resource(), stream); + auto transformed_result = experimental::detail::gather( + table_view({col}), *null_replaced_map, false, true, false, mr, stream); + return std::move(transformed_result->release()[0]); + } + else { + auto transformed_result = experimental::detail::gather( + table_view({col}), *arg_result, false, false, false, mr, stream); + return std::move(transformed_result->release()[0]); + } }; for (auto &&agg : agg_v) { diff --git a/cpp/tests/groupby/sort/group_max_test.cu b/cpp/tests/groupby/sort/group_max_test.cu index 77937050422..b540d00bdea 100644 --- a/cpp/tests/groupby/sort/group_max_test.cu +++ b/cpp/tests/groupby/sort/group_max_test.cu @@ -133,5 +133,19 @@ TEST_F(groupby_max_string_test, basic) test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } +TEST_F(groupby_max_string_test, zero_valid_values) +{ + using K = int32_t; + + fixed_width_column_wrapper keys { 1, 1, 1}; + strings_column_wrapper vals ( { "año", "bit", "₹1"}, all_null() ); + + fixed_width_column_wrapper expect_keys { 1 }; + strings_column_wrapper expect_vals({ "" }, all_null()); + + auto agg = cudf::experimental::make_max_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + } // namespace test } // namespace cudf diff --git a/cpp/tests/groupby/sort/group_min_test.cu b/cpp/tests/groupby/sort/group_min_test.cu index abe5f6f006f..c2ae89cb971 100644 --- a/cpp/tests/groupby/sort/group_min_test.cu +++ b/cpp/tests/groupby/sort/group_min_test.cu @@ -133,5 +133,19 @@ TEST_F(groupby_min_string_test, basic) test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } +TEST_F(groupby_min_string_test, zero_valid_values) +{ + using K = int32_t; + + fixed_width_column_wrapper keys { 1, 1, 1}; + strings_column_wrapper vals ( { "año", "bit", "₹1"}, all_null() ); + + fixed_width_column_wrapper expect_keys { 1 }; + strings_column_wrapper expect_vals({ "" }, all_null()); + + auto agg = cudf::experimental::make_min_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + } // namespace test } // namespace cudf From 28f92451e6d5e8a83360a7c742f14def492cae20 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Tue, 10 Mar 2020 23:23:39 +0530 Subject: [PATCH 12/79] changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c660d83500b..0eba669fe2c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -221,7 +221,7 @@ - PR #4358 Fix strings::concat where narep is an empty string - PR #4369 Fix race condition in gpuinflate - PR #4390 Disable ScatterValid and ScatterNull legacy tests - +- PR #4398 Fixes the bug in groupby in MIN/MAX on strings when strings some groups are empty # cuDF 0.12.0 (04 Feb 2020) From 39a7537fc97b31c5238f9debd2e65e7da7fa2cec Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 11 Mar 2020 16:47:37 -0400 Subject: [PATCH 13/79] Move BinaryOperation creation to Cython --- python/cudf/cudf/_libxx/binaryop.pyx | 1 + python/cudf/cudf/core/column/numerical.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_libxx/binaryop.pyx b/python/cudf/cudf/_libxx/binaryop.pyx index d99a13f4fd1..40f124ce9ad 100644 --- a/python/cudf/cudf/_libxx/binaryop.pyx +++ b/python/cudf/cudf/_libxx/binaryop.pyx @@ -154,6 +154,7 @@ def binaryop(lhs, rhs, op, dtype): """ Dispatches a binary op call to the appropriate libcudf function: """ + op = BinaryOperation[op.upper()] cdef binary_operator c_op = ( op ) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index a1960a19d71..5bc3c106a12 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -461,8 +461,7 @@ def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False): is_op_comparison = op in ["lt", "gt", "le", "ge", "eq", "ne"] - operator = libcudfxx.binaryop.BinaryOperation[op.upper()] - out = libcudfxx.binaryop.binaryop(lhs, rhs, operator, out_dtype) + out = libcudfxx.binaryop.binaryop(lhs, rhs, op, out_dtype) if is_op_comparison: out = out.fillna(op == "ne") From e21d82106823809bb8d94846e4c4a13def4c4f1b Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Thu, 12 Mar 2020 04:29:45 +0530 Subject: [PATCH 14/79] Remove extra replace_nulls operation --- cpp/src/groupby/hash/groupby.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 06a206f79b7..a039000ef2a 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -169,12 +169,12 @@ void sparse_to_dense_results( (auto const& agg_kind) { auto tranformed_agg = std::make_unique(agg_kind); auto arg_result = to_dense_agg_result(tranformed_agg); - if (arg_result->has_nulls()) { - auto replacement = numeric_scalar(-1, true, stream); - auto null_replaced_map = cudf::detail::replace_nulls( - *arg_result, replacement, rmm::mr::get_default_resource(), stream); + if (arg_result->nullable()) { + column_view null_removed_map(data_type(type_to_id()), + arg_result->size(), + static_cast(arg_result->view().template data())); auto transformed_result = experimental::detail::gather( - table_view({col}), *null_replaced_map, false, true, false, mr, stream); + table_view({col}), null_removed_map, false, true, false, mr, stream); return std::move(transformed_result->release()[0]); } else { From 3e4a8d6587ad1beffdafbf7fdc56a4f8d2023cf5 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 12 Mar 2020 08:09:25 -0400 Subject: [PATCH 15/79] Remove Series._rbinaryop and Series._filled_binary_op --- python/cudf/cudf/core/series.py | 106 ++++++++++++++------------------ 1 file changed, 47 insertions(+), 59 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index e8306710a8d..ace410f665a 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1,5 +1,4 @@ # Copyright (c) 2018, NVIDIA CORPORATION. -import operator import pickle import warnings from numbers import Number @@ -597,7 +596,7 @@ def __repr__(self): lines.append(category_memory) return "\n".join(lines) - def _binaryop(self, other, fn, reflect=False): + def _binaryop(self, other, fn, fill_value=None, reflect=False): """ Internal util to call a binary operator *fn* on operands *self* and *other*. Return the output Series. The output dtype is @@ -619,48 +618,31 @@ def _binaryop(self, other, fn, reflect=False): else: lhs, rhs = self, other rhs = self._normalize_binop_value(rhs) - outcol = lhs._column.binary_operator(fn, rhs, reflect=reflect) - result = lhs._copy_construct(data=outcol, name=result_name) - libcudf.nvtx.nvtx_range_pop() - return result - - def _rbinaryop(self, other, fn): - """ - Internal util to call a binary operator *fn* on operands *self* - and *other* for reflected operations. Return the output Series. - The output dtype is determined by the input operands. - """ - return self._binaryop(other, fn, reflect=True) - - def _filled_binaryop(self, other, fn, fill_value=None, reflect=False): - def func(lhs, rhs): - return fn(rhs, lhs) if reflect else fn(lhs, rhs) - - if isinstance(other, Series): - lhs, rhs = _align_indices([self, other], allow_non_unique=True) - else: - lhs, rhs = self, other if fill_value is not None: - if isinstance(rhs, Series): + if is_scalar(rhs): + lhs = lhs.fillna(fill_value) + else: if lhs.nullable and rhs.nullable: lmask = Series(data=lhs.nullmask) rmask = Series(data=rhs.nullmask) mask = (lmask | rmask).data lhs = lhs.fillna(fill_value) rhs = rhs.fillna(fill_value) - result = func(lhs, rhs) + result = lhs._binaryop(rhs, fn=fn, reflect=reflect) data = column.build_column( data=result.data, dtype=result.dtype, mask=mask ) return lhs._copy_construct(data=data) elif lhs.nullable: - return func(lhs.fillna(fill_value), rhs) + lhs = lhs.fillna(fill_value) elif rhs.nullable: - return func(lhs, rhs.fillna(fill_value)) - elif is_scalar(rhs): - return func(lhs.fillna(fill_value), rhs) - return func(lhs, rhs) + rhs = rhs.fillna(fill_value) + + outcol = lhs._column.binary_operator(fn, rhs, reflect=reflect) + result = lhs._copy_construct(data=outcol, name=result_name) + libcudf.nvtx.nvtx_range_pop() + return result def add(self, other, fill_value=None, axis=0): """Addition of series and other, element-wise @@ -675,7 +657,7 @@ def add(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._filled_binaryop(other, operator.add, fill_value) + return self._binaryop(other, "add", fill_value) def __add__(self, other): return self._binaryop(other, "add") @@ -693,10 +675,12 @@ def radd(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._filled_binaryop(other, operator.add, fill_value, True) + return self._binaryop( + other, "add", fill_value=fill_value, reflect=True + ) def __radd__(self, other): - return self._rbinaryop(other, "add") + return self._binaryop(other, "add", reflect=True) def sub(self, other, fill_value=None, axis=0): """Subtraction of series and other, element-wise @@ -711,7 +695,7 @@ def sub(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._filled_binaryop(other, operator.sub, fill_value) + return self._binaryop(other, "sub", fill_value) def __sub__(self, other): return self._binaryop(other, "sub") @@ -729,10 +713,10 @@ def rsub(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._filled_binaryop(other, operator.sub, fill_value, True) + return self._binaryop(other, "sub", fill_value, reflect=True) def __rsub__(self, other): - return self._rbinaryop(other, "sub") + return self._binaryop(other, "sub", reflect=True) def mul(self, other, fill_value=None, axis=0): """Multiplication of series and other, element-wise @@ -747,7 +731,7 @@ def mul(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._filled_binaryop(other, operator.mul, fill_value) + return self._binaryop(other, "mul", fill_value=fill_value) def __mul__(self, other): return self._binaryop(other, "mul") @@ -765,10 +749,10 @@ def rmul(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._filled_binaryop(other, operator.mul, fill_value, True) + return self._binaryop(other, "mul", fill_value, True) def __rmul__(self, other): - return self._rbinaryop(other, "mul") + return self._binaryop(other, "mul", reflect=True) def mod(self, other, fill_value=None, axis=0): """Modulo of series and other, element-wise @@ -783,7 +767,7 @@ def mod(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._filled_binaryop(other, operator.mod, fill_value) + return self._binaryop(other, "mod", fill_value) def __mod__(self, other): return self._binaryop(other, "mod") @@ -801,10 +785,10 @@ def rmod(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._filled_binaryop(other, operator.mod, fill_value, True) + return self._binaryop(other, "mod", fill_value, True) def __rmod__(self, other): - return self._rbinaryop(other, "mod") + return self._binaryop(other, "mod", reflect=True) def pow(self, other, fill_value=None, axis=0): """Exponential power of series and other, element-wise @@ -819,7 +803,7 @@ def pow(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._filled_binaryop(other, operator.pow, fill_value) + return self._binaryop(other, "pow", fill_value) def __pow__(self, other): return self._binaryop(other, "pow") @@ -837,10 +821,10 @@ def rpow(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._filled_binaryop(other, operator.pow, fill_value, True) + return self._binaryop(other, "pow", fill_value, True) def __rpow__(self, other): - return self._rbinaryop(other, "pow") + return self._binaryop(other, "pow", reflect=True) def floordiv(self, other, fill_value=None, axis=0): """Integer division of series and other, element-wise @@ -855,7 +839,7 @@ def floordiv(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._filled_binaryop(other, operator.floordiv, fill_value) + return self._binaryop(other, "floordiv", fill_value) def __floordiv__(self, other): return self._binaryop(other, "floordiv") @@ -873,12 +857,12 @@ def rfloordiv(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._filled_binaryop( - other, operator.floordiv, fill_value, True + return self._binaryop( + other, "floordiv", fill_value, True ) def __rfloordiv__(self, other): - return self._rbinaryop(other, "floordiv") + return self._binaryop(other, "floordiv", reflect=True) def truediv(self, other, fill_value=None, axis=0): """Floating division of series and other, element-wise @@ -893,7 +877,7 @@ def truediv(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._filled_binaryop(other, operator.truediv, fill_value) + return self._binaryop(other, "truediv", fill_value) def __truediv__(self, other): if self.dtype in list(truediv_int_dtype_corrections.keys()): @@ -915,14 +899,18 @@ def rtruediv(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._filled_binaryop(other, operator.truediv, fill_value, True) + return self._binaryop(other, "truediv", fill_value, True) def __rtruediv__(self, other): if self.dtype in list(truediv_int_dtype_corrections.keys()): truediv_type = truediv_int_dtype_corrections[str(self.dtype)] - return self.astype(truediv_type)._rbinaryop(other, "truediv") + return self.astype(truediv_type)._binaryop( + other, + "truediv", + reflect=True + ) else: - return self._rbinaryop(other, "truediv") + return self._binaryop(other, "truediv", reflect=True) __div__ = __truediv__ @@ -1021,7 +1009,7 @@ def eq(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._filled_binaryop(other, operator.eq, fill_value) + return self._binaryop(other, "eq", fill_value) def __eq__(self, other): return self._unordered_compare(other, "eq") @@ -1046,7 +1034,7 @@ def ne(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._filled_binaryop(other, operator.ne, fill_value) + return self._binaryop(other, "ne", fill_value) def __ne__(self, other): return self._unordered_compare(other, "ne") @@ -1064,7 +1052,7 @@ def lt(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._filled_binaryop(other, operator.lt, fill_value) + return self._binaryop(other, "lt", fill_value) def __lt__(self, other): return self._ordered_compare(other, "lt") @@ -1082,7 +1070,7 @@ def le(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._filled_binaryop(other, operator.le, fill_value) + return self._binaryop(other, "le", fill_value) def __le__(self, other): return self._ordered_compare(other, "le") @@ -1100,7 +1088,7 @@ def gt(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._filled_binaryop(other, operator.gt, fill_value) + return self._binaryop(other, "gt", fill_value) def __gt__(self, other): return self._ordered_compare(other, "gt") @@ -1118,7 +1106,7 @@ def ge(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._filled_binaryop(other, operator.ge, fill_value) + return self._binaryop(other, "ge", fill_value) def __ge__(self, other): return self._ordered_compare(other, "ge") From ea3fb8928d7f63ffac7c65fc5f5473f42ec61ad4 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 12 Mar 2020 08:09:47 -0400 Subject: [PATCH 16/79] Fix to_cudf_compatible_scalar --- python/cudf/cudf/utils/dtypes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 2150646e434..8741f3e7046 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -143,6 +143,8 @@ def to_cudf_compatible_scalar(val, dtype=None): if val is None: return val + dtype = "str" if is_string_dtype(dtype) else dtype + if not is_scalar(val): raise ValueError( f"Cannot convert value of type {type(val).__name__} " From e739c076aae284505f350e22d5d424c86995e9a5 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 12 Mar 2020 11:36:24 -0400 Subject: [PATCH 17/79] Remove (un)ordered_compare methods across all column types --- python/cudf/cudf/core/column/categorical.py | 31 +++++------ python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/core/column/datetime.py | 12 ++--- python/cudf/cudf/core/column/numerical.py | 18 +++---- python/cudf/cudf/core/column/string.py | 11 ++-- python/cudf/cudf/core/series.py | 57 ++++++--------------- python/cudf/cudf/tests/test_categorical.py | 6 +-- 7 files changed, 48 insertions(+), 89 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 69e89ab52cd..7778ecde5a9 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -140,7 +140,7 @@ def _categories_equal(self, new_categories, **kwargs): cur_categories = Series(cur_categories).sort_values() new_categories = Series(new_categories).sort_values() - return cur_categories.equals(new_categories) + return cur_categories._column.equals(new_categories._column) def _set_categories(self, new_categories, **kwargs): """Returns a new CategoricalColumn with the categories set to the @@ -362,13 +362,6 @@ def ordered(self, value): def cat(self, parent=None): return CategoricalAccessor(self, parent=parent) - def binary_operator(self, binop, rhs, reflect=False): - msg = ( - "Series of dtype `category` cannot perform the operation: " - "{}".format(binop) - ) - raise TypeError(msg) - def unary_operator(self, unaryop): msg = ( "Series of dtype `category` cannot perform the operation: " @@ -376,18 +369,20 @@ def unary_operator(self, unaryop): ) raise TypeError(msg) - def unordered_compare(self, cmpop, rhs): - if self.dtype != rhs.dtype: - raise TypeError("Categoricals can only compare with the same type") - return self.as_numerical.unordered_compare(cmpop, rhs.as_numerical) + def binary_operator(self, op, rhs, reflect=False): - def ordered_compare(self, cmpop, rhs): - if not (self.ordered and rhs.ordered): - msg = "Unordered Categoricals can only compare equality or not" - raise TypeError(msg) + if not (self.ordered and rhs.ordered) and op not in ("eq", "ne"): + if op in ("lt", "gt", "le", "ge"): + raise TypeError( + f"Unordered Categoricals can only compare equality or not" + ) + raise TypeError( + f"Series of dtype `{self.dtype}` cannot perform the " + f"operation: {op}" + ) if self.dtype != rhs.dtype: raise TypeError("Categoricals can only compare with the same type") - return self.as_numerical.ordered_compare(cmpop, rhs.as_numerical) + return self.as_numerical.binary_operator(op, rhs.as_numerical) def normalize_binop_value(self, other): ary = utils.scalar_broadcast_to( @@ -631,7 +626,7 @@ def pandas_categorical_as_column(categorical, codes=None): codes = categorical.codes if codes is None else codes codes = column.as_column(codes) - valid_codes = codes.unordered_compare("ne", codes.dtype.type(-1)) + valid_codes = codes.binary_operator("ne", codes.dtype.type(-1)) mask = None if not valid_codes.all(): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 9bc9bab6087..57fd880c51b 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -140,7 +140,7 @@ def equals(self, other): if isinstance(val, np.ndarray): return val.all() return bool(val) - return self.unordered_compare("eq", other).min() + return self.binary_operator("eq", other).min() def __sizeof__(self): n = self.data.size diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 029bb8dbee7..1ac5a90335f 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -177,14 +177,6 @@ def as_string_column(self, dtype, **kwargs): else: return column.column_empty(0, dtype="object", masked=False) - def unordered_compare(self, cmpop, rhs): - lhs, rhs = self, rhs - return binop(lhs, rhs, op=cmpop, out_dtype=np.bool) - - def ordered_compare(self, cmpop, rhs): - lhs, rhs = self, rhs - return binop(lhs, rhs, op=cmpop, out_dtype=np.bool) - def to_pandas(self, index=None): return pd.Series( self.to_array(fillna="pandas").astype(self.dtype), index=index @@ -214,6 +206,10 @@ def default_na_value(self): "datetime column of {} has no NaN value".format(self.dtype) ) + def binary_operator(self, op, rhs, reflect=False): + lhs, rhs = self, rhs + return binop(lhs, rhs, op=op, out_dtype=np.bool) + def fillna(self, fill_value): if is_scalar(fill_value): fill_value = np.datetime64(fill_value, self.time_unit) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 5bc3c106a12..1595a54c92d 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -59,6 +59,9 @@ def __contains__(self, item): self, column.as_column([item], dtype=self.dtype) ).any() + def unary_operator(self, unaryop): + return _numeric_column_unaryop(self, op=unaryop) + def binary_operator(self, binop, rhs, reflect=False): int_dtypes = [ np.dtype("int8"), @@ -89,12 +92,6 @@ def binary_operator(self, binop, rhs, reflect=False): def unary_operator(self, unaryop): return _numeric_column_unaryop(self, op=unaryop) - def unordered_compare(self, cmpop, rhs): - return _numeric_column_compare(self, rhs, op=cmpop) - - def ordered_compare(self, cmpop, rhs): - return _numeric_column_compare(self, rhs, op=cmpop) - def _apply_scan_op(self, op): return libcudfxx.reduce.scan(op, self, True) @@ -461,6 +458,9 @@ def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False): is_op_comparison = op in ["lt", "gt", "le", "ge", "eq", "ne"] + if is_op_comparison: + out_dtype = "bool" + out = libcudfxx.binaryop.binaryop(lhs, rhs, op, out_dtype) if is_op_comparison: @@ -478,10 +478,6 @@ def _numeric_column_unaryop(operand, op): return libcudfxx.unary.unary_operation(operand, op) -def _numeric_column_compare(lhs, rhs, op): - return _numeric_column_binop(lhs, rhs, op, out_dtype=np.bool_) - - def _safe_cast_to_int(col, dtype): """ Cast given NumericalColumn to given integer dtype safely. @@ -492,7 +488,7 @@ def _safe_cast_to_int(col, dtype): return col new_col = col.astype(dtype) - if new_col.unordered_compare("eq", col).all(): + if new_col.binary_operator("eq", col).all(): return new_col else: raise TypeError( diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 048e2dadb07..e280c86187a 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1023,9 +1023,6 @@ def sort_by_values(self, ascending=True, na_position="last"): def copy(self, deep=True): return column.as_column(self.nvstrings.copy()) - def unordered_compare(self, cmpop, rhs): - return _string_column_binop(self, rhs, op=cmpop) - def find_and_replace(self, to_replace, replacement, all_nan): """ Return col with *to_replace* replaced with *value* @@ -1108,15 +1105,17 @@ def normalize_binop_value(self, other): def default_na_value(self): return None - def binary_operator(self, binop, rhs, reflect=False): + def binary_operator(self, op, rhs, reflect=False): lhs = self if reflect: lhs, rhs = rhs, lhs - if isinstance(rhs, StringColumn) and binop == "add": + if isinstance(rhs, StringColumn) and op == "add": return lhs.nvstrings.cat(others=rhs.nvstrings) + elif op in ("eq", "ne"): + return _string_column_binop(self, rhs, op=op) else: msg = "{!r} operator not supported between {} and {}" - raise TypeError(msg.format(binop, type(self), type(rhs))) + raise TypeError(msg.format(op, type(self), type(rhs))) def sum(self, dtype=None): # dtype is irrelevant it is needed to be in sync with diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index ace410f665a..93f6edcd4d2 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -619,6 +619,11 @@ def _binaryop(self, other, fn, fill_value=None, reflect=False): lhs, rhs = self, other rhs = self._normalize_binop_value(rhs) + if fn == "truediv": + if str(lhs.dtype) in truediv_int_dtype_corrections: + truediv_type = truediv_int_dtype_corrections[str(lhs.dtype)] + lhs = lhs.astype(truediv_type) + if fill_value is not None: if is_scalar(rhs): lhs = lhs.fillna(fill_value) @@ -857,9 +862,7 @@ def rfloordiv(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop( - other, "floordiv", fill_value, True - ) + return self._binaryop(other, "floordiv", fill_value, True) def __rfloordiv__(self, other): return self._binaryop(other, "floordiv", reflect=True) @@ -880,11 +883,7 @@ def truediv(self, other, fill_value=None, axis=0): return self._binaryop(other, "truediv", fill_value) def __truediv__(self, other): - if self.dtype in list(truediv_int_dtype_corrections.keys()): - truediv_type = truediv_int_dtype_corrections[str(self.dtype)] - return self.astype(truediv_type)._binaryop(other, "truediv") - else: - return self._binaryop(other, "truediv") + return self._binaryop(other, "truediv") def rtruediv(self, other, fill_value=None, axis=0): """Floating division of series and other, element-wise @@ -902,15 +901,7 @@ def rtruediv(self, other, fill_value=None, axis=0): return self._binaryop(other, "truediv", fill_value, True) def __rtruediv__(self, other): - if self.dtype in list(truediv_int_dtype_corrections.keys()): - truediv_type = truediv_int_dtype_corrections[str(self.dtype)] - return self.astype(truediv_type)._binaryop( - other, - "truediv", - reflect=True - ) - else: - return self._binaryop(other, "truediv", reflect=True) + return self._binaryop(other, "truediv", reflect=True) __div__ = __truediv__ @@ -978,24 +969,6 @@ def _normalize_binop_value(self, other): else: return self._column.normalize_binop_value(other) - def _unordered_compare(self, other, cmpops): - libcudf.nvtx.nvtx_range_push("CUDF_UNORDERED_COMP", "orange") - result_name = utils.get_result_name(self, other) - other = self._normalize_binop_value(other) - outcol = self._column.unordered_compare(cmpops, other) - result = self._copy_construct(data=outcol, name=result_name) - libcudf.nvtx.nvtx_range_pop() - return result - - def _ordered_compare(self, other, cmpops): - libcudf.nvtx.nvtx_range_push("CUDF_ORDERED_COMP", "orange") - result_name = utils.get_result_name(self, other) - other = self._normalize_binop_value(other) - outcol = self._column.ordered_compare(cmpops, other) - result = self._copy_construct(data=outcol, name=result_name) - libcudf.nvtx.nvtx_range_pop() - return result - def eq(self, other, fill_value=None, axis=0): """Equal to of series and other, element-wise (binary operator eq). @@ -1012,14 +985,14 @@ def eq(self, other, fill_value=None, axis=0): return self._binaryop(other, "eq", fill_value) def __eq__(self, other): - return self._unordered_compare(other, "eq") + return self._binaryop(other, "eq") def equals(self, other): if self is other: return True if other is None or len(self) != len(other): return False - return self._unordered_compare(other, "eq").min() + return self._binaryop(other, "eq").min() def ne(self, other, fill_value=None, axis=0): """Not equal to of series and other, element-wise @@ -1037,7 +1010,7 @@ def ne(self, other, fill_value=None, axis=0): return self._binaryop(other, "ne", fill_value) def __ne__(self, other): - return self._unordered_compare(other, "ne") + return self._binaryop(other, "ne") def lt(self, other, fill_value=None, axis=0): """Less than of series and other, element-wise @@ -1055,7 +1028,7 @@ def lt(self, other, fill_value=None, axis=0): return self._binaryop(other, "lt", fill_value) def __lt__(self, other): - return self._ordered_compare(other, "lt") + return self._binaryop(other, "lt") def le(self, other, fill_value=None, axis=0): """Less than or equal to of series and other, element-wise @@ -1073,7 +1046,7 @@ def le(self, other, fill_value=None, axis=0): return self._binaryop(other, "le", fill_value) def __le__(self, other): - return self._ordered_compare(other, "le") + return self._binaryop(other, "le") def gt(self, other, fill_value=None, axis=0): """Greater than of series and other, element-wise @@ -1091,7 +1064,7 @@ def gt(self, other, fill_value=None, axis=0): return self._binaryop(other, "gt", fill_value) def __gt__(self, other): - return self._ordered_compare(other, "gt") + return self._binaryop(other, "gt") def ge(self, other, fill_value=None, axis=0): """Greater than or equal to of series and other, element-wise @@ -1109,7 +1082,7 @@ def ge(self, other, fill_value=None, axis=0): return self._binaryop(other, "ge", fill_value) def __ge__(self, other): - return self._ordered_compare(other, "ge") + return self._binaryop(other, "ge") def __invert__(self): """Bitwise invert (~) for each element. diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 2f857e7023a..54a35f58130 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -150,7 +150,7 @@ def test_categorical_binary_add(): with pytest.raises(TypeError) as raises: sr + sr raises.match( - "Series of dtype `category` cannot perform the operation: " "add" + "Series of dtype `category` cannot perform the operation: add" ) @@ -166,7 +166,7 @@ def test_categorical_unary_ceil(): with pytest.raises(TypeError) as raises: sr.ceil() raises.match( - "Series of dtype `category` cannot perform the operation: " "ceil" + "Series of dtype `category` cannot perform the operation: ceil" ) @@ -243,7 +243,7 @@ def test_cat_series_binop_error(): with pytest.raises(TypeError) as raises: dfa + dfb raises.match( - "Series of dtype `category` cannot perform the operation: " "add" + "Series of dtype `category` cannot perform the operation: add" ) # if lhs is a numerical with pytest.raises(TypeError) as raises: From 32b39d2c1041e1d314fe268ff86252fb74bcfaf5 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 12 Mar 2020 12:14:31 -0400 Subject: [PATCH 18/79] Port datetime/string ops to libcudfxx --- python/cudf/cudf/core/column/datetime.py | 4 +--- python/cudf/cudf/core/column/string.py | 12 ++++-------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 1ac5a90335f..70728889852 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -288,8 +288,6 @@ def is_monotonic_decreasing(self): def binop(lhs, rhs, op, out_dtype): libcudf.nvtx.nvtx_range_push("CUDF_BINARY_OP", "orange") - masked = lhs.nullable or rhs.nullable - out = column.column_empty_like(lhs, dtype=out_dtype, masked=masked) - _ = libcudf.binops.apply_op(lhs, rhs, out, op) + out = libcudfxx.binaryop.binaryop(lhs, rhs, op, out_dtype) libcudf.nvtx.nvtx_range_pop() return out diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index e280c86187a..ba2811fbe75 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1111,8 +1111,8 @@ def binary_operator(self, op, rhs, reflect=False): lhs, rhs = rhs, lhs if isinstance(rhs, StringColumn) and op == "add": return lhs.nvstrings.cat(others=rhs.nvstrings) - elif op in ("eq", "ne"): - return _string_column_binop(self, rhs, op=op) + elif op in ("eq", "ne", "gt", "lt", "ge", "le"): + return _string_column_binop(self, rhs, op=op, out_dtype="bool") else: msg = "{!r} operator not supported between {} and {}" raise TypeError(msg.format(op, type(self), type(rhs))) @@ -1164,12 +1164,8 @@ def _mimic_inplace(self, other_col, inplace=False): return out -def _string_column_binop(lhs, rhs, op): +def _string_column_binop(lhs, rhs, op, out_dtype): nvtx_range_push("CUDF_BINARY_OP", "orange") - # Allocate output - masked = lhs.nullable or rhs.nullable - out = column.column_empty_like(lhs, dtype="bool", masked=masked) - # Call and fix null_count - _ = libcudf.binops.apply_op(lhs=lhs, rhs=rhs, out=out, op=op) + out = libcudfxx.binaryop.binaryop(lhs=lhs, rhs=rhs, op=op, dtype=out_dtype) nvtx_range_pop() return out From acc60a55119f9a8a90fe6964f3a407c347acce4b Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 12 Mar 2020 14:53:07 -0700 Subject: [PATCH 19/79] fix .str.rsplit to be similar to .str.split and enable tests --- python/cudf/cudf/core/column/string.py | 16 +++++++++++----- python/cudf/cudf/tests/test_string.py | 21 ++++++++------------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 24984aadd4e..982777d526c 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -879,10 +879,11 @@ def split(self, pat=None, n=-1, expand=True, **kwargs): from cudf._libxx.scalar import Scalar result_table = cpp_split(self._column, Scalar(pat, "str"), n) - if len(result_table._data) == 1: if result_table._data[0].null_count == len(self._parent): result_table = [] + elif self._parent.null_count == len(self._parent): + result_table = [self._column.copy()] return self._return_or_inplace(result_table, **kwargs,) @@ -920,13 +921,18 @@ def rsplit(self, pat=None, n=-1, expand=True, **kwargs): kwargs.setdefault("expand", expand) if pat is None: - pat = " " + pat = "" from cudf._libxx.scalar import Scalar - return self._return_or_inplace( - cpp_rsplit(self._column, Scalar(pat), n), **kwargs - ) + result_table = cpp_rsplit(self._column, Scalar(pat), n) + if len(result_table._data) == 1: + if result_table._data[0].null_count == len(self._parent): + result_table = [] + elif self._parent.null_count == len(self._parent): + result_table = [self._column.copy()] + + return self._return_or_inplace(result_table, **kwargs) def partition(self, sep=" ", expand=True, **kwargs): """ diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 96273e0a483..9764a20f0cf 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -469,11 +469,8 @@ def test_string_extract(ps_gs, pat, expand, flags, flags_raise): ("a", False), ("a", True), ("f", False), - # TODO, PREM: Analyse and uncomment the - # two tests as they seem to pass when run - # as independent test but seem to fail as a group test. - # (r"[a-z]", True), - # (r"[A-Z]", True), + (r"[a-z]", True), + (r"[A-Z]", True), ("hello", False), ("FGHI", False), ], @@ -537,8 +534,7 @@ def test_string_upper(ps_gs): ["a b", " c ", " d", "e ", "f"], ["a-b", "-c-", "---d", "e---", "f"], ["ab", "c", "d", "e", "f"], - # TODO, PREM: Uncomment in future PR - # [None, None, None, None, None], + [None, None, None, None, None], ], ) @pytest.mark.parametrize("pat", [None, " ", "-"]) @@ -1226,11 +1222,11 @@ def test_strings_rsplit(data, n, expand): gs = Series(data) ps = pd.Series(data) - # TODO: Uncomment this test once - # this is fixed: https://github.com/rapidsai/cudf/issues/4357 - # assert_eq( - # ps.str.rsplit(n=n, expand=expand), gs.str.rsplit(n=n, expand=expand) - # ) + pd.testing.assert_frame_equal( + ps.str.rsplit(n=n, expand=expand).reset_index(), + gs.str.rsplit(n=n, expand=expand).to_pandas().reset_index(), + check_index_type=False, + ) assert_eq( ps.str.rsplit(",", n=n, expand=expand), gs.str.rsplit(",", n=n, expand=expand), @@ -1591,7 +1587,6 @@ def test_string_starts_ends(data, pat): [ # TODO, PREM: Uncomment after this issue is fixed # '', - # None, " ", "a", "abc", From ade1f53fbfa4a39fe5b956b561b857497f56a88b Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 12 Mar 2020 16:55:39 -0500 Subject: [PATCH 20/79] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 76b6c91caeb..69b6fcce685 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -249,6 +249,7 @@ - PR #4434 Fix join_strings logic with all-null strings and non-null narep - PR #4464 Update Cmake to always link in libnvToolsExt - PR #4467 Fix dropna issue for a DataFrame having np.nan +- PR #4482 Fix `.str.rsplit` & `.str.split` and enable related tests # cuDF 0.12.0 (04 Feb 2020) From 7efdc49ab94a6e1dd92a9e70ada0e66dcd69ad24 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 13 Mar 2020 09:29:20 -0500 Subject: [PATCH 21/79] nvtext cython stubs --- .../cudf/cudf/_libxx/cpp/nvtext/__init__.pxd | 0 .../_libxx/cpp/nvtext/generate_ngrams.pxd | 16 ++++ .../_libxx/cpp/nvtext/ngrams_tokenize.pxd | 17 ++++ .../cudf/cudf/_libxx/cpp/nvtext/normalize.pxd | 12 +++ .../cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd | 29 +++++++ python/cudf/cudf/_libxx/nvtext/__init__.py | 0 .../cudf/_libxx/nvtext/generate_ngrams.pyx | 32 +++++++ .../cudf/_libxx/nvtext/ngrams_tokenize.pyx | 39 +++++++++ python/cudf/cudf/_libxx/nvtext/normalize.pyx | 22 +++++ python/cudf/cudf/_libxx/nvtext/tokenize.pyx | 87 +++++++++++++++++++ 10 files changed, 254 insertions(+) create mode 100644 python/cudf/cudf/_libxx/cpp/nvtext/__init__.pxd create mode 100644 python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd create mode 100644 python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd create mode 100644 python/cudf/cudf/_libxx/cpp/nvtext/normalize.pxd create mode 100644 python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd create mode 100644 python/cudf/cudf/_libxx/nvtext/__init__.py create mode 100644 python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx create mode 100644 python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx create mode 100644 python/cudf/cudf/_libxx/nvtext/normalize.pyx create mode 100644 python/cudf/cudf/_libxx/nvtext/tokenize.pyx diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/__init__.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/__init__.pxd new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd new file mode 100644 index 00000000000..f20209956b1 --- /dev/null +++ b/python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd @@ -0,0 +1,16 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._libxx.cpp.column.column cimport column +from cudf._libxx.cpp.column.column_view cimport column_view +from cudf._libxx.cpp.scalar.scalar cimport scalar +from cudf._libxx.cpp.types cimport size_type + +cdef extern from "cudf/nvtext/generate_ngrams.hpp" namespace "nvtext" nogil: + + cdef unique_ptr[column] generate_ngrams( + const column_view &strings, + size_type ngrams, + const scalar & separator + ) except + diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd new file mode 100644 index 00000000000..0fc892d3b9d --- /dev/null +++ b/python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd @@ -0,0 +1,17 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._libxx.cpp.column.column cimport column +from cudf._libxx.cpp.column.column_view cimport column_view +from cudf._libxx.cpp.scalar.scalar cimport scalar +from cudf._libxx.cpp.types cimport size_type + +cdef extern from "cudf/nvtext/ngrams_tokenize.hpp" namespace "nvtext" nogil: + + cdef unique_ptr[column] ngrams_tokenize( + const column_view & strings, + size_type ngrams, + const scalar & delimiter, + const scalar & separator + ) except + diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/normalize.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/normalize.pxd new file mode 100644 index 00000000000..dc4b060d7f6 --- /dev/null +++ b/python/cudf/cudf/_libxx/cpp/nvtext/normalize.pxd @@ -0,0 +1,12 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._libxx.cpp.column.column cimport column +from cudf._libxx.cpp.column.column_view cimport column_view + +cdef extern from "cudf/nvtext/normalize.hpp" namespace "nvtext" nogil: + + cdef unique_ptr[column] normalize_spaces( + const column_view & strings + ) except + diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd new file mode 100644 index 00000000000..0c653c7afbe --- /dev/null +++ b/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd @@ -0,0 +1,29 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._libxx.cpp.column.column cimport column +from cudf._libxx.cpp.column.column_view cimport column_view +from cudf._libxx.cpp.scalar.scalar cimport scalar + +cdef extern from "cudf/nvtext/tokenize.hpp" namespace "nvtext" nogil: + + cdef unique_ptr[column] tokenize( + const column_view & strings, + const scalar & delimiter + ) except + + + cdef unique_ptr[column] tokenize( + const column_view & strings, + const column_view & delimiters + ) except + + + cdef unique_ptr[column] count_tokens( + const column_view & strings, + const scalar & delimiter + ) except + + + cdef unique_ptr[column] count_tokens( + const column_view & strings, + const column_view & delimiters + ) except + diff --git a/python/cudf/cudf/_libxx/nvtext/__init__.py b/python/cudf/cudf/_libxx/nvtext/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx new file mode 100644 index 00000000000..51796e5411a --- /dev/null +++ b/python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx @@ -0,0 +1,32 @@ +# Copyright (c) 2018-2020, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from cudf._libxx.move cimport move + +from cudf._libxx.cpp.column.column cimport column +from cudf._libxx.cpp.scalar.scalar cimport scalar +from cudf._libxx.cpp.types cimport size_type +from cudf._libxx.cpp.column.column_view cimport column_view +from cudf._libxx.cpp.nvtext.generate_ngrams cimport ( + generate_ngrams as cpp_generate_ngrams +) +from cudf._libxx.column cimport Column +from cudf._libxx.scalar cimport Scalar + + +def generate_ngrams(Column strings, int ngrams, Scalar separator): + cdef column_view source_view = strings.view() + cdef size_type c_ngrams = ngrams + cdef scalar* c_separator = separator.c_value.get() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_generate_ngrams( + c_strings, + c_ngrams + c_separator[0] + ) + ) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx new file mode 100644 index 00000000000..1f7afa2bf09 --- /dev/null +++ b/python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx @@ -0,0 +1,39 @@ +# Copyright (c) 2018-2020, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from cudf._libxx.move cimport move + +from cudf._libxx.cpp.column.column cimport column +from cudf._libxx.cpp.scalar.scalar cimport scalar +from cudf._libxx.cpp.types cimport size_type +from cudf._libxx.cpp.column.column_view cimport column_view +from cudf._libxx.cpp.nvtext.ngrams_tokenize cimport ( + ngrams_tokenize as cpp_ngrams_tokenize +) +from cudf._libxx.column cimport Column +from cudf._libxx.scalar cimport Scalar + + +def ngrams_tokenize( + Column strings, + int ngrams, + Scalar delimiter, + Scalar separator +): + cdef column_view source_view = strings.view() + cdef size_type c_ngrams = ngrams + cdef scalar* c_separator = separator.c_value.get() + cdef scalar* c_delimiter = delimiter.c_value.get() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_ngrams_tokenize( + c_strings, + c_ngrams + c_delimiter[0] + c_separator[0] + ) + ) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_libxx/nvtext/normalize.pyx b/python/cudf/cudf/_libxx/nvtext/normalize.pyx new file mode 100644 index 00000000000..7ed15b9dfc6 --- /dev/null +++ b/python/cudf/cudf/_libxx/nvtext/normalize.pyx @@ -0,0 +1,22 @@ +# Copyright (c) 2018-2020, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from cudf._libxx.move cimport move + +from cudf._libxx.cpp.column.column cimport column +from cudf._libxx.cpp.column.column_view cimport column_view +from cudf._libxx.cpp.nvtext.normalize cimport ( + normalize as cpp_normalize +) +from cudf._libxx.column cimport Column +from cudf._libxx.scalar cimport Scalar + + +def normalize_spaces(Column strings, int ngrams): + cdef column_view source_view = strings.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_normalize(c_strings)) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_libxx/nvtext/tokenize.pyx b/python/cudf/cudf/_libxx/nvtext/tokenize.pyx new file mode 100644 index 00000000000..5383b8136b2 --- /dev/null +++ b/python/cudf/cudf/_libxx/nvtext/tokenize.pyx @@ -0,0 +1,87 @@ +# Copyright (c) 2018-2020, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from cudf._libxx.move cimport move + +from cudf._libxx.cpp.column.column cimport column +from cudf._libxx.cpp.scalar.scalar cimport scalar +from cudf._libxx.cpp.types cimport size_type +from cudf._libxx.cpp.column.column_view cimport column_view +from cudf._libxx.cpp.nvtext.tokenize cimport ( + tokenize as cpp_tokenize, + count_tokens as cpp_count_tokens +) +from cudf._libxx.column cimport Column +from cudf._libxx.scalar cimport Scalar + + +def tokenize(Column strings, Scalar delimiter): + cdef column_view c_strings = strings.view() + cdef scalar* c_delimiter = delimiter.c_value.get() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_tokenize( + c_strings, + c_ngrams + c_delimiter[0] + c_separator[0] + ) + ) + + return Column.from_unique_ptr(move(c_result)) + + +def tokenize(Column strings, Column delimiters): + cdef column_view c_strings = strings.view() + cdef column_view c_delimiter = delimiter.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_tokenize( + c_strings, + c_ngrams + c_delimiter[0] + c_separator[0] + ) + ) + + return Column.from_unique_ptr(move(c_result)) + + +def count_tokens(Column strings, Scalar delimiter): + cdef column_view c_strings = strings.view() + cdef scalar* c_delimiter = delimiter.c_value.get() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_count_tokens( + c_strings, + c_ngrams + c_delimiter[0] + c_separator[0] + ) + ) + + return Column.from_unique_ptr(move(c_result)) + + +def count_tokens(Column strings, Column delimiters): + cdef column_view c_strings = strings.view() + cdef column_view c_delimiter = delimiter.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_count_tokens( + c_strings, + c_ngrams + c_delimiter[0] + c_separator[0] + ) + ) + + return Column.from_unique_ptr(move(c_result)) From f33806c812abfe3b2e0365796ec4a9d8326e7ddf Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Fri, 13 Mar 2020 20:27:24 +0530 Subject: [PATCH 22/79] Chnagelog typo --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cdc3e7eb71d..2236e701ed9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -234,7 +234,7 @@ - PR #4358 Fix strings::concat where narep is an empty string - PR #4369 Fix race condition in gpuinflate - PR #4390 Disable ScatterValid and ScatterNull legacy tests -- PR #4398 Fixes the bug in groupby in MIN/MAX on strings when strings some groups are empty +- PR #4398 Fixes the failure in groupby in MIN/MAX on strings when some groups are empty - PR #4406 Fix sorted merge issue with null values and ascending=False - PR #4423 Tighten up Dask serialization checks - PR #4434 Fix join_strings logic with all-null strings and non-null narep From a77f47864acdcd278c60d5c72ed33b1da0bfb5e8 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 13 Mar 2020 10:13:45 -0500 Subject: [PATCH 23/79] Update python/cudf/cudf/core/column/string.py Co-Authored-By: Keith Kraus --- python/cudf/cudf/core/column/string.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 982777d526c..8196719f60f 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -882,7 +882,7 @@ def split(self, pat=None, n=-1, expand=True, **kwargs): if len(result_table._data) == 1: if result_table._data[0].null_count == len(self._parent): result_table = [] - elif self._parent.null_count == len(self._parent): + elif self._column.null_count == len(self._column): result_table = [self._column.copy()] return self._return_or_inplace(result_table, **kwargs,) From cfe1786a2ddbdfcc0c7bb1b2be52dbaf797231ed Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 13 Mar 2020 10:13:56 -0500 Subject: [PATCH 24/79] Update python/cudf/cudf/core/column/string.py Co-Authored-By: Keith Kraus --- python/cudf/cudf/core/column/string.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 8196719f60f..ebe338a8014 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -880,7 +880,7 @@ def split(self, pat=None, n=-1, expand=True, **kwargs): result_table = cpp_split(self._column, Scalar(pat, "str"), n) if len(result_table._data) == 1: - if result_table._data[0].null_count == len(self._parent): + if result_table._data[0].null_count == len(self._column): result_table = [] elif self._column.null_count == len(self._column): result_table = [self._column.copy()] From f198709ac65add6b0229ce1a62bda3e2c267427b Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 13 Mar 2020 08:37:00 -0700 Subject: [PATCH 25/79] enable replace_with_backrefs tests --- python/cudf/cudf/tests/test_string.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 9764a20f0cf..abb0dd45cdc 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1479,8 +1479,6 @@ def test_string_replace_multi(): assert_eq(expect, got) -# TODO, PREM: Uncomment this following tests after -# this is fixed: https://github.com/rapidsai/cudf/issues/4380 @pytest.mark.parametrize( "find", [ @@ -1488,22 +1486,15 @@ def test_string_replace_multi(): "(\\d)(\\d)", "(\\d)(\\d)", "(\\d)(\\d)", - # "([a-z])-([a-z])", + "([a-z])-([a-z])", "([a-z])-([a-zé])", "([a-z])-([a-z])", - # "([a-z])-([a-zé])", + "([a-z])-([a-zé])", ], ) @pytest.mark.parametrize( "replace", - [ - "\\1-\\2", - "V\\2-\\1", - "\\1 \\2", - "\\2 \\1", - # "X\\1+\\2Z", - # "X\\1+\\2Z" - ], + ["\\1-\\2", "V\\2-\\1", "\\1 \\2", "\\2 \\1", "X\\1+\\2Z", "X\\1+\\2Z"], ) def test_string_replace_with_backrefs(find, replace): s = [ @@ -1513,7 +1504,7 @@ def test_string_replace_with_backrefs(find, replace): None, "tést-string", "two-thréé four-fivé", - # "abcd-éfgh", + "abcd-éfgh", "tést-string-again", ] ps = pd.Series(s) From b3a4a2bd78c506a23285f4f0067c4f709c9c0c42 Mon Sep 17 00:00:00 2001 From: "Ram (Ramakrishna Prabhu)" Date: Fri, 13 Mar 2020 10:57:43 -0500 Subject: [PATCH 26/79] code changes --- python/cudf/cudf/_libxx/copying.pyx | 43 +++++++++++++++++++++++------ python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/frame.py | 28 +++++++++++++++---- 3 files changed, 57 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/_libxx/copying.pyx b/python/cudf/cudf/_libxx/copying.pyx index 4c692e7b210..03d3a017137 100644 --- a/python/cudf/cudf/_libxx/copying.pyx +++ b/python/cudf/cudf/_libxx/copying.pyx @@ -313,9 +313,14 @@ def column_allocate_like(Column input_column, size=None): return Column.from_unique_ptr(move(c_result)) -def table_empty_like(Table input_table): +def table_empty_like(Table input_table, bool keep_index=True): + + cdef table_view input_table_view + if keep_index is True: + input_table_view = input_table.view() + else: + input_table_view = input_table.data_view() - cdef table_view input_table_view = input_table.view() cdef unique_ptr[table] c_result with nogil: @@ -324,7 +329,9 @@ def table_empty_like(Table input_table): return Table.from_unique_ptr( move(c_result), column_names=input_table._column_names, - index_names=input_table._index._column_names + index_names=( + input_table._index._column_names if keep_index is True else None + ) ) @@ -357,9 +364,14 @@ def column_slice(Column input_column, object indices): return result -def table_slice(Table input_table, object indices): +def table_slice(Table input_table, object indices, bool keep_index=True): + + cdef table_view input_table_view + if keep_index is True: + input_table_view = input_table.view() + else: + input_table_view = input_table.data_view() - cdef table_view input_table_view = input_table.view() cdef vector[size_type] c_indices c_indices.reserve(len(indices)) @@ -382,7 +394,11 @@ def table_slice(Table input_table, object indices): c_result[i], input_table, column_names=input_table._column_names, - index_names=input_table._index._column_names + index_names=( + input_table._index._column_names if ( + keep_index is True) + else None + ) ) for i in range(num_of_result_cols)] return result @@ -419,9 +435,14 @@ def column_split(Column input_column, object splits): return result -def table_split(Table input_table, object splits): +def table_split(Table input_table, object splits, keep_index=True): + + cdef table_view input_table_view + if keep_index is True: + input_table_view = input_table.view() + else: + input_table_view = input_table.data_view() - cdef table_view input_table_view = input_table.view() cdef vector[size_type] c_splits c_splits.reserve(len(splits)) @@ -444,7 +465,11 @@ def table_split(Table input_table, object splits): c_result[i], input_table, column_names=input_table._column_names, - index_names=input_table._index._column_names + index_names=( + input_table._index._column_names if ( + keep_index is True) + else None + ) ) for i in range(num_of_result_cols)] return result diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 4c31547cb1b..2e18dcedc5f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3964,7 +3964,7 @@ def scatter_by_map(self, map_index, map_size=None, keep_index=True): # Append empty dataframes if map_size > len(tables) for i in range(map_size - len(tables)): - tables.append(self.take([])) + tables.append(self._empty_like(keep_index)) return tables def stack(self, level=-1, dropna=True): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 72bf2af4cfc..72b5a4eacca 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -106,8 +106,10 @@ def _scatter(self, key, value): result._copy_categories(self) return result - def _empty_like(self): - result = self._from_table(libcudfxx.copying.table_empty_like(self)) + def _empty_like(self, keep_index=True): + result = self._from_table( + libcudfxx.copying.table_empty_like(self, keep_index) + ) result._copy_categories(self) return result @@ -121,6 +123,8 @@ def _slice(self, arg): arg : should always be of type slice and doesn't handle step """ + from cudf.core.index import RangeIndex + num_rows = len(self) if num_rows == 0: return self @@ -131,22 +135,34 @@ def _slice(self, arg): """Step size is not supported other than None and 1""" ) + # This is just to handle RangeIndex type, stop + # it from materializing unnecessarily + keep_index = True + if isinstance(self.index, RangeIndex): + keep_index = False + if start < 0: start = start + num_rows if stop < 0: stop = stop + num_rows if start > stop: - return self._empty_like() + return self._empty_like(keep_index) else: start = len(self) if start > num_rows else start stop = len(self) if stop > num_rows else stop result = self._from_table( - libcudfxx.copying.table_slice(self, [start, stop])[0] + libcudfxx.copying.table_slice(self, [start, stop], keep_index)[ + 0 + ] ) - result._copy_categories(self) + result._copy_categories(self, keep_index) + # Adding index of type RangeIndex back to + # result + if keep_index is False: + result.index = RangeIndex(start, stop) return result def _normalize_scalars(self, other): @@ -366,7 +382,7 @@ def _scatter_to_tables(self, scatter_map, keep_index=True): self, scatter_map, keep_index ) result = [self._from_table(tbl) for tbl in result] - [frame._copy_categories(self) for frame in result] + [frame._copy_categories(self, keep_index) for frame in result] return result From 39e1103b819bc644edf9bf77c2f65b371e43e905 Mon Sep 17 00:00:00 2001 From: "Ram (Ramakrishna Prabhu)" Date: Fri, 13 Mar 2020 11:03:59 -0500 Subject: [PATCH 27/79] CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 54329da76a7..33918e3e8c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -165,6 +165,7 @@ - PR #4316 Add Java and JNI bindings for substring expression - PR #4314 Add Java and JNI bindings for string contains - PR #4461 Port nvstrings Miscellaneous functions to cuDF Python/Cython +- PR #4499 Adding changes to handle include `keep_index` and `RangeIndex` ## Bug Fixes From 5b1d24ce4cbd51d50d7cae7868bf8024e298ee27 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 13 Mar 2020 09:17:04 -0700 Subject: [PATCH 28/79] clean up .str.join stale code --- python/cudf/cudf/core/column/string.py | 38 -------------------------- 1 file changed, 38 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index ebe338a8014..9296ebc9e70 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -295,44 +295,6 @@ def cat(self, others=None, sep=None, na_rep=None, **kwargs): out = out[0] return out - # TODO, PREM: Uncomment in future PR - # def join(self, sep, na_rep="", **kwargs): - # """ - # Join lists contained as elements in the Series/Index with passed - # delimiter. - - # Parameters - # ---------- - # sep : str - # Delimiter to use between list entries. - - # na_rep : str - # This character will take the place of any null strings - # (not empty strings) in either list. - - # Returns - # ------- - # Series/Index of str dtype - # The list entries concatenated by intervening - # occurrences of the delimiter. - - # """ - # from cudf._libxx.scalar import Scalar - # from cudf.core.series import Series - # # import pdb; pdb.set_trace() - - # data = cpp_join(self._column, Scalar(sep), Scalar(na_rep)) - # if len(data) != len(self._parent): - # data = column.as_column( - # utils.scalar_broadcast_to(data[0], - # len(self._parent), dtype='str') - # ) - # return Series( - # data=data, - # index=self._parent.index, - # dtype='str' - # ) - def join(self, sep): """ Join lists contained as elements in the Series/Index with passed From 56b0859311036af30193db45ec0748ac54648336 Mon Sep 17 00:00:00 2001 From: "Ram (Ramakrishna Prabhu)" Date: Fri, 13 Mar 2020 11:56:50 -0500 Subject: [PATCH 29/79] Review changes --- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/frame.py | 20 +++++++++++++------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 2e18dcedc5f..ea57b9b92b7 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1490,7 +1490,6 @@ def reset_index(self, drop=False, inplace=False): result = self else: result = self.copy() - index_columns = self.index._data.columns if all(name is None for name in self.index.names): if isinstance(self.index, cudf.MultiIndex): names = tuple( @@ -1502,6 +1501,7 @@ def reset_index(self, drop=False, inplace=False): names = self.index.names if not drop: + index_columns = self.index._data.columns for name, index_column in zip( reversed(names), reversed(index_columns) ): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 72b5a4eacca..7f86c55cf6c 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -84,7 +84,7 @@ def _hash_partition( self, columns_to_hash, num_partitions, keep_index ) output = self.__class__._from_table(output) - output._copy_categories(self) + output._copy_categories(self, include_index=keep_index) return output, offsets def _as_column(self): @@ -111,7 +111,7 @@ def _empty_like(self, keep_index=True): libcudfxx.copying.table_empty_like(self, keep_index) ) - result._copy_categories(self) + result._copy_categories(self, include_index=keep_index) return result def _slice(self, arg): @@ -124,6 +124,7 @@ def _slice(self, arg): """ from cudf.core.index import RangeIndex + from cudf import DataFrame, Series num_rows = len(self) if num_rows == 0: @@ -138,7 +139,9 @@ def _slice(self, arg): # This is just to handle RangeIndex type, stop # it from materializing unnecessarily keep_index = True - if isinstance(self.index, RangeIndex): + if isinstance(self, (DataFrame, Series)) and isinstance( + self.index, RangeIndex + ): keep_index = False if start < 0: @@ -158,11 +161,11 @@ def _slice(self, arg): ] ) - result._copy_categories(self, keep_index) + result._copy_categories(self, include_index=keep_index) # Adding index of type RangeIndex back to # result - if keep_index is False: - result.index = RangeIndex(start, stop) + if keep_index is False and isinstance(self, (DataFrame, Series)): + result.index = self.index[start:stop] return result def _normalize_scalars(self, other): @@ -382,7 +385,10 @@ def _scatter_to_tables(self, scatter_map, keep_index=True): self, scatter_map, keep_index ) result = [self._from_table(tbl) for tbl in result] - [frame._copy_categories(self, keep_index) for frame in result] + [ + frame._copy_categories(self, include_index=keep_index) + for frame in result + ] return result From 2b3e24da0dfa11202cc3bc393991fb96796a60c8 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 13 Mar 2020 10:14:22 -0700 Subject: [PATCH 30/79] special case handling for .str.startswith and .str.endswith --- python/cudf/cudf/core/column/string.py | 30 ++++++++++++++++++++------ python/cudf/cudf/tests/test_string.py | 13 +---------- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 9296ebc9e70..d94df8265b5 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1435,9 +1435,18 @@ def endswith(self, pat, **kwargs): from cudf._libxx.scalar import Scalar - return self._return_or_inplace( - cpp_endswith(self._column, Scalar(pat, "str")), **kwargs - ) + # TODO: Cleanup if/else blocks after this issue is fixed: + # https://github.com/rapidsai/cudf/issues/4500 + if pat == "": + result_col = column.as_column( + True, dtype="bool", length=len(self._column) + ).set_mask(self._column.mask) + elif pat is None: + result_col = column.as_column(np.nan, length=len(self._column)) + else: + result_col = cpp_endswith(self._column, Scalar(pat, "str")) + + return self._return_or_inplace(result_col, **kwargs) def startswith(self, pat, **kwargs): """ @@ -1463,9 +1472,18 @@ def startswith(self, pat, **kwargs): from cudf._libxx.scalar import Scalar - return self._return_or_inplace( - cpp_startswith(self._column, Scalar(pat, "str")), **kwargs - ) + # TODO: Cleanup if/else blocks after this issue is fixed: + # https://github.com/rapidsai/cudf/issues/4500 + if pat == "": + result_col = column.as_column( + True, dtype="bool", length=len(self._column) + ).set_mask(self._column.mask) + elif pat is None: + result_col = column.as_column(np.nan, length=len(self._column)) + else: + result_col = cpp_startswith(self._column, Scalar(pat, "str")) + + return self._return_or_inplace(result_col, **kwargs) def find(self, sub, start=0, end=None, **kwargs): """ diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index abb0dd45cdc..2db7587f8ad 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1539,18 +1539,7 @@ def test_string_table_view_creation(): ], ) @pytest.mark.parametrize( - "pat", - [ - # TODO, PREM: Uncomment after this issue is fixed - # '', - # None, - " ", - "a", - "abc", - "cat", - "$", - "\n", - ], + "pat", ["", None, " ", "a", "abc", "cat", "$", "\n"], ) def test_string_starts_ends(data, pat): ps = pd.Series(data) From a8ef01d6ba21b8e562f7eb2c282179960df80cae Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 13 Mar 2020 13:11:29 -0500 Subject: [PATCH 31/79] nvtext: fix various syntax and import errors. --- CHANGELOG.md | 2 + .../_libxx/cpp/nvtext/generate_ngrams.pxd | 2 +- .../_libxx/cpp/nvtext/ngrams_tokenize.pxd | 2 +- .../cudf/cudf/_libxx/cpp/nvtext/normalize.pxd | 2 +- .../cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd | 2 +- python/cudf/cudf/_libxx/nvtext/__init__.pxd | 0 .../cudf/_libxx/nvtext/generate_ngrams.pyx | 52 +++--- .../cudf/_libxx/nvtext/ngrams_tokenize.pyx | 66 +++---- python/cudf/cudf/_libxx/nvtext/normalize.pyx | 6 +- python/cudf/cudf/_libxx/nvtext/tokenize.pyx | 166 +++++++++--------- 10 files changed, 147 insertions(+), 153 deletions(-) create mode 100644 python/cudf/cudf/_libxx/nvtext/__init__.pxd diff --git a/CHANGELOG.md b/CHANGELOG.md index 54329da76a7..0de164a1fa1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -145,6 +145,7 @@ - PR #4244 Port nvstrings Substring Gather/Scatter functions to cuDF Python/Cython - PR #4280 Port nvstrings Numeric Handling functions to cuDF Python/Cython - PR #4278 Port filling.pyx to libcudf++ API +- PR #4278 Port filling.pyx to libcudf++ API - PR #4328 Add memory threshold callbacks for Java RMM event handler - PR #4336 Move a bunch of internal nvstrings code to use native StringColumns - PR #4166 Port `is_sorted.pyx` to use libcudf++ APIs @@ -165,6 +166,7 @@ - PR #4316 Add Java and JNI bindings for substring expression - PR #4314 Add Java and JNI bindings for string contains - PR #4461 Port nvstrings Miscellaneous functions to cuDF Python/Cython +- PR #4495 Port nvtext to cuDF Python/Cython ## Bug Fixes diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd index f20209956b1..d75acb92c71 100644 --- a/python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd +++ b/python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd @@ -7,7 +7,7 @@ from cudf._libxx.cpp.column.column_view cimport column_view from cudf._libxx.cpp.scalar.scalar cimport scalar from cudf._libxx.cpp.types cimport size_type -cdef extern from "cudf/nvtext/generate_ngrams.hpp" namespace "nvtext" nogil: +cdef extern from "nvtext/generate_ngrams.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] generate_ngrams( const column_view &strings, diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd index 0fc892d3b9d..b34c1a2953d 100644 --- a/python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd +++ b/python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd @@ -7,7 +7,7 @@ from cudf._libxx.cpp.column.column_view cimport column_view from cudf._libxx.cpp.scalar.scalar cimport scalar from cudf._libxx.cpp.types cimport size_type -cdef extern from "cudf/nvtext/ngrams_tokenize.hpp" namespace "nvtext" nogil: +cdef extern from "nvtext/ngrams_tokenize.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] ngrams_tokenize( const column_view & strings, diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/normalize.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/normalize.pxd index dc4b060d7f6..900b9e0b0b9 100644 --- a/python/cudf/cudf/_libxx/cpp/nvtext/normalize.pxd +++ b/python/cudf/cudf/_libxx/cpp/nvtext/normalize.pxd @@ -5,7 +5,7 @@ from libcpp.memory cimport unique_ptr from cudf._libxx.cpp.column.column cimport column from cudf._libxx.cpp.column.column_view cimport column_view -cdef extern from "cudf/nvtext/normalize.hpp" namespace "nvtext" nogil: +cdef extern from "nvtext/normalize.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] normalize_spaces( const column_view & strings diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd index 0c653c7afbe..7aa0fb2f12d 100644 --- a/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd +++ b/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd @@ -6,7 +6,7 @@ from cudf._libxx.cpp.column.column cimport column from cudf._libxx.cpp.column.column_view cimport column_view from cudf._libxx.cpp.scalar.scalar cimport scalar -cdef extern from "cudf/nvtext/tokenize.hpp" namespace "nvtext" nogil: +cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] tokenize( const column_view & strings, diff --git a/python/cudf/cudf/_libxx/nvtext/__init__.pxd b/python/cudf/cudf/_libxx/nvtext/__init__.pxd new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx index 51796e5411a..f83262ae521 100644 --- a/python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx +++ b/python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx @@ -1,32 +1,32 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# # Copyright (c) 2018-2020, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from cudf._libxx.move cimport move +# from libcpp.memory cimport unique_ptr +# from cudf._libxx.move cimport move -from cudf._libxx.cpp.column.column cimport column -from cudf._libxx.cpp.scalar.scalar cimport scalar -from cudf._libxx.cpp.types cimport size_type -from cudf._libxx.cpp.column.column_view cimport column_view -from cudf._libxx.cpp.nvtext.generate_ngrams cimport ( - generate_ngrams as cpp_generate_ngrams -) -from cudf._libxx.column cimport Column -from cudf._libxx.scalar cimport Scalar +# from cudf._libxx.cpp.column.column cimport column +# from cudf._libxx.cpp.scalar.scalar cimport scalar +# from cudf._libxx.cpp.types cimport size_type +# from cudf._libxx.cpp.column.column_view cimport column_view +# from cudf._libxx.cpp.nvtext.generate_ngrams cimport ( +# generate_ngrams as cpp_generate_ngrams +# ) +# from cudf._libxx.column cimport Column +# from cudf._libxx.scalar cimport Scalar -def generate_ngrams(Column strings, int ngrams, Scalar separator): - cdef column_view source_view = strings.view() - cdef size_type c_ngrams = ngrams - cdef scalar* c_separator = separator.c_value.get() - cdef unique_ptr[column] c_result +# def generate_ngrams(Column strings, int ngrams, Scalar separator): +# cdef column_view c_strings = strings.view() +# cdef size_type c_ngrams = ngrams +# cdef scalar* c_separator = separator.c_value.get() +# cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_generate_ngrams( - c_strings, - c_ngrams - c_separator[0] - ) - ) +# with nogil: +# c_result = move( +# cpp_generate_ngrams( +# c_strings, +# c_ngrams, +# c_separator[0] +# ) +# ) - return Column.from_unique_ptr(move(c_result)) +# return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx index 1f7afa2bf09..86d47382dea 100644 --- a/python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx +++ b/python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx @@ -1,39 +1,39 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# # Copyright (c) 2018-2020, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from cudf._libxx.move cimport move +# from libcpp.memory cimport unique_ptr +# from cudf._libxx.move cimport move -from cudf._libxx.cpp.column.column cimport column -from cudf._libxx.cpp.scalar.scalar cimport scalar -from cudf._libxx.cpp.types cimport size_type -from cudf._libxx.cpp.column.column_view cimport column_view -from cudf._libxx.cpp.nvtext.ngrams_tokenize cimport ( - ngrams_tokenize as cpp_ngrams_tokenize -) -from cudf._libxx.column cimport Column -from cudf._libxx.scalar cimport Scalar +# from cudf._libxx.cpp.column.column cimport column +# from cudf._libxx.cpp.scalar.scalar cimport scalar +# from cudf._libxx.cpp.types cimport size_type +# from cudf._libxx.cpp.column.column_view cimport column_view +# from cudf._libxx.cpp.nvtext.ngrams_tokenize cimport ( +# ngrams_tokenize as cpp_ngrams_tokenize +# ) +# from cudf._libxx.column cimport Column +# from cudf._libxx.scalar cimport Scalar -def ngrams_tokenize( - Column strings, - int ngrams, - Scalar delimiter, - Scalar separator -): - cdef column_view source_view = strings.view() - cdef size_type c_ngrams = ngrams - cdef scalar* c_separator = separator.c_value.get() - cdef scalar* c_delimiter = delimiter.c_value.get() - cdef unique_ptr[column] c_result +# def ngrams_tokenize( +# Column strings, +# int ngrams, +# Scalar delimiter, +# Scalar separator +# ): +# cdef column_view c_strings = strings.view() +# cdef size_type c_ngrams = ngrams +# cdef scalar* c_separator = separator.c_value.get() +# cdef scalar* c_delimiter = delimiter.c_value.get() +# cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_ngrams_tokenize( - c_strings, - c_ngrams - c_delimiter[0] - c_separator[0] - ) - ) +# with nogil: +# c_result = move( +# cpp_ngrams_tokenize( +# c_strings, +# c_ngrams, +# c_delimiter[0], +# c_separator[0] +# ) +# ) - return Column.from_unique_ptr(move(c_result)) +# return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_libxx/nvtext/normalize.pyx b/python/cudf/cudf/_libxx/nvtext/normalize.pyx index 7ed15b9dfc6..7b4432c54a4 100644 --- a/python/cudf/cudf/_libxx/nvtext/normalize.pyx +++ b/python/cudf/cudf/_libxx/nvtext/normalize.pyx @@ -6,17 +6,17 @@ from cudf._libxx.move cimport move from cudf._libxx.cpp.column.column cimport column from cudf._libxx.cpp.column.column_view cimport column_view from cudf._libxx.cpp.nvtext.normalize cimport ( - normalize as cpp_normalize + normalize_spaces as cpp_normalize_spaces ) from cudf._libxx.column cimport Column from cudf._libxx.scalar cimport Scalar def normalize_spaces(Column strings, int ngrams): - cdef column_view source_view = strings.view() + cdef column_view c_strings = strings.view() cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_normalize(c_strings)) + c_result = move(cpp_normalize_spaces(c_strings)) return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_libxx/nvtext/tokenize.pyx b/python/cudf/cudf/_libxx/nvtext/tokenize.pyx index 5383b8136b2..e3af10e2cac 100644 --- a/python/cudf/cudf/_libxx/nvtext/tokenize.pyx +++ b/python/cudf/cudf/_libxx/nvtext/tokenize.pyx @@ -1,87 +1,79 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from cudf._libxx.move cimport move - -from cudf._libxx.cpp.column.column cimport column -from cudf._libxx.cpp.scalar.scalar cimport scalar -from cudf._libxx.cpp.types cimport size_type -from cudf._libxx.cpp.column.column_view cimport column_view -from cudf._libxx.cpp.nvtext.tokenize cimport ( - tokenize as cpp_tokenize, - count_tokens as cpp_count_tokens -) -from cudf._libxx.column cimport Column -from cudf._libxx.scalar cimport Scalar - - -def tokenize(Column strings, Scalar delimiter): - cdef column_view c_strings = strings.view() - cdef scalar* c_delimiter = delimiter.c_value.get() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_tokenize( - c_strings, - c_ngrams - c_delimiter[0] - c_separator[0] - ) - ) - - return Column.from_unique_ptr(move(c_result)) - - -def tokenize(Column strings, Column delimiters): - cdef column_view c_strings = strings.view() - cdef column_view c_delimiter = delimiter.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_tokenize( - c_strings, - c_ngrams - c_delimiter[0] - c_separator[0] - ) - ) - - return Column.from_unique_ptr(move(c_result)) - - -def count_tokens(Column strings, Scalar delimiter): - cdef column_view c_strings = strings.view() - cdef scalar* c_delimiter = delimiter.c_value.get() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_count_tokens( - c_strings, - c_ngrams - c_delimiter[0] - c_separator[0] - ) - ) - - return Column.from_unique_ptr(move(c_result)) - - -def count_tokens(Column strings, Column delimiters): - cdef column_view c_strings = strings.view() - cdef column_view c_delimiter = delimiter.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_count_tokens( - c_strings, - c_ngrams - c_delimiter[0] - c_separator[0] - ) - ) - - return Column.from_unique_ptr(move(c_result)) +# # Copyright (c) 2018-2020, NVIDIA CORPORATION. + +# from libcpp.memory cimport unique_ptr +# from cudf._libxx.move cimport move + +# from cudf._libxx.cpp.column.column cimport column +# from cudf._libxx.cpp.scalar.scalar cimport scalar +# from cudf._libxx.cpp.types cimport size_type +# from cudf._libxx.cpp.column.column_view cimport column_view +# from cudf._libxx.cpp.nvtext.tokenize cimport ( +# tokenize as cpp_tokenize, +# count_tokens as cpp_count_tokens +# ) +# from cudf._libxx.column cimport Column +# from cudf._libxx.scalar cimport Scalar + + +# def tokenize(Column strings, Scalar delimiter): +# cdef column_view c_strings = strings.view() +# cdef scalar* c_delimiter = delimiter.c_value.get() +# cdef unique_ptr[column] c_result + +# with nogil: +# c_result = move( +# cpp_tokenize( +# c_strings, +# c_delimiter[0], +# ) +# ) + +# return Column.from_unique_ptr(move(c_result)) + + +# def tokenize(Column strings, Column delimiters): +# cdef column_view c_strings = strings.view() +# cdef column_view c_delimiters = delimiters.view() +# cdef unique_ptr[column] c_result + +# with nogil: +# c_result = move( +# cpp_tokenize( +# c_strings, +# c_delimiters +# ) +# ) + +# return Column.from_unique_ptr(move(c_result)) + + +# def count_tokens(Column strings, Scalar delimiter): +# cdef column_view c_strings = strings.view() +# cdef scalar* c_delimiter = delimiter.c_value.get() +# cdef unique_ptr[column] c_result + +# with nogil: +# c_result = move( +# cpp_count_tokens( +# c_strings, +# c_delimiter[0] +# ) +# ) + +# return Column.from_unique_ptr(move(c_result)) + + +# def count_tokens(Column strings, Column delimiters): +# cdef column_view c_strings = strings.view() +# cdef column_view c_delimiters = delimiters.view() +# cdef unique_ptr[column] c_result + +# with nogil: +# c_result = move( +# cpp_count_tokens( +# c_strings, +# c_delimiters +# ) +# ) + +# return Column.from_unique_ptr(move(c_result)) From fd489e927624d1fe040a7caead05eecaecb11fd2 Mon Sep 17 00:00:00 2001 From: "Ram (Ramakrishna Prabhu)" Date: Fri, 13 Mar 2020 13:36:04 -0500 Subject: [PATCH 32/79] Review changes, rather than series which might create one more index, use column and update table.pyx --- python/cudf/cudf/_libxx/table.pyx | 4 +--- python/cudf/cudf/core/dataframe.py | 20 ++++++++++---------- python/cudf/cudf/core/frame.py | 7 ++----- 3 files changed, 13 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/_libxx/table.pyx b/python/cudf/cudf/_libxx/table.pyx index c83123ee38b..9bfd413a028 100644 --- a/python/cudf/cudf/_libxx/table.pyx +++ b/python/cudf/cudf/_libxx/table.pyx @@ -51,9 +51,7 @@ cdef class Table: @property def _num_rows(self): if self._index is not None: - if len(self._index._data) == 0: - return 0 - return self._index._num_rows + return len(self._index) return len(self._data.columns[0]) @property diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index ea57b9b92b7..dd956f0dffa 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3925,33 +3925,33 @@ def scatter_by_map(self, map_index, map_size=None, keep_index=True): """ # map_index might be a column name or array, - # make it a Series + # make it a Column if isinstance(map_index, str): - map_index = self[map_index] + map_index = self[map_index]._column + elif isinstance(map_index, Series): + map_index = map_index._column else: - map_index = Series(map_index) + map_index = as_column(map_index) # Convert float to integer if map_index.dtype == np.float: map_index = map_index.astype(np.int32) # Convert string or categorical to integer - if isinstance(map_index._column, StringColumn): - map_index = Series( - map_index._column.as_categorical_column(np.int32).as_numerical - ) + if isinstance(map_index, StringColumn): + map_index = map_index.as_categorical_column(np.int32).as_numerical warnings.warn( "Using StringColumn for map_index in scatter_by_map. " "Use an integer array/column for better performance." ) - elif isinstance(map_index._column, CategoricalColumn): - map_index = Series(map_index._column.as_numerical) + elif isinstance(map_index, CategoricalColumn): + map_index = map_index.as_numerical warnings.warn( "Using CategoricalColumn for map_index in scatter_by_map. " "Use an integer array/column for better performance." ) - tables = self._scatter_to_tables(map_index._column, keep_index) + tables = self._scatter_to_tables(map_index, keep_index) if map_size: # Make sure map_size is >= the number of uniques in map_index diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 7f86c55cf6c..73f416cdb9c 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -124,7 +124,6 @@ def _slice(self, arg): """ from cudf.core.index import RangeIndex - from cudf import DataFrame, Series num_rows = len(self) if num_rows == 0: @@ -139,9 +138,7 @@ def _slice(self, arg): # This is just to handle RangeIndex type, stop # it from materializing unnecessarily keep_index = True - if isinstance(self, (DataFrame, Series)) and isinstance( - self.index, RangeIndex - ): + if self.index is not None and isinstance(self.index, RangeIndex): keep_index = False if start < 0: @@ -164,7 +161,7 @@ def _slice(self, arg): result._copy_categories(self, include_index=keep_index) # Adding index of type RangeIndex back to # result - if keep_index is False and isinstance(self, (DataFrame, Series)): + if keep_index is False and self.index is not None: result.index = self.index[start:stop] return result From f8e61dfc062ff9bf91f3cffcc998b46f8f9297f0 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 13 Mar 2020 12:15:24 -0700 Subject: [PATCH 33/79] re-work .str.find, .str.rfind, .str.index, .str.rindex logic to handle special cases --- python/cudf/cudf/core/column/string.py | 89 ++++++++++++++++++++++---- python/cudf/cudf/tests/test_string.py | 48 ++++---------- 2 files changed, 88 insertions(+), 49 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index d94df8265b5..5b1d8acc182 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1507,14 +1507,33 @@ def find(self, sub, start=0, end=None, **kwargs): Series or Index of int """ + if not isinstance(sub, str): + msg = "expected a string object, not {0}" + raise TypeError(msg.format(type(sub).__name__)) + from cudf._libxx.scalar import Scalar if end is None: end = -1 + mask = self._column.mask - return self._return_or_inplace( - cpp_find(self._column, Scalar(sub, "str"), start, end), **kwargs - ) + if sub == "": + result_col = column.as_column( + start, dtype="float", length=len(self._column) + ) + else: + result_col = cpp_find(self._column, Scalar(sub, "str"), start, end) + + result_col = result_col.set_mask(mask) + if self._column.has_nulls: + result_col = result_col.astype("float64") + else: + result_col = result_col.astype("int64") + + result = self._return_or_inplace(result_col, **kwargs) + if sub == "": + result[self._parent.str.len() < start] = -1 + return result def rfind(self, sub, start=0, end=None, **kwargs): """ @@ -1538,13 +1557,35 @@ def rfind(self, sub, start=0, end=None, **kwargs): Series or Index of int """ + if not isinstance(sub, str): + msg = "expected a string object, not {0}" + raise TypeError(msg.format(type(sub).__name__)) + from cudf._libxx.scalar import Scalar if end is None: end = -1 - return self._return_or_inplace( - cpp_rfind(self._column, Scalar(sub, "str"), start, end), **kwargs - ) + mask = self._column.mask + + if sub == "": + result_col = cpp_count_characters(self._column) + else: + result_col = cpp_rfind( + self._column, Scalar(sub, "str"), start, end + ) + + result_col = result_col.set_mask(mask) + if self._column.has_nulls: + result_col = result_col.astype("float64") + else: + result_col = result_col.astype("int64") + + result = self._return_or_inplace(result_col, **kwargs) + if sub == "": + result[result < start] = -1 + if end != -1: + result[result > end] = end + return result def index(self, sub, start=0, end=None, **kwargs): """ @@ -1569,14 +1610,25 @@ def index(self, sub, start=0, end=None, **kwargs): Series or Index of object """ + if not isinstance(sub, str): + msg = "expected a string object, not {0}" + raise TypeError(msg.format(type(sub).__name__)) + from cudf._libxx.scalar import Scalar if end is None: end = -1 - result = self._return_or_inplace( - cpp_find(self._column, Scalar(sub, "str"), start, end), **kwargs - ) + if sub == "": + result_col = column.as_column( + 0.0, dtype="float", length=len(self._column) + ).set_mask(self._column.mask) + else: + result_col = cpp_find(self._column, Scalar(sub, "str"), start, end) + + result = self._return_or_inplace(result_col, **kwargs) + if sub == "": + result[self._parent.str.len() < start] = -1 if (result == -1).any(): raise ValueError("substring not found") @@ -1606,14 +1658,27 @@ def rindex(self, sub, start=0, end=None, **kwargs): Series or Index of object """ + if not isinstance(sub, str): + msg = "expected a string object, not {0}" + raise TypeError(msg.format(type(sub).__name__)) + from cudf._libxx.scalar import Scalar if end is None: end = -1 - result = self._return_or_inplace( - cpp_rfind(self._column, Scalar(sub, "str"), start, end), **kwargs - ) + if sub == "": + result_col = cpp_count_characters(self._column) + else: + result_col = cpp_rfind( + self._column, Scalar(sub, "str"), start, end + ) + + result = self._return_or_inplace(result_col, **kwargs) + if sub == "": + result[result < start] = -1 + if end != -1: + result[result > end] = end if (result == -1).any(): raise ValueError("substring not found") diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 2db7587f8ad..6e48a930d20 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1563,56 +1563,28 @@ def test_string_starts_ends(data, pat): ], ) @pytest.mark.parametrize( - "sub", - [ - # TODO, PREM: Uncomment after this issue is fixed - # '', - " ", - "a", - "abc", - "cat", - "$", - "\n", - ], + "sub", ["", " ", "a", "abc", "cat", "$", "\n"], ) def test_string_find(data, sub): ps = pd.Series(data) gs = Series(data) - assert_eq(ps.str.find(sub).fillna(-1), gs.str.find(sub), check_dtype=False) - assert_eq( - ps.str.find(sub, start=1).fillna(-1), - gs.str.find(sub, start=1), - check_dtype=False, - ) + assert_eq(ps.str.find(sub), gs.str.find(sub)) assert_eq( - ps.str.find(sub, end=10).fillna(-1), - gs.str.find(sub, end=10), - check_dtype=False, + ps.str.find(sub, start=1), gs.str.find(sub, start=1), ) + assert_eq(ps.str.find(sub, end=10), gs.str.find(sub, end=10)) assert_eq( - ps.str.find(sub, start=2, end=10).fillna(-1), - gs.str.find(sub, start=2, end=10), - check_dtype=False, + ps.str.find(sub, start=2, end=10), gs.str.find(sub, start=2, end=10), ) + assert_eq(ps.str.rfind(sub), gs.str.rfind(sub)) + assert_eq(ps.str.rfind(sub, start=1), gs.str.rfind(sub, start=1)) assert_eq( - ps.str.rfind(sub).fillna(-1), gs.str.rfind(sub), check_dtype=False + ps.str.rfind(sub, end=10), gs.str.rfind(sub, end=10), ) assert_eq( - ps.str.rfind(sub, start=1).fillna(-1), - gs.str.rfind(sub, start=1), - check_dtype=False, - ) - assert_eq( - ps.str.rfind(sub, end=10).fillna(-1), - gs.str.rfind(sub, end=10), - check_dtype=False, - ) - assert_eq( - ps.str.rfind(sub, start=2, end=10).fillna(-1), - gs.str.rfind(sub, start=2, end=10), - check_dtype=False, + ps.str.rfind(sub, start=2, end=10), gs.str.rfind(sub, start=2, end=10), ) @@ -1630,6 +1602,7 @@ def test_string_find(data, sub): "+", ValueError, ), + (["line to be wrapped", "another line to be wrapped"], "", None), ], ) def test_string_str_index(data, sub, er): @@ -1668,6 +1641,7 @@ def test_string_str_index(data, sub, er): "+", ValueError, ), + (["line to be wrapped", "another line to be wrapped"], "", None), ], ) def test_string_str_rindex(data, sub, er): From 31b619b067bfaed75e033ebf602b7d0f3abc60d6 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 13 Mar 2020 14:19:03 -0500 Subject: [PATCH 34/79] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f8498170886..4b074c2b518 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -256,7 +256,7 @@ - PR #4480 Fix string_scalar.value to return an empty string_view for empty string-scalar - PR #4474 Fix to not materialize RangeIndex in copy_categories - PR #4494 Update Java memory event handler for new RMM resource API -- PR #4482 Fix `.str.rsplit` & `.str.split` and enable related tests +- PR #4482 Fix `.str.rsplit`, `.str.split`, `.str.find`, `.str.rfind`, `.str.index`, `.str.rindex` and enable related tests # cuDF 0.12.0 (04 Feb 2020) From 799fd02a72ce90f979d71be23052ffa19fa68d98 Mon Sep 17 00:00:00 2001 From: "Ram (Ramakrishna Prabhu)" <42624703+rgsl888prabhu@users.noreply.github.com> Date: Fri, 13 Mar 2020 14:38:19 -0500 Subject: [PATCH 35/79] Update python/cudf/cudf/_libxx/copying.pyx Co-Authored-By: GALI PREM SAGAR --- python/cudf/cudf/_libxx/copying.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_libxx/copying.pyx b/python/cudf/cudf/_libxx/copying.pyx index 03d3a017137..63d8d4cbc9a 100644 --- a/python/cudf/cudf/_libxx/copying.pyx +++ b/python/cudf/cudf/_libxx/copying.pyx @@ -435,7 +435,7 @@ def column_split(Column input_column, object splits): return result -def table_split(Table input_table, object splits, keep_index=True): +def table_split(Table input_table, object splits, bool keep_index=True): cdef table_view input_table_view if keep_index is True: From 836e34fb3043540572db70dfe847373e82fbe11d Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 13 Mar 2020 13:24:19 -0500 Subject: [PATCH 36/79] nvtext cython: uncomment code and fix cython declaration errors --- .../_libxx/cpp/nvtext/generate_ngrams.pxd | 4 +- .../_libxx/cpp/nvtext/ngrams_tokenize.pxd | 6 +- .../cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd | 10 +- python/cudf/cudf/_libxx/nvtext/__init__.py | 0 .../cudf/_libxx/nvtext/generate_ngrams.pyx | 52 ++--- .../cudf/_libxx/nvtext/ngrams_tokenize.pyx | 66 +++--- python/cudf/cudf/_libxx/nvtext/normalize.pyx | 1 - python/cudf/cudf/_libxx/nvtext/tokenize.pyx | 189 ++++++++++-------- 8 files changed, 179 insertions(+), 149 deletions(-) delete mode 100644 python/cudf/cudf/_libxx/nvtext/__init__.py diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd index d75acb92c71..5505fda1f7d 100644 --- a/python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd +++ b/python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd @@ -4,7 +4,7 @@ from libcpp.memory cimport unique_ptr from cudf._libxx.cpp.column.column cimport column from cudf._libxx.cpp.column.column_view cimport column_view -from cudf._libxx.cpp.scalar.scalar cimport scalar +from cudf._libxx.cpp.scalar.scalar cimport string_scalar from cudf._libxx.cpp.types cimport size_type cdef extern from "nvtext/generate_ngrams.hpp" namespace "nvtext" nogil: @@ -12,5 +12,5 @@ cdef extern from "nvtext/generate_ngrams.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] generate_ngrams( const column_view &strings, size_type ngrams, - const scalar & separator + const string_scalar & separator ) except + diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd index b34c1a2953d..3c65358d777 100644 --- a/python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd +++ b/python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd @@ -4,7 +4,7 @@ from libcpp.memory cimport unique_ptr from cudf._libxx.cpp.column.column cimport column from cudf._libxx.cpp.column.column_view cimport column_view -from cudf._libxx.cpp.scalar.scalar cimport scalar +from cudf._libxx.cpp.scalar.scalar cimport string_scalar from cudf._libxx.cpp.types cimport size_type cdef extern from "nvtext/ngrams_tokenize.hpp" namespace "nvtext" nogil: @@ -12,6 +12,6 @@ cdef extern from "nvtext/ngrams_tokenize.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] ngrams_tokenize( const column_view & strings, size_type ngrams, - const scalar & delimiter, - const scalar & separator + const string_scalar & delimiter, + const string_scalar & separator ) except + diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd index 7aa0fb2f12d..21ea6dc09ae 100644 --- a/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd +++ b/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd @@ -4,26 +4,26 @@ from libcpp.memory cimport unique_ptr from cudf._libxx.cpp.column.column cimport column from cudf._libxx.cpp.column.column_view cimport column_view -from cudf._libxx.cpp.scalar.scalar cimport scalar +from cudf._libxx.cpp.scalar.scalar cimport string_scalar cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] tokenize( const column_view & strings, - const scalar & delimiter + const string_scalar & delimiter ) except + - cdef unique_ptr[column] tokenize( + cdef unique_ptr[column] tokenize_multi "nvtext::tokenize" ( const column_view & strings, const column_view & delimiters ) except + cdef unique_ptr[column] count_tokens( const column_view & strings, - const scalar & delimiter + const string_scalar & delimiter ) except + - cdef unique_ptr[column] count_tokens( + cdef unique_ptr[column] count_tokens_multi "nvtext::count_tokens" ( const column_view & strings, const column_view & delimiters ) except + diff --git a/python/cudf/cudf/_libxx/nvtext/__init__.py b/python/cudf/cudf/_libxx/nvtext/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx index f83262ae521..b563ce9f884 100644 --- a/python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx +++ b/python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx @@ -1,32 +1,32 @@ -# # Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2020, NVIDIA CORPORATION. -# from libcpp.memory cimport unique_ptr -# from cudf._libxx.move cimport move +from libcpp.memory cimport unique_ptr +from cudf._libxx.move cimport move -# from cudf._libxx.cpp.column.column cimport column -# from cudf._libxx.cpp.scalar.scalar cimport scalar -# from cudf._libxx.cpp.types cimport size_type -# from cudf._libxx.cpp.column.column_view cimport column_view -# from cudf._libxx.cpp.nvtext.generate_ngrams cimport ( -# generate_ngrams as cpp_generate_ngrams -# ) -# from cudf._libxx.column cimport Column -# from cudf._libxx.scalar cimport Scalar +from cudf._libxx.cpp.column.column cimport column +from cudf._libxx.cpp.scalar.scalar cimport string_scalar +from cudf._libxx.cpp.types cimport size_type +from cudf._libxx.cpp.column.column_view cimport column_view +from cudf._libxx.cpp.nvtext.generate_ngrams cimport ( + generate_ngrams as cpp_generate_ngrams +) +from cudf._libxx.column cimport Column +from cudf._libxx.scalar cimport Scalar -# def generate_ngrams(Column strings, int ngrams, Scalar separator): -# cdef column_view c_strings = strings.view() -# cdef size_type c_ngrams = ngrams -# cdef scalar* c_separator = separator.c_value.get() -# cdef unique_ptr[column] c_result +def generate_ngrams(Column strings, int ngrams, Scalar separator): + cdef column_view c_strings = strings.view() + cdef size_type c_ngrams = ngrams + cdef string_scalar* c_separator = separator.c_value.get() + cdef unique_ptr[column] c_result -# with nogil: -# c_result = move( -# cpp_generate_ngrams( -# c_strings, -# c_ngrams, -# c_separator[0] -# ) -# ) + with nogil: + c_result = move( + cpp_generate_ngrams( + c_strings, + c_ngrams, + c_separator[0] + ) + ) -# return Column.from_unique_ptr(move(c_result)) + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx index 86d47382dea..f89ba1c8669 100644 --- a/python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx +++ b/python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx @@ -1,39 +1,39 @@ -# # Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2020, NVIDIA CORPORATION. -# from libcpp.memory cimport unique_ptr -# from cudf._libxx.move cimport move +from libcpp.memory cimport unique_ptr +from cudf._libxx.move cimport move -# from cudf._libxx.cpp.column.column cimport column -# from cudf._libxx.cpp.scalar.scalar cimport scalar -# from cudf._libxx.cpp.types cimport size_type -# from cudf._libxx.cpp.column.column_view cimport column_view -# from cudf._libxx.cpp.nvtext.ngrams_tokenize cimport ( -# ngrams_tokenize as cpp_ngrams_tokenize -# ) -# from cudf._libxx.column cimport Column -# from cudf._libxx.scalar cimport Scalar +from cudf._libxx.cpp.column.column cimport column +from cudf._libxx.cpp.scalar.scalar cimport string_scalar +from cudf._libxx.cpp.types cimport size_type +from cudf._libxx.cpp.column.column_view cimport column_view +from cudf._libxx.cpp.nvtext.ngrams_tokenize cimport ( + ngrams_tokenize as cpp_ngrams_tokenize +) +from cudf._libxx.column cimport Column +from cudf._libxx.scalar cimport Scalar -# def ngrams_tokenize( -# Column strings, -# int ngrams, -# Scalar delimiter, -# Scalar separator -# ): -# cdef column_view c_strings = strings.view() -# cdef size_type c_ngrams = ngrams -# cdef scalar* c_separator = separator.c_value.get() -# cdef scalar* c_delimiter = delimiter.c_value.get() -# cdef unique_ptr[column] c_result +def ngrams_tokenize( + Column strings, + int ngrams, + Scalar delimiter, + Scalar separator +): + cdef column_view c_strings = strings.view() + cdef size_type c_ngrams = ngrams + cdef string_scalar* c_separator = separator.c_value.get() + cdef string_scalar* c_delimiter = delimiter.c_value.get() + cdef unique_ptr[column] c_result -# with nogil: -# c_result = move( -# cpp_ngrams_tokenize( -# c_strings, -# c_ngrams, -# c_delimiter[0], -# c_separator[0] -# ) -# ) + with nogil: + c_result = move( + cpp_ngrams_tokenize( + c_strings, + c_ngrams, + c_delimiter[0], + c_separator[0] + ) + ) -# return Column.from_unique_ptr(move(c_result)) + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_libxx/nvtext/normalize.pyx b/python/cudf/cudf/_libxx/nvtext/normalize.pyx index 7b4432c54a4..b2654446ab9 100644 --- a/python/cudf/cudf/_libxx/nvtext/normalize.pyx +++ b/python/cudf/cudf/_libxx/nvtext/normalize.pyx @@ -9,7 +9,6 @@ from cudf._libxx.cpp.nvtext.normalize cimport ( normalize_spaces as cpp_normalize_spaces ) from cudf._libxx.column cimport Column -from cudf._libxx.scalar cimport Scalar def normalize_spaces(Column strings, int ngrams): diff --git a/python/cudf/cudf/_libxx/nvtext/tokenize.pyx b/python/cudf/cudf/_libxx/nvtext/tokenize.pyx index e3af10e2cac..a1a3d81398d 100644 --- a/python/cudf/cudf/_libxx/nvtext/tokenize.pyx +++ b/python/cudf/cudf/_libxx/nvtext/tokenize.pyx @@ -1,79 +1,110 @@ -# # Copyright (c) 2018-2020, NVIDIA CORPORATION. - -# from libcpp.memory cimport unique_ptr -# from cudf._libxx.move cimport move - -# from cudf._libxx.cpp.column.column cimport column -# from cudf._libxx.cpp.scalar.scalar cimport scalar -# from cudf._libxx.cpp.types cimport size_type -# from cudf._libxx.cpp.column.column_view cimport column_view -# from cudf._libxx.cpp.nvtext.tokenize cimport ( -# tokenize as cpp_tokenize, -# count_tokens as cpp_count_tokens -# ) -# from cudf._libxx.column cimport Column -# from cudf._libxx.scalar cimport Scalar - - -# def tokenize(Column strings, Scalar delimiter): -# cdef column_view c_strings = strings.view() -# cdef scalar* c_delimiter = delimiter.c_value.get() -# cdef unique_ptr[column] c_result - -# with nogil: -# c_result = move( -# cpp_tokenize( -# c_strings, -# c_delimiter[0], -# ) -# ) - -# return Column.from_unique_ptr(move(c_result)) - - -# def tokenize(Column strings, Column delimiters): -# cdef column_view c_strings = strings.view() -# cdef column_view c_delimiters = delimiters.view() -# cdef unique_ptr[column] c_result - -# with nogil: -# c_result = move( -# cpp_tokenize( -# c_strings, -# c_delimiters -# ) -# ) - -# return Column.from_unique_ptr(move(c_result)) - - -# def count_tokens(Column strings, Scalar delimiter): -# cdef column_view c_strings = strings.view() -# cdef scalar* c_delimiter = delimiter.c_value.get() -# cdef unique_ptr[column] c_result - -# with nogil: -# c_result = move( -# cpp_count_tokens( -# c_strings, -# c_delimiter[0] -# ) -# ) - -# return Column.from_unique_ptr(move(c_result)) - - -# def count_tokens(Column strings, Column delimiters): -# cdef column_view c_strings = strings.view() -# cdef column_view c_delimiters = delimiters.view() -# cdef unique_ptr[column] c_result - -# with nogil: -# c_result = move( -# cpp_count_tokens( -# c_strings, -# c_delimiters -# ) -# ) - -# return Column.from_unique_ptr(move(c_result)) +# Copyright (c) 2018-2020, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from cudf._libxx.move cimport move + +from cudf._libxx.cpp.column.column cimport column +from cudf._libxx.cpp.scalar.scalar cimport string_scalar +from cudf._libxx.cpp.types cimport size_type +from cudf._libxx.cpp.column.column_view cimport column_view +from cudf._libxx.cpp.nvtext.tokenize cimport ( + tokenize as cpp_tokenize, + tokenize_multi as cpp_tokenize_multi, + count_tokens as cpp_count_tokens, + count_tokens_multi as cpp_count_tokens_multi, +) +from cudf._libxx.column cimport Column +from cudf._libxx.scalar cimport Scalar + + +def tokenize(Column strings, object delimiter): + if isinstance(delimiter, Scalar): + return _tokenize_scalar(strings, delimiter) + + if isinstance(delimiter, Column): + return _tokenize_column(strings, delimiter) + + raise TypeError( + "Expected a Scalar or Column for delimiters, but got {}".format( + type(delimiter) + ) + ) + + +def _tokenize_scalar(Column strings, Scalar delimiter): + + cdef column_view c_strings = strings.view() + cdef string_scalar* c_delimiter = delimiter.c_value.get() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_tokenize( + c_strings, + c_delimiter[0], + ) + ) + + return Column.from_unique_ptr(move(c_result)) + + +def _tokenize_column(Column strings, Column delimiters): + cdef column_view c_strings = strings.view() + cdef column_view c_delimiters = delimiters.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_tokenize_multi( + c_strings, + c_delimiters + ) + ) + + return Column.from_unique_ptr(move(c_result)) + + +def count_tokens(Column strings, object delimiter): + if isinstance(delimiter, Scalar): + return _count_tokens_scalar(strings, delimiter) + + if isinstance(delimiter, Column): + return _count_tokens_column(strings, delimiter) + + raise TypeError( + "Expected a Scalar or Column for delimiters, but got {}".format( + type(delimiter) + ) + ) + + +def _count_tokens_scalar(Column strings, Scalar delimiter): + cdef column_view c_strings = strings.view() + cdef string_scalar* c_delimiter = delimiter.c_value.get() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_count_tokens( + c_strings, + c_delimiter[0] + ) + ) + + return Column.from_unique_ptr(move(c_result)) + + +def _count_tokens_column(Column strings, Column delimiters): + cdef column_view c_strings = strings.view() + cdef column_view c_delimiters = delimiters.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_count_tokens_multi( + c_strings, + c_delimiters + ) + ) + + return Column.from_unique_ptr(move(c_result)) From 5066f65427fbc1b56c6b9251fb6be85ecb10447e Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 13 Mar 2020 14:47:51 -0500 Subject: [PATCH 37/79] changelog --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0de164a1fa1..d1035f99499 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -145,7 +145,6 @@ - PR #4244 Port nvstrings Substring Gather/Scatter functions to cuDF Python/Cython - PR #4280 Port nvstrings Numeric Handling functions to cuDF Python/Cython - PR #4278 Port filling.pyx to libcudf++ API -- PR #4278 Port filling.pyx to libcudf++ API - PR #4328 Add memory threshold callbacks for Java RMM event handler - PR #4336 Move a bunch of internal nvstrings code to use native StringColumns - PR #4166 Port `is_sorted.pyx` to use libcudf++ APIs From cc6901fbbe445666bf6b77f15c9892ae64c7057d Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 13 Mar 2020 15:03:10 -0500 Subject: [PATCH 38/79] nvtext cython: remove explicit method names in favor of overload --- python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd | 4 ++-- python/cudf/cudf/_libxx/nvtext/tokenize.pyx | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd index 21ea6dc09ae..1e4ebbf3d43 100644 --- a/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd +++ b/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd @@ -13,7 +13,7 @@ cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil: const string_scalar & delimiter ) except + - cdef unique_ptr[column] tokenize_multi "nvtext::tokenize" ( + cdef unique_ptr[column] tokenize( const column_view & strings, const column_view & delimiters ) except + @@ -23,7 +23,7 @@ cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil: const string_scalar & delimiter ) except + - cdef unique_ptr[column] count_tokens_multi "nvtext::count_tokens" ( + cdef unique_ptr[column] count_tokens( const column_view & strings, const column_view & delimiters ) except + diff --git a/python/cudf/cudf/_libxx/nvtext/tokenize.pyx b/python/cudf/cudf/_libxx/nvtext/tokenize.pyx index a1a3d81398d..b755303e0ef 100644 --- a/python/cudf/cudf/_libxx/nvtext/tokenize.pyx +++ b/python/cudf/cudf/_libxx/nvtext/tokenize.pyx @@ -9,9 +9,7 @@ from cudf._libxx.cpp.types cimport size_type from cudf._libxx.cpp.column.column_view cimport column_view from cudf._libxx.cpp.nvtext.tokenize cimport ( tokenize as cpp_tokenize, - tokenize_multi as cpp_tokenize_multi, count_tokens as cpp_count_tokens, - count_tokens_multi as cpp_count_tokens_multi, ) from cudf._libxx.column cimport Column from cudf._libxx.scalar cimport Scalar @@ -55,7 +53,7 @@ def _tokenize_column(Column strings, Column delimiters): with nogil: c_result = move( - cpp_tokenize_multi( + cpp_tokenize( c_strings, c_delimiters ) @@ -101,7 +99,7 @@ def _count_tokens_column(Column strings, Column delimiters): with nogil: c_result = move( - cpp_count_tokens_multi( + cpp_count_tokens( c_strings, c_delimiters ) From 18f2fd2ec12ab85acbe8bec7689149f5bc0924de Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 13 Mar 2020 16:08:30 -0400 Subject: [PATCH 39/79] Add workaround for all-empty strings --- python/cudf/cudf/core/column/string.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 3c423496fc9..f16b0a6d260 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1769,6 +1769,17 @@ def __init__(self, mask=None, size=None, offset=0, children=()): None, size, dtype, mask=mask, offset=offset, children=children ) + # For an "all empty" StringColumn (e.g., [""]) libcudf still + # needs the chars child column pointer to be non-null: + if self.size: + if self.null_count == 0 and self.children[1].size == 0: + self.set_base_children( + ( + self.base_children[0], + column_empty(1, dtype=self.base_children[1].dtype), + ) + ) + # TODO: Remove these once NVStrings is fully deprecated / removed self._nvstrings = None self._nvcategory = None From 3952f9a5f8f02695801cf9eeddccfbc9417c7a87 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 13 Mar 2020 16:11:12 -0400 Subject: [PATCH 40/79] Style --- python/cudf/cudf/core/column/numerical.py | 6 ------ python/cudf/cudf/core/column/string.py | 1 - 2 files changed, 7 deletions(-) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 3c392e06843..b75423fef08 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -56,9 +56,6 @@ def __contains__(self, item): self, column.as_column([item], dtype=self.dtype) ).any() - def unary_operator(self, unaryop): - return _numeric_column_unaryop(self, op=unaryop) - def binary_operator(self, binop, rhs, reflect=False): int_dtypes = [ np.dtype("int8"), @@ -86,9 +83,6 @@ def binary_operator(self, binop, rhs, reflect=False): lhs=self, rhs=rhs, op=binop, out_dtype=out_dtype, reflect=reflect ) - def unary_operator(self, unaryop): - return _numeric_column_unaryop(self, op=unaryop) - def _apply_scan_op(self, op): return libcudfxx.reduce.scan(op, self, True) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index f16b0a6d260..05fed9d0d36 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -11,7 +11,6 @@ import nvstrings -import cudf._lib as libcudf import cudf._libxx as libcudfxx import cudf._libxx.string_casting as str_cast from cudf._lib.nvtx import nvtx_range_pop, nvtx_range_push From 74e6d6a954d601fd4cdc7d45e7ef94736cffd16c Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 13 Mar 2020 16:32:11 -0400 Subject: [PATCH 41/79] Changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f79724130ab..e1e62e4bb84 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -166,6 +166,7 @@ - PR #4316 Add Java and JNI bindings for substring expression - PR #4314 Add Java and JNI bindings for string contains - PR #4461 Port nvstrings Miscellaneous functions to cuDF Python/Cython +- PR #4503 Port binaryop.pyx to libcudf++ API ## Bug Fixes From 07a8b545abaf5884e97a8126df84865bc67ad280 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 13 Mar 2020 16:36:00 -0400 Subject: [PATCH 42/79] Restore lost unary_operator --- python/cudf/cudf/core/column/numerical.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index b75423fef08..6be5408e2ac 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -56,6 +56,9 @@ def __contains__(self, item): self, column.as_column([item], dtype=self.dtype) ).any() + def unary_operator(self, unaryop): + return _numeric_column_unaryop(self, op=unaryop) + def binary_operator(self, binop, rhs, reflect=False): int_dtypes = [ np.dtype("int8"), From ca17bc6c4d0cc3685b6318fbdac47f37e434dd70 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 13 Mar 2020 16:36:46 -0400 Subject: [PATCH 43/79] Remove legacy binops Cython bindings --- python/cudf/cudf/_lib/binops.pyx | 195 ---------------------- python/cudf/cudf/_lib/includes/binops.pxd | 67 -------- 2 files changed, 262 deletions(-) delete mode 100644 python/cudf/cudf/_lib/binops.pyx delete mode 100644 python/cudf/cudf/_lib/includes/binops.pxd diff --git a/python/cudf/cudf/_lib/binops.pyx b/python/cudf/cudf/_lib/binops.pyx deleted file mode 100644 index ac7c8240d03..00000000000 --- a/python/cudf/cudf/_lib/binops.pyx +++ /dev/null @@ -1,195 +0,0 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. - -from cudf._lib.cudf cimport * -from cudf._lib.cudf import * -from cudf._lib.GDFError import GDFError -from libcpp.vector cimport vector -from libc.stdlib cimport free - -from libcpp.string cimport string - -import rmm - -from cudf._lib.includes.binops cimport * - - -_BINARY_OP = { - 'add': GDF_ADD, - 'sub': GDF_SUB, - 'mul': GDF_MUL, - 'div': GDF_DIV, - 'truediv': GDF_TRUE_DIV, - 'floordiv': GDF_FLOOR_DIV, - 'mod': GDF_PYMOD, - 'pow': GDF_POW, - 'eq': GDF_EQUAL, - 'ne': GDF_NOT_EQUAL, - 'lt': GDF_LESS, - 'gt': GDF_GREATER, - 'le': GDF_LESS_EQUAL, - 'ge': GDF_GREATER_EQUAL, - 'and': GDF_BITWISE_AND, - 'or': GDF_BITWISE_OR, - 'xor': GDF_BITWISE_XOR, - 'l_and': GDF_LOGICAL_AND, - 'l_or': GDF_LOGICAL_OR, -} - -cdef apply_op_v_v(gdf_column* c_lhs, gdf_column* c_rhs, gdf_column* c_out, op): - """ - Call gdf binary ops between two columns. - """ - - cdef gdf_binary_operator c_op = _BINARY_OP[op] - with nogil: - binary_operation( - c_out, - c_lhs, - c_rhs, - c_op) - - cdef int nullct = c_out[0].null_count - - return nullct - -cdef apply_op_v_s(gdf_column* c_lhs, gdf_scalar* c_rhs, gdf_column* c_out, op): - """ - Call gdf binary ops between a column and a scalar. - """ - - cdef gdf_binary_operator c_op = _BINARY_OP[op] - with nogil: - binary_operation( - c_out, - c_lhs, - c_rhs, - c_op) - - cdef int nullct = c_out[0].null_count - - return nullct - - -cdef apply_op_s_v(gdf_scalar* c_lhs, gdf_column* c_rhs, gdf_column* c_out, op): - """ - Call gdf binary ops between a scalar and a column. - """ - - cdef gdf_binary_operator c_op = _BINARY_OP[op] - with nogil: - binary_operation( - c_out, - c_lhs, - c_rhs, - c_op) - - cdef int nullct = c_out[0].null_count - - return nullct - - -def apply_op(lhs, rhs, out, op): - """ - Dispatches a binary op call to the appropriate libcudf function: - """ - check_gdf_compatibility(out) - cdef gdf_column* c_lhs = NULL - cdef gdf_column* c_rhs = NULL - cdef gdf_scalar* c_scalar = NULL - cdef gdf_column* c_out = column_view_from_column(out) - - if np.isscalar(lhs): - check_gdf_compatibility(rhs) - c_rhs = column_view_from_column(rhs) - c_scalar = gdf_scalar_from_scalar(lhs) - nullct = apply_op_s_v( - c_scalar, - c_rhs, - c_out, - op - ) - elif lhs is None: - check_gdf_compatibility(rhs) - c_rhs = column_view_from_column(rhs) - c_scalar = gdf_scalar_from_scalar(lhs, rhs.dtype) - nullct = apply_op_s_v( - c_scalar, - c_rhs, - c_out, - op - ) - - elif np.isscalar(rhs): - check_gdf_compatibility(lhs) - c_lhs = column_view_from_column(lhs) - c_scalar = gdf_scalar_from_scalar(rhs) - nullct = apply_op_v_s( - c_lhs, - c_scalar, - c_out, - op - ) - - elif rhs is None: - check_gdf_compatibility(lhs) - c_lhs = column_view_from_column(lhs) - c_scalar = gdf_scalar_from_scalar(rhs, lhs.dtype) - nullct = apply_op_v_s( - c_lhs, - c_scalar, - c_out, - op - ) - - else: - check_gdf_compatibility(lhs) - check_gdf_compatibility(rhs) - c_lhs = column_view_from_column(lhs) - c_rhs = column_view_from_column(rhs) - - nullct = apply_op_v_v( - c_lhs, - c_rhs, - c_out, - op - ) - - free(c_scalar) - free_column(c_lhs) - free_column(c_rhs) - free_column(c_out) - - return nullct - - -def apply_op_udf(lhs, rhs, udf_ptx, np_dtype): - """ - Apply a user-defined binary operator (a UDF) defined in `udf_ptx` on - the two input columns `lhs` and `rhs`. The output type of the UDF - has to be specified in `np_dtype`, a numpy data type. - Currently ONLY int32, int64, float32 and float64 are supported. - """ - check_gdf_compatibility(lhs) - check_gdf_compatibility(rhs) - cdef gdf_column* c_lhs = column_view_from_column(lhs) - cdef gdf_column* c_rhs = column_view_from_column(rhs) - - # get the gdf_type related to the input np type - cdef gdf_dtype g_type = dtypes[np_dtype] - - cdef string cpp_str = udf_ptx.encode("UTF-8") - - cdef gdf_column c_out_col - - with nogil: - c_out_col = binary_operation( - c_lhs[0], - c_rhs[0], - cpp_str, - g_type - ) - - free_column(c_lhs) - free_column(c_rhs) - - return gdf_column_to_column(&c_out_col) diff --git a/python/cudf/cudf/_lib/includes/binops.pxd b/python/cudf/cudf/_lib/includes/binops.pxd deleted file mode 100644 index 3c462bb4048..00000000000 --- a/python/cudf/cudf/_lib/includes/binops.pxd +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. - -from cudf._lib.cudf cimport * - -from libcpp.string cimport string - -cdef extern from "cudf/legacy/binaryop.hpp" nogil: - - ctypedef enum gdf_binary_operator: - GDF_ADD, - GDF_SUB, - GDF_MUL, - GDF_DIV, - GDF_TRUE_DIV, - GDF_FLOOR_DIV, - GDF_MOD, - GDF_PYMOD, - GDF_POW, - GDF_EQUAL, - GDF_NOT_EQUAL, - GDF_LESS, - GDF_GREATER, - GDF_LESS_EQUAL, - GDF_GREATER_EQUAL, - GDF_BITWISE_AND, - GDF_BITWISE_OR, - GDF_BITWISE_XOR, - GDF_LOGICAL_AND, - GDF_LOGICAL_OR, - GDF_INVALID_BINARY - -cdef extern from "cudf/legacy/binaryop.hpp" namespace "cudf" nogil: - - cdef void binary_operation( - gdf_column* out, - gdf_scalar* lhs, - gdf_column* rhs, - gdf_binary_operator ope - ) except + - - cdef void binary_operation( - gdf_column* out, - gdf_column* lhs, - gdf_scalar* rhs, - gdf_binary_operator ope - ) except + - - cdef void binary_operation( - gdf_column* out, - gdf_column* lhs, - gdf_column* rhs, - gdf_binary_operator ope - ) except + - - cdef void binary_operation( - gdf_column* out, - gdf_column* lhs, - gdf_column* rhs, - const string& ptx - ) except + - - cdef gdf_column binary_operation( - const gdf_column& lhs, - const gdf_column& rhs, - const string& ptx, - gdf_dtype output_type - ) except + From 5c846b20f3fcd50c5d97a3de191d89e9d81d16c7 Mon Sep 17 00:00:00 2001 From: "Ram (Ramakrishna Prabhu)" Date: Fri, 13 Mar 2020 15:49:45 -0500 Subject: [PATCH 44/79] adding _num_rows property to RangeIndex --- python/cudf/cudf/core/index.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index ec3fb4336ef..0a9f2e84483 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -495,6 +495,10 @@ def name(self, value): def _num_columns(self): return 1 + @property + def _num_rows(self): + return len(self) + @cached_property def _values(self): if len(self) > 0: From 3f10098ae94b87e8d6abe33dadf8d973a8cfa3f7 Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Fri, 13 Mar 2020 14:05:33 -0700 Subject: [PATCH 45/79] check for zero length frames when serializing cuda buffers --- python/cudf/cudf/comm/serialize.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/comm/serialize.py b/python/cudf/cudf/comm/serialize.py index cff0fce550a..9fe5a55c0d0 100644 --- a/python/cudf/cudf/comm/serialize.py +++ b/python/cudf/cudf/comm/serialize.py @@ -40,9 +40,10 @@ def dask_serialize_cudf_object(x): def deserialize_cudf_object(header, frames): with log_errors(): if header["serializer"] == "cuda": - assert all( - hasattr(f, "__cuda_array_interface__") for f in frames - ) + for f in frames: + # some frames are empty -- meta/empty partitions/etc + if len(f) > 0: + assert hasattr(f, "__cuda_array_interface__") if header["serializer"] == "dask": frames = [memoryview(f) for f in frames] From f5adf23a8615472063a130557141dc16af3450c7 Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Fri, 13 Mar 2020 14:14:44 -0700 Subject: [PATCH 46/79] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 09ed8f3bde1..a1fbe82bcef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -259,6 +259,7 @@ - PR #4474 Fix to not materialize RangeIndex in copy_categories - PR #4496 Skip tests which require 2+ GPUs - PR #4494 Update Java memory event handler for new RMM resource API +- PR #4505 Fix 0 length buffers during serialization # cuDF 0.12.0 (04 Feb 2020) From 6d631de2afbab9990e34baea856566dca93dada0 Mon Sep 17 00:00:00 2001 From: "Ram (Ramakrishna Prabhu)" Date: Fri, 13 Mar 2020 18:19:44 -0500 Subject: [PATCH 47/79] changes --- python/cudf/cudf/_libxx/copying.pyx | 2 +- python/cudf/cudf/core/dataframe.py | 60 +++++++++++++++++------------ 2 files changed, 36 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/_libxx/copying.pyx b/python/cudf/cudf/_libxx/copying.pyx index 995abc43295..32a82ae5322 100644 --- a/python/cudf/cudf/_libxx/copying.pyx +++ b/python/cudf/cudf/_libxx/copying.pyx @@ -315,7 +315,7 @@ def table_empty_like(Table input_table): return Table.from_unique_ptr( move(c_result), column_names=input_table._column_names, - index_names=input_table._index._column_names + index_names=input_table._index_names ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 112975a7ac4..d0acd24a369 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -501,6 +501,7 @@ def mask(self, other): def __setitem__(self, arg, value): """Add/set column by *arg or DataFrame* """ + print("__setitem__ is being called ___________________") if isinstance(arg, DataFrame): # not handling set_item where arg = df & value = df if isinstance(value, DataFrame): @@ -513,10 +514,12 @@ def __setitem__(self, arg, value): for col_name in self._data: scatter_map = arg[col_name] if is_scalar(value): - value = utils.scalar_broadcast_to(value, len(self)) - self._data[col_name][scatter_map] = column.as_column( - value - )[scatter_map] + self._data[col_name][scatter_map] = value + else: + + self._data[col_name][scatter_map] = column.as_column( + value + )[scatter_map] elif is_scalar(arg) or isinstance(arg, tuple): if isinstance(value, DataFrame): _setitem_with_dataframe( @@ -527,30 +530,34 @@ def __setitem__(self, arg, value): ) else: if arg in self._data: - if is_scalar(value): - value = utils.scalar_broadcast_to(value, len(self)) if len(self) == 0: - if isinstance(value, (pd.Series, Series)): - self._index = as_index(value.index) - elif len(value) > 0: - self._index = RangeIndex(start=0, stop=len(value)) value = column.as_column(value) new_data = self._data.__class__() for key in self._data: - if key == arg: + if key in arg or key == arg: new_data[key] = value else: new_data[key] = column.column_empty_like( - self._data[key], - masked=True, - newsize=len(value), + self._data[key], + masked=True, + newsize=len(value), ) + self._data = new_data + return elif isinstance(value, (pd.Series, Series)): value = Series(value)._align_to_index( self._index, how="right", allow_non_unique=True ) - self._data[arg] = column.as_column(value) + if is_scalar(arg): + arg=[arg] + if is_scalar(value): + for key in arg: + self._data[key][:] = value + else: + value = as_column(value) + for key in arg: + self._data[key] = value else: # disc. with pandas here # pandas raises key error here @@ -562,13 +569,10 @@ def __setitem__(self, arg, value): mask = arg if isinstance(mask, list): mask = np.array(mask) - - if is_scalar(value): - value = column.as_column( - utils.scalar_broadcast_to(value, len(self)) - ) - + mask = np.array(arg) if mask.dtype == "bool": + mask = column.as_column(arg) + if isinstance(value, DataFrame): _setitem_with_dataframe( input_df=self, @@ -577,10 +581,10 @@ def __setitem__(self, arg, value): mask=mask, ) else: + if not is_scalar(value): + value = column.as_column(value)[mask] for col_name in self._data: - self._data[col_name][mask] = column.as_column(value)[ - mask - ] + self._data[col_name][mask] = value else: if isinstance(value, DataFrame): _setitem_with_dataframe( @@ -590,11 +594,17 @@ def __setitem__(self, arg, value): mask=None, ) else: + if not is_scalar(value): + value = column.as_column(value) for col in arg: # we will raise a key error if col not in dataframe # this behavior will make it # consistent to pandas >0.21.0 - self._data[col] = column.as_column(value) + if not is_scalar(value): + self._data[col] = value + else: + self._data[col][:] = value + else: msg = "__setitem__ on type {!r} is not supported" raise TypeError(msg.format(type(arg))) From 8dcd79b7087ea5045369cba53a554afaa35b4d26 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 13 Mar 2020 19:34:54 -0500 Subject: [PATCH 48/79] nvtext cython: add nvtext methods to `StringMethods`. --- python/cudf/cudf/_libxx/nvtext/normalize.pyx | 2 +- python/cudf/cudf/core/column/string.py | 67 ++++++++++++++++++-- 2 files changed, 63 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/_libxx/nvtext/normalize.pyx b/python/cudf/cudf/_libxx/nvtext/normalize.pyx index b2654446ab9..e8817495a81 100644 --- a/python/cudf/cudf/_libxx/nvtext/normalize.pyx +++ b/python/cudf/cudf/_libxx/nvtext/normalize.pyx @@ -11,7 +11,7 @@ from cudf._libxx.cpp.nvtext.normalize cimport ( from cudf._libxx.column cimport Column -def normalize_spaces(Column strings, int ngrams): +def normalize_spaces(Column strings): cdef column_view c_strings = strings.view() cdef unique_ptr[column] c_result diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 24984aadd4e..590902fd95c 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1172,11 +1172,6 @@ def rjust(self, width, fillchar=" ", **kwargs): with fillchar. fillchar : str, default ' ' (whitespace) - Additional character for filling, default is whitespace. - - Returns - ------- - Series/Index of str dtype Returns Series or Index. """ @@ -1735,6 +1730,68 @@ def translate(self, table, **kwargs): cpp_translate(self._column, table), **kwargs ) + def normalize_spaces(self): + return libcudfxx.nvtext.normalize_spaces(self._column) + + def tokenize(self, delimiter=None): + delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) + return libcudfxx.nvtext.tokenize(self._column, delimiter) + + def token_count(self, delimiter=None): + delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) + return libcudfxx.nvtext.count_tokens(self._column, delimiter) + + def ngrams(self, ngrams=2, separator="_"): + separator = _massage_string_arg(separator, "separator") + return libcudfxx.nvtext.ngrams(self._column, ngrams, separator) + + def ngrams_tokenize(self, ngrams=2, delimiter=" ", separator="_"): + delimiter = _massage_string_arg(delimiter, "delimiter") + separator = _massage_string_arg(separator, "separator") + + return libcudfxx.nvtext.ngrams_tokenize( + self._column, ngrams, delimiter, separator + ) + + +def _massage_string_arg(value, name, allow_col=False): + from cudf._libxx.scalar import Scalar + from cudf._libxx.column import Column + from cudf.utils.dtypes import is_string_dtype + + allowed_types = ["Scalar"] + + if isinstance(value, str): + return Scalar(value, dtype="str") + + if isinstance(value, Scalar) and is_string_dtype(value.dtype): + return value + + if allow_col: + allowed_types += ["Column"] + + if isinstance(value, list): + return column.as_column(value, dtype="str") + + if isinstance(value, Column) and is_string_dtype(value.dtype): + return value + + raise ValueError( + "Expected {} for {} but got {}".format( + _expected_types_format(allowed_types), name, type(value) + ) + ) + + +def _expected_types_format(types): + if len(types) == 0: + raise ValueError + + if len(types) == 1: + return types[0] + + return ", ".join(types[:-1]) + ", or " + types[-1] + class StringColumn(column.ColumnBase): """Implements operations for Columns of String type From d0736567e553dce67846180b97108c61e98d7490 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Sat, 14 Mar 2020 07:19:03 -0400 Subject: [PATCH 49/79] Stale import --- python/cudf/cudf/_lib/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 472a36064f6..e0beeeec493 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -2,7 +2,6 @@ from . import ( avro, - binops, concat, copying, csv, From 1d55fd700655566d2d2cf29d82f68786a4f2ee19 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Sat, 14 Mar 2020 07:19:10 -0400 Subject: [PATCH 50/79] Raise on unsupported datetime binop --- python/cudf/cudf/core/column/datetime.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index a36ed8e4bb6..89dffe17f78 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -213,7 +213,13 @@ def default_na_value(self): def binary_operator(self, op, rhs, reflect=False): lhs, rhs = self, rhs - return binop(lhs, rhs, op=op, out_dtype=np.bool) + + if op in ("eq", "ne", "lt", "gt", "le", "ge"): + out_dtype = np.bool + else: + raise TypeError(f"Series of dtype {self.dtype} cannot perform " + f" the operation {op}") + return binop(lhs, rhs, op=op, out_dtype=out_dtype) def fillna(self, fill_value): if is_scalar(fill_value): From 48b5f8c07dbe3d7550874274f346aa3941273f01 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Sat, 14 Mar 2020 07:19:22 -0400 Subject: [PATCH 51/79] Fix logic for empty string handling --- python/cudf/cudf/core/column/string.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 05fed9d0d36..a540dab0587 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1771,13 +1771,12 @@ def __init__(self, mask=None, size=None, offset=0, children=()): # For an "all empty" StringColumn (e.g., [""]) libcudf still # needs the chars child column pointer to be non-null: if self.size: - if self.null_count == 0 and self.children[1].size == 0: - self.set_base_children( - ( - self.base_children[0], - column_empty(1, dtype=self.base_children[1].dtype), - ) + if self.children[1].size == 0 and self.null_count != self.size: + offsets = self.base_children[0] + chars = column_empty( + self.base_children[1].size + 1, dtype="int8" ) + self.set_base_children((offsets, chars)) # TODO: Remove these once NVStrings is fully deprecated / removed self._nvstrings = None From ba4269e3f1e4f8dfe2535775985ced73994f13bc Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Sat, 14 Mar 2020 07:19:32 -0400 Subject: [PATCH 52/79] Remove function introduced by bad merge --- python/cudf/cudf/core/column/string.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index a540dab0587..341a85add9d 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -2065,9 +2065,6 @@ def deserialize(cls, header, frames): ) return col - def copy(self, deep=True): - return column.as_column(self.nvstrings.copy()) - def find_and_replace(self, to_replace, replacement, all_nan): """ Return col with *to_replace* replaced with *value* From 956d2b1d64e4739db6f8ac749c92d18a2bbc87de Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Sat, 14 Mar 2020 07:19:50 -0400 Subject: [PATCH 53/79] Remove use of legacy bindings --- python/cudf/cudf/utils/applyutils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py index 7a0ddc9b6f5..0b5614e5c84 100644 --- a/python/cudf/cudf/utils/applyutils.py +++ b/python/cudf/cudf/utils/applyutils.py @@ -6,7 +6,7 @@ from numba import cuda, six from numba.utils import exec_, pysignature -import cudf._lib as libcudf +import cudf._libxx as libcudfxx from cudf.core.column import column from cudf.core.series import Series from cudf.utils import utils @@ -116,8 +116,8 @@ def make_aggregate_nullmask(df, columns=None, op="and"): ) continue - libcudf.binops.apply_op( - column.as_column(nullmask), out_mask, out_mask, op + out_mask = libcudfxx.binaryop.binaryop( + column.as_column(nullmask), out_mask, op, out_mask.dtype ) return out_mask From f61c713aa3d45f5fc6b3e4d16e6473238328b971 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Sat, 14 Mar 2020 10:28:43 -0400 Subject: [PATCH 54/79] Black --- python/cudf/cudf/core/column/datetime.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 89dffe17f78..5639e1dabe4 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -217,8 +217,10 @@ def binary_operator(self, op, rhs, reflect=False): if op in ("eq", "ne", "lt", "gt", "le", "ge"): out_dtype = np.bool else: - raise TypeError(f"Series of dtype {self.dtype} cannot perform " - f" the operation {op}") + raise TypeError( + f"Series of dtype {self.dtype} cannot perform " + f" the operation {op}" + ) return binop(lhs, rhs, op=op, out_dtype=out_dtype) def fillna(self, fill_value): From 08d892c0600d7a5d297236f9c4b8f623f8b70736 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 14 Mar 2020 11:34:35 -0500 Subject: [PATCH 55/79] nvtext cython: unit tests --- python/cudf/cudf/_libxx/__init__.py | 1 + python/cudf/cudf/core/column/string.py | 38 +++--- python/cudf/cudf/tests/test_text.py | 162 +++++++++++++++++++++++++ 3 files changed, 186 insertions(+), 15 deletions(-) create mode 100644 python/cudf/cudf/tests/test_text.py diff --git a/python/cudf/cudf/_libxx/__init__.py b/python/cudf/cudf/_libxx/__init__.py index 9e29af03850..9e95af0963d 100644 --- a/python/cudf/cudf/_libxx/__init__.py +++ b/python/cudf/cudf/_libxx/__init__.py @@ -12,6 +12,7 @@ join, merge, null_mask, + nvtext, orc, quantiles, reduce, diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 83f5a631501..09506e481e2 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -15,6 +15,19 @@ import cudf._libxx as libcudfxx import cudf._libxx.string_casting as str_cast from cudf._lib.nvtx import nvtx_range_pop, nvtx_range_push +from cudf._libxx.nvtext.generate_ngrams import ( + generate_ngrams as cpp_generate_ngrams, +) +from cudf._libxx.nvtext.ngrams_tokenize import ( + ngrams_tokenize as cpp_ngrams_tokenize, +) +from cudf._libxx.nvtext.normalize import ( + normalize_spaces as cpp_normalize_spaces, +) +from cudf._libxx.nvtext.tokenize import ( + count_tokens as cpp_count_tokens, + tokenize as cpp_tokenize, +) from cudf._libxx.strings.attributes import ( code_points as cpp_code_points, count_characters as cpp_count_characters, @@ -815,10 +828,7 @@ def split(self, pat=None, n=-1, expand=True, **kwargs): String to split on, does not yet support regular expressions. n : int, default -1 (all) Limit number of splits in output. `None`, 0, and -1 will all be - interpreted as "all splits". - - Returns - ------- + interpreted as "all splits".libcudfxx.nvtext.tokenize DataFrame Returns a DataFrame with each split as a column. @@ -1782,27 +1792,25 @@ def translate(self, table, **kwargs): ) def normalize_spaces(self): - return libcudfxx.nvtext.normalize_spaces(self._column) + return cpp_normalize_spaces(self._column) - def tokenize(self, delimiter=None): + def tokenize(self, delimiter=" "): delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) - return libcudfxx.nvtext.tokenize(self._column, delimiter) + return cpp_tokenize(self._column, delimiter) - def token_count(self, delimiter=None): + def token_count(self, delimiter=" "): delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) - return libcudfxx.nvtext.count_tokens(self._column, delimiter) + return cpp_count_tokens(self._column, delimiter) - def ngrams(self, ngrams=2, separator="_"): + def ngrams(self, n=2, separator="_"): separator = _massage_string_arg(separator, "separator") - return libcudfxx.nvtext.ngrams(self._column, ngrams, separator) + return cpp_generate_ngrams(self._column, n, separator) - def ngrams_tokenize(self, ngrams=2, delimiter=" ", separator="_"): + def ngrams_tokenize(self, n=2, delimiter=" ", separator="_"): delimiter = _massage_string_arg(delimiter, "delimiter") separator = _massage_string_arg(separator, "separator") - return libcudfxx.nvtext.ngrams_tokenize( - self._column, ngrams, delimiter, separator - ) + return cpp_ngrams_tokenize(self._column, n, delimiter, separator) def _massage_string_arg(value, name, allow_col=False): diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py new file mode 100644 index 00000000000..16523054c17 --- /dev/null +++ b/python/cudf/cudf/tests/test_text.py @@ -0,0 +1,162 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. + +import pytest +from pandas.util.testing import assert_series_equal + +import cudf + + +def test_tokenize(): + strings = cudf.Series( + [ + "the quick fox jumped over the lazy dog", + "the siamésé cat jumped under the sofa", + None, + "", + ] + ) + + expected = cudf.Series( + [ + "the", + "quick", + "fox", + "jumped", + "over", + "the", + "lazy", + "dog", + "the", + "siamésé", + "cat", + "jumped", + "under", + "the", + "sofa", + ] + ) + + actual = strings.str.tokenize() + + assert_series_equal(expected.to_pandas(), actual.to_pandas()) + + +@pytest.mark.parametrize( + "delimiter, expected_token_counts", + [ + (" ", [10, 9, 0, 0, 1]), # TODO: verify last count should be 1, not 5 + ("o", [6, 3, 0, 0, 1]), + (["a", "e", "i", "o", "u"], [13, 13, 0, 0, 6]), + (["a", "e", "i", "o"], [12, 11, 0, 0, 6]), + # ([], [10, 9, 0, 0, 5]), # throws + ], +) +def test_token_count(delimiter, expected_token_counts): + strings = cudf.Series( + [ + "the quick brown fox jumped over the lazy brown dog", + "the sable siamésé cat jumped under the brown sofa", + None, + "", + "test_str\x01test_str\x02test_str\x03test_str\x04test_str\x05", + ] + ) + + expected = cudf.Series(expected_token_counts) + + actual = strings.str.token_count(delimiter) + + assert_series_equal( + expected.to_pandas(), actual.to_pandas(), check_dtype=False + ) + + +def test_normalize_spaces(): + strings = cudf.Series( + [ + " the\t quick fox jumped over the lazy dog", + "the siamésé cat\f jumped\t\tunder the sofa ", + None, + "", + ] + ) + expected = cudf.Series( + [ + "the quick fox jumped over the lazy dog", + "the siamésé cat jumped under the sofa", + None, + "", + ] + ) + + actual = strings.str.normalize_spaces() + + assert_series_equal(expected.to_pandas(), actual.to_pandas()) + + +@pytest.mark.parametrize( + "n, separator, expected_values", + [ + ( + 2, + "_", + [ + "this_is", + "is_my", + "my_favorite", + "favorite_book", + "book_on", + "on_my", + "my_bookshelf", + ], + ), + ( + 3, + "-", + [ + "this-is-my", + "is-my-favorite", + "my-favorite-book", + "favorite-book-on", + "book-on-my", + "on-my-bookshelf", + ], + ), + ], +) +def test_ngrams(n, separator, expected_values): + strings = cudf.Series( + ["this", "is", "my", "favorite", "book", "on", "my", "bookshelf"] + ) + expected = cudf.Series(expected_values) + actual = strings.str.ngrams(n=n, separator=separator) + assert_series_equal(expected.to_pandas(), actual.to_pandas()) + + +@pytest.mark.parametrize( + "n, separator, expected_values", + [ + ( + 2, + "_", + [ + "this_is", + "is_my", + "my_favorite", + "book_on", + "on_my", + "my_bookshelf", + ], + ), + ( + 3, + "-", + ["this-is-my", "is-my-favorite", "book-on-my", "on-my-bookshelf"], + ), + ], +) +def test_ngrams_tokenize(n, separator, expected_values): + strings = cudf.Series(["this is my favorite", "book on my bookshelf"]) + expected = cudf.Series(expected_values) + actual = strings.str.ngrams_tokenize(n=n, separator=separator) + assert_series_equal(expected.to_pandas(), actual.to_pandas()) From d224fc1bec5fa1a3251333710ca5faecff7e7af3 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 14 Mar 2020 11:37:40 -0500 Subject: [PATCH 56/79] string.py: add-back accidentally deleted lines. --- python/cudf/cudf/core/column/string.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 09506e481e2..f935a2a9f1f 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -828,7 +828,10 @@ def split(self, pat=None, n=-1, expand=True, **kwargs): String to split on, does not yet support regular expressions. n : int, default -1 (all) Limit number of splits in output. `None`, 0, and -1 will all be - interpreted as "all splits".libcudfxx.nvtext.tokenize + interpreted as "all splits". + + Returns + ------- DataFrame Returns a DataFrame with each split as a column. @@ -1150,6 +1153,11 @@ def rjust(self, width, fillchar=" ", **kwargs): with fillchar. fillchar : str, default ' ' (whitespace) + Additional character for filling, default is whitespace. + + Returns + ------- + Series/Index of str dtype Returns Series or Index. """ From 9dc0b3b7b8d9d98b9b082933be8e3a849c1cb41e Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Tue, 10 Mar 2020 12:50:41 -0500 Subject: [PATCH 57/79] Make scalar destructor virtual to ensure all derived class destructors properly get called. --- cpp/include/cudf/scalar/scalar.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp index 9d9f5fc98fb..dbd4e0a52cc 100644 --- a/cpp/include/cudf/scalar/scalar.hpp +++ b/cpp/include/cudf/scalar/scalar.hpp @@ -40,7 +40,7 @@ namespace cudf { */ class scalar { public: - ~scalar() = default; + virtual ~scalar() = default; scalar(scalar&& other) = default; scalar(scalar const& other) = default; scalar& operator=(scalar const& other) = delete; From bfffdcd9467ceae0e240ab603e936c5ee7421d44 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Sat, 14 Mar 2020 13:28:46 -0500 Subject: [PATCH 58/79] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7bfc58c5222..ba6bc22b152 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -262,6 +262,7 @@ - PR #4494 Update Java memory event handler for new RMM resource API - PR #4505 Fix 0 length buffers during serialization - PR #4482 Fix `.str.rsplit`, `.str.split`, `.str.find`, `.str.rfind`, `.str.index`, `.str.rindex` and enable related tests +- PR #4513 Backport scalar virtual destructor fix # cuDF 0.12.0 (04 Feb 2020) From 943aa0f11f5145d2d48f9fee780d9f4e91dd8701 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 14 Mar 2020 16:22:45 -0500 Subject: [PATCH 59/79] strings.py: return correct subclass from nvtext methods --- python/cudf/cudf/core/column/string.py | 28 +++++++++++++++++--------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index f935a2a9f1f..df765e397a9 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1799,26 +1799,34 @@ def translate(self, table, **kwargs): cpp_translate(self._column, table), **kwargs ) - def normalize_spaces(self): - return cpp_normalize_spaces(self._column) + def normalize_spaces(self, **kwargs): + return self._return_or_inplace( + cpp_normalize_spaces(self._column), **kwargs + ) - def tokenize(self, delimiter=" "): + def tokenize(self, delimiter=" ", **kwargs): delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) - return cpp_tokenize(self._column, delimiter) + return self._return_or_inplace(cpp_tokenize(self._column, delimiter)) - def token_count(self, delimiter=" "): + def token_count(self, delimiter=" ", **kwargs): delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) - return cpp_count_tokens(self._column, delimiter) + return self._return_or_inplace( + cpp_count_tokens(self._column, delimiter) + ) - def ngrams(self, n=2, separator="_"): + def ngrams(self, n=2, separator="_", **kwargs): separator = _massage_string_arg(separator, "separator") - return cpp_generate_ngrams(self._column, n, separator) + return self._return_or_inplace( + cpp_generate_ngrams(self._column, n, separator) + ) - def ngrams_tokenize(self, n=2, delimiter=" ", separator="_"): + def ngrams_tokenize(self, n=2, delimiter=" ", separator="_", **kwargs): delimiter = _massage_string_arg(delimiter, "delimiter") separator = _massage_string_arg(separator, "separator") - return cpp_ngrams_tokenize(self._column, n, delimiter, separator) + return self._return_or_inplace( + cpp_ngrams_tokenize(self._column, n, delimiter, separator) + ) def _massage_string_arg(value, name, allow_col=False): From 897b9413747b6445b778ebec4f3858c0d0865912 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 14 Mar 2020 16:23:57 -0500 Subject: [PATCH 60/79] string.py: move unnecessary code down past early returns. --- python/cudf/cudf/core/column/string.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index df765e397a9..a3c566c1d97 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1834,23 +1834,23 @@ def _massage_string_arg(value, name, allow_col=False): from cudf._libxx.column import Column from cudf.utils.dtypes import is_string_dtype - allowed_types = ["Scalar"] - if isinstance(value, str): return Scalar(value, dtype="str") if isinstance(value, Scalar) and is_string_dtype(value.dtype): return value - if allow_col: - allowed_types += ["Column"] + allowed_types = ["Scalar"] + if allow_col: if isinstance(value, list): return column.as_column(value, dtype="str") if isinstance(value, Column) and is_string_dtype(value.dtype): return value + allowed_types.append("Column") + raise ValueError( "Expected {} for {} but got {}".format( _expected_types_format(allowed_types), name, type(value) @@ -1859,9 +1859,6 @@ def _massage_string_arg(value, name, allow_col=False): def _expected_types_format(types): - if len(types) == 0: - raise ValueError - if len(types) == 1: return types[0] From ef32f2cd4f5b00744bd4f8024fa762eb9ac36333 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 14 Mar 2020 17:04:24 -0500 Subject: [PATCH 61/79] strings.py: fix nvtext methods incomptable index errors, update unit tests --- python/cudf/cudf/core/column/string.py | 27 ++++++++++++++++++-------- python/cudf/cudf/tests/test_text.py | 11 +++++++++++ 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index a3c566c1d97..0d360ecf13f 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -214,9 +214,15 @@ def _return_or_inplace(self, new_col, **kwargs): index=self._parent.index, ) elif isinstance(self._parent, Series): - return Series( - new_col, index=self._parent.index, name=self._parent.name - ) + retain_index = kwargs.get("retain_index", True) + if retain_index: + return Series( + new_col, + name=self._parent.name, + index=self._parent.index, + ) + else: + return Series(new_col, name=self._parent.name) elif isinstance(self._parent, Index): return as_index(new_col, name=self._parent.name) else: @@ -1806,26 +1812,31 @@ def normalize_spaces(self, **kwargs): def tokenize(self, delimiter=" ", **kwargs): delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) - return self._return_or_inplace(cpp_tokenize(self._column, delimiter)) + kwargs.setdefault("retain_index", False) + return self._return_or_inplace( + cpp_tokenize(self._column, delimiter), **kwargs + ) def token_count(self, delimiter=" ", **kwargs): delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) return self._return_or_inplace( - cpp_count_tokens(self._column, delimiter) + cpp_count_tokens(self._column, delimiter), **kwargs ) def ngrams(self, n=2, separator="_", **kwargs): separator = _massage_string_arg(separator, "separator") + kwargs.setdefault("retain_index", False) return self._return_or_inplace( - cpp_generate_ngrams(self._column, n, separator) + cpp_generate_ngrams(self._column, n, separator), **kwargs ) def ngrams_tokenize(self, n=2, delimiter=" ", separator="_", **kwargs): delimiter = _massage_string_arg(delimiter, "delimiter") separator = _massage_string_arg(separator, "separator") - + kwargs.setdefault("retain_index", False) return self._return_or_inplace( - cpp_ngrams_tokenize(self._column, n, delimiter, separator) + cpp_ngrams_tokenize(self._column, n, delimiter, separator), + **kwargs, ) diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index 16523054c17..166cc1fdbb2 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -38,6 +38,7 @@ def test_tokenize(): actual = strings.str.tokenize() + assert type(expected) == type(actual) assert_series_equal(expected.to_pandas(), actual.to_pandas()) @@ -66,6 +67,7 @@ def test_token_count(delimiter, expected_token_counts): actual = strings.str.token_count(delimiter) + assert type(expected) == type(actual) assert_series_equal( expected.to_pandas(), actual.to_pandas(), check_dtype=False ) @@ -91,6 +93,7 @@ def test_normalize_spaces(): actual = strings.str.normalize_spaces() + assert type(expected) == type(actual) assert_series_equal(expected.to_pandas(), actual.to_pandas()) @@ -128,8 +131,12 @@ def test_ngrams(n, separator, expected_values): strings = cudf.Series( ["this", "is", "my", "favorite", "book", "on", "my", "bookshelf"] ) + expected = cudf.Series(expected_values) + actual = strings.str.ngrams(n=n, separator=separator) + + assert type(expected) == type(actual) assert_series_equal(expected.to_pandas(), actual.to_pandas()) @@ -157,6 +164,10 @@ def test_ngrams(n, separator, expected_values): ) def test_ngrams_tokenize(n, separator, expected_values): strings = cudf.Series(["this is my favorite", "book on my bookshelf"]) + expected = cudf.Series(expected_values) + actual = strings.str.ngrams_tokenize(n=n, separator=separator) + + assert type(expected) == type(actual) assert_series_equal(expected.to_pandas(), actual.to_pandas()) From e6616af76ba271b5f13d5a674364116116438b78 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Sun, 15 Mar 2020 20:22:20 -0700 Subject: [PATCH 62/79] remove condition check for nsmallest & nlargest --- python/cudf/cudf/core/series.py | 13 +++++++---- python/cudf/cudf/tests/test_sorting.py | 32 +++++++++++--------------- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 07fab5e2d9d..9bef799c132 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1462,13 +1462,18 @@ def sort_values(self, ascending=True, na_position="last"): return vals.set_index(index) def _n_largest_or_smallest(self, largest, n, keep): - if not (0 <= n <= len(self)): - raise ValueError("n out-of-bound") direction = largest if keep == "first": - return self.sort_values(ascending=not direction)[:n] + if n < 0: + n = 0 + return self.sort_values(ascending=not direction).head(n) elif keep == "last": - return self.sort_values(ascending=direction)[-n:].reverse() + data = self.sort_values(ascending=direction) + if n <= 0: + data = data[-n:-n] + else: + data = data.tail(n) + return data.reverse() else: raise ValueError('keep must be either "first", "last"') diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index 49e273b76e8..87a148553ec 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -75,34 +75,30 @@ def test_series_sort_index(nelem, asc): np.testing.assert_array_equal(orig, got) -def test_series_nlargest(): +@pytest.mark.parametrize("data", [[0, 1, 1, 2, 2, 2, 3, 3], [0], [1, 2, 3]]) +@pytest.mark.parametrize("n", [-100, -50, -12, -2, 0, 1, 2, 3, 4, 7]) +def test_series_nlargest(data, n): """Indirectly tests Series.sort_values() """ - sr = Series([0, 1, 1, 2, 2, 2, 3, 3]) - got = sr.nlargest(3) # default keep='first' - assert list(got) == [3, 3, 2] - assert list(got.index.values) == [6, 7, 3] - - got = sr.nlargest(3, keep="last") - assert list(got) == [3, 3, 2] - assert list(got.index.values) == [7, 6, 5] + sr = Series(data) + psr = pd.Series(data) + assert_eq(sr.nlargest(n), psr.nlargest(n)) + assert_eq(sr.nlargest(n, keep="last"), psr.nlargest(n, keep="last")) with pytest.raises(ValueError) as raises: sr.nlargest(3, keep="what") assert raises.match('keep must be either "first", "last"') -def test_series_nsmallest(): +@pytest.mark.parametrize("data", [[0, 1, 1, 2, 2, 2, 3, 3], [0], [1, 2, 3]]) +@pytest.mark.parametrize("n", [-100, -50, -12, -2, 0, 1, 2, 3, 4, 9]) +def test_series_nsmallest(data, n): """Indirectly tests Series.sort_values() """ - sr = Series([0, 1, 1, 2, 2, 2, 3, 3]) - got = sr.nsmallest(3) # default keep='first' - assert list(got) == [0, 1, 1] - assert list(got.index.values) == [0, 1, 2] - - got = sr.nsmallest(3, keep="last") - assert list(got) == [0, 1, 1] - assert list(got.index.values) == [0, 2, 1] + sr = Series(data) + psr = pd.Series(data) + assert_eq(sr.nsmallest(n), psr.nsmallest(n)) + assert_eq(sr.nsmallest(n, keep="last"), psr.nsmallest(n, keep="last")) with pytest.raises(ValueError) as raises: sr.nsmallest(3, keep="what") From 36a67835bf564bfccaf943bc80a464885e12fb42 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Sun, 15 Mar 2020 20:27:17 -0700 Subject: [PATCH 63/79] remove n range check in DataFrame._n_largest_or_smallest --- python/cudf/cudf/core/dataframe.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 4c31547cb1b..b22b5570c74 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2137,8 +2137,7 @@ def _n_largest_or_smallest(self, method, n, columns, keep): [column] = columns else: column = columns - if not (0 <= n <= len(self)): - raise ValueError("n out-of-bound") + col = self[column].reset_index(drop=True) # Operate sorted_series = getattr(col, method)(n=n, keep=keep) From f81e63dee8526f29470d6df1978369e205906f04 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Sun, 15 Mar 2020 22:33:44 -0500 Subject: [PATCH 64/79] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c2d1d394bac..a726fb40f9d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -258,6 +258,7 @@ - PR #4474 Fix to not materialize RangeIndex in copy_categories - PR #4496 Skip tests which require 2+ GPUs - PR #4494 Update Java memory event handler for new RMM resource API +- PR #4519 Remove `n` validation for `nlargest` & `nsmallest` and add negative support for `n` # cuDF 0.12.0 (04 Feb 2020) From f74adf84709922f684f2ce644f9da8b17bc157fe Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Mon, 16 Mar 2020 16:39:06 +0530 Subject: [PATCH 65/79] Document dropping mask of ARGMIN/MAX gather map --- cpp/src/groupby/hash/groupby.cu | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index a039000ef2a..ed4daec7f68 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -170,6 +170,11 @@ void sparse_to_dense_results( auto tranformed_agg = std::make_unique(agg_kind); auto arg_result = to_dense_agg_result(tranformed_agg); if (arg_result->nullable()) { + // We make a view of ARG(MIN/MAX) result without a null mask and gather + // using this map. The values in data buffer of ARG(MIN/MAX) result + // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL + // which is an out of bounds index value (-1) and causes the gathered + // value to be null. column_view null_removed_map(data_type(type_to_id()), arg_result->size(), static_cast(arg_result->view().template data())); From 3a800ca7879210c9a1296637e9647a64f9843f2e Mon Sep 17 00:00:00 2001 From: "Ram (Ramakrishna Prabhu)" Date: Mon, 16 Mar 2020 11:50:47 -0500 Subject: [PATCH 66/79] test cases pass --- python/cudf/cudf/core/dataframe.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d0acd24a369..5bc8a0bb802 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -501,7 +501,6 @@ def mask(self, other): def __setitem__(self, arg, value): """Add/set column by *arg or DataFrame* """ - print("__setitem__ is being called ___________________") if isinstance(arg, DataFrame): # not handling set_item where arg = df & value = df if isinstance(value, DataFrame): @@ -531,6 +530,10 @@ def __setitem__(self, arg, value): else: if arg in self._data: if len(self) == 0: + if isinstance(value, (pd.Series, Series)): + self._index = as_index(value.index) + elif len(value) > 0: + self._index = RangeIndex(start=0, stop=len(value)) value = column.as_column(value) new_data = self._data.__class__() for key in self._data: @@ -549,15 +552,11 @@ def __setitem__(self, arg, value): value = Series(value)._align_to_index( self._index, how="right", allow_non_unique=True ) - if is_scalar(arg): - arg=[arg] if is_scalar(value): - for key in arg: - self._data[key][:] = value + self._data[arg][:] = value else: value = as_column(value) - for key in arg: - self._data[key] = value + self._data[arg] = value else: # disc. with pandas here # pandas raises key error here From be55bc6c505e280029790525efc9a688b8b2764f Mon Sep 17 00:00:00 2001 From: "Ram (Ramakrishna Prabhu)" Date: Mon, 16 Mar 2020 12:23:19 -0500 Subject: [PATCH 67/79] CHANGELOG.md --- CHANGELOG.md | 1 + python/cudf/cudf/core/dataframe.py | 14 ++++++-------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 95d5443981d..f8db84bb07e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -169,6 +169,7 @@ - PR #4503 Port binaryop.pyx to libcudf++ API - PR #4499 Adding changes to handle include `keep_index` and `RangeIndex` - PR #4493 Skip legacy testing in CI +- PR #4524 Updating `__setitem__` for DataFrame to use scalar scatter ## Bug Fixes diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 5ed38e1cf87..13143e4a927 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -537,13 +537,13 @@ def __setitem__(self, arg, value): value = column.as_column(value) new_data = self._data.__class__() for key in self._data: - if key in arg or key == arg: + if key == arg: new_data[key] = value else: new_data[key] = column.column_empty_like( - self._data[key], - masked=True, - newsize=len(value), + self._data[key], + masked=True, + newsize=len(value), ) self._data = new_data @@ -568,7 +568,7 @@ def __setitem__(self, arg, value): mask = arg if isinstance(mask, list): mask = np.array(mask) - mask = np.array(arg) + if mask.dtype == "bool": mask = column.as_column(arg) @@ -593,14 +593,12 @@ def __setitem__(self, arg, value): mask=None, ) else: - if not is_scalar(value): - value = column.as_column(value) for col in arg: # we will raise a key error if col not in dataframe # this behavior will make it # consistent to pandas >0.21.0 if not is_scalar(value): - self._data[col] = value + self._data[col] = column.as_column(value) else: self._data[col][:] = value From 76bc0e823a8abbf224b2a65862133dcd56298fc4 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Mon, 16 Mar 2020 23:35:29 +0530 Subject: [PATCH 68/79] Review code cleanup requested by karthikeyan https://github.com/rapidsai/cudf/pull/4456#discussion_r393186736 --- cpp/src/groupby/hash/groupby.cu | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index ed4daec7f68..66e8138d45b 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -167,26 +167,19 @@ void sparse_to_dense_results( auto transformed_result = [&col, to_dense_agg_result, mr, stream] (auto const& agg_kind) { - auto tranformed_agg = std::make_unique(agg_kind); - auto arg_result = to_dense_agg_result(tranformed_agg); - if (arg_result->nullable()) { - // We make a view of ARG(MIN/MAX) result without a null mask and gather - // using this map. The values in data buffer of ARG(MIN/MAX) result - // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL - // which is an out of bounds index value (-1) and causes the gathered - // value to be null. - column_view null_removed_map(data_type(type_to_id()), - arg_result->size(), - static_cast(arg_result->view().template data())); - auto transformed_result = experimental::detail::gather( - table_view({col}), null_removed_map, false, true, false, mr, stream); - return std::move(transformed_result->release()[0]); - } - else { - auto transformed_result = experimental::detail::gather( - table_view({col}), *arg_result, false, false, false, mr, stream); - return std::move(transformed_result->release()[0]); - } + auto transformed_agg = std::make_unique(agg_kind); + auto arg_result = to_dense_agg_result(transformed_agg); + // We make a view of ARG(MIN/MAX) result without a null mask and gather + // using this map. The values in data buffer of ARG(MIN/MAX) result + // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL + // which is an out of bounds index value (-1) and causes the gathered + // value to be null. + column_view null_removed_map(data_type(type_to_id()), + arg_result->size(), + static_cast(arg_result->view().template data())); + auto transformed_result = experimental::detail::gather(table_view({col}), + null_removed_map, false, arg_result->nullable(), false, mr, stream); + return std::move(transformed_result->release()[0]); }; for (auto &&agg : agg_v) { From 50ecc6253885890c77b2a028be2ff6cf86f79076 Mon Sep 17 00:00:00 2001 From: "Ram (Ramakrishna Prabhu)" <42624703+rgsl888prabhu@users.noreply.github.com> Date: Mon, 16 Mar 2020 14:15:53 -0500 Subject: [PATCH 69/79] Update dataframe.py --- python/cudf/cudf/core/dataframe.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 13143e4a927..bacb6bfdcc5 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -593,12 +593,14 @@ def __setitem__(self, arg, value): mask=None, ) else: + if not is_scalar(value): + value = column.as_column(value) for col in arg: # we will raise a key error if col not in dataframe # this behavior will make it # consistent to pandas >0.21.0 if not is_scalar(value): - self._data[col] = column.as_column(value) + self._data[col] = value else: self._data[col][:] = value From 38b19531cdd3d56f1b74a95d2622cf5a0bedd61f Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 16 Mar 2020 12:25:31 -0700 Subject: [PATCH 70/79] fix issue related to index slicing when the dataframe is empty --- python/cudf/cudf/core/indexing.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index 63ba9ce0f89..ea17b7f5f62 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -313,7 +313,10 @@ def _getitem_tuple_arg(self, arg): if df.shape[0] == 0 and df.shape[1] == 0 and isinstance(arg[0], slice): from cudf.core.index import RangeIndex - slice_len = arg[0].stop or len(self._df) + if len(self._df) > 0: + slice_len = arg[0].stop or len(self._df) + else: + slice_len = len(self._df) start, stop, step = arg[0].indices(slice_len) df._index = RangeIndex(start, stop) return df From d944fd2e7943fc49f6efd21e6c8154f25a67b932 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 16 Mar 2020 12:26:22 -0700 Subject: [PATCH 71/79] add test related to empty dataframe head/tail --- python/cudf/cudf/tests/test_index.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 8291859fa93..3205b686863 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -323,3 +323,10 @@ def test_index_names(): ) def test_range_index_from_range(data): assert_eq(pd.Index(data), cudf.core.index.as_index(data)) + + +def test_empty_df_head_tail_index(): + df = cudf.DataFrame() + pdf = pd.DataFrame() + assert_eq(df.head().index.values, pdf.head().index.values) + assert_eq(df.tail().index.values, pdf.tail().index.values) From f8726c78d5aa9138ac6767276aa3bd2b66c635c2 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 16 Mar 2020 14:34:57 -0500 Subject: [PATCH 72/79] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c2d1d394bac..f17f914d17b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -258,6 +258,7 @@ - PR #4474 Fix to not materialize RangeIndex in copy_categories - PR #4496 Skip tests which require 2+ GPUs - PR #4494 Update Java memory event handler for new RMM resource API +- PR #4526 Fix index slicing issue for index incase of an empty dataframe # cuDF 0.12.0 (04 Feb 2020) From 7e06a000016fa67da12b61afe607cd53c7be86f9 Mon Sep 17 00:00:00 2001 From: "Ram (Ramakrishna Prabhu)" <42624703+rgsl888prabhu@users.noreply.github.com> Date: Mon, 16 Mar 2020 15:06:01 -0500 Subject: [PATCH 73/79] Update dataframe.py --- python/cudf/cudf/core/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index bacb6bfdcc5..56f84506356 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -593,7 +593,7 @@ def __setitem__(self, arg, value): mask=None, ) else: - if not is_scalar(value): + if not is_scalar(value): value = column.as_column(value) for col in arg: # we will raise a key error if col not in dataframe From 7bcacde03e3bbcb335d9cfcb63c250cfa80c5af5 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 16 Mar 2020 14:22:53 -0700 Subject: [PATCH 74/79] handling all cases --- python/cudf/cudf/core/indexing.py | 5 +---- python/cudf/cudf/tests/test_index.py | 19 ++++++++++++++++--- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index ea17b7f5f62..201a331779a 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -313,10 +313,7 @@ def _getitem_tuple_arg(self, arg): if df.shape[0] == 0 and df.shape[1] == 0 and isinstance(arg[0], slice): from cudf.core.index import RangeIndex - if len(self._df) > 0: - slice_len = arg[0].stop or len(self._df) - else: - slice_len = len(self._df) + slice_len = len(self._df) start, stop, step = arg[0].indices(slice_len) df._index = RangeIndex(start, stop) return df diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 3205b686863..2bf2bb76dd3 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -325,8 +325,21 @@ def test_range_index_from_range(data): assert_eq(pd.Index(data), cudf.core.index.as_index(data)) -def test_empty_df_head_tail_index(): +@pytest.mark.parametrize( + "n", [-10, -5, -2, 0, 1, 0, 2, 5, 10], +) +def test_empty_df_head_tail_index(n): df = cudf.DataFrame() pdf = pd.DataFrame() - assert_eq(df.head().index.values, pdf.head().index.values) - assert_eq(df.tail().index.values, pdf.tail().index.values) + assert_eq(df.head(n).index.values, pdf.head(n).index.values) + assert_eq(df.tail(n).index.values, pdf.tail(n).index.values) + + df = cudf.DataFrame({"a": [11, 2, 33, 44, 55]}) + pdf = pd.DataFrame({"a": [11, 2, 33, 44, 55]}) + assert_eq(df.head(n).index.values, pdf.head(n).index.values) + assert_eq(df.tail(n).index.values, pdf.tail(n).index.values) + + df = cudf.DataFrame(index=[1, 2, 3]) + pdf = pd.DataFrame(index=[1, 2, 3]) + assert_eq(df.head(n).index.values, pdf.head(n).index.values) + assert_eq(df.tail(n).index.values, pdf.tail(n).index.values) From a4839caaafcc879e2a77699e89cd11498890beb7 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 16 Mar 2020 16:32:17 -0500 Subject: [PATCH 75/79] string.py: default tokenize arguments to `""`, meaning "all whitespace". --- python/cudf/cudf/core/column/string.py | 6 +++--- python/cudf/cudf/tests/test_text.py | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 7d12c98e0c9..b34c6192d99 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1801,11 +1801,11 @@ def translate(self, table, **kwargs): def normalize_spaces(self): return cpp_normalize_spaces(self._column) - def tokenize(self, delimiter=" "): + def tokenize(self, delimiter=""): delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) return cpp_tokenize(self._column, delimiter) - def token_count(self, delimiter=" "): + def token_count(self, delimiter=""): delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) return cpp_count_tokens(self._column, delimiter) @@ -1813,7 +1813,7 @@ def ngrams(self, n=2, separator="_"): separator = _massage_string_arg(separator, "separator") return cpp_generate_ngrams(self._column, n, separator) - def ngrams_tokenize(self, n=2, delimiter=" ", separator="_"): + def ngrams_tokenize(self, n=2, delimiter="", separator="_"): delimiter = _massage_string_arg(delimiter, "delimiter") separator = _massage_string_arg(separator, "separator") diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index 16523054c17..d6a2bfe48b3 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -44,11 +44,10 @@ def test_tokenize(): @pytest.mark.parametrize( "delimiter, expected_token_counts", [ - (" ", [10, 9, 0, 0, 1]), # TODO: verify last count should be 1, not 5 + ("", [10, 9, 0, 0, 5]), ("o", [6, 3, 0, 0, 1]), (["a", "e", "i", "o", "u"], [13, 13, 0, 0, 6]), (["a", "e", "i", "o"], [12, 11, 0, 0, 6]), - # ([], [10, 9, 0, 0, 5]), # throws ], ) def test_token_count(delimiter, expected_token_counts): From 20a03f9fe7cf5a232aeea615eb05ac1fdff42a28 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 16 Mar 2020 16:25:55 -0700 Subject: [PATCH 76/79] Import `tlz` for optional `cytoolz` support --- CHANGELOG.md | 1 + python/dask_cudf/dask_cudf/accessor.py | 2 +- python/dask_cudf/dask_cudf/core.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc38ac15cbf..178df1cb738 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -168,6 +168,7 @@ - PR #4461 Port nvstrings Miscellaneous functions to cuDF Python/Cython - PR #4503 Port binaryop.pyx to libcudf++ API - PR #4499 Adding changes to handle include `keep_index` and `RangeIndex` +- PR #4533 Import `tlz` for optional `cytoolz` support - PR #4493 Skip legacy testing in CI ## Bug Fixes diff --git a/python/dask_cudf/dask_cudf/accessor.py b/python/dask_cudf/dask_cudf/accessor.py index eb6e50ea1ad..fceb2c74470 100644 --- a/python/dask_cudf/dask_cudf/accessor.py +++ b/python/dask_cudf/dask_cudf/accessor.py @@ -12,7 +12,7 @@ """ -from toolz import partial +from tlz import partial import cudf from cudf.core.column.categorical import ( diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index b245970173d..549e319fc90 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -5,7 +5,7 @@ import numpy as np import pandas as pd -from toolz import partition_all +from tlz import partition_all import dask import dask.dataframe as dd From 27a0520b598e569c06b313f2a5d84a03a0592185 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Mon, 16 Mar 2020 19:16:10 -0500 Subject: [PATCH 77/79] Disable errors from deprecation warnings. --- cpp/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index fe0cce676ba..f3ab25bd243 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -48,7 +48,7 @@ set(CMAKE_CUDA_STANDARD 14) set(CMAKE_CUDA_STANDARD_REQUIRED ON) if(CMAKE_COMPILER_IS_GNUCXX) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wno-error=deprecated-declarations") # Suppress parentheses warning which causes gmock to fail set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wno-parentheses") @@ -110,7 +110,7 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda --expt-relaxed- # set warnings as errors # TODO: remove `no-maybe-unitialized` used to suppress warnings in rmm::exec_policy -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Werror cross-execution-space-call -Xcompiler -Wall,-Werror") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Werror cross-execution-space-call -Xcompiler -Wall,-Werror,-Wno-error=deprecated-declarations") # Option to enable line info in CUDA device compilation to allow introspection when profiling / memchecking option(CMAKE_CUDA_LINEINFO "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler" OFF) From 1915d0a6c71b4b158bac3f037af7dbafc3ca9309 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Mon, 16 Mar 2020 19:37:15 -0500 Subject: [PATCH 78/79] changelog. --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc38ac15cbf..ad851d17160 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -169,6 +169,7 @@ - PR #4503 Port binaryop.pyx to libcudf++ API - PR #4499 Adding changes to handle include `keep_index` and `RangeIndex` - PR #4493 Skip legacy testing in CI +- PR #4534 Disable deprecation warnings as errors. ## Bug Fixes From c9a3acbdeecacf83ae2df0e430bb5c742351f1e9 Mon Sep 17 00:00:00 2001 From: Keith Kraus Date: Mon, 16 Mar 2020 22:35:11 -0400 Subject: [PATCH 79/79] Add deprecation warning handling to pyniNVStrings as well --- python/nvstrings/cpp/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/nvstrings/cpp/CMakeLists.txt b/python/nvstrings/cpp/CMakeLists.txt index 15c70849dcd..ade21cdfaa7 100644 --- a/python/nvstrings/cpp/CMakeLists.txt +++ b/python/nvstrings/cpp/CMakeLists.txt @@ -41,7 +41,7 @@ set(CMAKE_CXX_COMPILER $ENV{CXX}) set(CMAKE_CXX_STANDARD_REQUIRED ON) if(CMAKE_COMPILER_IS_GNUCXX) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wno-error=deprecated-declarations") option(CMAKE_CXX11_ABI "Enable the GLIBCXX11 ABI" ON) if(CMAKE_CXX11_ABI)