From 098b1d8bd4517d0b220712ce1fcdfd7c5e473411 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 8 Jul 2022 13:14:50 -0700 Subject: [PATCH 1/3] Remove `stable_sort_order` and use `cpp_distinct` --- python/cudf/cudf/_lib/cpp/sorting.pxd | 6 --- .../cudf/cudf/_lib/cpp/stream_compaction.pxd | 3 +- python/cudf/cudf/_lib/stream_compaction.pyx | 44 ++++--------------- 3 files changed, 10 insertions(+), 43 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/sorting.pxd b/python/cudf/cudf/_lib/cpp/sorting.pxd index c6c42c327ac..46fbf0b1e4d 100644 --- a/python/cudf/cudf/_lib/cpp/sorting.pxd +++ b/python/cudf/cudf/_lib/cpp/sorting.pxd @@ -20,12 +20,6 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil: vector[libcudf_types.order] column_order, vector[libcudf_types.null_order] null_precedence) except + - cdef unique_ptr[table] stable_sort_by_key( - const table_view& values, - const table_view& keys, - vector[libcudf_types.order] column_order, - vector[libcudf_types.null_order] null_precedence) except + - cdef unique_ptr[column] rank( column_view input_view, rank_method method, diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd index 61efd040807..186e8a81abb 100644 --- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd @@ -20,6 +20,7 @@ from cudf._lib.cpp.types cimport ( cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" \ nogil: ctypedef enum duplicate_keep_option: + KEEP_ANY 'cudf::duplicate_keep_option::KEEP_ANY' KEEP_FIRST 'cudf::duplicate_keep_option::KEEP_FIRST' KEEP_LAST 'cudf::duplicate_keep_option::KEEP_LAST' KEEP_NONE 'cudf::duplicate_keep_option::KEEP_NONE' @@ -33,7 +34,7 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" \ column_view boolean_mask ) except + - cdef unique_ptr[table] unique( + cdef unique_ptr[table] distinct( table_view source_table, vector[size_type] keys, duplicate_keep_option keep, diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index b645fcd59d0..763b8fdc7e6 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -9,22 +9,19 @@ from libcpp.vector cimport vector from cudf._lib.column cimport Column from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.sorting cimport stable_sort_by_key as cpp_stable_sort_by_key from cudf._lib.cpp.stream_compaction cimport ( apply_boolean_mask as cpp_apply_boolean_mask, + distinct as cpp_distinct, distinct_count as cpp_distinct_count, drop_nulls as cpp_drop_nulls, duplicate_keep_option, - unique as cpp_unique, ) from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport ( nan_policy, null_equality, - null_order, null_policy, - order, size_type, ) from cudf._lib.utils cimport ( @@ -132,14 +129,16 @@ def drop_duplicates(list columns, ) cdef duplicate_keep_option cpp_keep_option - if keep == 'first': - cpp_keep_option = duplicate_keep_option.KEEP_FIRST + if keep == 'any': + cpp_keep_option = duplicate_keep_option.KEEP_ANY + elif keep == 'first': + cpp_keep_option = duplicate_keep_option.KEEP_FIRST elif keep == 'last': cpp_keep_option = duplicate_keep_option.KEEP_LAST elif keep is False: cpp_keep_option = duplicate_keep_option.KEEP_NONE else: - raise ValueError('keep must be either "first", "last" or False') + raise ValueError('keep must be either "any", "first", "last" or False') # shifting the index number by number of index columns cdef null_equality cpp_nulls_equal = ( @@ -148,40 +147,13 @@ def drop_duplicates(list columns, else null_equality.UNEQUAL ) - cdef vector[order] column_order = ( - vector[order]( - cpp_keys.size(), - order.ASCENDING - ) - ) - cdef vector[null_order] null_precedence = ( - vector[null_order]( - cpp_keys.size(), - null_order.BEFORE - ) - ) - cdef table_view source_table_view = table_view_from_columns(columns) - cdef table_view keys_view = source_table_view.select(cpp_keys) - cdef unique_ptr[table] sorted_source_table cdef unique_ptr[table] c_result with nogil: - # cudf::unique keeps unique rows in each consecutive group of - # equivalent rows. To match the behavior of pandas.DataFrame. - # drop_duplicates, users need to stable sort the input first - # and then invoke cudf::unique. - sorted_source_table = move( - cpp_stable_sort_by_key( - source_table_view, - keys_view, - column_order, - null_precedence - ) - ) c_result = move( - cpp_unique( - sorted_source_table.get().view(), + cpp_distinct( + source_table_view, cpp_keys, cpp_keep_option, cpp_nulls_equal From d545cfd25b29c72aed86cd3079e069d5301854ac Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 8 Jul 2022 14:40:20 -0700 Subject: [PATCH 2/3] Fix style --- python/cudf/cudf/_lib/stream_compaction.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index 763b8fdc7e6..3e53e8cb3a6 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -132,7 +132,7 @@ def drop_duplicates(list columns, if keep == 'any': cpp_keep_option = duplicate_keep_option.KEEP_ANY elif keep == 'first': - cpp_keep_option = duplicate_keep_option.KEEP_FIRST + cpp_keep_option = duplicate_keep_option.KEEP_FIRST elif keep == 'last': cpp_keep_option = duplicate_keep_option.KEEP_LAST elif keep is False: From 6e308eff2df6a2ea9b06064331c0910a86ba98fe Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 6 Sep 2022 14:05:51 -0700 Subject: [PATCH 3/3] Remove `any` option --- python/cudf/cudf/_lib/stream_compaction.pyx | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index 3e53e8cb3a6..fa67f4f782b 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -129,16 +129,14 @@ def drop_duplicates(list columns, ) cdef duplicate_keep_option cpp_keep_option - if keep == 'any': - cpp_keep_option = duplicate_keep_option.KEEP_ANY - elif keep == 'first': + if keep == 'first': cpp_keep_option = duplicate_keep_option.KEEP_FIRST elif keep == 'last': cpp_keep_option = duplicate_keep_option.KEEP_LAST elif keep is False: cpp_keep_option = duplicate_keep_option.KEEP_NONE else: - raise ValueError('keep must be either "any", "first", "last" or False') + raise ValueError('keep must be either "first", "last" or False') # shifting the index number by number of index columns cdef null_equality cpp_nulls_equal = (