diff --git a/python/cudf/cudf/_lib/cpp/sorting.pxd b/python/cudf/cudf/_lib/cpp/sorting.pxd index 68f01003fe6..86dc0f0de95 100644 --- a/python/cudf/cudf/_lib/cpp/sorting.pxd +++ b/python/cudf/cudf/_lib/cpp/sorting.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -68,3 +68,8 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil: table_view source_table, vector[libcudf_types.order] column_order, vector[libcudf_types.null_order] null_precedence) except + + + cdef unique_ptr[table] stable_sort( + table_view source_table, + vector[libcudf_types.order] column_order, + vector[libcudf_types.null_order] null_precedence) except + diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd index e8539ecb9c3..55854a9444f 100644 --- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd @@ -30,21 +30,28 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil: vector[size_type] keys, size_type keep_threshold) except + + cdef unique_ptr[table] drop_nans(table_view source_table, + vector[size_type] keys, + size_type keep_threshold) except + + cdef unique_ptr[table] apply_boolean_mask( table_view source_table, column_view boolean_mask ) except + - cdef size_type distinct_count( - column_view source_table, - null_policy null_handling, - nan_policy nan_handling) except + + cdef unique_ptr[table] unique( + table_view input, + vector[size_type] keys, + duplicate_keep_option keep, + null_equality nulls_equal, + ) except + - cdef unique_ptr[table] stable_distinct( + cdef unique_ptr[table] distinct( table_view input, vector[size_type] keys, duplicate_keep_option keep, null_equality nulls_equal, + nan_equality nans_equals, ) except + cdef unique_ptr[column] distinct_indices( @@ -53,3 +60,29 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil: null_equality nulls_equal, nan_equality nans_equal, ) except + + + cdef unique_ptr[table] stable_distinct( + table_view input, + vector[size_type] keys, + duplicate_keep_option keep, + null_equality nulls_equal, + nan_equality nans_equal, + ) except + + + cdef size_type unique_count( + column_view column, + null_policy null_handling, + nan_policy nan_handling) except + + + cdef size_type unique_count( + table_view source_table, + null_policy null_handling) except + + + cdef size_type distinct_count( + column_view column, + null_policy null_handling, + nan_policy nan_handling) except + + + cdef size_type distinct_count( + table_view source_table, + null_policy null_handling) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/sorting.pxd b/python/cudf/cudf/_lib/pylibcudf/sorting.pxd index fb22da0b0fd..3ed241622c0 100644 --- a/python/cudf/cudf/_lib/pylibcudf/sorting.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/sorting.pxd @@ -59,3 +59,5 @@ cpdef Table stable_sort_by_key( ) cpdef Table sort(Table source_table, list column_order, list null_precedence) + +cpdef Table stable_sort(Table source_table, list column_order, list null_precedence) diff --git a/python/cudf/cudf/_lib/pylibcudf/sorting.pyx b/python/cudf/cudf/_lib/pylibcudf/sorting.pyx index 4e73760720a..1668a3efc7c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/sorting.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/sorting.pyx @@ -50,7 +50,8 @@ cpdef Column stable_sorted_order( list column_order, list null_precedence, ): - """Computes the row indices required to sort the table, maintaining input order. + """Computes the row indices required to sort the table, + preserving order of equal elements. Parameters ---------- @@ -206,7 +207,8 @@ cpdef Table stable_segmented_sort_by_key( list column_order, list null_precedence, ): - """Sorts the table by key, within segments, maintaining input order. + """Sorts the table by key preserving order of equal elements, + within segments. Parameters ---------- @@ -287,7 +289,7 @@ cpdef Table stable_sort_by_key( list column_order, list null_precedence, ): - """Sorts the table by key, maintaining input order. + """Sorts the table by key preserving order of equal elements. Parameters ---------- @@ -349,3 +351,34 @@ cpdef Table sort(Table source_table, list column_order, list null_precedence): ) ) return Table.from_libcudf(move(c_result)) + + +cpdef Table stable_sort(Table source_table, list column_order, list null_precedence): + """Sorts the table preserving order of equal elements. + + Parameters + ---------- + source_table : Table + The table to sort. + column_order : List[ColumnOrder] + Whether each column should be sorted in ascending or descending order. + null_precedence : List[NullOrder] + Whether nulls should come before or after non-nulls. + + Returns + ------- + Table + The sorted table. + """ + cdef unique_ptr[table] c_result + cdef vector[order] c_orders = column_order + cdef vector[null_order] c_null_precedence = null_precedence + with nogil: + c_result = move( + cpp_sorting.stable_sort( + source_table.view(), + c_orders, + c_null_precedence, + ) + ) + return Table.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd index 78adb20021c..29acc21fc05 100644 --- a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd @@ -15,19 +15,21 @@ from .table cimport Table cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold) -cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask) +cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold) -cpdef size_type distinct_count( - Column source_table, - null_policy null_handling, - nan_policy nan_handling +cpdef Table unique( + Table input, + list keys, + duplicate_keep_option keep, + null_equality nulls_equal, ) -cpdef Table stable_distinct( +cpdef Table distinct( Table input, list keys, duplicate_keep_option keep, null_equality nulls_equal, + nan_equality nans_equal, ) cpdef Column distinct_indices( @@ -36,3 +38,23 @@ cpdef Column distinct_indices( null_equality nulls_equal, nan_equality nans_equal, ) + +cpdef Table stable_distinct( + Table input, + list keys, + duplicate_keep_option keep, + null_equality nulls_equal, + nan_equality nans_equal, +) + +cpdef size_type unique_count( + Column column, + null_policy null_handling, + nan_policy nan_handling +) + +cpdef size_type distinct_count( + Column column, + null_policy null_handling, + nan_policy nan_handling +) diff --git a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx index 0357866980a..af7a85d31bf 100644 --- a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx @@ -51,6 +51,34 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold): return Table.from_libcudf(move(c_result)) +cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold): + """Filters out rows from the input table based on the presence of NaNs. + + Parameters + ---------- + source_table : Table + The input table to filter. + keys : List[size_type] + The list of column indexes to consider for NaN filtering. + keep_threshold : size_type + The minimum number of non-NaNs required to keep a row. + + Returns + ------- + Table + A new table with rows removed based on NaNs. + """ + cdef unique_ptr[table] c_result + cdef vector[size_type] c_keys = keys + with nogil: + c_result = move( + cpp_stream_compaction.drop_nulls( + source_table.view(), c_keys, keep_threshold + ) + ) + return Table.from_libcudf(move(c_result)) + + cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask): """Filters out rows from the input table based on a boolean mask. @@ -76,39 +104,55 @@ cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask): return Table.from_libcudf(move(c_result)) -cpdef size_type distinct_count( - Column source_table, - null_policy null_handling, - nan_policy nan_handling +cpdef Table unique( + Table input, + list keys, + duplicate_keep_option keep, + null_equality nulls_equal, ): - """Returns the number of unique elements in the input column. + """Filter duplicate consecutive rows from the input table. Parameters ---------- - source_table : Column - The input column to count the unique elements of. - null_handling : null_policy - Flag to include or exclude nulls from the count. - nan_handling : nan_policy - Flag to include or exclude NaNs from the count. + input : Table + The input table to filter + keys : list[int] + The list of column indexes to consider for filtering. + keep : duplicate_keep_option + The option to specify which rows to keep in the case of duplicates. + nulls_equal : null_equality + The option to specify how nulls are handled in the comparison. Returns ------- - size_type - The number of unique elements in the input column. + Table + New Table with unique rows from each sequence of equivalent rows + as specified by keep. In the same order as the input table. + + Notes + ----- + If the input columns to be filtered on are sorted, then + unique can produce the same result as stable_distinct, but faster. """ - return cpp_stream_compaction.distinct_count( - source_table.view(), null_handling, nan_handling - ) + cdef unique_ptr[table] c_result + cdef vector[size_type] c_keys = keys + with nogil: + c_result = move( + cpp_stream_compaction.unique( + input.view(), c_keys, keep, nulls_equal + ) + ) + return Table.from_libcudf(move(c_result)) -cpdef Table stable_distinct( +cpdef Table distinct( Table input, list keys, duplicate_keep_option keep, null_equality nulls_equal, + nan_equality nans_equal, ): - """Get the distinct rows from the input table, preserving input order. + """Get the distinct rows from the input table. Parameters ---------- @@ -120,18 +164,21 @@ cpdef Table stable_distinct( The option to specify which rows to keep in the case of duplicates. nulls_equal : null_equality The option to specify how nulls are handled in the comparison. + nans_equal : nan_equality + The option to specify how NaNs are handled in the comparison. Returns ------- Table - A new table with distinct rows from the input table. + A new table with distinct rows from the input table. The + output will not necessarily be in the same order as the input. """ cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys with nogil: c_result = move( - cpp_stream_compaction.stable_distinct( - input.view(), c_keys, keep, nulls_equal + cpp_stream_compaction.distinct( + input.view(), c_keys, keep, nulls_equal, nans_equal ) ) return Table.from_libcudf(move(c_result)) @@ -169,3 +216,99 @@ cpdef Column distinct_indices( ) ) return Column.from_libcudf(move(c_result)) + + +cpdef Table stable_distinct( + Table input, + list keys, + duplicate_keep_option keep, + null_equality nulls_equal, + nan_equality nans_equal, +): + """Get the distinct rows from the input table, preserving input order. + + Parameters + ---------- + input : Table + The input table to filter. + keys : list + The list of column indexes to consider for distinct filtering. + keep : duplicate_keep_option + The option to specify which rows to keep in the case of duplicates. + nulls_equal : null_equality + The option to specify how nulls are handled in the comparison. + nans_equal : nan_equality + The option to specify how NaNs are handled in the comparison. + + Returns + ------- + Table + A new table with distinct rows from the input table, preserving + the input table order. + """ + cdef unique_ptr[table] c_result + cdef vector[size_type] c_keys = keys + with nogil: + c_result = move( + cpp_stream_compaction.stable_distinct( + input.view(), c_keys, keep, nulls_equal, nans_equal + ) + ) + return Table.from_libcudf(move(c_result)) + + +cpdef size_type unique_count( + Column source, + null_policy null_handling, + nan_policy nan_handling +): + """Returns the number of unique consecutive elements in the input column. + + Parameters + ---------- + source : Column + The input column to count the unique elements of. + null_handling : null_policy + Flag to include or exclude nulls from the count. + nan_handling : nan_policy + Flag to include or exclude NaNs from the count. + + Returns + ------- + size_type + The number of unique consecutive elements in the input column. + + Notes + ----- + If the input column is sorted, then unique_count can produce the + same result as distinct_count, but faster. + """ + return cpp_stream_compaction.unique_count( + source.view(), null_handling, nan_handling + ) + + +cpdef size_type distinct_count( + Column source, + null_policy null_handling, + nan_policy nan_handling +): + """Returns the number of distinct elements in the input column. + + Parameters + ---------- + source : Column + The input column to count the unique elements of. + null_handling : null_policy + Flag to include or exclude nulls from the count. + nan_handling : nan_policy + Flag to include or exclude NaNs from the count. + + Returns + ------- + size_type + The number of distinct elements in the input column. + """ + return cpp_stream_compaction.distinct_count( + source.view(), null_handling, nan_handling + ) diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index 04883eac559..834f91f48d9 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -109,6 +109,7 @@ def drop_duplicates(list columns, keep_option, pylibcudf.types.NullEquality.EQUAL if nulls_are_equal else pylibcudf.types.NullEquality.UNEQUAL, + pylibcudf.types.NanEquality.ALL_EQUAL, ) )