Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use List of Columns as Input for drop_nulls, gather and drop_duplicates #9558

Merged
merged 46 commits into from
Nov 19, 2021
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
3e9963c
List_Interface_dropnull
isVoid Oct 21, 2021
5b5716c
Change take interface to accept list
isVoid Oct 28, 2021
ea4a5e6
Actually passing in col in column API
isVoid Oct 28, 2021
5d56753
Fixing duplicate names issue in multiindex
isVoid Oct 28, 2021
4ca4169
Initial pass for gather refactoring
isVoid Oct 28, 2021
f6af30b
refactors drop_duplicates
isVoid Oct 28, 2021
7e1dacd
running pre-commit hooks and some cleanups
isVoid Oct 28, 2021
c483c77
Merge branch 'branch-21.12' of https://github.com/rapidsai/cudf into …
isVoid Oct 28, 2021
9711816
fix performance regression
isVoid Oct 29, 2021
b3f00ff
style fixes
isVoid Oct 29, 2021
c1f7670
moor style fixes
isVoid Oct 29, 2021
36827b7
reword data with columns
isVoid Nov 1, 2021
905f6f0
Unwrap
isVoid Nov 1, 2021
cdc4e1b
Update python/cudf/cudf/_lib/utils.pyx
isVoid Nov 1, 2021
54eadb7
Merge branch 'dropna_list_interface' of github.com:isVoid/cudf into d…
isVoid Nov 1, 2021
f3cfd84
call from siblings
isVoid Nov 1, 2021
8a80446
move argument on calling
isVoid Nov 1, 2021
e835d38
use indexing
isVoid Nov 1, 2021
7f22ac8
remove duplicate implementation of column name to positions conversio…
isVoid Nov 1, 2021
de37cfa
Move index naming logic into frame factories
isVoid Nov 1, 2021
2322cba
replace inext(iter) idiom with indexing
isVoid Nov 1, 2021
5aee88b
explain nullify parameter
isVoid Nov 1, 2021
383c6b3
add dtype check in column take
isVoid Nov 2, 2021
2682785
remove trailing comma in tuple
isVoid Nov 2, 2021
7fdc320
make docstrings two lines
isVoid Nov 2, 2021
1f30ef0
refine n_index_columns handling
isVoid Nov 2, 2021
d07f8e3
set single level index name for index, don't do so for multiindex
isVoid Nov 2, 2021
9049e18
consolidating index naming logic and remove incorrect comments
isVoid Nov 2, 2021
e5c69a1
fix bug in gather map bound check
isVoid Nov 2, 2021
33c1d67
Update python/cudf/cudf/core/column/column.py
isVoid Nov 2, 2021
be2aac8
Update python/cudf/cudf/core/indexed_frame.py
isVoid Nov 2, 2021
3e2c588
Merge branch 'branch-21.12' of https://github.com/rapidsai/cudf into …
isVoid Nov 2, 2021
f703922
Add drop_duplicates specification for indexed_frame
isVoid Nov 2, 2021
8be1cbc
rename columns to frame factory method, add docstring
isVoid Nov 2, 2021
32876e7
rename include_index argument
isVoid Nov 3, 2021
59606c4
docstring
isVoid Nov 3, 2021
b09ecd5
move drop_duplicate to BaseIndex
isVoid Nov 3, 2021
f16f975
Add dropna in BaseIndex, rewrite some logic in _drop_na_rows and move…
isVoid Nov 4, 2021
38cbbb6
Revert moving two drop* functions
isVoid Nov 9, 2021
07beaf8
Update docstrings
isVoid Nov 9, 2021
fd49a6f
Add multiple TODOs to address open issues on the PR
isVoid Nov 9, 2021
498e222
Apply suggestions from code review
isVoid Nov 10, 2021
7fbdd3a
style
isVoid Nov 12, 2021
03d785e
apply doc review change
isVoid Nov 12, 2021
86ffcef
Merge branch 'branch-22.02' of github.com:rapidsai/cudf into dropna_l…
isVoid Nov 15, 2021
c047f60
Consolidate gather_map checks
isVoid Nov 15, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 9 additions & 27 deletions python/cudf/cudf/_lib/copying.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,12 @@ from cudf._lib.cpp.scalar.scalar cimport scalar
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.types cimport size_type
from cudf._lib.utils cimport data_from_table_view, data_from_unique_ptr
from cudf._lib.utils cimport (
columns_from_unique_ptr,
data_from_table_view,
data_from_unique_ptr,
table_view_from_columns,
)

# workaround for https://github.com/cython/cython/issues/3885
ctypedef const scalar constscalar
Expand Down Expand Up @@ -144,26 +149,12 @@ def copy_range(Column input_column,


def gather(
source_table,
columns: list,
Column gather_map,
bool keep_index=True,
bool nullify=False
):
if not pd.api.types.is_integer_dtype(gather_map.dtype):
raise ValueError("Gather map is not integer dtype.")

if len(gather_map) > 0 and not nullify:
gm_min, gm_max = minmax(gather_map)
if gm_min < -len(source_table) or gm_max >= len(source_table):
raise IndexError(f"Gather map index with min {gm_min},"
f" max {gm_max} is out of bounds in"
f" {type(source_table)} with {len(source_table)}"
f" rows.")

cdef unique_ptr[table] c_result
cdef table_view source_table_view = table_view_from_table(
source_table, not keep_index
)
cdef table_view source_table_view = table_view_from_columns(columns)
cdef column_view gather_map_view = gather_map.view()
cdef cpp_copying.out_of_bounds_policy policy = (
cpp_copying.out_of_bounds_policy.NULLIFY if nullify
Expand All @@ -179,16 +170,7 @@ def gather(
)
)

return data_from_unique_ptr(
move(c_result),
column_names=source_table._column_names,
index_names=(
None if (
source_table._index is None)
or keep_index is False
else source_table._index_names
)
)
return columns_from_unique_ptr(move(c_result))


def scatter(object source, Column scatter_map, Column target_column,
Expand Down
81 changes: 25 additions & 56 deletions python/cudf/cudf/_lib/stream_compaction.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -24,40 +24,34 @@ from cudf._lib.cpp.types cimport (
null_policy,
size_type,
)
from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
from cudf._lib.utils cimport (
columns_from_unique_ptr,
data_from_unique_ptr,
table_view_from_columns,
table_view_from_table,
)


def drop_nulls(source_table, how="any", keys=None, thresh=None):
def drop_nulls(columns: list, how="any", keys=None, thresh=None):
"""
Drops null rows from cols depending on key columns.

Parameters
----------
source_table : source table whose null rows are dropped to form new table
columns : list of columns
how : "any" or "all". If thresh is None, drops rows of cols that have any
nulls or all nulls (respectively) in subset (default: "any")
keys : List of Column names. If set, then these columns are checked for
nulls rather than all of cols (optional)
keys : List of column indices. If set, then these columns are checked for
nulls rather than all of columns (optional)
thresh : Minimum number of non-nulls required to keep a row (optional)

Returns
-------
Frame with null rows dropped
columns with null rows dropped
"""

num_index_columns = (
0 if source_table._index is None else
source_table._index._num_columns)
# shifting the index number by number of index columns
cdef vector[size_type] cpp_keys = (
[
num_index_columns + source_table._column_names.index(name)
for name in keys
]
if keys is not None
else range(
num_index_columns, num_index_columns + source_table._num_columns
)
keys if keys is not None else range(len(columns))
)

cdef size_type c_keep_threshold = cpp_keys.size()
Expand All @@ -67,7 +61,7 @@ def drop_nulls(source_table, how="any", keys=None, thresh=None):
c_keep_threshold = 1

cdef unique_ptr[table] c_result
cdef table_view source_table_view = table_view_from_table(source_table)
cdef table_view source_table_view = table_view_from_columns(columns)

with nogil:
c_result = move(
Expand All @@ -78,13 +72,7 @@ def drop_nulls(source_table, how="any", keys=None, thresh=None):
)
)

return data_from_unique_ptr(
move(c_result),
column_names=source_table._column_names,
index_names=(
None if source_table._index is None
else source_table._index_names)
)
return columns_from_unique_ptr(move(c_result))


def apply_boolean_mask(source_table, Column boolean_mask):
Expand Down Expand Up @@ -124,26 +112,29 @@ def apply_boolean_mask(source_table, Column boolean_mask):
)


def drop_duplicates(source_table,
def drop_duplicates(columns: list,
object keys=None,
object keep='first',
bool nulls_are_equal=True,
bool ignore_index=False):
bool nulls_are_equal=True):
"""
Drops rows in source_table as per duplicate rows in keys.

Parameters
----------
source_table : source_table whose rows gets dropped
keys : List of Column names belong to source_table
columns : List of columns
keys : List of column indices. If set, then these columns are checked for
duplicates rather than all of columns (optional)
keep : keep 'first' or 'last' or none of the duplicate rows
nulls_are_equal : if True, nulls are treated equal else not.

Returns
-------
Frame with duplicate dropped
columns with duplicate dropped
"""

cdef vector[size_type] cpp_keys = (
keys if keys is not None else range(len(columns))
)
cdef duplicate_keep_option cpp_keep_option

if keep == 'first':
Expand All @@ -155,30 +146,14 @@ def drop_duplicates(source_table,
else:
raise ValueError('keep must be either "first", "last" or False')

num_index_columns =(
0 if (source_table._index is None or ignore_index)
else source_table._index._num_columns)
# shifting the index number by number of index columns
cdef vector[size_type] cpp_keys = (
[
num_index_columns + source_table._column_names.index(name)
for name in keys
]
if keys is not None
else range(
num_index_columns, num_index_columns + source_table._num_columns
)
)

cdef null_equality cpp_nulls_equal = (
null_equality.EQUAL
if nulls_are_equal
else null_equality.UNEQUAL
)
cdef unique_ptr[table] c_result
cdef table_view source_table_view = table_view_from_table(
source_table, ignore_index
)
cdef table_view source_table_view = table_view_from_columns(columns)

with nogil:
c_result = move(
Expand All @@ -190,13 +165,7 @@ def drop_duplicates(source_table,
)
)

return data_from_unique_ptr(
move(c_result),
column_names=source_table._column_names,
index_names=(
None if (source_table._index is None or ignore_index)
else source_table._index_names)
)
return columns_from_unique_ptr(move(c_result))


def distinct_count(Column source_column, ignore_nulls=True, nan_as_null=False):
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/utils.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ cdef data_from_table_view(
table_view tv, object owner, object column_names, object index_names=*)
cdef table_view table_view_from_columns(columns) except *
cdef table_view table_view_from_table(tbl, ignore_index=*) except*
cdef columns_from_unique_ptr(unique_ptr[table] c_tbl)
34 changes: 27 additions & 7 deletions python/cudf/cudf/_lib/utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ PARQUET_META_TYPE_MAP = {
for cudf_dtype, pandas_dtype in np_dtypes_to_pandas_dtypes.items()
}


cdef table_view table_view_from_columns(columns) except*:
"""Create a cudf::table_view from an iterable of Columns."""
cdef vector[column_view] column_views
Expand Down Expand Up @@ -221,6 +220,32 @@ def _index_level_name(index_name, level, column_names):
return f"__index_level_{level}__"


cdef columns_from_unique_ptr(
unique_ptr[table] c_tbl
):
"""Convert a libcudf table into list of columns.

Parameters
----------
c_tbl : unique_ptr[cudf::table]
isVoid marked this conversation as resolved.
Show resolved Hide resolved
The libcudf table whose columns will be extracted

Returns
-------
list[Column]
A list of columns.
"""
cdef vector[unique_ptr[column]] c_columns = move(c_tbl.get().release())
cdef vector[unique_ptr[column]].iterator it = c_columns.begin()

cdef size_t i

columns = [Column.from_unique_ptr(move(dereference(it+i)))
for i in range(c_columns.size())]

return columns


cdef data_from_unique_ptr(
unique_ptr[table] c_tbl, column_names, index_names=None
):
Expand Down Expand Up @@ -255,13 +280,8 @@ cdef data_from_unique_ptr(
tuple(Dict[str, Column], Optional[Index])
A dict of the columns in the output table.
"""
cdef vector[unique_ptr[column]] c_columns = move(c_tbl.get().release())
cdef vector[unique_ptr[column]].iterator it = c_columns.begin()

cdef size_t i

columns = [Column.from_unique_ptr(move(dereference(it+i)))
for i in range(c_columns.size())]
columns = columns_from_unique_ptr(move(c_tbl))

# First construct the index, if any
index = (
Expand Down
51 changes: 19 additions & 32 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,11 @@
create_null_mask,
)
from cudf._lib.scalar import as_device_scalar
from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count
from cudf._lib.stream_compaction import (
distinct_count as cpp_distinct_count,
drop_duplicates,
drop_nulls,
)
from cudf._lib.transform import bools_to_mask
from cudf._typing import BinaryOperand, ColumnLike, Dtype, ScalarLike
from cudf.api.types import (
Expand Down Expand Up @@ -71,7 +75,7 @@
pandas_dtypes_alias_to_cudf_alias,
pandas_dtypes_to_np_dtypes,
)
from cudf.utils.utils import mask_dtype
from cudf.utils.utils import _gather_map_is_valid, mask_dtype

T = TypeVar("T", bound="ColumnBase")

Expand Down Expand Up @@ -208,11 +212,8 @@ def __sizeof__(self) -> int:
return n

def dropna(self, drop_nan: bool = False) -> ColumnBase:
if drop_nan:
col = self.nans_to_nulls()
else:
col = self
return col.as_frame()._drop_na_rows(drop_nan=drop_nan)._as_column()
col = self.nans_to_nulls() if drop_nan else self
return drop_nulls([col])[0]

def to_arrow(self) -> pa.Array:
"""Convert to PyArrow Array
Expand Down Expand Up @@ -688,29 +689,19 @@ def quantile(
def median(self, skipna: bool = None) -> ScalarLike:
raise TypeError(f"cannot perform median with type {self.dtype}")

def take(
self: T,
indices: ColumnBase,
keep_index: bool = True,
nullify: bool = False,
) -> T:
"""Return Column by taking values from the corresponding *indices*."""
def take(self: T, indices: ColumnBase, nullify: bool = False,) -> T:
isVoid marked this conversation as resolved.
Show resolved Hide resolved
"""Return Column by taking values from the corresponding *indices*. Set
rows to null for all out of bound indices if nullify is `True`.
"""
isVoid marked this conversation as resolved.
Show resolved Hide resolved
# Handle zero size
if indices.size == 0:
return cast(T, column_empty_like(self, newsize=0))
try:
return (
self.as_frame()
._gather(indices, keep_index=keep_index, nullify=nullify)
._as_column()
._with_type_metadata(self.dtype)
)
except RuntimeError as e:
if "out of bounds" in str(e):
raise IndexError(
f"index out of bounds for column of size {len(self)}"
) from e
raise
if not nullify and not _gather_map_is_valid(indices, len(self)):
vyasr marked this conversation as resolved.
Show resolved Hide resolved
vyasr marked this conversation as resolved.
Show resolved Hide resolved
raise IndexError("Gather map index is out of bounds.")

return libcudf.copying.gather([self], indices, nullify=nullify,)[
isVoid marked this conversation as resolved.
Show resolved Hide resolved
0
]._with_type_metadata(self.dtype)

def isin(self, values: Sequence) -> ColumnBase:
"""Check whether values are contained in the Column.
Expand Down Expand Up @@ -1099,11 +1090,7 @@ def unique(self) -> ColumnBase:
# the following issue resolved:
# https://github.com/rapidsai/cudf/issues/5286

return (
self.as_frame()
.drop_duplicates(keep="first", ignore_index=True)
._as_column()
)
return drop_duplicates([self], keep="first")[0]

def serialize(self) -> Tuple[dict, list]:
header: Dict[Any, Any] = {}
Expand Down
Loading