Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify merge internals and reduce overhead #9516

Merged
merged 40 commits into from
Nov 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
26f02ae
Remove unnecessary trivial __init__ overrides.
vyasr Oct 17, 2021
3e8bea5
Some minor renames.
vyasr Oct 19, 2021
39c400a
Inline merge call.
vyasr Oct 19, 2021
477ab41
More simplifications.
vyasr Oct 19, 2021
1027a1e
Store left and right keys separately to get rid of the _JoinKeys name…
vyasr Oct 19, 2021
e2d5b08
Inline some Merge methods.
vyasr Oct 19, 2021
59d9510
Inline single categorical matching logic.
vyasr Oct 19, 2021
d1363aa
Stop creating column accessors and frames unnecessarily when subsetting.
vyasr Oct 19, 2021
54aef83
Minor comments.
vyasr Oct 19, 2021
2914670
Allow left_on and right_on to support both index levels and columns.
vyasr Oct 20, 2021
dbce92a
Cache key_columns_with_same_name on construction.
vyasr Oct 21, 2021
2bc258e
Remove self.on branch for sorting.
vyasr Oct 21, 2021
ea425e5
Remove self.on logic from join key computation.
vyasr Oct 21, 2021
5128a89
Disallow specifying left_on with left_index or right_on with right_in…
vyasr Oct 21, 2021
b601a82
Remove self.on.
vyasr Oct 21, 2021
46ec0cd
Remove DataFrame._from_columns.
vyasr Oct 21, 2021
229175b
Add error check for merge keys that are present in both a frame and i…
vyasr Oct 21, 2021
8da1115
Use the same logic for index and on cols.
vyasr Oct 21, 2021
aef3d91
Simplify key computation and inline in constructor.
vyasr Oct 21, 2021
35e66bc
Define joiner as class rather than instance variable.
vyasr Oct 21, 2021
0ca291e
Remove left_on and right_on to rely on left_keys and right_keys entir…
vyasr Oct 21, 2021
7f12230
Inline frame_select_by_indexers.
vyasr Oct 22, 2021
9992be6
Centralize more of the logic for generating output data and remove so…
vyasr Oct 22, 2021
ac621f2
Remove one more use of _Indexer.get.
vyasr Oct 22, 2021
e353096
Remove lhs/rhs members.
vyasr Oct 25, 2021
7452d70
Remove unnecessary host copy in tests.
vyasr Oct 25, 2021
8cca32c
Subclass _Indexer for columns and indexes rather than lumping all fun…
vyasr Oct 25, 2021
47311af
Rename output_lhs to lhs and output_rhs to rhs.
vyasr Oct 25, 2021
086d90d
Various internal simplifications.
vyasr Oct 25, 2021
c4bd790
Clean up output column renaming logic.
vyasr Oct 25, 2021
3d7ff7f
Minor comment cleanup.
vyasr Oct 25, 2021
8e1c908
Add tests for newly added checks.
vyasr Oct 25, 2021
cf9d242
Remove TODO.
vyasr Oct 25, 2021
3879261
Maintain index if 'on' is provided and names index columns.
vyasr Oct 25, 2021
ac9ce94
Change dask_cudf test since we now support this behavior (consistent …
vyasr Oct 25, 2021
0f0c2d9
Remove one more unnecessary assertion.
vyasr Oct 26, 2021
cd71262
Fix custreamz test.
vyasr Oct 29, 2021
2c104de
Merge branch 'branch-22.02' into refactor/merging_part1
vyasr Nov 12, 2021
040757e
Merge remote-tracking branch 'origin/branch-22.02' into refactor/merg…
vyasr Nov 16, 2021
0ff93b1
Merge remote-tracking branch 'origin/branch-22.02' into refactor/merg…
vyasr Nov 16, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 9 additions & 21 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,9 +598,12 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
else:
if is_list_like(data):
if len(data) > 0 and is_scalar(data[0]):
new_df = self._from_columns(
[data], index=index, columns=columns
)
if columns is not None:
data = dict(zip(columns, [data]))
else:
data = dict(enumerate([data]))
new_df = DataFrame(data=data, index=index)

self._data = new_df._data
self.index = new_df._index
self.columns = new_df.columns
Expand Down Expand Up @@ -3760,19 +3763,16 @@ def join(
FutureWarning,
)

lhs = self
rhs = other

df = lhs.merge(
rhs,
df = self.merge(
other,
left_index=True,
right_index=True,
how=how,
suffixes=(lsuffix, rsuffix),
sort=sort,
)
df.index.name = (
None if lhs.index.name != rhs.index.name else lhs.index.name
None if self.index.name != other.index.name else self.index.name
)
return df

Expand Down Expand Up @@ -5093,18 +5093,6 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
df._index = as_index(index)
return df

@classmethod
def _from_columns(cls, cols, index=None, columns=None):
"""
Construct a DataFrame from a list of Columns
"""
if columns is not None:
data = dict(zip(columns, cols))
else:
data = dict(enumerate(cols))

return cls(data=data, index=index,)

def interpolate(
self,
method="linear",
Expand Down
9 changes: 6 additions & 3 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
serialize_columns,
)
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.join import merge
from cudf.core.join import Merge, MergeSemi
from cudf.core.udf.pipeline import compile_or_get, supported_cols_from_frame
from cudf.core.window import Rolling
from cudf.utils import ioutils
Expand Down Expand Up @@ -3755,15 +3755,18 @@ def _merge(
suffixes=("_x", "_y"),
):
lhs, rhs = self, right
merge_cls = Merge
if how == "right":
# Merge doesn't support right, so just swap
how = "left"
lhs, rhs = right, self
left_on, right_on = right_on, left_on
left_index, right_index = right_index, left_index
suffixes = (suffixes[1], suffixes[0])
elif how in {"leftsemi", "leftanti"}:
merge_cls = MergeSemi

return merge(
return merge_cls(
lhs,
rhs,
on=on,
Expand All @@ -3775,7 +3778,7 @@ def _merge(
sort=sort,
indicator=indicator,
suffixes=suffixes,
)
).perform_merge()

def _is_sorted(self, ascending=None, null_position=None):
"""
Expand Down
24 changes: 0 additions & 24 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1178,18 +1178,6 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):

_PROTECTED_KEYS = frozenset(("obj",))

def __init__(
self, obj, by=None, level=None, sort=False, as_index=True, dropna=True
):
super().__init__(
obj=obj,
by=by,
level=level,
sort=sort,
as_index=as_index,
dropna=dropna,
)

def __getitem__(self, key):
return self.obj[key].groupby(
self.grouping, dropna=self._dropna, sort=self._sort
Expand Down Expand Up @@ -1262,18 +1250,6 @@ class SeriesGroupBy(GroupBy):
Name: Max Speed, dtype: float64
"""

def __init__(
self, obj, by=None, level=None, sort=False, as_index=True, dropna=True
):
super().__init__(
obj=obj,
by=by,
level=level,
sort=sort,
as_index=as_index,
dropna=dropna,
)

def agg(self, func):
result = super().agg(func)

Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/join/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2021, NVIDIA CORPORATION.

from cudf.core.join.join import merge
from cudf.core.join.join import Merge, MergeSemi
118 changes: 38 additions & 80 deletions python/cudf/cudf/core/join/_join_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,17 @@

import collections
import warnings
from typing import TYPE_CHECKING, Any, Iterable, Tuple
from typing import TYPE_CHECKING, Any, Tuple, cast

import numpy as np
import pandas as pd

import cudf
from cudf.api.types import is_dtype_equal
from cudf.core.column import CategoricalColumn
from cudf.core.dtypes import CategoricalDtype

if TYPE_CHECKING:
from cudf.core.column import CategoricalColumn, ColumnBase
from cudf.core.column import ColumnBase
from cudf.core.frame import Frame


Expand All @@ -28,61 +29,36 @@ class _Indexer:
# >>> _Indexer("a", column=True).get(df) # returns column "a" of df
# >>> _Indexer("b", index=True).get(df) # returns index level "b" of df

def __init__(self, name: Any, column=False, index=False):
if column and index:
raise ValueError("Cannot specify both column and index")
def __init__(self, name: Any):
self.name = name
self.column, self.index = column, index


class _ColumnIndexer(_Indexer):
def get(self, obj: Frame) -> ColumnBase:
# get the column from `obj`
if self.column:
return obj._data[self.name]
else:
if obj._index is not None:
return obj._index._data[self.name]
raise KeyError()
return obj._data[self.name]

def set(self, obj: Frame, value: ColumnBase, validate=False):
# set the colum in `obj`
if self.column:
obj._data.set_by_label(self.name, value, validate=validate)
else:
if obj._index is not None:
obj._index._data.set_by_label(
self.name, value, validate=validate
)
else:
raise KeyError()


def _frame_select_by_indexers(
frame: Frame, indexers: Iterable[_Indexer]
) -> Frame:
# Select columns from the given `Frame` using `indexers`,
# and return a new `Frame`.
index_data = frame._data.__class__()
data = frame._data.__class__()

for idx in indexers:
if idx.index:
index_data.set_by_label(idx.name, idx.get(frame), validate=False)
else:
data.set_by_label(idx.name, idx.get(frame), validate=False)
obj._data.set_by_label(self.name, value, validate=validate)

result_index = (
cudf.core.index._index_from_data(index_data) if index_data else None
)
result = cudf.core.frame.Frame(data=data, index=result_index)
return result

class _IndexIndexer(_Indexer):
def get(self, obj: Frame) -> ColumnBase:
if obj._index is not None:
return obj._index._data[self.name]
raise KeyError

def set(self, obj: Frame, value: ColumnBase, validate=False):
if obj._index is not None:
obj._index._data.set_by_label(self.name, value, validate=validate)
else:
raise KeyError


def _match_join_keys(
lcol: ColumnBase, rcol: ColumnBase, how: str
) -> Tuple[ColumnBase, ColumnBase]:
# returns the common dtype that lcol and rcol should be casted to,
# before they can be used as left and right join keys.
# If no casting is necessary, returns None
# Casts lcol and rcol to a common dtype for use as join keys. If no casting
# is necessary, they are returned as is.

common_type = None

Expand All @@ -91,12 +67,22 @@ def _match_join_keys(
rtype = rcol.dtype

# if either side is categorical, different logic
if isinstance(ltype, CategoricalDtype) or isinstance(
rtype, CategoricalDtype
):
return _match_categorical_dtypes(lcol, rcol, how)
left_is_categorical = isinstance(ltype, CategoricalDtype)
right_is_categorical = isinstance(rtype, CategoricalDtype)
if left_is_categorical and right_is_categorical:
return _match_categorical_dtypes_both(
cast(CategoricalColumn, lcol), cast(CategoricalColumn, rcol), how
)
elif left_is_categorical or right_is_categorical:
if left_is_categorical:
if how in {"left", "leftsemi", "leftanti"}:
return lcol, rcol.astype(ltype)
common_type = ltype.categories.dtype
else:
common_type = rtype.categories.dtype
return lcol.astype(common_type), rcol.astype(common_type)

if pd.api.types.is_dtype_equal(ltype, rtype):
if is_dtype_equal(ltype, rtype):
return lcol, rcol

if isinstance(ltype, cudf.Decimal64Dtype) or isinstance(
Expand Down Expand Up @@ -131,34 +117,9 @@ def _match_join_keys(
return lcol.astype(common_type), rcol.astype(common_type)


def _match_categorical_dtypes(
lcol: ColumnBase, rcol: ColumnBase, how: str
) -> Tuple[ColumnBase, ColumnBase]:
# cast the keys lcol and rcol to a common dtype
# when at least one of them is a categorical type
ltype, rtype = lcol.dtype, rcol.dtype

if isinstance(lcol, cudf.core.column.CategoricalColumn) and isinstance(
rcol, cudf.core.column.CategoricalColumn
):
# if both are categoricals, logic is complicated:
return _match_categorical_dtypes_both(lcol, rcol, how)

if isinstance(ltype, CategoricalDtype):
if how in {"left", "leftsemi", "leftanti"}:
return lcol, rcol.astype(ltype)
common_type = ltype.categories.dtype
elif isinstance(rtype, CategoricalDtype):
common_type = rtype.categories.dtype
return lcol.astype(common_type), rcol.astype(common_type)


def _match_categorical_dtypes_both(
lcol: CategoricalColumn, rcol: CategoricalColumn, how: str
) -> Tuple[ColumnBase, ColumnBase]:
# The commontype depends on both `how` and the specifics of the
# categorical variables to be merged.

ltype, rtype = lcol.dtype, rcol.dtype

# when both are ordered and both have the same categories,
Expand All @@ -184,9 +145,6 @@ def _match_categorical_dtypes_both(
"neither side is ordered"
)

# the following should now always hold
assert not ltype.ordered and not rtype.ordered

if how == "inner":
# cast to category types -- we must cast them back later
return _match_join_keys(
Expand Down
Loading