rapidsai · rapids-bot · Nov 17, 2021 · Oct 17, 2021 · Oct 19, 2021 · Oct 19, 2021
@@ -598,9 +598,12 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
         else:
             if is_list_like(data):
                 if len(data) > 0 and is_scalar(data[0]):
-                    new_df = self._from_columns(
-                        [data], index=index, columns=columns
-                    )
+                    if columns is not None:
+                        data = dict(zip(columns, [data]))
+                    else:
+                        data = dict(enumerate([data]))
+                    new_df = DataFrame(data=data, index=index)
+
                     self._data = new_df._data
                     self.index = new_df._index
                     self.columns = new_df.columns
@@ -3760,19 +3763,16 @@ def join(
                 FutureWarning,
             )
 
-        lhs = self
-        rhs = other
-
-        df = lhs.merge(
-            rhs,
+        df = self.merge(
+            other,
             left_index=True,
             right_index=True,
             how=how,
             suffixes=(lsuffix, rsuffix),
             sort=sort,
         )
         df.index.name = (
-            None if lhs.index.name != rhs.index.name else lhs.index.name
+            None if self.index.name != other.index.name else self.index.name
         )
         return df
 
@@ -5093,18 +5093,6 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
             df._index = as_index(index)
         return df
 
-    @classmethod
-    def _from_columns(cls, cols, index=None, columns=None):
-        """
-        Construct a DataFrame from a list of Columns
-        """
-        if columns is not None:
-            data = dict(zip(columns, cols))
-        else:
-            data = dict(enumerate(cols))
-
-        return cls(data=data, index=index,)
-
     def interpolate(
         self,
         method="linear",

@@ -46,7 +46,7 @@
     serialize_columns,
 )
 from cudf.core.column_accessor import ColumnAccessor
-from cudf.core.join import merge
+from cudf.core.join import Merge, MergeSemi
 from cudf.core.udf.pipeline import compile_or_get, supported_cols_from_frame
 from cudf.core.window import Rolling
 from cudf.utils import ioutils
@@ -3755,15 +3755,18 @@ def _merge(
         suffixes=("_x", "_y"),
     ):
         lhs, rhs = self, right
+        merge_cls = Merge
         if how == "right":
             # Merge doesn't support right, so just swap
             how = "left"
             lhs, rhs = right, self
             left_on, right_on = right_on, left_on
             left_index, right_index = right_index, left_index
             suffixes = (suffixes[1], suffixes[0])
+        elif how in {"leftsemi", "leftanti"}:
+            merge_cls = MergeSemi
 
-        return merge(
+        return merge_cls(
             lhs,
             rhs,
             on=on,
@@ -3775,7 +3778,7 @@ def _merge(
             sort=sort,
             indicator=indicator,
             suffixes=suffixes,
-        )
+        ).perform_merge()
 
     def _is_sorted(self, ascending=None, null_position=None):
         """

@@ -1178,18 +1178,6 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
 
     _PROTECTED_KEYS = frozenset(("obj",))
 
-    def __init__(
-        self, obj, by=None, level=None, sort=False, as_index=True, dropna=True
-    ):
-        super().__init__(
-            obj=obj,
-            by=by,
-            level=level,
-            sort=sort,
-            as_index=as_index,
-            dropna=dropna,
-        )
-
     def __getitem__(self, key):
         return self.obj[key].groupby(
             self.grouping, dropna=self._dropna, sort=self._sort
@@ -1262,18 +1250,6 @@ class SeriesGroupBy(GroupBy):
     Name: Max Speed, dtype: float64
     """
 
-    def __init__(
-        self, obj, by=None, level=None, sort=False, as_index=True, dropna=True
-    ):
-        super().__init__(
-            obj=obj,
-            by=by,
-            level=level,
-            sort=sort,
-            as_index=as_index,
-            dropna=dropna,
-        )
-
     def agg(self, func):
         result = super().agg(func)
 

@@ -1,3 +1,3 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
-from cudf.core.join.join import merge
+from cudf.core.join.join import Merge, MergeSemi
@@ -3,16 +3,17 @@
 
 import collections
 import warnings
-from typing import TYPE_CHECKING, Any, Iterable, Tuple
+from typing import TYPE_CHECKING, Any, Tuple, cast
 
 import numpy as np
-import pandas as pd
 
 import cudf
+from cudf.api.types import is_dtype_equal
+from cudf.core.column import CategoricalColumn
 from cudf.core.dtypes import CategoricalDtype
 
 if TYPE_CHECKING:
-    from cudf.core.column import CategoricalColumn, ColumnBase
+    from cudf.core.column import ColumnBase
     from cudf.core.frame import Frame
 
 
@@ -28,61 +29,36 @@ class _Indexer:
     # >>> _Indexer("a", column=True).get(df)  # returns column "a" of df
     # >>> _Indexer("b", index=True).get(df)  # returns index level "b" of df
 
-    def __init__(self, name: Any, column=False, index=False):
-        if column and index:
-            raise ValueError("Cannot specify both column and index")
+    def __init__(self, name: Any):
         self.name = name
-        self.column, self.index = column, index
 
+
+class _ColumnIndexer(_Indexer):
     def get(self, obj: Frame) -> ColumnBase:
-        # get the column from `obj`
-        if self.column:
-            return obj._data[self.name]
-        else:
-            if obj._index is not None:
-                return obj._index._data[self.name]
-        raise KeyError()
+        return obj._data[self.name]
 
     def set(self, obj: Frame, value: ColumnBase, validate=False):
-        # set the colum in `obj`
-        if self.column:
-            obj._data.set_by_label(self.name, value, validate=validate)
-        else:
-            if obj._index is not None:
-                obj._index._data.set_by_label(
-                    self.name, value, validate=validate
-                )
-            else:
-                raise KeyError()
-
-
-def _frame_select_by_indexers(
-    frame: Frame, indexers: Iterable[_Indexer]
-) -> Frame:
-    # Select columns from the given `Frame` using `indexers`,
-    # and return a new `Frame`.
-    index_data = frame._data.__class__()
-    data = frame._data.__class__()
-
-    for idx in indexers:
-        if idx.index:
-            index_data.set_by_label(idx.name, idx.get(frame), validate=False)
-        else:
-            data.set_by_label(idx.name, idx.get(frame), validate=False)
+        obj._data.set_by_label(self.name, value, validate=validate)
 
-    result_index = (
-        cudf.core.index._index_from_data(index_data) if index_data else None
-    )
-    result = cudf.core.frame.Frame(data=data, index=result_index)
-    return result
+
+class _IndexIndexer(_Indexer):
+    def get(self, obj: Frame) -> ColumnBase:
+        if obj._index is not None:
+            return obj._index._data[self.name]
+        raise KeyError
+
+    def set(self, obj: Frame, value: ColumnBase, validate=False):
+        if obj._index is not None:
+            obj._index._data.set_by_label(self.name, value, validate=validate)
+        else:
+            raise KeyError
 
 
 def _match_join_keys(
     lcol: ColumnBase, rcol: ColumnBase, how: str
 ) -> Tuple[ColumnBase, ColumnBase]:
-    # returns the common dtype that lcol and rcol should be casted to,
-    # before they can be used as left and right join keys.
-    # If no casting is necessary, returns None
+    # Casts lcol and rcol to a common dtype for use as join keys. If no casting
+    # is necessary, they are returned as is.
 
     common_type = None
 
@@ -91,12 +67,22 @@ def _match_join_keys(
     rtype = rcol.dtype
 
     # if either side is categorical, different logic
-    if isinstance(ltype, CategoricalDtype) or isinstance(
-        rtype, CategoricalDtype
-    ):
-        return _match_categorical_dtypes(lcol, rcol, how)
+    left_is_categorical = isinstance(ltype, CategoricalDtype)
+    right_is_categorical = isinstance(rtype, CategoricalDtype)
+    if left_is_categorical and right_is_categorical:
+        return _match_categorical_dtypes_both(
+            cast(CategoricalColumn, lcol), cast(CategoricalColumn, rcol), how
+        )
+    elif left_is_categorical or right_is_categorical:
+        if left_is_categorical:
+            if how in {"left", "leftsemi", "leftanti"}:
+                return lcol, rcol.astype(ltype)
+            common_type = ltype.categories.dtype
+        else:
+            common_type = rtype.categories.dtype
+        return lcol.astype(common_type), rcol.astype(common_type)
 
-    if pd.api.types.is_dtype_equal(ltype, rtype):
+    if is_dtype_equal(ltype, rtype):
         return lcol, rcol
 
     if isinstance(ltype, cudf.Decimal64Dtype) or isinstance(
@@ -131,34 +117,9 @@ def _match_join_keys(
     return lcol.astype(common_type), rcol.astype(common_type)
 
 
-def _match_categorical_dtypes(
-    lcol: ColumnBase, rcol: ColumnBase, how: str
-) -> Tuple[ColumnBase, ColumnBase]:
-    # cast the keys lcol and rcol to a common dtype
-    # when at least one of them is a categorical type
-    ltype, rtype = lcol.dtype, rcol.dtype
-
-    if isinstance(lcol, cudf.core.column.CategoricalColumn) and isinstance(
-        rcol, cudf.core.column.CategoricalColumn
-    ):
-        # if both are categoricals, logic is complicated:
-        return _match_categorical_dtypes_both(lcol, rcol, how)
-
-    if isinstance(ltype, CategoricalDtype):
-        if how in {"left", "leftsemi", "leftanti"}:
-            return lcol, rcol.astype(ltype)
-        common_type = ltype.categories.dtype
-    elif isinstance(rtype, CategoricalDtype):
-        common_type = rtype.categories.dtype
-    return lcol.astype(common_type), rcol.astype(common_type)
-
-
 def _match_categorical_dtypes_both(
     lcol: CategoricalColumn, rcol: CategoricalColumn, how: str
 ) -> Tuple[ColumnBase, ColumnBase]:
-    # The commontype depends on both `how` and the specifics of the
-    # categorical variables to be merged.
-
     ltype, rtype = lcol.dtype, rcol.dtype
 
     # when both are ordered and both have the same categories,
@@ -184,9 +145,6 @@ def _match_categorical_dtypes_both(
             "neither side is ordered"
         )
 
-    # the following should now always hold
-    assert not ltype.ordered and not rtype.ordered
-
     if how == "inner":
         # cast to category types -- we must cast them back later
         return _match_join_keys(