Skip to content

Commit

Permalink
Update names
Browse files Browse the repository at this point in the history
  • Loading branch information
phofl committed Sep 7, 2023
1 parent 33170bb commit 5d7151d
Show file tree
Hide file tree
Showing 7 changed files with 36 additions and 52 deletions.
8 changes: 4 additions & 4 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
unpack_tuple_and_ellipses,
validate_indices,
)
from pandas.core.reshape.merge_utils import factorize_with_rizer
from pandas.core.reshape.merge_utils import factorize_arrays
from pandas.core.strings.base import BaseStringArrayMethods

from pandas.io._util import _arrow_dtype_mapping
Expand Down Expand Up @@ -2555,8 +2555,8 @@ def _dt_tz_convert(self, tz):
result = self._pa_array.cast(pa.timestamp(current_unit, tz))
return type(self)(result)

def _factorize_with_other(
self, other: ArrowExtensionArray, sort: bool = False # type: ignore[override]
def _factorize_with_other_for_merge(
self, other: Self, sort: bool = False # type: ignore[override]
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
if not isinstance(self.dtype, StringDtype) and (
pa.types.is_floating(self.dtype.pyarrow_dtype)
Expand All @@ -2573,7 +2573,7 @@ def _factorize_with_other(
lk = np.asarray(lk, dtype=np.int64)
rk = np.asarray(rk, dtype=np.int64)

return factorize_with_rizer(lk, rk, sort, self.isna(), other.isna())
return factorize_arrays(lk, rk, sort, self.isna(), other.isna())

len_left = len(self)
left = self._pa_array
Expand Down
26 changes: 4 additions & 22 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@

from pandas._libs import (
algos as libalgos,
hashtable as libhashtable,
lib,
)
from pandas.compat import set_function_name
Expand Down Expand Up @@ -70,7 +69,7 @@
unique,
)
from pandas.core.array_algos.quantile import quantile_with_mask
from pandas.core.reshape.merge_utils import factorize_with_rizer
from pandas.core.reshape.merge_utils import factorize_arrays
from pandas.core.sorting import (
nargminmax,
nargsort,
Expand Down Expand Up @@ -104,23 +103,6 @@
from pandas import Index

_extension_array_shared_docs: dict[str, str] = {}
_factorizers = {
np.int64: libhashtable.Int64Factorizer,
np.longlong: libhashtable.Int64Factorizer,
np.int32: libhashtable.Int32Factorizer,
np.int16: libhashtable.Int16Factorizer,
np.int8: libhashtable.Int8Factorizer,
np.uint64: libhashtable.UInt64Factorizer,
np.uint32: libhashtable.UInt32Factorizer,
np.uint16: libhashtable.UInt16Factorizer,
np.uint8: libhashtable.UInt8Factorizer,
np.bool_: libhashtable.UInt8Factorizer,
np.float64: libhashtable.Float64Factorizer,
np.float32: libhashtable.Float32Factorizer,
np.complex64: libhashtable.Complex64Factorizer,
np.complex128: libhashtable.Complex128Factorizer,
np.object_: libhashtable.ObjectFactorizer,
}


class ExtensionArray:
Expand Down Expand Up @@ -2285,12 +2267,12 @@ def _groupby_op(
else:
raise NotImplementedError

def _factorize_with_other(
self, other: ExtensionArray, sort: bool = False
def _factorize_with_other_for_merge(
self, other: Self, sort: bool = False
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
lk, _ = self._values_for_factorize()
rk, _ = other._values_for_factorize()
return factorize_with_rizer(lk, rk, sort)
return factorize_arrays(lk, rk, sort)


class ExtensionArraySupportsAnyAll(ExtensionArray):
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@
sanitize_array,
)
from pandas.core.ops.common import unpack_zerodim_and_defer
from pandas.core.reshape.merge_utils import factorize_with_rizer
from pandas.core.reshape.merge_utils import factorize_arrays
from pandas.core.sorting import nargsort
from pandas.core.strings.object_array import ObjectStringArrayMixin

Expand Down Expand Up @@ -2718,14 +2718,14 @@ def _groupby_op(
res_values[result_mask == 1] = -1
return self._from_backing_data(res_values)

def _factorize_with_other(
self, other: Categorical, sort: bool = False # type: ignore[override]
def _factorize_with_other_for_merge(
self, other: Self, sort: bool = False # type: ignore[override]
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
other = self._encode_with_my_categories(other)

lk = ensure_int64(self.codes)
rk = ensure_int64(other.codes)
return factorize_with_rizer(lk, rk, sort)
return factorize_arrays(lk, rk, sort)


# The Series.cat accessor
Expand Down
10 changes: 4 additions & 6 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@
invalid_comparison,
make_invalid_op,
)
from pandas.core.reshape.merge_utils import factorize_with_rizer
from pandas.core.reshape.merge_utils import factorize_arrays

from pandas.tseries import frequencies

Expand Down Expand Up @@ -1704,14 +1704,12 @@ def _groupby_op(
res_values = res_values.view(self._ndarray.dtype)
return self._from_backing_data(res_values)

def _factorize_with_other(
self,
other: DatetimeLikeArrayMixin, # type: ignore[override]
sort: bool = False,
def _factorize_with_other_for_merge(
self, other: Self, sort: bool = False
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
lk = np.asarray(self._ndarray, dtype=np.int64)
rk = np.asarray(other._ndarray, dtype=np.int64)
return factorize_with_rizer(lk, rk, sort)
return factorize_arrays(lk, rk, sort)


class DatelikeOps(DatetimeLikeArrayMixin):
Expand Down
10 changes: 4 additions & 6 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@
)
from pandas.core.indexers import check_array_indexer
from pandas.core.ops import invalid_comparison
from pandas.core.reshape.merge_utils import factorize_with_rizer
from pandas.core.reshape.merge_utils import factorize_arrays

if TYPE_CHECKING:
from collections.abc import (
Expand Down Expand Up @@ -1506,12 +1506,10 @@ def _groupby_op(
# wrap in a MaskedArray
return self._maybe_mask_result(res_values, result_mask)

def _factorize_with_other(
self, other: BaseMaskedArray, sort: bool = False # type: ignore[override]
def _factorize_with_other_for_merge(
self, other: Self, sort: bool = False # type: ignore[override]
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
return factorize_with_rizer(
self._data, other._data, sort, self._mask, other._mask
)
return factorize_arrays(self._data, other._data, sort, self._mask, other._mask)


def transpose_homogeneous_masked_arrays(
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@
)
from pandas.core.frame import _merge_doc
from pandas.core.indexes.api import default_index
from pandas.core.reshape.merge_utils import factorize_with_rizer
from pandas.core.reshape.merge_utils import factorize_arrays
from pandas.core.sorting import is_int64_overflow_possible

if TYPE_CHECKING:
Expand Down Expand Up @@ -2379,15 +2379,15 @@ def _factorize_keys(

if isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype:
assert isinstance(rk, ExtensionArray)
llab, rlab, count = lk._factorize_with_other(rk, sort)
llab, rlab, count = lk._factorize_with_other_for_merge(rk, sort)
else:
lk, rk = _convert_arrays(lk, rk)

if isinstance(lk, ExtensionArray):
assert isinstance(rk, ExtensionArray)
llab, rlab, count = lk._factorize_with_other(rk, sort)
llab, rlab, count = lk._factorize_with_other_for_merge(rk, sort)
else:
llab, rlab, count = factorize_with_rizer(lk, rk, sort)
llab, rlab, count = factorize_arrays(lk, rk, sort)

if how == "right":
return rlab, llab, count
Expand Down
18 changes: 12 additions & 6 deletions pandas/core/reshape/merge_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,20 @@ def _sort_labels(
return new_left, new_right


def factorize_with_rizer(lk, rk, sort: bool = False, lk_mask=None, rk_mask=None):
rizer = _factorizers[lk.dtype.type](max(len(lk), len(rk)))
llab = rizer.factorize(lk, mask=lk_mask)
rlab = rizer.factorize(rk, mask=rk_mask)
count = rizer.get_count()
def factorize_arrays(
lk: np.ndarray,
rk: np.ndarray,
sort: bool = False,
lk_mask: np.ndarray | None = None,
rk_mask: np.ndarray | None = None,
):
factorizer = _factorizers[lk.dtype.type](max(len(lk), len(rk)))
llab = factorizer.factorize(lk, mask=lk_mask)
rlab = factorizer.factorize(rk, mask=rk_mask)
count = factorizer.get_count()

if sort:
uniques = rizer.uniques.to_array()
uniques = factorizer.uniques.to_array()
llab, rlab = _sort_labels(uniques, llab, rlab)

lmask = llab == -1
Expand Down

0 comments on commit 5d7151d

Please sign in to comment.