diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 63a1f11021e69..908c3e328f731 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -59,7 +59,7 @@ unpack_tuple_and_ellipses, validate_indices, ) -from pandas.core.reshape.merge_utils import factorize_with_rizer +from pandas.core.reshape.merge_utils import factorize_arrays from pandas.core.strings.base import BaseStringArrayMethods from pandas.io._util import _arrow_dtype_mapping @@ -2555,8 +2555,8 @@ def _dt_tz_convert(self, tz): result = self._pa_array.cast(pa.timestamp(current_unit, tz)) return type(self)(result) - def _factorize_with_other( - self, other: ArrowExtensionArray, sort: bool = False # type: ignore[override] + def _factorize_with_other_for_merge( + self, other: Self, sort: bool = False # type: ignore[override] ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: if not isinstance(self.dtype, StringDtype) and ( pa.types.is_floating(self.dtype.pyarrow_dtype) @@ -2573,7 +2573,7 @@ def _factorize_with_other( lk = np.asarray(lk, dtype=np.int64) rk = np.asarray(rk, dtype=np.int64) - return factorize_with_rizer(lk, rk, sort, self.isna(), other.isna()) + return factorize_arrays(lk, rk, sort, self.isna(), other.isna()) len_left = len(self) left = self._pa_array diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 3abf7740ff51a..7a303fccd7a06 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -24,7 +24,6 @@ from pandas._libs import ( algos as libalgos, - hashtable as libhashtable, lib, ) from pandas.compat import set_function_name @@ -70,7 +69,7 @@ unique, ) from pandas.core.array_algos.quantile import quantile_with_mask -from pandas.core.reshape.merge_utils import factorize_with_rizer +from pandas.core.reshape.merge_utils import factorize_arrays from pandas.core.sorting import ( nargminmax, nargsort, @@ -104,23 +103,6 @@ from pandas import Index _extension_array_shared_docs: dict[str, str] = {} -_factorizers = { - np.int64: libhashtable.Int64Factorizer, - np.longlong: libhashtable.Int64Factorizer, - np.int32: libhashtable.Int32Factorizer, - np.int16: libhashtable.Int16Factorizer, - np.int8: libhashtable.Int8Factorizer, - np.uint64: libhashtable.UInt64Factorizer, - np.uint32: libhashtable.UInt32Factorizer, - np.uint16: libhashtable.UInt16Factorizer, - np.uint8: libhashtable.UInt8Factorizer, - np.bool_: libhashtable.UInt8Factorizer, - np.float64: libhashtable.Float64Factorizer, - np.float32: libhashtable.Float32Factorizer, - np.complex64: libhashtable.Complex64Factorizer, - np.complex128: libhashtable.Complex128Factorizer, - np.object_: libhashtable.ObjectFactorizer, -} class ExtensionArray: @@ -2285,12 +2267,12 @@ def _groupby_op( else: raise NotImplementedError - def _factorize_with_other( - self, other: ExtensionArray, sort: bool = False + def _factorize_with_other_for_merge( + self, other: Self, sort: bool = False ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: lk, _ = self._values_for_factorize() rk, _ = other._values_for_factorize() - return factorize_with_rizer(lk, rk, sort) + return factorize_arrays(lk, rk, sort) class ExtensionArraySupportsAnyAll(ExtensionArray): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 28fc482cf3ae4..ee19fcf1c666d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -84,7 +84,7 @@ sanitize_array, ) from pandas.core.ops.common import unpack_zerodim_and_defer -from pandas.core.reshape.merge_utils import factorize_with_rizer +from pandas.core.reshape.merge_utils import factorize_arrays from pandas.core.sorting import nargsort from pandas.core.strings.object_array import ObjectStringArrayMixin @@ -2718,14 +2718,14 @@ def _groupby_op( res_values[result_mask == 1] = -1 return self._from_backing_data(res_values) - def _factorize_with_other( - self, other: Categorical, sort: bool = False # type: ignore[override] + def _factorize_with_other_for_merge( + self, other: Self, sort: bool = False # type: ignore[override] ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: other = self._encode_with_my_categories(other) lk = ensure_int64(self.codes) rk = ensure_int64(other.codes) - return factorize_with_rizer(lk, rk, sort) + return factorize_arrays(lk, rk, sort) # The Series.cat accessor diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index bb2340433db2c..16fd3a1d8404e 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -139,7 +139,7 @@ invalid_comparison, make_invalid_op, ) -from pandas.core.reshape.merge_utils import factorize_with_rizer +from pandas.core.reshape.merge_utils import factorize_arrays from pandas.tseries import frequencies @@ -1704,14 +1704,12 @@ def _groupby_op( res_values = res_values.view(self._ndarray.dtype) return self._from_backing_data(res_values) - def _factorize_with_other( - self, - other: DatetimeLikeArrayMixin, # type: ignore[override] - sort: bool = False, + def _factorize_with_other_for_merge( + self, other: Self, sort: bool = False ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: lk = np.asarray(self._ndarray, dtype=np.int64) rk = np.asarray(other._ndarray, dtype=np.int64) - return factorize_with_rizer(lk, rk, sort) + return factorize_arrays(lk, rk, sort) class DatelikeOps(DatetimeLikeArrayMixin): diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index ff98493be73bf..5cdddb9278e26 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -85,7 +85,7 @@ ) from pandas.core.indexers import check_array_indexer from pandas.core.ops import invalid_comparison -from pandas.core.reshape.merge_utils import factorize_with_rizer +from pandas.core.reshape.merge_utils import factorize_arrays if TYPE_CHECKING: from collections.abc import ( @@ -1506,12 +1506,10 @@ def _groupby_op( # wrap in a MaskedArray return self._maybe_mask_result(res_values, result_mask) - def _factorize_with_other( - self, other: BaseMaskedArray, sort: bool = False # type: ignore[override] + def _factorize_with_other_for_merge( + self, other: Self, sort: bool = False # type: ignore[override] ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: - return factorize_with_rizer( - self._data, other._data, sort, self._mask, other._mask - ) + return factorize_arrays(self._data, other._data, sort, self._mask, other._mask) def transpose_homogeneous_masked_arrays( diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 839e8b05d404e..95431b497ecf0 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -94,7 +94,7 @@ ) from pandas.core.frame import _merge_doc from pandas.core.indexes.api import default_index -from pandas.core.reshape.merge_utils import factorize_with_rizer +from pandas.core.reshape.merge_utils import factorize_arrays from pandas.core.sorting import is_int64_overflow_possible if TYPE_CHECKING: @@ -2379,15 +2379,15 @@ def _factorize_keys( if isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: assert isinstance(rk, ExtensionArray) - llab, rlab, count = lk._factorize_with_other(rk, sort) + llab, rlab, count = lk._factorize_with_other_for_merge(rk, sort) else: lk, rk = _convert_arrays(lk, rk) if isinstance(lk, ExtensionArray): assert isinstance(rk, ExtensionArray) - llab, rlab, count = lk._factorize_with_other(rk, sort) + llab, rlab, count = lk._factorize_with_other_for_merge(rk, sort) else: - llab, rlab, count = factorize_with_rizer(lk, rk, sort) + llab, rlab, count = factorize_arrays(lk, rk, sort) if how == "right": return rlab, llab, count diff --git a/pandas/core/reshape/merge_utils.py b/pandas/core/reshape/merge_utils.py index e4be47454f0cf..1ec9c81250815 100644 --- a/pandas/core/reshape/merge_utils.py +++ b/pandas/core/reshape/merge_utils.py @@ -42,14 +42,20 @@ def _sort_labels( return new_left, new_right -def factorize_with_rizer(lk, rk, sort: bool = False, lk_mask=None, rk_mask=None): - rizer = _factorizers[lk.dtype.type](max(len(lk), len(rk))) - llab = rizer.factorize(lk, mask=lk_mask) - rlab = rizer.factorize(rk, mask=rk_mask) - count = rizer.get_count() +def factorize_arrays( + lk: np.ndarray, + rk: np.ndarray, + sort: bool = False, + lk_mask: np.ndarray | None = None, + rk_mask: np.ndarray | None = None, +): + factorizer = _factorizers[lk.dtype.type](max(len(lk), len(rk))) + llab = factorizer.factorize(lk, mask=lk_mask) + rlab = factorizer.factorize(rk, mask=rk_mask) + count = factorizer.get_count() if sort: - uniques = rizer.uniques.to_array() + uniques = factorizer.uniques.to_array() llab, rlab = _sort_labels(uniques, llab, rlab) lmask = llab == -1