Skip to content

Commit

Permalink
Faster factorize
Browse files Browse the repository at this point in the history
  • Loading branch information
dcherian committed Dec 9, 2023
1 parent 12cbef9 commit 03edc1f
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 2 deletions.
69 changes: 69 additions & 0 deletions asv_bench/benchmarks/factorize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/usr/bin/env python3

import numpy as np
import pandas as pd
from asv_runner.benchmarks.mark import parameterize

import flox

Nsmall = 4
Nlarge = 2000


class Factorize:
"""Time the core factorize_ function."""

def setup(self, *args, **kwargs):
raise NotImplementedError

@parameterize(
{
"expected": (None, (pd.Index([1, 3]),), (pd.RangeIndex(Nsmall),)),
"reindex": [True, False],
"sort": [True, False],
}
)
def time_factorize_small(self, expected, reindex, sort):
flox.core.factorize_(
self.by_small,
axes=(-1,),
expected_groups=expected,
reindex=reindex,
sort=sort,
)

@parameterize(
{
"expected": (None, (pd.Index([1, 3]),), (pd.RangeIndex(Nsmall),)),
"reindex": [True, False],
"sort": [True, False],
}
)
def time_factorize_large(self, expected, reindex, sort):
flox.core.factorize_(
self.by_large,
axes=(-1,),
expected_groups=None,
reindex=reindex,
sort=sort,
)


class SingleGrouper1D(Factorize):
def setup(self, *args, **kwargs):
self.by_small = (np.repeat(np.arange(Nsmall), 250),)
self.by_large = (np.random.permutation(np.arange(Nlarge)),)


class SingleGrouper3D(Factorize):
def setup(self, *args, **kwargs):
self.by_small = (np.broadcast_to(np.repeat(np.arange(Nsmall), 250), (5, 5, 1000)),)
self.by_large = (np.broadcast_to(np.random.permutation(np.arange(Nlarge)), (5, 5, Nlarge)),)


# class Multiple(Factorize):
# def setup(self, *args, **kwargs):
# pass

# class CFTimeFactorize(Factorize):
# pass
40 changes: 38 additions & 2 deletions flox/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,31 @@ def offset_labels(labels: np.ndarray, ngroups: int) -> tuple[np.ndarray, int]:
return offset, size


def fast_isin(ar1, ar2, invert):
"""
Faster version of numpy isin.
1. Use pd.factorize instead of np.unique
2. Skip a bunch of checks
"""
rev_idx, ar1 = pd.factorize(ar1, sort=False)

ar = np.concatenate((ar1, ar2))
# We need this to be a stable sort, so always use 'mergesort'
# here. The values from the first array should always come before
# the values from the second array.
order = ar.argsort(kind="mergesort")
sar = ar[order]
if invert:
bool_ar = sar[1:] != sar[:-1]
else:
bool_ar = sar[1:] == sar[:-1]
flag = np.concatenate((bool_ar, [invert]))
ret = np.empty(ar.shape, dtype=bool)
ret[order] = flag

return ret[rev_idx]


@overload
def factorize_(
by: T_Bys,
Expand Down Expand Up @@ -639,12 +664,23 @@ def factorize_(
if expect is not None and reindex:
sorter = np.argsort(expect)
groups = expect[(sorter,)] if sort else expect

idx = np.searchsorted(expect, flat, sorter=sorter)
mask = ~np.isin(flat, expect) | isnull(flat) | (idx == len(expect))
mask = fast_isin(flat, expect, invert=True)
if not np.issubdtype(flat.dtype, np.integer):
mask |= isnull(flat)
outside_last_elem_mask = idx == len(expect)
mask |= outside_last_elem_mask

# idx = np.full(flat.shape, -1)
# result = np.searchsorted(expect.values, flat[~mask], sorter=sorter)
# idx[~mask] = result
# idx = np.searchsorted(expect.values, flat, sorter=sorter)
# idx[mask] = -1
if not sort:
# idx is the index in to the sorted array.
# if we didn't want sorting, unsort it back
idx[(idx == len(expect),)] = -1
idx[(outside_last_elem_mask)] = -1
idx = sorter[(idx,)]
idx[mask] = -1
else:
Expand Down

0 comments on commit 03edc1f

Please sign in to comment.