Skip to content

Commit

Permalink
BUG make hashtable.unique support readonly arrays (#18825)
Browse files Browse the repository at this point in the history
  • Loading branch information
hexgnu authored and jreback committed Dec 27, 2017
1 parent ee2e6de commit 80a5399
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 44 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,7 @@ Reshaping
- Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`)
- Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`)
- Bug in :func:`Series.rank` where ``Series`` containing ``NaT`` modifies the ``Series`` inplace (:issue:`18521`)
- Bug in :func:`cut` which fails when using readonly arrays (:issue:`18773`)
- Bug in :func:`Dataframe.pivot_table` which fails when the ``aggfunc`` arg is of type string. The behavior is now consistent with other methods like ``agg`` and ``apply`` (:issue:`18713`)


Expand Down
100 changes: 59 additions & 41 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -255,10 +255,56 @@ dtypes = [('Float64', 'float64', 'val != val', True),
('UInt64', 'uint64', 'False', False),
('Int64', 'int64', 'val == iNaT', False)]

def get_dispatch(dtypes):
for (name, dtype, null_condition, float_group) in dtypes:
unique_template = """\
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
{dtype}_t val
khiter_t k
bint seen_na = 0
{name}Vector uniques = {name}Vector()
{name}VectorData *ud

ud = uniques.data

with nogil:
for i in range(n):
val = values[i]
IF {float_group}:
if val == val:
k = kh_get_{dtype}(self.table, val)
if k == self.table.n_buckets:
kh_put_{dtype}(self.table, val, &ret)
if needs_resize(ud):
with gil:
uniques.resize()
append_data_{dtype}(ud, val)
elif not seen_na:
seen_na = 1
if needs_resize(ud):
with gil:
uniques.resize()
append_data_{dtype}(ud, NAN)
ELSE:
k = kh_get_{dtype}(self.table, val)
if k == self.table.n_buckets:
kh_put_{dtype}(self.table, val, &ret)
if needs_resize(ud):
with gil:
uniques.resize()
append_data_{dtype}(ud, val)
return uniques.to_array()
"""

unique_template = unique_template.format(name=name, dtype=dtype, null_condition=null_condition, float_group=float_group)

yield (name, dtype, null_condition, float_group, unique_template)
}}


{{for name, dtype, null_condition, float_group in dtypes}}
{{for name, dtype, null_condition, float_group, unique_template in get_dispatch(dtypes)}}

cdef class {{name}}HashTable(HashTable):

Expand Down Expand Up @@ -450,48 +496,20 @@ cdef class {{name}}HashTable(HashTable):
return np.asarray(labels), arr_uniques

@cython.boundscheck(False)
def unique(self, {{dtype}}_t[:] values):
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
{{dtype}}_t val
khiter_t k
bint seen_na = 0
{{name}}Vector uniques = {{name}}Vector()
{{name}}VectorData *ud
def unique(self, ndarray[{{dtype}}_t, ndim=1] values):
if values.flags.writeable:
# If the value is writeable (mutable) then use memview
return self.unique_memview(values)

ud = uniques.data

with nogil:
for i in range(n):
val = values[i]

{{if float_group}}
if val == val:
k = kh_get_{{dtype}}(self.table, val)
if k == self.table.n_buckets:
kh_put_{{dtype}}(self.table, val, &ret)
if needs_resize(ud):
with gil:
uniques.resize()
append_data_{{dtype}}(ud, val)
elif not seen_na:
seen_na = 1
if needs_resize(ud):
with gil:
uniques.resize()
append_data_{{dtype}}(ud, NAN)
{{else}}
k = kh_get_{{dtype}}(self.table, val)
if k == self.table.n_buckets:
kh_put_{{dtype}}(self.table, val, &ret)
if needs_resize(ud):
with gil:
uniques.resize()
append_data_{{dtype}}(ud, val)
{{endif}}
# We cannot use the memoryview version on readonly-buffers due to
# a limitation of Cython's typed memoryviews. Instead we can use
# the slightly slower Cython ndarray type directly.
# see https://github.com/cython/cython/issues/1605
{{unique_template}}

return uniques.to_array()
@cython.boundscheck(False)
def unique_memview(self, {{dtype}}_t[:] values):
{{unique_template}}

{{endfor}}

Expand Down
17 changes: 14 additions & 3 deletions pandas/tests/reshape/test_tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,18 @@ def f():
tm.assert_numpy_array_equal(
mask, np.array([False, True, True, True, True]))

@pytest.mark.parametrize(
"array_1_writeable, array_2_writeable",
[(True, True), (True, False), (False, False)])
def test_cut_read_only(self, array_1_writeable, array_2_writeable):
# issue 18773
array_1 = np.arange(0, 100, 10)
array_1.flags.writeable = array_1_writeable

def curpath():
pth, _ = os.path.split(os.path.abspath(__file__))
return pth
array_2 = np.arange(0, 100, 10)
array_2.flags.writeable = array_2_writeable

hundred_elements = np.arange(100)

tm.assert_categorical_equal(cut(hundred_elements, array_1),
cut(hundred_elements, array_2))

0 comments on commit 80a5399

Please sign in to comment.