Skip to content

Commit

Permalink
ARROW-12677: [Python] Add a mask argument to pyarrow.StructArray.from…
Browse files Browse the repository at this point in the history
…_arrays

This allows the user to supply an optional `mask` when creating a struct array.

 * The mask requirements are pretty strict (must be a boolean arrow array without nulls) compared with some of the other functions (e.g. `array.mask` accepts a wide variety of inputs).  I think this should be ok since this use case is probably rarer and there are other plenty of existing ways to convert other datatypes to an arrow array.
 * ~~Unfortunately, StructArray::Make interprets the "null buffer" as more of a validity buffer (1 = valid, 0 = null).  This is the opposite of everywhere else a `mask` is used.  I was torn between inverting the input buffer to mimic the python API and passing through directly to the C interface for simplicity.  I chose the simpler option but could be convinced otherwise.~~ Per request, I now invert the mask to align with the python API.

Closes #10272 from westonpace/feature/ARROW-12677--python-add-a-mask-argument-to-pyarrow-structarra

Authored-by: Weston Pace <[email protected]>
Signed-off-by: David Li <[email protected]>
  • Loading branch information
westonpace authored and lidavidm committed May 14, 2021
1 parent 527c346 commit f47703e
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 3 deletions.
57 changes: 54 additions & 3 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1593,6 +1593,39 @@ cdef class ListArray(BaseListArray):
Returns
-------
list_array : ListArray
Examples
--------
>>> values = pa.array([1, 2, 3, 4])
>>> offsets = pa.array([0, 2, 4])
>>> pa.ListArray.from_arrays(offsets, values)
<pyarrow.lib.ListArray object at 0x7fbde226bf40>
[
[
0,
1
],
[
2,
3
]
]
# nulls in the offsets array become null lists
>>> offsets = pa.array([0, None, 2, 4])
>>> pa.ListArray.from_arrays(offsets, values)
<pyarrow.lib.ListArray object at 0x7fbde226bf40>
[
[
0,
1
],
null,
[
2,
3
]
]
"""
cdef:
Array _offsets, _values
Expand Down Expand Up @@ -2153,7 +2186,8 @@ cdef class StructArray(Array):
return [pyarrow_wrap_array(arr) for arr in arrays]

@staticmethod
def from_arrays(arrays, names=None, fields=None):
def from_arrays(arrays, names=None, fields=None, mask=None,
memory_pool=None):
"""
Construct StructArray from collection of arrays representing
each field in the struct.
Expand All @@ -2167,13 +2201,18 @@ cdef class StructArray(Array):
Field names for each struct child.
fields : List[Field] (optional)
Field instances for each struct child.
mask : pyarrow.Array[bool] (optional)
Indicate which values are null (True) or not null (False).
memory_pool : MemoryPool (optional)
For memory allocations, if required, otherwise uses default pool.
Returns
-------
result : StructArray
"""
cdef:
shared_ptr[CArray] c_array
shared_ptr[CBuffer] c_mask
vector[shared_ptr[CArray]] c_arrays
vector[c_string] c_names
vector[shared_ptr[CField]] c_fields
Expand All @@ -2189,6 +2228,18 @@ cdef class StructArray(Array):
if names is not None and fields is not None:
raise ValueError('Must pass either names or fields, not both')

if mask is None:
c_mask = shared_ptr[CBuffer]()
elif isinstance(mask, Array):
if mask.type.id != Type_BOOL:
raise ValueError('Mask must be a pyarrow.Array of type bool')
if mask.null_count != 0:
raise ValueError('Mask must not contain nulls')
inverted_mask = _pc().invert(mask, memory_pool=memory_pool)
c_mask = pyarrow_unwrap_buffer(inverted_mask.buffers()[1])
else:
raise ValueError('Mask must be a pyarrow.Array of type bool')

arrays = [asarray(x) for x in arrays]
for arr in arrays:
c_array = pyarrow_unwrap_array(arr)
Expand All @@ -2215,10 +2266,10 @@ cdef class StructArray(Array):
# XXX Cannot pass "nullptr" for a shared_ptr<T> argument:
# https://github.com/cython/cython/issues/3020
c_result = CStructArray.MakeFromFieldNames(
c_arrays, c_names, shared_ptr[CBuffer](), -1, 0)
c_arrays, c_names, c_mask, -1, 0)
else:
c_result = CStructArray.MakeFromFields(
c_arrays, c_fields, shared_ptr[CBuffer](), -1, 0)
c_arrays, c_fields, c_mask, -1, 0)
cdef Array result = pyarrow_wrap_array(GetResultValue(c_result))
result.validate()
return result
Expand Down
41 changes: 41 additions & 0 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -668,6 +668,28 @@ def test_struct_from_arrays():
with pytest.raises(ValueError, match="int64 vs int32"):
pa.StructArray.from_arrays([a, b, c], fields=[fa2, fb, fc])

arrays = [a, b, c]
fields = [fa, fb, fc]
# With mask
mask = pa.array([True, False, False])
arr = pa.StructArray.from_arrays(arrays, fields=fields, mask=mask)
assert arr.to_pylist() == [None] + expected_list[1:]

arr = pa.StructArray.from_arrays(arrays, names=['a', 'b', 'c'], mask=mask)
assert arr.to_pylist() == [None] + expected_list[1:]

# Bad masks
with pytest.raises(ValueError, match='Mask must be'):
pa.StructArray.from_arrays(arrays, fields, mask=[True, False, False])

with pytest.raises(ValueError, match='not contain nulls'):
pa.StructArray.from_arrays(
arrays, fields, mask=pa.array([True, False, None]))

with pytest.raises(ValueError, match='Mask must be'):
pa.StructArray.from_arrays(
arrays, fields, mask=pa.chunked_array([mask]))


def test_struct_array_from_chunked():
# ARROW-11780
Expand Down Expand Up @@ -932,6 +954,25 @@ def test_fixed_size_list_from_arrays():
pa.FixedSizeListArray.from_arrays(values, 5)


def test_variable_list_from_arrays():
values = pa.array([1, 2, 3, 4], pa.int64())
offsets = pa.array([0, 2, 4])
result = pa.ListArray.from_arrays(offsets, values)
assert result.to_pylist() == [[1, 2], [3, 4]]
assert result.type.equals(pa.list_(pa.int64()))

offsets = pa.array([0, None, 2, 4])
result = pa.ListArray.from_arrays(offsets, values)
assert result.to_pylist() == [[1, 2], None, [3, 4]]

# raise if offset out of bounds
with pytest.raises(ValueError):
pa.ListArray.from_arrays(pa.array([-1, 2, 4]), values)

with pytest.raises(ValueError):
pa.ListArray.from_arrays(pa.array([0, 2, 5]), values)


def test_union_from_dense():
binary = pa.array([b'a', b'b', b'c', b'd'], type='binary')
int64 = pa.array([1, 2, 3], type='int64')
Expand Down

0 comments on commit f47703e

Please sign in to comment.