From 8d436c5d509848376585b4f7dc95d8ef25060ca6 Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Thu, 25 Aug 2022 11:00:13 +0200 Subject: [PATCH 1/4] Fix DictionaryArray.from_buffers, should not crash Requires modifying signature to take dictionary as well as the buffers. --- python/pyarrow/array.pxi | 49 ++++++++++++++++++++++++++++ python/pyarrow/includes/libarrow.pxd | 1 + python/pyarrow/tests/test_array.py | 7 ++++ 3 files changed, 57 insertions(+) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 7a4f040061669..b2f5e70b81d9f 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2478,6 +2478,55 @@ cdef class DictionaryArray(Array): return self._indices + @staticmethod + def from_buffers(DataType type, int64_t length, buffers, dictionary, + int64_t null_count=-1, int64_t offset=0, memory_pool=None): + """ + Construct a DictionaryArray from buffers. + + type : pyarrow.DataType + length : int + The number of values in the array. + buffers : List[Buffer] + The buffers backing this array. + dictionary : pyarrow.Array, ndarray or pandas.Series + The array of values referenced by the indices. + null_count : int, default -1 + The number of null entries in the array. Negative value means that + the null count is not known. + offset : int, default 0 + The array's logical offset (in values, not in bytes) from the + start of each buffer. + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise uses default pool. + """ + cdef: + Array _dictionary + vector[shared_ptr[CBuffer]] c_buffers + shared_ptr[CDataType] c_type + shared_ptr[CArrayData] c_data + shared_ptr[CArray] c_result + + for buf in buffers: + c_buffers.push_back(pyarrow_unwrap_buffer(buf)) + + if isinstance(dictionary, Array): + _dictionary = dictionary + else: + _dictionary = array(dictionary, memory_pool=memory_pool) + + c_type = pyarrow_unwrap_data_type(type) + + with nogil: + c_data = CArrayData.Make( + c_type, length, c_buffers, null_count, offset) + c_data.get().dictionary = _dictionary.sp_array.get().data() + c_result.reset(new CDictionaryArray(c_data)) + + cdef Array result = pyarrow_wrap_array(c_result) + result.validate() + return result + @staticmethod def from_arrays(indices, dictionary, mask=None, bint ordered=False, bint from_pandas=False, bint safe=True, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 781d2ce7ad6e8..31ac0d0ec6e85 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -250,6 +250,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: CDictionaryArray(const shared_ptr[CDataType]& type, const shared_ptr[CArray]& indices, const shared_ptr[CArray]& dictionary) + CDictionaryArray(const shared_ptr[CArrayData]& data) @staticmethod CResult[shared_ptr[CArray]] FromArrays( diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 814691c92d9b8..02e4bc276db44 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -725,6 +725,13 @@ def test_struct_array_from_chunked(): pa.StructArray.from_arrays([chunked_arr], ["foo"]) +def test_dictionary_from_buffers(): + a = pa.array(["one", "two", "three", "two", "one"]).dictionary_encode() + b = pa.DictionaryArray.from_buffers( + a.type, len(a), a.indices.buffers(), a.dictionary) + assert a == b + + def test_dictionary_from_numpy(): indices = np.repeat([0, 1, 2], 2) dictionary = np.array(['foo', 'bar', 'baz'], dtype=object) From ff683dcd07adf92a9a59778fbad9b68edfbd1373 Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Mon, 29 Aug 2022 10:58:09 +0200 Subject: [PATCH 2/4] Address review comments --- python/pyarrow/array.pxi | 20 +++++++++----------- python/pyarrow/tests/test_array.py | 10 ++++++---- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index b2f5e70b81d9f..4da0ea79e1d1e 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2479,11 +2479,13 @@ cdef class DictionaryArray(Array): return self._indices @staticmethod - def from_buffers(DataType type, int64_t length, buffers, dictionary, - int64_t null_count=-1, int64_t offset=0, memory_pool=None): + def from_buffers(DataType type, int64_t length, buffers, Array dictionary, + int64_t null_count=-1, int64_t offset=0): """ Construct a DictionaryArray from buffers. + Parameters + ---------- type : pyarrow.DataType length : int The number of values in the array. @@ -2497,11 +2499,12 @@ cdef class DictionaryArray(Array): offset : int, default 0 The array's logical offset (in values, not in bytes) from the start of each buffer. - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise uses default pool. + + Returns + ------- + dict_array : DictionaryArray """ cdef: - Array _dictionary vector[shared_ptr[CBuffer]] c_buffers shared_ptr[CDataType] c_type shared_ptr[CArrayData] c_data @@ -2510,17 +2513,12 @@ cdef class DictionaryArray(Array): for buf in buffers: c_buffers.push_back(pyarrow_unwrap_buffer(buf)) - if isinstance(dictionary, Array): - _dictionary = dictionary - else: - _dictionary = array(dictionary, memory_pool=memory_pool) - c_type = pyarrow_unwrap_data_type(type) with nogil: c_data = CArrayData.Make( c_type, length, c_buffers, null_count, offset) - c_data.get().dictionary = _dictionary.sp_array.get().data() + c_data.get().dictionary = dictionary.sp_array.get().data() c_result.reset(new CDictionaryArray(c_data)) cdef Array result = pyarrow_wrap_array(c_result) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 02e4bc276db44..3a4bcc40f813b 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -725,11 +725,13 @@ def test_struct_array_from_chunked(): pa.StructArray.from_arrays([chunked_arr], ["foo"]) -def test_dictionary_from_buffers(): +@pytest.mark.parametrize("offset", (0, 1)) +def test_dictionary_from_buffers(offset): a = pa.array(["one", "two", "three", "two", "one"]).dictionary_encode() - b = pa.DictionaryArray.from_buffers( - a.type, len(a), a.indices.buffers(), a.dictionary) - assert a == b + b = pa.DictionaryArray.from_buffers(a.type, len(a)-offset, + a.indices.buffers(), a.dictionary, + offset=offset) + assert a[offset:] == b def test_dictionary_from_numpy(): From f6bbc5611e5aaa13ad811167fbfb41c1b5ff3056 Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Tue, 30 Aug 2022 05:50:50 -0800 Subject: [PATCH 3/4] Update python/pyarrow/array.pxi [skip ci] Co-authored-by: Antoine Pitrou --- python/pyarrow/array.pxi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 4da0ea79e1d1e..30c2e7deeb3a5 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2490,7 +2490,7 @@ cdef class DictionaryArray(Array): length : int The number of values in the array. buffers : List[Buffer] - The buffers backing this array. + The buffers backing the indices array. dictionary : pyarrow.Array, ndarray or pandas.Series The array of values referenced by the indices. null_count : int, default -1 From 79a01fbe3713c60f877a9d40d98be45ba87740df Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Tue, 30 Aug 2022 05:51:02 -0800 Subject: [PATCH 4/4] Update python/pyarrow/array.pxi Co-authored-by: Antoine Pitrou --- python/pyarrow/array.pxi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 30c2e7deeb3a5..6c09e56e41341 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2494,7 +2494,7 @@ cdef class DictionaryArray(Array): dictionary : pyarrow.Array, ndarray or pandas.Series The array of values referenced by the indices. null_count : int, default -1 - The number of null entries in the array. Negative value means that + The number of null entries in the indices array. Negative value means that the null count is not known. offset : int, default 0 The array's logical offset (in values, not in bytes) from the