From dd0988b49cb6726cf915bb9f53d7320e3a97b00b Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Thu, 25 Aug 2022 10:53:29 +0200 Subject: [PATCH] ARROW-15277: [C++][Python] Use ChunkedArray::Make for chunked_array (#13950) Supersedes and will close #12096 [ARROW-15277](https://issues.apache.org/jira/browse/ARROW-15277) Lead-authored-by: Miles Granger Co-authored-by: Eduardo Ponce Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/chunked_array.cc | 2 +- cpp/src/arrow/chunked_array_test.cc | 4 ++-- python/pyarrow/includes/libarrow.pxd | 4 ++++ python/pyarrow/table.pxi | 21 +++++---------------- python/pyarrow/tests/test_compute.py | 8 ++++++++ python/pyarrow/tests/test_table.py | 10 ++++------ 6 files changed, 24 insertions(+), 25 deletions(-) diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index 840dd04a5ad4e..c5e6d7fa4bdf0 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -72,7 +72,7 @@ Result> ChunkedArray::Make(ArrayVector chunks, } for (const auto& chunk : chunks) { if (!chunk->type()->Equals(*type)) { - return Status::Invalid("Array chunks must all be same type"); + return Status::TypeError("Array chunks must all be same type"); } } return std::make_shared(std::move(chunks), std::move(type)); diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc index d1dc69de274b6..08410b4cd5367 100644 --- a/cpp/src/arrow/chunked_array_test.cc +++ b/cpp/src/arrow/chunked_array_test.cc @@ -65,8 +65,8 @@ TEST_F(TestChunkedArray, Make) { ASSERT_OK_AND_ASSIGN(auto result2, ChunkedArray::Make({chunk0, chunk0}, int8())); AssertChunkedEqual(*result, *result2); - ASSERT_RAISES(Invalid, ChunkedArray::Make({chunk0, chunk1})); - ASSERT_RAISES(Invalid, ChunkedArray::Make({chunk0}, int16())); + ASSERT_RAISES(TypeError, ChunkedArray::Make({chunk0, chunk1})); + ASSERT_RAISES(TypeError, ChunkedArray::Make({chunk0}, int16())); } TEST_F(TestChunkedArray, MakeEmpty) { diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index a9b0a4bc71ab2..781d2ce7ad6e8 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -760,6 +760,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: CChunkedArray(const vector[shared_ptr[CArray]]& arrays) CChunkedArray(const vector[shared_ptr[CArray]]& arrays, const shared_ptr[CDataType]& type) + + @staticmethod + CResult[shared_ptr[CChunkedArray]] Make(vector[shared_ptr[CArray]] chunks, + shared_ptr[CDataType] type) int64_t length() int64_t null_count() int num_chunks() diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 5d84716fc9824..b8c98df1f0e0e 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1303,7 +1303,8 @@ def chunked_array(arrays, type=None): cdef: Array arr vector[shared_ptr[CArray]] c_arrays - shared_ptr[CChunkedArray] sp_chunked_array + shared_ptr[CChunkedArray] c_result + shared_ptr[CDataType] c_type type = ensure_type(type, allow_none=True) @@ -1318,25 +1319,13 @@ def chunked_array(arrays, type=None): # subsequent arrays to the firstly inferred array type # it also spares the inference overhead after the first chunk type = arr.type - else: - if arr.type != type: - raise TypeError( - "All array chunks must have type {}".format(type) - ) c_arrays.push_back(arr.sp_array) - if c_arrays.size() == 0 and type is None: - raise ValueError("When passing an empty collection of arrays " - "you must also pass the data type") - - sp_chunked_array.reset( - new CChunkedArray(c_arrays, pyarrow_unwrap_data_type(type)) - ) + c_type = pyarrow_unwrap_data_type(type) with nogil: - check_status(sp_chunked_array.get().Validate()) - - return pyarrow_wrap_chunked_array(sp_chunked_array) + c_result = GetResultValue(CChunkedArray.Make(c_arrays, c_type)) + return pyarrow_wrap_chunked_array(c_result) cdef _schema_from_arrays(arrays, names, metadata, shared_ptr[CSchema]* schema): diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 2bdec412f1f82..f2820b6e25f4c 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -394,6 +394,14 @@ def test_mode_chunked_array(): assert len(pc.mode(arr)) == 0 +def test_empty_chunked_array(): + msg = "cannot construct ChunkedArray from empty vector and omitted type" + with pytest.raises(pa.ArrowInvalid, match=msg): + pa.chunked_array([]) + + pa.chunked_array([], type=pa.int8()) + + def test_variance(): data = [1, 2, 3, 4, 5, 6, 7, 8] assert pc.variance(data).as_py() == 5.25 diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index dbd90ac907b80..c0c60da6272f2 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -97,10 +97,7 @@ def test_chunked_array_construction(): assert len(arr) == 3 assert len(arr.chunks) == 2 - msg = ( - "When passing an empty collection of arrays you must also pass the " - "data type" - ) + msg = "cannot construct ChunkedArray from empty vector and omitted type" with pytest.raises(ValueError, match=msg): assert pa.chunked_array([]) @@ -143,14 +140,15 @@ def test_chunked_array_to_numpy(): def test_chunked_array_mismatch_types(): - with pytest.raises(TypeError): + msg = "chunks must all be same type" + with pytest.raises(TypeError, match=msg): # Given array types are different pa.chunked_array([ pa.array([1, 2, 3]), pa.array([1., 2., 3.]) ]) - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): # Given array type is different from explicit type argument pa.chunked_array([pa.array([1, 2, 3])], type=pa.float64())