From 0dbbd43ca9133912d1809394727784560cc5e797 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Tue, 13 Feb 2024 18:15:10 +0000 Subject: [PATCH 01/46] GH-40052: [C++][FS][Azure] Fix CreateDir and DeleteDir trailing slash issues on hierarchical namespace accounts (#40054) ### Rationale for this change There were the following failure cases ``` fs->CreateDir("directory/") ``` ``` fs->DeleteDir("directory/") ``` They fail with ``` Failed to delete a directory: directory/: https://tomtesthns.blob.core.windows.net/ea119933-c9d3-11ee-989a-71cec6336ac8/directory/ Azure Error: [InvalidUri] 400 The request URI is invalid. The request URI is invalid. RequestId:c9ad826a-101f-0005-5be0-5d0db4000000 Time:2024-02-12T18:24:12.9974541Z Request ID: c9ad826a-101f-0005-5be0-5d0db4000000 ``` ### What changes are included in this PR? Add tests to cover these cases. Remove trailing slashes to avoid these issues. ### Are these changes tested? Yes. I added new test cases to cover these cases. I was rather torn about whether to add precise tests like I've done or to duplicate every test case we had and run it a second time with trailing slashes. ### Are there any user-facing changes? Fixed bug on `CreateDir` and `DeleteDir`. * Closes: #40052 Authored-by: Thomas Newton Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/filesystem/azurefs.cc | 6 ++- cpp/src/arrow/filesystem/azurefs_test.cc | 64 ++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index d4bb445701444..11750591932e9 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1635,7 +1635,8 @@ class AzureFileSystem::Impl { return CreateDirTemplate( adlfs_client, [](const auto& adlfs_client, const auto& location) { - auto directory_client = adlfs_client.GetDirectoryClient(location.path); + auto directory_client = adlfs_client.GetDirectoryClient( + std::string(internal::RemoveTrailingSlash(location.path))); directory_client.CreateIfNotExists(); }, location, recursive); @@ -1860,7 +1861,8 @@ class AzureFileSystem::Impl { Azure::Nullable lease_id = {}) { DCHECK(!location.container.empty()); DCHECK(!location.path.empty()); - auto directory_client = adlfs_client.GetDirectoryClient(location.path); + auto directory_client = adlfs_client.GetDirectoryClient( + std::string(internal::RemoveTrailingSlash(location.path))); DataLake::DeleteDirectoryOptions options; options.AccessConditions.LeaseId = std::move(lease_id); try { diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index c39a5b7d22bdd..42f38f1ed6ac7 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -698,6 +698,14 @@ class TestAzureFileSystem : public ::testing::Test { AssertFileInfo(fs(), dir1, FileType::Directory); } + void TestCreateDirOnRootWithTrailingSlash() { + auto dir1 = PreexistingData::RandomContainerName(rng_) + "/"; + + AssertFileInfo(fs(), dir1, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(dir1, false)); + AssertFileInfo(fs(), dir1, FileType::Directory); + } + void TestCreateDirOnExistingContainer() { auto data = SetUpPreexistingData(); auto dir1 = data.RandomDirectoryPath(rng_); @@ -758,6 +766,15 @@ class TestAzureFileSystem : public ::testing::Test { AssertFileInfo(fs(), subdir5, FileType::Directory); } + void TestCreateDirOnExistingContainerWithTrailingSlash() { + auto data = SetUpPreexistingData(); + auto dir1 = data.RandomDirectoryPath(rng_) + "/"; + + AssertFileInfo(fs(), dir1, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(dir1, /*recursive=*/false)); + AssertFileInfo(fs(), dir1, FileType::Directory); + } + void TestCreateDirOnMissingContainer() { auto container1 = PreexistingData::RandomContainerName(rng_); auto container2 = PreexistingData::RandomContainerName(rng_); @@ -844,6 +861,21 @@ class TestAzureFileSystem : public ::testing::Test { AssertFileInfo(fs(), blob_path, FileType::NotFound); } + void TestNonEmptyDirWithTrailingSlash() { + if (HasSubmitBatchBug()) { + GTEST_SKIP() << kSubmitBatchBugMessage; + } + auto data = SetUpPreexistingData(); + const auto directory_path = data.RandomDirectoryPath(rng_); + const auto blob_path = ConcatAbstractPath(directory_path, "hello.txt"); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(blob_path)); + ASSERT_OK(output->Write("hello")); + ASSERT_OK(output->Close()); + AssertFileInfo(fs(), blob_path, FileType::File); + ASSERT_OK(fs()->DeleteDir(directory_path + "/")); + AssertFileInfo(fs(), blob_path, FileType::NotFound); + } + void TestDeleteDirSuccessHaveDirectory() { if (HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; @@ -873,6 +905,20 @@ class TestAzureFileSystem : public ::testing::Test { } } + void TestDeleteDirContentsSuccessExistWithTrailingSlash() { + if (HasSubmitBatchBug()) { + GTEST_SKIP() << kSubmitBatchBugMessage; + } + auto preexisting_data = SetUpPreexistingData(); + HierarchicalPaths paths; + CreateHierarchicalData(&paths); + ASSERT_OK(fs()->DeleteDirContents(paths.directory + "/")); + AssertFileInfo(fs(), paths.directory, FileType::Directory); + for (const auto& sub_path : paths.sub_paths) { + AssertFileInfo(fs(), sub_path, FileType::NotFound); + } + } + void TestDeleteDirContentsSuccessNonexistent() { if (HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; @@ -1466,6 +1512,10 @@ TYPED_TEST(TestAzureFileSystemOnAllEnvs, CreateDirWithEmptyPath) { TYPED_TEST(TestAzureFileSystemOnAllEnvs, CreateDirOnRoot) { this->TestCreateDirOnRoot(); } +TYPED_TEST(TestAzureFileSystemOnAllEnvs, CreateDirOnRootWithTrailingSlash) { + this->TestCreateDirOnRootWithTrailingSlash(); +} + // Tests using all the 3 environments (Azurite, Azure w/o HNS (flat), Azure w/ HNS) // combined with the two scenarios for AzureFileSystem::cached_hns_support_ -- unknown and // known according to the environment. @@ -1496,6 +1546,11 @@ TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirOnExistingContainer) { this->TestCreateDirOnExistingContainer(); } +TYPED_TEST(TestAzureFileSystemOnAllScenarios, + CreateDirOnExistingContainerWithTrailingSlash) { + this->TestCreateDirOnExistingContainerWithTrailingSlash(); +} + TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirOnMissingContainer) { this->TestCreateDirOnMissingContainer(); } @@ -1512,6 +1567,10 @@ TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirSuccessHaveBlob) { this->TestDeleteDirSuccessHaveBlob(); } +TYPED_TEST(TestAzureFileSystemOnAllScenarios, NonEmptyDirWithTrailingSlash) { + this->TestNonEmptyDirWithTrailingSlash(); +} + TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirSuccessHaveDirectory) { this->TestDeleteDirSuccessHaveDirectory(); } @@ -1520,6 +1579,11 @@ TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirContentsSuccessExist) { this->TestDeleteDirContentsSuccessExist(); } +TYPED_TEST(TestAzureFileSystemOnAllScenarios, + DeleteDirContentsSuccessExistWithTrailingSlash) { + this->TestDeleteDirContentsSuccessExistWithTrailingSlash(); +} + TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirContentsSuccessNonexistent) { this->TestDeleteDirContentsSuccessNonexistent(); } From 967831b49d8ffeb9499c22aa3a812e46dc5cb1aa Mon Sep 17 00:00:00 2001 From: Yue Date: Wed, 14 Feb 2024 16:40:56 +0800 Subject: [PATCH 02/46] GH-40040: [C++][Gandiva] Make Gandiva's default cache size to be 5000 for object code cache (#40041) ### Rationale for this change Gandiva's default cache is object code cache, however, the default cache size is still the old value for LLVM module based cache, which is too small. More details about the `GANDIVA_ENABLE_OBJECT_CODE_CACHE` flag can be found in GH-40040 ### What changes are included in this PR? Remove the unused `GANDIVA_ENABLE_OBJECT_CODE_CACHE` flag and make the default cache size to be `500000` for object code cache. ### Are these changes tested? No ### Are there any user-facing changes? Yes, default cache size will be changed from 500 to 500000, and it may help the default deployment's performance. * Closes: #40040 Authored-by: Yue Ni Signed-off-by: Antoine Pitrou --- cpp/src/gandiva/CMakeLists.txt | 1 + cpp/src/gandiva/cache.cc | 6 +---- cpp/src/gandiva/cache_test.cc | 42 ++++++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 5 deletions(-) create mode 100644 cpp/src/gandiva/cache_test.cc diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index d773fb5ff5895..9352ac5c4a938 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -243,6 +243,7 @@ endfunction() add_gandiva_test(internals-test SOURCES bitmap_accumulator_test.cc + cache_test.cc engine_llvm_test.cc function_registry_test.cc function_signature_test.cc diff --git a/cpp/src/gandiva/cache.cc b/cpp/src/gandiva/cache.cc index f7e3e5e9f8f1f..a1333ccdc5d43 100644 --- a/cpp/src/gandiva/cache.cc +++ b/cpp/src/gandiva/cache.cc @@ -23,11 +23,7 @@ namespace gandiva { -#ifdef GANDIVA_ENABLE_OBJECT_CODE_CACHE -static const size_t DEFAULT_CACHE_SIZE = 500000; -#else -static const size_t DEFAULT_CACHE_SIZE = 500; -#endif +static const size_t DEFAULT_CACHE_SIZE = 5000; int GetCapacity() { size_t capacity = DEFAULT_CACHE_SIZE; diff --git a/cpp/src/gandiva/cache_test.cc b/cpp/src/gandiva/cache_test.cc new file mode 100644 index 0000000000000..a146707079fa6 --- /dev/null +++ b/cpp/src/gandiva/cache_test.cc @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/cache.h" + +#include + +namespace gandiva { +class TestCacheKey { + public: + explicit TestCacheKey(int value) : value_(value) {} + std::size_t Hash() const { return value_; } + bool operator==(const TestCacheKey& other) const { return value_ == other.value_; } + + private: + int value_; +}; + +TEST(TestCache, TestGetPut) { + Cache cache(2); + cache.PutObjectCode(TestCacheKey(1), "hello"); + cache.PutObjectCode(TestCacheKey(2), "world"); + ASSERT_EQ(cache.GetObjectCode(TestCacheKey(1)), "hello"); + ASSERT_EQ(cache.GetObjectCode(TestCacheKey(2)), "world"); +} + +TEST(TestCache, TestGetCacheCapacity) { ASSERT_EQ(GetCapacity(), 5000); } +} // namespace gandiva From 91bf1c9c170c1917ad47bb0dbb38aa5c9fbbbfb2 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 14 Feb 2024 04:55:27 -0400 Subject: [PATCH 03/46] GH-39984: [Python] Add ChunkedArray import/export to/from C (#39985) ### Rationale for this change ChunkedArrays have an unambiguous representation as a stream of arrays. #39455 added the ability to import/export in C++...this PR wires up the new functions in pyarrow. ### What changes are included in this PR? - Added `__arrow_c_stream__()` and `_import_from_c_capsule()` to the `ChunkedArray` ### Are these changes tested? Yes! Tests were added. ### Are there any user-facing changes? Yes! But I'm not sure where the protocol methods are documented. ```python import pyarrow as pa import nanoarrow as na chunked = pa.chunked_array([pa.array([0, 1, 2]), pa.array([3, 4, 5])]) [na.c_array_view(item) for item in na.c_array_stream(chunked)] ``` [ - storage_type: 'int64' - length: 3 - offset: 0 - null_count: 0 - buffers[2]: - - - dictionary: NULL - children[0]:, - storage_type: 'int64' - length: 3 - offset: 0 - null_count: 0 - buffers[2]: - - - dictionary: NULL - children[0]:] ```python stream_capsule = chunked.__arrow_c_stream__() chunked2 = chunked._import_from_c_capsule(stream_capsule) chunked2 ``` [ [ 0, 1, 2 ], [ 3, 4, 5 ] ] * Closes: #39984 Lead-authored-by: Dewey Dunnington Co-authored-by: Dewey Dunnington Signed-off-by: Antoine Pitrou --- python/pyarrow/includes/libarrow.pxd | 3 ++ python/pyarrow/table.pxi | 61 ++++++++++++++++++++++++++++ python/pyarrow/tests/test_cffi.py | 26 ++++++++++++ 3 files changed, 90 insertions(+) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 8056d99354965..935fb4d34b318 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2930,6 +2930,9 @@ cdef extern from "arrow/c/bridge.h" namespace "arrow" nogil: CResult[shared_ptr[CRecordBatchReader]] ImportRecordBatchReader( ArrowArrayStream*) + CStatus ExportChunkedArray(shared_ptr[CChunkedArray], ArrowArrayStream*) + CResult[shared_ptr[CChunkedArray]] ImportChunkedArray(ArrowArrayStream*) + cdef extern from "arrow/util/byte_size.h" namespace "arrow::util" nogil: CResult[int64_t] ReferencedBufferSize(const CArray& array_data) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index abda784fb7c18..ee3872aa3a242 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1327,6 +1327,67 @@ cdef class ChunkedArray(_PandasConvertible): result += self.chunk(i).to_pylist() return result + def __arrow_c_stream__(self, requested_schema=None): + """ + Export to a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + A capsule containing a C ArrowArrayStream struct. + """ + cdef: + ArrowArrayStream* c_stream = NULL + + if requested_schema is not None: + out_type = DataType._import_from_c_capsule(requested_schema) + if self.type != out_type: + raise NotImplementedError("Casting to requested_schema") + + stream_capsule = alloc_c_stream(&c_stream) + + with nogil: + check_status(ExportChunkedArray(self.sp_chunked_array, c_stream)) + + return stream_capsule + + @staticmethod + def _import_from_c_capsule(stream): + """ + Import ChunkedArray from a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + stream: PyCapsule + A capsule containing a C ArrowArrayStream PyCapsule. + + Returns + ------- + ChunkedArray + """ + cdef: + ArrowArrayStream* c_stream + shared_ptr[CChunkedArray] c_chunked_array + ChunkedArray self + + c_stream = PyCapsule_GetPointer( + stream, 'arrow_array_stream' + ) + + with nogil: + c_chunked_array = GetResultValue(ImportChunkedArray(c_stream)) + + self = ChunkedArray.__new__(ChunkedArray) + self.init(c_chunked_array) + return self + def chunked_array(arrays, type=None): """ diff --git a/python/pyarrow/tests/test_cffi.py b/python/pyarrow/tests/test_cffi.py index ff81b06440f03..3a0c7b5b7152f 100644 --- a/python/pyarrow/tests/test_cffi.py +++ b/python/pyarrow/tests/test_cffi.py @@ -601,3 +601,29 @@ def test_roundtrip_batch_reader_capsule(): assert imported_reader.read_next_batch().equals(batch) with pytest.raises(StopIteration): imported_reader.read_next_batch() + + +def test_roundtrip_chunked_array_capsule(): + chunked = pa.chunked_array([pa.array(["a", "b", "c"])]) + + capsule = chunked.__arrow_c_stream__() + assert PyCapsule_IsValid(capsule, b"arrow_array_stream") == 1 + imported_chunked = pa.ChunkedArray._import_from_c_capsule(capsule) + assert imported_chunked.type == chunked.type + assert imported_chunked == chunked + + +def test_roundtrip_chunked_array_capsule_requested_schema(): + chunked = pa.chunked_array([pa.array(["a", "b", "c"])]) + + # Requesting the same type should work + requested_capsule = chunked.type.__arrow_c_schema__() + capsule = chunked.__arrow_c_stream__(requested_capsule) + imported_chunked = pa.ChunkedArray._import_from_c_capsule(capsule) + assert imported_chunked == chunked + + # Casting to something else should error + requested_type = pa.binary() + requested_capsule = requested_type.__arrow_c_schema__() + with pytest.raises(NotImplementedError): + chunked.__arrow_c_stream__(requested_capsule) From 2422994de04cf4f5a989fec0f00fabccad15b03f Mon Sep 17 00:00:00 2001 From: Hyunseok Seo Date: Thu, 15 Feb 2024 01:16:16 +0900 Subject: [PATCH 04/46] GH-39463: [C++] Support cast kernel from large string, (large) binary to dictionary (#40017) ### Rationale for this change Support `cast` kernel from large string(`large_utf8()`, (large) binary(`binary()`, `large_binary()`) to `dictionary` ### What changes are included in this PR? - Support `cast` kernel - from large string(`large_utf8()`) to `dictionary` - from binary(`binary()`) to `dictionary` - from large binary(`large_binary()`) to `dictionary` ### Are these changes tested? Yes. It is passed by existing test cases. ### Are there any user-facing changes? No. * Closes: #39463 Authored-by: Hyunseok Seo Signed-off-by: Felipe Oliveira Carvalho --- .../compute/kernels/scalar_cast_dictionary.cc | 14 +++-- cpp/src/arrow/scalar_test.cc | 56 ++++++++++--------- 2 files changed, 38 insertions(+), 32 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc b/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc index f13aa26d969c1..ae88ef1cb7534 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc @@ -45,11 +45,12 @@ Status CastToDictionary(KernelContext* ctx, const ExecSpan& batch, ExecResult* o return Status::OK(); } - // If the input type is STRING, it is first encoded as a dictionary to facilitate - // processing. This approach allows the subsequent code to uniformly handle STRING - // inputs as if they were originally provided in dictionary format. Encoding as a - // dictionary helps in reusing the same logic for dictionary operations. - if (batch[0].type()->id() == Type::STRING) { + // If the input type is string or binary-like, it is first encoded as a dictionary to + // facilitate processing. This approach allows the subsequent code to uniformly handle + // string or binary-like inputs as if they were originally provided in dictionary + // format. Encoding as a dictionary helps in reusing the same logic for dictionary + // operations. + if (is_base_binary_like(in_array->type->id())) { in_array = DictionaryEncode(in_array)->array(); } const auto& in_type = checked_cast(*in_array->type); @@ -98,6 +99,9 @@ std::vector> GetDictionaryCasts() { AddCommonCasts(Type::DICTIONARY, kOutputTargetType, cast_dict.get()); AddDictionaryCast(cast_dict.get()); AddDictionaryCast(cast_dict.get()); + AddDictionaryCast(cast_dict.get()); + AddDictionaryCast(cast_dict.get()); + AddDictionaryCast(cast_dict.get()); return {cast_dict}; } diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index d9fb3feaeea6e..09dfde3227109 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -1482,33 +1482,35 @@ TEST(TestDictionaryScalar, ValidateErrors) { TEST(TestDictionaryScalar, Cast) { for (auto index_ty : all_dictionary_index_types()) { - auto ty = dictionary(index_ty, utf8()); - auto dict = checked_pointer_cast( - ArrayFromJSON(utf8(), R"(["alpha", null, "gamma"])")); - - for (int64_t i = 0; i < dict->length(); ++i) { - auto alpha = - dict->IsValid(i) ? MakeScalar(dict->GetString(i)) : MakeNullScalar(utf8()); - // Cast string to dict(..., string) - ASSERT_OK_AND_ASSIGN(auto cast_alpha_datum, Cast(alpha, ty)); - const auto& cast_alpha = cast_alpha_datum.scalar(); - ASSERT_OK(cast_alpha->ValidateFull()); - ASSERT_OK_AND_ASSIGN( - auto roundtripped_alpha, - checked_cast(*cast_alpha).GetEncodedValue()); - - ASSERT_OK_AND_ASSIGN(auto i_scalar, MakeScalar(index_ty, i)); - auto alpha_dict = DictionaryScalar({i_scalar, dict}, ty); - ASSERT_OK(alpha_dict.ValidateFull()); - ASSERT_OK_AND_ASSIGN( - auto encoded_alpha, - checked_cast(alpha_dict).GetEncodedValue()); - - AssertScalarsEqual(*alpha, *roundtripped_alpha); - AssertScalarsEqual(*encoded_alpha, *roundtripped_alpha); - - // dictionaries differ, though encoded values are identical - ASSERT_FALSE(alpha_dict.Equals(*cast_alpha)); + for (auto value_ty : {utf8(), large_utf8(), binary(), large_binary()}) { + auto ty = dictionary(index_ty, value_ty); + auto dict = ArrayFromJSON(value_ty, R"(["alpha", null, "gamma"])"); + ASSERT_OK(dict->ValidateFull()); + + for (int64_t i = 0; i < dict->length(); ++i) { + ASSERT_OK_AND_ASSIGN(auto alpha, dict->GetScalar(i)); + + // Cast string to dict(..., string) + ASSERT_OK_AND_ASSIGN(auto cast_alpha_datum, Cast(alpha, ty)); + const auto& cast_alpha = cast_alpha_datum.scalar(); + ASSERT_OK(cast_alpha->ValidateFull()); + ASSERT_OK_AND_ASSIGN( + auto roundtripped_alpha, + checked_cast(*cast_alpha).GetEncodedValue()); + + ASSERT_OK_AND_ASSIGN(auto i_scalar, MakeScalar(index_ty, i)); + auto alpha_dict = DictionaryScalar({i_scalar, dict}, ty); + ASSERT_OK(alpha_dict.ValidateFull()); + ASSERT_OK_AND_ASSIGN( + auto encoded_alpha, + checked_cast(alpha_dict).GetEncodedValue()); + + AssertScalarsEqual(*alpha, *roundtripped_alpha); + AssertScalarsEqual(*encoded_alpha, *roundtripped_alpha); + + // dictionaries differ, though encoded values are identical + ASSERT_FALSE(alpha_dict.Equals(*cast_alpha)); + } } } } From 621f707f62bee8bde128eed0ef1e239abe5eb8c0 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 15 Feb 2024 09:13:29 -0300 Subject: [PATCH 05/46] GH-40085: [C++][FS][Azure] Validate containers in AzureFileSystem::Impl::MovePaths() (#40086) ### Rationale for this change Cross container moves aren't supported yet (and might never be). ### What changes are included in this PR? - Check that containers are the same before calling a `Rename` that assumes `src` and `dest` are on the same container ### Are these changes tested? Yes, new tests were added. * Closes: #40085 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/filesystem/azurefs.cc | 5 ++++ cpp/src/arrow/filesystem/azurefs_test.cc | 32 +++++++++++++----------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 11750591932e9..23af67a33d688 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -2297,6 +2297,11 @@ class AzureFileSystem::Impl { } } + // Now that src and dest are validated, make sure they are on the same filesystem. + if (src.container != dest.container) { + return CrossContainerMoveNotImplemented(src, dest); + } + try { // NOTE: The Azure SDK provides a RenameDirectory() function, but the // implementation is the same as RenameFile() with the only difference being diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 42f38f1ed6ac7..e6bd80d1d2508 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -1234,30 +1234,32 @@ class TestAzureFileSystem : public ::testing::Test { void TestMovePath() { Status st; auto data = SetUpPreexistingData(); + auto another_container = PreexistingData::RandomContainerName(rng_); + CreateContainer(another_container); // When source doesn't exist. ASSERT_MOVE("missing-container/src-path", data.ContainerPath("dest-path"), ENOENT); auto missing_path1 = data.RandomDirectoryPath(rng_); ASSERT_MOVE(missing_path1, "missing-container/path", ENOENT); // But when source exists... - if (!WithHierarchicalNamespace()) { - // ...and containers are different, we get an error message telling cross-container - // moves are not implemented. - EXPECT_RAISES_WITH_MESSAGE_THAT( - NotImplemented, - HasCrossContainerNotImplementedMessage(data.ObjectPath(), - "missing-container/path"), - fs()->Move(data.ObjectPath(), "missing-container/path")); - GTEST_SKIP() << "The rest of TestMovePath is not implemented for non-HNS scenarios"; - } - auto adlfs_client = - datalake_service_client_->GetFileSystemClient(data.container_name); - // ...and dest.container doesn't exist. + // ...and containers are different, we get an error message telling cross-container + // moves are not implemented. EXPECT_RAISES_WITH_MESSAGE_THAT( - IOError, HasMissingParentDirMessage("missing-container/path"), + NotImplemented, + HasCrossContainerNotImplementedMessage(data.ObjectPath(), + "missing-container/path"), fs()->Move(data.ObjectPath(), "missing-container/path")); + EXPECT_RAISES_WITH_MESSAGE_THAT( + NotImplemented, + HasCrossContainerNotImplementedMessage( + data.ObjectPath(), ConcatAbstractPath(another_container, "path")), + fs()->Move(data.ObjectPath(), ConcatAbstractPath(another_container, "path"))); AssertFileInfo(fs(), data.ObjectPath(), FileType::File); + if (!WithHierarchicalNamespace()) { + GTEST_SKIP() << "The rest of TestMovePath is not implemented for non-HNS scenarios"; + } + EXPECT_RAISES_WITH_MESSAGE_THAT( IOError, HasMissingParentDirMessage(data.Path("missing-subdir/file")), fs()->Move(data.ObjectPath(), data.Path("missing-subdir/file"))); @@ -1271,6 +1273,8 @@ class TestAzureFileSystem : public ::testing::Test { // "file0" exists // src is a file and dest exists (as a file) + auto adlfs_client = + datalake_service_client_->GetFileSystemClient(data.container_name); CreateFile(adlfs_client, PreexistingData::kObjectName, PreexistingData::kLoremIpsum); CreateFile(adlfs_client, "file1", PreexistingData::kLoremIpsum); ASSERT_MOVE_OK(data.ObjectPath(), data.Path("file0")); From a03d957b5b8d0425f9d5b6c98b6ee1efa56a1248 Mon Sep 17 00:00:00 2001 From: david dali susanibar arce Date: Thu, 15 Feb 2024 08:11:44 -0500 Subject: [PATCH 06/46] GH-40055: [Java][Docs] Simplify use of Filter and Expression into Dataset Substrait (#40056) ### Rationale for this change Simplify creation of SQL Expression Filter and Projections into Arrow Java Dataset module using new [Substrait Feature for SQL Expressions](https://github.com/substrait-io/substrait-java/releases/tag/v0.26.0). ### What changes are included in this PR? Update Apache Arrow Java Dataset Substrait documentation ### Are these changes tested? Yes ### Are there any user-facing changes? No * Closes: #40055 Authored-by: david dali susanibar arce Signed-off-by: David Li --- docs/source/java/substrait.rst | 333 +++++---------------------------- 1 file changed, 42 insertions(+), 291 deletions(-) diff --git a/docs/source/java/substrait.rst b/docs/source/java/substrait.rst index d8d49a96e88f8..c5857dcc23f75 100644 --- a/docs/source/java/substrait.rst +++ b/docs/source/java/substrait.rst @@ -113,31 +113,19 @@ This requires the substrait-java library. This Java program: - Loads a Parquet file containing the "nation" table from the TPC-H benchmark. +- Applies a filter: + - ``N_NATIONKEY > 18`` - Projects two new columns: - - ``N_NAME || ' - ' || N_COMMENT`` - ``N_REGIONKEY + 10`` -- Applies a filter: ``N_NATIONKEY > 18`` + - ``N_NAME || ' - ' || N_COMMENT`` + + .. code-block:: Java - import io.substrait.extension.ExtensionCollector; - import io.substrait.proto.Expression; - import io.substrait.proto.ExpressionReference; + import com.google.common.collect.ImmutableList; + import io.substrait.isthmus.SqlExpressionToSubstrait; import io.substrait.proto.ExtendedExpression; - import io.substrait.proto.FunctionArgument; - import io.substrait.proto.SimpleExtensionDeclaration; - import io.substrait.proto.SimpleExtensionURI; - import io.substrait.type.NamedStruct; - import io.substrait.type.Type; - import io.substrait.type.TypeCreator; - import io.substrait.type.proto.TypeProtoConverter; - import java.nio.ByteBuffer; - import java.util.ArrayList; - import java.util.Arrays; - import java.util.Base64; - import java.util.HashMap; - import java.util.List; - import java.util.Optional; import org.apache.arrow.dataset.file.FileFormat; import org.apache.arrow.dataset.file.FileSystemDatasetFactory; import org.apache.arrow.dataset.jni.NativeMemoryPool; @@ -148,297 +136,60 @@ This Java program: import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.ipc.ArrowReader; + import org.apache.calcite.sql.parser.SqlParseException; + + import java.nio.ByteBuffer; + import java.util.Base64; + import java.util.Optional; public class ClientSubstraitExtendedExpressionsCookbook { - public static void main(String[] args) throws Exception { - // project and filter dataset using extended expression definition - 03 Expressions: - // Expression 01 - CONCAT: N_NAME || ' - ' || N_COMMENT = col 1 || ' - ' || col 3 - // Expression 02 - ADD: N_REGIONKEY + 10 = col 1 + 10 - // Expression 03 - FILTER: N_NATIONKEY > 18 = col 3 > 18 + public static void main(String[] args) throws SqlParseException { projectAndFilterDataset(); } - public static void projectAndFilterDataset() { + private static void projectAndFilterDataset() throws SqlParseException { String uri = "file:///Users/data/tpch_parquet/nation.parquet"; - ScanOptions options = new ScanOptions.Builder(/*batchSize*/ 32768) - .columns(Optional.empty()) - .substraitFilter(getSubstraitExpressionFilter()) - .substraitProjection(getSubstraitExpressionProjection()) - .build(); - try ( - BufferAllocator allocator = new RootAllocator(); - DatasetFactory datasetFactory = new FileSystemDatasetFactory( - allocator, NativeMemoryPool.getDefault(), - FileFormat.PARQUET, uri); - Dataset dataset = datasetFactory.finish(); - Scanner scanner = dataset.newScan(options); - ArrowReader reader = scanner.scanBatches() - ) { + ScanOptions options = + new ScanOptions.Builder(/*batchSize*/ 32768) + .columns(Optional.empty()) + .substraitFilter(getByteBuffer(new String[]{"N_NATIONKEY > 18"})) + .substraitProjection(getByteBuffer(new String[]{"N_REGIONKEY + 10", + "N_NAME || CAST(' - ' as VARCHAR) || N_COMMENT"})) + .build(); + try (BufferAllocator allocator = new RootAllocator(); + DatasetFactory datasetFactory = + new FileSystemDatasetFactory( + allocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri); + Dataset dataset = datasetFactory.finish(); + Scanner scanner = dataset.newScan(options); + ArrowReader reader = scanner.scanBatches()) { while (reader.loadNextBatch()) { - System.out.println( - reader.getVectorSchemaRoot().contentToTSVString()); + System.out.println(reader.getVectorSchemaRoot().contentToTSVString()); } } catch (Exception e) { throw new RuntimeException(e); } } - private static ByteBuffer getSubstraitExpressionProjection() { - // Expression: N_REGIONKEY + 10 = col 3 + 10 - Expression.Builder selectionBuilderProjectOne = Expression.newBuilder(). - setSelection( - Expression.FieldReference.newBuilder(). - setDirectReference( - Expression.ReferenceSegment.newBuilder(). - setStructField( - Expression.ReferenceSegment.StructField.newBuilder().setField( - 2) - ) - ) - ); - Expression.Builder literalBuilderProjectOne = Expression.newBuilder() - .setLiteral( - Expression.Literal.newBuilder().setI32(10) - ); - io.substrait.proto.Type outputProjectOne = TypeCreator.NULLABLE.I32.accept( - new TypeProtoConverter(new ExtensionCollector())); - Expression.Builder expressionBuilderProjectOne = Expression. - newBuilder(). - setScalarFunction( - Expression. - ScalarFunction. - newBuilder(). - setFunctionReference(0). - setOutputType(outputProjectOne). - addArguments( - 0, - FunctionArgument.newBuilder().setValue( - selectionBuilderProjectOne) - ). - addArguments( - 1, - FunctionArgument.newBuilder().setValue( - literalBuilderProjectOne) - ) - ); - ExpressionReference.Builder expressionReferenceBuilderProjectOne = ExpressionReference.newBuilder(). - setExpression(expressionBuilderProjectOne) - .addOutputNames("ADD_TEN_TO_COLUMN_N_REGIONKEY"); - - // Expression: name || name = N_NAME || "-" || N_COMMENT = col 1 || col 3 - Expression.Builder selectionBuilderProjectTwo = Expression.newBuilder(). - setSelection( - Expression.FieldReference.newBuilder(). - setDirectReference( - Expression.ReferenceSegment.newBuilder(). - setStructField( - Expression.ReferenceSegment.StructField.newBuilder().setField( - 1) - ) - ) - ); - Expression.Builder selectionBuilderProjectTwoConcatLiteral = Expression.newBuilder() - .setLiteral( - Expression.Literal.newBuilder().setString(" - ") - ); - Expression.Builder selectionBuilderProjectOneToConcat = Expression.newBuilder(). - setSelection( - Expression.FieldReference.newBuilder(). - setDirectReference( - Expression.ReferenceSegment.newBuilder(). - setStructField( - Expression.ReferenceSegment.StructField.newBuilder().setField( - 3) - ) - ) - ); - io.substrait.proto.Type outputProjectTwo = TypeCreator.NULLABLE.STRING.accept( - new TypeProtoConverter(new ExtensionCollector())); - Expression.Builder expressionBuilderProjectTwo = Expression. - newBuilder(). - setScalarFunction( - Expression. - ScalarFunction. - newBuilder(). - setFunctionReference(1). - setOutputType(outputProjectTwo). - addArguments( - 0, - FunctionArgument.newBuilder().setValue( - selectionBuilderProjectTwo) - ). - addArguments( - 1, - FunctionArgument.newBuilder().setValue( - selectionBuilderProjectTwoConcatLiteral) - ). - addArguments( - 2, - FunctionArgument.newBuilder().setValue( - selectionBuilderProjectOneToConcat) - ) - ); - ExpressionReference.Builder expressionReferenceBuilderProjectTwo = ExpressionReference.newBuilder(). - setExpression(expressionBuilderProjectTwo) - .addOutputNames("CONCAT_COLUMNS_N_NAME_AND_N_COMMENT"); - - List columnNames = Arrays.asList("N_NATIONKEY", "N_NAME", - "N_REGIONKEY", "N_COMMENT"); - List dataTypes = Arrays.asList( - TypeCreator.NULLABLE.I32, - TypeCreator.NULLABLE.STRING, - TypeCreator.NULLABLE.I32, - TypeCreator.NULLABLE.STRING - ); - NamedStruct of = NamedStruct.of( - columnNames, - Type.Struct.builder().fields(dataTypes).nullable(false).build() - ); - // Extensions URI - HashMap extensionUris = new HashMap<>(); - extensionUris.put( - "key-001", - SimpleExtensionURI.newBuilder() - .setExtensionUriAnchor(1) - .setUri("/functions_arithmetic.yaml") - .build() - ); - // Extensions - ArrayList extensions = new ArrayList<>(); - SimpleExtensionDeclaration extensionFunctionAdd = SimpleExtensionDeclaration.newBuilder() - .setExtensionFunction( - SimpleExtensionDeclaration.ExtensionFunction.newBuilder() - .setFunctionAnchor(0) - .setName("add:i32_i32") - .setExtensionUriReference(1)) - .build(); - SimpleExtensionDeclaration extensionFunctionGreaterThan = SimpleExtensionDeclaration.newBuilder() - .setExtensionFunction( - SimpleExtensionDeclaration.ExtensionFunction.newBuilder() - .setFunctionAnchor(1) - .setName("concat:vchar") - .setExtensionUriReference(2)) - .build(); - extensions.add(extensionFunctionAdd); - extensions.add(extensionFunctionGreaterThan); - // Extended Expression - ExtendedExpression.Builder extendedExpressionBuilder = - ExtendedExpression.newBuilder(). - addReferredExpr(0, - expressionReferenceBuilderProjectOne). - addReferredExpr(1, - expressionReferenceBuilderProjectTwo). - setBaseSchema(of.toProto(new TypeProtoConverter( - new ExtensionCollector()))); - extendedExpressionBuilder.addAllExtensionUris(extensionUris.values()); - extendedExpressionBuilder.addAllExtensions(extensions); - ExtendedExpression extendedExpression = extendedExpressionBuilder.build(); - byte[] extendedExpressions = Base64.getDecoder().decode( - Base64.getEncoder().encodeToString( - extendedExpression.toByteArray())); - ByteBuffer substraitExpressionProjection = ByteBuffer.allocateDirect( - extendedExpressions.length); - substraitExpressionProjection.put(extendedExpressions); - return substraitExpressionProjection; - } - - private static ByteBuffer getSubstraitExpressionFilter() { - // Expression: Filter: N_NATIONKEY > 18 = col 1 > 18 - Expression.Builder selectionBuilderFilterOne = Expression.newBuilder(). - setSelection( - Expression.FieldReference.newBuilder(). - setDirectReference( - Expression.ReferenceSegment.newBuilder(). - setStructField( - Expression.ReferenceSegment.StructField.newBuilder().setField( - 0) - ) - ) - ); - Expression.Builder literalBuilderFilterOne = Expression.newBuilder() - .setLiteral( - Expression.Literal.newBuilder().setI32(18) - ); - io.substrait.proto.Type outputFilterOne = TypeCreator.NULLABLE.BOOLEAN.accept( - new TypeProtoConverter(new ExtensionCollector())); - Expression.Builder expressionBuilderFilterOne = Expression. - newBuilder(). - setScalarFunction( - Expression. - ScalarFunction. - newBuilder(). - setFunctionReference(1). - setOutputType(outputFilterOne). - addArguments( - 0, - FunctionArgument.newBuilder().setValue( - selectionBuilderFilterOne) - ). - addArguments( - 1, - FunctionArgument.newBuilder().setValue( - literalBuilderFilterOne) - ) - ); - ExpressionReference.Builder expressionReferenceBuilderFilterOne = ExpressionReference.newBuilder(). - setExpression(expressionBuilderFilterOne) - .addOutputNames("COLUMN_N_NATIONKEY_GREATER_THAN_18"); - - List columnNames = Arrays.asList("N_NATIONKEY", "N_NAME", - "N_REGIONKEY", "N_COMMENT"); - List dataTypes = Arrays.asList( - TypeCreator.NULLABLE.I32, - TypeCreator.NULLABLE.STRING, - TypeCreator.NULLABLE.I32, - TypeCreator.NULLABLE.STRING - ); - NamedStruct of = NamedStruct.of( - columnNames, - Type.Struct.builder().fields(dataTypes).nullable(false).build() - ); - // Extensions URI - HashMap extensionUris = new HashMap<>(); - extensionUris.put( - "key-001", - SimpleExtensionURI.newBuilder() - .setExtensionUriAnchor(1) - .setUri("/functions_comparison.yaml") - .build() - ); - // Extensions - ArrayList extensions = new ArrayList<>(); - SimpleExtensionDeclaration extensionFunctionLowerThan = SimpleExtensionDeclaration.newBuilder() - .setExtensionFunction( - SimpleExtensionDeclaration.ExtensionFunction.newBuilder() - .setFunctionAnchor(1) - .setName("gt:any_any") - .setExtensionUriReference(1)) - .build(); - extensions.add(extensionFunctionLowerThan); - // Extended Expression - ExtendedExpression.Builder extendedExpressionBuilder = - ExtendedExpression.newBuilder(). - addReferredExpr(0, - expressionReferenceBuilderFilterOne). - setBaseSchema(of.toProto(new TypeProtoConverter( - new ExtensionCollector()))); - extendedExpressionBuilder.addAllExtensionUris(extensionUris.values()); - extendedExpressionBuilder.addAllExtensions(extensions); - ExtendedExpression extendedExpression = extendedExpressionBuilder.build(); - byte[] extendedExpressions = Base64.getDecoder().decode( - Base64.getEncoder().encodeToString( - extendedExpression.toByteArray())); - ByteBuffer substraitExpressionFilter = ByteBuffer.allocateDirect( - extendedExpressions.length); - substraitExpressionFilter.put(extendedExpressions); - return substraitExpressionFilter; + private static ByteBuffer getByteBuffer(String[] sqlExpression) throws SqlParseException { + String schema = + "CREATE TABLE NATION (N_NATIONKEY INT NOT NULL, N_NAME VARCHAR, " + + "N_REGIONKEY INT NOT NULL, N_COMMENT VARCHAR)"; + SqlExpressionToSubstrait expressionToSubstrait = new SqlExpressionToSubstrait(); + ExtendedExpression expression = + expressionToSubstrait.convert(sqlExpression, ImmutableList.of(schema)); + byte[] expressionToByte = + Base64.getDecoder().decode(Base64.getEncoder().encodeToString(expression.toByteArray())); + ByteBuffer byteBuffer = ByteBuffer.allocateDirect(expressionToByte.length); + byteBuffer.put(expressionToByte); + return byteBuffer; } } .. code-block:: text - ADD_TEN_TO_COLUMN_N_REGIONKEY CONCAT_COLUMNS_N_NAME_AND_N_COMMENT + column-1 column-2 13 ROMANIA - ular asymptotes are about the furious multipliers. express dependencies nag above the ironically ironic account 14 SAUDI ARABIA - ts. silent requests haggle. closely express packages sleep across the blithely 12 VIETNAM - hely enticingly express accounts. even, final From ca67ec22440305d9cc9bfa22d3133cb064e5e257 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 19 Feb 2024 11:37:47 +0100 Subject: [PATCH 07/46] GH-40095: [C++][Parquet] Remove AVX512 variants of BYTE_STREAM_SPLIT encoding (#40127) Two reasons: * the SSE2 and AVX2 variants are already fast enough (on the order of 10 GB/s) * the AVX512 variants do not seem faster, and can even be slower, on tested Intel machines * Closes: #40095 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- .../arrow/util/byte_stream_split_internal.h | 222 +----------------- cpp/src/arrow/util/byte_stream_split_test.cc | 4 - cpp/src/parquet/encoding_benchmark.cc | 27 --- 3 files changed, 2 insertions(+), 251 deletions(-) diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h index f70b3991473fa..cd43d8ec00b5d 100644 --- a/cpp/src/arrow/util/byte_stream_split_internal.h +++ b/cpp/src/arrow/util/byte_stream_split_internal.h @@ -332,226 +332,11 @@ void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, const int64_t num_valu } #endif // ARROW_HAVE_AVX2 -#if defined(ARROW_HAVE_AVX512) -template -void ByteStreamSplitDecodeAvx512(const uint8_t* data, int64_t num_values, int64_t stride, - uint8_t* out) { - static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams."); - constexpr int kNumStreamsLog2 = (kNumStreams == 8 ? 3 : 2); - constexpr int64_t kBlockSize = sizeof(__m512i) * kNumStreams; - - const int64_t size = num_values * kNumStreams; - if (size < kBlockSize) // Back to AVX2 for small size - return ByteStreamSplitDecodeAvx2(data, num_values, stride, out); - const int64_t num_blocks = size / kBlockSize; - - // First handle suffix. - const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams; - for (int64_t i = num_processed_elements; i < num_values; ++i) { - uint8_t gathered_byte_data[kNumStreams]; - for (int b = 0; b < kNumStreams; ++b) { - const int64_t byte_index = b * stride + i; - gathered_byte_data[b] = data[byte_index]; - } - memcpy(out + i * kNumStreams, gathered_byte_data, kNumStreams); - } - - // Processed hierarchically using the unpack, then two shuffles. - __m512i stage[kNumStreamsLog2 + 1][kNumStreams]; - __m512i shuffle[kNumStreams]; - __m512i final_result[kNumStreams]; - constexpr int kNumStreamsHalf = kNumStreams / 2U; - - for (int64_t i = 0; i < num_blocks; ++i) { - for (int j = 0; j < kNumStreams; ++j) { - stage[0][j] = _mm512_loadu_si512( - reinterpret_cast(&data[i * sizeof(__m512i) + j * stride])); - } - - for (int step = 0; step < kNumStreamsLog2; ++step) { - for (int j = 0; j < kNumStreamsHalf; ++j) { - stage[step + 1][j * 2] = - _mm512_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]); - stage[step + 1][j * 2 + 1] = - _mm512_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]); - } - } - - if constexpr (kNumStreams == 8) { - // path for double, 128i index: - // {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C}, - // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D}, - // {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E}, - // {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F}, - shuffle[0] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0], - stage[kNumStreamsLog2][1], 0b01000100); - shuffle[1] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2], - stage[kNumStreamsLog2][3], 0b01000100); - shuffle[2] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][4], - stage[kNumStreamsLog2][5], 0b01000100); - shuffle[3] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][6], - stage[kNumStreamsLog2][7], 0b01000100); - shuffle[4] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0], - stage[kNumStreamsLog2][1], 0b11101110); - shuffle[5] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2], - stage[kNumStreamsLog2][3], 0b11101110); - shuffle[6] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][4], - stage[kNumStreamsLog2][5], 0b11101110); - shuffle[7] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][6], - stage[kNumStreamsLog2][7], 0b11101110); - - final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000); - final_result[1] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000); - final_result[2] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101); - final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101); - final_result[4] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b10001000); - final_result[5] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b10001000); - final_result[6] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b11011101); - final_result[7] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b11011101); - } else { - // path for float, 128i index: - // {0x00, 0x04, 0x08, 0x0C}, {0x01, 0x05, 0x09, 0x0D} - // {0x02, 0x06, 0x0A, 0x0E}, {0x03, 0x07, 0x0B, 0x0F}, - shuffle[0] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0], - stage[kNumStreamsLog2][1], 0b01000100); - shuffle[1] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2], - stage[kNumStreamsLog2][3], 0b01000100); - shuffle[2] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0], - stage[kNumStreamsLog2][1], 0b11101110); - shuffle[3] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2], - stage[kNumStreamsLog2][3], 0b11101110); - - final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000); - final_result[1] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101); - final_result[2] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000); - final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101); - } - - for (int j = 0; j < kNumStreams; ++j) { - _mm512_storeu_si512( - reinterpret_cast<__m512i*>(out + (i * kNumStreams + j) * sizeof(__m512i)), - final_result[j]); - } - } -} - -template -void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const int64_t num_values, - uint8_t* output_buffer_raw) { - static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams."); - constexpr int kBlockSize = sizeof(__m512i) * kNumStreams; - - const int64_t size = num_values * kNumStreams; - - if (size < kBlockSize) // Back to AVX2 for small size - return ByteStreamSplitEncodeAvx2(raw_values, num_values, - output_buffer_raw); - - const int64_t num_blocks = size / kBlockSize; - const __m512i* raw_values_simd = reinterpret_cast(raw_values); - __m512i* output_buffer_streams[kNumStreams]; - for (int i = 0; i < kNumStreams; ++i) { - output_buffer_streams[i] = - reinterpret_cast<__m512i*>(&output_buffer_raw[num_values * i]); - } - - // First handle suffix. - const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams; - for (int64_t i = num_processed_elements; i < num_values; ++i) { - for (int j = 0; j < kNumStreams; ++j) { - const uint8_t byte_in_value = raw_values[i * kNumStreams + j]; - output_buffer_raw[j * num_values + i] = byte_in_value; - } - } - - constexpr int KNumUnpack = (kNumStreams == 8) ? 2 : 3; - __m512i final_result[kNumStreams]; - __m512i unpack[KNumUnpack + 1][kNumStreams]; - __m512i permutex[kNumStreams]; - __m512i permutex_mask; - if constexpr (kNumStreams == 8) { - // use _mm512_set_epi32, no _mm512_set_epi16 for some old gcc version. - permutex_mask = _mm512_set_epi32(0x001F0017, 0x000F0007, 0x001E0016, 0x000E0006, - 0x001D0015, 0x000D0005, 0x001C0014, 0x000C0004, - 0x001B0013, 0x000B0003, 0x001A0012, 0x000A0002, - 0x00190011, 0x00090001, 0x00180010, 0x00080000); - } else { - permutex_mask = _mm512_set_epi32(0x0F, 0x0B, 0x07, 0x03, 0x0E, 0x0A, 0x06, 0x02, 0x0D, - 0x09, 0x05, 0x01, 0x0C, 0x08, 0x04, 0x00); - } - - for (int64_t block_index = 0; block_index < num_blocks; ++block_index) { - for (int i = 0; i < kNumStreams; ++i) { - unpack[0][i] = _mm512_loadu_si512(&raw_values_simd[block_index * kNumStreams + i]); - } - - for (int unpack_lvl = 0; unpack_lvl < KNumUnpack; ++unpack_lvl) { - for (int i = 0; i < kNumStreams / 2; ++i) { - unpack[unpack_lvl + 1][i * 2] = _mm512_unpacklo_epi8( - unpack[unpack_lvl][i * 2], unpack[unpack_lvl][i * 2 + 1]); - unpack[unpack_lvl + 1][i * 2 + 1] = _mm512_unpackhi_epi8( - unpack[unpack_lvl][i * 2], unpack[unpack_lvl][i * 2 + 1]); - } - } - - if constexpr (kNumStreams == 8) { - // path for double - // 1. unpack to epi16 block - // 2. permutexvar_epi16 to 128i block - // 3. shuffle 128i to final 512i target, index: - // {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C}, - // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D}, - // {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E}, - // {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F}, - for (int i = 0; i < kNumStreams; ++i) - permutex[i] = _mm512_permutexvar_epi16(permutex_mask, unpack[KNumUnpack][i]); - - __m512i shuffle[kNumStreams]; - shuffle[0] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b01000100); - shuffle[1] = _mm512_shuffle_i32x4(permutex[4], permutex[6], 0b01000100); - shuffle[2] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b11101110); - shuffle[3] = _mm512_shuffle_i32x4(permutex[4], permutex[6], 0b11101110); - shuffle[4] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b01000100); - shuffle[5] = _mm512_shuffle_i32x4(permutex[5], permutex[7], 0b01000100); - shuffle[6] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b11101110); - shuffle[7] = _mm512_shuffle_i32x4(permutex[5], permutex[7], 0b11101110); - - final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000); - final_result[1] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101); - final_result[2] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000); - final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101); - final_result[4] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b10001000); - final_result[5] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b11011101); - final_result[6] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b10001000); - final_result[7] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b11011101); - } else { - // Path for float. - // 1. Processed hierarchically to 32i block using the unpack intrinsics. - // 2. Pack 128i block using _mm256_permutevar8x32_epi32. - // 3. Pack final 256i block with _mm256_permute2x128_si256. - for (int i = 0; i < kNumStreams; ++i) - permutex[i] = _mm512_permutexvar_epi32(permutex_mask, unpack[KNumUnpack][i]); - - final_result[0] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b01000100); - final_result[1] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b11101110); - final_result[2] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b01000100); - final_result[3] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b11101110); - } - - for (int i = 0; i < kNumStreams; ++i) { - _mm512_storeu_si512(&output_buffer_streams[i][block_index], final_result[i]); - } - } -} -#endif // ARROW_HAVE_AVX512 - #if defined(ARROW_HAVE_SIMD_SPLIT) template void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int64_t num_values, int64_t stride, uint8_t* out) { -#if defined(ARROW_HAVE_AVX512) - return ByteStreamSplitDecodeAvx512(data, num_values, stride, out); -#elif defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_AVX2) return ByteStreamSplitDecodeAvx2(data, num_values, stride, out); #elif defined(ARROW_HAVE_SSE4_2) return ByteStreamSplitDecodeSse2(data, num_values, stride, out); @@ -563,10 +348,7 @@ void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int64_t num_values, template void inline ByteStreamSplitEncodeSimd(const uint8_t* raw_values, const int64_t num_values, uint8_t* output_buffer_raw) { -#if defined(ARROW_HAVE_AVX512) - return ByteStreamSplitEncodeAvx512(raw_values, num_values, - output_buffer_raw); -#elif defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_AVX2) return ByteStreamSplitEncodeAvx2(raw_values, num_values, output_buffer_raw); #elif defined(ARROW_HAVE_SSE4_2) diff --git a/cpp/src/arrow/util/byte_stream_split_test.cc b/cpp/src/arrow/util/byte_stream_split_test.cc index 71c6063179ea6..421edce4e0aa3 100644 --- a/cpp/src/arrow/util/byte_stream_split_test.cc +++ b/cpp/src/arrow/util/byte_stream_split_test.cc @@ -81,10 +81,6 @@ class TestByteStreamSplitSpecialized : public ::testing::Test { #if defined(ARROW_HAVE_AVX2) encode_funcs_.push_back({"avx2", &ByteStreamSplitEncodeAvx2}); decode_funcs_.push_back({"avx2", &ByteStreamSplitDecodeAvx2}); -#endif -#if defined(ARROW_HAVE_AVX512) - encode_funcs_.push_back({"avx512", &ByteStreamSplitEncodeAvx512}); - decode_funcs_.push_back({"avx512", &ByteStreamSplitDecodeAvx512}); #endif } diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index 76c411244b22d..dd258ab815244 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -468,33 +468,6 @@ BENCHMARK(BM_ByteStreamSplitEncode_Float_Avx2)->Range(MIN_RANGE, MAX_RANGE); BENCHMARK(BM_ByteStreamSplitEncode_Double_Avx2)->Range(MIN_RANGE, MAX_RANGE); #endif -#if defined(ARROW_HAVE_AVX512) -static void BM_ByteStreamSplitDecode_Float_Avx512(benchmark::State& state) { - BM_ByteStreamSplitDecode( - state, ::arrow::util::internal::ByteStreamSplitDecodeAvx512); -} - -static void BM_ByteStreamSplitDecode_Double_Avx512(benchmark::State& state) { - BM_ByteStreamSplitDecode( - state, ::arrow::util::internal::ByteStreamSplitDecodeAvx512); -} - -static void BM_ByteStreamSplitEncode_Float_Avx512(benchmark::State& state) { - BM_ByteStreamSplitEncode( - state, ::arrow::util::internal::ByteStreamSplitEncodeAvx512); -} - -static void BM_ByteStreamSplitEncode_Double_Avx512(benchmark::State& state) { - BM_ByteStreamSplitEncode( - state, ::arrow::util::internal::ByteStreamSplitEncodeAvx512); -} - -BENCHMARK(BM_ByteStreamSplitDecode_Float_Avx512)->Range(MIN_RANGE, MAX_RANGE); -BENCHMARK(BM_ByteStreamSplitDecode_Double_Avx512)->Range(MIN_RANGE, MAX_RANGE); -BENCHMARK(BM_ByteStreamSplitEncode_Float_Avx512)->Range(MIN_RANGE, MAX_RANGE); -BENCHMARK(BM_ByteStreamSplitEncode_Double_Avx512)->Range(MIN_RANGE, MAX_RANGE); -#endif - template static auto MakeDeltaBitPackingInputFixed(size_t length) { using T = typename DType::c_type; From d1e852f4d804d741422c258b8bdd0cb4ce7925b6 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 19 Feb 2024 15:11:16 +0100 Subject: [PATCH 08/46] MINOR: [Archery] Output full Docker progress when --debug is passed (#40129) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Authored-by: Antoine Pitrou Signed-off-by: Raúl Cumplido --- dev/archery/archery/cli.py | 2 +- dev/archery/archery/docker/cli.py | 3 ++- dev/archery/archery/docker/core.py | 7 ++++++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 0ad3eee14d1f3..32921afb2e61b 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -63,7 +63,7 @@ def archery(ctx, debug, pdb, quiet): if debug: logger.setLevel(logging.DEBUG) - ctx.debug = debug + ctx.obj['debug'] = debug if pdb: import pdb diff --git a/dev/archery/archery/docker/cli.py b/dev/archery/archery/docker/cli.py index 42caecd7427a4..162f73ec0ffe0 100644 --- a/dev/archery/archery/docker/cli.py +++ b/dev/archery/archery/docker/cli.py @@ -64,7 +64,8 @@ def docker(ctx, src, dry_run): # take the docker-compose parameters like PYTHON, PANDAS, UBUNTU from the # environment variables to keep the usage similar to docker-compose - compose = DockerCompose(config_path, params=os.environ) + compose = DockerCompose(config_path, params=os.environ, + debug=ctx.obj.get('debug', False)) if dry_run: _mock_compose_calls(compose) ctx.obj['compose'] = compose diff --git a/dev/archery/archery/docker/core.py b/dev/archery/archery/docker/core.py index b0e9d32552cbb..184d9808759b8 100644 --- a/dev/archery/archery/docker/core.py +++ b/dev/archery/archery/docker/core.py @@ -164,11 +164,12 @@ def __init__(self, docker_bin=None): class DockerCompose(Command): def __init__(self, config_path, dotenv_path=None, compose_bin=None, - params=None): + params=None, debug=False): compose_bin = default_bin(compose_bin, 'docker-compose') self.config = ComposeConfig(config_path, dotenv_path, compose_bin, params) self.bin = compose_bin + self.debug = debug self.pull_memory = set() def clear_pull_memory(self): @@ -296,6 +297,8 @@ def _build(service, use_cache): self._execute_docker("buildx", "build", *args) elif using_docker: # better for caching + if self.debug: + args.append("--progress=plain") for k, v in service['build'].get('args', {}).items(): args.extend(['--build-arg', '{}={}'.format(k, v)]) for img in cache_from: @@ -307,6 +310,8 @@ def _build(service, use_cache): ]) self._execute_docker("build", *args) else: + if self.debug: + args.append("--progress=plain") self._execute_compose("build", *args, service['name']) service = self.config.get(service_name) From 4b74b451d581e557765994a68ad87c575b4139b8 Mon Sep 17 00:00:00 2001 From: Jeremy Aguilon Date: Mon, 19 Feb 2024 09:54:57 -0500 Subject: [PATCH 09/46] GH-39803: [C++][Acero] Fix AsOfJoin with differently ordered schemas than the output (#39804) ### Rationale for this change Issue is described visually in https://github.com/apache/arrow/issues/39803. The key hasher works by hashing every row of the input tables' key columns. An important step is inspecting the [column metadata](https://github.com/apache/arrow/blob/main/cpp/src/arrow/acero/asof_join_node.cc#L412) for the asof-join key fields. This returns whether columns are fixed width, among other things. The issue is we are passing the `output_schema`, rather than the input's schema. If an input looks like ``` key_string_type,ts_int32_type,val ``` But our expected output schema looks like: ``` ts_int32,key_string_type,... ``` Then the hasher will think that the `key_string_type`'s type is an int32. This completely throws off hashes. Tests currently get away with it since we just use ints across the board. ### What changes are included in this PR? One line fix and test with string types. ### Are these changes tested? Yes. Can see the test run before and after changes here: https://gist.github.com/JerAguilon/953d82ed288d58f9ce24d1a925def2cc Before the change, notice that inputs 0 and 1 have mismatched hashes: ``` AsofjoinNode(0x16cf9e2d8): key hasher 1 got hashes [0, 9784892099856512926, 1050982531982388796, 10763536662319179482, 2029627098739957112, 11814237723602982167, 3080328155728858293, 12792882290360550483, 4058972722486426609, 13771526852823217039] ... AsofjoinNode(0x16cf9dd18): key hasher 0 got hashes [17528465654998409509, 12047706865972860560, 18017664240540048750, 12358837084497432044, 8151160321586084686, 8691136767698756332, 15973065724125580046, 9654919479117127288, 618127929167745505, 3403805303373270709] ``` And after, they do match: ``` AsofjoinNode(0x16f2ea2d8): key hasher 1 got hashes [17528465654998409509, 12047706865972860560, 18017664240540048750, 12358837084497432044, 8151160321586084686, 8691136767698756332, 15973065724125580046, 9654919479117127288, 618127929167745505, 3403805303373270709] ... AsofjoinNode(0x16f2e9d18): key hasher 0 got hashes [17528465654998409509, 12047706865972860560, 18017664240540048750, 12358837084497432044, 8151160321586084686, 8691136767698756332, 15973065724125580046, 9654919479117127288, 618127929167745505, 3403805303373270709] ``` ...which is exactly what you want, since the `key` column for both tables looks like `["0", "1", ..."9"]` ### Are there any user-facing changes? * Closes: #39803 Lead-authored-by: Jeremy Aguilon Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/acero/asof_join_node.cc | 2 +- cpp/src/arrow/acero/asof_join_node_test.cc | 64 ++++++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/acero/asof_join_node.cc b/cpp/src/arrow/acero/asof_join_node.cc index 2609905a0b552..e96d5ad44a9e6 100644 --- a/cpp/src/arrow/acero/asof_join_node.cc +++ b/cpp/src/arrow/acero/asof_join_node.cc @@ -1098,7 +1098,7 @@ class AsofJoinNode : public ExecNode { auto inputs = this->inputs(); for (size_t i = 0; i < inputs.size(); i++) { RETURN_NOT_OK(key_hashers_[i]->Init(plan()->query_context()->exec_context(), - output_schema())); + inputs[i]->output_schema())); ARROW_ASSIGN_OR_RAISE( auto input_state, InputState::Make(i, tolerance_, must_hash_, may_rehash_, key_hashers_[i].get(), diff --git a/cpp/src/arrow/acero/asof_join_node_test.cc b/cpp/src/arrow/acero/asof_join_node_test.cc index e400cc031693a..d95d2aaad3643 100644 --- a/cpp/src/arrow/acero/asof_join_node_test.cc +++ b/cpp/src/arrow/acero/asof_join_node_test.cc @@ -1582,6 +1582,70 @@ TEST(AsofJoinTest, BatchSequencing) { return TestSequencing(MakeIntegerBatches, /*num_batches=*/32, /*batch_size=*/1); } +template +void TestSchemaResolution(BatchesMaker maker, int num_batches, int batch_size) { + // GH-39803: The key hasher needs to resolve the types of key columns. All other + // tests use int32 for all columns, but this test converts the key columns to + // strings via a projection node to test that the column is correctly resolved + // to string. + auto l_schema = + schema({field("time", int32()), field("key", int32()), field("l_value", int32())}); + auto r_schema = + schema({field("time", int32()), field("key", int32()), field("r0_value", int32())}); + + auto make_shift = [&maker, num_batches, batch_size]( + const std::shared_ptr& schema, int shift) { + return maker({[](int row) -> int64_t { return row; }, + [num_batches](int row) -> int64_t { return row / num_batches; }, + [shift](int row) -> int64_t { return row * 10 + shift; }}, + schema, num_batches, batch_size); + }; + ASSERT_OK_AND_ASSIGN(auto l_batches, make_shift(l_schema, 0)); + ASSERT_OK_AND_ASSIGN(auto r_batches, make_shift(r_schema, 1)); + + Declaration l_src = {"source", + SourceNodeOptions(l_schema, l_batches.gen(false, false))}; + Declaration r_src = {"source", + SourceNodeOptions(r_schema, r_batches.gen(false, false))}; + Declaration l_project = { + "project", + {std::move(l_src)}, + ProjectNodeOptions({compute::field_ref("time"), + compute::call("cast", {compute::field_ref("key")}, + compute::CastOptions::Safe(utf8())), + compute::field_ref("l_value")}, + {"time", "key", "l_value"})}; + Declaration r_project = { + "project", + {std::move(r_src)}, + ProjectNodeOptions({compute::call("cast", {compute::field_ref("key")}, + compute::CastOptions::Safe(utf8())), + compute::field_ref("r0_value"), compute::field_ref("time")}, + {"key", "r0_value", "time"})}; + + Declaration asofjoin = { + "asofjoin", {l_project, r_project}, GetRepeatedOptions(2, "time", {"key"}, 1000)}; + + QueryOptions query_options; + query_options.use_threads = false; + ASSERT_OK_AND_ASSIGN(auto table, DeclarationToTable(asofjoin, query_options)); + + Int32Builder expected_r0_b; + for (int i = 1; i <= 91; i += 10) { + ASSERT_OK(expected_r0_b.Append(i)); + } + ASSERT_OK_AND_ASSIGN(auto expected_r0, expected_r0_b.Finish()); + + auto actual_r0 = table->GetColumnByName("r0_value"); + std::vector> chunks = {expected_r0}; + auto expected_r0_chunked = std::make_shared(chunks); + ASSERT_TRUE(actual_r0->Equals(expected_r0_chunked)); +} + +TEST(AsofJoinTest, OutputSchemaResolution) { + return TestSchemaResolution(MakeIntegerBatches, /*num_batches=*/1, /*batch_size=*/10); +} + namespace { Result>> MakeIntegerBatchGenForTest( From ec3723e497d2ae338600cc22b977d0a3f1006886 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 19 Feb 2024 15:57:36 +0100 Subject: [PATCH 10/46] GH-40112: [CI][Python] Ensure CPython is selected, not PyPy (#40131) Sometimes, mamba might select PyPy over CPython in certain environment upgrade/downgrade scenarios. * Closes: #40112 Lead-authored-by: Antoine Pitrou Co-authored-by: Uwe L. Korn Signed-off-by: Antoine Pitrou --- ci/docker/conda-python.dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/docker/conda-python.dockerfile b/ci/docker/conda-python.dockerfile index 5d37c53386e7d..027fd589cecca 100644 --- a/ci/docker/conda-python.dockerfile +++ b/ci/docker/conda-python.dockerfile @@ -28,7 +28,7 @@ COPY ci/conda_env_python.txt \ RUN mamba install -q -y \ --file arrow/ci/conda_env_python.txt \ $([ "$python" == $(gdb --batch --eval-command 'python import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') ] && echo "gdb") \ - python=${python} \ + "python=${python}.*=*_cpython" \ nomkl && \ mamba clean --all From b224c583f3c7dfacfecc356e0e91de867f044d22 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 19 Feb 2024 16:09:57 +0100 Subject: [PATCH 11/46] GH-39663: [C++] Ensure top-level benchmarks present informative metrics (#40091) ### Rationale for this change Some benchmarks may present only an iteration time, or not present sufficiently informative metrics. ### What changes are included in this PR? Add bytes/second and/or items/second metrics to top-level benchmarks where applicable. This PR only tackles miscellaneous benchmarks from the top-level Arrow directory, as well as IO, IPC and utilities. ### Are these changes tested? Manually. ### Are there any user-facing changes? No. * Closes: #39663 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/builder_benchmark.cc | 12 ++++++++++++ cpp/src/arrow/memory_pool_benchmark.cc | 13 +++++++++++-- cpp/src/arrow/util/int_util_benchmark.cc | 4 ++++ cpp/src/arrow/util/range_benchmark.cc | 6 ++++++ 4 files changed, 33 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/builder_benchmark.cc b/cpp/src/arrow/builder_benchmark.cc index e639a900cc5b5..84f27d20ee038 100644 --- a/cpp/src/arrow/builder_benchmark.cc +++ b/cpp/src/arrow/builder_benchmark.cc @@ -56,6 +56,7 @@ constexpr int64_t kRounds = 256; static VectorType kData = AlmostU8CompressibleVector(); constexpr int64_t kBytesProcessPerRound = kNumberOfElements * sizeof(ValueType); constexpr int64_t kBytesProcessed = kRounds * kBytesProcessPerRound; +constexpr int64_t kItemsProcessed = kRounds * kNumberOfElements; static const char* kBinaryString = "12345678"; static std::string_view kBinaryView(kBinaryString); @@ -73,6 +74,7 @@ static void BuildIntArrayNoNulls(benchmark::State& state) { // NOLINT non-const } state.SetBytesProcessed(state.iterations() * kBytesProcessed); + state.SetItemsProcessed(state.iterations() * kItemsProcessed); } static void BuildAdaptiveIntNoNulls( @@ -89,6 +91,7 @@ static void BuildAdaptiveIntNoNulls( } state.SetBytesProcessed(state.iterations() * kBytesProcessed); + state.SetItemsProcessed(state.iterations() * kItemsProcessed); } static void BuildAdaptiveIntNoNullsScalarAppend( @@ -107,6 +110,7 @@ static void BuildAdaptiveIntNoNullsScalarAppend( } state.SetBytesProcessed(state.iterations() * kBytesProcessed); + state.SetItemsProcessed(state.iterations() * kItemsProcessed); } static void BuildBooleanArrayNoNulls( @@ -127,6 +131,7 @@ static void BuildBooleanArrayNoNulls( } state.SetBytesProcessed(state.iterations() * kBytesProcessed); + state.SetItemsProcessed(state.iterations() * kItemsProcessed); } static void BuildBinaryArray(benchmark::State& state) { // NOLINT non-const reference @@ -142,6 +147,7 @@ static void BuildBinaryArray(benchmark::State& state) { // NOLINT non-const ref } state.SetBytesProcessed(state.iterations() * kBytesProcessed); + state.SetItemsProcessed(state.iterations() * kItemsProcessed); } static void BuildChunkedBinaryArray( @@ -161,6 +167,7 @@ static void BuildChunkedBinaryArray( } state.SetBytesProcessed(state.iterations() * kBytesProcessed); + state.SetItemsProcessed(state.iterations() * kItemsProcessed); } static void BuildFixedSizeBinaryArray( @@ -179,6 +186,7 @@ static void BuildFixedSizeBinaryArray( } state.SetBytesProcessed(state.iterations() * kBytesProcessed); + state.SetItemsProcessed(state.iterations() * kItemsProcessed); } static void BuildDecimalArray(benchmark::State& state) { // NOLINT non-const reference @@ -199,6 +207,7 @@ static void BuildDecimalArray(benchmark::State& state) { // NOLINT non-const re } state.SetBytesProcessed(state.iterations() * kRounds * kNumberOfElements * 16); + state.SetItemsProcessed(state.iterations() * kRounds * kNumberOfElements); } // ---------------------------------------------------------------------- @@ -317,6 +326,7 @@ static void BenchmarkDictionaryArray( fodder_nbytes = fodder.size() * sizeof(Scalar); } state.SetBytesProcessed(state.iterations() * fodder_nbytes * kRounds); + state.SetItemsProcessed(state.iterations() * fodder.size() * kRounds); } static void BuildInt64DictionaryArrayRandom( @@ -361,6 +371,7 @@ static void ArrayDataConstructDestruct( InitArrays(); arrays.clear(); } + state.SetItemsProcessed(state.iterations() * kNumArrays); } // ---------------------------------------------------------------------- @@ -430,6 +441,7 @@ static void ReferenceBuildVectorNoNulls( } state.SetBytesProcessed(state.iterations() * kBytesProcessed); + state.SetItemsProcessed(state.iterations() * kItemsProcessed); } BENCHMARK(ReferenceBuildVectorNoNulls); diff --git a/cpp/src/arrow/memory_pool_benchmark.cc b/cpp/src/arrow/memory_pool_benchmark.cc index ba39310a82ec0..fe7a3dd2f8ee0 100644 --- a/cpp/src/arrow/memory_pool_benchmark.cc +++ b/cpp/src/arrow/memory_pool_benchmark.cc @@ -23,6 +23,8 @@ namespace arrow { +static constexpr int64_t kCacheLineSize = 64; + struct SystemAlloc { static Result GetAllocator() { return system_memory_pool(); } }; @@ -51,8 +53,8 @@ static void TouchCacheLines(uint8_t* data, int64_t nbytes) { uint8_t total = 0; while (nbytes > 0) { total += *data; - data += 64; - nbytes -= 64; + data += kCacheLineSize; + nbytes -= kCacheLineSize; } benchmark::DoNotOptimize(total); } @@ -71,6 +73,8 @@ static void TouchArea(benchmark::State& state) { // NOLINT non-const reference } pool->Free(data, nbytes); + state.SetItemsProcessed(state.iterations()); + state.SetBytesProcessed(state.iterations() * nbytes); } // Benchmark the raw cost of allocating memory. @@ -88,6 +92,9 @@ static void AllocateDeallocate(benchmark::State& state) { // NOLINT non-const r ARROW_CHECK_OK(pool->Allocate(nbytes, &data)); pool->Free(data, nbytes); } + state.SetItemsProcessed(state.iterations()); + // SetBytesProcessed() would give nonsensical figures since the data is not + // actually processed. } // Benchmark the cost of allocating memory plus accessing it. @@ -103,6 +110,8 @@ static void AllocateTouchDeallocate( TouchCacheLines(data, nbytes); pool->Free(data, nbytes); } + state.SetItemsProcessed(state.iterations()); + state.SetBytesProcessed(state.iterations() * nbytes); } #define BENCHMARK_ALLOCATE_ARGS \ diff --git a/cpp/src/arrow/util/int_util_benchmark.cc b/cpp/src/arrow/util/int_util_benchmark.cc index 1eae604a7dab8..696a957c3ce85 100644 --- a/cpp/src/arrow/util/int_util_benchmark.cc +++ b/cpp/src/arrow/util/int_util_benchmark.cc @@ -64,6 +64,7 @@ static void DetectUIntWidthNoNulls( benchmark::DoNotOptimize(result); } state.SetBytesProcessed(state.iterations() * values.size() * sizeof(uint64_t)); + state.SetItemsProcessed(state.iterations() * values.size()); } static void DetectUIntWidthNulls(benchmark::State& state) { // NOLINT non-const reference @@ -76,6 +77,7 @@ static void DetectUIntWidthNulls(benchmark::State& state) { // NOLINT non-const benchmark::DoNotOptimize(result); } state.SetBytesProcessed(state.iterations() * values.size() * sizeof(uint64_t)); + state.SetItemsProcessed(state.iterations() * values.size()); } static void DetectIntWidthNoNulls( @@ -87,6 +89,7 @@ static void DetectIntWidthNoNulls( benchmark::DoNotOptimize(result); } state.SetBytesProcessed(state.iterations() * values.size() * sizeof(uint64_t)); + state.SetItemsProcessed(state.iterations() * values.size()); } static void DetectIntWidthNulls(benchmark::State& state) { // NOLINT non-const reference @@ -99,6 +102,7 @@ static void DetectIntWidthNulls(benchmark::State& state) { // NOLINT non-const benchmark::DoNotOptimize(result); } state.SetBytesProcessed(state.iterations() * values.size() * sizeof(uint64_t)); + state.SetItemsProcessed(state.iterations() * values.size()); } static void CheckIndexBoundsInt32( diff --git a/cpp/src/arrow/util/range_benchmark.cc b/cpp/src/arrow/util/range_benchmark.cc index 204fd24f791d0..ca9f675b9d5a7 100644 --- a/cpp/src/arrow/util/range_benchmark.cc +++ b/cpp/src/arrow/util/range_benchmark.cc @@ -46,6 +46,7 @@ void for_loop(benchmark::State& state) { for (auto _ : state) { for (int64_t index = 0; index < kSize; ++index) target[index] = source[index] + 1; } + state.SetItemsProcessed(state.iterations() * kSize); } BENCHMARK(for_loop); @@ -58,6 +59,7 @@ void std_copy(benchmark::State& state) { for (auto _ : state) { std::copy(source.begin(), source.end(), target.begin()); } + state.SetItemsProcessed(state.iterations() * kSize); } BENCHMARK(std_copy); @@ -71,6 +73,7 @@ void std_copy_converting(benchmark::State& state) { for (auto _ : state) { std::copy(source.begin(), source.end(), target.begin()); } + state.SetItemsProcessed(state.iterations() * kSize); } BENCHMARK(std_copy_converting); @@ -85,6 +88,7 @@ void lazy_copy(benchmark::State& state) { for (auto _ : state) { std::copy(lazy_range.begin(), lazy_range.end(), target.begin()); } + state.SetItemsProcessed(state.iterations() * kSize); } BENCHMARK(lazy_copy); @@ -101,6 +105,7 @@ void lazy_copy_converting(benchmark::State& state) { for (auto _ : state) { std::copy(lazy_range.begin(), lazy_range.end(), target.begin()); } + state.SetItemsProcessed(state.iterations() * kSize); } BENCHMARK(lazy_copy_converting); @@ -119,6 +124,7 @@ void lazy_postinc(benchmark::State& state) { while (lazy_iter != lazy_end) *(target_iter++) = *(lazy_iter++); } + state.SetItemsProcessed(state.iterations() * kSize); } BENCHMARK(lazy_postinc); From 2456258bdb1cc0eeeed9448110dc9c0c51c38d7d Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 19 Feb 2024 17:16:25 +0100 Subject: [PATCH 12/46] GH-40133: [C++][Parquet][Tools] Print FIXED_LEN_BYTE_ARRAY length (#40132) In `ParquetFilePrinter`, when printing the type of the column, also print its byte width if the type is FIXED_LEN_BYTE_ARRAY. Before: ``` Column 0: float16_plain (FIXED_LEN_BYTE_ARRAY / Float16) Column 1: float16_byte_stream_split (FIXED_LEN_BYTE_ARRAY / Float16) Column 2: float_plain (FLOAT) Column 3: float_byte_stream_split (FLOAT) Column 4: double_plain (DOUBLE) Column 5: double_byte_stream_split (DOUBLE) Column 6: int32_plain (INT32) Column 7: int32_byte_stream_split (INT32) Column 8: int64_plain (INT64) Column 9: int64_byte_stream_split (INT64) Column 10: flba5_plain (FIXED_LEN_BYTE_ARRAY) Column 11: flba5_byte_stream_split (FIXED_LEN_BYTE_ARRAY) Column 12: decimal_plain (FIXED_LEN_BYTE_ARRAY / Decimal(precision=7, scale=3) / DECIMAL(7,3)) Column 13: decimal_byte_stream_split (FIXED_LEN_BYTE_ARRAY / Decimal(precision=7, scale=3) / DECIMAL(7,3)) ``` After: ``` Column 0: float16_plain (FIXED_LEN_BYTE_ARRAY(2) / Float16) Column 1: float16_byte_stream_split (FIXED_LEN_BYTE_ARRAY(2) / Float16) Column 2: float_plain (FLOAT) Column 3: float_byte_stream_split (FLOAT) Column 4: double_plain (DOUBLE) Column 5: double_byte_stream_split (DOUBLE) Column 6: int32_plain (INT32) Column 7: int32_byte_stream_split (INT32) Column 8: int64_plain (INT64) Column 9: int64_byte_stream_split (INT64) Column 10: flba5_plain (FIXED_LEN_BYTE_ARRAY(5)) Column 11: flba5_byte_stream_split (FIXED_LEN_BYTE_ARRAY(5)) Column 12: decimal_plain (FIXED_LEN_BYTE_ARRAY(4) / Decimal(precision=7, scale=3) / DECIMAL(7,3)) Column 13: decimal_byte_stream_split (FIXED_LEN_BYTE_ARRAY(4) / Decimal(precision=7, scale=3) / DECIMAL(7,3)) ``` * Closes: #40133 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/parquet/printer.cc | 2 +- cpp/src/parquet/types.cc | 10 ++++++++++ cpp/src/parquet/types.h | 2 ++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc index f11397ab96ed8..ce194f897e44d 100644 --- a/cpp/src/parquet/printer.cc +++ b/cpp/src/parquet/printer.cc @@ -105,7 +105,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list selecte for (auto i : selected_columns) { const ColumnDescriptor* descr = file_metadata->schema()->Column(i); stream << "Column " << i << ": " << descr->path()->ToDotString() << " (" - << TypeToString(descr->physical_type()); + << TypeToString(descr->physical_type(), descr->type_length()); const auto& logical_type = descr->logical_type(); if (!logical_type->is_none()) { stream << " / " << logical_type->ToString(); diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index 33fed01ba324f..7b50ed48d06b0 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -177,6 +177,16 @@ std::string TypeToString(Type::type t) { } } +std::string TypeToString(Type::type t, int type_length) { + auto s = TypeToString(t); + if (t == Type::FIXED_LEN_BYTE_ARRAY) { + s += '('; + s += std::to_string(type_length); + s += ')'; + } + return s; +} + std::string ConvertedTypeToString(ConvertedType::type t) { switch (t) { case ConvertedType::NONE: diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 76dd0efc7cb4a..38529bceae85f 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -796,6 +796,8 @@ PARQUET_EXPORT std::string ConvertedTypeToString(ConvertedType::type t); PARQUET_EXPORT std::string TypeToString(Type::type t); +PARQUET_EXPORT std::string TypeToString(Type::type t, int type_length); + PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type, ::std::string_view val); From fee640c10bc23de17bb9962f193d5b61282a673f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 19 Feb 2024 09:13:31 -0800 Subject: [PATCH 13/46] MINOR: [C#] Bump xunit.runner.visualstudio from 2.5.6 to 2.5.7 in /csharp (#40140) Bumps [xunit.runner.visualstudio](https://github.com/xunit/visualstudio.xunit) from 2.5.6 to 2.5.7.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=xunit.runner.visualstudio&package-manager=nuget&previous-version=2.5.6&new-version=2.5.7)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- .../Apache.Arrow.Compression.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Sql.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj | 2 +- csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index dd27e790d9d4f..c85c770e49fb4 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -9,7 +9,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj index 3d7b415599907..73c25479dc9f0 100644 --- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj @@ -8,7 +8,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index 3038376327e70..5d6116f0479c0 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -8,7 +8,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index 48f5747b790b3..17c0e95fdf8d3 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -16,7 +16,7 @@ - + all runtime; build; native; contentfiles; analyzers From 023c5a1bb1982b126edd94f68df12cadf32be5e8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 19 Feb 2024 09:15:48 -0800 Subject: [PATCH 14/46] MINOR: [C#] Bump Google.Protobuf from 3.25.2 to 3.25.3 in /csharp (#40141) Bumps [Google.Protobuf](https://github.com/protocolbuffers/protobuf) from 3.25.2 to 3.25.3.
Commits
  • 4a2aef5 Updating version.json and repo version numbers to: 25.3
  • 7c6ba83 Merge pull request #15814 from protocolbuffers/cp-ruby-3.3
  • 25b1e81 Update Ruby GHA to test against Ruby 3.3.
  • 70e459f Merge pull request #15802 from protocolbuffers/cp-25.x
  • 17ec19d Bump python version to 3.9 for gcloud 460.0.0
  • 9dc736d [ObjC] Use a local to avoid warnings in 32bit builds.
  • 9d1bc10 [CPP] Add the privacy manifest to the C++ CocoaPod.
  • cec08dc [ObjC] Add the privacy manifest to the ObjC CocoaPod.
  • cf87faa Add PrivacyInfo.xcprivacy (#15557)
  • 76d05d4 remove use of mach_absolute_time (#15554)
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Google.Protobuf&package-manager=nuget&previous-version=3.25.2&new-version=3.25.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj index 3a6ae28b390d2..95752b0f64858 100644 --- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj +++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj @@ -5,7 +5,7 @@ - + From 977e217adf4e7406b0dc478c1176617af9c347e4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 19 Feb 2024 09:21:29 -0800 Subject: [PATCH 15/46] MINOR: [C#] Bump xunit from 2.6.6 to 2.7.0 in /csharp (#40139) Bumps [xunit](https://github.com/xunit/xunit) from 2.6.6 to 2.7.0.
Commits
  • d806fa1 v2.7.0
  • 13bb60b #2872: Expand special handling for sets in Assert.Contains/DoesNotContain (v2)
  • 203b4d9 Update to use the common build system (#2883)
  • a4f585b Remove UTF-8 byte order marks
  • 8b2c7b2 Add v3 build submodule
  • 2413c57 Throw when user tries to run a non-async test with a timeout
  • 9ebc10c #2573: Opting out of AsyncTestSyncContext (v2)
  • b060404 xunit/xunit#2880: Update XML documentation for string-based Assert.Equal (v2)
  • 6bbf922 Add conditions to EquivalenceAssertsTests for XUNIT_IMMUTABLE_COLLECTIONS
  • cab6a3e #2871: Inner exception stack trace is missing from Assert.Collection failure ...
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=xunit&package-manager=nuget&previous-version=2.6.6&new-version=2.7.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- .../Apache.Arrow.Compression.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Sql.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj | 2 +- csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index c85c770e49fb4..d4d124668e081 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -8,7 +8,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj index 73c25479dc9f0..214553ad1ed22 100644 --- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj @@ -7,7 +7,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index 5d6116f0479c0..d752c077c5521 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -7,7 +7,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index 17c0e95fdf8d3..3febfc92b97c8 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -15,7 +15,7 @@ - + all runtime; build; native; contentfiles; analyzers From 5dc9a0d2afbb17fffce85c579a5cb97c88c247bf Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 20 Feb 2024 08:15:37 +0900 Subject: [PATCH 16/46] MINOR: [Java] Bump org.apache.hadoop:hadoop-client-api from 3.3.2 to 3.3.6 in /java (#40134) Bumps org.apache.hadoop:hadoop-client-api from 3.3.2 to 3.3.6. [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.hadoop:hadoop-client-api&package-manager=maven&previous-version=3.3.2&new-version=3.3.6)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/adapter/orc/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/adapter/orc/pom.xml b/java/adapter/orc/pom.xml index 79e51470a426e..e7a2bfe872eb3 100644 --- a/java/adapter/orc/pom.xml +++ b/java/adapter/orc/pom.xml @@ -70,7 +70,7 @@ org.apache.hadoop hadoop-client-api - 3.3.2 + 3.3.6 org.apache.hadoop From f2872444e6dd67203f611c599daffb98060b985b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 20 Feb 2024 08:15:54 +0900 Subject: [PATCH 17/46] MINOR: [Java] Bump dep.junit.jupiter.version from 5.10.1 to 5.10.2 in /java (#40135) Bumps `dep.junit.jupiter.version` from 5.10.1 to 5.10.2. Updates `org.junit.jupiter:junit-jupiter-engine` from 5.10.1 to 5.10.2
Release notes

Sourced from org.junit.jupiter:junit-jupiter-engine's releases.

JUnit 5.10.2 = Platform 1.10.2 + Jupiter 5.10.2 + Vintage 5.10.2

See Release Notes.

Full Changelog: https://github.com/junit-team/junit5/compare/r5.10.1...r5.10.2

Commits
  • 4c0ddda Release 5.10.2
  • 463a147 Finalize release notes for 5.10.2
  • 43c105a Revert "Apply method predicate before searching type hierarchy"
  • 63d464d Revert "Harmonize application of method and field filters in search algorithms"
  • 85ec2fc Revert "Apply field predicate before searching type hierarchy"
  • 6209006 Update release notes
  • 5ee499f Fix CI build
  • d919ba7 Namespace user-specific build parameters
  • e26cd83 Prepare release notes for 5.10.2
  • ec8d428 Include LauncherInterceptor in launcher module declaration
  • Additional commits viewable in compare view

Updates `org.junit.jupiter:junit-jupiter-api` from 5.10.1 to 5.10.2
Release notes

Sourced from org.junit.jupiter:junit-jupiter-api's releases.

JUnit 5.10.2 = Platform 1.10.2 + Jupiter 5.10.2 + Vintage 5.10.2

See Release Notes.

Full Changelog: https://github.com/junit-team/junit5/compare/r5.10.1...r5.10.2

Commits
  • 4c0ddda Release 5.10.2
  • 463a147 Finalize release notes for 5.10.2
  • 43c105a Revert "Apply method predicate before searching type hierarchy"
  • 63d464d Revert "Harmonize application of method and field filters in search algorithms"
  • 85ec2fc Revert "Apply field predicate before searching type hierarchy"
  • 6209006 Update release notes
  • 5ee499f Fix CI build
  • d919ba7 Namespace user-specific build parameters
  • e26cd83 Prepare release notes for 5.10.2
  • ec8d428 Include LauncherInterceptor in launcher module declaration
  • Additional commits viewable in compare view

Updates `org.junit.vintage:junit-vintage-engine` from 5.10.1 to 5.10.2
Release notes

Sourced from org.junit.vintage:junit-vintage-engine's releases.

JUnit 5.10.2 = Platform 1.10.2 + Jupiter 5.10.2 + Vintage 5.10.2

See Release Notes.

Full Changelog: https://github.com/junit-team/junit5/compare/r5.10.1...r5.10.2

Commits
  • 4c0ddda Release 5.10.2
  • 463a147 Finalize release notes for 5.10.2
  • 43c105a Revert "Apply method predicate before searching type hierarchy"
  • 63d464d Revert "Harmonize application of method and field filters in search algorithms"
  • 85ec2fc Revert "Apply field predicate before searching type hierarchy"
  • 6209006 Update release notes
  • 5ee499f Fix CI build
  • d919ba7 Namespace user-specific build parameters
  • e26cd83 Prepare release notes for 5.10.2
  • ec8d428 Include LauncherInterceptor in launcher module declaration
  • Additional commits viewable in compare view

Updates `org.junit.jupiter:junit-jupiter-params` from 5.10.1 to 5.10.2
Release notes

Sourced from org.junit.jupiter:junit-jupiter-params's releases.

JUnit 5.10.2 = Platform 1.10.2 + Jupiter 5.10.2 + Vintage 5.10.2

See Release Notes.

Full Changelog: https://github.com/junit-team/junit5/compare/r5.10.1...r5.10.2

Commits
  • 4c0ddda Release 5.10.2
  • 463a147 Finalize release notes for 5.10.2
  • 43c105a Revert "Apply method predicate before searching type hierarchy"
  • 63d464d Revert "Harmonize application of method and field filters in search algorithms"
  • 85ec2fc Revert "Apply field predicate before searching type hierarchy"
  • 6209006 Update release notes
  • 5ee499f Fix CI build
  • d919ba7 Namespace user-specific build parameters
  • e26cd83 Prepare release notes for 5.10.2
  • ec8d428 Include LauncherInterceptor in launcher module declaration
  • Additional commits viewable in compare view

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index accff4c9b9f69..79c4219d5a772 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -30,7 +30,7 @@ ${project.build.directory}/generated-sources 1.9.0 - 5.10.1 + 5.10.2 2.0.11 33.0.0-jre 4.1.106.Final From 31b8d29dfe6bc914a444c8a87e437628d6ed1d5a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 20 Feb 2024 08:16:27 +0900 Subject: [PATCH 18/46] MINOR: [Java] Bump kr.motd.maven:os-maven-plugin from 1.7.0 to 1.7.1 in /java (#40137) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [kr.motd.maven:os-maven-plugin](https://github.com/trustin/os-maven-plugin) from 1.7.0 to 1.7.1.
Release notes

Sourced from kr.motd.maven:os-maven-plugin's releases.

os-maven-plugin-1.7.1

Commits
  • 9913130 [maven-release-plugin] prepare release os-maven-plugin-1.7.1
  • 211db49 Override the default Maven repository when testing against old Maven versions
  • 88bae1e Detect more osx variants (#67)
  • cebc3e8 Add support for LoongArch64 architecture (#63)
  • 4df5494 Add riscv32 and riscv64 support (#62)
  • 6bd9cfa Support for E2K (Elbrus 2000) architecture (#57)
  • 6d81345 Update README.md
  • 52e547c Update the version in README.md
  • ad10438 [maven-release-plugin] prepare for next development iteration
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=kr.motd.maven:os-maven-plugin&package-manager=maven&previous-version=1.7.0&new-version=1.7.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/flight/flight-core/pom.xml | 2 +- java/gandiva/pom.xml | 2 +- java/pom.xml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/java/flight/flight-core/pom.xml b/java/flight/flight-core/pom.xml index 0346172f610a6..e7b299fdbe850 100644 --- a/java/flight/flight-core/pom.xml +++ b/java/flight/flight-core/pom.xml @@ -142,7 +142,7 @@ kr.motd.maven os-maven-plugin - 1.7.0 + 1.7.1 diff --git a/java/gandiva/pom.xml b/java/gandiva/pom.xml index 6337efcf7e348..819baee11edec 100644 --- a/java/gandiva/pom.xml +++ b/java/gandiva/pom.xml @@ -125,7 +125,7 @@ kr.motd.maven os-maven-plugin - 1.7.0 + 1.7.1 diff --git a/java/pom.xml b/java/pom.xml index 79c4219d5a772..f713eb8d8e7b9 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -96,7 +96,7 @@ kr.motd.maven os-maven-plugin - 1.7.0 + 1.7.1 From 5a53e98bca762f0e251bc2d1a82dd45073160234 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 20 Feb 2024 08:16:54 +0900 Subject: [PATCH 19/46] MINOR: [Java] Bump org.apache.maven.surefire:surefire-junit-platform from 3.2.3 to 3.2.5 in /java (#40138) Bumps org.apache.maven.surefire:surefire-junit-platform from 3.2.3 to 3.2.5. [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.maven.surefire:surefire-junit-platform&package-manager=maven&previous-version=3.2.3&new-version=3.2.5)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index f713eb8d8e7b9..69ee8a26d946f 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -464,7 +464,7 @@ org.apache.maven.surefire surefire-junit-platform - 3.2.3 + 3.2.5 From bfddfa24755f3871ec66beb119f177323f1380de Mon Sep 17 00:00:00 2001 From: Divyansh200102 <146909065+Divyansh200102@users.noreply.github.com> Date: Tue, 20 Feb 2024 04:54:23 +0530 Subject: [PATCH 20/46] GH-40145: [C++][Docs] Correct the console emitter link (#40146) ### Rationale for this change To fix the embedded link of `cmdr` in C++ development docs ### What changes are included in this PR? The embedded link of `cmdr` is fixed. ### Are these changes tested? Yes ### Are there any user-facing changes? Yes * Closes: #40145 Authored-by: Divyansh200102 Signed-off-by: Sutou Kouhei --- docs/source/developers/cpp/windows.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/developers/cpp/windows.rst b/docs/source/developers/cpp/windows.rst index d97b038ade5e0..251a45325fe0b 100644 --- a/docs/source/developers/cpp/windows.rst +++ b/docs/source/developers/cpp/windows.rst @@ -45,7 +45,7 @@ For Visual Studio 2019, the script is: "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\Tools\VsDevCmd.bat" -arch=amd64 -One can configure a console emulator like `cmder `_ to +One can configure a console emulator like `cmder `_ to automatically launch this when starting a new development console. Using conda-forge for build dependencies From 5d3f5b61b9585e88b0672840d13d0a065647b11e Mon Sep 17 00:00:00 2001 From: Zhen Wang <643348094@qq.com> Date: Tue, 20 Feb 2024 07:33:04 +0800 Subject: [PATCH 21/46] MINOR: [Docs][Java] Fix incorrect example (#39941) ### Rationale for this change `setRowCount` should be called after filling vectors. ### What changes are included in this PR? Move `setRowCount` to after filling vectors. ### Are these changes tested? No ### Are there any user-facing changes? Documentation Authored-by: wforget <643348094@qq.com> Signed-off-by: Sutou Kouhei --- docs/source/java/quickstartguide.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/java/quickstartguide.rst b/docs/source/java/quickstartguide.rst index 5ce643db01999..e358681c57830 100644 --- a/docs/source/java/quickstartguide.rst +++ b/docs/source/java/quickstartguide.rst @@ -180,7 +180,6 @@ Example: Create a dataset of names (strings) and ages (32-bit signed integers). IntVector ageVector = (IntVector) root.getVector("age"); VarCharVector nameVector = (VarCharVector) root.getVector("name"); ){ - root.setRowCount(3); ageVector.allocateNew(3); ageVector.set(0, 10); ageVector.set(1, 20); @@ -189,6 +188,7 @@ Example: Create a dataset of names (strings) and ages (32-bit signed integers). nameVector.set(0, "Dave".getBytes(StandardCharsets.UTF_8)); nameVector.set(1, "Peter".getBytes(StandardCharsets.UTF_8)); nameVector.set(2, "Mary".getBytes(StandardCharsets.UTF_8)); + root.setRowCount(3); System.out.println("VectorSchemaRoot created: \n" + root.contentToTSVString()); } From b63770cbe2601908de3439a692b466e32df79392 Mon Sep 17 00:00:00 2001 From: Zhen Wang <643348094@qq.com> Date: Tue, 20 Feb 2024 07:33:51 +0800 Subject: [PATCH 22/46] MINOR: [Docs][Java] Fix installation documentation for BOM file (#39939) ### Rationale for this change We should import `arrow-bom` in `dependencyManagement`. ### What changes are included in this PR? Import `arrow-bom` module in `dependencyManagement` ### Are these changes tested? No ### Are there any user-facing changes? Documentation Authored-by: wforget <643348094@qq.com> Signed-off-by: Sutou Kouhei --- docs/source/java/install.rst | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/docs/source/java/install.rst b/docs/source/java/install.rst index 783687fb1f773..7ac1a4990f37d 100644 --- a/docs/source/java/install.rst +++ b/docs/source/java/install.rst @@ -134,11 +134,6 @@ every module. An alternative to the above would be: 15.0.0
- - org.apache.arrow - arrow-bom - ${arrow.version} - org.apache.arrow arrow-vector @@ -148,6 +143,17 @@ every module. An alternative to the above would be: arrow-memory-netty + + + + org.apache.arrow + arrow-bom + ${arrow.version} + pom + import + + + To use the Arrow Flight dependencies, also add the ``os-maven-plugin`` From 4dc3d04ae84d97d02443c0cef555a46535925c2b Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 20 Feb 2024 11:55:48 +0100 Subject: [PATCH 23/46] GH-40151: [C++] Make S3 narrative test more flexible (#40144) `arrow-s3fs-narrative-test` allows manual testing of the S3 filesystem implementation against an actual S3 backend such as AWS. This PR allows customizing the endpoint address, and adds a command to create a bucket for testing. Validated with the LocalStack S3 server. * Closes: #40151 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- .../arrow/filesystem/s3fs_narrative_test.cc | 35 +++++++++++++------ 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3fs_narrative_test.cc b/cpp/src/arrow/filesystem/s3fs_narrative_test.cc index f75ca4bdfd04d..bbb3c32ee6bd2 100644 --- a/cpp/src/arrow/filesystem/s3fs_narrative_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_narrative_test.cc @@ -34,6 +34,7 @@ #include "arrow/util/logging.h" DEFINE_bool(clear, false, "delete all bucket contents"); +DEFINE_bool(create, false, "create test bucket"); DEFINE_bool(test, false, "run narrative test against bucket"); DEFINE_bool(verbose, false, "be more verbose (includes AWS warnings)"); @@ -57,8 +58,7 @@ namespace fs { PrintError(context_msg, _status_or_result); \ } while (0) -std::shared_ptr MakeFileSystem() { - std::shared_ptr s3fs; +Result> MakeRootFileSystem() { S3Options options; if (!FLAGS_access_key.empty()) { options = S3Options::FromAccessKey(FLAGS_access_key, FLAGS_secret_key); @@ -68,8 +68,13 @@ std::shared_ptr MakeFileSystem() { options.endpoint_override = FLAGS_endpoint; options.scheme = FLAGS_scheme; options.region = FLAGS_region; - s3fs = S3FileSystem::Make(options).ValueOrDie(); - return std::make_shared(FLAGS_bucket, s3fs); + options.allow_bucket_creation = FLAGS_create; + return S3FileSystem::Make(options); +} + +Result> MakeFileSystem() { + ARROW_ASSIGN_OR_RAISE(auto fs, MakeRootFileSystem()); + return std::make_shared(FLAGS_bucket, fs); } void PrintError(const std::string& context_msg, const Status& st) { @@ -90,13 +95,17 @@ void CheckDirectory(FileSystem* fs, const std::string& path) { } void ClearBucket(int argc, char** argv) { - auto fs = MakeFileSystem(); - + ASSERT_OK_AND_ASSIGN(auto fs, MakeFileSystem()); ASSERT_OK(fs->DeleteRootDirContents()); } +void CreateBucket(int argc, char** argv) { + ASSERT_OK_AND_ASSIGN(auto fs, MakeRootFileSystem()); + ASSERT_OK(fs->CreateDir(FLAGS_bucket)); +} + void TestBucket(int argc, char** argv) { - auto fs = MakeFileSystem(); + ASSERT_OK_AND_ASSIGN(auto fs, MakeFileSystem()); std::vector infos; FileSelector select; std::shared_ptr is; @@ -221,13 +230,17 @@ void TestMain(int argc, char** argv) { : (FLAGS_verbose ? S3LogLevel::Warn : S3LogLevel::Fatal); ASSERT_OK(InitializeS3(options)); - if (FLAGS_region.empty()) { + if (FLAGS_region.empty() && FLAGS_endpoint.empty()) { ASSERT_OK_AND_ASSIGN(FLAGS_region, ResolveS3BucketRegion(FLAGS_bucket)); } + if (FLAGS_create) { + CreateBucket(argc, argv); + } if (FLAGS_clear) { ClearBucket(argc, argv); - } else if (FLAGS_test) { + } + if (FLAGS_test) { TestBucket(argc, argv); } @@ -244,8 +257,8 @@ int main(int argc, char** argv) { gflags::SetUsageMessage(ss.str()); gflags::ParseCommandLineFlags(&argc, &argv, true); - if (FLAGS_clear + FLAGS_test != 1) { - ARROW_LOG(ERROR) << "Need exactly one of --test and --clear"; + if (FLAGS_clear + FLAGS_test + FLAGS_create != 1) { + ARROW_LOG(ERROR) << "Need exactly one of --test, --clear and --create"; return 2; } if (FLAGS_bucket.empty()) { From df9e0c1264e3d7b83f913bccaec2c9a85fe6777e Mon Sep 17 00:00:00 2001 From: Florian Bernard Date: Tue, 20 Feb 2024 14:15:44 +0100 Subject: [PATCH 24/46] GH-20379: [Java] Dataset Failed to update reservation while freeing bytes (#40101) ### Rationale for this change Better controls JNI Thread management in java dataset module to fix #20379 Re-use the same code found in the java arrow-c-data module : https://github.com/apache/arrow/blob/main/java/c/src/main/cpp/jni_wrapper.cc#L107 May JNIEnvGuard class code can be put in a common place for both libraries ... ### What changes are included in this PR? N/A ### Are these changes tested? These changes has been tested with : https://gist.github.com/fb64/71880cde297bc5234b02b68b785670fd on Linux X86_64 architecture ### Are there any user-facing changes? N/A * Closes: #20379 Authored-by: Florian Bernard Signed-off-by: David Li --- java/dataset/src/main/cpp/jni_wrapper.cc | 62 +++++++++++++++++++----- 1 file changed, 50 insertions(+), 12 deletions(-) diff --git a/java/dataset/src/main/cpp/jni_wrapper.cc b/java/dataset/src/main/cpp/jni_wrapper.cc index d2d976677bd6b..19a43c8d2fa41 100644 --- a/java/dataset/src/main/cpp/jni_wrapper.cc +++ b/java/dataset/src/main/cpp/jni_wrapper.cc @@ -83,6 +83,40 @@ void ThrowIfError(const arrow::Status& status) { } } +class JNIEnvGuard { + public: + explicit JNIEnvGuard(JavaVM* vm) : vm_(vm), env_(nullptr), should_detach_(false) { + JNIEnv* env; + jint code = vm->GetEnv(reinterpret_cast(&env), JNI_VERSION); + if (code == JNI_EDETACHED) { + JavaVMAttachArgs args; + args.version = JNI_VERSION; + args.name = NULL; + args.group = NULL; + code = vm->AttachCurrentThread(reinterpret_cast(&env), &args); + should_detach_ = (code == JNI_OK); + } + if (code != JNI_OK) { + ThrowPendingException("Failed to attach the current thread to a Java VM"); + } + env_ = env; + } + + JNIEnv* env() { return env_; } + + ~JNIEnvGuard() { + if (should_detach_) { + vm_->DetachCurrentThread(); + should_detach_ = false; + } + } + + private: + JavaVM* vm_; + JNIEnv* env_; + bool should_detach_; +}; + template T JniGetOrThrow(arrow::Result result) { const arrow::Status& status = result.status(); @@ -126,23 +160,27 @@ class ReserveFromJava : public arrow::dataset::jni::ReservationListener { : vm_(vm), java_reservation_listener_(java_reservation_listener) {} arrow::Status OnReservation(int64_t size) override { - JNIEnv* env; - if (vm_->GetEnv(reinterpret_cast(&env), JNI_VERSION) != JNI_OK) { - return arrow::Status::Invalid("JNIEnv was not attached to current thread"); + try { + JNIEnvGuard guard(vm_); + JNIEnv* env = guard.env(); + env->CallObjectMethod(java_reservation_listener_, reserve_memory_method, size); + RETURN_NOT_OK(arrow::dataset::jni::CheckException(env)); + return arrow::Status::OK(); + } catch (const JniPendingException& e) { + return arrow::Status::Invalid(e.what()); } - env->CallObjectMethod(java_reservation_listener_, reserve_memory_method, size); - RETURN_NOT_OK(arrow::dataset::jni::CheckException(env)); - return arrow::Status::OK(); } arrow::Status OnRelease(int64_t size) override { - JNIEnv* env; - if (vm_->GetEnv(reinterpret_cast(&env), JNI_VERSION) != JNI_OK) { - return arrow::Status::Invalid("JNIEnv was not attached to current thread"); + try { + JNIEnvGuard guard(vm_); + JNIEnv* env = guard.env(); + env->CallObjectMethod(java_reservation_listener_, unreserve_memory_method, size); + RETURN_NOT_OK(arrow::dataset::jni::CheckException(env)); + return arrow::Status::OK(); + } catch (const JniPendingException& e) { + return arrow::Status::Invalid(e.what()); } - env->CallObjectMethod(java_reservation_listener_, unreserve_memory_method, size); - RETURN_NOT_OK(arrow::dataset::jni::CheckException(env)); - return arrow::Status::OK(); } jobject GetJavaReservationListener() { return java_reservation_listener_; } From 2e2bd8b9f231e1fc13ec4f76aebccae8f7a1ef8f Mon Sep 17 00:00:00 2001 From: Paul Nienaber Date: Tue, 20 Feb 2024 05:49:26 -0800 Subject: [PATCH 25/46] GH-34865: [C++][Java][Flight RPC] Add Session management messages (#34817) ### Rationale for this change Flight presently contains no formal mechanism for managing connection/query configuration options; instead, request headers and/or non-query SQL statements are often used in lieu, with unnecessary overhead and poor failure handling. A stateless (from Flight's perspective) Flight format extension is desirable to close this gap for server implementations that use/want connection state/context. ### What changes are included in this PR? "Session" set/get/close Actions and server-side helper middleware. ### Are these changes tested? Integration tests (C++ currently broken due to middleware-related framework issue) and some complex-case unit testing are included. ### Are there any user-facing changes? Non-breaking extensions to wire format and corresponding client/server Flight RPC API extensions. * Closes: #34865 Lead-authored-by: Paul Nienaber Co-authored-by: Paul Nienaber Co-authored-by: James Duong Co-authored-by: Sutou Kouhei Signed-off-by: David Li --- cpp/src/arrow/flight/client.cc | 41 ++ cpp/src/arrow/flight/client.h | 21 + .../flight_integration_test.cc | 2 + .../integration_tests/test_integration.cc | 154 ++++++++ .../arrow/flight/serialization_internal.cc | 159 ++++++++ cpp/src/arrow/flight/serialization_internal.h | 20 + cpp/src/arrow/flight/sql/CMakeLists.txt | 7 +- cpp/src/arrow/flight/sql/client.h | 27 ++ cpp/src/arrow/flight/sql/server.cc | 76 +++- cpp/src/arrow/flight/sql/server.h | 19 + .../flight/sql/server_session_middleware.cc | 235 ++++++++++++ .../flight/sql/server_session_middleware.h | 89 +++++ .../sql/server_session_middleware_factory.h | 61 +++ ...erver_session_middleware_internals_test.cc | 45 +++ .../flight/transport/grpc/grpc_server.cc | 26 +- cpp/src/arrow/flight/types.cc | 363 ++++++++++++++++++ cpp/src/arrow/flight/types.h | 197 ++++++++++ dev/archery/archery/integration/runner.py | 5 + docs/source/format/FlightSql.rst | 41 ++ format/Flight.proto | 114 ++++++ .../arrow/flight/CloseSessionRequest.java | 58 +++ .../arrow/flight/CloseSessionResult.java | 106 +++++ .../org/apache/arrow/flight/FlightClient.java | 96 +++++ .../apache/arrow/flight/FlightConstants.java | 14 + .../flight/GetSessionOptionsRequest.java | 60 +++ .../arrow/flight/GetSessionOptionsResult.java | 80 ++++ .../flight/NoOpSessionOptionValueVisitor.java | 72 ++++ .../arrow/flight/ServerSessionMiddleware.java | 227 +++++++++++ .../arrow/flight/SessionOptionValue.java | 94 +++++ .../flight/SessionOptionValueFactory.java | 284 ++++++++++++++ .../flight/SessionOptionValueVisitor.java | 58 +++ .../flight/SetSessionOptionsRequest.java | 81 ++++ .../arrow/flight/SetSessionOptionsResult.java | 152 ++++++++ java/flight/flight-integration-tests/pom.xml | 4 + .../flight/integration/tests/Scenarios.java | 1 + .../tests/SessionOptionsProducer.java | 110 ++++++ .../tests/SessionOptionsScenario.java | 107 ++++++ .../integration/tests/IntegrationTest.java | 5 + .../sql/CloseSessionResultListener.java | 46 +++ .../arrow/flight/sql/FlightSqlClient.java | 18 + .../arrow/flight/sql/FlightSqlProducer.java | 79 ++++ .../sql/GetSessionOptionsResultListener.java | 46 +++ .../sql/SetSessionOptionsResultListener.java | 46 +++ testing | 2 +- 44 files changed, 3527 insertions(+), 21 deletions(-) create mode 100644 cpp/src/arrow/flight/sql/server_session_middleware.cc create mode 100644 cpp/src/arrow/flight/sql/server_session_middleware.h create mode 100644 cpp/src/arrow/flight/sql/server_session_middleware_factory.h create mode 100644 cpp/src/arrow/flight/sql/server_session_middleware_internals_test.cc create mode 100644 java/flight/flight-core/src/main/java/org/apache/arrow/flight/CloseSessionRequest.java create mode 100644 java/flight/flight-core/src/main/java/org/apache/arrow/flight/CloseSessionResult.java create mode 100644 java/flight/flight-core/src/main/java/org/apache/arrow/flight/GetSessionOptionsRequest.java create mode 100644 java/flight/flight-core/src/main/java/org/apache/arrow/flight/GetSessionOptionsResult.java create mode 100644 java/flight/flight-core/src/main/java/org/apache/arrow/flight/NoOpSessionOptionValueVisitor.java create mode 100644 java/flight/flight-core/src/main/java/org/apache/arrow/flight/ServerSessionMiddleware.java create mode 100644 java/flight/flight-core/src/main/java/org/apache/arrow/flight/SessionOptionValue.java create mode 100644 java/flight/flight-core/src/main/java/org/apache/arrow/flight/SessionOptionValueFactory.java create mode 100644 java/flight/flight-core/src/main/java/org/apache/arrow/flight/SessionOptionValueVisitor.java create mode 100644 java/flight/flight-core/src/main/java/org/apache/arrow/flight/SetSessionOptionsRequest.java create mode 100644 java/flight/flight-core/src/main/java/org/apache/arrow/flight/SetSessionOptionsResult.java create mode 100644 java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/SessionOptionsProducer.java create mode 100644 java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/SessionOptionsScenario.java create mode 100644 java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/CloseSessionResultListener.java create mode 100644 java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/GetSessionOptionsResultListener.java create mode 100644 java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/SetSessionOptionsResultListener.java diff --git a/cpp/src/arrow/flight/client.cc b/cpp/src/arrow/flight/client.cc index 25da5e8007660..4d4f13a09fb26 100644 --- a/cpp/src/arrow/flight/client.cc +++ b/cpp/src/arrow/flight/client.cc @@ -713,6 +713,47 @@ arrow::Result FlightClient::DoExchange( return result; } +::arrow::Result FlightClient::SetSessionOptions( + const FlightCallOptions& options, const SetSessionOptionsRequest& request) { + RETURN_NOT_OK(CheckOpen()); + ARROW_ASSIGN_OR_RAISE(auto body, request.SerializeToString()); + Action action{ActionType::kSetSessionOptions.type, Buffer::FromString(body)}; + ARROW_ASSIGN_OR_RAISE(auto stream, DoAction(options, action)); + ARROW_ASSIGN_OR_RAISE(auto result, stream->Next()); + ARROW_ASSIGN_OR_RAISE( + auto set_session_options_result, + SetSessionOptionsResult::Deserialize(std::string_view(*result->body))); + ARROW_RETURN_NOT_OK(stream->Drain()); + return set_session_options_result; +} + +::arrow::Result FlightClient::GetSessionOptions( + const FlightCallOptions& options, const GetSessionOptionsRequest& request) { + RETURN_NOT_OK(CheckOpen()); + ARROW_ASSIGN_OR_RAISE(auto body, request.SerializeToString()); + Action action{ActionType::kGetSessionOptions.type, Buffer::FromString(body)}; + ARROW_ASSIGN_OR_RAISE(auto stream, DoAction(options, action)); + ARROW_ASSIGN_OR_RAISE(auto result, stream->Next()); + ARROW_ASSIGN_OR_RAISE( + auto get_session_options_result, + GetSessionOptionsResult::Deserialize(std::string_view(*result->body))); + ARROW_RETURN_NOT_OK(stream->Drain()); + return get_session_options_result; +} + +::arrow::Result FlightClient::CloseSession( + const FlightCallOptions& options, const CloseSessionRequest& request) { + RETURN_NOT_OK(CheckOpen()); + ARROW_ASSIGN_OR_RAISE(auto body, request.SerializeToString()); + Action action{ActionType::kCloseSession.type, Buffer::FromString(body)}; + ARROW_ASSIGN_OR_RAISE(auto stream, DoAction(options, action)); + ARROW_ASSIGN_OR_RAISE(auto result, stream->Next()); + ARROW_ASSIGN_OR_RAISE(auto close_session_result, + CloseSessionResult::Deserialize(std::string_view(*result->body))); + ARROW_RETURN_NOT_OK(stream->Drain()); + return close_session_result; +} + Status FlightClient::Close() { if (!closed_) { closed_ = true; diff --git a/cpp/src/arrow/flight/client.h b/cpp/src/arrow/flight/client.h index 1df71d2029f74..330fa8bad730d 100644 --- a/cpp/src/arrow/flight/client.h +++ b/cpp/src/arrow/flight/client.h @@ -383,6 +383,27 @@ class ARROW_FLIGHT_EXPORT FlightClient { return DoExchange({}, descriptor); } + /// \brief Set server session option(s) by name/value. Sessions are generally + /// persisted via HTTP cookies. + /// \param[in] options Per-RPC options + /// \param[in] request The server session options to set + ::arrow::Result SetSessionOptions( + const FlightCallOptions& options, const SetSessionOptionsRequest& request); + + /// \brief Get the current server session options. The session is generally + /// accessed via an HTTP cookie. + /// \param[in] options Per-RPC options + /// \param[in] request The (empty) GetSessionOptions request object. + ::arrow::Result GetSessionOptions( + const FlightCallOptions& options, const GetSessionOptionsRequest& request); + + /// \brief Close/invalidate the current server session. The session is generally + /// accessed via an HTTP cookie. + /// \param[in] options Per-RPC options + /// \param[in] request The (empty) CloseSession request object. + ::arrow::Result CloseSession(const FlightCallOptions& options, + const CloseSessionRequest& request); + /// \brief Explicitly shut down and clean up the client. /// /// For backwards compatibility, this will be implicitly called by diff --git a/cpp/src/arrow/flight/integration_tests/flight_integration_test.cc b/cpp/src/arrow/flight/integration_tests/flight_integration_test.cc index 67c7ee85f59d3..6f3115cc5ab8a 100644 --- a/cpp/src/arrow/flight/integration_tests/flight_integration_test.cc +++ b/cpp/src/arrow/flight/integration_tests/flight_integration_test.cc @@ -71,6 +71,8 @@ TEST(FlightIntegration, ExpirationTimeRenewFlightEndpoint) { ASSERT_OK(RunScenario("expiration_time:renew_flight_endpoint")); } +TEST(FlightIntegration, SessionOptions) { ASSERT_OK(RunScenario("session_options")); } + TEST(FlightIntegration, PollFlightInfo) { ASSERT_OK(RunScenario("poll_flight_info")); } TEST(FlightIntegration, AppMetadataFlightInfoEndpoint) { diff --git a/cpp/src/arrow/flight/integration_tests/test_integration.cc b/cpp/src/arrow/flight/integration_tests/test_integration.cc index b693662f60740..d4e0a2cda5bd8 100644 --- a/cpp/src/arrow/flight/integration_tests/test_integration.cc +++ b/cpp/src/arrow/flight/integration_tests/test_integration.cc @@ -28,11 +28,13 @@ #include "arrow/array/array_nested.h" #include "arrow/array/array_primitive.h" #include "arrow/array/builder_primitive.h" +#include "arrow/flight/client_cookie_middleware.h" #include "arrow/flight/client_middleware.h" #include "arrow/flight/server_middleware.h" #include "arrow/flight/sql/client.h" #include "arrow/flight/sql/column_metadata.h" #include "arrow/flight/sql/server.h" +#include "arrow/flight/sql/server_session_middleware.h" #include "arrow/flight/sql/types.h" #include "arrow/flight/test_util.h" #include "arrow/flight/types.h" @@ -744,6 +746,155 @@ class ExpirationTimeRenewFlightEndpointScenario : public Scenario { } }; +/// \brief The server used for testing Session Options. +/// +/// SetSessionOptions has a blacklisted option name and string option value, +/// both "lol_invalid", which will result in errors attempting to set either. +class SessionOptionsServer : public sql::FlightSqlServerBase { + static inline const std::string invalid_option_name = "lol_invalid"; + static inline const SessionOptionValue invalid_option_value = "lol_invalid"; + + const std::string session_middleware_key; + // These will never be threaded so using a plain map and no lock + std::map session_store_; + + public: + explicit SessionOptionsServer(std::string session_middleware_key) + : FlightSqlServerBase(), + session_middleware_key(std::move(session_middleware_key)) {} + + arrow::Result SetSessionOptions( + const ServerCallContext& context, + const SetSessionOptionsRequest& request) override { + SetSessionOptionsResult res; + + auto* middleware = static_cast( + context.GetMiddleware(session_middleware_key)); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr session, + middleware->GetSession()); + + for (const auto& [name, value] : request.session_options) { + // Blacklisted value name + if (name == invalid_option_name) { + res.errors.emplace(name, SetSessionOptionsResult::Error{ + SetSessionOptionErrorValue::kInvalidName}); + continue; + } + // Blacklisted option value + if (value == invalid_option_value) { + res.errors.emplace(name, SetSessionOptionsResult::Error{ + SetSessionOptionErrorValue::kInvalidValue}); + continue; + } + if (std::holds_alternative(value)) { + session->EraseSessionOption(name); + continue; + } + session->SetSessionOption(name, value); + } + + return res; + } + + arrow::Result GetSessionOptions( + const ServerCallContext& context, + const GetSessionOptionsRequest& request) override { + auto* middleware = static_cast( + context.GetMiddleware(session_middleware_key)); + if (!middleware->HasSession()) { + return Status::Invalid("No existing session to get options from."); + } + ARROW_ASSIGN_OR_RAISE(std::shared_ptr session, + middleware->GetSession()); + + return GetSessionOptionsResult{session->GetSessionOptions()}; + } + + arrow::Result CloseSession( + const ServerCallContext& context, const CloseSessionRequest& request) override { + // Broken (does not expire cookie) until C++ middleware handling (GH-39791) fixed: + auto* middleware = static_cast( + context.GetMiddleware(session_middleware_key)); + ARROW_RETURN_NOT_OK(middleware->CloseSession()); + return CloseSessionResult{CloseSessionStatus::kClosed}; + } +}; + +/// \brief The Session Options scenario. +/// +/// This tests Session Options functionality as well as ServerSessionMiddleware. +class SessionOptionsScenario : public Scenario { + static inline const std::string server_middleware_key = "sessionmiddleware"; + + Status MakeServer(std::unique_ptr* server, + FlightServerOptions* options) override { + *server = std::make_unique(server_middleware_key); + + auto id_gen_int = std::make_shared(1000); + options->middleware.emplace_back( + server_middleware_key, + sql::MakeServerSessionMiddlewareFactory( + [=]() -> std::string { return std::to_string((*id_gen_int)++); })); + + return Status::OK(); + } + + Status MakeClient(FlightClientOptions* options) override { + options->middleware.emplace_back(GetCookieFactory()); + return Status::OK(); + } + + Status RunClient(std::unique_ptr flight_client) override { + sql::FlightSqlClient client{std::move(flight_client)}; + + // Set + auto req1 = SetSessionOptionsRequest{ + {{"foolong", 123L}, + {"bardouble", 456.0}, + {"lol_invalid", "this won't get set"}, + {"key_with_invalid_value", "lol_invalid"}, + {"big_ol_string_list", std::vector{"a", "b", "sea", "dee", " ", + " ", "geee", "(づ。◕‿‿◕。)づ"}}}}; + ARROW_ASSIGN_OR_RAISE(auto res1, client.SetSessionOptions({}, req1)); + // Some errors + if (res1.errors != + std::map{ + {"lol_invalid", + SetSessionOptionsResult::Error{SetSessionOptionErrorValue::kInvalidName}}, + {"key_with_invalid_value", SetSessionOptionsResult::Error{ + SetSessionOptionErrorValue::kInvalidValue}}}) { + return Status::Invalid("res1 incorrect: " + res1.ToString()); + } + // Some set, some omitted due to above errors + ARROW_ASSIGN_OR_RAISE(auto res2, client.GetSessionOptions({}, {})); + if (res2.session_options != + std::map{ + {"foolong", 123L}, + {"bardouble", 456.0}, + {"big_ol_string_list", + std::vector{"a", "b", "sea", "dee", " ", " ", "geee", + "(づ。◕‿‿◕。)づ"}}}) { + return Status::Invalid("res2 incorrect: " + res2.ToString()); + } + // Update + ARROW_ASSIGN_OR_RAISE( + auto res3, + client.SetSessionOptions( + {}, SetSessionOptionsRequest{ + {{"foolong", std::monostate{}}, + {"big_ol_string_list", "a,b,sea,dee, , ,geee,(づ。◕‿‿◕。)づ"}}})); + ARROW_ASSIGN_OR_RAISE(auto res4, client.GetSessionOptions({}, {})); + if (res4.session_options != + std::map{ + {"bardouble", 456.0}, + {"big_ol_string_list", "a,b,sea,dee, , ,geee,(づ。◕‿‿◕。)づ"}}) { + return Status::Invalid("res4 incorrect: " + res4.ToString()); + } + + return Status::OK(); + } +}; + /// \brief The server used for testing PollFlightInfo(). class PollFlightInfoServer : public FlightServerBase { public: @@ -1952,6 +2103,9 @@ Status GetScenario(const std::string& scenario_name, std::shared_ptr* } else if (scenario_name == "expiration_time:renew_flight_endpoint") { *out = std::make_shared(); return Status::OK(); + } else if (scenario_name == "session_options") { + *out = std::make_shared(); + return Status::OK(); } else if (scenario_name == "poll_flight_info") { *out = std::make_shared(); return Status::OK(); diff --git a/cpp/src/arrow/flight/serialization_internal.cc b/cpp/src/arrow/flight/serialization_internal.cc index e5a7503a6386b..10600d055b3a8 100644 --- a/cpp/src/arrow/flight/serialization_internal.cc +++ b/cpp/src/arrow/flight/serialization_internal.cc @@ -27,6 +27,14 @@ #include "arrow/result.h" #include "arrow/status.h" +// Lambda helper & CTAD +template +struct overloaded : Ts... { + using Ts::operator()...; +}; +template // CTAD will not be needed for >=C++20 +overloaded(Ts...)->overloaded; + namespace arrow { namespace flight { namespace internal { @@ -380,6 +388,157 @@ Status ToPayload(const FlightDescriptor& descr, std::shared_ptr* out) { return Status::OK(); } +// SessionOptionValue + +Status FromProto(const pb::SessionOptionValue& pb_val, SessionOptionValue* val) { + switch (pb_val.option_value_case()) { + case pb::SessionOptionValue::OPTION_VALUE_NOT_SET: + *val = std::monostate{}; + break; + case pb::SessionOptionValue::kStringValue: + *val = pb_val.string_value(); + break; + case pb::SessionOptionValue::kBoolValue: + *val = pb_val.bool_value(); + break; + case pb::SessionOptionValue::kInt64Value: + *val = pb_val.int64_value(); + break; + case pb::SessionOptionValue::kDoubleValue: + *val = pb_val.double_value(); + break; + case pb::SessionOptionValue::kStringListValue: { + std::vector vec; + vec.reserve(pb_val.string_list_value().values_size()); + for (const std::string& s : pb_val.string_list_value().values()) { + vec.push_back(s); + } + (*val).emplace>(std::move(vec)); + break; + } + } + return Status::OK(); +} + +Status ToProto(const SessionOptionValue& val, pb::SessionOptionValue* pb_val) { + std::visit(overloaded{[&](std::monostate v) { pb_val->clear_option_value(); }, + [&](std::string v) { pb_val->set_string_value(v); }, + [&](bool v) { pb_val->set_bool_value(v); }, + [&](int64_t v) { pb_val->set_int64_value(v); }, + [&](double v) { pb_val->set_double_value(v); }, + [&](std::vector v) { + auto* string_list_value = pb_val->mutable_string_list_value(); + for (const std::string& s : v) string_list_value->add_values(s); + }}, + val); + return Status::OK(); +} + +// map + +Status FromProto(const google::protobuf::Map& pb_map, + std::map* map) { + if (pb_map.empty()) { + return Status::OK(); + } + for (const auto& [name, pb_val] : pb_map) { + RETURN_NOT_OK(FromProto(pb_val, &(*map)[name])); + } + return Status::OK(); +} + +Status ToProto(const std::map& map, + google::protobuf::Map* pb_map) { + for (const auto& [name, val] : map) { + RETURN_NOT_OK(ToProto(val, &(*pb_map)[name])); + } + return Status::OK(); +} + +// SetSessionOptionsRequest + +Status FromProto(const pb::SetSessionOptionsRequest& pb_request, + SetSessionOptionsRequest* request) { + RETURN_NOT_OK(FromProto(pb_request.session_options(), &request->session_options)); + return Status::OK(); +} + +Status ToProto(const SetSessionOptionsRequest& request, + pb::SetSessionOptionsRequest* pb_request) { + RETURN_NOT_OK(ToProto(request.session_options, pb_request->mutable_session_options())); + return Status::OK(); +} + +// SetSessionOptionsResult + +Status FromProto(const pb::SetSessionOptionsResult& pb_result, + SetSessionOptionsResult* result) { + for (const auto& [k, pb_v] : pb_result.errors()) { + result->errors.insert({k, {static_cast(pb_v.value())}}); + } + return Status::OK(); +} + +Status ToProto(const SetSessionOptionsResult& result, + pb::SetSessionOptionsResult* pb_result) { + auto* pb_errors = pb_result->mutable_errors(); + for (const auto& [k, v] : result.errors) { + pb::SetSessionOptionsResult::Error e; + e.set_value(static_cast(v.value)); + (*pb_errors)[k] = std::move(e); + } + return Status::OK(); +} + +// GetSessionOptionsRequest + +Status FromProto(const pb::GetSessionOptionsRequest& pb_request, + GetSessionOptionsRequest* request) { + return Status::OK(); +} + +Status ToProto(const GetSessionOptionsRequest& request, + pb::GetSessionOptionsRequest* pb_request) { + return Status::OK(); +} + +// GetSessionOptionsResult + +Status FromProto(const pb::GetSessionOptionsResult& pb_result, + GetSessionOptionsResult* result) { + RETURN_NOT_OK(FromProto(pb_result.session_options(), &result->session_options)); + return Status::OK(); +} + +Status ToProto(const GetSessionOptionsResult& result, + pb::GetSessionOptionsResult* pb_result) { + RETURN_NOT_OK(ToProto(result.session_options, pb_result->mutable_session_options())); + return Status::OK(); +} + +// CloseSessionRequest + +Status FromProto(const pb::CloseSessionRequest& pb_request, + CloseSessionRequest* request) { + return Status::OK(); +} + +Status ToProto(const CloseSessionRequest& request, pb::CloseSessionRequest* pb_request) { + return Status::OK(); +} + +// CloseSessionResult + +Status FromProto(const pb::CloseSessionResult& pb_result, CloseSessionResult* result) { + result->status = static_cast(pb_result.status()); + return Status::OK(); +} + +Status ToProto(const CloseSessionResult& result, pb::CloseSessionResult* pb_result) { + pb_result->set_status(static_cast(result.status)); + return Status::OK(); +} + } // namespace internal } // namespace flight } // namespace arrow diff --git a/cpp/src/arrow/flight/serialization_internal.h b/cpp/src/arrow/flight/serialization_internal.h index 1ac7de83d1308..90dde87d3a5eb 100644 --- a/cpp/src/arrow/flight/serialization_internal.h +++ b/cpp/src/arrow/flight/serialization_internal.h @@ -66,6 +66,16 @@ Status FromProto(const pb::CancelFlightInfoRequest& pb_request, CancelFlightInfoRequest* request); Status FromProto(const pb::SchemaResult& pb_result, std::string* result); Status FromProto(const pb::BasicAuth& pb_basic_auth, BasicAuth* info); +Status FromProto(const pb::SetSessionOptionsRequest& pb_request, + SetSessionOptionsRequest* request); +Status FromProto(const pb::SetSessionOptionsResult& pb_result, + SetSessionOptionsResult* result); +Status FromProto(const pb::GetSessionOptionsRequest& pb_request, + GetSessionOptionsRequest* request); +Status FromProto(const pb::GetSessionOptionsResult& pb_result, + GetSessionOptionsResult* result); +Status FromProto(const pb::CloseSessionRequest& pb_request, CloseSessionRequest* request); +Status FromProto(const pb::CloseSessionResult& pb_result, CloseSessionResult* result); Status ToProto(const Timestamp& timestamp, google::protobuf::Timestamp* pb_timestamp); Status ToProto(const FlightDescriptor& descr, pb::FlightDescriptor* pb_descr); @@ -85,6 +95,16 @@ Status ToProto(const Criteria& criteria, pb::Criteria* pb_criteria); Status ToProto(const SchemaResult& result, pb::SchemaResult* pb_result); Status ToProto(const Ticket& ticket, pb::Ticket* pb_ticket); Status ToProto(const BasicAuth& basic_auth, pb::BasicAuth* pb_basic_auth); +Status ToProto(const SetSessionOptionsRequest& request, + pb::SetSessionOptionsRequest* pb_request); +Status ToProto(const SetSessionOptionsResult& result, + pb::SetSessionOptionsResult* pb_result); +Status ToProto(const GetSessionOptionsRequest& request, + pb::GetSessionOptionsRequest* pb_request); +Status ToProto(const GetSessionOptionsResult& result, + pb::GetSessionOptionsResult* pb_result); +Status ToProto(const CloseSessionRequest& request, pb::CloseSessionRequest* pb_request); +Status ToProto(const CloseSessionResult& result, pb::CloseSessionResult* pb_result); Status ToPayload(const FlightDescriptor& descr, std::shared_ptr* out); diff --git a/cpp/src/arrow/flight/sql/CMakeLists.txt b/cpp/src/arrow/flight/sql/CMakeLists.txt index b0a551a2bca77..b32f731496749 100644 --- a/cpp/src/arrow/flight/sql/CMakeLists.txt +++ b/cpp/src/arrow/flight/sql/CMakeLists.txt @@ -47,7 +47,8 @@ set(ARROW_FLIGHT_SQL_SRCS sql_info_internal.cc column_metadata.cc client.cc - protocol_internal.cc) + protocol_internal.cc + server_session_middleware.cc) add_arrow_lib(arrow_flight_sql CMAKE_PACKAGE_NAME @@ -104,7 +105,9 @@ if(ARROW_BUILD_TESTS OR ARROW_BUILD_EXAMPLES) example/sqlite_server.cc example/sqlite_tables_schema_batch_reader.cc) - set(ARROW_FLIGHT_SQL_TEST_SRCS server_test.cc) + set(ARROW_FLIGHT_SQL_TEST_SRCS server_test.cc + server_session_middleware_internals_test.cc) + set(ARROW_FLIGHT_SQL_TEST_LIBS ${SQLite3_LIBRARIES}) set(ARROW_FLIGHT_SQL_ACERO_SRCS example/acero_server.cc) diff --git a/cpp/src/arrow/flight/sql/client.h b/cpp/src/arrow/flight/sql/client.h index 5f3fc7d8574a9..9782611dbadcd 100644 --- a/cpp/src/arrow/flight/sql/client.h +++ b/cpp/src/arrow/flight/sql/client.h @@ -350,6 +350,33 @@ class ARROW_FLIGHT_SQL_EXPORT FlightSqlClient { ::arrow::Result CancelQuery(const FlightCallOptions& options, const FlightInfo& info); + /// \brief Sets session options. + /// + /// \param[in] options RPC-layer hints for this call. + /// \param[in] request The session options to set. + ::arrow::Result SetSessionOptions( + const FlightCallOptions& options, const SetSessionOptionsRequest& request) { + return impl_->SetSessionOptions(options, request); + } + + /// \brief Gets current session options. + /// + /// \param[in] options RPC-layer hints for this call. + /// \param[in] request The (empty) GetSessionOptions request object. + ::arrow::Result GetSessionOptions( + const FlightCallOptions& options, const GetSessionOptionsRequest& request) { + return impl_->GetSessionOptions(options, request); + } + + /// \brief Explicitly closes the session if applicable. + /// + /// \param[in] options RPC-layer hints for this call. + /// \param[in] request The (empty) CloseSession request object. + ::arrow::Result CloseSession(const FlightCallOptions& options, + const CloseSessionRequest& request) { + return impl_->CloseSession(options, request); + } + /// \brief Extends the expiration of a FlightEndpoint. /// /// \param[in] options RPC-layer hints for this call. diff --git a/cpp/src/arrow/flight/sql/server.cc b/cpp/src/arrow/flight/sql/server.cc index a6d197d15b2c0..a5cb842de8f49 100644 --- a/cpp/src/arrow/flight/sql/server.cc +++ b/cpp/src/arrow/flight/sql/server.cc @@ -442,6 +442,21 @@ arrow::Result PackActionResult(ActionCreatePreparedStatementResult resul return PackActionResult(pb_result); } +arrow::Result PackActionResult(SetSessionOptionsResult result) { + ARROW_ASSIGN_OR_RAISE(auto serialized, result.SerializeToString()); + return Result{Buffer::FromString(std::move(serialized))}; +} + +arrow::Result PackActionResult(GetSessionOptionsResult result) { + ARROW_ASSIGN_OR_RAISE(auto serialized, result.SerializeToString()); + return Result{Buffer::FromString(std::move(serialized))}; +} + +arrow::Result PackActionResult(CloseSessionResult result) { + ARROW_ASSIGN_OR_RAISE(auto serialized, result.SerializeToString()); + return Result{Buffer::FromString(std::move(serialized))}; +} + } // namespace arrow::Result StatementQueryTicket::Deserialize( @@ -759,18 +774,19 @@ Status FlightSqlServerBase::DoPut(const ServerCallContext& context, Status FlightSqlServerBase::ListActions(const ServerCallContext& context, std::vector* actions) { - *actions = { - ActionType::kCancelFlightInfo, - ActionType::kRenewFlightEndpoint, - FlightSqlServerBase::kBeginSavepointActionType, - FlightSqlServerBase::kBeginTransactionActionType, - FlightSqlServerBase::kCancelQueryActionType, - FlightSqlServerBase::kCreatePreparedStatementActionType, - FlightSqlServerBase::kCreatePreparedSubstraitPlanActionType, - FlightSqlServerBase::kClosePreparedStatementActionType, - FlightSqlServerBase::kEndSavepointActionType, - FlightSqlServerBase::kEndTransactionActionType, - }; + *actions = {ActionType::kCancelFlightInfo, + ActionType::kRenewFlightEndpoint, + FlightSqlServerBase::kBeginSavepointActionType, + FlightSqlServerBase::kBeginTransactionActionType, + FlightSqlServerBase::kCancelQueryActionType, + FlightSqlServerBase::kCreatePreparedStatementActionType, + FlightSqlServerBase::kCreatePreparedSubstraitPlanActionType, + FlightSqlServerBase::kClosePreparedStatementActionType, + FlightSqlServerBase::kEndSavepointActionType, + FlightSqlServerBase::kEndTransactionActionType, + ActionType::kSetSessionOptions, + ActionType::kGetSessionOptions, + ActionType::kCloseSession}; return Status::OK(); } @@ -791,6 +807,27 @@ Status FlightSqlServerBase::DoAction(const ServerCallContext& context, ARROW_ASSIGN_OR_RAISE(auto renewed_endpoint, RenewFlightEndpoint(context, request)); ARROW_ASSIGN_OR_RAISE(auto packed_result, PackActionResult(renewed_endpoint)); + results.push_back(std::move(packed_result)); + } else if (action.type == ActionType::kSetSessionOptions.type) { + std::string_view body(*action.body); + ARROW_ASSIGN_OR_RAISE(auto request, SetSessionOptionsRequest::Deserialize(body)); + ARROW_ASSIGN_OR_RAISE(auto result, SetSessionOptions(context, request)); + ARROW_ASSIGN_OR_RAISE(auto packed_result, PackActionResult(std::move(result))); + + results.push_back(std::move(packed_result)); + } else if (action.type == ActionType::kGetSessionOptions.type) { + std::string_view body(*action.body); + ARROW_ASSIGN_OR_RAISE(auto request, GetSessionOptionsRequest::Deserialize(body)); + ARROW_ASSIGN_OR_RAISE(auto result, GetSessionOptions(context, request)); + ARROW_ASSIGN_OR_RAISE(auto packed_result, PackActionResult(std::move(result))); + + results.push_back(std::move(packed_result)); + } else if (action.type == ActionType::kCloseSession.type) { + std::string_view body(*action.body); + ARROW_ASSIGN_OR_RAISE(auto request, CloseSessionRequest::Deserialize(body)); + ARROW_ASSIGN_OR_RAISE(auto result, CloseSession(context, request)); + ARROW_ASSIGN_OR_RAISE(auto packed_result, PackActionResult(std::move(result))); + results.push_back(std::move(packed_result)); } else { google::protobuf::Any any; @@ -1098,6 +1135,11 @@ arrow::Result FlightSqlServerBase::RenewFlightEndpoint( return Status::NotImplemented("RenewFlightEndpoint not implemented"); } +arrow::Result FlightSqlServerBase::CloseSession( + const ServerCallContext& context, const CloseSessionRequest& request) { + return Status::NotImplemented("CloseSession not implemented"); +} + arrow::Result FlightSqlServerBase::CreatePreparedStatement( const ServerCallContext& context, @@ -1128,6 +1170,16 @@ Status FlightSqlServerBase::EndTransaction(const ServerCallContext& context, return Status::NotImplemented("EndTransaction not implemented"); } +arrow::Result FlightSqlServerBase::SetSessionOptions( + const ServerCallContext& context, const SetSessionOptionsRequest& request) { + return Status::NotImplemented("SetSessionOptions not implemented"); +} + +arrow::Result FlightSqlServerBase::GetSessionOptions( + const ServerCallContext& context, const GetSessionOptionsRequest& request) { + return Status::NotImplemented("GetSessionOptions not implemented"); +} + Status FlightSqlServerBase::DoPutPreparedStatementQuery( const ServerCallContext& context, const PreparedStatementQuery& command, FlightMessageReader* reader, FlightMetadataWriter* writer) { diff --git a/cpp/src/arrow/flight/sql/server.h b/cpp/src/arrow/flight/sql/server.h index 24f0aa2bd48cf..df46004474a39 100644 --- a/cpp/src/arrow/flight/sql/server.h +++ b/cpp/src/arrow/flight/sql/server.h @@ -20,6 +20,7 @@ #pragma once +#include #include #include #include @@ -601,6 +602,24 @@ class ARROW_FLIGHT_SQL_EXPORT FlightSqlServerBase : public FlightServerBase { virtual arrow::Result CancelFlightInfo( const ServerCallContext& context, const CancelFlightInfoRequest& request); + /// \brief Set server session option(s). + /// \param[in] context The call context. + /// \param[in] request The session options to set. + virtual arrow::Result SetSessionOptions( + const ServerCallContext& context, const SetSessionOptionsRequest& request); + + /// \brief Get server session option(s). + /// \param[in] context The call context. + /// \param[in] request Request object. + virtual arrow::Result GetSessionOptions( + const ServerCallContext& context, const GetSessionOptionsRequest& request); + + /// \brief Close/invalidate the session. + /// \param[in] context The call context. + /// \param[in] request Request object. + virtual arrow::Result CloseSession( + const ServerCallContext& context, const CloseSessionRequest& request); + /// \brief Attempt to explicitly cancel a query. /// /// \param[in] context The call context. diff --git a/cpp/src/arrow/flight/sql/server_session_middleware.cc b/cpp/src/arrow/flight/sql/server_session_middleware.cc new file mode 100644 index 0000000000000..f3e02de232444 --- /dev/null +++ b/cpp/src/arrow/flight/sql/server_session_middleware.cc @@ -0,0 +1,235 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/flight/sql/server_session_middleware.h" +#include "arrow/flight/sql/server_session_middleware_factory.h" + +namespace arrow { +namespace flight { +namespace sql { + +class ServerSessionMiddlewareImpl : public ServerSessionMiddleware { + protected: + std::shared_mutex mutex_; + ServerSessionMiddlewareFactory* factory_; + const CallHeaders& headers_; + std::shared_ptr session_; + std::string session_id_; + std::string closed_session_id_; + bool existing_session_; + + public: + ServerSessionMiddlewareImpl(ServerSessionMiddlewareFactory* factory, + const CallHeaders& headers) + : factory_(factory), headers_(headers), existing_session_(false) {} + + ServerSessionMiddlewareImpl(ServerSessionMiddlewareFactory* factory, + const CallHeaders& headers, + std::shared_ptr session, + std::string session_id, bool existing_session = true) + : factory_(factory), + headers_(headers), + session_(std::move(session)), + session_id_(std::move(session_id)), + existing_session_(existing_session) {} + + void SendingHeaders(AddCallHeaders* add_call_headers) override { + if (!existing_session_ && session_) { + add_call_headers->AddHeader( + "set-cookie", static_cast(kSessionCookieName) + "=" + session_id_); + } + if (!closed_session_id_.empty()) { + add_call_headers->AddHeader( + "set-cookie", static_cast(kSessionCookieName) + "=" + session_id_ + + "; Max-Age=0"); + } + } + + void CallCompleted(const Status&) override {} + + bool HasSession() const override { return static_cast(session_); } + + arrow::Result> GetSession() override { + const std::lock_guard l(mutex_); + if (!session_) { + auto [id, s] = factory_->CreateNewSession(); + session_ = std::move(s); + session_id_ = std::move(id); + } + if (!static_cast(session_)) { + return Status::UnknownError("Error creating session."); + } + return session_; + } + + Status CloseSession() override { + const std::lock_guard l(mutex_); + if (static_cast(session_)) { + return Status::Invalid("Nonexistent session cannot be closed."); + } + ARROW_RETURN_NOT_OK(factory_->CloseSession(session_id_)); + closed_session_id_ = std::move(session_id_); + session_id_.clear(); + session_.reset(); + existing_session_ = false; + + return Status::OK(); + } + + const CallHeaders& GetCallHeaders() const override { return headers_; } +}; + +std::vector> +ServerSessionMiddlewareFactory::ParseCookieString(const std::string_view& s) { + const std::string list_sep = "; "; + const std::string pair_sep = "="; + + std::vector> result; + + size_t cur = 0; + while (cur < s.length()) { + const size_t end = s.find(list_sep, cur); + const bool further_pairs = end != std::string::npos; + const size_t len = further_pairs ? end - cur : std::string::npos; + const std::string_view tok = s.substr(cur, len); + cur = further_pairs ? end + list_sep.length() : s.length(); + + const size_t val_pos = tok.find(pair_sep); + if (val_pos == std::string::npos) { + // The cookie header is somewhat malformed; ignore the key and continue parsing + continue; + } + const std::string_view cookie_name = tok.substr(0, val_pos); + std::string_view cookie_value = + tok.substr(val_pos + pair_sep.length(), std::string::npos); + if (cookie_name.empty()) { + continue; + } + // Strip doublequotes + if (cookie_value.length() >= 2 && cookie_value.front() == '"' && + cookie_value.back() == '"') { + cookie_value.remove_prefix(1); + cookie_value.remove_suffix(1); + } + result.emplace_back(cookie_name, cookie_value); + } + + return result; +} + +Status ServerSessionMiddlewareFactory::StartCall( + const CallInfo&, const CallHeaders& incoming_headers, + std::shared_ptr* middleware) { + std::string session_id; + + const std::pair& + headers_it_pr = incoming_headers.equal_range("cookie"); + for (auto itr = headers_it_pr.first; itr != headers_it_pr.second; ++itr) { + const std::string_view& cookie_header = itr->second; + const std::vector> cookies = + ParseCookieString(cookie_header); + for (const std::pair& cookie : cookies) { + if (cookie.first == kSessionCookieName) { + if (cookie.second.empty()) + return Status::Invalid("Empty ", kSessionCookieName, " cookie value."); + session_id = std::move(cookie.second); + } + } + if (!session_id.empty()) break; + } + + if (session_id.empty()) { + // No cookie was found + // Temporary workaround until middleware handling fixed + auto [id, s] = CreateNewSession(); + *middleware = std::make_shared(this, incoming_headers, + std::move(s), id, false); + } else { + const std::shared_lock l(session_store_lock_); + if (auto it = session_store_.find(session_id); it == session_store_.end()) { + return Status::Invalid("Invalid or expired ", kSessionCookieName, " cookie."); + } else { + auto session = it->second; + *middleware = std::make_shared( + this, incoming_headers, std::move(session), session_id); + } + } + + return Status::OK(); +} + +/// \brief Get a new, empty session option map & its id key; {"",NULLPTR} on collision. +std::pair> +ServerSessionMiddlewareFactory::CreateNewSession() { + auto new_id = id_generator_(); + auto session = std::make_shared(); + + const std::lock_guard l(session_store_lock_); + if (session_store_.count(new_id)) { + // Collision + return {"", NULLPTR}; + } + session_store_[new_id] = session; + + return {new_id, session}; +} + +Status ServerSessionMiddlewareFactory::CloseSession(std::string id) { + const std::lock_guard l(session_store_lock_); + if (!session_store_.erase(id)) { + return Status::KeyError("Invalid or nonexistent session cannot be closed."); + } + return Status::OK(); +} + +std::shared_ptr MakeServerSessionMiddlewareFactory( + std::function id_gen) { + return std::make_shared(std::move(id_gen)); +} + +std::optional FlightSession::GetSessionOption( + const std::string& name) { + const std::shared_lock l(map_lock_); + auto it = map_.find(name); + if (it != map_.end()) { + return it->second; + } else { + return std::nullopt; + } +} + +std::map FlightSession::GetSessionOptions() { + const std::shared_lock l(map_lock_); + return map_; +} + +void FlightSession::SetSessionOption(const std::string& name, + const SessionOptionValue value) { + const std::lock_guard l(map_lock_); + map_[name] = std::move(value); +} + +void FlightSession::EraseSessionOption(const std::string& name) { + const std::lock_guard l(map_lock_); + map_.erase(name); +} + +} // namespace sql +} // namespace flight +} // namespace arrow diff --git a/cpp/src/arrow/flight/sql/server_session_middleware.h b/cpp/src/arrow/flight/sql/server_session_middleware.h new file mode 100644 index 0000000000000..021793de3de32 --- /dev/null +++ b/cpp/src/arrow/flight/sql/server_session_middleware.h @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Middleware for handling Flight SQL Sessions including session cookie handling. +// Currently experimental. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/flight/server_middleware.h" +#include "arrow/flight/sql/types.h" +#include "arrow/status.h" + +namespace arrow { +namespace flight { +namespace sql { + +static constexpr char const kSessionCookieName[] = "arrow_flight_session_id"; + +class ARROW_FLIGHT_SQL_EXPORT FlightSession { + protected: + std::map map_; + std::shared_mutex map_lock_; + + public: + /// \brief Get session option by name + std::optional GetSessionOption(const std::string& name); + /// \brief Get a copy of the session options map. + /// + /// The returned options map may be modified by further calls to this FlightSession + std::map GetSessionOptions(); + /// \brief Set session option by name to given value + void SetSessionOption(const std::string& name, const SessionOptionValue value); + /// \brief Idempotently remove name from this session + void EraseSessionOption(const std::string& name); +}; + +/// \brief A middleware to handle session option persistence and related cookie headers. +/// +/// WARNING that client cookie invalidation does not currently work due to a gRPC +/// transport bug. +class ARROW_FLIGHT_SQL_EXPORT ServerSessionMiddleware : public ServerMiddleware { + public: + static constexpr char const kMiddlewareName[] = + "arrow::flight::sql::ServerSessionMiddleware"; + + std::string name() const override { return kMiddlewareName; } + + /// \brief Is there an existing session (either existing or new) + virtual bool HasSession() const = 0; + /// \brief Get existing or new call-associated session + /// + /// May return NULLPTR if there is an id generation collision. + virtual arrow::Result> GetSession() = 0; + /// Close the current session. + /// + /// This is presently unsupported in C++ until middleware handling can be fixed. + virtual Status CloseSession() = 0; + /// \brief Get request headers, in lieu of a provided or created session. + virtual const CallHeaders& GetCallHeaders() const = 0; +}; + +/// \brief Returns a ServerMiddlewareFactory that handles session option storage. +/// \param[in] id_gen A thread-safe, collision-free generator for session id strings. +ARROW_FLIGHT_SQL_EXPORT std::shared_ptr +MakeServerSessionMiddlewareFactory(std::function id_gen); + +} // namespace sql +} // namespace flight +} // namespace arrow diff --git a/cpp/src/arrow/flight/sql/server_session_middleware_factory.h b/cpp/src/arrow/flight/sql/server_session_middleware_factory.h new file mode 100644 index 0000000000000..2613c572eefc2 --- /dev/null +++ b/cpp/src/arrow/flight/sql/server_session_middleware_factory.h @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// ServerSessionMiddlewareFactory, factored into a separate header for testability + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +namespace arrow { +namespace flight { +namespace sql { + +/// \brief A factory for ServerSessionMiddleware, itself storing session data. +class ARROW_FLIGHT_SQL_EXPORT ServerSessionMiddlewareFactory + : public ServerMiddlewareFactory { + protected: + std::map> session_store_; + std::shared_mutex session_store_lock_; + std::function id_generator_; + + static std::vector> ParseCookieString( + const std::string_view& s); + + public: + explicit ServerSessionMiddlewareFactory(std::function id_gen) + : id_generator_(id_gen) {} + Status StartCall(const CallInfo&, const CallHeaders& incoming_headers, + std::shared_ptr* middleware) override; + + /// \brief Get a new, empty session option map and its id key. + std::pair> CreateNewSession(); + /// \brief Close the session identified by 'id'. + /// \param id The string id of the session to close. + Status CloseSession(std::string id); +}; + +} // namespace sql +} // namespace flight +} // namespace arrow diff --git a/cpp/src/arrow/flight/sql/server_session_middleware_internals_test.cc b/cpp/src/arrow/flight/sql/server_session_middleware_internals_test.cc new file mode 100644 index 0000000000000..74e4d7845c699 --- /dev/null +++ b/cpp/src/arrow/flight/sql/server_session_middleware_internals_test.cc @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// ---------------------------------------------------------------------- +// ServerSessionMiddleware{,Factory} tests not involing a client/server instance + +#include + +#include + +namespace arrow { +namespace flight { +namespace sql { + +class ServerSessionMiddlewareFactoryPrivate : public ServerSessionMiddlewareFactory { + public: + using ServerSessionMiddlewareFactory::ParseCookieString; +}; + +TEST(ServerSessionMiddleware, ParseCookieString) { + std::vector> r1 = + ServerSessionMiddlewareFactoryPrivate::ParseCookieString( + "k1=v1; k2=\"v2\"; kempty=; k3=v3"); + std::vector> e1 = { + {"k1", "v1"}, {"k2", "v2"}, {"kempty", ""}, {"k3", "v3"}}; + ASSERT_EQ(e1, r1); +} + +} // namespace sql +} // namespace flight +} // namespace arrow diff --git a/cpp/src/arrow/flight/transport/grpc/grpc_server.cc b/cpp/src/arrow/flight/transport/grpc/grpc_server.cc index bbd01155fe4a4..a9780b5eeb77e 100644 --- a/cpp/src/arrow/flight/transport/grpc/grpc_server.cc +++ b/cpp/src/arrow/flight/transport/grpc/grpc_server.cc @@ -290,7 +290,8 @@ class GrpcServiceHandler final : public FlightService::Service { // Authenticate the client (if applicable) and construct the call context ::grpc::Status CheckAuth(const FlightMethod& method, ServerContext* context, - GrpcServerCallContext& flight_context) { + GrpcServerCallContext& flight_context, + bool skip_headers = false) { if (!auth_handler_) { const auto auth_context = context->auth_context(); if (auth_context && auth_context->IsPeerAuthenticated()) { @@ -320,11 +321,11 @@ class GrpcServiceHandler final : public FlightService::Service { // Authenticate the client (if applicable) and construct the call context ::grpc::Status MakeCallContext(const FlightMethod& method, ServerContext* context, - GrpcServerCallContext& flight_context) { + GrpcServerCallContext& flight_context, + bool skip_headers = false) { // Run server middleware const CallInfo info{method}; - GrpcAddServerHeaders outgoing_headers(context); for (const auto& factory : middleware_) { std::shared_ptr instance; Status result = factory.second->StartCall(info, flight_context, &instance); @@ -336,13 +337,25 @@ class GrpcServiceHandler final : public FlightService::Service { if (instance != nullptr) { flight_context.middleware_.push_back(instance); flight_context.middleware_map_.insert({factory.first, instance}); - instance->SendingHeaders(&outgoing_headers); } } + // TODO factor this out after fixing all streaming and non-streaming handlers + if (!skip_headers) { + addMiddlewareHeaders(context, flight_context); + } + return ::grpc::Status::OK; } + void addMiddlewareHeaders(ServerContext* context, + GrpcServerCallContext& flight_context) { + GrpcAddServerHeaders outgoing_headers(context); + for (const std::shared_ptr& instance : flight_context.middleware_) { + instance->SendingHeaders(&outgoing_headers); + } + } + ::grpc::Status Handshake( ServerContext* context, ::grpc::ServerReaderWriter* stream) { @@ -399,8 +412,9 @@ class GrpcServiceHandler final : public FlightService::Service { SERVICE_RETURN_NOT_OK(flight_context, internal::FromProto(*request, &descr)); std::unique_ptr info; - SERVICE_RETURN_NOT_OK(flight_context, - impl_->base()->GetFlightInfo(flight_context, descr, &info)); + auto res = impl_->base()->GetFlightInfo(flight_context, descr, &info); + addMiddlewareHeaders(context, flight_context); + SERVICE_RETURN_NOT_OK(flight_context, res); if (!info) { // Treat null listing as no flights available diff --git a/cpp/src/arrow/flight/types.cc b/cpp/src/arrow/flight/types.cc index 1d43c41b69d9f..11b2baafad220 100644 --- a/cpp/src/arrow/flight/types.cc +++ b/cpp/src/arrow/flight/types.cc @@ -17,9 +17,11 @@ #include "arrow/flight/types.h" +#include #include #include #include +#include #include #include "arrow/buffer.h" @@ -473,6 +475,352 @@ arrow::Result CancelFlightInfoRequest::Deserialize( return out; } +static const char* const SetSessionOptionStatusNames[] = {"Unspecified", "InvalidName", + "InvalidValue", "Error"}; +static const char* const CloseSessionStatusNames[] = {"Unspecified", "Closed", "Closing", + "NotClosable"}; + +// Helpers for stringifying maps containing various types +std::string ToString(const SetSessionOptionErrorValue& error_value) { + return SetSessionOptionStatusNames[static_cast(error_value)]; +} + +std::ostream& operator<<(std::ostream& os, + const SetSessionOptionErrorValue& error_value) { + os << ToString(error_value); + return os; +} + +std::string ToString(const CloseSessionStatus& status) { + return CloseSessionStatusNames[static_cast(status)]; +} + +std::ostream& operator<<(std::ostream& os, const CloseSessionStatus& status) { + os << ToString(status); + return os; +} + +std::ostream& operator<<(std::ostream& os, std::vector values) { + os << '['; + std::string sep = ""; + for (const auto& v : values) { + os << sep << std::quoted(v); + sep = ", "; + } + os << ']'; + + return os; +} + +std::ostream& operator<<(std::ostream& os, const SessionOptionValue& v) { + if (std::holds_alternative(v)) { + os << ""; + } else { + std::visit( + [&](const auto& x) { + if constexpr (std::is_convertible_v, + std::string_view>) { + os << std::quoted(x); + } else { + os << x; + } + }, + v); + } + return os; +} + +std::ostream& operator<<(std::ostream& os, const SetSessionOptionsResult::Error& e) { + os << '{' << e.value << '}'; + return os; +} + +template +std::ostream& operator<<(std::ostream& os, std::map m) { + os << '{'; + std::string sep = ""; + if constexpr (std::is_convertible_v) { + // std::string, char*, std::string_view + for (const auto& [k, v] : m) { + os << sep << '[' << k << "]: " << std::quoted(v) << '"'; + sep = ", "; + } + } else { + for (const auto& [k, v] : m) { + os << sep << '[' << k << "]: " << v; + sep = ", "; + } + } + os << '}'; + + return os; +} + +namespace { +static bool CompareSessionOptionMaps(const std::map& a, + const std::map& b) { + if (a.size() != b.size()) { + return false; + } + for (const auto& [k, v] : a) { + if (const auto it = b.find(k); it == b.end()) { + return false; + } else { + const auto& b_v = it->second; + if (v.index() != b_v.index()) { + return false; + } + if (v != b_v) { + return false; + } + } + } + return true; +} +} // namespace + +// SetSessionOptionsRequest + +std::string SetSessionOptionsRequest::ToString() const { + std::stringstream ss; + + ss << " SetSessionOptionsRequest::Deserialize( + std::string_view serialized) { + // TODO these & SerializeToString should all be factored out to a superclass + pb::SetSessionOptionsRequest pb_request; + if (serialized.size() > static_cast(std::numeric_limits::max())) { + return Status::Invalid( + "Serialized SetSessionOptionsRequest size should not exceed 2 GiB"); + } + google::protobuf::io::ArrayInputStream input(serialized.data(), + static_cast(serialized.size())); + if (!pb_request.ParseFromZeroCopyStream(&input)) { + return Status::Invalid("Not a valid SetSessionOptionsRequest"); + } + SetSessionOptionsRequest out; + RETURN_NOT_OK(internal::FromProto(pb_request, &out)); + return out; +} + +// SetSessionOptionsResult + +std::string SetSessionOptionsResult::ToString() const { + std::stringstream ss; + + ss << " SetSessionOptionsResult::Deserialize( + std::string_view serialized) { + pb::SetSessionOptionsResult pb_result; + if (serialized.size() > static_cast(std::numeric_limits::max())) { + return Status::Invalid( + "Serialized SetSessionOptionsResult size should not exceed 2 GiB"); + } + google::protobuf::io::ArrayInputStream input(serialized.data(), + static_cast(serialized.size())); + if (!pb_result.ParseFromZeroCopyStream(&input)) { + return Status::Invalid("Not a valid SetSessionOptionsResult"); + } + SetSessionOptionsResult out; + RETURN_NOT_OK(internal::FromProto(pb_result, &out)); + return out; +} + +// GetSessionOptionsRequest + +std::string GetSessionOptionsRequest::ToString() const { + return ""; +} + +bool GetSessionOptionsRequest::Equals(const GetSessionOptionsRequest& other) const { + return true; +} + +arrow::Result GetSessionOptionsRequest::SerializeToString() const { + pb::GetSessionOptionsRequest pb_request; + RETURN_NOT_OK(internal::ToProto(*this, &pb_request)); + + std::string out; + if (!pb_request.SerializeToString(&out)) { + return Status::IOError("Serialized GetSessionOptionsRequest exceeded 2GiB limit"); + } + return out; +} + +arrow::Result GetSessionOptionsRequest::Deserialize( + std::string_view serialized) { + pb::GetSessionOptionsRequest pb_request; + if (serialized.size() > static_cast(std::numeric_limits::max())) { + return Status::Invalid( + "Serialized GetSessionOptionsRequest size should not exceed 2 GiB"); + } + google::protobuf::io::ArrayInputStream input(serialized.data(), + static_cast(serialized.size())); + if (!pb_request.ParseFromZeroCopyStream(&input)) { + return Status::Invalid("Not a valid GetSessionOptionsRequest"); + } + GetSessionOptionsRequest out; + RETURN_NOT_OK(internal::FromProto(pb_request, &out)); + return out; +} + +// GetSessionOptionsResult + +std::string GetSessionOptionsResult::ToString() const { + std::stringstream ss; + + ss << " GetSessionOptionsResult::Deserialize( + std::string_view serialized) { + pb::GetSessionOptionsResult pb_result; + if (serialized.size() > static_cast(std::numeric_limits::max())) { + return Status::Invalid( + "Serialized GetSessionOptionsResult size should not exceed 2 GiB"); + } + google::protobuf::io::ArrayInputStream input(serialized.data(), + static_cast(serialized.size())); + if (!pb_result.ParseFromZeroCopyStream(&input)) { + return Status::Invalid("Not a valid GetSessionOptionsResult"); + } + GetSessionOptionsResult out; + RETURN_NOT_OK(internal::FromProto(pb_result, &out)); + return out; +} + +// CloseSessionRequest + +std::string CloseSessionRequest::ToString() const { return ""; } + +bool CloseSessionRequest::Equals(const CloseSessionRequest& other) const { return true; } + +arrow::Result CloseSessionRequest::SerializeToString() const { + pb::CloseSessionRequest pb_request; + RETURN_NOT_OK(internal::ToProto(*this, &pb_request)); + + std::string out; + if (!pb_request.SerializeToString(&out)) { + return Status::IOError("Serialized CloseSessionRequest exceeded 2GiB limit"); + } + return out; +} + +arrow::Result CloseSessionRequest::Deserialize( + std::string_view serialized) { + pb::CloseSessionRequest pb_request; + if (serialized.size() > static_cast(std::numeric_limits::max())) { + return Status::Invalid("Serialized CloseSessionRequest size should not exceed 2 GiB"); + } + google::protobuf::io::ArrayInputStream input(serialized.data(), + static_cast(serialized.size())); + if (!pb_request.ParseFromZeroCopyStream(&input)) { + return Status::Invalid("Not a valid CloseSessionRequest"); + } + CloseSessionRequest out; + RETURN_NOT_OK(internal::FromProto(pb_request, &out)); + return out; +} + +// CloseSessionResult + +std::string CloseSessionResult::ToString() const { + std::stringstream ss; + + ss << " CloseSessionResult::Deserialize( + std::string_view serialized) { + pb::CloseSessionResult pb_result; + if (serialized.size() > static_cast(std::numeric_limits::max())) { + return Status::Invalid("Serialized CloseSessionResult size should not exceed 2 GiB"); + } + google::protobuf::io::ArrayInputStream input(serialized.data(), + static_cast(serialized.size())); + if (!pb_result.ParseFromZeroCopyStream(&input)) { + return Status::Invalid("Not a valid CloseSessionResult"); + } + CloseSessionResult out; + RETURN_NOT_OK(internal::FromProto(pb_result, &out)); + return out; +} + Location::Location() { uri_ = std::make_shared(); } arrow::Result Location::Parse(const std::string& uri_string) { @@ -648,6 +996,21 @@ const ActionType ActionType::kRenewFlightEndpoint = "Extend expiration time of the given FlightEndpoint.\n" "Request Message: RenewFlightEndpointRequest\n" "Response Message: Renewed FlightEndpoint"}; +const ActionType ActionType::kSetSessionOptions = + ActionType{"SetSessionOptions", + "Set client session options by name/value pairs.\n" + "Request Message: SetSessionOptionsRequest\n" + "Response Message: SetSessionOptionsResult"}; +const ActionType ActionType::kGetSessionOptions = + ActionType{"GetSessionOptions", + "Get current client session options\n" + "Request Message: GetSessionOptionsRequest\n" + "Response Message: GetSessionOptionsResult"}; +const ActionType ActionType::kCloseSession = + ActionType{"CloseSession", + "Explicitly close/invalidate the cookie-specified client session.\n" + "Request Message: CloseSessionRequest\n" + "Response Message: CloseSessionResult"}; bool ActionType::Equals(const ActionType& other) const { return type == other.type && description == other.description; diff --git a/cpp/src/arrow/flight/types.h b/cpp/src/arrow/flight/types.h index 790a2067dd705..4b17149aa2d46 100644 --- a/cpp/src/arrow/flight/types.h +++ b/cpp/src/arrow/flight/types.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include "arrow/flight/type_fwd.h" @@ -184,6 +185,9 @@ struct ARROW_FLIGHT_EXPORT ActionType { static const ActionType kCancelFlightInfo; static const ActionType kRenewFlightEndpoint; + static const ActionType kSetSessionOptions; + static const ActionType kGetSessionOptions; + static const ActionType kCloseSession; }; /// \brief Opaque selection criteria for ListFlights RPC @@ -761,6 +765,199 @@ struct ARROW_FLIGHT_EXPORT CancelFlightInfoRequest { static arrow::Result Deserialize(std::string_view serialized); }; +/// \brief Variant supporting all possible value types for {Set,Get}SessionOptions +/// +/// By convention, an attempt to set a valueless (std::monostate) SessionOptionValue +/// should attempt to unset or clear the named option value on the server. +using SessionOptionValue = std::variant>; + +/// \brief The result of setting a session option. +enum class SetSessionOptionErrorValue : int8_t { + /// \brief The status of setting the option is unknown. + /// + /// Servers should avoid using this value (send a NOT_FOUND error if the requested + /// session is not known). Clients can retry the request. + kUnspecified, + /// \brief The given session option name is invalid. + kInvalidName, + /// \brief The session option value or type is invalid. + kInvalidValue, + /// \brief The session option cannot be set. + kError +}; +std::string ToString(const SetSessionOptionErrorValue& error_value); +std::ostream& operator<<(std::ostream& os, const SetSessionOptionErrorValue& error_value); + +/// \brief The result of closing a session. +enum class CloseSessionStatus : int8_t { + // \brief The session close status is unknown. + // + // Servers should avoid using this value (send a NOT_FOUND error if the requested + // session is not known). Clients can retry the request. + kUnspecified, + // \brief The session close request is complete. + // + // Subsequent requests with the same session produce a NOT_FOUND error. + kClosed, + // \brief The session close request is in progress. + // + // The client may retry the request. + kClosing, + // \brief The session is not closeable. + // + // The client should not retry the request. + kNotClosable +}; +std::string ToString(const CloseSessionStatus& status); +std::ostream& operator<<(std::ostream& os, const CloseSessionStatus& status); + +/// \brief A request to set a set of session options by name/value. +struct ARROW_FLIGHT_EXPORT SetSessionOptionsRequest { + std::map session_options; + + std::string ToString() const; + bool Equals(const SetSessionOptionsRequest& other) const; + + friend bool operator==(const SetSessionOptionsRequest& left, + const SetSessionOptionsRequest& right) { + return left.Equals(right); + } + friend bool operator!=(const SetSessionOptionsRequest& left, + const SetSessionOptionsRequest& right) { + return !(left == right); + } + + /// \brief Serialize this message to its wire-format representation. + arrow::Result SerializeToString() const; + + /// \brief Deserialize this message from its wire-format representation. + static arrow::Result Deserialize(std::string_view serialized); +}; + +/// \brief The result(s) of setting session option(s). +struct ARROW_FLIGHT_EXPORT SetSessionOptionsResult { + struct Error { + SetSessionOptionErrorValue value; + + bool Equals(const Error& other) const { return value == other.value; } + friend bool operator==(const Error& left, const Error& right) { + return left.Equals(right); + } + friend bool operator!=(const Error& left, const Error& right) { + return !(left == right); + } + }; + + std::map errors; + + std::string ToString() const; + bool Equals(const SetSessionOptionsResult& other) const; + + friend bool operator==(const SetSessionOptionsResult& left, + const SetSessionOptionsResult& right) { + return left.Equals(right); + } + friend bool operator!=(const SetSessionOptionsResult& left, + const SetSessionOptionsResult& right) { + return !(left == right); + } + + /// \brief Serialize this message to its wire-format representation. + arrow::Result SerializeToString() const; + + /// \brief Deserialize this message from its wire-format representation. + static arrow::Result Deserialize(std::string_view serialized); +}; + +/// \brief A request to get current session options. +struct ARROW_FLIGHT_EXPORT GetSessionOptionsRequest { + std::string ToString() const; + bool Equals(const GetSessionOptionsRequest& other) const; + + friend bool operator==(const GetSessionOptionsRequest& left, + const GetSessionOptionsRequest& right) { + return left.Equals(right); + } + friend bool operator!=(const GetSessionOptionsRequest& left, + const GetSessionOptionsRequest& right) { + return !(left == right); + } + + /// \brief Serialize this message to its wire-format representation. + arrow::Result SerializeToString() const; + + /// \brief Deserialize this message from its wire-format representation. + static arrow::Result Deserialize(std::string_view serialized); +}; + +/// \brief The current session options. +struct ARROW_FLIGHT_EXPORT GetSessionOptionsResult { + std::map session_options; + + std::string ToString() const; + bool Equals(const GetSessionOptionsResult& other) const; + + friend bool operator==(const GetSessionOptionsResult& left, + const GetSessionOptionsResult& right) { + return left.Equals(right); + } + friend bool operator!=(const GetSessionOptionsResult& left, + const GetSessionOptionsResult& right) { + return !(left == right); + } + + /// \brief Serialize this message to its wire-format representation. + arrow::Result SerializeToString() const; + + /// \brief Deserialize this message from its wire-format representation. + static arrow::Result Deserialize(std::string_view serialized); +}; + +/// \brief A request to close the open client session. +struct ARROW_FLIGHT_EXPORT CloseSessionRequest { + std::string ToString() const; + bool Equals(const CloseSessionRequest& other) const; + + friend bool operator==(const CloseSessionRequest& left, + const CloseSessionRequest& right) { + return left.Equals(right); + } + friend bool operator!=(const CloseSessionRequest& left, + const CloseSessionRequest& right) { + return !(left == right); + } + + /// \brief Serialize this message to its wire-format representation. + arrow::Result SerializeToString() const; + + /// \brief Deserialize this message from its wire-format representation. + static arrow::Result Deserialize(std::string_view serialized); +}; + +/// \brief The result of attempting to close the client session. +struct ARROW_FLIGHT_EXPORT CloseSessionResult { + CloseSessionStatus status; + + std::string ToString() const; + bool Equals(const CloseSessionResult& other) const; + + friend bool operator==(const CloseSessionResult& left, + const CloseSessionResult& right) { + return left.Equals(right); + } + friend bool operator!=(const CloseSessionResult& left, + const CloseSessionResult& right) { + return !(left == right); + } + + /// \brief Serialize this message to its wire-format representation. + arrow::Result SerializeToString() const; + + /// \brief Deserialize this message from its wire-format representation. + static arrow::Result Deserialize(std::string_view serialized); +}; + /// \brief An iterator to FlightInfo instances returned by ListFlights. class ARROW_FLIGHT_EXPORT FlightListing { public: diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py index 299983f62f283..e984468bc5052 100644 --- a/dev/archery/archery/integration/runner.py +++ b/dev/archery/archery/integration/runner.py @@ -608,6 +608,11 @@ def run_all_tests(with_cpp=True, with_java=True, with_js=True, "RenewFlightEndpoint are working as expected."), skip_testers={"JS", "C#", "Rust"}, ), + Scenario( + "session_options", + description="Ensure Flight SQL Sessions work as expected.", + skip_testers={"JS", "C#", "Rust", "Go"} + ), Scenario( "poll_flight_info", description="Ensure PollFlightInfo is supported.", diff --git a/docs/source/format/FlightSql.rst b/docs/source/format/FlightSql.rst index add044c2d3621..6bb917271366c 100644 --- a/docs/source/format/FlightSql.rst +++ b/docs/source/format/FlightSql.rst @@ -170,6 +170,47 @@ the ``type`` should be ``ClosePreparedStatement``). When used with DoPut: execute the query and return the number of affected rows. +Flight Server Session Management +-------------------------------- + +Flight SQL provides commands to set and update server session variables +which affect the server behaviour in various ways. Common options may +include (depending on the server implementation) ``catalog`` and +``schema``, indicating the currently-selected catalog and schema for +queries to be run against. + +Clients should prefer, where possible, setting options prior to issuing +queries and other commands, as some server implementations may require +these options be set exactly once and prior to any other activity which +may trigger their implicit setting. + +For compatibility with Database Connectivity drivers (JDBC, ODBC, and +others), it is strongly recommended that server implementations accept +string representations of all option values which may be provided to the +driver as part of a server connection string and passed through to the +server without further conversion. For ease of use it is also recommended +to accept and convert other numeric types to the preferred type for an +option value, however this is not required. + +Sessions are persisted between the client and server using an +implementation-defined mechanism, which is typically RFC 6265 cookies. +Servers may also combine other connection state opaquely with the +session token: Consider that the lifespan and semantics of a session +should make sense for any additional uses, e.g. CloseSession would also +invalidate any authentication context persisted via the session context. +A session may be initiated upon a nonempty (or empty) SetSessionOptions +call, or at any other time of the server's choosing. + +``SetSessionOptions`` +Set server session option(s) by name/value. + +``GetSessionOptions`` +Get the current server session options, including those set by the client +and any defaulted or implicitly set by the server. + +``CloseSession`` +Close and invalidate the current session context. + Sequence Diagrams ================= diff --git a/format/Flight.proto b/format/Flight.proto index de3794f05ba83..59714108e1cbc 100644 --- a/format/Flight.proto +++ b/format/Flight.proto @@ -525,3 +525,117 @@ message FlightData { message PutResult { bytes app_metadata = 1; } + +/* + * EXPERIMENTAL: Union of possible value types for a Session Option to be set to. + * + * By convention, an attempt to set a valueless SessionOptionValue should + * attempt to unset or clear the named option value on the server. + */ +message SessionOptionValue { + message StringListValue { + repeated string values = 1; + } + + oneof option_value { + string string_value = 1; + bool bool_value = 2; + sfixed64 int64_value = 3; + double double_value = 4; + StringListValue string_list_value = 5; + } +} + +/* + * EXPERIMENTAL: A request to set session options for an existing or new (implicit) + * server session. + * + * Sessions are persisted and referenced via a transport-level state management, typically + * RFC 6265 HTTP cookies when using an HTTP transport. The suggested cookie name or state + * context key is 'arrow_flight_session_id', although implementations may freely choose their + * own name. + * + * Session creation (if one does not already exist) is implied by this RPC request, however + * server implementations may choose to initiate a session that also contains client-provided + * session options at any other time, e.g. on authentication, or when any other call is made + * and the server wishes to use a session to persist any state (or lack thereof). + */ +message SetSessionOptionsRequest { + map session_options = 1; +} + +/* + * EXPERIMENTAL: The results (individually) of setting a set of session options. + * + * Option names should only be present in the response if they were not successfully + * set on the server; that is, a response without an Error for a name provided in the + * SetSessionOptionsRequest implies that the named option value was set successfully. + */ +message SetSessionOptionsResult { + enum ErrorValue { + // Protobuf deserialization fallback value: The status is unknown or unrecognized. + // Servers should avoid using this value. The request may be retried by the client. + UNSPECIFIED = 0; + // The given session option name is invalid. + INVALID_NAME = 1; + // The session option value or type is invalid. + INVALID_VALUE = 2; + // The session option cannot be set. + ERROR = 3; + } + + message Error { + ErrorValue value = 1; + } + + map errors = 1; +} + +/* + * EXPERIMENTAL: A request to access the session options for the current server session. + * + * The existing session is referenced via a cookie header or similar (see + * SetSessionOptionsRequest above); it is an error to make this request with a missing, + * invalid, or expired session cookie header or other implementation-defined session + * reference token. + */ +message GetSessionOptionsRequest { +} + +/* + * EXPERIMENTAL: The result containing the current server session options. + */ +message GetSessionOptionsResult { + map session_options = 1; +} + +/* + * Request message for the "Close Session" action. + * + * The exiting session is referenced via a cookie header. + */ +message CloseSessionRequest { +} + +/* + * The result of closing a session. + */ +message CloseSessionResult { + enum Status { + // Protobuf deserialization fallback value: The session close status is unknown or + // not recognized. Servers should avoid using this value (send a NOT_FOUND error if + // the requested session is not known or expired). Clients can retry the request. + UNSPECIFIED = 0; + // The session close request is complete. Subsequent requests with + // the same session produce a NOT_FOUND error. + CLOSED = 1; + // The session close request is in progress. The client may retry + // the close request. + CLOSING = 2; + // The session is not closeable. The client should not retry the + // close request. + NOT_CLOSEABLE = 3; + } + + Status status = 1; +} diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/CloseSessionRequest.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/CloseSessionRequest.java new file mode 100644 index 0000000000000..29eb3664f6286 --- /dev/null +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/CloseSessionRequest.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import org.apache.arrow.flight.impl.Flight; + +/** A request to close/invalidate a server session context. */ +public class CloseSessionRequest { + public CloseSessionRequest() { + } + + CloseSessionRequest(Flight.CloseSessionRequest proto) { + } + + Flight.CloseSessionRequest toProtocol() { + return Flight.CloseSessionRequest.getDefaultInstance(); + } + + /** + * Get the serialized form of this protocol message. + * + *

Intended to help interoperability by allowing non-Flight services to still return Flight types. + */ + public ByteBuffer serialize() { + return ByteBuffer.wrap(toProtocol().toByteArray()); + } + + /** + * Parse the serialized form of this protocol message. + * + *

Intended to help interoperability by allowing Flight clients to obtain stream info from non-Flight services. + * + * @param serialized The serialized form of the message, as returned by {@link #serialize()}. + * @return The deserialized message. + * @throws IOException if the serialized form is invalid. + */ + public static CloseSessionRequest deserialize(ByteBuffer serialized) throws IOException { + return new CloseSessionRequest(Flight.CloseSessionRequest.parseFrom(serialized)); + } +} diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/CloseSessionResult.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/CloseSessionResult.java new file mode 100644 index 0000000000000..c3710a14b108a --- /dev/null +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/CloseSessionResult.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import org.apache.arrow.flight.impl.Flight; + +/** The result of attempting to close/invalidate a server session context. */ +public class CloseSessionResult { + /** + * Close operation result status values. + */ + public enum Status { + /** + * The session close status is unknown. Servers should avoid using this value + * (send a NOT_FOUND error if the requested session is not known). Clients can + * retry the request. + */ + UNSPECIFIED, + /** + * The session close request is complete. + */ + CLOSED, + /** + * The session close request is in progress. The client may retry the request. + */ + CLOSING, + /** + * The session is not closeable. + */ + NOT_CLOSABLE, + ; + + public static Status fromProtocol(Flight.CloseSessionResult.Status proto) { + return values()[proto.getNumber()]; + } + + public Flight.CloseSessionResult.Status toProtocol() { + return Flight.CloseSessionResult.Status.values()[ordinal()]; + } + } + + private final Status status; + + public CloseSessionResult(Status status) { + this.status = status; + } + + CloseSessionResult(Flight.CloseSessionResult proto) { + status = Status.fromProtocol(proto.getStatus()); + if (status == null) { + // Unreachable + throw new IllegalArgumentException(""); + } + } + + public Status getStatus() { + return status; + } + + Flight.CloseSessionResult toProtocol() { + Flight.CloseSessionResult.Builder b = Flight.CloseSessionResult.newBuilder(); + b.setStatus(status.toProtocol()); + return b.build(); + } + + /** + * Get the serialized form of this protocol message. + * + *

Intended to help interoperability by allowing non-Flight services to still return Flight types. + */ + public ByteBuffer serialize() { + return ByteBuffer.wrap(toProtocol().toByteArray()); + } + + /** + * Parse the serialized form of this protocol message. + * + *

Intended to help interoperability by allowing Flight clients to obtain stream info from non-Flight services. + * + * @param serialized The serialized form of the message, as returned by {@link #serialize()}. + * @return The deserialized message. + * @throws IOException if the serialized form is invalid. + */ + public static CloseSessionResult deserialize(ByteBuffer serialized) throws IOException { + return new CloseSessionResult(Flight.CloseSessionResult.parseFrom(serialized)); + } + +} diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java index 8f251a7c7ef07..980a762e397f9 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java @@ -580,6 +580,102 @@ public FlightEndpoint renewFlightEndpoint(RenewFlightEndpointRequest request, Ca return result; } + /** + * Set server session option(s) by name/value. + * + * Sessions are generally persisted via HTTP cookies. + * + * @param request The session options to set on the server. + * @param options Call options. + * @return The result containing per-value error statuses, if any. + */ + public SetSessionOptionsResult setSessionOptions(SetSessionOptionsRequest request, CallOption... options) { + Action action = new Action(FlightConstants.SET_SESSION_OPTIONS.getType(), request.serialize().array()); + Iterator results = doAction(action, options); + if (!results.hasNext()) { + throw CallStatus.INTERNAL + .withDescription("Server did not return a response") + .toRuntimeException(); + } + + SetSessionOptionsResult result; + try { + result = SetSessionOptionsResult.deserialize(ByteBuffer.wrap(results.next().getBody())); + } catch (IOException e) { + throw CallStatus.INTERNAL + .withDescription("Failed to parse server response: " + e) + .withCause(e) + .toRuntimeException(); + } + results.forEachRemaining((ignored) -> { + }); + return result; + } + + /** + * Get the current server session options. + * + * The session is generally accessed via an HTTP cookie. + * + * @param request The (empty) GetSessionOptionsRequest. + * @param options Call options. + * @return The result containing the set of session options configured on the server. + */ + public GetSessionOptionsResult getSessionOptions(GetSessionOptionsRequest request, CallOption... options) { + Action action = new Action(FlightConstants.GET_SESSION_OPTIONS.getType(), request.serialize().array()); + Iterator results = doAction(action, options); + if (!results.hasNext()) { + throw CallStatus.INTERNAL + .withDescription("Server did not return a response") + .toRuntimeException(); + } + + GetSessionOptionsResult result; + try { + result = GetSessionOptionsResult.deserialize(ByteBuffer.wrap(results.next().getBody())); + } catch (IOException e) { + throw CallStatus.INTERNAL + .withDescription("Failed to parse server response: " + e) + .withCause(e) + .toRuntimeException(); + } + results.forEachRemaining((ignored) -> { + }); + return result; + } + + /** + * Close/invalidate the current server session. + * + * The session is generally accessed via an HTTP cookie. + * + * @param request The (empty) CloseSessionRequest. + * @param options Call options. + * @return The result containing the status of the close operation. + */ + public CloseSessionResult closeSession(CloseSessionRequest request, CallOption... options) { + Action action = new Action(FlightConstants.CLOSE_SESSION.getType(), request.serialize().array()); + Iterator results = doAction(action, options); + if (!results.hasNext()) { + throw CallStatus.INTERNAL + .withDescription("Server did not return a response") + .toRuntimeException(); + } + + CloseSessionResult result; + try { + result = CloseSessionResult.deserialize(ByteBuffer.wrap(results.next().getBody())); + } catch (IOException e) { + throw CallStatus.INTERNAL + .withDescription("Failed to parse server response: " + e) + .withCause(e) + .toRuntimeException(); + } + results.forEachRemaining((ignored) -> { + }); + return result; + } + /** * Interface for writers to an Arrow data stream. */ diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightConstants.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightConstants.java index 2a240abad6d95..4456e3dae4949 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightConstants.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightConstants.java @@ -35,4 +35,18 @@ public interface FlightConstants { "Extend expiration time of the given FlightEndpoint.\n" + "Request Message: RenewFlightEndpointRequest\n" + "Response Message: Renewed FlightEndpoint"); + + ActionType SET_SESSION_OPTIONS = new ActionType("SetSessionOptions", + "Set client session options by name/value pairs.\n" + + "Request Message: SetSessionOptionsRequest\n" + + "Response Message: SetSessionOptionsResult"); + + ActionType GET_SESSION_OPTIONS = new ActionType("GetSessionOptions", + "Get current client session options\n" + + "Request Message: GetSessionOptionsRequest\n" + + "Response Message: GetSessionOptionsResult"); + ActionType CLOSE_SESSION = new ActionType("CloseSession", + "Explicitly close/invalidate the cookie-specified client session.\n" + + "Request Message: CloseSessionRequest\n" + + "Response Message: CloseSessionResult"); } diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/GetSessionOptionsRequest.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/GetSessionOptionsRequest.java new file mode 100644 index 0000000000000..9d63e59027aac --- /dev/null +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/GetSessionOptionsRequest.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import org.apache.arrow.flight.impl.Flight; + +/** + * A request to get current session options. + */ +public class GetSessionOptionsRequest { + public GetSessionOptionsRequest() { + } + + GetSessionOptionsRequest(Flight.GetSessionOptionsRequest proto) { + } + + Flight.GetSessionOptionsRequest toProtocol() { + return Flight.GetSessionOptionsRequest.getDefaultInstance(); + } + + /** + * Get the serialized form of this protocol message. + * + *

Intended to help interoperability by allowing non-Flight services to still return Flight types. + */ + public ByteBuffer serialize() { + return ByteBuffer.wrap(toProtocol().toByteArray()); + } + + /** + * Parse the serialized form of this protocol message. + * + *

Intended to help interoperability by allowing Flight clients to obtain stream info from non-Flight services. + * + * @param serialized The serialized form of the message, as returned by {@link #serialize()}. + * @return The deserialized message. + * @throws IOException if the serialized form is invalid. + */ + public static GetSessionOptionsRequest deserialize(ByteBuffer serialized) throws IOException { + return new GetSessionOptionsRequest(Flight.GetSessionOptionsRequest.parseFrom(serialized)); + } +} diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/GetSessionOptionsResult.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/GetSessionOptionsResult.java new file mode 100644 index 0000000000000..c777bd39bd032 --- /dev/null +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/GetSessionOptionsResult.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.arrow.flight.impl.Flight; + +/** A request to view the currently-set options for the current server session. */ +public class GetSessionOptionsResult { + private final Map sessionOptions; + + public GetSessionOptionsResult(Map sessionOptions) { + this.sessionOptions = Collections.unmodifiableMap(new HashMap(sessionOptions)); + } + + GetSessionOptionsResult(Flight.GetSessionOptionsResult proto) { + sessionOptions = Collections.unmodifiableMap( + proto.getSessionOptionsMap().entrySet().stream().collect(Collectors.toMap( + Map.Entry::getKey, (e) -> SessionOptionValueFactory.makeSessionOptionValue(e.getValue())))); + } + + /** + * Get the session options map contained in the request. + * + * @return An immutable view of the session options map. + */ + public Map getSessionOptions() { + return sessionOptions; + } + + Flight.GetSessionOptionsResult toProtocol() { + Flight.GetSessionOptionsResult.Builder b = Flight.GetSessionOptionsResult.newBuilder(); + b.putAllSessionOptions(sessionOptions.entrySet().stream().collect(Collectors.toMap( + Map.Entry::getKey, (e) -> e.getValue().toProtocol()))); + return b.build(); + } + + /** + * Get the serialized form of this protocol message. + * + *

Intended to help interoperability by allowing non-Flight services to still return Flight types. + */ + public ByteBuffer serialize() { + return ByteBuffer.wrap(toProtocol().toByteArray()); + } + + /** + * Parse the serialized form of this protocol message. + * + *

Intended to help interoperability by allowing Flight clients to obtain stream info from non-Flight services. + * + * @param serialized The serialized form of the message, as returned by {@link #serialize()}. + * @return The deserialized message. + * @throws IOException if the serialized form is invalid. + */ + public static GetSessionOptionsResult deserialize(ByteBuffer serialized) throws IOException { + return new GetSessionOptionsResult(Flight.GetSessionOptionsResult.parseFrom(serialized)); + } +} diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/NoOpSessionOptionValueVisitor.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/NoOpSessionOptionValueVisitor.java new file mode 100644 index 0000000000000..c951cce0ed42d --- /dev/null +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/NoOpSessionOptionValueVisitor.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight; + +/** + * A helper to facilitate easier anonymous subclass declaration. + * + * Implementations need only override callbacks for types they wish to do something with. + * + * @param Return type of the visit operation. + */ +public class NoOpSessionOptionValueVisitor implements SessionOptionValueVisitor { + /** + * A callback to handle SessionOptionValue containing a String. + */ + public T visit(String value) { + return null; + } + + /** + * A callback to handle SessionOptionValue containing a boolean. + */ + public T visit(boolean value) { + return null; + } + + /** + * A callback to handle SessionOptionValue containing a long. + */ + public T visit(long value) { + return null; + } + + /** + * A callback to handle SessionOptionValue containing a double. + */ + public T visit(double value) { + return null; + } + + /** + * A callback to handle SessionOptionValue containing an array of String. + */ + public T visit(String[] value) { + return null; + } + + /** + * A callback to handle SessionOptionValue containing no value. + * + * By convention, an attempt to set a valueless SessionOptionValue should + * attempt to unset or clear the named option value on the server. + */ + public T visit(Void value) { + return null; + } +} diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/ServerSessionMiddleware.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/ServerSessionMiddleware.java new file mode 100644 index 0000000000000..7091caa5e98bc --- /dev/null +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/ServerSessionMiddleware.java @@ -0,0 +1,227 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.Callable; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; + +/** + * Middleware for handling Flight SQL Sessions including session cookie handling. + * + * Currently experimental. + */ +public class ServerSessionMiddleware implements FlightServerMiddleware { + Factory factory; + boolean existingSession; + private Session session; + private String closedSessionId = null; + + public static final String sessionCookieName = "arrow_flight_session_id"; + + /** + * Factory for managing and accessing ServerSessionMiddleware. + */ + public static class Factory implements FlightServerMiddleware.Factory { + private final ConcurrentMap sessionStore = + new ConcurrentHashMap<>(); + private final Callable idGenerator; + + /** + * Construct a factory for ServerSessionMiddleware. + * + * Factory manages and accesses persistent sessions based on HTTP cookies. + * + * @param idGenerator A Callable returning unique session id Strings. + */ + public Factory(Callable idGenerator) { + this.idGenerator = idGenerator; + } + + private synchronized Session createNewSession() { + String id; + try { + id = idGenerator.call(); + } catch (Exception ignored) { + // Most impls aren't going to throw so don't make caller handle a nonexistent checked exception + throw CallStatus.INTERNAL.withDescription("Session creation error").toRuntimeException(); + } + + Session newSession = new Session(id); + if (sessionStore.putIfAbsent(id, newSession) != null) { + // Collision, should never happen + throw CallStatus.INTERNAL.withDescription("Session creation error").toRuntimeException(); + } + return newSession; + } + + private void closeSession(String id) { + if (sessionStore.remove(id) == null) { + throw CallStatus.NOT_FOUND.withDescription("Session id '" + id + "' not found.").toRuntimeException(); + } + } + + @Override + public ServerSessionMiddleware onCallStarted(CallInfo callInfo, CallHeaders incomingHeaders, + RequestContext context) { + String sessionId = null; + + final Iterable it = incomingHeaders.getAll("cookie"); + if (it != null) { + findIdCookie: + for (final String headerValue : it) { + for (final String cookie : headerValue.split(" ;")) { + final String[] cookiePair = cookie.split("="); + if (cookiePair.length != 2) { + // Soft failure: Ignore invalid cookie list field + break; + } + + if (sessionCookieName.equals(cookiePair[0]) && cookiePair[1].length() > 0) { + sessionId = cookiePair[1]; + break findIdCookie; + } + } + } + } + + if (sessionId == null) { + // No session cookie, create middleware instance without session. + return new ServerSessionMiddleware(this, incomingHeaders, null); + } + + Session session = sessionStore.get(sessionId); + // Cookie provided by caller, but invalid + if (session == null) { + // Can't soft-fail/proceed here, clients will get unexpected behaviour without options they thought were set. + throw CallStatus.NOT_FOUND.withDescription("Invalid " + sessionCookieName + " cookie.").toRuntimeException(); + } + + return new ServerSessionMiddleware(this, incomingHeaders, session); + } + } + + /** + * A thread-safe container for named SessionOptionValues. + */ + public static class Session { + public final String id; + private ConcurrentMap sessionData = + new ConcurrentHashMap(); + + /** + * Construct a new Session with the given id. + * + * @param id The Session's id string, which is used as the session cookie value. + */ + private Session(String id) { + this.id = id; + } + + /** Get session option by name, or null if it does not exist. */ + public SessionOptionValue getSessionOption(String name) { + return sessionData.get(name); + } + + /** Get an immutable copy of the session options map. */ + public Map getSessionOptions() { + return Collections.unmodifiableMap(new HashMap(sessionData)); + } + + /** Set session option by name to given value. */ + public void setSessionOption(String name, SessionOptionValue value) { + sessionData.put(name, value); + } + + /** Idempotently remove name from this session. */ + public void eraseSessionOption(String name) { + sessionData.remove(name); + } + } + + private final CallHeaders headers; + + private ServerSessionMiddleware(ServerSessionMiddleware.Factory factory, + CallHeaders incomingHeaders, Session session) { + this.factory = factory; + headers = incomingHeaders; + this.session = session; + existingSession = (session != null); + } + + /** + * Check if there is an open session associated with this call. + * + * @return True iff there is an open session associated with this call. + */ + public boolean hasSession() { + return session != null; + } + + /** + * Get the existing or new session value map for this call. + * + * @return The session option value map, or null in case of an id generation collision. + */ + public synchronized Session getSession() { + if (session == null) { + session = factory.createNewSession(); + } + + return session; + } + + /** + * Close the current session. + * + * It is an error to call this without a valid session specified via cookie or equivalent. + * */ + public synchronized void closeSession() { + if (session == null) { + throw CallStatus.NOT_FOUND.withDescription("No session found for the current call.").toRuntimeException(); + } + factory.closeSession(session.id); + closedSessionId = session.id; + session = null; + } + + public CallHeaders getCallHeaders() { + return headers; + } + + @Override + public void onBeforeSendingHeaders(CallHeaders outgoingHeaders) { + if (!existingSession && session != null) { + outgoingHeaders.insert("set-cookie", sessionCookieName + "=" + session.id); + } + if (closedSessionId != null) { + outgoingHeaders.insert("set-cookie", sessionCookieName + "=" + closedSessionId + "; Max-Age=0"); + } + } + + @Override + public void onCallCompleted(CallStatus status) { + } + + @Override + public void onCallErrored(Throwable err) { + } +} diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/SessionOptionValue.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/SessionOptionValue.java new file mode 100644 index 0000000000000..db22c736be182 --- /dev/null +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/SessionOptionValue.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight; + +import java.util.Arrays; + +import org.apache.arrow.flight.impl.Flight; + +/** + * A union-like container interface for supported session option value types. + */ +public abstract class SessionOptionValue { + SessionOptionValue() { + } + + /** + * Value access via a caller-provided visitor/functor. + */ + public abstract T acceptVisitor(SessionOptionValueVisitor v); + + Flight.SessionOptionValue toProtocol() { + Flight.SessionOptionValue.Builder b = Flight.SessionOptionValue.newBuilder(); + SessionOptionValueToProtocolVisitor visitor = new SessionOptionValueToProtocolVisitor(b); + this.acceptVisitor(visitor); + return b.build(); + } + + /** Check whether the SessionOptionValue is empty/valueless. */ + public boolean isEmpty() { + return false; + } + + private class SessionOptionValueToProtocolVisitor implements SessionOptionValueVisitor { + final Flight.SessionOptionValue.Builder b; + + SessionOptionValueToProtocolVisitor(Flight.SessionOptionValue.Builder b) { + this.b = b; + } + + @Override + public Void visit(String value) { + b.setStringValue(value); + return null; + } + + @Override + public Void visit(boolean value) { + b.setBoolValue(value); + return null; + } + + @Override + public Void visit(long value) { + b.setInt64Value(value); + return null; + } + + @Override + public Void visit(double value) { + b.setDoubleValue(value); + return null; + } + + @Override + public Void visit(String[] value) { + Flight.SessionOptionValue.StringListValue.Builder pbSLVBuilder = + Flight.SessionOptionValue.StringListValue.newBuilder(); + pbSLVBuilder.addAllValues(Arrays.asList(value)); + b.setStringListValue(pbSLVBuilder.build()); + return null; + } + + @Override + public Void visit(Void ignored) { + b.clearOptionValue(); + return null; + } + } +} diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/SessionOptionValueFactory.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/SessionOptionValueFactory.java new file mode 100644 index 0000000000000..47c82fa7bb7fd --- /dev/null +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/SessionOptionValueFactory.java @@ -0,0 +1,284 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight; + +import java.nio.charset.StandardCharsets; +import java.util.Arrays; + +import org.apache.arrow.flight.impl.Flight; + +/** Abstract factory for concrete SessionOptionValue instances. */ +public class SessionOptionValueFactory { + public static SessionOptionValue makeSessionOptionValue(String value) { + return new SessionOptionValueString(value); + } + + public static SessionOptionValue makeSessionOptionValue(boolean value) { + return new SessionOptionValueBoolean(value); + } + + public static SessionOptionValue makeSessionOptionValue(long value) { + return new SessionOptionValueLong(value); + } + + public static SessionOptionValue makeSessionOptionValue(double value) { + return new SessionOptionValueDouble(value); + } + + public static SessionOptionValue makeSessionOptionValue(String[] value) { + return new SessionOptionValueStringList(value); + } + + public static SessionOptionValue makeEmptySessionOptionValue() { + return new SessionOptionValueEmpty(); + } + + /** Construct a SessionOptionValue from its Protobuf object representation. */ + public static SessionOptionValue makeSessionOptionValue(Flight.SessionOptionValue proto) { + switch (proto.getOptionValueCase()) { + case STRING_VALUE: + return new SessionOptionValueString(proto.getStringValue()); + case BOOL_VALUE: + return new SessionOptionValueBoolean(proto.getBoolValue()); + case INT64_VALUE: + return new SessionOptionValueLong(proto.getInt64Value()); + case DOUBLE_VALUE: + return new SessionOptionValueDouble(proto.getDoubleValue()); + case STRING_LIST_VALUE: + // Using ByteString::toByteArray() here otherwise we still somehow get `ByteArray`s with broken .equals(String) + return new SessionOptionValueStringList(proto.getStringListValue().getValuesList().asByteStringList().stream() + .map((e) -> new String(e.toByteArray(), StandardCharsets.UTF_8)).toArray(String[]::new)); + case OPTIONVALUE_NOT_SET: + return new SessionOptionValueEmpty(); + default: + // Unreachable + throw new IllegalArgumentException(""); + } + } + + private static class SessionOptionValueString extends SessionOptionValue { + private final String value; + + SessionOptionValueString(String value) { + this.value = value; + } + + @Override + public T acceptVisitor(SessionOptionValueVisitor v) { + return v.visit(value); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + SessionOptionValueString that = (SessionOptionValueString) o; + return value.equals(that.value); + } + + @Override + public int hashCode() { + return value.hashCode(); + } + + @Override + public String toString() { + return '"' + value + '"'; + } + } + + private static class SessionOptionValueBoolean extends SessionOptionValue { + private final boolean value; + + SessionOptionValueBoolean(boolean value) { + this.value = value; + } + + @Override + public T acceptVisitor(SessionOptionValueVisitor v) { + return v.visit(value); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + SessionOptionValueBoolean that = (SessionOptionValueBoolean) o; + return value == that.value; + } + + @Override + public int hashCode() { + return Boolean.hashCode(value); + } + + @Override + public String toString() { + return String.valueOf(value); + } + } + + private static class SessionOptionValueLong extends SessionOptionValue { + private final long value; + + SessionOptionValueLong(long value) { + this.value = value; + } + + @Override + public T acceptVisitor(SessionOptionValueVisitor v) { + return v.visit(value); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + SessionOptionValueLong that = (SessionOptionValueLong) o; + return value == that.value; + } + + @Override + public int hashCode() { + return Long.hashCode(value); + } + + @Override + public String toString() { + return String.valueOf(value); + } + } + + private static class SessionOptionValueDouble extends SessionOptionValue { + private final double value; + + SessionOptionValueDouble(double value) { + this.value = value; + } + + @Override + public T acceptVisitor(SessionOptionValueVisitor v) { + return v.visit(value); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + SessionOptionValueDouble that = (SessionOptionValueDouble) o; + return value == that.value; + } + + @Override + public int hashCode() { + return Double.hashCode(value); + } + + @Override + public String toString() { + return String.valueOf(value); + } + } + + private static class SessionOptionValueStringList extends SessionOptionValue { + private final String[] value; + + SessionOptionValueStringList(String[] value) { + this.value = value.clone(); + } + + @Override + public T acceptVisitor(SessionOptionValueVisitor v) { + return v.visit(value); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + SessionOptionValueStringList that = (SessionOptionValueStringList) o; + return Arrays.deepEquals(value, that.value); + } + + @Override + public int hashCode() { + return Arrays.deepHashCode(value); + } + + @Override + public String toString() { + if (value.length == 0) { + return "[]"; + } + return "[\"" + String.join("\", \"", value) + "\"]"; + } + } + + private static class SessionOptionValueEmpty extends SessionOptionValue { + @Override + public T acceptVisitor(SessionOptionValueVisitor v) { + return v.visit((Void) null); + } + + @Override + public boolean isEmpty() { + return true; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + return true; + } + + @Override + public int hashCode() { + return SessionOptionValueEmpty.class.hashCode(); + } + + @Override + public String toString() { + return ""; + } + } +} diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/SessionOptionValueVisitor.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/SessionOptionValueVisitor.java new file mode 100644 index 0000000000000..f2178224a0d29 --- /dev/null +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/SessionOptionValueVisitor.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight; + +/** + * A visitor interface to access SessionOptionValue's contained value. + * + * @param Return type of the visit operation. + */ +public interface SessionOptionValueVisitor { + /** + * A callback to handle SessionOptionValue containing a String. + */ + T visit(String value); + + /** + * A callback to handle SessionOptionValue containing a boolean. + */ + T visit(boolean value); + + /** + * A callback to handle SessionOptionValue containing a long. + */ + T visit(long value); + + /** + * A callback to handle SessionOptionValue containing a double. + */ + T visit(double value); + + /** + * A callback to handle SessionOptionValue containing an array of String. + */ + T visit(String[] value); + + /** + * A callback to handle SessionOptionValue containing no value. + * + * By convention, an attempt to set a valueless SessionOptionValue should + * attempt to unset or clear the named option value on the server. + */ + T visit(Void value); +} diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/SetSessionOptionsRequest.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/SetSessionOptionsRequest.java new file mode 100644 index 0000000000000..8a5253e682162 --- /dev/null +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/SetSessionOptionsRequest.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.arrow.flight.impl.Flight; + +/** A request to set option(s) in an existing or implicitly-created server session. */ +public class SetSessionOptionsRequest { + private final Map sessionOptions; + + public SetSessionOptionsRequest(Map sessionOptions) { + this.sessionOptions = Collections.unmodifiableMap(new HashMap(sessionOptions)); + } + + SetSessionOptionsRequest(Flight.SetSessionOptionsRequest proto) { + sessionOptions = Collections.unmodifiableMap( + proto.getSessionOptionsMap().entrySet().stream().collect(Collectors.toMap( + Map.Entry::getKey, (e) -> SessionOptionValueFactory.makeSessionOptionValue(e.getValue())))); + } + + /** + * Get the session option map from the request. + * + * @return An immutable view of the session options map. + */ + public Map getSessionOptions() { + return Collections.unmodifiableMap(sessionOptions); + } + + Flight.SetSessionOptionsRequest toProtocol() { + Flight.SetSessionOptionsRequest.Builder b = Flight.SetSessionOptionsRequest.newBuilder(); + b.putAllSessionOptions(sessionOptions.entrySet().stream().collect(Collectors.toMap( + Map.Entry::getKey, (e) -> e.getValue().toProtocol()))); + return b.build(); + } + + /** + * Get the serialized form of this protocol message. + * + *

Intended to help interoperability by allowing non-Flight services to still return Flight types. + */ + public ByteBuffer serialize() { + return ByteBuffer.wrap(toProtocol().toByteArray()); + } + + /** + * Parse the serialized form of this protocol message. + * + *

Intended to help interoperability by allowing Flight clients to obtain stream info from non-Flight services. + * + * @param serialized The serialized form of the message, as returned by {@link #serialize()}. + * @return The deserialized message. + * @throws IOException if the serialized form is invalid. + */ + public static SetSessionOptionsRequest deserialize(ByteBuffer serialized) throws IOException { + return new SetSessionOptionsRequest(Flight.SetSessionOptionsRequest.parseFrom(serialized)); + } + +} diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/SetSessionOptionsResult.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/SetSessionOptionsResult.java new file mode 100644 index 0000000000000..14d53cc6767e0 --- /dev/null +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/SetSessionOptionsResult.java @@ -0,0 +1,152 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.arrow.flight.impl.Flight; + +/** The result of attempting to set a set of session options. */ +public class SetSessionOptionsResult { + /** Error status value for per-option errors. */ + public enum ErrorValue { + /** + * The status of setting the option is unknown. Servers should avoid using this value + * (send a NOT_FOUND error if the requested session is not known). Clients can retry + * the request. + */ + UNSPECIFIED, + /** + * The given session option name is invalid. + */ + INVALID_NAME, + /** + * The session option value or type is invalid. + */ + INVALID_VALUE, + /** + * The session option cannot be set. + */ + ERROR, + ; + + static ErrorValue fromProtocol(Flight.SetSessionOptionsResult.ErrorValue s) { + return values()[s.getNumber()]; + } + + Flight.SetSessionOptionsResult.ErrorValue toProtocol() { + return Flight.SetSessionOptionsResult.ErrorValue.values()[ordinal()]; + } + } + + /** Per-option extensible error response container. */ + public static class Error { + public ErrorValue value; + + public Error(ErrorValue value) { + this.value = value; + } + + Error(Flight.SetSessionOptionsResult.Error e) { + value = ErrorValue.fromProtocol(e.getValue()); + } + + Flight.SetSessionOptionsResult.Error toProtocol() { + Flight.SetSessionOptionsResult.Error.Builder b = Flight.SetSessionOptionsResult.Error.newBuilder(); + b.setValue(value.toProtocol()); + return b.build(); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + Error that = (Error) o; + return value == that.value; + } + + @Override + public int hashCode() { + return value.hashCode(); + } + } + + private final Map errors; + + public SetSessionOptionsResult(Map errors) { + this.errors = Collections.unmodifiableMap(new HashMap(errors)); + } + + SetSessionOptionsResult(Flight.SetSessionOptionsResult proto) { + errors = Collections.unmodifiableMap(proto.getErrors().entrySet().stream().collect( + Collectors.toMap(Map.Entry::getKey, (e) -> new Error(e.getValue())))); + } + + /** Report whether the error map has nonzero length. */ + public boolean hasErrors() { + return errors.size() > 0; + } + + /** + * Get the error status map from the result object. + * + * @return An immutable view of the error status map. + */ + public Map getErrors() { + return errors; + } + + Flight.SetSessionOptionsResult toProtocol() { + Flight.SetSessionOptionsResult.Builder b = Flight.SetSessionOptionsResult.newBuilder(); + b.putAllErrors(errors.entrySet().stream().collect(Collectors.toMap( + Map.Entry::getKey, + (e) -> e.getValue().toProtocol()))); + return b.build(); + } + + /** + * Get the serialized form of this protocol message. + * + *

Intended to help interoperability by allowing non-Flight services to still return Flight types. + */ + public ByteBuffer serialize() { + return ByteBuffer.wrap(toProtocol().toByteArray()); + } + + /** + * Parse the serialized form of this protocol message. + * + *

Intended to help interoperability by allowing Flight clients to obtain stream info from non-Flight services. + * + * @param serialized The serialized form of the message, as returned by {@link #serialize()}. + * @return The deserialized message. + * @throws IOException if the serialized form is invalid. + */ + public static SetSessionOptionsResult deserialize(ByteBuffer serialized) throws IOException { + return new SetSessionOptionsResult(Flight.SetSessionOptionsResult.parseFrom(serialized)); + } +} diff --git a/java/flight/flight-integration-tests/pom.xml b/java/flight/flight-integration-tests/pom.xml index 944c624d630a2..905c8bdaf013b 100644 --- a/java/flight/flight-integration-tests/pom.xml +++ b/java/flight/flight-integration-tests/pom.xml @@ -45,6 +45,10 @@ com.google.protobuf protobuf-java + + com.google.guava + guava + commons-cli commons-cli diff --git a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/Scenarios.java b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/Scenarios.java index c61fd94a4d228..6878c22c5ccdc 100644 --- a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/Scenarios.java +++ b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/Scenarios.java @@ -50,6 +50,7 @@ private Scenarios() { scenarios.put("flight_sql", FlightSqlScenario::new); scenarios.put("flight_sql:extension", FlightSqlExtensionScenario::new); scenarios.put("app_metadata_flight_info_endpoint", AppMetadataFlightInfoEndpointScenario::new); + scenarios.put("session_options", SessionOptionsScenario::new); } private static Scenarios getInstance() { diff --git a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/SessionOptionsProducer.java b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/SessionOptionsProducer.java new file mode 100644 index 0000000000000..f29028547c452 --- /dev/null +++ b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/SessionOptionsProducer.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight.integration.tests; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.arrow.flight.CallStatus; +import org.apache.arrow.flight.CloseSessionRequest; +import org.apache.arrow.flight.CloseSessionResult; +import org.apache.arrow.flight.FlightRuntimeException; +import org.apache.arrow.flight.FlightServerMiddleware; +import org.apache.arrow.flight.GetSessionOptionsRequest; +import org.apache.arrow.flight.GetSessionOptionsResult; +import org.apache.arrow.flight.ServerSessionMiddleware; +import org.apache.arrow.flight.SessionOptionValue; +import org.apache.arrow.flight.SessionOptionValueFactory; +import org.apache.arrow.flight.SetSessionOptionsRequest; +import org.apache.arrow.flight.SetSessionOptionsResult; +import org.apache.arrow.flight.sql.NoOpFlightSqlProducer; + +/** The server used for testing Sessions. + *

+ * SetSessionOptions(), GetSessionOptions(), and CloseSession() operate on a + * simple SessionOptionValue store. + */ +final class SessionOptionsProducer extends NoOpFlightSqlProducer { + private static final SessionOptionValue invalidOptionValue = + SessionOptionValueFactory.makeSessionOptionValue("lol_invalid"); + private final FlightServerMiddleware.Key sessionMiddlewareKey; + + SessionOptionsProducer(FlightServerMiddleware.Key sessionMiddlewareKey) { + this.sessionMiddlewareKey = sessionMiddlewareKey; + } + + @Override + public void setSessionOptions(SetSessionOptionsRequest request, CallContext context, + StreamListener listener) { + Map errors = new HashMap(); + + ServerSessionMiddleware middleware = context.getMiddleware(sessionMiddlewareKey); + ServerSessionMiddleware.Session session = middleware.getSession(); + for (Map.Entry entry : request.getSessionOptions().entrySet()) { + // Blacklisted option name + if (entry.getKey().equals("lol_invalid")) { + errors.put(entry.getKey(), + new SetSessionOptionsResult.Error(SetSessionOptionsResult.ErrorValue.INVALID_NAME)); + continue; + } + // Blacklisted option value + // Recommend using a visitor to check polymorphic equality, but this check is easy + if (entry.getValue().equals(invalidOptionValue)) { + errors.put(entry.getKey(), + new SetSessionOptionsResult.Error(SetSessionOptionsResult.ErrorValue.INVALID_VALUE)); + continue; + } + // Business as usual: + if (entry.getValue().isEmpty()) { + session.eraseSessionOption(entry.getKey()); + continue; + } + session.setSessionOption(entry.getKey(), entry.getValue()); + } + listener.onNext(new SetSessionOptionsResult(errors)); + listener.onCompleted(); + } + + @Override + public void getSessionOptions(GetSessionOptionsRequest request, CallContext context, + StreamListener listener) { + ServerSessionMiddleware middleware = context.getMiddleware(sessionMiddlewareKey); + if (!middleware.hasSession()) { + // Attempt to get options without an existing session + listener.onError(CallStatus.NOT_FOUND.withDescription("No current server session").toRuntimeException()); + return; + } + final Map sessionOptions = middleware.getSession().getSessionOptions(); + listener.onNext(new GetSessionOptionsResult(sessionOptions)); + listener.onCompleted(); + } + + @Override + public void closeSession(CloseSessionRequest request, CallContext context, + StreamListener listener) { + ServerSessionMiddleware middleware = context.getMiddleware(sessionMiddlewareKey); + try { + middleware.closeSession(); + } catch (FlightRuntimeException fre) { + listener.onError(fre); + return; + } + listener.onNext(new CloseSessionResult(CloseSessionResult.Status.CLOSED)); + listener.onCompleted(); + } +} diff --git a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/SessionOptionsScenario.java b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/SessionOptionsScenario.java new file mode 100644 index 0000000000000..c150cfa6ef137 --- /dev/null +++ b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/SessionOptionsScenario.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight.integration.tests; + +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.arrow.flight.FlightClient; +import org.apache.arrow.flight.FlightProducer; +import org.apache.arrow.flight.FlightServer; +import org.apache.arrow.flight.FlightServerMiddleware; +import org.apache.arrow.flight.GetSessionOptionsRequest; +import org.apache.arrow.flight.GetSessionOptionsResult; +import org.apache.arrow.flight.Location; +import org.apache.arrow.flight.ServerSessionMiddleware; +import org.apache.arrow.flight.SessionOptionValue; +import org.apache.arrow.flight.SessionOptionValueFactory; +import org.apache.arrow.flight.SetSessionOptionsRequest; +import org.apache.arrow.flight.SetSessionOptionsResult; +import org.apache.arrow.flight.client.ClientCookieMiddleware; +import org.apache.arrow.flight.sql.FlightSqlClient; +import org.apache.arrow.memory.BufferAllocator; + +import com.google.common.collect.ImmutableMap; + +/** + * Scenario to exercise Session Options functionality. + */ +final class SessionOptionsScenario implements Scenario { + private final FlightServerMiddleware.Key key = + FlightServerMiddleware.Key.of("sessionmiddleware"); + + @Override + public FlightProducer producer(BufferAllocator allocator, Location location) throws Exception { + return new SessionOptionsProducer(key); + } + + @Override + public void buildServer(FlightServer.Builder builder) { + AtomicInteger counter = new AtomicInteger(1000); + builder.middleware(key, new ServerSessionMiddleware.Factory(() -> String.valueOf(counter.getAndIncrement()))); + } + + @Override + public void client(BufferAllocator allocator, Location location, FlightClient ignored) throws Exception { + final ClientCookieMiddleware.Factory factory = new ClientCookieMiddleware.Factory(); + try (final FlightClient flightClient = FlightClient.builder(allocator, location).intercept(factory).build()) { + final FlightSqlClient client = new FlightSqlClient(flightClient); + + // Set + SetSessionOptionsRequest req1 = new SetSessionOptionsRequest(ImmutableMap.builder() + .put("foolong", SessionOptionValueFactory.makeSessionOptionValue(123L)) + .put("bardouble", SessionOptionValueFactory.makeSessionOptionValue(456.0)) + .put("lol_invalid", SessionOptionValueFactory.makeSessionOptionValue("this won't get set")) + .put("key_with_invalid_value", SessionOptionValueFactory.makeSessionOptionValue("lol_invalid")) + .put("big_ol_string_list", SessionOptionValueFactory.makeSessionOptionValue( + new String[]{"a", "b", "sea", "dee", " ", " ", "geee", "(づ。◕‿‿◕。)づ"})) + .build()); + SetSessionOptionsResult res1 = client.setSessionOptions(req1); + // Some errors + IntegrationAssertions.assertEquals(ImmutableMap.builder() + .put("lol_invalid", new SetSessionOptionsResult.Error(SetSessionOptionsResult.ErrorValue.INVALID_NAME)) + .put("key_with_invalid_value", new SetSessionOptionsResult.Error( + SetSessionOptionsResult.ErrorValue.INVALID_VALUE)) + .build(), + res1.getErrors()); + // Some set, some omitted due to above errors + GetSessionOptionsResult res2 = client.getSessionOptions(new GetSessionOptionsRequest()); + IntegrationAssertions.assertEquals(ImmutableMap.builder() + .put("foolong", SessionOptionValueFactory.makeSessionOptionValue(123L)) + .put("bardouble", SessionOptionValueFactory.makeSessionOptionValue(456.0)) + .put("big_ol_string_list", SessionOptionValueFactory.makeSessionOptionValue( + new String[]{"a", "b", "sea", "dee", " ", " ", "geee", "(づ。◕‿‿◕。)づ"})) + .build(), + res2.getSessionOptions()); + // Update + client.setSessionOptions(new SetSessionOptionsRequest(ImmutableMap.builder() + // Delete + .put("foolong", SessionOptionValueFactory.makeEmptySessionOptionValue()) + // Update + .put("big_ol_string_list", + SessionOptionValueFactory.makeSessionOptionValue("a,b,sea,dee, , ,geee,(づ。◕‿‿◕。)づ")) + .build())); + GetSessionOptionsResult res4 = client.getSessionOptions(new GetSessionOptionsRequest()); + IntegrationAssertions.assertEquals(ImmutableMap.builder() + .put("bardouble", SessionOptionValueFactory.makeSessionOptionValue(456.0)) + .put("big_ol_string_list", + SessionOptionValueFactory.makeSessionOptionValue("a,b,sea,dee, , ,geee,(づ。◕‿‿◕。)づ")) + .build(), + res4.getSessionOptions()); + } + } +} diff --git a/java/flight/flight-integration-tests/src/test/java/org/apache/arrow/flight/integration/tests/IntegrationTest.java b/java/flight/flight-integration-tests/src/test/java/org/apache/arrow/flight/integration/tests/IntegrationTest.java index 477a56055cbbc..f814427567ae9 100644 --- a/java/flight/flight-integration-tests/src/test/java/org/apache/arrow/flight/integration/tests/IntegrationTest.java +++ b/java/flight/flight-integration-tests/src/test/java/org/apache/arrow/flight/integration/tests/IntegrationTest.java @@ -83,6 +83,11 @@ void appMetadataFlightInfoEndpoint() throws Exception { testScenario("app_metadata_flight_info_endpoint"); } + @Test + void sessionOptions() throws Exception { + testScenario("session_options"); + } + void testScenario(String scenarioName) throws Exception { try (final BufferAllocator allocator = new RootAllocator()) { final FlightServer.Builder builder = FlightServer.builder() diff --git a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/CloseSessionResultListener.java b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/CloseSessionResultListener.java new file mode 100644 index 0000000000000..e1a5b369fe16c --- /dev/null +++ b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/CloseSessionResultListener.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight.sql; + +import org.apache.arrow.flight.CloseSessionResult; +import org.apache.arrow.flight.FlightProducer; +import org.apache.arrow.flight.Result; + +/** Typed StreamListener for closeSession. */ +public class CloseSessionResultListener implements FlightProducer.StreamListener { + private final FlightProducer.StreamListener listener; + + CloseSessionResultListener(FlightProducer.StreamListener listener) { + this.listener = listener; + } + + @Override + public void onNext(CloseSessionResult val) { + listener.onNext(new Result(val.serialize().array())); + } + + @Override + public void onError(Throwable t) { + listener.onError(t); + } + + @Override + public void onCompleted() { + listener.onCompleted(); + } +} diff --git a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlClient.java b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlClient.java index 93d933f00f38f..e6eb28fe317e1 100644 --- a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlClient.java +++ b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlClient.java @@ -61,15 +61,21 @@ import org.apache.arrow.flight.CallStatus; import org.apache.arrow.flight.CancelFlightInfoRequest; import org.apache.arrow.flight.CancelFlightInfoResult; +import org.apache.arrow.flight.CloseSessionRequest; +import org.apache.arrow.flight.CloseSessionResult; import org.apache.arrow.flight.FlightClient; import org.apache.arrow.flight.FlightDescriptor; import org.apache.arrow.flight.FlightEndpoint; import org.apache.arrow.flight.FlightInfo; import org.apache.arrow.flight.FlightStream; +import org.apache.arrow.flight.GetSessionOptionsRequest; +import org.apache.arrow.flight.GetSessionOptionsResult; import org.apache.arrow.flight.PutResult; import org.apache.arrow.flight.RenewFlightEndpointRequest; import org.apache.arrow.flight.Result; import org.apache.arrow.flight.SchemaResult; +import org.apache.arrow.flight.SetSessionOptionsRequest; +import org.apache.arrow.flight.SetSessionOptionsResult; import org.apache.arrow.flight.SyncPutListener; import org.apache.arrow.flight.Ticket; import org.apache.arrow.flight.sql.impl.FlightSql.ActionCreatePreparedStatementResult; @@ -917,6 +923,18 @@ public FlightEndpoint renewFlightEndpoint(RenewFlightEndpointRequest request, Ca return client.renewFlightEndpoint(request, options); } + public SetSessionOptionsResult setSessionOptions(SetSessionOptionsRequest request, CallOption... options) { + return client.setSessionOptions(request, options); + } + + public GetSessionOptionsResult getSessionOptions(GetSessionOptionsRequest request, CallOption... options) { + return client.getSessionOptions(request, options); + } + + public CloseSessionResult closeSession(CloseSessionRequest request, CallOption... options) { + return client.closeSession(request, options); + } + @Override public void close() throws Exception { AutoCloseables.close(client); diff --git a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlProducer.java b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlProducer.java index dbe39ab1d07b4..f06c1b868f4fd 100644 --- a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlProducer.java +++ b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlProducer.java @@ -56,16 +56,22 @@ import org.apache.arrow.flight.CallStatus; import org.apache.arrow.flight.CancelFlightInfoRequest; import org.apache.arrow.flight.CancelStatus; +import org.apache.arrow.flight.CloseSessionRequest; +import org.apache.arrow.flight.CloseSessionResult; import org.apache.arrow.flight.FlightConstants; import org.apache.arrow.flight.FlightDescriptor; import org.apache.arrow.flight.FlightEndpoint; import org.apache.arrow.flight.FlightInfo; import org.apache.arrow.flight.FlightProducer; import org.apache.arrow.flight.FlightStream; +import org.apache.arrow.flight.GetSessionOptionsRequest; +import org.apache.arrow.flight.GetSessionOptionsResult; import org.apache.arrow.flight.PutResult; import org.apache.arrow.flight.RenewFlightEndpointRequest; import org.apache.arrow.flight.Result; import org.apache.arrow.flight.SchemaResult; +import org.apache.arrow.flight.SetSessionOptionsRequest; +import org.apache.arrow.flight.SetSessionOptionsResult; import org.apache.arrow.flight.Ticket; import org.apache.arrow.flight.sql.impl.FlightSql.ActionClosePreparedStatementRequest; import org.apache.arrow.flight.sql.impl.FlightSql.ActionCreatePreparedStatementRequest; @@ -383,6 +389,42 @@ default void doAction(CallContext context, Action action, StreamListener return; } renewFlightEndpoint(request, context, new FlightEndpointListener(listener)); + } else if (actionType.equals(FlightConstants.SET_SESSION_OPTIONS.getType())) { + final SetSessionOptionsRequest request; + try { + request = SetSessionOptionsRequest.deserialize(ByteBuffer.wrap(action.getBody())); + } catch (IOException e) { + listener.onError(CallStatus.INTERNAL + .withDescription("Could not unpack SetSessionOptionsRequest: " + e) + .withCause(e) + .toRuntimeException()); + return; + } + setSessionOptions(request, context, new SetSessionOptionsResultListener(listener)); + } else if (actionType.equals(FlightConstants.GET_SESSION_OPTIONS.getType())) { + final GetSessionOptionsRequest request; + try { + request = GetSessionOptionsRequest.deserialize(ByteBuffer.wrap(action.getBody())); + } catch (IOException e) { + listener.onError(CallStatus.INTERNAL + .withDescription("Could not unpack GetSessionOptionsRequest: " + e) + .withCause(e) + .toRuntimeException()); + return; + } + getSessionOptions(request, context, new GetSessionOptionsResultListener(listener)); + } else if (actionType.equals(FlightConstants.CLOSE_SESSION.getType())) { + final CloseSessionRequest request; + try { + request = CloseSessionRequest.deserialize(ByteBuffer.wrap(action.getBody())); + } catch (IOException e) { + listener.onError(CallStatus.INTERNAL + .withDescription("Could not unpack CloseSessionRequest: " + e) + .withCause(e) + .toRuntimeException()); + return; + } + closeSession(request, context, new CloseSessionResultListener(listener)); } else { throw CallStatus.INVALID_ARGUMENT .withDescription("Unrecognized request: " + action.getType()) @@ -472,6 +514,43 @@ public void onCompleted() { }); } + /** + * Set server session options(s). + * + * @param request The session options to set. For *DBC driver compatibility, servers + * should support converting values from strings. + * @param context Per-call context. + * @param listener An interface for sending data back to the client. + */ + default void setSessionOptions(SetSessionOptionsRequest request, CallContext context, + StreamListener listener) { + listener.onError(CallStatus.UNIMPLEMENTED.toRuntimeException()); + } + + /** + * Get server session option(s). + * + * @param request The (empty) GetSessionOptionsRequest. + * @param context Per-call context. + * @param listener An interface for sending data back to the client. + */ + default void getSessionOptions(GetSessionOptionsRequest request, CallContext context, + StreamListener listener) { + listener.onError(CallStatus.UNIMPLEMENTED.toRuntimeException()); + } + + /** + * Close/invalidate the session. + * + * @param request The (empty) CloseSessionRequest. + * @param context Per-call context. + * @param listener An interface for sending data back to the client. + */ + default void closeSession(CloseSessionRequest request, CallContext context, + StreamListener listener) { + listener.onError(CallStatus.UNIMPLEMENTED.toRuntimeException()); + } + /** * Creates a prepared statement on the server and returns a handle and metadata for in a * {@link ActionCreatePreparedStatementResult} object in a {@link Result} diff --git a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/GetSessionOptionsResultListener.java b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/GetSessionOptionsResultListener.java new file mode 100644 index 0000000000000..4fdffd076243c --- /dev/null +++ b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/GetSessionOptionsResultListener.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight.sql; + +import org.apache.arrow.flight.FlightProducer; +import org.apache.arrow.flight.GetSessionOptionsResult; +import org.apache.arrow.flight.Result; + +/** Typed StreamListener for getSessionOptions. */ +public class GetSessionOptionsResultListener implements FlightProducer.StreamListener { + private final FlightProducer.StreamListener listener; + + GetSessionOptionsResultListener(FlightProducer.StreamListener listener) { + this.listener = listener; + } + + @Override + public void onNext(GetSessionOptionsResult val) { + listener.onNext(new Result(val.serialize().array())); + } + + @Override + public void onError(Throwable t) { + listener.onError(t); + } + + @Override + public void onCompleted() { + listener.onCompleted(); + } +} diff --git a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/SetSessionOptionsResultListener.java b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/SetSessionOptionsResultListener.java new file mode 100644 index 0000000000000..230be2bf1b316 --- /dev/null +++ b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/SetSessionOptionsResultListener.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight.sql; + +import org.apache.arrow.flight.FlightProducer; +import org.apache.arrow.flight.Result; +import org.apache.arrow.flight.SetSessionOptionsResult; + +/** Typed StreamListener for setSessionOptions. */ +public class SetSessionOptionsResultListener implements FlightProducer.StreamListener { + private final FlightProducer.StreamListener listener; + + SetSessionOptionsResultListener(FlightProducer.StreamListener listener) { + this.listener = listener; + } + + @Override + public void onNext(SetSessionOptionsResult val) { + listener.onNext(new Result(val.serialize().array())); + } + + @Override + public void onError(Throwable t) { + listener.onError(t); + } + + @Override + public void onCompleted() { + listener.onCompleted(); + } +} diff --git a/testing b/testing index 25d16511e8d42..ad82a736c170e 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 25d16511e8d42c2744a1d94d90169e3a36e92631 +Subproject commit ad82a736c170e97b7c8c035ebd8a801c17eec170 From a8b9537764144baf5da8e22dfcd4e9ff0c70ab2f Mon Sep 17 00:00:00 2001 From: h-vetinari Date: Tue, 20 Feb 2024 15:49:16 +0100 Subject: [PATCH 26/46] GH-37931: [Python] Revert "GH-37803: [CI][Dev][Python] Release and merge script errors (#37819)" (#40150) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 79e49dbfb71efc70555417ba19cb612eb50924e8. #37931 should have been fixed as of https://github.com/pypa/setuptools_scm/commit/056584b49f039f0913bd6ee5bb5a5befdb396dec in setuptools_scm 8.0.4; I tested that this works in https://github.com/conda-forge/arrow-cpp-feedstock/pull/1314. CC @ AlenkaF @ raulcd * Closes: #37931 Authored-by: H. Vetinari Signed-off-by: Raúl Cumplido --- ci/conda_env_archery.txt | 2 +- ci/conda_env_crossbow.txt | 2 +- ci/conda_env_python.txt | 2 +- dev/archery/setup.py | 2 +- dev/tasks/conda-recipes/arrow-cpp/meta.yaml | 4 ++-- python/pyproject.toml | 2 +- python/requirements-build.txt | 2 +- python/requirements-wheel-build.txt | 2 +- python/setup.py | 2 +- 9 files changed, 10 insertions(+), 10 deletions(-) diff --git a/ci/conda_env_archery.txt b/ci/conda_env_archery.txt index 40875e0a55039..ace7a42acb026 100644 --- a/ci/conda_env_archery.txt +++ b/ci/conda_env_archery.txt @@ -25,7 +25,7 @@ jira pygit2 pygithub ruamel.yaml -setuptools_scm<8.0.0 +setuptools_scm toolz # benchmark diff --git a/ci/conda_env_crossbow.txt b/ci/conda_env_crossbow.txt index 59b799720f12b..347294650ca28 100644 --- a/ci/conda_env_crossbow.txt +++ b/ci/conda_env_crossbow.txt @@ -21,5 +21,5 @@ jinja2 jira pygit2 ruamel.yaml -setuptools_scm<8.0.0 +setuptools_scm toolz diff --git a/ci/conda_env_python.txt b/ci/conda_env_python.txt index 59e2def1bf339..19e94d7d3e5bd 100644 --- a/ci/conda_env_python.txt +++ b/ci/conda_env_python.txt @@ -27,4 +27,4 @@ pytest<8 pytest-faulthandler s3fs>=2023.10.0 setuptools -setuptools_scm<8.0.0 +setuptools_scm diff --git a/dev/archery/setup.py b/dev/archery/setup.py index 2ecc72e04e8aa..02a8b34299b1f 100755 --- a/dev/archery/setup.py +++ b/dev/archery/setup.py @@ -30,7 +30,7 @@ extras = { 'benchmark': ['pandas'], 'crossbow': ['github3.py', jinja_req, 'pygit2>=1.6.0', 'requests', - 'ruamel.yaml', 'setuptools_scm<8.0.0'], + 'ruamel.yaml', 'setuptools_scm'], 'crossbow-upload': ['github3.py', jinja_req, 'ruamel.yaml', 'setuptools_scm'], 'docker': ['ruamel.yaml', 'python-dotenv'], diff --git a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml index 367445c595c4b..10ee9c28f8c78 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml +++ b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml @@ -240,7 +240,7 @@ outputs: - numpy - python - setuptools - - setuptools_scm <8.0.0 + - setuptools_scm run: - {{ pin_subpackage('libarrow', exact=True) }} - {{ pin_compatible('numpy') }} @@ -322,7 +322,7 @@ outputs: - numpy - python - setuptools - - setuptools_scm <8.0.0 + - setuptools_scm run: - {{ pin_subpackage('pyarrow', exact=True) }} - python diff --git a/python/pyproject.toml b/python/pyproject.toml index 9079618ad1c7d..1588e690a7247 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -24,7 +24,7 @@ requires = [ # continue using oldest-support-numpy. "oldest-supported-numpy>=0.14; python_version<'3.9'", "numpy>=1.25; python_version>='3.9'", - "setuptools_scm < 8.0.0", + "setuptools_scm", "setuptools >= 40.1.0", "wheel" ] diff --git a/python/requirements-build.txt b/python/requirements-build.txt index e1372e807f88d..87dcc148ad161 100644 --- a/python/requirements-build.txt +++ b/python/requirements-build.txt @@ -1,5 +1,5 @@ cython>=0.29.31 oldest-supported-numpy>=0.14; python_version<'3.9' numpy>=1.25; python_version>='3.9' -setuptools_scm<8.0.0 +setuptools_scm setuptools>=38.6.0 diff --git a/python/requirements-wheel-build.txt b/python/requirements-wheel-build.txt index 044f9de5f8214..9920a38a4e288 100644 --- a/python/requirements-wheel-build.txt +++ b/python/requirements-wheel-build.txt @@ -1,6 +1,6 @@ cython>=0.29.31 oldest-supported-numpy>=0.14; python_version<'3.9' numpy>=1.25; python_version>='3.9' -setuptools_scm<8.0.0 +setuptools_scm setuptools>=58 wheel diff --git a/python/setup.py b/python/setup.py index 098d75a3186af..423de708e8813 100755 --- a/python/setup.py +++ b/python/setup.py @@ -492,7 +492,7 @@ def has_ext_modules(foo): 'pyarrow/_generated_version.py'), 'version_scheme': guess_next_dev_version }, - setup_requires=['setuptools_scm < 8.0.0', 'cython >= 0.29.31'] + setup_requires, + setup_requires=['setuptools_scm', 'cython >= 0.29.31'] + setup_requires, install_requires=install_requires, tests_require=['pytest', 'pandas', 'hypothesis'], python_requires='>=3.8', From a690088193711447aa4d526f2257027f9a459efa Mon Sep 17 00:00:00 2001 From: wayne Date: Tue, 20 Feb 2024 08:38:06 -0700 Subject: [PATCH 27/46] GH-40097: [Go][FlightRPC] Enable disabling TLS (#40098) See https://github.com/apache/arrow/issues/40097 for more in-depth description about the problem that led me to file this PR. ### Rationale for this change Because it's annoying to not be able to connect to a non-TLS flightsql endpoint in my development environment just because my development environment happens to still use token authentication. ### What changes are included in this PR? Thread the flightsql `DriverConfig.TLSEnabled` parameter into the `grpcCredentials` type so that `grpcCredentials.RequireTransportSecurity` can return false if TLS is not enabled on the driver config. One thing that occurred to me about the `DriverConfig.TLSEnabled` field is that its semantics seem very mildly dangerous since golang `bool` types are `false` by default and golang doesn't require fields on structs to be explicitly initialized. It seems to me that `DriverConfig.TLSDisabled` would be better (semantically speaking) because then the API user doesn't have to explicitly enable TLS. But I suppose it's probably undesirable to change the name of a public field on a public type. ### Are these changes tested? I haven't written any tests, mostly because there weren't already any tests for the `grpcCredentials` type but I have manually verified this fixes the problem I described in https://github.com/apache/arrow/issues/40097 by rebuilding my tool and running it against the non-TLS listening thing in my development environment. ### Are there any user-facing changes? * Closes: #40097 Authored-by: wayne warren Signed-off-by: Matt Topol --- go/arrow/flight/flightsql/driver/driver.go | 9 +++++---- go/arrow/flight/flightsql/driver/utils.go | 11 ++++++----- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/go/arrow/flight/flightsql/driver/driver.go b/go/arrow/flight/flightsql/driver/driver.go index 852a97fb4d3ca..65068048ab3d8 100644 --- a/go/arrow/flight/flightsql/driver/driver.go +++ b/go/arrow/flight/flightsql/driver/driver.go @@ -364,10 +364,11 @@ func (c *Connector) Configure(config *DriverConfig) error { // Set authentication credentials rpcCreds := grpcCredentials{ - username: config.Username, - password: config.Password, - token: config.Token, - params: config.Params, + username: config.Username, + password: config.Password, + token: config.Token, + params: config.Params, + tlsEnabled: config.TLSEnabled, } c.options = append(c.options, grpc.WithPerRPCCredentials(rpcCreds)) diff --git a/go/arrow/flight/flightsql/driver/utils.go b/go/arrow/flight/flightsql/driver/utils.go index f7bd2a2e02113..a99c045e2ed02 100644 --- a/go/arrow/flight/flightsql/driver/utils.go +++ b/go/arrow/flight/flightsql/driver/utils.go @@ -27,10 +27,11 @@ import ( // *** GRPC helpers *** type grpcCredentials struct { - username string - password string - token string - params map[string]string + username string + password string + token string + params map[string]string + tlsEnabled bool } func (g grpcCredentials) GetRequestMetadata(ctx context.Context, uri ...string) (map[string]string, error) { @@ -53,7 +54,7 @@ func (g grpcCredentials) GetRequestMetadata(ctx context.Context, uri ...string) } func (g grpcCredentials) RequireTransportSecurity() bool { - return g.token != "" || g.username != "" + return g.tlsEnabled && (g.token != "" || g.username != "") } // *** Type conversions *** From 47f15b07080d62cd912bfbfd5d067cf70dfe6960 Mon Sep 17 00:00:00 2001 From: Yan Zhou Date: Wed, 21 Feb 2024 00:27:23 +0800 Subject: [PATCH 28/46] GH-40113 [Go][Parquet] New RegisterCodec function (#40114) This is to allow addition/overwrite of custom codec implementation This allows other modules to provide alternative implementations for the compression algorithms, such as using libdeflate for Gzip, or CGO version of ZSTD. In addition, it allows others to supply codecs that cannot be easily supported by this library such as LZO due to license reasons or LZ4. ### Rationale for this change See #40113 ### What changes are included in this PR? A new RegisterCodec function added ### Are these changes tested? yes ### Are there any user-facing changes? It's an addition more targeted towards library writers. * Closes: #40113 Authored-by: Yan Zhou Signed-off-by: Matt Topol --- go/parquet/compress/brotli.go | 2 +- go/parquet/compress/compress.go | 20 ++++++++++++++++++++ go/parquet/compress/gzip.go | 2 +- go/parquet/compress/snappy.go | 2 +- go/parquet/compress/zstd.go | 2 +- 5 files changed, 24 insertions(+), 4 deletions(-) diff --git a/go/parquet/compress/brotli.go b/go/parquet/compress/brotli.go index 8a7e92a1403c3..3b1575a70cfc8 100644 --- a/go/parquet/compress/brotli.go +++ b/go/parquet/compress/brotli.go @@ -110,5 +110,5 @@ func (brotliCodec) NewWriterLevel(w io.Writer, level int) (io.WriteCloser, error } func init() { - codecs[Codecs.Brotli] = brotliCodec{} + RegisterCodec(Codecs.Brotli, brotliCodec{}) } diff --git a/go/parquet/compress/compress.go b/go/parquet/compress/compress.go index dc45b6ee9311f..2798defca9444 100644 --- a/go/parquet/compress/compress.go +++ b/go/parquet/compress/compress.go @@ -92,6 +92,26 @@ type Codec interface { var codecs = map[Compression]Codec{} +// RegisterCodec adds or overrides a codec implementation for a given compression algorithm. +// The intended use case is within the init() section of a package. For example, +// +// // inside a custom codec package, say czstd +// +// func init() { +// RegisterCodec(compress.Codecs.Zstd, czstdCodec{}) +// } +// +// type czstdCodec struct{} // implementing Codec interface using CGO based ZSTD wrapper +// +// And user of the custom codec can import the above package like below, +// +// package main +// +// import _ "package/path/to/czstd" +func RegisterCodec(compression Compression, codec Codec) { + codecs[compression] = codec +} + type nocodec struct{} func (nocodec) NewReader(r io.Reader) io.ReadCloser { diff --git a/go/parquet/compress/gzip.go b/go/parquet/compress/gzip.go index 31f1729e9b3af..4b43f8e906599 100644 --- a/go/parquet/compress/gzip.go +++ b/go/parquet/compress/gzip.go @@ -93,5 +93,5 @@ func (gzipCodec) NewWriterLevel(w io.Writer, level int) (io.WriteCloser, error) } func init() { - codecs[Codecs.Gzip] = gzipCodec{} + RegisterCodec(Codecs.Gzip, gzipCodec{}) } diff --git a/go/parquet/compress/snappy.go b/go/parquet/compress/snappy.go index b7fa1142c3a6c..5c82a2c8dc33e 100644 --- a/go/parquet/compress/snappy.go +++ b/go/parquet/compress/snappy.go @@ -57,5 +57,5 @@ func (s snappyCodec) NewWriterLevel(w io.Writer, _ int) (io.WriteCloser, error) } func init() { - codecs[Codecs.Snappy] = snappyCodec{} + RegisterCodec(Codecs.Snappy, snappyCodec{}) } diff --git a/go/parquet/compress/zstd.go b/go/parquet/compress/zstd.go index 02ffd2eae568a..be3fb507262d4 100644 --- a/go/parquet/compress/zstd.go +++ b/go/parquet/compress/zstd.go @@ -108,5 +108,5 @@ func (zstdCodec) CompressBound(len int64) int64 { } func init() { - codecs[Codecs.Zstd] = zstdCodec{} + RegisterCodec(Codecs.Zstd, zstdCodec{}) } From 09889330a296f6767734b42381693e5602419f36 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 20 Feb 2024 18:37:19 +0100 Subject: [PATCH 29/46] MINOR: [Dev][Archery] Reinstate version constraint on setuptools_scm for comment bot (#40162) The comment bot depends on an internal setuptools_scm API that was changed in setuptools_scm 8. We therefore need to reinstate the Archery version constraint that was removed in https://github.com/apache/arrow/pull/40150 See example failure at https://github.com/apache/arrow/actions/runs/7976567301/job/21777437575 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- dev/archery/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/archery/setup.py b/dev/archery/setup.py index 02a8b34299b1f..2ecc72e04e8aa 100755 --- a/dev/archery/setup.py +++ b/dev/archery/setup.py @@ -30,7 +30,7 @@ extras = { 'benchmark': ['pandas'], 'crossbow': ['github3.py', jinja_req, 'pygit2>=1.6.0', 'requests', - 'ruamel.yaml', 'setuptools_scm'], + 'ruamel.yaml', 'setuptools_scm<8.0.0'], 'crossbow-upload': ['github3.py', jinja_req, 'ruamel.yaml', 'setuptools_scm'], 'docker': ['ruamel.yaml', 'python-dotenv'], From 29d2b168a5c43c2cc0bed65d93d08d83fc1ca80f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= Date: Tue, 20 Feb 2024 22:01:59 +0100 Subject: [PATCH 30/46] GH-40153: [Python] Fix OverflowError in foreign_buffer on 32-bit platforms (#40158) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use `uintptr_t` rather than `intptr_t` to fix `OverflowError`, visible e.g. when running `tests/interchange/test_conversion.py` tests on 32-bit platforms. ### Rationale for this change This fixes the `OverflowError`s from #40153, and makes `pyarrow/tests/interchange/` all pass on 32-bit x86. ### What changes are included in this PR? - change the type used to store pointer from `intptr_t` to `uintptr_t` to provide coverage for pointers above `0x80000000`. ### Are these changes tested? These changes are covered by the tests in `pyarrow/tests/interchange`. ### Are there any user-facing changes? It fixes `OverflowError` that can be triggered by working with pandas data types, possibly more (though I'm not sure if this qualifies as a "crash"). * Closes: #40153 Authored-by: Michał Górny Signed-off-by: Antoine Pitrou --- python/pyarrow/io.pxi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 1897e76efc2a0..b57980b3d68fd 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -1987,7 +1987,7 @@ def foreign_buffer(address, size, base=None): Object that owns the referenced memory. """ cdef: - intptr_t c_addr = address + uintptr_t c_addr = address int64_t c_size = size shared_ptr[CBuffer] buf From 11ef68d7dc2e15c81dfc75f4304070021ad42a1e Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 20 Feb 2024 22:08:58 +0100 Subject: [PATCH 31/46] GH-40159: [Python][CI] Add 32-bit Debian build on Crossbow (#40164) ### What changes are included in this PR? Add a Debian-based i386 test build for Python, similar to the existing one for C++. ### Are these changes tested? Yes. The test suite step in the new build will fail until GH-40153 is entirely fixed. ### Are there any user-facing changes? No. * Closes: #40159 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- dev/tasks/tasks.yml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index c2321453052dc..cfc333c6b22f5 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1192,7 +1192,7 @@ tasks: PYTHON: "3.10" image: conda-python-cython2 - test-debian-11-python-3: + test-debian-11-python-3-amd64: ci: azure template: docker-tests/azure.linux.yml params: @@ -1200,6 +1200,16 @@ tasks: DEBIAN: 11 image: debian-python + test-debian-11-python-3-i386: + ci: github + template: docker-tests/github.linux.yml + params: + env: + ARCH: i386 + DEBIAN: 11 + flags: "-e ARROW_S3=OFF -e ARROW_GANDIVA=OFF" + image: debian-python + test-ubuntu-20.04-python-3: ci: azure template: docker-tests/azure.linux.yml From aa6b39859261ab2f116d4d971127c53a8a5be2a5 Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 20 Feb 2024 19:01:25 -0500 Subject: [PATCH 32/46] GH-38573: [Java][FlightRPC] Try all locations in JDBC driver (#40104) ### Rationale for this change This brings the JDBC driver up to par with other Flight SQL clients. ### What changes are included in this PR? Try multiple locations for the Flight SQL driver. ### Are these changes tested? Yes ### Are there any user-facing changes? No * Closes: #38573 Authored-by: David Li Signed-off-by: David Li --- .../org/apache/arrow/flight/FlightClient.java | 3 +- .../client/ArrowFlightSqlClientHandler.java | 53 +++++++---- .../jdbc/utils/FlightEndpointDataQueue.java | 9 +- .../arrow/driver/jdbc/ResultSetTest.java | 91 ++++++++++++++++++- 4 files changed, 137 insertions(+), 19 deletions(-) diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java index 980a762e397f9..49f9af4ebfbb7 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java @@ -909,7 +909,8 @@ public FlightClient build() { builder .maxTraceEvents(MAX_CHANNEL_TRACE_EVENTS) - .maxInboundMessageSize(maxInboundMessageSize); + .maxInboundMessageSize(maxInboundMessageSize) + .maxInboundMetadataSize(maxInboundMessageSize); return new FlightClient(allocator, builder.build(), middleware); } } diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/ArrowFlightSqlClientHandler.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/ArrowFlightSqlClientHandler.java index 234820bd41823..1b03f927d7fc6 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/ArrowFlightSqlClientHandler.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/ArrowFlightSqlClientHandler.java @@ -116,26 +116,47 @@ public List getStreams(final FlightInfo flightInfo) sqlClient.getStream(endpoint.getTicket(), getOptions()), null)); } else { // Clone the builder and then set the new endpoint on it. - // GH-38573: This code currently only tries the first Location and treats a failure as fatal. - // This should be changed to try other Locations that are available. - + // GH-38574: Currently a new FlightClient will be made for each partition that returns a non-empty Location // then disposed of. It may be better to cache clients because a server may report the same Locations. // It would also be good to identify when the reported location is the same as the original connection's // Location and skip creating a FlightClient in that scenario. - final URI endpointUri = endpoint.getLocations().get(0).getUri(); - final Builder builderForEndpoint = new Builder(ArrowFlightSqlClientHandler.this.builder) - .withHost(endpointUri.getHost()) - .withPort(endpointUri.getPort()) - .withEncryption(endpointUri.getScheme().equals(LocationSchemes.GRPC_TLS)); - - final ArrowFlightSqlClientHandler endpointHandler = builderForEndpoint.build(); - try { - endpoints.add(new CloseableEndpointStreamPair( - endpointHandler.sqlClient.getStream(endpoint.getTicket(), - endpointHandler.getOptions()), endpointHandler.sqlClient)); - } catch (Exception ex) { - AutoCloseables.close(endpointHandler); + List exceptions = new ArrayList<>(); + CloseableEndpointStreamPair stream = null; + for (Location location : endpoint.getLocations()) { + final URI endpointUri = location.getUri(); + final Builder builderForEndpoint = new Builder(ArrowFlightSqlClientHandler.this.builder) + .withHost(endpointUri.getHost()) + .withPort(endpointUri.getPort()) + .withEncryption(endpointUri.getScheme().equals(LocationSchemes.GRPC_TLS)); + + ArrowFlightSqlClientHandler endpointHandler = null; + try { + endpointHandler = builderForEndpoint.build(); + stream = new CloseableEndpointStreamPair( + endpointHandler.sqlClient.getStream(endpoint.getTicket(), + endpointHandler.getOptions()), endpointHandler.sqlClient); + // Make sure we actually get data from the server + stream.getStream().getSchema(); + } catch (Exception ex) { + if (endpointHandler != null) { + AutoCloseables.close(endpointHandler); + } + exceptions.add(ex); + continue; + } + break; + } + if (stream != null) { + endpoints.add(stream); + } else if (exceptions.isEmpty()) { + // This should never happen... + throw new IllegalStateException("Could not connect to endpoint and no errors occurred"); + } else { + Exception ex = exceptions.remove(0); + while (!exceptions.isEmpty()) { + ex.addSuppressed(exceptions.remove(exceptions.size() - 1)); + } throw ex; } } diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/FlightEndpointDataQueue.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/FlightEndpointDataQueue.java index 1198d89c40aef..d617026c682d2 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/FlightEndpointDataQueue.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/FlightEndpointDataQueue.java @@ -108,7 +108,14 @@ private CloseableEndpointStreamPair next(final EndpointStreamSupplier endpointSt if (endpoint != null) { return endpoint; } - } catch (final ExecutionException | InterruptedException | CancellationException e) { + } catch (final ExecutionException e) { + // Unwrap one layer + final Throwable cause = e.getCause(); + if (cause instanceof FlightRuntimeException) { + throw (FlightRuntimeException) cause; + } + throw AvaticaConnection.HELPER.wrap(e.getMessage(), e); + } catch (InterruptedException | CancellationException e) { throw AvaticaConnection.HELPER.wrap(e.getMessage(), e); } } diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ResultSetTest.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ResultSetTest.java index 0e3e015a04636..680803318e3a2 100644 --- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ResultSetTest.java +++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ResultSetTest.java @@ -39,6 +39,7 @@ import java.sql.Statement; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Random; @@ -49,7 +50,10 @@ import org.apache.arrow.driver.jdbc.utils.PartitionedFlightSqlProducer; import org.apache.arrow.flight.FlightEndpoint; import org.apache.arrow.flight.FlightProducer; +import org.apache.arrow.flight.FlightRuntimeException; import org.apache.arrow.flight.FlightServer; +import org.apache.arrow.flight.FlightStatusCode; +import org.apache.arrow.flight.Location; import org.apache.arrow.flight.Ticket; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; @@ -63,6 +67,7 @@ import org.junit.ClassRule; import org.junit.Rule; import org.junit.Test; +import org.junit.jupiter.api.Assertions; import org.junit.rules.ErrorCollector; import com.google.common.collect.ImmutableSet; @@ -351,7 +356,7 @@ public void testShouldInterruptFlightStreamsIfQueryIsCancelledMidProcessingForTi .toString(), anyOf(is(format("Error while executing SQL \"%s\": Query canceled", query)), allOf(containsString(format("Error while executing SQL \"%s\"", query)), - containsString("CANCELLED")))); + anyOf(containsString("CANCELLED"), containsString("Cancelling"))))); } } @@ -455,6 +460,90 @@ allocator, forGrpcInsecure("localhost", 0), rootProducer) } } + @Test + public void testPartitionedFlightServerIgnoreFailure() throws Exception { + final Schema schema = new Schema( + Collections.singletonList(Field.nullablePrimitive("int_column", new ArrowType.Int(32, true)))); + try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE)) { + final FlightEndpoint firstEndpoint = + new FlightEndpoint(new Ticket("first".getBytes(StandardCharsets.UTF_8)), + Location.forGrpcInsecure("127.0.0.2", 1234), + Location.forGrpcInsecure("127.0.0.3", 1234)); + + try (final PartitionedFlightSqlProducer rootProducer = new PartitionedFlightSqlProducer( + schema, firstEndpoint); + FlightServer rootServer = FlightServer.builder( + allocator, forGrpcInsecure("localhost", 0), rootProducer) + .build() + .start(); + Connection newConnection = DriverManager.getConnection(String.format( + "jdbc:arrow-flight-sql://%s:%d/?useEncryption=false", + rootServer.getLocation().getUri().getHost(), rootServer.getPort())); + Statement newStatement = newConnection.createStatement()) { + final SQLException e = Assertions.assertThrows(SQLException.class, () -> { + ResultSet result = newStatement.executeQuery("Select partitioned_data"); + while (result.next()) { + } + }); + final Throwable cause = e.getCause(); + Assertions.assertTrue(cause instanceof FlightRuntimeException); + final FlightRuntimeException fre = (FlightRuntimeException) cause; + Assertions.assertEquals(FlightStatusCode.UNAVAILABLE, fre.status().code()); + } + } + } + + @Test + public void testPartitionedFlightServerAllFailure() throws Exception { + // Arrange + final Schema schema = new Schema( + Collections.singletonList(Field.nullablePrimitive("int_column", new ArrowType.Int(32, true)))); + try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + VectorSchemaRoot firstPartition = VectorSchemaRoot.create(schema, allocator)) { + firstPartition.setRowCount(1); + ((IntVector) firstPartition.getVector(0)).set(0, 1); + + // Construct the data-only nodes first. + FlightProducer firstProducer = new PartitionedFlightSqlProducer.DataOnlyFlightSqlProducer( + new Ticket("first".getBytes(StandardCharsets.UTF_8)), firstPartition); + + final FlightServer.Builder firstBuilder = FlightServer.builder( + allocator, forGrpcInsecure("localhost", 0), firstProducer); + + // Run the data-only nodes so that we can get the Locations they are running at. + try (FlightServer firstServer = firstBuilder.build()) { + firstServer.start(); + final Location badLocation = Location.forGrpcInsecure("127.0.0.2", 1234); + final FlightEndpoint firstEndpoint = + new FlightEndpoint(new Ticket("first".getBytes(StandardCharsets.UTF_8)), + badLocation, firstServer.getLocation()); + + // Finally start the root node. + try (final PartitionedFlightSqlProducer rootProducer = new PartitionedFlightSqlProducer( + schema, firstEndpoint); + FlightServer rootServer = FlightServer.builder( + allocator, forGrpcInsecure("localhost", 0), rootProducer) + .build() + .start(); + Connection newConnection = DriverManager.getConnection(String.format( + "jdbc:arrow-flight-sql://%s:%d/?useEncryption=false", + rootServer.getLocation().getUri().getHost(), rootServer.getPort())); + Statement newStatement = newConnection.createStatement(); + // Act + ResultSet result = newStatement.executeQuery("Select partitioned_data")) { + List resultData = new ArrayList<>(); + while (result.next()) { + resultData.add(result.getInt(1)); + } + + // Assert + assertEquals(firstPartition.getRowCount(), resultData.size()); + assertTrue(resultData.contains(((IntVector) firstPartition.getVector(0)).get(0))); + } + } + } + } + @Test public void testShouldRunSelectQueryWithEmptyVectorsEmbedded() throws Exception { try (Statement statement = connection.createStatement(); From a2d072929fea956a77775671e2eaf20f9d0ed5fa Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 20 Feb 2024 22:04:21 -0300 Subject: [PATCH 33/46] GH-40074: [C++][FS][Azure] Implement `DeleteFile()` for flat-namespace storage accounts (#40075) ### Rationale for this change It was not implemented yet. ### What changes are included in this PR? - An implementation of `DeleteFile()` that is specialized to storage accounts that don't have HNS support enabled - This fixes a semantic issue: deleting a file should not delete the parent directory when the file deleted was the last one - Increased test coverage - Fix of a bug in the version that deletes files in HNS-enabled accounts (we shouldn't let `DeleteFile` delete directories even if they are empty) ### Are these changes tested? Yes. Tests were re-written and moved to `TestAzureFileSystemOnAllScenarios`. * Closes: #40074 Lead-authored-by: Felipe Oliveira Carvalho Co-authored-by: jerry.adair Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/filesystem/azurefs.cc | 180 +++++++++++++++++++--- cpp/src/arrow/filesystem/azurefs_test.cc | 184 ++++++++++++++++------- 2 files changed, 283 insertions(+), 81 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 23af67a33d688..de7cdba245ada 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1085,7 +1085,11 @@ class LeaseGuard { return Status::OK(); } - /// \brief Break the lease before deleting or renaming the resource. + /// \brief Break the lease before deleting or renaming the resource via the + /// DataLakeFileSystemClient API. + /// + /// NOTE: When using the Blobs API, this is not necessary -- you can release a + /// lease on a path after it's deleted with a lease on it. /// /// Calling this is recommended when the resource for which the lease was acquired is /// about to be deleted as there is no way of releasing the lease after that, we can @@ -1926,26 +1930,6 @@ class AzureFileSystem::Impl { } } - Status DeleteFile(const AzureLocation& location) { - RETURN_NOT_OK(ValidateFileLocation(location)); - auto file_client = datalake_service_client_->GetFileSystemClient(location.container) - .GetFileClient(location.path); - try { - auto response = file_client.Delete(); - // Only the "*IfExists" functions ever set Deleted to false. - // All the others either succeed or throw an exception. - DCHECK(response.Value.Deleted); - } catch (const Storage::StorageException& exception) { - if (exception.ErrorCode == "FilesystemNotFound" || - exception.ErrorCode == "PathNotFound") { - return PathNotFound(location); - } - return ExceptionToStatus(exception, "Failed to delete a file: ", location.path, - ": ", file_client.GetUrl()); - } - return Status::OK(); - } - private: /// \brief Create a BlobLeaseClient and acquire a lease on the container. /// @@ -1994,7 +1978,7 @@ class AzureFileSystem::Impl { /// optional (nullptr denotes blob not found) Result> AcquireBlobLease( const AzureLocation& location, std::chrono::seconds lease_duration, - bool allow_missing = false, bool retry_allowed = true) { + bool allow_missing, bool retry_allowed = true) { DCHECK(!location.container.empty() && !location.path.empty()); auto path = std::string{internal::RemoveTrailingSlash(location.path)}; auto blob_client = GetBlobClient(location.container, std::move(path)); @@ -2057,6 +2041,131 @@ class AzureFileSystem::Impl { static constexpr auto kTimeNeededForFileOrDirectoryRename = std::chrono::seconds{3}; public: + /// \pre location.container is not empty. + /// \pre location.path is not empty. + Status DeleteFileOnFileSystem(const DataLake::DataLakeFileSystemClient& adlfs_client, + const AzureLocation& location, + bool require_file_to_exist) { + DCHECK(!location.container.empty()); + DCHECK(!location.path.empty()); + auto path_no_trailing_slash = + std::string{internal::RemoveTrailingSlash(location.path)}; + auto file_client = adlfs_client.GetFileClient(path_no_trailing_slash); + try { + // This is necessary to avoid deletion of directories via DeleteFile. + auto properties = file_client.GetProperties(); + if (properties.Value.IsDirectory) { + return internal::NotAFile(location.all); + } + if (internal::HasTrailingSlash(location.path)) { + return internal::NotADir(location.all); + } + auto response = file_client.Delete(); + // Only the "*IfExists" functions ever set Deleted to false. + // All the others either succeed or throw an exception. + DCHECK(response.Value.Deleted); + } catch (const Storage::StorageException& exception) { + if (exception.StatusCode == Http::HttpStatusCode::NotFound) { + // ErrorCode can be "FilesystemNotFound", "PathNotFound"... + if (require_file_to_exist) { + return PathNotFound(location); + } + return Status::OK(); + } + return ExceptionToStatus(exception, "Failed to delete a file: ", location.path, + ": ", file_client.GetUrl()); + } + return Status::OK(); + } + + /// \pre location.container is not empty. + /// \pre location.path is not empty. + Status DeleteFileOnContainer(const Blobs::BlobContainerClient& container_client, + const AzureLocation& location, bool require_file_to_exist, + const char* operation) { + DCHECK(!location.container.empty()); + DCHECK(!location.path.empty()); + constexpr auto kFileBlobLeaseTime = std::chrono::seconds{15}; + + // When it's known that the blob doesn't exist as a file, check if it exists as a + // directory to generate the appropriate error message. + auto check_if_location_exists_as_dir = [&]() -> Status { + auto no_trailing_slash = location; + no_trailing_slash.path = internal::RemoveTrailingSlash(location.path); + no_trailing_slash.all = internal::RemoveTrailingSlash(location.all); + ARROW_ASSIGN_OR_RAISE(auto file_info, + GetFileInfo(container_client, no_trailing_slash)); + if (file_info.type() == FileType::NotFound) { + return require_file_to_exist ? PathNotFound(location) : Status::OK(); + } + if (file_info.type() == FileType::Directory) { + return internal::NotAFile(location.all); + } + return internal::HasTrailingSlash(location.path) ? internal::NotADir(location.all) + : internal::NotAFile(location.all); + }; + + // Paths ending with trailing slashes are never leading to a deletion, + // but the correct error message requires a check of the path. + if (internal::HasTrailingSlash(location.path)) { + return check_if_location_exists_as_dir(); + } + + // If the parent directory of a file is not the container itself, there is a + // risk that deleting the file also deletes the *implied directory* -- the + // directory that is implied by the existence of the file path. + // + // In this case, we must ensure that the deletion is not semantically + // equivalent to also deleting the directory. This is done by ensuring the + // directory marker blob exists before the file is deleted. + std::optional file_blob_lease_guard; + const auto parent = location.parent(); + if (!parent.path.empty()) { + // We have to check the existence of the file before checking the + // existence of the parent directory marker, so we acquire a lease on the + // file first. + ARROW_ASSIGN_OR_RAISE(auto file_blob_lease_client, + AcquireBlobLease(location, kFileBlobLeaseTime, + /*allow_missing=*/true)); + if (file_blob_lease_client) { + file_blob_lease_guard.emplace(std::move(file_blob_lease_client), + kFileBlobLeaseTime); + // Ensure the empty directory marker blob of the parent exists before the file is + // deleted. + // + // There is not need to hold a lease on the directory marker because if + // a concurrent client deletes the directory marker right after we + // create it, the file deletion itself won't be the cause of the directory + // deletion. Additionally, the fact that a lease is held on the blob path + // semantically preserves the directory -- its existence is implied + // until the blob representing the file is deleted -- even if another + // client deletes the directory marker. + RETURN_NOT_OK(EnsureEmptyDirExists(container_client, parent, operation)); + } else { + return check_if_location_exists_as_dir(); + } + } + + auto blob_client = container_client.GetBlobClient(location.path); + Blobs::DeleteBlobOptions options; + if (file_blob_lease_guard) { + options.AccessConditions.LeaseId = file_blob_lease_guard->LeaseId(); + } + try { + auto response = blob_client.Delete(options); + // Only the "*IfExists" functions ever set Deleted to false. + // All the others either succeed or throw an exception. + DCHECK(response.Value.Deleted); + } catch (const Storage::StorageException& exception) { + if (exception.StatusCode == Http::HttpStatusCode::NotFound) { + return check_if_location_exists_as_dir(); + } + return ExceptionToStatus(exception, "Failed to delete a file: ", location.all, ": ", + blob_client.GetUrl()); + } + return Status::OK(); + } + /// The conditions for a successful container rename are derived from the /// conditions for a successful `Move("/$src.container", "/$dest.container")`. /// The numbers here match the list in `Move`. @@ -2238,7 +2347,8 @@ class AzureFileSystem::Impl { const auto dest_path = std::string{internal::RemoveTrailingSlash(dest.path)}; // Ensure that src exists and, if path has a trailing slash, that it's a directory. - ARROW_ASSIGN_OR_RAISE(auto src_lease_client, AcquireBlobLease(src, kLeaseDuration)); + ARROW_ASSIGN_OR_RAISE(auto src_lease_client, + AcquireBlobLease(src, kLeaseDuration, /*allow_missing=*/false)); LeaseGuard src_lease_guard{std::move(src_lease_client), kLeaseDuration}; // It might be necessary to check src is a directory 0-3 times in this function, // so we use a lazy evaluation function to avoid redundant calls to GetFileInfo(). @@ -2551,7 +2661,29 @@ Status AzureFileSystem::DeleteRootDirContents() { Status AzureFileSystem::DeleteFile(const std::string& path) { ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(path)); - return impl_->DeleteFile(location); + if (location.container.empty()) { + return Status::Invalid("DeleteFile requires a non-empty path."); + } + auto container_client = impl_->GetBlobContainerClient(location.container); + if (location.path.empty()) { + // Container paths (locations w/o path) are either not found or represent directories. + ARROW_ASSIGN_OR_RAISE(auto container_info, + GetContainerPropsAsFileInfo(location, container_client)); + return container_info.IsDirectory() ? NotAFile(location) : PathNotFound(location); + } + auto adlfs_client = impl_->GetFileSystemClient(location.container); + ARROW_ASSIGN_OR_RAISE(auto hns_support, + impl_->HierarchicalNamespaceSupport(adlfs_client)); + if (hns_support == HNSSupport::kContainerNotFound) { + return PathNotFound(location); + } + if (hns_support == HNSSupport::kEnabled) { + return impl_->DeleteFileOnFileSystem(adlfs_client, location, + /*require_file_to_exist=*/true); + } + return impl_->DeleteFileOnContainer(container_client, location, + /*require_file_to_exist=*/true, + /*operation=*/"DeleteFile"); } Status AzureFileSystem::Move(const std::string& src, const std::string& dest) { diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index e6bd80d1d2508..7f5cd247a8d35 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -641,6 +641,18 @@ class TestAzureFileSystem : public ::testing::Test { #endif } + static bool WithErrno(const Status& status, int expected_errno) { + auto* detail = status.detail().get(); + return detail && + arrow::internal::ErrnoFromStatusDetail(*detail).value_or(-1) == expected_errno; + } + +#define ASSERT_RAISES_ERRNO(expr, expected_errno) \ + for (::arrow::Status _st = ::arrow::internal::GenericToStatus((expr)); \ + !WithErrno(_st, (expected_errno));) \ + FAIL() << "'" ARROW_STRINGIFY(expr) "' did not fail with errno=" << #expected_errno \ + << ": " << _st.ToString() + // Tests that are called from more than one implementation of TestAzureFileSystem void TestDetectHierarchicalNamespace(bool trip_up_azurite); @@ -935,6 +947,106 @@ class TestAzureFileSystem : public ::testing::Test { ASSERT_RAISES(IOError, fs()->DeleteDirContents(directory_path, false)); } + void TestDeleteFileAtRoot() { + ASSERT_RAISES_ERRNO(fs()->DeleteFile("file0"), ENOENT); + ASSERT_RAISES_ERRNO(fs()->DeleteFile("file1/"), ENOENT); + const auto container_name = PreexistingData::RandomContainerName(rng_); + if (WithHierarchicalNamespace()) { + ARROW_UNUSED(CreateFilesystem(container_name)); + } else { + ARROW_UNUSED(CreateContainer(container_name)); + } + arrow::fs::AssertFileInfo(fs(), container_name, FileType::Directory); + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Not a regular file: '" + container_name + "'"), + fs()->DeleteFile(container_name)); + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Not a regular file: '" + container_name + "/'"), + fs()->DeleteFile(container_name + "/")); + } + + void TestDeleteFileAtContainerRoot() { + auto data = SetUpPreexistingData(); + + ASSERT_RAISES_ERRNO(fs()->DeleteFile(data.Path("nonexistent-path")), ENOENT); + ASSERT_RAISES_ERRNO(fs()->DeleteFile(data.Path("nonexistent-path/")), ENOENT); + + arrow::fs::AssertFileInfo(fs(), data.ObjectPath(), FileType::File); + ASSERT_OK(fs()->DeleteFile(data.ObjectPath())); + arrow::fs::AssertFileInfo(fs(), data.ObjectPath(), FileType::NotFound); + + if (WithHierarchicalNamespace()) { + auto adlfs_client = + datalake_service_client_->GetFileSystemClient(data.container_name); + CreateFile(adlfs_client, data.kObjectName, PreexistingData::kLoremIpsum); + } else { + auto container_client = CreateContainer(data.container_name); + CreateBlob(container_client, data.kObjectName, PreexistingData::kLoremIpsum); + } + arrow::fs::AssertFileInfo(fs(), data.ObjectPath(), FileType::File); + + ASSERT_RAISES_ERRNO(fs()->DeleteFile(data.ObjectPath() + "/"), ENOTDIR); + ASSERT_OK(fs()->DeleteFile(data.ObjectPath())); + arrow::fs::AssertFileInfo(fs(), data.ObjectPath(), FileType::NotFound); + } + + void TestDeleteFileAtSubdirectory(bool create_empty_dir_marker_first) { + auto data = SetUpPreexistingData(); + + auto setup_dir_file0 = [this, create_empty_dir_marker_first, &data]() { + if (WithHierarchicalNamespace()) { + ASSERT_FALSE(create_empty_dir_marker_first); + auto adlfs_client = + datalake_service_client_->GetFileSystemClient(data.container_name); + CreateFile(adlfs_client, "dir/file0", PreexistingData::kLoremIpsum); + } else { + auto container_client = CreateContainer(data.container_name); + if (create_empty_dir_marker_first) { + CreateBlob(container_client, "dir/", ""); + } + CreateBlob(container_client, "dir/file0", PreexistingData::kLoremIpsum); + } + }; + setup_dir_file0(); + + // Trying to delete a non-existing file in an existing directory should fail + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, + ::testing::HasSubstr("Path does not exist '" + data.Path("dir/nonexistent-path") + + "'"), + fs()->DeleteFile(data.Path("dir/nonexistent-path"))); + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, + ::testing::HasSubstr("Path does not exist '" + + data.Path("dir/nonexistent-path/") + "'"), + fs()->DeleteFile(data.Path("dir/nonexistent-path/"))); + + // Trying to delete the directory with DeleteFile should fail + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Not a regular file: '" + data.Path("dir") + "'"), + fs()->DeleteFile(data.Path("dir"))); + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Not a regular file: '" + data.Path("dir/") + "'"), + fs()->DeleteFile(data.Path("dir/"))); + + arrow::fs::AssertFileInfo(fs(), data.Path("dir"), FileType::Directory); + arrow::fs::AssertFileInfo(fs(), data.Path("dir/"), FileType::Directory); + arrow::fs::AssertFileInfo(fs(), data.Path("dir/file0"), FileType::File); + ASSERT_OK(fs()->DeleteFile(data.Path("dir/file0"))); + arrow::fs::AssertFileInfo(fs(), data.Path("dir"), FileType::Directory); + arrow::fs::AssertFileInfo(fs(), data.Path("dir/"), FileType::Directory); + arrow::fs::AssertFileInfo(fs(), data.Path("dir/file0"), FileType::NotFound); + + // Recreating the file on the same path gurantees leases were properly released/broken + setup_dir_file0(); + + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, + ::testing::HasSubstr("Not a directory: '" + data.Path("dir/file0/") + "'"), + fs()->DeleteFile(data.Path("dir/file0/"))); + arrow::fs::AssertFileInfo(fs(), data.Path("dir/file0"), FileType::File); + } + private: using StringMatcher = ::testing::PolymorphicMatcher<::testing::internal::HasSubstrMatcher>; @@ -1092,12 +1204,6 @@ class TestAzureFileSystem : public ::testing::Test { AssertFileInfo(fs(), dest, type); } - static bool WithErrno(const Status& status, int expected_errno) { - auto* detail = status.detail().get(); - return detail && - arrow::internal::ErrnoFromStatusDetail(*detail).value_or(-1) == expected_errno; - } - std::optional MoveErrorMessageMatcher(const FileInfo& src_info, const std::string& src, const std::string& dest, @@ -1596,6 +1702,21 @@ TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirContentsFailureNonexisten this->TestDeleteDirContentsFailureNonexistent(); } +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteFileAtRoot) { + this->TestDeleteFileAtRoot(); +} + +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteFileAtContainerRoot) { + this->TestDeleteFileAtContainerRoot(); +} + +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteFileAtSubdirectory) { + this->TestDeleteFileAtSubdirectory(/*create_empty_dir_marker_first=*/false); + if (!this->WithHierarchicalNamespace()) { + this->TestDeleteFileAtSubdirectory(/*create_empty_dir_marker_first=*/true); + } +} + TYPED_TEST(TestAzureFileSystemOnAllScenarios, RenameContainer) { this->TestRenameContainer(); } @@ -1884,57 +2005,6 @@ TEST_F(TestAzuriteFileSystem, DeleteDirContentsFailureNonexistent) { this->TestDeleteDirContentsFailureNonexistent(); } -TEST_F(TestAzuriteFileSystem, DeleteFileSuccess) { - const auto container_name = PreexistingData::RandomContainerName(rng_); - const auto file_name = ConcatAbstractPath(container_name, "filename"); - if (WithHierarchicalNamespace()) { - auto adlfs_client = CreateFilesystem(container_name); - CreateFile(adlfs_client, "filename", "data"); - } else { - auto container = CreateContainer(container_name); - CreateBlob(container, "filename", "data"); - } - arrow::fs::AssertFileInfo(fs(), file_name, FileType::File); - ASSERT_OK(fs()->DeleteFile(file_name)); - arrow::fs::AssertFileInfo(fs(), file_name, FileType::NotFound); -} - -TEST_F(TestAzuriteFileSystem, DeleteFileFailureNonexistent) { - const auto container_name = PreexistingData::RandomContainerName(rng_); - const auto nonexistent_file_name = ConcatAbstractPath(container_name, "nonexistent"); - if (WithHierarchicalNamespace()) { - ARROW_UNUSED(CreateFilesystem(container_name)); - } else { - ARROW_UNUSED(CreateContainer(container_name)); - } - ASSERT_RAISES(IOError, fs()->DeleteFile(nonexistent_file_name)); -} - -TEST_F(TestAzuriteFileSystem, DeleteFileFailureContainer) { - const auto container_name = PreexistingData::RandomContainerName(rng_); - if (WithHierarchicalNamespace()) { - ARROW_UNUSED(CreateFilesystem(container_name)); - } else { - ARROW_UNUSED(CreateContainer(container_name)); - } - arrow::fs::AssertFileInfo(fs(), container_name, FileType::Directory); - ASSERT_RAISES(IOError, fs()->DeleteFile(container_name)); -} - -TEST_F(TestAzuriteFileSystem, DeleteFileFailureDirectory) { - auto container_name = PreexistingData::RandomContainerName(rng_); - if (WithHierarchicalNamespace()) { - auto adlfs_client = CreateFilesystem(container_name); - CreateDirectory(adlfs_client, "directory"); - } else { - auto container = CreateContainer(container_name); - CreateBlob(container, "directory/"); - } - auto directory_path = ConcatAbstractPath(container_name, "directory"); - arrow::fs::AssertFileInfo(fs(), directory_path, FileType::Directory); - ASSERT_RAISES(IOError, fs()->DeleteFile(directory_path)); -} - TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationNonexistent) { auto data = SetUpPreexistingData(); const auto destination_path = data.ContainerPath("copy-destionation"); From 29a0581f5bfcad86a6493854f8be8fcb6ffe2fbc Mon Sep 17 00:00:00 2001 From: Matthew McNew Date: Tue, 20 Feb 2024 19:59:57 -0600 Subject: [PATCH 34/46] GH-39870: [Go] Include buffered pages in TotalBytesWritten (#40105) ### Rationale for this change Currently, buffered data pages are not included in TotalBytesWritten this means that their is not an accurate estimate of the size of the current size. ### Are there any user-facing changes? `RowGroupTotalBytesWritten` will include the TotalBytes in buffered DataPages minus the buffered data pages headers. * Closes: #39870 Authored-by: Matthew McNew Signed-off-by: Matt Topol --- go/parquet/file/column_writer.go | 7 ++++++- go/parquet/file/column_writer_test.go | 14 ++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/go/parquet/file/column_writer.go b/go/parquet/file/column_writer.go index 4d603c547ca6a..91f5d18942958 100755 --- a/go/parquet/file/column_writer.go +++ b/go/parquet/file/column_writer.go @@ -198,7 +198,12 @@ func (w *columnWriter) TotalCompressedBytes() int64 { } func (w *columnWriter) TotalBytesWritten() int64 { - return w.totalBytesWritten + bufferedPagesBytes := int64(0) + for _, p := range w.pages { + bufferedPagesBytes += int64(len(p.Data())) + } + + return w.totalBytesWritten + bufferedPagesBytes } func (w *columnWriter) RowsWritten() int { diff --git a/go/parquet/file/column_writer_test.go b/go/parquet/file/column_writer_test.go index dd597e280b850..d78e1c6761be0 100755 --- a/go/parquet/file/column_writer_test.go +++ b/go/parquet/file/column_writer_test.go @@ -430,6 +430,11 @@ func (p *PrimitiveWriterTestSuite) testDictionaryFallbackEncoding(version parque } func (p *PrimitiveWriterTestSuite) testDictionaryFallbackAndCompressedSize(version parquet.Version) { + // skip boolean as dictionary encoding is not used + if p.Typ.Kind() == reflect.Bool { + return + } + p.GenerateData(SmallSize) props := parquet.DefaultColumnProperties() props.DictionaryEnabled = true @@ -440,13 +445,14 @@ func (p *PrimitiveWriterTestSuite) testDictionaryFallbackAndCompressedSize(versi props.Encoding = parquet.Encodings.RLEDict } - writer := p.buildWriter(SmallSize, props, parquet.WithVersion(version)) + writer := p.buildWriter(SmallSize, props, parquet.WithVersion(version), parquet.WithDataPageSize(SmallSize-1)) p.WriteBatchValues(writer, nil, nil) + p.NotZero(writer.TotalBytesWritten()) writer.FallbackToPlain() - p.NotEqual(0, writer.TotalCompressedBytes()) + p.NotZero(writer.TotalCompressedBytes()) writer.Close() - p.NotEqual(0, writer.TotalCompressedBytes()) - p.NotEqual(0, writer.TotalBytesWritten()) + p.NotZero(writer.TotalCompressedBytes()) + p.NotZero(writer.TotalBytesWritten()) } func (p *PrimitiveWriterTestSuite) TestRequiredPlain() { From 1ffed20f4008a1b3bd06deb904d94ff668cde42a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= Date: Wed, 21 Feb 2024 09:58:42 +0100 Subject: [PATCH 35/46] GH-40153: [Python] Update size assumptions for 32-bit platforms (#40165) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change This fixes two tests on 32-bit platforms (tested on x86 specifically). ### What changes are included in this PR? - update the `pd.object_` size assumption to 4 bytes on 32-bit platforms - update the `pa.schema` size assumptions to be twice smaller on 32-bit platforms ### Are these changes tested? The changes fix tests. ### Are there any user-facing changes? Only test fixes. * Closes: #40153 Authored-by: Michał Górny Signed-off-by: Antoine Pitrou --- python/pyarrow/tests/test_pandas.py | 5 +++-- python/pyarrow/tests/test_schema.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 676cc96151161..89a241a27efe0 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -2608,8 +2608,9 @@ def test_from_numpy_nested(self): ('yy', np.bool_)])), ('y', np.int16), ('z', np.object_)]) - # Note: itemsize is not a multiple of sizeof(object) - assert dt.itemsize == 12 + # Note: itemsize is not necessarily a multiple of sizeof(object) + # object_ is 8 bytes on 64-bit systems, 4 bytes on 32-bit systems + assert dt.itemsize == (12 if sys.maxsize > 2**32 else 8) ty = pa.struct([pa.field('x', pa.struct([pa.field('xx', pa.int8()), pa.field('yy', pa.bool_())])), pa.field('y', pa.int16()), diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index fa75fcea30db7..8793c9e773c1d 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -681,7 +681,8 @@ def test_schema_sizeof(): pa.field('bar', pa.string()), ]) - assert sys.getsizeof(schema) > 30 + # Note: pa.schema is twice as large on 64-bit systems + assert sys.getsizeof(schema) > (30 if sys.maxsize > 2**32 else 15) schema2 = schema.with_metadata({"key": "some metadata"}) assert sys.getsizeof(schema2) > sys.getsizeof(schema) From b51c318122ff5db1c6c4c70a69b0c804f9e31704 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 21 Feb 2024 11:24:06 +0100 Subject: [PATCH 36/46] MINOR: [Dev] Remove auto close link forced in PR description (#40178) For some reason, https://github.com/apache/arrow/pull/14783 changed the automatic GH issue link to a "Closes" reference that will forcefully close the linked issue *even if the committer chooses not to close the issue using the merge script*. Since the original change was done without discussion, this is a MINOR PR as well. Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- .github/workflows/dev_pr/link.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dev_pr/link.js b/.github/workflows/dev_pr/link.js index 174bd3bae650a..a70dbc604c377 100644 --- a/.github/workflows/dev_pr/link.js +++ b/.github/workflows/dev_pr/link.js @@ -82,7 +82,7 @@ async function commentJIRAURL(github, context, pullRequestNumber, jiraID) { async function commentGitHubURL(github, context, pullRequestNumber, issueID) { // Make the call to ensure issue exists before adding comment const issueInfo = await helpers.getGitHubInfo(github, context, issueID, pullRequestNumber); - const message = "* Closes: #" + issueInfo.number + const message = "* GitHub Issue: #" + issueInfo.number if (issueInfo) { const body = context.payload.pull_request.body || ""; if (body.includes(message)) { From 6a22a1dee78b0f7daa7e4d8793d663e29a5712a6 Mon Sep 17 00:00:00 2001 From: Divyansh200102 <146909065+Divyansh200102@users.noreply.github.com> Date: Wed, 21 Feb 2024 20:00:24 +0530 Subject: [PATCH 37/46] GH-39291: [Docs] Remove the "Show source" links from doc pages (#40167) ### Rationale for this change To fix the show source button links to 404 page problem ### What changes are included in this PR? The show source button link will be removed. ### Are these changes tested? Not yet ### Are there any user-facing changes? Yes * Closes: #39291 * GitHub Issue: #39291 Authored-by: Divyansh200102 Signed-off-by: Joris Van den Bossche --- docs/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 5af7b7955fdde..c6be6cb94cfb5 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -414,7 +414,7 @@ # If true, links to the reST sources are added to the pages. # -# html_show_sourcelink = True +html_show_sourcelink = False # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. # From c3444469570eb33f32a6f960ffa1d2e446c271f3 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 21 Feb 2024 15:56:38 +0100 Subject: [PATCH 38/46] GH-40153: [Python][C++] Fix large file handling on 32-bit Python build (#40176) ### Rationale for this change Python large file tests fail on 32-bit platforms. ### What changes are included in this PR? 1. Fix passing `int64_t` position to the Python file methods when a Python file object is wrapped in an Arrow `RandomAccessFile` 2. Disallow creating a `MemoryMappedFile` spanning more than the `size_t` maximum, instead of silently truncating its length ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #40153 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/io/file.cc | 22 ++++++++++++++-------- python/pyarrow/src/arrow/python/io.cc | 15 +++++++++------ python/pyarrow/tests/test_io.py | 26 +++++++++++++++++++------- 3 files changed, 42 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc index 543fa90a86e9b..3b18bb7b0f0f4 100644 --- a/cpp/src/arrow/io/file.cc +++ b/cpp/src/arrow/io/file.cc @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -560,17 +561,22 @@ class MemoryMappedFile::MemoryMap RETURN_NOT_OK(::arrow::internal::FileTruncate(file_->fd(), initial_size)); } - size_t mmap_length = static_cast(initial_size); - if (length > initial_size) { - return Status::Invalid("mapping length is beyond file size"); - } - if (length >= 0 && length < initial_size) { + int64_t mmap_length = initial_size; + if (length >= 0) { // memory mapping a file region - mmap_length = static_cast(length); + if (length > initial_size) { + return Status::Invalid("mapping length is beyond file size"); + } + mmap_length = length; + } + if (static_cast(static_cast(mmap_length)) != mmap_length) { + return Status::CapacityError("Requested memory map length ", mmap_length, + " does not fit in a C size_t " + "(are you using a 32-bit build of Arrow?"); } - void* result = mmap(nullptr, mmap_length, prot_flags_, map_mode_, file_->fd(), - static_cast(offset)); + void* result = mmap(nullptr, static_cast(mmap_length), prot_flags_, map_mode_, + file_->fd(), static_cast(offset)); if (result == MAP_FAILED) { return Status::IOError("Memory mapping file failed: ", ::arrow::internal::ErrnoMessage(errno)); diff --git a/python/pyarrow/src/arrow/python/io.cc b/python/pyarrow/src/arrow/python/io.cc index 43f8297c5a7ec..197f8b9d39804 100644 --- a/python/pyarrow/src/arrow/python/io.cc +++ b/python/pyarrow/src/arrow/python/io.cc @@ -92,9 +92,12 @@ class PythonFile { Status Seek(int64_t position, int whence) { RETURN_NOT_OK(CheckClosed()); + // NOTE: `long long` is at least 64 bits in the C standard, the cast below is + // therefore safe. + // whence: 0 for relative to start of file, 2 for end of file - PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "seek", "(ni)", - static_cast(position), whence); + PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "seek", "(Li)", + static_cast(position), whence); Py_XDECREF(result); PY_RETURN_IF_ERROR(StatusCode::IOError); return Status::OK(); @@ -103,16 +106,16 @@ class PythonFile { Status Read(int64_t nbytes, PyObject** out) { RETURN_NOT_OK(CheckClosed()); - PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read", "(n)", - static_cast(nbytes)); + PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read", "(L)", + static_cast(nbytes)); PY_RETURN_IF_ERROR(StatusCode::IOError); *out = result; return Status::OK(); } Status ReadBuffer(int64_t nbytes, PyObject** out) { - PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read_buffer", "(n)", - static_cast(nbytes)); + PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read_buffer", "(L)", + static_cast(nbytes)); PY_RETURN_IF_ERROR(StatusCode::IOError); *out = result; return Status::OK(); diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index 5a495aa80abdf..17eab871a2575 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -36,7 +36,7 @@ import pyarrow as pa -def check_large_seeks(file_factory): +def check_large_seeks(file_factory, expected_error=None): if sys.platform in ('win32', 'darwin'): pytest.skip("need sparse file support") try: @@ -45,11 +45,16 @@ def check_large_seeks(file_factory): f.truncate(2 ** 32 + 10) f.seek(2 ** 32 + 5) f.write(b'mark\n') - with file_factory(filename) as f: - assert f.seek(2 ** 32 + 5) == 2 ** 32 + 5 - assert f.tell() == 2 ** 32 + 5 - assert f.read(5) == b'mark\n' - assert f.tell() == 2 ** 32 + 10 + if expected_error: + with expected_error: + file_factory(filename) + else: + with file_factory(filename) as f: + assert f.size() == 2 ** 32 + 10 + assert f.seek(2 ** 32 + 5) == 2 ** 32 + 5 + assert f.tell() == 2 ** 32 + 5 + assert f.read(5) == b'mark\n' + assert f.tell() == 2 ** 32 + 10 finally: os.unlink(filename) @@ -1137,7 +1142,14 @@ def test_memory_zero_length(tmpdir): def test_memory_map_large_seeks(): - check_large_seeks(pa.memory_map) + if sys.maxsize >= 2**32: + expected_error = None + else: + expected_error = pytest.raises( + pa.ArrowCapacityError, + match="Requested memory map length 4294967306 " + "does not fit in a C size_t") + check_large_seeks(pa.memory_map, expected_error=expected_error) def test_memory_map_close_remove(tmpdir): From 8a62f30d34a606c8edca6cfaad56846e0e7aceea Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 21 Feb 2024 16:29:18 +0000 Subject: [PATCH 39/46] GH-40037: [C++][FS][Azure] Make attempted reads and writes against directories fail fast (#40119) ### Rationale for this change Prevent confusion if a user attempts to read or write a directory. ### What changes are included in this PR? - Make `ObjectAppendStream::Flush` a noop if `ObjectAppendStream::Init` has not run successfully. This avoids an unhandled error when the destructor calls flush. - Check blob properties for directory marker metadata when initialising `ObjectInputFile` or `ObjectAppendStream`. - When initialising `ObjectAppendStream` call `GetFileInfo` if it is a flat namespace account. ### Are these changes tested? Add new tests `DisallowReadingOrWritingDirectoryMarkers` and `DisallowCreatingFileAndDirectoryWithTheSameName` to cover the new fail fast behaviour. Also updated `WriteMetadata` to ensure that my changes to Flush didn't break setting metadata without calling `Write` on the stream. ### Are there any user-facing changes? Yes. Invalid read and write operations will now fail fast and gracefully. Previously could get into a confusing state where there were files and directories at the same path and there were some un-graceful failures. * Closes: #40037 Authored-by: Thomas Newton Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/filesystem/azurefs.cc | 95 +++++++++++++++++++----- cpp/src/arrow/filesystem/azurefs_test.cc | 60 +++++++++++++++ 2 files changed, 135 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index de7cdba245ada..8ae33b8818827 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -347,6 +347,22 @@ bool IsContainerNotFound(const Storage::StorageException& e) { return false; } +const auto kHierarchicalNamespaceIsDirectoryMetadataKey = "hdi_isFolder"; +const auto kFlatNamespaceIsDirectoryMetadataKey = "is_directory"; + +bool MetadataIndicatesIsDirectory(const Storage::Metadata& metadata) { + // Inspired by + // https://github.com/Azure/azure-sdk-for-cpp/blob/12407e8bfcb9bc1aa43b253c1d0ec93bf795ae3b/sdk/storage/azure-storage-files-datalake/src/datalake_utilities.cpp#L86-L91 + auto hierarchical_directory_metadata = + metadata.find(kHierarchicalNamespaceIsDirectoryMetadataKey); + if (hierarchical_directory_metadata != metadata.end()) { + return hierarchical_directory_metadata->second == "true"; + } + auto flat_directory_metadata = metadata.find(kFlatNamespaceIsDirectoryMetadataKey); + return flat_directory_metadata != metadata.end() && + flat_directory_metadata->second == "true"; +} + template std::string FormatValue(typename TypeTraits::CType value) { struct StringAppender { @@ -512,11 +528,18 @@ class ObjectInputFile final : public io::RandomAccessFile { Status Init() { if (content_length_ != kNoSize) { + // When the user provides the file size we don't validate that its a file. This is + // only a read so its not a big deal if the user makes a mistake. DCHECK_GE(content_length_, 0); return Status::OK(); } try { + // To open an ObjectInputFile the Blob must exist and it must not represent + // a directory. Additionally we need to know the file size. auto properties = blob_client_->GetProperties(); + if (MetadataIndicatesIsDirectory(properties.Value.Metadata)) { + return NotAFile(location_); + } content_length_ = properties.Value.BlobSize; metadata_ = PropertiesToMetadata(properties.Value); return Status::OK(); @@ -698,11 +721,10 @@ class ObjectAppendStream final : public io::OutputStream { ObjectAppendStream(std::shared_ptr block_blob_client, const io::IOContext& io_context, const AzureLocation& location, const std::shared_ptr& metadata, - const AzureOptions& options, int64_t size = kNoSize) + const AzureOptions& options) : block_blob_client_(std::move(block_blob_client)), io_context_(io_context), - location_(location), - content_length_(size) { + location_(location) { if (metadata && metadata->size() != 0) { metadata_ = ArrowMetadataToAzureMetadata(metadata); } else if (options.default_metadata && options.default_metadata->size() != 0) { @@ -716,17 +738,31 @@ class ObjectAppendStream final : public io::OutputStream { io::internal::CloseFromDestructor(this); } - Status Init() { - if (content_length_ != kNoSize) { - DCHECK_GE(content_length_, 0); - pos_ = content_length_; + Status Init(const bool truncate, + std::function ensure_not_flat_namespace_directory) { + if (truncate) { + content_length_ = 0; + pos_ = 0; + // We need to create an empty file overwriting any existing file, but + // fail if there is an existing directory. + RETURN_NOT_OK(ensure_not_flat_namespace_directory()); + // On hierarchical namespace CreateEmptyBlockBlob will fail if there is an existing + // directory so we don't need to check like we do on flat namespace. + RETURN_NOT_OK(CreateEmptyBlockBlob(*block_blob_client_)); } else { try { auto properties = block_blob_client_->GetProperties(); + if (MetadataIndicatesIsDirectory(properties.Value.Metadata)) { + return NotAFile(location_); + } content_length_ = properties.Value.BlobSize; pos_ = content_length_; } catch (const Storage::StorageException& exception) { if (exception.StatusCode == Http::HttpStatusCode::NotFound) { + // No file exists but on flat namespace its possible there is a directory + // marker or an implied directory. Ensure there is no directory before starting + // a new empty file. + RETURN_NOT_OK(ensure_not_flat_namespace_directory()); RETURN_NOT_OK(CreateEmptyBlockBlob(*block_blob_client_)); } else { return ExceptionToStatus( @@ -743,6 +779,7 @@ class ObjectAppendStream final : public io::OutputStream { block_ids_.push_back(block.Name); } } + initialised_ = true; return Status::OK(); } @@ -789,6 +826,11 @@ class ObjectAppendStream final : public io::OutputStream { Status Flush() override { RETURN_NOT_OK(CheckClosed("flush")); + if (!initialised_) { + // If the stream has not been successfully initialized then there is nothing to + // flush. This also avoids some unhandled errors when flushing in the destructor. + return Status::OK(); + } return CommitBlockList(block_blob_client_, block_ids_, metadata_); } @@ -840,10 +882,11 @@ class ObjectAppendStream final : public io::OutputStream { std::shared_ptr block_blob_client_; const io::IOContext io_context_; const AzureLocation location_; + int64_t content_length_ = kNoSize; bool closed_ = false; + bool initialised_ = false; int64_t pos_ = 0; - int64_t content_length_ = kNoSize; std::vector block_ids_; Storage::Metadata metadata_; }; @@ -1666,20 +1709,32 @@ class AzureFileSystem::Impl { AzureFileSystem* fs) { RETURN_NOT_OK(ValidateFileLocation(location)); + const auto blob_container_client = GetBlobContainerClient(location.container); auto block_blob_client = std::make_shared( - blob_service_client_->GetBlobContainerClient(location.container) - .GetBlockBlobClient(location.path)); + blob_container_client.GetBlockBlobClient(location.path)); + + auto ensure_not_flat_namespace_directory = [this, location, + blob_container_client]() -> Status { + ARROW_ASSIGN_OR_RAISE( + auto hns_support, + HierarchicalNamespaceSupport(GetFileSystemClient(location.container))); + if (hns_support == HNSSupport::kDisabled) { + // Flat namespace so we need to GetFileInfo in-case its a directory. + ARROW_ASSIGN_OR_RAISE(auto status, GetFileInfo(blob_container_client, location)) + if (status.type() == FileType::Directory) { + return NotAFile(location); + } + } + // kContainerNotFound - it doesn't exist, so no need to check if its a directory. + // kEnabled - hierarchical namespace so Azure APIs will fail if its a directory. We + // don't need to explicitly check. + return Status::OK(); + }; std::shared_ptr stream; - if (truncate) { - RETURN_NOT_OK(CreateEmptyBlockBlob(*block_blob_client)); - stream = std::make_shared(block_blob_client, fs->io_context(), - location, metadata, options_, 0); - } else { - stream = std::make_shared(block_blob_client, fs->io_context(), - location, metadata, options_); - } - RETURN_NOT_OK(stream->Init()); + stream = std::make_shared(block_blob_client, fs->io_context(), + location, metadata, options_); + RETURN_NOT_OK(stream->Init(truncate, ensure_not_flat_namespace_directory)); return stream; } @@ -1694,7 +1749,7 @@ class AzureFileSystem::Impl { // on directory marker blobs. // https://github.com/fsspec/adlfs/blob/32132c4094350fca2680155a5c236f2e9f991ba5/adlfs/spec.py#L855-L870 Blobs::UploadBlockBlobFromOptions blob_options; - blob_options.Metadata.emplace("is_directory", "true"); + blob_options.Metadata.emplace(kFlatNamespaceIsDirectoryMetadataKey, "true"); block_blob_client.UploadFrom(nullptr, 0, blob_options); } diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 7f5cd247a8d35..f21876f03cc95 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -838,6 +838,41 @@ class TestAzureFileSystem : public ::testing::Test { AssertFileInfo(fs(), subdir3, FileType::Directory); } + void TestDisallowReadingOrWritingDirectoryMarkers() { + auto data = SetUpPreexistingData(); + auto directory_path = data.Path("directory"); + + ASSERT_OK(fs()->CreateDir(directory_path)); + ASSERT_RAISES(IOError, fs()->OpenInputFile(directory_path)); + ASSERT_RAISES(IOError, fs()->OpenOutputStream(directory_path)); + ASSERT_RAISES(IOError, fs()->OpenAppendStream(directory_path)); + + auto directory_path_with_slash = directory_path + "/"; + ASSERT_RAISES(IOError, fs()->OpenInputFile(directory_path_with_slash)); + ASSERT_RAISES(IOError, fs()->OpenOutputStream(directory_path_with_slash)); + ASSERT_RAISES(IOError, fs()->OpenAppendStream(directory_path_with_slash)); + } + + void TestDisallowCreatingFileAndDirectoryWithTheSameName() { + auto data = SetUpPreexistingData(); + auto path1 = data.Path("directory1"); + ASSERT_OK(fs()->CreateDir(path1)); + ASSERT_RAISES(IOError, fs()->OpenOutputStream(path1)); + ASSERT_RAISES(IOError, fs()->OpenAppendStream(path1)); + AssertFileInfo(fs(), path1, FileType::Directory); + + auto path2 = data.Path("directory2"); + ASSERT_OK(fs()->OpenOutputStream(path2)); + // CreateDir returns OK even if there is already a file or directory at this + // location. Whether or not this is the desired behaviour is debatable. + ASSERT_OK(fs()->CreateDir(path2)); + AssertFileInfo(fs(), path2, FileType::File); + } + + void TestOpenOutputStreamWithMissingContainer() { + ASSERT_RAISES(IOError, fs()->OpenOutputStream("not-a-container/file", {})); + } + void TestDeleteDirSuccessEmpty() { if (HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; @@ -1665,6 +1700,19 @@ TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirOnMissingContainer) { this->TestCreateDirOnMissingContainer(); } +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DisallowReadingOrWritingDirectoryMarkers) { + this->TestDisallowReadingOrWritingDirectoryMarkers(); +} + +TYPED_TEST(TestAzureFileSystemOnAllScenarios, + DisallowCreatingFileAndDirectoryWithTheSameName) { + this->TestDisallowCreatingFileAndDirectoryWithTheSameName(); +} + +TYPED_TEST(TestAzureFileSystemOnAllScenarios, OpenOutputStreamWithMissingContainer) { + this->TestOpenOutputStreamWithMissingContainer(); +} + TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirSuccessEmpty) { this->TestDeleteDirSuccessEmpty(); } @@ -2232,6 +2280,18 @@ TEST_F(TestAzuriteFileSystem, WriteMetadata) { .Value.Metadata; // Defaults are overwritten and not merged. EXPECT_EQ(Core::CaseInsensitiveMap{std::make_pair("bar", "foo")}, blob_metadata); + + // Metadata can be written without writing any data. + ASSERT_OK_AND_ASSIGN( + output, fs_with_defaults->OpenAppendStream( + full_path, /*metadata=*/arrow::key_value_metadata({{"bar", "baz"}}))); + ASSERT_OK(output->Close()); + blob_metadata = blob_service_client_->GetBlobContainerClient(data.container_name) + .GetBlockBlobClient(blob_path) + .GetProperties() + .Value.Metadata; + // Defaults are overwritten and not merged. + EXPECT_EQ(Core::CaseInsensitiveMap{std::make_pair("bar", "baz")}, blob_metadata); } TEST_F(TestAzuriteFileSystem, OpenOutputStreamSmall) { From 884a10931038b689fca6a85178f702e1045f4e61 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 22 Feb 2024 00:54:04 +0800 Subject: [PATCH 40/46] GH-40174: [C++][CI][Parquet] Fixing parquet column_writer_test building (#40175) ### Rationale for this change Remove `ThrowsMessage` for CI build. ### What changes are included in this PR? Remove `ThrowsMessage` for CI build. ### Are these changes tested? no need ### Are there any user-facing changes? no * Closes: #40174 Authored-by: mwish Signed-off-by: Antoine Pitrou --- cpp/src/parquet/column_writer_test.cc | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index a40e71ce30aec..86fe0965a6a7f 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -483,7 +483,6 @@ using TestByteArrayValuesWriter = TestPrimitiveWriter; using TestFixedLengthByteArrayValuesWriter = TestPrimitiveWriter; using ::testing::HasSubstr; -using ::testing::ThrowsMessage; TYPED_TEST(TestPrimitiveWriter, RequiredPlain) { this->TestRequiredWithEncoding(Encoding::PLAIN); @@ -918,20 +917,27 @@ TEST(TestPageWriter, ThrowsOnPagesTooLarge) { DataPageV1 over_compressed_limit(buffer, /*num_values=*/100, Encoding::BIT_PACKED, Encoding::BIT_PACKED, Encoding::BIT_PACKED, /*uncompressed_size=*/100); - EXPECT_THAT([&]() { pager->WriteDataPage(over_compressed_limit); }, - ThrowsMessage(HasSubstr("overflows INT32_MAX"))); + EXPECT_THROW_THAT([&]() { pager->WriteDataPage(over_compressed_limit); }, + ParquetException, + ::testing::Property(&ParquetException::what, + ::testing::HasSubstr("overflows INT32_MAX"))); DictionaryPage dictionary_over_compressed_limit(buffer, /*num_values=*/100, Encoding::PLAIN); - EXPECT_THAT([&]() { pager->WriteDictionaryPage(dictionary_over_compressed_limit); }, - ThrowsMessage(HasSubstr("overflows INT32_MAX"))); + EXPECT_THROW_THAT( + [&]() { pager->WriteDictionaryPage(dictionary_over_compressed_limit); }, + ParquetException, + ::testing::Property(&ParquetException::what, + ::testing::HasSubstr("overflows INT32_MAX"))); buffer = std::make_shared(&data, 1); DataPageV1 over_uncompressed_limit( buffer, /*num_values=*/100, Encoding::BIT_PACKED, Encoding::BIT_PACKED, Encoding::BIT_PACKED, /*uncompressed_size=*/std::numeric_limits::max() + int64_t{1}); - EXPECT_THAT([&]() { pager->WriteDataPage(over_compressed_limit); }, - ThrowsMessage(HasSubstr("overflows INT32_MAX"))); + EXPECT_THROW_THAT([&]() { pager->WriteDataPage(over_compressed_limit); }, + ParquetException, + ::testing::Property(&ParquetException::what, + ::testing::HasSubstr("overflows INT32_MAX"))); } TEST(TestColumnWriter, RepeatedListsUpdateSpacedBug) { From e198f309c577de9a265c04af2bc4644c33f54375 Mon Sep 17 00:00:00 2001 From: david dali susanibar arce Date: Wed, 21 Feb 2024 14:46:30 -0500 Subject: [PATCH 41/46] MINOR: [JAVA] Add unit test for float16 into cdata interface module (#40185) Add unit test for float16 into CData Interface module. Authored-by: david dali susanibar arce Signed-off-by: David Li --- .../java/org/apache/arrow/c/RoundtripTest.java | 9 +++++++++ .../vector/testing/ValueVectorDataPopulator.java | 15 +++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java index fe070400ad94f..a7e3cde2e7b4b 100644 --- a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java +++ b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java @@ -48,6 +48,7 @@ import org.apache.arrow.vector.ExtensionTypeVector; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.Float2Vector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; @@ -297,6 +298,14 @@ public void testFixedSizeBinaryVector() { } } + @Test + public void testFloat2Vector() { + try (final Float2Vector vector = new Float2Vector("v", allocator)) { + setVector(vector, 0.1f, 0.2f, 0.3f, null); + assertTrue(roundtrip(vector, Float2Vector.class)); + } + } + @Test public void testFloat4Vector() { try (final Float4Vector vector = new Float4Vector("v", allocator)) { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java index 9e96e75880522..9bfcb3c635d86 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java @@ -34,6 +34,7 @@ import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.DurationVector; import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.Float2Vector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; @@ -207,6 +208,20 @@ public static void setVector(FixedSizeBinaryVector vector, byte[]... values) { vector.setValueCount(length); } + /** + * Populate values for Float2Vector. + */ + public static void setVector(Float2Vector vector, Float... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.setWithPossibleTruncate(i, values[i]); + } + } + vector.setValueCount(length); + } + /** * Populate values for Float4Vector. */ From aa4512dc21a932064ac11969b5d274762a30c094 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 22 Feb 2024 11:34:27 +0900 Subject: [PATCH 42/46] MINOR: [Java] Bump org.apache.commons:commons-compress from 1.25.0 to 1.26.0 in /java/compression (#40169) Bumps org.apache.commons:commons-compress from 1.25.0 to 1.26.0. [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.commons:commons-compress&package-manager=maven&previous-version=1.25.0&new-version=1.26.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---

Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/apache/arrow/network/alerts).
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/compression/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/compression/pom.xml b/java/compression/pom.xml index dea8c778735a8..d38ad405b94e8 100644 --- a/java/compression/pom.xml +++ b/java/compression/pom.xml @@ -42,7 +42,7 @@ org.apache.commons commons-compress - 1.25.0 + 1.26.0 com.github.luben From 8e53451cc48081df20fdf52b82edcc52ea778ec5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 22 Feb 2024 10:19:17 +0100 Subject: [PATCH 43/46] GH-40092: [Python] Support Binary/StringView conversion to numpy/pandas (#40093) Last step for Binary/StringView support in Python (https://github.com/apache/arrow/issues/39633), now adding it to the arrow->pandas/numpy conversion code path. * Closes: #40092 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- .../src/arrow/python/arrow_to_pandas.cc | 22 ++++++++++++++++--- python/pyarrow/tests/test_pandas.py | 14 ++++++++++++ 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index e979342b886da..2115cd8015cac 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -133,6 +133,13 @@ struct WrapBytes { } }; +template <> +struct WrapBytes { + static inline PyObject* Wrap(const char* data, int64_t length) { + return PyUnicode_FromStringAndSize(data, length); + } +}; + template <> struct WrapBytes { static inline PyObject* Wrap(const char* data, int64_t length) { @@ -147,6 +154,13 @@ struct WrapBytes { } }; +template <> +struct WrapBytes { + static inline PyObject* Wrap(const char* data, int64_t length) { + return PyBytes_FromStringAndSize(data, length); + } +}; + template <> struct WrapBytes { static inline PyObject* Wrap(const char* data, int64_t length) { @@ -1154,7 +1168,8 @@ struct ObjectWriterVisitor { } template - enable_if_t::value || is_fixed_size_binary_type::value, + enable_if_t::value || is_binary_view_like_type::value || + is_fixed_size_binary_type::value, Status> Visit(const Type& type) { auto WrapValue = [](const std::string_view& view, PyObject** out) { @@ -1355,8 +1370,7 @@ struct ObjectWriterVisitor { std::is_same::value || (std::is_base_of::value && !std::is_same::value) || - std::is_base_of::value || - std::is_base_of::value, + std::is_base_of::value, Status> Visit(const Type& type) { return Status::NotImplemented("No implemented conversion to object dtype: ", @@ -2086,8 +2100,10 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& break; case Type::STRING: // fall through case Type::LARGE_STRING: // fall through + case Type::STRING_VIEW: // fall through case Type::BINARY: // fall through case Type::LARGE_BINARY: + case Type::BINARY_VIEW: case Type::NA: // fall through case Type::FIXED_SIZE_BINARY: // fall through case Type::STRUCT: // fall through diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 89a241a27efe0..fdfd123a8c34f 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -1760,6 +1760,20 @@ def test_large_string(self): _check_pandas_roundtrip( df, schema=pa.schema([('a', pa.large_string())])) + def test_binary_view(self): + s = pd.Series([b'123', b'', b'a', None]) + _check_series_roundtrip(s, type_=pa.binary_view()) + df = pd.DataFrame({'a': s}) + _check_pandas_roundtrip( + df, schema=pa.schema([('a', pa.binary_view())])) + + def test_string_view(self): + s = pd.Series(['123', '', 'a', None]) + _check_series_roundtrip(s, type_=pa.string_view()) + df = pd.DataFrame({'a': s}) + _check_pandas_roundtrip( + df, schema=pa.schema([('a', pa.string_view())])) + def test_table_empty_str(self): values = ['', '', '', '', ''] df = pd.DataFrame({'strings': values}) From 280bc112b23976d2f17c07c638bb62702ac89e8a Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 22 Feb 2024 16:09:42 +0100 Subject: [PATCH 44/46] MINOR: [C++] Add missing parenthesis in error message (#40201) Followup to https://github.com/apache/arrow/pull/40176 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/io/file.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc index 3b18bb7b0f0f4..00426f9957b1f 100644 --- a/cpp/src/arrow/io/file.cc +++ b/cpp/src/arrow/io/file.cc @@ -572,7 +572,7 @@ class MemoryMappedFile::MemoryMap if (static_cast(static_cast(mmap_length)) != mmap_length) { return Status::CapacityError("Requested memory map length ", mmap_length, " does not fit in a C size_t " - "(are you using a 32-bit build of Arrow?"); + "(are you using a 32-bit build of Arrow?)"); } void* result = mmap(nullptr, static_cast(mmap_length), prot_flags_, map_mode_, From f9995ac4c104c9df1577a8ac85bddff3c5eacd35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Thu, 22 Feb 2024 22:52:15 +0100 Subject: [PATCH 45/46] GH-31735: [Docs][Release] Move release verification guide to developers documentation (#39960) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change The current verification guide is obsolete and should not be on the wiki. ### What changes are included in this PR? Adding a new page with the release verification information. ### Are these changes tested? The documentation will be generated and will use the preview-docs job. ### Are there any user-facing changes? Yes but not relevant in terms of code changes. * Closes: #31735 Lead-authored-by: Raúl Cumplido Co-authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- dev/README.md | 19 --- dev/release/02-source-test.rb | 2 +- dev/release/02-source.sh | 2 +- dev/release/README.md | 6 +- dev/release/VERIFY.md | 113 -------------- docs/source/developers/index.rst | 8 + docs/source/developers/release.rst | 2 + .../developers/release_verification.rst | 144 ++++++++++++++++++ 8 files changed, 158 insertions(+), 138 deletions(-) delete mode 100644 dev/release/VERIFY.md create mode 100644 docs/source/developers/release_verification.rst diff --git a/dev/README.md b/dev/README.md index 24600a67db8c3..db9a10d527334 100644 --- a/dev/README.md +++ b/dev/README.md @@ -108,25 +108,6 @@ Status closed URL https://github.com/apache/arrow/issues/Y ``` -## Verifying Release Candidates - -We have provided a script to assist with verifying release candidates on Linux -and macOS: - -```shell -bash dev/release/verify-release-candidate.sh 0.7.0 0 -``` - -Read the script and check the notes in dev/release for information about system -dependencies. - -On Windows, we have a script that verifies C++ and Python (requires Visual -Studio 2015): - -``` -dev/release/verify-release-candidate.bat apache-arrow-0.7.0.tar.gz -``` - # Integration testing Build the following base image used by multiple tests: diff --git a/dev/release/02-source-test.rb b/dev/release/02-source-test.rb index b9e6a8505b72b..149a2b27ac94a 100644 --- a/dev/release/02-source-test.rb +++ b/dev/release/02-source-test.rb @@ -166,7 +166,7 @@ def test_vote [10]: https://apache.jfrog.io/artifactory/arrow/python-rc/#{@release_version}-rc0 [11]: https://apache.jfrog.io/artifactory/arrow/ubuntu-rc/ [12]: https://github.com/apache/arrow/blob/#{@current_commit}/CHANGELOG.md -[13]: https://cwiki.apache.org/confluence/display/ARROW/How+to+Verify+Release+Candidates +[13]: https://arrow.apache.org/docs/developers/release_verification.html [14]: #{verify_pr_url || "null"} VOTE end diff --git a/dev/release/02-source.sh b/dev/release/02-source.sh index a3441b23bf539..1bd3c0e19e04e 100755 --- a/dev/release/02-source.sh +++ b/dev/release/02-source.sh @@ -202,7 +202,7 @@ The vote will be open for at least 72 hours. [10]: https://apache.jfrog.io/artifactory/arrow/python-rc/${version}-rc${rc} [11]: https://apache.jfrog.io/artifactory/arrow/ubuntu-rc/ [12]: https://github.com/apache/arrow/blob/${release_hash}/CHANGELOG.md -[13]: https://cwiki.apache.org/confluence/display/ARROW/How+to+Verify+Release+Candidates +[13]: https://arrow.apache.org/docs/developers/release_verification.html [14]: ${verify_pr_url} MAIL echo "---------------------------------------------------------" diff --git a/dev/release/README.md b/dev/release/README.md index e1ecdd4332292..ce1eb82d7eba3 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -19,8 +19,6 @@ ## Release management scripts -To learn more, see the project wiki: +To learn more, see the project documentation: -https://cwiki.apache.org/confluence/display/ARROW/Release+Management+Guide - -and [VERIFY.md](VERIFY.md) +https://arrow.apache.org/docs/developers/release.html diff --git a/dev/release/VERIFY.md b/dev/release/VERIFY.md deleted file mode 100644 index 433e6fcb832fc..0000000000000 --- a/dev/release/VERIFY.md +++ /dev/null @@ -1,113 +0,0 @@ - - -# Verifying Arrow releases - -## Windows - -We've provided a convenience script for verifying the C++ and Python builds on -Windows. Read the comments in `verify-release-candidate.bat` for instructions. - -## Linux and macOS - -We've provided a convenience script for verifying the C++, C#, C GLib, Go, -Java, JavaScript, Ruby and Python builds on Linux and macOS. Read the script -`verify-release-candidate.sh` for further information. - -### C GLib - -You need the followings to verify C GLib build: - - * GLib - * GObject Introspection - * Ruby (not EOL-ed version is required) - * gobject-introspection gem - * test-unit gem - -You can install them by the followings on Debian GNU/Linux and Ubuntu: - -```console -% sudo apt install -y -V libgirepository1.0-dev ruby-dev -% sudo gem install gobject-introspection test-unit -``` - -You can install them by the followings on CentOS 7: - -```console -% sudo yum install -y gobject-introspection-devel -% git clone https://github.com/sstephenson/rbenv.git ~/.rbenv -% git clone https://github.com/sstephenson/ruby-build.git ~/.rbenv/plugins/ruby-build -% echo 'export PATH="$HOME/.rbenv/bin:$PATH"' >> ~/.bash_profile -% echo 'eval "$(rbenv init -)"' >> ~/.bash_profile -% exec ${SHELL} --login -% sudo yum install -y gcc make patch openssl-devel readline-devel zlib-devel -% rbenv install 2.4.2 -% rbenv global 2.4.2 -% gem install gobject-introspection test-unit -``` - -You can install them by the followings on macOS: - -```console -% brew install -y gobject-introspection -% gem install gobject-introspection test-unit -``` - -You need to set `PKG_CONFIG_PATH` to find libffi on macOS: - -```console -% export PKG_CONFIG_PATH=$(brew --prefix libffi)/lib/pkgconfig:$PKG_CONFIG_PATH -``` - -### C++, C#, C GLib, Go, Java, JavaScript, Python, Ruby - -Example scripts to install the dependencies to run the verification -script for verifying the source on Ubuntu 20.04, Rocky Linux 8 and -AlmaLinux 8 are in this folder and named `setup-ubuntu.sh` and -`setup-rhel-rebuilds.sh`. These can be adapted to different -situations. Go and JavaScript are installed by the verification -script in the testing environment. Verifying the apt and yum binaries -additionally requires installation of Docker. - -When verifying the source, by default the verification script will try -to verify all implementations and bindings. Should one of the -verification tests fail, the script will exit before running the other -tests. It can be helpful to repeat the failed test to see if it will -complete, since failures can occur for problems such as slow or failed -download of a dependency from the internet. It is possible to run -specific verification tests by setting environment variables, for example - -```console -% TEST_DEFAULT=0 TEST_SOURCE=1 dev/release/verify-release-candidate.sh 6.0.0 3 -% TEST_DEFAULT=0 TEST_BINARIES=1 dev/release/verify-release-candidate.sh 6.0.0 3 -% TEST_DEFAULT=0 TEST_GO=1 dev/release/verify-release-candidate.sh 6.0.0 3 -% TEST_DEFAULT=0 TEST_YUM=1 dev/release/verify-release-candidate.sh 6.0.0 3 -``` - -It is also possible to use -[Archery](https://arrow.apache.org/docs/developers/archery.html) to run -the verification process in a container, for example - -```console -% archery docker run -e VERIFY_VERSION=6.0.1 -e VERIFY_RC=1 almalinux-verify-rc-source -% archery docker run -e VERIFY_VERSION=6.0.1 -e VERIFY_RC=1 ubuntu-verify-rc-source -``` - -To improve software quality, you are encouraged to verify -on a variety of platforms. diff --git a/docs/source/developers/index.rst b/docs/source/developers/index.rst index 83dc556e1605a..fa63f66516e37 100644 --- a/docs/source/developers/index.rst +++ b/docs/source/developers/index.rst @@ -211,6 +211,13 @@ All participation in the Apache Arrow project is governed by the ASF's To learn about the detailed information on the steps followed to perform a release, see :ref:`release`. +.. dropdown:: Release Verification Process + :animate: fade-in-slide-down + :class-title: sd-fs-5 + :class-container: sd-shadow-none + + To learn how to verify a release, see :ref:`release_verification`. + .. toctree:: :maxdepth: 2 :hidden: @@ -226,3 +233,4 @@ All participation in the Apache Arrow project is governed by the ASF's benchmarks documentation release + release_verification diff --git a/docs/source/developers/release.rst b/docs/source/developers/release.rst index 0ff8e3a824ffc..1ecf747e36379 100644 --- a/docs/source/developers/release.rst +++ b/docs/source/developers/release.rst @@ -200,6 +200,8 @@ Verify the Release # on dev@arrow.apache.org. To regenerate the email template use SOURCE_DEFAULT=0 SOURCE_VOTE=1 dev/release/02-source.sh +See :ref:`release_verification` for details. + Voting and approval =================== diff --git a/docs/source/developers/release_verification.rst b/docs/source/developers/release_verification.rst new file mode 100644 index 0000000000000..53c8f54e5b5bd --- /dev/null +++ b/docs/source/developers/release_verification.rst @@ -0,0 +1,144 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _release_verification: + +============================ +Release Verification Process +============================ + +This page provides detailed information on the steps followed to perform +a release verification on the major platforms. + +Principles +========== + +The Apache Arrow Release Approval process follows the guidelines defined at the +`Apache Software Foundation Release Approval `_. + +For a release vote to pass, a minimum of three positive binding votes and more +positive binding votes than negative binding votes MUST be cast. +Releases may not be vetoed. Votes cast by PMC members are binding, however, +non-binding votes are greatly encouraged and a sign of a healthy project. + +Running the release verification +================================ + +Linux and macOS +--------------- + +In order to run the verification script either for the source release or the +binary artifacts see the following guidelines: + +.. code-block:: + + # this will create and automatically clean up a temporary directory for the verification environment and will run the source verification + TEST_DEFAULT=0 TEST_SOURCE=1 verify-release-candidate.sh $VERSION $RC_NUM + + # this will create and automatically clean up a temporary directory for the verification environment and will run the binary verification + TEST_DEFAULT=0 TEST_BINARIES=1 dev/release/verify-release-candidate.sh $VERSION $RC_NUM + + # to verify only certain implementations use the TEST_DEFAULT=0 and TEST_* variables + # here are a couple of examples, but see the source code for the available options + TEST_DEFAULT=0 TEST_CPP=1 verify-release-candidate.sh $VERSION $RC_NUM # only C++ tests + TEST_DEFAULT=0 TEST_CPP=1 TEST_PYTHON=1 verify-release-candidate.sh $VERSION $RC_NUM # C++ and Python tests + TEST_DEFAULT=0 TEST_INTEGRATION_CPP=1 TEST_INTEGRATION_JAVA=1 verify-release-candidate.sh $VERSION $RC_NUM # C++ and Java integration tests + + # to verify certain binaries use the TEST_* variables as: + TEST_DEFAULT=0 TEST_WHEELS=1 verify-release-candidate.sh $VERSION $RC_NUM # only Wheels + TEST_DEFAULT=0 TEST_APT=1 verify-release-candidate.sh $VERSION $RC_NUM # only APT packages + TEST_DEFAULT=0 TEST_YUM=1 verify-release-candidate.sh $VERSION $RC_NUM # only YUM packages + TEST_DEFAULT=0 TEST_JARS=1 verify-release-candidate.sh $VERSION $RC_NUM # only JARS + +Windows +------- + +In order to run the verification script on Windows you have to download +the source tarball from the SVN dist system that you wish to verify: + +.. code-block:: + + dev\release\verify-release-candidate.bat %VERSION% %RC_NUM% + +System Configuration Instructions +================================= + +You will need some tools installed like curl, git, etcetera. + +Ubuntu +------ + +You might have to install some packages on your system. The following +utility script can be used to set your Ubuntu system. This wil install +the required packages to perform a source verification on a clean +Ubuntu: + +.. code-block:: + + # From the arrow clone + sudo dev/release/setup-ubuntu.sh + +macOS ARM +--------- + +.. code-block:: + + # From the arrow clone + brew install gpg + brew bundle --file=cpp/Brewfile + brew bundle --file=c_glib/Brewfile + brew uninstall node + # You might need to add node, ruby java and maven to the PATH, follow + # instructions from brew after installing. + brew install node@20 + brew install ruby + brew install openjdk + brew install maven + +Windows 11 +---------- + +To be defined + +Casting your vote +================= + +Once you have performed the verification you can cast your vote by responding +to the vote thread on dev@arrow.apache.org and supply your result. + +If the verification was successful you can send your +1 vote. We usually send +along with the vote the command that was executed and the local versions used. +As an example: + +.. code-block:: + +1 + + I've verified successfully the sources and binaries with: + + TEST_DEFAULT=0 TEST_SOURCE=1 dev/release/verify-release-candidate.sh 15.0.0 1 + TEST_DEFAULT=0 TEST_BINARIES=1 dev/release/verify-release-candidate.sh 15.0.0 1 + with: + * Python 3.10.12 + * gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 + * NVIDIA CUDA Build cuda_11.5.r11.5/compiler.30672275_0 + * openjdk version "17.0.9" 2023-10-17 + * ruby 3.0.2p107 (2021-07-07 revision 0db68f0233) [x86_64-linux-gnu] + * dotnet 7.0.115 + * Ubuntu 22.04 LTS + +If there were some issues during verification please report them on the +mail thread to diagnose the issue. From b089c6a77bdf2e542a647105ec6bfc3221df85ce Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 22 Feb 2024 22:16:31 +0000 Subject: [PATCH 46/46] GH-40079: [CI][Packaging] Enable Azure in more tests and builds (#40080) ### Rationale for this change We want python side tests of `AzureFileSystem` to run in CI. ### What changes are included in this PR? - Add missing `export` to enable Azure pyarrow tests - Enable azure in sdist tests. - Enable Azure on macos python builds - Enable azure in conda builds and install dependencies (Azure C++ SDK and azurite) - Enable retries on C++ tests to mitigate https://github.com/apache/arrow/issues/40121 Probably all of this should have been included in https://github.com/apache/arrow/pull/39971 ### Are these changes tested? There is no new functionality to test ### Are there any user-facing changes? No * Closes: #40079 * GitHub Issue: #40079 Authored-by: Thomas Newton Signed-off-by: Sutou Kouhei --- .github/workflows/python.yml | 1 + ci/conda_env_cpp.txt | 6 ++++++ ci/docker/conda-cpp.dockerfile | 8 ++++++++ ci/scripts/cpp_test.sh | 1 + ci/scripts/python_sdist_test.sh | 1 + ci/scripts/python_test.sh | 1 + 6 files changed, 18 insertions(+) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 6e3797b29c21e..25d918bcc25aa 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -132,6 +132,7 @@ jobs: timeout-minutes: 60 env: ARROW_HOME: /usr/local + ARROW_AZURE: ON ARROW_DATASET: ON ARROW_FLIGHT: ON ARROW_GANDIVA: ON diff --git a/ci/conda_env_cpp.txt b/ci/conda_env_cpp.txt index ef00f7cf4751c..b8c792008a958 100644 --- a/ci/conda_env_cpp.txt +++ b/ci/conda_env_cpp.txt @@ -16,6 +16,11 @@ # under the License. aws-sdk-cpp=1.11.68 +azure-core-cpp>=1.10.3 +azure-identity-cpp>=1.6.0 +azure-storage-blobs-cpp>=12.10.0 +azure-storage-common-cpp>=12.5.0 +azure-storage-files-datalake-cpp>=12.9.0 benchmark>=1.6.0 boost-cpp>=1.68.0 brotli @@ -34,6 +39,7 @@ libutf8proc lz4-c make ninja +nodejs orc pkg-config python diff --git a/ci/docker/conda-cpp.dockerfile b/ci/docker/conda-cpp.dockerfile index 7a54dcc86f8fa..dff1f2224809a 100644 --- a/ci/docker/conda-cpp.dockerfile +++ b/ci/docker/conda-cpp.dockerfile @@ -42,6 +42,13 @@ RUN mamba install -q -y \ valgrind && \ mamba clean --all +# Ensure npm, node and azurite are on path. npm and node are required to install azurite, which will then need to +# be on the path for the tests to run. +ENV PATH=/opt/conda/envs/arrow/bin:$PATH + +COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_azurite.sh + # We want to install the GCS testbench using the same Python binary that the Conda code will use. COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts RUN /arrow/ci/scripts/install_gcs_testbench.sh default @@ -50,6 +57,7 @@ COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin ENV ARROW_ACERO=ON \ + ARROW_AZURE=ON \ ARROW_BUILD_TESTS=ON \ ARROW_DATASET=ON \ ARROW_DEPENDENCY_SOURCE=CONDA \ diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh index 1d685c51a9326..a23ea8eb1cd34 100755 --- a/ci/scripts/cpp_test.sh +++ b/ci/scripts/cpp_test.sh @@ -86,6 +86,7 @@ ctest \ --label-regex unittest \ --output-on-failure \ --parallel ${n_jobs} \ + --repeat until-pass:3 \ --timeout ${ARROW_CTEST_TIMEOUT:-300} \ "${ctest_options[@]}" \ "$@" diff --git a/ci/scripts/python_sdist_test.sh b/ci/scripts/python_sdist_test.sh index d3c6f0e6ade89..1cd1000aa3903 100755 --- a/ci/scripts/python_sdist_test.sh +++ b/ci/scripts/python_sdist_test.sh @@ -28,6 +28,7 @@ export PARQUET_TEST_DATA=${arrow_dir}/cpp/submodules/parquet-testing/data export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR:-Ninja} export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE:-debug} export PYARROW_WITH_ACERO=${ARROW_ACERO:-ON} +export PYARROW_WITH_AZURE=${ARROW_AZURE:-OFF} export PYARROW_WITH_S3=${ARROW_S3:-OFF} export PYARROW_WITH_ORC=${ARROW_ORC:-OFF} export PYARROW_WITH_CUDA=${ARROW_CUDA:-OFF} diff --git a/ci/scripts/python_test.sh b/ci/scripts/python_test.sh index 20ca3300c0538..7b803518494ee 100755 --- a/ci/scripts/python_test.sh +++ b/ci/scripts/python_test.sh @@ -52,6 +52,7 @@ fi : ${PYARROW_TEST_S3:=${ARROW_S3:-ON}} export PYARROW_TEST_ACERO +export PYARROW_TEST_AZURE export PYARROW_TEST_CUDA export PYARROW_TEST_DATASET export PYARROW_TEST_FLIGHT