diff --git a/.gitignore b/.gitignore index dd69b6cec9c5f..e6dfe19bb9807 100644 --- a/.gitignore +++ b/.gitignore @@ -27,4 +27,5 @@ MANIFEST cpp/.idea/ python/.eggs/ -.vscode \ No newline at end of file +.vscode +.idea/ diff --git a/.travis.yml b/.travis.yml index cdf787c831b0f..9cc2b86c05cde 100644 --- a/.travis.yml +++ b/.travis.yml @@ -120,6 +120,27 @@ matrix: - $TRAVIS_BUILD_DIR/ci/travis_before_script_c_glib.sh script: - $TRAVIS_BUILD_DIR/ci/travis_script_c_glib.sh + - compiler: gcc + language: cpp + os: linux + group: deprecated + before_script: + - export CC="gcc-4.9" + - export CXX="g++-4.9" + - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh + script: + - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh + - $TRAVIS_BUILD_DIR/ci/travis_script_plasma.sh + - compiler: clang + osx_image: xcode6.4 + os: osx + cache: + addons: + before_script: + - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh + script: + - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh + - $TRAVIS_BUILD_DIR/ci/travis_script_plasma.sh before_install: - ulimit -c unlimited -S diff --git a/appveyor.yml b/appveyor.yml index c58e1dab7d8fd..91e9ee2649073 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -45,6 +45,13 @@ environment: PYTHON: "3.5" ARCH: "64" CONFIGURATION: "Release" + - JOB: "Build" + GENERATOR: Visual Studio 15 2017 Win64 + PYTHON: "3.5" + ARCH: "64" + CONFIGURATION: "Release" + APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 + BOOST_ROOT: C:\Libraries\boost_1_64_0 MSVC_DEFAULT_OPTIONS: ON BOOST_ROOT: C:\Libraries\boost_1_63_0 diff --git a/ci/travis_script_manylinux.sh b/ci/travis_script_manylinux.sh index 4e6be62bd3e9d..844d5f719f15a 100755 --- a/ci/travis_script_manylinux.sh +++ b/ci/travis_script_manylinux.sh @@ -18,4 +18,4 @@ set -ex pushd python/manylinux1 git clone ../../ arrow docker build -t arrow-base-x86_64 -f Dockerfile-x86_64 . -docker run --rm -e PYARROW_PARALLEL=3 -v $PWD:/io arrow-base-x86_64 /io/build_arrow.sh +docker run --shm-size=2g --rm -e PYARROW_PARALLEL=3 -v $PWD:/io arrow-base-x86_64 /io/build_arrow.sh diff --git a/ci/travis_script_plasma.sh b/ci/travis_script_plasma.sh new file mode 100755 index 0000000000000..fa384ade89c2f --- /dev/null +++ b/ci/travis_script_plasma.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + +set -e + +source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh + +export ARROW_HOME=$ARROW_CPP_INSTALL +export PYARROW_WITH_PLASMA=1 + +pushd $ARROW_PYTHON_DIR + +function build_arrow_libraries() { + CPP_BUILD_DIR=$1 + CPP_DIR=$TRAVIS_BUILD_DIR/cpp + + mkdir $CPP_BUILD_DIR + pushd $CPP_BUILD_DIR + + cmake -DARROW_BUILD_TESTS=off \ + -DARROW_PYTHON=on \ + -DARROW_PLASMA=on \ + -DCMAKE_INSTALL_PREFIX=$2 \ + $CPP_DIR + + make -j4 + make install + + popd +} + +python_version_tests() { + PYTHON_VERSION=$1 + CONDA_ENV_DIR=$TRAVIS_BUILD_DIR/pyarrow-test-$PYTHON_VERSION + + export ARROW_HOME=$TRAVIS_BUILD_DIR/arrow-install-$PYTHON_VERSION + export LD_LIBRARY_PATH=$ARROW_HOME/lib:$PARQUET_HOME/lib + + conda create -y -q -p $CONDA_ENV_DIR python=$PYTHON_VERSION cmake curl + source activate $CONDA_ENV_DIR + + python --version + which python + + # faster builds, please + conda install -y -q nomkl + + # Expensive dependencies install from Continuum package repo + conda install -y -q pip numpy pandas cython + + # Build C++ libraries + build_arrow_libraries arrow-build-$PYTHON_VERSION $ARROW_HOME + + # Other stuff pip install + pip install -r requirements.txt + + python setup.py build_ext --inplace + + python -m pytest -vv -r sxX pyarrow + + # Build documentation once + if [[ "$PYTHON_VERSION" == "3.6" ]] + then + conda install -y -q --file=doc/requirements.txt + python setup.py build_sphinx -s doc/source + fi + + # Build and install pyarrow + pushd $TRAVIS_BUILD_DIR/python + python setup.py install + popd + + # Run Plasma tests + pushd $TRAVIS_BUILD_DIR/python + python -m pytest pyarrow/tests/test_plasma.py + if [ $TRAVIS_OS_NAME == "linux" ]; then + PLASMA_VALGRIND=1 python -m pytest pyarrow/tests/test_plasma.py + fi + popd +} + +# run tests for python 2.7 and 3.6 +python_version_tests 2.7 +python_version_tests 3.6 + +popd diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index ac64c548d8225..fdb5ad6a62c93 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -17,6 +17,7 @@ set -e source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh export ARROW_HOME=$ARROW_CPP_INSTALL +export PYARROW_WITH_PLASMA=1 pushd $ARROW_PYTHON_DIR export PARQUET_HOME=$TRAVIS_BUILD_DIR/parquet-env @@ -71,9 +72,8 @@ function build_arrow_libraries() { pushd $CPP_BUILD_DIR cmake -DARROW_BUILD_TESTS=off \ - -DARROW_PYTHON=on \ - -DPLASMA_PYTHON=on \ -DARROW_PLASMA=on \ + -DARROW_PYTHON=on \ -DCMAKE_INSTALL_PREFIX=$2 \ $CPP_DIR diff --git a/cpp/.clang-format b/cpp/.clang-format index 33f282a20de20..06453dfbb25b7 100644 --- a/cpp/.clang-format +++ b/cpp/.clang-format @@ -15,67 +15,6 @@ # specific language governing permissions and limitations # under the License. --- -Language: Cpp -# BasedOnStyle: Google -AccessModifierOffset: -1 -AlignAfterOpenBracket: false -AlignConsecutiveAssignments: false -AlignEscapedNewlinesLeft: true -AlignOperands: true -AlignTrailingComments: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowShortBlocksOnASingleLine: true -AllowShortCaseLabelsOnASingleLine: false -AllowShortFunctionsOnASingleLine: Inline -AllowShortIfStatementsOnASingleLine: true -AllowShortLoopsOnASingleLine: false -AlwaysBreakAfterDefinitionReturnType: None -AlwaysBreakBeforeMultilineStrings: true -AlwaysBreakTemplateDeclarations: true -BinPackArguments: true -BinPackParameters: true -BreakBeforeBinaryOperators: None -BreakBeforeBraces: Attach -BreakBeforeTernaryOperators: true -BreakConstructorInitializersBeforeComma: false -ColumnLimit: 90 -CommentPragmas: '^ IWYU pragma:' -ConstructorInitializerAllOnOneLineOrOnePerLine: true -ConstructorInitializerIndentWidth: 4 -ContinuationIndentWidth: 4 -Cpp11BracedListStyle: true +BasedOnStyle: Google DerivePointerAlignment: false -DisableFormat: false -ExperimentalAutoDetectBinPacking: false -ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] -IndentCaseLabels: true -IndentWidth: 2 -IndentWrappedFunctionNames: false -KeepEmptyLinesAtTheStartOfBlocks: false -MacroBlockBegin: '' -MacroBlockEnd: '' -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None -ObjCBlockIndentWidth: 2 -ObjCSpaceAfterProperty: false -ObjCSpaceBeforeProtocolList: false -PenaltyBreakBeforeFirstCallParameter: 1000 -PenaltyBreakComment: 300 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakString: 1000 -PenaltyExcessCharacter: 1000000 -PenaltyReturnTypeOnItsOwnLine: 200 -PointerAlignment: Left -SpaceAfterCStyleCast: false -SpaceBeforeAssignmentOperators: true -SpaceBeforeParens: ControlStatements -SpaceInEmptyParentheses: false -SpacesBeforeTrailingComments: 2 -SpacesInAngles: false -SpacesInContainerLiterals: true -SpacesInCStyleCastParentheses: false -SpacesInParentheses: false -SpacesInSquareBrackets: false -Standard: Cpp11 -TabWidth: 8 -UseTab: Never +ColumnLimit: 90 diff --git a/cpp/cmake_modules/SnappyConfig.h b/cpp/cmake_modules/SnappyConfig.h index 74eb77621626b..c998d1813aa7e 100644 --- a/cpp/cmake_modules/SnappyConfig.h +++ b/cpp/cmake_modules/SnappyConfig.h @@ -29,7 +29,7 @@ #ifndef SNAPPY_CONFIG_H #define SNAPPY_CONFIG_H 1 -#if defined(_MSC_VER) && (_MSC_VER <= 1900) +#if defined(_MSC_VER) && (_MSC_VER <= 1910) typedef __int64 ssize_t; #endif diff --git a/cpp/src/arrow/allocator-test.cc b/cpp/src/arrow/allocator-test.cc index 5a4e98d76600f..f3a80cdae818b 100644 --- a/cpp/src/arrow/allocator-test.cc +++ b/cpp/src/arrow/allocator-test.cc @@ -48,7 +48,7 @@ TEST(stl_allocator, FreeLargeMemory) { #ifndef NDEBUG EXPECT_EXIT(alloc.deallocate(data, 120), ::testing::ExitedWithCode(1), - ".*Check failed: \\(bytes_allocated_\\) >= \\(size\\)"); + ".*Check failed: \\(bytes_allocated_\\) >= \\(size\\)"); #endif alloc.deallocate(data, 100); diff --git a/cpp/src/arrow/array-decimal-test.cc b/cpp/src/arrow/array-decimal-test.cc index 0959d686498d5..436ce9cf7c312 100644 --- a/cpp/src/arrow/array-decimal-test.cc +++ b/cpp/src/arrow/array-decimal-test.cc @@ -28,12 +28,12 @@ namespace decimal { template class DecimalTestBase { public: - virtual std::vector data( - const std::vector& input, size_t byte_width) const = 0; + virtual std::vector data(const std::vector& input, + size_t byte_width) const = 0; void test(int precision, const std::vector& draw, - const std::vector& valid_bytes, - const std::vector& sign_bitmap = {}, int64_t offset = 0) const { + const std::vector& valid_bytes, + const std::vector& sign_bitmap = {}, int64_t offset = 0) const { auto type = std::make_shared(precision, 4); int byte_width = type->byte_width(); auto pool = default_memory_pool(); @@ -63,8 +63,9 @@ class DecimalTestBase { ASSERT_OK(BitUtil::BytesToBits(valid_bytes, &expected_null_bitmap)); int64_t expected_null_count = test::null_count(valid_bytes); - auto expected = std::make_shared(type, size, expected_data, - expected_null_bitmap, expected_null_count, offset, expected_sign_bitmap); + auto expected = + std::make_shared(type, size, expected_data, expected_null_bitmap, + expected_null_count, offset, expected_sign_bitmap); std::shared_ptr out; ASSERT_OK(builder->Finish(&out)); @@ -75,8 +76,8 @@ class DecimalTestBase { template class DecimalTest : public DecimalTestBase { public: - std::vector data( - const std::vector& input, size_t byte_width) const override { + std::vector data(const std::vector& input, + size_t byte_width) const override { std::vector result(input.size() * byte_width); // TODO(phillipc): There's probably a better way to do this constexpr static const size_t bytes_per_element = sizeof(T); @@ -90,8 +91,8 @@ class DecimalTest : public DecimalTestBase { template <> class DecimalTest : public DecimalTestBase { public: - std::vector data( - const std::vector& input, size_t byte_width) const override { + std::vector data(const std::vector& input, + size_t byte_width) const override { std::vector result; result.reserve(input.size() * byte_width); constexpr static const size_t bytes_per_element = 16; @@ -120,24 +121,24 @@ class Decimal128BuilderTest : public ::testing::TestWithParam, TEST_P(Decimal32BuilderTest, NoNulls) { int precision = GetParam(); - std::vector draw = { - Decimal32(1), Decimal32(2), Decimal32(2389), Decimal32(4), Decimal32(-12348)}; + std::vector draw = {Decimal32(1), Decimal32(2), Decimal32(2389), + Decimal32(4), Decimal32(-12348)}; std::vector valid_bytes = {true, true, true, true, true}; this->test(precision, draw, valid_bytes); } TEST_P(Decimal64BuilderTest, NoNulls) { int precision = GetParam(); - std::vector draw = { - Decimal64(1), Decimal64(2), Decimal64(2389), Decimal64(4), Decimal64(-12348)}; + std::vector draw = {Decimal64(1), Decimal64(2), Decimal64(2389), + Decimal64(4), Decimal64(-12348)}; std::vector valid_bytes = {true, true, true, true, true}; this->test(precision, draw, valid_bytes); } TEST_P(Decimal128BuilderTest, NoNulls) { int precision = GetParam(); - std::vector draw = { - Decimal128(1), Decimal128(-2), Decimal128(2389), Decimal128(4), Decimal128(-12348)}; + std::vector draw = {Decimal128(1), Decimal128(-2), Decimal128(2389), + Decimal128(4), Decimal128(-12348)}; std::vector valid_bytes = {true, true, true, true, true}; std::vector sign_bitmap = {false, true, false, false, true}; this->test(precision, draw, valid_bytes, sign_bitmap); @@ -145,41 +146,47 @@ TEST_P(Decimal128BuilderTest, NoNulls) { TEST_P(Decimal32BuilderTest, WithNulls) { int precision = GetParam(); - std::vector draw = { - Decimal32(1), Decimal32(2), Decimal32(-1), Decimal32(4), Decimal32(-1)}; + std::vector draw = {Decimal32(1), Decimal32(2), Decimal32(-1), Decimal32(4), + Decimal32(-1)}; std::vector valid_bytes = {true, true, false, true, false}; this->test(precision, draw, valid_bytes); } TEST_P(Decimal64BuilderTest, WithNulls) { int precision = GetParam(); - std::vector draw = { - Decimal64(-1), Decimal64(2), Decimal64(-1), Decimal64(4), Decimal64(-1)}; + std::vector draw = {Decimal64(-1), Decimal64(2), Decimal64(-1), Decimal64(4), + Decimal64(-1)}; std::vector valid_bytes = {true, true, false, true, false}; this->test(precision, draw, valid_bytes); } TEST_P(Decimal128BuilderTest, WithNulls) { int precision = GetParam(); - std::vector draw = {Decimal128(1), Decimal128(2), Decimal128(-1), - Decimal128(4), Decimal128(-1), Decimal128(1), Decimal128(2), - Decimal128("230342903942.234234"), Decimal128("-23049302932.235234")}; - std::vector valid_bytes = { - true, true, false, true, false, true, true, true, true}; - std::vector sign_bitmap = { - false, false, false, false, false, false, false, false, true}; + std::vector draw = {Decimal128(1), + Decimal128(2), + Decimal128(-1), + Decimal128(4), + Decimal128(-1), + Decimal128(1), + Decimal128(2), + Decimal128("230342903942.234234"), + Decimal128("-23049302932.235234")}; + std::vector valid_bytes = {true, true, false, true, false, + true, true, true, true}; + std::vector sign_bitmap = {false, false, false, false, false, + false, false, false, true}; this->test(precision, draw, valid_bytes, sign_bitmap); } INSTANTIATE_TEST_CASE_P(Decimal32BuilderTest, Decimal32BuilderTest, - ::testing::Range( - DecimalPrecision::minimum, DecimalPrecision::maximum)); + ::testing::Range(DecimalPrecision::minimum, + DecimalPrecision::maximum)); INSTANTIATE_TEST_CASE_P(Decimal64BuilderTest, Decimal64BuilderTest, - ::testing::Range( - DecimalPrecision::minimum, DecimalPrecision::maximum)); + ::testing::Range(DecimalPrecision::minimum, + DecimalPrecision::maximum)); INSTANTIATE_TEST_CASE_P(Decimal128BuilderTest, Decimal128BuilderTest, - ::testing::Range( - DecimalPrecision::minimum, DecimalPrecision::maximum)); + ::testing::Range(DecimalPrecision::minimum, + DecimalPrecision::maximum)); } // namespace decimal } // namespace arrow diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index acb4819dd0949..5d63d921cdd52 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -64,8 +64,8 @@ TEST_F(TestArray, TestLength) { ASSERT_EQ(arr->length(), 100); } -Status MakeArrayFromValidBytes( - const vector& v, MemoryPool* pool, std::shared_ptr* out) { +Status MakeArrayFromValidBytes(const vector& v, MemoryPool* pool, + std::shared_ptr* out) { int64_t null_count = v.size() - std::accumulate(v.begin(), v.end(), 0); std::shared_ptr null_buf; @@ -147,7 +147,9 @@ TEST_F(TestArray, TestIsNull) { // clang-format on int64_t null_count = 0; for (uint8_t x : null_bitmap) { - if (x == 0) { ++null_count; } + if (x == 0) { + ++null_count; + } } std::shared_ptr null_buf; @@ -223,8 +225,8 @@ class TestPrimitiveBuilder : public TestBuilder { void Check(const std::unique_ptr& builder, bool nullable) { int64_t size = builder->length(); - auto ex_data = std::make_shared( - reinterpret_cast(draws_.data()), size * sizeof(T)); + auto ex_data = std::make_shared(reinterpret_cast(draws_.data()), + size * sizeof(T)); std::shared_ptr ex_null_bitmap; int64_t ex_null_count = 0; @@ -316,8 +318,8 @@ void TestPrimitiveBuilder::RandomData(int64_t N, double pct_null) { } template <> -void TestPrimitiveBuilder::Check( - const std::unique_ptr& builder, bool nullable) { +void TestPrimitiveBuilder::Check(const std::unique_ptr& builder, + bool nullable) { int64_t size = builder->length(); std::shared_ptr ex_data; @@ -351,7 +353,9 @@ void TestPrimitiveBuilder::Check( ASSERT_EQ(expected->length(), result->length()); for (int64_t i = 0; i < result->length(); ++i) { - if (nullable) { ASSERT_EQ(valid_bytes_[i] == 0, result->IsNull(i)) << i; } + if (nullable) { + ASSERT_EQ(valid_bytes_[i] == 0, result->IsNull(i)) << i; + } bool actual = BitUtil::GetBit(result->values()->data(), i); ASSERT_EQ(draws_[i] != 0, actual) << i; } @@ -359,7 +363,7 @@ void TestPrimitiveBuilder::Check( } typedef ::testing::Types + PInt32, PInt64, PFloat, PDouble> Primitives; TYPED_TEST_CASE(TestPrimitiveBuilder, Primitives); @@ -377,7 +381,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestInit) { ASSERT_OK(this->builder_->Reserve(n)); ASSERT_EQ(BitUtil::NextPower2(n), this->builder_->capacity()); ASSERT_EQ(BitUtil::NextPower2(TypeTraits::bytes_required(n)), - this->builder_->data()->size()); + this->builder_->data()->size()); // unsure if this should go in all builder classes ASSERT_EQ(0, this->builder_->num_children()); @@ -440,8 +444,8 @@ TYPED_TEST(TestPrimitiveBuilder, Equality) { ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &equal_array)); // Make the not equal array by negating the first valid element with itself. - const auto first_valid = std::find_if( - valid_bytes.begin(), valid_bytes.end(), [](uint8_t valid) { return valid > 0; }); + const auto first_valid = std::find_if(valid_bytes.begin(), valid_bytes.end(), + [](uint8_t valid) { return valid > 0; }); const int64_t first_valid_idx = std::distance(valid_bytes.begin(), first_valid); // This should be true with a very high probability, but might introduce flakiness ASSERT_LT(first_valid_idx, size - 1); @@ -679,8 +683,8 @@ class TestStringArray : public ::testing::Test { ASSERT_OK(BitUtil::BytesToBits(valid_bytes_, &null_bitmap_)); null_count_ = test::null_count(valid_bytes_); - strings_ = std::make_shared( - length_, offsets_buf_, value_buf_, null_bitmap_, null_count_); + strings_ = std::make_shared(length_, offsets_buf_, value_buf_, + null_bitmap_, null_count_); } protected: @@ -723,8 +727,8 @@ TEST_F(TestStringArray, TestListFunctions) { } TEST_F(TestStringArray, TestDestructor) { - auto arr = std::make_shared( - length_, offsets_buf_, value_buf_, null_bitmap_, null_count_); + auto arr = std::make_shared(length_, offsets_buf_, value_buf_, + null_bitmap_, null_count_); } TEST_F(TestStringArray, TestGetString) { @@ -742,10 +746,10 @@ TEST_F(TestStringArray, TestEmptyStringComparison) { offsets_buf_ = test::GetBufferFromVector(offsets_); length_ = static_cast(offsets_.size() - 1); - auto strings_a = std::make_shared( - length_, offsets_buf_, nullptr, null_bitmap_, null_count_); - auto strings_b = std::make_shared( - length_, offsets_buf_, nullptr, null_bitmap_, null_count_); + auto strings_a = std::make_shared(length_, offsets_buf_, nullptr, + null_bitmap_, null_count_); + auto strings_b = std::make_shared(length_, offsets_buf_, nullptr, + null_bitmap_, null_count_); ASSERT_TRUE(strings_a->Equals(strings_b)); } @@ -893,8 +897,8 @@ class TestBinaryArray : public ::testing::Test { ASSERT_OK(BitUtil::BytesToBits(valid_bytes_, &null_bitmap_)); null_count_ = test::null_count(valid_bytes_); - strings_ = std::make_shared( - length_, offsets_buf_, value_buf_, null_bitmap_, null_count_); + strings_ = std::make_shared(length_, offsets_buf_, value_buf_, + null_bitmap_, null_count_); } protected: @@ -937,8 +941,8 @@ TEST_F(TestBinaryArray, TestListFunctions) { } TEST_F(TestBinaryArray, TestDestructor) { - auto arr = std::make_shared( - length_, offsets_buf_, value_buf_, null_bitmap_, null_count_); + auto arr = std::make_shared(length_, offsets_buf_, value_buf_, + null_bitmap_, null_count_); } TEST_F(TestBinaryArray, TestGetValue) { @@ -965,8 +969,9 @@ TEST_F(TestBinaryArray, TestEqualsEmptyStrings) { ASSERT_OK(builder.Finish(&left_arr)); const BinaryArray& left = static_cast(*left_arr); - std::shared_ptr right = std::make_shared(left.length(), - left.value_offsets(), nullptr, left.null_bitmap(), left.null_count()); + std::shared_ptr right = + std::make_shared(left.length(), left.value_offsets(), nullptr, + left.null_bitmap(), left.null_count()); ASSERT_TRUE(left.Equals(right)); ASSERT_TRUE(left.RangeEquals(0, left.length(), 0, right)); @@ -1082,17 +1087,11 @@ void CheckSliceEquality() { ASSERT_TRUE(array->RangeEquals(5, 25, 0, slice)); } -TEST_F(TestBinaryArray, TestSliceEquality) { - CheckSliceEquality(); -} +TEST_F(TestBinaryArray, TestSliceEquality) { CheckSliceEquality(); } -TEST_F(TestStringArray, TestSliceEquality) { - CheckSliceEquality(); -} +TEST_F(TestStringArray, TestSliceEquality) { CheckSliceEquality(); } -TEST_F(TestBinaryArray, LengthZeroCtor) { - BinaryArray array(0, nullptr, nullptr); -} +TEST_F(TestBinaryArray, LengthZeroCtor) { BinaryArray array(0, nullptr, nullptr); } // ---------------------------------------------------------------------- // FixedSizeBinary tests @@ -1126,8 +1125,8 @@ TEST_F(TestFWBinaryArray, Builder) { std::shared_ptr result; - auto CheckResult = [this, &length, &is_valid, &raw_data, &byte_width]( - const Array& result) { + auto CheckResult = [this, &length, &is_valid, &raw_data, + &byte_width](const Array& result) { // Verify output const auto& fw_result = static_cast(result); @@ -1135,8 +1134,8 @@ TEST_F(TestFWBinaryArray, Builder) { for (int64_t i = 0; i < result.length(); ++i) { if (is_valid[i]) { - ASSERT_EQ( - 0, memcmp(raw_data + byte_width * i, fw_result.GetValue(i), byte_width)); + ASSERT_EQ(0, + memcmp(raw_data + byte_width * i, fw_result.GetValue(i), byte_width)); } else { ASSERT_TRUE(fw_result.IsNull(i)); } @@ -1323,8 +1322,8 @@ TEST_F(TestAdaptiveIntBuilder, TestInt16) { SetUp(); ASSERT_OK(builder_->Append(std::numeric_limits::max())); ASSERT_OK(builder_->Append(std::numeric_limits::min())); - expected_values = { - std::numeric_limits::max(), std::numeric_limits::min()}; + expected_values = {std::numeric_limits::max(), + std::numeric_limits::min()}; Done(); ArrayFromVector(expected_values, &expected_); @@ -1354,8 +1353,8 @@ TEST_F(TestAdaptiveIntBuilder, TestInt32) { SetUp(); ASSERT_OK(builder_->Append(std::numeric_limits::max())); ASSERT_OK(builder_->Append(std::numeric_limits::min())); - expected_values = { - std::numeric_limits::max(), std::numeric_limits::min()}; + expected_values = {std::numeric_limits::max(), + std::numeric_limits::min()}; Done(); ArrayFromVector(expected_values, &expected_); @@ -1385,8 +1384,8 @@ TEST_F(TestAdaptiveIntBuilder, TestInt64) { SetUp(); ASSERT_OK(builder_->Append(std::numeric_limits::max())); ASSERT_OK(builder_->Append(std::numeric_limits::min())); - expected_values = { - std::numeric_limits::max(), std::numeric_limits::min()}; + expected_values = {std::numeric_limits::max(), + std::numeric_limits::min()}; Done(); ArrayFromVector(expected_values, &expected_); @@ -1505,7 +1504,7 @@ template class TestDictionaryBuilder : public TestBuilder {}; typedef ::testing::Types + UInt32Type, Int64Type, UInt64Type, FloatType, DoubleType> PrimitiveDictionaries; TYPED_TEST_CASE(TestDictionaryBuilder, PrimitiveDictionaries); @@ -1784,7 +1783,7 @@ TEST_F(TestListBuilder, TestAppendNull) { } void ValidateBasicListArray(const ListArray* result, const vector& values, - const vector& is_valid) { + const vector& is_valid) { ASSERT_OK(ValidateArray(*result)); ASSERT_EQ(1, result->null_count()); ASSERT_EQ(0, result->values()->null_count()); @@ -1997,9 +1996,12 @@ TEST(TestDictionary, Validate) { // Struct tests void ValidateBasicStructArray(const StructArray* result, - const vector& struct_is_valid, const vector& list_values, - const vector& list_is_valid, const vector& list_lengths, - const vector& list_offsets, const vector& int_values) { + const vector& struct_is_valid, + const vector& list_values, + const vector& list_is_valid, + const vector& list_lengths, + const vector& list_offsets, + const vector& int_values) { ASSERT_EQ(4, result->length()); ASSERT_OK(ValidateArray(*result)); @@ -2134,7 +2136,7 @@ TEST_F(TestStructBuilder, TestBasics) { Done(); ValidateBasicStructArray(result_.get(), struct_is_valid, list_values, list_is_valid, - list_lengths, list_offsets, int_values); + list_lengths, list_offsets, int_values); } TEST_F(TestStructBuilder, BulkAppend) { @@ -2166,7 +2168,7 @@ TEST_F(TestStructBuilder, BulkAppend) { Done(); ValidateBasicStructArray(result_.get(), struct_is_valid, list_values, list_is_valid, - list_lengths, list_offsets, int_values); + list_lengths, list_offsets, int_values); } TEST_F(TestStructBuilder, BulkAppendInvalid) { @@ -2280,7 +2282,7 @@ TEST_F(TestStructBuilder, TestEquality) { // setup an unequal one with unequal offsets ASSERT_OK(builder_->Append(struct_is_valid.size(), struct_is_valid.data())); ASSERT_OK(list_vb->Append(unequal_list_offsets.data(), unequal_list_offsets.size(), - unequal_list_is_valid.data())); + unequal_list_is_valid.data())); for (int8_t value : list_values) { char_vb->UnsafeAppend(value); } diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 4a405f24342fb..61791c9457756 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -57,45 +57,57 @@ int64_t Array::null_count() const { bool Array::Equals(const Array& arr) const { bool are_equal = false; Status error = ArrayEquals(*this, arr, &are_equal); - if (!error.ok()) { DCHECK(false) << "Arrays not comparable: " << error.ToString(); } + if (!error.ok()) { + DCHECK(false) << "Arrays not comparable: " << error.ToString(); + } return are_equal; } bool Array::Equals(const std::shared_ptr& arr) const { - if (!arr) { return false; } + if (!arr) { + return false; + } return Equals(*arr); } bool Array::ApproxEquals(const Array& arr) const { bool are_equal = false; Status error = ArrayApproxEquals(*this, arr, &are_equal); - if (!error.ok()) { DCHECK(false) << "Arrays not comparable: " << error.ToString(); } + if (!error.ok()) { + DCHECK(false) << "Arrays not comparable: " << error.ToString(); + } return are_equal; } bool Array::ApproxEquals(const std::shared_ptr& arr) const { - if (!arr) { return false; } + if (!arr) { + return false; + } return ApproxEquals(*arr); } bool Array::RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx, - const std::shared_ptr& other) const { - if (!other) { return false; } + const std::shared_ptr& other) const { + if (!other) { + return false; + } return RangeEquals(*other, start_idx, end_idx, other_start_idx); } bool Array::RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx, - int64_t other_start_idx) const { + int64_t other_start_idx) const { bool are_equal = false; Status error = ArrayRangeEquals(*this, other, start_idx, end_idx, other_start_idx, &are_equal); - if (!error.ok()) { DCHECK(false) << "Arrays not comparable: " << error.ToString(); } + if (!error.ok()) { + DCHECK(false) << "Arrays not comparable: " << error.ToString(); + } return are_equal; } // Last two parameters are in-out parameters -static inline void ConformSliceParams( - int64_t array_offset, int64_t array_length, int64_t* offset, int64_t* length) { +static inline void ConformSliceParams(int64_t array_offset, int64_t array_length, + int64_t* offset, int64_t* length) { DCHECK_LE(*offset, array_length); DCHECK_NE(offset, nullptr); *length = std::min(array_length - *offset, *length); @@ -113,8 +125,8 @@ std::string Array::ToString() const { return ss.str(); } -static inline std::shared_ptr SliceData( - const ArrayData& data, int64_t offset, int64_t length) { +static inline std::shared_ptr SliceData(const ArrayData& data, int64_t offset, + int64_t length) { ConformSliceParams(data.offset, data.length, &offset, &length); auto new_data = data.ShallowCopy(); @@ -139,8 +151,9 @@ std::shared_ptr NullArray::Slice(int64_t offset, int64_t length) const { // Primitive array base PrimitiveArray::PrimitiveArray(const std::shared_ptr& type, int64_t length, - const std::shared_ptr& data, const std::shared_ptr& null_bitmap, - int64_t null_count, int64_t offset) { + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap, + int64_t null_count, int64_t offset) { BufferVector buffers = {null_bitmap, data}; SetData( std::make_shared(type, length, std::move(buffers), null_count, offset)); @@ -166,7 +179,8 @@ BooleanArray::BooleanArray(const std::shared_ptr& data) } BooleanArray::BooleanArray(int64_t length, const std::shared_ptr& data, - const std::shared_ptr& null_bitmap, int64_t null_count, int64_t offset) + const std::shared_ptr& null_bitmap, int64_t null_count, + int64_t offset) : PrimitiveArray(boolean(), length, data, null_bitmap, null_count, offset) {} std::shared_ptr BooleanArray::Slice(int64_t offset, int64_t length) const { @@ -182,8 +196,10 @@ ListArray::ListArray(const std::shared_ptr& data) { } ListArray::ListArray(const std::shared_ptr& type, int64_t length, - const std::shared_ptr& value_offsets, const std::shared_ptr& values, - const std::shared_ptr& null_bitmap, int64_t null_count, int64_t offset) { + const std::shared_ptr& value_offsets, + const std::shared_ptr& values, + const std::shared_ptr& null_bitmap, int64_t null_count, + int64_t offset) { BufferVector buffers = {null_bitmap, value_offsets}; auto internal_data = std::make_shared(type, length, std::move(buffers), null_count, offset); @@ -192,7 +208,7 @@ ListArray::ListArray(const std::shared_ptr& type, int64_t length, } Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPool* pool, - std::shared_ptr* out) { + std::shared_ptr* out) { if (ARROW_PREDICT_FALSE(offsets.length() == 0)) { return Status::Invalid("List offsets must have non-zero length"); } @@ -205,12 +221,13 @@ Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPo return Status::Invalid("List offsets must be signed int32"); } - BufferVector buffers = { - offsets.null_bitmap(), static_cast(offsets).values()}; + BufferVector buffers = {offsets.null_bitmap(), + static_cast(offsets).values()}; auto list_type = list(values.type()); - auto internal_data = std::make_shared(list_type, - offsets.length() - 1, std::move(buffers), offsets.null_count(), offsets.offset()); + auto internal_data = std::make_shared( + list_type, offsets.length() - 1, std::move(buffers), offsets.null_count(), + offsets.offset()); internal_data->child_data.push_back(values.data()); *out = std::make_shared(internal_data); @@ -230,14 +247,12 @@ std::shared_ptr ListArray::value_type() const { return static_cast(*type()).value_type(); } -std::shared_ptr ListArray::values() const { - return values_; -} +std::shared_ptr ListArray::values() const { return values_; } std::shared_ptr ListArray::Slice(int64_t offset, int64_t length) const { ConformSliceParams(data_->offset, data_->length, &offset, &length); return std::make_shared(type(), length, value_offsets(), values(), - null_bitmap(), kUnknownNullCount, offset); + null_bitmap(), kUnknownNullCount, offset); } // ---------------------------------------------------------------------- @@ -262,14 +277,17 @@ void BinaryArray::SetData(const std::shared_ptr& data) { } BinaryArray::BinaryArray(int64_t length, const std::shared_ptr& value_offsets, - const std::shared_ptr& data, const std::shared_ptr& null_bitmap, - int64_t null_count, int64_t offset) + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap, int64_t null_count, + int64_t offset) : BinaryArray(kBinary, length, value_offsets, data, null_bitmap, null_count, offset) { } BinaryArray::BinaryArray(const std::shared_ptr& type, int64_t length, - const std::shared_ptr& value_offsets, const std::shared_ptr& data, - const std::shared_ptr& null_bitmap, int64_t null_count, int64_t offset) { + const std::shared_ptr& value_offsets, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap, int64_t null_count, + int64_t offset) { BufferVector buffers = {null_bitmap, value_offsets, data}; SetData( std::make_shared(type, length, std::move(buffers), null_count, offset)); @@ -285,8 +303,9 @@ StringArray::StringArray(const std::shared_ptr& data) { } StringArray::StringArray(int64_t length, const std::shared_ptr& value_offsets, - const std::shared_ptr& data, const std::shared_ptr& null_bitmap, - int64_t null_count, int64_t offset) + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap, int64_t null_count, + int64_t offset) : BinaryArray(kString, length, value_offsets, data, null_bitmap, null_count, offset) { } @@ -304,8 +323,10 @@ FixedSizeBinaryArray::FixedSizeBinaryArray( } FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr& type, - int64_t length, const std::shared_ptr& data, - const std::shared_ptr& null_bitmap, int64_t null_count, int64_t offset) + int64_t length, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap, + int64_t null_count, int64_t offset) : PrimitiveArray(type, length, data, null_bitmap, null_count, offset), byte_width_(static_cast(*type).byte_width()) {} @@ -335,8 +356,9 @@ void DecimalArray::SetData(const std::shared_ptr& data) { } DecimalArray::DecimalArray(const std::shared_ptr& type, int64_t length, - const std::shared_ptr& data, const std::shared_ptr& null_bitmap, - int64_t null_count, int64_t offset, const std::shared_ptr& sign_bitmap) { + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap, int64_t null_count, + int64_t offset, const std::shared_ptr& sign_bitmap) { BufferVector buffers = {null_bitmap, data, sign_bitmap}; SetData( std::make_shared(type, length, std::move(buffers), null_count, offset)); @@ -392,8 +414,9 @@ StructArray::StructArray(const std::shared_ptr& data) { } StructArray::StructArray(const std::shared_ptr& type, int64_t length, - const std::vector>& children, - std::shared_ptr null_bitmap, int64_t null_count, int64_t offset) { + const std::vector>& children, + std::shared_ptr null_bitmap, int64_t null_count, + int64_t offset) { BufferVector buffers = {null_bitmap}; SetData( std::make_shared(type, length, std::move(buffers), null_count, offset)); @@ -433,9 +456,11 @@ UnionArray::UnionArray(const std::shared_ptr& data) { } UnionArray::UnionArray(const std::shared_ptr& type, int64_t length, - const std::vector>& children, - const std::shared_ptr& type_ids, const std::shared_ptr& value_offsets, - const std::shared_ptr& null_bitmap, int64_t null_count, int64_t offset) { + const std::vector>& children, + const std::shared_ptr& type_ids, + const std::shared_ptr& value_offsets, + const std::shared_ptr& null_bitmap, int64_t null_count, + int64_t offset) { BufferVector buffers = {null_bitmap, type_ids, value_offsets}; auto internal_data = std::make_shared(type, length, std::move(buffers), null_count, offset); @@ -464,8 +489,8 @@ DictionaryArray::DictionaryArray(const std::shared_ptr& data) SetData(data); } -DictionaryArray::DictionaryArray( - const std::shared_ptr& type, const std::shared_ptr& indices) +DictionaryArray::DictionaryArray(const std::shared_ptr& type, + const std::shared_ptr& indices) : dict_type_(static_cast(type.get())) { DCHECK_EQ(type->id(), Type::DICTIONARY); DCHECK_EQ(indices->type_id(), dict_type_->index_type()->id()); @@ -482,9 +507,7 @@ void DictionaryArray::SetData(const std::shared_ptr& data) { DCHECK(internal::MakeArray(indices_data, &indices_).ok()); } -std::shared_ptr DictionaryArray::indices() const { - return indices_; -} +std::shared_ptr DictionaryArray::indices() const { return indices_; } std::shared_ptr DictionaryArray::dictionary() const { return dict_type_->dictionary(); @@ -517,7 +540,9 @@ struct ValidateVisitor { } Status Visit(const ListArray& array) { - if (array.length() < 0) { return Status::Invalid("Length was negative"); } + if (array.length() < 0) { + return Status::Invalid("Length was negative"); + } auto value_offsets = array.value_offsets(); if (array.length() && !value_offsets) { @@ -550,7 +575,9 @@ struct ValidateVisitor { } int32_t prev_offset = array.value_offset(0); - if (prev_offset != 0) { return Status::Invalid("The first offset wasn't zero"); } + if (prev_offset != 0) { + return Status::Invalid("The first offset wasn't zero"); + } for (int64_t i = 1; i <= array.length(); ++i) { int32_t current_offset = array.value_offset(i); if (array.IsNull(i - 1) && current_offset != prev_offset) { @@ -573,7 +600,9 @@ struct ValidateVisitor { } Status Visit(const StructArray& array) { - if (array.length() < 0) { return Status::Invalid("Length was negative"); } + if (array.length() < 0) { + return Status::Invalid("Length was negative"); + } if (array.null_count() > array.length()) { return Status::Invalid("Null count exceeds the length of this struct"); @@ -610,7 +639,9 @@ struct ValidateVisitor { } Status Visit(const UnionArray& array) { - if (array.length() < 0) { return Status::Invalid("Length was negative"); } + if (array.length() < 0) { + return Status::Invalid("Length was negative"); + } if (array.null_count() > array.length()) { return Status::Invalid("Null count exceeds the length of this struct"); @@ -661,8 +692,9 @@ Status MakeArray(const std::shared_ptr& data, std::shared_ptr* } // namespace internal Status MakePrimitiveArray(const std::shared_ptr& type, int64_t length, - const std::shared_ptr& data, const std::shared_ptr& null_bitmap, - int64_t null_count, int64_t offset, std::shared_ptr* out) { + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap, int64_t null_count, + int64_t offset, std::shared_ptr* out) { BufferVector buffers = {null_bitmap, data}; auto internal_data = std::make_shared( type, length, std::move(buffers), null_count, offset); @@ -670,8 +702,9 @@ Status MakePrimitiveArray(const std::shared_ptr& type, int64_t length, } Status MakePrimitiveArray(const std::shared_ptr& type, - const std::vector>& buffers, int64_t length, - int64_t null_count, int64_t offset, std::shared_ptr* out) { + const std::vector>& buffers, + int64_t length, int64_t null_count, int64_t offset, + std::shared_ptr* out) { auto internal_data = std::make_shared(type, length, buffers, null_count, offset); return internal::MakeArray(internal_data, out); diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index c32d5e1c93ffd..a853f2bb5f93d 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -88,8 +88,8 @@ struct ARROW_EXPORT ArrayData { ArrayData() {} ArrayData(const std::shared_ptr& type, int64_t length, - const std::vector>& buffers, - int64_t null_count = kUnknownNullCount, int64_t offset = 0) + const std::vector>& buffers, + int64_t null_count = kUnknownNullCount, int64_t offset = 0) : type(type), length(length), buffers(buffers), @@ -97,8 +97,8 @@ struct ARROW_EXPORT ArrayData { offset(offset) {} ArrayData(const std::shared_ptr& type, int64_t length, - std::vector>&& buffers, - int64_t null_count = kUnknownNullCount, int64_t offset = 0) + std::vector>&& buffers, + int64_t null_count = kUnknownNullCount, int64_t offset = 0) : type(type), length(length), buffers(std::move(buffers)), @@ -145,8 +145,8 @@ struct ARROW_EXPORT ArrayData { std::vector> child_data; }; -Status ARROW_EXPORT MakeArray( - const std::shared_ptr& data, std::shared_ptr* out); +Status ARROW_EXPORT MakeArray(const std::shared_ptr& data, + std::shared_ptr* out); } // namespace internal @@ -211,10 +211,10 @@ class ARROW_EXPORT Array { /// Compare if the range of slots specified are equal for the given array and /// this array. end_idx exclusive. This methods does not bounds check. bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx, - const std::shared_ptr& other) const; + const std::shared_ptr& other) const; bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx, - int64_t other_start_idx) const; + int64_t other_start_idx) const; Status Accept(ArrayVisitor* visitor) const; @@ -285,9 +285,9 @@ class ARROW_EXPORT NullArray : public FlatArray { class ARROW_EXPORT PrimitiveArray : public FlatArray { public: PrimitiveArray(const std::shared_ptr& type, int64_t length, - const std::shared_ptr& data, - const std::shared_ptr& null_bitmap = nullptr, int64_t null_count = 0, - int64_t offset = 0); + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = nullptr, + int64_t null_count = 0, int64_t offset = 0); /// Does not account for any slice offset std::shared_ptr values() const { return data_->buffers[1]; } @@ -328,7 +328,7 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray { const std::shared_ptr& null_bitmap = nullptr, int64_t null_count = 0, int64_t offset = 0) : PrimitiveArray(TypeTraits::type_singleton(), length, data, null_bitmap, - null_count, offset) {} + null_count, offset) {} const value_type* raw_values() const { return reinterpret_cast(raw_values_) + data_->offset; @@ -349,14 +349,14 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { explicit BooleanArray(const std::shared_ptr& data); BooleanArray(int64_t length, const std::shared_ptr& data, - const std::shared_ptr& null_bitmap = nullptr, int64_t null_count = 0, - int64_t offset = 0); + const std::shared_ptr& null_bitmap = nullptr, + int64_t null_count = 0, int64_t offset = 0); std::shared_ptr Slice(int64_t offset, int64_t length) const override; bool Value(int64_t i) const { - return BitUtil::GetBit( - reinterpret_cast(raw_values_), i + data_->offset); + return BitUtil::GetBit(reinterpret_cast(raw_values_), + i + data_->offset); } protected: @@ -373,9 +373,10 @@ class ARROW_EXPORT ListArray : public Array { explicit ListArray(const std::shared_ptr& data); ListArray(const std::shared_ptr& type, int64_t length, - const std::shared_ptr& value_offsets, const std::shared_ptr& values, - const std::shared_ptr& null_bitmap = nullptr, int64_t null_count = 0, - int64_t offset = 0); + const std::shared_ptr& value_offsets, + const std::shared_ptr& values, + const std::shared_ptr& null_bitmap = nullptr, int64_t null_count = 0, + int64_t offset = 0); /// \brief Construct ListArray from array of offsets and child value array /// @@ -388,7 +389,7 @@ class ARROW_EXPORT ListArray : public Array { /// allocated because of null values /// \param[out] out Will have length equal to offsets.length() - 1 static Status FromArrays(const Array& offsets, const Array& values, MemoryPool* pool, - std::shared_ptr* out); + std::shared_ptr* out); /// \brief Return array object containing the list's values std::shared_ptr values() const; @@ -428,9 +429,9 @@ class ARROW_EXPORT BinaryArray : public FlatArray { explicit BinaryArray(const std::shared_ptr& data); BinaryArray(int64_t length, const std::shared_ptr& value_offsets, - const std::shared_ptr& data, - const std::shared_ptr& null_bitmap = nullptr, int64_t null_count = 0, - int64_t offset = 0); + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = nullptr, + int64_t null_count = 0, int64_t offset = 0); // Return the pointer to the given elements bytes // TODO(emkornfield) introduce a StringPiece or something similar to capture zero-copy @@ -471,9 +472,10 @@ class ARROW_EXPORT BinaryArray : public FlatArray { // Constructor that allows sub-classes/builders to propagate there logical type up the // class hierarchy. BinaryArray(const std::shared_ptr& type, int64_t length, - const std::shared_ptr& value_offsets, const std::shared_ptr& data, - const std::shared_ptr& null_bitmap = nullptr, int64_t null_count = 0, - int64_t offset = 0); + const std::shared_ptr& value_offsets, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = nullptr, + int64_t null_count = 0, int64_t offset = 0); const int32_t* raw_value_offsets_; const uint8_t* raw_data_; @@ -486,9 +488,9 @@ class ARROW_EXPORT StringArray : public BinaryArray { explicit StringArray(const std::shared_ptr& data); StringArray(int64_t length, const std::shared_ptr& value_offsets, - const std::shared_ptr& data, - const std::shared_ptr& null_bitmap = nullptr, int64_t null_count = 0, - int64_t offset = 0); + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = nullptr, + int64_t null_count = 0, int64_t offset = 0); // Construct a std::string // TODO: std::bad_alloc possibility @@ -511,9 +513,9 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { explicit FixedSizeBinaryArray(const std::shared_ptr& data); FixedSizeBinaryArray(const std::shared_ptr& type, int64_t length, - const std::shared_ptr& data, - const std::shared_ptr& null_bitmap = nullptr, int64_t null_count = 0, - int64_t offset = 0); + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = nullptr, + int64_t null_count = 0, int64_t offset = 0); const uint8_t* GetValue(int64_t i) const; @@ -542,9 +544,10 @@ class ARROW_EXPORT DecimalArray : public FlatArray { explicit DecimalArray(const std::shared_ptr& data); DecimalArray(const std::shared_ptr& type, int64_t length, - const std::shared_ptr& data, - const std::shared_ptr& null_bitmap = nullptr, int64_t null_count = 0, - int64_t offset = 0, const std::shared_ptr& sign_bitmap = nullptr); + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = nullptr, + int64_t null_count = 0, int64_t offset = 0, + const std::shared_ptr& sign_bitmap = nullptr); bool IsNegative(int64_t i) const; @@ -582,9 +585,9 @@ class ARROW_EXPORT StructArray : public Array { explicit StructArray(const std::shared_ptr& data); StructArray(const std::shared_ptr& type, int64_t length, - const std::vector>& children, - std::shared_ptr null_bitmap = nullptr, int64_t null_count = 0, - int64_t offset = 0); + const std::vector>& children, + std::shared_ptr null_bitmap = nullptr, int64_t null_count = 0, + int64_t offset = 0); // Return a shared pointer in case the requestor desires to share ownership // with this array. @@ -604,11 +607,11 @@ class ARROW_EXPORT UnionArray : public Array { explicit UnionArray(const std::shared_ptr& data); UnionArray(const std::shared_ptr& type, int64_t length, - const std::vector>& children, - const std::shared_ptr& type_ids, - const std::shared_ptr& value_offsets = nullptr, - const std::shared_ptr& null_bitmap = nullptr, int64_t null_count = 0, - int64_t offset = 0); + const std::vector>& children, + const std::shared_ptr& type_ids, + const std::shared_ptr& value_offsets = nullptr, + const std::shared_ptr& null_bitmap = nullptr, int64_t null_count = 0, + int64_t offset = 0); /// Note that this buffer does not account for any slice offset std::shared_ptr type_ids() const { return data_->buffers[1]; } @@ -656,8 +659,8 @@ class ARROW_EXPORT DictionaryArray : public Array { explicit DictionaryArray(const std::shared_ptr& data); - DictionaryArray( - const std::shared_ptr& type, const std::shared_ptr& indices); + DictionaryArray(const std::shared_ptr& type, + const std::shared_ptr& indices); std::shared_ptr indices() const; std::shared_ptr dictionary() const; @@ -705,13 +708,16 @@ Status ARROW_EXPORT ValidateArray(const Array& array); /// Create new arrays for logical types that are backed by primitive arrays. Status ARROW_EXPORT MakePrimitiveArray(const std::shared_ptr& type, - int64_t length, const std::shared_ptr& data, - const std::shared_ptr& null_bitmap, int64_t null_count, int64_t offset, - std::shared_ptr* out); - -Status ARROW_EXPORT MakePrimitiveArray(const std::shared_ptr& type, - const std::vector>& buffers, int64_t length, - int64_t null_count, int64_t offset, std::shared_ptr* out); + int64_t length, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap, + int64_t null_count, int64_t offset, + std::shared_ptr* out); + +Status ARROW_EXPORT +MakePrimitiveArray(const std::shared_ptr& type, + const std::vector>& buffers, int64_t length, + int64_t null_count, int64_t offset, std::shared_ptr* out); } // namespace arrow diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc index a1d119ecdcae5..b9c5897f8a228 100644 --- a/cpp/src/arrow/buffer.cc +++ b/cpp/src/arrow/buffer.cc @@ -27,8 +27,8 @@ namespace arrow { -Status Buffer::Copy( - int64_t start, int64_t nbytes, MemoryPool* pool, std::shared_ptr* out) const { +Status Buffer::Copy(int64_t start, int64_t nbytes, MemoryPool* pool, + std::shared_ptr* out) const { // Sanity checks DCHECK_LT(start, size_); DCHECK_LE(nbytes, size_ - start); @@ -47,25 +47,28 @@ Status Buffer::Copy(int64_t start, int64_t nbytes, std::shared_ptr* out) } bool Buffer::Equals(const Buffer& other, int64_t nbytes) const { - return this == &other || - (size_ >= nbytes && other.size_ >= nbytes && - (data_ == other.data_ || - !memcmp(data_, other.data_, static_cast(nbytes)))); + return this == &other || (size_ >= nbytes && other.size_ >= nbytes && + (data_ == other.data_ || + !memcmp(data_, other.data_, static_cast(nbytes)))); } bool Buffer::Equals(const Buffer& other) const { - return this == &other || (size_ == other.size_ && (data_ == other.data_ || - !memcmp(data_, other.data_, - static_cast(size_)))); + return this == &other || (size_ == other.size_ && + (data_ == other.data_ || + !memcmp(data_, other.data_, static_cast(size_)))); } PoolBuffer::PoolBuffer(MemoryPool* pool) : ResizableBuffer(nullptr, 0) { - if (pool == nullptr) { pool = default_memory_pool(); } + if (pool == nullptr) { + pool = default_memory_pool(); + } pool_ = pool; } PoolBuffer::~PoolBuffer() { - if (mutable_data_ != nullptr) { pool_->Free(mutable_data_, capacity_); } + if (mutable_data_ != nullptr) { + pool_->Free(mutable_data_, capacity_); + } } Status PoolBuffer::Reserve(int64_t new_capacity) { @@ -109,28 +112,28 @@ Status PoolBuffer::Resize(int64_t new_size, bool shrink_to_fit) { return Status::OK(); } -std::shared_ptr SliceMutableBuffer( - const std::shared_ptr& buffer, int64_t offset, int64_t length) { +std::shared_ptr SliceMutableBuffer(const std::shared_ptr& buffer, + int64_t offset, int64_t length) { return std::make_shared(buffer, offset, length); } -MutableBuffer::MutableBuffer( - const std::shared_ptr& parent, int64_t offset, int64_t size) +MutableBuffer::MutableBuffer(const std::shared_ptr& parent, int64_t offset, + int64_t size) : MutableBuffer(parent->mutable_data() + offset, size) { DCHECK(parent->is_mutable()) << "Must pass mutable buffer"; parent_ = parent; } -Status AllocateBuffer( - MemoryPool* pool, int64_t size, std::shared_ptr* out) { +Status AllocateBuffer(MemoryPool* pool, int64_t size, + std::shared_ptr* out) { auto buffer = std::make_shared(pool); RETURN_NOT_OK(buffer->Resize(size)); *out = buffer; return Status::OK(); } -Status AllocateResizableBuffer( - MemoryPool* pool, int64_t size, std::shared_ptr* out) { +Status AllocateResizableBuffer(MemoryPool* pool, int64_t size, + std::shared_ptr* out) { auto buffer = std::make_shared(pool); RETURN_NOT_OK(buffer->Resize(size)); *out = buffer; diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 488a4c05334d5..09e539d162fb2 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -72,7 +72,7 @@ class ARROW_EXPORT Buffer { /// Copy a section of the buffer into a new Buffer. Status Copy(int64_t start, int64_t nbytes, MemoryPool* pool, - std::shared_ptr* out) const; + std::shared_ptr* out) const; /// Copy a section of the buffer using the default memory pool into a new Buffer. Status Copy(int64_t start, int64_t nbytes, std::shared_ptr* out) const; @@ -106,21 +106,21 @@ class ARROW_EXPORT Buffer { /// \param str std::string instance /// \return std::shared_ptr static inline std::shared_ptr GetBufferFromString(const std::string& str) { - return std::make_shared( - reinterpret_cast(str.c_str()), static_cast(str.size())); + return std::make_shared(reinterpret_cast(str.c_str()), + static_cast(str.size())); } /// Construct a view on passed buffer at the indicated offset and length. This /// function cannot fail and does not error checking (except in debug builds) -static inline std::shared_ptr SliceBuffer( - const std::shared_ptr& buffer, int64_t offset, int64_t length) { +static inline std::shared_ptr SliceBuffer(const std::shared_ptr& buffer, + int64_t offset, int64_t length) { return std::make_shared(buffer, offset, length); } /// Construct a mutable buffer slice. If the parent buffer is not mutable, this /// will abort in debug builds -std::shared_ptr ARROW_EXPORT SliceMutableBuffer( - const std::shared_ptr& buffer, int64_t offset, int64_t length); +std::shared_ptr ARROW_EXPORT +SliceMutableBuffer(const std::shared_ptr& buffer, int64_t offset, int64_t length); /// A Buffer whose contents can be mutated. May or may not own its data. class ARROW_EXPORT MutableBuffer : public Buffer { @@ -186,8 +186,12 @@ class ARROW_EXPORT BufferBuilder { /// Resizes the buffer to the nearest multiple of 64 bytes per Layout.md Status Resize(int64_t elements) { // Resize(0) is a no-op - if (elements == 0) { return Status::OK(); } - if (capacity_ == 0) { buffer_ = std::make_shared(pool_); } + if (elements == 0) { + return Status::OK(); + } + if (capacity_ == 0) { + buffer_ = std::make_shared(pool_); + } int64_t old_capacity = capacity_; RETURN_NOT_OK(buffer_->Resize(elements)); capacity_ = buffer_->capacity(); @@ -199,14 +203,18 @@ class ARROW_EXPORT BufferBuilder { } Status Append(const uint8_t* data, int64_t length) { - if (capacity_ < length + size_) { RETURN_NOT_OK(Resize(length + size_)); } + if (capacity_ < length + size_) { + RETURN_NOT_OK(Resize(length + size_)); + } UnsafeAppend(data, length); return Status::OK(); } // Advance pointer and zero out memory Status Advance(int64_t length) { - if (capacity_ < length + size_) { RETURN_NOT_OK(Resize(length + size_)); } + if (capacity_ < length + size_) { + RETURN_NOT_OK(Resize(length + size_)); + } memset(data_ + size_, 0, static_cast(length)); size_ += length; return Status::OK(); @@ -220,7 +228,9 @@ class ARROW_EXPORT BufferBuilder { Status Finish(std::shared_ptr* out) { // Do not shrink to fit to avoid unneeded realloc - if (size_ > 0) { RETURN_NOT_OK(buffer_->Resize(size_, false)); } + if (size_ > 0) { + RETURN_NOT_OK(buffer_->Resize(size_, false)); + } *out = buffer_; Reset(); return Status::OK(); @@ -250,29 +260,29 @@ class ARROW_EXPORT TypedBufferBuilder : public BufferBuilder { Status Append(T arithmetic_value) { static_assert(std::is_arithmetic::value, - "Convenience buffer append only supports arithmetic types"); - return BufferBuilder::Append( - reinterpret_cast(&arithmetic_value), sizeof(T)); + "Convenience buffer append only supports arithmetic types"); + return BufferBuilder::Append(reinterpret_cast(&arithmetic_value), + sizeof(T)); } Status Append(const T* arithmetic_values, int64_t num_elements) { static_assert(std::is_arithmetic::value, - "Convenience buffer append only supports arithmetic types"); - return BufferBuilder::Append( - reinterpret_cast(arithmetic_values), num_elements * sizeof(T)); + "Convenience buffer append only supports arithmetic types"); + return BufferBuilder::Append(reinterpret_cast(arithmetic_values), + num_elements * sizeof(T)); } void UnsafeAppend(T arithmetic_value) { static_assert(std::is_arithmetic::value, - "Convenience buffer append only supports arithmetic types"); + "Convenience buffer append only supports arithmetic types"); BufferBuilder::UnsafeAppend(reinterpret_cast(&arithmetic_value), sizeof(T)); } void UnsafeAppend(const T* arithmetic_values, int64_t num_elements) { static_assert(std::is_arithmetic::value, - "Convenience buffer append only supports arithmetic types"); - BufferBuilder::UnsafeAppend( - reinterpret_cast(arithmetic_values), num_elements * sizeof(T)); + "Convenience buffer append only supports arithmetic types"); + BufferBuilder::UnsafeAppend(reinterpret_cast(arithmetic_values), + num_elements * sizeof(T)); } const T* data() const { return reinterpret_cast(data_); } @@ -286,11 +296,11 @@ class ARROW_EXPORT TypedBufferBuilder : public BufferBuilder { /// \param[out] out the allocated buffer with padding /// /// \return Status message -Status ARROW_EXPORT AllocateBuffer( - MemoryPool* pool, int64_t size, std::shared_ptr* out); +Status ARROW_EXPORT AllocateBuffer(MemoryPool* pool, int64_t size, + std::shared_ptr* out); -Status ARROW_EXPORT AllocateResizableBuffer( - MemoryPool* pool, int64_t size, std::shared_ptr* out); +Status ARROW_EXPORT AllocateResizableBuffer(MemoryPool* pool, int64_t size, + std::shared_ptr* out); } // namespace arrow diff --git a/cpp/src/arrow/builder-benchmark.cc b/cpp/src/arrow/builder-benchmark.cc index 7ca7bb4999801..8ba9360e917fc 100644 --- a/cpp/src/arrow/builder-benchmark.cc +++ b/cpp/src/arrow/builder-benchmark.cc @@ -38,8 +38,8 @@ static void BM_BuildPrimitiveArrayNoNulls( std::shared_ptr out; ABORT_NOT_OK(builder.Finish(&out)); } - state.SetBytesProcessed( - state.iterations() * data.size() * sizeof(int64_t) * kFinalSize); + state.SetBytesProcessed(state.iterations() * data.size() * sizeof(int64_t) * + kFinalSize); } static void BM_BuildVectorNoNulls( @@ -53,8 +53,8 @@ static void BM_BuildVectorNoNulls( builder.insert(builder.end(), data.cbegin(), data.cend()); } } - state.SetBytesProcessed( - state.iterations() * data.size() * sizeof(int64_t) * kFinalSize); + state.SetBytesProcessed(state.iterations() * data.size() * sizeof(int64_t) * + kFinalSize); } static void BM_BuildAdaptiveIntNoNulls( @@ -127,8 +127,8 @@ static void BM_BuildDictionary(benchmark::State& state) { // NOLINT non-const r std::shared_ptr out; ABORT_NOT_OK(builder.Finish(&out)); } - state.SetBytesProcessed( - state.iterations() * iterations * (iterations + 1) / 2 * sizeof(int64_t)); + state.SetBytesProcessed(state.iterations() * iterations * (iterations + 1) / 2 * + sizeof(int64_t)); } static void BM_BuildStringDictionary( @@ -152,8 +152,8 @@ static void BM_BuildStringDictionary( ABORT_NOT_OK(builder.Finish(&out)); } // Assuming a string here needs on average 2 bytes - state.SetBytesProcessed( - state.iterations() * iterations * (iterations + 1) / 2 * sizeof(int32_t)); + state.SetBytesProcessed(state.iterations() * iterations * (iterations + 1) / 2 * + sizeof(int32_t)); } BENCHMARK(BM_BuildPrimitiveArrayNoNulls)->Repetitions(3)->Unit(benchmark::kMicrosecond); diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index ee363b91d8fcc..d3a299e5412fc 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -69,7 +69,9 @@ Status ArrayBuilder::Init(int64_t capacity) { } Status ArrayBuilder::Resize(int64_t new_bits) { - if (!null_bitmap_) { return Init(new_bits); } + if (!null_bitmap_) { + return Init(new_bits); + } int64_t new_bytes = BitUtil::CeilByte(new_bits) / 8; int64_t old_bytes = null_bitmap_->size(); RETURN_NOT_OK(null_bitmap_->Resize(new_bytes)); @@ -78,8 +80,8 @@ Status ArrayBuilder::Resize(int64_t new_bits) { const int64_t byte_capacity = null_bitmap_->capacity(); capacity_ = new_bits; if (old_bytes < new_bytes) { - memset( - null_bitmap_data_ + old_bytes, 0, static_cast(byte_capacity - old_bytes)); + memset(null_bitmap_data_ + old_bytes, 0, + static_cast(byte_capacity - old_bytes)); } return Status::OK(); } @@ -140,7 +142,9 @@ void ArrayBuilder::UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t leng bit_offset++; } - if (bit_offset != 0) { null_bitmap_data_[byte_offset] = bitset; } + if (bit_offset != 0) { + null_bitmap_data_[byte_offset] = bitset; + } length_ += length; } @@ -149,7 +153,9 @@ void ArrayBuilder::UnsafeSetNotNull(int64_t length) { // Fill up the bytes until we have a byte alignment int64_t pad_to_byte = std::min(8 - (length_ % 8), length); - if (pad_to_byte == 8) { pad_to_byte = 0; } + if (pad_to_byte == 8) { + pad_to_byte = 0; + } for (int64_t i = length_; i < length_ + pad_to_byte; ++i) { BitUtil::SetBit(null_bitmap_data_, i); } @@ -157,7 +163,7 @@ void ArrayBuilder::UnsafeSetNotNull(int64_t length) { // Fast bitsetting int64_t fast_length = (length - pad_to_byte) / 8; memset(null_bitmap_data_ + ((length_ + pad_to_byte) / 8), 0xFF, - static_cast(fast_length)); + static_cast(fast_length)); // Trailing bytes for (int64_t i = length_ + pad_to_byte + (fast_length * 8); i < new_length; ++i) { @@ -184,7 +190,9 @@ Status PrimitiveBuilder::Init(int64_t capacity) { template Status PrimitiveBuilder::Resize(int64_t capacity) { // XXX: Set floor size for now - if (capacity < kMinBuilderCapacity) { capacity = kMinBuilderCapacity; } + if (capacity < kMinBuilderCapacity) { + capacity = kMinBuilderCapacity; + } if (capacity_ == 0) { RETURN_NOT_OK(Init(capacity)); @@ -195,20 +203,20 @@ Status PrimitiveBuilder::Resize(int64_t capacity) { RETURN_NOT_OK(data_->Resize(new_bytes)); raw_data_ = reinterpret_cast(data_->mutable_data()); // TODO(emkornfield) valgrind complains without this - memset( - data_->mutable_data() + old_bytes, 0, static_cast(new_bytes - old_bytes)); + memset(data_->mutable_data() + old_bytes, 0, + static_cast(new_bytes - old_bytes)); } return Status::OK(); } template -Status PrimitiveBuilder::Append( - const value_type* values, int64_t length, const uint8_t* valid_bytes) { +Status PrimitiveBuilder::Append(const value_type* values, int64_t length, + const uint8_t* valid_bytes) { RETURN_NOT_OK(Reserve(length)); if (length > 0) { std::memcpy(raw_data_ + length_, values, - static_cast(TypeTraits::bytes_required(length))); + static_cast(TypeTraits::bytes_required(length))); } // length_ is update by these @@ -224,8 +232,8 @@ Status PrimitiveBuilder::Finish(std::shared_ptr* out) { // Trim buffers RETURN_NOT_OK(data_->Resize(bytes_required)); } - *out = std::make_shared::ArrayType>( - type_, length_, data_, null_bitmap_, null_count_); + *out = std::make_shared::ArrayType>(type_, length_, data_, + null_bitmap_, null_count_); data_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; @@ -267,7 +275,9 @@ Status AdaptiveIntBuilderBase::Init(int64_t capacity) { Status AdaptiveIntBuilderBase::Resize(int64_t capacity) { // XXX: Set floor size for now - if (capacity < kMinBuilderCapacity) { capacity = kMinBuilderCapacity; } + if (capacity < kMinBuilderCapacity) { + capacity = kMinBuilderCapacity; + } if (capacity_ == 0) { RETURN_NOT_OK(Init(capacity)); @@ -278,8 +288,8 @@ Status AdaptiveIntBuilderBase::Resize(int64_t capacity) { RETURN_NOT_OK(data_->Resize(new_bytes)); raw_data_ = data_->mutable_data(); // TODO(emkornfield) valgrind complains without this - memset( - data_->mutable_data() + old_bytes, 0, static_cast(new_bytes - old_bytes)); + memset(data_->mutable_data() + old_bytes, 0, + static_cast(new_bytes - old_bytes)); } return Status::OK(); } @@ -298,16 +308,16 @@ Status AdaptiveIntBuilder::Finish(std::shared_ptr* out) { std::make_shared(int8(), length_, data_, null_bitmap_, null_count_); break; case 2: - *out = std::make_shared( - int16(), length_, data_, null_bitmap_, null_count_); + *out = std::make_shared(int16(), length_, data_, null_bitmap_, + null_count_); break; case 4: - *out = std::make_shared( - int32(), length_, data_, null_bitmap_, null_count_); + *out = std::make_shared(int32(), length_, data_, null_bitmap_, + null_count_); break; case 8: - *out = std::make_shared( - int64(), length_, data_, null_bitmap_, null_count_); + *out = std::make_shared(int64(), length_, data_, null_bitmap_, + null_count_); break; default: DCHECK(false); @@ -319,8 +329,8 @@ Status AdaptiveIntBuilder::Finish(std::shared_ptr* out) { return Status::OK(); } -Status AdaptiveIntBuilder::Append( - const int64_t* values, int64_t length, const uint8_t* valid_bytes) { +Status AdaptiveIntBuilder::Append(const int64_t* values, int64_t length, + const uint8_t* valid_bytes) { RETURN_NOT_OK(Reserve(length)); if (length > 0) { @@ -331,13 +341,15 @@ Status AdaptiveIntBuilder::Append( new_int_size = expanded_int_size(values[i], new_int_size); } } - if (new_int_size != int_size_) { RETURN_NOT_OK(ExpandIntSize(new_int_size)); } + if (new_int_size != int_size_) { + RETURN_NOT_OK(ExpandIntSize(new_int_size)); + } } } if (int_size_ == 8) { std::memcpy(reinterpret_cast(raw_data_) + length_, values, - sizeof(int64_t) * length); + sizeof(int64_t) * length); } else { #ifdef _MSC_VER #pragma warning(push) @@ -348,17 +360,17 @@ Status AdaptiveIntBuilder::Append( case 1: { int8_t* data_ptr = reinterpret_cast(raw_data_) + length_; std::transform(values, values + length, data_ptr, - [](int64_t x) { return static_cast(x); }); + [](int64_t x) { return static_cast(x); }); } break; case 2: { int16_t* data_ptr = reinterpret_cast(raw_data_) + length_; std::transform(values, values + length, data_ptr, - [](int64_t x) { return static_cast(x); }); + [](int64_t x) { return static_cast(x); }); } break; case 4: { int32_t* data_ptr = reinterpret_cast(raw_data_) + length_; std::transform(values, values + length, data_ptr, - [](int64_t x) { return static_cast(x); }); + [](int64_t x) { return static_cast(x); }); } break; default: DCHECK(false); @@ -449,20 +461,20 @@ Status AdaptiveUIntBuilder::Finish(std::shared_ptr* out) { } switch (int_size_) { case 1: - *out = std::make_shared( - uint8(), length_, data_, null_bitmap_, null_count_); + *out = std::make_shared(uint8(), length_, data_, null_bitmap_, + null_count_); break; case 2: - *out = std::make_shared( - uint16(), length_, data_, null_bitmap_, null_count_); + *out = std::make_shared(uint16(), length_, data_, null_bitmap_, + null_count_); break; case 4: - *out = std::make_shared( - uint32(), length_, data_, null_bitmap_, null_count_); + *out = std::make_shared(uint32(), length_, data_, null_bitmap_, + null_count_); break; case 8: - *out = std::make_shared( - uint64(), length_, data_, null_bitmap_, null_count_); + *out = std::make_shared(uint64(), length_, data_, null_bitmap_, + null_count_); break; default: DCHECK(false); @@ -474,8 +486,8 @@ Status AdaptiveUIntBuilder::Finish(std::shared_ptr* out) { return Status::OK(); } -Status AdaptiveUIntBuilder::Append( - const uint64_t* values, int64_t length, const uint8_t* valid_bytes) { +Status AdaptiveUIntBuilder::Append(const uint64_t* values, int64_t length, + const uint8_t* valid_bytes) { RETURN_NOT_OK(Reserve(length)); if (length > 0) { @@ -486,13 +498,15 @@ Status AdaptiveUIntBuilder::Append( new_int_size = expanded_uint_size(values[i], new_int_size); } } - if (new_int_size != int_size_) { RETURN_NOT_OK(ExpandIntSize(new_int_size)); } + if (new_int_size != int_size_) { + RETURN_NOT_OK(ExpandIntSize(new_int_size)); + } } } if (int_size_ == 8) { std::memcpy(reinterpret_cast(raw_data_) + length_, values, - sizeof(uint64_t) * length); + sizeof(uint64_t) * length); } else { #ifdef _MSC_VER #pragma warning(push) @@ -503,17 +517,17 @@ Status AdaptiveUIntBuilder::Append( case 1: { uint8_t* data_ptr = reinterpret_cast(raw_data_) + length_; std::transform(values, values + length, data_ptr, - [](uint64_t x) { return static_cast(x); }); + [](uint64_t x) { return static_cast(x); }); } break; case 2: { uint16_t* data_ptr = reinterpret_cast(raw_data_) + length_; std::transform(values, values + length, data_ptr, - [](uint64_t x) { return static_cast(x); }); + [](uint64_t x) { return static_cast(x); }); } break; case 4: { uint32_t* data_ptr = reinterpret_cast(raw_data_) + length_; std::transform(values, values + length, data_ptr, - [](uint64_t x) { return static_cast(x); }); + [](uint64_t x) { return static_cast(x); }); } break; default: DCHECK(false); @@ -616,7 +630,9 @@ Status BooleanBuilder::Init(int64_t capacity) { Status BooleanBuilder::Resize(int64_t capacity) { // XXX: Set floor size for now - if (capacity < kMinBuilderCapacity) { capacity = kMinBuilderCapacity; } + if (capacity < kMinBuilderCapacity) { + capacity = kMinBuilderCapacity; + } if (capacity_ == 0) { RETURN_NOT_OK(Init(capacity)); @@ -627,8 +643,8 @@ Status BooleanBuilder::Resize(int64_t capacity) { RETURN_NOT_OK(data_->Resize(new_bytes)); raw_data_ = reinterpret_cast(data_->mutable_data()); - memset( - data_->mutable_data() + old_bytes, 0, static_cast(new_bytes - old_bytes)); + memset(data_->mutable_data() + old_bytes, 0, + static_cast(new_bytes - old_bytes)); } return Status::OK(); } @@ -647,8 +663,8 @@ Status BooleanBuilder::Finish(std::shared_ptr* out) { return Status::OK(); } -Status BooleanBuilder::Append( - const uint8_t* values, int64_t length, const uint8_t* valid_bytes) { +Status BooleanBuilder::Append(const uint8_t* values, int64_t length, + const uint8_t* valid_bytes) { RETURN_NOT_OK(Reserve(length)); for (int64_t i = 0; i < length; ++i) { @@ -673,14 +689,16 @@ Status BooleanBuilder::Append( // DictionaryBuilder template -DictionaryBuilder::DictionaryBuilder( - MemoryPool* pool, const std::shared_ptr& type) +DictionaryBuilder::DictionaryBuilder(MemoryPool* pool, + const std::shared_ptr& type) : ArrayBuilder(pool, type), hash_table_(new PoolBuffer(pool)), hash_slots_(nullptr), dict_builder_(pool, type), values_builder_(pool) { - if (!::arrow::CpuInfo::initialized()) { ::arrow::CpuInfo::Init(); } + if (!::arrow::CpuInfo::initialized()) { + ::arrow::CpuInfo::Init(); + } } template @@ -699,7 +717,9 @@ Status DictionaryBuilder::Init(int64_t elements) { template Status DictionaryBuilder::Resize(int64_t capacity) { - if (capacity < kMinBuilderCapacity) { capacity = kMinBuilderCapacity; } + if (capacity < kMinBuilderCapacity) { + capacity = kMinBuilderCapacity; + } if (capacity_ == 0) { return Init(capacity); @@ -732,7 +752,9 @@ Status DictionaryBuilder::Append(const Scalar& value) { while (kHashSlotEmpty != index && SlotDifferent(index, value)) { // Linear probing ++j; - if (j == hash_table_size_) { j = 0; } + if (j == hash_table_size_) { + j = 0; + } index = hash_slots_[j]; } @@ -784,7 +806,9 @@ Status DictionaryBuilder::DoubleTableSize() { for (int i = 0; i < hash_table_size_; ++i) { hash_slot_t index = hash_slots_[i]; - if (index == kHashSlotEmpty) { continue; } + if (index == kHashSlotEmpty) { + continue; + } // Compute the hash value mod the new table size to start looking for an // empty slot @@ -796,7 +820,9 @@ Status DictionaryBuilder::DoubleTableSize() { while (kHashSlotEmpty != slot && SlotDifferent(slot, value)) { ++j; - if (j == new_size) { j = 0; } + if (j == new_size) { + j = 0; + } slot = new_hash_slots[j]; } @@ -870,8 +896,8 @@ Status DictionaryBuilder::AppendDictionary(const Scalar& value) { } \ \ template <> \ - bool DictionaryBuilder::SlotDifferent( \ - hash_slot_t index, const internal::WrappedBinary& value) { \ + bool DictionaryBuilder::SlotDifferent(hash_slot_t index, \ + const internal::WrappedBinary& value) { \ int32_t other_length; \ const uint8_t* other_value = \ dict_builder_.GetValue(static_cast(index), &other_length); \ @@ -951,7 +977,9 @@ Status DecimalBuilder::Init(int64_t capacity) { Status DecimalBuilder::Resize(int64_t capacity) { int64_t old_bytes = null_bitmap_ != nullptr ? null_bitmap_->size() : 0; - if (sign_bitmap_ == nullptr) { return Init(capacity); } + if (sign_bitmap_ == nullptr) { + return Init(capacity); + } RETURN_NOT_OK(FixedSizeBinaryBuilder::Resize(capacity)); if (byte_width_ == 16) { @@ -962,7 +990,7 @@ Status DecimalBuilder::Resize(int64_t capacity) { // The buffer might be overpadded to deal with padding according to the spec if (old_bytes < new_bytes) { memset(sign_bitmap_data_ + old_bytes, 0, - static_cast(sign_bitmap_->capacity() - old_bytes)); + static_cast(sign_bitmap_->capacity() - old_bytes)); } } return Status::OK(); @@ -973,8 +1001,8 @@ Status DecimalBuilder::Finish(std::shared_ptr* out) { RETURN_NOT_OK(byte_builder_.Finish(&data)); /// TODO(phillipc): not sure where to get the offset argument here - *out = std::make_shared( - type_, length_, data, null_bitmap_, null_count_, 0, sign_bitmap_); + *out = std::make_shared(type_, length_, data, null_bitmap_, null_count_, + 0, sign_bitmap_); return Status::OK(); } @@ -982,15 +1010,15 @@ Status DecimalBuilder::Finish(std::shared_ptr* out) { // ListBuilder ListBuilder::ListBuilder(MemoryPool* pool, std::unique_ptr value_builder, - const std::shared_ptr& type) - : ArrayBuilder( - pool, type ? type : std::static_pointer_cast( - std::make_shared(value_builder->type()))), + const std::shared_ptr& type) + : ArrayBuilder(pool, + type ? type : std::static_pointer_cast( + std::make_shared(value_builder->type()))), offsets_builder_(pool), value_builder_(std::move(value_builder)) {} -Status ListBuilder::Append( - const int32_t* offsets, int64_t length, const uint8_t* valid_bytes) { +Status ListBuilder::Append(const int32_t* offsets, int64_t length, + const uint8_t* valid_bytes) { RETURN_NOT_OK(Reserve(length)); UnsafeAppendToBitmap(valid_bytes, length); offsets_builder_.UnsafeAppend(offsets, length); @@ -1035,10 +1063,12 @@ Status ListBuilder::Finish(std::shared_ptr* out) { RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); std::shared_ptr items = values_; - if (!items) { RETURN_NOT_OK(value_builder_->Finish(&items)); } + if (!items) { + RETURN_NOT_OK(value_builder_->Finish(&items)); + } - *out = std::make_shared( - type_, length_, offsets, items, null_bitmap_, null_count_); + *out = std::make_shared(type_, length_, offsets, items, null_bitmap_, + null_count_); Reset(); return Status::OK(); @@ -1111,8 +1141,8 @@ Status BinaryBuilder::FinishInternal(std::shared_ptr* out) RETURN_NOT_OK(value_data_builder_.Finish(&value_data)); BufferVector buffers = {null_bitmap_, offsets, value_data}; - *out = std::make_shared( - type_, length_, std::move(buffers), null_count_, 0); + *out = std::make_shared(type_, length_, std::move(buffers), + null_count_, 0); return Status::OK(); } @@ -1154,8 +1184,8 @@ Status StringBuilder::Finish(std::shared_ptr* out) { // ---------------------------------------------------------------------- // Fixed width binary -FixedSizeBinaryBuilder::FixedSizeBinaryBuilder( - MemoryPool* pool, const std::shared_ptr& type) +FixedSizeBinaryBuilder::FixedSizeBinaryBuilder(MemoryPool* pool, + const std::shared_ptr& type) : ArrayBuilder(pool, type), byte_width_(static_cast(*type).byte_width()), byte_builder_(pool) {} @@ -1166,8 +1196,8 @@ Status FixedSizeBinaryBuilder::Append(const uint8_t* value) { return byte_builder_.Append(value, byte_width_); } -Status FixedSizeBinaryBuilder::Append( - const uint8_t* data, int64_t length, const uint8_t* valid_bytes) { +Status FixedSizeBinaryBuilder::Append(const uint8_t* data, int64_t length, + const uint8_t* valid_bytes) { RETURN_NOT_OK(Reserve(length)); UnsafeAppendToBitmap(valid_bytes, length); return byte_builder_.Append(data, length * byte_width_); @@ -1196,8 +1226,8 @@ Status FixedSizeBinaryBuilder::Resize(int64_t capacity) { Status FixedSizeBinaryBuilder::Finish(std::shared_ptr* out) { std::shared_ptr data; RETURN_NOT_OK(byte_builder_.Finish(&data)); - *out = std::make_shared( - type_, length_, data, null_bitmap_, null_count_); + *out = std::make_shared(type_, length_, data, null_bitmap_, + null_count_); return Status::OK(); } @@ -1205,7 +1235,7 @@ Status FixedSizeBinaryBuilder::Finish(std::shared_ptr* out) { // Struct StructBuilder::StructBuilder(MemoryPool* pool, const std::shared_ptr& type, - std::vector>&& field_builders) + std::vector>&& field_builders) : ArrayBuilder(pool, type) { field_builders_ = std::move(field_builders); } @@ -1237,7 +1267,7 @@ Status StructBuilder::Finish(std::shared_ptr* out) { // // TODO(wesm): come up with a less monolithic strategy Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, - std::unique_ptr* out) { + std::unique_ptr* out) { switch (type->id()) { BUILDER_CASE(UINT8, UInt8Builder); BUILDER_CASE(INT8, Int8Builder); @@ -1292,7 +1322,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, return Status::OK(); Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr& type, - std::shared_ptr* out) { + std::shared_ptr* out) { switch (type->id()) { DICTIONARY_BUILDER_CASE(UINT8, DictionaryBuilder); DICTIONARY_BUILDER_CASE(INT8, DictionaryBuilder); diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 065e115ac5872..080a32900555c 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -186,8 +186,8 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { /// /// If passed, valid_bytes is of equal length to values, and any zero byte /// will be considered as a null for that slot - Status Append( - const value_type* values, int64_t length, const uint8_t* valid_bytes = nullptr); + Status Append(const value_type* values, int64_t length, + const uint8_t* valid_bytes = nullptr); Status Finish(std::shared_ptr* out) override; Status Init(int64_t capacity) override; @@ -298,15 +298,15 @@ class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { inline uint8_t expanded_uint_size(uint64_t val, uint8_t current_int_size) { if (current_int_size == 8 || (current_int_size < 8 && - (val > static_cast(std::numeric_limits::max())))) { + (val > static_cast(std::numeric_limits::max())))) { return 8; } else if (current_int_size == 4 || (current_int_size < 4 && - (val > static_cast(std::numeric_limits::max())))) { + (val > static_cast(std::numeric_limits::max())))) { return 4; } else if (current_int_size == 2 || (current_int_size == 1 && - (val > static_cast(std::numeric_limits::max())))) { + (val > static_cast(std::numeric_limits::max())))) { return 2; } else { return 1; @@ -325,7 +325,9 @@ class ARROW_EXPORT AdaptiveUIntBuilder : public AdaptiveIntBuilderBase { BitUtil::SetBit(null_bitmap_data_, length_); uint8_t new_int_size = expanded_uint_size(val, int_size_); - if (new_int_size != int_size_) { RETURN_NOT_OK(ExpandIntSize(new_int_size)); } + if (new_int_size != int_size_) { + RETURN_NOT_OK(ExpandIntSize(new_int_size)); + } switch (int_size_) { case 1: @@ -350,8 +352,8 @@ class ARROW_EXPORT AdaptiveUIntBuilder : public AdaptiveIntBuilderBase { /// /// If passed, valid_bytes is of equal length to values, and any zero byte /// will be considered as a null for that slot - Status Append( - const uint64_t* values, int64_t length, const uint8_t* valid_bytes = nullptr); + Status Append(const uint64_t* values, int64_t length, + const uint8_t* valid_bytes = nullptr); Status ExpandIntSize(uint8_t new_int_size); Status Finish(std::shared_ptr* out) override; @@ -374,18 +376,18 @@ class ARROW_EXPORT AdaptiveUIntBuilder : public AdaptiveIntBuilderBase { inline uint8_t expanded_int_size(int64_t val, uint8_t current_int_size) { if (current_int_size == 8 || (current_int_size < 8 && - (val > static_cast(std::numeric_limits::max()) || - val < static_cast(std::numeric_limits::min())))) { + (val > static_cast(std::numeric_limits::max()) || + val < static_cast(std::numeric_limits::min())))) { return 8; } else if (current_int_size == 4 || (current_int_size < 4 && - (val > static_cast(std::numeric_limits::max()) || - val < static_cast(std::numeric_limits::min())))) { + (val > static_cast(std::numeric_limits::max()) || + val < static_cast(std::numeric_limits::min())))) { return 4; } else if (current_int_size == 2 || (current_int_size == 1 && - (val > static_cast(std::numeric_limits::max()) || - val < static_cast(std::numeric_limits::min())))) { + (val > static_cast(std::numeric_limits::max()) || + val < static_cast(std::numeric_limits::min())))) { return 2; } else { return 1; @@ -404,7 +406,9 @@ class ARROW_EXPORT AdaptiveIntBuilder : public AdaptiveIntBuilderBase { BitUtil::SetBit(null_bitmap_data_, length_); uint8_t new_int_size = expanded_int_size(val, int_size_); - if (new_int_size != int_size_) { RETURN_NOT_OK(ExpandIntSize(new_int_size)); } + if (new_int_size != int_size_) { + RETURN_NOT_OK(ExpandIntSize(new_int_size)); + } switch (int_size_) { case 1: @@ -429,8 +433,8 @@ class ARROW_EXPORT AdaptiveIntBuilder : public AdaptiveIntBuilderBase { /// /// If passed, valid_bytes is of equal length to values, and any zero byte /// will be considered as a null for that slot - Status Append( - const int64_t* values, int64_t length, const uint8_t* valid_bytes = nullptr); + Status Append(const int64_t* values, int64_t length, + const uint8_t* valid_bytes = nullptr); Status ExpandIntSize(uint8_t new_int_size); Status Finish(std::shared_ptr* out) override; @@ -490,8 +494,8 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { /// /// If passed, valid_bytes is of equal length to values, and any zero byte /// will be considered as a null for that slot - Status Append( - const uint8_t* values, int64_t length, const uint8_t* valid_bytes = nullptr); + Status Append(const uint8_t* values, int64_t length, + const uint8_t* valid_bytes = nullptr); Status Finish(std::shared_ptr* out) override; Status Init(int64_t capacity) override; @@ -526,7 +530,7 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder { /// Use this constructor to incrementally build the value array along with offsets and /// null bitmap. ListBuilder(MemoryPool* pool, std::unique_ptr value_builder, - const std::shared_ptr& type = nullptr); + const std::shared_ptr& type = nullptr); Status Init(int64_t elements) override; Status Resize(int64_t capacity) override; @@ -536,8 +540,8 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder { /// /// If passed, valid_bytes is of equal length to values, and any zero byte /// will be considered as a null for that slot - Status Append( - const int32_t* offsets, int64_t length, const uint8_t* valid_bytes = nullptr); + Status Append(const int32_t* offsets, int64_t length, + const uint8_t* valid_bytes = nullptr); /// \brief Start a new variable-length list slot /// @@ -626,8 +630,8 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { FixedSizeBinaryBuilder(MemoryPool* pool, const std::shared_ptr& type); Status Append(const uint8_t* value); - Status Append( - const uint8_t* data, int64_t length, const uint8_t* valid_bytes = nullptr); + Status Append(const uint8_t* data, int64_t length, + const uint8_t* valid_bytes = nullptr); Status Append(const std::string& value); Status AppendNull(); @@ -672,7 +676,7 @@ class ARROW_EXPORT DecimalBuilder : public FixedSizeBinaryBuilder { class ARROW_EXPORT StructBuilder : public ArrayBuilder { public: StructBuilder(MemoryPool* pool, const std::shared_ptr& type, - std::vector>&& field_builders); + std::vector>&& field_builders); Status Finish(std::shared_ptr* out) override; @@ -808,7 +812,7 @@ class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder(value.c_str()), - static_cast(value.size()))); + static_cast(value.size()))); } }; @@ -829,7 +833,7 @@ class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder(value.c_str()), - static_cast(value.size()))); + static_cast(value.size()))); } }; @@ -837,10 +841,11 @@ class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder& type, - std::unique_ptr* out); + std::unique_ptr* out); Status ARROW_EXPORT MakeDictionaryBuilder(MemoryPool* pool, - const std::shared_ptr& type, std::shared_ptr* out); + const std::shared_ptr& type, + std::shared_ptr* out); } // namespace arrow diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 1465e0b414fe3..da10c2ad90177 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -41,7 +41,7 @@ namespace arrow { class RangeEqualsVisitor { public: RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx, - int64_t right_start_idx) + int64_t right_start_idx) : right_(right), left_start_idx_(left_start_idx), left_end_idx_(left_end_idx), @@ -71,7 +71,9 @@ class RangeEqualsVisitor { for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_; ++i, ++o_i) { const bool is_null = left.IsNull(i); - if (is_null != right.IsNull(o_i)) { return false; } + if (is_null != right.IsNull(o_i)) { + return false; + } if (is_null) continue; const int32_t begin_offset = left.value_offset(i); const int32_t end_offset = left.value_offset(i + 1); @@ -84,8 +86,8 @@ class RangeEqualsVisitor { if (end_offset - begin_offset > 0 && std::memcmp(left.value_data()->data() + begin_offset, - right.value_data()->data() + right_begin_offset, - static_cast(end_offset - begin_offset))) { + right.value_data()->data() + right_begin_offset, + static_cast(end_offset - begin_offset))) { return false; } } @@ -101,7 +103,9 @@ class RangeEqualsVisitor { for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_; ++i, ++o_i) { const bool is_null = left.IsNull(i); - if (is_null != right.IsNull(o_i)) { return false; } + if (is_null != right.IsNull(o_i)) { + return false; + } if (is_null) continue; const int32_t begin_offset = left.value_offset(i); const int32_t end_offset = left.value_offset(i + 1); @@ -111,8 +115,8 @@ class RangeEqualsVisitor { if (end_offset - begin_offset != right_end_offset - right_begin_offset) { return false; } - if (!left_values->RangeEquals( - begin_offset, end_offset, right_begin_offset, right_values)) { + if (!left_values->RangeEquals(begin_offset, end_offset, right_begin_offset, + right_values)) { return false; } } @@ -124,7 +128,9 @@ class RangeEqualsVisitor { bool equal_fields = true; for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_; ++i, ++o_i) { - if (left.IsNull(i) != right.IsNull(o_i)) { return false; } + if (left.IsNull(i) != right.IsNull(o_i)) { + return false; + } if (left.IsNull(i)) continue; for (int j = 0; j < left.num_fields(); ++j) { // TODO: really we should be comparing stretches of non-null data rather @@ -132,9 +138,11 @@ class RangeEqualsVisitor { const int64_t left_abs_index = i + left.offset(); const int64_t right_abs_index = o_i + right.offset(); - equal_fields = left.field(j)->RangeEquals( - left_abs_index, left_abs_index + 1, right_abs_index, right.field(j)); - if (!equal_fields) { return false; } + equal_fields = left.field(j)->RangeEquals(left_abs_index, left_abs_index + 1, + right_abs_index, right.field(j)); + if (!equal_fields) { + return false; + } } } return true; @@ -144,7 +152,9 @@ class RangeEqualsVisitor { const auto& right = static_cast(right_); const UnionMode union_mode = left.mode(); - if (union_mode != right.mode()) { return false; } + if (union_mode != right.mode()) { + return false; + } const auto& left_type = static_cast(*left.type()); @@ -154,7 +164,9 @@ class RangeEqualsVisitor { const std::vector& type_codes = left_type.type_codes(); for (size_t i = 0; i < type_codes.size(); ++i) { const uint8_t code = type_codes[i]; - if (code > max_code) { max_code = code; } + if (code > max_code) { + max_code = code; + } } // Store mapping in a vector for constant time lookups @@ -169,9 +181,13 @@ class RangeEqualsVisitor { uint8_t id, child_num; for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_; ++i, ++o_i) { - if (left.IsNull(i) != right.IsNull(o_i)) { return false; } + if (left.IsNull(i) != right.IsNull(o_i)) { + return false; + } if (left.IsNull(i)) continue; - if (left_ids[i] != right_ids[o_i]) { return false; } + if (left_ids[i] != right_ids[o_i]) { + return false; + } id = left_ids[i]; child_num = type_id_to_child_num[id]; @@ -183,14 +199,15 @@ class RangeEqualsVisitor { // rather than looking at one value at a time. if (union_mode == UnionMode::SPARSE) { if (!left.child(child_num)->RangeEquals(left_abs_index, left_abs_index + 1, - right_abs_index, right.child(child_num))) { + right_abs_index, + right.child(child_num))) { return false; } } else { const int32_t offset = left.raw_value_offsets()[i]; const int32_t o_offset = right.raw_value_offsets()[o_i]; - if (!left.child(child_num)->RangeEquals( - offset, offset + 1, o_offset, right.child(child_num))) { + if (!left.child(child_num)->RangeEquals(offset, offset + 1, o_offset, + right.child(child_num))) { return false; } } @@ -211,9 +228,13 @@ class RangeEqualsVisitor { const uint8_t* left_data = nullptr; const uint8_t* right_data = nullptr; - if (left.values()) { left_data = left.raw_values() + left.offset() * width; } + if (left.values()) { + left_data = left.raw_values() + left.offset() * width; + } - if (right.values()) { right_data = right.raw_values() + right.offset() * width; } + if (right.values()) { + right_data = right.raw_values() + right.offset() * width; + } for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_; ++i, ++o_i) { @@ -241,9 +262,13 @@ class RangeEqualsVisitor { const uint8_t* left_data = nullptr; const uint8_t* right_data = nullptr; - if (left.values()) { left_data = left.raw_values() + left.offset() * width; } + if (left.values()) { + left_data = left.raw_values() + left.offset() * width; + } - if (right.values()) { right_data = right.raw_values() + right.offset() * width; } + if (right.values()) { + right_data = right.raw_values() + right.offset() * width; + } for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_; ++i, ++o_i) { @@ -301,8 +326,8 @@ class RangeEqualsVisitor { result_ = false; return Status::OK(); } - result_ = left.indices()->RangeEquals( - left_start_idx_, left_end_idx_, right_start_idx_, right.indices()); + result_ = left.indices()->RangeEquals(left_start_idx_, left_end_idx_, + right_start_idx_, right.indices()); return Status::OK(); } @@ -324,7 +349,9 @@ static bool IsEqualPrimitive(const PrimitiveArray& left, const PrimitiveArray& r const uint8_t* left_data = nullptr; const uint8_t* right_data = nullptr; - if (left.values()) { left_data = left.values()->data() + left.offset() * byte_width; } + if (left.values()) { + left_data = left.values()->data() + left.offset() * byte_width; + } if (right.values()) { right_data = right.values()->data() + right.offset() * byte_width; } @@ -341,13 +368,13 @@ static bool IsEqualPrimitive(const PrimitiveArray& left, const PrimitiveArray& r return true; } else { return memcmp(left_data, right_data, - static_cast(byte_width * left.length())) == 0; + static_cast(byte_width * left.length())) == 0; } } template -static inline bool CompareBuiltIn( - const Array& left, const Array& right, const T* ldata, const T* rdata) { +static inline bool CompareBuiltIn(const Array& left, const Array& right, const T* ldata, + const T* rdata) { if (left.null_count() > 0) { for (int64_t i = 0; i < left.length(); ++i) { if (left.IsNull(i) != right.IsNull(i)) { @@ -369,17 +396,21 @@ static bool IsEqualDecimal(const DecimalArray& left, const DecimalArray& right) const uint8_t* left_data = nullptr; const uint8_t* right_data = nullptr; - if (left.values()) { left_data = left.values()->data(); } - if (right.values()) { right_data = right.values()->data(); } + if (left.values()) { + left_data = left.values()->data(); + } + if (right.values()) { + right_data = right.values()->data(); + } const int32_t byte_width = left.byte_width(); if (byte_width == 4) { - return CompareBuiltIn(left, right, - reinterpret_cast(left_data) + loffset, + return CompareBuiltIn( + left, right, reinterpret_cast(left_data) + loffset, reinterpret_cast(right_data) + roffset); } else if (byte_width == 8) { - return CompareBuiltIn(left, right, - reinterpret_cast(left_data) + loffset, + return CompareBuiltIn( + left, right, reinterpret_cast(left_data) + loffset, reinterpret_cast(right_data) + roffset); } else { // 128-bit @@ -387,8 +418,12 @@ static bool IsEqualDecimal(const DecimalArray& left, const DecimalArray& right) // Must also compare sign bitmap const uint8_t* left_sign = nullptr; const uint8_t* right_sign = nullptr; - if (left.sign_bitmap()) { left_sign = left.sign_bitmap()->data(); } - if (right.sign_bitmap()) { right_sign = right.sign_bitmap()->data(); } + if (left.sign_bitmap()) { + left_sign = left.sign_bitmap()->data(); + } + if (right.sign_bitmap()) { + right_sign = right.sign_bitmap()->data(); + } for (int64_t i = 0; i < left.length(); ++i) { bool left_null = left.IsNull(i); @@ -434,7 +469,7 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor { result_ = true; } else { result_ = BitmapEquals(left.values()->data(), left.offset(), right.values()->data(), - right.offset(), left.length()); + right.offset(), left.length()); } return Status::OK(); } @@ -442,7 +477,7 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor { template typename std::enable_if::value && !std::is_base_of::value, - Status>::type + Status>::type Visit(const T& left) { result_ = IsEqualPrimitive(left, static_cast(right_)); return Status::OK(); @@ -458,8 +493,8 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor { const auto& right = static_cast(right_); if (left.offset() == 0 && right.offset() == 0) { - return left.value_offsets()->Equals( - *right.value_offsets(), (left.length() + 1) * sizeof(int32_t)); + return left.value_offsets()->Equals(*right.value_offsets(), + (left.length() + 1) * sizeof(int32_t)); } else { // One of the arrays is sliced; logic is more complicated because the // value offsets are not both 0-based @@ -482,10 +517,16 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor { const auto& right = static_cast(right_); bool equal_offsets = ValueOffsetsEqual(left); - if (!equal_offsets) { return false; } + if (!equal_offsets) { + return false; + } - if (!left.value_data() && !(right.value_data())) { return true; } - if (left.value_offset(left.length()) == 0) { return true; } + if (!left.value_data() && !(right.value_data())) { + return true; + } + if (left.value_offset(left.length()) == 0) { + return true; + } const uint8_t* left_data = left.value_data()->data(); const uint8_t* right_data = right.value_data()->data(); @@ -493,23 +534,25 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor { if (left.null_count() == 0) { // Fast path for null count 0, single memcmp if (left.offset() == 0 && right.offset() == 0) { - return std::memcmp( - left_data, right_data, left.raw_value_offsets()[left.length()]) == 0; + return std::memcmp(left_data, right_data, + left.raw_value_offsets()[left.length()]) == 0; } else { const int64_t total_bytes = left.value_offset(left.length()) - left.value_offset(0); return std::memcmp(left_data + left.value_offset(0), - right_data + right.value_offset(0), - static_cast(total_bytes)) == 0; + right_data + right.value_offset(0), + static_cast(total_bytes)) == 0; } } else { // ARROW-537: Only compare data in non-null slots const int32_t* left_offsets = left.raw_value_offsets(); const int32_t* right_offsets = right.raw_value_offsets(); for (int64_t i = 0; i < left.length(); ++i) { - if (left.IsNull(i)) { continue; } + if (left.IsNull(i)) { + continue; + } if (std::memcmp(left_data + left_offsets[i], right_data + right_offsets[i], - left.value_length(i))) { + left.value_length(i))) { return false; } } @@ -530,8 +573,9 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor { return Status::OK(); } - result_ = left.values()->RangeEquals(left.value_offset(0), - left.value_offset(left.length()), right.value_offset(0), right.values()); + result_ = + left.values()->RangeEquals(left.value_offset(0), left.value_offset(left.length()), + right.value_offset(0), right.values()); return Status::OK(); } @@ -547,15 +591,15 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor { template typename std::enable_if::value, - Status>::type + Status>::type Visit(const T& left) { return RangeEqualsVisitor::Visit(left); } }; template -inline bool FloatingApproxEquals( - const NumericArray& left, const NumericArray& right) { +inline bool FloatingApproxEquals(const NumericArray& left, + const NumericArray& right) { using T = typename TYPE::c_type; const T* left_data = left.raw_values(); @@ -566,11 +610,15 @@ inline bool FloatingApproxEquals( if (left.null_count() > 0) { for (int64_t i = 0; i < left.length(); ++i) { if (left.IsNull(i)) continue; - if (fabs(left_data[i] - right_data[i]) > EPSILON) { return false; } + if (fabs(left_data[i] - right_data[i]) > EPSILON) { + return false; + } } } else { for (int64_t i = 0; i < left.length(); ++i) { - if (fabs(left_data[i] - right_data[i]) > EPSILON) { return false; } + if (fabs(left_data[i] - right_data[i]) > EPSILON) { + return false; + } } } return true; @@ -601,7 +649,7 @@ static bool BaseDataEquals(const Array& left, const Array& right) { } if (left.null_count() > 0 && left.null_count() < left.length()) { return BitmapEquals(left.null_bitmap()->data(), left.offset(), - right.null_bitmap()->data(), right.offset(), left.length()); + right.null_bitmap()->data(), right.offset(), left.length()); } return true; } @@ -634,7 +682,7 @@ Status ArrayApproxEquals(const Array& left, const Array& right, bool* are_equal) } Status ArrayRangeEquals(const Array& left, const Array& right, int64_t left_start_idx, - int64_t left_end_idx, int64_t right_start_idx, bool* are_equal) { + int64_t left_end_idx, int64_t right_start_idx, bool* are_equal) { if (&left == &right) { *are_equal = true; } else if (left.type_id() != right.type_id()) { @@ -705,7 +753,7 @@ class TypeEqualsVisitor { template typename std::enable_if::value || std::is_base_of::value, - Status>::type + Status>::type Visit(const T& type) { result_ = true; return Status::OK(); @@ -714,7 +762,7 @@ class TypeEqualsVisitor { template typename std::enable_if::value || std::is_base_of::value, - Status>::type + Status>::type Visit(const T& left) { const auto& right = static_cast(right_); result_ = left.unit() == right.unit(); diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h index 96a6435c5df33..a36b55320b5a2 100644 --- a/cpp/src/arrow/compare.h +++ b/cpp/src/arrow/compare.h @@ -34,21 +34,22 @@ class Tensor; /// Returns true if the arrays are exactly equal Status ARROW_EXPORT ArrayEquals(const Array& left, const Array& right, bool* are_equal); -Status ARROW_EXPORT TensorEquals( - const Tensor& left, const Tensor& right, bool* are_equal); +Status ARROW_EXPORT TensorEquals(const Tensor& left, const Tensor& right, + bool* are_equal); /// Returns true if the arrays are approximately equal. For non-floating point /// types, this is equivalent to ArrayEquals(left, right) -Status ARROW_EXPORT ArrayApproxEquals( - const Array& left, const Array& right, bool* are_equal); +Status ARROW_EXPORT ArrayApproxEquals(const Array& left, const Array& right, + bool* are_equal); /// Returns true if indicated equal-length segment of arrays is exactly equal Status ARROW_EXPORT ArrayRangeEquals(const Array& left, const Array& right, - int64_t start_idx, int64_t end_idx, int64_t other_start_idx, bool* are_equal); + int64_t start_idx, int64_t end_idx, + int64_t other_start_idx, bool* are_equal); /// Returns true if the type metadata are exactly equal -Status ARROW_EXPORT TypeEquals( - const DataType& left, const DataType& right, bool* are_equal); +Status ARROW_EXPORT TypeEquals(const DataType& left, const DataType& right, + bool* are_equal); } // namespace arrow diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc index 936655f26dbda..82e3ba8109c23 100644 --- a/cpp/src/arrow/io/file.cc +++ b/cpp/src/arrow/io/file.cc @@ -123,8 +123,8 @@ constexpr const char* kRangeExceptionError = "Range exception during wide-char string conversion"; #endif -static inline Status CheckOpenResult( - int ret, int errno_actual, const char* filename, size_t filename_length) { +static inline Status CheckOpenResult(int ret, int errno_actual, const char* filename, + size_t filename_length) { if (ret == -1) { // TODO: errno codes to strings std::stringstream ss; @@ -134,12 +134,14 @@ static inline Status CheckOpenResult( // this requires c++11 std::wstring_convert, wchar_t> converter; - std::wstring wide_string( - reinterpret_cast(filename), filename_length / sizeof(wchar_t)); + std::wstring wide_string(reinterpret_cast(filename), + filename_length / sizeof(wchar_t)); try { std::string byte_string = converter.to_bytes(wide_string); ss << byte_string; - } catch (const std::range_error&) { ss << kRangeExceptionError; } + } catch (const std::range_error&) { + ss << kRangeExceptionError; + } #else ss << filename; #endif @@ -161,7 +163,9 @@ static inline int64_t lseek64_compat(int fd, int64_t pos, int whence) { #if defined(_MSC_VER) static inline Status ConvertToUtf16(const std::string& input, std::wstring* result) { - if (result == nullptr) { return Status::Invalid("Pointer to result is not valid"); } + if (result == nullptr) { + return Status::Invalid("Pointer to result is not valid"); + } if (input.empty()) { *result = std::wstring(); @@ -171,7 +175,9 @@ static inline Status ConvertToUtf16(const std::string& input, std::wstring* resu std::wstring_convert> utf16_converter; try { *result = utf16_converter.from_bytes(input); - } catch (const std::range_error&) { return Status::Invalid(kRangeExceptionError); } + } catch (const std::range_error&) { + return Status::Invalid(kRangeExceptionError); + } return Status::OK(); } #endif @@ -194,8 +200,8 @@ static inline Status FileOpenReadable(const std::string& filename, int* fd) { return CheckOpenResult(ret, errno_actual, filename.c_str(), filename.size()); } -static inline Status FileOpenWriteable( - const std::string& filename, bool write_only, bool truncate, int* fd) { +static inline Status FileOpenWriteable(const std::string& filename, bool write_only, + bool truncate, int* fd) { int ret; errno_t errno_actual = 0; @@ -205,9 +211,13 @@ static inline Status FileOpenWriteable( int oflag = _O_CREAT | _O_BINARY; int pmode = _S_IWRITE; - if (!write_only) { pmode |= _S_IREAD; } + if (!write_only) { + pmode |= _S_IREAD; + } - if (truncate) { oflag |= _O_TRUNC; } + if (truncate) { + oflag |= _O_TRUNC; + } if (write_only) { oflag |= _O_WRONLY; @@ -221,7 +231,9 @@ static inline Status FileOpenWriteable( #else int oflag = O_CREAT | O_BINARY; - if (truncate) { oflag |= O_TRUNC; } + if (truncate) { + oflag |= O_TRUNC; + } if (write_only) { oflag |= O_WRONLY; @@ -239,7 +251,9 @@ static inline Status FileTell(int fd, int64_t* pos) { #if defined(_MSC_VER) current_pos = _telli64(fd); - if (current_pos == -1) { return Status::IOError("_telli64 failed"); } + if (current_pos == -1) { + return Status::IOError("_telli64 failed"); + } #else current_pos = lseek64_compat(fd, 0, SEEK_CUR); CHECK_LSEEK(current_pos); @@ -255,10 +269,12 @@ static inline Status FileSeek(int fd, int64_t pos) { return Status::OK(); } -static inline Status FileRead( - int fd, uint8_t* buffer, int64_t nbytes, int64_t* bytes_read) { +static inline Status FileRead(int fd, uint8_t* buffer, int64_t nbytes, + int64_t* bytes_read) { #if defined(_MSC_VER) - if (nbytes > INT32_MAX) { return Status::IOError("Unable to read > 2GB blocks yet"); } + if (nbytes > INT32_MAX) { + return Status::IOError("Unable to read > 2GB blocks yet"); + } *bytes_read = static_cast(_read(fd, buffer, static_cast(nbytes))); #else *bytes_read = static_cast(read(fd, buffer, static_cast(nbytes))); @@ -323,7 +339,9 @@ static inline Status FileClose(int fd) { ret = static_cast(close(fd)); #endif - if (ret == -1) { return Status::IOError("error closing file"); } + if (ret == -1) { + return Status::IOError("error closing file"); + } return Status::OK(); } @@ -371,7 +389,9 @@ class OSFile { } Status Seek(int64_t pos) { - if (pos < 0) { return Status::Invalid("Invalid position"); } + if (pos < 0) { + return Status::Invalid("Invalid position"); + } return FileSeek(fd_, pos); } @@ -379,7 +399,9 @@ class OSFile { Status Write(const uint8_t* data, int64_t length) { std::lock_guard guard(lock_); - if (length < 0) { return Status::IOError("Length must be non-negative"); } + if (length < 0) { + return Status::IOError("Length must be non-negative"); + } return FileWrite(fd_, data, length); } @@ -421,7 +443,9 @@ class ReadableFile::ReadableFileImpl : public OSFile { int64_t bytes_read = 0; RETURN_NOT_OK(Read(nbytes, &bytes_read, buffer->mutable_data())); - if (bytes_read < nbytes) { RETURN_NOT_OK(buffer->Resize(bytes_read)); } + if (bytes_read < nbytes) { + RETURN_NOT_OK(buffer->Resize(bytes_read)); + } *out = buffer; return Status::OK(); } @@ -430,13 +454,9 @@ class ReadableFile::ReadableFileImpl : public OSFile { MemoryPool* pool_; }; -ReadableFile::ReadableFile(MemoryPool* pool) { - impl_.reset(new ReadableFileImpl(pool)); -} +ReadableFile::ReadableFile(MemoryPool* pool) { impl_.reset(new ReadableFileImpl(pool)); } -ReadableFile::~ReadableFile() { - DCHECK(impl_->Close().ok()); -} +ReadableFile::~ReadableFile() { DCHECK(impl_->Close().ok()); } Status ReadableFile::Open(const std::string& path, std::shared_ptr* file) { *file = std::shared_ptr(new ReadableFile(default_memory_pool())); @@ -444,18 +464,14 @@ Status ReadableFile::Open(const std::string& path, std::shared_ptr } Status ReadableFile::Open(const std::string& path, MemoryPool* memory_pool, - std::shared_ptr* file) { + std::shared_ptr* file) { *file = std::shared_ptr(new ReadableFile(memory_pool)); return (*file)->impl_->Open(path); } -Status ReadableFile::Close() { - return impl_->Close(); -} +Status ReadableFile::Close() { return impl_->Close(); } -Status ReadableFile::Tell(int64_t* pos) { - return impl_->Tell(pos); -} +Status ReadableFile::Tell(int64_t* pos) { return impl_->Tell(pos); } Status ReadableFile::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) { return impl_->Read(nbytes, bytes_read, out); @@ -470,17 +486,11 @@ Status ReadableFile::GetSize(int64_t* size) { return Status::OK(); } -Status ReadableFile::Seek(int64_t pos) { - return impl_->Seek(pos); -} +Status ReadableFile::Seek(int64_t pos) { return impl_->Seek(pos); } -bool ReadableFile::supports_zero_copy() const { - return false; -} +bool ReadableFile::supports_zero_copy() const { return false; } -int ReadableFile::file_descriptor() const { - return impl_->fd(); -} +int ReadableFile::file_descriptor() const { return impl_->fd(); } // ---------------------------------------------------------------------- // FileOutputStream @@ -492,42 +502,34 @@ class FileOutputStream::FileOutputStreamImpl : public OSFile { } }; -FileOutputStream::FileOutputStream() { - impl_.reset(new FileOutputStreamImpl()); -} +FileOutputStream::FileOutputStream() { impl_.reset(new FileOutputStreamImpl()); } FileOutputStream::~FileOutputStream() { // This can fail; better to explicitly call close DCHECK(impl_->Close().ok()); } -Status FileOutputStream::Open( - const std::string& path, std::shared_ptr* file) { +Status FileOutputStream::Open(const std::string& path, + std::shared_ptr* file) { return Open(path, false, file); } -Status FileOutputStream::Open( - const std::string& path, bool append, std::shared_ptr* file) { +Status FileOutputStream::Open(const std::string& path, bool append, + std::shared_ptr* file) { // private ctor *file = std::shared_ptr(new FileOutputStream()); return (*file)->impl_->Open(path, append); } -Status FileOutputStream::Close() { - return impl_->Close(); -} +Status FileOutputStream::Close() { return impl_->Close(); } -Status FileOutputStream::Tell(int64_t* pos) { - return impl_->Tell(pos); -} +Status FileOutputStream::Tell(int64_t* pos) { return impl_->Tell(pos); } Status FileOutputStream::Write(const uint8_t* data, int64_t length) { return impl_->Write(data, length); } -int FileOutputStream::file_descriptor() const { - return impl_->fd(); -} +int FileOutputStream::file_descriptor() const { return impl_->fd(); } // ---------------------------------------------------------------------- // Implement MemoryMappedFile @@ -567,7 +569,7 @@ class MemoryMappedFile::MemoryMap : public MutableBuffer { } void* result = mmap(nullptr, static_cast(file_->size()), prot_flags, map_mode, - file_->fd(), 0); + file_->fd(), 0); if (result == MAP_FAILED) { std::stringstream ss; ss << "Memory mapping file failed, errno: " << errno; @@ -585,7 +587,9 @@ class MemoryMappedFile::MemoryMap : public MutableBuffer { int64_t size() const { return size_; } Status Seek(int64_t position) { - if (position < 0) { return Status::Invalid("position is out of bounds"); } + if (position < 0) { + return Status::Invalid("position is out of bounds"); + } position_ = position; return Status::OK(); } @@ -610,8 +614,8 @@ class MemoryMappedFile::MemoryMap : public MutableBuffer { MemoryMappedFile::MemoryMappedFile() {} MemoryMappedFile::~MemoryMappedFile() {} -Status MemoryMappedFile::Create( - const std::string& path, int64_t size, std::shared_ptr* out) { +Status MemoryMappedFile::Create(const std::string& path, int64_t size, + std::shared_ptr* out) { std::shared_ptr file; RETURN_NOT_OK(FileOutputStream::Open(path, &file)); #ifdef _MSC_VER @@ -624,7 +628,7 @@ Status MemoryMappedFile::Create( } Status MemoryMappedFile::Open(const std::string& path, FileMode::type mode, - std::shared_ptr* out) { + std::shared_ptr* out) { std::shared_ptr result(new MemoryMappedFile()); result->memory_map_.reset(new MemoryMap()); @@ -644,9 +648,7 @@ Status MemoryMappedFile::Tell(int64_t* position) { return Status::OK(); } -Status MemoryMappedFile::Seek(int64_t position) { - return memory_map_->Seek(position); -} +Status MemoryMappedFile::Seek(int64_t position) { return memory_map_->Seek(position); } Status MemoryMappedFile::Close() { // munmap handled in pimpl dtor @@ -656,7 +658,9 @@ Status MemoryMappedFile::Close() { Status MemoryMappedFile::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) { nbytes = std::max( 0, std::min(nbytes, memory_map_->size() - memory_map_->position())); - if (nbytes > 0) { std::memcpy(out, memory_map_->head(), static_cast(nbytes)); } + if (nbytes > 0) { + std::memcpy(out, memory_map_->head(), static_cast(nbytes)); + } *bytes_read = nbytes; memory_map_->advance(nbytes); return Status::OK(); @@ -675,9 +679,7 @@ Status MemoryMappedFile::Read(int64_t nbytes, std::shared_ptr* out) { return Status::OK(); } -bool MemoryMappedFile::supports_zero_copy() const { - return true; -} +bool MemoryMappedFile::supports_zero_copy() const { return true; } Status MemoryMappedFile::WriteAt(int64_t position, const uint8_t* data, int64_t nbytes) { std::lock_guard guard(lock_); @@ -708,9 +710,7 @@ Status MemoryMappedFile::WriteInternal(const uint8_t* data, int64_t nbytes) { return Status::OK(); } -int MemoryMappedFile::file_descriptor() const { - return memory_map_->fd(); -} +int MemoryMappedFile::file_descriptor() const { return memory_map_->fd(); } } // namespace io } // namespace arrow diff --git a/cpp/src/arrow/io/file.h b/cpp/src/arrow/io/file.h index f0be3cf980162..ba740f1e8f4a9 100644 --- a/cpp/src/arrow/io/file.h +++ b/cpp/src/arrow/io/file.h @@ -44,8 +44,8 @@ class ARROW_EXPORT FileOutputStream : public OutputStream { // truncated to 0 bytes, deleting any existing memory static Status Open(const std::string& path, std::shared_ptr* file); - static Status Open( - const std::string& path, bool append, std::shared_ptr* file); + static Status Open(const std::string& path, bool append, + std::shared_ptr* file); // OutputStream interface Status Close() override; @@ -73,7 +73,7 @@ class ARROW_EXPORT ReadableFile : public RandomAccessFile { // Open file with one's own memory pool for memory allocations static Status Open(const std::string& path, MemoryPool* memory_pool, - std::shared_ptr* file); + std::shared_ptr* file); Status Close() override; Status Tell(int64_t* position) override; @@ -107,11 +107,11 @@ class ARROW_EXPORT MemoryMappedFile : public ReadWriteFileInterface { ~MemoryMappedFile(); /// Create new file with indicated size, return in read/write mode - static Status Create( - const std::string& path, int64_t size, std::shared_ptr* out); + static Status Create(const std::string& path, int64_t size, + std::shared_ptr* out); static Status Open(const std::string& path, FileMode::type mode, - std::shared_ptr* out); + std::shared_ptr* out); Status Close() override; diff --git a/cpp/src/arrow/io/hdfs-internal.cc b/cpp/src/arrow/io/hdfs-internal.cc index 8b4a92b396789..8f42b1c817fe4 100644 --- a/cpp/src/arrow/io/hdfs-internal.cc +++ b/cpp/src/arrow/io/hdfs-internal.cc @@ -59,9 +59,9 @@ static std::vector get_potential_libhdfs_paths(); static std::vector get_potential_libhdfs3_paths(); static arrow::Status try_dlopen(std::vector potential_paths, const char* name, #ifndef _WIN32 - void*& out_handle); + void*& out_handle); #else - HINSTANCE& out_handle); + HINSTANCE& out_handle); #endif static std::vector get_potential_libhdfs_paths() { @@ -88,7 +88,9 @@ static std::vector get_potential_libhdfs_paths() { } const char* libhdfs_dir = std::getenv("ARROW_LIBHDFS_DIR"); - if (libhdfs_dir != nullptr) { search_paths.push_back(fs::path(libhdfs_dir)); } + if (libhdfs_dir != nullptr) { + search_paths.push_back(fs::path(libhdfs_dir)); + } // All paths with file name for (auto& path : search_paths) { @@ -115,7 +117,9 @@ static std::vector get_potential_libhdfs3_paths() { std::vector search_paths = {fs::path(""), fs::path(".")}; const char* libhdfs3_dir = std::getenv("ARROW_LIBHDFS3_DIR"); - if (libhdfs3_dir != nullptr) { search_paths.push_back(fs::path(libhdfs3_dir)); } + if (libhdfs3_dir != nullptr) { + search_paths.push_back(fs::path(libhdfs3_dir)); + } // All paths with file name for (auto& path : search_paths) { @@ -188,8 +192,8 @@ static std::vector get_potential_libjvm_paths() { } #ifndef _WIN32 -static arrow::Status try_dlopen( - std::vector potential_paths, const char* name, void*& out_handle) { +static arrow::Status try_dlopen(std::vector potential_paths, const char* name, + void*& out_handle) { std::vector error_messages; for (auto& i : potential_paths) { @@ -219,8 +223,8 @@ static arrow::Status try_dlopen( } #else -static arrow::Status try_dlopen( - std::vector potential_paths, const char* name, HINSTANCE& out_handle) { +static arrow::Status try_dlopen(std::vector potential_paths, const char* name, + HINSTANCE& out_handle) { std::vector error_messages; for (auto& i : potential_paths) { @@ -282,9 +286,7 @@ namespace io { static LibHdfsShim libhdfs_shim; static LibHdfsShim libhdfs3_shim; -hdfsBuilder* LibHdfsShim::NewBuilder(void) { - return this->hdfsNewBuilder(); -} +hdfsBuilder* LibHdfsShim::NewBuilder(void) { return this->hdfsNewBuilder(); } void LibHdfsShim::BuilderSetNameNode(hdfsBuilder* bld, const char* nn) { this->hdfsBuilderSetNameNode(bld, nn); @@ -298,8 +300,8 @@ void LibHdfsShim::BuilderSetUserName(hdfsBuilder* bld, const char* userName) { this->hdfsBuilderSetUserName(bld, userName); } -void LibHdfsShim::BuilderSetKerbTicketCachePath( - hdfsBuilder* bld, const char* kerbTicketCachePath) { +void LibHdfsShim::BuilderSetKerbTicketCachePath(hdfsBuilder* bld, + const char* kerbTicketCachePath) { this->hdfsBuilderSetKerbTicketCachePath(bld, kerbTicketCachePath); } @@ -307,12 +309,10 @@ hdfsFS LibHdfsShim::BuilderConnect(hdfsBuilder* bld) { return this->hdfsBuilderConnect(bld); } -int LibHdfsShim::Disconnect(hdfsFS fs) { - return this->hdfsDisconnect(fs); -} +int LibHdfsShim::Disconnect(hdfsFS fs) { return this->hdfsDisconnect(fs); } hdfsFile LibHdfsShim::OpenFile(hdfsFS fs, const char* path, int flags, int bufferSize, - short replication, tSize blocksize) { // NOLINT + short replication, tSize blocksize) { // NOLINT return this->hdfsOpenFile(fs, path, flags, bufferSize, replication, blocksize); } @@ -328,9 +328,7 @@ int LibHdfsShim::Seek(hdfsFS fs, hdfsFile file, tOffset desiredPos) { return this->hdfsSeek(fs, file, desiredPos); } -tOffset LibHdfsShim::Tell(hdfsFS fs, hdfsFile file) { - return this->hdfsTell(fs, file); -} +tOffset LibHdfsShim::Tell(hdfsFS fs, hdfsFile file) { return this->hdfsTell(fs, file); } tSize LibHdfsShim::Read(hdfsFS fs, hdfsFile file, void* buffer, tSize length) { return this->hdfsRead(fs, file, buffer, length); @@ -341,8 +339,8 @@ bool LibHdfsShim::HasPread() { return this->hdfsPread != nullptr; } -tSize LibHdfsShim::Pread( - hdfsFS fs, hdfsFile file, tOffset position, void* buffer, tSize length) { +tSize LibHdfsShim::Pread(hdfsFS fs, hdfsFile file, tOffset position, void* buffer, + tSize length) { GET_SYMBOL(this, hdfsPread); return this->hdfsPread(fs, file, position, buffer, length); } @@ -351,9 +349,7 @@ tSize LibHdfsShim::Write(hdfsFS fs, hdfsFile file, const void* buffer, tSize len return this->hdfsWrite(fs, file, buffer, length); } -int LibHdfsShim::Flush(hdfsFS fs, hdfsFile file) { - return this->hdfsFlush(fs, file); -} +int LibHdfsShim::Flush(hdfsFS fs, hdfsFile file) { return this->hdfsFlush(fs, file); } int LibHdfsShim::Available(hdfsFS fs, hdfsFile file) { GET_SYMBOL(this, hdfsAvailable); @@ -434,8 +430,8 @@ void LibHdfsShim::FreeFileInfo(hdfsFileInfo* hdfsFileInfo, int numEntries) { this->hdfsFreeFileInfo(hdfsFileInfo, numEntries); } -char*** LibHdfsShim::GetHosts( - hdfsFS fs, const char* path, tOffset start, tOffset length) { +char*** LibHdfsShim::GetHosts(hdfsFS fs, const char* path, tOffset start, + tOffset length) { GET_SYMBOL(this, hdfsGetHosts); if (this->hdfsGetHosts) { return this->hdfsGetHosts(fs, path, start, length); @@ -446,7 +442,9 @@ char*** LibHdfsShim::GetHosts( void LibHdfsShim::FreeHosts(char*** blockHosts) { GET_SYMBOL(this, hdfsFreeHosts); - if (this->hdfsFreeHosts) { this->hdfsFreeHosts(blockHosts); } + if (this->hdfsFreeHosts) { + this->hdfsFreeHosts(blockHosts); + } } tOffset LibHdfsShim::GetDefaultBlockSize(hdfsFS fs) { @@ -458,16 +456,12 @@ tOffset LibHdfsShim::GetDefaultBlockSize(hdfsFS fs) { } } -tOffset LibHdfsShim::GetCapacity(hdfsFS fs) { - return this->hdfsGetCapacity(fs); -} +tOffset LibHdfsShim::GetCapacity(hdfsFS fs) { return this->hdfsGetCapacity(fs); } -tOffset LibHdfsShim::GetUsed(hdfsFS fs) { - return this->hdfsGetUsed(fs); -} +tOffset LibHdfsShim::GetUsed(hdfsFS fs) { return this->hdfsGetUsed(fs); } -int LibHdfsShim::Chown( - hdfsFS fs, const char* path, const char* owner, const char* group) { +int LibHdfsShim::Chown(hdfsFS fs, const char* path, const char* owner, + const char* group) { GET_SYMBOL(this, hdfsChown); if (this->hdfsChown) { return this->hdfsChown(fs, path, owner, group); diff --git a/cpp/src/arrow/io/hdfs-internal.h b/cpp/src/arrow/io/hdfs-internal.h index c5ea397af0bd5..db6a21c2b36ac 100644 --- a/cpp/src/arrow/io/hdfs-internal.h +++ b/cpp/src/arrow/io/hdfs-internal.h @@ -45,22 +45,22 @@ struct LibHdfsShim { void (*hdfsBuilderSetNameNode)(hdfsBuilder* bld, const char* nn); void (*hdfsBuilderSetNameNodePort)(hdfsBuilder* bld, tPort port); void (*hdfsBuilderSetUserName)(hdfsBuilder* bld, const char* userName); - void (*hdfsBuilderSetKerbTicketCachePath)( - hdfsBuilder* bld, const char* kerbTicketCachePath); + void (*hdfsBuilderSetKerbTicketCachePath)(hdfsBuilder* bld, + const char* kerbTicketCachePath); hdfsFS (*hdfsBuilderConnect)(hdfsBuilder* bld); int (*hdfsDisconnect)(hdfsFS fs); hdfsFile (*hdfsOpenFile)(hdfsFS fs, const char* path, int flags, int bufferSize, - short replication, tSize blocksize); // NOLINT + short replication, tSize blocksize); // NOLINT int (*hdfsCloseFile)(hdfsFS fs, hdfsFile file); int (*hdfsExists)(hdfsFS fs, const char* path); int (*hdfsSeek)(hdfsFS fs, hdfsFile file, tOffset desiredPos); tOffset (*hdfsTell)(hdfsFS fs, hdfsFile file); tSize (*hdfsRead)(hdfsFS fs, hdfsFile file, void* buffer, tSize length); - tSize (*hdfsPread)( - hdfsFS fs, hdfsFile file, tOffset position, void* buffer, tSize length); + tSize (*hdfsPread)(hdfsFS fs, hdfsFile file, tOffset position, void* buffer, + tSize length); tSize (*hdfsWrite)(hdfsFS fs, hdfsFile file, const void* buffer, tSize length); int (*hdfsFlush)(hdfsFS fs, hdfsFile file); int (*hdfsAvailable)(hdfsFS fs, hdfsFile file); @@ -139,7 +139,7 @@ struct LibHdfsShim { int Disconnect(hdfsFS fs); hdfsFile OpenFile(hdfsFS fs, const char* path, int flags, int bufferSize, - short replication, tSize blocksize); // NOLINT + short replication, tSize blocksize); // NOLINT int CloseFile(hdfsFS fs, hdfsFile file); diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc index 9ded9bc3f9902..500f42caf5277 100644 --- a/cpp/src/arrow/io/hdfs.cc +++ b/cpp/src/arrow/io/hdfs.cc @@ -61,8 +61,8 @@ static constexpr int kDefaultHdfsBufferSize = 1 << 16; class HdfsAnyFileImpl { public: - void set_members( - const std::string& path, LibHdfsShim* driver, hdfsFS fs, hdfsFile handle) { + void set_members(const std::string& path, LibHdfsShim* driver, hdfsFS fs, + hdfsFile handle) { path_ = path; driver_ = driver; fs_ = fs; @@ -118,7 +118,7 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { tSize ret; if (driver_->HasPread()) { ret = driver_->Pread(fs_, file_, static_cast(position), - reinterpret_cast(buffer), static_cast(nbytes)); + reinterpret_cast(buffer), static_cast(nbytes)); } else { std::lock_guard guard(lock_); RETURN_NOT_OK(Seek(position)); @@ -136,7 +136,9 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { int64_t bytes_read = 0; RETURN_NOT_OK(ReadAt(position, nbytes, &bytes_read, buffer->mutable_data())); - if (bytes_read < nbytes) { RETURN_NOT_OK(buffer->Resize(bytes_read)); } + if (bytes_read < nbytes) { + RETURN_NOT_OK(buffer->Resize(bytes_read)); + } *out = buffer; return Status::OK(); @@ -145,11 +147,14 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) { int64_t total_bytes = 0; while (total_bytes < nbytes) { - tSize ret = driver_->Read(fs_, file_, reinterpret_cast(buffer + total_bytes), + tSize ret = driver_->Read( + fs_, file_, reinterpret_cast(buffer + total_bytes), static_cast(std::min(buffer_size_, nbytes - total_bytes))); RETURN_NOT_OK(CheckReadResult(ret)); total_bytes += ret; - if (ret == 0) { break; } + if (ret == 0) { + break; + } } *bytes_read = total_bytes; @@ -162,7 +167,9 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { int64_t bytes_read = 0; RETURN_NOT_OK(Read(nbytes, &bytes_read, buffer->mutable_data())); - if (bytes_read < nbytes) { RETURN_NOT_OK(buffer->Resize(bytes_read)); } + if (bytes_read < nbytes) { + RETURN_NOT_OK(buffer->Resize(bytes_read)); + } *out = buffer; return Status::OK(); @@ -170,7 +177,9 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { Status GetSize(int64_t* size) { hdfsFileInfo* entry = driver_->GetPathInfo(fs_, path_.c_str()); - if (entry == nullptr) { return Status::IOError("HDFS: GetPathInfo failed"); } + if (entry == nullptr) { + return Status::IOError("HDFS: GetPathInfo failed"); + } *size = entry->mSize; driver_->FreeFileInfo(entry, 1); @@ -187,31 +196,27 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { }; HdfsReadableFile::HdfsReadableFile(MemoryPool* pool) { - if (pool == nullptr) { pool = default_memory_pool(); } + if (pool == nullptr) { + pool = default_memory_pool(); + } impl_.reset(new HdfsReadableFileImpl(pool)); } -HdfsReadableFile::~HdfsReadableFile() { - DCHECK(impl_->Close().ok()); -} +HdfsReadableFile::~HdfsReadableFile() { DCHECK(impl_->Close().ok()); } -Status HdfsReadableFile::Close() { - return impl_->Close(); -} +Status HdfsReadableFile::Close() { return impl_->Close(); } -Status HdfsReadableFile::ReadAt( - int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) { +Status HdfsReadableFile::ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, + uint8_t* buffer) { return impl_->ReadAt(position, nbytes, bytes_read, buffer); } -Status HdfsReadableFile::ReadAt( - int64_t position, int64_t nbytes, std::shared_ptr* out) { +Status HdfsReadableFile::ReadAt(int64_t position, int64_t nbytes, + std::shared_ptr* out) { return impl_->ReadAt(position, nbytes, out); } -bool HdfsReadableFile::supports_zero_copy() const { - return false; -} +bool HdfsReadableFile::supports_zero_copy() const { return false; } Status HdfsReadableFile::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) { return impl_->Read(nbytes, bytes_read, buffer); @@ -221,17 +226,11 @@ Status HdfsReadableFile::Read(int64_t nbytes, std::shared_ptr* buffer) { return impl_->Read(nbytes, buffer); } -Status HdfsReadableFile::GetSize(int64_t* size) { - return impl_->GetSize(size); -} +Status HdfsReadableFile::GetSize(int64_t* size) { return impl_->GetSize(size); } -Status HdfsReadableFile::Seek(int64_t position) { - return impl_->Seek(position); -} +Status HdfsReadableFile::Seek(int64_t position) { return impl_->Seek(position); } -Status HdfsReadableFile::Tell(int64_t* position) { - return impl_->Tell(position); -} +Status HdfsReadableFile::Tell(int64_t* position) { return impl_->Tell(position); } // ---------------------------------------------------------------------- // File writing @@ -259,28 +258,22 @@ class HdfsOutputStream::HdfsOutputStreamImpl : public HdfsAnyFileImpl { Status Write(const uint8_t* buffer, int64_t nbytes, int64_t* bytes_written) { std::lock_guard guard(lock_); - tSize ret = driver_->Write( - fs_, file_, reinterpret_cast(buffer), static_cast(nbytes)); + tSize ret = driver_->Write(fs_, file_, reinterpret_cast(buffer), + static_cast(nbytes)); CHECK_FAILURE(ret, "Write"); *bytes_written = ret; return Status::OK(); } }; -HdfsOutputStream::HdfsOutputStream() { - impl_.reset(new HdfsOutputStreamImpl()); -} +HdfsOutputStream::HdfsOutputStream() { impl_.reset(new HdfsOutputStreamImpl()); } -HdfsOutputStream::~HdfsOutputStream() { - DCHECK(impl_->Close().ok()); -} +HdfsOutputStream::~HdfsOutputStream() { DCHECK(impl_->Close().ok()); } -Status HdfsOutputStream::Close() { - return impl_->Close(); -} +Status HdfsOutputStream::Close() { return impl_->Close(); } -Status HdfsOutputStream::Write( - const uint8_t* buffer, int64_t nbytes, int64_t* bytes_read) { +Status HdfsOutputStream::Write(const uint8_t* buffer, int64_t nbytes, + int64_t* bytes_read) { return impl_->Write(buffer, nbytes, bytes_read); } @@ -289,13 +282,9 @@ Status HdfsOutputStream::Write(const uint8_t* buffer, int64_t nbytes) { return Write(buffer, nbytes, &bytes_written_dummy); } -Status HdfsOutputStream::Flush() { - return impl_->Flush(); -} +Status HdfsOutputStream::Flush() { return impl_->Flush(); } -Status HdfsOutputStream::Tell(int64_t* position) { - return impl_->Tell(position); -} +Status HdfsOutputStream::Tell(int64_t* position) { return impl_->Tell(position); } // ---------------------------------------------------------------------- // HDFS client @@ -344,7 +333,9 @@ class HdfsClient::HdfsClientImpl { } fs_ = driver_->BuilderConnect(builder); - if (fs_ == nullptr) { return Status::IOError("HDFS connection failed"); } + if (fs_ == nullptr) { + return Status::IOError("HDFS connection failed"); + } namenode_host_ = config->host; port_ = config->port; user_ = config->user; @@ -395,7 +386,9 @@ class HdfsClient::HdfsClientImpl { Status GetPathInfo(const std::string& path, HdfsPathInfo* info) { hdfsFileInfo* entry = driver_->GetPathInfo(fs_, path.c_str()); - if (entry == nullptr) { return Status::IOError("HDFS: GetPathInfo failed"); } + if (entry == nullptr) { + return Status::IOError("HDFS: GetPathInfo failed"); + } SetPathInfo(entry, info); driver_->FreeFileInfo(entry, 1); @@ -435,7 +428,7 @@ class HdfsClient::HdfsClientImpl { } Status OpenReadable(const std::string& path, int32_t buffer_size, - std::shared_ptr* file) { + std::shared_ptr* file) { hdfsFile handle = driver_->OpenFile(fs_, path.c_str(), O_RDONLY, buffer_size, 0, 0); if (handle == nullptr) { @@ -454,13 +447,14 @@ class HdfsClient::HdfsClientImpl { } Status OpenWriteable(const std::string& path, bool append, int32_t buffer_size, - int16_t replication, int64_t default_block_size, - std::shared_ptr* file) { + int16_t replication, int64_t default_block_size, + std::shared_ptr* file) { int flags = O_WRONLY; if (append) flags |= O_APPEND; - hdfsFile handle = driver_->OpenFile(fs_, path.c_str(), flags, buffer_size, - replication, static_cast(default_block_size)); + hdfsFile handle = + driver_->OpenFile(fs_, path.c_str(), flags, buffer_size, replication, + static_cast(default_block_size)); if (handle == nullptr) { // TODO(wesm): determine cause of failure @@ -496,14 +490,12 @@ class HdfsClient::HdfsClientImpl { // ---------------------------------------------------------------------- // Public API for HDFSClient -HdfsClient::HdfsClient() { - impl_.reset(new HdfsClientImpl()); -} +HdfsClient::HdfsClient() { impl_.reset(new HdfsClientImpl()); } HdfsClient::~HdfsClient() {} -Status HdfsClient::Connect( - const HdfsConnectionConfig* config, std::shared_ptr* fs) { +Status HdfsClient::Connect(const HdfsConnectionConfig* config, + std::shared_ptr* fs) { // ctor is private, make_shared will not work *fs = std::shared_ptr(new HdfsClient()); @@ -519,50 +511,43 @@ Status HdfsClient::Delete(const std::string& path, bool recursive) { return impl_->Delete(path, recursive); } -Status HdfsClient::Disconnect() { - return impl_->Disconnect(); -} +Status HdfsClient::Disconnect() { return impl_->Disconnect(); } -bool HdfsClient::Exists(const std::string& path) { - return impl_->Exists(path); -} +bool HdfsClient::Exists(const std::string& path) { return impl_->Exists(path); } Status HdfsClient::GetPathInfo(const std::string& path, HdfsPathInfo* info) { return impl_->GetPathInfo(path, info); } -Status HdfsClient::GetCapacity(int64_t* nbytes) { - return impl_->GetCapacity(nbytes); -} +Status HdfsClient::GetCapacity(int64_t* nbytes) { return impl_->GetCapacity(nbytes); } -Status HdfsClient::GetUsed(int64_t* nbytes) { - return impl_->GetUsed(nbytes); -} +Status HdfsClient::GetUsed(int64_t* nbytes) { return impl_->GetUsed(nbytes); } -Status HdfsClient::ListDirectory( - const std::string& path, std::vector* listing) { +Status HdfsClient::ListDirectory(const std::string& path, + std::vector* listing) { return impl_->ListDirectory(path, listing); } Status HdfsClient::OpenReadable(const std::string& path, int32_t buffer_size, - std::shared_ptr* file) { + std::shared_ptr* file) { return impl_->OpenReadable(path, buffer_size, file); } -Status HdfsClient::OpenReadable( - const std::string& path, std::shared_ptr* file) { +Status HdfsClient::OpenReadable(const std::string& path, + std::shared_ptr* file) { return OpenReadable(path, kDefaultHdfsBufferSize, file); } Status HdfsClient::OpenWriteable(const std::string& path, bool append, - int32_t buffer_size, int16_t replication, int64_t default_block_size, - std::shared_ptr* file) { - return impl_->OpenWriteable( - path, append, buffer_size, replication, default_block_size, file); + int32_t buffer_size, int16_t replication, + int64_t default_block_size, + std::shared_ptr* file) { + return impl_->OpenWriteable(path, append, buffer_size, replication, default_block_size, + file); } -Status HdfsClient::OpenWriteable( - const std::string& path, bool append, std::shared_ptr* file) { +Status HdfsClient::OpenWriteable(const std::string& path, bool append, + std::shared_ptr* file) { return OpenWriteable(path, append, 0, 0, 0, file); } diff --git a/cpp/src/arrow/io/hdfs.h b/cpp/src/arrow/io/hdfs.h index f3de4a2bf174f..63c3ae0d53724 100644 --- a/cpp/src/arrow/io/hdfs.h +++ b/cpp/src/arrow/io/hdfs.h @@ -75,8 +75,8 @@ class ARROW_EXPORT HdfsClient : public FileSystemClient { // @param config (in): configuration for connecting // @param fs (out): the created client // @returns Status - static Status Connect( - const HdfsConnectionConfig* config, std::shared_ptr* fs); + static Status Connect(const HdfsConnectionConfig* config, + std::shared_ptr* fs); // Create directory and all parents // @@ -132,7 +132,7 @@ class ARROW_EXPORT HdfsClient : public FileSystemClient { // // @param path complete file path Status OpenReadable(const std::string& path, int32_t buffer_size, - std::shared_ptr* file); + std::shared_ptr* file); Status OpenReadable(const std::string& path, std::shared_ptr* file); @@ -142,11 +142,11 @@ class ARROW_EXPORT HdfsClient : public FileSystemClient { // @param replication, 0 for default // @param default_block_size, 0 for default Status OpenWriteable(const std::string& path, bool append, int32_t buffer_size, - int16_t replication, int64_t default_block_size, - std::shared_ptr* file); + int16_t replication, int64_t default_block_size, + std::shared_ptr* file); - Status OpenWriteable( - const std::string& path, bool append, std::shared_ptr* file); + Status OpenWriteable(const std::string& path, bool append, + std::shared_ptr* file); private: friend class HdfsReadableFile; @@ -173,8 +173,8 @@ class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile { Status Read(int64_t nbytes, std::shared_ptr* out) override; - Status ReadAt( - int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override; + Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, + uint8_t* buffer) override; Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; diff --git a/cpp/src/arrow/io/interfaces.cc b/cpp/src/arrow/io/interfaces.cc index 06957d4de560d..57dc42d8a9b2a 100644 --- a/cpp/src/arrow/io/interfaces.cc +++ b/cpp/src/arrow/io/interfaces.cc @@ -29,32 +29,28 @@ namespace io { FileInterface::~FileInterface() {} -RandomAccessFile::RandomAccessFile() { - set_mode(FileMode::READ); -} +RandomAccessFile::RandomAccessFile() { set_mode(FileMode::READ); } -Status RandomAccessFile::ReadAt( - int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* out) { +Status RandomAccessFile::ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, + uint8_t* out) { std::lock_guard guard(lock_); RETURN_NOT_OK(Seek(position)); return Read(nbytes, bytes_read, out); } -Status RandomAccessFile::ReadAt( - int64_t position, int64_t nbytes, std::shared_ptr* out) { +Status RandomAccessFile::ReadAt(int64_t position, int64_t nbytes, + std::shared_ptr* out) { std::lock_guard guard(lock_); RETURN_NOT_OK(Seek(position)); return Read(nbytes, out); } Status Writeable::Write(const std::string& data) { - return Write( - reinterpret_cast(data.c_str()), static_cast(data.size())); + return Write(reinterpret_cast(data.c_str()), + static_cast(data.size())); } -Status Writeable::Flush() { - return Status::OK(); -} +Status Writeable::Flush() { return Status::OK(); } } // namespace io } // namespace arrow diff --git a/cpp/src/arrow/io/interfaces.h b/cpp/src/arrow/io/interfaces.h index b5a0bd85bf27b..e71a5c93baa32 100644 --- a/cpp/src/arrow/io/interfaces.h +++ b/cpp/src/arrow/io/interfaces.h @@ -107,8 +107,8 @@ class ARROW_EXPORT RandomAccessFile : public InputStream, public Seekable { /// be overridden /// /// Default implementation is thread-safe - virtual Status ReadAt( - int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* out); + virtual Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, + uint8_t* out); /// Default implementation is thread-safe virtual Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out); diff --git a/cpp/src/arrow/io/io-file-test.cc b/cpp/src/arrow/io/io-file-test.cc index a077f8cb921c7..36c35700d6496 100644 --- a/cpp/src/arrow/io/io-file-test.cc +++ b/cpp/src/arrow/io/io-file-test.cc @@ -43,9 +43,10 @@ static bool FileExists(const std::string& path) { #if defined(_MSC_VER) void InvalidParamHandler(const wchar_t* expr, const wchar_t* func, - const wchar_t* source_file, unsigned int source_line, uintptr_t reserved) { + const wchar_t* source_file, unsigned int source_line, + uintptr_t reserved) { wprintf(L"Invalid parameter in funcion %s. Source: %s line %d expression %s", func, - source_file, source_line, expr); + source_file, source_line, expr); } #endif @@ -61,7 +62,9 @@ static bool FileIsClosed(int fd) { int ret = static_cast(_close(fd)); return (ret == -1); #else - if (-1 != fcntl(fd, F_GETFD)) { return false; } + if (-1 != fcntl(fd, F_GETFD)) { + return false; + } return errno == EBADF; #endif } @@ -76,7 +79,9 @@ class FileTestFixture : public ::testing::Test { void TearDown() { EnsureFileDeleted(); } void EnsureFileDeleted() { - if (FileExists(path_)) { std::remove(path_.c_str()); } + if (FileExists(path_)) { + std::remove(path_.c_str()); + } } protected: @@ -382,7 +387,9 @@ TEST_F(TestReadableFile, ThreadSafety) { for (int i = 0; i < niter; ++i) { ASSERT_OK(file_->ReadAt(0, 3, &buffer)); - if (0 == memcmp(data.c_str(), buffer->data(), 3)) { correct_count += 1; } + if (0 == memcmp(data.c_str(), buffer->data(), 3)) { + correct_count += 1; + } } }; @@ -547,8 +554,8 @@ TEST_F(TestMemoryMappedFile, InvalidFile) { std::string non_existent_path = "invalid-file-name-asfd"; std::shared_ptr result; - ASSERT_RAISES( - IOError, MemoryMappedFile::Open(non_existent_path, FileMode::READ, &result)); + ASSERT_RAISES(IOError, + MemoryMappedFile::Open(non_existent_path, FileMode::READ, &result)); } TEST_F(TestMemoryMappedFile, CastableToFileInterface) { @@ -563,8 +570,8 @@ TEST_F(TestMemoryMappedFile, ThreadSafety) { std::shared_ptr file; ASSERT_OK(MemoryMappedFile::Open(path, FileMode::READWRITE, &file)); - ASSERT_OK(file->Write( - reinterpret_cast(data.c_str()), static_cast(data.size()))); + ASSERT_OK(file->Write(reinterpret_cast(data.c_str()), + static_cast(data.size()))); std::atomic correct_count(0); const int niter = 10000; @@ -574,7 +581,9 @@ TEST_F(TestMemoryMappedFile, ThreadSafety) { for (int i = 0; i < niter; ++i) { ASSERT_OK(file->ReadAt(0, 3, &buffer)); - if (0 == memcmp(data.c_str(), buffer->data(), 3)) { correct_count += 1; } + if (0 == memcmp(data.c_str(), buffer->data(), 3)) { + correct_count += 1; + } } }; diff --git a/cpp/src/arrow/io/io-hdfs-test.cc b/cpp/src/arrow/io/io-hdfs-test.cc index 74f80428c4561..c584cf5adeaf2 100644 --- a/cpp/src/arrow/io/io-hdfs-test.cc +++ b/cpp/src/arrow/io/io-hdfs-test.cc @@ -58,11 +58,11 @@ class TestHdfsClient : public ::testing::Test { } Status WriteDummyFile(const std::string& path, const uint8_t* buffer, int64_t size, - bool append = false, int buffer_size = 0, int16_t replication = 0, - int default_block_size = 0) { + bool append = false, int buffer_size = 0, int16_t replication = 0, + int default_block_size = 0) { std::shared_ptr file; - RETURN_NOT_OK(client_->OpenWriteable( - path, append, buffer_size, replication, default_block_size, &file)); + RETURN_NOT_OK(client_->OpenWriteable(path, append, buffer_size, replication, + default_block_size, &file)); RETURN_NOT_OK(file->Write(buffer, size)); RETURN_NOT_OK(file->Close()); @@ -87,9 +87,10 @@ class TestHdfsClient : public ::testing::Test { LibHdfsShim* driver_shim; client_ = nullptr; - scratch_dir_ = boost::filesystem::unique_path( - boost::filesystem::temp_directory_path() / "arrow-hdfs/scratch-%%%%") - .string(); + scratch_dir_ = + boost::filesystem::unique_path(boost::filesystem::temp_directory_path() / + "arrow-hdfs/scratch-%%%%") + .string(); loaded_driver_ = false; @@ -175,7 +176,9 @@ TYPED_TEST(TestHdfsClient, MakeDirectory) { std::string path = this->ScratchPath("create-directory"); - if (this->client_->Exists(path)) { ASSERT_OK(this->client_->Delete(path, true)); } + if (this->client_->Exists(path)) { + ASSERT_OK(this->client_->Delete(path, true)); + } ASSERT_OK(this->client_->MakeDirectory(path)); ASSERT_TRUE(this->client_->Exists(path)); @@ -396,7 +399,7 @@ TYPED_TEST(TestHdfsClient, ThreadSafety) { std::string data = "foobar"; ASSERT_OK(this->WriteDummyFile(src_path, reinterpret_cast(data.c_str()), - static_cast(data.size()))); + static_cast(data.size()))); std::shared_ptr file; ASSERT_OK(this->client_->OpenReadable(src_path, &file)); @@ -409,10 +412,14 @@ TYPED_TEST(TestHdfsClient, ThreadSafety) { std::shared_ptr buffer; if (i % 2 == 0) { ASSERT_OK(file->ReadAt(3, 3, &buffer)); - if (0 == memcmp(data.c_str() + 3, buffer->data(), 3)) { correct_count += 1; } + if (0 == memcmp(data.c_str() + 3, buffer->data(), 3)) { + correct_count += 1; + } } else { ASSERT_OK(file->ReadAt(0, 4, &buffer)); - if (0 == memcmp(data.c_str() + 0, buffer->data(), 4)) { correct_count += 1; } + if (0 == memcmp(data.c_str() + 0, buffer->data(), 4)) { + correct_count += 1; + } } } }; diff --git a/cpp/src/arrow/io/memory.cc b/cpp/src/arrow/io/memory.cc index 4d8bf63757d64..b6c48ec39be89 100644 --- a/cpp/src/arrow/io/memory.cc +++ b/cpp/src/arrow/io/memory.cc @@ -46,7 +46,7 @@ BufferOutputStream::BufferOutputStream(const std::shared_ptr& b mutable_data_(buffer->mutable_data()) {} Status BufferOutputStream::Create(int64_t initial_capacity, MemoryPool* pool, - std::shared_ptr* out) { + std::shared_ptr* out) { std::shared_ptr buffer; RETURN_NOT_OK(AllocateResizableBuffer(pool, initial_capacity, &buffer)); *out = std::make_shared(buffer); @@ -55,7 +55,9 @@ Status BufferOutputStream::Create(int64_t initial_capacity, MemoryPool* pool, BufferOutputStream::~BufferOutputStream() { // This can fail, better to explicitly call close - if (buffer_) { DCHECK(Close().ok()); } + if (buffer_) { + DCHECK(Close().ok()); + } } Status BufferOutputStream::Close() { @@ -102,9 +104,7 @@ Status BufferOutputStream::Reserve(int64_t nbytes) { // ---------------------------------------------------------------------- // OutputStream that doesn't write anything -Status MockOutputStream::Close() { - return Status::OK(); -} +Status MockOutputStream::Close() { return Status::OK(); } Status MockOutputStream::Tell(int64_t* position) { *position = extent_bytes_written_; @@ -158,7 +158,7 @@ Status FixedSizeBufferWriter::Tell(int64_t* position) { Status FixedSizeBufferWriter::Write(const uint8_t* data, int64_t nbytes) { if (nbytes > memcopy_threshold_ && memcopy_num_threads_ > 1) { parallel_memcopy(mutable_data_ + position_, data, nbytes, memcopy_blocksize_, - memcopy_num_threads_); + memcopy_num_threads_); } else { memcpy(mutable_data_ + position_, data, nbytes); } @@ -166,8 +166,8 @@ Status FixedSizeBufferWriter::Write(const uint8_t* data, int64_t nbytes) { return Status::OK(); } -Status FixedSizeBufferWriter::WriteAt( - int64_t position, const uint8_t* data, int64_t nbytes) { +Status FixedSizeBufferWriter::WriteAt(int64_t position, const uint8_t* data, + int64_t nbytes) { std::lock_guard guard(lock_); RETURN_NOT_OK(Seek(position)); return Write(data, nbytes); @@ -206,9 +206,7 @@ Status BufferReader::Tell(int64_t* position) { return Status::OK(); } -bool BufferReader::supports_zero_copy() const { - return true; -} +bool BufferReader::supports_zero_copy() const { return true; } Status BufferReader::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) { memcpy(buffer, data_ + position_, nbytes); diff --git a/cpp/src/arrow/io/memory.h b/cpp/src/arrow/io/memory.h index 06384f0d4c4b7..1f8177436471c 100644 --- a/cpp/src/arrow/io/memory.h +++ b/cpp/src/arrow/io/memory.h @@ -45,7 +45,7 @@ class ARROW_EXPORT BufferOutputStream : public OutputStream { explicit BufferOutputStream(const std::shared_ptr& buffer); static Status Create(int64_t initial_capacity, MemoryPool* pool, - std::shared_ptr* out); + std::shared_ptr* out); ~BufferOutputStream(); diff --git a/cpp/src/arrow/io/test-common.h b/cpp/src/arrow/io/test-common.h index 438f378085f65..a4974b77528de 100644 --- a/cpp/src/arrow/io/test-common.h +++ b/cpp/src/arrow/io/test-common.h @@ -73,8 +73,8 @@ class MemoryMapFixture { tmp_files_.push_back(path); } - Status InitMemoryMap( - int64_t size, const std::string& path, std::shared_ptr* mmap) { + Status InitMemoryMap(int64_t size, const std::string& path, + std::shared_ptr* mmap) { RETURN_NOT_OK(MemoryMappedFile::Create(path, size, mmap)); tmp_files_.push_back(path); return Status::OK(); diff --git a/cpp/src/arrow/ipc/feather-internal.h b/cpp/src/arrow/ipc/feather-internal.h index 646c3b2f9f2e3..36cfecc0493f5 100644 --- a/cpp/src/arrow/ipc/feather-internal.h +++ b/cpp/src/arrow/ipc/feather-internal.h @@ -49,7 +49,7 @@ struct ARROW_EXPORT ArrayMetadata { ArrayMetadata() {} ArrayMetadata(fbs::Type type, int64_t offset, int64_t length, int64_t null_count, - int64_t total_bytes) + int64_t total_bytes) : type(type), offset(offset), length(length), @@ -135,7 +135,9 @@ class ARROW_EXPORT TableMetadata { bool HasDescription() const { return table_->description() != 0; } std::string GetDescription() const { - if (!HasDescription()) { return std::string(""); } + if (!HasDescription()) { + return std::string(""); + } return table_->description()->str(); } @@ -153,7 +155,7 @@ class ARROW_EXPORT TableMetadata { static inline flatbuffers::Offset GetPrimitiveArray( FBB& fbb, const ArrayMetadata& array) { return fbs::CreatePrimitiveArray(fbb, array.type, fbs::Encoding_PLAIN, array.offset, - array.length, array.null_count, array.total_bytes); + array.length, array.null_count, array.total_bytes); } static inline fbs::TimeUnit ToFlatbufferEnum(TimeUnit::type unit) { diff --git a/cpp/src/arrow/ipc/feather-test.cc b/cpp/src/arrow/ipc/feather-test.cc index 029aae31ff52c..b76b518788b91 100644 --- a/cpp/src/arrow/ipc/feather-test.cc +++ b/cpp/src/arrow/ipc/feather-test.cc @@ -365,8 +365,8 @@ TEST_F(TestTableWriter, TimeTypes) { ArrayFromVector(is_valid, date_values_vec, &date_array); const auto& prim_values = static_cast(*values); - std::vector> buffers = { - prim_values.null_bitmap(), prim_values.values()}; + std::vector> buffers = {prim_values.null_bitmap(), + prim_values.values()}; std::vector> arrays; arrays.push_back(date_array->data()); @@ -400,7 +400,8 @@ TEST_F(TestTableWriter, PrimitiveNullRoundTrip) { ASSERT_OK(reader_->GetColumn(i, &col)); ASSERT_EQ(batch->column_name(i), col->name()); StringArray str_values(batch->column(i)->length(), nullptr, nullptr, - batch->column(i)->null_bitmap(), batch->column(i)->null_count()); + batch->column(i)->null_bitmap(), + batch->column(i)->null_count()); CheckArrays(str_values, *col->data()->chunk(0)); } } diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index 61b96e0c1dc3b..54771d3356b83 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -61,26 +61,30 @@ static int64_t GetOutputLength(int64_t nbytes) { } static Status WritePadded(io::OutputStream* stream, const uint8_t* data, int64_t length, - int64_t* bytes_written) { + int64_t* bytes_written) { RETURN_NOT_OK(stream->Write(data, length)); int64_t remainder = PaddedLength(length) - length; - if (remainder != 0) { RETURN_NOT_OK(stream->Write(kPaddingBytes, remainder)); } + if (remainder != 0) { + RETURN_NOT_OK(stream->Write(kPaddingBytes, remainder)); + } *bytes_written = length + remainder; return Status::OK(); } /// For compability, we need to write any data sometimes just to keep producing /// files that can be read with an older reader. -static Status WritePaddedBlank( - io::OutputStream* stream, int64_t length, int64_t* bytes_written) { +static Status WritePaddedBlank(io::OutputStream* stream, int64_t length, + int64_t* bytes_written) { const uint8_t null = 0; for (int64_t i = 0; i < length; i++) { RETURN_NOT_OK(stream->Write(&null, 1)); } int64_t remainder = PaddedLength(length) - length; - if (remainder != 0) { RETURN_NOT_OK(stream->Write(kPaddingBytes, remainder)); } + if (remainder != 0) { + RETURN_NOT_OK(stream->Write(kPaddingBytes, remainder)); + } *bytes_written = length + remainder; return Status::OK(); } @@ -90,20 +94,22 @@ static Status WritePaddedBlank( TableBuilder::TableBuilder(int64_t num_rows) : finished_(false), num_rows_(num_rows) {} -FBB& TableBuilder::fbb() { - return fbb_; -} +FBB& TableBuilder::fbb() { return fbb_; } Status TableBuilder::Finish() { - if (finished_) { return Status::Invalid("can only call this once"); } + if (finished_) { + return Status::Invalid("can only call this once"); + } FBString desc = 0; - if (!description_.empty()) { desc = fbb_.CreateString(description_); } + if (!description_.empty()) { + desc = fbb_.CreateString(description_); + } flatbuffers::Offset metadata = 0; - auto root = fbs::CreateCTable( - fbb_, desc, num_rows_, fbb_.CreateVector(columns_), kFeatherVersion, metadata); + auto root = fbs::CreateCTable(fbb_, desc, num_rows_, fbb_.CreateVector(columns_), + kFeatherVersion, metadata); fbb_.Finish(root); finished_ = true; @@ -111,17 +117,15 @@ Status TableBuilder::Finish() { } std::shared_ptr TableBuilder::GetBuffer() const { - return std::make_shared( - fbb_.GetBufferPointer(), static_cast(fbb_.GetSize())); + return std::make_shared(fbb_.GetBufferPointer(), + static_cast(fbb_.GetSize())); } void TableBuilder::SetDescription(const std::string& description) { description_ = description; } -void TableBuilder::SetNumRows(int64_t num_rows) { - num_rows_ = num_rows; -} +void TableBuilder::SetNumRows(int64_t num_rows) { num_rows_ = num_rows; } void TableBuilder::add_column(const flatbuffers::Offset& col) { columns_.push_back(col); @@ -177,21 +181,17 @@ Status ColumnBuilder::Finish() { flatbuffers::Offset metadata = CreateColumnMetadata(); auto column = fbs::CreateColumn(buf, buf.CreateString(name_), values, - ToFlatbufferEnum(type_), // metadata_type - metadata, buf.CreateString(user_metadata_)); + ToFlatbufferEnum(type_), // metadata_type + metadata, buf.CreateString(user_metadata_)); // bad coupling, but OK for now parent_->add_column(column); return Status::OK(); } -void ColumnBuilder::SetValues(const ArrayMetadata& values) { - values_ = values; -} +void ColumnBuilder::SetValues(const ArrayMetadata& values) { values_ = values; } -void ColumnBuilder::SetUserMetadata(const std::string& data) { - user_metadata_ = data; -} +void ColumnBuilder::SetUserMetadata(const std::string& data) { user_metadata_ = data; } void ColumnBuilder::SetCategory(const ArrayMetadata& levels, bool ordered) { type_ = ColumnType::CATEGORY; @@ -209,18 +209,14 @@ void ColumnBuilder::SetTimestamp(TimeUnit::type unit, const std::string& timezon meta_timestamp_.timezone = timezone; } -void ColumnBuilder::SetDate() { - type_ = ColumnType::DATE; -} +void ColumnBuilder::SetDate() { type_ = ColumnType::DATE; } void ColumnBuilder::SetTime(TimeUnit::type unit) { type_ = ColumnType::TIME; meta_time_.unit = unit; } -FBB& ColumnBuilder::fbb() { - return *fbb_; -} +FBB& ColumnBuilder::fbb() { return *fbb_; } std::unique_ptr TableBuilder::AddColumn(const std::string& name) { return std::unique_ptr(new ColumnBuilder(this, name)); @@ -272,7 +268,7 @@ class TableReader::TableReaderImpl { } Status GetDataType(const fbs::PrimitiveArray* values, fbs::TypeMetadata metadata_type, - const void* metadata, std::shared_ptr* out) { + const void* metadata, std::shared_ptr* out) { #define PRIMITIVE_CASE(CAP_TYPE, FACTORY_FUNC) \ case fbs::Type_##CAP_TYPE: \ *out = FACTORY_FUNC(); \ @@ -342,7 +338,7 @@ class TableReader::TableReaderImpl { // @returns: a Buffer instance, the precise type will depend on the kind of // input data source (which may or may not have memory-map like semantics) Status LoadValues(const fbs::PrimitiveArray* meta, fbs::TypeMetadata metadata_type, - const void* metadata, std::shared_ptr* out) { + const void* metadata, std::shared_ptr* out) { std::shared_ptr type; RETURN_NOT_OK(GetDataType(meta, metadata_type, metadata, &type)); @@ -394,8 +390,8 @@ class TableReader::TableReaderImpl { // if (user_meta->size() > 0) { user_metadata_ = user_meta->str(); } std::shared_ptr values; - RETURN_NOT_OK(LoadValues( - col_meta->values(), col_meta->metadata_type(), col_meta->metadata(), &values)); + RETURN_NOT_OK(LoadValues(col_meta->values(), col_meta->metadata_type(), + col_meta->metadata(), &values)); out->reset(new Column(col_meta->name()->str(), values)); return Status::OK(); } @@ -410,41 +406,27 @@ class TableReader::TableReaderImpl { // ---------------------------------------------------------------------- // TableReader public API -TableReader::TableReader() { - impl_.reset(new TableReaderImpl()); -} +TableReader::TableReader() { impl_.reset(new TableReaderImpl()); } TableReader::~TableReader() {} Status TableReader::Open(const std::shared_ptr& source, - std::unique_ptr* out) { + std::unique_ptr* out) { out->reset(new TableReader()); return (*out)->impl_->Open(source); } -bool TableReader::HasDescription() const { - return impl_->HasDescription(); -} +bool TableReader::HasDescription() const { return impl_->HasDescription(); } -std::string TableReader::GetDescription() const { - return impl_->GetDescription(); -} +std::string TableReader::GetDescription() const { return impl_->GetDescription(); } -int TableReader::version() const { - return impl_->version(); -} +int TableReader::version() const { return impl_->version(); } -int64_t TableReader::num_rows() const { - return impl_->num_rows(); -} +int64_t TableReader::num_rows() const { return impl_->num_rows(); } -int64_t TableReader::num_columns() const { - return impl_->num_columns(); -} +int64_t TableReader::num_columns() const { return impl_->num_columns(); } -std::string TableReader::GetColumnName(int i) const { - return impl_->GetColumnName(i); -} +std::string TableReader::GetColumnName(int i) const { return impl_->GetColumnName(i); } Status TableReader::GetColumn(int i, std::shared_ptr* out) { return impl_->GetColumn(i, out); @@ -501,8 +483,8 @@ static Status SanitizeUnsupportedTypes(const Array& values, std::shared_ptr( - values.length(), nullptr, nullptr, values.null_bitmap(), values.null_count()); + *out = std::make_shared(values.length(), nullptr, nullptr, + values.null_bitmap(), values.null_count()); return Status::OK(); } else { return MakeArray(values.data(), out); @@ -537,8 +519,8 @@ class TableWriter::TableWriterImpl : public ArrayVisitor { // Footer: metadata length, magic bytes RETURN_NOT_OK( stream_->Write(reinterpret_cast(&buffer_size), sizeof(uint32_t))); - return stream_->Write( - reinterpret_cast(kFeatherMagicBytes), strlen(kFeatherMagicBytes)); + return stream_->Write(reinterpret_cast(kFeatherMagicBytes), + strlen(kFeatherMagicBytes)); } Status LoadArrayMetadata(const Array& values, ArrayMetadata* meta) { @@ -571,7 +553,7 @@ class TableWriter::TableWriterImpl : public ArrayVisitor { // byte boundary, and we write this much data into the stream if (values.null_bitmap()) { RETURN_NOT_OK(WritePadded(stream_.get(), values.null_bitmap()->data(), - values.null_bitmap()->size(), &bytes_written)); + values.null_bitmap()->size(), &bytes_written)); } else { RETURN_NOT_OK(WritePaddedBlank( stream_.get(), BitUtil::BytesForBits(values.length()), &bytes_written)); @@ -592,15 +574,17 @@ class TableWriter::TableWriterImpl : public ArrayVisitor { values_bytes = bin_values.raw_value_offsets()[values.length()]; // Write the variable-length offsets - RETURN_NOT_OK(WritePadded(stream_.get(), - reinterpret_cast(bin_values.raw_value_offsets()), - offset_bytes, &bytes_written)); + RETURN_NOT_OK(WritePadded(stream_.get(), reinterpret_cast( + bin_values.raw_value_offsets()), + offset_bytes, &bytes_written)); } else { RETURN_NOT_OK(WritePaddedBlank(stream_.get(), offset_bytes, &bytes_written)); } meta->total_bytes += bytes_written; - if (bin_values.value_data()) { values_buffer = bin_values.value_data()->data(); } + if (bin_values.value_data()) { + values_buffer = bin_values.value_data()->data(); + } } else { const auto& prim_values = static_cast(values); const auto& fw_type = static_cast(*values.type()); @@ -612,7 +596,9 @@ class TableWriter::TableWriterImpl : public ArrayVisitor { values_bytes = values.length() * fw_type.bit_width() / 8; } - if (prim_values.values()) { values_buffer = prim_values.values()->data(); } + if (prim_values.values()) { + values_buffer = prim_values.values()->data(); + } } if (values_buffer) { RETURN_NOT_OK( @@ -710,9 +696,9 @@ class TableWriter::TableWriterImpl : public ArrayVisitor { Status CheckStarted() { if (!initialized_stream_) { int64_t bytes_written_unused; - RETURN_NOT_OK( - WritePadded(stream_.get(), reinterpret_cast(kFeatherMagicBytes), - strlen(kFeatherMagicBytes), &bytes_written_unused)); + RETURN_NOT_OK(WritePadded(stream_.get(), + reinterpret_cast(kFeatherMagicBytes), + strlen(kFeatherMagicBytes), &bytes_written_unused)); initialized_stream_ = true; } return Status::OK(); @@ -728,33 +714,25 @@ class TableWriter::TableWriterImpl : public ArrayVisitor { Status AppendPrimitive(const PrimitiveArray& values, ArrayMetadata* out); }; -TableWriter::TableWriter() { - impl_.reset(new TableWriterImpl()); -} +TableWriter::TableWriter() { impl_.reset(new TableWriterImpl()); } TableWriter::~TableWriter() {} -Status TableWriter::Open( - const std::shared_ptr& stream, std::unique_ptr* out) { +Status TableWriter::Open(const std::shared_ptr& stream, + std::unique_ptr* out) { out->reset(new TableWriter()); return (*out)->impl_->Open(stream); } -void TableWriter::SetDescription(const std::string& desc) { - impl_->SetDescription(desc); -} +void TableWriter::SetDescription(const std::string& desc) { impl_->SetDescription(desc); } -void TableWriter::SetNumRows(int64_t num_rows) { - impl_->SetNumRows(num_rows); -} +void TableWriter::SetNumRows(int64_t num_rows) { impl_->SetNumRows(num_rows); } Status TableWriter::Append(const std::string& name, const Array& values) { return impl_->Append(name, values); } -Status TableWriter::Finalize() { - return impl_->Finalize(); -} +Status TableWriter::Finalize() { return impl_->Finalize(); } } // namespace feather } // namespace ipc diff --git a/cpp/src/arrow/ipc/feather.h b/cpp/src/arrow/ipc/feather.h index 4d59a8bbd54a9..8abcb5c0f2599 100644 --- a/cpp/src/arrow/ipc/feather.h +++ b/cpp/src/arrow/ipc/feather.h @@ -56,7 +56,7 @@ class ARROW_EXPORT TableReader { ~TableReader(); static Status Open(const std::shared_ptr& source, - std::unique_ptr* out); + std::unique_ptr* out); // Optional table description // @@ -83,8 +83,8 @@ class ARROW_EXPORT TableWriter { public: ~TableWriter(); - static Status Open( - const std::shared_ptr& stream, std::unique_ptr* out); + static Status Open(const std::shared_ptr& stream, + std::unique_ptr* out); void SetDescription(const std::string& desc); void SetNumRows(int64_t num_rows); diff --git a/cpp/src/arrow/ipc/file-to-stream.cc b/cpp/src/arrow/ipc/file-to-stream.cc index a1feedc212618..4707c4fcdf0f7 100644 --- a/cpp/src/arrow/ipc/file-to-stream.cc +++ b/cpp/src/arrow/ipc/file-to-stream.cc @@ -15,11 +15,11 @@ // specific language governing permissions and limitations // under the License. +#include #include "arrow/io/file.h" #include "arrow/ipc/reader.h" #include "arrow/ipc/writer.h" #include "arrow/status.h" -#include #include "arrow/util/io-util.h" diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc index 79344df46b243..35264fa02c5ba 100644 --- a/cpp/src/arrow/ipc/ipc-json-test.cc +++ b/cpp/src/arrow/ipc/ipc-json-test.cc @@ -77,7 +77,9 @@ void TestArrayRoundTrip(const Array& array) { rj::Document d; d.Parse(array_as_json); - if (d.HasParseError()) { FAIL() << "JSON parsing failed"; } + if (d.HasParseError()) { + FAIL() << "JSON parsing failed"; + } std::shared_ptr out; ASSERT_OK(internal::ReadArray(default_memory_pool(), d, array.type(), &out)); @@ -88,7 +90,8 @@ void TestArrayRoundTrip(const Array& array) { template void CheckPrimitive(const std::shared_ptr& type, - const std::vector& is_valid, const std::vector& values) { + const std::vector& is_valid, + const std::vector& values) { MemoryPool* pool = default_memory_pool(); typename TypeTraits::BuilderType builder(pool); @@ -108,16 +111,17 @@ void CheckPrimitive(const std::shared_ptr& type, TEST(TestJsonSchemaWriter, FlatTypes) { // TODO // field("f14", date32()) - std::vector> fields = {field("f0", int8()), - field("f1", int16(), false), field("f2", int32()), field("f3", int64(), false), - field("f4", uint8()), field("f5", uint16()), field("f6", uint32()), - field("f7", uint64()), field("f8", float32()), field("f9", float64()), - field("f10", utf8()), field("f11", binary()), field("f12", list(int32())), + std::vector> fields = { + field("f0", int8()), field("f1", int16(), false), field("f2", int32()), + field("f3", int64(), false), field("f4", uint8()), field("f5", uint16()), + field("f6", uint32()), field("f7", uint64()), field("f8", float32()), + field("f9", float64()), field("f10", utf8()), field("f11", binary()), + field("f12", list(int32())), field("f13", struct_({field("s1", int32()), field("s2", utf8())})), field("f15", date64()), field("f16", timestamp(TimeUnit::NANO)), field("f17", time64(TimeUnit::MICRO)), field("f18", union_({field("u1", int8()), field("u2", time32(TimeUnit::MILLI))}, - {0, 1}, UnionMode::DENSE))}; + {0, 1}, UnionMode::DENSE))}; Schema schema(fields); TestSchemaRoundTrip(schema); @@ -185,8 +189,8 @@ TEST(TestJsonArrayWriter, NestedTypes) { struct_({field("f1", int32()), field("f2", int32()), field("f3", int32())}); std::vector> fields = {values_array, values_array, values_array}; - StructArray struct_array( - struct_type, static_cast(struct_is_valid.size()), fields, struct_bitmap, 2); + StructArray struct_array(struct_type, static_cast(struct_is_valid.size()), fields, + struct_bitmap, 2); TestArrayRoundTrip(struct_array); } @@ -202,7 +206,7 @@ TEST(TestJsonArrayWriter, Unions) { // Data generation for test case below void MakeBatchArrays(const std::shared_ptr& schema, const int num_rows, - std::vector>* arrays) { + std::vector>* arrays) { std::vector is_valid; test::random_is_valid(num_rows, 0.25, &is_valid); @@ -266,8 +270,8 @@ TEST(TestJsonFileReadWrite, BasicRoundTrip) { std::unique_ptr reader; - auto buffer = std::make_shared( - reinterpret_cast(result.c_str()), static_cast(result.size())); + auto buffer = std::make_shared(reinterpret_cast(result.c_str()), + static_cast(result.size())); ASSERT_OK(JsonReader::Open(buffer, &reader)); ASSERT_TRUE(reader->schema()->Equals(*schema)); @@ -332,8 +336,8 @@ TEST(TestJsonFileReadWrite, MinimalFormatExample) { } )example"; - auto buffer = std::make_shared( - reinterpret_cast(example), strlen(example)); + auto buffer = std::make_shared(reinterpret_cast(example), + strlen(example)); std::unique_ptr reader; ASSERT_OK(JsonReader::Open(buffer, &reader)); @@ -361,9 +365,9 @@ TEST(TestJsonFileReadWrite, MinimalFormatExample) { #define BATCH_CASES() \ ::testing::Values(&MakeIntRecordBatch, &MakeListRecordBatch, &MakeNonNullRecordBatch, \ - &MakeZeroLengthRecordBatch, &MakeDeeplyNestedList, &MakeStringTypesRecordBatch, \ - &MakeStruct, &MakeUnion, &MakeDates, &MakeTimestamps, &MakeTimes, &MakeFWBinary, \ - &MakeDictionary); + &MakeZeroLengthRecordBatch, &MakeDeeplyNestedList, \ + &MakeStringTypesRecordBatch, &MakeStruct, &MakeUnion, &MakeDates, \ + &MakeTimestamps, &MakeTimes, &MakeFWBinary, &MakeDictionary); class TestJsonRoundTrip : public ::testing::TestWithParam { public: @@ -382,7 +386,7 @@ void CheckRoundtrip(const RecordBatch& batch) { ASSERT_OK(writer->Finish(&result)); auto buffer = std::make_shared(reinterpret_cast(result.c_str()), - static_cast(result.size())); + static_cast(result.size())); std::unique_ptr reader; ASSERT_OK(JsonReader::Open(buffer, &reader)); diff --git a/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc b/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc index c890d829849fd..a88120a248d2d 100644 --- a/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc +++ b/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc @@ -80,7 +80,7 @@ static void BM_WriteRecordBatch(benchmark::State& state) { // NOLINT non-const int32_t metadata_length; int64_t body_length; if (!ipc::WriteRecordBatch(*record_batch, 0, &stream, &metadata_length, &body_length, - default_memory_pool()) + default_memory_pool()) .ok()) { state.SkipWithError("Failed to write!"); } @@ -101,7 +101,7 @@ static void BM_ReadRecordBatch(benchmark::State& state) { // NOLINT non-const r int32_t metadata_length; int64_t body_length; if (!ipc::WriteRecordBatch(*record_batch, 0, &stream, &metadata_length, &body_length, - default_memory_pool()) + default_memory_pool()) .ok()) { state.SkipWithError("Failed to write!"); } diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/ipc-read-write-test.cc index 2119ff74056f2..6c7051750b7cb 100644 --- a/cpp/src/arrow/ipc/ipc-read-write-test.cc +++ b/cpp/src/arrow/ipc/ipc-read-write-test.cc @@ -126,40 +126,45 @@ TEST_F(TestSchemaMetadata, NestedFields) { CheckRoundtrip(schema, &memo); } -#define BATCH_CASES() \ - ::testing::Values(&MakeIntRecordBatch, &MakeListRecordBatch, &MakeNonNullRecordBatch, \ - &MakeZeroLengthRecordBatch, &MakeDeeplyNestedList, &MakeStringTypesRecordBatch, \ - &MakeStruct, &MakeUnion, &MakeDictionary, &MakeDates, &MakeTimestamps, &MakeTimes, \ - &MakeFWBinary, &MakeBooleanBatch); +#define BATCH_CASES() \ + ::testing::Values(&MakeIntRecordBatch, &MakeListRecordBatch, &MakeNonNullRecordBatch, \ + &MakeZeroLengthRecordBatch, &MakeDeeplyNestedList, \ + &MakeStringTypesRecordBatch, &MakeStruct, &MakeUnion, \ + &MakeDictionary, &MakeDates, &MakeTimestamps, &MakeTimes, \ + &MakeFWBinary, &MakeBooleanBatch); static int g_file_number = 0; class IpcTestFixture : public io::MemoryMapFixture { public: Status DoStandardRoundTrip(const RecordBatch& batch, bool zero_data, - std::shared_ptr* batch_result) { + std::shared_ptr* batch_result) { int32_t metadata_length; int64_t body_length; const int64_t buffer_offset = 0; - if (zero_data) { RETURN_NOT_OK(ZeroMemoryMap(mmap_.get())); } + if (zero_data) { + RETURN_NOT_OK(ZeroMemoryMap(mmap_.get())); + } RETURN_NOT_OK(mmap_->Seek(0)); - RETURN_NOT_OK(WriteRecordBatch( - batch, buffer_offset, mmap_.get(), &metadata_length, &body_length, pool_)); + RETURN_NOT_OK(WriteRecordBatch(batch, buffer_offset, mmap_.get(), &metadata_length, + &body_length, pool_)); std::unique_ptr message; RETURN_NOT_OK(ReadMessage(0, metadata_length, mmap_.get(), &message)); io::BufferReader buffer_reader(message->body()); - return ReadRecordBatch( - *message->metadata(), batch.schema(), &buffer_reader, batch_result); + return ReadRecordBatch(*message->metadata(), batch.schema(), &buffer_reader, + batch_result); } - Status DoLargeRoundTrip( - const RecordBatch& batch, bool zero_data, std::shared_ptr* result) { - if (zero_data) { RETURN_NOT_OK(ZeroMemoryMap(mmap_.get())); } + Status DoLargeRoundTrip(const RecordBatch& batch, bool zero_data, + std::shared_ptr* result) { + if (zero_data) { + RETURN_NOT_OK(ZeroMemoryMap(mmap_.get())); + } RETURN_NOT_OK(mmap_->Seek(0)); std::shared_ptr file_writer; @@ -244,8 +249,8 @@ TEST_F(TestIpcRoundTrip, MetadataVersion) { const int64_t buffer_offset = 0; - ASSERT_OK(WriteRecordBatch( - *batch, buffer_offset, mmap_.get(), &metadata_length, &body_length, pool_)); + ASSERT_OK(WriteRecordBatch(*batch, buffer_offset, mmap_.get(), &metadata_length, + &body_length, pool_)); std::unique_ptr message; ASSERT_OK(ReadMessage(0, metadata_length, mmap_.get(), &message)); @@ -258,7 +263,9 @@ TEST_P(TestIpcRoundTrip, SliceRoundTrip) { ASSERT_OK((*GetParam())(&batch)); // NOLINT clang-tidy gtest issue // Skip the zero-length case - if (batch->num_rows() < 2) { return; } + if (batch->num_rows() < 2) { + return; + } auto sliced_batch = batch->Slice(2, 10); CheckRoundtrip(*sliced_batch, 1 << 20); @@ -282,8 +289,9 @@ TEST_P(TestIpcRoundTrip, ZeroLengthArrays) { ASSERT_OK(AllocateBuffer(pool_, sizeof(int32_t), &value_offsets)); *reinterpret_cast(value_offsets->mutable_data()) = 0; - std::shared_ptr bin_array = std::make_shared(0, value_offsets, - std::make_shared(nullptr, 0), std::make_shared(nullptr, 0)); + std::shared_ptr bin_array = std::make_shared( + 0, value_offsets, std::make_shared(nullptr, 0), + std::make_shared(nullptr, 0)); // null value_offsets std::shared_ptr bin_array2 = std::make_shared(0, nullptr, nullptr); @@ -357,8 +365,8 @@ TEST_F(TestWriteRecordBatch, SliceTruncatesBuffers) { std::shared_ptr offsets_buffer; ASSERT_OK( test::CopyBufferFromVector(type_offsets, default_memory_pool(), &offsets_buffer)); - a1 = std::make_shared( - dense_union_type, a0->length(), struct_children, ids_buffer, offsets_buffer); + a1 = std::make_shared(dense_union_type, a0->length(), struct_children, + ids_buffer, offsets_buffer); CheckArray(a1); } @@ -367,8 +375,8 @@ void TestGetRecordBatchSize(std::shared_ptr batch) { int32_t mock_metadata_length = -1; int64_t mock_body_length = -1; int64_t size = -1; - ASSERT_OK(WriteRecordBatch( - *batch, 0, &mock, &mock_metadata_length, &mock_body_length, default_memory_pool())); + ASSERT_OK(WriteRecordBatch(*batch, 0, &mock, &mock_metadata_length, &mock_body_length, + default_memory_pool())); ASSERT_OK(GetRecordBatchSize(*batch, &size)); ASSERT_EQ(mock.GetExtentBytesWritten(), size); } @@ -398,8 +406,8 @@ class RecursionLimits : public ::testing::Test, public io::MemoryMapFixture { void TearDown() { io::MemoryMapFixture::TearDown(); } Status WriteToMmap(int recursion_level, bool override_level, int32_t* metadata_length, - int64_t* body_length, std::shared_ptr* batch, - std::shared_ptr* schema) { + int64_t* body_length, std::shared_ptr* batch, + std::shared_ptr* schema) { const int batch_length = 5; TypePtr type = int32(); std::shared_ptr array; @@ -425,10 +433,10 @@ class RecursionLimits : public ::testing::Test, public io::MemoryMapFixture { if (override_level) { return WriteRecordBatch(**batch, 0, mmap_.get(), metadata_length, body_length, - pool_, recursion_level + 1); + pool_, recursion_level + 1); } else { - return WriteRecordBatch( - **batch, 0, mmap_.get(), metadata_length, body_length, pool_); + return WriteRecordBatch(**batch, 0, mmap_.get(), metadata_length, body_length, + pool_); } } @@ -442,8 +450,8 @@ TEST_F(RecursionLimits, WriteLimit) { int64_t body_length = -1; std::shared_ptr schema; std::shared_ptr batch; - ASSERT_RAISES(Invalid, - WriteToMmap((1 << 8) + 1, false, &metadata_length, &body_length, &batch, &schema)); + ASSERT_RAISES(Invalid, WriteToMmap((1 << 8) + 1, false, &metadata_length, &body_length, + &batch, &schema)); } TEST_F(RecursionLimits, ReadLimit) { @@ -454,8 +462,8 @@ TEST_F(RecursionLimits, ReadLimit) { const int recursion_depth = 64; std::shared_ptr batch; - ASSERT_OK(WriteToMmap( - recursion_depth, true, &metadata_length, &body_length, &batch, &schema)); + ASSERT_OK(WriteToMmap(recursion_depth, true, &metadata_length, &body_length, &batch, + &schema)); std::unique_ptr message; ASSERT_OK(ReadMessage(0, metadata_length, mmap_.get(), &message)); @@ -472,16 +480,16 @@ TEST_F(RecursionLimits, StressLimit) { int64_t body_length = -1; std::shared_ptr schema; std::shared_ptr batch; - ASSERT_OK(WriteToMmap( - recursion_depth, true, &metadata_length, &body_length, &batch, &schema)); + ASSERT_OK(WriteToMmap(recursion_depth, true, &metadata_length, &body_length, &batch, + &schema)); std::unique_ptr message; ASSERT_OK(ReadMessage(0, metadata_length, mmap_.get(), &message)); io::BufferReader reader(message->body()); std::shared_ptr result; - ASSERT_OK(ReadRecordBatch( - *message->metadata(), schema, recursion_depth + 1, &reader, &result)); + ASSERT_OK(ReadRecordBatch(*message->metadata(), schema, recursion_depth + 1, &reader, + &result)); *it_works = result->Equals(*batch); }; @@ -568,8 +576,8 @@ class TestStreamFormat : public ::testing::TestWithParam { } void TearDown() {} - Status RoundTripHelper( - const RecordBatch& batch, std::vector>* out_batches) { + Status RoundTripHelper(const RecordBatch& batch, + std::vector>* out_batches) { // Write the file std::shared_ptr writer; RETURN_NOT_OK(RecordBatchStreamWriter::Open(sink_.get(), batch.schema(), &writer)); @@ -589,7 +597,9 @@ class TestStreamFormat : public ::testing::TestWithParam { std::shared_ptr chunk; while (true) { RETURN_NOT_OK(reader->ReadNextRecordBatch(&chunk)); - if (chunk == nullptr) { break; } + if (chunk == nullptr) { + break; + } out_batches->emplace_back(chunk); } return Status::OK(); @@ -747,8 +757,8 @@ TEST_F(TestTensorRoundTrip, NonContiguous) { int32_t metadata_length; int64_t body_length; ASSERT_OK(mmap_->Seek(0)); - ASSERT_RAISES( - Invalid, WriteTensor(tensor, mmap_.get(), &metadata_length, &body_length)); + ASSERT_RAISES(Invalid, + WriteTensor(tensor, mmap_.get(), &metadata_length, &body_length)); } } // namespace ipc diff --git a/cpp/src/arrow/ipc/json-integration-test.cc b/cpp/src/arrow/ipc/json-integration-test.cc index 18f5dfaf57098..035f7086e7e53 100644 --- a/cpp/src/arrow/ipc/json-integration-test.cc +++ b/cpp/src/arrow/ipc/json-integration-test.cc @@ -40,7 +40,8 @@ DEFINE_string(arrow, "", "Arrow file name"); DEFINE_string(json, "", "JSON file name"); -DEFINE_string(mode, "VALIDATE", +DEFINE_string( + mode, "VALIDATE", "Mode of integration testing tool (ARROW_TO_JSON, JSON_TO_ARROW, VALIDATE)"); DEFINE_bool(integration, false, "Run in integration test mode"); DEFINE_bool(verbose, true, "Verbose output"); @@ -55,8 +56,8 @@ bool file_exists(const char* path) { } // Convert JSON file to IPC binary format -static Status ConvertJsonToArrow( - const std::string& json_path, const std::string& arrow_path) { +static Status ConvertJsonToArrow(const std::string& json_path, + const std::string& arrow_path) { std::shared_ptr in_file; std::shared_ptr out_file; @@ -89,8 +90,8 @@ static Status ConvertJsonToArrow( } // Convert IPC binary format to JSON -static Status ConvertArrowToJson( - const std::string& arrow_path, const std::string& json_path) { +static Status ConvertArrowToJson(const std::string& arrow_path, + const std::string& json_path) { std::shared_ptr in_file; std::shared_ptr out_file; @@ -116,11 +117,11 @@ static Status ConvertArrowToJson( std::string result; RETURN_NOT_OK(writer->Finish(&result)); return out_file->Write(reinterpret_cast(result.c_str()), - static_cast(result.size())); + static_cast(result.size())); } -static Status ValidateArrowVsJson( - const std::string& arrow_path, const std::string& json_path) { +static Status ValidateArrowVsJson(const std::string& arrow_path, + const std::string& json_path) { // Construct JSON reader std::shared_ptr json_file; RETURN_NOT_OK(io::ReadableFile::Open(json_path, &json_file)); @@ -151,7 +152,9 @@ static Status ValidateArrowVsJson( << "Arrow schema: \n" << arrow_schema->ToString(); - if (FLAGS_verbose) { std::cout << ss.str() << std::endl; } + if (FLAGS_verbose) { + std::cout << ss.str() << std::endl; + } return Status::Invalid("Schemas did not match"); } @@ -188,10 +191,14 @@ static Status ValidateArrowVsJson( } Status RunCommand(const std::string& json_path, const std::string& arrow_path, - const std::string& command) { - if (json_path == "") { return Status::Invalid("Must specify json file name"); } + const std::string& command) { + if (json_path == "") { + return Status::Invalid("Must specify json file name"); + } - if (arrow_path == "") { return Status::Invalid("Must specify arrow file name"); } + if (arrow_path == "") { + return Status::Invalid("Must specify arrow file name"); + } if (command == "ARROW_TO_JSON") { if (!file_exists(arrow_path.c_str())) { @@ -240,8 +247,8 @@ class TestJSONIntegration : public ::testing::Test { do { std::shared_ptr out; RETURN_NOT_OK(io::FileOutputStream::Open(path, &out)); - RETURN_NOT_OK(out->Write( - reinterpret_cast(data), static_cast(strlen(data)))); + RETURN_NOT_OK(out->Write(reinterpret_cast(data), + static_cast(strlen(data)))); } while (0); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 69e4ae8d14a04..175d75b7d1e97 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -199,7 +199,7 @@ class SchemaWriter { typename std::enable_if::value || std::is_base_of::value || std::is_base_of::value, - void>::type + void>::type WriteTypeMetadata(const T& type) {} void WriteTypeMetadata(const Integer& type) { @@ -508,7 +508,7 @@ class ArrayWriter { } Status WriteChildren(const std::vector>& fields, - const std::vector>& arrays) { + const std::vector>& arrays) { writer_->Key("children"); writer_->StartArray(); for (size_t i = 0; i < fields.size(); ++i) { @@ -602,16 +602,16 @@ static Status GetObjectBool(const RjObject& obj, const std::string& key, bool* o return Status::OK(); } -static Status GetObjectString( - const RjObject& obj, const std::string& key, std::string* out) { +static Status GetObjectString(const RjObject& obj, const std::string& key, + std::string* out) { const auto& it = obj.FindMember(key); RETURN_NOT_STRING(key, it, obj); *out = it->value.GetString(); return Status::OK(); } -static Status GetInteger( - const rj::Value::ConstObject& json_type, std::shared_ptr* type) { +static Status GetInteger(const rj::Value::ConstObject& json_type, + std::shared_ptr* type) { const auto& it_bit_width = json_type.FindMember("bitWidth"); RETURN_NOT_INT("bitWidth", it_bit_width, json_type); @@ -642,8 +642,8 @@ static Status GetInteger( return Status::OK(); } -static Status GetFloatingPoint( - const RjObject& json_type, std::shared_ptr* type) { +static Status GetFloatingPoint(const RjObject& json_type, + std::shared_ptr* type) { const auto& it_precision = json_type.FindMember("precision"); RETURN_NOT_STRING("precision", it_precision, json_type); @@ -663,8 +663,8 @@ static Status GetFloatingPoint( return Status::OK(); } -static Status GetFixedSizeBinary( - const RjObject& json_type, std::shared_ptr* type) { +static Status GetFixedSizeBinary(const RjObject& json_type, + std::shared_ptr* type) { const auto& it_byte_width = json_type.FindMember("byteWidth"); RETURN_NOT_INT("byteWidth", it_byte_width, json_type); @@ -756,8 +756,8 @@ static Status GetTimestamp(const RjObject& json_type, std::shared_ptr* } static Status GetUnion(const RjObject& json_type, - const std::vector>& children, - std::shared_ptr* type) { + const std::vector>& children, + std::shared_ptr* type) { const auto& it_mode = json_type.FindMember("mode"); RETURN_NOT_STRING("mode", it_mode, json_type); @@ -790,8 +790,8 @@ static Status GetUnion(const RjObject& json_type, } static Status GetType(const RjObject& json_type, - const std::vector>& children, - std::shared_ptr* type) { + const std::vector>& children, + std::shared_ptr* type) { const auto& it_type_name = json_type.FindMember("name"); RETURN_NOT_STRING("name", it_type_name, json_type); @@ -831,10 +831,11 @@ static Status GetType(const RjObject& json_type, } static Status GetField(const rj::Value& obj, const DictionaryMemo* dictionary_memo, - std::shared_ptr* field); + std::shared_ptr* field); static Status GetFieldsFromArray(const rj::Value& obj, - const DictionaryMemo* dictionary_memo, std::vector>* fields) { + const DictionaryMemo* dictionary_memo, + std::vector>* fields) { const auto& values = obj.GetArray(); fields->resize(values.Size()); @@ -845,7 +846,7 @@ static Status GetFieldsFromArray(const rj::Value& obj, } static Status ParseDictionary(const RjObject& obj, int64_t* id, bool* is_ordered, - std::shared_ptr* index_type) { + std::shared_ptr* index_type) { int32_t int32_id; RETURN_NOT_OK(GetObjectInt(obj, "id", &int32_id)); *id = int32_id; @@ -866,8 +867,10 @@ static Status ParseDictionary(const RjObject& obj, int64_t* id, bool* is_ordered } static Status GetField(const rj::Value& obj, const DictionaryMemo* dictionary_memo, - std::shared_ptr* field) { - if (!obj.IsObject()) { return Status::Invalid("Field was not a JSON object"); } + std::shared_ptr* field) { + if (!obj.IsObject()) { + return Status::Invalid("Field was not a JSON object"); + } const auto& json_field = obj.GetObject(); std::string name; @@ -884,8 +887,8 @@ static Status GetField(const rj::Value& obj, const DictionaryMemo* dictionary_me int64_t dictionary_id; bool is_ordered; std::shared_ptr index_type; - RETURN_NOT_OK(ParseDictionary( - it_dictionary->value.GetObject(), &dictionary_id, &is_ordered, &index_type)); + RETURN_NOT_OK(ParseDictionary(it_dictionary->value.GetObject(), &dictionary_id, + &is_ordered, &index_type)); std::shared_ptr dictionary; RETURN_NOT_OK(dictionary_memo->GetDictionary(dictionary_id, &dictionary)); @@ -941,13 +944,13 @@ UnboxValue(const rj::Value& val) { class ArrayReader { public: explicit ArrayReader(const rj::Value& json_array, const std::shared_ptr& type, - MemoryPool* pool) + MemoryPool* pool) : json_array_(json_array), type_(type), pool_(pool) {} Status ParseTypeValues(const DataType& type); Status GetValidityBuffer(const std::vector& is_valid, int32_t* null_count, - std::shared_ptr* validity_buffer) { + std::shared_ptr* validity_buffer) { int length = static_cast(is_valid.size()); std::shared_ptr out_buffer; @@ -1024,7 +1027,9 @@ class ArrayReader { DCHECK(hex_string.size() % 2 == 0) << "Expected base16 hex string"; int32_t length = static_cast(hex_string.size()) / 2; - if (byte_buffer->size() < length) { RETURN_NOT_OK(byte_buffer->Resize(length)); } + if (byte_buffer->size() < length) { + RETURN_NOT_OK(byte_buffer->Resize(length)); + } const char* hex_data = hex_string.c_str(); uint8_t* byte_buffer_data = byte_buffer->mutable_data(); @@ -1078,8 +1083,8 @@ class ArrayReader { } template - Status GetIntArray( - const RjArray& json_array, const int32_t length, std::shared_ptr* out) { + Status GetIntArray(const RjArray& json_array, const int32_t length, + std::shared_ptr* out) { std::shared_ptr buffer; RETURN_NOT_OK(AllocateBuffer(pool_, length * sizeof(T), &buffer)); @@ -1102,15 +1107,15 @@ class ArrayReader { const auto& json_offsets = obj_->FindMember("OFFSET"); RETURN_NOT_ARRAY("OFFSET", json_offsets, *obj_); std::shared_ptr offsets_buffer; - RETURN_NOT_OK(GetIntArray( - json_offsets->value.GetArray(), length_ + 1, &offsets_buffer)); + RETURN_NOT_OK(GetIntArray(json_offsets->value.GetArray(), length_ + 1, + &offsets_buffer)); std::vector> children; RETURN_NOT_OK(GetChildren(*obj_, type, &children)); DCHECK_EQ(children.size(), 1); - result_ = std::make_shared( - type_, length_, offsets_buffer, children[0], validity_buffer, null_count); + result_ = std::make_shared(type_, length_, offsets_buffer, children[0], + validity_buffer, null_count); return Status::OK(); } @@ -1123,8 +1128,8 @@ class ArrayReader { std::vector> fields; RETURN_NOT_OK(GetChildren(*obj_, type, &fields)); - result_ = std::make_shared( - type_, length_, fields, validity_buffer, null_count); + result_ = std::make_shared(type_, length_, fields, validity_buffer, + null_count); return Status::OK(); } @@ -1154,7 +1159,7 @@ class ArrayReader { RETURN_NOT_OK(GetChildren(*obj_, type, &children)); result_ = std::make_shared(type_, length_, children, type_id_buffer, - offsets_buffer, validity_buffer, null_count); + offsets_buffer, validity_buffer, null_count); return Status::OK(); } @@ -1177,7 +1182,7 @@ class ArrayReader { } Status GetChildren(const RjObject& obj, const DataType& type, - std::vector>* array) { + std::vector>* array) { const auto& json_children = obj.FindMember("children"); RETURN_NOT_ARRAY("children", json_children, obj); const auto& json_children_arr = json_children->value.GetArray(); @@ -1280,7 +1285,8 @@ static Status GetDictionaryTypes(const RjArray& fields, DictionaryTypeMap* id_to } static Status ReadDictionary(const RjObject& obj, const DictionaryTypeMap& id_to_field, - MemoryPool* pool, int64_t* dictionary_id, std::shared_ptr* out) { + MemoryPool* pool, int64_t* dictionary_id, + std::shared_ptr* out) { int id; RETURN_NOT_OK(GetObjectInt(obj, "id", &id)); @@ -1312,7 +1318,7 @@ static Status ReadDictionary(const RjObject& obj, const DictionaryTypeMap& id_to } static Status ReadDictionaries(const rj::Value& doc, const DictionaryTypeMap& id_to_field, - MemoryPool* pool, DictionaryMemo* dictionary_memo) { + MemoryPool* pool, DictionaryMemo* dictionary_memo) { auto it = doc.FindMember("dictionaries"); if (it == doc.MemberEnd()) { // No dictionaries @@ -1334,8 +1340,8 @@ static Status ReadDictionaries(const rj::Value& doc, const DictionaryTypeMap& id return Status::OK(); } -Status ReadSchema( - const rj::Value& json_schema, MemoryPool* pool, std::shared_ptr* schema) { +Status ReadSchema(const rj::Value& json_schema, MemoryPool* pool, + std::shared_ptr* schema) { auto it = json_schema.FindMember("schema"); RETURN_NOT_OBJECT("schema", it, json_schema); const auto& obj_schema = it->value.GetObject(); @@ -1359,7 +1365,7 @@ Status ReadSchema( } Status ReadRecordBatch(const rj::Value& json_obj, const std::shared_ptr& schema, - MemoryPool* pool, std::shared_ptr* batch) { + MemoryPool* pool, std::shared_ptr* batch) { DCHECK(json_obj.IsObject()); const auto& batch_obj = json_obj.GetObject(); @@ -1409,14 +1415,16 @@ Status WriteArray(const std::string& name, const Array& array, RjWriter* json_wr } Status ReadArray(MemoryPool* pool, const rj::Value& json_array, - const std::shared_ptr& type, std::shared_ptr* array) { + const std::shared_ptr& type, std::shared_ptr* array) { ArrayReader converter(json_array, type, pool); return converter.GetArray(array); } Status ReadArray(MemoryPool* pool, const rj::Value& json_array, const Schema& schema, - std::shared_ptr* array) { - if (!json_array.IsObject()) { return Status::Invalid("Element was not a JSON object"); } + std::shared_ptr* array) { + if (!json_array.IsObject()) { + return Status::Invalid("Element was not a JSON object"); + } const auto& json_obj = json_array.GetObject(); diff --git a/cpp/src/arrow/ipc/json-internal.h b/cpp/src/arrow/ipc/json-internal.h index 5571d9233969c..9b641cd53329b 100644 --- a/cpp/src/arrow/ipc/json-internal.h +++ b/cpp/src/arrow/ipc/json-internal.h @@ -99,17 +99,17 @@ Status WriteSchema(const Schema& schema, RjWriter* writer); Status WriteRecordBatch(const RecordBatch& batch, RjWriter* writer); Status WriteArray(const std::string& name, const Array& array, RjWriter* writer); -Status ReadSchema( - const rj::Value& json_obj, MemoryPool* pool, std::shared_ptr* schema); +Status ReadSchema(const rj::Value& json_obj, MemoryPool* pool, + std::shared_ptr* schema); Status ReadRecordBatch(const rj::Value& json_obj, const std::shared_ptr& schema, - MemoryPool* pool, std::shared_ptr* batch); + MemoryPool* pool, std::shared_ptr* batch); Status ReadArray(MemoryPool* pool, const rj::Value& json_obj, - const std::shared_ptr& type, std::shared_ptr* array); + const std::shared_ptr& type, std::shared_ptr* array); Status ReadArray(MemoryPool* pool, const rj::Value& json_obj, const Schema& schema, - std::shared_ptr* array); + std::shared_ptr* array); } // namespace internal } // namespace json diff --git a/cpp/src/arrow/ipc/json.cc b/cpp/src/arrow/ipc/json.cc index 36e343e5fb5bc..f57101a31a97d 100644 --- a/cpp/src/arrow/ipc/json.cc +++ b/cpp/src/arrow/ipc/json.cc @@ -79,15 +79,13 @@ JsonWriter::JsonWriter(const std::shared_ptr& schema) { JsonWriter::~JsonWriter() {} -Status JsonWriter::Open( - const std::shared_ptr& schema, std::unique_ptr* writer) { +Status JsonWriter::Open(const std::shared_ptr& schema, + std::unique_ptr* writer) { *writer = std::unique_ptr(new JsonWriter(schema)); return (*writer)->impl_->Start(); } -Status JsonWriter::Finish(std::string* result) { - return impl_->Finish(result); -} +Status JsonWriter::Finish(std::string* result) { return impl_->Finish(result); } Status JsonWriter::WriteRecordBatch(const RecordBatch& batch) { return impl_->WriteRecordBatch(batch); @@ -103,8 +101,10 @@ class JsonReader::JsonReaderImpl { Status ParseAndReadSchema() { doc_.Parse(reinterpret_cast(data_->data()), - static_cast(data_->size())); - if (doc_.HasParseError()) { return Status::IOError("JSON parsing failed"); } + static_cast(data_->size())); + if (doc_.HasParseError()) { + return Status::IOError("JSON parsing failed"); + } RETURN_NOT_OK(json::internal::ReadSchema(doc_, pool_, &schema_)); @@ -120,8 +120,8 @@ class JsonReader::JsonReaderImpl { DCHECK_LT(i, static_cast(record_batches_->GetArray().Size())) << "i out of bounds"; - return json::internal::ReadRecordBatch( - record_batches_->GetArray()[i], schema_, pool_, batch); + return json::internal::ReadRecordBatch(record_batches_->GetArray()[i], schema_, pool_, + batch); } std::shared_ptr schema() const { return schema_; } @@ -145,24 +145,20 @@ JsonReader::JsonReader(MemoryPool* pool, const std::shared_ptr& data) { JsonReader::~JsonReader() {} -Status JsonReader::Open( - const std::shared_ptr& data, std::unique_ptr* reader) { +Status JsonReader::Open(const std::shared_ptr& data, + std::unique_ptr* reader) { return Open(default_memory_pool(), data, reader); } Status JsonReader::Open(MemoryPool* pool, const std::shared_ptr& data, - std::unique_ptr* reader) { + std::unique_ptr* reader) { *reader = std::unique_ptr(new JsonReader(pool, data)); return (*reader)->impl_->ParseAndReadSchema(); } -std::shared_ptr JsonReader::schema() const { - return impl_->schema(); -} +std::shared_ptr JsonReader::schema() const { return impl_->schema(); } -int JsonReader::num_record_batches() const { - return impl_->num_record_batches(); -} +int JsonReader::num_record_batches() const { return impl_->num_record_batches(); } Status JsonReader::ReadRecordBatch(int i, std::shared_ptr* batch) const { return impl_->ReadRecordBatch(i, batch); diff --git a/cpp/src/arrow/ipc/json.h b/cpp/src/arrow/ipc/json.h index 2ba27c7f2c37d..be26f0233ebeb 100644 --- a/cpp/src/arrow/ipc/json.h +++ b/cpp/src/arrow/ipc/json.h @@ -41,8 +41,8 @@ class ARROW_EXPORT JsonWriter { public: ~JsonWriter(); - static Status Open( - const std::shared_ptr& schema, std::unique_ptr* out); + static Status Open(const std::shared_ptr& schema, + std::unique_ptr* out); Status WriteRecordBatch(const RecordBatch& batch); Status Finish(std::string* result); @@ -61,11 +61,11 @@ class ARROW_EXPORT JsonReader { ~JsonReader(); static Status Open(MemoryPool* pool, const std::shared_ptr& data, - std::unique_ptr* reader); + std::unique_ptr* reader); // Use the default memory pool - static Status Open( - const std::shared_ptr& data, std::unique_ptr* reader); + static Status Open(const std::shared_ptr& data, + std::unique_ptr* reader); std::shared_ptr schema() const; diff --git a/cpp/src/arrow/ipc/metadata.cc b/cpp/src/arrow/ipc/metadata.cc index 49c24c72727b7..20fd280db6de6 100644 --- a/cpp/src/arrow/ipc/metadata.cc +++ b/cpp/src/arrow/ipc/metadata.cc @@ -58,8 +58,8 @@ static constexpr flatbuf::MetadataVersion kCurrentMetadataVersion = static constexpr flatbuf::MetadataVersion kMinMetadataVersion = flatbuf::MetadataVersion_V3; -static Status IntFromFlatbuffer( - const flatbuf::Int* int_data, std::shared_ptr* out) { +static Status IntFromFlatbuffer(const flatbuf::Int* int_data, + std::shared_ptr* out) { if (int_data->bitWidth() > 64) { return Status::NotImplemented("Integers with more than 64 bits not implemented"); } @@ -86,8 +86,8 @@ static Status IntFromFlatbuffer( return Status::OK(); } -static Status FloatFromFlatuffer( - const flatbuf::FloatingPoint* float_data, std::shared_ptr* out) { +static Status FloatFromFlatuffer(const flatbuf::FloatingPoint* float_data, + std::shared_ptr* out) { if (float_data->precision() == flatbuf::Precision_HALF) { *out = float16(); } else if (float_data->precision() == flatbuf::Precision_SINGLE) { @@ -100,7 +100,7 @@ static Status FloatFromFlatuffer( // Forward declaration static Status FieldToFlatbuffer(FBB& fbb, const std::shared_ptr& field, - DictionaryMemo* dictionary_memo, FieldOffset* offset); + DictionaryMemo* dictionary_memo, FieldOffset* offset); static Offset IntToFlatbuffer(FBB& fbb, int bitWidth, bool is_signed) { return flatbuf::CreateInt(fbb, bitWidth, is_signed).Union(); @@ -111,7 +111,8 @@ static Offset FloatToFlatbuffer(FBB& fbb, flatbuf::Precision precision) { } static Status AppendChildFields(FBB& fbb, const std::shared_ptr& type, - std::vector* out_children, DictionaryMemo* dictionary_memo) { + std::vector* out_children, + DictionaryMemo* dictionary_memo) { FieldOffset field; for (int i = 0; i < type->num_children(); ++i) { RETURN_NOT_OK(FieldToFlatbuffer(fbb, type->child(i), dictionary_memo, &field)); @@ -121,16 +122,16 @@ static Status AppendChildFields(FBB& fbb, const std::shared_ptr& type, } static Status ListToFlatbuffer(FBB& fbb, const std::shared_ptr& type, - std::vector* out_children, DictionaryMemo* dictionary_memo, - Offset* offset) { + std::vector* out_children, + DictionaryMemo* dictionary_memo, Offset* offset) { RETURN_NOT_OK(AppendChildFields(fbb, type, out_children, dictionary_memo)); *offset = flatbuf::CreateList(fbb).Union(); return Status::OK(); } static Status StructToFlatbuffer(FBB& fbb, const std::shared_ptr& type, - std::vector* out_children, DictionaryMemo* dictionary_memo, - Offset* offset) { + std::vector* out_children, + DictionaryMemo* dictionary_memo, Offset* offset) { RETURN_NOT_OK(AppendChildFields(fbb, type, out_children, dictionary_memo)); *offset = flatbuf::CreateStruct_(fbb).Union(); return Status::OK(); @@ -140,7 +141,8 @@ static Status StructToFlatbuffer(FBB& fbb, const std::shared_ptr& type // Union implementation static Status UnionFromFlatbuffer(const flatbuf::Union* union_data, - const std::vector>& children, std::shared_ptr* out) { + const std::vector>& children, + std::shared_ptr* out) { UnionMode mode = union_data->mode() == flatbuf::UnionMode_Sparse ? UnionMode::SPARSE : UnionMode::DENSE; @@ -163,8 +165,8 @@ static Status UnionFromFlatbuffer(const flatbuf::Union* union_data, } static Status UnionToFlatBuffer(FBB& fbb, const std::shared_ptr& type, - std::vector* out_children, DictionaryMemo* dictionary_memo, - Offset* offset) { + std::vector* out_children, + DictionaryMemo* dictionary_memo, Offset* offset) { RETURN_NOT_OK(AppendChildFields(fbb, type, out_children, dictionary_memo)); const auto& union_type = static_cast(*type); @@ -224,15 +226,16 @@ static inline TimeUnit::type FromFlatbufferUnit(flatbuf::TimeUnit unit) { } static Status TypeFromFlatbuffer(flatbuf::Type type, const void* type_data, - const std::vector>& children, std::shared_ptr* out) { + const std::vector>& children, + std::shared_ptr* out) { switch (type) { case flatbuf::Type_NONE: return Status::Invalid("Type metadata cannot be none"); case flatbuf::Type_Int: return IntFromFlatbuffer(static_cast(type_data), out); case flatbuf::Type_FloatingPoint: - return FloatFromFlatuffer( - static_cast(type_data), out); + return FloatFromFlatuffer(static_cast(type_data), + out); case flatbuf::Type_Binary: *out = binary(); return Status::OK(); @@ -301,8 +304,8 @@ static Status TypeFromFlatbuffer(flatbuf::Type type, const void* type_data, *out = std::make_shared(children); return Status::OK(); case flatbuf::Type_Union: - return UnionFromFlatbuffer( - static_cast(type_data), children, out); + return UnionFromFlatbuffer(static_cast(type_data), children, + out); default: return Status::Invalid("Unrecognized type"); } @@ -310,15 +313,17 @@ static Status TypeFromFlatbuffer(flatbuf::Type type, const void* type_data, // TODO(wesm): Convert this to visitor pattern static Status TypeToFlatbuffer(FBB& fbb, const std::shared_ptr& type, - std::vector* children, std::vector* layout, - flatbuf::Type* out_type, DictionaryMemo* dictionary_memo, Offset* offset) { + std::vector* children, + std::vector* layout, + flatbuf::Type* out_type, DictionaryMemo* dictionary_memo, + Offset* offset) { if (type->id() == Type::DICTIONARY) { // In this library, the dictionary "type" is a logical construct. Here we // pass through to the value type, as we've already captured the index // type in the DictionaryEncoding metadata in the parent field const auto& dict_type = static_cast(*type); return TypeToFlatbuffer(fbb, dict_type.dictionary()->type(), children, layout, - out_type, dictionary_memo, offset); + out_type, dictionary_memo, offset); } std::vector buffer_layout = type->GetBufferLayout(); @@ -436,7 +441,7 @@ static Status TypeToFlatbuffer(FBB& fbb, const std::shared_ptr& type, } static Status TensorTypeToFlatbuffer(FBB& fbb, const std::shared_ptr& type, - flatbuf::Type* out_type, Offset* offset) { + flatbuf::Type* out_type, Offset* offset) { switch (type->id()) { case Type::UINT8: INT_TO_FB_CASE(8, false); @@ -475,8 +480,8 @@ static Status TensorTypeToFlatbuffer(FBB& fbb, const std::shared_ptr& return Status::OK(); } -static DictionaryOffset GetDictionaryEncoding( - FBB& fbb, const DictionaryType& type, DictionaryMemo* memo) { +static DictionaryOffset GetDictionaryEncoding(FBB& fbb, const DictionaryType& type, + DictionaryMemo* memo) { int64_t dictionary_id = memo->GetId(type.dictionary()); // We assume that the dictionary index type (as an integer) has already been @@ -491,7 +496,7 @@ static DictionaryOffset GetDictionaryEncoding( } static Status FieldToFlatbuffer(FBB& fbb, const std::shared_ptr& field, - DictionaryMemo* dictionary_memo, FieldOffset* offset) { + DictionaryMemo* dictionary_memo, FieldOffset* offset) { auto fb_name = fbb.CreateString(field->name()); flatbuf::Type type_enum; @@ -500,8 +505,8 @@ static Status FieldToFlatbuffer(FBB& fbb, const std::shared_ptr& field, std::vector children; std::vector layout; - RETURN_NOT_OK(TypeToFlatbuffer( - fbb, field->type(), &children, &layout, &type_enum, dictionary_memo, &type_offset)); + RETURN_NOT_OK(TypeToFlatbuffer(fbb, field->type(), &children, &layout, &type_enum, + dictionary_memo, &type_offset)); auto fb_children = fbb.CreateVector(children); auto fb_layout = fbb.CreateVector(layout); @@ -513,13 +518,14 @@ static Status FieldToFlatbuffer(FBB& fbb, const std::shared_ptr& field, // TODO: produce the list of VectorTypes *offset = flatbuf::CreateField(fbb, fb_name, field->nullable(), type_enum, type_offset, - dictionary, fb_children, fb_layout); + dictionary, fb_children, fb_layout); return Status::OK(); } static Status FieldFromFlatbuffer(const flatbuf::Field* field, - const DictionaryMemo& dictionary_memo, std::shared_ptr* out) { + const DictionaryMemo& dictionary_memo, + std::shared_ptr* out) { std::shared_ptr type; const flatbuf::DictionaryEncoding* encoding = field->dictionary(); @@ -551,8 +557,8 @@ static Status FieldFromFlatbuffer(const flatbuf::Field* field, return Status::OK(); } -static Status FieldFromFlatbufferDictionary( - const flatbuf::Field* field, std::shared_ptr* out) { +static Status FieldFromFlatbufferDictionary(const flatbuf::Field* field, + std::shared_ptr* out) { // Need an empty memo to pass down for constructing children DictionaryMemo dummy_memo; @@ -584,7 +590,8 @@ flatbuf::Endianness endianness() { } static Status SchemaToFlatbuffer(FBB& fbb, const Schema& schema, - DictionaryMemo* dictionary_memo, flatbuffers::Offset* out) { + DictionaryMemo* dictionary_memo, + flatbuffers::Offset* out) { /// Fields std::vector field_offsets; for (int i = 0; i < schema.num_fields(); ++i) { @@ -609,8 +616,8 @@ static Status SchemaToFlatbuffer(FBB& fbb, const Schema& schema, key_value_offsets.push_back( flatbuf::CreateKeyValue(fbb, fbb.CreateString(key), fbb.CreateString(value))); } - *out = flatbuf::CreateSchema( - fbb, endianness(), fb_offsets, fbb.CreateVector(key_value_offsets)); + *out = flatbuf::CreateSchema(fbb, endianness(), fb_offsets, + fbb.CreateVector(key_value_offsets)); } else { *out = flatbuf::CreateSchema(fbb, endianness(), fb_offsets); } @@ -631,15 +638,16 @@ static Status WriteFlatbufferBuilder(FBB& fbb, std::shared_ptr* out) { } static Status WriteFBMessage(FBB& fbb, flatbuf::MessageHeader header_type, - flatbuffers::Offset header, int64_t body_length, std::shared_ptr* out) { - auto message = flatbuf::CreateMessage( - fbb, kCurrentMetadataVersion, header_type, header, body_length); + flatbuffers::Offset header, int64_t body_length, + std::shared_ptr* out) { + auto message = flatbuf::CreateMessage(fbb, kCurrentMetadataVersion, header_type, header, + body_length); fbb.Finish(message); return WriteFlatbufferBuilder(fbb, out); } -Status WriteSchemaMessage( - const Schema& schema, DictionaryMemo* dictionary_memo, std::shared_ptr* out) { +Status WriteSchemaMessage(const Schema& schema, DictionaryMemo* dictionary_memo, + std::shared_ptr* out) { FBB fbb; flatbuffers::Offset fb_schema; RETURN_NOT_OK(SchemaToFlatbuffer(fbb, schema, dictionary_memo, &fb_schema)); @@ -650,8 +658,8 @@ using FieldNodeVector = flatbuffers::Offset>; using BufferVector = flatbuffers::Offset>; -static Status WriteFieldNodes( - FBB& fbb, const std::vector& nodes, FieldNodeVector* out) { +static Status WriteFieldNodes(FBB& fbb, const std::vector& nodes, + FieldNodeVector* out) { std::vector fb_nodes; fb_nodes.reserve(nodes.size()); @@ -666,8 +674,8 @@ static Status WriteFieldNodes( return Status::OK(); } -static Status WriteBuffers( - FBB& fbb, const std::vector& buffers, BufferVector* out) { +static Status WriteBuffers(FBB& fbb, const std::vector& buffers, + BufferVector* out) { std::vector fb_buffers; fb_buffers.reserve(buffers.size()); @@ -680,8 +688,9 @@ static Status WriteBuffers( } static Status MakeRecordBatch(FBB& fbb, int64_t length, int64_t body_length, - const std::vector& nodes, const std::vector& buffers, - RecordBatchOffset* offset) { + const std::vector& nodes, + const std::vector& buffers, + RecordBatchOffset* offset) { FieldNodeVector fb_nodes; BufferVector fb_buffers; @@ -693,17 +702,18 @@ static Status MakeRecordBatch(FBB& fbb, int64_t length, int64_t body_length, } Status WriteRecordBatchMessage(int64_t length, int64_t body_length, - const std::vector& nodes, const std::vector& buffers, - std::shared_ptr* out) { + const std::vector& nodes, + const std::vector& buffers, + std::shared_ptr* out) { FBB fbb; RecordBatchOffset record_batch; RETURN_NOT_OK(MakeRecordBatch(fbb, length, body_length, nodes, buffers, &record_batch)); - return WriteFBMessage( - fbb, flatbuf::MessageHeader_RecordBatch, record_batch.Union(), body_length, out); + return WriteFBMessage(fbb, flatbuf::MessageHeader_RecordBatch, record_batch.Union(), + body_length, out); } -Status WriteTensorMessage( - const Tensor& tensor, int64_t buffer_start_offset, std::shared_ptr* out) { +Status WriteTensorMessage(const Tensor& tensor, int64_t buffer_start_offset, + std::shared_ptr* out) { using TensorDimOffset = flatbuffers::Offset; using TensorOffset = flatbuffers::Offset; @@ -727,19 +737,20 @@ Status WriteTensorMessage( TensorOffset fb_tensor = flatbuf::CreateTensor(fbb, fb_type_type, fb_type, fb_shape, fb_strides, &buffer); - return WriteFBMessage( - fbb, flatbuf::MessageHeader_Tensor, fb_tensor.Union(), body_length, out); + return WriteFBMessage(fbb, flatbuf::MessageHeader_Tensor, fb_tensor.Union(), + body_length, out); } Status WriteDictionaryMessage(int64_t id, int64_t length, int64_t body_length, - const std::vector& nodes, const std::vector& buffers, - std::shared_ptr* out) { + const std::vector& nodes, + const std::vector& buffers, + std::shared_ptr* out) { FBB fbb; RecordBatchOffset record_batch; RETURN_NOT_OK(MakeRecordBatch(fbb, length, body_length, nodes, buffers, &record_batch)); auto dictionary_batch = flatbuf::CreateDictionaryBatch(fbb, id, record_batch).Union(); - return WriteFBMessage( - fbb, flatbuf::MessageHeader_DictionaryBatch, dictionary_batch, body_length, out); + return WriteFBMessage(fbb, flatbuf::MessageHeader_DictionaryBatch, dictionary_batch, + body_length, out); } static flatbuffers::Offset> @@ -754,8 +765,8 @@ FileBlocksToFlatbuffer(FBB& fbb, const std::vector& blocks) { } Status WriteFileFooter(const Schema& schema, const std::vector& dictionaries, - const std::vector& record_batches, DictionaryMemo* dictionary_memo, - io::OutputStream* out) { + const std::vector& record_batches, + DictionaryMemo* dictionary_memo, io::OutputStream* out) { FBB fbb; flatbuffers::Offset fb_schema; @@ -764,8 +775,8 @@ Status WriteFileFooter(const Schema& schema, const std::vector& dicti auto fb_dictionaries = FileBlocksToFlatbuffer(fbb, dictionaries); auto fb_record_batches = FileBlocksToFlatbuffer(fbb, record_batches); - auto footer = flatbuf::CreateFooter( - fbb, kCurrentMetadataVersion, fb_schema, fb_dictionaries, fb_record_batches); + auto footer = flatbuf::CreateFooter(fbb, kCurrentMetadataVersion, fb_schema, + fb_dictionaries, fb_record_batches); fbb.Finish(footer); @@ -780,8 +791,8 @@ Status WriteFileFooter(const Schema& schema, const std::vector& dicti DictionaryMemo::DictionaryMemo() {} // Returns KeyError if dictionary not found -Status DictionaryMemo::GetDictionary( - int64_t id, std::shared_ptr* dictionary) const { +Status DictionaryMemo::GetDictionary(int64_t id, + std::shared_ptr* dictionary) const { auto it = id_to_dictionary_.find(id); if (it == id_to_dictionary_.end()) { std::stringstream ss; @@ -817,8 +828,8 @@ bool DictionaryMemo::HasDictionaryId(int64_t id) const { return it != id_to_dictionary_.end(); } -Status DictionaryMemo::AddDictionary( - int64_t id, const std::shared_ptr& dictionary) { +Status DictionaryMemo::AddDictionary(int64_t id, + const std::shared_ptr& dictionary) { if (HasDictionaryId(id)) { std::stringstream ss; ss << "Dictionary with id " << id << " already exists"; @@ -835,8 +846,8 @@ Status DictionaryMemo::AddDictionary( class Message::MessageImpl { public: - explicit MessageImpl( - const std::shared_ptr& metadata, const std::shared_ptr& body) + explicit MessageImpl(const std::shared_ptr& metadata, + const std::shared_ptr& body) : metadata_(metadata), message_(nullptr), body_(body) {} Status Open() { @@ -897,43 +908,35 @@ class Message::MessageImpl { std::shared_ptr body_; }; -Message::Message( - const std::shared_ptr& metadata, const std::shared_ptr& body) { +Message::Message(const std::shared_ptr& metadata, + const std::shared_ptr& body) { impl_.reset(new MessageImpl(metadata, body)); } Status Message::Open(const std::shared_ptr& metadata, - const std::shared_ptr& body, std::unique_ptr* out) { + const std::shared_ptr& body, std::unique_ptr* out) { out->reset(new Message(metadata, body)); return (*out)->impl_->Open(); } Message::~Message() {} -std::shared_ptr Message::body() const { - return impl_->body(); -} +std::shared_ptr Message::body() const { return impl_->body(); } -std::shared_ptr Message::metadata() const { - return impl_->metadata(); -} +std::shared_ptr Message::metadata() const { return impl_->metadata(); } -Message::Type Message::type() const { - return impl_->type(); -} +Message::Type Message::type() const { return impl_->type(); } -MetadataVersion Message::metadata_version() const { - return impl_->version(); -} +MetadataVersion Message::metadata_version() const { return impl_->version(); } -const void* Message::header() const { - return impl_->header(); -} +const void* Message::header() const { return impl_->header(); } bool Message::Equals(const Message& other) const { int64_t metadata_bytes = std::min(metadata()->size(), other.metadata()->size()); - if (!metadata()->Equals(*other.metadata(), metadata_bytes)) { return false; } + if (!metadata()->Equals(*other.metadata(), metadata_bytes)) { + return false; + } // Compare bodies, if they have them auto this_body = body(); @@ -1012,7 +1015,7 @@ Status GetDictionaryTypes(const void* opaque_schema, DictionaryTypeMap* id_to_fi } Status GetSchema(const void* opaque_schema, const DictionaryMemo& dictionary_memo, - std::shared_ptr* out) { + std::shared_ptr* out) { auto schema = static_cast(opaque_schema); int num_fields = static_cast(schema->fields()->size()); @@ -1036,8 +1039,8 @@ Status GetSchema(const void* opaque_schema, const DictionaryMemo& dictionary_mem } Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type, - std::vector* shape, std::vector* strides, - std::vector* dim_names) { + std::vector* shape, std::vector* strides, + std::vector* dim_names) { auto message = flatbuf::GetMessage(metadata.data()); auto tensor = reinterpret_cast(message->header()); @@ -1068,7 +1071,8 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type // Read and write messages static Status ReadFullMessage(const std::shared_ptr& metadata, - io::InputStream* stream, std::unique_ptr* message) { + io::InputStream* stream, + std::unique_ptr* message) { auto fb_message = flatbuf::GetMessage(metadata->data()); int64_t body_length = fb_message->bodyLength(); @@ -1087,7 +1091,7 @@ static Status ReadFullMessage(const std::shared_ptr& metadata, } Status ReadMessage(int64_t offset, int32_t metadata_length, io::RandomAccessFile* file, - std::unique_ptr* message) { + std::unique_ptr* message) { std::shared_ptr buffer; RETURN_NOT_OK(file->ReadAt(offset, metadata_length, &buffer)); @@ -1141,8 +1145,8 @@ InputStreamMessageReader::~InputStreamMessageReader() {} // ---------------------------------------------------------------------- // Implement message writing -Status WriteMessage( - const Buffer& message, io::OutputStream* file, int32_t* message_length) { +Status WriteMessage(const Buffer& message, io::OutputStream* file, + int32_t* message_length) { // Need to write 4 bytes (message size), the message, plus padding to // end on an 8-byte offset int64_t start_offset; @@ -1151,7 +1155,9 @@ Status WriteMessage( int32_t padded_message_length = static_cast(message.size()) + 4; const int32_t remainder = (padded_message_length + static_cast(start_offset)) % 8; - if (remainder != 0) { padded_message_length += 8 - remainder; } + if (remainder != 0) { + padded_message_length += 8 - remainder; + } // The returned message size includes the length prefix, the flatbuffer, // plus padding @@ -1167,7 +1173,9 @@ Status WriteMessage( // Write any padding int32_t padding = padded_message_length - static_cast(message.size()) - 4; - if (padding > 0) { RETURN_NOT_OK(file->Write(kPaddingBytes, padding)); } + if (padding > 0) { + RETURN_NOT_OK(file->Write(kPaddingBytes, padding)); + } return Status::OK(); } diff --git a/cpp/src/arrow/ipc/metadata.h b/cpp/src/arrow/ipc/metadata.h index 614f7a6a922cc..90e4defd6a300 100644 --- a/cpp/src/arrow/ipc/metadata.h +++ b/cpp/src/arrow/ipc/metadata.h @@ -133,11 +133,14 @@ Status GetDictionaryTypes(const void* opaque_schema, DictionaryTypeMap* id_to_fi // Construct a complete Schema from the message. May be expensive for very // large schemas if you are only interested in a few fields Status ARROW_EXPORT GetSchema(const void* opaque_schema, - const DictionaryMemo& dictionary_memo, std::shared_ptr* out); + const DictionaryMemo& dictionary_memo, + std::shared_ptr* out); Status ARROW_EXPORT GetTensorMetadata(const Buffer& metadata, - std::shared_ptr* type, std::vector* shape, - std::vector* strides, std::vector* dim_names); + std::shared_ptr* type, + std::vector* shape, + std::vector* strides, + std::vector* dim_names); /// \brief An IPC message including metadata and body class ARROW_EXPORT Message { @@ -157,7 +160,7 @@ class ARROW_EXPORT Message { /// \param[in] body a buffer containing the message body, which may be nullptr /// \param[out] out the created message static Status Open(const std::shared_ptr& metadata, - const std::shared_ptr& body, std::unique_ptr* out); + const std::shared_ptr& body, std::unique_ptr* out); /// \brief Write length-prefixed metadata and body to output stream /// @@ -242,22 +245,23 @@ class ARROW_EXPORT InputStreamMessageReader : public MessageReader { /// \param[out] message the message read /// \return Status success or failure Status ARROW_EXPORT ReadMessage(int64_t offset, int32_t metadata_length, - io::RandomAccessFile* file, std::unique_ptr* message); + io::RandomAccessFile* file, + std::unique_ptr* message); /// \brief Read encapulated RPC message (metadata and body) from InputStream /// /// Read length-prefixed message with as-yet unknown length. Returns nullptr if /// there are not enough bytes available or the message length is 0 (e.g. EOS /// in a stream) -Status ARROW_EXPORT ReadMessage( - io::InputStream* stream, std::unique_ptr* message); +Status ARROW_EXPORT ReadMessage(io::InputStream* stream, + std::unique_ptr* message); /// Write a serialized message metadata with a length-prefix and padding to an /// 8-byte offset /// /// -Status ARROW_EXPORT WriteMessage( - const Buffer& message, io::OutputStream* file, int32_t* message_length); +Status ARROW_EXPORT WriteMessage(const Buffer& message, io::OutputStream* file, + int32_t* message_length); // Serialize arrow::Schema as a Flatbuffer // @@ -266,23 +270,26 @@ Status ARROW_EXPORT WriteMessage( // dictionary ids // \param[out] out the serialized arrow::Buffer // \return Status outcome -Status ARROW_EXPORT WriteSchemaMessage( - const Schema& schema, DictionaryMemo* dictionary_memo, std::shared_ptr* out); +Status ARROW_EXPORT WriteSchemaMessage(const Schema& schema, + DictionaryMemo* dictionary_memo, + std::shared_ptr* out); Status ARROW_EXPORT WriteRecordBatchMessage(int64_t length, int64_t body_length, - const std::vector& nodes, const std::vector& buffers, - std::shared_ptr* out); + const std::vector& nodes, + const std::vector& buffers, + std::shared_ptr* out); -Status ARROW_EXPORT WriteTensorMessage( - const Tensor& tensor, int64_t buffer_start_offset, std::shared_ptr* out); +Status ARROW_EXPORT WriteTensorMessage(const Tensor& tensor, int64_t buffer_start_offset, + std::shared_ptr* out); Status WriteDictionaryMessage(int64_t id, int64_t length, int64_t body_length, - const std::vector& nodes, const std::vector& buffers, - std::shared_ptr* out); + const std::vector& nodes, + const std::vector& buffers, + std::shared_ptr* out); Status WriteFileFooter(const Schema& schema, const std::vector& dictionaries, - const std::vector& record_batches, DictionaryMemo* dictionary_memo, - io::OutputStream* out); + const std::vector& record_batches, + DictionaryMemo* dictionary_memo, io::OutputStream* out); } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 88ab33087b637..8ae82804c3164 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -95,12 +95,12 @@ struct ArrayLoaderContext { }; static Status LoadArray(const std::shared_ptr& type, - ArrayLoaderContext* context, internal::ArrayData* out); + ArrayLoaderContext* context, internal::ArrayData* out); class ArrayLoader { public: ArrayLoader(const std::shared_ptr& type, internal::ArrayData* out, - ArrayLoaderContext* context) + ArrayLoaderContext* context) : type_(type), context_(context), out_(out) {} Status Load() { @@ -184,7 +184,7 @@ class ArrayLoader { typename std::enable_if::value && !std::is_base_of::value && !std::is_base_of::value, - Status>::type + Status>::type Visit(const T& type) { return LoadPrimitive(); } @@ -252,18 +252,18 @@ class ArrayLoader { }; static Status LoadArray(const std::shared_ptr& type, - ArrayLoaderContext* context, internal::ArrayData* out) { + ArrayLoaderContext* context, internal::ArrayData* out) { ArrayLoader loader(type, out, context); return loader.Load(); } Status ReadRecordBatch(const Buffer& metadata, const std::shared_ptr& schema, - io::RandomAccessFile* file, std::shared_ptr* out) { + io::RandomAccessFile* file, std::shared_ptr* out) { return ReadRecordBatch(metadata, schema, kMaxNestingDepth, file, out); } Status ReadRecordBatch(const Message& message, const std::shared_ptr& schema, - std::shared_ptr* out) { + std::shared_ptr* out) { io::BufferReader reader(message.body()); DCHECK_EQ(message.type(), Message::RECORD_BATCH); return ReadRecordBatch(*message.metadata(), schema, kMaxNestingDepth, &reader, out); @@ -273,8 +273,9 @@ Status ReadRecordBatch(const Message& message, const std::shared_ptr& sc // Array loading static Status LoadRecordBatchFromSource(const std::shared_ptr& schema, - int64_t num_rows, int max_recursion_depth, IpcComponentSource* source, - std::shared_ptr* out) { + int64_t num_rows, int max_recursion_depth, + IpcComponentSource* source, + std::shared_ptr* out) { ArrayLoaderContext context; context.source = source; context.field_index = 0; @@ -294,16 +295,17 @@ static Status LoadRecordBatchFromSource(const std::shared_ptr& schema, } static inline Status ReadRecordBatch(const flatbuf::RecordBatch* metadata, - const std::shared_ptr& schema, int max_recursion_depth, - io::RandomAccessFile* file, std::shared_ptr* out) { + const std::shared_ptr& schema, + int max_recursion_depth, io::RandomAccessFile* file, + std::shared_ptr* out) { IpcComponentSource source(metadata, file); - return LoadRecordBatchFromSource( - schema, metadata->length(), max_recursion_depth, &source, out); + return LoadRecordBatchFromSource(schema, metadata->length(), max_recursion_depth, + &source, out); } Status ReadRecordBatch(const Buffer& metadata, const std::shared_ptr& schema, - int max_recursion_depth, io::RandomAccessFile* file, - std::shared_ptr* out) { + int max_recursion_depth, io::RandomAccessFile* file, + std::shared_ptr* out) { auto message = flatbuf::GetMessage(metadata.data()); if (message->header_type() != flatbuf::MessageHeader_RecordBatch) { DCHECK_EQ(message->header_type(), flatbuf::MessageHeader_RecordBatch); @@ -313,7 +315,8 @@ Status ReadRecordBatch(const Buffer& metadata, const std::shared_ptr& sc } Status ReadDictionary(const Buffer& metadata, const DictionaryTypeMap& dictionary_types, - io::RandomAccessFile* file, int64_t* dictionary_id, std::shared_ptr* out) { + io::RandomAccessFile* file, int64_t* dictionary_id, + std::shared_ptr* out) { auto message = flatbuf::GetMessage(metadata.data()); auto dictionary_batch = reinterpret_cast(message->header()); @@ -347,7 +350,7 @@ Status ReadDictionary(const Buffer& metadata, const DictionaryTypeMap& dictionar } static Status ReadMessageAndValidate(MessageReader* reader, Message::Type expected_type, - bool allow_null, std::unique_ptr* message) { + bool allow_null, std::unique_ptr* message) { RETURN_NOT_OK(reader->ReadNextMessage(message)); if (!(*message) && !allow_null) { @@ -357,7 +360,9 @@ static Status ReadMessageAndValidate(MessageReader* reader, Message::Type expect return Status::Invalid(ss.str()); } - if ((*message) == nullptr) { return Status::OK(); } + if ((*message) == nullptr) { + return Status::OK(); + } if ((*message)->type() != expected_type) { std::stringstream ss; @@ -389,15 +394,15 @@ class RecordBatchStreamReader::RecordBatchStreamReaderImpl { Status ReadNextDictionary() { std::unique_ptr message; - RETURN_NOT_OK(ReadMessageAndValidate( - message_reader_.get(), Message::DICTIONARY_BATCH, false, &message)); + RETURN_NOT_OK(ReadMessageAndValidate(message_reader_.get(), Message::DICTIONARY_BATCH, + false, &message)); io::BufferReader reader(message->body()); std::shared_ptr dictionary; int64_t id; - RETURN_NOT_OK(ReadDictionary( - *message->metadata(), dictionary_types_, &reader, &id, &dictionary)); + RETURN_NOT_OK(ReadDictionary(*message->metadata(), dictionary_types_, &reader, &id, + &dictionary)); return dictionary_memo_.AddDictionary(id, dictionary); } @@ -420,8 +425,8 @@ class RecordBatchStreamReader::RecordBatchStreamReaderImpl { Status ReadNextRecordBatch(std::shared_ptr* batch) { std::unique_ptr message; - RETURN_NOT_OK(ReadMessageAndValidate( - message_reader_.get(), Message::RECORD_BATCH, true, &message)); + RETURN_NOT_OK(ReadMessageAndValidate(message_reader_.get(), Message::RECORD_BATCH, + true, &message)); if (message == nullptr) { // End of stream @@ -451,14 +456,14 @@ RecordBatchStreamReader::RecordBatchStreamReader() { RecordBatchStreamReader::~RecordBatchStreamReader() {} Status RecordBatchStreamReader::Open(std::unique_ptr message_reader, - std::shared_ptr* reader) { + std::shared_ptr* reader) { // Private ctor *reader = std::shared_ptr(new RecordBatchStreamReader()); return (*reader)->impl_->Open(std::move(message_reader)); } Status RecordBatchStreamReader::Open(const std::shared_ptr& stream, - std::shared_ptr* out) { + std::shared_ptr* out) { std::unique_ptr message_reader(new InputStreamMessageReader(stream)); return Open(std::move(message_reader), out); } @@ -502,8 +507,8 @@ class RecordBatchFileReader::RecordBatchFileReaderImpl { } // Now read the footer - RETURN_NOT_OK(file_->ReadAt( - footer_offset_ - footer_length - file_end_size, footer_length, &footer_buffer_)); + RETURN_NOT_OK(file_->ReadAt(footer_offset_ - footer_length - file_end_size, + footer_length, &footer_buffer_)); // TODO(wesm): Verify the footer footer_ = flatbuf::GetFooter(footer_buffer_->data()); @@ -568,7 +573,7 @@ class RecordBatchFileReader::RecordBatchFileReaderImpl { std::shared_ptr dictionary; int64_t dictionary_id; RETURN_NOT_OK(ReadDictionary(*message->metadata(), dictionary_fields_, &reader, - &dictionary_id, &dictionary)); + &dictionary_id, &dictionary)); RETURN_NOT_OK(dictionary_memo_->AddDictionary(dictionary_id, dictionary)); } @@ -610,37 +615,34 @@ RecordBatchFileReader::RecordBatchFileReader() { RecordBatchFileReader::~RecordBatchFileReader() {} Status RecordBatchFileReader::Open(const std::shared_ptr& file, - std::shared_ptr* reader) { + std::shared_ptr* reader) { int64_t footer_offset; RETURN_NOT_OK(file->GetSize(&footer_offset)); return Open(file, footer_offset, reader); } Status RecordBatchFileReader::Open(const std::shared_ptr& file, - int64_t footer_offset, std::shared_ptr* reader) { + int64_t footer_offset, + std::shared_ptr* reader) { *reader = std::shared_ptr(new RecordBatchFileReader()); return (*reader)->impl_->Open(file, footer_offset); } -std::shared_ptr RecordBatchFileReader::schema() const { - return impl_->schema(); -} +std::shared_ptr RecordBatchFileReader::schema() const { return impl_->schema(); } int RecordBatchFileReader::num_record_batches() const { return impl_->num_record_batches(); } -MetadataVersion RecordBatchFileReader::version() const { - return impl_->version(); -} +MetadataVersion RecordBatchFileReader::version() const { return impl_->version(); } -Status RecordBatchFileReader::ReadRecordBatch( - int i, std::shared_ptr* batch) { +Status RecordBatchFileReader::ReadRecordBatch(int i, + std::shared_ptr* batch) { return impl_->ReadRecordBatch(i, batch); } -static Status ReadContiguousPayload( - int64_t offset, io::RandomAccessFile* file, std::unique_ptr* message) { +static Status ReadContiguousPayload(int64_t offset, io::RandomAccessFile* file, + std::unique_ptr* message) { std::shared_ptr buffer; RETURN_NOT_OK(file->Seek(offset)); RETURN_NOT_OK(ReadMessage(file, message)); @@ -652,16 +654,16 @@ static Status ReadContiguousPayload( } Status ReadRecordBatch(const std::shared_ptr& schema, int64_t offset, - io::RandomAccessFile* file, std::shared_ptr* out) { + io::RandomAccessFile* file, std::shared_ptr* out) { std::unique_ptr message; RETURN_NOT_OK(ReadContiguousPayload(offset, file, &message)); io::BufferReader buffer_reader(message->body()); - return ReadRecordBatch( - *message->metadata(), schema, kMaxNestingDepth, &buffer_reader, out); + return ReadRecordBatch(*message->metadata(), schema, kMaxNestingDepth, &buffer_reader, + out); } -Status ReadTensor( - int64_t offset, io::RandomAccessFile* file, std::shared_ptr* out) { +Status ReadTensor(int64_t offset, io::RandomAccessFile* file, + std::shared_ptr* out) { // Respect alignment of Tensor messages (see WriteTensor) offset = PaddedLength(offset); std::unique_ptr message; diff --git a/cpp/src/arrow/ipc/reader.h b/cpp/src/arrow/ipc/reader.h index d6c261475014c..c0d3fb1f185f9 100644 --- a/cpp/src/arrow/ipc/reader.h +++ b/cpp/src/arrow/ipc/reader.h @@ -72,7 +72,7 @@ class ARROW_EXPORT RecordBatchStreamReader : public RecordBatchReader { /// \param(out) out the created RecordBatchStreamReader object /// \return Status static Status Open(std::unique_ptr message_reader, - std::shared_ptr* out); + std::shared_ptr* out); /// \Create Record batch stream reader from InputStream /// @@ -80,7 +80,7 @@ class ARROW_EXPORT RecordBatchStreamReader : public RecordBatchReader { /// \param(out) out the created RecordBatchStreamReader object /// \return Status static Status Open(const std::shared_ptr& stream, - std::shared_ptr* out); + std::shared_ptr* out); std::shared_ptr schema() const override; Status ReadNextRecordBatch(std::shared_ptr* batch) override; @@ -103,7 +103,7 @@ class ARROW_EXPORT RecordBatchFileReader { // need only locate the end of the Arrow file stream to discover the metadata // and then proceed to read the data into memory. static Status Open(const std::shared_ptr& file, - std::shared_ptr* reader); + std::shared_ptr* reader); // If the file is embedded within some larger file or memory region, you can // pass the absolute memory offset to the end of the file (which contains the @@ -113,7 +113,8 @@ class ARROW_EXPORT RecordBatchFileReader { // @param file: the data source // @param footer_offset: the position of the end of the Arrow "file" static Status Open(const std::shared_ptr& file, - int64_t footer_offset, std::shared_ptr* reader); + int64_t footer_offset, + std::shared_ptr* reader); /// The schema includes any dictionaries std::shared_ptr schema() const; @@ -148,8 +149,9 @@ class ARROW_EXPORT RecordBatchFileReader { /// \param(in) file a random access file /// \param(out) out the read record batch Status ARROW_EXPORT ReadRecordBatch(const Buffer& metadata, - const std::shared_ptr& schema, io::RandomAccessFile* file, - std::shared_ptr* out); + const std::shared_ptr& schema, + io::RandomAccessFile* file, + std::shared_ptr* out); /// \brief Read record batch from fully encapulated Message /// @@ -158,7 +160,8 @@ Status ARROW_EXPORT ReadRecordBatch(const Buffer& metadata, /// \param[out] out the resulting RecordBatch /// \return Status Status ARROW_EXPORT ReadRecordBatch(const Message& message, - const std::shared_ptr& schema, std::shared_ptr* out); + const std::shared_ptr& schema, + std::shared_ptr* out); /// Read record batch from file given metadata and schema /// @@ -168,8 +171,9 @@ Status ARROW_EXPORT ReadRecordBatch(const Message& message, /// \param(in) max_recursion_depth the maximum permitted nesting depth /// \param(out) out the read record batch Status ARROW_EXPORT ReadRecordBatch(const Buffer& metadata, - const std::shared_ptr& schema, int max_recursion_depth, - io::RandomAccessFile* file, std::shared_ptr* out); + const std::shared_ptr& schema, + int max_recursion_depth, io::RandomAccessFile* file, + std::shared_ptr* out); /// Read record batch as encapsulated IPC message with metadata size prefix and /// header @@ -179,15 +183,16 @@ Status ARROW_EXPORT ReadRecordBatch(const Buffer& metadata, /// \param(in) file the file where the batch is located /// \param(out) out the read record batch Status ARROW_EXPORT ReadRecordBatch(const std::shared_ptr& schema, int64_t offset, - io::RandomAccessFile* file, std::shared_ptr* out); + io::RandomAccessFile* file, + std::shared_ptr* out); /// EXPERIMENTAL: Read arrow::Tensor as encapsulated IPC message in file /// /// \param(in) offset the file location of the start of the message /// \param(in) file the file where the batch is located /// \param(out) out the read tensor -Status ARROW_EXPORT ReadTensor( - int64_t offset, io::RandomAccessFile* file, std::shared_ptr* out); +Status ARROW_EXPORT ReadTensor(int64_t offset, io::RandomAccessFile* file, + std::shared_ptr* out); /// Backwards-compatibility for Arrow < 0.4.0 /// diff --git a/cpp/src/arrow/ipc/stream-to-file.cc b/cpp/src/arrow/ipc/stream-to-file.cc index de65883910120..33719b3c89c9e 100644 --- a/cpp/src/arrow/ipc/stream-to-file.cc +++ b/cpp/src/arrow/ipc/stream-to-file.cc @@ -15,11 +15,11 @@ // specific language governing permissions and limitations // under the License. +#include #include "arrow/io/file.h" #include "arrow/ipc/reader.h" #include "arrow/ipc/writer.h" #include "arrow/status.h" -#include #include "arrow/util/io-util.h" diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index 67a41ba086b75..a8767926b2a07 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -69,8 +69,8 @@ static inline void CompareBatch(const RecordBatch& left, const RecordBatch& righ } } -static inline void CompareArraysDetailed( - int index, const Array& result, const Array& expected) { +static inline void CompareArraysDetailed(int index, const Array& result, + const Array& expected) { if (!expected.Equals(result)) { std::stringstream pp_result; std::stringstream pp_expected; @@ -83,8 +83,8 @@ static inline void CompareArraysDetailed( } } -static inline void CompareBatchColumnsDetailed( - const RecordBatch& result, const RecordBatch& expected) { +static inline void CompareBatchColumnsDetailed(const RecordBatch& result, + const RecordBatch& expected) { for (int i = 0; i < expected.num_columns(); ++i) { auto left = result.column(i); auto right = expected.column(i); @@ -95,16 +95,16 @@ static inline void CompareBatchColumnsDetailed( const auto kListInt32 = list(int32()); const auto kListListInt32 = list(kListInt32); -Status MakeRandomInt32Array( - int64_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr* out) { +Status MakeRandomInt32Array(int64_t length, bool include_nulls, MemoryPool* pool, + std::shared_ptr* out) { std::shared_ptr data; RETURN_NOT_OK(test::MakeRandomInt32PoolBuffer(length, pool, &data)); Int32Builder builder(pool, int32()); if (include_nulls) { std::shared_ptr valid_bytes; RETURN_NOT_OK(test::MakeRandomBytePoolBuffer(length, pool, &valid_bytes)); - RETURN_NOT_OK(builder.Append( - reinterpret_cast(data->data()), length, valid_bytes->data())); + RETURN_NOT_OK(builder.Append(reinterpret_cast(data->data()), length, + valid_bytes->data())); return builder.Finish(out); } RETURN_NOT_OK(builder.Append(reinterpret_cast(data->data()), length)); @@ -112,7 +112,8 @@ Status MakeRandomInt32Array( } Status MakeRandomListArray(const std::shared_ptr& child_array, int num_lists, - bool include_nulls, MemoryPool* pool, std::shared_ptr* out) { + bool include_nulls, MemoryPool* pool, + std::shared_ptr* out) { // Create the null list values std::vector valid_lists(num_lists); const double null_percent = include_nulls ? 0.1 : 0; @@ -129,15 +130,16 @@ Status MakeRandomListArray(const std::shared_ptr& child_array, int num_li test::rand_uniform_int(num_lists, seed, 0, max_list_size, list_sizes.data()); // make sure sizes are consistent with null std::transform(list_sizes.begin(), list_sizes.end(), valid_lists.begin(), - list_sizes.begin(), - [](int32_t size, int32_t valid) { return valid == 0 ? 0 : size; }); + list_sizes.begin(), + [](int32_t size, int32_t valid) { return valid == 0 ? 0 : size; }); std::partial_sum(list_sizes.begin(), list_sizes.end(), ++offsets.begin()); // Force invariants const int32_t child_length = static_cast(child_array->length()); offsets[0] = 0; std::replace_if(offsets.begin(), offsets.end(), - [child_length](int32_t offset) { return offset > child_length; }, child_length); + [child_length](int32_t offset) { return offset > child_length; }, + child_length); } offsets[num_lists] = static_cast(child_array->length()); @@ -148,14 +150,14 @@ Status MakeRandomListArray(const std::shared_ptr& child_array, int num_li RETURN_NOT_OK(test::CopyBufferFromVector(offsets, pool, &offsets_buffer)); *out = std::make_shared(list(child_array->type()), num_lists, offsets_buffer, - child_array, null_bitmap, kUnknownNullCount); + child_array, null_bitmap, kUnknownNullCount); return ValidateArray(**out); } typedef Status MakeRecordBatch(std::shared_ptr* out); -Status MakeRandomBooleanArray( - const int length, bool include_nulls, std::shared_ptr* out) { +Status MakeRandomBooleanArray(const int length, bool include_nulls, + std::shared_ptr* out) { std::vector values(length); test::random_null_bytes(length, 0.5, values.data()); std::shared_ptr data; @@ -210,10 +212,10 @@ Status MakeIntRecordBatch(std::shared_ptr* out) { } template -Status MakeRandomBinaryArray( - int64_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr* out) { - const std::vector values = { - "", "", "abc", "123", "efg", "456!@#!@#", "12312"}; +Status MakeRandomBinaryArray(int64_t length, bool include_nulls, MemoryPool* pool, + std::shared_ptr* out) { + const std::vector values = {"", "", "abc", "123", + "efg", "456!@#!@#", "12312"}; Builder builder(pool); const size_t values_len = values.size(); for (int64_t i = 0; i < length; ++i) { @@ -223,7 +225,7 @@ Status MakeRandomBinaryArray( } else { const std::string& value = values[values_index]; RETURN_NOT_OK(builder.Append(reinterpret_cast(value.data()), - static_cast(value.size()))); + static_cast(value.size()))); } } return builder.Finish(out); @@ -434,11 +436,12 @@ Status MakeUnion(std::shared_ptr* out) { // construct individual nullable/non-nullable struct arrays auto sparse_no_nulls = std::make_shared(sparse_type, length, sparse_children, type_ids_buffer); - auto sparse = std::make_shared( - sparse_type, length, sparse_children, type_ids_buffer, nullptr, null_bitmask, 1); + auto sparse = std::make_shared(sparse_type, length, sparse_children, + type_ids_buffer, nullptr, null_bitmask, 1); - auto dense = std::make_shared(dense_type, length, dense_children, - type_ids_buffer, offsets_buffer, null_bitmask, 1); + auto dense = + std::make_shared(dense_type, length, dense_children, type_ids_buffer, + offsets_buffer, null_bitmask, 1); // construct batch std::vector> arrays = {sparse_no_nulls, sparse, dense}; @@ -480,8 +483,8 @@ Status MakeDictionary(std::shared_ptr* out) { std::vector list_offsets = {0, 0, 2, 2, 5, 6, 9}; std::shared_ptr offsets, indices3; - ArrayFromVector( - std::vector(list_offsets.size(), true), list_offsets, &offsets); + ArrayFromVector(std::vector(list_offsets.size(), true), + list_offsets, &offsets); std::vector indices3_values = {0, 1, 2, 0, 1, 2, 0, 1, 2}; std::vector is_valid3(9, true); @@ -490,8 +493,8 @@ Status MakeDictionary(std::shared_ptr* out) { std::shared_ptr null_bitmap; RETURN_NOT_OK(test::GetBitmapFromVector(is_valid, &null_bitmap)); - std::shared_ptr a3 = std::make_shared(f3_type, length, - std::static_pointer_cast(offsets)->values(), + std::shared_ptr a3 = std::make_shared( + f3_type, length, std::static_pointer_cast(offsets)->values(), std::make_shared(f1_type, indices3), null_bitmap, 1); // Dictionary-encoded list of integer @@ -500,14 +503,15 @@ Status MakeDictionary(std::shared_ptr* out) { std::shared_ptr offsets4, values4, indices4; std::vector list_offsets4 = {0, 2, 2, 3}; - ArrayFromVector( - std::vector(4, true), list_offsets4, &offsets4); + ArrayFromVector(std::vector(4, true), list_offsets4, + &offsets4); std::vector list_values4 = {0, 1, 2}; ArrayFromVector(std::vector(3, true), list_values4, &values4); - auto dict3 = std::make_shared(f4_value_type, 3, - std::static_pointer_cast(offsets4)->values(), values4); + auto dict3 = std::make_shared( + f4_value_type, 3, std::static_pointer_cast(offsets4)->values(), + values4); std::vector indices4_values = {0, 1, 2, 0, 1, 2}; ArrayFromVector(is_valid, indices4_values, &indices4); @@ -516,9 +520,9 @@ Status MakeDictionary(std::shared_ptr* out) { auto a4 = std::make_shared(f4_type, indices4); // construct batch - std::shared_ptr schema(new Schema({field("dict1", f0_type), - field("sparse", f1_type), field("dense", f2_type), - field("list of encoded string", f3_type), field("encoded list", f4_type)})); + std::shared_ptr schema(new Schema( + {field("dict1", f0_type), field("sparse", f1_type), field("dense", f2_type), + field("list of encoded string", f3_type), field("encoded list", f4_type)})); std::vector> arrays = {a0, a1, a2, a3, a4}; @@ -575,7 +579,8 @@ Status MakeDates(std::shared_ptr* out) { ArrayFromVector(is_valid, date32_values, &date32_array); std::vector date64_values = {1489269000000, 1489270000000, 1489271000000, - 1489272000000, 1489272000000, 1489273000000, 1489274000000}; + 1489272000000, 1489272000000, 1489273000000, + 1489274000000}; std::shared_ptr date64_array; ArrayFromVector(is_valid, date64_values, &date64_array); @@ -592,7 +597,7 @@ Status MakeTimestamps(std::shared_ptr* out) { std::shared_ptr schema(new Schema({f0, f1, f2})); std::vector ts_values = {1489269000000, 1489270000000, 1489271000000, - 1489272000000, 1489272000000, 1489273000000}; + 1489272000000, 1489272000000, 1489273000000}; std::shared_ptr a0, a1, a2; ArrayFromVector(f0->type(), is_valid, ts_values, &a0); @@ -612,10 +617,10 @@ Status MakeTimes(std::shared_ptr* out) { auto f3 = field("f3", time64(TimeUnit::NANO)); std::shared_ptr schema(new Schema({f0, f1, f2, f3})); - std::vector t32_values = { - 1489269000, 1489270000, 1489271000, 1489272000, 1489272000, 1489273000}; + std::vector t32_values = {1489269000, 1489270000, 1489271000, + 1489272000, 1489272000, 1489273000}; std::vector t64_values = {1489269000000, 1489270000000, 1489271000000, - 1489272000000, 1489272000000, 1489273000000}; + 1489272000000, 1489272000000, 1489273000000}; std::shared_ptr a0, a1, a2, a3; ArrayFromVector(f0->type(), is_valid, t32_values, &a0); @@ -630,7 +635,7 @@ Status MakeTimes(std::shared_ptr* out) { template void AppendValues(const std::vector& is_valid, const std::vector& values, - BuilderType* builder) { + BuilderType* builder) { for (size_t i = 0; i < values.size(); ++i) { if (is_valid[i]) { ASSERT_OK(builder->Append(values[i])); diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 14708a1e7a032..163b27b443351 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -45,8 +45,9 @@ namespace ipc { // Record batch write path static inline Status GetTruncatedBitmap(int64_t offset, int64_t length, - const std::shared_ptr input, MemoryPool* pool, - std::shared_ptr* buffer) { + const std::shared_ptr input, + MemoryPool* pool, + std::shared_ptr* buffer) { if (!input) { *buffer = input; return Status::OK(); @@ -63,8 +64,8 @@ static inline Status GetTruncatedBitmap(int64_t offset, int64_t length, template inline Status GetTruncatedBuffer(int64_t offset, int64_t length, - const std::shared_ptr input, MemoryPool* pool, - std::shared_ptr* buffer) { + const std::shared_ptr input, MemoryPool* pool, + std::shared_ptr* buffer) { if (!input) { *buffer = input; return Status::OK(); @@ -80,17 +81,19 @@ inline Status GetTruncatedBuffer(int64_t offset, int64_t length, return Status::OK(); } -static inline bool NeedTruncate( - int64_t offset, const Buffer* buffer, int64_t min_length) { +static inline bool NeedTruncate(int64_t offset, const Buffer* buffer, + int64_t min_length) { // buffer can be NULL - if (buffer == nullptr) { return false; } + if (buffer == nullptr) { + return false; + } return offset != 0 || min_length < buffer->size(); } class RecordBatchSerializer : public ArrayVisitor { public: RecordBatchSerializer(MemoryPool* pool, int64_t buffer_start_offset, - int max_recursion_depth, bool allow_64bit) + int max_recursion_depth, bool allow_64bit) : pool_(pool), max_recursion_depth_(max_recursion_depth), buffer_start_offset_(buffer_start_offset), @@ -114,8 +117,8 @@ class RecordBatchSerializer : public ArrayVisitor { if (arr.null_count() > 0) { std::shared_ptr bitmap; - RETURN_NOT_OK(GetTruncatedBitmap( - arr.offset(), arr.length(), arr.null_bitmap(), pool_, &bitmap)); + RETURN_NOT_OK(GetTruncatedBitmap(arr.offset(), arr.length(), arr.null_bitmap(), + pool_, &bitmap)); buffers_.push_back(bitmap); } else { // Push a dummy zero-length buffer, not to be copied @@ -175,14 +178,14 @@ class RecordBatchSerializer : public ArrayVisitor { } // Override this for writing dictionary metadata - virtual Status WriteMetadataMessage( - int64_t num_rows, int64_t body_length, std::shared_ptr* out) { - return WriteRecordBatchMessage( - num_rows, body_length, field_nodes_, buffer_meta_, out); + virtual Status WriteMetadataMessage(int64_t num_rows, int64_t body_length, + std::shared_ptr* out) { + return WriteRecordBatchMessage(num_rows, body_length, field_nodes_, buffer_meta_, + out); } Status Write(const RecordBatch& batch, io::OutputStream* dst, int32_t* metadata_length, - int64_t* body_length) { + int64_t* body_length) { RETURN_NOT_OK(Assemble(batch, body_length)); #ifndef NDEBUG @@ -216,9 +219,13 @@ class RecordBatchSerializer : public ArrayVisitor { padding = BitUtil::RoundUpToMultipleOf64(size) - size; } - if (size > 0) { RETURN_NOT_OK(dst->Write(buffer->data(), size)); } + if (size > 0) { + RETURN_NOT_OK(dst->Write(buffer->data(), size)); + } - if (padding > 0) { RETURN_NOT_OK(dst->Write(kPaddingBytes, padding)); } + if (padding > 0) { + RETURN_NOT_OK(dst->Write(kPaddingBytes, padding)); + } } #ifndef NDEBUG @@ -245,7 +252,7 @@ class RecordBatchSerializer : public ArrayVisitor { // Send padding if it's available const int64_t buffer_length = std::min(BitUtil::RoundUpToMultipleOf64(array.length() * type_width), - data->size() - byte_offset); + data->size() - byte_offset); data = SliceBuffer(data, byte_offset, buffer_length); } buffers_.push_back(data); @@ -253,8 +260,8 @@ class RecordBatchSerializer : public ArrayVisitor { } template - Status GetZeroBasedValueOffsets( - const ArrayType& array, std::shared_ptr* value_offsets) { + Status GetZeroBasedValueOffsets(const ArrayType& array, + std::shared_ptr* value_offsets) { // Share slicing logic between ListArray and BinaryArray auto offsets = array.value_offsets(); @@ -265,8 +272,8 @@ class RecordBatchSerializer : public ArrayVisitor { // b) slice the values array accordingly std::shared_ptr shifted_offsets; - RETURN_NOT_OK(AllocateBuffer( - pool_, sizeof(int32_t) * (array.length() + 1), &shifted_offsets)); + RETURN_NOT_OK(AllocateBuffer(pool_, sizeof(int32_t) * (array.length() + 1), + &shifted_offsets)); int32_t* dest_offsets = reinterpret_cast(shifted_offsets->mutable_data()); const int32_t start_offset = array.value_offset(0); @@ -392,13 +399,15 @@ class RecordBatchSerializer : public ArrayVisitor { const auto& type = static_cast(*array.type()); std::shared_ptr value_offsets; - RETURN_NOT_OK(GetTruncatedBuffer( - offset, length, array.value_offsets(), pool_, &value_offsets)); + RETURN_NOT_OK(GetTruncatedBuffer(offset, length, array.value_offsets(), + pool_, &value_offsets)); // The Union type codes are not necessary 0-indexed uint8_t max_code = 0; for (uint8_t code : type.type_codes()) { - if (code > max_code) { max_code = code; } + if (code > max_code) { + max_code = code; + } } // Allocate an array of child offsets. Set all to -1 to indicate that we @@ -424,7 +433,9 @@ class RecordBatchSerializer : public ArrayVisitor { for (int64_t i = 0; i < length; ++i) { const uint8_t code = type_ids[i]; int32_t shift = child_offsets[code]; - if (shift == -1) { child_offsets[code] = shift = unshifted_offsets[i]; } + if (shift == -1) { + child_offsets[code] = shift = unshifted_offsets[i]; + } shifted_offsets[i] = unshifted_offsets[i] - shift; // Update the child length to account for observed value @@ -486,14 +497,14 @@ class DictionaryWriter : public RecordBatchSerializer { public: using RecordBatchSerializer::RecordBatchSerializer; - Status WriteMetadataMessage( - int64_t num_rows, int64_t body_length, std::shared_ptr* out) override { - return WriteDictionaryMessage( - dictionary_id_, num_rows, body_length, field_nodes_, buffer_meta_, out); + Status WriteMetadataMessage(int64_t num_rows, int64_t body_length, + std::shared_ptr* out) override { + return WriteDictionaryMessage(dictionary_id_, num_rows, body_length, field_nodes_, + buffer_meta_, out); } Status Write(int64_t dictionary_id, const std::shared_ptr& dictionary, - io::OutputStream* dst, int32_t* metadata_length, int64_t* body_length) { + io::OutputStream* dst, int32_t* metadata_length, int64_t* body_length) { dictionary_id_ = dictionary_id; // Make a dummy record batch. A bit tedious as we have to make a schema @@ -516,27 +527,30 @@ Status AlignStreamPosition(io::OutputStream* stream) { int64_t position; RETURN_NOT_OK(stream->Tell(&position)); int64_t remainder = PaddedLength(position) - position; - if (remainder > 0) { return stream->Write(kPaddingBytes, remainder); } + if (remainder > 0) { + return stream->Write(kPaddingBytes, remainder); + } return Status::OK(); } Status WriteRecordBatch(const RecordBatch& batch, int64_t buffer_start_offset, - io::OutputStream* dst, int32_t* metadata_length, int64_t* body_length, - MemoryPool* pool, int max_recursion_depth, bool allow_64bit) { - RecordBatchSerializer writer( - pool, buffer_start_offset, max_recursion_depth, allow_64bit); + io::OutputStream* dst, int32_t* metadata_length, + int64_t* body_length, MemoryPool* pool, int max_recursion_depth, + bool allow_64bit) { + RecordBatchSerializer writer(pool, buffer_start_offset, max_recursion_depth, + allow_64bit); return writer.Write(batch, dst, metadata_length, body_length); } Status WriteLargeRecordBatch(const RecordBatch& batch, int64_t buffer_start_offset, - io::OutputStream* dst, int32_t* metadata_length, int64_t* body_length, - MemoryPool* pool) { + io::OutputStream* dst, int32_t* metadata_length, + int64_t* body_length, MemoryPool* pool) { return WriteRecordBatch(batch, buffer_start_offset, dst, metadata_length, body_length, - pool, kMaxNestingDepth, true); + pool, kMaxNestingDepth, true); } Status WriteTensor(const Tensor& tensor, io::OutputStream* dst, int32_t* metadata_length, - int64_t* body_length) { + int64_t* body_length) { if (!tensor.is_contiguous()) { return Status::Invalid("No support yet for writing non-contiguous tensors"); } @@ -556,8 +570,8 @@ Status WriteTensor(const Tensor& tensor, io::OutputStream* dst, int32_t* metadat } Status WriteDictionary(int64_t dictionary_id, const std::shared_ptr& dictionary, - int64_t buffer_start_offset, io::OutputStream* dst, int32_t* metadata_length, - int64_t* body_length, MemoryPool* pool) { + int64_t buffer_start_offset, io::OutputStream* dst, + int32_t* metadata_length, int64_t* body_length, MemoryPool* pool) { DictionaryWriter writer(pool, buffer_start_offset, kMaxNestingDepth, false); return writer.Write(dictionary_id, dictionary, dst, metadata_length, body_length); } @@ -568,7 +582,7 @@ Status GetRecordBatchSize(const RecordBatch& batch, int64_t* size) { int64_t body_length = 0; io::MockOutputStream dst; RETURN_NOT_OK(WriteRecordBatch(batch, 0, &dst, &metadata_length, &body_length, - default_memory_pool(), kMaxNestingDepth, true)); + default_memory_pool(), kMaxNestingDepth, true)); *size = dst.GetExtentBytesWritten(); return Status::OK(); } @@ -632,7 +646,9 @@ class RecordBatchStreamWriter::RecordBatchStreamWriterImpl { } Status CheckStarted() { - if (!started_) { return Start(); } + if (!started_) { + return Start(); + } return Status::OK(); } @@ -653,7 +669,7 @@ class RecordBatchStreamWriter::RecordBatchStreamWriterImpl { // Frame of reference in file format is 0, see ARROW-384 const int64_t buffer_start_offset = 0; RETURN_NOT_OK(WriteDictionary(entry.first, entry.second, buffer_start_offset, sink_, - &block->metadata_length, &block->body_length, pool_)); + &block->metadata_length, &block->body_length, pool_)); RETURN_NOT_OK(UpdatePosition()); DCHECK(position_ % 8 == 0) << "WriteDictionary did not perform aligned writes"; } @@ -668,9 +684,9 @@ class RecordBatchStreamWriter::RecordBatchStreamWriterImpl { // Frame of reference in file format is 0, see ARROW-384 const int64_t buffer_start_offset = 0; - RETURN_NOT_OK(arrow::ipc::WriteRecordBatch(batch, buffer_start_offset, sink_, - &block->metadata_length, &block->body_length, pool_, kMaxNestingDepth, - allow_64bit)); + RETURN_NOT_OK(arrow::ipc::WriteRecordBatch( + batch, buffer_start_offset, sink_, &block->metadata_length, &block->body_length, + pool_, kMaxNestingDepth, allow_64bit)); RETURN_NOT_OK(UpdatePosition()); DCHECK(position_ % 8 == 0) << "WriteRecordBatch did not perform aligned writes"; @@ -681,15 +697,17 @@ class RecordBatchStreamWriter::RecordBatchStreamWriterImpl { Status WriteRecordBatch(const RecordBatch& batch, bool allow_64bit) { // Push an empty FileBlock. Can be written in the footer later record_batches_.push_back({0, 0, 0}); - return WriteRecordBatch( - batch, allow_64bit, &record_batches_[record_batches_.size() - 1]); + return WriteRecordBatch(batch, allow_64bit, + &record_batches_[record_batches_.size() - 1]); } // Adds padding bytes if necessary to ensure all memory blocks are written on // 64-byte (or other alignment) boundaries. Status Align(int64_t alignment = kArrowAlignment) { int64_t remainder = PaddedLength(position_, alignment) - position_; - if (remainder > 0) { return Write(kPaddingBytes, remainder); } + if (remainder > 0) { + return Write(kPaddingBytes, remainder); + } return Status::OK(); } @@ -725,8 +743,8 @@ RecordBatchStreamWriter::RecordBatchStreamWriter() { RecordBatchStreamWriter::~RecordBatchStreamWriter() {} -Status RecordBatchStreamWriter::WriteRecordBatch( - const RecordBatch& batch, bool allow_64bit) { +Status RecordBatchStreamWriter::WriteRecordBatch(const RecordBatch& batch, + bool allow_64bit) { return impl_->WriteRecordBatch(batch, allow_64bit); } @@ -735,16 +753,14 @@ void RecordBatchStreamWriter::set_memory_pool(MemoryPool* pool) { } Status RecordBatchStreamWriter::Open(io::OutputStream* sink, - const std::shared_ptr& schema, - std::shared_ptr* out) { + const std::shared_ptr& schema, + std::shared_ptr* out) { // ctor is private *out = std::shared_ptr(new RecordBatchStreamWriter()); return (*out)->impl_->Open(sink, schema); } -Status RecordBatchStreamWriter::Close() { - return impl_->Close(); -} +Status RecordBatchStreamWriter::Close() { return impl_->Close(); } // ---------------------------------------------------------------------- // File writer implementation @@ -756,8 +772,8 @@ class RecordBatchFileWriter::RecordBatchFileWriterImpl Status Start() override { // It is only necessary to align to 8-byte boundary at the start of the file - RETURN_NOT_OK(Write( - reinterpret_cast(kArrowMagicBytes), strlen(kArrowMagicBytes))); + RETURN_NOT_OK(Write(reinterpret_cast(kArrowMagicBytes), + strlen(kArrowMagicBytes))); RETURN_NOT_OK(Align(8)); // We write the schema at the start of the file (and the end). This also @@ -768,21 +784,23 @@ class RecordBatchFileWriter::RecordBatchFileWriterImpl Status Close() override { // Write metadata int64_t initial_position = position_; - RETURN_NOT_OK(WriteFileFooter( - *schema_, dictionaries_, record_batches_, &dictionary_memo_, sink_)); + RETURN_NOT_OK(WriteFileFooter(*schema_, dictionaries_, record_batches_, + &dictionary_memo_, sink_)); RETURN_NOT_OK(UpdatePosition()); // Write footer length int32_t footer_length = static_cast(position_ - initial_position); - if (footer_length <= 0) { return Status::Invalid("Invalid file footer"); } + if (footer_length <= 0) { + return Status::Invalid("Invalid file footer"); + } RETURN_NOT_OK( Write(reinterpret_cast(&footer_length), sizeof(int32_t))); // Write magic bytes to end file - return Write( - reinterpret_cast(kArrowMagicBytes), strlen(kArrowMagicBytes)); + return Write(reinterpret_cast(kArrowMagicBytes), + strlen(kArrowMagicBytes)); } }; @@ -793,20 +811,19 @@ RecordBatchFileWriter::RecordBatchFileWriter() { RecordBatchFileWriter::~RecordBatchFileWriter() {} Status RecordBatchFileWriter::Open(io::OutputStream* sink, - const std::shared_ptr& schema, std::shared_ptr* out) { + const std::shared_ptr& schema, + std::shared_ptr* out) { *out = std::shared_ptr( new RecordBatchFileWriter()); // ctor is private return (*out)->impl_->Open(sink, schema); } -Status RecordBatchFileWriter::WriteRecordBatch( - const RecordBatch& batch, bool allow_64bit) { +Status RecordBatchFileWriter::WriteRecordBatch(const RecordBatch& batch, + bool allow_64bit) { return impl_->WriteRecordBatch(batch, allow_64bit); } -Status RecordBatchFileWriter::Close() { - return impl_->Close(); -} +Status RecordBatchFileWriter::Close() { return impl_->Close(); } } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/writer.h b/cpp/src/arrow/ipc/writer.h index 899a1b2cc1e30..c28dfe0afbb11 100644 --- a/cpp/src/arrow/ipc/writer.h +++ b/cpp/src/arrow/ipc/writer.h @@ -85,7 +85,7 @@ class ARROW_EXPORT RecordBatchStreamWriter : public RecordBatchWriter { /// \param(out) out the created stream writer /// \return Status indicating success or failure static Status Open(io::OutputStream* sink, const std::shared_ptr& schema, - std::shared_ptr* out); + std::shared_ptr* out); Status WriteRecordBatch(const RecordBatch& batch, bool allow_64bit = false) override; Status Close() override; @@ -113,7 +113,7 @@ class ARROW_EXPORT RecordBatchFileWriter : public RecordBatchStreamWriter { /// \param(out) out the created stream writer /// \return Status indicating success or failure static Status Open(io::OutputStream* sink, const std::shared_ptr& schema, - std::shared_ptr* out); + std::shared_ptr* out); Status WriteRecordBatch(const RecordBatch& batch, bool allow_64bit = false) override; Status Close() override; @@ -145,14 +145,16 @@ class ARROW_EXPORT RecordBatchFileWriter : public RecordBatchStreamWriter { /// \param(out) body_length: the size of the contiguous buffer block plus /// padding bytes Status ARROW_EXPORT WriteRecordBatch(const RecordBatch& batch, - int64_t buffer_start_offset, io::OutputStream* dst, int32_t* metadata_length, - int64_t* body_length, MemoryPool* pool, int max_recursion_depth = kMaxNestingDepth, - bool allow_64bit = false); + int64_t buffer_start_offset, io::OutputStream* dst, + int32_t* metadata_length, int64_t* body_length, + MemoryPool* pool, + int max_recursion_depth = kMaxNestingDepth, + bool allow_64bit = false); // Write Array as a DictionaryBatch message Status WriteDictionary(int64_t dictionary_id, const std::shared_ptr& dictionary, - int64_t buffer_start_offset, io::OutputStream* dst, int32_t* metadata_length, - int64_t* body_length, MemoryPool* pool); + int64_t buffer_start_offset, io::OutputStream* dst, + int32_t* metadata_length, int64_t* body_length, MemoryPool* pool); // Compute the precise number of bytes needed in a contiguous memory segment to // write the record batch. This involves generating the complete serialized @@ -166,13 +168,14 @@ Status ARROW_EXPORT GetTensorSize(const Tensor& tensor, int64_t* size); /// EXPERIMENTAL: Write RecordBatch allowing lengths over INT32_MAX. This data /// may not be readable by all Arrow implementations Status ARROW_EXPORT WriteLargeRecordBatch(const RecordBatch& batch, - int64_t buffer_start_offset, io::OutputStream* dst, int32_t* metadata_length, - int64_t* body_length, MemoryPool* pool); + int64_t buffer_start_offset, + io::OutputStream* dst, int32_t* metadata_length, + int64_t* body_length, MemoryPool* pool); /// EXPERIMENTAL: Write arrow::Tensor as a contiguous message /// Status ARROW_EXPORT WriteTensor(const Tensor& tensor, io::OutputStream* dst, - int32_t* metadata_length, int64_t* body_length); + int32_t* metadata_length, int64_t* body_length); /// Backwards-compatibility for Arrow < 0.4.0 /// diff --git a/cpp/src/arrow/memory_pool-test.cc b/cpp/src/arrow/memory_pool-test.cc index 8a185abca71cc..52e48dbefab9e 100644 --- a/cpp/src/arrow/memory_pool-test.cc +++ b/cpp/src/arrow/memory_pool-test.cc @@ -27,9 +27,7 @@ class TestDefaultMemoryPool : public ::arrow::test::TestMemoryPoolBase { ::arrow::MemoryPool* memory_pool() override { return ::arrow::default_memory_pool(); } }; -TEST_F(TestDefaultMemoryPool, MemoryTracking) { - this->TestMemoryTracking(); -} +TEST_F(TestDefaultMemoryPool, MemoryTracking) { this->TestMemoryTracking(); } TEST_F(TestDefaultMemoryPool, OOM) { #ifndef ADDRESS_SANITIZER @@ -37,9 +35,7 @@ TEST_F(TestDefaultMemoryPool, OOM) { #endif } -TEST_F(TestDefaultMemoryPool, Reallocate) { - this->TestReallocate(); -} +TEST_F(TestDefaultMemoryPool, Reallocate) { this->TestReallocate(); } // Death tests and valgrind are known to not play well 100% of the time. See // googletest documentation @@ -53,7 +49,7 @@ TEST(DefaultMemoryPoolDeathTest, FreeLargeMemory) { #ifndef NDEBUG EXPECT_EXIT(pool->Free(data, 120), ::testing::ExitedWithCode(1), - ".*Check failed: \\(bytes_allocated_\\) >= \\(size\\)"); + ".*Check failed: \\(bytes_allocated_\\) >= \\(size\\)"); #endif pool->Free(data, 100); diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc index e7de5c4fc589a..769fc1037ee80 100644 --- a/cpp/src/arrow/memory_pool.cc +++ b/cpp/src/arrow/memory_pool.cc @@ -17,12 +17,12 @@ #include "arrow/memory_pool.h" +#include #include #include #include #include #include -#include #include "arrow/status.h" #include "arrow/util/logging.h" @@ -60,8 +60,8 @@ Status AllocateAligned(int64_t size, uint8_t** out) { return Status::OutOfMemory(ss.str()); } #else - const int result = posix_memalign( - reinterpret_cast(out), kAlignment, static_cast(size)); + const int result = posix_memalign(reinterpret_cast(out), kAlignment, + static_cast(size)); if (result == ENOMEM) { std::stringstream ss; ss << "malloc of size " << size << " failed"; @@ -82,13 +82,9 @@ MemoryPool::MemoryPool() {} MemoryPool::~MemoryPool() {} -int64_t MemoryPool::max_memory() const { - return -1; -} +int64_t MemoryPool::max_memory() const { return -1; } -DefaultMemoryPool::DefaultMemoryPool() : bytes_allocated_(0) { - max_memory_ = 0; -} +DefaultMemoryPool::DefaultMemoryPool() : bytes_allocated_(0) { max_memory_ = 0; } Status DefaultMemoryPool::Allocate(int64_t size, uint8_t** out) { RETURN_NOT_OK(AllocateAligned(size, out)); @@ -96,7 +92,9 @@ Status DefaultMemoryPool::Allocate(int64_t size, uint8_t** out) { { std::lock_guard guard(lock_); - if (bytes_allocated_ > max_memory_) { max_memory_ = bytes_allocated_.load(); } + if (bytes_allocated_ > max_memory_) { + max_memory_ = bytes_allocated_.load(); + } } return Status::OK(); } @@ -128,15 +126,15 @@ Status DefaultMemoryPool::Reallocate(int64_t old_size, int64_t new_size, uint8_t bytes_allocated_ += new_size - old_size; { std::lock_guard guard(lock_); - if (bytes_allocated_ > max_memory_) { max_memory_ = bytes_allocated_.load(); } + if (bytes_allocated_ > max_memory_) { + max_memory_ = bytes_allocated_.load(); + } } return Status::OK(); } -int64_t DefaultMemoryPool::bytes_allocated() const { - return bytes_allocated_.load(); -} +int64_t DefaultMemoryPool::bytes_allocated() const { return bytes_allocated_.load(); } void DefaultMemoryPool::Free(uint8_t* buffer, int64_t size) { DCHECK_GE(bytes_allocated_, size); @@ -150,9 +148,7 @@ void DefaultMemoryPool::Free(uint8_t* buffer, int64_t size) { bytes_allocated_ -= size; } -int64_t DefaultMemoryPool::max_memory() const { - return max_memory_.load(); -} +int64_t DefaultMemoryPool::max_memory() const { return max_memory_.load(); } DefaultMemoryPool::~DefaultMemoryPool() {} diff --git a/cpp/src/arrow/pretty_print-test.cc b/cpp/src/arrow/pretty_print-test.cc index 10a91f5e4e461..049f5a58a6841 100644 --- a/cpp/src/arrow/pretty_print-test.cc +++ b/cpp/src/arrow/pretty_print-test.cc @@ -57,7 +57,7 @@ void CheckArray(const Array& arr, int indent, const char* expected) { template void CheckPrimitive(int indent, const std::vector& is_valid, - const std::vector& values, const char* expected) { + const std::vector& values, const char* expected) { std::shared_ptr array; ArrayFromVector(is_valid, values, &array); CheckArray(*array, indent, expected); diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index 93f6ff0f363f0..aedad1228dfb2 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -42,7 +42,9 @@ class ArrayPrinter { const T& array) { const auto data = array.raw_values(); for (int i = 0; i < array.length(); ++i) { - if (i > 0) { (*sink_) << ", "; } + if (i > 0) { + (*sink_) << ", "; + } if (array.IsNull(i)) { (*sink_) << "null"; } else { @@ -56,7 +58,9 @@ class ArrayPrinter { const T& array) { const auto data = array.raw_values(); for (int i = 0; i < array.length(); ++i) { - if (i > 0) { (*sink_) << ", "; } + if (i > 0) { + (*sink_) << ", "; + } if (array.IsNull(i)) { Write("null"); } else { @@ -71,7 +75,9 @@ class ArrayPrinter { WriteDataValues(const T& array) { int32_t length; for (int i = 0; i < array.length(); ++i) { - if (i > 0) { (*sink_) << ", "; } + if (i > 0) { + (*sink_) << ", "; + } if (array.IsNull(i)) { Write("null"); } else { @@ -87,7 +93,9 @@ class ArrayPrinter { WriteDataValues(const T& array) { int32_t length; for (int i = 0; i < array.length(); ++i) { - if (i > 0) { (*sink_) << ", "; } + if (i > 0) { + (*sink_) << ", "; + } if (array.IsNull(i)) { Write("null"); } else { @@ -102,7 +110,9 @@ class ArrayPrinter { WriteDataValues(const T& array) { int32_t width = array.byte_width(); for (int i = 0; i < array.length(); ++i) { - if (i > 0) { (*sink_) << ", "; } + if (i > 0) { + (*sink_) << ", "; + } if (array.IsNull(i)) { Write("null"); } else { @@ -116,7 +126,9 @@ class ArrayPrinter { inline typename std::enable_if::value, void>::type WriteDataValues(const T& array) { for (int i = 0; i < array.length(); ++i) { - if (i > 0) { (*sink_) << ", "; } + if (i > 0) { + (*sink_) << ", "; + } if (array.IsNull(i)) { Write("null"); } else { @@ -138,7 +150,7 @@ class ArrayPrinter { typename std::enable_if::value || std::is_base_of::value || std::is_base_of::value, - Status>::type + Status>::type Visit(const T& array) { OpenArray(); WriteDataValues(array); @@ -157,8 +169,8 @@ class ArrayPrinter { Newline(); Write("-- value_offsets: "); - Int32Array value_offsets( - array.length() + 1, array.value_offsets(), nullptr, 0, array.offset()); + Int32Array value_offsets(array.length() + 1, array.value_offsets(), nullptr, 0, + array.offset()); RETURN_NOT_OK(PrettyPrint(value_offsets, indent_ + 2, sink_)); Newline(); @@ -170,8 +182,8 @@ class ArrayPrinter { return Status::OK(); } - Status PrintChildren( - const std::vector>& fields, int64_t offset, int64_t length) { + Status PrintChildren(const std::vector>& fields, int64_t offset, + int64_t length) { for (size_t i = 0; i < fields.size(); ++i) { Newline(); std::stringstream ss; @@ -179,7 +191,9 @@ class ArrayPrinter { Write(ss.str()); std::shared_ptr field = fields[i]; - if (offset != 0) { field = field->Slice(offset, length); } + if (offset != 0) { + field = field->Slice(offset, length); + } RETURN_NOT_OK(PrettyPrint(*field, indent_ + 2, sink_)); } @@ -207,8 +221,8 @@ class ArrayPrinter { if (array.mode() == UnionMode::DENSE) { Newline(); Write("-- value_offsets: "); - Int32Array value_offsets( - array.length(), array.value_offsets(), nullptr, 0, array.offset()); + Int32Array value_offsets(array.length(), array.value_offsets(), nullptr, 0, + array.offset()); RETURN_NOT_OK(PrettyPrint(value_offsets, indent_ + 2, sink_)); } @@ -247,8 +261,8 @@ Status ArrayPrinter::WriteValidityBitmap(const Array& array) { Write("-- is_valid: "); if (array.null_count() > 0) { - BooleanArray is_valid( - array.length(), array.null_bitmap(), nullptr, 0, array.offset()); + BooleanArray is_valid(array.length(), array.null_bitmap(), nullptr, 0, + array.offset()); return PrettyPrint(is_valid, indent_ + 2, sink_); } else { Write("all not null"); @@ -256,20 +270,12 @@ Status ArrayPrinter::WriteValidityBitmap(const Array& array) { } } -void ArrayPrinter::OpenArray() { - (*sink_) << "["; -} -void ArrayPrinter::CloseArray() { - (*sink_) << "]"; -} +void ArrayPrinter::OpenArray() { (*sink_) << "["; } +void ArrayPrinter::CloseArray() { (*sink_) << "]"; } -void ArrayPrinter::Write(const char* data) { - (*sink_) << data; -} +void ArrayPrinter::Write(const char* data) { (*sink_) << data; } -void ArrayPrinter::Write(const std::string& data) { - (*sink_) << data; -} +void ArrayPrinter::Write(const std::string& data) { (*sink_) << data; } void ArrayPrinter::Newline() { (*sink_) << "\n"; diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index d40609fe3fad2..462bdb7b7d744 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -147,8 +147,8 @@ static inline PyArray_Descr* GetSafeNumPyDtype(int type) { return PyArray_DescrFromType(type); } } -static inline PyObject* NewArray1DFromType( - DataType* arrow_type, int type, int64_t length, void* data) { +static inline PyObject* NewArray1DFromType(DataType* arrow_type, int type, int64_t length, + void* data) { npy_intp dims[1] = {length}; PyArray_Descr* descr = GetSafeNumPyDtype(type); @@ -159,7 +159,8 @@ static inline PyObject* NewArray1DFromType( set_numpy_metadata(type, arrow_type, descr); return PyArray_NewFromDescr(&PyArray_Type, descr, 1, dims, nullptr, data, - NPY_ARRAY_OWNDATA | NPY_ARRAY_CARRAY | NPY_ARRAY_WRITEABLE, nullptr); + NPY_ARRAY_OWNDATA | NPY_ARRAY_CARRAY | NPY_ARRAY_WRITEABLE, + nullptr); } class PandasBlock { @@ -188,7 +189,7 @@ class PandasBlock { virtual Status Allocate() = 0; virtual Status Write(const std::shared_ptr& col, int64_t abs_placement, - int64_t rel_placement) = 0; + int64_t rel_placement) = 0; PyObject* block_arr() const { return block_arr_.obj(); } @@ -408,7 +409,9 @@ inline Status ConvertFixedSizeBinary(const ChunkedArray& data, PyObject** out_va inline Status ConvertStruct(const ChunkedArray& data, PyObject** out_values) { PyAcquireGIL lock; - if (data.num_chunks() <= 0) { return Status::OK(); } + if (data.num_chunks() <= 0) { + return Status::OK(); + } // ChunkedArray has at least one chunk auto arr = static_cast(data.chunk(0).get()); // Use it to cache the struct type and number of fields for all chunks @@ -467,8 +470,8 @@ inline Status ConvertStruct(const ChunkedArray& data, PyObject** out_values) { } template -inline Status ConvertListsLike( - const std::shared_ptr& col, PyObject** out_values) { +inline Status ConvertListsLike(const std::shared_ptr& col, + PyObject** out_values) { const ChunkedArray& data = *col->data().get(); auto list_type = std::static_pointer_cast(col->type()); @@ -532,8 +535,8 @@ inline void ConvertNumericNullable(const ChunkedArray& data, T na_value, T* out_ } template -inline void ConvertNumericNullableCast( - const ChunkedArray& data, OutType na_value, OutType* out_values) { +inline void ConvertNumericNullableCast(const ChunkedArray& data, OutType na_value, + OutType* out_values) { for (int c = 0; c < data.num_chunks(); c++) { const std::shared_ptr arr = data.chunk(c); auto prim_arr = static_cast(arr.get()); @@ -602,8 +605,8 @@ Status ValidateDecimalPrecision(int precision) { } template -Status RawDecimalToString( - const uint8_t* bytes, int precision, int scale, std::string* result) { +Status RawDecimalToString(const uint8_t* bytes, int precision, int scale, + std::string* result) { DCHECK_NE(bytes, nullptr); DCHECK_NE(result, nullptr); RETURN_NOT_OK(ValidateDecimalPrecision(precision)); @@ -613,13 +616,13 @@ Status RawDecimalToString( return Status::OK(); } -template Status RawDecimalToString( - const uint8_t*, int, int, std::string* result); -template Status RawDecimalToString( - const uint8_t*, int, int, std::string* result); +template Status RawDecimalToString(const uint8_t*, int, int, + std::string* result); +template Status RawDecimalToString(const uint8_t*, int, int, + std::string* result); Status RawDecimalToString(const uint8_t* bytes, int precision, int scale, - bool is_negative, std::string* result) { + bool is_negative, std::string* result) { DCHECK_NE(bytes, nullptr); DCHECK_NE(result, nullptr); RETURN_NOT_OK(ValidateDecimalPrecision(precision)); @@ -684,7 +687,7 @@ class ObjectBlock : public PandasBlock { Status Allocate() override { return AllocateNDArray(NPY_OBJECT); } Status Write(const std::shared_ptr& col, int64_t abs_placement, - int64_t rel_placement) override { + int64_t rel_placement) override { Type::type type = col->type()->id(); PyObject** out_buffer = @@ -753,7 +756,7 @@ class IntBlock : public PandasBlock { } Status Write(const std::shared_ptr& col, int64_t abs_placement, - int64_t rel_placement) override { + int64_t rel_placement) override { Type::type type = col->type()->id(); C_TYPE* out_buffer = @@ -789,7 +792,7 @@ class Float32Block : public PandasBlock { Status Allocate() override { return AllocateNDArray(NPY_FLOAT32); } Status Write(const std::shared_ptr& col, int64_t abs_placement, - int64_t rel_placement) override { + int64_t rel_placement) override { Type::type type = col->type()->id(); if (type != Type::FLOAT) { @@ -813,7 +816,7 @@ class Float64Block : public PandasBlock { Status Allocate() override { return AllocateNDArray(NPY_FLOAT64); } Status Write(const std::shared_ptr& col, int64_t abs_placement, - int64_t rel_placement) override { + int64_t rel_placement) override { Type::type type = col->type()->id(); double* out_buffer = @@ -868,7 +871,7 @@ class BoolBlock : public PandasBlock { Status Allocate() override { return AllocateNDArray(NPY_BOOL); } Status Write(const std::shared_ptr& col, int64_t abs_placement, - int64_t rel_placement) override { + int64_t rel_placement) override { Type::type type = col->type()->id(); if (type != Type::BOOL) { @@ -903,7 +906,7 @@ class DatetimeBlock : public PandasBlock { Status Allocate() override { return AllocateDatetime(2); } Status Write(const std::shared_ptr& col, int64_t abs_placement, - int64_t rel_placement) override { + int64_t rel_placement) override { Type::type type = col->type()->id(); int64_t* out_buffer = @@ -981,14 +984,14 @@ class CategoricalBlock : public PandasBlock { constexpr int npy_type = arrow_traits::npy_type; if (!(npy_type == NPY_INT8 || npy_type == NPY_INT16 || npy_type == NPY_INT32 || - npy_type == NPY_INT64)) { + npy_type == NPY_INT64)) { return Status::Invalid("Category indices must be signed integers"); } return AllocateNDArray(npy_type, 1); } Status Write(const std::shared_ptr& col, int64_t abs_placement, - int64_t rel_placement) override { + int64_t rel_placement) override { using T = typename arrow_traits::T; T* out_values = reinterpret_cast(block_data_) + rel_placement * num_rows_; @@ -1036,7 +1039,7 @@ class CategoricalBlock : public PandasBlock { }; Status MakeBlock(PandasBlock::type type, int64_t num_rows, int num_columns, - std::shared_ptr* block) { + std::shared_ptr* block) { #define BLOCK_CASE(NAME, TYPE) \ case PandasBlock::NAME: \ *block = std::make_shared(num_rows, num_columns); \ @@ -1066,7 +1069,8 @@ Status MakeBlock(PandasBlock::type type, int64_t num_rows, int num_columns, } static inline Status MakeCategoricalBlock(const std::shared_ptr& type, - int64_t num_rows, std::shared_ptr* block) { + int64_t num_rows, + std::shared_ptr* block) { // All categoricals become a block with a single column auto dict_type = static_cast(type.get()); switch (dict_type->index_type()->id()) { @@ -1259,7 +1263,9 @@ class DataFrameBlockCreator { block = it->second; } else { auto it = this->blocks_.find(output_type); - if (it == this->blocks_.end()) { return Status::KeyError("No block allocated"); } + if (it == this->blocks_.end()) { + return Status::KeyError("No block allocated"); + } block = it->second; } return block->Write(col, i, rel_placement); @@ -1286,7 +1292,9 @@ class DataFrameBlockCreator { int column_num; while (!error_occurred) { column_num = task_counter.fetch_add(1); - if (column_num >= this->table_->num_columns()) { break; } + if (column_num >= this->table_->num_columns()) { + break; + } Status s = WriteColumn(column_num); if (!s.ok()) { std::lock_guard lock(error_mtx); @@ -1301,7 +1309,9 @@ class DataFrameBlockCreator { thread.join(); } - if (error_occurred) { return error; } + if (error_occurred) { + return error; + } } return Status::OK(); } @@ -1310,7 +1320,9 @@ class DataFrameBlockCreator { for (const auto& it : blocks) { PyObject* item; RETURN_NOT_OK(it.second->GetPyResult(&item)); - if (PyList_Append(list, item) < 0) { RETURN_IF_PYERROR(); } + if (PyList_Append(list, item) < 0) { + RETURN_IF_PYERROR(); + } // ARROW-1017; PyList_Append increments object refcount Py_DECREF(item); @@ -1432,7 +1444,7 @@ class ArrowDeserializer { template typename std::enable_if::value || std::is_base_of::value, - Status>::type + Status>::type Visit(const Type& type) { constexpr int TYPE = Type::type_id; using traits = arrow_traits; @@ -1603,22 +1615,22 @@ class ArrowDeserializer { PyObject* result_; }; -Status ConvertArrayToPandas( - const std::shared_ptr& arr, PyObject* py_ref, PyObject** out) { +Status ConvertArrayToPandas(const std::shared_ptr& arr, PyObject* py_ref, + PyObject** out) { static std::string dummy_name = "dummy"; auto field = std::make_shared(dummy_name, arr->type()); auto col = std::make_shared(field, arr); return ConvertColumnToPandas(col, py_ref, out); } -Status ConvertColumnToPandas( - const std::shared_ptr& col, PyObject* py_ref, PyObject** out) { +Status ConvertColumnToPandas(const std::shared_ptr& col, PyObject* py_ref, + PyObject** out) { ArrowDeserializer converter(col, py_ref); return converter.Convert(out); } -Status ConvertTableToPandas( - const std::shared_ptr& table, int nthreads, PyObject** out) { +Status ConvertTableToPandas(const std::shared_ptr
& table, int nthreads, + PyObject** out) { DataFrameBlockCreator helper(table); return helper.Convert(nthreads, out); } diff --git a/cpp/src/arrow/python/arrow_to_pandas.h b/cpp/src/arrow/python/arrow_to_pandas.h index c606dcbbe0aa7..5a99274a33ee0 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.h +++ b/cpp/src/arrow/python/arrow_to_pandas.h @@ -40,12 +40,12 @@ class Table; namespace py { ARROW_EXPORT -Status ConvertArrayToPandas( - const std::shared_ptr& arr, PyObject* py_ref, PyObject** out); +Status ConvertArrayToPandas(const std::shared_ptr& arr, PyObject* py_ref, + PyObject** out); ARROW_EXPORT -Status ConvertColumnToPandas( - const std::shared_ptr& col, PyObject* py_ref, PyObject** out); +Status ConvertColumnToPandas(const std::shared_ptr& col, PyObject* py_ref, + PyObject** out); struct PandasOptions { bool strings_to_categorical; @@ -58,8 +58,8 @@ struct PandasOptions { // // tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2]) ARROW_EXPORT -Status ConvertTableToPandas( - const std::shared_ptr
& table, int nthreads, PyObject** out); +Status ConvertTableToPandas(const std::shared_ptr
& table, int nthreads, + PyObject** out); } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index a76b6ba25531c..6eaa37fb8ca93 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -44,8 +44,8 @@ static inline bool IsPyInteger(PyObject* obj) { #endif } -Status InvalidConversion( - PyObject* obj, const std::string& expected_types, std::ostream* out) { +Status InvalidConversion(PyObject* obj, const std::string& expected_types, + std::ostream* out) { OwnedRef type(PyObject_Type(obj)); RETURN_IF_PYERROR(); DCHECK_NE(type.obj(), nullptr); @@ -161,7 +161,9 @@ class SeqVisitor { // co-recursive with VisitElem Status Visit(PyObject* obj, int level = 0) { - if (level > max_nesting_level_) { max_nesting_level_ = level; } + if (level > max_nesting_level_) { + max_nesting_level_ = level; + } // Loop through either a sequence or an iterator. if (PySequence_Check(obj)) { Py_ssize_t size = PySequence_Size(obj); @@ -226,7 +228,9 @@ class SeqVisitor { int max_observed_level() const { int result = 0; for (int i = 0; i < MAX_NESTING_LEVELS; ++i) { - if (nesting_histogram_[i] > 0) { result = i; } + if (nesting_histogram_[i] > 0) { + result = i; + } } return result; } @@ -235,7 +239,9 @@ class SeqVisitor { int num_nesting_levels() const { int result = 0; for (int i = 0; i < MAX_NESTING_LEVELS; ++i) { - if (nesting_histogram_[i] > 0) { ++result; } + if (nesting_histogram_[i] > 0) { + ++result; + } } return result; } @@ -300,13 +306,15 @@ Status InferArrowType(PyObject* obj, std::shared_ptr* out_type) { RETURN_NOT_OK(seq_visitor.Validate()); *out_type = seq_visitor.GetType(); - if (*out_type == nullptr) { return Status::TypeError("Unable to determine data type"); } + if (*out_type == nullptr) { + return Status::TypeError("Unable to determine data type"); + } return Status::OK(); } -Status InferArrowTypeAndSize( - PyObject* obj, int64_t* size, std::shared_ptr* out_type) { +Status InferArrowTypeAndSize(PyObject* obj, int64_t* size, + std::shared_ptr* out_type) { RETURN_NOT_OK(InferArrowSize(obj, size)); // For 0-length sequences, refuse to guess @@ -372,7 +380,9 @@ class TypedConverterVisitor : public TypedConverter { RETURN_NOT_OK(static_cast(this)->AppendItem(ref)); ++i; } - if (size != i) { RETURN_NOT_OK(this->typed_builder_->Resize(i)); } + if (size != i) { + RETURN_NOT_OK(this->typed_builder_->Resize(i)); + } } else { return Status::TypeError("Object is not a sequence or iterable"); } @@ -487,8 +497,9 @@ class FixedWidthBytesConverter inline Status AppendItem(const OwnedRef& item) { PyObject* bytes_obj; OwnedRef tmp; - Py_ssize_t expected_length = std::dynamic_pointer_cast( - typed_builder_->type())->byte_width(); + Py_ssize_t expected_length = + std::dynamic_pointer_cast(typed_builder_->type()) + ->byte_width(); if (item.obj() == Py_None) { RETURN_NOT_OK(typed_builder_->AppendNull()); return Status::OK(); @@ -636,7 +647,7 @@ Status ListConverter::Init(ArrayBuilder* builder) { } Status AppendPySequence(PyObject* obj, int64_t size, - const std::shared_ptr& type, ArrayBuilder* builder) { + const std::shared_ptr& type, ArrayBuilder* builder) { PyDateTime_IMPORT; std::shared_ptr converter = GetConverter(type); if (converter == nullptr) { @@ -656,7 +667,7 @@ Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr } Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr* out, - const std::shared_ptr& type, int64_t size) { + const std::shared_ptr& type, int64_t size) { // Handle NA / NullType case if (type->id() == Type::NA) { out->reset(new NullArray(size)); @@ -671,7 +682,7 @@ Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr } Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr* out, - const std::shared_ptr& type) { + const std::shared_ptr& type) { int64_t size; RETURN_NOT_OK(InferArrowSize(obj, &size)); return ConvertPySequence(obj, pool, out, type, size); diff --git a/cpp/src/arrow/python/builtin_convert.h b/cpp/src/arrow/python/builtin_convert.h index 4f84fbb7caca9..cde7a1bd4cfdc 100644 --- a/cpp/src/arrow/python/builtin_convert.h +++ b/cpp/src/arrow/python/builtin_convert.h @@ -39,14 +39,15 @@ class Status; namespace py { -ARROW_EXPORT arrow::Status InferArrowType( - PyObject* obj, std::shared_ptr* out_type); +ARROW_EXPORT arrow::Status InferArrowType(PyObject* obj, + std::shared_ptr* out_type); ARROW_EXPORT arrow::Status InferArrowTypeAndSize( PyObject* obj, int64_t* size, std::shared_ptr* out_type); ARROW_EXPORT arrow::Status InferArrowSize(PyObject* obj, int64_t* size); ARROW_EXPORT arrow::Status AppendPySequence(PyObject* obj, int64_t size, - const std::shared_ptr& type, arrow::ArrayBuilder* builder); + const std::shared_ptr& type, + arrow::ArrayBuilder* builder); // Type and size inference ARROW_EXPORT @@ -55,19 +56,19 @@ Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr // Size inference ARROW_EXPORT Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr* out, - const std::shared_ptr& type); + const std::shared_ptr& type); // No inference ARROW_EXPORT Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr* out, - const std::shared_ptr& type, int64_t size); + const std::shared_ptr& type, int64_t size); ARROW_EXPORT -Status InvalidConversion( - PyObject* obj, const std::string& expected_type_name, std::ostream* out); +Status InvalidConversion(PyObject* obj, const std::string& expected_type_name, + std::ostream* out); -ARROW_EXPORT Status CheckPythonBytesAreFixedLength( - PyObject* obj, Py_ssize_t expected_length); +ARROW_EXPORT Status CheckPythonBytesAreFixedLength(PyObject* obj, + Py_ssize_t expected_length); } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/config.cc b/cpp/src/arrow/python/config.cc index 3cec7c41a2f31..92ca9db9cc391 100644 --- a/cpp/src/arrow/python/config.cc +++ b/cpp/src/arrow/python/config.cc @@ -16,8 +16,6 @@ // under the License. #include "arrow/python/platform.h" -#include - #include "arrow/python/config.h" namespace arrow { diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc index 76ec3a1ba8746..164e42e52e48e 100644 --- a/cpp/src/arrow/python/helpers.cc +++ b/cpp/src/arrow/python/helpers.cc @@ -89,8 +89,8 @@ Status PythonDecimalToString(PyObject* python_decimal, std::string* out) { return Status::OK(); } -Status InferDecimalPrecisionAndScale( - PyObject* python_decimal, int* precision, int* scale) { +Status InferDecimalPrecisionAndScale(PyObject* python_decimal, int* precision, + int* scale) { // Call Python's str(decimal_object) OwnedRef str_obj(PyObject_Str(python_decimal)); RETURN_IF_PYERROR(); @@ -102,12 +102,12 @@ Status InferDecimalPrecisionAndScale( auto size = str.size; std::string c_string(bytes, size); - return FromString( - c_string, static_cast(nullptr), precision, scale); + return FromString(c_string, static_cast(nullptr), precision, + scale); } -Status DecimalFromString( - PyObject* decimal_constructor, const std::string& decimal_string, PyObject** out) { +Status DecimalFromString(PyObject* decimal_constructor, const std::string& decimal_string, + PyObject** out) { DCHECK_NE(decimal_constructor, nullptr); DCHECK_NE(out, nullptr); @@ -117,8 +117,8 @@ Status DecimalFromString( auto string_bytes = decimal_string.c_str(); DCHECK_NE(string_bytes, nullptr); - *out = PyObject_CallFunction( - decimal_constructor, const_cast("s#"), string_bytes, string_size); + *out = PyObject_CallFunction(decimal_constructor, const_cast("s#"), string_bytes, + string_size); RETURN_IF_PYERROR(); return Status::OK(); } diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h index e0656699ce4c2..8b8c6673c8ebb 100644 --- a/cpp/src/arrow/python/helpers.h +++ b/cpp/src/arrow/python/helpers.h @@ -36,16 +36,17 @@ class OwnedRef; ARROW_EXPORT std::shared_ptr GetPrimitiveType(Type::type type); Status ARROW_EXPORT ImportModule(const std::string& module_name, OwnedRef* ref); -Status ARROW_EXPORT ImportFromModule( - const OwnedRef& module, const std::string& module_name, OwnedRef* ref); +Status ARROW_EXPORT ImportFromModule(const OwnedRef& module, + const std::string& module_name, OwnedRef* ref); Status ARROW_EXPORT PythonDecimalToString(PyObject* python_decimal, std::string* out); -Status ARROW_EXPORT InferDecimalPrecisionAndScale( - PyObject* python_decimal, int* precision = nullptr, int* scale = nullptr); +Status ARROW_EXPORT InferDecimalPrecisionAndScale(PyObject* python_decimal, + int* precision = nullptr, + int* scale = nullptr); -Status ARROW_EXPORT DecimalFromString( - PyObject* decimal_constructor, const std::string& decimal_string, PyObject** out); +Status ARROW_EXPORT DecimalFromString(PyObject* decimal_constructor, + const std::string& decimal_string, PyObject** out); } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/init.cc b/cpp/src/arrow/python/init.cc index db648915465a8..dba293bbe2366 100644 --- a/cpp/src/arrow/python/init.cc +++ b/cpp/src/arrow/python/init.cc @@ -21,6 +21,4 @@ #include "arrow/python/init.h" #include "arrow/python/numpy_interop.h" -int arrow_init_numpy() { - return arrow::py::import_numpy(); -} +int arrow_init_numpy() { return arrow::py::import_numpy(); } diff --git a/cpp/src/arrow/python/io.cc b/cpp/src/arrow/python/io.cc index a7193854c4d01..4c73fd6401cb6 100644 --- a/cpp/src/arrow/python/io.cc +++ b/cpp/src/arrow/python/io.cc @@ -33,23 +33,19 @@ namespace py { // ---------------------------------------------------------------------- // Python file -PythonFile::PythonFile(PyObject* file) : file_(file) { - Py_INCREF(file_); -} +PythonFile::PythonFile(PyObject* file) : file_(file) { Py_INCREF(file_); } -PythonFile::~PythonFile() { - Py_DECREF(file_); -} +PythonFile::~PythonFile() { Py_DECREF(file_); } // This is annoying: because C++11 does not allow implicit conversion of string // literals to non-const char*, we need to go through some gymnastics to use // PyObject_CallMethod without a lot of pain (its arguments are non-const // char*) template -static inline PyObject* cpp_PyObject_CallMethod( - PyObject* obj, const char* method_name, const char* argspec, ArgTypes... args) { - return PyObject_CallMethod( - obj, const_cast(method_name), const_cast(argspec), args...); +static inline PyObject* cpp_PyObject_CallMethod(PyObject* obj, const char* method_name, + const char* argspec, ArgTypes... args) { + return PyObject_CallMethod(obj, const_cast(method_name), + const_cast(argspec), args...); } Status PythonFile::Close() { @@ -103,9 +99,7 @@ Status PythonFile::Tell(int64_t* position) { // ---------------------------------------------------------------------- // Seekable input stream -PyReadableFile::PyReadableFile(PyObject* file) { - file_.reset(new PythonFile(file)); -} +PyReadableFile::PyReadableFile(PyObject* file) { file_.reset(new PythonFile(file)); } PyReadableFile::~PyReadableFile() {} @@ -167,9 +161,7 @@ Status PyReadableFile::GetSize(int64_t* size) { return Status::OK(); } -bool PyReadableFile::supports_zero_copy() const { - return false; -} +bool PyReadableFile::supports_zero_copy() const { return false; } // ---------------------------------------------------------------------- // Output stream diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc index c391b5d7a1018..95d63b8fecb5b 100644 --- a/cpp/src/arrow/python/numpy_convert.cc +++ b/cpp/src/arrow/python/numpy_convert.cc @@ -38,7 +38,7 @@ namespace py { bool is_contiguous(PyObject* array) { if (PyArray_Check(array)) { return (PyArray_FLAGS(reinterpret_cast(array)) & - (NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS)) != 0; + (NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS)) != 0; } else { return false; } @@ -49,8 +49,12 @@ int cast_npy_type_compat(int type_num) { // U/LONGLONG to U/INT64 so things work properly. #if (NPY_INT64 == NPY_LONGLONG) && (NPY_SIZEOF_LONGLONG == 8) - if (type_num == NPY_LONGLONG) { type_num = NPY_INT64; } - if (type_num == NPY_ULONGLONG) { type_num = NPY_UINT64; } + if (type_num == NPY_LONGLONG) { + type_num = NPY_INT64; + } + if (type_num == NPY_ULONGLONG) { + type_num = NPY_UINT64; + } #endif return type_num; @@ -66,13 +70,13 @@ NumPyBuffer::NumPyBuffer(PyObject* ao) : Buffer(nullptr, 0) { size_ = PyArray_SIZE(ndarray) * PyArray_DESCR(ndarray)->elsize; capacity_ = size_; - if (PyArray_FLAGS(ndarray) & NPY_ARRAY_WRITEABLE) { is_mutable_ = true; } + if (PyArray_FLAGS(ndarray) & NPY_ARRAY_WRITEABLE) { + is_mutable_ = true; + } } } -NumPyBuffer::~NumPyBuffer() { - Py_XDECREF(arr_); -} +NumPyBuffer::~NumPyBuffer() { Py_XDECREF(arr_); } #define TO_ARROW_TYPE_CASE(NPY_NAME, FACTORY) \ case NPY_##NPY_NAME: \ @@ -198,7 +202,9 @@ Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr* out) { #undef TO_ARROW_TYPE_CASE Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, std::shared_ptr* out) { - if (!PyArray_Check(ao)) { return Status::TypeError("Did not pass ndarray object"); } + if (!PyArray_Check(ao)) { + return Status::TypeError("Did not pass ndarray object"); + } PyArrayObject* ndarray = reinterpret_cast(ao); @@ -242,18 +248,27 @@ Status TensorToNdarray(const Tensor& tensor, PyObject* base, PyObject** out) { } const void* immutable_data = nullptr; - if (tensor.data()) { immutable_data = tensor.data()->data(); } + if (tensor.data()) { + immutable_data = tensor.data()->data(); + } // Remove const =( void* mutable_data = const_cast(immutable_data); int array_flags = 0; - if (tensor.is_row_major()) { array_flags |= NPY_ARRAY_C_CONTIGUOUS; } - if (tensor.is_column_major()) { array_flags |= NPY_ARRAY_F_CONTIGUOUS; } - if (tensor.is_mutable()) { array_flags |= NPY_ARRAY_WRITEABLE; } + if (tensor.is_row_major()) { + array_flags |= NPY_ARRAY_C_CONTIGUOUS; + } + if (tensor.is_column_major()) { + array_flags |= NPY_ARRAY_F_CONTIGUOUS; + } + if (tensor.is_mutable()) { + array_flags |= NPY_ARRAY_WRITEABLE; + } - PyObject* result = PyArray_NewFromDescr(&PyArray_Type, dtype, tensor.ndim(), - npy_shape.data(), npy_strides.data(), mutable_data, array_flags, nullptr); + PyObject* result = + PyArray_NewFromDescr(&PyArray_Type, dtype, tensor.ndim(), npy_shape.data(), + npy_strides.data(), mutable_data, array_flags, nullptr); RETURN_IF_PYERROR() if (base != Py_None) { diff --git a/cpp/src/arrow/python/numpy_convert.h b/cpp/src/arrow/python/numpy_convert.h index a486646cdec64..7b3b3b7c9a2a0 100644 --- a/cpp/src/arrow/python/numpy_convert.h +++ b/cpp/src/arrow/python/numpy_convert.h @@ -63,8 +63,8 @@ Status GetTensorType(PyObject* dtype, std::shared_ptr* out); ARROW_EXPORT Status GetNumPyType(const DataType& type, int* type_num); -ARROW_EXPORT Status NdarrayToTensor( - MemoryPool* pool, PyObject* ao, std::shared_ptr* out); +ARROW_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, + std::shared_ptr* out); ARROW_EXPORT Status TensorToNdarray(const Tensor& tensor, PyObject* base, PyObject** out); diff --git a/cpp/src/arrow/python/pandas_to_arrow.cc b/cpp/src/arrow/python/pandas_to_arrow.cc index 1368c3605a4e3..be5634b53bbfe 100644 --- a/cpp/src/arrow/python/pandas_to_arrow.cc +++ b/cpp/src/arrow/python/pandas_to_arrow.cc @@ -75,9 +75,7 @@ static inline bool PyObject_is_string(const PyObject* obj) { #endif } -static inline bool PyObject_is_float(const PyObject* obj) { - return PyFloat_Check(obj); -} +static inline bool PyObject_is_float(const PyObject* obj) { return PyFloat_Check(obj); } static inline bool PyObject_is_integer(const PyObject* obj) { return (!PyBool_Check(obj)) && PyArray_IsIntegerScalar(obj); @@ -120,8 +118,8 @@ static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap } template -static int64_t ValuesToValidBytes( - const void* data, int64_t length, uint8_t* valid_bytes) { +static int64_t ValuesToValidBytes(const void* data, int64_t length, + uint8_t* valid_bytes) { typedef npy_traits traits; typedef typename traits::value_type T; @@ -163,7 +161,8 @@ constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max(); /// be length of arr if fully consumed /// \param[out] have_bytes true if we encountered any PyBytes object static Status AppendObjectStrings(PyArrayObject* arr, PyArrayObject* mask, int64_t offset, - StringBuilder* builder, int64_t* end_offset, bool* have_bytes) { + StringBuilder* builder, int64_t* end_offset, + bool* have_bytes) { PyObject* obj; Ndarray1DIndexer objects(arr); @@ -210,8 +209,9 @@ static Status AppendObjectStrings(PyArrayObject* arr, PyArrayObject* mask, int64 } static Status AppendObjectFixedWidthBytes(PyArrayObject* arr, PyArrayObject* mask, - int byte_width, int64_t offset, FixedSizeBinaryBuilder* builder, - int64_t* end_offset) { + int byte_width, int64_t offset, + FixedSizeBinaryBuilder* builder, + int64_t* end_offset) { PyObject* obj; Ndarray1DIndexer objects(arr); @@ -245,8 +245,8 @@ static Status AppendObjectFixedWidthBytes(PyArrayObject* arr, PyArrayObject* mas } RETURN_NOT_OK(CheckPythonBytesAreFixedLength(obj, byte_width)); - if (ARROW_PREDICT_FALSE( - builder->value_data_length() + byte_width > kBinaryMemoryLimit)) { + if (ARROW_PREDICT_FALSE(builder->value_data_length() + byte_width > + kBinaryMemoryLimit)) { break; } RETURN_NOT_OK( @@ -263,13 +263,15 @@ static Status AppendObjectFixedWidthBytes(PyArrayObject* arr, PyArrayObject* mas class PandasConverter { public: - PandasConverter( - MemoryPool* pool, PyObject* ao, PyObject* mo, const std::shared_ptr& type) + PandasConverter(MemoryPool* pool, PyObject* ao, PyObject* mo, + const std::shared_ptr& type) : pool_(pool), type_(type), arr_(reinterpret_cast(ao)), mask_(nullptr) { - if (mo != nullptr && mo != Py_None) { mask_ = reinterpret_cast(mo); } + if (mo != nullptr && mo != Py_None) { + mask_ = reinterpret_cast(mo); + } length_ = static_cast(PyArray_SIZE(arr_)); } @@ -315,7 +317,9 @@ class PandasConverter { Status VisitNative() { using traits = arrow_traits; - if (mask_ != nullptr || traits::supports_nulls) { RETURN_NOT_OK(InitNullBitmap()); } + if (mask_ != nullptr || traits::supports_nulls) { + RETURN_NOT_OK(InitNullBitmap()); + } std::shared_ptr data; RETURN_NOT_OK(ConvertData(&data)); @@ -337,7 +341,7 @@ class PandasConverter { template typename std::enable_if::value || std::is_same::value, - Status>::type + Status>::type Visit(const T& type) { return VisitNative(); } @@ -373,7 +377,9 @@ class PandasConverter { return Status::Invalid("only handle 1-dimensional arrays"); } - if (type_ == nullptr) { return Status::Invalid("Must pass data type"); } + if (type_ == nullptr) { + return Status::Invalid("Must pass data type"); + } // Visit the type to perform conversion return VisitTypeInline(*type_, this); @@ -385,8 +391,8 @@ class PandasConverter { // Conversion logic for various object dtype arrays template - Status ConvertTypedLists( - const std::shared_ptr& type, ListBuilder* builder, PyObject* list); + Status ConvertTypedLists(const std::shared_ptr& type, ListBuilder* builder, + PyObject* list); template Status ConvertDates(); @@ -397,8 +403,8 @@ class PandasConverter { Status ConvertObjectFixedWidthBytes(const std::shared_ptr& type); Status ConvertObjectIntegers(); Status ConvertLists(const std::shared_ptr& type); - Status ConvertLists( - const std::shared_ptr& type, ListBuilder* builder, PyObject* list); + Status ConvertLists(const std::shared_ptr& type, ListBuilder* builder, + PyObject* list); Status ConvertObjects(); Status ConvertDecimals(); Status ConvertTimes(); @@ -428,12 +434,14 @@ void CopyStrided(T* input_data, int64_t length, int64_t stride, T* output_data) } template <> -void CopyStrided( - PyObject** input_data, int64_t length, int64_t stride, PyObject** output_data) { +void CopyStrided(PyObject** input_data, int64_t length, int64_t stride, + PyObject** output_data) { int64_t j = 0; for (int64_t i = 0; i < length; ++i) { output_data[i] = input_data[j]; - if (output_data[i] != nullptr) { Py_INCREF(output_data[i]); } + if (output_data[i] != nullptr) { + Py_INCREF(output_data[i]); + } j += stride; } } @@ -458,7 +466,7 @@ inline Status PandasConverter::ConvertData(std::shared_ptr* data) { auto new_buffer = std::make_shared(pool_); RETURN_NOT_OK(new_buffer->Resize(sizeof(T) * length_)); CopyStrided(reinterpret_cast(PyArray_DATA(arr_)), length_, stride_elements, - reinterpret_cast(new_buffer->mutable_data())); + reinterpret_cast(new_buffer->mutable_data())); *data = new_buffer; } else { // Can zero-copy @@ -479,7 +487,9 @@ inline Status PandasConverter::ConvertData(std::shared_ptr* memset(bitmap, 0, nbytes); for (int i = 0; i < length_; ++i) { - if (values[i] > 0) { BitUtil::SetBit(bitmap, i); } + if (values[i] > 0) { + BitUtil::SetBit(bitmap, i); + } } *data = buffer; @@ -913,8 +923,8 @@ Status LoopPySequence(PyObject* sequence, T func) { } template -inline Status PandasConverter::ConvertTypedLists( - const std::shared_ptr& type, ListBuilder* builder, PyObject* list) { +inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr& type, + ListBuilder* builder, PyObject* list) { typedef npy_traits traits; typedef typename traits::value_type T; typedef typename traits::BuilderClass BuilderT; @@ -1002,8 +1012,8 @@ inline Status PandasConverter::ConvertTypedLists( RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT)); int64_t offset = 0; - RETURN_NOT_OK(AppendObjectStrings( - numpy_array, nullptr, 0, value_builder, &offset, &have_bytes)); + RETURN_NOT_OK(AppendObjectStrings(numpy_array, nullptr, 0, value_builder, &offset, + &have_bytes)); if (offset < PyArray_SIZE(numpy_array)) { return Status::Invalid("Array cell value exceeded 2GB"); } @@ -1032,8 +1042,8 @@ inline Status PandasConverter::ConvertTypedLists( return ConvertTypedLists(type, builder, list); \ } -Status PandasConverter::ConvertLists( - const std::shared_ptr& type, ListBuilder* builder, PyObject* list) { +Status PandasConverter::ConvertLists(const std::shared_ptr& type, + ListBuilder* builder, PyObject* list) { switch (type->id()) { LIST_CASE(UINT8, NPY_UINT8, UInt8Type) LIST_CASE(INT8, NPY_INT8, Int8Type) @@ -1080,7 +1090,7 @@ Status PandasConverter::ConvertLists(const std::shared_ptr& type) { } Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr& type, std::shared_ptr* out) { + const std::shared_ptr& type, std::shared_ptr* out) { PandasConverter converter(pool, ao, mo, type); RETURN_NOT_OK(converter.Convert()); *out = converter.result()[0]; @@ -1088,7 +1098,8 @@ Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, } Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr& type, std::shared_ptr* out) { + const std::shared_ptr& type, + std::shared_ptr* out) { PandasConverter converter(pool, ao, mo, type); RETURN_NOT_OK(converter.ConvertObjects()); *out = std::make_shared(converter.result()); diff --git a/cpp/src/arrow/python/pandas_to_arrow.h b/cpp/src/arrow/python/pandas_to_arrow.h index 8f1862470bc94..3e655ba3feec0 100644 --- a/cpp/src/arrow/python/pandas_to_arrow.h +++ b/cpp/src/arrow/python/pandas_to_arrow.h @@ -38,7 +38,7 @@ namespace py { ARROW_EXPORT Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr& type, std::shared_ptr* out); + const std::shared_ptr& type, std::shared_ptr* out); /// Convert dtype=object arrays. If target data type is not known, pass a type /// with nullptr @@ -50,7 +50,8 @@ Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, /// \param[out] out a ChunkedArray, to accommodate chunked output ARROW_EXPORT Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr& type, std::shared_ptr* out); + const std::shared_ptr& type, + std::shared_ptr* out); } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/platform.h b/cpp/src/arrow/python/platform.h index a354b38f04cea..ae394695fac0d 100644 --- a/cpp/src/arrow/python/platform.h +++ b/cpp/src/arrow/python/platform.h @@ -23,6 +23,7 @@ #include #include +#include // Work around C2528 error #if _MSC_VER >= 1900 diff --git a/cpp/src/arrow/python/pyarrow.cc b/cpp/src/arrow/python/pyarrow.cc index 5d88051117b78..d080cc0a8147f 100644 --- a/cpp/src/arrow/python/pyarrow.cc +++ b/cpp/src/arrow/python/pyarrow.cc @@ -31,13 +31,9 @@ namespace { namespace arrow { namespace py { -int import_pyarrow() { - return ::import_pyarrow__lib(); -} +int import_pyarrow() { return ::import_pyarrow__lib(); } -bool is_buffer(PyObject* buffer) { - return ::pyarrow_is_buffer(buffer) != 0; -} +bool is_buffer(PyObject* buffer) { return ::pyarrow_is_buffer(buffer) != 0; } Status unwrap_buffer(PyObject* buffer, std::shared_ptr* out) { *out = ::pyarrow_unwrap_buffer(buffer); @@ -52,9 +48,7 @@ PyObject* wrap_buffer(const std::shared_ptr& buffer) { return ::pyarrow_wrap_buffer(buffer); } -bool is_data_type(PyObject* data_type) { - return ::pyarrow_is_data_type(data_type) != 0; -} +bool is_data_type(PyObject* data_type) { return ::pyarrow_is_data_type(data_type) != 0; } Status unwrap_data_type(PyObject* object, std::shared_ptr* out) { *out = ::pyarrow_unwrap_data_type(object); @@ -69,9 +63,7 @@ PyObject* wrap_data_type(const std::shared_ptr& type) { return ::pyarrow_wrap_data_type(type); } -bool is_field(PyObject* field) { - return ::pyarrow_is_field(field) != 0; -} +bool is_field(PyObject* field) { return ::pyarrow_is_field(field) != 0; } Status unwrap_field(PyObject* field, std::shared_ptr* out) { *out = ::pyarrow_unwrap_field(field); @@ -86,9 +78,7 @@ PyObject* wrap_field(const std::shared_ptr& field) { return ::pyarrow_wrap_field(field); } -bool is_schema(PyObject* schema) { - return ::pyarrow_is_schema(schema) != 0; -} +bool is_schema(PyObject* schema) { return ::pyarrow_is_schema(schema) != 0; } Status unwrap_schema(PyObject* schema, std::shared_ptr* out) { *out = ::pyarrow_unwrap_schema(schema); @@ -103,9 +93,7 @@ PyObject* wrap_schema(const std::shared_ptr& schema) { return ::pyarrow_wrap_schema(schema); } -bool is_array(PyObject* array) { - return ::pyarrow_is_array(array) != 0; -} +bool is_array(PyObject* array) { return ::pyarrow_is_array(array) != 0; } Status unwrap_array(PyObject* array, std::shared_ptr* out) { *out = ::pyarrow_unwrap_array(array); @@ -120,9 +108,7 @@ PyObject* wrap_array(const std::shared_ptr& array) { return ::pyarrow_wrap_array(array); } -bool is_tensor(PyObject* tensor) { - return ::pyarrow_is_tensor(tensor) != 0; -} +bool is_tensor(PyObject* tensor) { return ::pyarrow_is_tensor(tensor) != 0; } Status unwrap_tensor(PyObject* tensor, std::shared_ptr* out) { *out = ::pyarrow_unwrap_tensor(tensor); @@ -137,9 +123,7 @@ PyObject* wrap_tensor(const std::shared_ptr& tensor) { return ::pyarrow_wrap_tensor(tensor); } -bool is_column(PyObject* column) { - return ::pyarrow_is_column(column) != 0; -} +bool is_column(PyObject* column) { return ::pyarrow_is_column(column) != 0; } Status unwrap_column(PyObject* column, std::shared_ptr* out) { *out = ::pyarrow_unwrap_column(column); @@ -154,9 +138,7 @@ PyObject* wrap_column(const std::shared_ptr& column) { return ::pyarrow_wrap_column(column); } -bool is_table(PyObject* table) { - return ::pyarrow_is_table(table) != 0; -} +bool is_table(PyObject* table) { return ::pyarrow_is_table(table) != 0; } Status unwrap_table(PyObject* table, std::shared_ptr
* out) { *out = ::pyarrow_unwrap_table(table); @@ -171,9 +153,7 @@ PyObject* wrap_table(const std::shared_ptr
& table) { return ::pyarrow_wrap_table(table); } -bool is_record_batch(PyObject* batch) { - return ::pyarrow_is_batch(batch) != 0; -} +bool is_record_batch(PyObject* batch) { return ::pyarrow_is_batch(batch) != 0; } Status unwrap_record_batch(PyObject* batch, std::shared_ptr* out) { *out = ::pyarrow_unwrap_batch(batch); diff --git a/cpp/src/arrow/python/pyarrow.h b/cpp/src/arrow/python/pyarrow.h index 7278d1c285785..e637627006177 100644 --- a/cpp/src/arrow/python/pyarrow.h +++ b/cpp/src/arrow/python/pyarrow.h @@ -74,8 +74,8 @@ ARROW_EXPORT Status unwrap_table(PyObject* table, std::shared_ptr
* out); ARROW_EXPORT PyObject* wrap_table(const std::shared_ptr
& table); ARROW_EXPORT bool is_record_batch(PyObject* batch); -ARROW_EXPORT Status unwrap_record_batch( - PyObject* batch, std::shared_ptr* out); +ARROW_EXPORT Status unwrap_record_batch(PyObject* batch, + std::shared_ptr* out); ARROW_EXPORT PyObject* wrap_record_batch(const std::shared_ptr& batch); } // namespace py diff --git a/cpp/src/arrow/python/python-test.cc b/cpp/src/arrow/python/python-test.cc index c0e555d4904d5..b50699d1ae9d4 100644 --- a/cpp/src/arrow/python/python-test.cc +++ b/cpp/src/arrow/python/python-test.cc @@ -36,9 +36,7 @@ namespace arrow { namespace py { -TEST(PyBuffer, InvalidInputObject) { - PyBuffer buffer(Py_None); -} +TEST(PyBuffer, InvalidInputObject) { PyBuffer buffer(Py_None); } TEST(DecimalTest, TestPythonDecimalToString) { PyAcquireGIL lock; @@ -58,8 +56,8 @@ TEST(DecimalTest, TestPythonDecimalToString) { auto c_string_size = decimal_string.size(); ASSERT_GT(c_string_size, 0); - OwnedRef pydecimal(PyObject_CallFunction( - Decimal.obj(), const_cast(format), c_string, c_string_size)); + OwnedRef pydecimal(PyObject_CallFunction(Decimal.obj(), const_cast(format), + c_string, c_string_size)); ASSERT_NE(pydecimal.obj(), nullptr); ASSERT_EQ(PyErr_Occurred(), nullptr); @@ -88,7 +86,8 @@ TEST(PandasConversionTest, TestObjectBlockWriteFails) { auto f3 = field("f3", utf8()); std::vector> fields = {f1, f2, f3}; std::vector> cols = {std::make_shared(f1, arr), - std::make_shared(f2, arr), std::make_shared(f3, arr)}; + std::make_shared(f2, arr), + std::make_shared(f3, arr)}; auto schema = std::make_shared(fields); auto table = std::make_shared
(schema, cols); diff --git a/cpp/src/arrow/python/util/datetime.h b/cpp/src/arrow/python/util/datetime.h index d32421e8e3652..de7515101518a 100644 --- a/cpp/src/arrow/python/util/datetime.h +++ b/cpp/src/arrow/python/util/datetime.h @@ -18,8 +18,8 @@ #ifndef PYARROW_UTIL_DATETIME_H #define PYARROW_UTIL_DATETIME_H -#include "arrow/python/platform.h" #include +#include "arrow/python/platform.h" namespace arrow { namespace py { @@ -31,8 +31,8 @@ static inline int64_t PyTime_to_us(PyObject* pytime) { PyDateTime_TIME_GET_MICROSECOND(pytime)); } -static inline Status PyTime_from_int( - int64_t val, const TimeUnit::type unit, PyObject** out) { +static inline Status PyTime_from_int(int64_t val, const TimeUnit::type unit, + PyObject** out) { int64_t hour = 0, minute = 0, second = 0, microsecond = 0; switch (unit) { case TimeUnit::NANO: @@ -65,7 +65,7 @@ static inline Status PyTime_from_int( break; } *out = PyTime_FromTime(static_cast(hour), static_cast(minute), - static_cast(second), static_cast(microsecond)); + static_cast(second), static_cast(microsecond)); return Status::OK(); } diff --git a/cpp/src/arrow/status.cc b/cpp/src/arrow/status.cc index 99897428eaed3..9b509b4835126 100644 --- a/cpp/src/arrow/status.cc +++ b/cpp/src/arrow/status.cc @@ -33,7 +33,9 @@ void Status::CopyFrom(const State* state) { } std::string Status::CodeAsString() const { - if (state_ == NULL) { return "OK"; } + if (state_ == NULL) { + return "OK"; + } const char* type; switch (code()) { @@ -70,7 +72,9 @@ std::string Status::CodeAsString() const { std::string Status::ToString() const { std::string result(CodeAsString()); - if (state_ == NULL) { return result; } + if (state_ == NULL) { + return result; + } result += ": "; result += state_->msg; return result; diff --git a/cpp/src/arrow/status.h b/cpp/src/arrow/status.h index 1bea1fca84ebb..a02752f21e4b9 100644 --- a/cpp/src/arrow/status.h +++ b/cpp/src/arrow/status.h @@ -23,10 +23,12 @@ #include "arrow/util/visibility.h" // Return the given status if it is not OK. -#define ARROW_RETURN_NOT_OK(s) \ - do { \ - ::arrow::Status _s = (s); \ - if (ARROW_PREDICT_FALSE(!_s.ok())) { return _s; } \ +#define ARROW_RETURN_NOT_OK(s) \ + do { \ + ::arrow::Status _s = (s); \ + if (ARROW_PREDICT_FALSE(!_s.ok())) { \ + return _s; \ + } \ } while (0) // If 'to_call' returns a bad status, CHECK immediately with a logged message @@ -43,10 +45,12 @@ namespace arrow { -#define RETURN_NOT_OK(s) \ - do { \ - Status _s = (s); \ - if (ARROW_PREDICT_FALSE(!_s.ok())) { return _s; } \ +#define RETURN_NOT_OK(s) \ + do { \ + Status _s = (s); \ + if (ARROW_PREDICT_FALSE(!_s.ok())) { \ + return _s; \ + } \ } while (0) #define RETURN_NOT_OK_ELSE(s, else_) \ @@ -187,7 +191,9 @@ inline Status::Status(const Status& s) inline void Status::operator=(const Status& s) { // The following condition catches both aliasing (when this == &s), // and the common case where both s and *this are ok. - if (state_ != s.state_) { CopyFrom(s.state_); } + if (state_ != s.state_) { + CopyFrom(s.state_); + } } } // namespace arrow diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc index e46fdc77cf761..8dba8c052e922 100644 --- a/cpp/src/arrow/table-test.cc +++ b/cpp/src/arrow/table-test.cc @@ -198,11 +198,11 @@ class TestTable : public TestBase { schema_ = std::make_shared(fields); arrays_ = {MakePrimitive(length), MakePrimitive(length), - MakePrimitive(length)}; + MakePrimitive(length)}; columns_ = {std::make_shared(schema_->field(0), arrays_[0]), - std::make_shared(schema_->field(1), arrays_[1]), - std::make_shared(schema_->field(2), arrays_[2])}; + std::make_shared(schema_->field(1), arrays_[1]), + std::make_shared(schema_->field(2), arrays_[2])}; } protected: @@ -412,8 +412,8 @@ TEST_F(TestTable, AddColumn) { ASSERT_OK(table.AddColumn(0, columns_[0], &result)); auto ex_schema = std::shared_ptr(new Schema( {schema_->field(0), schema_->field(0), schema_->field(1), schema_->field(2)})); - std::vector> ex_columns = { - table.column(0), table.column(0), table.column(1), table.column(2)}; + std::vector> ex_columns = {table.column(0), table.column(0), + table.column(1), table.column(2)}; ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); ASSERT_OK(table.AddColumn(1, columns_[0], &result)); diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index c09628ed395c4..a0a25079e6ed7 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -43,8 +43,12 @@ ChunkedArray::ChunkedArray(const ArrayVector& chunks) : chunks_(chunks) { } bool ChunkedArray::Equals(const ChunkedArray& other) const { - if (length_ != other.length()) { return false; } - if (null_count_ != other.null_count()) { return false; } + if (length_ != other.length()) { + return false; + } + if (null_count_ != other.null_count()) { + return false; + } // Check contents of the underlying arrays. This checks for equality of // the underlying data independently of the chunk size. @@ -57,10 +61,10 @@ bool ChunkedArray::Equals(const ChunkedArray& other) const { while (elements_compared < length_) { const std::shared_ptr this_array = chunks_[this_chunk_idx]; const std::shared_ptr other_array = other.chunk(other_chunk_idx); - int64_t common_length = std::min( - this_array->length() - this_start_idx, other_array->length() - other_start_idx); + int64_t common_length = std::min(this_array->length() - this_start_idx, + other_array->length() - other_start_idx); if (!this_array->RangeEquals(this_start_idx, this_start_idx + common_length, - other_start_idx, other_array)) { + other_start_idx, other_array)) { return false; } @@ -85,8 +89,12 @@ bool ChunkedArray::Equals(const ChunkedArray& other) const { } bool ChunkedArray::Equals(const std::shared_ptr& other) const { - if (this == other.get()) { return true; } - if (!other) { return false; } + if (this == other.get()) { + return true; + } + if (!other) { + return false; + } return Equals(*other.get()); } @@ -107,18 +115,24 @@ Column::Column(const std::shared_ptr& field, const std::shared_ptr Column::Column(const std::string& name, const std::shared_ptr& data) : Column(::arrow::field(name, data->type()), data) {} -Column::Column( - const std::shared_ptr& field, const std::shared_ptr& data) +Column::Column(const std::shared_ptr& field, + const std::shared_ptr& data) : field_(field), data_(data) {} bool Column::Equals(const Column& other) const { - if (!field_->Equals(other.field())) { return false; } + if (!field_->Equals(other.field())) { + return false; + } return data_->Equals(other.data()); } bool Column::Equals(const std::shared_ptr& other) const { - if (this == other.get()) { return true; } - if (!other) { return false; } + if (this == other.get()) { + return true; + } + if (!other) { + return false; + } return Equals(*other.get()); } @@ -141,11 +155,13 @@ Status Column::ValidateData() { void AssertBatchValid(const RecordBatch& batch) { Status s = batch.Validate(); - if (!s.ok()) { DCHECK(false) << s.ToString(); } + if (!s.ok()) { + DCHECK(false) << s.ToString(); + } } RecordBatch::RecordBatch(const std::shared_ptr& schema, int64_t num_rows, - const std::vector>& columns) + const std::vector>& columns) : schema_(schema), num_rows_(num_rows), columns_(columns.size()) { for (size_t i = 0; i < columns.size(); ++i) { columns_[i] = columns[i]->data(); @@ -153,7 +169,7 @@ RecordBatch::RecordBatch(const std::shared_ptr& schema, int64_t num_rows } RecordBatch::RecordBatch(const std::shared_ptr& schema, int64_t num_rows, - std::vector>&& columns) + std::vector>&& columns) : schema_(schema), num_rows_(num_rows), columns_(columns.size()) { for (size_t i = 0; i < columns.size(); ++i) { columns_[i] = columns[i]->data(); @@ -161,11 +177,11 @@ RecordBatch::RecordBatch(const std::shared_ptr& schema, int64_t num_rows } RecordBatch::RecordBatch(const std::shared_ptr& schema, int64_t num_rows, - std::vector>&& columns) + std::vector>&& columns) : schema_(schema), num_rows_(num_rows), columns_(std::move(columns)) {} RecordBatch::RecordBatch(const std::shared_ptr& schema, int64_t num_rows, - const std::vector>& columns) + const std::vector>& columns) : schema_(schema), num_rows_(num_rows), columns_(columns) {} std::shared_ptr RecordBatch::column(int i) const { @@ -184,7 +200,9 @@ bool RecordBatch::Equals(const RecordBatch& other) const { } for (int i = 0; i < num_columns(); ++i) { - if (!column(i)->Equals(other.column(i))) { return false; } + if (!column(i)->Equals(other.column(i))) { + return false; + } } return true; @@ -196,7 +214,9 @@ bool RecordBatch::ApproxEquals(const RecordBatch& other) const { } for (int i = 0; i < num_columns(); ++i) { - if (!column(i)->ApproxEquals(other.column(i))) { return false; } + if (!column(i)->ApproxEquals(other.column(i))) { + return false; + } } return true; @@ -253,7 +273,7 @@ Status RecordBatch::Validate() const { // Table methods Table::Table(const std::shared_ptr& schema, - const std::vector>& columns) + const std::vector>& columns) : schema_(schema), columns_(columns) { if (columns.size() == 0) { num_rows_ = 0; @@ -263,7 +283,7 @@ Table::Table(const std::shared_ptr& schema, } Table::Table(const std::shared_ptr& schema, - const std::vector>& columns, int64_t num_rows) + const std::vector>& columns, int64_t num_rows) : schema_(schema), columns_(columns), num_rows_(num_rows) {} std::shared_ptr
Table::ReplaceSchemaMetadata( @@ -273,7 +293,7 @@ std::shared_ptr
Table::ReplaceSchemaMetadata( } Status Table::FromRecordBatches(const std::vector>& batches, - std::shared_ptr
* table) { + std::shared_ptr
* table) { if (batches.size() == 0) { return Status::Invalid("Must pass at least one record batch"); } @@ -307,9 +327,11 @@ Status Table::FromRecordBatches(const std::vector>& return Status::OK(); } -Status ConcatenateTables( - const std::vector>& tables, std::shared_ptr
* table) { - if (tables.size() == 0) { return Status::Invalid("Must pass at least one table"); } +Status ConcatenateTables(const std::vector>& tables, + std::shared_ptr
* table) { + if (tables.size() == 0) { + return Status::Invalid("Must pass at least one table"); + } std::shared_ptr schema = tables[0]->schema(); @@ -343,12 +365,20 @@ Status ConcatenateTables( } bool Table::Equals(const Table& other) const { - if (this == &other) { return true; } - if (!schema_->Equals(*other.schema())) { return false; } - if (static_cast(columns_.size()) != other.num_columns()) { return false; } + if (this == &other) { + return true; + } + if (!schema_->Equals(*other.schema())) { + return false; + } + if (static_cast(columns_.size()) != other.num_columns()) { + return false; + } for (int i = 0; i < static_cast(columns_.size()); i++) { - if (!columns_[i]->Equals(other.column(i))) { return false; } + if (!columns_[i]->Equals(other.column(i))) { + return false; + } } return true; } @@ -361,9 +391,11 @@ Status Table::RemoveColumn(int i, std::shared_ptr
* out) const { return Status::OK(); } -Status Table::AddColumn( - int i, const std::shared_ptr& col, std::shared_ptr
* out) const { - if (i < 0 || i > num_columns() + 1) { return Status::Invalid("Invalid column index."); } +Status Table::AddColumn(int i, const std::shared_ptr& col, + std::shared_ptr
* out) const { + if (i < 0 || i > num_columns() + 1) { + return Status::Invalid("Invalid column index."); + } if (col == nullptr) { std::stringstream ss; ss << "Column " << i << " was null"; @@ -407,7 +439,8 @@ Status Table::ValidateColumns() const { } Status ARROW_EXPORT MakeTable(const std::shared_ptr& schema, - const std::vector>& arrays, std::shared_ptr
* table) { + const std::vector>& arrays, + std::shared_ptr
* table) { // Make sure the length of the schema corresponds to the length of the vector if (schema->num_fields() != static_cast(arrays.size())) { std::stringstream ss; diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 7ada0e9709f05..6afd618da043b 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -121,11 +121,11 @@ class ARROW_EXPORT RecordBatch { /// num_rows RecordBatch(const std::shared_ptr& schema, int64_t num_rows, - const std::vector>& columns); + const std::vector>& columns); /// \brief Deprecated move constructor for a vector of Array instances RecordBatch(const std::shared_ptr& schema, int64_t num_rows, - std::vector>&& columns); + std::vector>&& columns); /// \brief Construct record batch from vector of internal data structures /// \since 0.5.0 @@ -138,12 +138,12 @@ class ARROW_EXPORT RecordBatch { /// should be equal to the length of each field /// \param columns the data for the batch's columns RecordBatch(const std::shared_ptr& schema, int64_t num_rows, - std::vector>&& columns); + std::vector>&& columns); /// \brief Construct record batch by copying vector of array data /// \since 0.5.0 RecordBatch(const std::shared_ptr& schema, int64_t num_rows, - const std::vector>& columns); + const std::vector>& columns); bool Equals(const RecordBatch& other) const; @@ -194,14 +194,14 @@ class ARROW_EXPORT Table { public: // If columns is zero-length, the table's number of rows is zero Table(const std::shared_ptr& schema, - const std::vector>& columns); + const std::vector>& columns); // num_rows is a parameter to allow for tables of a particular size not // having any materialized columns. Each column should therefore have the // same length as num_rows -- you can validate this using // Table::ValidateColumns Table(const std::shared_ptr& schema, - const std::vector>& columns, int64_t num_rows); + const std::vector>& columns, int64_t num_rows); // Construct table from RecordBatch, but only if all of the batch schemas are // equal. Returns Status::Invalid if there is some problem @@ -221,8 +221,8 @@ class ARROW_EXPORT Table { Status RemoveColumn(int i, std::shared_ptr
* out) const; /// Add column to the table, producing a new Table - Status AddColumn( - int i, const std::shared_ptr& column, std::shared_ptr
* out) const; + Status AddColumn(int i, const std::shared_ptr& column, + std::shared_ptr
* out) const; /// \brief Replace schema key-value metadata with new metadata (EXPERIMENTAL) /// \since 0.5.0 @@ -252,11 +252,12 @@ class ARROW_EXPORT Table { // Construct table from multiple input tables. Return Status::Invalid if // schemas are not equal -Status ARROW_EXPORT ConcatenateTables( - const std::vector>& tables, std::shared_ptr
* table); +Status ARROW_EXPORT ConcatenateTables(const std::vector>& tables, + std::shared_ptr
* table); Status ARROW_EXPORT MakeTable(const std::shared_ptr& schema, - const std::vector>& arrays, std::shared_ptr
* table); + const std::vector>& arrays, + std::shared_ptr
* table); } // namespace arrow diff --git a/cpp/src/arrow/tensor.cc b/cpp/src/arrow/tensor.cc index bcd9d8d94c6b4..31b1a359219a6 100644 --- a/cpp/src/arrow/tensor.cc +++ b/cpp/src/arrow/tensor.cc @@ -35,7 +35,8 @@ namespace arrow { static void ComputeRowMajorStrides(const FixedWidthType& type, - const std::vector& shape, std::vector* strides) { + const std::vector& shape, + std::vector* strides) { int64_t remaining = type.bit_width() / 8; for (int64_t dimsize : shape) { remaining *= dimsize; @@ -53,7 +54,8 @@ static void ComputeRowMajorStrides(const FixedWidthType& type, } static void ComputeColumnMajorStrides(const FixedWidthType& type, - const std::vector& shape, std::vector* strides) { + const std::vector& shape, + std::vector* strides) { int64_t total = type.bit_width() / 8; for (int64_t dimsize : shape) { if (dimsize == 0) { @@ -69,8 +71,8 @@ static void ComputeColumnMajorStrides(const FixedWidthType& type, /// Constructor with strides and dimension names Tensor::Tensor(const std::shared_ptr& type, const std::shared_ptr& data, - const std::vector& shape, const std::vector& strides, - const std::vector& dim_names) + const std::vector& shape, const std::vector& strides, + const std::vector& dim_names) : type_(type), data_(data), shape_(shape), strides_(strides), dim_names_(dim_names) { DCHECK(is_tensor_supported(type->id())); if (shape.size() > 0 && strides.size() == 0) { @@ -79,11 +81,11 @@ Tensor::Tensor(const std::shared_ptr& type, const std::shared_ptr& type, const std::shared_ptr& data, - const std::vector& shape, const std::vector& strides) + const std::vector& shape, const std::vector& strides) : Tensor(type, data, shape, strides, {}) {} Tensor::Tensor(const std::shared_ptr& type, const std::shared_ptr& data, - const std::vector& shape) + const std::vector& shape) : Tensor(type, data, shape, {}, {}) {} const std::string& Tensor::dim_name(int i) const { @@ -100,9 +102,7 @@ int64_t Tensor::size() const { return std::accumulate(shape_.begin(), shape_.end(), 1LL, std::multiplies()); } -bool Tensor::is_contiguous() const { - return is_row_major() || is_column_major(); -} +bool Tensor::is_contiguous() const { return is_row_major() || is_column_major(); } bool Tensor::is_row_major() const { std::vector c_strides; @@ -118,14 +118,14 @@ bool Tensor::is_column_major() const { return strides_ == f_strides; } -Type::type Tensor::type_id() const { - return type_->id(); -} +Type::type Tensor::type_id() const { return type_->id(); } bool Tensor::Equals(const Tensor& other) const { bool are_equal = false; Status error = TensorEquals(*this, other, &are_equal); - if (!error.ok()) { DCHECK(false) << "Tensors not comparable: " << error.ToString(); } + if (!error.ok()) { + DCHECK(false) << "Tensors not comparable: " << error.ToString(); + } return are_equal; } diff --git a/cpp/src/arrow/tensor.h b/cpp/src/arrow/tensor.h index 371f5911a4396..b074b8c309ba1 100644 --- a/cpp/src/arrow/tensor.h +++ b/cpp/src/arrow/tensor.h @@ -62,16 +62,16 @@ class ARROW_EXPORT Tensor { /// Constructor with no dimension names or strides, data assumed to be row-major Tensor(const std::shared_ptr& type, const std::shared_ptr& data, - const std::vector& shape); + const std::vector& shape); /// Constructor with non-negative strides Tensor(const std::shared_ptr& type, const std::shared_ptr& data, - const std::vector& shape, const std::vector& strides); + const std::vector& shape, const std::vector& strides); /// Constructor with strides and dimension names Tensor(const std::shared_ptr& type, const std::shared_ptr& data, - const std::vector& shape, const std::vector& strides, - const std::vector& dim_names); + const std::vector& shape, const std::vector& strides, + const std::vector& dim_names); std::shared_ptr type() const { return type_; } std::shared_ptr data() const { return data_; } diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 2bc662526713e..1a3376cee6053 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -39,16 +39,20 @@ #include "arrow/util/logging.h" #include "arrow/util/random.h" -#define ASSERT_RAISES(ENUM, expr) \ - do { \ - ::arrow::Status s = (expr); \ - if (!s.Is##ENUM()) { FAIL() << s.ToString(); } \ +#define ASSERT_RAISES(ENUM, expr) \ + do { \ + ::arrow::Status s = (expr); \ + if (!s.Is##ENUM()) { \ + FAIL() << s.ToString(); \ + } \ } while (0) -#define ASSERT_OK(expr) \ - do { \ - ::arrow::Status s = (expr); \ - if (!s.ok()) { FAIL() << s.ToString(); } \ +#define ASSERT_OK(expr) \ + do { \ + ::arrow::Status s = (expr); \ + if (!s.ok()) { \ + FAIL() << s.ToString(); \ + } \ } while (0) #define ASSERT_OK_NO_THROW(expr) ASSERT_NO_THROW(ASSERT_OK(expr)) @@ -59,10 +63,12 @@ EXPECT_TRUE(s.ok()); \ } while (0) -#define ABORT_NOT_OK(s) \ - do { \ - ::arrow::Status _s = (s); \ - if (ARROW_PREDICT_FALSE(!_s.ok())) { exit(-1); } \ +#define ABORT_NOT_OK(s) \ + do { \ + ::arrow::Status _s = (s); \ + if (ARROW_PREDICT_FALSE(!_s.ok())) { \ + exit(-1); \ + } \ } while (0); namespace arrow { @@ -85,8 +91,8 @@ void randint(int64_t N, T lower, T upper, std::vector* out) { } template -void random_real( - int64_t n, uint32_t seed, T min_value, T max_value, std::vector* out) { +void random_real(int64_t n, uint32_t seed, T min_value, T max_value, + std::vector* out) { std::mt19937 gen(seed); std::uniform_real_distribution d(min_value, max_value); for (int64_t i = 0; i < n; ++i) { @@ -96,13 +102,13 @@ void random_real( template std::shared_ptr GetBufferFromVector(const std::vector& values) { - return std::make_shared( - reinterpret_cast(values.data()), values.size() * sizeof(T)); + return std::make_shared(reinterpret_cast(values.data()), + values.size() * sizeof(T)); } template -inline Status CopyBufferFromVector( - const std::vector& values, MemoryPool* pool, std::shared_ptr* result) { +inline Status CopyBufferFromVector(const std::vector& values, MemoryPool* pool, + std::shared_ptr* result) { int64_t nbytes = static_cast(values.size()) * sizeof(T); auto buffer = std::make_shared(pool); @@ -114,8 +120,8 @@ inline Status CopyBufferFromVector( } template -static inline Status GetBitmapFromVector( - const std::vector& is_valid, std::shared_ptr* result) { +static inline Status GetBitmapFromVector(const std::vector& is_valid, + std::shared_ptr* result) { size_t length = is_valid.size(); std::shared_ptr buffer; @@ -123,7 +129,9 @@ static inline Status GetBitmapFromVector( uint8_t* bitmap = buffer->mutable_data(); for (size_t i = 0; i < static_cast(length); ++i) { - if (is_valid[i]) { BitUtil::SetBit(bitmap, i); } + if (is_valid[i]) { + BitUtil::SetBit(bitmap, i); + } } *result = buffer; @@ -139,8 +147,8 @@ static inline void random_null_bytes(int64_t n, double pct_null, uint8_t* null_b } } -static inline void random_is_valid( - int64_t n, double pct_null, std::vector* is_valid) { +static inline void random_is_valid(int64_t n, double pct_null, + std::vector* is_valid) { Random rng(random_seed()); for (int64_t i = 0; i < n; ++i) { is_valid->push_back(rng.NextDoubleFraction() > pct_null); @@ -178,24 +186,28 @@ void rand_uniform_int(int64_t n, uint32_t seed, T min_value, T max_value, T* out static inline int64_t null_count(const std::vector& valid_bytes) { int64_t result = 0; for (size_t i = 0; i < valid_bytes.size(); ++i) { - if (valid_bytes[i] == 0) { ++result; } + if (valid_bytes[i] == 0) { + ++result; + } } return result; } Status MakeRandomInt32PoolBuffer(int64_t length, MemoryPool* pool, - std::shared_ptr* pool_buffer, uint32_t seed = 0) { + std::shared_ptr* pool_buffer, + uint32_t seed = 0) { DCHECK(pool); auto data = std::make_shared(pool); RETURN_NOT_OK(data->Resize(length * sizeof(int32_t))); test::rand_uniform_int(length, seed, 0, std::numeric_limits::max(), - reinterpret_cast(data->mutable_data())); + reinterpret_cast(data->mutable_data())); *pool_buffer = data; return Status::OK(); } Status MakeRandomBytePoolBuffer(int64_t length, MemoryPool* pool, - std::shared_ptr* pool_buffer, uint32_t seed = 0) { + std::shared_ptr* pool_buffer, + uint32_t seed = 0) { auto bytes = std::make_shared(pool); RETURN_NOT_OK(bytes->Resize(length)); test::random_bytes(length, seed, bytes->mutable_data()); @@ -207,8 +219,8 @@ Status MakeRandomBytePoolBuffer(int64_t length, MemoryPool* pool, template void ArrayFromVector(const std::shared_ptr& type, - const std::vector& is_valid, const std::vector& values, - std::shared_ptr* out) { + const std::vector& is_valid, const std::vector& values, + std::shared_ptr* out) { MemoryPool* pool = default_memory_pool(); typename TypeTraits::BuilderType builder(pool, type); for (size_t i = 0; i < values.size(); ++i) { @@ -223,7 +235,7 @@ void ArrayFromVector(const std::shared_ptr& type, template void ArrayFromVector(const std::vector& is_valid, const std::vector& values, - std::shared_ptr* out) { + std::shared_ptr* out) { MemoryPool* pool = default_memory_pool(); typename TypeTraits::BuilderType builder(pool); for (size_t i = 0; i < values.size(); ++i) { @@ -248,7 +260,7 @@ void ArrayFromVector(const std::vector& values, std::shared_ptr* template Status MakeArray(const std::vector& valid_bytes, const std::vector& values, - int64_t size, Builder* builder, std::shared_ptr* out) { + int64_t size, Builder* builder, std::shared_ptr* out) { // Append the first 1000 for (int64_t i = 0; i < size; ++i) { if (valid_bytes[i] > 0) { diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc index 7f3adef633767..6b86b4d2f1024 100644 --- a/cpp/src/arrow/type-test.cc +++ b/cpp/src/arrow/type-test.cc @@ -345,16 +345,16 @@ TEST(TestTimestampType, ToString) { } TEST(TestNestedType, Equals) { - auto create_struct = []( - std::string inner_name, std::string struct_name) -> shared_ptr { + auto create_struct = [](std::string inner_name, + std::string struct_name) -> shared_ptr { auto f_type = field(inner_name, int32()); vector> fields = {f_type}; auto s_type = std::make_shared(fields); return field(struct_name, s_type); }; - auto create_union = []( - std::string inner_name, std::string union_name) -> shared_ptr { + auto create_union = [](std::string inner_name, + std::string union_name) -> shared_ptr { auto f_type = field(inner_name, int32()); vector> fields = {f_type}; vector codes = {Type::INT32}; diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 623c1934f875e..586da2d86d909 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -37,7 +37,7 @@ std::shared_ptr Field::AddMetadata( } Status Field::AddMetadata(const std::shared_ptr& metadata, - std::shared_ptr* out) const { + std::shared_ptr* out) const { *out = AddMetadata(metadata); return Status::OK(); } @@ -47,7 +47,9 @@ std::shared_ptr Field::RemoveMetadata() const { } bool Field::Equals(const Field& other) const { - if (this == &other) { return true; } + if (this == &other) { + return true; + } if (this->name_ == other.name_ && this->nullable_ == other.nullable_ && this->type_->Equals(*other.type_.get())) { if (metadata_ == nullptr && other.metadata_ == nullptr) { @@ -68,7 +70,9 @@ bool Field::Equals(const std::shared_ptr& other) const { std::string Field::ToString() const { std::stringstream ss; ss << this->name_ << ": " << this->type_->ToString(); - if (!this->nullable_) { ss << " not null"; } + if (!this->nullable_) { + ss << " not null"; + } return ss.str(); } @@ -77,34 +81,28 @@ DataType::~DataType() {} bool DataType::Equals(const DataType& other) const { bool are_equal = false; Status error = TypeEquals(*this, other, &are_equal); - if (!error.ok()) { DCHECK(false) << "Types not comparable: " << error.ToString(); } + if (!error.ok()) { + DCHECK(false) << "Types not comparable: " << error.ToString(); + } return are_equal; } bool DataType::Equals(const std::shared_ptr& other) const { - if (!other) { return false; } + if (!other) { + return false; + } return Equals(*other.get()); } -std::string BooleanType::ToString() const { - return name(); -} +std::string BooleanType::ToString() const { return name(); } -FloatingPoint::Precision HalfFloatType::precision() const { - return FloatingPoint::HALF; -} +FloatingPoint::Precision HalfFloatType::precision() const { return FloatingPoint::HALF; } -FloatingPoint::Precision FloatType::precision() const { - return FloatingPoint::SINGLE; -} +FloatingPoint::Precision FloatType::precision() const { return FloatingPoint::SINGLE; } -FloatingPoint::Precision DoubleType::precision() const { - return FloatingPoint::DOUBLE; -} +FloatingPoint::Precision DoubleType::precision() const { return FloatingPoint::DOUBLE; } -std::string StringType::ToString() const { - return std::string("string"); -} +std::string StringType::ToString() const { return std::string("string"); } std::string ListType::ToString() const { std::stringstream s; @@ -112,13 +110,9 @@ std::string ListType::ToString() const { return s.str(); } -std::string BinaryType::ToString() const { - return std::string("binary"); -} +std::string BinaryType::ToString() const { return std::string("binary"); } -int FixedSizeBinaryType::bit_width() const { - return CHAR_BIT * byte_width(); -} +int FixedSizeBinaryType::bit_width() const { return CHAR_BIT * byte_width(); } std::string FixedSizeBinaryType::ToString() const { std::stringstream ss; @@ -130,7 +124,9 @@ std::string StructType::ToString() const { std::stringstream s; s << "struct<"; for (int i = 0; i < this->num_children(); ++i) { - if (i > 0) { s << ", "; } + if (i > 0) { + s << ", "; + } std::shared_ptr field = this->child(i); s << field->name() << ": " << field->type()->ToString(); } @@ -148,13 +144,9 @@ Date32Type::Date32Type() : DateType(Type::DATE32, DateUnit::DAY) {} Date64Type::Date64Type() : DateType(Type::DATE64, DateUnit::MILLI) {} -std::string Date64Type::ToString() const { - return std::string("date64[ms]"); -} +std::string Date64Type::ToString() const { return std::string("date64[ms]"); } -std::string Date32Type::ToString() const { - return std::string("date32[day]"); -} +std::string Date32Type::ToString() const { return std::string("date32[day]"); } // ---------------------------------------------------------------------- // Time types @@ -190,7 +182,9 @@ std::string Time64Type::ToString() const { std::string TimestampType::ToString() const { std::stringstream ss; ss << "timestamp[" << this->unit_; - if (this->timezone_.size() > 0) { ss << ", tz=" << this->timezone_; } + if (this->timezone_.size() > 0) { + ss << ", tz=" << this->timezone_; + } ss << "]"; return ss.str(); } @@ -199,7 +193,7 @@ std::string TimestampType::ToString() const { // Union type UnionType::UnionType(const std::vector>& fields, - const std::vector& type_codes, UnionMode mode) + const std::vector& type_codes, UnionMode mode) : NestedType(Type::UNION), mode_(mode), type_codes_(type_codes) { children_ = fields; } @@ -214,7 +208,9 @@ std::string UnionType::ToString() const { } for (size_t i = 0; i < children_.size(); ++i) { - if (i) { s << ", "; } + if (i) { + s << ", "; + } s << children_[i]->ToString() << "=" << static_cast(type_codes_[i]); } s << ">"; @@ -225,7 +221,7 @@ std::string UnionType::ToString() const { // DictionaryType DictionaryType::DictionaryType(const std::shared_ptr& index_type, - const std::shared_ptr& dictionary, bool ordered) + const std::shared_ptr& dictionary, bool ordered) : FixedWidthType(Type::DICTIONARY), index_type_(index_type), dictionary_(dictionary), @@ -235,9 +231,7 @@ int DictionaryType::bit_width() const { return static_cast(index_type_.get())->bit_width(); } -std::shared_ptr DictionaryType::dictionary() const { - return dictionary_; -} +std::shared_ptr DictionaryType::dictionary() const { return dictionary_; } std::string DictionaryType::ToString() const { std::stringstream ss; @@ -249,23 +243,27 @@ std::string DictionaryType::ToString() const { // ---------------------------------------------------------------------- // Null type -std::string NullType::ToString() const { - return name(); -} +std::string NullType::ToString() const { return name(); } // ---------------------------------------------------------------------- // Schema implementation Schema::Schema(const std::vector>& fields, - const std::shared_ptr& metadata) + const std::shared_ptr& metadata) : fields_(fields), metadata_(metadata) {} bool Schema::Equals(const Schema& other) const { - if (this == &other) { return true; } + if (this == &other) { + return true; + } - if (num_fields() != other.num_fields()) { return false; } + if (num_fields() != other.num_fields()) { + return false; + } for (int i = 0; i < num_fields(); ++i) { - if (!field(i)->Equals(*other.field(i).get())) { return false; } + if (!field(i)->Equals(*other.field(i).get())) { + return false; + } } return true; } @@ -290,8 +288,8 @@ int64_t Schema::GetFieldIndex(const std::string& name) const { } } -Status Schema::AddField( - int i, const std::shared_ptr& field, std::shared_ptr* out) const { +Status Schema::AddField(int i, const std::shared_ptr& field, + std::shared_ptr* out) const { DCHECK_GE(i, 0); DCHECK_LE(i, this->num_fields()); @@ -305,7 +303,7 @@ std::shared_ptr Schema::AddMetadata( } Status Schema::AddMetadata(const std::shared_ptr& metadata, - std::shared_ptr* out) const { + std::shared_ptr* out) const { *out = AddMetadata(metadata); return Status::OK(); } @@ -327,7 +325,9 @@ std::string Schema::ToString() const { int i = 0; for (auto field : fields_) { - if (i > 0) { buffer << std::endl; } + if (i > 0) { + buffer << std::endl; + } buffer << field->ToString(); ++i; } @@ -422,18 +422,18 @@ std::shared_ptr struct_(const std::vector>& fie } std::shared_ptr union_(const std::vector>& child_fields, - const std::vector& type_codes, UnionMode mode) { + const std::vector& type_codes, UnionMode mode) { return std::make_shared(child_fields, type_codes, mode); } std::shared_ptr dictionary(const std::shared_ptr& index_type, - const std::shared_ptr& dict_values) { + const std::shared_ptr& dict_values) { return std::make_shared(index_type, dict_values); } std::shared_ptr field(const std::string& name, - const std::shared_ptr& type, bool nullable, - const std::shared_ptr& metadata) { + const std::shared_ptr& type, bool nullable, + const std::shared_ptr& metadata) { return std::make_shared(name, type, nullable, metadata); } @@ -454,9 +454,7 @@ std::vector FixedWidthType::GetBufferLayout() const { return {kValidityBuffer, BufferDescr(BufferType::DATA, bit_width())}; } -std::vector NullType::GetBufferLayout() const { - return {}; -} +std::vector NullType::GetBufferLayout() const { return {}; } std::vector BinaryType::GetBufferLayout() const { return {kValidityBuffer, kOffsetBuffer, kValues8}; @@ -474,9 +472,7 @@ std::vector ListType::GetBufferLayout() const { return {kValidityBuffer, kOffsetBuffer}; } -std::vector StructType::GetBufferLayout() const { - return {kValidityBuffer}; -} +std::vector StructType::GetBufferLayout() const { return {kValidityBuffer}; } std::vector UnionType::GetBufferLayout() const { if (mode_ == UnionMode::SPARSE) { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index fffb840e3cef7..e0df722e5668a 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -204,15 +204,15 @@ class NoExtraMeta {}; class ARROW_EXPORT Field { public: Field(const std::string& name, const std::shared_ptr& type, - bool nullable = true, - const std::shared_ptr& metadata = nullptr) + bool nullable = true, + const std::shared_ptr& metadata = nullptr) : name_(name), type_(type), nullable_(nullable), metadata_(metadata) {} std::shared_ptr metadata() const { return metadata_; } /// \deprecated Status AddMetadata(const std::shared_ptr& metadata, - std::shared_ptr* out) const; + std::shared_ptr* out) const; std::shared_ptr AddMetadata( const std::shared_ptr& metadata) const; @@ -489,7 +489,7 @@ class ARROW_EXPORT UnionType : public NestedType { static constexpr Type::type type_id = Type::UNION; UnionType(const std::vector>& fields, - const std::vector& type_codes, UnionMode mode = UnionMode::SPARSE); + const std::vector& type_codes, UnionMode mode = UnionMode::SPARSE); std::string ToString() const override; static std::string name() { return "union"; } @@ -669,7 +669,7 @@ class ARROW_EXPORT DictionaryType : public FixedWidthType { static constexpr Type::type type_id = Type::DICTIONARY; DictionaryType(const std::shared_ptr& index_type, - const std::shared_ptr& dictionary, bool ordered = false); + const std::shared_ptr& dictionary, bool ordered = false); int bit_width() const override; @@ -699,7 +699,7 @@ class ARROW_EXPORT DictionaryType : public FixedWidthType { class ARROW_EXPORT Schema { public: explicit Schema(const std::vector>& fields, - const std::shared_ptr& metadata = nullptr); + const std::shared_ptr& metadata = nullptr); virtual ~Schema() = default; /// Returns true if all of the schema fields are equal @@ -724,13 +724,13 @@ class ARROW_EXPORT Schema { /// \brief Render a string representation of the schema suitable for debugging std::string ToString() const; - Status AddField( - int i, const std::shared_ptr& field, std::shared_ptr* out) const; + Status AddField(int i, const std::shared_ptr& field, + std::shared_ptr* out) const; Status RemoveField(int i, std::shared_ptr* out) const; /// \deprecated Status AddMetadata(const std::shared_ptr& metadata, - std::shared_ptr* out) const; + std::shared_ptr* out) const; /// \brief Replace key-value metadata with new metadata /// @@ -761,8 +761,8 @@ std::shared_ptr ARROW_EXPORT list(const std::shared_ptr& value_ std::shared_ptr ARROW_EXPORT list(const std::shared_ptr& value_type); std::shared_ptr ARROW_EXPORT timestamp(TimeUnit::type unit); -std::shared_ptr ARROW_EXPORT timestamp( - TimeUnit::type unit, const std::string& timezone); +std::shared_ptr ARROW_EXPORT timestamp(TimeUnit::type unit, + const std::string& timezone); /// Unit can be either SECOND or MILLI std::shared_ptr ARROW_EXPORT time32(TimeUnit::type unit); @@ -770,18 +770,18 @@ std::shared_ptr ARROW_EXPORT time32(TimeUnit::type unit); /// Unit can be either MICRO or NANO std::shared_ptr ARROW_EXPORT time64(TimeUnit::type unit); -std::shared_ptr ARROW_EXPORT struct_( - const std::vector>& fields); +std::shared_ptr ARROW_EXPORT +struct_(const std::vector>& fields); -std::shared_ptr ARROW_EXPORT union_( - const std::vector>& child_fields, - const std::vector& type_codes, UnionMode mode = UnionMode::SPARSE); +std::shared_ptr ARROW_EXPORT +union_(const std::vector>& child_fields, + const std::vector& type_codes, UnionMode mode = UnionMode::SPARSE); std::shared_ptr ARROW_EXPORT dictionary( const std::shared_ptr& index_type, const std::shared_ptr& values); -std::shared_ptr ARROW_EXPORT field(const std::string& name, - const std::shared_ptr& type, bool nullable = true, +std::shared_ptr ARROW_EXPORT field( + const std::string& name, const std::shared_ptr& type, bool nullable = true, const std::shared_ptr& metadata = nullptr); // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 3e8ea23432b98..8be67b2a3829c 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -319,9 +319,10 @@ GET_ATTR(TypeClass, void); #undef GET_ATTR -#define PRIMITIVE_TRAITS(T) \ - using TypeClass = typename std::conditional::value, T, \ - typename GetAttr_TypeClass::type>::type; \ +#define PRIMITIVE_TRAITS(T) \ + using TypeClass = \ + typename std::conditional::value, T, \ + typename GetAttr_TypeClass::type>::type; \ using c_type = typename GetAttr_c_type::type; template diff --git a/cpp/src/arrow/util/bit-stream-utils.h b/cpp/src/arrow/util/bit-stream-utils.h index 537fdc3045ca5..318f5ba8b0e17 100644 --- a/cpp/src/arrow/util/bit-stream-utils.h +++ b/cpp/src/arrow/util/bit-stream-utils.h @@ -20,9 +20,9 @@ #ifndef ARROW_UTIL_BIT_STREAM_UTILS_H #define ARROW_UTIL_BIT_STREAM_UTILS_H +#include #include #include -#include #include "arrow/util/bit-util.h" #include "arrow/util/bpacking.h" @@ -229,13 +229,13 @@ inline bool BitWriter::PutVlqInt(uint32_t v) { template inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer, - int* bit_offset, int* byte_offset, uint64_t* buffered_values) { + int* bit_offset, int* byte_offset, uint64_t* buffered_values) { #ifdef _MSC_VER #pragma warning(push) #pragma warning(disable : 4800) #endif - *v = static_cast( - BitUtil::TrailingBits(*buffered_values, *bit_offset + num_bits) >> *bit_offset); + *v = static_cast(BitUtil::TrailingBits(*buffered_values, *bit_offset + num_bits) >> + *bit_offset); #ifdef _MSC_VER #pragma warning(pop) #endif @@ -292,13 +292,14 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) { if (UNLIKELY(bit_offset != 0)) { for (; i < batch_size && bit_offset != 0; ++i) { GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset, - &buffered_values); + &buffered_values); } } if (sizeof(T) == 4) { - int num_unpacked = unpack32(reinterpret_cast(buffer + byte_offset), - reinterpret_cast(v + i), batch_size - i, num_bits); + int num_unpacked = + unpack32(reinterpret_cast(buffer + byte_offset), + reinterpret_cast(v + i), batch_size - i, num_bits); i += num_unpacked; byte_offset += num_unpacked * num_bits / 8; } else { @@ -307,8 +308,10 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) { while (i < batch_size) { int unpack_size = std::min(buffer_size, batch_size - i); int num_unpacked = unpack32(reinterpret_cast(buffer + byte_offset), - unpack_buffer, unpack_size, num_bits); - if (num_unpacked == 0) { break; } + unpack_buffer, unpack_size, num_bits); + if (num_unpacked == 0) { + break; + } for (int k = 0; k < num_unpacked; ++k) { #ifdef _MSC_VER #pragma warning(push) @@ -332,8 +335,8 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) { } for (; i < batch_size; ++i) { - GetValue_( - num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset, &buffered_values); + GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset, + &buffered_values); } bit_offset_ = bit_offset; diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc index cd945585ba210..231bf54a2a3b6 100644 --- a/cpp/src/arrow/util/bit-util-test.cc +++ b/cpp/src/arrow/util/bit-util-test.cc @@ -35,7 +35,9 @@ namespace arrow { static void EnsureCpuInfoInitialized() { - if (!CpuInfo::initialized()) { CpuInfo::Init(); } + if (!CpuInfo::initialized()) { + CpuInfo::Init(); + } } TEST(BitUtilTests, TestIsMultipleOf64) { @@ -68,11 +70,13 @@ TEST(BitUtilTests, TestNextPower2) { ASSERT_EQ(1LL << 62, NextPower2((1LL << 62) - 1)); } -static inline int64_t SlowCountBits( - const uint8_t* data, int64_t bit_offset, int64_t length) { +static inline int64_t SlowCountBits(const uint8_t* data, int64_t bit_offset, + int64_t length) { int64_t count = 0; for (int64_t i = bit_offset; i < bit_offset + length; ++i) { - if (BitUtil::GetBit(data, i)) { ++count; } + if (BitUtil::GetBit(data, i)) { + ++count; + } } return count; } @@ -175,9 +179,9 @@ TEST(BitUtil, TrailingBits) { EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 0), 0); EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 1), 1); EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 64), - BOOST_BINARY(1 1 1 1 1 1 1 1)); + BOOST_BINARY(1 1 1 1 1 1 1 1)); EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 100), - BOOST_BINARY(1 1 1 1 1 1 1 1)); + BOOST_BINARY(1 1 1 1 1 1 1 1)); EXPECT_EQ(BitUtil::TrailingBits(0, 1), 0); EXPECT_EQ(BitUtil::TrailingBits(0, 64), 0); EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 0), 0); @@ -193,12 +197,12 @@ TEST(BitUtil, ByteSwap) { EXPECT_EQ(BitUtil::ByteSwap(static_cast(0x11223344)), 0x44332211); EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); - EXPECT_EQ( - BitUtil::ByteSwap(static_cast(0x1122334455667788)), 0x8877665544332211); + EXPECT_EQ(BitUtil::ByteSwap(static_cast(0x1122334455667788)), + 0x8877665544332211); EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); - EXPECT_EQ( - BitUtil::ByteSwap(static_cast(0x1122334455667788)), 0x8877665544332211); + EXPECT_EQ(BitUtil::ByteSwap(static_cast(0x1122334455667788)), + 0x8877665544332211); EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); EXPECT_EQ(BitUtil::ByteSwap(static_cast(0x1122)), 0x2211); diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc index 5bbec6f23111e..f255f95f30a76 100644 --- a/cpp/src/arrow/util/bit-util.cc +++ b/cpp/src/arrow/util/bit-util.cc @@ -36,12 +36,14 @@ namespace arrow { void BitUtil::FillBitsFromBytes(const std::vector& bytes, uint8_t* bits) { for (size_t i = 0; i < bytes.size(); ++i) { - if (bytes[i] > 0) { SetBit(bits, i); } + if (bytes[i] > 0) { + SetBit(bits, i); + } } } -Status BitUtil::BytesToBits( - const std::vector& bytes, std::shared_ptr* out) { +Status BitUtil::BytesToBits(const std::vector& bytes, + std::shared_ptr* out) { int64_t bit_length = BitUtil::BytesForBits(bytes.size()); std::shared_ptr buffer; @@ -65,7 +67,9 @@ int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length) { // The number of bits until fast_count_start const int64_t initial_bits = std::min(length, fast_count_start - bit_offset); for (int64_t i = bit_offset; i < bit_offset + initial_bits; ++i) { - if (BitUtil::GetBit(data, i)) { ++count; } + if (BitUtil::GetBit(data, i)) { + ++count; + } } const int64_t fast_counts = (length - initial_bits) / pop_len; @@ -85,21 +89,23 @@ int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length) { // versions of popcount but the code complexity is likely not worth it) const int64_t tail_index = bit_offset + initial_bits + fast_counts * pop_len; for (int64_t i = tail_index; i < bit_offset + length; ++i) { - if (BitUtil::GetBit(data, i)) { ++count; } + if (BitUtil::GetBit(data, i)) { + ++count; + } } return count; } -Status GetEmptyBitmap( - MemoryPool* pool, int64_t length, std::shared_ptr* result) { +Status GetEmptyBitmap(MemoryPool* pool, int64_t length, + std::shared_ptr* result) { RETURN_NOT_OK(AllocateBuffer(pool, BitUtil::BytesForBits(length), result)); memset((*result)->mutable_data(), 0, static_cast((*result)->size())); return Status::OK(); } Status CopyBitmap(MemoryPool* pool, const uint8_t* data, int64_t offset, int64_t length, - std::shared_ptr* out) { + std::shared_ptr* out) { std::shared_ptr buffer; RETURN_NOT_OK(GetEmptyBitmap(pool, length, &buffer)); uint8_t* dest = buffer->mutable_data(); @@ -111,12 +117,14 @@ Status CopyBitmap(MemoryPool* pool, const uint8_t* data, int64_t offset, int64_t } bool BitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right, - int64_t right_offset, int64_t bit_length) { + int64_t right_offset, int64_t bit_length) { if (left_offset % 8 == 0 && right_offset % 8 == 0) { // byte aligned, can use memcmp bool bytes_equal = std::memcmp(left + left_offset / 8, right + right_offset / 8, - bit_length / 8) == 0; - if (!bytes_equal) { return false; } + bit_length / 8) == 0; + if (!bytes_equal) { + return false; + } for (int64_t i = (bit_length / 8) * 8; i < bit_length; ++i) { if (BitUtil::GetBit(left, left_offset + i) != BitUtil::GetBit(right, right_offset + i)) { diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index d055c751d16fa..f036763b8106e 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -101,17 +101,11 @@ static constexpr uint8_t kBitmask[] = {1, 2, 4, 8, 16, 32, 64, 128}; // the ~i byte version of kBitmaks static constexpr uint8_t kFlippedBitmask[] = {254, 253, 251, 247, 239, 223, 191, 127}; -static inline int64_t CeilByte(int64_t size) { - return (size + 7) & ~7; -} +static inline int64_t CeilByte(int64_t size) { return (size + 7) & ~7; } -static inline int64_t BytesForBits(int64_t size) { - return CeilByte(size) / 8; -} +static inline int64_t BytesForBits(int64_t size) { return CeilByte(size) / 8; } -static inline int64_t Ceil2Bytes(int64_t size) { - return (size + 15) & ~15; -} +static inline int64_t Ceil2Bytes(int64_t size) { return (size + 15) & ~15; } static inline bool GetBit(const uint8_t* bits, int64_t i) { return (bits[i / 8] & kBitmask[i % 8]) != 0; @@ -125,13 +119,13 @@ static inline void ClearBit(uint8_t* bits, int64_t i) { bits[i / 8] &= kFlippedBitmask[i % 8]; } -static inline void SetBit(uint8_t* bits, int64_t i) { - bits[i / 8] |= kBitmask[i % 8]; -} +static inline void SetBit(uint8_t* bits, int64_t i) { bits[i / 8] |= kBitmask[i % 8]; } /// Set bit if is_set is true, but cannot clear bit static inline void SetArrayBit(uint8_t* bits, int i, bool is_set) { - if (is_set) { SetBit(bits, i); } + if (is_set) { + SetBit(bits, i); + } } static inline void SetBitTo(uint8_t* bits, int64_t i, bool bit_is_set) { @@ -168,13 +162,9 @@ static inline int64_t NextPower2(int64_t n) { return n; } -static inline bool IsMultipleOf64(int64_t n) { - return (n & 63) == 0; -} +static inline bool IsMultipleOf64(int64_t n) { return (n & 63) == 0; } -static inline bool IsMultipleOf8(int64_t n) { - return (n & 7) == 0; -} +static inline bool IsMultipleOf8(int64_t n) { return (n & 7) == 0; } /// Returns the ceil of value/divisor static inline int64_t Ceil(int64_t value, int64_t divisor) { @@ -206,34 +196,22 @@ static inline int RoundDownToPowerOf2(int value, int factor) { /// Specialized round up and down functions for frequently used factors, /// like 8 (bits->bytes), 32 (bits->i32), and 64 (bits->i64). /// Returns the rounded up number of bytes that fit the number of bits. -static inline uint32_t RoundUpNumBytes(uint32_t bits) { - return (bits + 7) >> 3; -} +static inline uint32_t RoundUpNumBytes(uint32_t bits) { return (bits + 7) >> 3; } /// Returns the rounded down number of bytes that fit the number of bits. -static inline uint32_t RoundDownNumBytes(uint32_t bits) { - return bits >> 3; -} +static inline uint32_t RoundDownNumBytes(uint32_t bits) { return bits >> 3; } /// Returns the rounded up to 32 multiple. Used for conversions of bits to i32. -static inline uint32_t RoundUpNumi32(uint32_t bits) { - return (bits + 31) >> 5; -} +static inline uint32_t RoundUpNumi32(uint32_t bits) { return (bits + 31) >> 5; } /// Returns the rounded up 32 multiple. -static inline uint32_t RoundDownNumi32(uint32_t bits) { - return bits >> 5; -} +static inline uint32_t RoundDownNumi32(uint32_t bits) { return bits >> 5; } /// Returns the rounded up to 64 multiple. Used for conversions of bits to i64. -static inline uint32_t RoundUpNumi64(uint32_t bits) { - return (bits + 63) >> 6; -} +static inline uint32_t RoundUpNumi64(uint32_t bits) { return (bits + 63) >> 6; } /// Returns the rounded down to 64 multiple. -static inline uint32_t RoundDownNumi64(uint32_t bits) { - return bits >> 6; -} +static inline uint32_t RoundDownNumi64(uint32_t bits) { return bits >> 6; } static inline int64_t RoundUpToMultipleOf64(int64_t num) { // TODO(wesm): is this definitely needed? @@ -242,7 +220,9 @@ static inline int64_t RoundUpToMultipleOf64(int64_t num) { constexpr int64_t force_carry_addend = round_to - 1; constexpr int64_t truncate_bitmask = ~(round_to - 1); constexpr int64_t max_roundable_num = std::numeric_limits::max() - round_to; - if (num <= max_roundable_num) { return (num + force_carry_addend) & truncate_bitmask; } + if (num <= max_roundable_num) { + return (num + force_carry_addend) & truncate_bitmask; + } // handle overflow case. This should result in a malloc error upstream return num; } @@ -252,8 +232,7 @@ static inline int64_t RoundUpToMultipleOf64(int64_t num) { /// might be a much faster way to implement this. static inline int PopcountNoHw(uint64_t x) { int count = 0; - for (; x != 0; ++count) - x &= x - 1; + for (; x != 0; ++count) x &= x - 1; return count; } @@ -297,21 +276,16 @@ static inline int Log2(uint64_t x) { // (floor(log2(n)) = MSB(n) (0-indexed)) --x; int result = 1; - while (x >>= 1) - ++result; + while (x >>= 1) ++result; return result; } /// Swaps the byte order (i.e. endianess) -static inline int64_t ByteSwap(int64_t value) { - return ARROW_BYTE_SWAP64(value); -} +static inline int64_t ByteSwap(int64_t value) { return ARROW_BYTE_SWAP64(value); } static inline uint64_t ByteSwap(uint64_t value) { return static_cast(ARROW_BYTE_SWAP64(value)); } -static inline int32_t ByteSwap(int32_t value) { - return ARROW_BYTE_SWAP32(value); -} +static inline int32_t ByteSwap(int32_t value) { return ARROW_BYTE_SWAP32(value); } static inline uint32_t ByteSwap(uint32_t value) { return static_cast(ARROW_BYTE_SWAP32(value)); } @@ -352,84 +326,36 @@ static inline void ByteSwap(void* dst, const void* src, int len) { /// Converts to big endian format (if not already in big endian) from the /// machine's native endian format. #if __BYTE_ORDER == __LITTLE_ENDIAN -static inline int64_t ToBigEndian(int64_t value) { - return ByteSwap(value); -} -static inline uint64_t ToBigEndian(uint64_t value) { - return ByteSwap(value); -} -static inline int32_t ToBigEndian(int32_t value) { - return ByteSwap(value); -} -static inline uint32_t ToBigEndian(uint32_t value) { - return ByteSwap(value); -} -static inline int16_t ToBigEndian(int16_t value) { - return ByteSwap(value); -} -static inline uint16_t ToBigEndian(uint16_t value) { - return ByteSwap(value); -} +static inline int64_t ToBigEndian(int64_t value) { return ByteSwap(value); } +static inline uint64_t ToBigEndian(uint64_t value) { return ByteSwap(value); } +static inline int32_t ToBigEndian(int32_t value) { return ByteSwap(value); } +static inline uint32_t ToBigEndian(uint32_t value) { return ByteSwap(value); } +static inline int16_t ToBigEndian(int16_t value) { return ByteSwap(value); } +static inline uint16_t ToBigEndian(uint16_t value) { return ByteSwap(value); } #else -static inline int64_t ToBigEndian(int64_t val) { - return val; -} -static inline uint64_t ToBigEndian(uint64_t val) { - return val; -} -static inline int32_t ToBigEndian(int32_t val) { - return val; -} -static inline uint32_t ToBigEndian(uint32_t val) { - return val; -} -static inline int16_t ToBigEndian(int16_t val) { - return val; -} -static inline uint16_t ToBigEndian(uint16_t val) { - return val; -} +static inline int64_t ToBigEndian(int64_t val) { return val; } +static inline uint64_t ToBigEndian(uint64_t val) { return val; } +static inline int32_t ToBigEndian(int32_t val) { return val; } +static inline uint32_t ToBigEndian(uint32_t val) { return val; } +static inline int16_t ToBigEndian(int16_t val) { return val; } +static inline uint16_t ToBigEndian(uint16_t val) { return val; } #endif /// Converts from big endian format to the machine's native endian format. #if __BYTE_ORDER == __LITTLE_ENDIAN -static inline int64_t FromBigEndian(int64_t value) { - return ByteSwap(value); -} -static inline uint64_t FromBigEndian(uint64_t value) { - return ByteSwap(value); -} -static inline int32_t FromBigEndian(int32_t value) { - return ByteSwap(value); -} -static inline uint32_t FromBigEndian(uint32_t value) { - return ByteSwap(value); -} -static inline int16_t FromBigEndian(int16_t value) { - return ByteSwap(value); -} -static inline uint16_t FromBigEndian(uint16_t value) { - return ByteSwap(value); -} +static inline int64_t FromBigEndian(int64_t value) { return ByteSwap(value); } +static inline uint64_t FromBigEndian(uint64_t value) { return ByteSwap(value); } +static inline int32_t FromBigEndian(int32_t value) { return ByteSwap(value); } +static inline uint32_t FromBigEndian(uint32_t value) { return ByteSwap(value); } +static inline int16_t FromBigEndian(int16_t value) { return ByteSwap(value); } +static inline uint16_t FromBigEndian(uint16_t value) { return ByteSwap(value); } #else -static inline int64_t FromBigEndian(int64_t val) { - return val; -} -static inline uint64_t FromBigEndian(uint64_t val) { - return val; -} -static inline int32_t FromBigEndian(int32_t val) { - return val; -} -static inline uint32_t FromBigEndian(uint32_t val) { - return val; -} -static inline int16_t FromBigEndian(int16_t val) { - return val; -} -static inline uint16_t FromBigEndian(uint16_t val) { - return val; -} +static inline int64_t FromBigEndian(int64_t val) { return val; } +static inline uint64_t FromBigEndian(uint64_t val) { return val; } +static inline int32_t FromBigEndian(int32_t val) { return val; } +static inline uint32_t FromBigEndian(uint32_t val) { return val; } +static inline int16_t FromBigEndian(int16_t val) { return val; } +static inline uint16_t FromBigEndian(uint16_t val) { return val; } #endif // Logical right shift for signed integer types @@ -449,8 +375,8 @@ ARROW_EXPORT Status BytesToBits(const std::vector&, std::shared_ptr* result); +Status ARROW_EXPORT GetEmptyBitmap(MemoryPool* pool, int64_t length, + std::shared_ptr* result); /// Copy a bit range of an existing bitmap /// @@ -462,7 +388,7 @@ Status ARROW_EXPORT GetEmptyBitmap( /// /// \return Status message Status ARROW_EXPORT CopyBitmap(MemoryPool* pool, const uint8_t* bitmap, int64_t offset, - int64_t length, std::shared_ptr* out); + int64_t length, std::shared_ptr* out); /// Compute the number of 1's in the given data array /// @@ -471,11 +397,12 @@ Status ARROW_EXPORT CopyBitmap(MemoryPool* pool, const uint8_t* bitmap, int64_t /// \param[in] length the number of bits to inspect in the bitmap relative to the offset /// /// \return The number of set (1) bits in the range -int64_t ARROW_EXPORT CountSetBits( - const uint8_t* data, int64_t bit_offset, int64_t length); +int64_t ARROW_EXPORT CountSetBits(const uint8_t* data, int64_t bit_offset, + int64_t length); bool ARROW_EXPORT BitmapEquals(const uint8_t* left, int64_t left_offset, - const uint8_t* right, int64_t right_offset, int64_t bit_length); + const uint8_t* right, int64_t right_offset, + int64_t bit_length); } // namespace arrow #endif // ARROW_UTIL_BIT_UTIL_H diff --git a/cpp/src/arrow/util/bpacking.h b/cpp/src/arrow/util/bpacking.h index fce5f55224cd4..4d25de0ab060c 100644 --- a/cpp/src/arrow/util/bpacking.h +++ b/cpp/src/arrow/util/bpacking.h @@ -3199,136 +3199,103 @@ inline int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_b switch (num_bits) { case 0: - for (int i = 0; i < num_loops; ++i) - in = nullunpacker32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = nullunpacker32(in, out + i * 32); break; case 1: - for (int i = 0; i < num_loops; ++i) - in = unpack1_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack1_32(in, out + i * 32); break; case 2: - for (int i = 0; i < num_loops; ++i) - in = unpack2_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack2_32(in, out + i * 32); break; case 3: - for (int i = 0; i < num_loops; ++i) - in = unpack3_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack3_32(in, out + i * 32); break; case 4: - for (int i = 0; i < num_loops; ++i) - in = unpack4_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack4_32(in, out + i * 32); break; case 5: - for (int i = 0; i < num_loops; ++i) - in = unpack5_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack5_32(in, out + i * 32); break; case 6: - for (int i = 0; i < num_loops; ++i) - in = unpack6_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack6_32(in, out + i * 32); break; case 7: - for (int i = 0; i < num_loops; ++i) - in = unpack7_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack7_32(in, out + i * 32); break; case 8: - for (int i = 0; i < num_loops; ++i) - in = unpack8_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack8_32(in, out + i * 32); break; case 9: - for (int i = 0; i < num_loops; ++i) - in = unpack9_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack9_32(in, out + i * 32); break; case 10: - for (int i = 0; i < num_loops; ++i) - in = unpack10_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack10_32(in, out + i * 32); break; case 11: - for (int i = 0; i < num_loops; ++i) - in = unpack11_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack11_32(in, out + i * 32); break; case 12: - for (int i = 0; i < num_loops; ++i) - in = unpack12_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack12_32(in, out + i * 32); break; case 13: - for (int i = 0; i < num_loops; ++i) - in = unpack13_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack13_32(in, out + i * 32); break; case 14: - for (int i = 0; i < num_loops; ++i) - in = unpack14_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack14_32(in, out + i * 32); break; case 15: - for (int i = 0; i < num_loops; ++i) - in = unpack15_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack15_32(in, out + i * 32); break; case 16: - for (int i = 0; i < num_loops; ++i) - in = unpack16_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack16_32(in, out + i * 32); break; case 17: - for (int i = 0; i < num_loops; ++i) - in = unpack17_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack17_32(in, out + i * 32); break; case 18: - for (int i = 0; i < num_loops; ++i) - in = unpack18_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack18_32(in, out + i * 32); break; case 19: - for (int i = 0; i < num_loops; ++i) - in = unpack19_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack19_32(in, out + i * 32); break; case 20: - for (int i = 0; i < num_loops; ++i) - in = unpack20_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack20_32(in, out + i * 32); break; case 21: - for (int i = 0; i < num_loops; ++i) - in = unpack21_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack21_32(in, out + i * 32); break; case 22: - for (int i = 0; i < num_loops; ++i) - in = unpack22_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack22_32(in, out + i * 32); break; case 23: - for (int i = 0; i < num_loops; ++i) - in = unpack23_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack23_32(in, out + i * 32); break; case 24: - for (int i = 0; i < num_loops; ++i) - in = unpack24_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack24_32(in, out + i * 32); break; case 25: - for (int i = 0; i < num_loops; ++i) - in = unpack25_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack25_32(in, out + i * 32); break; case 26: - for (int i = 0; i < num_loops; ++i) - in = unpack26_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack26_32(in, out + i * 32); break; case 27: - for (int i = 0; i < num_loops; ++i) - in = unpack27_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack27_32(in, out + i * 32); break; case 28: - for (int i = 0; i < num_loops; ++i) - in = unpack28_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack28_32(in, out + i * 32); break; case 29: - for (int i = 0; i < num_loops; ++i) - in = unpack29_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack29_32(in, out + i * 32); break; case 30: - for (int i = 0; i < num_loops; ++i) - in = unpack30_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack30_32(in, out + i * 32); break; case 31: - for (int i = 0; i < num_loops; ++i) - in = unpack31_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack31_32(in, out + i * 32); break; case 32: - for (int i = 0; i < num_loops; ++i) - in = unpack32_32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack32_32(in, out + i * 32); break; default: DCHECK(false) << "Unsupported num_bits"; diff --git a/cpp/src/arrow/util/compression-test.cc b/cpp/src/arrow/util/compression-test.cc index f7739fc6dd7c3..64896dd6a4a15 100644 --- a/cpp/src/arrow/util/compression-test.cc +++ b/cpp/src/arrow/util/compression-test.cc @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -#include #include +#include #include #include @@ -43,25 +43,25 @@ void CheckCodecRoundtrip(const vector& data) { // compress with c1 int64_t actual_size; - ASSERT_OK(c1->Compress( - data.size(), &data[0], max_compressed_len, &compressed[0], &actual_size)); + ASSERT_OK(c1->Compress(data.size(), &data[0], max_compressed_len, &compressed[0], + &actual_size)); compressed.resize(actual_size); // decompress with c2 - ASSERT_OK(c2->Decompress( - compressed.size(), &compressed[0], decompressed.size(), &decompressed[0])); + ASSERT_OK(c2->Decompress(compressed.size(), &compressed[0], decompressed.size(), + &decompressed[0])); ASSERT_EQ(data, decompressed); // compress with c2 int64_t actual_size2; - ASSERT_OK(c2->Compress( - data.size(), &data[0], max_compressed_len, &compressed[0], &actual_size2)); + ASSERT_OK(c2->Compress(data.size(), &data[0], max_compressed_len, &compressed[0], + &actual_size2)); ASSERT_EQ(actual_size2, actual_size); // decompress with c1 - ASSERT_OK(c1->Decompress( - compressed.size(), &compressed[0], decompressed.size(), &decompressed[0])); + ASSERT_OK(c1->Decompress(compressed.size(), &compressed[0], decompressed.size(), + &decompressed[0])); ASSERT_EQ(data, decompressed); } @@ -76,24 +76,14 @@ void CheckCodec() { } } -TEST(TestCompressors, Snappy) { - CheckCodec(); -} +TEST(TestCompressors, Snappy) { CheckCodec(); } -TEST(TestCompressors, Brotli) { - CheckCodec(); -} +TEST(TestCompressors, Brotli) { CheckCodec(); } -TEST(TestCompressors, GZip) { - CheckCodec(); -} +TEST(TestCompressors, GZip) { CheckCodec(); } -TEST(TestCompressors, ZSTD) { - CheckCodec(); -} +TEST(TestCompressors, ZSTD) { CheckCodec(); } -TEST(TestCompressors, Lz4) { - CheckCodec(); -} +TEST(TestCompressors, Lz4) { CheckCodec(); } } // namespace arrow diff --git a/cpp/src/arrow/util/compression.h b/cpp/src/arrow/util/compression.h index 19c61179a502a..ae187a7fcdf1c 100644 --- a/cpp/src/arrow/util/compression.h +++ b/cpp/src/arrow/util/compression.h @@ -37,10 +37,11 @@ class ARROW_EXPORT Codec { static Status Create(Compression::type codec, std::unique_ptr* out); virtual Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_len, - uint8_t* output_buffer) = 0; + uint8_t* output_buffer) = 0; virtual Status Compress(int64_t input_len, const uint8_t* input, - int64_t output_buffer_len, uint8_t* output_buffer, int64_t* output_length) = 0; + int64_t output_buffer_len, uint8_t* output_buffer, + int64_t* output_length) = 0; virtual int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) = 0; diff --git a/cpp/src/arrow/util/compression_brotli.cc b/cpp/src/arrow/util/compression_brotli.cc index c03573bc46c1c..e4639083dfadb 100644 --- a/cpp/src/arrow/util/compression_brotli.cc +++ b/cpp/src/arrow/util/compression_brotli.cc @@ -33,8 +33,8 @@ namespace arrow { // ---------------------------------------------------------------------- // Brotli implementation -Status BrotliCodec::Decompress( - int64_t input_len, const uint8_t* input, int64_t output_len, uint8_t* output_buffer) { +Status BrotliCodec::Decompress(int64_t input_len, const uint8_t* input, + int64_t output_len, uint8_t* output_buffer) { size_t output_size = output_len; if (BrotliDecoderDecompress(input_len, input, &output_size, output_buffer) != BROTLI_DECODER_RESULT_SUCCESS) { @@ -48,12 +48,13 @@ int64_t BrotliCodec::MaxCompressedLen(int64_t input_len, const uint8_t* input) { } Status BrotliCodec::Compress(int64_t input_len, const uint8_t* input, - int64_t output_buffer_len, uint8_t* output_buffer, int64_t* output_length) { + int64_t output_buffer_len, uint8_t* output_buffer, + int64_t* output_length) { size_t output_len = output_buffer_len; // TODO: Make quality configurable. We use 8 as a default as it is the best // trade-off for Parquet workload if (BrotliEncoderCompress(8, BROTLI_DEFAULT_WINDOW, BROTLI_DEFAULT_MODE, input_len, - input, &output_len, output_buffer) == BROTLI_FALSE) { + input, &output_len, output_buffer) == BROTLI_FALSE) { return Status::IOError("Brotli compression failure."); } *output_length = output_len; diff --git a/cpp/src/arrow/util/compression_brotli.h b/cpp/src/arrow/util/compression_brotli.h index 08bd3379e3489..9e92cb106d422 100644 --- a/cpp/src/arrow/util/compression_brotli.h +++ b/cpp/src/arrow/util/compression_brotli.h @@ -30,10 +30,10 @@ namespace arrow { class ARROW_EXPORT BrotliCodec : public Codec { public: Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_len, - uint8_t* output_buffer) override; + uint8_t* output_buffer) override; Status Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output_buffer, int64_t* output_length) override; + uint8_t* output_buffer, int64_t* output_length) override; int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override; diff --git a/cpp/src/arrow/util/compression_lz4.cc b/cpp/src/arrow/util/compression_lz4.cc index 65eaa08946e79..295e9a438f799 100644 --- a/cpp/src/arrow/util/compression_lz4.cc +++ b/cpp/src/arrow/util/compression_lz4.cc @@ -32,12 +32,14 @@ namespace arrow { // ---------------------------------------------------------------------- // Lz4 implementation -Status Lz4Codec::Decompress( - int64_t input_len, const uint8_t* input, int64_t output_len, uint8_t* output_buffer) { - int64_t decompressed_size = LZ4_decompress_safe(reinterpret_cast(input), - reinterpret_cast(output_buffer), static_cast(input_len), - static_cast(output_len)); - if (decompressed_size < 1) { return Status::IOError("Corrupt Lz4 compressed data."); } +Status Lz4Codec::Decompress(int64_t input_len, const uint8_t* input, int64_t output_len, + uint8_t* output_buffer) { + int64_t decompressed_size = LZ4_decompress_safe( + reinterpret_cast(input), reinterpret_cast(output_buffer), + static_cast(input_len), static_cast(output_len)); + if (decompressed_size < 1) { + return Status::IOError("Corrupt Lz4 compressed data."); + } return Status::OK(); } @@ -46,11 +48,14 @@ int64_t Lz4Codec::MaxCompressedLen(int64_t input_len, const uint8_t* input) { } Status Lz4Codec::Compress(int64_t input_len, const uint8_t* input, - int64_t output_buffer_len, uint8_t* output_buffer, int64_t* output_length) { - *output_length = LZ4_compress_default(reinterpret_cast(input), - reinterpret_cast(output_buffer), static_cast(input_len), - static_cast(output_buffer_len)); - if (*output_length < 1) { return Status::IOError("Lz4 compression failure."); } + int64_t output_buffer_len, uint8_t* output_buffer, + int64_t* output_length) { + *output_length = LZ4_compress_default( + reinterpret_cast(input), reinterpret_cast(output_buffer), + static_cast(input_len), static_cast(output_buffer_len)); + if (*output_length < 1) { + return Status::IOError("Lz4 compression failure."); + } return Status::OK(); } diff --git a/cpp/src/arrow/util/compression_lz4.h b/cpp/src/arrow/util/compression_lz4.h index 9668fec126b12..0af228963f320 100644 --- a/cpp/src/arrow/util/compression_lz4.h +++ b/cpp/src/arrow/util/compression_lz4.h @@ -30,10 +30,10 @@ namespace arrow { class ARROW_EXPORT Lz4Codec : public Codec { public: Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_len, - uint8_t* output_buffer) override; + uint8_t* output_buffer) override; Status Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output_buffer, int64_t* output_length) override; + uint8_t* output_buffer, int64_t* output_length) override; int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override; diff --git a/cpp/src/arrow/util/compression_snappy.cc b/cpp/src/arrow/util/compression_snappy.cc index db2b67355109a..947ffe559bda6 100644 --- a/cpp/src/arrow/util/compression_snappy.cc +++ b/cpp/src/arrow/util/compression_snappy.cc @@ -37,10 +37,11 @@ namespace arrow { // ---------------------------------------------------------------------- // Snappy implementation -Status SnappyCodec::Decompress( - int64_t input_len, const uint8_t* input, int64_t output_len, uint8_t* output_buffer) { +Status SnappyCodec::Decompress(int64_t input_len, const uint8_t* input, + int64_t output_len, uint8_t* output_buffer) { if (!snappy::RawUncompress(reinterpret_cast(input), - static_cast(input_len), reinterpret_cast(output_buffer))) { + static_cast(input_len), + reinterpret_cast(output_buffer))) { return Status::IOError("Corrupt snappy compressed data."); } return Status::OK(); @@ -51,11 +52,12 @@ int64_t SnappyCodec::MaxCompressedLen(int64_t input_len, const uint8_t* input) { } Status SnappyCodec::Compress(int64_t input_len, const uint8_t* input, - int64_t output_buffer_len, uint8_t* output_buffer, int64_t* output_length) { + int64_t output_buffer_len, uint8_t* output_buffer, + int64_t* output_length) { size_t output_len; snappy::RawCompress(reinterpret_cast(input), - static_cast(input_len), reinterpret_cast(output_buffer), - &output_len); + static_cast(input_len), + reinterpret_cast(output_buffer), &output_len); *output_length = static_cast(output_len); return Status::OK(); } diff --git a/cpp/src/arrow/util/compression_snappy.h b/cpp/src/arrow/util/compression_snappy.h index 25281e1a97a16..5cc10c470af45 100644 --- a/cpp/src/arrow/util/compression_snappy.h +++ b/cpp/src/arrow/util/compression_snappy.h @@ -29,10 +29,10 @@ namespace arrow { class ARROW_EXPORT SnappyCodec : public Codec { public: Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_len, - uint8_t* output_buffer) override; + uint8_t* output_buffer) override; Status Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output_buffer, int64_t* output_length) override; + uint8_t* output_buffer, int64_t* output_length) override; int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override; diff --git a/cpp/src/arrow/util/compression_zlib.cc b/cpp/src/arrow/util/compression_zlib.cc index 3ff33b82028e8..ae6627ea6442f 100644 --- a/cpp/src/arrow/util/compression_zlib.cc +++ b/cpp/src/arrow/util/compression_zlib.cc @@ -69,7 +69,7 @@ class GZipCodec::GZipCodecImpl { window_bits += GZIP_CODEC; } if ((ret = deflateInit2(&stream_, Z_DEFAULT_COMPRESSION, Z_DEFLATED, window_bits, 9, - Z_DEFAULT_STRATEGY)) != Z_OK) { + Z_DEFAULT_STRATEGY)) != Z_OK) { std::stringstream ss; ss << "zlib deflateInit failed: " << std::string(stream_.msg); return Status::IOError(ss.str()); @@ -79,7 +79,9 @@ class GZipCodec::GZipCodecImpl { } void EndCompressor() { - if (compressor_initialized_) { (void)deflateEnd(&stream_); } + if (compressor_initialized_) { + (void)deflateEnd(&stream_); + } compressor_initialized_ = false; } @@ -100,13 +102,17 @@ class GZipCodec::GZipCodecImpl { } void EndDecompressor() { - if (decompressor_initialized_) { (void)inflateEnd(&stream_); } + if (decompressor_initialized_) { + (void)inflateEnd(&stream_); + } decompressor_initialized_ = false; } Status Decompress(int64_t input_length, const uint8_t* input, int64_t output_length, - uint8_t* output) { - if (!decompressor_initialized_) { RETURN_NOT_OK(InitDecompressor()); } + uint8_t* output) { + if (!decompressor_initialized_) { + RETURN_NOT_OK(InitDecompressor()); + } if (output_length == 0) { // The zlib library does not allow *output to be NULL, even when output_length // is 0 (inflate() will return Z_STREAM_ERROR). We don't consider this an @@ -168,8 +174,10 @@ class GZipCodec::GZipCodecImpl { } Status Compress(int64_t input_length, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output, int64_t* output_length) { - if (!compressor_initialized_) { RETURN_NOT_OK(InitCompressor()); } + uint8_t* output, int64_t* output_length) { + if (!compressor_initialized_) { + RETURN_NOT_OK(InitCompressor()); + } stream_.next_in = const_cast(reinterpret_cast(input)); stream_.avail_in = static_cast(input_length); stream_.next_out = reinterpret_cast(output); @@ -218,14 +226,12 @@ class GZipCodec::GZipCodecImpl { bool decompressor_initialized_; }; -GZipCodec::GZipCodec(Format format) { - impl_.reset(new GZipCodecImpl(format)); -} +GZipCodec::GZipCodec(Format format) { impl_.reset(new GZipCodecImpl(format)); } GZipCodec::~GZipCodec() {} Status GZipCodec::Decompress(int64_t input_length, const uint8_t* input, - int64_t output_buffer_len, uint8_t* output) { + int64_t output_buffer_len, uint8_t* output) { return impl_->Decompress(input_length, input, output_buffer_len, output); } @@ -234,12 +240,11 @@ int64_t GZipCodec::MaxCompressedLen(int64_t input_length, const uint8_t* input) } Status GZipCodec::Compress(int64_t input_length, const uint8_t* input, - int64_t output_buffer_len, uint8_t* output, int64_t* output_length) { + int64_t output_buffer_len, uint8_t* output, + int64_t* output_length) { return impl_->Compress(input_length, input, output_buffer_len, output, output_length); } -const char* GZipCodec::name() const { - return "gzip"; -} +const char* GZipCodec::name() const { return "gzip"; } } // namespace arrow diff --git a/cpp/src/arrow/util/compression_zlib.h b/cpp/src/arrow/util/compression_zlib.h index 517a06175ec8f..f55d6689edfa9 100644 --- a/cpp/src/arrow/util/compression_zlib.h +++ b/cpp/src/arrow/util/compression_zlib.h @@ -40,10 +40,10 @@ class ARROW_EXPORT GZipCodec : public Codec { virtual ~GZipCodec(); Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_len, - uint8_t* output_buffer) override; + uint8_t* output_buffer) override; Status Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output_buffer, int64_t* output_length) override; + uint8_t* output_buffer, int64_t* output_length) override; int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override; diff --git a/cpp/src/arrow/util/compression_zstd.cc b/cpp/src/arrow/util/compression_zstd.cc index 5511cb9dd8f37..ac6e9065d22dd 100644 --- a/cpp/src/arrow/util/compression_zstd.cc +++ b/cpp/src/arrow/util/compression_zstd.cc @@ -32,10 +32,11 @@ namespace arrow { // ---------------------------------------------------------------------- // ZSTD implementation -Status ZSTDCodec::Decompress( - int64_t input_len, const uint8_t* input, int64_t output_len, uint8_t* output_buffer) { - int64_t decompressed_size = ZSTD_decompress(output_buffer, - static_cast(output_len), input, static_cast(input_len)); +Status ZSTDCodec::Decompress(int64_t input_len, const uint8_t* input, int64_t output_len, + uint8_t* output_buffer) { + int64_t decompressed_size = + ZSTD_decompress(output_buffer, static_cast(output_len), input, + static_cast(input_len)); if (decompressed_size != output_len) { return Status::IOError("Corrupt ZSTD compressed data."); } @@ -47,9 +48,10 @@ int64_t ZSTDCodec::MaxCompressedLen(int64_t input_len, const uint8_t* input) { } Status ZSTDCodec::Compress(int64_t input_len, const uint8_t* input, - int64_t output_buffer_len, uint8_t* output_buffer, int64_t* output_length) { + int64_t output_buffer_len, uint8_t* output_buffer, + int64_t* output_length) { *output_length = ZSTD_compress(output_buffer, static_cast(output_buffer_len), - input, static_cast(input_len), 1); + input, static_cast(input_len), 1); if (ZSTD_isError(*output_length)) { return Status::IOError("ZSTD compression failure."); } diff --git a/cpp/src/arrow/util/compression_zstd.h b/cpp/src/arrow/util/compression_zstd.h index 2356d5862e01a..6e40e19d280d7 100644 --- a/cpp/src/arrow/util/compression_zstd.h +++ b/cpp/src/arrow/util/compression_zstd.h @@ -30,10 +30,10 @@ namespace arrow { class ARROW_EXPORT ZSTDCodec : public Codec { public: Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_len, - uint8_t* output_buffer) override; + uint8_t* output_buffer) override; Status Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output_buffer, int64_t* output_length) override; + uint8_t* output_buffer, int64_t* output_length) override; int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override; diff --git a/cpp/src/arrow/util/cpu-info.cc b/cpp/src/arrow/util/cpu-info.cc index c0fc8bdddf4bf..b0667cb33ada4 100644 --- a/cpp/src/arrow/util/cpu-info.cc +++ b/cpp/src/arrow/util/cpu-info.cc @@ -30,6 +30,10 @@ #include #endif +#ifdef _WIN32 +#include +#endif + #include #include @@ -62,7 +66,9 @@ static struct { string name; int64_t flag; } flag_mappings[] = { - {"ssse3", CpuInfo::SSSE3}, {"sse4_1", CpuInfo::SSE4_1}, {"sse4_2", CpuInfo::SSE4_2}, + {"ssse3", CpuInfo::SSSE3}, + {"sse4_1", CpuInfo::SSE4_1}, + {"sse4_2", CpuInfo::SSE4_2}, {"popcnt", CpuInfo::POPCNT}, }; static const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]); @@ -74,15 +80,66 @@ static const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0] int64_t ParseCPUFlags(const string& values) { int64_t flags = 0; for (int i = 0; i < num_flags; ++i) { - if (contains(values, flag_mappings[i].name)) { flags |= flag_mappings[i].flag; } + if (contains(values, flag_mappings[i].name)) { + flags |= flag_mappings[i].flag; + } } return flags; } +#ifdef _WIN32 +bool RetrieveCacheSize(int64_t* cache_sizes) { + if (!cache_sizes) { + return false; + } + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = nullptr; + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer_position = nullptr; + DWORD buffer_size = 0; + DWORD offset = 0; + typedef BOOL(WINAPI * GetLogicalProcessorInformationFuncPointer)(void*, void*); + GetLogicalProcessorInformationFuncPointer func_pointer = + (GetLogicalProcessorInformationFuncPointer)GetProcAddress( + GetModuleHandle("kernel32"), "GetLogicalProcessorInformation"); + + if (!func_pointer) { + return false; + } + + // Get buffer size + if (func_pointer(buffer, &buffer_size) && GetLastError() != ERROR_INSUFFICIENT_BUFFER) + return false; + + buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(buffer_size); + + if (!buffer || !func_pointer(buffer, &buffer_size)) { + return false; + } + + buffer_position = buffer; + while (offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= buffer_size) { + if (RelationCache == buffer_position->Relationship) { + PCACHE_DESCRIPTOR cache = &buffer_position->Cache; + if (cache->Level >= 1 && cache->Level <= 3) { + cache_sizes[cache->Level - 1] += cache->Size; + } + } + offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + buffer_position++; + } + + if (buffer) { + free(buffer); + } + return true; +} +#endif + void CpuInfo::Init() { std::lock_guard cpuinfo_lock(cpuinfo_mutex); - if (initialized()) { return; } + if (initialized()) { + return; + } string line; string name; @@ -93,6 +150,16 @@ void CpuInfo::Init() { memset(&cache_sizes_, 0, sizeof(cache_sizes_)); +#ifdef _WIN32 + SYSTEM_INFO system_info; + GetSystemInfo(&system_info); + num_cores = system_info.dwNumberOfProcessors; + + LARGE_INTEGER performance_frequency; + if (QueryPerformanceFrequency(&performance_frequency)) { + max_mhz = static_cast(performance_frequency.QuadPart); + } +#else // Read from /proc/cpuinfo std::ifstream cpuinfo("/proc/cpuinfo", std::ios::in); while (cpuinfo) { @@ -120,6 +187,7 @@ void CpuInfo::Init() { } } if (cpuinfo.is_open()) cpuinfo.close(); +#endif #ifdef __APPLE__ // On Mac OS X use sysctl() to get the cache sizes @@ -131,22 +199,19 @@ void CpuInfo::Init() { for (size_t i = 0; i < 3; ++i) { cache_sizes_[i] = data[i]; } +#elif _WIN32 + if (!RetrieveCacheSize(cache_sizes_)) { + SetDefaultCacheSize(); + } #else -#ifndef _SC_LEVEL1_DCACHE_SIZE - // Provide reasonable default values if no info - cache_sizes_[0] = 32 * 1024; // Level 1: 32k - cache_sizes_[1] = 256 * 1024; // Level 2: 256k - cache_sizes_[2] = 3072 * 1024; // Level 3: 3M -#else - // Call sysconf to query for the cache sizes - cache_sizes_[0] = sysconf(_SC_LEVEL1_DCACHE_SIZE); - cache_sizes_[1] = sysconf(_SC_LEVEL2_CACHE_SIZE); - cache_sizes_[2] = sysconf(_SC_LEVEL3_CACHE_SIZE); -#endif + SetDefaultCacheSize(); #endif if (max_mhz != 0) { - cycles_per_ms_ = static_cast(max_mhz) * 1000; + cycles_per_ms_ = static_cast(max_mhz); +#ifndef _WIN32 + cycles_per_ms_ *= 1000; +#endif } else { cycles_per_ms_ = 1000000; } @@ -203,4 +268,18 @@ std::string CpuInfo::model_name() { return model_name_; } +void CpuInfo::SetDefaultCacheSize() { +#ifndef _SC_LEVEL1_DCACHE_SIZE + // Provide reasonable default values if no info + cache_sizes_[0] = 32 * 1024; // Level 1: 32k + cache_sizes_[1] = 256 * 1024; // Level 2: 256k + cache_sizes_[2] = 3072 * 1024; // Level 3: 3M +#else + // Call sysconf to query for the cache sizes + cache_sizes_[0] = sysconf(_SC_LEVEL1_DCACHE_SIZE); + cache_sizes_[1] = sysconf(_SC_LEVEL2_CACHE_SIZE); + cache_sizes_[2] = sysconf(_SC_LEVEL3_CACHE_SIZE); +#endif +} + } // namespace arrow diff --git a/cpp/src/arrow/util/cpu-info.h b/cpp/src/arrow/util/cpu-info.h index 06800fc275572..f4bc8c35e3447 100644 --- a/cpp/src/arrow/util/cpu-info.h +++ b/cpp/src/arrow/util/cpu-info.h @@ -78,6 +78,9 @@ class ARROW_EXPORT CpuInfo { static bool initialized() { return initialized_; } private: + /// Inits CPU cache size variables with default values + static void SetDefaultCacheSize(); + static bool initialized_; static int64_t hardware_flags_; static int64_t original_hardware_flags_; diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index 72ede35bef9b5..1a12e20f9f93f 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -21,8 +21,8 @@ namespace arrow { namespace decimal { template -ARROW_EXPORT Status FromString( - const std::string& s, Decimal* out, int* precision, int* scale) { +ARROW_EXPORT Status FromString(const std::string& s, Decimal* out, int* precision, + int* scale) { // Implements this regex: "(\\+?|-?)((0*)(\\d*))(\\.(\\d+))?"; if (s.empty()) { return Status::Invalid("Empty string cannot be converted to decimal"); @@ -34,7 +34,9 @@ ARROW_EXPORT Status FromString( char first_char = *charp; if (first_char == '+' || first_char == '-') { - if (first_char == '-') { sign = -1; } + if (first_char == '-') { + sign = -1; + } ++charp; } @@ -55,7 +57,9 @@ ARROW_EXPORT Status FromString( // all zeros and no decimal point if (charp == end) { - if (out != nullptr) { out->value = static_cast(0); } + if (out != nullptr) { + out->value = static_cast(0); + } // Not sure what other libraries assign precision to for this case (this case of // a string consisting only of one or more zeros) @@ -63,7 +67,9 @@ ARROW_EXPORT Status FromString( *precision = static_cast(charp - numeric_string_start); } - if (scale != nullptr) { *scale = 0; } + if (scale != nullptr) { + *scale = 0; + } return Status::OK(); } @@ -127,22 +133,26 @@ ARROW_EXPORT Status FromString( *precision = static_cast(whole_part.size() + fractional_part.size()); } - if (scale != nullptr) { *scale = static_cast(fractional_part.size()); } + if (scale != nullptr) { + *scale = static_cast(fractional_part.size()); + } - if (out != nullptr) { StringToInteger(whole_part, fractional_part, sign, &out->value); } + if (out != nullptr) { + StringToInteger(whole_part, fractional_part, sign, &out->value); + } return Status::OK(); } -template ARROW_EXPORT Status FromString( - const std::string& s, Decimal32* out, int* precision, int* scale); -template ARROW_EXPORT Status FromString( - const std::string& s, Decimal64* out, int* precision, int* scale); -template ARROW_EXPORT Status FromString( - const std::string& s, Decimal128* out, int* precision, int* scale); +template ARROW_EXPORT Status FromString(const std::string& s, Decimal32* out, + int* precision, int* scale); +template ARROW_EXPORT Status FromString(const std::string& s, Decimal64* out, + int* precision, int* scale); +template ARROW_EXPORT Status FromString(const std::string& s, Decimal128* out, + int* precision, int* scale); -void StringToInteger( - const std::string& whole, const std::string& fractional, int8_t sign, int32_t* out) { +void StringToInteger(const std::string& whole, const std::string& fractional, int8_t sign, + int32_t* out) { DCHECK(sign == -1 || sign == 1); DCHECK_NE(out, nullptr); DCHECK(!whole.empty() || !fractional.empty()); @@ -150,12 +160,14 @@ void StringToInteger( *out = std::stoi(whole, nullptr, 10) * static_cast(pow(10.0, static_cast(fractional.size()))); } - if (!fractional.empty()) { *out += std::stoi(fractional, nullptr, 10); } + if (!fractional.empty()) { + *out += std::stoi(fractional, nullptr, 10); + } *out *= sign; } -void StringToInteger( - const std::string& whole, const std::string& fractional, int8_t sign, int64_t* out) { +void StringToInteger(const std::string& whole, const std::string& fractional, int8_t sign, + int64_t* out) { DCHECK(sign == -1 || sign == 1); DCHECK_NE(out, nullptr); DCHECK(!whole.empty() || !fractional.empty()); @@ -163,12 +175,14 @@ void StringToInteger( *out = static_cast(std::stoll(whole, nullptr, 10)) * static_cast(pow(10.0, static_cast(fractional.size()))); } - if (!fractional.empty()) { *out += std::stoll(fractional, nullptr, 10); } + if (!fractional.empty()) { + *out += std::stoll(fractional, nullptr, 10); + } *out *= sign; } -void StringToInteger( - const std::string& whole, const std::string& fractional, int8_t sign, int128_t* out) { +void StringToInteger(const std::string& whole, const std::string& fractional, int8_t sign, + int128_t* out) { DCHECK(sign == -1 || sign == 1); DCHECK_NE(out, nullptr); DCHECK(!whole.empty() || !fractional.empty()); @@ -200,7 +214,9 @@ void FromBytes(const uint8_t* bytes, bool is_negative, Decimal128* decimal) { int128_t::backend_type& backend(decimal_value.backend()); backend.resize(LIMBS_IN_INT128, LIMBS_IN_INT128); std::memcpy(backend.limbs(), bytes, BYTES_IN_128_BITS); - if (is_negative) { decimal->value = -decimal->value; } + if (is_negative) { + decimal->value = -decimal->value; + } } void ToBytes(const Decimal32& value, uint8_t** bytes) { diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h index 0d84ba89db973..20142faea3ec5 100644 --- a/cpp/src/arrow/util/decimal.h +++ b/cpp/src/arrow/util/decimal.h @@ -37,16 +37,16 @@ using boost::multiprecision::int128_t; template struct ARROW_EXPORT Decimal; -ARROW_EXPORT void StringToInteger( - const std::string& whole, const std::string& fractional, int8_t sign, int32_t* out); -ARROW_EXPORT void StringToInteger( - const std::string& whole, const std::string& fractional, int8_t sign, int64_t* out); -ARROW_EXPORT void StringToInteger( - const std::string& whole, const std::string& fractional, int8_t sign, int128_t* out); +ARROW_EXPORT void StringToInteger(const std::string& whole, const std::string& fractional, + int8_t sign, int32_t* out); +ARROW_EXPORT void StringToInteger(const std::string& whole, const std::string& fractional, + int8_t sign, int64_t* out); +ARROW_EXPORT void StringToInteger(const std::string& whole, const std::string& fractional, + int8_t sign, int128_t* out); template ARROW_EXPORT Status FromString(const std::string& s, Decimal* out, - int* precision = nullptr, int* scale = nullptr); + int* precision = nullptr, int* scale = nullptr); template struct ARROW_EXPORT Decimal { @@ -85,8 +85,8 @@ struct ARROW_EXPORT DecimalPrecision { }; template -ARROW_EXPORT std::string ToString( - const Decimal& decimal_value, int precision, int scale) { +ARROW_EXPORT std::string ToString(const Decimal& decimal_value, int precision, + int scale) { T value = decimal_value.value; // Decimal values are sent to clients as strings so in the interest of @@ -108,8 +108,8 @@ ARROW_EXPORT std::string ToString( if (scale > 0) { int remaining_scale = scale; do { - str[--last_char_idx] = static_cast( - (remaining_value % 10) + static_cast('0')); // Ascii offset + str[--last_char_idx] = static_cast((remaining_value % 10) + + static_cast('0')); // Ascii offset remaining_value /= 10; } while (--remaining_scale > 0); str[--last_char_idx] = '.'; diff --git a/cpp/src/arrow/util/key_value_metadata.cc b/cpp/src/arrow/util/key_value_metadata.cc index 8bddd5d0164c2..6877a6a5382fe 100644 --- a/cpp/src/arrow/util/key_value_metadata.cc +++ b/cpp/src/arrow/util/key_value_metadata.cc @@ -48,8 +48,8 @@ KeyValueMetadata::KeyValueMetadata( const std::unordered_map& map) : keys_(UnorderedMapKeys(map)), values_(UnorderedMapValues(map)) {} -KeyValueMetadata::KeyValueMetadata( - const std::vector& keys, const std::vector& values) +KeyValueMetadata::KeyValueMetadata(const std::vector& keys, + const std::vector& values) : keys_(keys), values_(values) { DCHECK_EQ(keys.size(), values.size()); } diff --git a/cpp/src/arrow/util/key_value_metadata.h b/cpp/src/arrow/util/key_value_metadata.h index a2a4623aee7cc..3d602131684f6 100644 --- a/cpp/src/arrow/util/key_value_metadata.h +++ b/cpp/src/arrow/util/key_value_metadata.h @@ -32,8 +32,8 @@ namespace arrow { class ARROW_EXPORT KeyValueMetadata { public: KeyValueMetadata(); - KeyValueMetadata( - const std::vector& keys, const std::vector& values); + KeyValueMetadata(const std::vector& keys, + const std::vector& values); explicit KeyValueMetadata(const std::unordered_map& map); virtual ~KeyValueMetadata() = default; diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index b6181219dbae6..89e69f932d52d 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -50,32 +50,25 @@ namespace arrow { #define DCHECK(condition) \ ARROW_IGNORE_EXPR(condition) \ - while (false) \ - ::arrow::internal::NullLog() + while (false) ::arrow::internal::NullLog() #define DCHECK_EQ(val1, val2) \ ARROW_IGNORE_EXPR(val1) \ - while (false) \ - ::arrow::internal::NullLog() + while (false) ::arrow::internal::NullLog() #define DCHECK_NE(val1, val2) \ ARROW_IGNORE_EXPR(val1) \ - while (false) \ - ::arrow::internal::NullLog() + while (false) ::arrow::internal::NullLog() #define DCHECK_LE(val1, val2) \ ARROW_IGNORE_EXPR(val1) \ - while (false) \ - ::arrow::internal::NullLog() + while (false) ::arrow::internal::NullLog() #define DCHECK_LT(val1, val2) \ ARROW_IGNORE_EXPR(val1) \ - while (false) \ - ::arrow::internal::NullLog() + while (false) ::arrow::internal::NullLog() #define DCHECK_GE(val1, val2) \ ARROW_IGNORE_EXPR(val1) \ - while (false) \ - ::arrow::internal::NullLog() + while (false) ::arrow::internal::NullLog() #define DCHECK_GT(val1, val2) \ ARROW_IGNORE_EXPR(val1) \ - while (false) \ - ::arrow::internal::NullLog() + while (false) ::arrow::internal::NullLog() #else #define ARROW_DFATAL ARROW_FATAL @@ -107,14 +100,20 @@ class CerrLog { has_logged_(false) {} virtual ~CerrLog() { - if (has_logged_) { std::cerr << std::endl; } - if (severity_ == ARROW_FATAL) { std::exit(1); } + if (has_logged_) { + std::cerr << std::endl; + } + if (severity_ == ARROW_FATAL) { + std::exit(1); + } } template CerrLog& operator<<(const T& t) { - has_logged_ = true; - std::cerr << t; + if (severity_ != ARROW_DEBUG) { + has_logged_ = true; + std::cerr << t; + } return *this; } @@ -131,7 +130,9 @@ class FatalLog : public CerrLog { : CerrLog(ARROW_FATAL){} // NOLINT [[noreturn]] ~FatalLog() { - if (has_logged_) { std::cerr << std::endl; } + if (has_logged_) { + std::cerr << std::endl; + } std::exit(1); } }; diff --git a/cpp/src/arrow/util/memory.h b/cpp/src/arrow/util/memory.h index c5c17ef907c22..fce9e19293249 100644 --- a/cpp/src/arrow/util/memory.h +++ b/cpp/src/arrow/util/memory.h @@ -31,7 +31,7 @@ uint8_t* pointer_logical_and(const uint8_t* address, uintptr_t bits) { // A helper function for doing memcpy with multiple threads. This is required // to saturate the memory bandwidth of modern cpus. void parallel_memcopy(uint8_t* dst, const uint8_t* src, int64_t nbytes, - uintptr_t block_size, int num_threads) { + uintptr_t block_size, int num_threads) { std::vector threadpool(num_threads); uint8_t* left = pointer_logical_and(src + block_size - 1, ~(block_size - 1)); uint8_t* right = pointer_logical_and(src + nbytes, ~(block_size - 1)); @@ -52,15 +52,17 @@ void parallel_memcopy(uint8_t* dst, const uint8_t* src, int64_t nbytes, // Start all threads first and handle leftovers while threads run. for (int i = 0; i < num_threads; i++) { - threadpool[i] = std::thread( - memcpy, dst + prefix + i * chunk_size, left + i * chunk_size, chunk_size); + threadpool[i] = std::thread(memcpy, dst + prefix + i * chunk_size, + left + i * chunk_size, chunk_size); } memcpy(dst, src, prefix); memcpy(dst + prefix + num_threads * chunk_size, right, suffix); for (auto& t : threadpool) { - if (t.joinable()) { t.join(); } + if (t.joinable()) { + t.join(); + } } } diff --git a/cpp/src/arrow/util/random.h b/cpp/src/arrow/util/random.h index 31f2b0680fe0a..ec48d5d4a529c 100644 --- a/cpp/src/arrow/util/random.h +++ b/cpp/src/arrow/util/random.h @@ -27,7 +27,9 @@ class Random { public: explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { // Avoid bad seeds. - if (seed_ == 0 || seed_ == random_internal::M) { seed_ = 1; } + if (seed_ == 0 || seed_ == random_internal::M) { + seed_ = 1; + } } // Next pseudo-random 32-bit unsigned integer. @@ -48,7 +50,9 @@ class Random { // The first reduction may overflow by 1 bit, so we may need to // repeat. mod == M is not possible; using > allows the faster // sign-bit-based test. - if (seed_ > random_internal::M) { seed_ -= random_internal::M; } + if (seed_ > random_internal::M) { + seed_ -= random_internal::M; + } return seed_; } @@ -97,9 +101,9 @@ class Random { double Normal(double mean, double std_dev) { double uniform1 = (Next() + 1.0) / (random_internal::M + 1.0); double uniform2 = (Next() + 1.0) / (random_internal::M + 1.0); - return ( - mean + - std_dev * sqrt(-2 * ::log(uniform1)) * cos(random_internal::kTwoPi * uniform2)); + return (mean + + std_dev * sqrt(-2 * ::log(uniform1)) * + cos(random_internal::kTwoPi * uniform2)); } // Return a random number between 0.0 and 1.0 inclusive. diff --git a/cpp/src/arrow/util/rle-encoding-test.cc b/cpp/src/arrow/util/rle-encoding-test.cc index 7c9b33c349496..7549b874355df 100644 --- a/cpp/src/arrow/util/rle-encoding-test.cc +++ b/cpp/src/arrow/util/rle-encoding-test.cc @@ -178,7 +178,7 @@ TEST(BitArray, TestMixed) { // exactly 'expected_encoding'. // if expected_len is not -1, it will validate the encoded size is correct. void ValidateRle(const vector& values, int bit_width, uint8_t* expected_encoding, - int expected_len) { + int expected_len) { const int len = 64 * 1024; uint8_t buffer[len]; EXPECT_LE(expected_len, len); @@ -190,7 +190,9 @@ void ValidateRle(const vector& values, int bit_width, uint8_t* expected_enc } int encoded_len = encoder.Flush(); - if (expected_len != -1) { EXPECT_EQ(encoded_len, expected_len); } + if (expected_len != -1) { + EXPECT_EQ(encoded_len, expected_len); + } if (expected_encoding != NULL) { EXPECT_EQ(memcmp(buffer, expected_encoding, expected_len), 0); } @@ -211,7 +213,7 @@ void ValidateRle(const vector& values, int bit_width, uint8_t* expected_enc RleDecoder decoder(buffer, len, bit_width); vector values_read(values.size()); ASSERT_EQ(values.size(), - decoder.GetBatch(values_read.data(), static_cast(values.size()))); + decoder.GetBatch(values_read.data(), static_cast(values.size()))); EXPECT_EQ(values, values_read); } } @@ -224,7 +226,9 @@ bool CheckRoundTrip(const vector& values, int bit_width) { RleEncoder encoder(buffer, len, bit_width); for (size_t i = 0; i < values.size(); ++i) { bool result = encoder.Put(values[i]); - if (!result) { return false; } + if (!result) { + return false; + } } int encoded_len = encoder.Flush(); int out = 0; @@ -233,7 +237,9 @@ bool CheckRoundTrip(const vector& values, int bit_width) { RleDecoder decoder(buffer, encoded_len, bit_width); for (size_t i = 0; i < values.size(); ++i) { EXPECT_TRUE(decoder.Get(&out)); - if (values[i] != out) { return false; } + if (values[i] != out) { + return false; + } } } @@ -245,7 +251,9 @@ bool CheckRoundTrip(const vector& values, int bit_width) { decoder.GetBatch(values_read.data(), static_cast(values.size()))) { return false; } - if (values != values_read) { return false; } + if (values != values_read) { + return false; + } } return true; @@ -294,8 +302,8 @@ TEST(Rle, SpecificSequences) { ValidateRle(values, 1, expected_buffer, 1 + num_groups); for (int width = 2; width <= MAX_WIDTH; ++width) { int num_values = static_cast(BitUtil::Ceil(100, 8)) * 8; - ValidateRle( - values, width, NULL, 1 + static_cast(BitUtil::Ceil(width * num_values, 8))); + ValidateRle(values, width, NULL, + 1 + static_cast(BitUtil::Ceil(width * num_values, 8))); } } @@ -352,8 +360,7 @@ TEST(Rle, BitWidthZeroLiteral) { // group but flush before finishing. TEST(BitRle, Flush) { vector values; - for (int i = 0; i < 16; ++i) - values.push_back(1); + for (int i = 0; i < 16; ++i) values.push_back(1); values.push_back(0); ValidateRle(values, 1, NULL, -1); values.push_back(1); @@ -385,7 +392,9 @@ TEST(BitRle, Random) { for (int i = 0; i < ngroups; ++i) { int group_size = dist(gen); - if (group_size > max_group_size) { group_size = 1; } + if (group_size > max_group_size) { + group_size = 1; + } for (int i = 0; i < group_size; ++i) { values.push_back(parity); } diff --git a/cpp/src/arrow/util/rle-encoding.h b/cpp/src/arrow/util/rle-encoding.h index 9ec6235144665..e69077807df3a 100644 --- a/cpp/src/arrow/util/rle-encoding.h +++ b/cpp/src/arrow/util/rle-encoding.h @@ -21,8 +21,8 @@ #ifndef ARROW_UTIL_RLE_ENCODING_H #define ARROW_UTIL_RLE_ENCODING_H -#include #include +#include #include "arrow/util/bit-stream-utils.h" #include "arrow/util/bit-util.h" @@ -122,7 +122,8 @@ class RleDecoder { /// Like GetBatchWithDict but add spacing for null entries template int GetBatchWithDictSpaced(const T* dictionary, T* values, int batch_size, - int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset); + int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset); protected: BitReader bit_reader_; @@ -289,7 +290,7 @@ inline int RleDecoder::GetBatch(T* values, int batch_size) { int repeat_batch = std::min(batch_size - values_read, static_cast(repeat_count_)); std::fill(values + values_read, values + values_read + repeat_batch, - static_cast(current_value_)); + static_cast(current_value_)); repeat_count_ -= repeat_batch; values_read += repeat_batch; } else if (literal_count_ > 0) { @@ -318,7 +319,7 @@ inline int RleDecoder::GetBatchWithDict(const T* dictionary, T* values, int batc int repeat_batch = std::min(batch_size - values_read, static_cast(repeat_count_)); std::fill(values + values_read, values + values_read + repeat_batch, - dictionary[current_value_]); + dictionary[current_value_]); repeat_count_ -= repeat_batch; values_read += repeat_batch; } else if (literal_count_ > 0) { @@ -345,8 +346,9 @@ inline int RleDecoder::GetBatchWithDict(const T* dictionary, T* values, int batc template inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* values, - int batch_size, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset) { + int batch_size, int null_count, + const uint8_t* valid_bits, + int64_t valid_bits_offset) { DCHECK_GE(bit_width_, 0); int values_read = 0; int remaining_nulls = null_count; @@ -379,8 +381,8 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* values, std::fill(values + values_read, values + values_read + repeat_batch, value); values_read += repeat_batch; } else if (literal_count_ > 0) { - int literal_batch = std::min( - batch_size - values_read - remaining_nulls, static_cast(literal_count_)); + int literal_batch = std::min(batch_size - values_read - remaining_nulls, + static_cast(literal_count_)); // Decode the literals constexpr int kBufferSize = 1024; @@ -434,7 +436,7 @@ bool RleDecoder::NextCounts() { repeat_count_ = indicator_value >> 1; bool result = bit_reader_.GetAligned(static_cast(BitUtil::Ceil(bit_width_, 8)), - reinterpret_cast(¤t_value_)); + reinterpret_cast(¤t_value_)); DCHECK(result); } return true; @@ -509,8 +511,8 @@ inline void RleEncoder::FlushRepeatedRun() { // The lsb of 0 indicates this is a repeated run int32_t indicator_value = repeat_count_ << 1 | 0; result &= bit_writer_.PutVlqInt(indicator_value); - result &= bit_writer_.PutAligned( - current_value_, static_cast(BitUtil::Ceil(bit_width_, 8))); + result &= bit_writer_.PutAligned(current_value_, + static_cast(BitUtil::Ceil(bit_width_, 8))); DCHECK(result); num_buffered_values_ = 0; repeat_count_ = 0; @@ -552,7 +554,7 @@ inline void RleEncoder::FlushBufferedValues(bool done) { inline int RleEncoder::Flush() { if (literal_count_ > 0 || repeat_count_ > 0 || num_buffered_values_ > 0) { bool all_repeat = literal_count_ == 0 && (repeat_count_ == num_buffered_values_ || - num_buffered_values_ == 0); + num_buffered_values_ == 0); // There is something pending, figure out if it's a repeated or literal run if (repeat_count_ > 0 && all_repeat) { FlushRepeatedRun(); diff --git a/cpp/src/arrow/util/sse-util.h b/cpp/src/arrow/util/sse-util.h index 570c4057a7573..a0ec8a2e93911 100644 --- a/cpp/src/arrow/util/sse-util.h +++ b/cpp/src/arrow/util/sse-util.h @@ -53,8 +53,8 @@ static const int STRCMP_MODE = /// Precomputed mask values up to 16 bits. static const int SSE_BITMASK[CHARS_PER_128_BIT_REGISTER] = { - 1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, 1 << 8, 1 << 9, - 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15, + 1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, + 1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15, }; } // namespace SSEUtil diff --git a/cpp/src/arrow/util/stl.h b/cpp/src/arrow/util/stl.h index d58689b748896..4b8916f6eaa54 100644 --- a/cpp/src/arrow/util/stl.h +++ b/cpp/src/arrow/util/stl.h @@ -40,8 +40,8 @@ inline std::vector DeleteVectorElement(const std::vector& values, size_t i } template -inline std::vector AddVectorElement( - const std::vector& values, size_t index, const T& new_element) { +inline std::vector AddVectorElement(const std::vector& values, size_t index, + const T& new_element) { DCHECK_LE(index, values.size()); std::vector out; out.reserve(values.size() + 1); diff --git a/cpp/src/arrow/util/string.h b/cpp/src/arrow/util/string.h index 5d9fdc88ced7e..6e70ddcccefec 100644 --- a/cpp/src/arrow/util/string.h +++ b/cpp/src/arrow/util/string.h @@ -46,7 +46,9 @@ static inline Status ParseHexValue(const char* data, uint8_t* out) { const char* pos2 = std::lower_bound(kAsciiTable, kAsciiTable + 16, c2); // Error checking - if (*pos1 != c1 || *pos2 != c2) { return Status::Invalid("Encountered non-hex digit"); } + if (*pos1 != c1 || *pos2 != c2) { + return Status::Invalid("Encountered non-hex digit"); + } *out = static_cast((pos1 - kAsciiTable) << 4 | (pos2 - kAsciiTable)); return Status::OK(); diff --git a/cpp/src/plasma/CMakeLists.txt b/cpp/src/plasma/CMakeLists.txt index 4ff3beba779c2..8bb7e71fdf11b 100644 --- a/cpp/src/plasma/CMakeLists.txt +++ b/cpp/src/plasma/CMakeLists.txt @@ -19,16 +19,13 @@ cmake_minimum_required(VERSION 2.8) project(plasma) +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/../python/cmake_modules") + find_package(PythonLibsNew REQUIRED) find_package(Threads) -option(PLASMA_PYTHON - "Build the Plasma Python extensions" - OFF) - -if(APPLE) - SET(CMAKE_SHARED_LIBRARY_SUFFIX ".so") -endif(APPLE) +set(PLASMA_SO_VERSION "0") +set(PLASMA_ABI_VERSION "${PLASMA_SO_VERSION}.0.0") include_directories(SYSTEM ${PYTHON_INCLUDE_DIRS}) include_directories("${FLATBUFFERS_INCLUDE_DIR}" "${CMAKE_CURRENT_LIST_DIR}/" "${CMAKE_CURRENT_LIST_DIR}/thirdparty/" "${CMAKE_CURRENT_LIST_DIR}/../") @@ -40,7 +37,7 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-conversion") # Compile flatbuffers set(PLASMA_FBS_SRC "${CMAKE_CURRENT_LIST_DIR}/format/plasma.fbs" "${CMAKE_CURRENT_LIST_DIR}/format/common.fbs") -set(OUTPUT_DIR ${CMAKE_CURRENT_LIST_DIR}/format/) +set(OUTPUT_DIR ${CMAKE_CURRENT_LIST_DIR}/) set(PLASMA_FBS_OUTPUT_FILES "${OUTPUT_DIR}/common_generated.h" @@ -69,8 +66,6 @@ endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") -set_source_files_properties(extension.cc PROPERTIES COMPILE_FLAGS -Wno-strict-aliasing) - set(PLASMA_SRCS client.cc common.cc @@ -97,17 +92,33 @@ set_source_files_properties(malloc.cc PROPERTIES COMPILE_FLAGS "-Wno-error -O3") add_executable(plasma_store store.cc) target_link_libraries(plasma_store plasma_static) +# Headers: top level +install(FILES + common.h + common_generated.h + client.h + events.h + plasma.h + plasma_generated.h + protocol.h + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/plasma") + +# Plasma store +install(TARGETS plasma_store DESTINATION ${CMAKE_INSTALL_BINDIR}) + +# pkg-config support +configure_file(plasma.pc.in + "${CMAKE_CURRENT_BINARY_DIR}/plasma.pc" + @ONLY) +install( + FILES "${CMAKE_CURRENT_BINARY_DIR}/plasma.pc" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") + +####################################### +# Unit tests +####################################### + ADD_ARROW_TEST(test/serialization_tests) ARROW_TEST_LINK_LIBRARIES(test/serialization_tests plasma_static) ADD_ARROW_TEST(test/client_tests) ARROW_TEST_LINK_LIBRARIES(test/client_tests plasma_static) - -if(PLASMA_PYTHON) - add_library(plasma_extension SHARED extension.cc) - - if(APPLE) - target_link_libraries(plasma_extension plasma_static "-undefined dynamic_lookup") - else(APPLE) - target_link_libraries(plasma_extension plasma_static -Wl,--whole-archive ${FLATBUFFERS_STATIC_LIB} -Wl,--no-whole-archive) - endif(APPLE) -endif() diff --git a/cpp/src/plasma/client.cc b/cpp/src/plasma/client.cc index b7b0ff0a3092d..44b94d6c5cf26 100644 --- a/cpp/src/plasma/client.cc +++ b/cpp/src/plasma/client.cc @@ -51,11 +51,31 @@ #define XXH64_DEFAULT_SEED 0 +namespace plasma { + // Number of threads used for memcopy and hash computations. constexpr int64_t kThreadPoolSize = 8; constexpr int64_t kBytesInMB = 1 << 20; static std::vector threadpool_(kThreadPoolSize); +struct ObjectInUseEntry { + /// A count of the number of times this client has called PlasmaClient::Create + /// or + /// PlasmaClient::Get on this object ID minus the number of calls to + /// PlasmaClient::Release. + /// When this count reaches zero, we remove the entry from the ObjectsInUse + /// and decrement a count in the relevant ClientMmapTableEntry. + int count; + /// Cached information to read the object. + PlasmaObject object; + /// A flag representing whether the object has been sealed. + bool is_sealed; +}; + +PlasmaClient::PlasmaClient() {} + +PlasmaClient::~PlasmaClient() {} + // If the file descriptor fd has been mmapped in this client process before, // return the pointer that was returned by mmap, otherwise mmap it and store the // pointer in a hash table. @@ -68,7 +88,9 @@ uint8_t* PlasmaClient::lookup_or_mmap(int fd, int store_fd_val, int64_t map_size uint8_t* result = reinterpret_cast( mmap(NULL, map_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); // TODO(pcm): Don't fail here, instead return a Status. - if (result == MAP_FAILED) { ARROW_LOG(FATAL) << "mmap failed"; } + if (result == MAP_FAILED) { + ARROW_LOG(FATAL) << "mmap failed"; + } close(fd); ClientMmapTableEntry& entry = mmap_table_[store_fd_val]; entry.pointer = result; @@ -86,8 +108,8 @@ uint8_t* PlasmaClient::lookup_mmapped_file(int store_fd_val) { return entry->second.pointer; } -void PlasmaClient::increment_object_count( - const ObjectID& object_id, PlasmaObject* object, bool is_sealed) { +void PlasmaClient::increment_object_count(const ObjectID& object_id, PlasmaObject* object, + bool is_sealed) { // Increment the count of the object to track the fact that it is being used. // The corresponding decrement should happen in PlasmaClient::Release. auto elem = objects_in_use_.find(object_id); @@ -122,7 +144,7 @@ void PlasmaClient::increment_object_count( } Status PlasmaClient::Create(const ObjectID& object_id, int64_t data_size, - uint8_t* metadata, int64_t metadata_size, uint8_t** data) { + uint8_t* metadata, int64_t metadata_size, uint8_t** data) { ARROW_LOG(DEBUG) << "called plasma_create on conn " << store_conn_ << " with size " << data_size << " and metadata size " << metadata_size; RETURN_NOT_OK(SendCreateRequest(store_conn_, object_id, data_size, metadata_size)); @@ -163,7 +185,7 @@ Status PlasmaClient::Create(const ObjectID& object_id, int64_t data_size, } Status PlasmaClient::Get(const ObjectID* object_ids, int64_t num_objects, - int64_t timeout_ms, ObjectBuffer* object_buffers) { + int64_t timeout_ms, ObjectBuffer* object_buffers) { // Fill out the info for the objects that are already in use locally. bool all_present = true; for (int i = 0; i < num_objects; ++i) { @@ -193,7 +215,9 @@ Status PlasmaClient::Get(const ObjectID* object_ids, int64_t num_objects, } } - if (all_present) { return Status::OK(); } + if (all_present) { + return Status::OK(); + } // If we get here, then the objects aren't all currently in use by this // client, so we need to send a request to the plasma store. @@ -203,8 +227,9 @@ Status PlasmaClient::Get(const ObjectID* object_ids, int64_t num_objects, std::vector received_object_ids(num_objects); std::vector object_data(num_objects); PlasmaObject* object; - RETURN_NOT_OK(ReadGetReply( - buffer.data(), buffer.size(), received_object_ids.data(), object_data.data(), num_objects)); + RETURN_NOT_OK(ReadGetReply(buffer.data(), buffer.size(), + received_object_ids.data(), object_data.data(), + num_objects)); for (int i = 0; i < num_objects; ++i) { DCHECK(received_object_ids[i] == object_ids[i]); @@ -300,13 +325,17 @@ Status PlasmaClient::PerformRelease(const ObjectID& object_id) { } Status PlasmaClient::Release(const ObjectID& object_id) { + // If the client is already disconnected, ignore release requests. + if (store_conn_ < 0) { + return Status::OK(); + } // Add the new object to the release history. release_history_.push_front(object_id); // If there are too many bytes in use by the client or if there are too many // pending release calls, and there are at least some pending release calls in // the release_history list, then release some objects. while ((in_use_object_bytes_ > std::min(kL3CacheSizeBytes, store_capacity_ / 100) || - release_history_.size() > config_.release_delay) && + release_history_.size() > config_.release_delay) && release_history_.size() > 0) { // Perform a release for the object ID for the first pending release. RETURN_NOT_OK(PerformRelease(release_history_.back())); @@ -340,8 +369,9 @@ static void ComputeBlockHash(const unsigned char* data, int64_t nbytes, uint64_t *hash = XXH64_digest(&hash_state); } -static inline bool compute_object_hash_parallel( - XXH64_state_t* hash_state, const unsigned char* data, int64_t nbytes) { +static inline bool compute_object_hash_parallel(XXH64_state_t* hash_state, + const unsigned char* data, + int64_t nbytes) { // Note that this function will likely be faster if the address of data is // aligned on a 64-byte boundary. const int num_threads = kThreadPoolSize; @@ -356,16 +386,18 @@ static inline bool compute_object_hash_parallel( // Each thread gets a "chunk" of k blocks, except the suffix thread. for (int i = 0; i < num_threads; i++) { - threadpool_[i] = std::thread(ComputeBlockHash, - reinterpret_cast(data_address) + i * chunk_size, chunk_size, - &threadhash[i]); + threadpool_[i] = std::thread( + ComputeBlockHash, reinterpret_cast(data_address) + i * chunk_size, + chunk_size, &threadhash[i]); } - ComputeBlockHash( - reinterpret_cast(right_address), suffix, &threadhash[num_threads]); + ComputeBlockHash(reinterpret_cast(right_address), suffix, + &threadhash[num_threads]); // Join the threads. for (auto& t : threadpool_) { - if (t.joinable()) { t.join(); } + if (t.joinable()) { + t.join(); + } } XXH64_update(hash_state, (unsigned char*)threadhash, sizeof(threadhash)); @@ -376,32 +408,16 @@ static uint64_t compute_object_hash(const ObjectBuffer& obj_buffer) { XXH64_state_t hash_state; XXH64_reset(&hash_state, XXH64_DEFAULT_SEED); if (obj_buffer.data_size >= kBytesInMB) { - compute_object_hash_parallel( - &hash_state, (unsigned char*)obj_buffer.data, obj_buffer.data_size); + compute_object_hash_parallel(&hash_state, (unsigned char*)obj_buffer.data, + obj_buffer.data_size); } else { XXH64_update(&hash_state, (unsigned char*)obj_buffer.data, obj_buffer.data_size); } - XXH64_update( - &hash_state, (unsigned char*)obj_buffer.metadata, obj_buffer.metadata_size); + XXH64_update(&hash_state, (unsigned char*)obj_buffer.metadata, + obj_buffer.metadata_size); return XXH64_digest(&hash_state); } -bool plasma_compute_object_hash( - PlasmaClient* conn, ObjectID object_id, unsigned char* digest) { - // Get the plasma object data. We pass in a timeout of 0 to indicate that - // the operation should timeout immediately. - ObjectBuffer object_buffer; - ARROW_CHECK_OK(conn->Get(&object_id, 1, 0, &object_buffer)); - // If the object was not retrieved, return false. - if (object_buffer.data_size == -1) { return false; } - // Compute the hash. - uint64_t hash = compute_object_hash(object_buffer); - memcpy(digest, &hash, sizeof(hash)); - // Release the plasma object. - ARROW_CHECK_OK(conn->Release(object_id)); - return true; -} - Status PlasmaClient::Seal(const ObjectID& object_id) { // Make sure this client has a reference to the object before sending the // request to Plasma. @@ -413,7 +429,7 @@ Status PlasmaClient::Seal(const ObjectID& object_id) { object_entry->second->is_sealed = true; /// Send the seal request to Plasma. static unsigned char digest[kDigestSize]; - ARROW_CHECK(plasma_compute_object_hash(this, object_id, &digest[0])); + RETURN_NOT_OK(Hash(object_id, &digest[0])); RETURN_NOT_OK(SendSealRequest(store_conn_, object_id, &digest[0])); // We call PlasmaClient::Release to decrement the number of instances of this // object @@ -439,6 +455,22 @@ Status PlasmaClient::Evict(int64_t num_bytes, int64_t& num_bytes_evicted) { return ReadEvictReply(buffer.data(), buffer.size(), num_bytes_evicted); } +Status PlasmaClient::Hash(const ObjectID& object_id, uint8_t* digest) { + // Get the plasma object data. We pass in a timeout of 0 to indicate that + // the operation should timeout immediately. + ObjectBuffer object_buffer; + RETURN_NOT_OK(Get(&object_id, 1, 0, &object_buffer)); + // If the object was not retrieved, return false. + if (object_buffer.data_size == -1) { + return Status::PlasmaObjectNonexistent("Object not found"); + } + // Compute the hash. + uint64_t hash = compute_object_hash(object_buffer); + memcpy(digest, &hash, sizeof(hash)); + // Release the plasma object. + return Release(object_id); +} + Status PlasmaClient::Subscribe(int* fd) { int sock[2]; // Create a non-blocking socket pair. This will only be used to send @@ -459,8 +491,28 @@ Status PlasmaClient::Subscribe(int* fd) { return Status::OK(); } +Status PlasmaClient::GetNotification(int fd, ObjectID* object_id, int64_t* data_size, + int64_t* metadata_size) { + uint8_t* notification = read_message_async(fd); + if (notification == NULL) { + return Status::IOError("Failed to read object notification from Plasma socket"); + } + auto object_info = flatbuffers::GetRoot(notification); + ARROW_CHECK(object_info->object_id()->size() == sizeof(ObjectID)); + memcpy(object_id, object_info->object_id()->data(), sizeof(ObjectID)); + if (object_info->is_deletion()) { + *data_size = -1; + *metadata_size = -1; + } else { + *data_size = object_info->data_size(); + *metadata_size = object_info->metadata_size(); + } + delete[] notification; + return Status::OK(); +} + Status PlasmaClient::Connect(const std::string& store_socket_name, - const std::string& manager_socket_name, int release_delay) { + const std::string& manager_socket_name, int release_delay) { store_conn_ = connect_ipc_sock_retry(store_socket_name, -1, -1); if (manager_socket_name != "") { manager_conn_ = connect_ipc_sock_retry(manager_socket_name, -1, -1); @@ -485,7 +537,11 @@ Status PlasmaClient::Disconnect() { // Close the connections to Plasma. The Plasma store will release the objects // that were in use by us when handling the SIGPIPE. close(store_conn_); - if (manager_conn_ >= 0) { close(manager_conn_); } + store_conn_ = -1; + if (manager_conn_ >= 0) { + close(manager_conn_); + manager_conn_ = -1; + } return Status::OK(); } @@ -500,9 +556,7 @@ Status PlasmaClient::Fetch(int num_object_ids, const ObjectID* object_ids) { return SendFetchRequest(manager_conn_, object_ids, num_object_ids); } -int PlasmaClient::get_manager_fd() { - return manager_conn_; -} +int PlasmaClient::get_manager_fd() { return manager_conn_; } Status PlasmaClient::Info(const ObjectID& object_id, int* object_status) { ARROW_CHECK(manager_conn_ >= 0); @@ -517,7 +571,8 @@ Status PlasmaClient::Info(const ObjectID& object_id, int* object_status) { } Status PlasmaClient::Wait(int64_t num_object_requests, ObjectRequest* object_requests, - int num_ready_objects, int64_t timeout_ms, int* num_objects_ready) { + int num_ready_objects, int64_t timeout_ms, + int* num_objects_ready) { ARROW_CHECK(manager_conn_ >= 0); ARROW_CHECK(num_object_requests > 0); ARROW_CHECK(num_ready_objects > 0); @@ -529,7 +584,7 @@ Status PlasmaClient::Wait(int64_t num_object_requests, ObjectRequest* object_req } RETURN_NOT_OK(SendWaitRequest(manager_conn_, object_requests, num_object_requests, - num_ready_objects, timeout_ms)); + num_ready_objects, timeout_ms)); std::vector buffer; RETURN_NOT_OK(PlasmaReceive(manager_conn_, MessageType_PlasmaWaitReply, &buffer)); RETURN_NOT_OK(ReadWaitReply(buffer.data(), buffer.size(), object_requests, &num_ready_objects)); @@ -540,7 +595,9 @@ Status PlasmaClient::Wait(int64_t num_object_requests, ObjectRequest* object_req int status = object_requests[i].status; switch (type) { case PLASMA_QUERY_LOCAL: - if (status == ObjectStatus_Local) { *num_objects_ready += 1; } + if (status == ObjectStatus_Local) { + *num_objects_ready += 1; + } break; case PLASMA_QUERY_ANYWHERE: if (status == ObjectStatus_Local || status == ObjectStatus_Remote) { @@ -555,3 +612,5 @@ Status PlasmaClient::Wait(int64_t num_object_requests, ObjectRequest* object_req } return Status::OK(); } + +} // namespace plasma diff --git a/cpp/src/plasma/client.h b/cpp/src/plasma/client.h index fb3a161795d47..cc05a064511fe 100644 --- a/cpp/src/plasma/client.h +++ b/cpp/src/plasma/client.h @@ -22,12 +22,18 @@ #include #include +#include #include +#include -#include "plasma/plasma.h" +#include "arrow/status.h" +#include "arrow/util/visibility.h" +#include "plasma/common.h" using arrow::Status; +namespace plasma { + #define PLASMA_DEFAULT_RELEASE_DELAY 64 // Use 100MB as an overestimate of the L3 cache size. @@ -63,22 +69,16 @@ struct ClientMmapTableEntry { int count; }; -struct ObjectInUseEntry { - /// A count of the number of times this client has called PlasmaClient::Create - /// or - /// PlasmaClient::Get on this object ID minus the number of calls to - /// PlasmaClient::Release. - /// When this count reaches zero, we remove the entry from the ObjectsInUse - /// and decrement a count in the relevant ClientMmapTableEntry. - int count; - /// Cached information to read the object. - PlasmaObject object; - /// A flag representing whether the object has been sealed. - bool is_sealed; -}; +struct ObjectInUseEntry; +struct ObjectRequest; +struct PlasmaObject; -class PlasmaClient { +class ARROW_EXPORT PlasmaClient { public: + PlasmaClient(); + + ~PlasmaClient(); + /// Connect to the local plasma store and plasma manager. Return /// the resulting connection. /// @@ -91,7 +91,7 @@ class PlasmaClient { /// and not evicted to avoid too many munmaps. /// @return The return status. Status Connect(const std::string& store_socket_name, - const std::string& manager_socket_name, int release_delay); + const std::string& manager_socket_name, int release_delay); /// Create an object in the Plasma Store. Any metadata for this object must be /// be passed in when the object is created. @@ -108,7 +108,7 @@ class PlasmaClient { /// @param data The address of the newly created object will be written here. /// @return The return status. Status Create(const ObjectID& object_id, int64_t data_size, uint8_t* metadata, - int64_t metadata_size, uint8_t** data); + int64_t metadata_size, uint8_t** data); /// Get some objects from the Plasma Store. This function will block until the /// objects have all been created and sealed in the Plasma Store or the @@ -126,7 +126,7 @@ class PlasmaClient { /// size field is -1, then the object was not retrieved. /// @return The return status. Status Get(const ObjectID* object_ids, int64_t num_objects, int64_t timeout_ms, - ObjectBuffer* object_buffers); + ObjectBuffer* object_buffers); /// Tell Plasma that the client no longer needs the object. This should be /// called @@ -177,10 +177,18 @@ class PlasmaClient { /// @return The return status. Status Evict(int64_t num_bytes, int64_t& num_bytes_evicted); + /// Compute the hash of an object in the object store. + /// + /// @param conn The object containing the connection state. + /// @param object_id The ID of the object we want to hash. + /// @param digest A pointer at which to return the hash digest of the object. + /// The pointer must have at least kDigestSize bytes allocated. + /// @return The return status. + Status Hash(const ObjectID& object_id, uint8_t* digest); + /// Subscribe to notifications when objects are sealed in the object store. /// Whenever an object is sealed, a message will be written to the client - /// socket - /// that is returned by this method. + /// socket that is returned by this method. /// /// @param fd Out parameter for the file descriptor the client should use to /// read notifications @@ -188,6 +196,16 @@ class PlasmaClient { /// @return The return status. Status Subscribe(int* fd); + /// Receive next object notification for this client if Subscribe has been called. + /// + /// @param fd The file descriptor we are reading the notification from. + /// @param object_id Out parameter, the object_id of the object that was sealed. + /// @param data_size Out parameter, the data size of the object that was sealed. + /// @param metadata_size Out parameter, the metadata size of the object that was sealed. + /// @return The return status. + Status GetNotification(int fd, ObjectID* object_id, int64_t* data_size, + int64_t* metadata_size); + /// Disconnect from the local plasma instance, including the local store and /// manager. /// @@ -253,7 +271,7 @@ class PlasmaClient { /// min_num_ready_objects this means that timeout expired. /// @return The return status. Status Wait(int64_t num_object_requests, ObjectRequest* object_requests, - int num_ready_objects, int64_t timeout_ms, int* num_objects_ready); + int num_ready_objects, int64_t timeout_ms, int* num_objects_ready); /// Transfer local object to a different plasma manager. /// @@ -297,8 +315,8 @@ class PlasmaClient { uint8_t* lookup_mmapped_file(int store_fd_val); - void increment_object_count( - const ObjectID& object_id, PlasmaObject* object, bool is_sealed); + void increment_object_count(const ObjectID& object_id, PlasmaObject* object, + bool is_sealed); /// File descriptor of the Unix domain socket that connects to the store. int store_conn_; @@ -330,14 +348,6 @@ class PlasmaClient { int64_t store_capacity_; }; -/// Compute the hash of an object in the object store. -/// -/// @param conn The object containing the connection state. -/// @param object_id The ID of the object we want to hash. -/// @param digest A pointer at which to return the hash digest of the object. -/// The pointer must have at least DIGEST_SIZE bytes allocated. -/// @return A boolean representing whether the hash operation succeeded. -bool plasma_compute_object_hash( - PlasmaClient* conn, ObjectID object_id, unsigned char* digest); +} // namespace plasma #endif // PLASMA_CLIENT_H diff --git a/cpp/src/plasma/common.cc b/cpp/src/plasma/common.cc index a09a963fa4769..d7a7965078533 100644 --- a/cpp/src/plasma/common.cc +++ b/cpp/src/plasma/common.cc @@ -19,7 +19,9 @@ #include -#include "format/plasma_generated.h" +#include "plasma/plasma_generated.h" + +namespace plasma { using arrow::Status; @@ -39,13 +41,9 @@ UniqueID UniqueID::from_binary(const std::string& binary) { return id; } -const uint8_t* UniqueID::data() const { - return id_; -} +const uint8_t* UniqueID::data() const { return id_; } -uint8_t* UniqueID::mutable_data() { - return id_; -} +uint8_t* UniqueID::mutable_data() { return id_; } std::string UniqueID::binary() const { return std::string(reinterpret_cast(id_), kUniqueIDSize); @@ -81,3 +79,8 @@ Status plasma_error_status(int plasma_error) { } return Status::OK(); } + +ARROW_EXPORT int ObjectStatusLocal = ObjectStatus_Local; +ARROW_EXPORT int ObjectStatusRemote = ObjectStatus_Remote; + +} // namespace plasma diff --git a/cpp/src/plasma/common.h b/cpp/src/plasma/common.h index 85dc74bf86e0d..2b71da67015cd 100644 --- a/cpp/src/plasma/common.h +++ b/cpp/src/plasma/common.h @@ -29,9 +29,11 @@ #include "arrow/status.h" #include "arrow/util/logging.h" +namespace plasma { + constexpr int64_t kUniqueIDSize = 20; -class UniqueID { +class ARROW_EXPORT UniqueID { public: static UniqueID from_random(); static UniqueID from_binary(const std::string& binary); @@ -60,4 +62,39 @@ typedef UniqueID ObjectID; arrow::Status plasma_error_status(int plasma_error); +/// Size of object hash digests. +constexpr int64_t kDigestSize = sizeof(uint64_t); + +/// Object request data structure. Used for Wait. +struct ObjectRequest { + /// The ID of the requested object. If ID_NIL request any object. + ObjectID object_id; + /// Request associated to the object. It can take one of the following values: + /// - PLASMA_QUERY_LOCAL: return if or when the object is available in the + /// local Plasma Store. + /// - PLASMA_QUERY_ANYWHERE: return if or when the object is available in + /// the system (i.e., either in the local or a remote Plasma Store). + int type; + /// Object status. Same as the status returned by plasma_status() function + /// call. This is filled in by plasma_wait_for_objects1(): + /// - ObjectStatus_Local: object is ready at the local Plasma Store. + /// - ObjectStatus_Remote: object is ready at a remote Plasma Store. + /// - ObjectStatus_Nonexistent: object does not exist in the system. + /// - PLASMA_CLIENT_IN_TRANSFER, if the object is currently being scheduled + /// for being transferred or it is transferring. + int status; +}; + +enum ObjectRequestType { + /// Query for object in the local plasma store. + PLASMA_QUERY_LOCAL = 1, + /// Query for object in the local plasma store or in a remote plasma store. + PLASMA_QUERY_ANYWHERE +}; + +extern int ObjectStatusLocal; +extern int ObjectStatusRemote; + +} // namespace plasma + #endif // PLASMA_COMMON_H diff --git a/cpp/src/plasma/events.cc b/cpp/src/plasma/events.cc index a9f7356e1f67e..f98ced2faf180 100644 --- a/cpp/src/plasma/events.cc +++ b/cpp/src/plasma/events.cc @@ -19,8 +19,10 @@ #include -void EventLoop::file_event_callback( - aeEventLoop* loop, int fd, void* context, int events) { +namespace plasma { + +void EventLoop::file_event_callback(aeEventLoop* loop, int fd, void* context, + int events) { FileCallback* callback = reinterpret_cast(context); (*callback)(events); } @@ -32,12 +34,12 @@ int EventLoop::timer_event_callback(aeEventLoop* loop, TimerID timer_id, void* c constexpr int kInitialEventLoopSize = 1024; -EventLoop::EventLoop() { - loop_ = aeCreateEventLoop(kInitialEventLoopSize); -} +EventLoop::EventLoop() { loop_ = aeCreateEventLoop(kInitialEventLoopSize); } bool EventLoop::add_file_event(int fd, int events, const FileCallback& callback) { - if (file_callbacks_.find(fd) != file_callbacks_.end()) { return false; } + if (file_callbacks_.find(fd) != file_callbacks_.end()) { + return false; + } auto data = std::unique_ptr(new FileCallback(callback)); void* context = reinterpret_cast(data.get()); // Try to add the file descriptor. @@ -45,7 +47,9 @@ bool EventLoop::add_file_event(int fd, int events, const FileCallback& callback) // If it cannot be added, increase the size of the event loop. if (err == AE_ERR && errno == ERANGE) { err = aeResizeSetSize(loop_, 3 * aeGetSetSize(loop_) / 2); - if (err != AE_OK) { return false; } + if (err != AE_OK) { + return false; + } err = aeCreateFileEvent(loop_, fd, events, EventLoop::file_event_callback, context); } // In any case, test if there were errors. @@ -61,9 +65,7 @@ void EventLoop::remove_file_event(int fd) { file_callbacks_.erase(fd); } -void EventLoop::run() { - aeMain(loop_); -} +void EventLoop::run() { aeMain(loop_); } int64_t EventLoop::add_timer(int64_t timeout, const TimerCallback& callback) { auto data = std::unique_ptr(new TimerCallback(callback)); @@ -79,3 +81,5 @@ int EventLoop::remove_timer(int64_t timer_id) { timer_callbacks_.erase(timer_id); return err; } + +} // namespace plasma diff --git a/cpp/src/plasma/events.h b/cpp/src/plasma/events.h index bd93d6bb2a6fd..6cb5b73fe94eb 100644 --- a/cpp/src/plasma/events.h +++ b/cpp/src/plasma/events.h @@ -26,6 +26,8 @@ extern "C" { #include "ae/ae.h" } +namespace plasma { + /// Constant specifying that the timer is done and it will be removed. constexpr int kEventLoopTimerDone = AE_NOMORE; @@ -96,4 +98,6 @@ class EventLoop { std::unordered_map> timer_callbacks_; }; +} // namespace plasma + #endif // PLASMA_EVENTS diff --git a/cpp/src/plasma/eviction_policy.cc b/cpp/src/plasma/eviction_policy.cc index 4ae6384d42543..6c2309f1709d2 100644 --- a/cpp/src/plasma/eviction_policy.cc +++ b/cpp/src/plasma/eviction_policy.cc @@ -19,6 +19,8 @@ #include +namespace plasma { + void LRUCache::add(const ObjectID& key, int64_t size) { auto it = item_map_.find(key); ARROW_CHECK(it == item_map_.end()); @@ -34,8 +36,8 @@ void LRUCache::remove(const ObjectID& key) { item_map_.erase(it); } -int64_t LRUCache::choose_objects_to_evict( - int64_t num_bytes_required, std::vector* objects_to_evict) { +int64_t LRUCache::choose_objects_to_evict(int64_t num_bytes_required, + std::vector* objects_to_evict) { int64_t bytes_evicted = 0; auto it = item_list_.end(); while (bytes_evicted < num_bytes_required && it != item_list_.begin()) { @@ -49,8 +51,8 @@ int64_t LRUCache::choose_objects_to_evict( EvictionPolicy::EvictionPolicy(PlasmaStoreInfo* store_info) : memory_used_(0), store_info_(store_info) {} -int64_t EvictionPolicy::choose_objects_to_evict( - int64_t num_bytes_required, std::vector* objects_to_evict) { +int64_t EvictionPolicy::choose_objects_to_evict(int64_t num_bytes_required, + std::vector* objects_to_evict) { int64_t bytes_evicted = cache_.choose_objects_to_evict(num_bytes_required, objects_to_evict); /* Update the LRU cache. */ @@ -67,8 +69,8 @@ void EvictionPolicy::object_created(const ObjectID& object_id) { cache_.add(object_id, entry->info.data_size + entry->info.metadata_size); } -bool EvictionPolicy::require_space( - int64_t size, std::vector* objects_to_evict) { +bool EvictionPolicy::require_space(int64_t size, + std::vector* objects_to_evict) { /* Check if there is enough space to create the object. */ int64_t required_space = memory_used_ + size - store_info_->memory_capacity; int64_t num_bytes_evicted; @@ -93,15 +95,17 @@ bool EvictionPolicy::require_space( return num_bytes_evicted >= required_space; } -void EvictionPolicy::begin_object_access( - const ObjectID& object_id, std::vector* objects_to_evict) { +void EvictionPolicy::begin_object_access(const ObjectID& object_id, + std::vector* objects_to_evict) { /* If the object is in the LRU cache, remove it. */ cache_.remove(object_id); } -void EvictionPolicy::end_object_access( - const ObjectID& object_id, std::vector* objects_to_evict) { +void EvictionPolicy::end_object_access(const ObjectID& object_id, + std::vector* objects_to_evict) { auto entry = store_info_->objects[object_id].get(); /* Add the object to the LRU cache.*/ cache_.add(object_id, entry->info.data_size + entry->info.metadata_size); } + +} // namespace plasma diff --git a/cpp/src/plasma/eviction_policy.h b/cpp/src/plasma/eviction_policy.h index 3815fc6652f0c..dd1c873466ec9 100644 --- a/cpp/src/plasma/eviction_policy.h +++ b/cpp/src/plasma/eviction_policy.h @@ -26,6 +26,8 @@ #include "plasma/common.h" #include "plasma/plasma.h" +namespace plasma { + // ==== The eviction policy ==== // // This file contains declaration for all functions and data structures that @@ -40,8 +42,8 @@ class LRUCache { void remove(const ObjectID& key); - int64_t choose_objects_to_evict( - int64_t num_bytes_required, std::vector* objects_to_evict); + int64_t choose_objects_to_evict(int64_t num_bytes_required, + std::vector* objects_to_evict); private: /// A doubly-linked list containing the items in the cache and @@ -93,8 +95,8 @@ class EvictionPolicy { /// @param objects_to_evict The object IDs that were chosen for eviction will /// be stored into this vector. /// @return Void. - void begin_object_access( - const ObjectID& object_id, std::vector* objects_to_evict); + void begin_object_access(const ObjectID& object_id, + std::vector* objects_to_evict); /// This method will be called whenever an object in the Plasma store that was /// being used is no longer being used. When this method is called, the @@ -105,8 +107,8 @@ class EvictionPolicy { /// @param objects_to_evict The object IDs that were chosen for eviction will /// be stored into this vector. /// @return Void. - void end_object_access( - const ObjectID& object_id, std::vector* objects_to_evict); + void end_object_access(const ObjectID& object_id, + std::vector* objects_to_evict); /// Choose some objects to evict from the Plasma store. When this method is /// called, the eviction policy will assume that the objects chosen to be @@ -119,8 +121,8 @@ class EvictionPolicy { /// @param objects_to_evict The object IDs that were chosen for eviction will /// be stored into this vector. /// @return The total number of bytes of space chosen to be evicted. - int64_t choose_objects_to_evict( - int64_t num_bytes_required, std::vector* objects_to_evict); + int64_t choose_objects_to_evict(int64_t num_bytes_required, + std::vector* objects_to_evict); private: /// The amount of memory (in bytes) currently being used. @@ -131,4 +133,6 @@ class EvictionPolicy { LRUCache cache_; }; +} // namespace plasma + #endif // PLASMA_EVICTION_POLICY_H diff --git a/cpp/src/plasma/extension.cc b/cpp/src/plasma/extension.cc deleted file mode 100644 index 5d61e337c108d..0000000000000 --- a/cpp/src/plasma/extension.cc +++ /dev/null @@ -1,456 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "plasma/extension.h" - -#include -#include - -#include "plasma/client.h" -#include "plasma/common.h" -#include "plasma/io.h" -#include "plasma/protocol.h" - -PyObject* PlasmaOutOfMemoryError; -PyObject* PlasmaObjectExistsError; - -PyObject* PyPlasma_connect(PyObject* self, PyObject* args) { - const char* store_socket_name; - const char* manager_socket_name; - int release_delay; - if (!PyArg_ParseTuple( - args, "ssi", &store_socket_name, &manager_socket_name, &release_delay)) { - return NULL; - } - PlasmaClient* client = new PlasmaClient(); - ARROW_CHECK_OK(client->Connect(store_socket_name, manager_socket_name, release_delay)); - - return PyCapsule_New(client, "plasma", NULL); -} - -PyObject* PyPlasma_disconnect(PyObject* self, PyObject* args) { - PyObject* client_capsule; - if (!PyArg_ParseTuple(args, "O", &client_capsule)) { return NULL; } - PlasmaClient* client; - ARROW_CHECK(PyObjectToPlasmaClient(client_capsule, &client)); - ARROW_CHECK_OK(client->Disconnect()); - /* We use the context of the connection capsule to indicate if the connection - * is still active (if the context is NULL) or if it is closed (if the context - * is (void*) 0x1). This is neccessary because the primary pointer of the - * capsule cannot be NULL. */ - PyCapsule_SetContext(client_capsule, reinterpret_cast(0x1)); - Py_RETURN_NONE; -} - -PyObject* PyPlasma_create(PyObject* self, PyObject* args) { - PlasmaClient* client; - ObjectID object_id; - Py_ssize_t size; - PyObject* metadata; - if (!PyArg_ParseTuple(args, "O&O&nO", PyObjectToPlasmaClient, &client, - PyStringToUniqueID, &object_id, &size, &metadata)) { - return NULL; - } - if (!PyByteArray_Check(metadata)) { - PyErr_SetString(PyExc_TypeError, "metadata must be a bytearray"); - return NULL; - } - uint8_t* data; - Status s = client->Create(object_id, size, - reinterpret_cast(PyByteArray_AsString(metadata)), - PyByteArray_Size(metadata), &data); - if (s.IsPlasmaObjectExists()) { - PyErr_SetString(PlasmaObjectExistsError, - "An object with this ID already exists in the plasma " - "store."); - return NULL; - } - if (s.IsPlasmaStoreFull()) { - PyErr_SetString(PlasmaOutOfMemoryError, - "The plasma store ran out of memory and could not create " - "this object."); - return NULL; - } - ARROW_CHECK(s.ok()); - -#if PY_MAJOR_VERSION >= 3 - return PyMemoryView_FromMemory(reinterpret_cast(data), size, PyBUF_WRITE); -#else - return PyBuffer_FromReadWriteMemory(reinterpret_cast(data), size); -#endif -} - -PyObject* PyPlasma_hash(PyObject* self, PyObject* args) { - PlasmaClient* client; - ObjectID object_id; - if (!PyArg_ParseTuple(args, "O&O&", PyObjectToPlasmaClient, &client, PyStringToUniqueID, - &object_id)) { - return NULL; - } - unsigned char digest[kDigestSize]; - bool success = plasma_compute_object_hash(client, object_id, digest); - if (success) { - PyObject* digest_string = - PyBytes_FromStringAndSize(reinterpret_cast(digest), kDigestSize); - return digest_string; - } else { - Py_RETURN_NONE; - } -} - -PyObject* PyPlasma_seal(PyObject* self, PyObject* args) { - PlasmaClient* client; - ObjectID object_id; - if (!PyArg_ParseTuple(args, "O&O&", PyObjectToPlasmaClient, &client, PyStringToUniqueID, - &object_id)) { - return NULL; - } - ARROW_CHECK_OK(client->Seal(object_id)); - Py_RETURN_NONE; -} - -PyObject* PyPlasma_release(PyObject* self, PyObject* args) { - PlasmaClient* client; - ObjectID object_id; - if (!PyArg_ParseTuple(args, "O&O&", PyObjectToPlasmaClient, &client, PyStringToUniqueID, - &object_id)) { - return NULL; - } - ARROW_CHECK_OK(client->Release(object_id)); - Py_RETURN_NONE; -} - -PyObject* PyPlasma_get(PyObject* self, PyObject* args) { - PlasmaClient* client; - PyObject* object_id_list; - Py_ssize_t timeout_ms; - if (!PyArg_ParseTuple( - args, "O&On", PyObjectToPlasmaClient, &client, &object_id_list, &timeout_ms)) { - return NULL; - } - - Py_ssize_t num_object_ids = PyList_Size(object_id_list); - std::vector object_ids(num_object_ids); - std::vector object_buffers(num_object_ids); - - for (int i = 0; i < num_object_ids; ++i) { - PyStringToUniqueID(PyList_GetItem(object_id_list, i), &object_ids[i]); - } - - Py_BEGIN_ALLOW_THREADS; - ARROW_CHECK_OK( - client->Get(object_ids.data(), num_object_ids, timeout_ms, object_buffers.data())); - Py_END_ALLOW_THREADS; - - PyObject* returns = PyList_New(num_object_ids); - for (int i = 0; i < num_object_ids; ++i) { - if (object_buffers[i].data_size != -1) { - /* The object was retrieved, so return the object. */ - PyObject* t = PyTuple_New(2); - Py_ssize_t data_size = static_cast(object_buffers[i].data_size); - Py_ssize_t metadata_size = static_cast(object_buffers[i].metadata_size); -#if PY_MAJOR_VERSION >= 3 - char* data = reinterpret_cast(object_buffers[i].data); - char* metadata = reinterpret_cast(object_buffers[i].metadata); - PyTuple_SET_ITEM(t, 0, PyMemoryView_FromMemory(data, data_size, PyBUF_READ)); - PyTuple_SET_ITEM( - t, 1, PyMemoryView_FromMemory(metadata, metadata_size, PyBUF_READ)); -#else - void* data = reinterpret_cast(object_buffers[i].data); - void* metadata = reinterpret_cast(object_buffers[i].metadata); - PyTuple_SET_ITEM(t, 0, PyBuffer_FromMemory(data, data_size)); - PyTuple_SET_ITEM(t, 1, PyBuffer_FromMemory(metadata, metadata_size)); -#endif - ARROW_CHECK(PyList_SetItem(returns, i, t) == 0); - } else { - /* The object was not retrieved, so just add None to the list of return - * values. */ - Py_INCREF(Py_None); - ARROW_CHECK(PyList_SetItem(returns, i, Py_None) == 0); - } - } - return returns; -} - -PyObject* PyPlasma_contains(PyObject* self, PyObject* args) { - PlasmaClient* client; - ObjectID object_id; - if (!PyArg_ParseTuple(args, "O&O&", PyObjectToPlasmaClient, &client, PyStringToUniqueID, - &object_id)) { - return NULL; - } - bool has_object; - ARROW_CHECK_OK(client->Contains(object_id, &has_object)); - - if (has_object) { - Py_RETURN_TRUE; - } else { - Py_RETURN_FALSE; - } -} - -PyObject* PyPlasma_fetch(PyObject* self, PyObject* args) { - PlasmaClient* client; - PyObject* object_id_list; - if (!PyArg_ParseTuple(args, "O&O", PyObjectToPlasmaClient, &client, &object_id_list)) { - return NULL; - } - if (client->get_manager_fd() == -1) { - PyErr_SetString(PyExc_RuntimeError, "Not connected to the plasma manager"); - return NULL; - } - Py_ssize_t n = PyList_Size(object_id_list); - ObjectID* object_ids = new ObjectID[n]; - for (int i = 0; i < n; ++i) { - PyStringToUniqueID(PyList_GetItem(object_id_list, i), &object_ids[i]); - } - ARROW_CHECK_OK(client->Fetch(static_cast(n), object_ids)); - delete[] object_ids; - Py_RETURN_NONE; -} - -PyObject* PyPlasma_wait(PyObject* self, PyObject* args) { - PlasmaClient* client; - PyObject* object_id_list; - Py_ssize_t timeout; - int num_returns; - if (!PyArg_ParseTuple(args, "O&Oni", PyObjectToPlasmaClient, &client, &object_id_list, - &timeout, &num_returns)) { - return NULL; - } - Py_ssize_t n = PyList_Size(object_id_list); - - if (client->get_manager_fd() == -1) { - PyErr_SetString(PyExc_RuntimeError, "Not connected to the plasma manager"); - return NULL; - } - if (num_returns < 0) { - PyErr_SetString( - PyExc_RuntimeError, "The argument num_returns cannot be less than zero."); - return NULL; - } - if (num_returns > n) { - PyErr_SetString(PyExc_RuntimeError, - "The argument num_returns cannot be greater than len(object_ids)"); - return NULL; - } - int64_t threshold = 1 << 30; - if (timeout > threshold) { - PyErr_SetString( - PyExc_RuntimeError, "The argument timeout cannot be greater than 2 ** 30."); - return NULL; - } - - std::vector object_requests(n); - for (int i = 0; i < n; ++i) { - ARROW_CHECK(PyStringToUniqueID(PyList_GetItem(object_id_list, i), - &object_requests[i].object_id) == 1); - object_requests[i].type = PLASMA_QUERY_ANYWHERE; - } - /* Drop the global interpreter lock while we are waiting, so other threads can - * run. */ - int num_return_objects; - Py_BEGIN_ALLOW_THREADS; - ARROW_CHECK_OK( - client->Wait(n, object_requests.data(), num_returns, timeout, &num_return_objects)); - Py_END_ALLOW_THREADS; - - int num_to_return = std::min(num_return_objects, num_returns); - PyObject* ready_ids = PyList_New(num_to_return); - PyObject* waiting_ids = PySet_New(object_id_list); - int num_returned = 0; - for (int i = 0; i < n; ++i) { - if (num_returned == num_to_return) { break; } - if (object_requests[i].status == ObjectStatus_Local || - object_requests[i].status == ObjectStatus_Remote) { - PyObject* ready = PyBytes_FromStringAndSize( - reinterpret_cast(&object_requests[i].object_id), - sizeof(object_requests[i].object_id)); - PyList_SetItem(ready_ids, num_returned, ready); - PySet_Discard(waiting_ids, ready); - num_returned += 1; - } else { - ARROW_CHECK(object_requests[i].status == ObjectStatus_Nonexistent); - } - } - ARROW_CHECK(num_returned == num_to_return); - /* Return both the ready IDs and the remaining IDs. */ - PyObject* t = PyTuple_New(2); - PyTuple_SetItem(t, 0, ready_ids); - PyTuple_SetItem(t, 1, waiting_ids); - return t; -} - -PyObject* PyPlasma_evict(PyObject* self, PyObject* args) { - PlasmaClient* client; - Py_ssize_t num_bytes; - if (!PyArg_ParseTuple(args, "O&n", PyObjectToPlasmaClient, &client, &num_bytes)) { - return NULL; - } - int64_t evicted_bytes; - ARROW_CHECK_OK(client->Evict(static_cast(num_bytes), evicted_bytes)); - return PyLong_FromSsize_t(static_cast(evicted_bytes)); -} - -PyObject* PyPlasma_delete(PyObject* self, PyObject* args) { - PlasmaClient* client; - ObjectID object_id; - if (!PyArg_ParseTuple(args, "O&O&", PyObjectToPlasmaClient, &client, PyStringToUniqueID, - &object_id)) { - return NULL; - } - ARROW_CHECK_OK(client->Delete(object_id)); - Py_RETURN_NONE; -} - -PyObject* PyPlasma_transfer(PyObject* self, PyObject* args) { - PlasmaClient* client; - ObjectID object_id; - const char* addr; - int port; - if (!PyArg_ParseTuple(args, "O&O&si", PyObjectToPlasmaClient, &client, - PyStringToUniqueID, &object_id, &addr, &port)) { - return NULL; - } - - if (client->get_manager_fd() == -1) { - PyErr_SetString(PyExc_RuntimeError, "Not connected to the plasma manager"); - return NULL; - } - - ARROW_CHECK_OK(client->Transfer(addr, port, object_id)); - Py_RETURN_NONE; -} - -PyObject* PyPlasma_subscribe(PyObject* self, PyObject* args) { - PlasmaClient* client; - if (!PyArg_ParseTuple(args, "O&", PyObjectToPlasmaClient, &client)) { return NULL; } - - int sock; - ARROW_CHECK_OK(client->Subscribe(&sock)); - return PyLong_FromLong(sock); -} - -PyObject* PyPlasma_receive_notification(PyObject* self, PyObject* args) { - int plasma_sock; - - if (!PyArg_ParseTuple(args, "i", &plasma_sock)) { return NULL; } - /* Receive object notification from the plasma connection socket. If the - * object was added, return a tuple of its fields: ObjectID, data_size, - * metadata_size. If the object was deleted, data_size and metadata_size will - * be set to -1. */ - uint8_t* notification = read_message_async(plasma_sock); - if (notification == NULL) { - PyErr_SetString( - PyExc_RuntimeError, "Failed to read object notification from Plasma socket"); - return NULL; - } - auto object_info = flatbuffers::GetRoot(notification); - /* Construct a tuple from object_info and return. */ - PyObject* t = PyTuple_New(3); - PyTuple_SetItem(t, 0, PyBytes_FromStringAndSize(object_info->object_id()->data(), - object_info->object_id()->size())); - if (object_info->is_deletion()) { - PyTuple_SetItem(t, 1, PyLong_FromLong(-1)); - PyTuple_SetItem(t, 2, PyLong_FromLong(-1)); - } else { - PyTuple_SetItem(t, 1, PyLong_FromLong(object_info->data_size())); - PyTuple_SetItem(t, 2, PyLong_FromLong(object_info->metadata_size())); - } - - delete[] notification; - return t; -} - -static PyMethodDef plasma_methods[] = { - {"connect", PyPlasma_connect, METH_VARARGS, "Connect to plasma."}, - {"disconnect", PyPlasma_disconnect, METH_VARARGS, "Disconnect from plasma."}, - {"create", PyPlasma_create, METH_VARARGS, "Create a new plasma object."}, - {"hash", PyPlasma_hash, METH_VARARGS, "Compute the hash of a plasma object."}, - {"seal", PyPlasma_seal, METH_VARARGS, "Seal a plasma object."}, - {"get", PyPlasma_get, METH_VARARGS, "Get a plasma object."}, - {"contains", PyPlasma_contains, METH_VARARGS, - "Does the plasma store contain this plasma object?"}, - {"fetch", PyPlasma_fetch, METH_VARARGS, - "Fetch the object from another plasma manager instance."}, - {"wait", PyPlasma_wait, METH_VARARGS, - "Wait until num_returns objects in object_ids are ready."}, - {"evict", PyPlasma_evict, METH_VARARGS, - "Evict some objects until we recover some number of bytes."}, - {"release", PyPlasma_release, METH_VARARGS, "Release the plasma object."}, - {"delete", PyPlasma_delete, METH_VARARGS, "Delete a plasma object."}, - {"transfer", PyPlasma_transfer, METH_VARARGS, - "Transfer object to another plasma manager."}, - {"subscribe", PyPlasma_subscribe, METH_VARARGS, - "Subscribe to the plasma notification socket."}, - {"receive_notification", PyPlasma_receive_notification, METH_VARARGS, - "Receive next notification from plasma notification socket."}, - {NULL} /* Sentinel */ -}; - -#if PY_MAJOR_VERSION >= 3 -static struct PyModuleDef moduledef = { - PyModuleDef_HEAD_INIT, "libplasma", /* m_name */ - "A Python client library for plasma.", /* m_doc */ - 0, /* m_size */ - plasma_methods, /* m_methods */ - NULL, /* m_reload */ - NULL, /* m_traverse */ - NULL, /* m_clear */ - NULL, /* m_free */ -}; -#endif - -#if PY_MAJOR_VERSION >= 3 -#define INITERROR return NULL -#else -#define INITERROR return -#endif - -#ifndef PyMODINIT_FUNC /* declarations for DLL import/export */ -#define PyMODINIT_FUNC void -#endif - -#if PY_MAJOR_VERSION >= 3 -#define MOD_INIT(name) PyMODINIT_FUNC PyInit_##name(void) -#else -#define MOD_INIT(name) PyMODINIT_FUNC init##name(void) -#endif - -MOD_INIT(libplasma) { -#if PY_MAJOR_VERSION >= 3 - PyObject* m = PyModule_Create(&moduledef); -#else - PyObject* m = - Py_InitModule3("libplasma", plasma_methods, "A Python client library for plasma."); -#endif - - /* Create a custom exception for when an object ID is reused. */ - char plasma_object_exists_error[] = "plasma_object_exists.error"; - PlasmaObjectExistsError = PyErr_NewException(plasma_object_exists_error, NULL, NULL); - Py_INCREF(PlasmaObjectExistsError); - PyModule_AddObject(m, "plasma_object_exists_error", PlasmaObjectExistsError); - /* Create a custom exception for when the plasma store is out of memory. */ - char plasma_out_of_memory_error[] = "plasma_out_of_memory.error"; - PlasmaOutOfMemoryError = PyErr_NewException(plasma_out_of_memory_error, NULL, NULL); - Py_INCREF(PlasmaOutOfMemoryError); - PyModule_AddObject(m, "plasma_out_of_memory_error", PlasmaOutOfMemoryError); - -#if PY_MAJOR_VERSION >= 3 - return m; -#endif -} diff --git a/cpp/src/plasma/extension.h b/cpp/src/plasma/extension.h deleted file mode 100644 index cee30abb3592d..0000000000000 --- a/cpp/src/plasma/extension.h +++ /dev/null @@ -1,50 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PLASMA_EXTENSION_H -#define PLASMA_EXTENSION_H - -#undef _XOPEN_SOURCE -#undef _POSIX_C_SOURCE -#include - -#include "bytesobject.h" // NOLINT - -#include "plasma/client.h" -#include "plasma/common.h" - -static int PyObjectToPlasmaClient(PyObject* object, PlasmaClient** client) { - if (PyCapsule_IsValid(object, "plasma")) { - *client = reinterpret_cast(PyCapsule_GetPointer(object, "plasma")); - return 1; - } else { - PyErr_SetString(PyExc_TypeError, "must be a 'plasma' capsule"); - return 0; - } -} - -int PyStringToUniqueID(PyObject* object, ObjectID* object_id) { - if (PyBytes_Check(object)) { - memcpy(object_id, PyBytes_AsString(object), sizeof(ObjectID)); - return 1; - } else { - PyErr_SetString(PyExc_TypeError, "must be a 20 character string"); - return 0; - } -} - -#endif // PLASMA_EXTENSION_H diff --git a/cpp/src/plasma/io.cc b/cpp/src/plasma/io.cc index 5875ebb7ae611..e3b6b617fbc06 100644 --- a/cpp/src/plasma/io.cc +++ b/cpp/src/plasma/io.cc @@ -38,7 +38,9 @@ Status WriteBytes(int fd, uint8_t* cursor, size_t length) { * advance the cursor, and decrease the amount left to write. */ nbytes = write(fd, cursor + offset, bytesleft); if (nbytes < 0) { - if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { continue; } + if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { + continue; + } return Status::IOError(std::string(strerror(errno))); } else if (nbytes == 0) { return Status::IOError("Encountered unexpected EOF"); @@ -67,7 +69,9 @@ Status ReadBytes(int fd, uint8_t* cursor, size_t length) { while (bytesleft > 0) { nbytes = read(fd, cursor + offset, bytesleft); if (nbytes < 0) { - if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { continue; } + if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { + continue; + } return Status::IOError(std::string(strerror(errno))); } else if (0 == nbytes) { return Status::IOError("Encountered unexpected EOF"); @@ -83,14 +87,16 @@ Status ReadBytes(int fd, uint8_t* cursor, size_t length) { Status ReadMessage(int fd, int64_t* type, std::vector* buffer) { int64_t version; RETURN_NOT_OK_ELSE(ReadBytes(fd, reinterpret_cast(&version), sizeof(version)), - *type = DISCONNECT_CLIENT); + *type = DISCONNECT_CLIENT); ARROW_CHECK(version == PLASMA_PROTOCOL_VERSION) << "version = " << version; size_t length; RETURN_NOT_OK_ELSE(ReadBytes(fd, reinterpret_cast(type), sizeof(*type)), - *type = DISCONNECT_CLIENT); + *type = DISCONNECT_CLIENT); RETURN_NOT_OK_ELSE(ReadBytes(fd, reinterpret_cast(&length), sizeof(length)), - *type = DISCONNECT_CLIENT); - if (length > buffer->size()) { buffer->resize(length); } + *type = DISCONNECT_CLIENT); + if (length > buffer->size()) { + buffer->resize(length); + } RETURN_NOT_OK_ELSE(ReadBytes(fd, buffer->data(), length), *type = DISCONNECT_CLIENT); return Status::OK(); } @@ -105,7 +111,7 @@ int bind_ipc_sock(const std::string& pathname, bool shall_listen) { /* Tell the system to allow the port to be reused. */ int on = 1; if (setsockopt(socket_fd, SOL_SOCKET, SO_REUSEADDR, reinterpret_cast(&on), - sizeof(on)) < 0) { + sizeof(on)) < 0) { ARROW_LOG(ERROR) << "setsockopt failed for pathname " << pathname; close(socket_fd); return -1; @@ -134,16 +140,22 @@ int bind_ipc_sock(const std::string& pathname, bool shall_listen) { return socket_fd; } -int connect_ipc_sock_retry( - const std::string& pathname, int num_retries, int64_t timeout) { +int connect_ipc_sock_retry(const std::string& pathname, int num_retries, + int64_t timeout) { /* Pick the default values if the user did not specify. */ - if (num_retries < 0) { num_retries = NUM_CONNECT_ATTEMPTS; } - if (timeout < 0) { timeout = CONNECT_TIMEOUT_MS; } + if (num_retries < 0) { + num_retries = NUM_CONNECT_ATTEMPTS; + } + if (timeout < 0) { + timeout = CONNECT_TIMEOUT_MS; + } int fd = -1; for (int num_attempts = 0; num_attempts < num_retries; ++num_attempts) { fd = connect_ipc_sock(pathname); - if (fd >= 0) { break; } + if (fd >= 0) { + break; + } if (num_attempts == 0) { ARROW_LOG(ERROR) << "Connection to socket failed for pathname " << pathname; } @@ -151,7 +163,9 @@ int connect_ipc_sock_retry( usleep(static_cast(timeout * 1000)); } /* If we could not connect to the socket, exit. */ - if (fd == -1) { ARROW_LOG(FATAL) << "Could not connect to socket " << pathname; } + if (fd == -1) { + ARROW_LOG(FATAL) << "Could not connect to socket " << pathname; + } return fd; } diff --git a/cpp/src/plasma/malloc.cc b/cpp/src/plasma/malloc.cc index 97c9a16c0c0bd..77a8afea75424 100644 --- a/cpp/src/plasma/malloc.cc +++ b/cpp/src/plasma/malloc.cc @@ -69,13 +69,9 @@ std::unordered_map mmap_records; constexpr int GRANULARITY_MULTIPLIER = 2; -static void* pointer_advance(void* p, ptrdiff_t n) { - return (unsigned char*)p + n; -} +static void* pointer_advance(void* p, ptrdiff_t n) { return (unsigned char*)p + n; } -static void* pointer_retreat(void* p, ptrdiff_t n) { - return (unsigned char*)p - n; -} +static void* pointer_retreat(void* p, ptrdiff_t n) { return (unsigned char*)p - n; } static ptrdiff_t pointer_distance(void const* pfrom, void const* pto) { return (unsigned char const*)pto - (unsigned char const*)pfrom; @@ -87,8 +83,8 @@ int create_buffer(int64_t size) { int fd; #ifdef _WIN32 if (!CreateFileMapping(INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, - (DWORD)((uint64_t)size >> (CHAR_BIT * sizeof(DWORD))), (DWORD)(uint64_t)size, - NULL)) { + (DWORD)((uint64_t)size >> (CHAR_BIT * sizeof(DWORD))), + (DWORD)(uint64_t)size, NULL)) { fd = -1; } #else @@ -127,7 +123,9 @@ void* fake_mmap(size_t size) { int fd = create_buffer(size); ARROW_CHECK(fd >= 0) << "Failed to create buffer during mmap"; void* pointer = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if (pointer == MAP_FAILED) { return pointer; } + if (pointer == MAP_FAILED) { + return pointer; + } /* Increase dlmalloc's allocation granularity directly. */ mparams.granularity *= GRANULARITY_MULTIPLIER; @@ -156,7 +154,9 @@ int fake_munmap(void* addr, int64_t size) { } int r = munmap(addr, size); - if (r == 0) { close(entry->second.fd); } + if (r == 0) { + close(entry->second.fd); + } mmap_records.erase(entry); return r; diff --git a/cpp/src/plasma/plasma.cc b/cpp/src/plasma/plasma.cc index 559d8e7f2a65e..87082817f12e9 100644 --- a/cpp/src/plasma/plasma.cc +++ b/cpp/src/plasma/plasma.cc @@ -24,8 +24,12 @@ #include "plasma/common.h" #include "plasma/protocol.h" +namespace plasma { + int warn_if_sigpipe(int status, int client_sock) { - if (status >= 0) { return 0; } + if (status >= 0) { + return 0; + } if (errno == EPIPE || errno == EBADF || errno == ECONNRESET) { ARROW_LOG(WARNING) << "Received SIGPIPE, BAD FILE DESCRIPTOR, or ECONNRESET when " "sending a message to client on fd " @@ -56,9 +60,13 @@ uint8_t* create_object_info_buffer(ObjectInfoT* object_info) { return notification; } -ObjectTableEntry* get_object_table_entry( - PlasmaStoreInfo* store_info, const ObjectID& object_id) { +ObjectTableEntry* get_object_table_entry(PlasmaStoreInfo* store_info, + const ObjectID& object_id) { auto it = store_info->objects.find(object_id); - if (it == store_info->objects.end()) { return NULL; } + if (it == store_info->objects.end()) { + return NULL; + } return it->second.get(); } + +} // namespace plasma diff --git a/cpp/src/plasma/plasma.h b/cpp/src/plasma/plasma.h index 275d0c7a41687..d60e5a8363035 100644 --- a/cpp/src/plasma/plasma.h +++ b/cpp/src/plasma/plasma.h @@ -32,8 +32,10 @@ #include "arrow/status.h" #include "arrow/util/logging.h" -#include "format/common_generated.h" #include "plasma/common.h" +#include "plasma/common_generated.h" + +namespace plasma { #define HANDLE_SIGPIPE(s, fd_) \ do { \ @@ -54,47 +56,23 @@ /// Allocation granularity used in plasma for object allocation. #define BLOCK_SIZE 64 -/// Size of object hash digests. -constexpr int64_t kDigestSize = sizeof(uint64_t); - struct Client; -/// Object request data structure. Used in the plasma_wait_for_objects() -/// argument. -typedef struct { - /// The ID of the requested object. If ID_NIL request any object. - ObjectID object_id; - /// Request associated to the object. It can take one of the following values: - /// - PLASMA_QUERY_LOCAL: return if or when the object is available in the - /// local Plasma Store. - /// - PLASMA_QUERY_ANYWHERE: return if or when the object is available in - /// the system (i.e., either in the local or a remote Plasma Store). - int type; - /// Object status. Same as the status returned by plasma_status() function - /// call. This is filled in by plasma_wait_for_objects1(): - /// - ObjectStatus_Local: object is ready at the local Plasma Store. - /// - ObjectStatus_Remote: object is ready at a remote Plasma Store. - /// - ObjectStatus_Nonexistent: object does not exist in the system. - /// - PLASMA_CLIENT_IN_TRANSFER, if the object is currently being scheduled - /// for being transferred or it is transferring. - int status; -} ObjectRequest; - /// Mapping from object IDs to type and status of the request. typedef std::unordered_map ObjectRequestMap; /// Handle to access memory mapped file and map it into client address space. -typedef struct { +struct object_handle { /// The file descriptor of the memory mapped file in the store. It is used as /// a unique identifier of the file in the client to look up the corresponding /// file descriptor on the client's side. int store_fd; /// The size in bytes of the memory mapped file. int64_t mmap_size; -} object_handle; +}; // TODO(pcm): Replace this by the flatbuffers message PlasmaObjectSpec. -typedef struct { +struct PlasmaObject { /// Handle for memory mapped file the object is stored in. object_handle handle; /// The offset in bytes in the memory mapped file of the data. @@ -105,28 +83,21 @@ typedef struct { int64_t data_size; /// The size in bytes of the metadata. int64_t metadata_size; -} PlasmaObject; +}; -typedef enum { +enum object_state { /// Object was created but not sealed in the local Plasma Store. PLASMA_CREATED = 1, /// Object is sealed and stored in the local Plasma Store. PLASMA_SEALED -} object_state; +}; -typedef enum { +enum object_status { /// The object was not found. OBJECT_NOT_FOUND = 0, /// The object was found. OBJECT_FOUND = 1 -} object_status; - -typedef enum { - /// Query for object in the local plasma store. - PLASMA_QUERY_LOCAL = 1, - /// Query for object in the local plasma store or in a remote plasma store. - PLASMA_QUERY_ANYWHERE -} object_request_type; +}; /// This type is used by the Plasma store. It is here because it is exposed to /// the eviction policy. @@ -167,8 +138,8 @@ struct PlasmaStoreInfo { /// @param object_id The object_id of the entry we are looking for. /// @return The entry associated with the object_id or NULL if the object_id /// is not present. -ObjectTableEntry* get_object_table_entry( - PlasmaStoreInfo* store_info, const ObjectID& object_id); +ObjectTableEntry* get_object_table_entry(PlasmaStoreInfo* store_info, + const ObjectID& object_id); /// Print a warning if the status is less than zero. This should be used to check /// the success of messages sent to plasma clients. We print a warning instead of @@ -188,4 +159,6 @@ int warn_if_sigpipe(int status, int client_sock); uint8_t* create_object_info_buffer(ObjectInfoT* object_info); +} // namespace plasma + #endif // PLASMA_PLASMA_H diff --git a/cpp/src/plasma/plasma.pc.in b/cpp/src/plasma/plasma.pc.in new file mode 100644 index 0000000000000..d86868939f363 --- /dev/null +++ b/cpp/src/plasma/plasma.pc.in @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +prefix=@CMAKE_INSTALL_PREFIX@ +libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@ +includedir=${prefix}/include + +so_version=@PLASMA_SO_VERSION@ +abi_version=@PLASMA_ABI_VERSION@ +executable=${prefix}/@CMAKE_INSTALL_BINDIR@/plasma_store + +Name: Plasma +Description: Plasma is an in-memory object store and cache for big data. +Version: @PLASMA_VERSION@ +Libs: -L${libdir} -lplasma +Cflags: -I${includedir} diff --git a/cpp/src/plasma/protocol.cc b/cpp/src/plasma/protocol.cc index c51475dea2981..91a279d5d14ce 100644 --- a/cpp/src/plasma/protocol.cc +++ b/cpp/src/plasma/protocol.cc @@ -18,16 +18,18 @@ #include "plasma/protocol.h" #include "flatbuffers/flatbuffers.h" -#include "format/plasma_generated.h" +#include "plasma/plasma_generated.h" #include "plasma/common.h" #include "plasma/io.h" +namespace plasma { + using flatbuffers::uoffset_t; flatbuffers::Offset>> to_flatbuffer(flatbuffers::FlatBufferBuilder* fbb, const ObjectID* object_ids, - int64_t num_objects) { + int64_t num_objects) { std::vector> results; for (int64_t i = 0; i < num_objects; i++) { results.push_back(fbb->CreateString(object_ids[i].binary())); @@ -45,23 +47,23 @@ Status PlasmaReceive(int sock, int64_t message_type, std::vector* buffe template Status PlasmaSend(int sock, int64_t message_type, flatbuffers::FlatBufferBuilder* fbb, - const Message& message) { + const Message& message) { fbb->Finish(message); return WriteMessage(sock, message_type, fbb->GetSize(), fbb->GetBufferPointer()); } // Create messages. -Status SendCreateRequest( - int sock, ObjectID object_id, int64_t data_size, int64_t metadata_size) { +Status SendCreateRequest(int sock, ObjectID object_id, int64_t data_size, + int64_t metadata_size) { flatbuffers::FlatBufferBuilder fbb; - auto message = CreatePlasmaCreateRequest( - fbb, fbb.CreateString(object_id.binary()), data_size, metadata_size); + auto message = CreatePlasmaCreateRequest(fbb, fbb.CreateString(object_id.binary()), + data_size, metadata_size); return PlasmaSend(sock, MessageType_PlasmaCreateRequest, &fbb, message); } -Status ReadCreateRequest( - uint8_t* data, size_t size, ObjectID* object_id, int64_t* data_size, int64_t* metadata_size) { +Status ReadCreateRequest(uint8_t* data, size_t size, ObjectID* object_id, + int64_t* data_size, int64_t* metadata_size) { DCHECK(data); auto message = flatbuffers::GetRoot(data); DCHECK(verify_flatbuffer(message, data, size)); @@ -71,14 +73,14 @@ Status ReadCreateRequest( return Status::OK(); } -Status SendCreateReply( - int sock, ObjectID object_id, PlasmaObject* object, int error_code) { +Status SendCreateReply(int sock, ObjectID object_id, PlasmaObject* object, + int error_code) { flatbuffers::FlatBufferBuilder fbb; PlasmaObjectSpec plasma_object(object->handle.store_fd, object->handle.mmap_size, - object->data_offset, object->data_size, object->metadata_offset, - object->metadata_size); - auto message = CreatePlasmaCreateReply( - fbb, fbb.CreateString(object_id.binary()), &plasma_object, (PlasmaError)error_code); + object->data_offset, object->data_size, + object->metadata_offset, object->metadata_size); + auto message = CreatePlasmaCreateReply(fbb, fbb.CreateString(object_id.binary()), + &plasma_object, (PlasmaError)error_code); return PlasmaSend(sock, MessageType_PlasmaCreateReply, &fbb, message); } @@ -118,8 +120,8 @@ Status ReadSealRequest(uint8_t* data, size_t size, ObjectID* object_id, unsigned Status SendSealReply(int sock, ObjectID object_id, int error) { flatbuffers::FlatBufferBuilder fbb; - auto message = CreatePlasmaSealReply( - fbb, fbb.CreateString(object_id.binary()), (PlasmaError)error); + auto message = CreatePlasmaSealReply(fbb, fbb.CreateString(object_id.binary()), + (PlasmaError)error); return PlasmaSend(sock, MessageType_PlasmaSealReply, &fbb, message); } @@ -149,8 +151,8 @@ Status ReadReleaseRequest(uint8_t* data, size_t size, ObjectID* object_id) { Status SendReleaseReply(int sock, ObjectID object_id, int error) { flatbuffers::FlatBufferBuilder fbb; - auto message = CreatePlasmaReleaseReply( - fbb, fbb.CreateString(object_id.binary()), (PlasmaError)error); + auto message = CreatePlasmaReleaseReply(fbb, fbb.CreateString(object_id.binary()), + (PlasmaError)error); return PlasmaSend(sock, MessageType_PlasmaReleaseReply, &fbb, message); } @@ -180,8 +182,8 @@ Status ReadDeleteRequest(uint8_t* data, size_t size, ObjectID* object_id) { Status SendDeleteReply(int sock, ObjectID object_id, int error) { flatbuffers::FlatBufferBuilder fbb; - auto message = CreatePlasmaDeleteReply( - fbb, fbb.CreateString(object_id.binary()), (PlasmaError)error); + auto message = CreatePlasmaDeleteReply(fbb, fbb.CreateString(object_id.binary()), + (PlasmaError)error); return PlasmaSend(sock, MessageType_PlasmaDeleteReply, &fbb, message); } @@ -212,12 +214,12 @@ Status ReadStatusRequest(uint8_t* data, size_t size, ObjectID object_ids[], int6 return Status::OK(); } -Status SendStatusReply( - int sock, ObjectID object_ids[], int object_status[], int64_t num_objects) { +Status SendStatusReply(int sock, ObjectID object_ids[], int object_status[], + int64_t num_objects) { flatbuffers::FlatBufferBuilder fbb; auto message = CreatePlasmaStatusReply(fbb, to_flatbuffer(&fbb, object_ids, num_objects), - fbb.CreateVector(object_status, num_objects)); + fbb.CreateVector(object_status, num_objects)); return PlasmaSend(sock, MessageType_PlasmaStatusReply, &fbb, message); } @@ -228,8 +230,8 @@ int64_t ReadStatusReply_num_objects(uint8_t* data, size_t size) { return message->object_ids()->size(); } -Status ReadStatusReply( - uint8_t* data, size_t size, ObjectID object_ids[], int object_status[], int64_t num_objects) { +Status ReadStatusReply(uint8_t* data, size_t size, ObjectID object_ids[], + int object_status[], int64_t num_objects) { DCHECK(data); auto message = flatbuffers::GetRoot(data); DCHECK(verify_flatbuffer(message, data, size)); @@ -282,9 +284,7 @@ Status SendConnectRequest(int sock) { return PlasmaSend(sock, MessageType_PlasmaConnectRequest, &fbb, message); } -Status ReadConnectRequest(uint8_t* data) { - return Status::OK(); -} +Status ReadConnectRequest(uint8_t* data) { return Status::OK(); } Status SendConnectReply(int sock, int64_t memory_capacity) { flatbuffers::FlatBufferBuilder fbb; @@ -332,16 +332,16 @@ Status ReadEvictReply(uint8_t* data, size_t size, int64_t& num_bytes) { // Get messages. -Status SendGetRequest( - int sock, const ObjectID* object_ids, int64_t num_objects, int64_t timeout_ms) { +Status SendGetRequest(int sock, const ObjectID* object_ids, int64_t num_objects, + int64_t timeout_ms) { flatbuffers::FlatBufferBuilder fbb; - auto message = CreatePlasmaGetRequest( - fbb, to_flatbuffer(&fbb, object_ids, num_objects), timeout_ms); + auto message = CreatePlasmaGetRequest(fbb, to_flatbuffer(&fbb, object_ids, num_objects), + timeout_ms); return PlasmaSend(sock, MessageType_PlasmaGetRequest, &fbb, message); } -Status ReadGetRequest( - uint8_t* data, size_t size, std::vector& object_ids, int64_t* timeout_ms) { +Status ReadGetRequest(uint8_t* data, size_t size, + std::vector& object_ids, int64_t* timeout_ms) { DCHECK(data); auto message = flatbuffers::GetRoot(data); DCHECK(verify_flatbuffer(message, data, size)); @@ -353,7 +353,8 @@ Status ReadGetRequest( return Status::OK(); } -Status SendGetReply(int sock, ObjectID object_ids[], +Status SendGetReply( + int sock, ObjectID object_ids[], std::unordered_map& plasma_objects, int64_t num_objects) { flatbuffers::FlatBufferBuilder fbb; @@ -362,16 +363,17 @@ Status SendGetReply(int sock, ObjectID object_ids[], for (int i = 0; i < num_objects; ++i) { const PlasmaObject& object = plasma_objects[object_ids[i]]; objects.push_back(PlasmaObjectSpec(object.handle.store_fd, object.handle.mmap_size, - object.data_offset, object.data_size, object.metadata_offset, - object.metadata_size)); + object.data_offset, object.data_size, + object.metadata_offset, object.metadata_size)); } - auto message = CreatePlasmaGetReply(fbb, to_flatbuffer(&fbb, object_ids, num_objects), - fbb.CreateVectorOfStructs(objects.data(), num_objects)); + auto message = + CreatePlasmaGetReply(fbb, to_flatbuffer(&fbb, object_ids, num_objects), + fbb.CreateVectorOfStructs(objects.data(), num_objects)); return PlasmaSend(sock, MessageType_PlasmaGetReply, &fbb, message); } -Status ReadGetReply(uint8_t* data, size_t size, ObjectID object_ids[], PlasmaObject plasma_objects[], - int64_t num_objects) { +Status ReadGetReply(uint8_t* data, size_t size, ObjectID object_ids[], + PlasmaObject plasma_objects[], int64_t num_objects) { DCHECK(data); auto message = flatbuffers::GetRoot(data); DCHECK(verify_flatbuffer(message, data, size)); @@ -412,23 +414,24 @@ Status ReadFetchRequest(uint8_t* data, size_t size, std::vector& objec // Wait messages. Status SendWaitRequest(int sock, ObjectRequest object_requests[], int64_t num_requests, - int num_ready_objects, int64_t timeout_ms) { + int num_ready_objects, int64_t timeout_ms) { flatbuffers::FlatBufferBuilder fbb; std::vector> object_request_specs; for (int i = 0; i < num_requests; i++) { - object_request_specs.push_back(CreateObjectRequestSpec(fbb, - fbb.CreateString(object_requests[i].object_id.binary()), + object_request_specs.push_back(CreateObjectRequestSpec( + fbb, fbb.CreateString(object_requests[i].object_id.binary()), object_requests[i].type)); } - auto message = CreatePlasmaWaitRequest( - fbb, fbb.CreateVector(object_request_specs), num_ready_objects, timeout_ms); + auto message = CreatePlasmaWaitRequest(fbb, fbb.CreateVector(object_request_specs), + num_ready_objects, timeout_ms); return PlasmaSend(sock, MessageType_PlasmaWaitRequest, &fbb, message); } -Status ReadWaitRequest(uint8_t* data, size_t size, ObjectRequestMap& object_requests, - int64_t* timeout_ms, int* num_ready_objects) { +Status ReadWaitRequest(uint8_t* data, size_t size, + ObjectRequestMap& object_requests, int64_t* timeout_ms, + int* num_ready_objects) { DCHECK(data); auto message = flatbuffers::GetRoot(data); DCHECK(verify_flatbuffer(message, data, size)); @@ -439,14 +442,14 @@ Status ReadWaitRequest(uint8_t* data, size_t size, ObjectRequestMap& object_requ ObjectID object_id = ObjectID::from_binary(message->object_requests()->Get(i)->object_id()->str()); ObjectRequest object_request({object_id, message->object_requests()->Get(i)->type(), - ObjectStatus_Nonexistent}); + ObjectStatus_Nonexistent}); object_requests[object_id] = object_request; } return Status::OK(); } -Status SendWaitReply( - int sock, const ObjectRequestMap& object_requests, int num_ready_objects) { +Status SendWaitReply(int sock, const ObjectRequestMap& object_requests, + int num_ready_objects) { flatbuffers::FlatBufferBuilder fbb; std::vector> object_replies; @@ -461,8 +464,8 @@ Status SendWaitReply( return PlasmaSend(sock, MessageType_PlasmaWaitReply, &fbb, message); } -Status ReadWaitReply( - uint8_t* data, size_t size, ObjectRequest object_requests[], int* num_ready_objects) { +Status ReadWaitReply(uint8_t* data, size_t size, + ObjectRequest object_requests[], int* num_ready_objects) { DCHECK(data); auto message = flatbuffers::GetRoot(data); @@ -505,16 +508,16 @@ Status ReadDataRequest(uint8_t* data, size_t size, ObjectID* object_id, char** a return Status::OK(); } -Status SendDataReply( - int sock, ObjectID object_id, int64_t object_size, int64_t metadata_size) { +Status SendDataReply(int sock, ObjectID object_id, int64_t object_size, + int64_t metadata_size) { flatbuffers::FlatBufferBuilder fbb; - auto message = CreatePlasmaDataReply( - fbb, fbb.CreateString(object_id.binary()), object_size, metadata_size); + auto message = CreatePlasmaDataReply(fbb, fbb.CreateString(object_id.binary()), + object_size, metadata_size); return PlasmaSend(sock, MessageType_PlasmaDataReply, &fbb, message); } -Status ReadDataReply( - uint8_t* data, size_t size, ObjectID* object_id, int64_t* object_size, int64_t* metadata_size) { +Status ReadDataReply(uint8_t* data, size_t size, ObjectID* object_id, + int64_t* object_size, int64_t* metadata_size) { DCHECK(data); auto message = flatbuffers::GetRoot(data); DCHECK(verify_flatbuffer(message, data, size)); @@ -523,3 +526,5 @@ Status ReadDataReply( *metadata_size = (int64_t)message->metadata_size(); return Status::OK(); } + +} // namespace plasma diff --git a/cpp/src/plasma/protocol.h b/cpp/src/plasma/protocol.h index 954a0ae8e18e0..c0c2fc3d90a92 100644 --- a/cpp/src/plasma/protocol.h +++ b/cpp/src/plasma/protocol.h @@ -21,8 +21,10 @@ #include #include "arrow/status.h" -#include "format/plasma_generated.h" #include "plasma/plasma.h" +#include "plasma/plasma_generated.h" + +namespace plasma { using arrow::Status; @@ -38,11 +40,11 @@ Status PlasmaReceive(int sock, int64_t message_type, std::vector* buffe /* Plasma Create message functions. */ -Status SendCreateRequest( - int sock, ObjectID object_id, int64_t data_size, int64_t metadata_size); +Status SendCreateRequest(int sock, ObjectID object_id, int64_t data_size, + int64_t metadata_size); -Status ReadCreateRequest( - uint8_t* data, size_t size, ObjectID* object_id, int64_t* data_size, int64_t* metadata_size); +Status ReadCreateRequest(uint8_t* data, size_t size, ObjectID* object_id, + int64_t* data_size, int64_t* metadata_size); Status SendCreateReply(int sock, ObjectID object_id, PlasmaObject* object, int error); @@ -60,18 +62,19 @@ Status ReadSealReply(uint8_t* data, size_t size, ObjectID* object_id); /* Plasma Get message functions. */ -Status SendGetRequest( - int sock, const ObjectID* object_ids, int64_t num_objects, int64_t timeout_ms); +Status SendGetRequest(int sock, const ObjectID* object_ids, int64_t num_objects, + int64_t timeout_ms); -Status ReadGetRequest( - uint8_t* data, size_t size, std::vector& object_ids, int64_t* timeout_ms); +Status ReadGetRequest(uint8_t* data, size_t size, + std::vector& object_ids, int64_t* timeout_ms); -Status SendGetReply(int sock, ObjectID object_ids[], +Status SendGetReply( + int sock, ObjectID object_ids[], std::unordered_map& plasma_objects, int64_t num_objects); -Status ReadGetReply(uint8_t* data, size_t size, ObjectID object_ids[], PlasmaObject plasma_objects[], - int64_t num_objects); +Status ReadGetReply(uint8_t* data, size_t size, ObjectID object_ids[], + PlasmaObject plasma_objects[], int64_t num_objects); /* Plasma Release message functions. */ @@ -99,13 +102,13 @@ Status SendStatusRequest(int sock, const ObjectID* object_ids, int64_t num_objec Status ReadStatusRequest(uint8_t* data, size_t size, ObjectID object_ids[], int64_t num_objects); -Status SendStatusReply( - int sock, ObjectID object_ids[], int object_status[], int64_t num_objects); +Status SendStatusReply(int sock, ObjectID object_ids[], int object_status[], + int64_t num_objects); int64_t ReadStatusReply_num_objects(uint8_t* data, size_t size); -Status ReadStatusReply( - uint8_t* data, size_t size, ObjectID object_ids[], int object_status[], int64_t num_objects); +Status ReadStatusReply(uint8_t* data, size_t size, ObjectID object_ids[], + int object_status[], int64_t num_objects); /* Plasma Constains message functions. */ @@ -146,16 +149,17 @@ Status ReadFetchRequest(uint8_t* data, size_t size, std::vector& objec /* Plasma Wait message functions. */ Status SendWaitRequest(int sock, ObjectRequest object_requests[], int64_t num_requests, - int num_ready_objects, int64_t timeout_ms); + int num_ready_objects, int64_t timeout_ms); -Status ReadWaitRequest(uint8_t* data, size_t size, ObjectRequestMap& object_requests, - int64_t* timeout_ms, int* num_ready_objects); +Status ReadWaitRequest(uint8_t* data, size_t size, + ObjectRequestMap& object_requests, + int64_t* timeout_ms, int* num_ready_objects); -Status SendWaitReply( - int sock, const ObjectRequestMap& object_requests, int num_ready_objects); +Status SendWaitReply(int sock, const ObjectRequestMap& object_requests, + int num_ready_objects); -Status ReadWaitReply( - uint8_t* data, size_t size, ObjectRequest object_requests[], int* num_ready_objects); +Status ReadWaitReply(uint8_t* data, size_t size, + ObjectRequest object_requests[], int* num_ready_objects); /* Plasma Subscribe message functions. */ @@ -167,10 +171,13 @@ Status SendDataRequest(int sock, ObjectID object_id, const char* address, int po Status ReadDataRequest(uint8_t* data, size_t size, ObjectID* object_id, char** address, int* port); -Status SendDataReply( - int sock, ObjectID object_id, int64_t object_size, int64_t metadata_size); +Status SendDataReply(int sock, ObjectID object_id, int64_t object_size, + int64_t metadata_size); + +Status ReadDataReply(uint8_t* data, size_t size, + ObjectID* object_id, int64_t* object_size, + int64_t* metadata_size); -Status ReadDataReply( - uint8_t* data, size_t size, ObjectID* object_id, int64_t* object_size, int64_t* metadata_size); +} // namespace plasma #endif /* PLASMA_PROTOCOL */ diff --git a/cpp/src/plasma/store.cc b/cpp/src/plasma/store.cc index 5285ba1c4c52f..6bc6507953385 100644 --- a/cpp/src/plasma/store.cc +++ b/cpp/src/plasma/store.cc @@ -49,12 +49,14 @@ #include #include -#include "format/common_generated.h" #include "plasma/common.h" +#include "plasma/common_generated.h" #include "plasma/fling.h" #include "plasma/io.h" #include "plasma/malloc.h" +namespace plasma { + extern "C" { void* dlmalloc(size_t bytes); void* dlmemalign(size_t alignment, size_t bytes); @@ -87,8 +89,8 @@ GetRequest::GetRequest(Client* client, const std::vector& object_ids) object_ids(object_ids.begin(), object_ids.end()), objects(object_ids.size()), num_satisfied(0) { - std::unordered_set unique_ids( - object_ids.begin(), object_ids.end()); + std::unordered_set unique_ids(object_ids.begin(), + object_ids.end()); num_objects_to_wait_for = unique_ids.size(); } @@ -116,7 +118,9 @@ PlasmaStore::~PlasmaStore() { // object's list of clients, otherwise do nothing. void PlasmaStore::add_client_to_object_clients(ObjectTableEntry* entry, Client* client) { // Check if this client is already using the object. - if (entry->clients.find(client) != entry->clients.end()) { return; } + if (entry->clients.find(client) != entry->clients.end()) { + return; + } // If there are no other clients using this object, notify the eviction policy // that the object is being used. if (entry->clients.size() == 0) { @@ -131,7 +135,8 @@ void PlasmaStore::add_client_to_object_clients(ObjectTableEntry* entry, Client* // Create a new object buffer in the hash table. int PlasmaStore::create_object(const ObjectID& object_id, int64_t data_size, - int64_t metadata_size, Client* client, PlasmaObject* result) { + int64_t metadata_size, Client* client, + PlasmaObject* result) { ARROW_LOG(DEBUG) << "creating object " << object_id.hex(); if (store_info_.objects.count(object_id) != 0) { // There is already an object with the same ID in the Plasma Store, so @@ -158,7 +163,9 @@ int PlasmaStore::create_object(const ObjectID& object_id, int64_t data_size, delete_objects(objects_to_evict); // Return an error to the client if not enough space could be freed to // create the object. - if (!success) { return PlasmaError_OutOfMemory; } + if (!success) { + return PlasmaError_OutOfMemory; + } } } while (pointer == NULL); int fd; @@ -210,7 +217,7 @@ void PlasmaObject_init(PlasmaObject* object, ObjectTableEntry* entry) { void PlasmaStore::return_from_get(GetRequest* get_req) { // Send the get reply to the client. Status s = SendGetReply(get_req->client->fd, &get_req->object_ids[0], get_req->objects, - get_req->object_ids.size()); + get_req->object_ids.size()); warn_if_sigpipe(s.ok() ? 0 : -1, get_req->client->fd); // If we successfully sent the get reply message to the client, then also send // the file descriptors. @@ -247,10 +254,14 @@ void PlasmaStore::return_from_get(GetRequest* get_req) { auto& get_requests = object_get_requests_[object_id]; // Erase get_req from the vector. auto it = std::find(get_requests.begin(), get_requests.end(), get_req); - if (it != get_requests.end()) { get_requests.erase(it); } + if (it != get_requests.end()) { + get_requests.erase(it); + } } // Remove the get request. - if (get_req->timer != -1) { ARROW_CHECK(loop_->remove_timer(get_req->timer) == AE_OK); } + if (get_req->timer != -1) { + ARROW_CHECK(loop_->remove_timer(get_req->timer) == AE_OK); + } delete get_req; } @@ -285,8 +296,9 @@ void PlasmaStore::update_object_get_requests(const ObjectID& object_id) { object_get_requests_.erase(object_id); } -void PlasmaStore::process_get_request( - Client* client, const std::vector& object_ids, int64_t timeout_ms) { +void PlasmaStore::process_get_request(Client* client, + const std::vector& object_ids, + int64_t timeout_ms) { // Create a get request for this object. GetRequest* get_req = new GetRequest(client, object_ids); @@ -325,8 +337,8 @@ void PlasmaStore::process_get_request( } } -int PlasmaStore::remove_client_from_object_clients( - ObjectTableEntry* entry, Client* client) { +int PlasmaStore::remove_client_from_object_clients(ObjectTableEntry* entry, + Client* client) { auto it = entry->clients.find(client); if (it != entry->clients.end()) { entry->clients.erase(it); @@ -406,7 +418,9 @@ void PlasmaStore::connect_client(int listener_sock) { // TODO(pcm): Check return value. loop_->add_file_event(client_fd, kEventLoopRead, [this, client](int events) { Status s = process_message(client); - if (!s.ok()) { ARROW_LOG(FATAL) << "Failed to process file event: " << s; } + if (!s.ok()) { + ARROW_LOG(FATAL) << "Failed to process file event: " << s; + } }); ARROW_LOG(DEBUG) << "New connection with fd " << client_fd; } @@ -464,8 +478,9 @@ void PlasmaStore::send_notifications(int client_fd) { // at the end of the method. // TODO(pcm): Introduce status codes and check in case the file descriptor // is added twice. - loop_->add_file_event(client_fd, kEventLoopWrite, - [this, client_fd](int events) { send_notifications(client_fd); }); + loop_->add_file_event(client_fd, kEventLoopWrite, [this, client_fd](int events) { + send_notifications(client_fd); + }); break; } else { ARROW_LOG(WARNING) << "Failed to send notification to client on fd " << client_fd; @@ -480,7 +495,8 @@ void PlasmaStore::send_notifications(int client_fd) { delete[] notification; } // Remove the sent notifications from the array. - it->second.object_notifications.erase(it->second.object_notifications.begin(), + it->second.object_notifications.erase( + it->second.object_notifications.begin(), it->second.object_notifications.begin() + num_processed); // Stop sending notifications if the pipe was broken. @@ -490,7 +506,9 @@ void PlasmaStore::send_notifications(int client_fd) { } // If we have sent all notifications, remove the fd from the event loop. - if (it->second.object_notifications.empty()) { loop_->remove_file_event(client_fd); } + if (it->second.object_notifications.empty()) { + loop_->remove_file_event(client_fd); + } } void PlasmaStore::push_notification(ObjectInfoT* object_info) { @@ -549,8 +567,8 @@ Status PlasmaStore::process_message(Client* client) { RETURN_NOT_OK(ReadCreateRequest(input, input_size, &object_id, &data_size, &metadata_size)); int error_code = create_object(object_id, data_size, metadata_size, client, &object); - HANDLE_SIGPIPE( - SendCreateReply(client->fd, object_id, &object, error_code), client->fd); + HANDLE_SIGPIPE(SendCreateReply(client->fd, object_id, &object, error_code), + client->fd); if (error_code == PlasmaError_OK) { warn_if_sigpipe(send_fd(client->fd, object.handle.store_fd), client->fd); } @@ -592,8 +610,8 @@ Status PlasmaStore::process_message(Client* client) { subscribe_to_updates(client); break; case MessageType_PlasmaConnectRequest: { - HANDLE_SIGPIPE( - SendConnectReply(client->fd, store_info_.memory_capacity), client->fd); + HANDLE_SIGPIPE(SendConnectReply(client->fd, store_info_.memory_capacity), + client->fd); } break; case DISCONNECT_CLIENT: ARROW_LOG(DEBUG) << "Disconnecting client on fd " << client->fd; @@ -608,7 +626,9 @@ Status PlasmaStore::process_message(Client* client) { // Report "success" to valgrind. void signal_handler(int signal) { - if (signal == SIGTERM) { exit(0); } + if (signal == SIGTERM) { + exit(0); + } } void start_server(char* socket_name, int64_t system_memory) { @@ -622,12 +642,14 @@ void start_server(char* socket_name, int64_t system_memory) { ARROW_CHECK(socket >= 0); // TODO(pcm): Check return value. loop.add_file_event(socket, kEventLoopRead, - [&store, socket](int events) { store.connect_client(socket); }); + [&store, socket](int events) { store.connect_client(socket); }); loop.run(); } +} // namespace plasma + int main(int argc, char* argv[]) { - signal(SIGTERM, signal_handler); + signal(SIGTERM, plasma::signal_handler); char* socket_name = NULL; int64_t system_memory = -1; int c; @@ -678,7 +700,7 @@ int main(int argc, char* argv[]) { #endif // Make it so dlmalloc fails if we try to request more memory than is // available. - dlmalloc_set_footprint_limit((size_t)system_memory); + plasma::dlmalloc_set_footprint_limit((size_t)system_memory); ARROW_LOG(DEBUG) << "starting server listening on " << socket_name; - start_server(socket_name, system_memory); + plasma::start_server(socket_name, system_memory); } diff --git a/cpp/src/plasma/store.h b/cpp/src/plasma/store.h index 8bd94265410f6..fec25c133ce17 100644 --- a/cpp/src/plasma/store.h +++ b/cpp/src/plasma/store.h @@ -27,6 +27,8 @@ #include "plasma/plasma.h" #include "plasma/protocol.h" +namespace plasma { + struct GetRequest; struct NotificationQueue { @@ -64,7 +66,7 @@ class PlasmaStore { /// cannot create the object. In this case, the client should not call /// plasma_release. int create_object(const ObjectID& object_id, int64_t data_size, int64_t metadata_size, - Client* client, PlasmaObject* result); + Client* client, PlasmaObject* result); /// Delete objects that have been created in the hash table. This should only /// be called on objects that are returned by the eviction policy to evict. @@ -85,8 +87,8 @@ class PlasmaStore { /// @param object_ids Object IDs of the objects to be gotten. /// @param timeout_ms The timeout for the get request in milliseconds. /// @return Void. - void process_get_request( - Client* client, const std::vector& object_ids, int64_t timeout_ms); + void process_get_request(Client* client, const std::vector& object_ids, + int64_t timeout_ms); /// Seal an object. The object is now immutable and can be accessed with get. /// @@ -166,4 +168,6 @@ class PlasmaStore { std::unordered_map pending_notifications_; }; +} // namespace plasma + #endif // PLASMA_STORE_H diff --git a/cpp/src/plasma/test/client_tests.cc b/cpp/src/plasma/test/client_tests.cc index 29b5b135144c3..02b3832145186 100644 --- a/cpp/src/plasma/test/client_tests.cc +++ b/cpp/src/plasma/test/client_tests.cc @@ -29,7 +29,9 @@ #include "plasma/plasma.h" #include "plasma/protocol.h" -std::string g_test_executable; // NOLINT +namespace plasma { + +std::string test_executable; // NOLINT class TestPlasmaStore : public ::testing::Test { public: @@ -37,7 +39,7 @@ class TestPlasmaStore : public ::testing::Test { // stdout of the object store. Consider changing that. void SetUp() { std::string plasma_directory = - g_test_executable.substr(0, g_test_executable.find_last_of("/")); + test_executable.substr(0, test_executable.find_last_of("/")); std::string plasma_command = plasma_directory + "/plasma_store -m 1000000000 -s /tmp/store 1> /dev/null 2> /dev/null &"; @@ -125,8 +127,10 @@ TEST_F(TestPlasmaStore, MultipleGetTest) { ASSERT_EQ(object_buffer[1].data[0], 2); } +} // namespace plasma + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); - g_test_executable = std::string(argv[0]); + plasma::test_executable = std::string(argv[0]); return RUN_ALL_TESTS(); } diff --git a/cpp/src/plasma/test/serialization_tests.cc b/cpp/src/plasma/test/serialization_tests.cc index 22916a246e4be..7fa8531e699e8 100644 --- a/cpp/src/plasma/test/serialization_tests.cc +++ b/cpp/src/plasma/test/serialization_tests.cc @@ -25,6 +25,8 @@ #include "plasma/plasma.h" #include "plasma/protocol.h" +namespace plasma { + /** * Create a temporary file. Needs to be closed by the caller. * @@ -165,11 +167,11 @@ TEST(PlasmaSerialization, GetReply) { ASSERT_EQ(object_ids[0], object_ids_return[0]); ASSERT_EQ(object_ids[1], object_ids_return[1]); ASSERT_EQ(memcmp(&plasma_objects[object_ids[0]], &plasma_objects_return[0], - sizeof(PlasmaObject)), - 0); + sizeof(PlasmaObject)), + 0); ASSERT_EQ(memcmp(&plasma_objects[object_ids[1]], &plasma_objects_return[1], - sizeof(PlasmaObject)), - 0); + sizeof(PlasmaObject)), + 0); close(fd); } @@ -301,15 +303,15 @@ TEST(PlasmaSerialization, WaitRequest) { const int num_ready_objects_in = 1; int64_t timeout_ms = 1000; - ARROW_CHECK_OK(SendWaitRequest( - fd, &object_requests_in[0], num_objects_in, num_ready_objects_in, timeout_ms)); + ARROW_CHECK_OK(SendWaitRequest(fd, &object_requests_in[0], num_objects_in, + num_ready_objects_in, timeout_ms)); /* Read message back. */ std::vector data = read_message_from_file(fd, MessageType_PlasmaWaitRequest); int num_ready_objects_out; int64_t timeout_ms_read; ObjectRequestMap object_requests_out; - ARROW_CHECK_OK(ReadWaitRequest( - data.data(), data.size(), object_requests_out, &timeout_ms_read, &num_ready_objects_out)); + ARROW_CHECK_OK(ReadWaitRequest(data.data(), data.size(), object_requests_out, + &timeout_ms_read, &num_ready_objects_out)); ASSERT_EQ(num_objects_in, object_requests_out.size()); ASSERT_EQ(num_ready_objects_out, num_ready_objects_in); for (int i = 0; i < num_objects_in; i++) { @@ -386,3 +388,5 @@ TEST(PlasmaSerialization, DataReply) { ASSERT_EQ(object_size1, object_size2); ASSERT_EQ(metadata_size1, metadata_size2); } + +} // namespace plasma diff --git a/format/Schema.fbs b/format/Schema.fbs index a7e802b9dcba6..186f8e362bde2 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -44,6 +44,35 @@ table FixedSizeList { listSize: int; } +/// A Map is a logical nested type that is represented as +/// +/// List> +/// +/// In this layout, the keys and values are each respectively contiguous. We do +/// not constrain the key and value types, so the application is responsible +/// for ensuring that the keys are hashable and unique. Whether the keys are sorted +/// may be set in the metadata for this field +/// +/// In a Field with Map type, the Field has a child Struct field, which then +/// has two children: key type and the second the value type. The names of the +/// child fields may be respectively "entry", "key", and "value", but this is +/// not enforced +/// +/// Map +/// - child[0] entry: Struct +/// - child[0] key: K +/// - child[1] value: V +/// +/// Neither the "entry" field nor the "key" field may be nullable. +/// +/// The metadata is structured so that Arrow systems without special handling +/// for Map can make Map an alias for List. The "layout" attribute for the Map +/// field must have the same contents as a List. +table Map { + /// Set to true if the keys within each value are sorted + keysSorted: bool; +} + enum UnionMode:short { Sparse, Dense } /// A union is a complex type with children in Field @@ -170,7 +199,8 @@ union Type { Struct_, Union, FixedSizeBinary, - FixedSizeList + FixedSizeList, + Map } /// ---------------------------------------------------------------------- diff --git a/integration/integration_test.py b/integration/integration_test.py index 215ba58232a00..b7f1609935e79 100644 --- a/integration/integration_test.py +++ b/integration/integration_test.py @@ -945,7 +945,7 @@ def get_static_json_files(): def run_all_tests(debug=False): - testers = [CPPTester(debug=debug)] # , JavaTester(debug=debug)] + testers = [CPPTester(debug=debug), JavaTester(debug=debug)] static_json_files = get_static_json_files() generated_json_files = get_generated_json_files() json_files = static_json_files + generated_json_files diff --git a/java/format/pom.xml b/java/format/pom.xml index e09275476d175..af8ff609dbc9c 100644 --- a/java/format/pom.xml +++ b/java/format/pom.xml @@ -15,7 +15,7 @@ arrow-java-root org.apache.arrow - 0.5.0-SNAPSHOT + 0.6.0-SNAPSHOT arrow-format diff --git a/java/memory/pom.xml b/java/memory/pom.xml index dc4d0daf93d92..9a8d2d7c9b56f 100644 --- a/java/memory/pom.xml +++ b/java/memory/pom.xml @@ -14,7 +14,7 @@ org.apache.arrow arrow-java-root - 0.5.0-SNAPSHOT + 0.6.0-SNAPSHOT arrow-memory Arrow Memory diff --git a/java/pom.xml b/java/pom.xml index 1ec3d561121c3..81f80b00b563c 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -20,7 +20,7 @@ org.apache.arrow arrow-java-root - 0.5.0-SNAPSHOT + 0.6.0-SNAPSHOT pom Apache Arrow Java Root POM @@ -30,7 +30,7 @@ ${project.basedir}/target/generated-sources 4.11 - 1.7.6 + 1.7.25 18.0 2 2.7.1 @@ -231,7 +231,7 @@ pl.project13.maven git-commit-id-plugin - 2.1.9 + 2.2.2 for-jars @@ -520,7 +520,7 @@ ch.qos.logback logback-classic - 1.0.13 + 1.2.3 test diff --git a/java/tools/pom.xml b/java/tools/pom.xml index 53331f2f24541..9d067ef1e9bc2 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -14,7 +14,7 @@ org.apache.arrow arrow-java-root - 0.5.0-SNAPSHOT + 0.6.0-SNAPSHOT arrow-tools Arrow Tools diff --git a/java/vector/pom.xml b/java/vector/pom.xml index a117a2fb3b7cb..e15ab9a2497fc 100644 --- a/java/vector/pom.xml +++ b/java/vector/pom.xml @@ -14,7 +14,7 @@ org.apache.arrow arrow-java-root - 0.5.0-SNAPSHOT + 0.6.0-SNAPSHOT arrow-vector Arrow Vectors diff --git a/java/vector/src/main/codegen/templates/NullableValueVectors.java b/java/vector/src/main/codegen/templates/NullableValueVectors.java index 092097bb2bd6d..5b993678012b5 100644 --- a/java/vector/src/main/codegen/templates/NullableValueVectors.java +++ b/java/vector/src/main/codegen/templates/NullableValueVectors.java @@ -540,7 +540,7 @@ public void set(int index, <#if type.major == "VarLen">byte[]<#elseif (type.widt <#if type.major == "VarLen"> - private void fillEmpties(int index){ + public void fillEmpties(int index){ final ${valuesName}.Mutator valuesMutator = values.getMutator(); for (int i = lastSet + 1; i < index; i++) { valuesMutator.setSafe(i, emptyByteArray); @@ -699,6 +699,22 @@ public void reset(){ setCount = 0; <#if type.major = "VarLen">lastSet = -1; } + + public void setLastSet(int value) { + <#if type.major = "VarLen"> + lastSet = value; + <#else> + throw new UnsupportedOperationException(); + + } + + public int getLastSet() { + <#if type.major != "VarLen"> + throw new UnsupportedOperationException(); + <#else> + return lastSet; + + } } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index 4ab624f3694cb..6357294566017 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -393,6 +393,12 @@ public void setValueCount(int valueCount) { vector.getMutator().setValueCount(childValueCount); bits.getMutator().setValueCount(valueCount); } + + public void setLastSet(int value) { + lastSet = value; + } + + public int getLastSet() { return lastSet; } } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java index 11be3298f7533..29ea7628f452b 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java @@ -17,15 +17,26 @@ */ package org.apache.arrow.vector; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.impl.UnionListWriter; +import org.apache.arrow.vector.complex.impl.UnionListReader; import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.FieldWriter; +import org.apache.arrow.vector.holders.NullableBigIntHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.Types.*; +import org.apache.arrow.vector.types.pojo.FieldType; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; +import java.util.List; + public class TestListVector { private BufferAllocator allocator; @@ -80,4 +91,147 @@ public void testCopyFrom() throws Exception { Assert.assertTrue("shouldn't be null", reader.isSet()); } } + + @Test + public void testSetLastSetUsage() throws Exception { + try (ListVector listVector = ListVector.empty("input", allocator)) { + + /* Explicitly add the dataVector */ + MinorType type = MinorType.BIGINT; + listVector.addOrGetVector(FieldType.nullable(type.getType())); + + /* allocate memory */ + listVector.allocateNew(); + + /* get inner vectors; bitVector and offsetVector */ + List innerVectors = listVector.getFieldInnerVectors(); + BitVector bitVector = (BitVector)innerVectors.get(0); + UInt4Vector offsetVector = (UInt4Vector)innerVectors.get(1); + + /* get the underlying data vector -- NullableBigIntVector */ + NullableBigIntVector dataVector = (NullableBigIntVector)listVector.getDataVector(); + + /* check current lastSet */ + assertEquals(Integer.toString(0), Integer.toString(listVector.getMutator().getLastSet())); + + int index = 0; + int offset = 0; + + /* write [10, 11, 12] to the list vector at index */ + bitVector.getMutator().setSafe(index, 1); + dataVector.getMutator().setSafe(0, 1, 10); + dataVector.getMutator().setSafe(1, 1, 11); + dataVector.getMutator().setSafe(2, 1, 12); + offsetVector.getMutator().setSafe(index + 1, 3); + + index += 1; + + /* write [13, 14] to the list vector at index 1 */ + bitVector.getMutator().setSafe(index, 1); + dataVector.getMutator().setSafe(3, 1, 13); + dataVector.getMutator().setSafe(4, 1, 14); + offsetVector.getMutator().setSafe(index + 1, 5); + + index += 1; + + /* write [15, 16, 17] to the list vector at index 2 */ + bitVector.getMutator().setSafe(index, 1); + dataVector.getMutator().setSafe(5, 1, 15); + dataVector.getMutator().setSafe(6, 1, 16); + dataVector.getMutator().setSafe(7, 1, 17); + offsetVector.getMutator().setSafe(index + 1, 8); + + /* check current lastSet */ + assertEquals(Integer.toString(0), Integer.toString(listVector.getMutator().getLastSet())); + + /* set lastset and arbitrary valuecount for list vector. + * + * NOTE: if we don't execute setLastSet() before setLastValueCount(), then + * the latter will corrupt the offsetVector and thus the accessor will not + * retrieve the correct values from underlying dataVector. Run the test + * by commenting out next line and we should see failures from 5th assert + * onwards. This is why doing setLastSet() is important before setValueCount() + * once the vector has been loaded. + * + * Another important thing to remember is the value of lastSet itself. + * Even though the listVector has elements till index 2 only, the lastSet should + * be set as 3. This is because the offsetVector has valid offsets filled till index 3. + * If we do setLastSet(2), the offsetVector at index 3 will contain incorrect value + * after execution of setValueCount(). + * + * correct state of the listVector + * bitvector {1, 1, 1, 0, 0.... } + * offsetvector {0, 3, 5, 8, 8, 8.....} + * datavector { [10, 11, 12], + * [13, 14], + * [15, 16, 17] + * } + * + * if we don't do setLastSet() before setValueCount --> incorrect state + * bitvector {1, 1, 1, 0, 0.... } + * offsetvector {0, 0, 0, 0, 0, 0.....} + * datavector { [10, 11, 12], + * [13, 14], + * [15, 16, 17] + * } + * + * if we do setLastSet(2) before setValueCount --> incorrect state + * bitvector {1, 1, 1, 0, 0.... } + * offsetvector {0, 3, 5, 5, 5, 5.....} + * datavector { [10, 11, 12], + * [13, 14], + * [15, 16, 17] + * } + */ + listVector.getMutator().setLastSet(3); + listVector.getMutator().setValueCount(10); + + /* check the vector output */ + final UInt4Vector.Accessor offsetAccessor = offsetVector.getAccessor(); + final ValueVector.Accessor valueAccessor = dataVector.getAccessor(); + + index = 0; + offset = offsetAccessor.get(index); + assertEquals(Integer.toString(0), Integer.toString(offset)); + + Object actual = valueAccessor.getObject(offset); + assertEquals(new Long(10), (Long)actual); + offset++; + actual = valueAccessor.getObject(offset); + assertEquals(new Long(11), (Long)actual); + offset++; + actual = valueAccessor.getObject(offset); + assertEquals(new Long(12), (Long)actual); + + index++; + offset = offsetAccessor.get(index); + assertEquals(Integer.toString(3), Integer.toString(offset)); + + actual = valueAccessor.getObject(offset); + assertEquals(new Long(13), (Long)actual); + offset++; + actual = valueAccessor.getObject(offset); + assertEquals(new Long(14), (Long)actual); + + index++; + offset = offsetAccessor.get(index); + assertEquals(Integer.toString(5), Integer.toString(offset)); + + actual = valueAccessor.getObject(offsetAccessor.get(index)); + assertEquals(new Long(15), (Long)actual); + offset++; + actual = valueAccessor.getObject(offset); + assertEquals(new Long(16), (Long)actual); + offset++; + actual = valueAccessor.getObject(offset); + assertEquals(new Long(17), (Long)actual); + + index++; + offset = offsetAccessor.get(index); + assertEquals(Integer.toString(8), Integer.toString(offset)); + + actual = valueAccessor.getObject(offsetAccessor.get(index)); + assertNull(actual); + } + } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index 63543b0932908..0f41c2dd790e1 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -26,11 +26,15 @@ import java.nio.charset.Charset; import java.util.List; +import java.util.ArrayList; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; + +import org.apache.arrow.vector.schema.ArrowRecordBatch; import org.apache.arrow.vector.schema.TypeLayout; import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Schema; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.junit.After; @@ -56,6 +60,9 @@ public void init() { private final static byte[] STR1 = "AAAAA1".getBytes(utf8Charset); private final static byte[] STR2 = "BBBBBBBBB2".getBytes(utf8Charset); private final static byte[] STR3 = "CCCC3".getBytes(utf8Charset); + private final static byte[] STR4 = "DDDDDDDD4".getBytes(utf8Charset); + private final static byte[] STR5 = "EEE5".getBytes(utf8Charset); + private final static byte[] STR6 = "FFFFF6".getBytes(utf8Charset); @After public void terminate() throws Exception { @@ -509,11 +516,231 @@ public void testCopyFromWithNulls() { } else { assertEquals(Integer.toString(i), vector2.getAccessor().getObject(i).toString()); } + } + } + } + + @Test + public void testSetLastSetUsage() { + try (final NullableVarCharVector vector = new NullableVarCharVector("myvector", allocator)) { + + final NullableVarCharVector.Mutator mutator = vector.getMutator(); + vector.allocateNew(1024 * 10, 1024); + + setBytes(0, STR1, vector); + setBytes(1, STR2, vector); + setBytes(2, STR3, vector); + setBytes(3, STR4, vector); + setBytes(4, STR5, vector); + setBytes(5, STR6, vector); + + /* Check current lastSet */ + assertEquals(Integer.toString(-1), Integer.toString(mutator.getLastSet())); + + /* Check the vector output */ + final NullableVarCharVector.Accessor accessor = vector.getAccessor(); + assertArrayEquals(STR1, accessor.get(0)); + assertArrayEquals(STR2, accessor.get(1)); + assertArrayEquals(STR3, accessor.get(2)); + assertArrayEquals(STR4, accessor.get(3)); + assertArrayEquals(STR5, accessor.get(4)); + assertArrayEquals(STR6, accessor.get(5)); + + /* + * If we don't do setLastSe(5) before setValueCount(), then the latter will corrupt + * the value vector by filling in all positions [0,valuecount-1] will empty byte arrays. + * Run the test by commenting out next line and we should see incorrect vector output. + */ + mutator.setLastSet(5); + mutator.setValueCount(20); + + /* Check the vector output again */ + assertArrayEquals(STR1, accessor.get(0)); + assertArrayEquals(STR2, accessor.get(1)); + assertArrayEquals(STR3, accessor.get(2)); + assertArrayEquals(STR4, accessor.get(3)); + assertArrayEquals(STR5, accessor.get(4)); + assertArrayEquals(STR6, accessor.get(5)); + } + } + + @Test + public void testVectorLoadUnload() { + + try (final NullableVarCharVector vector1 = new NullableVarCharVector("myvector", allocator)) { + + final NullableVarCharVector.Mutator mutator1 = vector1.getMutator(); + + vector1.allocateNew(1024 * 10, 1024); + + mutator1.set(0, STR1); + mutator1.set(1, STR2); + mutator1.set(2, STR3); + mutator1.set(3, STR4); + mutator1.set(4, STR5); + mutator1.set(5, STR6); + assertEquals(Integer.toString(5), Integer.toString(mutator1.getLastSet())); + mutator1.setValueCount(15); + assertEquals(Integer.toString(14), Integer.toString(mutator1.getLastSet())); + + /* Check the vector output */ + final NullableVarCharVector.Accessor accessor1 = vector1.getAccessor(); + assertArrayEquals(STR1, accessor1.get(0)); + assertArrayEquals(STR2, accessor1.get(1)); + assertArrayEquals(STR3, accessor1.get(2)); + assertArrayEquals(STR4, accessor1.get(3)); + assertArrayEquals(STR5, accessor1.get(4)); + assertArrayEquals(STR6, accessor1.get(5)); + + Field field = vector1.getField(); + String fieldName = field.getName(); + + List fields = new ArrayList(); + List fieldVectors = new ArrayList(); + + fields.add(field); + fieldVectors.add(vector1); + + Schema schema = new Schema(fields); + + VectorSchemaRoot schemaRoot1 = new VectorSchemaRoot(schema, fieldVectors, accessor1.getValueCount()); + VectorUnloader vectorUnloader = new VectorUnloader(schemaRoot1); + + try ( + ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch(); + BufferAllocator finalVectorsAllocator = allocator.newChildAllocator("new vector", 0, Long.MAX_VALUE); + VectorSchemaRoot schemaRoot2 = VectorSchemaRoot.create(schema, finalVectorsAllocator); + ) { + + VectorLoader vectorLoader = new VectorLoader(schemaRoot2); + vectorLoader.load(recordBatch); + + NullableVarCharVector vector2 = (NullableVarCharVector)schemaRoot2.getVector(fieldName); + NullableVarCharVector.Mutator mutator2 = vector2.getMutator(); + + /* + * lastSet would have internally been set by VectorLoader.load() when it invokes + * loadFieldBuffers. + */ + assertEquals(Integer.toString(14), Integer.toString(mutator2.getLastSet())); + mutator2.setValueCount(25); + assertEquals(Integer.toString(24), Integer.toString(mutator2.getLastSet())); + + /* Check the vector output */ + final NullableVarCharVector.Accessor accessor2 = vector2.getAccessor(); + assertArrayEquals(STR1, accessor2.get(0)); + assertArrayEquals(STR2, accessor2.get(1)); + assertArrayEquals(STR3, accessor2.get(2)); + assertArrayEquals(STR4, accessor2.get(3)); + assertArrayEquals(STR5, accessor2.get(4)); + assertArrayEquals(STR6, accessor2.get(5)); } + } + } + @Test + public void testFillEmptiesUsage() { + try (final NullableVarCharVector vector = new NullableVarCharVector("myvector", allocator)) { + final NullableVarCharVector.Mutator mutator = vector.getMutator(); + + vector.allocateNew(1024 * 10, 1024); + + setBytes(0, STR1, vector); + setBytes(1, STR2, vector); + setBytes(2, STR3, vector); + setBytes(3, STR4, vector); + setBytes(4, STR5, vector); + setBytes(5, STR6, vector); + + /* Check current lastSet */ + assertEquals(Integer.toString(-1), Integer.toString(mutator.getLastSet())); + + /* Check the vector output */ + final NullableVarCharVector.Accessor accessor = vector.getAccessor(); + assertArrayEquals(STR1, accessor.get(0)); + assertArrayEquals(STR2, accessor.get(1)); + assertArrayEquals(STR3, accessor.get(2)); + assertArrayEquals(STR4, accessor.get(3)); + assertArrayEquals(STR5, accessor.get(4)); + assertArrayEquals(STR6, accessor.get(5)); + + mutator.setLastSet(5); + /* fill empty byte arrays from index [6, 9] */ + mutator.fillEmpties(10); + + /* Check current lastSet */ + assertEquals(Integer.toString(9), Integer.toString(mutator.getLastSet())); + + /* Check the vector output */ + assertArrayEquals(STR1, accessor.get(0)); + assertArrayEquals(STR2, accessor.get(1)); + assertArrayEquals(STR3, accessor.get(2)); + assertArrayEquals(STR4, accessor.get(3)); + assertArrayEquals(STR5, accessor.get(4)); + assertArrayEquals(STR6, accessor.get(5)); + assertEquals(Integer.toString(0), Integer.toString(accessor.getValueLength(6))); + assertEquals(Integer.toString(0), Integer.toString(accessor.getValueLength(7))); + assertEquals(Integer.toString(0), Integer.toString(accessor.getValueLength(8))); + assertEquals(Integer.toString(0), Integer.toString(accessor.getValueLength(9))); + + setBytes(10, STR1, vector); + setBytes(11, STR2, vector); + + mutator.setLastSet(11); + /* fill empty byte arrays from index [12, 14] */ + mutator.setValueCount(15); + + /* Check current lastSet */ + assertEquals(Integer.toString(14), Integer.toString(mutator.getLastSet())); + + /* Check the vector output */ + assertArrayEquals(STR1, accessor.get(0)); + assertArrayEquals(STR2, accessor.get(1)); + assertArrayEquals(STR3, accessor.get(2)); + assertArrayEquals(STR4, accessor.get(3)); + assertArrayEquals(STR5, accessor.get(4)); + assertArrayEquals(STR6, accessor.get(5)); + assertEquals(Integer.toString(0), Integer.toString(accessor.getValueLength(6))); + assertEquals(Integer.toString(0), Integer.toString(accessor.getValueLength(7))); + assertEquals(Integer.toString(0), Integer.toString(accessor.getValueLength(8))); + assertEquals(Integer.toString(0), Integer.toString(accessor.getValueLength(9))); + assertArrayEquals(STR1, accessor.get(10)); + assertArrayEquals(STR2, accessor.get(11)); + assertEquals(Integer.toString(0), Integer.toString(accessor.getValueLength(12))); + assertEquals(Integer.toString(0), Integer.toString(accessor.getValueLength(13))); + assertEquals(Integer.toString(0), Integer.toString(accessor.getValueLength(14))); + + /* Check offsets */ + final UInt4Vector.Accessor offsetAccessor = vector.values.offsetVector.getAccessor(); + assertEquals(Integer.toString(0), Integer.toString(offsetAccessor.get(0))); + assertEquals(Integer.toString(6), Integer.toString(offsetAccessor.get(1))); + assertEquals(Integer.toString(16), Integer.toString(offsetAccessor.get(2))); + assertEquals(Integer.toString(21), Integer.toString(offsetAccessor.get(3))); + assertEquals(Integer.toString(30), Integer.toString(offsetAccessor.get(4))); + assertEquals(Integer.toString(34), Integer.toString(offsetAccessor.get(5))); + + assertEquals(Integer.toString(40), Integer.toString(offsetAccessor.get(6))); + assertEquals(Integer.toString(40), Integer.toString(offsetAccessor.get(7))); + assertEquals(Integer.toString(40), Integer.toString(offsetAccessor.get(8))); + assertEquals(Integer.toString(40), Integer.toString(offsetAccessor.get(9))); + assertEquals(Integer.toString(40), Integer.toString(offsetAccessor.get(10))); + + assertEquals(Integer.toString(46), Integer.toString(offsetAccessor.get(11))); + assertEquals(Integer.toString(56), Integer.toString(offsetAccessor.get(12))); + + assertEquals(Integer.toString(56), Integer.toString(offsetAccessor.get(13))); + assertEquals(Integer.toString(56), Integer.toString(offsetAccessor.get(14))); + assertEquals(Integer.toString(56), Integer.toString(offsetAccessor.get(15))); } } + public static void setBytes(int index, byte[] bytes, NullableVarCharVector vector) { + final int currentOffset = vector.values.offsetVector.getAccessor().get(index); + + vector.bits.getMutator().setToOne(index); + vector.values.offsetVector.getMutator().set(index + 1, currentOffset + bytes.length); + vector.values.data.setBytes(currentOffset, bytes, 0, bytes.length); + } } diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 224147d8b5c3b..6ff66462958ef 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -220,6 +220,12 @@ include_directories(SYSTEM find_package(Arrow REQUIRED) include_directories(SYSTEM ${ARROW_INCLUDE_DIR}) +## Plasma +find_package(Plasma) +if (PLASMA_FOUND) + include_directories(SYSTEM ${PLASMA_INCLUDE_DIR}) +endif() + function(bundle_arrow_lib library_path) get_filename_component(LIBRARY_DIR ${${library_path}} DIRECTORY) get_filename_component(LIBRARY_NAME ${${library_path}} NAME_WE) @@ -252,6 +258,9 @@ if (PYARROW_BUNDLE_ARROW_CPP) file(COPY ${ARROW_INCLUDE_DIR}/arrow DESTINATION ${BUILD_OUTPUT_ROOT_DIRECTORY}/include) bundle_arrow_lib(ARROW_SHARED_LIB) bundle_arrow_lib(ARROW_PYTHON_SHARED_LIB) + if (PLASMA_FOUND) + bundle_arrow_lib(PLASMA_SHARED_LIB) + endif() endif() if (MSVC) @@ -278,9 +287,14 @@ set(CYTHON_EXTENSIONS lib ) +if (PLASMA_FOUND) + set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} plasma) +endif() + set(LINK_LIBS arrow_shared arrow_python_shared + ${PLASMA_SHARED_LIB} ) if (PYARROW_BUILD_PARQUET) @@ -379,3 +393,7 @@ foreach(module ${CYTHON_EXTENSIONS}) target_link_libraries(${module_name} ${LINK_LIBS}) endforeach(module) + +if (PLASMA_FOUND) + file(COPY ${PLASMA_EXECUTABLE} DESTINATION ${BUILD_OUTPUT_ROOT_DIRECTORY}) +endif() diff --git a/python/cmake_modules/FindPlasma.cmake b/python/cmake_modules/FindPlasma.cmake new file mode 100644 index 0000000000000..3acaa348bffa8 --- /dev/null +++ b/python/cmake_modules/FindPlasma.cmake @@ -0,0 +1,99 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# - Find PLASMA (plasma/client.h, libplasma.a, libplasma.so) +# This module defines +# PLASMA_INCLUDE_DIR, directory containing headers +# PLASMA_LIBS, directory containing plasma libraries +# PLASMA_STATIC_LIB, path to libplasma.a +# PLASMA_SHARED_LIB, path to libplasma's shared library +# PLASMA_SHARED_IMP_LIB, path to libplasma's import library (MSVC only) +# PLASMA_FOUND, whether plasma has been found + +include(FindPkgConfig) + +if ("$ENV{ARROW_HOME}" STREQUAL "") + pkg_check_modules(PLASMA plasma) + if (PLASMA_FOUND) + pkg_get_variable(PLASMA_EXECUTABLE plasma executable) + pkg_get_variable(PLASMA_ABI_VERSION plasma abi_version) + message(STATUS "Plasma ABI version: ${PLASMA_ABI_VERSION}") + pkg_get_variable(PLASMA_SO_VERSION plasma so_version) + message(STATUS "Plasma SO version: ${PLASMA_SO_VERSION}") + set(PLASMA_INCLUDE_DIR ${PLASMA_INCLUDE_DIRS}) + set(PLASMA_LIBS ${PLASMA_LIBRARY_DIRS}) + set(PLASMA_SEARCH_LIB_PATH ${PLASMA_LIBRARY_DIRS}) + endif() +else() + set(PLASMA_HOME "$ENV{ARROW_HOME}") + + set(PLASMA_EXECUTABLE ${PLASMA_HOME}/bin/plasma_store) + + set(PLASMA_SEARCH_HEADER_PATHS + ${PLASMA_HOME}/include + ) + + set(PLASMA_SEARCH_LIB_PATH + ${PLASMA_HOME}/lib + ) + + find_path(PLASMA_INCLUDE_DIR plasma/client.h PATHS + ${PLASMA_SEARCH_HEADER_PATHS} + # make sure we don't accidentally pick up a different version + NO_DEFAULT_PATH + ) +endif() + +find_library(PLASMA_LIB_PATH NAMES plasma + PATHS + ${PLASMA_SEARCH_LIB_PATH} + NO_DEFAULT_PATH) +get_filename_component(PLASMA_LIBS ${PLASMA_LIB_PATH} DIRECTORY) + +if (PLASMA_INCLUDE_DIR AND PLASMA_LIBS) + set(PLASMA_FOUND TRUE) + set(PLASMA_LIB_NAME plasma) + + set(PLASMA_STATIC_LIB ${PLASMA_LIBS}/lib${PLASMA_LIB_NAME}.a) + + set(PLASMA_SHARED_LIB ${PLASMA_LIBS}/lib${PLASMA_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) +endif() + +if (PLASMA_FOUND) + if (NOT Plasma_FIND_QUIETLY) + message(STATUS "Found the Plasma core library: ${PLASMA_LIB_PATH}") + message(STATUS "Found Plasma executable: ${PLASMA_EXECUTABLE}") + endif () +else () + if (NOT Plasma_FIND_QUIETLY) + set(PLASMA_ERR_MSG "Could not find the Plasma library. Looked for headers") + set(PLASMA_ERR_MSG "${PLASMA_ERR_MSG} in ${PLASMA_SEARCH_HEADER_PATHS}, and for libs") + set(PLASMA_ERR_MSG "${PLASMA_ERR_MSG} in ${PLASMA_SEARCH_LIB_PATH}") + if (Plasma_FIND_REQUIRED) + message(FATAL_ERROR "${PLASMA_ERR_MSG}") + else (Plasma_FIND_REQUIRED) + message(STATUS "${PLASMA_ERR_MSG}") + endif (Plasma_FIND_REQUIRED) + endif () + set(PLASMA_FOUND FALSE) +endif () + +mark_as_advanced( + PLASMA_INCLUDE_DIR + PLASMA_STATIC_LIB + PLASMA_SHARED_LIB +) diff --git a/python/doc/source/api.rst b/python/doc/source/api.rst index c52d400cef1c7..780aa4839610f 100644 --- a/python/doc/source/api.rst +++ b/python/doc/source/api.rst @@ -212,6 +212,21 @@ Type Classes Field Schema +.. currentmodule:: pyarrow.plasma + +.. _api.plasma: + +In-Memory Object Store +---------------------- + +.. autosummary:: + :toctree: generated/ + + ObjectID + PlasmaClient + PlasmaBuffer + MutablePlasmaBuffer + .. currentmodule:: pyarrow.parquet .. _api.parquet: diff --git a/python/manylinux1/build_arrow.sh b/python/manylinux1/build_arrow.sh index 8c6bda9550e87..85c096a5c11d0 100755 --- a/python/manylinux1/build_arrow.sh +++ b/python/manylinux1/build_arrow.sh @@ -35,6 +35,7 @@ cd /arrow/python # PyArrow build configuration export PYARROW_BUILD_TYPE='release' export PYARROW_WITH_PARQUET=1 +export PYARROW_WITH_PLASMA=1 export PYARROW_BUNDLE_ARROW_CPP=1 # Need as otherwise arrow_io is sometimes not linked export LDFLAGS="-Wl,--no-as-needed" @@ -52,7 +53,7 @@ for PYTHON in ${PYTHON_VERSIONS}; do ARROW_BUILD_DIR=/arrow/cpp/build-PY${PYTHON} mkdir -p "${ARROW_BUILD_DIR}" pushd "${ARROW_BUILD_DIR}" - PATH="$(cpython_path $PYTHON)/bin:$PATH" cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/arrow-dist -DARROW_BUILD_TESTS=OFF -DARROW_BUILD_SHARED=ON -DARROW_BOOST_USE_SHARED=OFF -DARROW_JEMALLOC=ON -DARROW_RPATH_ORIGIN=ON -DARROW_JEMALLOC_USE_SHARED=OFF -DARROW_PYTHON=ON -DPythonInterp_FIND_VERSION=${PYTHON} .. + PATH="$(cpython_path $PYTHON)/bin:$PATH" cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/arrow-dist -DARROW_BUILD_TESTS=OFF -DARROW_BUILD_SHARED=ON -DARROW_BOOST_USE_SHARED=OFF -DARROW_JEMALLOC=ON -DARROW_RPATH_ORIGIN=ON -DARROW_JEMALLOC_USE_SHARED=OFF -DARROW_PYTHON=ON -DPythonInterp_FIND_VERSION=${PYTHON} -DARROW_PLASMA=ON .. make -j5 install popd @@ -65,6 +66,7 @@ for PYTHON in ${PYTHON_VERSIONS}; do echo "=== (${PYTHON}) Test the existence of optional modules ===" $PIPI_IO -r requirements.txt PATH="$PATH:$(cpython_path $PYTHON)/bin" $PYTHON_INTERPRETER -c "import pyarrow.parquet" + PATH="$PATH:$(cpython_path $PYTHON)/bin" $PYTHON_INTERPRETER -c "import pyarrow.plasma" echo "=== (${PYTHON}) Tag the wheel with manylinux1 ===" mkdir -p repaired_wheels/ @@ -78,4 +80,3 @@ for PYTHON in ${PYTHON_VERSIONS}; do mv repaired_wheels/*.whl /io/dist done - diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index e3d783aee58b4..6d0ce204382e3 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -68,6 +68,7 @@ Date32Value, Date64Value, TimestampValue) from pyarrow.lib import (HdfsFile, NativeFile, PythonFile, + FixedSizeBufferOutputStream, Buffer, BufferReader, BufferOutputStream, OSFile, MemoryMappedFile, memory_map, frombuffer, @@ -99,7 +100,6 @@ open_file, serialize_pandas, deserialize_pandas) - localfs = LocalFilesystem.get_instance() diff --git a/python/pyarrow/error.pxi b/python/pyarrow/error.pxi index 259aeb074e3c2..8a3f57d209ac0 100644 --- a/python/pyarrow/error.pxi +++ b/python/pyarrow/error.pxi @@ -48,6 +48,18 @@ class ArrowNotImplementedError(NotImplementedError, ArrowException): pass +class PlasmaObjectExists(ArrowException): + pass + + +class PlasmaObjectNonexistent(ArrowException): + pass + + +class PlasmaStoreFull(ArrowException): + pass + + cdef int check_status(const CStatus& status) nogil except -1: if status.ok(): return 0 @@ -66,5 +78,11 @@ cdef int check_status(const CStatus& status) nogil except -1: raise ArrowNotImplementedError(message) elif status.IsTypeError(): raise ArrowTypeError(message) + elif status.IsPlasmaObjectExists(): + raise PlasmaObjectExists(message) + elif status.IsPlasmaObjectNonexistent(): + raise PlasmaObjectNonexistent(message) + elif status.IsPlasmaStoreFull(): + raise PlasmaStoreFull(message) else: raise ArrowException(message) diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd index 3487d48ce9b52..637a133afb02b 100644 --- a/python/pyarrow/includes/common.pxd +++ b/python/pyarrow/includes/common.pxd @@ -50,6 +50,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_bool IsKeyError() c_bool IsNotImplemented() c_bool IsTypeError() + c_bool IsPlasmaObjectExists() + c_bool IsPlasmaObjectNonexistent() + c_bool IsPlasmaStoreFull() cdef inline object PyObject_to_object(PyObject* o): diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index edf50ad54e787..ffe867b0af0f5 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -148,9 +148,15 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: CLoggingMemoryPool(CMemoryPool*) cdef cppclass CBuffer" arrow::Buffer": + CBuffer(const uint8_t* data, int64_t size) uint8_t* data() int64_t size() shared_ptr[CBuffer] parent() + c_bool is_mutable() const + + cdef cppclass CMutableBuffer" arrow::MutableBuffer"(CBuffer): + CMutableBuffer(const uint8_t* data, int64_t size) + uint8_t* mutable_data() cdef cppclass ResizableBuffer(CBuffer): CStatus Resize(int64_t nbytes) @@ -558,6 +564,9 @@ cdef extern from "arrow/io/memory.h" namespace "arrow::io" nogil: CMockOutputStream() int64_t GetExtentBytesWritten() + cdef cppclass CFixedSizeBufferWriter" arrow::io::FixedSizeBufferWriter"(WriteableFile): + CFixedSizeBufferWriter(const shared_ptr[CBuffer]& buffer) + cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil: enum MessageType" arrow::ipc::Message::Type": diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 8b213a33053d4..181b0b18a712f 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -473,6 +473,15 @@ cdef class OSFile(NativeFile): self.wr_file = handle +cdef class FixedSizeBufferOutputStream(NativeFile): + + def __cinit__(self, Buffer buffer): + self.wr_file.reset(new CFixedSizeBufferWriter(buffer.buffer)) + self.is_readable = 0 + self.is_writeable = 1 + self.is_open = True + + # ---------------------------------------------------------------------- # Arrow buffers @@ -523,7 +532,10 @@ cdef class Buffer: buffer.len = self.size buffer.ndim = 1 buffer.obj = self - buffer.readonly = 1 + if self.buffer.get().is_mutable(): + buffer.readonly = 0 + else: + buffer.readonly = 1 buffer.shape = self.shape buffer.strides = self.strides buffer.suboffsets = NULL @@ -540,6 +552,15 @@ cdef class Buffer: p[0] = self.buffer.get().data() return self.size + def __getwritebuffer__(self, Py_ssize_t idx, void **p): + if not self.buffer.get().is_mutable(): + raise SystemError("trying to write an immutable buffer") + if idx != 0: + raise SystemError("accessing non-existent buffer segment") + if p != NULL: + p[0] = self.buffer.get().data() + return self.size + cdef shared_ptr[PoolBuffer] allocate_buffer(CMemoryPool* pool): cdef shared_ptr[PoolBuffer] result diff --git a/python/pyarrow/plasma.pyx b/python/pyarrow/plasma.pyx new file mode 100644 index 0000000000000..8aaca9963c131 --- /dev/null +++ b/python/pyarrow/plasma.pyx @@ -0,0 +1,566 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True + +from libcpp cimport bool as c_bool, nullptr +from libcpp.memory cimport shared_ptr, unique_ptr, make_shared +from libcpp.string cimport string as c_string +from libcpp.vector cimport vector as c_vector +from libc.stdint cimport int64_t, uint8_t, uintptr_t +from cpython.pycapsule cimport * + +from pyarrow.lib cimport Buffer, NativeFile, check_status +from pyarrow.includes.libarrow cimport (CMutableBuffer, CBuffer, + CFixedSizeBufferWriter, CStatus) + + +PLASMA_WAIT_TIMEOUT = 2 ** 30 + + +cdef extern from "plasma/common.h" nogil: + + cdef cppclass CUniqueID" plasma::UniqueID": + + @staticmethod + CUniqueID from_binary(const c_string& binary) + + c_bool operator==(const CUniqueID& rhs) const + + c_string hex() const + + c_string binary() const + + cdef struct CObjectRequest" plasma::ObjectRequest": + CUniqueID object_id + int type + int status + + +cdef extern from "plasma/common.h": + cdef int64_t kDigestSize" plasma::kDigestSize" + + cdef enum ObjectRequestType: + PLASMA_QUERY_LOCAL"plasma::PLASMA_QUERY_LOCAL", + PLASMA_QUERY_ANYWHERE"plasma::PLASMA_QUERY_ANYWHERE" + + cdef int ObjectStatusLocal"plasma::ObjectStatusLocal"; + cdef int ObjectStatusRemote"plasma::ObjectStatusRemote"; + +cdef extern from "plasma/client.h" nogil: + + cdef cppclass CPlasmaClient" plasma::PlasmaClient": + + CPlasmaClient() + + CStatus Connect(const c_string& store_socket_name, + const c_string& manager_socket_name, int release_delay) + + CStatus Create(const CUniqueID& object_id, int64_t data_size, + const uint8_t* metadata, int64_t metadata_size, + uint8_t** data) + + CStatus Get(const CUniqueID* object_ids, int64_t num_objects, + int64_t timeout_ms, CObjectBuffer* object_buffers) + + CStatus Seal(const CUniqueID& object_id) + + CStatus Evict(int64_t num_bytes, int64_t& num_bytes_evicted) + + CStatus Hash(const CUniqueID& object_id, uint8_t* digest) + + CStatus Release(const CUniqueID& object_id) + + CStatus Contains(const CUniqueID& object_id, c_bool* has_object) + + CStatus Subscribe(int* fd) + + CStatus GetNotification(int fd, CUniqueID* object_id, + int64_t* data_size, int64_t* metadata_size) + + CStatus Disconnect() + + CStatus Fetch(int num_object_ids, const CUniqueID* object_ids) + + CStatus Wait(int64_t num_object_requests, CObjectRequest* object_requests, + int num_ready_objects, int64_t timeout_ms, int* num_objects_ready); + + CStatus Transfer(const char* addr, int port, const CUniqueID& object_id) + + +cdef extern from "plasma/client.h" nogil: + + cdef struct CObjectBuffer" plasma::ObjectBuffer": + int64_t data_size + uint8_t* data + int64_t metadata_size + uint8_t* metadata + + +def make_object_id(object_id): + return ObjectID(object_id) + + +cdef class ObjectID: + """ + An ObjectID represents a string of bytes used to identify Plasma objects. + """ + + cdef: + CUniqueID data + + def __cinit__(self, object_id): + self.data = CUniqueID.from_binary(object_id) + + def __richcmp__(ObjectID self, ObjectID object_id, operation): + if operation != 2: + raise ValueError("operation != 2 (only equality is supported)") + return self.data == object_id.data + + def __hash__(self): + return hash(self.data.binary()) + + def __repr__(self): + return "ObjectID(" + self.data.hex().decode() + ")" + + def __reduce__(self): + return (make_object_id, (self.data.binary(),)) + + def binary(self): + """ + Return the binary representation of this ObjectID. + + Returns + ------- + bytes + Binary representation of the ObjectID. + """ + return self.data.binary() + + +cdef class PlasmaBuffer(Buffer): + """ + This is the type returned by calls to get with a PlasmaClient. + + We define our own class instead of directly returning a buffer object so + that we can add a custom destructor which notifies Plasma that the object + is no longer being used, so the memory in the Plasma store backing the + object can potentially be freed. + + Attributes + ---------- + object_id : ObjectID + The ID of the object in the buffer. + client : PlasmaClient + The PlasmaClient that we use to communicate with the store and manager. + """ + + cdef: + ObjectID object_id + PlasmaClient client + + def __cinit__(self, ObjectID object_id, PlasmaClient client): + """ + Initialize a PlasmaBuffer. + """ + self.object_id = object_id + self.client = client + + def __dealloc__(self): + """ + Notify Plasma that the object is no longer needed. + + If the plasma client has been shut down, then don't do anything. + """ + self.client.release(self.object_id) + + +cdef class PlasmaClient: + """ + The PlasmaClient is used to interface with a plasma store and manager. + + The PlasmaClient can ask the PlasmaStore to allocate a new buffer, seal a + buffer, and get a buffer. Buffers are referred to by object IDs, which are + strings. + """ + + cdef: + shared_ptr[CPlasmaClient] client + int notification_fd + c_string store_socket_name + c_string manager_socket_name + + def __cinit__(self): + self.client.reset(new CPlasmaClient()) + self.notification_fd = -1 + self.store_socket_name = "" + self.manager_socket_name = "" + + cdef _get_object_buffers(self, object_ids, int64_t timeout_ms, + c_vector[CObjectBuffer]* result): + cdef c_vector[CUniqueID] ids + cdef ObjectID object_id + for object_id in object_ids: + ids.push_back(object_id.data) + result[0].resize(ids.size()) + with nogil: + check_status(self.client.get().Get(ids.data(), ids.size(), + timeout_ms, result[0].data())) + + cdef _make_plasma_buffer(self, ObjectID object_id, uint8_t* data, + int64_t size): + cdef shared_ptr[CBuffer] buffer + buffer.reset(new CBuffer(data, size)) + result = PlasmaBuffer(object_id, self) + result.init(buffer) + return result + + cdef _make_mutable_plasma_buffer(self, ObjectID object_id, uint8_t* data, + int64_t size): + cdef shared_ptr[CBuffer] buffer + buffer.reset(new CMutableBuffer(data, size)) + result = PlasmaBuffer(object_id, self) + result.init(buffer) + return result + + @property + def store_socket_name(self): + return self.store_socket_name.decode() + + @property + def manager_socket_name(self): + return self.manager_socket_name.decode() + + def create(self, ObjectID object_id, int64_t data_size, c_string metadata=b""): + """ + Create a new buffer in the PlasmaStore for a particular object ID. + + The returned buffer is mutable until seal is called. + + Parameters + ---------- + object_id : ObjectID + The object ID used to identify an object. + size : int + The size in bytes of the created buffer. + metadata : bytes + An optional string of bytes encoding whatever metadata the user + wishes to encode. + + Raises + ------ + PlasmaObjectExists + This exception is raised if the object could not be created because + there already is an object with the same ID in the plasma store. + + PlasmaStoreFull: This exception is raised if the object could + not be created because the plasma store is unable to evict + enough objects to create room for it. + """ + cdef uint8_t* data + with nogil: + check_status(self.client.get().Create(object_id.data, data_size, + (metadata.data()), + metadata.size(), &data)) + return self._make_mutable_plasma_buffer(object_id, data, data_size) + + def get(self, object_ids, timeout_ms=-1): + """ + Returns data buffer from the PlasmaStore based on object ID. + + If the object has not been sealed yet, this call will block. The + retrieved buffer is immutable. + + Parameters + ---------- + object_ids : list + A list of ObjectIDs used to identify some objects. + timeout_ms :int + The number of milliseconds that the get call should block before + timing out and returning. Pass -1 if the call should block and 0 + if the call should return immediately. + + Returns + ------- + list + List of PlasmaBuffers for the data associated with the object_ids + and None if the object was not available. + """ + cdef c_vector[CObjectBuffer] object_buffers + self._get_object_buffers(object_ids, timeout_ms, &object_buffers) + result = [] + for i in range(object_buffers.size()): + if object_buffers[i].data_size != -1: + result.append(self._make_plasma_buffer( + object_ids[i], object_buffers[i].data, + object_buffers[i].data_size)) + else: + result.append(None) + return result + + def get_metadata(self, object_ids, timeout_ms=-1): + """ + Returns metadata buffer from the PlasmaStore based on object ID. + + If the object has not been sealed yet, this call will block. The + retrieved buffer is immutable. + + Parameters + ---------- + object_ids : list + A list of ObjectIDs used to identify some objects. + timeout_ms : int + The number of milliseconds that the get call should block before + timing out and returning. Pass -1 if the call should block and 0 + if the call should return immediately. + + Returns + ------- + list + List of PlasmaBuffers for the metadata associated with the + object_ids and None if the object was not available. + """ + cdef c_vector[CObjectBuffer] object_buffers + self._get_object_buffers(object_ids, timeout_ms, &object_buffers) + result = [] + for i in range(object_buffers.size()): + result.append(self._make_plasma_buffer( + object_ids[i], object_buffers[i].metadata, + object_buffers[i].metadata_size)) + return result + + def seal(self, ObjectID object_id): + """ + Seal the buffer in the PlasmaStore for a particular object ID. + + Once a buffer has been sealed, the buffer is immutable and can only be + accessed through get. + + Parameters + ---------- + object_id : ObjectID + A string used to identify an object. + """ + with nogil: + check_status(self.client.get().Seal(object_id.data)) + + def release(self, ObjectID object_id): + """ + Notify Plasma that the object is no longer needed. + + Parameters + ---------- + object_id : ObjectID + A string used to identify an object. + """ + with nogil: + check_status(self.client.get().Release(object_id.data)) + + def contains(self, ObjectID object_id): + """ + Check if the object is present and sealed in the PlasmaStore. + + Parameters + ---------- + object_id : ObjectID + A string used to identify an object. + """ + cdef c_bool is_contained + with nogil: + check_status(self.client.get().Contains(object_id.data, + &is_contained)) + return is_contained + + def hash(self, ObjectID object_id): + """ + Compute the checksum of an object in the object store. + + Parameters + ---------- + object_id : ObjectID + A string used to identify an object. + + Returns + ------- + bytes + A digest string object's hash. If the object isn't in the object + store, the string will have length zero. + """ + cdef c_vector[uint8_t] digest = c_vector[uint8_t](kDigestSize) + with nogil: + check_status(self.client.get().Hash(object_id.data, + digest.data())) + return bytes(digest[:]) + + def evict(self, int64_t num_bytes): + """ + Evict some objects until to recover some bytes. + + Recover at least num_bytes bytes if possible. + + Parameters + ---------- + num_bytes : int + The number of bytes to attempt to recover. + """ + cdef int64_t num_bytes_evicted = -1 + with nogil: + check_status(self.client.get().Evict(num_bytes, num_bytes_evicted)) + return num_bytes_evicted + + def transfer(self, address, int port, ObjectID object_id): + """ + Transfer local object with id object_id to another plasma instance + + Parameters + ---------- + addr : str + IPv4 address of the plasma instance the object is sent to. + port : int + Port number of the plasma instance the object is sent to. + object_id : str + A string used to identify an object. + """ + cdef c_string addr = address.encode() + with nogil: + check_status(self.client.get().Transfer(addr.c_str(), port, object_id.data)) + + def fetch(self, object_ids): + """ + Fetch the objects with the given IDs from other plasma managers. + + Parameters + ---------- + object_ids : list + A list of strings used to identify the objects. + """ + cdef c_vector[CUniqueID] ids + cdef ObjectID object_id + for object_id in object_ids: + ids.push_back(object_id.data) + with nogil: + check_status(self.client.get().Fetch(ids.size(), ids.data())) + + def wait(self, object_ids, int64_t timeout=PLASMA_WAIT_TIMEOUT, int num_returns=1): + """ + Wait until num_returns objects in object_ids are ready. + Currently, the object ID arguments to wait must be unique. + + Parameters + ---------- + object_ids : list + List of object IDs to wait for. + timeout :int + Return to the caller after timeout milliseconds. + num_returns : int + We are waiting for this number of objects to be ready. + + Returns + ------- + list + List of object IDs that are ready. + list + List of object IDs we might still wait on. + """ + # Check that the object ID arguments are unique. The plasma manager + # currently crashes if given duplicate object IDs. + if len(object_ids) != len(set(object_ids)): + raise Exception("Wait requires a list of unique object IDs.") + cdef int64_t num_object_requests = len(object_ids) + cdef c_vector[CObjectRequest] object_requests = c_vector[CObjectRequest](num_object_requests) + cdef int num_objects_ready = 0 + cdef ObjectID object_id + for i, object_id in enumerate(object_ids): + object_requests[i].object_id = object_id.data + object_requests[i].type = PLASMA_QUERY_ANYWHERE + with nogil: + check_status(self.client.get().Wait(num_object_requests, object_requests.data(), num_returns, timeout, &num_objects_ready)) + cdef int num_to_return = min(num_objects_ready, num_returns); + ready_ids = [] + waiting_ids = set(object_ids) + cdef int num_returned = 0 + for i in range(len(object_ids)): + if num_returned == num_to_return: + break + if object_requests[i].status == ObjectStatusLocal or object_requests[i].status == ObjectStatusRemote: + ready_ids.append(ObjectID(object_requests[i].object_id.binary())) + waiting_ids.discard(ObjectID(object_requests[i].object_id.binary())) + num_returned += 1 + return ready_ids, list(waiting_ids) + + def subscribe(self): + """Subscribe to notifications about sealed objects.""" + with nogil: + check_status(self.client.get().Subscribe(&self.notification_fd)) + + def get_next_notification(self): + """ + Get the next notification from the notification socket. + + Returns + ------- + ObjectID + The object ID of the object that was stored. + int + The data size of the object that was stored. + int + The metadata size of the object that was stored. + """ + cdef ObjectID object_id = ObjectID(20 * b"\0") + cdef int64_t data_size + cdef int64_t metadata_size + with nogil: + check_status(self.client.get().GetNotification(self.notification_fd, + &object_id.data, + &data_size, + &metadata_size)) + return object_id, data_size, metadata_size + + def to_capsule(self): + return PyCapsule_New(self.client.get(), "plasma", NULL) + + def disconnect(self): + """ + Disconnect this client from the Plasma store. + """ + with nogil: + check_status(self.client.get().Disconnect()) + +def connect(store_socket_name, manager_socket_name, int release_delay): + """ + Return a new PlasmaClient that is connected a plasma store and + optionally a manager. + + Parameters + ---------- + store_socket_name : str + Name of the socket the plasma store is listening at. + manager_socket_name : str + Name of the socket the plasma manager is listening at. + release_delay : int + The maximum number of objects that the client will keep and + delay releasing (for caching reasons). + """ + cdef PlasmaClient result = PlasmaClient() + result.store_socket_name = store_socket_name.encode() + result.manager_socket_name = manager_socket_name.encode() + with nogil: + check_status(result.client.get().Connect(result.store_socket_name, + result.manager_socket_name, release_delay)) + return result diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 2aeeab7294ccc..21288e4f35e74 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -18,11 +18,12 @@ from pytest import skip -groups = ['hdfs', 'parquet', 'large_memory'] +groups = ['hdfs', 'parquet', 'plasma', 'large_memory'] defaults = { 'hdfs': False, 'parquet': False, + 'plasma': False, 'large_memory': False } @@ -32,6 +33,11 @@ except ImportError: pass +try: + import pyarrow.plasma as plasma + defaults['plasma'] = True +except ImportError: + pass def pytest_configure(config): pass diff --git a/python/pyarrow/tests/test_plasma.py b/python/pyarrow/tests/test_plasma.py new file mode 100644 index 0000000000000..8f8d5b5ed607b --- /dev/null +++ b/python/pyarrow/tests/test_plasma.py @@ -0,0 +1,683 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import glob +import numpy as np +import os +import pytest +import random +import signal +import subprocess +import sys +import time +import unittest + +import pyarrow as pa +import pandas as pd + +DEFAULT_PLASMA_STORE_MEMORY = 10 ** 9 + +def random_name(): + return str(random.randint(0, 99999999)) + + +def random_object_id(): + import pyarrow.plasma as plasma + return plasma.ObjectID(np.random.bytes(20)) + + +def generate_metadata(length): + metadata = bytearray(length) + if length > 0: + metadata[0] = random.randint(0, 255) + metadata[-1] = random.randint(0, 255) + for _ in range(100): + metadata[random.randint(0, length - 1)] = random.randint(0, 255) + return metadata + + +def write_to_data_buffer(buff, length): + array = np.frombuffer(buff, dtype="uint8") + if length > 0: + array[0] = random.randint(0, 255) + array[-1] = random.randint(0, 255) + for _ in range(100): + array[random.randint(0, length - 1)] = random.randint(0, 255) + + +def create_object_with_id(client, object_id, data_size, metadata_size, + seal=True): + metadata = generate_metadata(metadata_size) + memory_buffer = client.create(object_id, data_size, metadata) + write_to_data_buffer(memory_buffer, data_size) + if seal: + client.seal(object_id) + return memory_buffer, metadata + + +def create_object(client, data_size, metadata_size, seal=True): + object_id = random_object_id() + memory_buffer, metadata = create_object_with_id(client, object_id, + data_size, metadata_size, + seal=seal) + return object_id, memory_buffer, metadata + + +def assert_get_object_equal(unit_test, client1, client2, object_id, + memory_buffer=None, metadata=None): + import pyarrow.plasma as plasma + client1_buff = client1.get([object_id])[0] + client2_buff = client2.get([object_id])[0] + client1_metadata = client1.get_metadata([object_id])[0] + client2_metadata = client2.get_metadata([object_id])[0] + assert len(client1_buff) == len(client2_buff) + assert len(client1_metadata) == len(client2_metadata) + # Check that the buffers from the two clients are the same. + assert plasma.buffers_equal(client1_buff, client2_buff) + # Check that the metadata buffers from the two clients are the same. + assert plasma.buffers_equal(client1_metadata, client2_metadata) + # If a reference buffer was provided, check that it is the same as well. + if memory_buffer is not None: + assert plasma.buffers_equal(memory_buffer, client1_buff) + # If reference metadata was provided, check that it is the same as well. + if metadata is not None: + assert plasma.buffers_equal(metadata, client1_metadata) + + +def start_plasma_store(plasma_store_memory=DEFAULT_PLASMA_STORE_MEMORY, + use_valgrind=False, use_profiler=False, + stdout_file=None, stderr_file=None): + """Start a plasma store process. + Args: + use_valgrind (bool): True if the plasma store should be started inside + of valgrind. If this is True, use_profiler must be False. + use_profiler (bool): True if the plasma store should be started inside + a profiler. If this is True, use_valgrind must be False. + stdout_file: A file handle opened for writing to redirect stdout to. If + no redirection should happen, then this should be None. + stderr_file: A file handle opened for writing to redirect stderr to. If + no redirection should happen, then this should be None. + Return: + A tuple of the name of the plasma store socket and the process ID of + the plasma store process. + """ + if use_valgrind and use_profiler: + raise Exception("Cannot use valgrind and profiler at the same time.") + plasma_store_executable = os.path.join(pa.__path__[0], "plasma_store") + plasma_store_name = "/tmp/plasma_store{}".format(random_name()) + command = [plasma_store_executable, + "-s", plasma_store_name, + "-m", str(plasma_store_memory)] + if use_valgrind: + pid = subprocess.Popen(["valgrind", + "--track-origins=yes", + "--leak-check=full", + "--show-leak-kinds=all", + "--leak-check-heuristics=stdstring", + "--error-exitcode=1"] + command, + stdout=stdout_file, stderr=stderr_file) + time.sleep(1.0) + elif use_profiler: + pid = subprocess.Popen(["valgrind", "--tool=callgrind"] + command, + stdout=stdout_file, stderr=stderr_file) + time.sleep(1.0) + else: + pid = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file) + time.sleep(0.1) + return plasma_store_name, pid + + +@pytest.mark.plasma +class TestPlasmaClient(object): + + def setup_method(self, test_method): + import pyarrow.plasma as plasma + # Start Plasma store. + plasma_store_name, self.p = start_plasma_store( + use_valgrind=os.getenv("PLASMA_VALGRIND") == "1") + # Connect to Plasma. + self.plasma_client = plasma.connect(plasma_store_name, "", 64) + # For the eviction test + self.plasma_client2 = plasma.connect(plasma_store_name, "", 0) + + def teardown_method(self, test_method): + # Check that the Plasma store is still alive. + assert self.p.poll() == None + # Kill the plasma store process. + if os.getenv("PLASMA_VALGRIND") == "1": + self.p.send_signal(signal.SIGTERM) + self.p.wait() + if self.p.returncode != 0: + assert False + else: + self.p.kill() + + def test_create(self): + # Create an object id string. + object_id = random_object_id() + # Create a new buffer and write to it. + length = 50 + memory_buffer = np.frombuffer(self.plasma_client.create(object_id, + length), + dtype="uint8") + for i in range(length): + memory_buffer[i] = i % 256 + # Seal the object. + self.plasma_client.seal(object_id) + # Get the object. + memory_buffer = np.frombuffer(self.plasma_client.get([object_id])[0], + dtype="uint8") + for i in range(length): + assert memory_buffer[i] == i % 256 + + def test_create_with_metadata(self): + for length in range(1000): + # Create an object id string. + object_id = random_object_id() + # Create a random metadata string. + metadata = generate_metadata(length) + # Create a new buffer and write to it. + memory_buffer = np.frombuffer(self.plasma_client.create(object_id, + length, + metadata), + dtype="uint8") + for i in range(length): + memory_buffer[i] = i % 256 + # Seal the object. + self.plasma_client.seal(object_id) + # Get the object. + memory_buffer = np.frombuffer( + self.plasma_client.get([object_id])[0], dtype="uint8") + for i in range(length): + assert memory_buffer[i] == i % 256 + # Get the metadata. + metadata_buffer = np.frombuffer( + self.plasma_client.get_metadata([object_id])[0], dtype="uint8") + assert len(metadata) == len(metadata_buffer) + for i in range(len(metadata)): + assert metadata[i] == metadata_buffer[i] + + def test_create_existing(self): + # This test is partially used to test the code path in which we create + # an object with an ID that already exists + length = 100 + for _ in range(1000): + object_id = random_object_id() + self.plasma_client.create(object_id, length, + generate_metadata(length)) + try: + self.plasma_client.create(object_id, length, + generate_metadata(length)) + # TODO(pcm): Introduce a more specific error type here. + except pa.lib.ArrowException as e: + pass + else: + assert False + + def test_get(self): + num_object_ids = 100 + # Test timing out of get with various timeouts. + for timeout in [0, 10, 100, 1000]: + object_ids = [random_object_id() for _ in range(num_object_ids)] + results = self.plasma_client.get(object_ids, timeout_ms=timeout) + assert results == num_object_ids * [None] + + data_buffers = [] + metadata_buffers = [] + for i in range(num_object_ids): + if i % 2 == 0: + data_buffer, metadata_buffer = create_object_with_id( + self.plasma_client, object_ids[i], 2000, 2000) + data_buffers.append(data_buffer) + metadata_buffers.append(metadata_buffer) + + # Test timing out from some but not all get calls with various + # timeouts. + for timeout in [0, 10, 100, 1000]: + data_results = self.plasma_client.get(object_ids, + timeout_ms=timeout) + # metadata_results = self.plasma_client.get_metadata( + # object_ids, timeout_ms=timeout) + for i in range(num_object_ids): + if i % 2 == 0: + array1 = np.frombuffer(data_buffers[i // 2], dtype="uint8") + array2 = np.frombuffer(data_results[i], dtype="uint8") + np.testing.assert_equal(array1, array2) + # TODO(rkn): We should compare the metadata as well. But + # currently the types are different (e.g., memoryview + # versus bytearray). + # assert plasma.buffers_equal( + # metadata_buffers[i // 2], metadata_results[i]) + else: + assert results[i] is None + + def test_store_arrow_objects(self): + import pyarrow.plasma as plasma + data = np.random.randn(10, 4) + # Write an arrow object. + object_id = random_object_id() + tensor = pa.Tensor.from_numpy(data) + data_size = pa.get_tensor_size(tensor) + buf = self.plasma_client.create(object_id, data_size) + stream = pa.FixedSizeBufferOutputStream(buf) + pa.write_tensor(tensor, stream) + self.plasma_client.seal(object_id) + # Read the arrow object. + [tensor] = self.plasma_client.get([object_id]) + reader = pa.BufferReader(tensor) + array = pa.read_tensor(reader).to_numpy() + # Assert that they are equal. + np.testing.assert_equal(data, array) + + def test_store_pandas_dataframe(self): + import pyarrow.plasma as plasma + d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']), + 'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])} + df = pd.DataFrame(d) + + # Write the DataFrame. + record_batch = pa.RecordBatch.from_pandas(df) + # Determine the size. + s = pa.MockOutputStream() + stream_writer = pa.RecordBatchStreamWriter(s, record_batch.schema) + stream_writer.write_batch(record_batch) + data_size = s.size() + object_id = plasma.ObjectID(np.random.bytes(20)) + + buf = self.plasma_client.create(object_id, data_size) + stream = pa.FixedSizeBufferOutputStream(buf) + stream_writer = pa.RecordBatchStreamWriter(stream, record_batch.schema) + stream_writer.write_batch(record_batch) + + self.plasma_client.seal(object_id) + + # Read the DataFrame. + [data] = self.plasma_client.get([object_id]) + reader = pa.RecordBatchStreamReader(pa.BufferReader(data)) + result = reader.get_next_batch().to_pandas() + + pd.util.testing.assert_frame_equal(df, result) + + def test_pickle_object_ids(self): + # This can be used for sharing object IDs between processes. + import pickle + object_id = random_object_id() + data = pickle.dumps(object_id) + object_id2 = pickle.loads(data) + assert object_id == object_id2 + + def test_store_full(self): + # The store is started with 1GB, so make sure that create throws an + # exception when it is full. + def assert_create_raises_plasma_full(unit_test, size): + partial_size = np.random.randint(size) + try: + _, memory_buffer, _ = create_object(unit_test.plasma_client, + partial_size, + size - partial_size) + # TODO(pcm): More specific error here. + except pa.lib.ArrowException as e: + pass + else: + # For some reason the above didn't throw an exception, so fail. + assert False + + # Create a list to keep some of the buffers in scope. + memory_buffers = [] + _, memory_buffer, _ = create_object(self.plasma_client, 5 * 10 ** 8, 0) + memory_buffers.append(memory_buffer) + # Remaining space is 5 * 10 ** 8. Make sure that we can't create an + # object of size 5 * 10 ** 8 + 1, but we can create one of size + # 2 * 10 ** 8. + assert_create_raises_plasma_full(self, 5 * 10 ** 8 + 1) + _, memory_buffer, _ = create_object(self.plasma_client, 2 * 10 ** 8, 0) + del memory_buffer + _, memory_buffer, _ = create_object(self.plasma_client, 2 * 10 ** 8, 0) + del memory_buffer + assert_create_raises_plasma_full(self, 5 * 10 ** 8 + 1) + + _, memory_buffer, _ = create_object(self.plasma_client, 2 * 10 ** 8, 0) + memory_buffers.append(memory_buffer) + # Remaining space is 3 * 10 ** 8. + assert_create_raises_plasma_full(self, 3 * 10 ** 8 + 1) + + _, memory_buffer, _ = create_object(self.plasma_client, 10 ** 8, 0) + memory_buffers.append(memory_buffer) + # Remaining space is 2 * 10 ** 8. + assert_create_raises_plasma_full(self, 2 * 10 ** 8 + 1) + + def test_contains(self): + fake_object_ids = [random_object_id() for _ in range(100)] + real_object_ids = [random_object_id() for _ in range(100)] + for object_id in real_object_ids: + assert self.plasma_client.contains(object_id) == False + self.plasma_client.create(object_id, 100) + self.plasma_client.seal(object_id) + assert self.plasma_client.contains(object_id) + for object_id in fake_object_ids: + assert not self.plasma_client.contains(object_id) + for object_id in real_object_ids: + assert self.plasma_client.contains(object_id) + + def test_hash(self): + # Check the hash of an object that doesn't exist. + object_id1 = random_object_id() + try: + self.plasma_client.hash(object_id1) + # TODO(pcm): Introduce a more specific error type here + except pa.lib.ArrowException as e: + pass + else: + assert False + + length = 1000 + # Create a random object, and check that the hash function always + # returns the same value. + metadata = generate_metadata(length) + memory_buffer = np.frombuffer(self.plasma_client.create(object_id1, + length, + metadata), + dtype="uint8") + for i in range(length): + memory_buffer[i] = i % 256 + self.plasma_client.seal(object_id1) + assert (self.plasma_client.hash(object_id1) == + self.plasma_client.hash(object_id1)) + + # Create a second object with the same value as the first, and check + # that their hashes are equal. + object_id2 = random_object_id() + memory_buffer = np.frombuffer(self.plasma_client.create(object_id2, + length, + metadata), + dtype="uint8") + for i in range(length): + memory_buffer[i] = i % 256 + self.plasma_client.seal(object_id2) + assert (self.plasma_client.hash(object_id1) == + self.plasma_client.hash(object_id2)) + + # Create a third object with a different value from the first two, and + # check that its hash is different. + object_id3 = random_object_id() + metadata = generate_metadata(length) + memory_buffer = np.frombuffer(self.plasma_client.create(object_id3, + length, + metadata), + dtype="uint8") + for i in range(length): + memory_buffer[i] = (i + 1) % 256 + self.plasma_client.seal(object_id3) + assert (self.plasma_client.hash(object_id1) != + self.plasma_client.hash(object_id3)) + + # Create a fourth object with the same value as the third, but + # different metadata. Check that its hash is different from any of the + # previous three. + object_id4 = random_object_id() + metadata4 = generate_metadata(length) + memory_buffer = np.frombuffer(self.plasma_client.create(object_id4, + length, + metadata4), + dtype="uint8") + for i in range(length): + memory_buffer[i] = (i + 1) % 256 + self.plasma_client.seal(object_id4) + assert (self.plasma_client.hash(object_id1) != + self.plasma_client.hash(object_id4)) + assert (self.plasma_client.hash(object_id3) != + self.plasma_client.hash(object_id4)) + + def test_many_hashes(self): + hashes = [] + length = 2 ** 10 + + for i in range(256): + object_id = random_object_id() + memory_buffer = np.frombuffer(self.plasma_client.create(object_id, + length), + dtype="uint8") + for j in range(length): + memory_buffer[j] = i + self.plasma_client.seal(object_id) + hashes.append(self.plasma_client.hash(object_id)) + + # Create objects of varying length. Each pair has two bits different. + for i in range(length): + object_id = random_object_id() + memory_buffer = np.frombuffer(self.plasma_client.create(object_id, + length), + dtype="uint8") + for j in range(length): + memory_buffer[j] = 0 + memory_buffer[i] = 1 + self.plasma_client.seal(object_id) + hashes.append(self.plasma_client.hash(object_id)) + + # Create objects of varying length, all with value 0. + for i in range(length): + object_id = random_object_id() + memory_buffer = np.frombuffer(self.plasma_client.create(object_id, + i), + dtype="uint8") + for j in range(i): + memory_buffer[j] = 0 + self.plasma_client.seal(object_id) + hashes.append(self.plasma_client.hash(object_id)) + + # Check that all hashes were unique. + assert len(set(hashes)) == 256 + length + length + + # def test_individual_delete(self): + # length = 100 + # # Create an object id string. + # object_id = random_object_id() + # # Create a random metadata string. + # metadata = generate_metadata(100) + # # Create a new buffer and write to it. + # memory_buffer = self.plasma_client.create(object_id, length, + # metadata) + # for i in range(length): + # memory_buffer[i] = chr(i % 256) + # # Seal the object. + # self.plasma_client.seal(object_id) + # # Check that the object is present. + # assert self.plasma_client.contains(object_id) + # # Delete the object. + # self.plasma_client.delete(object_id) + # # Make sure the object is no longer present. + # self.assertFalse(self.plasma_client.contains(object_id)) + # + # def test_delete(self): + # # Create some objects. + # object_ids = [random_object_id() for _ in range(100)] + # for object_id in object_ids: + # length = 100 + # # Create a random metadata string. + # metadata = generate_metadata(100) + # # Create a new buffer and write to it. + # memory_buffer = self.plasma_client.create(object_id, length, + # metadata) + # for i in range(length): + # memory_buffer[i] = chr(i % 256) + # # Seal the object. + # self.plasma_client.seal(object_id) + # # Check that the object is present. + # assert self.plasma_client.contains(object_id) + # + # # Delete the objects and make sure they are no longer present. + # for object_id in object_ids: + # # Delete the object. + # self.plasma_client.delete(object_id) + # # Make sure the object is no longer present. + # self.assertFalse(self.plasma_client.contains(object_id)) + + def test_illegal_functionality(self): + # Create an object id string. + object_id = random_object_id() + # Create a new buffer and write to it. + length = 1000 + memory_buffer = self.plasma_client.create(object_id, length) + # Make sure we cannot access memory out of bounds. + with pytest.raises(Exception): + memory_buffer[length] + # Seal the object. + self.plasma_client.seal(object_id) + # This test is commented out because it currently fails. + # # Make sure the object is ready only now. + # def illegal_assignment(): + # memory_buffer[0] = chr(0) + # with pytest.raises(Exception): + # illegal_assignment() + # Get the object. + memory_buffer = self.plasma_client.get([object_id])[0] + + # Make sure the object is read only. + def illegal_assignment(): + memory_buffer[0] = chr(0) + with pytest.raises(Exception): + illegal_assignment() + + def test_evict(self): + client = self.plasma_client2 + object_id1 = random_object_id() + b1 = client.create(object_id1, 1000) + client.seal(object_id1) + del b1 + assert client.evict(1) == 1000 + + object_id2 = random_object_id() + object_id3 = random_object_id() + b2 = client.create(object_id2, 999) + b3 = client.create(object_id3, 998) + client.seal(object_id3) + del b3 + assert client.evict(1000) == 998 + + object_id4 = random_object_id() + b4 = client.create(object_id4, 997) + client.seal(object_id4) + del b4 + client.seal(object_id2) + del b2 + assert client.evict(1) == 997 + assert client.evict(1) == 999 + + object_id5 = random_object_id() + object_id6 = random_object_id() + object_id7 = random_object_id() + b5 = client.create(object_id5, 996) + b6 = client.create(object_id6, 995) + b7 = client.create(object_id7, 994) + client.seal(object_id5) + client.seal(object_id6) + client.seal(object_id7) + del b5 + del b6 + del b7 + assert client.evict(2000) == 996 + 995 + 994 + + def test_subscribe(self): + # Subscribe to notifications from the Plasma Store. + self.plasma_client.subscribe() + for i in [1, 10, 100, 1000, 10000]: + object_ids = [random_object_id() for _ in range(i)] + metadata_sizes = [np.random.randint(1000) for _ in range(i)] + data_sizes = [np.random.randint(1000) for _ in range(i)] + for j in range(i): + self.plasma_client.create( + object_ids[j], data_sizes[j], + metadata=bytearray(np.random.bytes(metadata_sizes[j]))) + self.plasma_client.seal(object_ids[j]) + # Check that we received notifications for all of the objects. + for j in range(i): + notification_info = self.plasma_client.get_next_notification() + recv_objid, recv_dsize, recv_msize = notification_info + assert object_ids[j] == recv_objid + assert data_sizes[j] == recv_dsize + assert metadata_sizes[j] == recv_msize + + def test_subscribe_deletions(self): + # Subscribe to notifications from the Plasma Store. We use + # plasma_client2 to make sure that all used objects will get evicted + # properly. + self.plasma_client2.subscribe() + for i in [1, 10, 100, 1000, 10000]: + object_ids = [random_object_id() for _ in range(i)] + # Add 1 to the sizes to make sure we have nonzero object sizes. + metadata_sizes = [np.random.randint(1000) + 1 for _ in range(i)] + data_sizes = [np.random.randint(1000) + 1 for _ in range(i)] + for j in range(i): + x = self.plasma_client2.create( + object_ids[j], data_sizes[j], + metadata=bytearray(np.random.bytes(metadata_sizes[j]))) + self.plasma_client2.seal(object_ids[j]) + del x + # Check that we received notifications for creating all of the + # objects. + for j in range(i): + notification_info = self.plasma_client2.get_next_notification() + recv_objid, recv_dsize, recv_msize = notification_info + assert object_ids[j] == recv_objid + assert data_sizes[j] == recv_dsize + assert metadata_sizes[j] == recv_msize + + # Check that we receive notifications for deleting all objects, as + # we evict them. + for j in range(i): + assert (self.plasma_client2.evict(1) == + data_sizes[j] + metadata_sizes[j]) + notification_info = self.plasma_client2.get_next_notification() + recv_objid, recv_dsize, recv_msize = notification_info + assert object_ids[j] == recv_objid + assert -1 == recv_dsize + assert -1 == recv_msize + + # Test multiple deletion notifications. The first 9 object IDs have + # size 0, and the last has a nonzero size. When Plasma evicts 1 byte, + # it will evict all objects, so we should receive deletion + # notifications for each. + num_object_ids = 10 + object_ids = [random_object_id() for _ in range(num_object_ids)] + metadata_sizes = [0] * (num_object_ids - 1) + data_sizes = [0] * (num_object_ids - 1) + metadata_sizes.append(np.random.randint(1000)) + data_sizes.append(np.random.randint(1000)) + for i in range(num_object_ids): + x = self.plasma_client2.create( + object_ids[i], data_sizes[i], + metadata=bytearray(np.random.bytes(metadata_sizes[i]))) + self.plasma_client2.seal(object_ids[i]) + del x + for i in range(num_object_ids): + notification_info = self.plasma_client2.get_next_notification() + recv_objid, recv_dsize, recv_msize = notification_info + assert object_ids[i] == recv_objid + assert data_sizes[i] == recv_dsize + assert metadata_sizes[i] == recv_msize + assert (self.plasma_client2.evict(1) == + data_sizes[-1] + metadata_sizes[-1]) + for i in range(num_object_ids): + notification_info = self.plasma_client2.get_next_notification() + recv_objid, recv_dsize, recv_msize = notification_info + assert object_ids[i] == recv_objid + assert -1 == recv_dsize + assert -1 == recv_msize diff --git a/python/setup.py b/python/setup.py index 1ea57ae2d858d..7425b71916001 100644 --- a/python/setup.py +++ b/python/setup.py @@ -99,6 +99,10 @@ def initialize_options(self): self.with_parquet = strtobool( os.environ.get('PYARROW_WITH_PARQUET', '0')) + self.with_plasma = strtobool( + os.environ.get('PYARROW_WITH_PLASMA', '0')) + if self.with_plasma and "plasma" not in self.CYTHON_MODULE_NAMES: + self.CYTHON_MODULE_NAMES.append("plasma") self.bundle_arrow_cpp = strtobool( os.environ.get('PYARROW_BUNDLE_ARROW_CPP', '0')) @@ -242,6 +246,8 @@ def move_lib(lib_name): shutil.move(pjoin(build_prefix, 'include'), pjoin(build_lib, 'pyarrow')) move_lib("arrow") move_lib("arrow_python") + if self.with_plasma: + move_lib("plasma") if self.with_parquet: move_lib("parquet") @@ -270,11 +276,20 @@ def move_lib(lib_name): shutil.move(self.get_ext_built_api_header(name), pjoin(os.path.dirname(ext_path), name + '_api.h')) + # Move the plasma store + if self.with_plasma: + build_py = self.get_finalized_command('build_py') + source = os.path.join(self.build_type, "plasma_store") + target = os.path.join(build_lib, build_py.get_package_dir('pyarrow'), "plasma_store") + shutil.move(source, target) + os.chdir(saved_cwd) def _failure_permitted(self, name): if name == '_parquet' and not self.with_parquet: return True + if name == 'plasma' and not self.with_plasma: + return True return False def _get_inplace_dir(self):