-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
GH-43129: [C++][Compute] Fix the unnecessary allocation of extra byte…
…s when encoding row table (#43125) ### Rationale for this change As described in #43129 , current row table occupies more memory than expected. The memory consumption is double of necessary. The reason listed below. When encoding var length columns into into the row table: https://github.com/apache/arrow/blob/e59832fb05dc40a85fa63297c77c8f134c9ac8e0/cpp/src/arrow/compute/row/encode_internal.cc#L155-L162 We first call `AppendEmpty` to reserve space for `x` rows but `0` bytes. This is to reserve enough size for the underlying fixed-length buffers: null masks and offsets (for var-length columns). Then we call `GetRowOffsetsSelected` to populate the offsets. At last we call `AppendEmpty` again with `0` rows but `y` bytes, where `y` is the last offset element which is essentially the whole size of the var-length columns. Sounds all reasonable so far. However, `AppendEmpty` calls `ResizeOptionalVaryingLengthBuffer`, in which: https://github.com/apache/arrow/blob/e59832fb05dc40a85fa63297c77c8f134c9ac8e0/cpp/src/arrow/compute/row/row_internal.cc#L294-L303 We calculate `bytes_capacity_new` by keeping doubling it until it's big enough for `num_bytes + num_extra_bytes`. Note by the time of this point, `num_bytes == offsets()[num_rows_]` is already `y`, meanwhile `num_extra_bytes` is also `y`, hence the unexpected doubled size than necessary. ### What changes are included in this PR? Fix the wasted half size for buffers in row table. Also add tests to make sure the buffer size is as expected. ### Are these changes tested? UT included. ### Are there any user-facing changes? None. * GitHub Issue: #43129 Authored-by: Ruoxi Sun <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
- Loading branch information
1 parent
031497d
commit 3b7ad9d
Showing
5 changed files
with
161 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#include <numeric> | ||
|
||
#include "arrow/compute/row/encode_internal.h" | ||
#include "arrow/compute/row/row_internal.h" | ||
#include "arrow/testing/generator.h" | ||
#include "arrow/testing/gtest_util.h" | ||
|
||
namespace arrow { | ||
namespace compute { | ||
|
||
namespace { | ||
|
||
Result<RowTableImpl> MakeRowTableFromColumn(const std::shared_ptr<Array>& column, | ||
int64_t num_rows, int row_alignment, | ||
int string_alignment) { | ||
DCHECK_GE(column->length(), num_rows); | ||
MemoryPool* pool = default_memory_pool(); | ||
|
||
std::vector<KeyColumnArray> column_arrays; | ||
std::vector<Datum> values{column}; | ||
ExecBatch batch(std::move(values), num_rows); | ||
RETURN_NOT_OK(ColumnArraysFromExecBatch(batch, &column_arrays)); | ||
|
||
std::vector<KeyColumnMetadata> column_metadatas; | ||
RETURN_NOT_OK(ColumnMetadatasFromExecBatch(batch, &column_metadatas)); | ||
RowTableMetadata table_metadata; | ||
table_metadata.FromColumnMetadataVector(column_metadatas, row_alignment, | ||
string_alignment); | ||
|
||
RowTableImpl row_table; | ||
RETURN_NOT_OK(row_table.Init(pool, table_metadata)); | ||
|
||
RowTableEncoder row_encoder; | ||
row_encoder.Init(column_metadatas, row_alignment, string_alignment); | ||
row_encoder.PrepareEncodeSelected(0, num_rows, column_arrays); | ||
|
||
std::vector<uint16_t> row_ids(num_rows); | ||
std::iota(row_ids.begin(), row_ids.end(), 0); | ||
|
||
RETURN_NOT_OK(row_encoder.EncodeSelected(&row_table, static_cast<uint32_t>(num_rows), | ||
row_ids.data())); | ||
|
||
return row_table; | ||
} | ||
|
||
} // namespace | ||
|
||
// GH-43129: Ensure that the memory consumption of the row table is reasonable, that is, | ||
// with the growth factor of 2, the actual memory usage does not exceed twice the amount | ||
// of memory actually needed. | ||
TEST(RowTableMemoryConsumption, Encode) { | ||
constexpr int64_t num_rows_max = 8192; | ||
constexpr int64_t padding_for_vectors = 64; | ||
|
||
ASSERT_OK_AND_ASSIGN( | ||
auto fixed_length_column, | ||
::arrow::gen::Constant(std::make_shared<UInt32Scalar>(0))->Generate(num_rows_max)); | ||
ASSERT_OK_AND_ASSIGN(auto var_length_column, | ||
::arrow::gen::Constant(std::make_shared<BinaryScalar>("X")) | ||
->Generate(num_rows_max)); | ||
|
||
for (int64_t num_rows : {1023, 1024, 1025, 4095, 4096, 4097}) { | ||
// Fixed length column. | ||
{ | ||
SCOPED_TRACE("encoding fixed length column of " + std::to_string(num_rows) + | ||
" rows"); | ||
ASSERT_OK_AND_ASSIGN(auto row_table, | ||
MakeRowTableFromColumn(fixed_length_column, num_rows, | ||
uint32()->byte_width(), 0)); | ||
ASSERT_NE(row_table.data(0), NULLPTR); | ||
ASSERT_NE(row_table.data(1), NULLPTR); | ||
ASSERT_EQ(row_table.data(2), NULLPTR); | ||
|
||
int64_t actual_null_mask_size = | ||
num_rows * row_table.metadata().null_masks_bytes_per_row; | ||
ASSERT_LE(actual_null_mask_size, row_table.buffer_size(0) - padding_for_vectors); | ||
ASSERT_GT(actual_null_mask_size * 2, | ||
row_table.buffer_size(0) - padding_for_vectors); | ||
|
||
int64_t actual_rows_size = num_rows * uint32()->byte_width(); | ||
ASSERT_LE(actual_rows_size, row_table.buffer_size(1) - padding_for_vectors); | ||
ASSERT_GT(actual_rows_size * 2, row_table.buffer_size(1) - padding_for_vectors); | ||
} | ||
|
||
// Var length column. | ||
{ | ||
SCOPED_TRACE("encoding var length column of " + std::to_string(num_rows) + " rows"); | ||
ASSERT_OK_AND_ASSIGN(auto row_table, | ||
MakeRowTableFromColumn(var_length_column, num_rows, 4, 4)); | ||
ASSERT_NE(row_table.data(0), NULLPTR); | ||
ASSERT_NE(row_table.data(1), NULLPTR); | ||
ASSERT_NE(row_table.data(2), NULLPTR); | ||
|
||
int64_t actual_null_mask_size = | ||
num_rows * row_table.metadata().null_masks_bytes_per_row; | ||
ASSERT_LE(actual_null_mask_size, row_table.buffer_size(0) - padding_for_vectors); | ||
ASSERT_GT(actual_null_mask_size * 2, | ||
row_table.buffer_size(0) - padding_for_vectors); | ||
|
||
int64_t actual_offset_size = num_rows * sizeof(uint32_t); | ||
ASSERT_LE(actual_offset_size, row_table.buffer_size(1) - padding_for_vectors); | ||
ASSERT_GT(actual_offset_size * 2, row_table.buffer_size(1) - padding_for_vectors); | ||
|
||
int64_t actual_rows_size = num_rows * row_table.offsets()[1]; | ||
ASSERT_LE(actual_rows_size, row_table.buffer_size(2) - padding_for_vectors); | ||
ASSERT_GT(actual_rows_size * 2, row_table.buffer_size(2) - padding_for_vectors); | ||
} | ||
} | ||
} | ||
|
||
} // namespace compute | ||
} // namespace arrow |