Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into apachegh-41480-pyar…
Browse files Browse the repository at this point in the history
…row-build-config
  • Loading branch information
jorisvandenbossche committed May 8, 2024
2 parents c3a2136 + 5385926 commit f9b2b3c
Show file tree
Hide file tree
Showing 31 changed files with 351 additions and 370 deletions.
3 changes: 2 additions & 1 deletion cpp/src/arrow/compute/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ add_arrow_test(internals_test
light_array_test.cc
registry_test.cc
key_hash_test.cc
row/compare_test.cc)
row/compare_test.cc
row/grouper_test.cc)

add_arrow_compute_test(expression_test SOURCES expression_test.cc)

Expand Down
41 changes: 18 additions & 23 deletions cpp/src/arrow/compute/row/compare_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,22 +36,22 @@ void KeyCompare::NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_com
const uint32_t* left_to_right_map,
LightContext* ctx, const KeyColumnArray& col,
const RowTableImpl& rows,
uint8_t* match_bytevector,
bool are_cols_in_encoding_order) {
bool are_cols_in_encoding_order,
uint8_t* match_bytevector) {
if (!rows.has_any_nulls(ctx) && !col.data(0)) {
return;
}
uint32_t num_processed = 0;
#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (ctx->has_avx2()) {
num_processed = NullUpdateColumnToRow_avx2(use_selection, id_col, num_rows_to_compare,
sel_left_maybe_null, left_to_right_map,
ctx, col, rows, match_bytevector);
num_processed = NullUpdateColumnToRow_avx2(
use_selection, id_col, num_rows_to_compare, sel_left_maybe_null,
left_to_right_map, ctx, col, rows, are_cols_in_encoding_order, match_bytevector);
}
#endif

uint32_t null_bit_id =
are_cols_in_encoding_order ? id_col : rows.metadata().pos_after_encoding(id_col);
const uint32_t null_bit_id =
ColIdInEncodingOrder(rows, id_col, are_cols_in_encoding_order);

if (!col.data(0)) {
// Remove rows from the result for which the column value is a null
Expand Down Expand Up @@ -363,10 +363,9 @@ void KeyCompare::CompareColumnsToRows(
continue;
}

uint32_t offset_within_row = rows.metadata().encoded_field_offset(
are_cols_in_encoding_order
? static_cast<uint32_t>(icol)
: rows.metadata().pos_after_encoding(static_cast<uint32_t>(icol)));
uint32_t offset_within_row =
rows.metadata().encoded_field_offset(ColIdInEncodingOrder(
rows, static_cast<uint32_t>(icol), are_cols_in_encoding_order));
if (col.metadata().is_fixed_length) {
if (sel_left_maybe_null) {
CompareBinaryColumnToRow<true>(
Expand All @@ -375,9 +374,8 @@ void KeyCompare::CompareColumnsToRows(
is_first_column ? match_bytevector_A : match_bytevector_B);
NullUpdateColumnToRow<true>(
static_cast<uint32_t>(icol), num_rows_to_compare, sel_left_maybe_null,
left_to_right_map, ctx, col, rows,
is_first_column ? match_bytevector_A : match_bytevector_B,
are_cols_in_encoding_order);
left_to_right_map, ctx, col, rows, are_cols_in_encoding_order,
is_first_column ? match_bytevector_A : match_bytevector_B);
} else {
// Version without using selection vector
CompareBinaryColumnToRow<false>(
Expand All @@ -386,9 +384,8 @@ void KeyCompare::CompareColumnsToRows(
is_first_column ? match_bytevector_A : match_bytevector_B);
NullUpdateColumnToRow<false>(
static_cast<uint32_t>(icol), num_rows_to_compare, sel_left_maybe_null,
left_to_right_map, ctx, col, rows,
is_first_column ? match_bytevector_A : match_bytevector_B,
are_cols_in_encoding_order);
left_to_right_map, ctx, col, rows, are_cols_in_encoding_order,
is_first_column ? match_bytevector_A : match_bytevector_B);
}
if (!is_first_column) {
AndByteVectors(ctx, num_rows_to_compare, match_bytevector_A, match_bytevector_B);
Expand All @@ -414,9 +411,8 @@ void KeyCompare::CompareColumnsToRows(
}
NullUpdateColumnToRow<true>(
static_cast<uint32_t>(icol), num_rows_to_compare, sel_left_maybe_null,
left_to_right_map, ctx, col, rows,
is_first_column ? match_bytevector_A : match_bytevector_B,
are_cols_in_encoding_order);
left_to_right_map, ctx, col, rows, are_cols_in_encoding_order,
is_first_column ? match_bytevector_A : match_bytevector_B);
} else {
if (ivarbinary == 0) {
CompareVarBinaryColumnToRow<false, true>(
Expand All @@ -429,9 +425,8 @@ void KeyCompare::CompareColumnsToRows(
}
NullUpdateColumnToRow<false>(
static_cast<uint32_t>(icol), num_rows_to_compare, sel_left_maybe_null,
left_to_right_map, ctx, col, rows,
is_first_column ? match_bytevector_A : match_bytevector_B,
are_cols_in_encoding_order);
left_to_right_map, ctx, col, rows, are_cols_in_encoding_order,
is_first_column ? match_bytevector_A : match_bytevector_B);
}
if (!is_first_column) {
AndByteVectors(ctx, num_rows_to_compare, match_bytevector_A, match_bytevector_B);
Expand Down
25 changes: 15 additions & 10 deletions cpp/src/arrow/compute/row/compare_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,19 @@ class ARROW_EXPORT KeyCompare {
uint8_t* out_match_bitvector_maybe_null = NULLPTR);

private:
static uint32_t ColIdInEncodingOrder(const RowTableImpl& rows, uint32_t id_col,
bool are_cols_in_encoding_order) {
return are_cols_in_encoding_order ? id_col
: rows.metadata().pos_after_encoding(id_col);
}

template <bool use_selection>
static void NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_compare,
const uint16_t* sel_left_maybe_null,
const uint32_t* left_to_right_map, LightContext* ctx,
const KeyColumnArray& col, const RowTableImpl& rows,
uint8_t* match_bytevector,
bool are_cols_in_encoding_order);
bool are_cols_in_encoding_order,
uint8_t* match_bytevector);

template <bool use_selection, class COMPARE_FN>
static void CompareBinaryColumnToRowHelper(
Expand Down Expand Up @@ -92,7 +98,8 @@ class ARROW_EXPORT KeyCompare {
static uint32_t NullUpdateColumnToRowImp_avx2(
uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null,
const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col,
const RowTableImpl& rows, uint8_t* match_bytevector);
const RowTableImpl& rows, bool are_cols_in_encoding_order,
uint8_t* match_bytevector);

template <bool use_selection, class COMPARE8_FN>
static uint32_t CompareBinaryColumnToRowHelper_avx2(
Expand All @@ -118,13 +125,11 @@ class ARROW_EXPORT KeyCompare {
static uint32_t AndByteVectors_avx2(uint32_t num_elements, uint8_t* bytevector_A,
const uint8_t* bytevector_B);

static uint32_t NullUpdateColumnToRow_avx2(bool use_selection, uint32_t id_col,
uint32_t num_rows_to_compare,
const uint16_t* sel_left_maybe_null,
const uint32_t* left_to_right_map,
LightContext* ctx, const KeyColumnArray& col,
const RowTableImpl& rows,
uint8_t* match_bytevector);
static uint32_t NullUpdateColumnToRow_avx2(
bool use_selection, uint32_t id_col, uint32_t num_rows_to_compare,
const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map,
LightContext* ctx, const KeyColumnArray& col, const RowTableImpl& rows,
bool are_cols_in_encoding_order, uint8_t* match_bytevector);

static uint32_t CompareBinaryColumnToRow_avx2(
bool use_selection, uint32_t offset_within_row, uint32_t num_rows_to_compare,
Expand Down
20 changes: 11 additions & 9 deletions cpp/src/arrow/compute/row/compare_internal_avx2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,14 @@ template <bool use_selection>
uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2(
uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null,
const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col,
const RowTableImpl& rows, uint8_t* match_bytevector) {
const RowTableImpl& rows, bool are_cols_in_encoding_order,
uint8_t* match_bytevector) {
if (!rows.has_any_nulls(ctx) && !col.data(0)) {
return num_rows_to_compare;
}

uint32_t null_bit_id = rows.metadata().pos_after_encoding(id_col);
const uint32_t null_bit_id =
ColIdInEncodingOrder(rows, id_col, are_cols_in_encoding_order);

if (!col.data(0)) {
// Remove rows from the result for which the column value is a null
Expand Down Expand Up @@ -569,7 +571,7 @@ uint32_t KeyCompare::NullUpdateColumnToRow_avx2(
bool use_selection, uint32_t id_col, uint32_t num_rows_to_compare,
const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map,
LightContext* ctx, const KeyColumnArray& col, const RowTableImpl& rows,
uint8_t* match_bytevector) {
bool are_cols_in_encoding_order, uint8_t* match_bytevector) {
int64_t num_rows_safe =
TailSkipForSIMD::FixBitAccess(sizeof(uint32_t), col.length(), col.bit_offset(0));
if (sel_left_maybe_null) {
Expand All @@ -580,13 +582,13 @@ uint32_t KeyCompare::NullUpdateColumnToRow_avx2(
}

if (use_selection) {
return NullUpdateColumnToRowImp_avx2<true>(id_col, num_rows_to_compare,
sel_left_maybe_null, left_to_right_map,
ctx, col, rows, match_bytevector);
return NullUpdateColumnToRowImp_avx2<true>(
id_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col,
rows, are_cols_in_encoding_order, match_bytevector);
} else {
return NullUpdateColumnToRowImp_avx2<false>(id_col, num_rows_to_compare,
sel_left_maybe_null, left_to_right_map,
ctx, col, rows, match_bytevector);
return NullUpdateColumnToRowImp_avx2<false>(
id_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col,
rows, are_cols_in_encoding_order, match_bytevector);
}
}

Expand Down
68 changes: 68 additions & 0 deletions cpp/src/arrow/compute/row/grouper_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <numeric>

#include "arrow/compute/exec.h"
#include "arrow/compute/row/grouper.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/random.h"

namespace arrow {
namespace compute {

// Specialized case for GH-40997
TEST(Grouper, ResortedColumnsWithLargeNullRows) {
const uint64_t num_rows = 1024;

// construct random array with plenty of null values
const int32_t kSeed = 42;
const int32_t min = 0;
const int32_t max = 100;
const double null_probability = 0.3;
const double true_probability = 0.5;
auto rng = random::RandomArrayGenerator(kSeed);
auto b_arr = rng.Boolean(num_rows, true_probability, null_probability);
auto i32_arr = rng.Int32(num_rows, min, max, null_probability);
auto i64_arr = rng.Int64(num_rows, min, max * 10, null_probability);

// construct batches with columns which will be resorted in the grouper make
std::vector<ExecBatch> exec_batches = {ExecBatch({i64_arr, i32_arr, b_arr}, num_rows),
ExecBatch({i32_arr, i64_arr, b_arr}, num_rows),
ExecBatch({i64_arr, b_arr, i32_arr}, num_rows),
ExecBatch({i32_arr, b_arr, i64_arr}, num_rows),
ExecBatch({b_arr, i32_arr, i64_arr}, num_rows),
ExecBatch({b_arr, i64_arr, i32_arr}, num_rows)};

const int num_batches = static_cast<int>(exec_batches.size());
std::vector<uint32_t> group_num_vec;
group_num_vec.reserve(num_batches);

for (const auto& exec_batch : exec_batches) {
ExecSpan span(exec_batch);
ASSERT_OK_AND_ASSIGN(auto grouper, Grouper::Make(span.GetTypes()));
ASSERT_OK_AND_ASSIGN(Datum group_ids, grouper->Consume(span));
group_num_vec.emplace_back(grouper->num_groups());
}

for (int i = 1; i < num_batches; i++) {
ASSERT_EQ(group_num_vec[i - 1], group_num_vec[i]);
}
}

} // namespace compute
} // namespace arrow
3 changes: 2 additions & 1 deletion cpp/src/arrow/compute/row/row_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ void RowTableMetadata::FromColumnMetadataVector(
//
// Columns are sorted based on the size in bytes of their fixed-length part.
// For the varying-length column, the fixed-length part is the 32-bit field storing
// cumulative length of varying-length fields.
// cumulative length of varying-length fields. This is to make the memory access of
// each individual column within the encoded row alignment-friendly.
//
// The rules are:
//
Expand Down
25 changes: 15 additions & 10 deletions cpp/src/arrow/dataset/file_parquet_encryption_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -148,17 +148,22 @@ class DatasetEncryptionTestBase : public ::testing::Test {
FileSystemDatasetFactory::Make(file_system_, selector,
file_format, factory_options));

// Read dataset into table
// Create the dataset
ASSERT_OK_AND_ASSIGN(auto dataset, dataset_factory->Finish());
ASSERT_OK_AND_ASSIGN(auto scanner_builder, dataset->NewScan());
ASSERT_OK_AND_ASSIGN(auto scanner, scanner_builder->Finish());
ASSERT_OK_AND_ASSIGN(auto read_table, scanner->ToTable());

// Verify the data was read correctly
ASSERT_OK_AND_ASSIGN(auto combined_table, read_table->CombineChunks());
// Validate the table
ASSERT_OK(combined_table->ValidateFull());
AssertTablesEqual(*combined_table, *table_);

// Reuse the dataset above to scan it twice to make sure decryption works correctly.
for (size_t i = 0; i < 2; ++i) {
// Read dataset into table
ASSERT_OK_AND_ASSIGN(auto scanner_builder, dataset->NewScan());
ASSERT_OK_AND_ASSIGN(auto scanner, scanner_builder->Finish());
ASSERT_OK_AND_ASSIGN(auto read_table, scanner->ToTable());

// Verify the data was read correctly
ASSERT_OK_AND_ASSIGN(auto combined_table, read_table->CombineChunks());
// Validate the table
ASSERT_OK(combined_table->ValidateFull());
AssertTablesEqual(*combined_table, *table_);
}
}

protected:
Expand Down
Loading

0 comments on commit f9b2b3c

Please sign in to comment.