Skip to content

Commit

Permalink
GH-41667: [C++][Parquet] Refuse writing non-nullable column that cont…
Browse files Browse the repository at this point in the history
…ains nulls (#44921)

### Rationale for this change

A non-nullable column that contains nulls would result in an invalid Parquet file, so we'd rather raise an error when writing.

This detection is only implemented for leaf columns. Implementing it for non-leaf columns would be more involved, and also doesn't actually seem necessary.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Raising a clear error when trying to write invalid data to Parquet, instead of letting the Parquet writer silently generate an invalid file.

* GitHub Issue: #41667

Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
pitrou authored Dec 4, 2024
1 parent fb8e812 commit ded148c
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 5 deletions.
21 changes: 21 additions & 0 deletions cpp/src/parquet/arrow/arrow_reader_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3349,6 +3349,27 @@ TEST(TestArrowWrite, CheckChunkSize) {
WriteTable(*table, ::arrow::default_memory_pool(), sink, chunk_size));
}

void CheckWritingNonNullableColumnWithNulls(std::shared_ptr<::arrow::Field> field,
std::string json_batch) {
ARROW_SCOPED_TRACE("field = ", field, ", json_batch = ", json_batch);
auto schema = ::arrow::schema({field});
auto table = ::arrow::TableFromJSON(schema, {json_batch});
auto sink = CreateOutputStream();
EXPECT_RAISES_WITH_MESSAGE_THAT(
Invalid, ::testing::HasSubstr("is declared non-nullable but contains nulls"),
WriteTable(*table, ::arrow::default_memory_pool(), sink));
}

TEST(TestArrowWrite, InvalidSchema) {
// GH-41667: nulls in non-nullable column
CheckWritingNonNullableColumnWithNulls(
::arrow::field("a", ::arrow::int32(), /*nullable=*/false),
R"([{"a": 456}, {"a": null}])");
CheckWritingNonNullableColumnWithNulls(
::arrow::field("a", ::arrow::utf8(), /*nullable=*/false),
R"([{"a": "foo"}, {"a": null}])");
}

void DoNestedValidate(const std::shared_ptr<::arrow::DataType>& inner_type,
const std::shared_ptr<::arrow::Field>& outer_field,
const std::shared_ptr<Buffer>& buffer,
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/parquet/column_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1301,6 +1301,10 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<
bool single_nullable_element =
(level_info_.def_level == level_info_.repeated_ancestor_def_level + 1) &&
leaf_field_nullable;
if (!leaf_field_nullable && leaf_array.null_count() != 0) {
return Status::Invalid("Column '", descr_->name(),
"' is declared non-nullable but contains nulls");
}
bool maybe_parent_nulls = level_info_.HasNullableValues() && !single_nullable_element;
if (maybe_parent_nulls) {
ARROW_ASSIGN_OR_RAISE(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,9 @@ private VectorSchemaRoot generateAllTypesVector(BufferAllocator allocator) {
// DenseUnion
List<Field> childFields = new ArrayList<>();
childFields.add(
new Field(
"int-child", new FieldType(false, new ArrowType.Int(32, true), null, null), null));
new Field("int-child", FieldType.notNullable(new ArrowType.Int(32, true)), null));
Field structField =
new Field(
"struct", new FieldType(true, ArrowType.Struct.INSTANCE, null, null), childFields);
new Field("struct", FieldType.nullable(ArrowType.Struct.INSTANCE), childFields);
Field[] fields =
new Field[] {
Field.nullablePrimitive("null", ArrowType.Null.INSTANCE),
Expand Down Expand Up @@ -239,7 +237,11 @@ private VectorSchemaRoot generateAllTypesVector(BufferAllocator allocator) {
largeListWriter.integer().writeInt(1);
largeListWriter.endList();

((StructVector) root.getVector("struct")).getChild("int-child", IntVector.class).set(1, 1);
var intChildVector =
((StructVector) root.getVector("struct")).getChild("int-child", IntVector.class);
// set a non-null value at index 0 since the field is not nullable
intChildVector.set(0, 0);
intChildVector.set(1, 1);
return root;
}

Expand Down

0 comments on commit ded148c

Please sign in to comment.