Skip to content

Commit

Permalink
apacheGH-44006: [GLib][Parquet] Add `gparquet_arrow_file_writer_new_r…
Browse files Browse the repository at this point in the history
…ow_group()` (apache#44039)

### Rationale for this change

This is a low-level API to control how to write data. This is for advanced users.

### What changes are included in this PR?

`gparquet_arrow_file_writer_write_chunked_array()` is also added to write a test for `gparquet_arrow_file_writer_new_row_group()`.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.
* GitHub Issue: apache#44006

Authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
  • Loading branch information
kou authored Sep 11, 2024
1 parent 2a793d6 commit 8556001
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 4 deletions.
50 changes: 47 additions & 3 deletions c_glib/parquet-glib/arrow-file-writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -548,13 +548,57 @@ gparquet_arrow_file_writer_write_record_batch(GParquetArrowFileWriter *writer,
gboolean
gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer,
GArrowTable *table,
guint64 chunk_size,
gsize chunk_size,
GError **error)
{
auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer);
auto arrow_table = garrow_table_get_raw(table).get();
auto status = parquet_arrow_file_writer->WriteTable(*arrow_table, chunk_size);
return garrow_error_check(error, status, "[parquet][arrow][file-writer][write-table]");
return garrow::check(error,
parquet_arrow_file_writer->WriteTable(*arrow_table, chunk_size),
"[parquet][arrow][file-writer][write-table]");
}

/**
* gparquet_arrow_file_writer_new_row_group:
* @writer: A #GParquetArrowFileWriter.
* @chunk_size: The max number of rows in a row group.
* @error: (nullable): Return location for a #GError or %NULL.
*
* Returns: %TRUE on success, %FALSE if there was an error.
*
* Since: 18.0.0
*/
gboolean
gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer,
gsize chunk_size,
GError **error)
{
auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer);
return garrow::check(error,
parquet_arrow_file_writer->NewRowGroup(chunk_size),
"[parquet][arrow][file-writer][new-row-group]");
}

/**
* gparquet_arrow_file_writer_write_chunked_array:
* @writer: A #GParquetArrowFileWriter.
* @chunked_array: A #GArrowChunkedArray to be written.
* @error: (nullable): Return location for a #GError or %NULL.
*
* Returns: %TRUE on success, %FALSE if there was an error.
*
* Since: 18.0.0
*/
gboolean
gparquet_arrow_file_writer_write_chunked_array(GParquetArrowFileWriter *writer,
GArrowChunkedArray *chunked_array,
GError **error)
{
auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer);
auto arrow_chunked_array = garrow_chunked_array_get_raw(chunked_array);
return garrow::check(error,
parquet_arrow_file_writer->WriteColumnChunk(arrow_chunked_array),
"[parquet][arrow][file-writer][write-chunked-array]");
}

/**
Expand Down
14 changes: 13 additions & 1 deletion c_glib/parquet-glib/arrow-file-writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,21 @@ GPARQUET_AVAILABLE_IN_0_11
gboolean
gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer,
GArrowTable *table,
guint64 chunk_size,
gsize chunk_size,
GError **error);

GPARQUET_AVAILABLE_IN_18_0
gboolean
gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer,
gsize chunk_size,
GError **error);

GPARQUET_AVAILABLE_IN_18_0
gboolean
gparquet_arrow_file_writer_write_chunked_array(GParquetArrowFileWriter *writer,
GArrowChunkedArray *chunked_array,
GError **error);

GPARQUET_AVAILABLE_IN_0_11
gboolean
gparquet_arrow_file_writer_close(GParquetArrowFileWriter *writer, GError **error);
Expand Down
30 changes: 30 additions & 0 deletions c_glib/test/parquet/test-arrow-file-writer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -82,4 +82,34 @@ def test_write_table
reader.unref
end
end

def test_write_chunked_array
schema = build_schema("enabled" => :boolean)
writer = Parquet::ArrowFileWriter.new(schema, @file.path)
writer.new_row_group(2)
chunked_array = Arrow::ChunkedArray.new([build_boolean_array([true, nil])])
writer.write_chunked_array(chunked_array)
writer.new_row_group(1)
chunked_array = Arrow::ChunkedArray.new([build_boolean_array([false])])
writer.write_chunked_array(chunked_array)
writer.close

reader = Parquet::ArrowFileReader.new(@file.path)
begin
reader.use_threads = true
assert_equal([
2,
build_table("enabled" => [
build_boolean_array([true, nil]),
build_boolean_array([false]),
]),
],
[
reader.n_row_groups,
reader.read_table,
])
ensure
reader.unref
end
end
end

0 comments on commit 8556001

Please sign in to comment.