Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify column adaptations for parquet in iceberg #16547

Merged
merged 2 commits into from
Mar 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,22 @@
import io.trino.parquet.ParquetCorruptionException;
import io.trino.parquet.ParquetDataSourceId;
import io.trino.parquet.reader.ParquetReader;
import io.trino.parquet.reader.ParquetReaderColumn;
import io.trino.spi.Page;
import io.trino.spi.TrinoException;
import io.trino.spi.block.Block;
import io.trino.spi.block.LongArrayBlock;
import io.trino.spi.block.RunLengthEncodedBlock;
import io.trino.spi.connector.ConnectorPageSource;
import io.trino.spi.metrics.Metrics;
import io.trino.spi.type.Type;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.List;
import java.util.Optional;
import java.util.OptionalLong;

import static com.google.common.base.Preconditions.checkArgument;
import static io.trino.plugin.base.util.Closables.closeAllSuppress;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_CURSOR_ERROR;
Expand All @@ -42,20 +43,19 @@ public class ParquetPageSource
implements ConnectorPageSource
{
private final ParquetReader parquetReader;
private final List<ParquetReaderColumn> parquetReaderColumns;
private final boolean areSyntheticColumnsPresent;
private final List<ColumnAdaptation> columnAdaptations;
private final boolean isColumnAdaptationRequired;

private boolean closed;
private long completedPositions;

public ParquetPageSource(
private ParquetPageSource(
ParquetReader parquetReader,
List<ParquetReaderColumn> parquetReaderColumns)
List<ColumnAdaptation> columnAdaptations)
{
this.parquetReader = requireNonNull(parquetReader, "parquetReader is null");
this.parquetReaderColumns = ImmutableList.copyOf(requireNonNull(parquetReaderColumns, "parquetReaderColumns is null"));
this.areSyntheticColumnsPresent = parquetReaderColumns.stream()
.anyMatch(column -> column.isRowIndexColumn() || column.field().isEmpty());
this.columnAdaptations = ImmutableList.copyOf(requireNonNull(columnAdaptations, "columnAdaptations is null"));
this.isColumnAdaptationRequired = isColumnAdaptationRequired(columnAdaptations);
}

@Override
Expand Down Expand Up @@ -131,29 +131,60 @@ public Metrics getMetrics()
return parquetReader.getMetrics();
}

public static Builder builder()
{
return new Builder();
}

public static class Builder
{
private final ImmutableList.Builder<ColumnAdaptation> columns = ImmutableList.builder();

private Builder() {}

public Builder addConstantColumn(Block value)
{
columns.add(new ConstantColumn(value));
return this;
}

public Builder addSourceColumn(int sourceChannel)
{
columns.add(new SourceColumn(sourceChannel));
return this;
}

public Builder addNullColumn(Type type)
{
columns.add(new NullColumn(type));
return this;
}

public Builder addRowIndexColumn()
{
columns.add(new RowIndexColumn());
return this;
}

public ConnectorPageSource build(ParquetReader parquetReader)
{
return new ParquetPageSource(parquetReader, this.columns.build());
}
}

private Page getColumnAdaptationsPage(Page page)
{
if (!areSyntheticColumnsPresent) {
if (!isColumnAdaptationRequired) {
return page;
}
if (page == null) {
return null;
}
int batchSize = page.getPositionCount();
Block[] blocks = new Block[parquetReaderColumns.size()];
int sourceColumn = 0;
for (int columnIndex = 0; columnIndex < parquetReaderColumns.size(); columnIndex++) {
ParquetReaderColumn column = parquetReaderColumns.get(columnIndex);
if (column.isRowIndexColumn()) {
blocks[columnIndex] = getRowIndexColumn(parquetReader.lastBatchStartRow(), batchSize);
}
else if (column.field().isEmpty()) {
blocks[columnIndex] = RunLengthEncodedBlock.create(column.type(), null, batchSize);
}
else {
blocks[columnIndex] = page.getBlock(sourceColumn);
sourceColumn++;
}
Block[] blocks = new Block[columnAdaptations.size()];
long startRowId = parquetReader.lastBatchStartRow();
for (int columnChannel = 0; columnChannel < columnAdaptations.size(); columnChannel++) {
blocks[columnChannel] = columnAdaptations.get(columnChannel).getBlock(page, startRowId);
}
return new Page(batchSize, blocks);
}
Expand All @@ -169,7 +200,100 @@ static TrinoException handleException(ParquetDataSourceId dataSourceId, Exceptio
return new TrinoException(HIVE_CURSOR_ERROR, format("Failed to read Parquet file: %s", dataSourceId), exception);
}

private static Block getRowIndexColumn(long baseIndex, int size)
private static boolean isColumnAdaptationRequired(List<ColumnAdaptation> columnAdaptations)
{
// If no synthetic columns are added and the source columns are in order, no adaptations are required
for (int columnChannel = 0; columnChannel < columnAdaptations.size(); columnChannel++) {
ColumnAdaptation column = columnAdaptations.get(columnChannel);
if (column instanceof SourceColumn) {
int delegateChannel = ((SourceColumn) column).getSourceChannel();
if (columnChannel != delegateChannel) {
return true;
}
}
else {
return true;
}
}
return false;
}

private interface ColumnAdaptation
{
Block getBlock(Page sourcePage, long startRowId);
}

private static class NullColumn
implements ColumnAdaptation
{
private final Block nullBlock;

private NullColumn(Type type)
{
this.nullBlock = type.createBlockBuilder(null, 1, 0)
.appendNull()
.build();
}

@Override
public Block getBlock(Page sourcePage, long startRowId)
{
return RunLengthEncodedBlock.create(nullBlock, sourcePage.getPositionCount());
}
}

private static class SourceColumn
implements ColumnAdaptation
{
private final int sourceChannel;

private SourceColumn(int sourceChannel)
{
checkArgument(sourceChannel >= 0, "sourceChannel is negative");
this.sourceChannel = sourceChannel;
}

@Override
public Block getBlock(Page sourcePage, long startRowId)
{
return sourcePage.getBlock(sourceChannel);
}

public int getSourceChannel()
{
return sourceChannel;
}
}

private static class ConstantColumn
implements ColumnAdaptation
{
private final Block singleValueBlock;

private ConstantColumn(Block singleValueBlock)
raunaqmorarka marked this conversation as resolved.
Show resolved Hide resolved
{
checkArgument(singleValueBlock.getPositionCount() == 1, "ConstantColumnAdaptation singleValueBlock may only contain one position");
this.singleValueBlock = singleValueBlock;
}

@Override
public Block getBlock(Page sourcePage, long startRowId)
{
return RunLengthEncodedBlock.create(singleValueBlock, sourcePage.getPositionCount());
}
}

private static class RowIndexColumn
implements ColumnAdaptation
{
@Override
public Block getBlock(Page sourcePage, long startRowId)
{
return createRowNumberBlock(startRowId, sourcePage.getPositionCount());
}
}

private static Block createRowNumberBlock(long baseIndex, int size)
{
long[] rowIndices = new long[size];
for (int position = 0; position < size; position++) {
Expand Down
Loading