Skip to content

Commit

Permalink
Simplify column adaptations for parquet in iceberg
Browse files Browse the repository at this point in the history
Avoids having two different column adaptation layers
for parquet in iceberg
Makes ParquetPageSource more consistent with OrcPageSource
  • Loading branch information
raunaqmorarka committed Mar 20, 2023
1 parent 6d3f562 commit f8e774a
Show file tree
Hide file tree
Showing 5 changed files with 231 additions and 153 deletions.

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,22 @@
import io.trino.parquet.ParquetCorruptionException;
import io.trino.parquet.ParquetDataSourceId;
import io.trino.parquet.reader.ParquetReader;
import io.trino.parquet.reader.ParquetReaderColumn;
import io.trino.spi.Page;
import io.trino.spi.TrinoException;
import io.trino.spi.block.Block;
import io.trino.spi.block.LongArrayBlock;
import io.trino.spi.block.RunLengthEncodedBlock;
import io.trino.spi.connector.ConnectorPageSource;
import io.trino.spi.metrics.Metrics;
import io.trino.spi.type.Type;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.List;
import java.util.Optional;
import java.util.OptionalLong;

import static com.google.common.base.Preconditions.checkArgument;
import static io.trino.plugin.base.util.Closables.closeAllSuppress;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_CURSOR_ERROR;
Expand All @@ -42,20 +43,19 @@ public class ParquetPageSource
implements ConnectorPageSource
{
private final ParquetReader parquetReader;
private final List<ParquetReaderColumn> parquetReaderColumns;
private final boolean areSyntheticColumnsPresent;
private final List<ColumnAdaptation> columnAdaptations;
private final boolean isColumnAdaptationRequired;

private boolean closed;
private long completedPositions;

public ParquetPageSource(
private ParquetPageSource(
ParquetReader parquetReader,
List<ParquetReaderColumn> parquetReaderColumns)
List<ColumnAdaptation> columnAdaptations)
{
this.parquetReader = requireNonNull(parquetReader, "parquetReader is null");
this.parquetReaderColumns = ImmutableList.copyOf(requireNonNull(parquetReaderColumns, "parquetReaderColumns is null"));
this.areSyntheticColumnsPresent = parquetReaderColumns.stream()
.anyMatch(column -> column.isRowIndexColumn() || column.field().isEmpty());
this.columnAdaptations = ImmutableList.copyOf(requireNonNull(columnAdaptations, "columnAdaptations is null"));
this.isColumnAdaptationRequired = isColumnAdaptationRequired(columnAdaptations);
}

@Override
Expand Down Expand Up @@ -131,29 +131,60 @@ public Metrics getMetrics()
return parquetReader.getMetrics();
}

public static Builder builder()
{
return new Builder();
}

public static class Builder
{
private final ImmutableList.Builder<ColumnAdaptation> columns = ImmutableList.builder();

private Builder() {}

public Builder addConstantColumn(Block value)
{
columns.add(new ConstantColumn(value));
return this;
}

public Builder addSourceColumn(int sourceChannel)
{
columns.add(new SourceColumn(sourceChannel));
return this;
}

public Builder addNullColumn(Type type)
{
columns.add(new NullColumn(type));
return this;
}

public Builder addRowIndexColumn()
{
columns.add(new RowIndexColumn());
return this;
}

public ConnectorPageSource build(ParquetReader parquetReader)
{
return new ParquetPageSource(parquetReader, this.columns.build());
}
}

private Page getColumnAdaptationsPage(Page page)
{
if (!areSyntheticColumnsPresent) {
if (!isColumnAdaptationRequired) {
return page;
}
if (page == null) {
return null;
}
int batchSize = page.getPositionCount();
Block[] blocks = new Block[parquetReaderColumns.size()];
int sourceColumn = 0;
for (int columnIndex = 0; columnIndex < parquetReaderColumns.size(); columnIndex++) {
ParquetReaderColumn column = parquetReaderColumns.get(columnIndex);
if (column.isRowIndexColumn()) {
blocks[columnIndex] = getRowIndexColumn(parquetReader.lastBatchStartRow(), batchSize);
}
else if (column.field().isEmpty()) {
blocks[columnIndex] = RunLengthEncodedBlock.create(column.type(), null, batchSize);
}
else {
blocks[columnIndex] = page.getBlock(sourceColumn);
sourceColumn++;
}
Block[] blocks = new Block[columnAdaptations.size()];
long startRowId = parquetReader.lastBatchStartRow();
for (int columnChannel = 0; columnChannel < columnAdaptations.size(); columnChannel++) {
blocks[columnChannel] = columnAdaptations.get(columnChannel).getBlock(page, startRowId);
}
return new Page(batchSize, blocks);
}
Expand All @@ -169,7 +200,100 @@ static TrinoException handleException(ParquetDataSourceId dataSourceId, Exceptio
return new TrinoException(HIVE_CURSOR_ERROR, format("Failed to read Parquet file: %s", dataSourceId), exception);
}

private static Block getRowIndexColumn(long baseIndex, int size)
private static boolean isColumnAdaptationRequired(List<ColumnAdaptation> columnAdaptations)
{
// If no synthetic columns are added and the source columns are in order, no adaptations are required
for (int columnChannel = 0; columnChannel < columnAdaptations.size(); columnChannel++) {
ColumnAdaptation column = columnAdaptations.get(columnChannel);
if (column instanceof SourceColumn) {
int delegateChannel = ((SourceColumn) column).getSourceChannel();
if (columnChannel != delegateChannel) {
return true;
}
}
else {
return true;
}
}
return false;
}

private interface ColumnAdaptation
{
Block getBlock(Page sourcePage, long startRowId);
}

private static class NullColumn
implements ColumnAdaptation
{
private final Block nullBlock;

private NullColumn(Type type)
{
this.nullBlock = type.createBlockBuilder(null, 1, 0)
.appendNull()
.build();
}

@Override
public Block getBlock(Page sourcePage, long startRowId)
{
return RunLengthEncodedBlock.create(nullBlock, sourcePage.getPositionCount());
}
}

private static class SourceColumn
implements ColumnAdaptation
{
private final int sourceChannel;

private SourceColumn(int sourceChannel)
{
checkArgument(sourceChannel >= 0, "sourceChannel is negative");
this.sourceChannel = sourceChannel;
}

@Override
public Block getBlock(Page sourcePage, long startRowId)
{
return sourcePage.getBlock(sourceChannel);
}

public int getSourceChannel()
{
return sourceChannel;
}
}

private static class ConstantColumn
implements ColumnAdaptation
{
private final Block singleValueBlock;

private ConstantColumn(Block singleValueBlock)
{
checkArgument(singleValueBlock.getPositionCount() == 1, "ConstantColumnAdaptation singleValueBlock may only contain one position");
this.singleValueBlock = singleValueBlock;
}

@Override
public Block getBlock(Page sourcePage, long startRowId)
{
return RunLengthEncodedBlock.create(singleValueBlock, sourcePage.getPositionCount());
}
}

private static class RowIndexColumn
implements ColumnAdaptation
{
@Override
public Block getBlock(Page sourcePage, long startRowId)
{
return createRowNumberBlock(startRowId, sourcePage.getPositionCount());
}
}

private static Block createRowNumberBlock(long baseIndex, int size)
{
long[] rowIndices = new long[size];
for (int position = 0; position < size; position++) {
Expand Down
Loading

0 comments on commit f8e774a

Please sign in to comment.