Skip to content

Commit

Permalink
Replace HiveMetastore get*Statistics with get*ColumnStatistics
Browse files Browse the repository at this point in the history
It is simpler and more efficient to only support column statistics in
HiveMetastore since the HiveBasicStatistics are actually extracted from
table/partition properties, and are not loaded the underlying metastore
implementation.
  • Loading branch information
dain committed Feb 7, 2024
1 parent a6edc70 commit 52a17f1
Show file tree
Hide file tree
Showing 22 changed files with 638 additions and 585 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,12 @@
*/
package io.trino.plugin.hive;

import com.google.common.collect.ImmutableList;
import io.trino.hive.thrift.metastore.DataOperationType;
import io.trino.plugin.hive.acid.AcidOperation;
import io.trino.plugin.hive.acid.AcidTransaction;
import io.trino.plugin.hive.metastore.AcidTransactionOwner;
import io.trino.plugin.hive.metastore.Column;
import io.trino.plugin.hive.metastore.Database;
import io.trino.plugin.hive.metastore.HiveColumnStatistics;
import io.trino.plugin.hive.metastore.HiveMetastore;
import io.trino.plugin.hive.metastore.HivePrincipal;
import io.trino.plugin.hive.metastore.HivePrivilegeInfo;
Expand Down Expand Up @@ -99,37 +98,23 @@ public Set<HiveColumnStatisticType> getSupportedColumnStatistics(Type type)
return delegate.getSupportedColumnStatistics(type);
}

public PartitionStatistics getTableStatistics(String databaseName, String tableName, Optional<Set<String>> columns)
public Map<String, HiveColumnStatistics> getTableColumnStatistics(String databaseName, String tableName, Set<String> columnNames, OptionalLong rowCount)
{
Table table = getExistingTable(databaseName, tableName);
if (columns.isPresent()) {
Set<String> requestedColumnNames = columns.get();
List<Column> requestedColumns = table.getDataColumns().stream()
.filter(column -> requestedColumnNames.contains(column.getName()))
.collect(toImmutableList());
table = Table.builder(table).setDataColumns(requestedColumns).build();
}
return delegate.getTableStatistics(table);
return delegate.getTableColumnStatistics(databaseName, tableName, columnNames, rowCount);
}

public Map<String, PartitionStatistics> getPartitionStatistics(String databaseName, String tableName, Set<String> partitionNames)
public Map<String, Map<String, HiveColumnStatistics>> getPartitionColumnStatistics(
String databaseName,
String tableName,
Map<String, OptionalLong> partitionNamesWithRowCount,
Set<String> columnNames)
{
return getPartitionStatistics(databaseName, tableName, partitionNames, Optional.empty());
return delegate.getPartitionColumnStatistics(databaseName, tableName, partitionNamesWithRowCount, columnNames);
}

public Map<String, PartitionStatistics> getPartitionStatistics(String databaseName, String tableName, Set<String> partitionNames, Optional<Set<String>> columns)
public boolean useSparkTableStatistics()
{
Table table = getExistingTable(databaseName, tableName);
List<Partition> partitions = getExistingPartitionsByNames(table, ImmutableList.copyOf(partitionNames));
if (columns.isPresent()) {
Set<String> requestedColumnNames = columns.get();
List<Column> requestedColumns = table.getDataColumns().stream()
.filter(column -> requestedColumnNames.contains(column.getName()))
.collect(toImmutableList());
table = Table.builder(table).setDataColumns(requestedColumns).build();
partitions = partitions.stream().map(partition -> Partition.builder(partition).setColumns(requestedColumns).build()).collect(toImmutableList());
}
return delegate.getPartitionStatistics(table, partitions);
return delegate.useSparkTableStatistics();
}

public void updateTableStatistics(String databaseName, String tableName, AcidTransaction transaction, StatisticsUpdateMode mode, PartitionStatistics statisticsUpdate)
Expand Down Expand Up @@ -267,7 +252,7 @@ public Optional<List<String>> getPartitionNamesByFilter(
return delegate.getPartitionNamesByFilter(databaseName, tableName, columnNames, partitionKeysFilter);
}

private List<Partition> getExistingPartitionsByNames(Table table, List<String> partitionNames)
public List<Partition> getExistingPartitionsByNames(Table table, List<String> partitionNames)
{
Map<String, Partition> partitions = getPartitionsByNames(table, partitionNames).entrySet().stream()
.map(entry -> immutableEntry(entry.getKey(), entry.getValue().orElseThrow(() ->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import java.util.Objects;

import static com.google.common.base.MoreObjects.toStringHelper;
import static com.google.common.collect.ImmutableMap.toImmutableMap;
import static java.util.Objects.requireNonNull;

@Immutable
Expand Down Expand Up @@ -100,15 +99,6 @@ public PartitionStatistics withBasicStatistics(HiveBasicStatistics basicStatisti
return new PartitionStatistics(basicStatistics, columnStatistics);
}

public PartitionStatistics withEmptyColumnStatisticsRemoved()
{
return new PartitionStatistics(
basicStatistics,
columnStatistics.entrySet().stream()
.filter(entry -> !entry.getValue().equals(HiveColumnStatistics.empty()))
.collect(toImmutableMap(Map.Entry::getKey, Map.Entry::getValue)));
}

public static class Builder
{
private HiveBasicStatistics basicStatistics = HiveBasicStatistics.createEmptyStatistics();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,32 @@ public interface HiveMetastore

Set<HiveColumnStatisticType> getSupportedColumnStatistics(Type type);

PartitionStatistics getTableStatistics(Table table);
/**
* @param columnNames Must not be empty.
* @param rowCount row count is required to calculate the number of distinct values, because Hive treats a NULL as a distinct value resulting in a count that is off by one
*/
Map<String, HiveColumnStatistics> getTableColumnStatistics(String databaseName, String tableName, Set<String> columnNames, OptionalLong rowCount);

/**
* @param columnNames Must not be empty.
* @param partitionNamesWithRowCount row count for each partition is required to calculate the number of distinct values, because
* Hive treats a NULL as a distinct value resulting in a count that is off by one
*/
Map<String, Map<String, HiveColumnStatistics>> getPartitionColumnStatistics(
String databaseName,
String tableName,
Map<String, OptionalLong> partitionNamesWithRowCount,
Set<String> columnNames);

Map<String, PartitionStatistics> getPartitionStatistics(Table table, List<Partition> partitions);
/**
* If true, callers should inspect table and partition parameters for spark stats.
* This method realy only exists for the ThriftHiveMetastore implementation. Spark mixes table and column statistics into the table parameters, and this breaks
* the abstractions of the metastore interface.
*/
default boolean useSparkTableStatistics()
{
return false;
}

void updateTableStatistics(String databaseName, String tableName, AcidTransaction transaction, StatisticsUpdateMode mode, PartitionStatistics statisticsUpdate);

Expand Down
Loading

0 comments on commit 52a17f1

Please sign in to comment.