From e65585de567e7383a967bf3a86075c1508a9c8d6 Mon Sep 17 00:00:00 2001 From: Dain Sundstrom Date: Sat, 10 Dec 2022 18:43:41 -0800 Subject: [PATCH] Add Redshift statistics --- plugin/trino-redshift/pom.xml | 12 +- .../trino/plugin/redshift/RedshiftClient.java | 36 +- .../plugin/redshift/RedshiftClientModule.java | 3 + .../RedshiftTableStatisticsReader.java | 176 +++++++++ .../redshift/TestRedshiftConnectorTest.java | 73 ++++ .../TestRedshiftTableStatisticsReader.java | 349 ++++++++++++++++++ 6 files changed, 642 insertions(+), 7 deletions(-) create mode 100644 plugin/trino-redshift/src/main/java/io/trino/plugin/redshift/RedshiftTableStatisticsReader.java create mode 100644 plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/TestRedshiftTableStatisticsReader.java diff --git a/plugin/trino-redshift/pom.xml b/plugin/trino-redshift/pom.xml index ebcd96279cda..2370ecf35c3e 100644 --- a/plugin/trino-redshift/pom.xml +++ b/plugin/trino-redshift/pom.xml @@ -49,6 +49,11 @@ javax.inject + + org.jdbi + jdbi3-core + + io.airlift @@ -68,12 +73,6 @@ runtime - - org.jdbi - jdbi3-core - runtime - - io.trino @@ -177,6 +176,7 @@ **/TestRedshiftConnectorTest.java + **/TestRedshiftTableStatisticsReader.java **/TestRedshiftTypeMapping.java diff --git a/plugin/trino-redshift/src/main/java/io/trino/plugin/redshift/RedshiftClient.java b/plugin/trino-redshift/src/main/java/io/trino/plugin/redshift/RedshiftClient.java index 1983260e43c9..2183e01bc4d7 100644 --- a/plugin/trino-redshift/src/main/java/io/trino/plugin/redshift/RedshiftClient.java +++ b/plugin/trino-redshift/src/main/java/io/trino/plugin/redshift/RedshiftClient.java @@ -22,6 +22,7 @@ import io.trino.plugin.jdbc.ColumnMapping; import io.trino.plugin.jdbc.ConnectionFactory; import io.trino.plugin.jdbc.JdbcColumnHandle; +import io.trino.plugin.jdbc.JdbcStatisticsConfig; import io.trino.plugin.jdbc.JdbcTableHandle; import io.trino.plugin.jdbc.JdbcTypeHandle; import io.trino.plugin.jdbc.LongWriteFunction; @@ -35,7 +36,10 @@ import io.trino.plugin.jdbc.logging.RemoteQueryModifier; import io.trino.plugin.jdbc.mapping.IdentifierMapping; import io.trino.spi.TrinoException; +import io.trino.spi.connector.ColumnHandle; import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.predicate.TupleDomain; +import io.trino.spi.statistics.TableStatistics; import io.trino.spi.type.CharType; import io.trino.spi.type.Chars; import io.trino.spi.type.DecimalType; @@ -73,6 +77,7 @@ import java.util.function.BiFunction; import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Throwables.throwIfInstanceOf; import static com.google.common.base.Verify.verify; import static io.trino.plugin.jdbc.JdbcErrorCode.JDBC_ERROR; import static io.trino.plugin.jdbc.JdbcErrorCode.JDBC_NON_TRANSIENT_ERROR; @@ -194,10 +199,21 @@ public class RedshiftClient .toFormatter(); private static final OffsetDateTime REDSHIFT_MIN_SUPPORTED_TIMESTAMP_TZ = OffsetDateTime.of(-4712, 1, 1, 0, 0, 0, 0, ZoneOffset.UTC); + private final boolean statisticsEnabled; + private final RedshiftTableStatisticsReader statisticsReader; + @Inject - public RedshiftClient(BaseJdbcConfig config, ConnectionFactory connectionFactory, QueryBuilder queryBuilder, IdentifierMapping identifierMapping, RemoteQueryModifier queryModifier) + public RedshiftClient( + BaseJdbcConfig config, + ConnectionFactory connectionFactory, + JdbcStatisticsConfig statisticsConfig, + QueryBuilder queryBuilder, + IdentifierMapping identifierMapping, + RemoteQueryModifier queryModifier) { super(config, "\"", connectionFactory, queryBuilder, identifierMapping, queryModifier); + this.statisticsEnabled = requireNonNull(statisticsConfig, "statisticsConfig is null").isEnabled(); + this.statisticsReader = new RedshiftTableStatisticsReader(connectionFactory); } @Override @@ -207,6 +223,24 @@ public Optional getTableComment(ResultSet resultSet) return Optional.empty(); } + @Override + public TableStatistics getTableStatistics(ConnectorSession session, JdbcTableHandle handle, TupleDomain tupleDomain) + { + if (!statisticsEnabled) { + return TableStatistics.empty(); + } + if (!handle.isNamedRelation()) { + return TableStatistics.empty(); + } + try { + return statisticsReader.readTableStatistics(session, handle, () -> this.getColumns(session, handle)); + } + catch (SQLException | RuntimeException e) { + throwIfInstanceOf(e, TrinoException.class); + throw new TrinoException(JDBC_ERROR, "Failed fetching statistics for table: " + handle, e); + } + } + @Override protected void renameTable(ConnectorSession session, Connection connection, String catalogName, String remoteSchemaName, String remoteTableName, String newRemoteSchemaName, String newRemoteTableName) throws SQLException diff --git a/plugin/trino-redshift/src/main/java/io/trino/plugin/redshift/RedshiftClientModule.java b/plugin/trino-redshift/src/main/java/io/trino/plugin/redshift/RedshiftClientModule.java index 13635c88f69b..ef4153ee45ef 100644 --- a/plugin/trino-redshift/src/main/java/io/trino/plugin/redshift/RedshiftClientModule.java +++ b/plugin/trino-redshift/src/main/java/io/trino/plugin/redshift/RedshiftClientModule.java @@ -24,6 +24,7 @@ import io.trino.plugin.jdbc.DriverConnectionFactory; import io.trino.plugin.jdbc.ForBaseJdbc; import io.trino.plugin.jdbc.JdbcClient; +import io.trino.plugin.jdbc.JdbcStatisticsConfig; import io.trino.plugin.jdbc.credential.CredentialProvider; import io.trino.plugin.jdbc.ptf.Query; import io.trino.spi.ptf.ConnectorTableFunction; @@ -32,6 +33,7 @@ import static com.google.inject.Scopes.SINGLETON; import static com.google.inject.multibindings.Multibinder.newSetBinder; +import static io.airlift.configuration.ConfigBinder.configBinder; public class RedshiftClientModule extends AbstractConfigurationAwareModule @@ -41,6 +43,7 @@ public void setup(Binder binder) { binder.bind(JdbcClient.class).annotatedWith(ForBaseJdbc.class).to(RedshiftClient.class).in(SINGLETON); newSetBinder(binder, ConnectorTableFunction.class).addBinding().toProvider(Query.class).in(SINGLETON); + configBinder(binder).bindConfig(JdbcStatisticsConfig.class); install(new DecimalModule()); } diff --git a/plugin/trino-redshift/src/main/java/io/trino/plugin/redshift/RedshiftTableStatisticsReader.java b/plugin/trino-redshift/src/main/java/io/trino/plugin/redshift/RedshiftTableStatisticsReader.java new file mode 100644 index 000000000000..c576abdd109d --- /dev/null +++ b/plugin/trino-redshift/src/main/java/io/trino/plugin/redshift/RedshiftTableStatisticsReader.java @@ -0,0 +1,176 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.redshift; + +import io.trino.plugin.jdbc.ConnectionFactory; +import io.trino.plugin.jdbc.JdbcColumnHandle; +import io.trino.plugin.jdbc.JdbcTableHandle; +import io.trino.plugin.jdbc.RemoteTableName; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.statistics.ColumnStatistics; +import io.trino.spi.statistics.Estimate; +import io.trino.spi.statistics.TableStatistics; +import org.jdbi.v3.core.Handle; +import org.jdbi.v3.core.Jdbi; + +import java.sql.Connection; +import java.sql.SQLException; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.function.Supplier; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static java.util.Objects.requireNonNull; +import static java.util.function.Function.identity; + +public class RedshiftTableStatisticsReader +{ + private final ConnectionFactory connectionFactory; + + public RedshiftTableStatisticsReader(ConnectionFactory connectionFactory) + { + this.connectionFactory = requireNonNull(connectionFactory, "connectionFactory is null"); + } + + public TableStatistics readTableStatistics(ConnectorSession session, JdbcTableHandle table, Supplier> columnSupplier) + throws SQLException + { + checkArgument(table.isNamedRelation(), "Relation is not a table: %s", table); + + try (Connection connection = connectionFactory.openConnection(session); + Handle handle = Jdbi.open(connection)) { + StatisticsDao statisticsDao = new StatisticsDao(handle); + + RemoteTableName remoteTableName = table.getRequiredNamedRelation().getRemoteTableName(); + Optional optionalRowCount = readRowCountTableStat(statisticsDao, table); + if (optionalRowCount.isEmpty()) { + // Table not found + return TableStatistics.empty(); + } + long rowCount = optionalRowCount.get(); + + TableStatistics.Builder tableStatistics = TableStatistics.builder() + .setRowCount(Estimate.of(rowCount)); + + if (rowCount == 0) { + return tableStatistics.build(); + } + + Map columnStatistics = statisticsDao.getColumnStatistics(remoteTableName.getSchemaName().orElse(null), remoteTableName.getTableName()).stream() + .collect(toImmutableMap(ColumnStatisticsResult::columnName, identity())); + + for (JdbcColumnHandle column : columnSupplier.get()) { + ColumnStatisticsResult result = columnStatistics.get(column.getColumnName()); + if (result == null) { + continue; + } + + ColumnStatistics statistics = ColumnStatistics.builder() + .setNullsFraction(result.nullsFraction() + .map(Estimate::of) + .orElseGet(Estimate::unknown)) + .setDistinctValuesCount(result.distinctValuesIndicator() + .map(distinctValuesIndicator -> { + // If the distinct value count is an estimate Redshift uses "the negative of the number of distinct values divided by the number of rows + // For example, -1 indicates a unique column in which the number of distinct values is the same as the number of rows." + // https://www.postgresql.org/docs/9.3/view-pg-stats.html + if (distinctValuesIndicator < 0.0) { + return Math.min(-distinctValuesIndicator * rowCount, rowCount); + } + return distinctValuesIndicator; + }) + .map(Estimate::of) + .orElseGet(Estimate::unknown)) + .setDataSize(result.averageColumnLength() + .flatMap(averageColumnLength -> + result.nullsFraction() + .map(nullsFraction -> 1.0 * averageColumnLength * rowCount * (1 - nullsFraction)) + .map(Estimate::of)) + .orElseGet(Estimate::unknown)) + .build(); + + tableStatistics.setColumnStatistics(column, statistics); + } + + return tableStatistics.build(); + } + } + + private static Optional readRowCountTableStat(StatisticsDao statisticsDao, JdbcTableHandle table) + { + RemoteTableName remoteTableName = table.getRequiredNamedRelation().getRemoteTableName(); + Optional rowCount = statisticsDao.getRowCountFromPgClass(remoteTableName.getSchemaName().orElse(null), remoteTableName.getTableName()); + if (rowCount.isEmpty()) { + // Table not found + return Optional.empty(); + } + + if (rowCount.get() == 0) { + // `pg_class.reltuples = 0` may mean an empty table or a recently populated table (CTAS, LOAD or INSERT) + // The `pg_stat_all_tables` view can be way off, so we use it only as a fallback + rowCount = statisticsDao.getRowCountFromPgStat(remoteTableName.getSchemaName().orElse(null), remoteTableName.getTableName()); + } + + return rowCount; + } + + private static class StatisticsDao + { + private final Handle handle; + + public StatisticsDao(Handle handle) + { + this.handle = requireNonNull(handle, "handle is null"); + } + + Optional getRowCountFromPgClass(String schema, String tableName) + { + return handle.createQuery("SELECT reltuples FROM pg_class WHERE relnamespace = (SELECT oid FROM pg_namespace WHERE nspname = :schema) AND relname = :table_name") + .bind("schema", schema) + .bind("table_name", tableName) + .mapTo(Long.class) + .findOne(); + } + + Optional getRowCountFromPgStat(String schema, String tableName) + { + // Redshift does not have the Postgres `n_live_tup`, so estimate from `inserts - deletes` + return handle.createQuery("SELECT n_tup_ins - n_tup_del FROM pg_stat_all_tables WHERE schemaname = :schema AND relname = :table_name") + .bind("schema", schema) + .bind("table_name", tableName) + .mapTo(Long.class) + .findOne(); + } + + List getColumnStatistics(String schema, String tableName) + { + return handle.createQuery("SELECT attname, null_frac, n_distinct, avg_width FROM pg_stats WHERE schemaname = :schema AND tablename = :table_name") + .bind("schema", schema) + .bind("table_name", tableName) + .map((rs, ctx) -> + new ColumnStatisticsResult( + requireNonNull(rs.getString("attname"), "attname is null"), + Optional.of(rs.getFloat("null_frac")), + Optional.of(rs.getFloat("n_distinct")), + Optional.of(rs.getInt("avg_width")))) + .list(); + } + } + + // TODO remove when error prone is updated for Java 17 records + @SuppressWarnings("unused") + private record ColumnStatisticsResult(String columnName, Optional nullsFraction, Optional distinctValuesIndicator, Optional averageColumnLength) {} +} diff --git a/plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/TestRedshiftConnectorTest.java b/plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/TestRedshiftConnectorTest.java index 7e2b12fe85f1..863f308b9554 100644 --- a/plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/TestRedshiftConnectorTest.java +++ b/plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/TestRedshiftConnectorTest.java @@ -29,8 +29,10 @@ import static io.trino.plugin.redshift.RedshiftQueryRunner.TEST_SCHEMA; import static io.trino.plugin.redshift.RedshiftQueryRunner.createRedshiftQueryRunner; +import static io.trino.plugin.redshift.RedshiftQueryRunner.executeInRedshift; import static io.trino.testing.TestingNames.randomNameSuffix; import static java.lang.String.format; +import static java.util.Locale.ENGLISH; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; @@ -177,6 +179,77 @@ public void testDelete() } } + @Test(dataProvider = "testCaseColumnNamesDataProvider") + public void testCaseColumnNames(String tableName) + { + try { + assertUpdate( + "CREATE TABLE " + TEST_SCHEMA + "." + tableName + + " AS SELECT " + + " custkey AS CASE_UNQUOTED_UPPER, " + + " name AS case_unquoted_lower, " + + " address AS cASe_uNQuoTeD_miXED, " + + " nationkey AS \"CASE_QUOTED_UPPER\", " + + " phone AS \"case_quoted_lower\"," + + " acctbal AS \"CasE_QuoTeD_miXED\" " + + "FROM customer", + 1500); + gatherStats(tableName); + assertQuery( + "SHOW STATS FOR " + TEST_SCHEMA + "." + tableName, + "VALUES " + + "('case_unquoted_upper', NULL, 1485, 0, null, null, null)," + + "('case_unquoted_lower', 33000, 1470, 0, null, null, null)," + + "('case_unquoted_mixed', 42000, 1500, 0, null, null, null)," + + "('case_quoted_upper', NULL, 25, 0, null, null, null)," + + "('case_quoted_lower', 28500, 1483, 0, null, null, null)," + + "('case_quoted_mixed', NULL, 1483, 0, null, null, null)," + + "(null, null, null, null, 1500, null, null)"); + } + finally { + assertUpdate("DROP TABLE IF EXISTS " + tableName); + } + } + + private static void gatherStats(String tableName) + { + executeInRedshift(handle -> { + handle.execute("ANALYZE VERBOSE " + TEST_SCHEMA + "." + tableName); + for (int i = 0; i < 5; i++) { + long actualCount = handle.createQuery("SELECT count(*) FROM " + TEST_SCHEMA + "." + tableName) + .mapTo(Long.class) + .one(); + long estimatedCount = handle.createQuery(""" + SELECT reltuples FROM pg_class + WHERE relnamespace = (SELECT oid FROM pg_namespace WHERE nspname = :schema) + AND relname = :table_name + """) + .bind("schema", TEST_SCHEMA) + .bind("table_name", tableName.toLowerCase(ENGLISH).replace("\"", "")) + .mapTo(Long.class) + .one(); + if (actualCount == estimatedCount) { + return; + } + handle.execute("ANALYZE VERBOSE " + TEST_SCHEMA + "." + tableName); + } + throw new IllegalStateException("Stats not gathered"); // for small test tables reltuples should be exact + }); + } + + @DataProvider + public Object[][] testCaseColumnNamesDataProvider() + { + return new Object[][] { + {"TEST_STATS_MIXED_UNQUOTED_UPPER_" + randomNameSuffix()}, + {"test_stats_mixed_unquoted_lower_" + randomNameSuffix()}, + {"test_stats_mixed_uNQuoTeD_miXED_" + randomNameSuffix()}, + {"\"TEST_STATS_MIXED_QUOTED_UPPER_" + randomNameSuffix() + "\""}, + {"\"test_stats_mixed_quoted_lower_" + randomNameSuffix() + "\""}, + {"\"test_stats_mixed_QuoTeD_miXED_" + randomNameSuffix() + "\""} + }; + } + @Override @Test public void testReadMetadataWithRelationsConcurrentModifications() diff --git a/plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/TestRedshiftTableStatisticsReader.java b/plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/TestRedshiftTableStatisticsReader.java new file mode 100644 index 000000000000..ff713337ea53 --- /dev/null +++ b/plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/TestRedshiftTableStatisticsReader.java @@ -0,0 +1,349 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.redshift; + +import com.amazon.redshift.Driver; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.trino.plugin.jdbc.BaseJdbcConfig; +import io.trino.plugin.jdbc.DriverConnectionFactory; +import io.trino.plugin.jdbc.JdbcColumnHandle; +import io.trino.plugin.jdbc.JdbcTableHandle; +import io.trino.plugin.jdbc.JdbcTypeHandle; +import io.trino.plugin.jdbc.RemoteTableName; +import io.trino.plugin.jdbc.credential.StaticCredentialProvider; +import io.trino.spi.connector.ColumnHandle; +import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.statistics.ColumnStatistics; +import io.trino.spi.statistics.Estimate; +import io.trino.spi.statistics.TableStatistics; +import io.trino.spi.type.VarcharType; +import io.trino.testing.AbstractTestQueryFramework; +import io.trino.testing.QueryRunner; +import io.trino.testing.sql.TestTable; +import org.assertj.core.api.InstanceOfAssertFactories; +import org.assertj.core.api.SoftAssertions; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.sql.Types; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.function.Consumer; + +import static io.trino.plugin.redshift.RedshiftQueryRunner.JDBC_PASSWORD; +import static io.trino.plugin.redshift.RedshiftQueryRunner.JDBC_URL; +import static io.trino.plugin.redshift.RedshiftQueryRunner.JDBC_USER; +import static io.trino.plugin.redshift.RedshiftQueryRunner.TEST_SCHEMA; +import static io.trino.plugin.redshift.RedshiftQueryRunner.createRedshiftQueryRunner; +import static io.trino.plugin.redshift.RedshiftQueryRunner.executeInRedshift; +import static io.trino.spi.type.BigintType.BIGINT; +import static io.trino.spi.type.DoubleType.DOUBLE; +import static io.trino.testing.TestingConnectorSession.SESSION; +import static io.trino.testing.TestingNames.randomNameSuffix; +import static io.trino.testing.sql.TestTable.fromColumns; +import static io.trino.tpch.TpchTable.CUSTOMER; +import static java.util.Collections.emptyMap; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.from; +import static org.assertj.core.api.Assertions.withinPercentage; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNull; + +public class TestRedshiftTableStatisticsReader + extends AbstractTestQueryFramework +{ + private static final JdbcTypeHandle BIGINT_TYPE_HANDLE = new JdbcTypeHandle(Types.BIGINT, Optional.of("int8"), Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty()); + private static final JdbcTypeHandle DOUBLE_TYPE_HANDLE = new JdbcTypeHandle(Types.DOUBLE, Optional.of("double"), Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty()); + + private static final List CUSTOMER_COLUMNS = ImmutableList.of( + new JdbcColumnHandle("custkey", BIGINT_TYPE_HANDLE, BIGINT), + createVarcharJdbcColumnHandle("name", 25), + createVarcharJdbcColumnHandle("address", 48), + new JdbcColumnHandle("nationkey", BIGINT_TYPE_HANDLE, BIGINT), + createVarcharJdbcColumnHandle("phone", 15), + new JdbcColumnHandle("acctbal", DOUBLE_TYPE_HANDLE, DOUBLE), + createVarcharJdbcColumnHandle("mktsegment", 10), + createVarcharJdbcColumnHandle("comment", 117)); + + private RedshiftTableStatisticsReader statsReader; + + @BeforeClass + public void setup() + { + DriverConnectionFactory connectionFactory = new DriverConnectionFactory( + new Driver(), + new BaseJdbcConfig().setConnectionUrl(JDBC_URL), + new StaticCredentialProvider(Optional.of(JDBC_USER), Optional.of(JDBC_PASSWORD))); + statsReader = new RedshiftTableStatisticsReader(connectionFactory); + } + + @Override + protected QueryRunner createQueryRunner() + throws Exception + { + return createRedshiftQueryRunner(Map.of(), Map.of(), ImmutableList.of(CUSTOMER)); + } + + @Test + public void testCustomerTable() + throws Exception + { + assertThat(collectStats("SELECT * FROM " + TEST_SCHEMA + ".customer", CUSTOMER_COLUMNS)) + .returns(Estimate.of(1500), from(TableStatistics::getRowCount)) + .extracting(TableStatistics::getColumnStatistics, InstanceOfAssertFactories.map(ColumnHandle.class, ColumnStatistics.class)) + .hasEntrySatisfying(CUSTOMER_COLUMNS.get(0), statsCloseTo(1500.0, 0.0, 8.0 * 1500)) + .hasEntrySatisfying(CUSTOMER_COLUMNS.get(1), statsCloseTo(1500.0, 0.0, 33000.0)) + .hasEntrySatisfying(CUSTOMER_COLUMNS.get(3), statsCloseTo(25.000, 0.0, 8.0 * 1500)) + .hasEntrySatisfying(CUSTOMER_COLUMNS.get(5), statsCloseTo(1499.0, 0.0, 8.0 * 1500)); + } + + @Test + public void testEmptyTable() + throws Exception + { + TableStatistics tableStatistics = collectStats("SELECT * FROM " + TEST_SCHEMA + ".customer WHERE false", CUSTOMER_COLUMNS); + assertThat(tableStatistics) + .returns(Estimate.of(0.0), from(TableStatistics::getRowCount)) + .returns(emptyMap(), from(TableStatistics::getColumnStatistics)); + } + + @Test + public void testAllNulls() + throws Exception + { + String tableName = "testallnulls_" + randomNameSuffix(); + String schemaAndTable = TEST_SCHEMA + "." + tableName; + try { + executeInRedshift("CREATE TABLE " + schemaAndTable + " (i BIGINT)"); + executeInRedshift("INSERT INTO " + schemaAndTable + " (i) VALUES (NULL)"); + executeInRedshift("ANALYZE VERBOSE " + schemaAndTable); + + TableStatistics stats = statsReader.readTableStatistics( + SESSION, + new JdbcTableHandle( + new SchemaTableName(TEST_SCHEMA, tableName), + new RemoteTableName(Optional.empty(), Optional.of(TEST_SCHEMA), tableName), + Optional.empty()), + () -> ImmutableList.of(new JdbcColumnHandle("i", BIGINT_TYPE_HANDLE, BIGINT))); + assertThat(stats) + .returns(Estimate.of(1.0), from(TableStatistics::getRowCount)) + .returns(emptyMap(), from(TableStatistics::getColumnStatistics)); + } + finally { + executeInRedshift("DROP TABLE IF EXISTS " + schemaAndTable); + } + } + + @Test + public void testNullsFraction() + throws Exception + { + JdbcColumnHandle custkeyColumnHandle = CUSTOMER_COLUMNS.get(0); + TableStatistics stats = collectStats( + "SELECT CASE custkey % 3 WHEN 0 THEN NULL ELSE custkey END FROM " + TEST_SCHEMA + ".customer", + ImmutableList.of(custkeyColumnHandle)); + assertEquals(stats.getRowCount(), Estimate.of(1500)); + + ColumnStatistics columnStatistics = stats.getColumnStatistics().get(custkeyColumnHandle); + assertThat(columnStatistics.getNullsFraction().getValue()).isCloseTo(1.0 / 3, withinPercentage(1)); + } + + @Test + public void testAverageColumnLength() + throws Exception + { + List columns = ImmutableList.of( + new JdbcColumnHandle("custkey", BIGINT_TYPE_HANDLE, BIGINT), + createVarcharJdbcColumnHandle("v3_in_3", 3), + createVarcharJdbcColumnHandle("v3_in_42", 42), + createVarcharJdbcColumnHandle("single_10v_value", 10), + createVarcharJdbcColumnHandle("half_10v_value", 10), + createVarcharJdbcColumnHandle("half_distinct_20v_value", 20), + createVarcharJdbcColumnHandle("all_nulls", 10)); + + assertThat( + collectStats( + "SELECT " + + " custkey, " + + " 'abc' v3_in_3, " + + " CAST('abc' AS varchar(42)) v3_in_42, " + + " CASE custkey WHEN 1 THEN '0123456789' ELSE NULL END single_10v_value, " + + " CASE custkey % 2 WHEN 0 THEN '0123456789' ELSE NULL END half_10v_value, " + + " CASE custkey % 2 WHEN 0 THEN CAST((1000000 - custkey) * (1000000 - custkey) AS varchar(20)) ELSE NULL END half_distinct_20v_value, " + // 12 chars each + " CAST(NULL AS varchar(10)) all_nulls " + + "FROM " + TEST_SCHEMA + ".customer " + + "ORDER BY custkey LIMIT 100", + columns)) + .returns(Estimate.of(100), from(TableStatistics::getRowCount)) + .extracting(TableStatistics::getColumnStatistics, InstanceOfAssertFactories.map(ColumnHandle.class, ColumnStatistics.class)) + .hasEntrySatisfying(columns.get(0), statsCloseTo(100.0, 0.0, 800)) + .hasEntrySatisfying(columns.get(1), statsCloseTo(1.0, 0.0, 700.0)) + .hasEntrySatisfying(columns.get(2), statsCloseTo(1.0, 0.0, 700)) + .hasEntrySatisfying(columns.get(3), statsCloseTo(1.0, 0.99, 14)) + .hasEntrySatisfying(columns.get(4), statsCloseTo(1.0, 0.5, 700)) + .hasEntrySatisfying(columns.get(5), statsCloseTo(51, 0.5, 800)) + .satisfies(stats -> assertNull(stats.get(columns.get(6)))); + } + + @Test + public void testView() + throws Exception + { + String tableName = "test_stats_view_" + randomNameSuffix(); + String schemaAndTable = TEST_SCHEMA + "." + tableName; + List columns = ImmutableList.of( + new JdbcColumnHandle("custkey", BIGINT_TYPE_HANDLE, BIGINT), + createVarcharJdbcColumnHandle("mktsegment", 10), + createVarcharJdbcColumnHandle("comment", 117)); + + try { + executeInRedshift("CREATE OR REPLACE VIEW " + schemaAndTable + " AS SELECT custkey, mktsegment, comment FROM " + TEST_SCHEMA + ".customer"); + TableStatistics tableStatistics = statsReader.readTableStatistics( + SESSION, + new JdbcTableHandle( + new SchemaTableName(TEST_SCHEMA, tableName), + new RemoteTableName(Optional.empty(), Optional.of(TEST_SCHEMA), tableName), + Optional.empty()), + () -> columns); + assertThat(tableStatistics).isEqualTo(TableStatistics.empty()); + } + finally { + executeInRedshift("DROP VIEW IF EXISTS " + schemaAndTable); + } + } + + @Test + public void testMaterializedView() + throws Exception + { + String tableName = "test_stats_materialized_view_" + randomNameSuffix(); + String schemaAndTable = TEST_SCHEMA + "." + tableName; + List columns = ImmutableList.of( + new JdbcColumnHandle("custkey", BIGINT_TYPE_HANDLE, BIGINT), + createVarcharJdbcColumnHandle("mktsegment", 10), + createVarcharJdbcColumnHandle("comment", 117)); + + try { + executeInRedshift("CREATE MATERIALIZED VIEW " + schemaAndTable + + " AS SELECT custkey, mktsegment, comment FROM " + TEST_SCHEMA + ".customer"); + executeInRedshift("REFRESH MATERIALIZED VIEW " + schemaAndTable); + executeInRedshift("ANALYZE VERBOSE " + schemaAndTable); + TableStatistics tableStatistics = statsReader.readTableStatistics( + SESSION, + new JdbcTableHandle( + new SchemaTableName(TEST_SCHEMA, tableName), + new RemoteTableName(Optional.empty(), Optional.of(TEST_SCHEMA), tableName), + Optional.empty()), + () -> columns); + assertThat(tableStatistics).isEqualTo(TableStatistics.empty()); + } + finally { + executeInRedshift("DROP MATERIALIZED VIEW " + schemaAndTable); + } + } + + @Test + public void testNumericCornerCases() + { + try (TestTable table = fromColumns( + getQueryRunner()::execute, + "test_numeric_corner_cases_", + ImmutableMap.>builder() + .put("only_negative_infinity double", List.of("-infinity()", "-infinity()", "-infinity()", "-infinity()")) + .put("only_positive_infinity double", List.of("infinity()", "infinity()", "infinity()", "infinity()")) + .put("mixed_infinities double", List.of("-infinity()", "infinity()", "-infinity()", "infinity()")) + .put("mixed_infinities_and_numbers double", List.of("-infinity()", "infinity()", "-5.0", "7.0")) + .put("nans_only double", List.of("nan()", "nan()")) + .put("nans_and_numbers double", List.of("nan()", "nan()", "-5.0", "7.0")) + .put("large_doubles double", List.of("CAST(-50371909150609548946090.0 AS DOUBLE)", "CAST(50371909150609548946090.0 AS DOUBLE)")) // 2^77 DIV 3 + .put("short_decimals_big_fraction decimal(16,15)", List.of("-1.234567890123456", "1.234567890123456")) + .put("short_decimals_big_integral decimal(16,1)", List.of("-123456789012345.6", "123456789012345.6")) + .put("long_decimals_big_fraction decimal(38,37)", List.of("-1.2345678901234567890123456789012345678", "1.2345678901234567890123456789012345678")) + .put("long_decimals_middle decimal(38,16)", List.of("-1234567890123456.7890123456789012345678", "1234567890123456.7890123456789012345678")) + .put("long_decimals_big_integral decimal(38,1)", List.of("-1234567890123456789012345678901234567.8", "1234567890123456789012345678901234567.8")) + .buildOrThrow(), + "null")) { + executeInRedshift("ANALYZE VERBOSE " + TEST_SCHEMA + "." + table.getName()); + assertQuery( + "SHOW STATS FOR " + table.getName(), + "VALUES " + + "('only_negative_infinity', null, 1, 0, null, null, null)," + + "('only_positive_infinity', null, 1, 0, null, null, null)," + + "('mixed_infinities', null, 2, 0, null, null, null)," + + "('mixed_infinities_and_numbers', null, 4.0, 0.0, null, null, null)," + + "('nans_only', null, 1.0, 0.5, null, null, null)," + + "('nans_and_numbers', null, 3.0, 0.0, null, null, null)," + + "('large_doubles', null, 2.0, 0.5, null, null, null)," + + "('short_decimals_big_fraction', null, 2.0, 0.5, null, null, null)," + + "('short_decimals_big_integral', null, 2.0, 0.5, null, null, null)," + + "('long_decimals_big_fraction', null, 2.0, 0.5, null, null, null)," + + "('long_decimals_middle', null, 2.0, 0.5, null, null, null)," + + "('long_decimals_big_integral', null, 2.0, 0.5, null, null, null)," + + "(null, null, null, null, 4, null, null)"); + } + } + + /** + * Assert that the given column is within 5% of each statistic in the parameters, and that it has no range + */ + private static Consumer statsCloseTo(double distinctValues, double nullsFraction, double dataSize) + { + return stats -> { + SoftAssertions softly = new SoftAssertions(); + + softly.assertThat(stats.getDistinctValuesCount().getValue()) + .isCloseTo(distinctValues, withinPercentage(5.0)); + + softly.assertThat(stats.getNullsFraction().getValue()) + .isCloseTo(nullsFraction, withinPercentage(5.0)); + + softly.assertThat(stats.getDataSize().getValue()) + .isCloseTo(dataSize, withinPercentage(5.0)); + + softly.assertThat(stats.getRange()).isEmpty(); + softly.assertAll(); + }; + } + + private TableStatistics collectStats(String values, List columnHandles) + throws Exception + { + String tableName = "testredshiftstatisticsreader_" + randomNameSuffix(); + String schemaAndTable = TEST_SCHEMA + "." + tableName; + try { + executeInRedshift("CREATE TABLE " + schemaAndTable + " AS " + values); + executeInRedshift("ANALYZE VERBOSE " + schemaAndTable); + return statsReader.readTableStatistics( + SESSION, + new JdbcTableHandle( + new SchemaTableName(TEST_SCHEMA, tableName), + new RemoteTableName(Optional.empty(), Optional.of(TEST_SCHEMA), tableName), + Optional.empty()), + () -> columnHandles); + } + finally { + executeInRedshift("DROP TABLE IF EXISTS " + schemaAndTable); + } + } + + private static JdbcColumnHandle createVarcharJdbcColumnHandle(String name, int length) + { + return new JdbcColumnHandle( + name, + new JdbcTypeHandle(Types.VARCHAR, Optional.of("varchar"), Optional.of(length), Optional.empty(), Optional.empty(), Optional.empty()), + VarcharType.createVarcharType(length)); + } +}