From b69938fa81d00b4e9db0abe86b6e2f9e5985feef Mon Sep 17 00:00:00 2001 From: "Adam J. Shook" Date: Wed, 19 Oct 2022 17:03:11 +0000 Subject: [PATCH] Add dataset cache to BigQuery connector When caseInsensitiveNameMatching=true, the BigQueryClient makes a call to listDatasets for each call on toRemoteDataset. This is problematic when running queries like SHOW SCHEMAS since listDatasets is called once to get the initial list, then once again for each schema. This is expensive if the BigQuery project has a large number of datasets. This commit adds a cache to listing datasets to drastically reduce the query time for SHOW SCHEMAS. --- docs/src/main/sphinx/connector/bigquery.rst | 2 + plugin/trino-bigquery/pom.xml | 2 + .../trino/plugin/bigquery/BigQueryClient.java | 26 +++++++- .../bigquery/BigQueryClientFactory.java | 5 +- .../trino/plugin/bigquery/BigQueryConfig.java | 17 ++++++ .../plugin/bigquery/TestBigQueryConfig.java | 4 ++ .../bigquery/TestBigQueryMetadataCaching.java | 61 +++++++++++++++++++ 7 files changed, 114 insertions(+), 3 deletions(-) create mode 100644 plugin/trino-bigquery/src/test/java/io/trino/plugin/bigquery/TestBigQueryMetadataCaching.java diff --git a/docs/src/main/sphinx/connector/bigquery.rst b/docs/src/main/sphinx/connector/bigquery.rst index 409a730c3766..25d8cf5a044e 100644 --- a/docs/src/main/sphinx/connector/bigquery.rst +++ b/docs/src/main/sphinx/connector/bigquery.rst @@ -133,6 +133,8 @@ Property Description ``BIGNUMERIC`` and ``TIMESTAMP`` types are unsupported. ``false`` ``bigquery.views-cache-ttl`` Duration for which the materialization of a view will be ``15m`` cached and reused. Set to ``0ms`` to disable the cache. +``bigquery.metadata.cache-ttl`` Duration for which metadata retrieved from BigQuery ``0ms`` + is cached and reused. Set to ``0ms`` to disable the cache. ``bigquery.max-read-rows-retries`` The number of retries in case of retryable server issues ``3`` ``bigquery.credentials-key`` The base64 encoded credentials key None. See the `requirements <#requirements>`_ section. ``bigquery.credentials-file`` The path to the JSON credentials file None. See the `requirements <#requirements>`_ section. diff --git a/plugin/trino-bigquery/pom.xml b/plugin/trino-bigquery/pom.xml index 66da8c5fdc99..d926ccf7f659 100644 --- a/plugin/trino-bigquery/pom.xml +++ b/plugin/trino-bigquery/pom.xml @@ -381,6 +381,7 @@ **/TestBigQueryConnectorTest.java + **/TestBigQueryMetadataCaching.java **/TestBigQueryTypeMapping.java **/TestBigQueryMetadata.java **/TestBigQueryInstanceCleaner.java @@ -415,6 +416,7 @@ **/TestBigQueryConnectorTest.java + **/TestBigQueryMetadataCaching.java **/TestBigQueryTypeMapping.java **/TestBigQueryMetadata.java **/TestBigQueryInstanceCleaner.java diff --git a/plugin/trino-bigquery/src/main/java/io/trino/plugin/bigquery/BigQueryClient.java b/plugin/trino-bigquery/src/main/java/io/trino/plugin/bigquery/BigQueryClient.java index 3e57a3300ff3..5deb24871df2 100644 --- a/plugin/trino-bigquery/src/main/java/io/trino/plugin/bigquery/BigQueryClient.java +++ b/plugin/trino-bigquery/src/main/java/io/trino/plugin/bigquery/BigQueryClient.java @@ -33,9 +33,12 @@ import com.google.cloud.bigquery.TableInfo; import com.google.cloud.bigquery.TableResult; import com.google.cloud.http.BaseHttpServiceException; +import com.google.common.cache.CacheLoader; +import com.google.common.cache.LoadingCache; import com.google.common.collect.ImmutableSet; import io.airlift.log.Logger; import io.airlift.units.Duration; +import io.trino.collect.cache.EvictableCacheBuilder; import io.trino.spi.TrinoException; import io.trino.spi.connector.TableNotFoundException; @@ -45,6 +48,7 @@ import java.util.Map; import java.util.Optional; import java.util.Set; +import java.util.concurrent.ExecutionException; import java.util.function.Supplier; import static com.google.cloud.bigquery.JobStatistics.QueryStatistics.StatementType.SELECT; @@ -58,9 +62,11 @@ import static io.trino.plugin.bigquery.BigQueryErrorCode.BIGQUERY_AMBIGUOUS_OBJECT_NAME; import static io.trino.plugin.bigquery.BigQueryErrorCode.BIGQUERY_FAILED_TO_EXECUTE_QUERY; import static io.trino.plugin.bigquery.BigQueryErrorCode.BIGQUERY_INVALID_STATEMENT; +import static io.trino.plugin.bigquery.BigQueryErrorCode.BIGQUERY_LISTING_DATASET_ERROR; import static java.lang.String.format; import static java.util.Locale.ENGLISH; import static java.util.Objects.requireNonNull; +import static java.util.concurrent.TimeUnit.MILLISECONDS; import static java.util.stream.Collectors.joining; public class BigQueryClient @@ -70,12 +76,17 @@ public class BigQueryClient private final BigQuery bigQuery; private final ViewMaterializationCache materializationCache; private final boolean caseInsensitiveNameMatching; + private final LoadingCache> remoteDatasetCache; - public BigQueryClient(BigQuery bigQuery, boolean caseInsensitiveNameMatching, ViewMaterializationCache materializationCache) + public BigQueryClient(BigQuery bigQuery, boolean caseInsensitiveNameMatching, ViewMaterializationCache materializationCache, Duration metadataCacheTtl) { this.bigQuery = requireNonNull(bigQuery, "bigQuery is null"); this.materializationCache = requireNonNull(materializationCache, "materializationCache is null"); this.caseInsensitiveNameMatching = caseInsensitiveNameMatching; + this.remoteDatasetCache = EvictableCacheBuilder.newBuilder() + .expireAfterWrite(metadataCacheTtl.toMillis(), MILLISECONDS) + .shareNothingWhenDisabled() + .build(CacheLoader.from(this::listDatasetsFromBigQuery)); } public Optional toRemoteDataset(String projectId, String datasetName) @@ -175,7 +186,18 @@ public String getProjectId() public Iterable listDatasets(String projectId) { - return bigQuery.listDatasets(projectId).iterateAll(); + try { + return remoteDatasetCache.get(projectId); + } + catch (ExecutionException e) { + throw new TrinoException(BIGQUERY_LISTING_DATASET_ERROR, "Failed to retrieve datasets from BigQuery", e); + } + } + + private List listDatasetsFromBigQuery(String projectId) + { + return stream(bigQuery.listDatasets(projectId).iterateAll()) + .collect(toImmutableList()); } public Iterable listTables(DatasetId remoteDatasetId, TableDefinition.Type... types) diff --git a/plugin/trino-bigquery/src/main/java/io/trino/plugin/bigquery/BigQueryClientFactory.java b/plugin/trino-bigquery/src/main/java/io/trino/plugin/bigquery/BigQueryClientFactory.java index 2d856f8d1575..edbcb266f593 100644 --- a/plugin/trino-bigquery/src/main/java/io/trino/plugin/bigquery/BigQueryClientFactory.java +++ b/plugin/trino-bigquery/src/main/java/io/trino/plugin/bigquery/BigQueryClientFactory.java @@ -19,6 +19,7 @@ import com.google.cloud.bigquery.BigQuery; import com.google.cloud.bigquery.BigQueryOptions; import com.google.common.cache.CacheBuilder; +import io.airlift.units.Duration; import io.trino.collect.cache.NonEvictableCache; import io.trino.spi.connector.ConnectorSession; @@ -40,6 +41,7 @@ public class BigQueryClientFactory private final ViewMaterializationCache materializationCache; private final HeaderProvider headerProvider; private final NonEvictableCache clientCache; + private final Duration metadataCacheTtl; @Inject public BigQueryClientFactory( @@ -56,6 +58,7 @@ public BigQueryClientFactory( this.caseInsensitiveNameMatching = bigQueryConfig.isCaseInsensitiveNameMatching(); this.materializationCache = requireNonNull(materializationCache, "materializationCache is null"); this.headerProvider = requireNonNull(headerProvider, "headerProvider is null"); + this.metadataCacheTtl = bigQueryConfig.getMetadataCacheTtl(); CacheBuilder cacheBuilder = CacheBuilder.newBuilder() .expireAfterWrite(bigQueryConfig.getServiceCacheTtl().toMillis(), MILLISECONDS); @@ -72,7 +75,7 @@ public BigQueryClient create(ConnectorSession session) protected BigQueryClient createBigQueryClient(ConnectorSession session) { - return new BigQueryClient(createBigQuery(session), caseInsensitiveNameMatching, materializationCache); + return new BigQueryClient(createBigQuery(session), caseInsensitiveNameMatching, materializationCache, metadataCacheTtl); } protected BigQuery createBigQuery(ConnectorSession session) diff --git a/plugin/trino-bigquery/src/main/java/io/trino/plugin/bigquery/BigQueryConfig.java b/plugin/trino-bigquery/src/main/java/io/trino/plugin/bigquery/BigQueryConfig.java index 81606f7cd4a0..8bc914eb8937 100644 --- a/plugin/trino-bigquery/src/main/java/io/trino/plugin/bigquery/BigQueryConfig.java +++ b/plugin/trino-bigquery/src/main/java/io/trino/plugin/bigquery/BigQueryConfig.java @@ -29,6 +29,7 @@ import static com.google.common.base.Preconditions.checkState; import static java.util.concurrent.TimeUnit.HOURS; +import static java.util.concurrent.TimeUnit.MILLISECONDS; import static java.util.concurrent.TimeUnit.MINUTES; @DefunctConfig("bigquery.case-insensitive-name-matching.cache-ttl") @@ -51,6 +52,7 @@ public class BigQueryConfig private boolean caseInsensitiveNameMatching; private Duration viewsCacheTtl = new Duration(15, MINUTES); private Duration serviceCacheTtl = new Duration(3, MINUTES); + private Duration metadataCacheTtl = new Duration(0, MILLISECONDS); private boolean queryResultsCacheEnabled; private int rpcInitialChannelCount = 1; @@ -222,6 +224,21 @@ public BigQueryConfig setServiceCacheTtl(Duration serviceCacheTtl) return this; } + @NotNull + @MinDuration("0ms") + public Duration getMetadataCacheTtl() + { + return metadataCacheTtl; + } + + @Config("bigquery.metadata.cache-ttl") + @ConfigDescription("Duration for which BigQuery client metadata is cached after listing") + public BigQueryConfig setMetadataCacheTtl(Duration metadataCacheTtl) + { + this.metadataCacheTtl = metadataCacheTtl; + return this; + } + public boolean isQueryResultsCacheEnabled() { return queryResultsCacheEnabled; diff --git a/plugin/trino-bigquery/src/test/java/io/trino/plugin/bigquery/TestBigQueryConfig.java b/plugin/trino-bigquery/src/test/java/io/trino/plugin/bigquery/TestBigQueryConfig.java index 285517849e71..08d4525112fd 100644 --- a/plugin/trino-bigquery/src/test/java/io/trino/plugin/bigquery/TestBigQueryConfig.java +++ b/plugin/trino-bigquery/src/test/java/io/trino/plugin/bigquery/TestBigQueryConfig.java @@ -24,6 +24,7 @@ import static io.airlift.configuration.testing.ConfigAssertions.recordDefaults; import static java.util.concurrent.TimeUnit.DAYS; import static java.util.concurrent.TimeUnit.HOURS; +import static java.util.concurrent.TimeUnit.MILLISECONDS; import static java.util.concurrent.TimeUnit.MINUTES; import static org.assertj.core.api.Assertions.assertThatThrownBy; @@ -44,6 +45,7 @@ public void testDefaults() .setCaseInsensitiveNameMatching(false) .setViewsCacheTtl(new Duration(15, MINUTES)) .setServiceCacheTtl(new Duration(3, MINUTES)) + .setMetadataCacheTtl(new Duration(0, MILLISECONDS)) .setViewsEnabled(false) .setQueryResultsCacheEnabled(false) .setRpcInitialChannelCount(1) @@ -69,6 +71,7 @@ public void testExplicitPropertyMappingsWithCredentialsKey() .put("bigquery.case-insensitive-name-matching", "true") .put("bigquery.views-cache-ttl", "1m") .put("bigquery.service-cache-ttl", "10d") + .put("bigquery.metadata.cache-ttl", "5d") .put("bigquery.query-results-cache.enabled", "true") .put("bigquery.channel-pool.initial-size", "11") .put("bigquery.channel-pool.min-size", "12") @@ -90,6 +93,7 @@ public void testExplicitPropertyMappingsWithCredentialsKey() .setCaseInsensitiveNameMatching(true) .setViewsCacheTtl(new Duration(1, MINUTES)) .setServiceCacheTtl(new Duration(10, DAYS)) + .setMetadataCacheTtl(new Duration(5, DAYS)) .setQueryResultsCacheEnabled(true) .setRpcInitialChannelCount(11) .setRpcMinChannelCount(12) diff --git a/plugin/trino-bigquery/src/test/java/io/trino/plugin/bigquery/TestBigQueryMetadataCaching.java b/plugin/trino-bigquery/src/test/java/io/trino/plugin/bigquery/TestBigQueryMetadataCaching.java new file mode 100644 index 000000000000..4bf645a0397d --- /dev/null +++ b/plugin/trino-bigquery/src/test/java/io/trino/plugin/bigquery/TestBigQueryMetadataCaching.java @@ -0,0 +1,61 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.bigquery; + +import com.google.common.collect.ImmutableMap; +import io.trino.testing.AbstractTestQueryFramework; +import io.trino.testing.QueryRunner; +import org.testng.annotations.Test; + +import static io.trino.plugin.bigquery.BigQueryQueryRunner.BigQuerySqlExecutor; +import static io.trino.testing.sql.TestTable.randomTableSuffix; +import static org.testng.Assert.assertEquals; + +public class TestBigQueryMetadataCaching + extends AbstractTestQueryFramework +{ + protected BigQuerySqlExecutor bigQuerySqlExecutor; + + @Override + protected QueryRunner createQueryRunner() + throws Exception + { + this.bigQuerySqlExecutor = new BigQuerySqlExecutor(); + return BigQueryQueryRunner.createQueryRunner( + ImmutableMap.of(), + ImmutableMap.of("bigquery.metadata.cache-ttl", "5m")); + } + + @Test + public void testMetadataCaching() + { + String schema = "test_metadata_caching_" + randomTableSuffix(); + try { + getQueryRunner().execute("CREATE SCHEMA " + schema); + assertEquals(getQueryRunner().execute("SHOW SCHEMAS IN bigquery LIKE '" + schema + "'").getOnlyValue(), schema); + + String schemaTableName = schema + ".test_metadata_caching"; + getQueryRunner().execute("CREATE TABLE " + schemaTableName + " AS SELECT * FROM tpch.tiny.region"); + assertEquals(getQueryRunner().execute("SELECT * FROM " + schemaTableName).getRowCount(), 5); + + bigQuerySqlExecutor.execute("DROP SCHEMA " + schema + " CASCADE"); + assertEquals(getQueryRunner().execute("SHOW SCHEMAS IN bigquery LIKE '" + schema + "'").getOnlyValue(), schema); + + assertQueryFails("SELECT * FROM " + schemaTableName, ".*Schema '.+' does not exist.*"); + } + finally { + bigQuerySqlExecutor.execute("DROP SCHEMA IF EXISTS " + schema + " CASCADE"); + } + } +}