From 75d7d38d9cde23a26cc92cc117201e8aa74bad94 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 14 Jan 2025 15:07:06 +0800 Subject: [PATCH] [#5361] improvment(hadoop-catalog): Introduce timeout mechanism to get Hadoop File System. (#6221) ### What changes were proposed in this pull request? Introduce a timeout mechanism when getting a Hadoop FileSystem instance. ### Why are the changes needed? Cloud filesystem like S3 and OSS(10 minutes) has a very long connection and can't be tune by configuration, this will cause deadlock as it will hold the tree lock for a long time Fix: #5361 Fix: #6156 ### Does this PR introduce _any_ user-facing change? N/A. ### How was this patch tested? Existing test. Co-authored-by: Qi Yu --- LICENSE.bin | 1 + catalogs/catalog-hadoop/build.gradle.kts | 1 + .../hadoop/HadoopCatalogOperations.java | 35 ++++++++++++++++++- .../HadoopCatalogPropertiesMetadata.java | 11 ++++++ .../apache/gravitino/lock/LockManager.java | 9 +++-- docs/hadoop-catalog.md | 9 ++--- 6 files changed, 59 insertions(+), 7 deletions(-) diff --git a/LICENSE.bin b/LICENSE.bin index effaa4ac4a2..d1dddd52795 100644 --- a/LICENSE.bin +++ b/LICENSE.bin @@ -374,6 +374,7 @@ Apache Arrow Rome Jettison + Awaitility This product bundles various third-party components also under the Apache Software Foundation License 1.1 diff --git a/catalogs/catalog-hadoop/build.gradle.kts b/catalogs/catalog-hadoop/build.gradle.kts index d599a5e72f1..3108d993c1a 100644 --- a/catalogs/catalog-hadoop/build.gradle.kts +++ b/catalogs/catalog-hadoop/build.gradle.kts @@ -54,6 +54,7 @@ dependencies { exclude("org.fusesource.leveldbjni") } implementation(libs.slf4j.api) + implementation(libs.awaitility) compileOnly(libs.guava) diff --git a/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogOperations.java b/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogOperations.java index 36177bea37f..6c032414be5 100644 --- a/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogOperations.java +++ b/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogOperations.java @@ -31,6 +31,8 @@ import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; import org.apache.commons.lang3.StringUtils; import org.apache.gravitino.Catalog; import org.apache.gravitino.Entity; @@ -71,6 +73,8 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.awaitility.Awaitility; +import org.awaitility.core.ConditionTimeoutException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -755,6 +759,35 @@ FileSystem getFileSystem(Path path, Map config) throws IOExcepti scheme, path, fileSystemProvidersMap.keySet(), fileSystemProvidersMap.values())); } - return provider.getFileSystem(path, config); + int timeoutSeconds = + (int) + propertiesMetadata + .catalogPropertiesMetadata() + .getOrDefault( + config, HadoopCatalogPropertiesMetadata.FILESYSTEM_CONNECTION_TIMEOUT_SECONDS); + try { + AtomicReference fileSystem = new AtomicReference<>(); + Awaitility.await() + .atMost(timeoutSeconds, TimeUnit.SECONDS) + .until( + () -> { + fileSystem.set(provider.getFileSystem(path, config)); + return true; + }); + return fileSystem.get(); + } catch (ConditionTimeoutException e) { + throw new IOException( + String.format( + "Failed to get FileSystem for path: %s, scheme: %s, provider: %s, config: %s within %s " + + "seconds, please check the configuration or increase the " + + "file system connection timeout time by setting catalog property: %s", + path, + scheme, + provider, + config, + timeoutSeconds, + HadoopCatalogPropertiesMetadata.FILESYSTEM_CONNECTION_TIMEOUT_SECONDS), + e); + } } } diff --git a/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogPropertiesMetadata.java b/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogPropertiesMetadata.java index 22cf0d5b2cd..3bdc125efc8 100644 --- a/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogPropertiesMetadata.java +++ b/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogPropertiesMetadata.java @@ -53,6 +53,9 @@ public class HadoopCatalogPropertiesMetadata extends BaseCatalogPropertiesMetada */ public static final String DEFAULT_FS_PROVIDER = "default-filesystem-provider"; + static final String FILESYSTEM_CONNECTION_TIMEOUT_SECONDS = "filesystem-conn-timeout-secs"; + static final int DEFAULT_GET_FILESYSTEM_TIMEOUT_SECONDS = 6; + public static final String BUILTIN_LOCAL_FS_PROVIDER = "builtin-local"; public static final String BUILTIN_HDFS_FS_PROVIDER = "builtin-hdfs"; @@ -82,6 +85,14 @@ public class HadoopCatalogPropertiesMetadata extends BaseCatalogPropertiesMetada false /* immutable */, BUILTIN_LOCAL_FS_PROVIDER, // please see LocalFileSystemProvider#name() false /* hidden */)) + .put( + FILESYSTEM_CONNECTION_TIMEOUT_SECONDS, + PropertyEntry.integerOptionalPropertyEntry( + FILESYSTEM_CONNECTION_TIMEOUT_SECONDS, + "Timeout to wait for to create the Hadoop file system client instance.", + false /* immutable */, + DEFAULT_GET_FILESYSTEM_TIMEOUT_SECONDS, + false /* hidden */)) // The following two are about authentication. .putAll(KERBEROS_PROPERTY_ENTRIES) .putAll(AuthenticationConfig.AUTHENTICATION_PROPERTY_ENTRIES) diff --git a/core/src/main/java/org/apache/gravitino/lock/LockManager.java b/core/src/main/java/org/apache/gravitino/lock/LockManager.java index 222dee8daad..d52c858cc43 100644 --- a/core/src/main/java/org/apache/gravitino/lock/LockManager.java +++ b/core/src/main/java/org/apache/gravitino/lock/LockManager.java @@ -26,6 +26,7 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Lists; import com.google.common.util.concurrent.ThreadFactoryBuilder; +import java.text.SimpleDateFormat; import java.util.List; import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.TimeUnit; @@ -136,10 +137,14 @@ void checkDeadLock(TreeLockNode node) { // If the thread is holding the lock for more than 30 seconds, we will log it. if (System.currentTimeMillis() - ts > 30000) { LOG.warn( - "Dead lock detected for thread with identifier {} on node {}, threads that holding the node: {} ", + "Thread with identifier {} holds the lock node {} for more than 30s since {}, please " + + "check if some dead lock or thread hang like io-connection hangs", threadIdentifier, node, - node.getHoldingThreadTimestamp()); + // SimpleDateFormat is not thread-safe, so we should create a new instance for + // each time + new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") + .format(node.getHoldingThreadTimestamp())); } }); } diff --git a/docs/hadoop-catalog.md b/docs/hadoop-catalog.md index 99e1dd7854e..cbdae846899 100644 --- a/docs/hadoop-catalog.md +++ b/docs/hadoop-catalog.md @@ -23,10 +23,11 @@ Hadoop 3. If there's any compatibility issue, please create an [issue](https://g Besides the [common catalog properties](./gravitino-server-config.md#apache-gravitino-catalog-properties-configuration), the Hadoop catalog has the following properties: -| Property Name | Description | Default Value | Required | Since Version | -|------------------------|----------------------------------------------------|---------------|----------|------------------| -| `location` | The storage location managed by Hadoop catalog. | (none) | No | 0.5.0 | -| `credential-providers` | The credential provider types, separated by comma. | (none) | No | 0.8.0-incubating | +| Property Name | Description | Default Value | Required | Since Version | +|--------------------------------|-----------------------------------------------------------------------------------------------------|---------------|----------|------------------| +| `location` | The storage location managed by Hadoop catalog. | (none) | No | 0.5.0 | +| `filesystem-conn-timeout-secs` | The timeout of getting the file system using Hadoop FileSystem client instance. Time unit: seconds. | 6 | No | 0.8.0-incubating | +| `credential-providers` | The credential provider types, separated by comma. | (none) | No | 0.8.0-incubating | Please refer to [Credential vending](./security/credential-vending.md) for more details about credential vending.