Skip to content

Commit

Permalink
[#5361] improvment(hadoop-catalog): Introduce timeout mechanism to ge…
Browse files Browse the repository at this point in the history
…t Hadoop File System. (#6221)

### What changes were proposed in this pull request?

Introduce a timeout mechanism when getting a Hadoop FileSystem instance.

### Why are the changes needed?


Cloud filesystem like S3 and OSS(10 minutes) has a very long connection
and can't be tune by configuration, this will cause deadlock as it will
hold the tree lock for a long time

Fix: #5361 
Fix: #6156

### Does this PR introduce _any_ user-facing change?

N/A.

### How was this patch tested?

Existing test.

Co-authored-by: Qi Yu <[email protected]>
  • Loading branch information
github-actions[bot] and yuqi1129 authored Jan 14, 2025
1 parent 016e415 commit 75d7d38
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 7 deletions.
1 change: 1 addition & 0 deletions LICENSE.bin
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,7 @@
Apache Arrow
Rome
Jettison
Awaitility

This product bundles various third-party components also under the
Apache Software Foundation License 1.1
Expand Down
1 change: 1 addition & 0 deletions catalogs/catalog-hadoop/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ dependencies {
exclude("org.fusesource.leveldbjni")
}
implementation(libs.slf4j.api)
implementation(libs.awaitility)

compileOnly(libs.guava)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.commons.lang3.StringUtils;
import org.apache.gravitino.Catalog;
import org.apache.gravitino.Entity;
Expand Down Expand Up @@ -71,6 +73,8 @@
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.awaitility.Awaitility;
import org.awaitility.core.ConditionTimeoutException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -755,6 +759,35 @@ FileSystem getFileSystem(Path path, Map<String, String> config) throws IOExcepti
scheme, path, fileSystemProvidersMap.keySet(), fileSystemProvidersMap.values()));
}

return provider.getFileSystem(path, config);
int timeoutSeconds =
(int)
propertiesMetadata
.catalogPropertiesMetadata()
.getOrDefault(
config, HadoopCatalogPropertiesMetadata.FILESYSTEM_CONNECTION_TIMEOUT_SECONDS);
try {
AtomicReference<FileSystem> fileSystem = new AtomicReference<>();
Awaitility.await()
.atMost(timeoutSeconds, TimeUnit.SECONDS)
.until(
() -> {
fileSystem.set(provider.getFileSystem(path, config));
return true;
});
return fileSystem.get();
} catch (ConditionTimeoutException e) {
throw new IOException(
String.format(
"Failed to get FileSystem for path: %s, scheme: %s, provider: %s, config: %s within %s "
+ "seconds, please check the configuration or increase the "
+ "file system connection timeout time by setting catalog property: %s",
path,
scheme,
provider,
config,
timeoutSeconds,
HadoopCatalogPropertiesMetadata.FILESYSTEM_CONNECTION_TIMEOUT_SECONDS),
e);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ public class HadoopCatalogPropertiesMetadata extends BaseCatalogPropertiesMetada
*/
public static final String DEFAULT_FS_PROVIDER = "default-filesystem-provider";

static final String FILESYSTEM_CONNECTION_TIMEOUT_SECONDS = "filesystem-conn-timeout-secs";
static final int DEFAULT_GET_FILESYSTEM_TIMEOUT_SECONDS = 6;

public static final String BUILTIN_LOCAL_FS_PROVIDER = "builtin-local";
public static final String BUILTIN_HDFS_FS_PROVIDER = "builtin-hdfs";

Expand Down Expand Up @@ -82,6 +85,14 @@ public class HadoopCatalogPropertiesMetadata extends BaseCatalogPropertiesMetada
false /* immutable */,
BUILTIN_LOCAL_FS_PROVIDER, // please see LocalFileSystemProvider#name()
false /* hidden */))
.put(
FILESYSTEM_CONNECTION_TIMEOUT_SECONDS,
PropertyEntry.integerOptionalPropertyEntry(
FILESYSTEM_CONNECTION_TIMEOUT_SECONDS,
"Timeout to wait for to create the Hadoop file system client instance.",
false /* immutable */,
DEFAULT_GET_FILESYSTEM_TIMEOUT_SECONDS,
false /* hidden */))
// The following two are about authentication.
.putAll(KERBEROS_PROPERTY_ENTRIES)
.putAll(AuthenticationConfig.AUTHENTICATION_PROPERTY_ENTRIES)
Expand Down
9 changes: 7 additions & 2 deletions core/src/main/java/org/apache/gravitino/lock/LockManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Lists;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import java.text.SimpleDateFormat;
import java.util.List;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
Expand Down Expand Up @@ -136,10 +137,14 @@ void checkDeadLock(TreeLockNode node) {
// If the thread is holding the lock for more than 30 seconds, we will log it.
if (System.currentTimeMillis() - ts > 30000) {
LOG.warn(
"Dead lock detected for thread with identifier {} on node {}, threads that holding the node: {} ",
"Thread with identifier {} holds the lock node {} for more than 30s since {}, please "
+ "check if some dead lock or thread hang like io-connection hangs",
threadIdentifier,
node,
node.getHoldingThreadTimestamp());
// SimpleDateFormat is not thread-safe, so we should create a new instance for
// each time
new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
.format(node.getHoldingThreadTimestamp()));
}
});
}
Expand Down
9 changes: 5 additions & 4 deletions docs/hadoop-catalog.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,11 @@ Hadoop 3. If there's any compatibility issue, please create an [issue](https://g

Besides the [common catalog properties](./gravitino-server-config.md#apache-gravitino-catalog-properties-configuration), the Hadoop catalog has the following properties:

| Property Name | Description | Default Value | Required | Since Version |
|------------------------|----------------------------------------------------|---------------|----------|------------------|
| `location` | The storage location managed by Hadoop catalog. | (none) | No | 0.5.0 |
| `credential-providers` | The credential provider types, separated by comma. | (none) | No | 0.8.0-incubating |
| Property Name | Description | Default Value | Required | Since Version |
|--------------------------------|-----------------------------------------------------------------------------------------------------|---------------|----------|------------------|
| `location` | The storage location managed by Hadoop catalog. | (none) | No | 0.5.0 |
| `filesystem-conn-timeout-secs` | The timeout of getting the file system using Hadoop FileSystem client instance. Time unit: seconds. | 6 | No | 0.8.0-incubating |
| `credential-providers` | The credential provider types, separated by comma. | (none) | No | 0.8.0-incubating |

Please refer to [Credential vending](./security/credential-vending.md) for more details about credential vending.

Expand Down

0 comments on commit 75d7d38

Please sign in to comment.