From f0ef639b6377b15ad3221514e1a6764108b8bbac Mon Sep 17 00:00:00 2001 From: Alex Jo Date: Fri, 7 Jul 2023 11:36:36 -0400 Subject: [PATCH 1/2] Reduce Hive file system listing Reduce the number of times Hive needs to call the file system listing API when using common file systems. Additionally, avoid unnecessary directory exists checks when file listing returns a non-empty result. --- .../trino/filesystem/hdfs/HdfsFileSystem.java | 13 +++++++++++++ .../hive/BackgroundHiveSplitLoader.java | 19 ++++++++++--------- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/lib/trino-hdfs/src/main/java/io/trino/filesystem/hdfs/HdfsFileSystem.java b/lib/trino-hdfs/src/main/java/io/trino/filesystem/hdfs/HdfsFileSystem.java index 163ab8979634..f9a6676de55a 100644 --- a/lib/trino-hdfs/src/main/java/io/trino/filesystem/hdfs/HdfsFileSystem.java +++ b/lib/trino-hdfs/src/main/java/io/trino/filesystem/hdfs/HdfsFileSystem.java @@ -13,6 +13,7 @@ */ package io.trino.filesystem.hdfs; +import com.google.common.collect.ImmutableMap; import io.airlift.stats.TimeStat; import io.trino.filesystem.FileIterator; import io.trino.filesystem.Location; @@ -48,6 +49,13 @@ class HdfsFileSystem implements TrinoFileSystem { + private static final Map KNOWN_HIERARCHICAL_FILESYSTEMS = ImmutableMap.builder() + .put("s3", false) + .put("s3a", false) + .put("s3n", false) + .put("hdfs", true) + .buildOrThrow(); + private final HdfsEnvironment environment; private final HdfsContext context; private final TrinoHdfsFileSystemStats stats; @@ -224,6 +232,11 @@ public Optional directoryExists(Location location) private boolean hierarchical(FileSystem fileSystem, Location rootLocation) { + Boolean knownResult = KNOWN_HIERARCHICAL_FILESYSTEMS.get(fileSystem.getScheme()); + if (knownResult != null) { + return knownResult; + } + Boolean cachedResult = hierarchicalFileSystemCache.get(fileSystem); if (cachedResult != null) { return cachedResult; diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/BackgroundHiveSplitLoader.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/BackgroundHiveSplitLoader.java index 1da5e9d99c15..99f56a74778d 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/BackgroundHiveSplitLoader.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/BackgroundHiveSplitLoader.java @@ -563,12 +563,12 @@ private ListenableFuture loadPartition(HivePartitionMetadata partition) private List listBucketFiles(TrinoFileSystem fs, Location location, String partitionName) { - if (!ignoreAbsentPartitions) { - checkPartitionLocationExists(fs, location); - } - try { - return ImmutableList.copyOf(new HiveFileIterator(table, location, fs, directoryLister, hdfsNamenodeStats, FAIL)); + HiveFileIterator fileIterator = new HiveFileIterator(table, location, fs, directoryLister, hdfsNamenodeStats, FAIL); + if (!fileIterator.hasNext() && !ignoreAbsentPartitions) { + checkPartitionLocationExists(fs, location); + } + return ImmutableList.copyOf(fileIterator); } catch (HiveFileIterator.NestedDirectoryNotAllowedException e) { // Fail here to be on the safe side. This seems to be the same as what Hive does @@ -651,9 +651,11 @@ Optional> buildManifestFileIterator( TrinoFileSystem trinoFileSystem = fileSystemFactory.create(session); Location location = Location.of(parent.toString()); - checkPartitionLocationExists(trinoFileSystem, location); Map fileStatuses = new HashMap<>(); HiveFileIterator fileStatusIterator = new HiveFileIterator(table, location, trinoFileSystem, directoryLister, hdfsNamenodeStats, IGNORED); + if (!fileStatusIterator.hasNext()) { + checkPartitionLocationExists(trinoFileSystem, location); + } fileStatusIterator.forEachRemaining(status -> fileStatuses.put(getPathWithoutSchemeAndAuthority(new Path(status.getPath())), status)); List locatedFileStatuses = new ArrayList<>(); @@ -814,11 +816,10 @@ private static boolean shouldUseFileSplitsFromInputFormat(InputFormat inpu private Iterator createInternalHiveSplitIterator(TrinoFileSystem fileSystem, Location location, InternalHiveSplitFactory splitFactory, boolean splittable, Optional acidInfo) { - if (!ignoreAbsentPartitions) { + Iterator iterator = new HiveFileIterator(table, location, fileSystem, directoryLister, hdfsNamenodeStats, recursiveDirWalkerEnabled ? RECURSE : IGNORED); + if (!iterator.hasNext() && !ignoreAbsentPartitions) { checkPartitionLocationExists(fileSystem, location); } - - Iterator iterator = new HiveFileIterator(table, location, fileSystem, directoryLister, hdfsNamenodeStats, recursiveDirWalkerEnabled ? RECURSE : IGNORED); return createInternalHiveSplitIterator(splitFactory, splittable, acidInfo, Streams.stream(iterator)); } From f1bee053fd7342fda1c75f4324f6a12a2b48357d Mon Sep 17 00:00:00 2001 From: Alex Jo Date: Fri, 14 Jul 2023 09:00:32 -0400 Subject: [PATCH 2/2] empty