-
Notifications
You must be signed in to change notification settings - Fork 3.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve performance of Iceberg remove_orphan_files #14383
Changes from all commits
9e86fa3
64c86c2
ee36f85
0323ebe
48d7095
9acbd5e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5260,11 +5260,14 @@ public void testRemoveOrphanFiles() | |
Session sessionWithShortRetentionUnlocked = prepareCleanUpSession(); | ||
assertUpdate("CREATE TABLE " + tableName + " (key varchar, value integer)"); | ||
assertUpdate("INSERT INTO " + tableName + " VALUES ('one', 1)", 1); | ||
assertUpdate("INSERT INTO " + tableName + " VALUES ('two', 2), ('three', 3)", 2); | ||
assertUpdate("DELETE FROM " + tableName + " WHERE key = 'two'", 1); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why change existing test? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It didn't exercise delete files, and also didn't validate that we didn't corrupt the table. Seemed like reasonable additions There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed. Let's have it as a separate commit, so that we can capture the rationale in the commit message. otherwise it looks like a left-over from local testing |
||
String location = getTableLocation(tableName); | ||
Path orphanFile = Files.createFile(Path.of(getIcebergTableDataPath(location).toString(), "invalidData." + format)); | ||
List<String> initialDataFiles = getAllDataFilesFromTableDirectory(tableName); | ||
|
||
assertQuerySucceeds(sessionWithShortRetentionUnlocked, "ALTER TABLE " + tableName + " EXECUTE REMOVE_ORPHAN_FILES (retention_threshold => '0s')"); | ||
assertQuery("SELECT * FROM " + tableName, "VALUES ('one', 1), ('three', 3)"); | ||
|
||
List<String> updatedDataFiles = getAllDataFilesFromTableDirectory(tableName); | ||
assertThat(updatedDataFiles.size()).isLessThan(initialDataFiles.size()); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -49,6 +49,7 @@ | |
import static io.trino.plugin.tpch.TpchMetadata.TINY_SCHEMA_NAME; | ||
import static io.trino.testing.QueryAssertions.copyTpchTables; | ||
import static io.trino.testing.TestingSession.testSessionBuilder; | ||
import static io.trino.testing.sql.TestTable.randomTableSuffix; | ||
import static java.lang.String.format; | ||
import static java.util.Collections.nCopies; | ||
import static java.util.Objects.requireNonNull; | ||
|
@@ -384,10 +385,41 @@ public void testPredicateWithVarcharCastToDate() | |
assertUpdate("DROP TABLE test_varchar_as_date_predicate"); | ||
} | ||
|
||
@Test | ||
public void testRemoveOrphanFiles() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are you planning on adding a test for snapshot expiration as well? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can, probably in a separate PR There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
{ | ||
String tableName = "test_remove_orphan_files_" + randomTableSuffix(); | ||
Session sessionWithShortRetentionUnlocked = Session.builder(getSession()) | ||
.setCatalogSessionProperty("iceberg", "remove_orphan_files_min_retention", "0s") | ||
.build(); | ||
assertUpdate("CREATE TABLE " + tableName + " (key varchar, value integer)"); | ||
alexjo2144 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
assertUpdate("INSERT INTO " + tableName + " VALUES ('one', 1)", 1); | ||
assertUpdate("INSERT INTO " + tableName + " VALUES ('two', 2), ('three', 3)", 2); | ||
assertUpdate("DELETE FROM " + tableName + " WHERE key = 'two'", 1); | ||
|
||
assertFileSystemAccesses( | ||
sessionWithShortRetentionUnlocked, | ||
"ALTER TABLE " + tableName + " EXECUTE REMOVE_ORPHAN_FILES (retention_threshold => '0s')", | ||
ImmutableMultiset.builder() | ||
.add(new FileOperation(METADATA_JSON, INPUT_FILE_NEW_STREAM)) | ||
.addCopies(new FileOperation(SNAPSHOT, INPUT_FILE_GET_LENGTH), 4) | ||
.addCopies(new FileOperation(SNAPSHOT, INPUT_FILE_NEW_STREAM), 4) | ||
.addCopies(new FileOperation(MANIFEST, INPUT_FILE_GET_LENGTH), 6) | ||
.addCopies(new FileOperation(MANIFEST, INPUT_FILE_NEW_STREAM), 6) | ||
Comment on lines
+407
to
+408
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like we're still reading each manifest twice, I'll take a look at that tomorrow. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that first every manifest is read here:
and then again here:
but this is still huge improvement as previously these were the numbers:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like it's part of how the ManifestReader works. The factory |
||
.build()); | ||
|
||
assertUpdate("DROP TABLE " + tableName); | ||
} | ||
|
||
private void assertFileSystemAccesses(@Language("SQL") String query, Multiset<Object> expectedAccesses) | ||
{ | ||
assertFileSystemAccesses(TEST_SESSION, query, expectedAccesses); | ||
} | ||
|
||
private void assertFileSystemAccesses(Session session, @Language("SQL") String query, Multiset<Object> expectedAccesses) | ||
{ | ||
resetCounts(); | ||
getDistributedQueryRunner().executeWithQueryId(TEST_SESSION, query); | ||
getDistributedQueryRunner().executeWithQueryId(session, query); | ||
assertThat(ImmutableMultiset.<Object>copyOf(getOperations())).containsExactlyInAnyOrderElementsOf(expectedAccesses); | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This used to do
do we not need to populate
validMetadataFileNames
?is it because
metadataFileLocations(table, false).stream()
will do this for us?if so, maybe we don't need
validMetadataFileNames.add(fileName(snapshot.manifestListLocation()));
either?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's what I get for thinking, I don't need to rerun the tests it was just a small refactor 😦