Skip to content

Commit

Permalink
Use bigger data set to address testStatsBasedRepartitionData flakiness
Browse files Browse the repository at this point in the history
Previously the test used `tpch.tiny.nation`. The tiny schema is sf0.01.
Now will use sf0.03. Since the test requires source table to have stats,
a copy of the table is created.

Despite the increased data size locally the test runs faster now. This
is because the test internally uses retries to cover flakiness and now
it needs fewer retries to succeed (usually no retries).
  • Loading branch information
findepi committed May 11, 2023
1 parent baa6d11 commit 7ed9b2e
Showing 1 changed file with 29 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@
import static io.trino.plugin.iceberg.IcebergFileFormat.ORC;
import static io.trino.plugin.iceberg.IcebergFileFormat.PARQUET;
import static io.trino.plugin.iceberg.IcebergQueryRunner.ICEBERG_CATALOG;
import static io.trino.plugin.iceberg.IcebergSessionProperties.COLLECT_EXTENDED_STATISTICS_ON_WRITE;
import static io.trino.plugin.iceberg.IcebergSessionProperties.EXTENDED_STATISTICS_ENABLED;
import static io.trino.plugin.iceberg.IcebergSplitManager.ICEBERG_DOMAIN_COMPACTION_THRESHOLD;
import static io.trino.plugin.iceberg.IcebergTestUtils.withSmallRowGroups;
Expand Down Expand Up @@ -4576,28 +4577,38 @@ public void testStatsBasedRepartitionDataOnInsert()

private void testStatsBasedRepartitionData(boolean ctas)
{
Session sessionRepartitionMany = Session.builder(getSession())
.setSystemProperty(SCALE_WRITERS, "false")
.setSystemProperty(USE_PREFERRED_WRITE_PARTITIONING, "false")
.build();
// Use DISTINCT to add data redistribution between source table and the writer. This makes it more likely that all writers get some data.
String sourceRelation = "(SELECT DISTINCT orderkey, custkey, orderstatus FROM tpch.tiny.orders)";
testRepartitionData(
getSession(),
sourceRelation,
ctas,
"'orderstatus'",
3);
// Test uses relatively small table (15K rows). When engine doesn't redistribute data for writes,
// occasionally a worker node doesn't get any data and fewer files get created.
assertEventually(new Duration(3, MINUTES), () -> {
String catalog = getSession().getCatalog().orElseThrow();
try (TestTable sourceTable = new TestTable(
sql -> assertQuerySucceeds(
Session.builder(getSession())
.setCatalogSessionProperty(catalog, COLLECT_EXTENDED_STATISTICS_ON_WRITE, "true")
.build(),
sql),
"temp_table_analyzed",
"AS SELECT orderkey, custkey, orderstatus FROM tpch.\"sf0.03\".orders")) {
Session sessionRepartitionMany = Session.builder(getSession())
.setSystemProperty(SCALE_WRITERS, "false")
.setSystemProperty(USE_PREFERRED_WRITE_PARTITIONING, "false")
.build();
// Use DISTINCT to add data redistribution between source table and the writer. This makes it more likely that all writers get some data.
String sourceRelation = "(SELECT DISTINCT orderkey, custkey, orderstatus FROM " + sourceTable.getName() + ")";
testRepartitionData(
sessionRepartitionMany,
getSession(),
sourceRelation,
ctas,
"'orderstatus'",
9);
});
3);
// Test uses relatively small table (45K rows). When engine doesn't redistribute data for writes,
// occasionally a worker node doesn't get any data and fewer files get created.
assertEventually(new Duration(3, MINUTES), () -> {
testRepartitionData(
sessionRepartitionMany,
sourceRelation,
ctas,
"'orderstatus'",
9);
});
}
}

private void testRepartitionData(Session session, String sourceRelation, boolean ctas, String partitioning, int expectedFiles)
Expand Down

0 comments on commit 7ed9b2e

Please sign in to comment.