Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use bigger data set to address testStatsBasedRepartitionData flakiness #17431

Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@
import static io.trino.plugin.iceberg.IcebergFileFormat.ORC;
import static io.trino.plugin.iceberg.IcebergFileFormat.PARQUET;
import static io.trino.plugin.iceberg.IcebergQueryRunner.ICEBERG_CATALOG;
import static io.trino.plugin.iceberg.IcebergSessionProperties.COLLECT_EXTENDED_STATISTICS_ON_WRITE;
import static io.trino.plugin.iceberg.IcebergSessionProperties.EXTENDED_STATISTICS_ENABLED;
import static io.trino.plugin.iceberg.IcebergSplitManager.ICEBERG_DOMAIN_COMPACTION_THRESHOLD;
import static io.trino.plugin.iceberg.IcebergTestUtils.withSmallRowGroups;
Expand Down Expand Up @@ -4576,28 +4577,38 @@ public void testStatsBasedRepartitionDataOnInsert()

private void testStatsBasedRepartitionData(boolean ctas)
{
Session sessionRepartitionMany = Session.builder(getSession())
.setSystemProperty(SCALE_WRITERS, "false")
.setSystemProperty(USE_PREFERRED_WRITE_PARTITIONING, "false")
.build();
// Use DISTINCT to add data redistribution between source table and the writer. This makes it more likely that all writers get some data.
String sourceRelation = "(SELECT DISTINCT orderkey, custkey, orderstatus FROM tpch.tiny.orders)";
testRepartitionData(
getSession(),
sourceRelation,
ctas,
"'orderstatus'",
3);
// Test uses relatively small table (60K rows). When engine doesn't redistribute data for writes,
// occasionally a worker node doesn't get any data and fewer files get created.
assertEventually(new Duration(3, MINUTES), () -> {
String catalog = getSession().getCatalog().orElseThrow();
try (TestTable sourceTable = new TestTable(
sql -> assertQuerySucceeds(
Session.builder(getSession())
.setCatalogSessionProperty(catalog, COLLECT_EXTENDED_STATISTICS_ON_WRITE, "true")
.build(),
sql),
"temp_table_analyzed",
"AS SELECT orderkey, custkey, orderstatus FROM tpch.\"sf0.03\".orders")) {
Session sessionRepartitionMany = Session.builder(getSession())
.setSystemProperty(SCALE_WRITERS, "false")
.setSystemProperty(USE_PREFERRED_WRITE_PARTITIONING, "false")
.build();
// Use DISTINCT to add data redistribution between source table and the writer. This makes it more likely that all writers get some data.
String sourceRelation = "(SELECT DISTINCT orderkey, custkey, orderstatus FROM " + sourceTable.getName() + ")";
testRepartitionData(
sessionRepartitionMany,
getSession(),
sourceRelation,
ctas,
"'orderstatus'",
9);
});
3);
// Test uses relatively small table (45K rows). When engine doesn't redistribute data for writes,
// occasionally a worker node doesn't get any data and fewer files get created.
assertEventually(new Duration(3, MINUTES), () -> {
testRepartitionData(
sessionRepartitionMany,
sourceRelation,
ctas,
"'orderstatus'",
9);
});
}
}

private void testRepartitionData(Session session, String sourceRelation, boolean ctas, String partitioning, int expectedFiles)
Expand Down