trinodb · aczajkowski · Feb 9, 2022 · Feb 15, 2022 · Feb 2, 2022 · findepi
diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/containers/HiveMinioDataLake.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/containers/HiveMinioDataLake.java
@@ -64,7 +64,7 @@ public HiveMinioDataLake(String bucketName, Map<String, String> hiveHadoopFilesT
         this.hiveHadoop = closer.register(
                 HiveHadoop.builder()
                         .withFilesToMount(ImmutableMap.<String, String>builder()
-                                .put("hive_s3_insert_overwrite/hive-core-site.xml", "/etc/hadoop/conf/core-site.xml")
+                                .put("hive_minio_datalake/hive-core-site.xml", "/etc/hadoop/conf/core-site.xml")
                                 .putAll(hiveHadoopFilesToMount)
                                 .buildOrThrow())
                         .withImage(hiveHadoopImage)

diff --git a/...ve_s3_insert_overwrite/hive-core-site.xml → ...es/hive_minio_datalake/hive-core-site.xml b/...ve_s3_insert_overwrite/hive-core-site.xml → ...es/hive_minio_datalake/hive-core-site.xml
@@ -20,4 +20,8 @@
         <name>fs.s3a.path.style.access</name>
         <value>true</value>
     </property>
+    <property>
+        <name>fs.s3.impl</name>
+        <value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
+    </property>
 </configuration>
diff --git a/plugin/trino-iceberg/pom.xml b/plugin/trino-iceberg/pom.xml
@@ -16,6 +16,13 @@
     <properties>
         <air.main.basedir>${project.parent.basedir}</air.main.basedir>
         <dep.iceberg.version>0.13.1</dep.iceberg.version>
+        <!--
+         Project's default for air.test.parallel is 'methods'. By design, 'classes' makes TestNG run tests from one class in a single thread.
+         As side effect it prevents TestNG from initializing multiple test instances upfront, which happens with 'methods'.
+         A potential downside can be long tail single-threaded execution of a single long test class.
+         This change is due to MVN Surefire issue which is addressed here https://github.com/apache/maven-surefire/pull/407
+         -->
+        <air.test.parallel>classes</air.test.parallel>
     </properties>
 
     <dependencies>
@@ -84,6 +91,11 @@
             <artifactId>units</artifactId>
         </dependency>
 
+        <dependency>
+            <groupId>com.amazonaws</groupId>
+            <artifactId>aws-java-sdk-s3</artifactId>
+        </dependency>
+
         <dependency>
             <groupId>com.fasterxml.jackson.core</groupId>
             <artifactId>jackson-core</artifactId>

diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergConnectorTest.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergConnectorTest.java
diff --git a/...in/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergMinioConnectorTest.java b/...in/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergMinioConnectorTest.java
@@ -0,0 +1,203 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.trino.plugin.iceberg;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import io.trino.Session;
+import io.trino.plugin.hive.containers.HiveMinioDataLake;
+import io.trino.testing.QueryRunner;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import java.io.BufferedInputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import static com.google.common.collect.ImmutableList.toImmutableList;
+import static io.trino.plugin.hive.containers.HiveMinioDataLake.ACCESS_KEY;
+import static io.trino.plugin.hive.containers.HiveMinioDataLake.SECRET_KEY;
+import static io.trino.testing.sql.TestTable.randomTableSuffix;
+import static java.lang.String.format;
+import static org.assertj.core.api.Assertions.assertThat;
+
+public abstract class BaseIcebergMinioConnectorTest
+        extends BaseIcebergConnectorTest
+{
+    private static final List<String> TEST_REPARTITION_EXCLUSIONS = ImmutableList.of(
+            TEST_REPARTITION_COMPLEX,
+            TEST_REPARTITION_SAME_COLUMN_MULTIPLE_TIMES);
+
+    private final String bucketName;
+    private HiveMinioDataLake hiveMinioDataLake;
+
+    public BaseIcebergMinioConnectorTest(IcebergFileFormat format)
+    {
+        super(format);
+        this.bucketName = "test-iceberg-minio-integration-test-" + randomTableSuffix();
+    }
+
+    @Override
+    protected QueryRunner createQueryRunner()
+            throws Exception
+    {
+        this.hiveMinioDataLake = closeAfterClass(new HiveMinioDataLake(bucketName, ImmutableMap.of()));
+        this.hiveMinioDataLake.start();
+        return createQueryRunner(
+                ImmutableMap.<String, String>builder()
+                        .put("iceberg.catalog.type", "HIVE_METASTORE")
+                        .put("hive.metastore.uri", "thrift://" + hiveMinioDataLake.getHiveHadoop().getHiveMetastoreEndpoint())
+                        .put("hive.s3.aws-access-key", ACCESS_KEY)
+                        .put("hive.s3.aws-secret-key", SECRET_KEY)
+                        .put("hive.s3.endpoint", "http://" + hiveMinioDataLake.getMinio().getMinioApiEndpoint())
+                        .put("hive.s3.path-style-access", "true")
+                        .put("hive.s3.streaming.part-size", "5MB")
+                        .buildOrThrow());
+    }
+
+    @Override
+    protected SchemaInitializer.Builder createSchemaInitializer(String schemaName)
+    {
+        return super.createSchemaInitializer(schemaName)
+                .withSchemaProperties(ImmutableMap.of("location", "'s3://" + bucketName + "/" + schemaName + "'"));
+    }
+
+    @Override
+    protected String createSchemaSql(String schemaName)
+    {
+        return "CREATE SCHEMA IF NOT EXISTS " + schemaName + " WITH (location = 's3://" + bucketName + "/" + schemaName + "')";
+    }
+
+    @Override
+    protected String createNewTableLocationBase(String schemaName)
+    {
+        return "s3://" + bucketName + "/" + schemaName;
+    }
+
+    @Override
+    protected Long calculateFileSystemFileSize(String filePath)
+    {
+        return this.hiveMinioDataLake.getS3Client()
+                .getObject(bucketName, filePath.replace("s3://" + bucketName + "/", ""))
+                .getObjectMetadata()
+                .getContentLength();
+    }
+
+    @Override
+    protected BufferedInputStream createDataFileStreamForFileSystem(String filePath)
+    {
+        return new BufferedInputStream(this.hiveMinioDataLake.getS3Client()
+                .getObject(bucketName, filePath.replace("s3://" + bucketName + "/", "")).getObjectContent());
+    }
+
+    @Override
+    protected List<String> getAllDataFilesFromTableDirectory(String tableName)
+    {
+        String tableLocation = getSession().getSchema().orElseThrow() + "/" + tableName + "/data";
+        return this.hiveMinioDataLake.getS3Client().listObjects(bucketName, tableLocation)
+                .getObjectSummaries()
+                .stream()
+                .map(object -> "s3://" + bucketName + "/" + object.getKey())
+                .filter(object -> !object.endsWith(".crc"))
+                .collect(toImmutableList());
+    }
+
+    /**
+     * DataProvider overload to exclude 2 test cases from generic tests:
+     * - {@link BaseIcebergConnectorTest#testRepartitionDataOnCtas(Session, String, int)}
+     * - {@link BaseIcebergConnectorTest#testRepartitionDataOnInsert(Session, String, int)}
+     * <p>
+     * 1. partitioning -> 'bucket(custkey, 4)', 'truncate(comment, 1)'
+     * Test environment provides higher memory usage reaching over 1,5GB per test case.
+     * This leads to test flakiness when both test classes run in parallel and may consume
+     * over 3GB which is current hard limit for CI and may end up with OOM.
+     * <p>
+     * Limiting dataset to 300 rows in dedicated test methods for MinIO solves this issue:
+     * - {@link BaseIcebergMinioConnectorTest#testRepartitionOnMinio()} ()}
+     * <p>
+     * 2. partitioning -> 'truncate(comment, 1)', 'orderstatus', 'bucket(comment, 2)'
+     * Test environment causes HMS to consume more time during createTable operation.
+     * This leads to read timed out and retry operation ended with TableAlreadyExists
+     * (HMS managed to end operation on it's side). As such behaviour wasn't observed
+     * in normal circumstances, test was adopted to avoid it.
+     * <p>
+     * Limiting dataset to 300 rows in dedicated test methods for MinIO solves this issue:
+     * - {@link BaseIcebergMinioConnectorTest#testRepartitionOnMinio()} ()}
+     */
+    @Override
+    @DataProvider
+    public Object[][] repartitioningDataProvider()
+    {
+        Object[][] defaultTestsData = super.repartitioningDataProvider();
+        List<Object[]> minioTestData = new ArrayList<>();
+        for (Object[] testData : defaultTestsData) {
+            if (!TEST_REPARTITION_EXCLUSIONS.contains(testData[1])) {
+                minioTestData.add(testData);
+            }
+        }
+        return minioTestData.toArray(new Object[minioTestData.size()][]);
+    }
+
+    @Test
+    public void testRepartitionOnMinio()
+    {
+        String sourceQuery = "SELECT * FROM tpch.tiny.orders ORDER BY orderkey LIMIT 300";
+        // complex; would exceed 100 open writers limit in IcebergPageSink without write repartitioning
+        testRepartitionData(getSession(), sourceQuery, true, TEST_REPARTITION_COMPLEX, 84);
+        testRepartitionData(getSession(), sourceQuery, false, TEST_REPARTITION_COMPLEX, 84);
+        // with same column multiple times
+        testRepartitionData(getSession(), sourceQuery, true, TEST_REPARTITION_SAME_COLUMN_MULTIPLE_TIMES, 97);
+        testRepartitionData(getSession(), sourceQuery, false, TEST_REPARTITION_SAME_COLUMN_MULTIPLE_TIMES, 97);
+    }
+
+    @Test
+    @Override
+    public void testShowCreateTable()
+    {
+        assertThat((String) computeScalar("SHOW CREATE TABLE region"))
+                .isEqualTo("" +
+                        "CREATE TABLE iceberg." + schemaName + ".region (\n" +
+                        "   regionkey bigint,\n" +
+                        "   name varchar,\n" +
+                        "   comment varchar\n" +
+                        ")\n" +
+                        "WITH (\n" +
+                        "   format = '" + format + "',\n" +
+                        "   location = 's3://" + bucketName + "/" + schemaName + "/region'\n" +
+                        ")");
+    }
+
+    @Test
+    @Override
+    public void testRenameSchema()
+    {
+        // Overridden because error message is different from upper test method
+        String schemaName = getSession().getSchema().orElseThrow();
+        assertQueryFails(
+                format("ALTER SCHEMA %s RENAME TO %s", schemaName, schemaName + randomTableSuffix()),
+                "Hive metastore does not support renaming schemas");
+    }
+
+    @Test
+    @Override
+    public void testShowCreateSchema()
+    {
+        assertThat(computeActual("SHOW CREATE SCHEMA " + schemaName).getOnlyValue().toString())
+                .matches("CREATE SCHEMA iceberg." + schemaName + "\n" +
+                        "AUTHORIZATION USER user\n" +
+                        "WITH \\(\n" +
+                        "\\s+location = 's3://" + bucketName + "/" + schemaName + "'\n" +
+                        "\\)");
+    }
+}
diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/IcebergQueryRunner.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/IcebergQueryRunner.java
@@ -24,13 +24,11 @@
 import java.io.File;
 import java.nio.file.Path;
 import java.util.HashMap;
-import java.util.List;
 import java.util.Map;
 import java.util.Optional;
 
+import static com.google.common.base.Preconditions.checkState;
 import static io.airlift.testing.Closeables.closeAllSuppress;
-import static io.trino.plugin.tpch.TpchMetadata.TINY_SCHEMA_NAME;
-import static io.trino.testing.QueryAssertions.copyTpchTables;
 import static io.trino.testing.TestingSession.testSessionBuilder;
 import static java.util.Objects.requireNonNull;
 
@@ -42,13 +40,20 @@ public final class IcebergQueryRunner
 
     private IcebergQueryRunner() {}
 
-    public static DistributedQueryRunner createIcebergQueryRunner()
+    public static DistributedQueryRunner createIcebergQueryRunner(TpchTable<?>... tables)
             throws Exception
     {
         return createIcebergQueryRunner(
-                Map.of(),
-                Map.of(),
-                List.of());
+                ImmutableList.copyOf(tables));
+    }
+
+    public static DistributedQueryRunner createIcebergQueryRunner(Iterable<TpchTable<?>> tables)
+            throws Exception
+    {
+        return createIcebergQueryRunner(
+                ImmutableMap.of(),
+                ImmutableMap.of(),
+                tables);
     }
 
     public static DistributedQueryRunner createIcebergQueryRunner(
@@ -70,11 +75,27 @@ public static DistributedQueryRunner createIcebergQueryRunner(
             Iterable<TpchTable<?>> tables,
             Optional<File> metastoreDirectory)
             throws Exception
+    {
+        return createIcebergQueryRunner(
+                extraProperties,
+                connectorProperties,
+                SchemaInitializer.builder()
+                        .withClonedTpchTables(tables)
+                        .build(),
+                metastoreDirectory);
+    }
+
+    public static DistributedQueryRunner createIcebergQueryRunner(
+            Map<String, String> extraProperties,
+            Map<String, String> connectorProperties,
+            SchemaInitializer schemaInitializer,
+            Optional<File> metastoreDirectory)
+            throws Exception
     {
         Builder builder = builder()
                 .setExtraProperties(extraProperties)
                 .setIcebergProperties(connectorProperties)
-                .setInitialTables(tables);
+                .setSchemaInitializer(schemaInitializer);
 
         metastoreDirectory.ifPresent(builder::setMetastoreDirectory);
         return builder.build();
@@ -90,13 +111,12 @@ public static class Builder
     {
         private Optional<File> metastoreDirectory = Optional.empty();
         private ImmutableMap.Builder<String, String> icebergProperties = ImmutableMap.builder();
-        private List<TpchTable<?>> initialTables = ImmutableList.of();
+        private Optional<SchemaInitializer> schemaInitializer = Optional.empty();
 
         protected Builder()
         {
             super(testSessionBuilder()
                     .setCatalog(ICEBERG_CATALOG)
-                    .setSchema("tpch")
                     .build());
         }
 
@@ -121,7 +141,15 @@ public Builder addIcebergProperty(String key, String value)
 
         public Builder setInitialTables(Iterable<TpchTable<?>> initialTables)
         {
-            this.initialTables = ImmutableList.copyOf(requireNonNull(initialTables, "initialTables is null"));
+            setSchemaInitializer(SchemaInitializer.builder().withClonedTpchTables(initialTables).build());
+            return self();
+        }
+
+        public Builder setSchemaInitializer(SchemaInitializer schemaInitializer)
+        {
+            checkState(this.schemaInitializer.isEmpty(), "schemaInitializer is already set");
+            this.schemaInitializer = Optional.of(requireNonNull(schemaInitializer, "schemaInitializer is null"));
+            amendSession(sessionBuilder -> sessionBuilder.setSchema(schemaInitializer.getSchemaName()));
             return self();
         }
 
@@ -134,19 +162,16 @@ public DistributedQueryRunner build()
                 queryRunner.installPlugin(new TpchPlugin());
                 queryRunner.createCatalog("tpch", "tpch");
 
-                Path dataDir = metastoreDirectory.map(File::toPath).orElseGet(() -> queryRunner.getCoordinator().getBaseDataDir().resolve("iceberg_data"));
-
                 queryRunner.installPlugin(new IcebergPlugin());
-                Map<String, String> icebergProperties = new HashMap<>();
-                icebergProperties.put("iceberg.catalog.type", "TESTING_FILE_METASTORE");
-                icebergProperties.put("hive.metastore.catalog.dir", dataDir.toString());
-                icebergProperties.putAll(this.icebergProperties.buildOrThrow());
+                Map<String, String> icebergProperties = new HashMap<>(this.icebergProperties.buildOrThrow());
+                if (!icebergProperties.containsKey("iceberg.catalog.type")) {
+                    Path dataDir = metastoreDirectory.map(File::toPath).orElseGet(() -> queryRunner.getCoordinator().getBaseDataDir().resolve("iceberg_data"));
+                    icebergProperties.put("iceberg.catalog.type", "TESTING_FILE_METASTORE");
+                    icebergProperties.put("hive.metastore.catalog.dir", dataDir.toString());
+                }
 
                 queryRunner.createCatalog(ICEBERG_CATALOG, "iceberg", icebergProperties);
-
-                queryRunner.execute("CREATE SCHEMA tpch");
-
-                copyTpchTables(queryRunner, "tpch", TINY_SCHEMA_NAME, queryRunner.getDefaultSession(), initialTables);
+                schemaInitializer.orElse(SchemaInitializer.builder().build()).accept(queryRunner);
 
                 return queryRunner;
             }