Add CI infrastructure for trino-redshift

Use an ephemeral/throw-away Redshift cluster for running integration tests on `trino-redshift` connector. Once the tests are run, the testing Redshift cluster is being reclaimed. The Redshift cluster is publicly accessible in order to be accessible from the general purpose Github runners.
trinodb · Mar 9, 2023 · fe5a925 · fe5a925
1 parent 3106f8d
commit fe5a925
Show file tree

Hide file tree

Showing 8 changed files with 291 additions and 11 deletions.
diff --git a/.github/bin/redshift/delete-aws-redshift.sh b/.github/bin/redshift/delete-aws-redshift.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+set -uo pipefail
+
+REDSHIFT_SCRIPTS_DIR="${BASH_SOURCE%/*}"
+
+if [[ ! -f "${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier" ]];  then
+    echo "Missing file ${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier"
+    exit 0
+fi
+
+REDSHIFT_CLUSTER_IDENTIFIER=$(cat $REDSHIFT_SCRIPTS_DIR/.cluster-identifier)
+
+echo "Deleting Amazon Redshift cluster $REDSHIFT_CLUSTER_IDENTIFIER"
+REDSHIFT_DELETE_CLUSTER_OUTPUT=$(aws redshift delete-cluster --cluster-identifier $REDSHIFT_CLUSTER_IDENTIFIER --skip-final-cluster-snapshot)
+
+if [ -z "${REDSHIFT_DELETE_CLUSTER_OUTPUT}" ]; then
+    echo ${REDSHIFT_DELETE_CLUSTER_OUTPUT}
+    # Don't fail the build because of cleanup issues
+    exit 0
+fi
+
+echo "Waiting for the Amazon Redshift cluster $REDSHIFT_CLUSTER_IDENTIFIER to be deleted"
+aws redshift wait cluster-deleted \
+  --cluster-identifier $REDSHIFT_CLUSTER_IDENTIFIER
+if [ "$?" -ne 0 ]
+then
+  echo "Amazon Redshift cluster $REDSHIFT_CLUSTER_IDENTIFIER deletion has timed out"
+else
+  echo "Amazon Redshift cluster $REDSHIFT_CLUSTER_IDENTIFIER has been deleted"
+fi
+
+rm -f ${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier
+exit 0
diff --git a/.github/bin/redshift/setup-aws-redshift.sh b/.github/bin/redshift/setup-aws-redshift.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+REDSHIFT_SCRIPTS_DIR="${BASH_SOURCE%/*}"
+
+# Redshift requires passwords containing at least a digit, a lower case letter and a upper case letter.
+# Having no warranty that openssl will output a string following the above mentioned password policy,
+# add explicitly the string 'Red1!' to the password
+REDSHIFT_PASSWORD="$(openssl rand -base64 16 | tr -dc 'a-zA-Z0-9')Red1!"
+
+REDSHIFT_CLUSTER_IDENTIFIER=trino-redshift-ci-cluster-$(openssl rand -hex 8)
+
+REDSHIFT_CLUSTER_TTL=$(date -u -d "+2 hours" +"%Y-%m-%dT%H:%M:%SZ" 2>/dev/null || date -u -v "+2H" +"%Y-%m-%dT%H:%M:%SZ")
+
+echo "Creating the Amazon Redshift cluster ${REDSHIFT_CLUSTER_IDENTIFIER} on the region ${AWS_REGION}."
+REDSHIFT_CREATE_CLUSTER_OUTPUT=$(aws redshift create-cluster \
+  --db-name testdb \
+  --region ${AWS_REGION} \
+  --node-type dc2.large \
+  --number-of-nodes 1 \
+  --master-username admin \
+  --master-user-password ${REDSHIFT_PASSWORD} \
+  --cluster-identifier ${REDSHIFT_CLUSTER_IDENTIFIER} \
+  --cluster-subnet-group-name ${REDSHIFT_SUBNET_GROUP_NAME} \
+  --cluster-type single-node\
+  --vpc-security-group-ids "${REDSHIFT_VPC_SECURITY_GROUP_IDS}" \
+  --iam-roles ${REDSHIFT_IAM_ROLES} \
+  --automated-snapshot-retention-period 0 \
+  --publicly-accessible \
+  --tags Key=cloud,Value=aws Key=environment,Value=test Key=project,Value=trino-redshift Key=ttl,Value=${REDSHIFT_CLUSTER_TTL})
+
+if [ -z "${REDSHIFT_CREATE_CLUSTER_OUTPUT}" ]; then
+    # Only show errors
+    echo ${REDSHIFT_CREATE_CLUSTER_OUTPUT}
+    exit 1
+fi
+
+echo ${REDSHIFT_CLUSTER_IDENTIFIER} > ${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier
+echo "Waiting for the Amazon Redshift cluster ${REDSHIFT_CLUSTER_IDENTIFIER} on the region ${AWS_REGION} to be available."
+
+# Wait for the cluster to become available
+aws redshift wait cluster-available \
+  --cluster-identifier ${REDSHIFT_CLUSTER_IDENTIFIER}
+
+echo "The Amazon Redshift cluster ${REDSHIFT_CLUSTER_IDENTIFIER} on the region ${AWS_REGION} is available for queries."
+
+REDSHIFT_CLUSTER_DESCRIPTION=$(aws redshift describe-clusters --cluster-identifier ${REDSHIFT_CLUSTER_IDENTIFIER})
+
+export REDSHIFT_ENDPOINT=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].Endpoint.Address' )
+export REDSHIFT_PORT=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].Endpoint.Port' )
+export REDSHIFT_CLUSTER_DATABASE_NAME=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].DBName' )
+export REDSHIFT_USER=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].MasterUsername' )
+export REDSHIFT_PASSWORD
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -538,6 +538,7 @@ jobs:
             - { modules: plugin/trino-postgresql }
             - { modules: plugin/trino-raptor-legacy }
             - { modules: plugin/trino-redis }
+            - { modules: plugin/trino-redshift, profile: cloud-tests }
             - { modules: plugin/trino-singlestore }
             - { modules: plugin/trino-sqlserver }
             - { modules: testing/trino-faulttolerant-tests, profile: default }
@@ -597,6 +598,7 @@ jobs:
           && ! (contains(matrix.modules, 'trino-delta-lake') && contains(matrix.profile, 'gcs-tests'))
           && ! (contains(matrix.modules, 'trino-iceberg') && contains(matrix.profile, 'cloud-tests'))
           && ! (contains(matrix.modules, 'trino-bigquery') && contains(matrix.profile, 'cloud-tests-arrow'))
+          && ! (contains(matrix.modules, 'trino-redshift') && contains(matrix.profile, 'cloud-tests'))
         run: $MAVEN test ${MAVEN_TEST} -pl ${{ matrix.modules }} ${{ matrix.profile != '' && format('-P {0}', matrix.profile) || '' }}
       # Additional tests for selected modules
       - name: Cloud Delta Lake Tests
@@ -678,6 +680,38 @@ jobs:
             -Dhive.hadoop2.azure-abfs-container="${ABFS_CONTAINER}" \
             -Dhive.hadoop2.azure-abfs-account="${ABFS_ACCOUNT}" \
             -Dhive.hadoop2.azure-abfs-access-key="${ABFS_ACCESS_KEY}"
+      - name: Cloud Redshift Tests
+        env:
+          AWS_REGION: ${{ vars.REDSHIFT_AWS_REGION }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.REDSHIFT_AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.REDSHIFT_AWS_SECRET_ACCESS_KEY }}
+          REDSHIFT_SUBNET_GROUP_NAME: ${{ vars.REDSHIFT_SUBNET_GROUP_NAME }}
+          REDSHIFT_IAM_ROLES: ${{ vars.REDSHIFT_IAM_ROLES }}
+          REDSHIFT_VPC_SECURITY_GROUP_IDS: ${{ vars.REDSHIFT_VPC_SECURITY_GROUP_IDS }}
+          REDSHIFT_S3_TPCH_TABLES_ROOT: ${{ vars.REDSHIFT_S3_TPCH_TABLES_ROOT }}
+        if: >-
+          contains(matrix.modules, 'trino-redshift') && contains(matrix.profile, 'cloud-tests') &&
+          (env.AWS_ACCESS_KEY_ID != '' || env.REDSHIFT_SUBNET_GROUP_NAME != '')
+        run: |
+          source .github/bin/redshift/setup-aws-redshift.sh
+
+          $MAVEN test ${MAVEN_TEST} -pl :trino-redshift ${{ format('-P {0}', matrix.profile) }} \
+            -Dtest.redshift.jdbc.user="${REDSHIFT_USER}" \
+            -Dtest.redshift.jdbc.password="${REDSHIFT_PASSWORD}" \
+            -Dtest.redshift.jdbc.endpoint="${REDSHIFT_ENDPOINT}:${REDSHIFT_PORT}/" \
+            -Dtest.redshift.s3.tpch.tables.root="${REDSHIFT_S3_TPCH_TABLES_ROOT}" \
+            -Dtest.redshift.iam.role="${REDSHIFT_IAM_ROLES}" \
+            -Dtest.redshift.aws.region="${AWS_REGION}" \
+            -Dtest.redshift.aws.access-key="${AWS_ACCESS_KEY_ID}" \
+            -Dtest.redshift.aws.secret-key="${AWS_SECRET_ACCESS_KEY}"
+      - name: Cleanup ephemeral Redshift Cluster
+        env:
+          AWS_REGION: ${{ vars.REDSHIFT_AWS_REGION }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.REDSHIFT_AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.REDSHIFT_AWS_SECRET_ACCESS_KEY }}
+        # Cancelled workflows may have left the ephemeral cluster running
+        if: always()
+        run: .github/bin/redshift/delete-aws-redshift.sh
       - name: Sanitize artifact name
         if: always()
         run: |

diff --git a/.gitignore b/.gitignore
@@ -30,3 +30,4 @@ product-test-reports
 /impacted-features.log
 .github/test-matrix.yaml
 .github/test-pt-matrix.yaml
+.github/bin/redshift/.cluster-identifier
diff --git a/plugin/trino-redshift/README.md b/plugin/trino-redshift/README.md
@@ -6,7 +6,7 @@ is a single dc2.large instance. Additionally, you will need a S3 bucket
 containing TPCH tiny data in Parquet format.  The files should be named:
 
 ```
-s3://<your_bucket>/tpch/tiny/<table_name>.parquet
+s3://<your_bucket>/tpch/tiny/<table_name>/*.parquet
 ```
 
 To run the tests set the following system properties:
@@ -18,3 +18,111 @@ test.redshift.jdbc.password=<password>
 test.redshift.s3.tpch.tables.root=<your_bucket>
 test.redshift.iam.role=<your_iam_arm_to_access_bucket>
 ```
+
+## Redshift Cluster CI Infrastructure setup
+
+### AWS VPC setup
+On _AWS VPC_ service create a VPC - `redshift-vpc`.
+Key properties to configure on the VPC:
+
+- `IPv4 CIDR`: `192.168.0.0/16`
+
+Create for the `redshift-vpc` an Internet Gateway - `redshift-igw`.
+
+Create a subnet for the VPC `redshift-public-subnet`.
+In the route table of the subnet make sure to add the route 
+`Destination 0.0.0.0/0` to `Target` the previously created 
+internet gateway `redshift-igw`.
+
+Create a Security Group `redshift-sg`.
+Make the following adjustments in the security group to allow access to the 
+Redshift cluster from the general purpose Github CI runners:
+
+- add an Inbound rule accepting `All traffic` from Source `0.0.0.0/0`
+- add an Outbound rule for `All traffic` to destination `0.0.0.0/0`
+
+### Amazon Redshift setup
+
+Create a subnet group `cluster-subnet-group-trino-ci` associated with 
+the VPC `redshift-vpc` and the VPC subnet `redshift-public-subnet`.
+
+### AWS IAM setup
+
+Create the AWS IAM role `redshift-ci` and add to it 
+the `AmazonRedshiftAllCommandsFullAccess` policy.
+This role will be passed to the ephemeral Redshift cluster to provide it with 
+the ability to execute `COPY` from AWS S3 bucket.
+
+Ensure that the AWS IAM user used by the CI process does have the ability to 
+create ephemeral Amazon Redshift clusters: 
+
+```
+{
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Sid": "PassRoleToRedshiftCluster",
+            "Effect": "Allow",
+            "Action": "iam:PassRole",
+            "Resource": "arn:aws:iam::894365193301:role/redshift-ci"
+        },
+        {
+            "Sid": "RedshiftClusterManagement",
+            "Effect": "Allow",
+            "Action": [
+                "redshift:DeleteTags",
+                "redshift:DeleteCluster",
+                "redshift:CreateTags",
+                "redshift:CreateCluster",
+                "redshift:DescribeClusters",
+                "redshift:DescribeLoggingStatus"
+            ],
+            "Resource": "arn:aws:redshift:us-east-2:894365193301:cluster:trino-redshift-ci-cluster-*"
+        },
+        {
+            "Sid": "DescribeRedshiftVpcComponents",
+            "Effect": "Allow",
+            "Action": [
+                "ec2:DescribeInternetGateways",
+                "ec2:DescribeAddresses",
+                "ec2:DescribeAvailabilityZones",
+                "ec2:DescribeVpcs",
+                "ec2:DescribeAccountAttributes",
+                "ec2:DescribeSubnets",
+                "ec2:DescribeSecurityGroups"
+            ],
+            "Resource": "*"
+        }
+    ]
+}
+```
+
+### AWS S3 setup
+
+The `trino-redshift` tests rely on a Redshift cluster 
+having TPCH tables filled with data.
+Create an AWS S3 bucket and add to it the parquet content 
+of `tpch` tables saved locally through the `trino-hive` connector 
+via commands like:
+
+```
+CREATE TABLE hive.tiny.table_name WITH (format= 'parquet') AS TABLE tpch.sf1.table_name
+```
+
+The content of the S3 bucket should look like this:
+
+```
+s3://<your_bucket>/tpch/tiny/<table_name>/*.parquet
+```
+
+where `table_name` is:
+
+- `customer`
+- `lineitem`
+- `nation`
+- `orders`
+- `part`
+- `partsupp`
+- `region`
+- `supplier`
+
diff --git a/plugin/trino-redshift/pom.xml b/plugin/trino-redshift/pom.xml
@@ -174,22 +174,22 @@
 
     <profiles>
         <profile>
-            <id>default</id>
+            <id>cloud-tests</id>
             <activation>
-                <activeByDefault>true</activeByDefault>
+                <activeByDefault>false</activeByDefault>
             </activation>
             <build>
                 <plugins>
                     <plugin>
                         <groupId>org.apache.maven.plugins</groupId>
                         <artifactId>maven-surefire-plugin</artifactId>
                         <configuration>
-                            <excludes>
-                                <exclude>**/TestRedshiftAutomaticJoinPushdown.java</exclude>
-                                <exclude>**/TestRedshiftConnectorTest.java</exclude>
-                                <exclude>**/TestRedshiftTableStatisticsReader.java</exclude>
-                                <exclude>**/TestRedshiftTypeMapping.java</exclude>
-                            </excludes>
+                            <includes>
+                                <!-- Run only the smoke tests of the connector on the CI environment due to unpredictable -->
+                                <!-- locations of GitHub runners which can lead to increased client latency on the -->
+                                <!-- JDBC operations performed on the ephemeral AWS Redshift cluster.  -->
+                                <include>**/TestRedshiftConnectorSmokeTest.java</include>
+                            </includes>
                         </configuration>
                     </plugin>
                 </plugins>

diff --git a/plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/RedshiftQueryRunner.java b/plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/RedshiftQueryRunner.java
@@ -105,7 +105,7 @@ public static DistributedQueryRunner createRedshiftQueryRunner(
             runner.installPlugin(new RedshiftPlugin());
             runner.createCatalog(TEST_CATALOG, CONNECTOR_NAME, properties);
 
-            executeInRedshift("CREATE SCHEMA IF NOT EXISTS " + TEST_SCHEMA);
+            executeInRedshiftWithRetry("CREATE SCHEMA IF NOT EXISTS " + TEST_SCHEMA);
             createUserIfNotExists(NON_GRANTED_USER, JDBC_PASSWORD);
             createUserIfNotExists(GRANTED_USER, JDBC_PASSWORD);
 
@@ -197,7 +197,7 @@ private static synchronized void provisionTables(Session session, QueryRunner qu
 
     private static void copyFromS3(QueryRunner queryRunner, Session session, String name)
     {
-        String s3Path = format("%s/%s/%s.parquet", S3_TPCH_TABLES_ROOT, TPCH_CATALOG, name);
+        String s3Path = format("%s/%s/%s/%s/", S3_TPCH_TABLES_ROOT, TPCH_CATALOG, TINY_SCHEMA_NAME, name);
         log.info("Creating table %s in Redshift copying from %s", name, s3Path);
 
         // Create table in ephemeral Redshift cluster with no data

diff --git a/...trino-redshift/src/test/java/io/trino/plugin/redshift/TestRedshiftConnectorSmokeTest.java b/...trino-redshift/src/test/java/io/trino/plugin/redshift/TestRedshiftConnectorSmokeTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.trino.plugin.redshift;
+
+import com.google.common.collect.ImmutableMap;
+import io.trino.plugin.jdbc.BaseJdbcConnectorSmokeTest;
+import io.trino.testing.QueryRunner;
+import io.trino.testing.TestingConnectorBehavior;
+
+import static io.trino.plugin.redshift.RedshiftQueryRunner.createRedshiftQueryRunner;
+import static io.trino.testing.TestingConnectorBehavior.SUPPORTS_RENAME_TABLE_ACROSS_SCHEMAS;
+
+public class TestRedshiftConnectorSmokeTest
+        extends BaseJdbcConnectorSmokeTest
+{
+    @Override
+    @SuppressWarnings("DuplicateBranchesInSwitch") // options here are grouped per-feature
+    protected boolean hasBehavior(TestingConnectorBehavior connectorBehavior)
+    {
+        switch (connectorBehavior) {
+            case SUPPORTS_RENAME_TABLE_ACROSS_SCHEMAS:
+                return false;
+
+            default:
+                return super.hasBehavior(connectorBehavior);
+        }
+    }
+
+    @Override
+    protected QueryRunner createQueryRunner()
+            throws Exception
+    {
+        return createRedshiftQueryRunner(
+                ImmutableMap.of(),
+                ImmutableMap.of(),
+                REQUIRED_TPCH_TABLES);
+    }
+}