From fe5a925d5a58bd184be97874fa41aa44f72f59fd Mon Sep 17 00:00:00 2001 From: Marius Grama Date: Mon, 23 Jan 2023 22:54:16 +0100 Subject: [PATCH] Add CI infrastructure for `trino-redshift` Use an ephemeral/throw-away Redshift cluster for running integration tests on `trino-redshift` connector. Once the tests are run, the testing Redshift cluster is being reclaimed. The Redshift cluster is publicly accessible in order to be accessible from the general purpose Github runners. --- .github/bin/redshift/delete-aws-redshift.sh | 34 ++++++ .github/bin/redshift/setup-aws-redshift.sh | 54 +++++++++ .github/workflows/ci.yml | 34 ++++++ .gitignore | 1 + plugin/trino-redshift/README.md | 110 +++++++++++++++++- plugin/trino-redshift/pom.xml | 16 +-- .../plugin/redshift/RedshiftQueryRunner.java | 4 +- .../TestRedshiftConnectorSmokeTest.java | 49 ++++++++ 8 files changed, 291 insertions(+), 11 deletions(-) create mode 100755 .github/bin/redshift/delete-aws-redshift.sh create mode 100755 .github/bin/redshift/setup-aws-redshift.sh create mode 100644 plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/TestRedshiftConnectorSmokeTest.java diff --git a/.github/bin/redshift/delete-aws-redshift.sh b/.github/bin/redshift/delete-aws-redshift.sh new file mode 100755 index 000000000000..2a9ad0400b11 --- /dev/null +++ b/.github/bin/redshift/delete-aws-redshift.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +set -uo pipefail + +REDSHIFT_SCRIPTS_DIR="${BASH_SOURCE%/*}" + +if [[ ! -f "${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier" ]]; then + echo "Missing file ${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier" + exit 0 +fi + +REDSHIFT_CLUSTER_IDENTIFIER=$(cat $REDSHIFT_SCRIPTS_DIR/.cluster-identifier) + +echo "Deleting Amazon Redshift cluster $REDSHIFT_CLUSTER_IDENTIFIER" +REDSHIFT_DELETE_CLUSTER_OUTPUT=$(aws redshift delete-cluster --cluster-identifier $REDSHIFT_CLUSTER_IDENTIFIER --skip-final-cluster-snapshot) + +if [ -z "${REDSHIFT_DELETE_CLUSTER_OUTPUT}" ]; then + echo ${REDSHIFT_DELETE_CLUSTER_OUTPUT} + # Don't fail the build because of cleanup issues + exit 0 +fi + +echo "Waiting for the Amazon Redshift cluster $REDSHIFT_CLUSTER_IDENTIFIER to be deleted" +aws redshift wait cluster-deleted \ + --cluster-identifier $REDSHIFT_CLUSTER_IDENTIFIER +if [ "$?" -ne 0 ] +then + echo "Amazon Redshift cluster $REDSHIFT_CLUSTER_IDENTIFIER deletion has timed out" +else + echo "Amazon Redshift cluster $REDSHIFT_CLUSTER_IDENTIFIER has been deleted" +fi + +rm -f ${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier +exit 0 diff --git a/.github/bin/redshift/setup-aws-redshift.sh b/.github/bin/redshift/setup-aws-redshift.sh new file mode 100755 index 000000000000..bc9189328596 --- /dev/null +++ b/.github/bin/redshift/setup-aws-redshift.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash + +set -euo pipefail + +REDSHIFT_SCRIPTS_DIR="${BASH_SOURCE%/*}" + +# Redshift requires passwords containing at least a digit, a lower case letter and a upper case letter. +# Having no warranty that openssl will output a string following the above mentioned password policy, +# add explicitly the string 'Red1!' to the password +REDSHIFT_PASSWORD="$(openssl rand -base64 16 | tr -dc 'a-zA-Z0-9')Red1!" + +REDSHIFT_CLUSTER_IDENTIFIER=trino-redshift-ci-cluster-$(openssl rand -hex 8) + +REDSHIFT_CLUSTER_TTL=$(date -u -d "+2 hours" +"%Y-%m-%dT%H:%M:%SZ" 2>/dev/null || date -u -v "+2H" +"%Y-%m-%dT%H:%M:%SZ") + +echo "Creating the Amazon Redshift cluster ${REDSHIFT_CLUSTER_IDENTIFIER} on the region ${AWS_REGION}." +REDSHIFT_CREATE_CLUSTER_OUTPUT=$(aws redshift create-cluster \ + --db-name testdb \ + --region ${AWS_REGION} \ + --node-type dc2.large \ + --number-of-nodes 1 \ + --master-username admin \ + --master-user-password ${REDSHIFT_PASSWORD} \ + --cluster-identifier ${REDSHIFT_CLUSTER_IDENTIFIER} \ + --cluster-subnet-group-name ${REDSHIFT_SUBNET_GROUP_NAME} \ + --cluster-type single-node\ + --vpc-security-group-ids "${REDSHIFT_VPC_SECURITY_GROUP_IDS}" \ + --iam-roles ${REDSHIFT_IAM_ROLES} \ + --automated-snapshot-retention-period 0 \ + --publicly-accessible \ + --tags Key=cloud,Value=aws Key=environment,Value=test Key=project,Value=trino-redshift Key=ttl,Value=${REDSHIFT_CLUSTER_TTL}) + +if [ -z "${REDSHIFT_CREATE_CLUSTER_OUTPUT}" ]; then + # Only show errors + echo ${REDSHIFT_CREATE_CLUSTER_OUTPUT} + exit 1 +fi + +echo ${REDSHIFT_CLUSTER_IDENTIFIER} > ${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier +echo "Waiting for the Amazon Redshift cluster ${REDSHIFT_CLUSTER_IDENTIFIER} on the region ${AWS_REGION} to be available." + +# Wait for the cluster to become available +aws redshift wait cluster-available \ + --cluster-identifier ${REDSHIFT_CLUSTER_IDENTIFIER} + +echo "The Amazon Redshift cluster ${REDSHIFT_CLUSTER_IDENTIFIER} on the region ${AWS_REGION} is available for queries." + +REDSHIFT_CLUSTER_DESCRIPTION=$(aws redshift describe-clusters --cluster-identifier ${REDSHIFT_CLUSTER_IDENTIFIER}) + +export REDSHIFT_ENDPOINT=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].Endpoint.Address' ) +export REDSHIFT_PORT=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].Endpoint.Port' ) +export REDSHIFT_CLUSTER_DATABASE_NAME=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].DBName' ) +export REDSHIFT_USER=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].MasterUsername' ) +export REDSHIFT_PASSWORD diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7cb7d96aa947..320aedf67211 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -538,6 +538,7 @@ jobs: - { modules: plugin/trino-postgresql } - { modules: plugin/trino-raptor-legacy } - { modules: plugin/trino-redis } + - { modules: plugin/trino-redshift, profile: cloud-tests } - { modules: plugin/trino-singlestore } - { modules: plugin/trino-sqlserver } - { modules: testing/trino-faulttolerant-tests, profile: default } @@ -597,6 +598,7 @@ jobs: && ! (contains(matrix.modules, 'trino-delta-lake') && contains(matrix.profile, 'gcs-tests')) && ! (contains(matrix.modules, 'trino-iceberg') && contains(matrix.profile, 'cloud-tests')) && ! (contains(matrix.modules, 'trino-bigquery') && contains(matrix.profile, 'cloud-tests-arrow')) + && ! (contains(matrix.modules, 'trino-redshift') && contains(matrix.profile, 'cloud-tests')) run: $MAVEN test ${MAVEN_TEST} -pl ${{ matrix.modules }} ${{ matrix.profile != '' && format('-P {0}', matrix.profile) || '' }} # Additional tests for selected modules - name: Cloud Delta Lake Tests @@ -678,6 +680,38 @@ jobs: -Dhive.hadoop2.azure-abfs-container="${ABFS_CONTAINER}" \ -Dhive.hadoop2.azure-abfs-account="${ABFS_ACCOUNT}" \ -Dhive.hadoop2.azure-abfs-access-key="${ABFS_ACCESS_KEY}" + - name: Cloud Redshift Tests + env: + AWS_REGION: ${{ vars.REDSHIFT_AWS_REGION }} + AWS_ACCESS_KEY_ID: ${{ secrets.REDSHIFT_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.REDSHIFT_AWS_SECRET_ACCESS_KEY }} + REDSHIFT_SUBNET_GROUP_NAME: ${{ vars.REDSHIFT_SUBNET_GROUP_NAME }} + REDSHIFT_IAM_ROLES: ${{ vars.REDSHIFT_IAM_ROLES }} + REDSHIFT_VPC_SECURITY_GROUP_IDS: ${{ vars.REDSHIFT_VPC_SECURITY_GROUP_IDS }} + REDSHIFT_S3_TPCH_TABLES_ROOT: ${{ vars.REDSHIFT_S3_TPCH_TABLES_ROOT }} + if: >- + contains(matrix.modules, 'trino-redshift') && contains(matrix.profile, 'cloud-tests') && + (env.AWS_ACCESS_KEY_ID != '' || env.REDSHIFT_SUBNET_GROUP_NAME != '') + run: | + source .github/bin/redshift/setup-aws-redshift.sh + + $MAVEN test ${MAVEN_TEST} -pl :trino-redshift ${{ format('-P {0}', matrix.profile) }} \ + -Dtest.redshift.jdbc.user="${REDSHIFT_USER}" \ + -Dtest.redshift.jdbc.password="${REDSHIFT_PASSWORD}" \ + -Dtest.redshift.jdbc.endpoint="${REDSHIFT_ENDPOINT}:${REDSHIFT_PORT}/" \ + -Dtest.redshift.s3.tpch.tables.root="${REDSHIFT_S3_TPCH_TABLES_ROOT}" \ + -Dtest.redshift.iam.role="${REDSHIFT_IAM_ROLES}" \ + -Dtest.redshift.aws.region="${AWS_REGION}" \ + -Dtest.redshift.aws.access-key="${AWS_ACCESS_KEY_ID}" \ + -Dtest.redshift.aws.secret-key="${AWS_SECRET_ACCESS_KEY}" + - name: Cleanup ephemeral Redshift Cluster + env: + AWS_REGION: ${{ vars.REDSHIFT_AWS_REGION }} + AWS_ACCESS_KEY_ID: ${{ secrets.REDSHIFT_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.REDSHIFT_AWS_SECRET_ACCESS_KEY }} + # Cancelled workflows may have left the ephemeral cluster running + if: always() + run: .github/bin/redshift/delete-aws-redshift.sh - name: Sanitize artifact name if: always() run: | diff --git a/.gitignore b/.gitignore index 77a854990a1d..ea694efb023a 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,4 @@ product-test-reports /impacted-features.log .github/test-matrix.yaml .github/test-pt-matrix.yaml +.github/bin/redshift/.cluster-identifier diff --git a/plugin/trino-redshift/README.md b/plugin/trino-redshift/README.md index 16229b145da1..e8b00a63765a 100644 --- a/plugin/trino-redshift/README.md +++ b/plugin/trino-redshift/README.md @@ -6,7 +6,7 @@ is a single dc2.large instance. Additionally, you will need a S3 bucket containing TPCH tiny data in Parquet format. The files should be named: ``` -s3:///tpch/tiny/.parquet +s3:///tpch/tiny//*.parquet ``` To run the tests set the following system properties: @@ -18,3 +18,111 @@ test.redshift.jdbc.password= test.redshift.s3.tpch.tables.root= test.redshift.iam.role= ``` + +## Redshift Cluster CI Infrastructure setup + +### AWS VPC setup +On _AWS VPC_ service create a VPC - `redshift-vpc`. +Key properties to configure on the VPC: + +- `IPv4 CIDR`: `192.168.0.0/16` + +Create for the `redshift-vpc` an Internet Gateway - `redshift-igw`. + +Create a subnet for the VPC `redshift-public-subnet`. +In the route table of the subnet make sure to add the route +`Destination 0.0.0.0/0` to `Target` the previously created +internet gateway `redshift-igw`. + +Create a Security Group `redshift-sg`. +Make the following adjustments in the security group to allow access to the +Redshift cluster from the general purpose Github CI runners: + +- add an Inbound rule accepting `All traffic` from Source `0.0.0.0/0` +- add an Outbound rule for `All traffic` to destination `0.0.0.0/0` + +### Amazon Redshift setup + +Create a subnet group `cluster-subnet-group-trino-ci` associated with +the VPC `redshift-vpc` and the VPC subnet `redshift-public-subnet`. + +### AWS IAM setup + +Create the AWS IAM role `redshift-ci` and add to it +the `AmazonRedshiftAllCommandsFullAccess` policy. +This role will be passed to the ephemeral Redshift cluster to provide it with +the ability to execute `COPY` from AWS S3 bucket. + +Ensure that the AWS IAM user used by the CI process does have the ability to +create ephemeral Amazon Redshift clusters: + +``` +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "PassRoleToRedshiftCluster", + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "arn:aws:iam::894365193301:role/redshift-ci" + }, + { + "Sid": "RedshiftClusterManagement", + "Effect": "Allow", + "Action": [ + "redshift:DeleteTags", + "redshift:DeleteCluster", + "redshift:CreateTags", + "redshift:CreateCluster", + "redshift:DescribeClusters", + "redshift:DescribeLoggingStatus" + ], + "Resource": "arn:aws:redshift:us-east-2:894365193301:cluster:trino-redshift-ci-cluster-*" + }, + { + "Sid": "DescribeRedshiftVpcComponents", + "Effect": "Allow", + "Action": [ + "ec2:DescribeInternetGateways", + "ec2:DescribeAddresses", + "ec2:DescribeAvailabilityZones", + "ec2:DescribeVpcs", + "ec2:DescribeAccountAttributes", + "ec2:DescribeSubnets", + "ec2:DescribeSecurityGroups" + ], + "Resource": "*" + } + ] +} +``` + +### AWS S3 setup + +The `trino-redshift` tests rely on a Redshift cluster +having TPCH tables filled with data. +Create an AWS S3 bucket and add to it the parquet content +of `tpch` tables saved locally through the `trino-hive` connector +via commands like: + +``` +CREATE TABLE hive.tiny.table_name WITH (format= 'parquet') AS TABLE tpch.sf1.table_name +``` + +The content of the S3 bucket should look like this: + +``` +s3:///tpch/tiny//*.parquet +``` + +where `table_name` is: + +- `customer` +- `lineitem` +- `nation` +- `orders` +- `part` +- `partsupp` +- `region` +- `supplier` + diff --git a/plugin/trino-redshift/pom.xml b/plugin/trino-redshift/pom.xml index f91986c0b196..0813777a3971 100644 --- a/plugin/trino-redshift/pom.xml +++ b/plugin/trino-redshift/pom.xml @@ -174,9 +174,9 @@ - default + cloud-tests - true + false @@ -184,12 +184,12 @@ org.apache.maven.plugins maven-surefire-plugin - - **/TestRedshiftAutomaticJoinPushdown.java - **/TestRedshiftConnectorTest.java - **/TestRedshiftTableStatisticsReader.java - **/TestRedshiftTypeMapping.java - + + + + + **/TestRedshiftConnectorSmokeTest.java + diff --git a/plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/RedshiftQueryRunner.java b/plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/RedshiftQueryRunner.java index a9e5a21bca4c..97a67c1ffea1 100644 --- a/plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/RedshiftQueryRunner.java +++ b/plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/RedshiftQueryRunner.java @@ -105,7 +105,7 @@ public static DistributedQueryRunner createRedshiftQueryRunner( runner.installPlugin(new RedshiftPlugin()); runner.createCatalog(TEST_CATALOG, CONNECTOR_NAME, properties); - executeInRedshift("CREATE SCHEMA IF NOT EXISTS " + TEST_SCHEMA); + executeInRedshiftWithRetry("CREATE SCHEMA IF NOT EXISTS " + TEST_SCHEMA); createUserIfNotExists(NON_GRANTED_USER, JDBC_PASSWORD); createUserIfNotExists(GRANTED_USER, JDBC_PASSWORD); @@ -197,7 +197,7 @@ private static synchronized void provisionTables(Session session, QueryRunner qu private static void copyFromS3(QueryRunner queryRunner, Session session, String name) { - String s3Path = format("%s/%s/%s.parquet", S3_TPCH_TABLES_ROOT, TPCH_CATALOG, name); + String s3Path = format("%s/%s/%s/%s/", S3_TPCH_TABLES_ROOT, TPCH_CATALOG, TINY_SCHEMA_NAME, name); log.info("Creating table %s in Redshift copying from %s", name, s3Path); // Create table in ephemeral Redshift cluster with no data diff --git a/plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/TestRedshiftConnectorSmokeTest.java b/plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/TestRedshiftConnectorSmokeTest.java new file mode 100644 index 000000000000..7d7f43c45212 --- /dev/null +++ b/plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/TestRedshiftConnectorSmokeTest.java @@ -0,0 +1,49 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.redshift; + +import com.google.common.collect.ImmutableMap; +import io.trino.plugin.jdbc.BaseJdbcConnectorSmokeTest; +import io.trino.testing.QueryRunner; +import io.trino.testing.TestingConnectorBehavior; + +import static io.trino.plugin.redshift.RedshiftQueryRunner.createRedshiftQueryRunner; +import static io.trino.testing.TestingConnectorBehavior.SUPPORTS_RENAME_TABLE_ACROSS_SCHEMAS; + +public class TestRedshiftConnectorSmokeTest + extends BaseJdbcConnectorSmokeTest +{ + @Override + @SuppressWarnings("DuplicateBranchesInSwitch") // options here are grouped per-feature + protected boolean hasBehavior(TestingConnectorBehavior connectorBehavior) + { + switch (connectorBehavior) { + case SUPPORTS_RENAME_TABLE_ACROSS_SCHEMAS: + return false; + + default: + return super.hasBehavior(connectorBehavior); + } + } + + @Override + protected QueryRunner createQueryRunner() + throws Exception + { + return createRedshiftQueryRunner( + ImmutableMap.of(), + ImmutableMap.of(), + REQUIRED_TPCH_TABLES); + } +}