Skip to content

Commit

Permalink
Add CI infrastructure for trino-redshift
Browse files Browse the repository at this point in the history
Use an ephemeral/throw-away Redshift cluster for running integration tests
on `trino-redshift` connector.
Once the tests are run, the testing Redshift cluster is being reclaimed.

The Redshift cluster is publicly accessible in order to be accessible
from the general purpose Github runners.
  • Loading branch information
findinpath authored and findepi committed Mar 9, 2023
1 parent 3106f8d commit fe5a925
Show file tree
Hide file tree
Showing 8 changed files with 291 additions and 11 deletions.
34 changes: 34 additions & 0 deletions .github/bin/redshift/delete-aws-redshift.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env bash

set -uo pipefail

REDSHIFT_SCRIPTS_DIR="${BASH_SOURCE%/*}"

if [[ ! -f "${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier" ]]; then
echo "Missing file ${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier"
exit 0
fi

REDSHIFT_CLUSTER_IDENTIFIER=$(cat $REDSHIFT_SCRIPTS_DIR/.cluster-identifier)

echo "Deleting Amazon Redshift cluster $REDSHIFT_CLUSTER_IDENTIFIER"
REDSHIFT_DELETE_CLUSTER_OUTPUT=$(aws redshift delete-cluster --cluster-identifier $REDSHIFT_CLUSTER_IDENTIFIER --skip-final-cluster-snapshot)

if [ -z "${REDSHIFT_DELETE_CLUSTER_OUTPUT}" ]; then
echo ${REDSHIFT_DELETE_CLUSTER_OUTPUT}
# Don't fail the build because of cleanup issues
exit 0
fi

echo "Waiting for the Amazon Redshift cluster $REDSHIFT_CLUSTER_IDENTIFIER to be deleted"
aws redshift wait cluster-deleted \
--cluster-identifier $REDSHIFT_CLUSTER_IDENTIFIER
if [ "$?" -ne 0 ]
then
echo "Amazon Redshift cluster $REDSHIFT_CLUSTER_IDENTIFIER deletion has timed out"
else
echo "Amazon Redshift cluster $REDSHIFT_CLUSTER_IDENTIFIER has been deleted"
fi

rm -f ${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier
exit 0
54 changes: 54 additions & 0 deletions .github/bin/redshift/setup-aws-redshift.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env bash

set -euo pipefail

REDSHIFT_SCRIPTS_DIR="${BASH_SOURCE%/*}"

# Redshift requires passwords containing at least a digit, a lower case letter and a upper case letter.
# Having no warranty that openssl will output a string following the above mentioned password policy,
# add explicitly the string 'Red1!' to the password
REDSHIFT_PASSWORD="$(openssl rand -base64 16 | tr -dc 'a-zA-Z0-9')Red1!"

REDSHIFT_CLUSTER_IDENTIFIER=trino-redshift-ci-cluster-$(openssl rand -hex 8)

REDSHIFT_CLUSTER_TTL=$(date -u -d "+2 hours" +"%Y-%m-%dT%H:%M:%SZ" 2>/dev/null || date -u -v "+2H" +"%Y-%m-%dT%H:%M:%SZ")

echo "Creating the Amazon Redshift cluster ${REDSHIFT_CLUSTER_IDENTIFIER} on the region ${AWS_REGION}."
REDSHIFT_CREATE_CLUSTER_OUTPUT=$(aws redshift create-cluster \
--db-name testdb \
--region ${AWS_REGION} \
--node-type dc2.large \
--number-of-nodes 1 \
--master-username admin \
--master-user-password ${REDSHIFT_PASSWORD} \
--cluster-identifier ${REDSHIFT_CLUSTER_IDENTIFIER} \
--cluster-subnet-group-name ${REDSHIFT_SUBNET_GROUP_NAME} \
--cluster-type single-node\
--vpc-security-group-ids "${REDSHIFT_VPC_SECURITY_GROUP_IDS}" \
--iam-roles ${REDSHIFT_IAM_ROLES} \
--automated-snapshot-retention-period 0 \
--publicly-accessible \
--tags Key=cloud,Value=aws Key=environment,Value=test Key=project,Value=trino-redshift Key=ttl,Value=${REDSHIFT_CLUSTER_TTL})

if [ -z "${REDSHIFT_CREATE_CLUSTER_OUTPUT}" ]; then
# Only show errors
echo ${REDSHIFT_CREATE_CLUSTER_OUTPUT}
exit 1
fi

echo ${REDSHIFT_CLUSTER_IDENTIFIER} > ${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier
echo "Waiting for the Amazon Redshift cluster ${REDSHIFT_CLUSTER_IDENTIFIER} on the region ${AWS_REGION} to be available."

# Wait for the cluster to become available
aws redshift wait cluster-available \
--cluster-identifier ${REDSHIFT_CLUSTER_IDENTIFIER}

echo "The Amazon Redshift cluster ${REDSHIFT_CLUSTER_IDENTIFIER} on the region ${AWS_REGION} is available for queries."

REDSHIFT_CLUSTER_DESCRIPTION=$(aws redshift describe-clusters --cluster-identifier ${REDSHIFT_CLUSTER_IDENTIFIER})

export REDSHIFT_ENDPOINT=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].Endpoint.Address' )
export REDSHIFT_PORT=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].Endpoint.Port' )
export REDSHIFT_CLUSTER_DATABASE_NAME=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].DBName' )
export REDSHIFT_USER=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].MasterUsername' )
export REDSHIFT_PASSWORD
34 changes: 34 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,7 @@ jobs:
- { modules: plugin/trino-postgresql }
- { modules: plugin/trino-raptor-legacy }
- { modules: plugin/trino-redis }
- { modules: plugin/trino-redshift, profile: cloud-tests }
- { modules: plugin/trino-singlestore }
- { modules: plugin/trino-sqlserver }
- { modules: testing/trino-faulttolerant-tests, profile: default }
Expand Down Expand Up @@ -597,6 +598,7 @@ jobs:
&& ! (contains(matrix.modules, 'trino-delta-lake') && contains(matrix.profile, 'gcs-tests'))
&& ! (contains(matrix.modules, 'trino-iceberg') && contains(matrix.profile, 'cloud-tests'))
&& ! (contains(matrix.modules, 'trino-bigquery') && contains(matrix.profile, 'cloud-tests-arrow'))
&& ! (contains(matrix.modules, 'trino-redshift') && contains(matrix.profile, 'cloud-tests'))
run: $MAVEN test ${MAVEN_TEST} -pl ${{ matrix.modules }} ${{ matrix.profile != '' && format('-P {0}', matrix.profile) || '' }}
# Additional tests for selected modules
- name: Cloud Delta Lake Tests
Expand Down Expand Up @@ -678,6 +680,38 @@ jobs:
-Dhive.hadoop2.azure-abfs-container="${ABFS_CONTAINER}" \
-Dhive.hadoop2.azure-abfs-account="${ABFS_ACCOUNT}" \
-Dhive.hadoop2.azure-abfs-access-key="${ABFS_ACCESS_KEY}"
- name: Cloud Redshift Tests
env:
AWS_REGION: ${{ vars.REDSHIFT_AWS_REGION }}
AWS_ACCESS_KEY_ID: ${{ secrets.REDSHIFT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.REDSHIFT_AWS_SECRET_ACCESS_KEY }}
REDSHIFT_SUBNET_GROUP_NAME: ${{ vars.REDSHIFT_SUBNET_GROUP_NAME }}
REDSHIFT_IAM_ROLES: ${{ vars.REDSHIFT_IAM_ROLES }}
REDSHIFT_VPC_SECURITY_GROUP_IDS: ${{ vars.REDSHIFT_VPC_SECURITY_GROUP_IDS }}
REDSHIFT_S3_TPCH_TABLES_ROOT: ${{ vars.REDSHIFT_S3_TPCH_TABLES_ROOT }}
if: >-
contains(matrix.modules, 'trino-redshift') && contains(matrix.profile, 'cloud-tests') &&
(env.AWS_ACCESS_KEY_ID != '' || env.REDSHIFT_SUBNET_GROUP_NAME != '')
run: |
source .github/bin/redshift/setup-aws-redshift.sh
$MAVEN test ${MAVEN_TEST} -pl :trino-redshift ${{ format('-P {0}', matrix.profile) }} \
-Dtest.redshift.jdbc.user="${REDSHIFT_USER}" \
-Dtest.redshift.jdbc.password="${REDSHIFT_PASSWORD}" \
-Dtest.redshift.jdbc.endpoint="${REDSHIFT_ENDPOINT}:${REDSHIFT_PORT}/" \
-Dtest.redshift.s3.tpch.tables.root="${REDSHIFT_S3_TPCH_TABLES_ROOT}" \
-Dtest.redshift.iam.role="${REDSHIFT_IAM_ROLES}" \
-Dtest.redshift.aws.region="${AWS_REGION}" \
-Dtest.redshift.aws.access-key="${AWS_ACCESS_KEY_ID}" \
-Dtest.redshift.aws.secret-key="${AWS_SECRET_ACCESS_KEY}"
- name: Cleanup ephemeral Redshift Cluster
env:
AWS_REGION: ${{ vars.REDSHIFT_AWS_REGION }}
AWS_ACCESS_KEY_ID: ${{ secrets.REDSHIFT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.REDSHIFT_AWS_SECRET_ACCESS_KEY }}
# Cancelled workflows may have left the ephemeral cluster running
if: always()
run: .github/bin/redshift/delete-aws-redshift.sh
- name: Sanitize artifact name
if: always()
run: |
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@ product-test-reports
/impacted-features.log
.github/test-matrix.yaml
.github/test-pt-matrix.yaml
.github/bin/redshift/.cluster-identifier
110 changes: 109 additions & 1 deletion plugin/trino-redshift/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ is a single dc2.large instance. Additionally, you will need a S3 bucket
containing TPCH tiny data in Parquet format. The files should be named:

```
s3://<your_bucket>/tpch/tiny/<table_name>.parquet
s3://<your_bucket>/tpch/tiny/<table_name>/*.parquet
```

To run the tests set the following system properties:
Expand All @@ -18,3 +18,111 @@ test.redshift.jdbc.password=<password>
test.redshift.s3.tpch.tables.root=<your_bucket>
test.redshift.iam.role=<your_iam_arm_to_access_bucket>
```

## Redshift Cluster CI Infrastructure setup

### AWS VPC setup
On _AWS VPC_ service create a VPC - `redshift-vpc`.
Key properties to configure on the VPC:

- `IPv4 CIDR`: `192.168.0.0/16`

Create for the `redshift-vpc` an Internet Gateway - `redshift-igw`.

Create a subnet for the VPC `redshift-public-subnet`.
In the route table of the subnet make sure to add the route
`Destination 0.0.0.0/0` to `Target` the previously created
internet gateway `redshift-igw`.

Create a Security Group `redshift-sg`.
Make the following adjustments in the security group to allow access to the
Redshift cluster from the general purpose Github CI runners:

- add an Inbound rule accepting `All traffic` from Source `0.0.0.0/0`
- add an Outbound rule for `All traffic` to destination `0.0.0.0/0`

### Amazon Redshift setup

Create a subnet group `cluster-subnet-group-trino-ci` associated with
the VPC `redshift-vpc` and the VPC subnet `redshift-public-subnet`.

### AWS IAM setup

Create the AWS IAM role `redshift-ci` and add to it
the `AmazonRedshiftAllCommandsFullAccess` policy.
This role will be passed to the ephemeral Redshift cluster to provide it with
the ability to execute `COPY` from AWS S3 bucket.

Ensure that the AWS IAM user used by the CI process does have the ability to
create ephemeral Amazon Redshift clusters:

```
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "PassRoleToRedshiftCluster",
"Effect": "Allow",
"Action": "iam:PassRole",
"Resource": "arn:aws:iam::894365193301:role/redshift-ci"
},
{
"Sid": "RedshiftClusterManagement",
"Effect": "Allow",
"Action": [
"redshift:DeleteTags",
"redshift:DeleteCluster",
"redshift:CreateTags",
"redshift:CreateCluster",
"redshift:DescribeClusters",
"redshift:DescribeLoggingStatus"
],
"Resource": "arn:aws:redshift:us-east-2:894365193301:cluster:trino-redshift-ci-cluster-*"
},
{
"Sid": "DescribeRedshiftVpcComponents",
"Effect": "Allow",
"Action": [
"ec2:DescribeInternetGateways",
"ec2:DescribeAddresses",
"ec2:DescribeAvailabilityZones",
"ec2:DescribeVpcs",
"ec2:DescribeAccountAttributes",
"ec2:DescribeSubnets",
"ec2:DescribeSecurityGroups"
],
"Resource": "*"
}
]
}
```

### AWS S3 setup

The `trino-redshift` tests rely on a Redshift cluster
having TPCH tables filled with data.
Create an AWS S3 bucket and add to it the parquet content
of `tpch` tables saved locally through the `trino-hive` connector
via commands like:

```
CREATE TABLE hive.tiny.table_name WITH (format= 'parquet') AS TABLE tpch.sf1.table_name
```

The content of the S3 bucket should look like this:

```
s3://<your_bucket>/tpch/tiny/<table_name>/*.parquet
```

where `table_name` is:

- `customer`
- `lineitem`
- `nation`
- `orders`
- `part`
- `partsupp`
- `region`
- `supplier`

16 changes: 8 additions & 8 deletions plugin/trino-redshift/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -174,22 +174,22 @@

<profiles>
<profile>
<id>default</id>
<id>cloud-tests</id>
<activation>
<activeByDefault>true</activeByDefault>
<activeByDefault>false</activeByDefault>
</activation>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<excludes>
<exclude>**/TestRedshiftAutomaticJoinPushdown.java</exclude>
<exclude>**/TestRedshiftConnectorTest.java</exclude>
<exclude>**/TestRedshiftTableStatisticsReader.java</exclude>
<exclude>**/TestRedshiftTypeMapping.java</exclude>
</excludes>
<includes>
<!-- Run only the smoke tests of the connector on the CI environment due to unpredictable -->
<!-- locations of GitHub runners which can lead to increased client latency on the -->
<!-- JDBC operations performed on the ephemeral AWS Redshift cluster. -->
<include>**/TestRedshiftConnectorSmokeTest.java</include>
</includes>
</configuration>
</plugin>
</plugins>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ public static DistributedQueryRunner createRedshiftQueryRunner(
runner.installPlugin(new RedshiftPlugin());
runner.createCatalog(TEST_CATALOG, CONNECTOR_NAME, properties);

executeInRedshift("CREATE SCHEMA IF NOT EXISTS " + TEST_SCHEMA);
executeInRedshiftWithRetry("CREATE SCHEMA IF NOT EXISTS " + TEST_SCHEMA);
createUserIfNotExists(NON_GRANTED_USER, JDBC_PASSWORD);
createUserIfNotExists(GRANTED_USER, JDBC_PASSWORD);

Expand Down Expand Up @@ -197,7 +197,7 @@ private static synchronized void provisionTables(Session session, QueryRunner qu

private static void copyFromS3(QueryRunner queryRunner, Session session, String name)
{
String s3Path = format("%s/%s/%s.parquet", S3_TPCH_TABLES_ROOT, TPCH_CATALOG, name);
String s3Path = format("%s/%s/%s/%s/", S3_TPCH_TABLES_ROOT, TPCH_CATALOG, TINY_SCHEMA_NAME, name);
log.info("Creating table %s in Redshift copying from %s", name, s3Path);

// Create table in ephemeral Redshift cluster with no data
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.plugin.redshift;

import com.google.common.collect.ImmutableMap;
import io.trino.plugin.jdbc.BaseJdbcConnectorSmokeTest;
import io.trino.testing.QueryRunner;
import io.trino.testing.TestingConnectorBehavior;

import static io.trino.plugin.redshift.RedshiftQueryRunner.createRedshiftQueryRunner;
import static io.trino.testing.TestingConnectorBehavior.SUPPORTS_RENAME_TABLE_ACROSS_SCHEMAS;

public class TestRedshiftConnectorSmokeTest
extends BaseJdbcConnectorSmokeTest
{
@Override
@SuppressWarnings("DuplicateBranchesInSwitch") // options here are grouped per-feature
protected boolean hasBehavior(TestingConnectorBehavior connectorBehavior)
{
switch (connectorBehavior) {
case SUPPORTS_RENAME_TABLE_ACROSS_SCHEMAS:
return false;

default:
return super.hasBehavior(connectorBehavior);
}
}

@Override
protected QueryRunner createQueryRunner()
throws Exception
{
return createRedshiftQueryRunner(
ImmutableMap.of(),
ImmutableMap.of(),
REQUIRED_TPCH_TABLES);
}
}

0 comments on commit fe5a925

Please sign in to comment.