Skip to content

Commit

Permalink
Create RDS database snapshot before executing alembic migrations (#1267)
Browse files Browse the repository at this point in the history
### Feature or Bugfix
- Feature

### Detail
Alembic migrations can get complex and in some cases we are using
alembic for not only schema migrations but also data migrations. When
moving columns with data from one table to another we might accidentally
make a mistake in a migration script. We strive to test all migration
scripts and avoid bugs in such sensitive operations, but to protect
users from the catastrophic situation in which there is a bug, a service
issue or any other exceptional situation this PR introduces the creation
of manual database snapshots before running alembic migration scripts.

This PR modifies the db_migration handler that is triggered with every
backendStack update. It checks if there are new migration scripts (if
the current head in the database is different from the new head in the
code). If True, it will create a cluster snapshot.

Remarks:
- Snapshots cannot be created when the cluster in not `available`, the
PR introduces a check to wait for this condition. If the Lambda timeout
is reached waiting for the cluster, then the CICD pipeline will fail and
will need to be retried
- During the creation of an snapshot we can still run alembic migration
scripts
- Snapshots are incremental, the first time will take a long time, but
new snapshots will be faster

### Relates
- #1258 - This PR is a good example of complex data migration
operations.

### Security
Please answer the questions below briefly where applicable, or write
`N/A`. Based on
[OWASP 10](https://owasp.org/Top10/en/).

- Does this PR introduce or modify any input fields or queries - this
includes
fetching data from storage outside the application (e.g. a database, an
S3 bucket)?
  - Is the input sanitized?
- What precautions are you taking before deserializing the data you
consume?
  - Is injection prevented by parametrizing queries?
  - Have you ensured no `eval` or similar functions are used?
- Does this PR introduce any functionality or component that requires
authorization?
- How have you ensured it respects the existing AuthN/AuthZ mechanisms?
  - Are you logging failed auth attempts?
- Are you using or adding any cryptographic features?
  - Do you use a standard proven implementations?
  - Are the used keys controlled by the customer? Where are they stored?
- Are you introducing any new policies/roles/users?
  - Have you used the least-privilege principle? How?


By submitting this pull request, I confirm that my contribution is made
under the terms of the Apache 2.0 license.
  • Loading branch information
dlpzx authored May 16, 2024
1 parent 91a1344 commit ee53816
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 6 deletions.
Empty file.
File renamed without changes.
67 changes: 67 additions & 0 deletions backend/deployment_triggers/dbsnapshots_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
"""
The handler of this module will be called once upon every deployment
"""

import logging
import os
import datetime
import boto3
import time
from alembic import command
from alembic.script import ScriptDirectory
from alembic.migration import MigrationContext
from alembic.config import Config
from dataall.base.db.connection import ENVNAME, get_engine

logger = logging.getLogger()
logger.setLevel(os.environ.get('LOG_LEVEL', 'INFO'))


def handler(event, context) -> None:
"""
This function will be called once upon every deployment.
It checks if there are any alembic migration scripts to execute.
If there are, it will create a snapshot of the database.
It executes the alembic migration scripts.
"""
alembic_cfg = Config('alembic.ini')
alembic_cfg.set_main_option('script_location', './migrations')

# Get head version
script = ScriptDirectory.from_config(alembic_cfg)
head_rev = script.get_current_head()

# Get current version from database
engine = get_engine(ENVNAME)
with engine.engine.connect() as connection:
context = MigrationContext.configure(connection)
current_rev = context.get_current_revision()

if head_rev != current_rev:
snapshot_id = f'{os.environ.get("resource_prefix", "dataall")}-migration-{head_rev}-{datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")}'
cluster_id = engine.dbconfig.host.split('.')[0]
logger.info(
f'Creating RDS snapshot for cluster {cluster_id}, head revision {head_rev} is ahead of {current_rev}...'
)
try:
rds_client = boto3.client('rds', region_name=os.getenv('AWS_REGION'))
# Edge case in which the cluster is performing backup and/or maintenance operations.
# If it times out the CICD pipeline fails and needs to be retried.
while (
cluster_status := rds_client.describe_db_clusters(DBClusterIdentifier=cluster_id)['DBClusters'][0][
'Status'
]
) != 'available':
logger.info(f'Waiting while the cluster is available, {cluster_status=}')
time.sleep(30)

rds_client.create_db_cluster_snapshot(
DBClusterSnapshotIdentifier=snapshot_id,
DBClusterIdentifier=cluster_id,
Tags=[
{'Key': 'Application', 'Value': 'dataall'},
],
)
except Exception as e:
logger.exception(f'Failed to create RDS snapshot: {e}')
raise Exception(f'Failed to create RDS snapshot: {e}')
File renamed without changes.
36 changes: 33 additions & 3 deletions deploy/stacks/backend_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,25 +310,55 @@ def __init__(
**kwargs,
)

db_snapshots = TriggerFunctionStack(
self,
'DbSnapshots',
handler='deployment_triggers.dbsnapshots_handler.handler',
envname=envname,
resource_prefix=resource_prefix,
vpc=vpc,
vpce_connection=vpce_connection,
image_tag=image_tag,
ecr_repository=repo,
execute_after=[aurora_stack.cluster],
connectables=[aurora_stack.cluster],
additional_policy_statements=[
iam.PolicyStatement(
effect=iam.Effect.ALLOW,
actions=['rds:AddTagsToResource', 'rds:CreateDBClusterSnapshot'],
resources=[
f'arn:aws:rds:*:{self.account}:cluster-snapshot:{resource_prefix}*',
f'arn:aws:rds:*:{self.account}:cluster:{resource_prefix}*',
],
),
iam.PolicyStatement(
effect=iam.Effect.ALLOW,
actions=['rds:DescribeDBClusters'],
resources=['*'],
),
],
**kwargs,
)

db_migrations = TriggerFunctionStack(
self,
'DbMigrations',
handler='dbmigrations_handler.handler',
handler='deployment_triggers.dbmigrations_handler.handler',
envname=envname,
resource_prefix=resource_prefix,
vpc=vpc,
vpce_connection=vpce_connection,
image_tag=image_tag,
ecr_repository=repo,
execute_after=[aurora_stack.cluster],
execute_after=[db_snapshots.trigger_function],
connectables=[aurora_stack.cluster],
**kwargs,
)

TriggerFunctionStack(
self,
'SavePerms',
handler='saveperms_handler.handler',
handler='deployment_triggers.saveperms_handler.handler',
envname=envname,
resource_prefix=resource_prefix,
vpc=vpc,
Expand Down
7 changes: 4 additions & 3 deletions deploy/stacks/trigger_function_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def __init__(
vpce_connection: ec2.IConnectable = None,
connectables: List[ec2.IConnectable] = [],
execute_after: List[Construct] = [],
additional_policy_statements: List[iam.PolicyStatement] = [],
**kwargs,
):
super().__init__(scope, id, **kwargs)
Expand All @@ -35,16 +36,16 @@ def __init__(
image_tag = self.node.try_get_context('image_tag')
image_tag = f'lambdas-{image_tag}'

env = {'envname': envname, 'LOG_LEVEL': 'INFO'}
env = {'envname': envname, 'resource_prefix': resource_prefix, 'LOG_LEVEL': 'INFO'}

function_sgs = self.create_lambda_sgs(envname, handler, resource_prefix, vpc)

statements = self.get_policy_statements(resource_prefix) + (additional_policy_statements or [])
self.trigger_function = TriggerFunction(
self,
f'TriggerFunction-{handler}',
function_name=f'{resource_prefix}-{envname}-{handler.replace(".", "_")}',
description=f'dataall {handler} trigger function',
initial_policy=self.get_policy_statements(resource_prefix),
initial_policy=statements,
code=_lambda.Code.from_ecr_image(repository=ecr_repository, tag=image_tag, cmd=[handler]),
vpc=vpc,
security_groups=[function_sgs],
Expand Down

0 comments on commit ee53816

Please sign in to comment.