Create RDS database snapshot before executing alembic migrations (#1267)

### Feature or Bugfix - Feature ### Detail Alembic migrations can get complex and in some cases we are using alembic for not only schema migrations but also data migrations. When moving columns with data from one table to another we might accidentally make a mistake in a migration script. We strive to test all migration scripts and avoid bugs in such sensitive operations, but to protect users from the catastrophic situation in which there is a bug, a service issue or any other exceptional situation this PR introduces the creation of manual database snapshots before running alembic migration scripts. This PR modifies the db_migration handler that is triggered with every backendStack update. It checks if there are new migration scripts (if the current head in the database is different from the new head in the code). If True, it will create a cluster snapshot. Remarks: - Snapshots cannot be created when the cluster in not `available`, the PR introduces a check to wait for this condition. If the Lambda timeout is reached waiting for the cluster, then the CICD pipeline will fail and will need to be retried - During the creation of an snapshot we can still run alembic migration scripts - Snapshots are incremental, the first time will take a long time, but new snapshots will be faster ### Relates - #1258 - This PR is a good example of complex data migration operations. ### Security Please answer the questions below briefly where applicable, or write `N/A`. Based on [OWASP 10](https://owasp.org/Top10/en/). - Does this PR introduce or modify any input fields or queries - this includes fetching data from storage outside the application (e.g. a database, an S3 bucket)? - Is the input sanitized? - What precautions are you taking before deserializing the data you consume? - Is injection prevented by parametrizing queries? - Have you ensured no `eval` or similar functions are used? - Does this PR introduce any functionality or component that requires authorization? - How have you ensured it respects the existing AuthN/AuthZ mechanisms? - Are you logging failed auth attempts? - Are you using or adding any cryptographic features? - Do you use a standard proven implementations? - Are the used keys controlled by the customer? Where are they stored? - Are you introducing any new policies/roles/users? - Have you used the least-privilege principle? How? By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license.
data-dot-all · May 16, 2024 · ee53816 · ee53816
1 parent 91a1344
commit ee53816
Show file tree

Hide file tree

Showing 6 changed files with 104 additions and 6 deletions.
diff --git a/backend/deployment_triggers/__init__.py b/backend/deployment_triggers/__init__.py
diff --git a/backend/dbmigrations_handler.py → ...ployment_triggers/dbmigrations_handler.py b/backend/dbmigrations_handler.py → ...ployment_triggers/dbmigrations_handler.py
diff --git a/backend/deployment_triggers/dbsnapshots_handler.py b/backend/deployment_triggers/dbsnapshots_handler.py
@@ -0,0 +1,67 @@
+"""
+The handler of this module will be called once upon every deployment
+"""
+
+import logging
+import os
+import datetime
+import boto3
+import time
+from alembic import command
+from alembic.script import ScriptDirectory
+from alembic.migration import MigrationContext
+from alembic.config import Config
+from dataall.base.db.connection import ENVNAME, get_engine
+
+logger = logging.getLogger()
+logger.setLevel(os.environ.get('LOG_LEVEL', 'INFO'))
+
+
+def handler(event, context) -> None:
+    """
+    This function will be called once upon every deployment.
+    It checks if there are any alembic migration scripts to execute.
+    If there are, it will create a snapshot of the database.
+    It executes the alembic migration scripts.
+    """
+    alembic_cfg = Config('alembic.ini')
+    alembic_cfg.set_main_option('script_location', './migrations')
+
+    # Get head version
+    script = ScriptDirectory.from_config(alembic_cfg)
+    head_rev = script.get_current_head()
+
+    # Get current version from database
+    engine = get_engine(ENVNAME)
+    with engine.engine.connect() as connection:
+        context = MigrationContext.configure(connection)
+        current_rev = context.get_current_revision()
+
+    if head_rev != current_rev:
+        snapshot_id = f'{os.environ.get("resource_prefix", "dataall")}-migration-{head_rev}-{datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")}'
+        cluster_id = engine.dbconfig.host.split('.')[0]
+        logger.info(
+            f'Creating RDS snapshot for cluster {cluster_id}, head revision {head_rev} is ahead of {current_rev}...'
+        )
+        try:
+            rds_client = boto3.client('rds', region_name=os.getenv('AWS_REGION'))
+            # Edge case in which the cluster is performing backup and/or maintenance operations.
+            # If it times out the CICD pipeline fails and needs to be retried.
+            while (
+                cluster_status := rds_client.describe_db_clusters(DBClusterIdentifier=cluster_id)['DBClusters'][0][
+                    'Status'
+                ]
+            ) != 'available':
+                logger.info(f'Waiting while the cluster is available, {cluster_status=}')
+                time.sleep(30)
+
+            rds_client.create_db_cluster_snapshot(
+                DBClusterSnapshotIdentifier=snapshot_id,
+                DBClusterIdentifier=cluster_id,
+                Tags=[
+                    {'Key': 'Application', 'Value': 'dataall'},
+                ],
+            )
+        except Exception as e:
+            logger.exception(f'Failed to create RDS snapshot: {e}')
+            raise Exception(f'Failed to create RDS snapshot: {e}')
diff --git a/backend/saveperms_handler.py → .../deployment_triggers/saveperms_handler.py b/backend/saveperms_handler.py → .../deployment_triggers/saveperms_handler.py
diff --git a/deploy/stacks/backend_stack.py b/deploy/stacks/backend_stack.py
@@ -310,25 +310,55 @@ def __init__(
             **kwargs,
         )
 
+        db_snapshots = TriggerFunctionStack(
+            self,
+            'DbSnapshots',
+            handler='deployment_triggers.dbsnapshots_handler.handler',
+            envname=envname,
+            resource_prefix=resource_prefix,
+            vpc=vpc,
+            vpce_connection=vpce_connection,
+            image_tag=image_tag,
+            ecr_repository=repo,
+            execute_after=[aurora_stack.cluster],
+            connectables=[aurora_stack.cluster],
+            additional_policy_statements=[
+                iam.PolicyStatement(
+                    effect=iam.Effect.ALLOW,
+                    actions=['rds:AddTagsToResource', 'rds:CreateDBClusterSnapshot'],
+                    resources=[
+                        f'arn:aws:rds:*:{self.account}:cluster-snapshot:{resource_prefix}*',
+                        f'arn:aws:rds:*:{self.account}:cluster:{resource_prefix}*',
+                    ],
+                ),
+                iam.PolicyStatement(
+                    effect=iam.Effect.ALLOW,
+                    actions=['rds:DescribeDBClusters'],
+                    resources=['*'],
+                ),
+            ],
+            **kwargs,
+        )
+
         db_migrations = TriggerFunctionStack(
             self,
             'DbMigrations',
-            handler='dbmigrations_handler.handler',
+            handler='deployment_triggers.dbmigrations_handler.handler',
             envname=envname,
             resource_prefix=resource_prefix,
             vpc=vpc,
             vpce_connection=vpce_connection,
             image_tag=image_tag,
             ecr_repository=repo,
-            execute_after=[aurora_stack.cluster],
+            execute_after=[db_snapshots.trigger_function],
             connectables=[aurora_stack.cluster],
             **kwargs,
         )
 
         TriggerFunctionStack(
             self,
             'SavePerms',
-            handler='saveperms_handler.handler',
+            handler='deployment_triggers.saveperms_handler.handler',
             envname=envname,
             resource_prefix=resource_prefix,
             vpc=vpc,

diff --git a/deploy/stacks/trigger_function_stack.py b/deploy/stacks/trigger_function_stack.py
@@ -27,6 +27,7 @@ def __init__(
         vpce_connection: ec2.IConnectable = None,
         connectables: List[ec2.IConnectable] = [],
         execute_after: List[Construct] = [],
+        additional_policy_statements: List[iam.PolicyStatement] = [],
         **kwargs,
     ):
         super().__init__(scope, id, **kwargs)
@@ -35,16 +36,16 @@ def __init__(
             image_tag = self.node.try_get_context('image_tag')
         image_tag = f'lambdas-{image_tag}'
 
-        env = {'envname': envname, 'LOG_LEVEL': 'INFO'}
+        env = {'envname': envname, 'resource_prefix': resource_prefix, 'LOG_LEVEL': 'INFO'}
 
         function_sgs = self.create_lambda_sgs(envname, handler, resource_prefix, vpc)
-
+        statements = self.get_policy_statements(resource_prefix) + (additional_policy_statements or [])
         self.trigger_function = TriggerFunction(
             self,
             f'TriggerFunction-{handler}',
             function_name=f'{resource_prefix}-{envname}-{handler.replace(".", "_")}',
             description=f'dataall {handler} trigger function',
-            initial_policy=self.get_policy_statements(resource_prefix),
+            initial_policy=statements,
             code=_lambda.Code.from_ecr_image(repository=ecr_repository, tag=image_tag, cmd=[handler]),
             vpc=vpc,
             security_groups=[function_sgs],