From e9e78c5e1ffbe4bc2c6d1009f69cc66c579c27ff Mon Sep 17 00:00:00 2001 From: Alexandre Allard Date: Wed, 6 Oct 2021 09:25:11 +0200 Subject: [PATCH] salt: Schedule daily backup on bootstrap node --- CHANGELOG.md | 7 ++ buildchain/buildchain/salt_tree.py | 2 + .../bootstrap_backup_restore.rst | 2 +- salt/_modules/metalk8s.py | 24 +++++ salt/_utils/kubernetes_utils.py | 14 +++ salt/metalk8s/backup/configured.sls | 4 + .../orchestrate/backup/files/job.yaml.j2 | 59 +++++++++++++ .../orchestrate/backup/replication.sls | 16 ++++ salt/metalk8s/roles/bootstrap/init.sls | 1 + salt/tests/unit/formulas/config.yaml | 9 ++ .../unit/modules/files/test_metalk8s.yaml | 15 ++++ salt/tests/unit/modules/test_metalk8s.py | 23 +++++ scripts/backup.sh.in | 88 +++++-------------- scripts/downgrade.sh.in | 5 +- scripts/upgrade.sh.in | 5 +- 15 files changed, 204 insertions(+), 70 deletions(-) create mode 100644 salt/metalk8s/backup/configured.sls create mode 100644 salt/metalk8s/orchestrate/backup/files/job.yaml.j2 create mode 100644 salt/metalk8s/orchestrate/backup/replication.sls diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d8d86557b..8f81e465ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # CHANGELOG ## Release 2.10.4 (in development) + +### Features Added + +- A daily backup of the bootstrap node is now automatically scheduled. + All the backups are also replicated onto all the master nodes. + (PR [#3557](https://github.com/scality/metalk8s/pull/3557)) + ## Release 2.10.3 ### Enhancements diff --git a/buildchain/buildchain/salt_tree.py b/buildchain/buildchain/salt_tree.py index 356edc18eb..9bc1477e3d 100644 --- a/buildchain/buildchain/salt_tree.py +++ b/buildchain/buildchain/salt_tree.py @@ -522,6 +522,8 @@ def _get_parts(self) -> Iterator[str]: Path("salt/metalk8s/orchestrate/deploy_node.sls"), Path("salt/metalk8s/orchestrate/etcd.sls"), Path("salt/metalk8s/orchestrate/register_etcd.sls"), + Path("salt/metalk8s/orchestrate/backup/files/job.yaml.j2"), + Path("salt/metalk8s/orchestrate/backup/replication.sls"), Path("salt/metalk8s/orchestrate/bootstrap/init.sls"), Path("salt/metalk8s/orchestrate/bootstrap/accept-minion.sls"), Path("salt/metalk8s/orchestrate/bootstrap/pre-downgrade.sls"), diff --git a/docs/operation/disaster_recovery/bootstrap_backup_restore.rst b/docs/operation/disaster_recovery/bootstrap_backup_restore.rst index 76a42f55f6..88475e5e7e 100644 --- a/docs/operation/disaster_recovery/bootstrap_backup_restore.rst +++ b/docs/operation/disaster_recovery/bootstrap_backup_restore.rst @@ -25,7 +25,7 @@ To create a new backup file, run the following command: /srv/scality/metalk8s-|version|/backup.sh -Backup archives are stored in /var/lib/metalk8s/backups. +Backup archives are stored in /var/lib/metalk8s/backups on all master nodes. Restoring a Bootstrap Node ************************** diff --git a/salt/_modules/metalk8s.py b/salt/_modules/metalk8s.py index 1f8b456666..3af9803e87 100644 --- a/salt/_modules/metalk8s.py +++ b/salt/_modules/metalk8s.py @@ -691,3 +691,27 @@ def configure_archive(archive, remove=False): msg = "Archive '{0}' {1}".format(archive, msg) log.info(msg) return msg + + +def backup_node(): + metalk8s_version = __pillar__["metalk8s"]["cluster_version"] + archives = get_archives() + + try: + archive_path = archives[f"metalk8s-{metalk8s_version}"]["path"] + except KeyError as exc: + raise CommandExecutionError( + f"No MetalK8s archive found for version {metalk8s_version}" + ) from exc + + backup_script = f"{archive_path}/backup.sh" + result = __salt__["cmd.run_all"](cmd=backup_script) + log.debug("Result: %r", result) + + if result["retcode"] != 0: + output = result.get("stderr") or result["stdout"] + raise CommandExecutionError(f"Failed to run {backup_script}: {output}") + + msg = "Backup successfully generated" + log.info(msg) + return msg diff --git a/salt/_utils/kubernetes_utils.py b/salt/_utils/kubernetes_utils.py index bef203911a..ed30d20a50 100644 --- a/salt/_utils/kubernetes_utils.py +++ b/salt/_utils/kubernetes_utils.py @@ -296,6 +296,13 @@ def __init__(self, model, api_cls, name, method_names=None): name="api_service", ), # }}} + # /apis/batch/v1/ {{{ + ("batch/v1", "Job"): KindInfo( + model=k8s_client.V1Job, + api_cls=k8s_client.BatchV1Api, + name="namespaced_job", + ), + # }}} # /apis/batch/v1beta1/ {{{ ("batch/v1beta1", "CronJob"): KindInfo( model=k8s_client.V1beta1CronJob, @@ -303,6 +310,13 @@ def __init__(self, model, api_cls, name, method_names=None): name="namespaced_cron_job", ), # }}} + # /apis/networking.k8s.io/v1/ {{{ + ("networking.k8s.io/v1", "NetworkPolicy"): KindInfo( + model=k8s_client.V1NetworkPolicy, + api_cls=k8s_client.NetworkingV1Api, + name="namespaced_network_policy", + ), + # }}} # /apis/networking.k8s.io/v1beta1/ {{{ ("networking.k8s.io/v1beta1", "Ingress"): KindInfo( model=k8s_client.NetworkingV1beta1Ingress, diff --git a/salt/metalk8s/backup/configured.sls b/salt/metalk8s/backup/configured.sls new file mode 100644 index 0000000000..1df97fe1e4 --- /dev/null +++ b/salt/metalk8s/backup/configured.sls @@ -0,0 +1,4 @@ +Schedule daily backup: + schedule.present: + - function: metalk8s.backup_node + - seconds: 86400 diff --git a/salt/metalk8s/orchestrate/backup/files/job.yaml.j2 b/salt/metalk8s/orchestrate/backup/files/job.yaml.j2 new file mode 100644 index 0000000000..5d875293c3 --- /dev/null +++ b/salt/metalk8s/orchestrate/backup/files/job.yaml.j2 @@ -0,0 +1,59 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: backup-replication-{{ node }} + namespace: kube-system + labels: + app.kubernetes.io/name: backup-replication + app.kubernetes.io/part-of: metalk8s + app.kubernetes.io/managed-by: salt +spec: + backoffLimit: 4 + parallelism: 1 + ttlSecondsAfterFinished: 120 + template: + metadata: + labels: + app.kubernetes.io/name: backup-replication + app.kubernetes.io/part-of: metalk8s + app.kubernetes.io/managed-by: salt + spec: + nodeName: {{ node }} + containers: + - name: backup-replication + image: {{ image }} + command: + - wget + - --accept="*.tar.gz" + - --no-host-directories + - --mirror + - --ca-certificate=/certificate/ca.crt + - --user=$(BACKUP_USERNAME) + - --password=$(BACKUP_PASSWORD) + - https://backup/ + env: + - name: BACKUP_USERNAME + valueFrom: + secretKeyRef: + name: backup-credentials + key: username + - name: BACKUP_PASSWORD + valueFrom: + secretKeyRef: + name: backup-credentials + key: password + volumeMounts: + - name: backups + mountPath: /backups + - name: ca-cert + mountPath: /certificate + workingDir: /backups + restartPolicy: Never + volumes: + - name: backups + hostPath: + path: /var/lib/metalk8s/backups + type: DirectoryOrCreate + - name: ca-cert + configMap: + name: backup-ca-cert diff --git a/salt/metalk8s/orchestrate/backup/replication.sls b/salt/metalk8s/orchestrate/backup/replication.sls new file mode 100644 index 0000000000..a6398711a3 --- /dev/null +++ b/salt/metalk8s/orchestrate/backup/replication.sls @@ -0,0 +1,16 @@ +{%- from "metalk8s/repo/macro.sls" import build_image_name with context %} + +{%- set master_nodes = salt.metalk8s.minions_by_role('master') %} +{%- set image = build_image_name("metalk8s-utils") %} + +{%- for node in master_nodes | sort %} + +Schedule backup replication Job on {{ node }}: + metalk8s_kubernetes.object_present: + - name: salt://{{ slspath }}/files/job.yaml.j2 + - template: jinja + - defaults: + node: {{ node }} + image: {{ image }} + +{%- endfor %} diff --git a/salt/metalk8s/roles/bootstrap/init.sls b/salt/metalk8s/roles/bootstrap/init.sls index e9fa8ede9f..f2c12ee3e8 100644 --- a/salt/metalk8s/roles/bootstrap/init.sls +++ b/salt/metalk8s/roles/bootstrap/init.sls @@ -3,3 +3,4 @@ include: - metalk8s.kubernetes.kubelet - metalk8s.salt.master - metalk8s.utils + - metalk8s.backup.configured diff --git a/salt/tests/unit/formulas/config.yaml b/salt/tests/unit/formulas/config.yaml index 35d30a8cf7..dc1119be7d 100644 --- a/salt/tests/unit/formulas/config.yaml +++ b/salt/tests/unit/formulas/config.yaml @@ -292,6 +292,15 @@ metalk8s: mode: master architecture: compact + backup: + files: + job.yaml.j2: + _cases: + "Create job manifest for a node": + extra_context: + node: master-1 + image: registry/some-image-name:tag + register_etcd.sls: _cases: "Target a new master node": &orch_target_master_node diff --git a/salt/tests/unit/modules/files/test_metalk8s.yaml b/salt/tests/unit/modules/files/test_metalk8s.yaml index 4bb1e8ac3a..cecde4d729 100644 --- a/salt/tests/unit/modules/files/test_metalk8s.yaml +++ b/salt/tests/unit/modules/files/test_metalk8s.yaml @@ -425,3 +425,18 @@ configure_archive: invalid_path: True raises: True result: Invalid archive path + +backup_node: + # 0. Ok - Backup succeed + - archives: &backup_node_archives + metalk8s-2.10.0: + path: /tmp + result: Backup successfully generated + # 1. Nok - Backup script failed + - archives: *backup_node_archives + raises: True + result: "Failed to run /tmp/backup.sh: Boom!" + # 2. Nok - No such archive for this MetalK8s version + - archives: {} + raises: True + result: No MetalK8s archive found for version 2.10.0 diff --git a/salt/tests/unit/modules/test_metalk8s.py b/salt/tests/unit/modules/test_metalk8s.py index d43b0e6705..cc7a3e0705 100644 --- a/salt/tests/unit/modules/test_metalk8s.py +++ b/salt/tests/unit/modules/test_metalk8s.py @@ -621,3 +621,26 @@ def test__read_bootstrap_config(self, raises=False, result=None): metalk8s._read_bootstrap_config(), "config", ) + + @utils.parameterized_from_cases(YAML_TESTS_CASES["backup_node"]) + def test_backup_node(self, result, version="2.10.0", archives=None, raises=False): + def _cmd_run_all(cmd): + ret = {"retcode": 0, "stdout": "OK", "stderr": "Boom!"} + if raises: + ret["retcode"] = 1 + return ret + + salt_dict = {"cmd.run_all": MagicMock(side_effect=_cmd_run_all)} + pillar_dict = {"metalk8s": {"cluster_version": version}} + + with patch.dict(metalk8s.__salt__, salt_dict), patch.dict( + metalk8s.__pillar__, pillar_dict + ), patch("metalk8s.get_archives", MagicMock(return_value=archives or {})): + if raises: + self.assertRaisesRegex( + CommandExecutionError, + result, + metalk8s.backup_node, + ) + else: + self.assertEqual(metalk8s.backup_node(), result) diff --git a/scripts/backup.sh.in b/scripts/backup.sh.in index b6fc3984b0..dad65b564d 100755 --- a/scripts/backup.sh.in +++ b/scripts/backup.sh.in @@ -12,7 +12,8 @@ TAR_OPTS=( "--atime-preserve" "--preserve-permissions" ) -BACKUP_ARCHIVE="/var/lib/metalk8s/backup_$(date -u +%Y%m%d_%H%M%S).tar.gz" +BACKUP_ARCHIVE="/var/lib/metalk8s/backups/$(date -u +%Y%m%d_%H%M%S).tar.gz" +REPLICATION=1 _usage() { echo "$(basename "$0") [options]" @@ -26,15 +27,18 @@ while (( "$#" )); do case "$1" in -v|--verbose) VERBOSE=1 - shift ;; -l|--log-file) LOGFILE="$2" - shift 2 + shift ;; -b|--backup-file) BACKUP_ARCHIVE="$2" - shift 2 + shift + ;; + # Disable the backup replication on other master nodes + -n|--no-replication) + REPLICATION=0 ;; *) # unsupported flags echo "Error: Unsupported flag $1" >&2 @@ -42,6 +46,7 @@ while (( "$#" )); do exit 1 ;; esac + shift done TMPFILES=$(mktemp -d) @@ -62,69 +67,9 @@ cleanup() { trap cleanup EXIT -run_quiet() { - local name=$1 - shift 1 - - echo -n "> ${name}..." - local start - start=$(date +%s) - set +e - "$@" 2>&1 | tee -ia "${LOGFILE}" > "${TMPFILES}/out" - local RC=$? - set -e - local end - end=$(date +%s) - - local duration=$(( end - start )) - - if [ $RC -eq 0 ]; then - echo " done [${duration}s]" - else - echo " fail [${duration}s]" - cat >/dev/stderr << EOM - -Failure while running step '${name}' - -Command: $@ - -Output: - -<< BEGIN >> -EOM - cat "${TMPFILES}/out" > /dev/stderr - - cat >/dev/stderr << EOM -<< END >> - -This script will now exit - -EOM - - exit 1 - fi -} - -run_verbose() { - local name=$1 - shift 1 - - echo "> ${name}..." - "$@" -} - -run() { - if [ "$VERBOSE" -eq 1 ]; then - run_verbose "${@}" - else - run_quiet "${@}" - fi -} - -die() { - echo 1>&2 "$@" - return 1 -} +BASE_DIR=$(dirname "$(readlink -f "${BASH_SOURCE[0]}")") +# shellcheck disable=SC1090 +. "$BASE_DIR"/common.sh _save_cp() { local -r src="$(readlink -f "$1")" @@ -225,7 +170,16 @@ EOF tar "${TAR_OPTS[@]}" -C "$BACKUP_DIR" -cz -f "$BACKUP_ARCHIVE" ./ } +replicate_archives() { + salt_master_exec=(crictl exec -i "$(get_salt_container)") + + "${salt_master_exec[@]}" salt-run --state-output=mixed state.orchestrate \ + metalk8s.orchestrate.backup.replication \ + saltenv=metalk8s-@@VERSION +} + run "Backing up MetalK8s configurations" backup_metalk8s_conf run "Backing up CAs certificates and keys" backup_cas run "Backing up etcd data" backup_etcd run "Creating backup archive '$BACKUP_ARCHIVE'" create_archive +(( REPLICATION )) && run "Replicating backup archives on master nodes" replicate_archives diff --git a/scripts/downgrade.sh.in b/scripts/downgrade.sh.in index ee04f349cd..793b705ddd 100755 --- a/scripts/downgrade.sh.in +++ b/scripts/downgrade.sh.in @@ -190,4 +190,7 @@ run "Downgrading bootstrap" downgrade_bootstrap run "Launching the downgrade" launch_downgrade run "Launching the post-downgrade" launch_post_downgrade -"$BASE_DIR"/backup.sh +# NOTE: We use --no-replication flag since the backup-server is not available +# in the previous version of MetalK8s +# This flag can safely be removed in MetalK8s 2.12. +"$BASE_DIR"/backup.sh --no-replication diff --git a/scripts/upgrade.sh.in b/scripts/upgrade.sh.in index cbe7199bca..4e5f850b4a 100755 --- a/scripts/upgrade.sh.in +++ b/scripts/upgrade.sh.in @@ -193,7 +193,10 @@ patch_kubesystem_namespace() { run "Performing Pre-Upgrade checks" precheck_upgrade [ $DRY_RUN -eq 1 ] && exit 0 -"$BASE_DIR"/backup.sh +# NOTE: We use --no-replication flag since the backup-server is not available +# in the previous version of MetalK8s. +# This flag can safely be removed in MetalK8s 2.12. +"$BASE_DIR"/backup.sh --no-replication run "Upgrading bootstrap" upgrade_bootstrap run "Setting cluster version to $DESTINATION_VERSION" patch_kubesystem_namespace