update to latest libs (#365)

* update to latest libs * update libs * fix HA tests * fix mongos tests * fix metrics test * update channel for revision check * wait for permissions to be resolved * wait for model to settle before creating backup * adding missing return * chore: remove useless comma * remove block until * fix: Use context manager to stop looping forever * revert earlier change * Revert "remove block until" This reverts commit 22446d1. * fix: Delay setting the partition in case the leader is the unit to upgrade --------- Co-authored-by: Mehdi-Bendriss <[email protected]> Co-authored-by: Neha Oudin <[email protected]>
canonical · Nov 14, 2024 · 3fc7400 · 3fc7400
1 parent ab4e9e7
commit 3fc7400
Show file tree

Hide file tree

Showing 14 changed files with 101 additions and 69 deletions.
diff --git a/lib/charms/mongodb/v0/config_server_interface.py b/lib/charms/mongodb/v0/config_server_interface.py
@@ -51,7 +51,7 @@
 
 # Increment this PATCH version before using `charmcraft publish-lib` or reset
 # to 0 if you are raising the major API version
-LIBPATCH = 14
+LIBPATCH = 15
 
 
 class ClusterProvider(Object):

diff --git a/lib/charms/mongodb/v1/mongodb_backups.py b/lib/charms/mongodb/v1/mongodb_backups.py
@@ -41,7 +41,7 @@
 
 # Increment this PATCH version before using `charmcraft publish-lib` or reset
 # to 0 if you are raising the major API version
-LIBPATCH = 6
+LIBPATCH = 5
 
 logger = logging.getLogger(__name__)
 

diff --git a/lib/charms/mongodb/v1/mongodb_provider.py b/lib/charms/mongodb/v1/mongodb_provider.py
@@ -37,7 +37,7 @@
 
 # Increment this PATCH version before using `charmcraft publish-lib` or reset
 # to 0 if you are raising the major API version
-LIBPATCH = 15
+LIBPATCH = 16
 
 logger = logging.getLogger(__name__)
 REL_NAME = "database"

diff --git a/lib/charms/mongodb/v1/mongodb_tls.py b/lib/charms/mongodb/v1/mongodb_tls.py
@@ -12,7 +12,7 @@
 import logging
 import re
 import socket
-from typing import Dict, List, Optional, Tuple
+from typing import Optional, Tuple
 
 from charms.tls_certificates_interface.v3.tls_certificates import (
     CertificateAvailableEvent,
@@ -42,7 +42,7 @@
 
 # Increment this PATCH version before using `charmcraft publish-lib` or reset
 # to 0 if you are raising the major API version
-LIBPATCH = 4
+LIBPATCH = 5
 
 WAIT_CERT_UPDATE = "wait-cert-updated"
 
@@ -105,9 +105,6 @@ def request_certificate(
         internal: bool,
     ):
         """Request TLS certificate."""
-        if not self.charm.model.get_relation(Config.TLS.TLS_PEER_RELATION):
-            return
-
         if param is None:
             key = generate_private_key()
         else:
@@ -234,7 +231,7 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None:
             self.charm.cluster.update_ca_secret(new_ca=event.ca)
             self.charm.config_server.update_ca_secret(new_ca=event.ca)
 
-        if self.waiting_for_both_certs():
+        if self.is_waiting_for_both_certs():
             logger.debug(
                 "Defer till both internal and external TLS certificates available to avoid second restart."
             )
@@ -256,7 +253,7 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None:
             # clear waiting status if db service is ready
             self.charm.status.set_and_share_status(ActiveStatus())
 
-    def waiting_for_both_certs(self):
+    def is_waiting_for_both_certs(self) -> bool:
         """Returns a boolean indicating whether additional certs are needed."""
         if not self.get_tls_secret(internal=True, label_name=Config.TLS.SECRET_CERT_LABEL):
             logger.debug("Waiting for internal certificate.")
@@ -295,6 +292,10 @@ def _on_certificate_expiring(self, event: CertificateExpiringEvent) -> None:
             return
 
         logger.debug("Generating a new Certificate Signing Request.")
+        self.request_new_certificates(internal)
+
+    def request_new_certificates(self, internal: bool) -> None:
+        """Requests the renewel of a new certificate."""
         key = self.get_tls_secret(internal, Config.TLS.SECRET_KEY_LABEL).encode("utf-8")
         old_csr = self.get_tls_secret(internal, Config.TLS.SECRET_CSR_LABEL).encode("utf-8")
         sans = self.get_new_sans()
@@ -313,8 +314,9 @@ def _on_certificate_expiring(self, event: CertificateExpiringEvent) -> None:
         )
 
         self.set_tls_secret(internal, Config.TLS.SECRET_CSR_LABEL, new_csr.decode("utf-8"))
+        self.set_waiting_for_cert_to_update(waiting=True, internal=internal)
 
-    def get_new_sans(self) -> Dict:
+    def get_new_sans(self) -> dict[str, list[str]]:
         """Create a list of DNS names for a MongoDB unit.
 
         Returns:
@@ -341,7 +343,7 @@ def get_new_sans(self) -> Dict:
 
         return sans
 
-    def get_current_sans(self, internal: bool) -> List[str] | None:
+    def get_current_sans(self, internal: bool) -> dict[str, list[str]] | None:
         """Gets the current SANs for the unit cert."""
         # if unit has no certificates do not proceed.
         if not self.is_tls_enabled(internal=internal):
@@ -411,9 +413,9 @@ def _get_subject_name(self) -> str:
 
     def is_set_waiting_for_cert_to_update(
         self,
-        internal=False,
+        internal: bool = False,
     ) -> bool:
-        """Returns True we are waiting for a cert to update."""
+        """Returns True if we are waiting for a cert to update."""
         scope = "int" if internal else "ext"
         label_name = f"{scope}-{WAIT_CERT_UPDATE}"
 

diff --git a/lib/charms/mongodb/v1/shards_interface.py b/lib/charms/mongodb/v1/shards_interface.py
@@ -58,7 +58,7 @@
 
 # Increment this PATCH version before using `charmcraft publish-lib` or reset
 # to 0 if you are raising the major API version
-LIBPATCH = 11
+LIBPATCH = 12
 
 KEYFILE_KEY = "key-file"
 HOSTS_KEY = "host"
@@ -711,7 +711,7 @@ def _on_relation_changed(self, event):
 
         self.update_member_auth(event, (key_file_enabled, tls_enabled))
 
-        if tls_enabled and self.charm.tls.waiting_for_both_certs():
+        if tls_enabled and self.charm.tls.is_waiting_for_both_certs():
             logger.info("Waiting for requested certs, before restarting and adding to cluster.")
             event.defer()
             return

diff --git a/lib/charms/tls_certificates_interface/v3/tls_certificates.py b/lib/charms/tls_certificates_interface/v3/tls_certificates.py
@@ -318,7 +318,7 @@ def _on_all_certificates_invalidated(self, event: AllCertificatesInvalidatedEven
 
 # Increment this PATCH version before using `charmcraft publish-lib` or reset
 # to 0 if you are raising the major API version
-LIBPATCH = 20
+LIBPATCH = 23
 
 PYDEPS = ["cryptography", "jsonschema"]
 
@@ -1902,10 +1902,20 @@ def _on_relation_changed(self, event: RelationChangedEvent) -> None:
                     )
                 else:
                     try:
+                        secret = self.model.get_secret(label=f"{LIBID}-{csr_in_sha256_hex}")
                         logger.debug(
                             "Setting secret with label %s", f"{LIBID}-{csr_in_sha256_hex}"
                         )
-                        secret = self.model.get_secret(label=f"{LIBID}-{csr_in_sha256_hex}")
+                        # Juju < 3.6 will create a new revision even if the content is the same
+                        if (
+                            secret.get_content(refresh=True).get("certificate", "")
+                            == certificate.certificate
+                        ):
+                            logger.debug(
+                                "Secret %s with correct certificate already exists",
+                                f"{LIBID}-{csr_in_sha256_hex}",
+                            )
+                            continue
                         secret.set_content(
                             {"certificate": certificate.certificate, "csr": certificate.csr}
                         )
@@ -1986,11 +1996,19 @@ def _on_secret_expired(self, event: SecretExpiredEvent) -> None:
         provider_certificate = self._find_certificate_in_relation_data(csr)
         if not provider_certificate:
             # A secret expired but we did not find matching certificate. Cleaning up
+            logger.warning(
+                "Failed to find matching certificate for csr, cleaning up secret %s",
+                event.secret.label,
+            )
             event.secret.remove_all_revisions()
             return
 
         if not provider_certificate.expiry_time:
             # A secret expired but matching certificate is invalid. Cleaning up
+            logger.warning(
+                "Certificate matching csr is invalid, cleaning up secret %s",
+                event.secret.label,
+            )
             event.secret.remove_all_revisions()
             return
 
@@ -2023,14 +2041,18 @@ def _find_certificate_in_relation_data(self, csr: str) -> Optional[ProviderCerti
             return provider_certificate
         return None
 
-    def _get_csr_from_secret(self, secret: Secret) -> str:
+    def _get_csr_from_secret(self, secret: Secret) -> Union[str, None]:
         """Extract the CSR from the secret label or content.
 
         This function is a workaround to maintain backwards compatibility
         and fix the issue reported in
         https://github.com/canonical/tls-certificates-interface/issues/228
         """
-        if not (csr := secret.get_content().get("csr", "")):
+        try:
+            content = secret.get_content(refresh=True)
+        except SecretNotFoundError:
+            return None
+        if not (csr := content.get("csr", None)):
             # In versions <14 of the Lib we were storing the CSR in the label of the secret
             # The CSR now is stored int the content of the secret, which was a breaking change
             # Here we get the CSR if the secret was created by an app using libpatch 14 or lower

diff --git a/src/charm.py b/src/charm.py
@@ -636,15 +636,17 @@ def _configure_container(self, container: Container) -> None:
         except FailedToUpdateFilesystem as err:
             raise ContainerNotReadyError from err
 
-        self._configure_layers(container)
-
-        # when a network cuts and the pod restarts - reconnect to the exporter
         try:
+            self._configure_layers(container)
+            # when a network cuts and the pod restarts - reconnect to the exporter and pbm
             self._connect_mongodb_exporter()
             self._connect_pbm_agent()
         except MissingSecretError as e:
             logger.error("Cannot connect mongodb exporter: %r", e)
             raise ContainerNotReadyError
+        except ChangeError as e:
+            logger.error("Cannot configure container layers %r", e)
+            raise ContainerNotReadyError
 
     # BEGIN: charm events
     def _on_upgrade(self, event: UpgradeCharmEvent) -> None:
@@ -927,6 +929,8 @@ def mongodb_storage_detaching(self, event) -> None:
                 self.shard.wait_for_draining(mongos_hosts)
                 logger.info("Shard successfully drained storage.")
 
+            return
+
         try:
             # retries over a period of 10 minutes in an attempt to resolve race conditions it is
             logger.debug("Removing %s from replica set", self.unit_host(self.unit))

diff --git a/src/upgrades/kubernetes_upgrades.py b/src/upgrades/kubernetes_upgrades.py
@@ -224,33 +224,36 @@ def reconcile_partition(
         # This does not address the situation where another unit > 1 restarts and sets the
         # partition during the `stop` event, but that is unlikely to occur in the small time window
         # that causes the unit to hang.
-        if partition_ < self._partition:
-            self._partition = partition_
-            logger.debug(
-                f"Lowered partition to {partition_} {action_event=} {force=} {self.in_progress=}"
-            )
         if action_event:
             assert len(units) >= 2
-            if self._partition > unit_number(units[1]):
+            if partition_ > unit_number(units[1]):
                 message = "Highest number unit is unhealthy. Refresh will not resume."
                 logger.debug(f"Resume upgrade event failed: {message}")
                 action_event.fail(message)
-                return
-            if force:
-                # If a unit was unhealthy and the upgrade was forced, only the next unit will
-                # upgrade. As long as 1 or more units are unhealthy, the upgrade will need to be
-                # forced for each unit.
-
-                # Include "Attempting to" because (on Kubernetes) we only control the partition,
-                # not which units upgrade. Kubernetes may not upgrade a unit even if the partition
-                # allows it (e.g. if the charm container of a higher unit is not ready). This is
-                # also applicable `if not force`, but is unlikely to happen since all units are
-                # healthy `if not force`.
-                message = f"Attempting to refresh unit {self._partition}."
             else:
-                message = f"Refresh resumed. Unit {self._partition} is refreshing next."
-            action_event.set_results({"result": message})
-            logger.debug(f"Resume refresh succeeded: {message}")
+                if force:
+                    # If a unit was unhealthy and the upgrade was forced, only
+                    # the next unit will upgrade. As long as 1 or more units
+                    # are unhealthy, the upgrade will need to be forced for
+                    # each unit.
+
+                    # Include "Attempting to" because (on Kubernetes) we only
+                    # control the partition, not which units upgrade.
+                    # Kubernetes may not upgrade a unit even if the partition
+                    # allows it (e.g. if the charm container of a higher unit
+                    # is not ready). This is also applicable `if not force`,
+                    # but is unlikely to happen since all units are healthy `if
+                    # not force`.
+                    message = f"Attempting to refresh unit {self._partition}."
+                else:
+                    message = f"Refresh resumed. Unit {self._partition} is refreshing next."
+                action_event.set_results({"result": message})
+                logger.debug(f"Resume refresh succeeded: {message}")
+        if partition_ < self._partition:
+            self._partition = partition_
+            logger.debug(
+                f"Lowered partition to {partition_} {action_event=} {force=} {self.in_progress=}"
+            )
 
 
 partition = _Partition()
diff --git a/tests/integration/backup_tests/test_backups.py b/tests/integration/backup_tests/test_backups.py
@@ -12,13 +12,7 @@
 import pytest_asyncio
 import yaml
 from pytest_operator.plugin import OpsTest
-from tenacity import (
-    RetryError,
-    Retrying,
-    stop_after_attempt,
-    stop_after_delay,
-    wait_fixed,
-)
+from tenacity import RetryError, Retrying, stop_after_delay, wait_fixed
 
 from ..ha_tests import helpers as ha_helpers
 from ..helpers import (
@@ -244,7 +238,7 @@ async def test_multi_backup(ops_test: OpsTest, github_secrets, continuous_writes
     db_unit = await helpers.get_leader_unit(ops_test)
 
     # create first backup once ready
-    await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20),
+    await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20)
 
     action = await db_unit.run_action(action_name="create-backup")
     first_backup = await action.wait()
@@ -262,7 +256,7 @@ async def test_multi_backup(ops_test: OpsTest, github_secrets, continuous_writes
     }
     await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters)
 
-    await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20),
+    await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20)
 
     # create a backup as soon as possible. might not be immediately possible since only one backup
     # can happen at a time.
@@ -279,7 +273,7 @@ async def test_multi_backup(ops_test: OpsTest, github_secrets, continuous_writes
     # backup can take a lot of time so this function returns once the command was successfully
     # sent to pbm. Therefore before checking, wait for Charmed MongoDB to finish creating the
     # backup
-    await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20),
+    await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20)
 
     # verify that backups was made in GCP bucket
     try:
@@ -298,7 +292,7 @@ async def test_multi_backup(ops_test: OpsTest, github_secrets, continuous_writes
         "endpoint": "https://s3.amazonaws.com",
     }
     await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters)
-    await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20),
+    await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20)
 
     # verify that backups was made on the AWS bucket
     try:
@@ -448,13 +442,14 @@ async def test_restore_new_cluster(
     ), "Backups from old cluster are listed as failed"
 
     # find most recent backup id and restore
-    for attempt in Retrying(stop=stop_after_attempt(120), wait=wait_fixed(1), reraise=True):
-        action = await leader_unit.run_action(action_name="list-backups")
-        list_result = await action.wait()
-        list_result = list_result.results["backups"]
-        most_recent_backup = list_result.split("\n")[-1]
-        backup_id = most_recent_backup.split()[0]
-        assert "-----" not in backup_id, "list of backups are empty."
+    action = await leader_unit.run_action(action_name="list-backups")
+    list_result = await action.wait()
+    list_result = list_result.results["backups"]
+    most_recent_backup = list_result.split("\n")[-1]
+    backup_id = most_recent_backup.split()[0]
+    action = await leader_unit.run_action(action_name="restore", **{"backup-id": backup_id})
+    restore = await action.wait()
+    assert restore.results["restore-status"] == "restore started", "restore not successful"
 
     # verify all writes are present
     try:

diff --git a/tests/integration/backup_tests/test_sharding_backups.py b/tests/integration/backup_tests/test_sharding_backups.py
@@ -107,7 +107,7 @@ async def test_set_credentials_in_cluster(ops_test: OpsTest, github_secrets) ->
 
     # apply new configuration options
     await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters)
-    await ops_test.model.wait_for_idle(apps=[S3_APP_NAME], status="active", timeout=TIMEOUT)
+    await ops_test.model.wait_for_idle(apps=CLUSTER_APPS, status="active", timeout=TIMEOUT)
     await setup_cluster_and_s3(ops_test)
 
 

diff --git a/tests/integration/ha_tests/test_ha.py b/tests/integration/ha_tests/test_ha.py
@@ -596,7 +596,9 @@ async def test_network_cut(ops_test: OpsTest, continuous_writes, chaos_mesh):
 
     # we need to give juju some time to realize that the instance is back online
     time.sleep(RESTART_DELAY)
-
+    await ops_test.model.wait_for_idle(
+        apps=[app], status="active", raise_on_blocked=False, timeout=1000
+    )
     await wait_until_unit_in_status(ops_test, primary, active_unit, "SECONDARY")
 
     # verify presence of primary, replica set member configuration, and number of primaries

diff --git a/tests/integration/metrics_tests/test_metrics.py b/tests/integration/metrics_tests/test_metrics.py
@@ -124,6 +124,9 @@ async def test_endpoints_network_cut(ops_test: OpsTest, chaos_mesh):
     time.sleep(60)
 
     # Wait for the network to be restored
+    await ops_test.model.wait_for_idle(
+        apps=[app_name], status="active", raise_on_blocked=False, timeout=1000
+    )
     await ha_helpers.wait_until_unit_in_status(ops_test, primary, active_unit, "SECONDARY")
 
     for unit in ops_test.model.applications[app_name].units: