diff --git a/src/upgrade.py b/src/upgrade.py index 795aacf060..ad025364ff 100644 --- a/src/upgrade.py +++ b/src/upgrade.py @@ -6,6 +6,7 @@ import json import logging +from charmlibs import snap from charms.data_platform_libs.v0.upgrade import ( ClusterNotReadyError, DataUpgrade, @@ -18,15 +19,17 @@ from tenacity import RetryError, Retrying, stop_after_attempt, wait_fixed from typing_extensions import override +from cluster import SwitchoverFailedError from constants import ( APP_SCOPE, MONITORING_PASSWORD_KEY, MONITORING_USER, PATRONI_PASSWORD_KEY, + POSTGRESQL_SNAP_NAME, RAFT_PASSWORD_KEY, SNAP_PACKAGES, ) -from utils import new_password +from utils import new_password, snap_refreshed logger = logging.getLogger(__name__) @@ -141,8 +144,52 @@ def _on_upgrade_charm_check_legacy(self) -> None: self._prepare_upgrade_from_legacy() self.on.upgrade_charm.emit() + def _pre_upgrade_switchover(self, event: UpgradeGrantedEvent) -> bool: + """Switchover primary before upgrading, to minimize client write downtime. + + Returns True if the event was deferred after the switchover (caller should return). + """ + if len(self.peer_relation.units) == 0: + return False + + if self.charm.async_replication.get_primary_cluster_endpoint(): + logger.info("Standby cluster is read-only; skipping pre-upgrade switchover") + return False + + if snap_refreshed(snap.SnapCache()[POSTGRESQL_SNAP_NAME].revision): + logger.info("Snap is already at the target revision, skipping pre-upgrade switchover") + return False + + if self.unit_upgrade_data.get("pre-upgrade-switchover-done"): + self.unit_upgrade_data.update({"pre-upgrade-switchover-done": ""}) + return False + + old_primary = self.charm._patroni.get_primary() + if old_primary is None or old_primary != self.charm.unit.name.replace("/", "-"): + return False + + logger.info("Switching over primary before upgrading") + self.charm.unit.status = MaintenanceStatus("switching over primary") + self.charm._patroni.switchover() + self.charm._patroni.primary_changed(old_primary) + logger.info("Primary switchover completed") + + if self.charm.unit.is_leader(): + self.charm._update_relation_endpoints() + + self.unit_upgrade_data.update({"pre-upgrade-switchover-done": "true"}) + logger.info("Deferring upgrade to let Juju commit endpoint changes to client relations") + event.defer() + return True + @override def _on_upgrade_granted(self, event: UpgradeGrantedEvent) -> None: + try: + if self._pre_upgrade_switchover(event): + return + except (RetryError, SwitchoverFailedError) as e: + logger.warning("Pre-upgrade switchover failed: %s. Proceeding with upgrade.", e) + # Refresh the charmed PostgreSQL snap and restart the database. # Update the configuration. self.charm.unit.status = MaintenanceStatus("updating configuration") diff --git a/tests/unit/test_upgrade.py b/tests/unit/test_upgrade.py index d9672a0a3a..914bd76de7 100644 --- a/tests/unit/test_upgrade.py +++ b/tests/unit/test_upgrade.py @@ -196,6 +196,23 @@ def test_on_upgrade_granted(harness): _updated_synchronous_node_count.assert_called_once_with() +def test_pre_upgrade_switchover_skips_standby(harness): + with ( + patch( + "charm.PostgreSQLAsyncReplication.get_primary_cluster_endpoint" + ) as _get_primary_cluster_endpoint, + patch("charm.Patroni.switchover") as _switchover, + ): + # A standby (async-replica) cluster is read-only: there is no client write + # downtime to avoid, so the pre-upgrade switchover must be skipped. + _get_primary_cluster_endpoint.return_value = "10.1.1.1" + mock_event = MagicMock() + + assert harness.charm.upgrade._pre_upgrade_switchover(mock_event) is False + _switchover.assert_not_called() + mock_event.defer.assert_not_called() + + def test_pre_upgrade_check(harness): with ( patch(