From 38237b4525b7abc80f4a92f24e7e062a97d68101 Mon Sep 17 00:00:00 2001 From: Alex Lutay <1928266+taurus-forever@users.noreply.github.com> Date: Mon, 27 Apr 2026 09:46:23 +0000 Subject: [PATCH 1/2] [DPE-9964] Pre-upgrade switchover to minimize client write downtime during refresh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the Patroni primary is also the Juju leader (upgraded last), clients lose write access for the entire upgrade cycle because no unit can update the relation endpoints. Perform a graceful Patroni switchover before the snap refresh so the endpoint is updated while the unit is still responsive. Falls back to the current automatic failover behavior if the switchover fails. Also, to commit endpoint to client, charm have to defer upgrade_granted, as Juju batches relation data changes and only commits them when the hook exits. The previous approach updated endpoints inside _on_upgrade_granted but the client wouldn't see the change until after the snap refresh completed — defeating the purpose. Now the switchover + endpoint update happens in the first invocation, which defers the event and returns. Juju commits the endpoint change, the client sees the new primary immediately, and the deferred event fires a second time to proceed with the snap refresh. Skip pre-upgrade switchover for single unit application or when snap revision is unchanged. Assisted-by: Claude:claude-4.6-opus --- src/upgrade.py | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/src/upgrade.py b/src/upgrade.py index 795aacf060b..66784bf9104 100644 --- a/src/upgrade.py +++ b/src/upgrade.py @@ -6,6 +6,7 @@ import json import logging +from charmlibs import snap from charms.data_platform_libs.v0.upgrade import ( ClusterNotReadyError, DataUpgrade, @@ -18,15 +19,17 @@ from tenacity import RetryError, Retrying, stop_after_attempt, wait_fixed from typing_extensions import override +from cluster import SwitchoverFailedError from constants import ( APP_SCOPE, MONITORING_PASSWORD_KEY, MONITORING_USER, PATRONI_PASSWORD_KEY, + POSTGRESQL_SNAP_NAME, RAFT_PASSWORD_KEY, SNAP_PACKAGES, ) -from utils import new_password +from utils import new_password, snap_refreshed logger = logging.getLogger(__name__) @@ -141,8 +144,48 @@ def _on_upgrade_charm_check_legacy(self) -> None: self._prepare_upgrade_from_legacy() self.on.upgrade_charm.emit() + def _pre_upgrade_switchover(self, event: UpgradeGrantedEvent) -> bool: + """Switchover primary before upgrading, to minimize client write downtime. + + Returns True if the event was deferred after the switchover (caller should return). + """ + if len(self.peer_relation.units) == 0: + return False + + if snap_refreshed(snap.SnapCache()[POSTGRESQL_SNAP_NAME].revision): + logger.info("Snap is already at the target revision, skipping pre-upgrade switchover") + return False + + if self.unit_upgrade_data.get("pre-upgrade-switchover-done"): + self.unit_upgrade_data.update({"pre-upgrade-switchover-done": ""}) + return False + + old_primary = self.charm._patroni.get_primary() + if old_primary is None or old_primary != self.charm.unit.name.replace("/", "-"): + return False + + logger.info("Switching over primary before upgrading") + self.charm.unit.status = MaintenanceStatus("switching over primary") + self.charm._patroni.switchover() + self.charm._patroni.primary_changed(old_primary) + logger.info("Primary switchover completed") + + if self.charm.unit.is_leader(): + self.charm._update_relation_endpoints() + + self.unit_upgrade_data.update({"pre-upgrade-switchover-done": "true"}) + logger.info("Deferring upgrade to let Juju commit endpoint changes to client relations") + event.defer() + return True + @override def _on_upgrade_granted(self, event: UpgradeGrantedEvent) -> None: + try: + if self._pre_upgrade_switchover(event): + return + except (RetryError, SwitchoverFailedError) as e: + logger.warning("Pre-upgrade switchover failed: %s. Proceeding with upgrade.", e) + # Refresh the charmed PostgreSQL snap and restart the database. # Update the configuration. self.charm.unit.status = MaintenanceStatus("updating configuration") From 40b5b46a52c33fb8af544df04bbf4545237a933e Mon Sep 17 00:00:00 2001 From: Alex Lutay <1928266+taurus-forever@users.noreply.github.com> Date: Wed, 10 Jun 2026 11:15:54 +0000 Subject: [PATCH 2/2] [DPE-9964] Skip pre-upgrade switchover for standby clusters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A standby (async-replica) cluster is read-only — clients do not write to it, so there is no client write downtime for the pre-upgrade switchover to minimize. The switchover also relies on get_primary(), which returns None on a standby cluster (it has a standby_leader, not a leader), so the switchover would no-op anyway. Skip it explicitly when this app is a standby cluster, gated on async_replication.get_primary_cluster_endpoint(). Assisted-by: Claude:claude-4.6-opus --- src/upgrade.py | 4 ++++ tests/unit/test_upgrade.py | 17 +++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/src/upgrade.py b/src/upgrade.py index 66784bf9104..ad025364ff4 100644 --- a/src/upgrade.py +++ b/src/upgrade.py @@ -152,6 +152,10 @@ def _pre_upgrade_switchover(self, event: UpgradeGrantedEvent) -> bool: if len(self.peer_relation.units) == 0: return False + if self.charm.async_replication.get_primary_cluster_endpoint(): + logger.info("Standby cluster is read-only; skipping pre-upgrade switchover") + return False + if snap_refreshed(snap.SnapCache()[POSTGRESQL_SNAP_NAME].revision): logger.info("Snap is already at the target revision, skipping pre-upgrade switchover") return False diff --git a/tests/unit/test_upgrade.py b/tests/unit/test_upgrade.py index d9672a0a3ae..914bd76de75 100644 --- a/tests/unit/test_upgrade.py +++ b/tests/unit/test_upgrade.py @@ -196,6 +196,23 @@ def test_on_upgrade_granted(harness): _updated_synchronous_node_count.assert_called_once_with() +def test_pre_upgrade_switchover_skips_standby(harness): + with ( + patch( + "charm.PostgreSQLAsyncReplication.get_primary_cluster_endpoint" + ) as _get_primary_cluster_endpoint, + patch("charm.Patroni.switchover") as _switchover, + ): + # A standby (async-replica) cluster is read-only: there is no client write + # downtime to avoid, so the pre-upgrade switchover must be skipped. + _get_primary_cluster_endpoint.return_value = "10.1.1.1" + mock_event = MagicMock() + + assert harness.charm.upgrade._pre_upgrade_switchover(mock_event) is False + _switchover.assert_not_called() + mock_event.defer.assert_not_called() + + def test_pre_upgrade_check(harness): with ( patch(