From e11ce493ea06cb591ce21d3b9f036f05202e27e6 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Thu, 11 Jun 2026 18:52:37 -0300 Subject: [PATCH 1/3] fix(charm): release storage on unit teardown The charm implemented no removal hooks, so on unit teardown the charmed-postgresql snap services kept the Juju storage mounts busy. Juju's unmount then failed with "target is busy", leaving storage stuck detaching and blocking machine and model removal (only destroy-model --force could clear it). Stop the workload in the storage-detaching hook, which Juju runs before stop, so the mounts are free by the time Juju unmounts them. This stops every charmed-postgresql snap service plus the charm's topology-observer and log-rotation processes, and is idempotent across the per-storage detaching events. Fixes #1550. Signed-off-by: Marcelo Henrique Neppel --- src/charm.py | 15 +++++++++++++++ src/cluster.py | 15 +++++++++++++++ src/rotate_logs.py | 12 ++++++++++++ tests/integration/test_storage.py | 28 ++++++++++++++++++++++++++++ tests/unit/test_charm.py | 21 +++++++++++++++++++++ tests/unit/test_cluster.py | 15 +++++++++++++++ tests/unit/test_rotate_logs.py | 31 +++++++++++++++++++++++++++++++ 7 files changed, 137 insertions(+) diff --git a/src/charm.py b/src/charm.py index 4a49576a137..dd56d1b7c35 100755 --- a/src/charm.py +++ b/src/charm.py @@ -374,6 +374,10 @@ def __init__(self, *args): # incorrectly # https://canonical-charm-refresh.readthedocs-hosted.com/latest/add-to-charm/status/#implementation self.framework.observe(self.on.collect_unit_status, self._reconcile_refresh_status) + for storage_name in self.meta.storages: + self.framework.observe( + self.on[storage_name].storage_detaching, self._on_storage_detaching + ) self.cluster_name = self.app.name self._member_name = self.unit.name.replace("/", "-") self._certs_path = "/usr/local/share/ca-certificates" @@ -2475,6 +2479,17 @@ def _install_snap_package( ) raise + def _on_storage_detaching(self, _) -> None: + """Release the storage so Juju can unmount it on unit teardown. + + ``storage-detaching`` runs before ``stop``, so this is the earliest hook + where the snap services and background processes that keep the storage + mounts busy can be stopped. + """ + self._observer.stop_observer() + self._rotate_logs.stop_log_rotation() + self._patroni.stop_all_services() + def _is_storage_attached(self) -> bool: """Returns if storage is attached.""" try: diff --git a/src/cluster.py b/src/cluster.py index c9d7a9749c0..5ad80b99d43 100644 --- a/src/cluster.py +++ b/src/cluster.py @@ -796,6 +796,21 @@ def stop_patroni(self) -> bool: logger.exception(error_message, exc_info=e) return False + def stop_all_services(self) -> None: + """Stop every charmed-postgresql snap service. + + Used on unit teardown so the snap releases the storage mounts (PostgreSQL, + pgBackRest, the exporters) before Juju unmounts them. + """ + try: + logger.debug("Stopping all charmed-postgresql services...") + cache = snap.SnapCache() + selected_snap = cache["charmed-postgresql"] + selected_snap.stop() + except snap.SnapError as e: + error_message = "Failed to stop charmed-postgresql snap services" + logger.exception(error_message, exc_info=e) + def switchover(self, candidate: str | None = None, async_cluster: bool = False) -> None: """Trigger a switchover.""" # Try to trigger the switchover. diff --git a/src/rotate_logs.py b/src/rotate_logs.py index 6ccc7816fa5..71e6da83d3c 100644 --- a/src/rotate_logs.py +++ b/src/rotate_logs.py @@ -5,6 +5,7 @@ import logging import os +import signal import subprocess from typing import TYPE_CHECKING @@ -57,3 +58,14 @@ def start_log_rotation(self): self._charm.unit_peer_data.update({"rotate-logs-pid": f"{pid}"}) logging.info(f"Started rotate logs process with PID {pid}") + + def stop_log_rotation(self) -> None: + """Stop the running rotate logs process if we have previously started it.""" + if stored_pid := self._charm.unit_peer_data.get("rotate-logs-pid"): + pid = int(stored_pid) + try: + os.kill(pid, signal.SIGINT) + logging.info(f"Stopped rotate logs process with PID {pid}") + self._charm.unit_peer_data.update({"rotate-logs-pid": ""}) + except OSError: + pass diff --git a/tests/integration/test_storage.py b/tests/integration/test_storage.py index 3cefedfccf9..4622af58ac2 100644 --- a/tests/integration/test_storage.py +++ b/tests/integration/test_storage.py @@ -31,3 +31,31 @@ def test_storage(juju: JujuFixture, charm): assert storage["key"] == f"{expected_storages[index]}/{index}", ( f"Storage {expected_storages[index]} not found" ) + + +@pytest.mark.abort_on_fail +def test_storage_released_on_removal(juju: JujuFixture, charm): + """Graceful removal must release the storage so teardown completes. + + Regression test for canonical/postgresql-operator#1550: without stopping the + workload in the ``storage-detaching`` hook the charmed-postgresql snap keeps + the storage mounts busy, so Juju's unmount fails ("target is busy") and the + unit, storage and machine removal hangs forever. + """ + if DATABASE_APP_NAME not in juju.status().apps: + with juju.ext.fast_forward(): + juju.ext.model.deploy(charm, num_units=1, config={"profile": "testing"}) + juju.ext.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active") + + logger.info("Removing %s and waiting for a clean teardown", DATABASE_APP_NAME) + juju.ext.model.remove_application( + DATABASE_APP_NAME, + block_until_done=True, + destroy_storage=True, + timeout=15 * 60, + ) + + detaching = [ + storage for storage in juju.ext.model.list_storage() if storage["life"] == "detaching" + ] + assert not detaching, f"storage stuck detaching after removal: {detaching}" diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index f29388e5d24..29179edb960 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -124,6 +124,27 @@ def test_on_install(harness): assert isinstance(harness.model.unit.status, WaitingStatus) +def test_on_storage_detaching(harness): + with ( + patch("charm.PostgresqlOperatorCharm._patroni", new_callable=PropertyMock) as _patroni, + patch("charm.ClusterTopologyObserver.stop_observer") as _stop_observer, + patch("charm.RotateLogs.stop_log_rotation") as _stop_log_rotation, + ): + # Every storage's detaching event releases the storage by stopping the + # background processes and all the snap services. + for storage_name in ("archive", "data", "logs", "temp"): + _stop_observer.reset_mock() + _stop_log_rotation.reset_mock() + _patroni.reset_mock() + + storage_id = harness.add_storage(storage_name, attach=True)[0] + harness.detach_storage(storage_id) + + _stop_observer.assert_called_once_with() + _stop_log_rotation.assert_called_once_with() + _patroni.return_value.stop_all_services.assert_called_once_with() + + def test_patroni_scrape_config(harness): result = harness.charm.patroni_scrape_config() diff --git a/tests/unit/test_cluster.py b/tests/unit/test_cluster.py index 582211a7e36..c08788ddef2 100644 --- a/tests/unit/test_cluster.py +++ b/tests/unit/test_cluster.py @@ -367,6 +367,21 @@ def test_stop_patroni(peers_ips, patroni): assert not patroni.stop_patroni() +def test_stop_all_services(peers_ips, patroni): + with patch("charm.snap.SnapCache") as _snap_cache: + _cache = _snap_cache.return_value + _selected_snap = _cache.__getitem__.return_value + + # All snap services are stopped (no services= argument means the whole snap). + patroni.stop_all_services() + _cache.__getitem__.assert_called_once_with("charmed-postgresql") + _selected_snap.stop.assert_called_once_with() + + # A snap error does not propagate out of the teardown helper. + _selected_snap.stop.side_effect = snap.SnapError + patroni.stop_all_services() + + def test_reinitialize_postgresql(peers_ips, patroni): with patch("requests.post") as _post: patroni.reinitialize_postgresql() diff --git a/tests/unit/test_rotate_logs.py b/tests/unit/test_rotate_logs.py index 70b870f0218..e399e9d2d43 100644 --- a/tests/unit/test_rotate_logs.py +++ b/tests/unit/test_rotate_logs.py @@ -1,5 +1,6 @@ # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. +import signal from unittest.mock import Mock, PropertyMock, patch import pytest @@ -96,3 +97,33 @@ def test_start_log_rotation_already_running(harness): harness.charm.rotate_logs.start_log_rotation() _kill.assert_called_once_with(1234, 0) _popen.assert_called_once() + + +def test_stop_log_rotation(harness): + with ( + patch("os.kill") as _kill, + patch.object(MockCharm, "_peers", new_callable=PropertyMock) as _peers, + ): + # Nothing is done when the peer relation (and so the stored PID) is gone. + _peers.return_value = None + harness.charm.rotate_logs.stop_log_rotation() + _kill.assert_not_called() + + # Nothing is done when there is no process running. + _peers.return_value = Mock(data={harness.charm.unit: {}}) + harness.charm.rotate_logs.stop_log_rotation() + _kill.assert_not_called() + + # The process is killed and the stored PID is cleared. + peer_data = {"rotate-logs-pid": "1"} + _peers.return_value = Mock(data={harness.charm.unit: peer_data}) + harness.charm.rotate_logs.stop_log_rotation() + _kill.assert_called_once_with(1, signal.SIGINT) + assert peer_data["rotate-logs-pid"] == "" + _kill.reset_mock() + + # A dead process doesn't break the script. + _peers.return_value = Mock(data={harness.charm.unit: {"rotate-logs-pid": "1"}}) + _kill.side_effect = OSError + harness.charm.rotate_logs.stop_log_rotation() + _kill.assert_called_once_with(1, signal.SIGINT) From 1641aeb9d3be7a2f0e7808e9e70851ef1a97dccb Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Thu, 2 Jul 2026 17:43:28 -0300 Subject: [PATCH 2/3] test(integration): reproduce #1550 teardown on juju 4.0 The storage-detaching regression only manifests on Juju 4.0: 3.6 masks it with a cleanup_storage shortcut that removes still-Dying storage, and on 4.0 only rootfs (machine-scoped) storage reproduces the stuck unmount, so running the teardown check on 3.6 proves nothing. Move it out of test_storage.py into its own spread task pinned to a juju40 variant with rootfs storage (force-deployed because the charm still declares assumes: juju < 4). Also make the list_storage adapter tolerate Juju 4.0's empty list-storage output, which it prints instead of "{}" for a model with no storage. Signed-off-by: Marcelo Henrique Neppel --- tests/integration/adapters.py | 3 +- tests/integration/test_storage.py | 28 ---------- tests/integration/test_storage_detaching.py | 54 +++++++++++++++++++ .../test_storage_detaching.py/task.yaml | 13 +++++ 4 files changed, 69 insertions(+), 29 deletions(-) create mode 100644 tests/integration/test_storage_detaching.py create mode 100644 tests/spread/test_storage_detaching.py/task.yaml diff --git a/tests/integration/adapters.py b/tests/integration/adapters.py index 1892a638146..64cacb23f09 100644 --- a/tests/integration/adapters.py +++ b/tests/integration/adapters.py @@ -379,7 +379,8 @@ def destroy_unit( def list_storage(self, filesystem: bool = False, volume: bool = False) -> list[TStorageInfo]: """Lists storage details.""" raw = self._juju.cli("list-storage", "--format", "json") - json_ = json.loads(raw) + # Juju 4.0 prints nothing (not `{}`) for a model with no storage. + json_ = json.loads(raw) if raw.strip() else {} ret = [] for storage_key, storage_details in json_.get("storage", {}).items(): ret.append({"key": storage_key, **storage_details}) diff --git a/tests/integration/test_storage.py b/tests/integration/test_storage.py index 4622af58ac2..3cefedfccf9 100644 --- a/tests/integration/test_storage.py +++ b/tests/integration/test_storage.py @@ -31,31 +31,3 @@ def test_storage(juju: JujuFixture, charm): assert storage["key"] == f"{expected_storages[index]}/{index}", ( f"Storage {expected_storages[index]} not found" ) - - -@pytest.mark.abort_on_fail -def test_storage_released_on_removal(juju: JujuFixture, charm): - """Graceful removal must release the storage so teardown completes. - - Regression test for canonical/postgresql-operator#1550: without stopping the - workload in the ``storage-detaching`` hook the charmed-postgresql snap keeps - the storage mounts busy, so Juju's unmount fails ("target is busy") and the - unit, storage and machine removal hangs forever. - """ - if DATABASE_APP_NAME not in juju.status().apps: - with juju.ext.fast_forward(): - juju.ext.model.deploy(charm, num_units=1, config={"profile": "testing"}) - juju.ext.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active") - - logger.info("Removing %s and waiting for a clean teardown", DATABASE_APP_NAME) - juju.ext.model.remove_application( - DATABASE_APP_NAME, - block_until_done=True, - destroy_storage=True, - timeout=15 * 60, - ) - - detaching = [ - storage for storage in juju.ext.model.list_storage() if storage["life"] == "detaching" - ] - assert not detaching, f"storage stuck detaching after removal: {detaching}" diff --git a/tests/integration/test_storage_detaching.py b/tests/integration/test_storage_detaching.py new file mode 100644 index 00000000000..bb9bd2a64cc --- /dev/null +++ b/tests/integration/test_storage_detaching.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# Copyright 2025 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging + +import pytest + +from .adapters import JujuFixture +from .jubilant_helpers import DATABASE_APP_NAME + +logger = logging.getLogger(__name__) + +# rootfs (machine-scoped) storage is the issue's scenario. It matters: +# - Juju 3.6 masks the bug via a cleanup_storage shortcut that removes still-Dying +# storage, so this only reproduces on Juju 4.0 (see tests/spread/.../task.yaml). +# - On 4.0 the detachable ``lxd`` pool lets Juju destroy the container before +# detaching, so only ``rootfs`` reproduces the stuck unmount. +ROOTFS_STORAGE = dict.fromkeys(("archive", "data", "logs", "temp"), "rootfs") + + +@pytest.mark.abort_on_fail +def test_storage_released_on_removal(juju: JujuFixture, charm): + """Graceful removal must release the storage so teardown completes. + + Regression test for canonical/postgresql-operator#1550: without stopping the + workload in the ``storage-detaching`` hook the charmed-postgresql snap keeps the + storage mounts busy, so Juju's unmount fails ("target is busy") and the unit, + storage and machine removal hangs forever. + + Pinned to Juju 4.0 with rootfs storage by its spread task. ``force=True`` is + required because the charm still declares ``assumes: juju < 4``. + """ + juju.ext.model.deploy( + charm, + num_units=1, + config={"profile": "testing"}, + storage=ROOTFS_STORAGE, + force=True, + ) + juju.ext.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active") + + logger.info("Removing %s and waiting for a clean teardown", DATABASE_APP_NAME) + juju.ext.model.remove_application( + DATABASE_APP_NAME, + block_until_done=True, + destroy_storage=True, + timeout=15 * 60, + ) + + detaching = [ + storage for storage in juju.ext.model.list_storage() if storage["life"] == "detaching" + ] + assert not detaching, f"storage stuck detaching after removal: {detaching}" diff --git a/tests/spread/test_storage_detaching.py/task.yaml b/tests/spread/test_storage_detaching.py/task.yaml new file mode 100644 index 00000000000..3f1f886a556 --- /dev/null +++ b/tests/spread/test_storage_detaching.py/task.yaml @@ -0,0 +1,13 @@ +summary: test_storage_detaching.py +environment: + TEST_MODULE: test_storage_detaching.py + # #1550 only reproduces on Juju 4.0: 3.6 masks it via a cleanup_storage shortcut + # that removes still-Dying storage. Pin this task to a 4.0 variant and drop the + # default juju36 variant so it runs ONLY on 4.0 (the rest of the suite stays 3.6). + CONCIERGE_JUJU_CHANNEL/juju40: 4.0/stable +variants: + - -juju36 +execute: | + tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" +artifacts: + - allure-results From 1b7a4fb22c9063a34ff307cd86b2294be3db021e Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Thu, 2 Jul 2026 19:48:49 -0300 Subject: [PATCH 3/3] fix(charm): only release storage on full app teardown Stopping the workload in storage-detaching also fired on scale-down (remove-unit), where the surviving leader still needs the departing unit's Patroni reachable to remove it from the raft cluster. Stopping it early broke that reconfiguration, leaving the cluster unable to elect a primary ("Primary unit not found") and failing the HA/scaling integration tests. The storage unmount only actually hangs on full teardown (remove-application/destroy-model), so guard the handler on planned_units() == 0. On scale-down it now does nothing, restoring the pre-fix cluster behaviour, while destroy-model/remove-application still release the storage. Signed-off-by: Marcelo Henrique Neppel --- src/charm.py | 8 ++++++++ tests/unit/test_charm.py | 15 ++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/charm.py b/src/charm.py index f53dad9ca5f..83494c6f0e2 100755 --- a/src/charm.py +++ b/src/charm.py @@ -2570,10 +2570,18 @@ def _install_snap_package( def _on_storage_detaching(self, _) -> None: """Release the storage so Juju can unmount it on unit teardown. + Only act when the whole application is going away (``planned_units == 0``, + i.e. remove-application/destroy-model), which is where the storage unmount + actually hangs. On scale-down the surviving units still need this unit's + Patroni reachable to remove it from the raft cluster, so stopping the + workload here would break primary election. + ``storage-detaching`` runs before ``stop``, so this is the earliest hook where the snap services and background processes that keep the storage mounts busy can be stopped. """ + if self.app.planned_units() > 0: + return self._observer.stop_observer() self._rotate_logs.stop_log_rotation() # Stop every charmed-postgresql snap service (PostgreSQL, pgBackRest, the diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 17f54ff0d45..86cb7b0ed8a 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -130,10 +130,23 @@ def test_on_storage_detaching(harness): patch("charm.snap.SnapCache") as _snap_cache, patch("charm.ClusterTopologyObserver.stop_observer") as _stop_observer, patch("charm.RotateLogs.stop_log_rotation") as _stop_log_rotation, + patch.object(harness.charm.app, "planned_units") as _planned_units, ): _selected_snap = _snap_cache.return_value.__getitem__.return_value - # Every storage's detaching event releases the storage by stopping the + + # Scale-down (units remain): the surviving cluster still needs this unit's + # Patroni reachable to remove it from raft, so the workload must NOT be + # stopped here. + _planned_units.return_value = 2 + storage_id = harness.add_storage("data", attach=True)[0] + harness.detach_storage(storage_id) + _stop_observer.assert_not_called() + _stop_log_rotation.assert_not_called() + _selected_snap.stop.assert_not_called() + + # Full teardown (no units remain): release the storage by stopping the # background processes and all the snap services. + _planned_units.return_value = 0 for storage_name in ("archive", "data", "logs", "temp"): _stop_observer.reset_mock() _stop_log_rotation.reset_mock()