diff --git a/src/charm.py b/src/charm.py index 0b9626453f..83494c6f0e 100755 --- a/src/charm.py +++ b/src/charm.py @@ -413,6 +413,10 @@ def __init__(self, *args): # incorrectly # https://canonical-charm-refresh.readthedocs-hosted.com/latest/add-to-charm/status/#implementation self.framework.observe(self.on.collect_unit_status, self._reconcile_refresh_status) + for storage_name in self.meta.storages: + self.framework.observe( + self.on[storage_name].storage_detaching, self._on_storage_detaching + ) self.cluster_name = self.app.name self._member_name = self.unit.name.replace("/", "-") self._certs_path = "/usr/local/share/ca-certificates" @@ -2563,6 +2567,30 @@ def _install_snap_package( ) raise + def _on_storage_detaching(self, _) -> None: + """Release the storage so Juju can unmount it on unit teardown. + + Only act when the whole application is going away (``planned_units == 0``, + i.e. remove-application/destroy-model), which is where the storage unmount + actually hangs. On scale-down the surviving units still need this unit's + Patroni reachable to remove it from the raft cluster, so stopping the + workload here would break primary election. + + ``storage-detaching`` runs before ``stop``, so this is the earliest hook + where the snap services and background processes that keep the storage + mounts busy can be stopped. + """ + if self.app.planned_units() > 0: + return + self._observer.stop_observer() + self._rotate_logs.stop_log_rotation() + # Stop every charmed-postgresql snap service (PostgreSQL, pgBackRest, the + # exporters) so the snap releases the storage mounts before Juju unmounts them. + try: + snap.SnapCache()[charm_refresh.snap_name()].stop() + except snap.SnapError: + logger.exception("Failed to stop charmed-postgresql snap services") + def _is_storage_attached(self) -> bool: """Returns if storage is attached.""" try: diff --git a/src/rotate_logs.py b/src/rotate_logs.py index 39cdafc11e..f6dd621e26 100644 --- a/src/rotate_logs.py +++ b/src/rotate_logs.py @@ -5,6 +5,7 @@ import logging import os +import signal import subprocess from typing import TYPE_CHECKING @@ -56,3 +57,14 @@ def start_log_rotation(self): self._charm.unit_peer_data.update({"rotate-logs-pid": f"{pid}"}) logging.info(f"Started rotate logs process with PID {pid}") + + def stop_log_rotation(self) -> None: + """Stop the running rotate logs process if we have previously started it.""" + if stored_pid := self._charm.unit_peer_data.get("rotate-logs-pid"): + pid = int(stored_pid) + try: + os.kill(pid, signal.SIGINT) + logging.info(f"Stopped rotate logs process with PID {pid}") + self._charm.unit_peer_data.update({"rotate-logs-pid": ""}) + except OSError: + pass diff --git a/tests/integration/adapters.py b/tests/integration/adapters.py index 1892a63814..64cacb23f0 100644 --- a/tests/integration/adapters.py +++ b/tests/integration/adapters.py @@ -379,7 +379,8 @@ def destroy_unit( def list_storage(self, filesystem: bool = False, volume: bool = False) -> list[TStorageInfo]: """Lists storage details.""" raw = self._juju.cli("list-storage", "--format", "json") - json_ = json.loads(raw) + # Juju 4.0 prints nothing (not `{}`) for a model with no storage. + json_ = json.loads(raw) if raw.strip() else {} ret = [] for storage_key, storage_details in json_.get("storage", {}).items(): ret.append({"key": storage_key, **storage_details}) diff --git a/tests/integration/test_storage_detaching.py b/tests/integration/test_storage_detaching.py new file mode 100644 index 0000000000..bb9bd2a64c --- /dev/null +++ b/tests/integration/test_storage_detaching.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# Copyright 2025 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging + +import pytest + +from .adapters import JujuFixture +from .jubilant_helpers import DATABASE_APP_NAME + +logger = logging.getLogger(__name__) + +# rootfs (machine-scoped) storage is the issue's scenario. It matters: +# - Juju 3.6 masks the bug via a cleanup_storage shortcut that removes still-Dying +# storage, so this only reproduces on Juju 4.0 (see tests/spread/.../task.yaml). +# - On 4.0 the detachable ``lxd`` pool lets Juju destroy the container before +# detaching, so only ``rootfs`` reproduces the stuck unmount. +ROOTFS_STORAGE = dict.fromkeys(("archive", "data", "logs", "temp"), "rootfs") + + +@pytest.mark.abort_on_fail +def test_storage_released_on_removal(juju: JujuFixture, charm): + """Graceful removal must release the storage so teardown completes. + + Regression test for canonical/postgresql-operator#1550: without stopping the + workload in the ``storage-detaching`` hook the charmed-postgresql snap keeps the + storage mounts busy, so Juju's unmount fails ("target is busy") and the unit, + storage and machine removal hangs forever. + + Pinned to Juju 4.0 with rootfs storage by its spread task. ``force=True`` is + required because the charm still declares ``assumes: juju < 4``. + """ + juju.ext.model.deploy( + charm, + num_units=1, + config={"profile": "testing"}, + storage=ROOTFS_STORAGE, + force=True, + ) + juju.ext.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active") + + logger.info("Removing %s and waiting for a clean teardown", DATABASE_APP_NAME) + juju.ext.model.remove_application( + DATABASE_APP_NAME, + block_until_done=True, + destroy_storage=True, + timeout=15 * 60, + ) + + detaching = [ + storage for storage in juju.ext.model.list_storage() if storage["life"] == "detaching" + ] + assert not detaching, f"storage stuck detaching after removal: {detaching}" diff --git a/tests/spread/test_storage_detaching.py/task.yaml b/tests/spread/test_storage_detaching.py/task.yaml new file mode 100644 index 0000000000..3f1f886a55 --- /dev/null +++ b/tests/spread/test_storage_detaching.py/task.yaml @@ -0,0 +1,13 @@ +summary: test_storage_detaching.py +environment: + TEST_MODULE: test_storage_detaching.py + # #1550 only reproduces on Juju 4.0: 3.6 masks it via a cleanup_storage shortcut + # that removes still-Dying storage. Pin this task to a 4.0 variant and drop the + # default juju36 variant so it runs ONLY on 4.0 (the rest of the suite stays 3.6). + CONCIERGE_JUJU_CHANNEL/juju40: 4.0/stable +variants: + - -juju36 +execute: | + tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" +artifacts: + - allure-results diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 7a9db219e5..86cb7b0ed8 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -125,6 +125,41 @@ def test_on_install(harness): assert isinstance(harness.model.unit.status, WaitingStatus) +def test_on_storage_detaching(harness): + with ( + patch("charm.snap.SnapCache") as _snap_cache, + patch("charm.ClusterTopologyObserver.stop_observer") as _stop_observer, + patch("charm.RotateLogs.stop_log_rotation") as _stop_log_rotation, + patch.object(harness.charm.app, "planned_units") as _planned_units, + ): + _selected_snap = _snap_cache.return_value.__getitem__.return_value + + # Scale-down (units remain): the surviving cluster still needs this unit's + # Patroni reachable to remove it from raft, so the workload must NOT be + # stopped here. + _planned_units.return_value = 2 + storage_id = harness.add_storage("data", attach=True)[0] + harness.detach_storage(storage_id) + _stop_observer.assert_not_called() + _stop_log_rotation.assert_not_called() + _selected_snap.stop.assert_not_called() + + # Full teardown (no units remain): release the storage by stopping the + # background processes and all the snap services. + _planned_units.return_value = 0 + for storage_name in ("archive", "data", "logs", "temp"): + _stop_observer.reset_mock() + _stop_log_rotation.reset_mock() + _selected_snap.reset_mock() + + storage_id = harness.add_storage(storage_name, attach=True)[0] + harness.detach_storage(storage_id) + + _stop_observer.assert_called_once_with() + _stop_log_rotation.assert_called_once_with() + _selected_snap.stop.assert_called_once_with() + + def test_patroni_scrape_config(harness): result = harness.charm.patroni_scrape_config() diff --git a/tests/unit/test_rotate_logs.py b/tests/unit/test_rotate_logs.py index 70b870f021..e399e9d2d4 100644 --- a/tests/unit/test_rotate_logs.py +++ b/tests/unit/test_rotate_logs.py @@ -1,5 +1,6 @@ # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. +import signal from unittest.mock import Mock, PropertyMock, patch import pytest @@ -96,3 +97,33 @@ def test_start_log_rotation_already_running(harness): harness.charm.rotate_logs.start_log_rotation() _kill.assert_called_once_with(1234, 0) _popen.assert_called_once() + + +def test_stop_log_rotation(harness): + with ( + patch("os.kill") as _kill, + patch.object(MockCharm, "_peers", new_callable=PropertyMock) as _peers, + ): + # Nothing is done when the peer relation (and so the stored PID) is gone. + _peers.return_value = None + harness.charm.rotate_logs.stop_log_rotation() + _kill.assert_not_called() + + # Nothing is done when there is no process running. + _peers.return_value = Mock(data={harness.charm.unit: {}}) + harness.charm.rotate_logs.stop_log_rotation() + _kill.assert_not_called() + + # The process is killed and the stored PID is cleared. + peer_data = {"rotate-logs-pid": "1"} + _peers.return_value = Mock(data={harness.charm.unit: peer_data}) + harness.charm.rotate_logs.stop_log_rotation() + _kill.assert_called_once_with(1, signal.SIGINT) + assert peer_data["rotate-logs-pid"] == "" + _kill.reset_mock() + + # A dead process doesn't break the script. + _peers.return_value = Mock(data={harness.charm.unit: {"rotate-logs-pid": "1"}}) + _kill.side_effect = OSError + harness.charm.rotate_logs.stop_log_rotation() + _kill.assert_called_once_with(1, signal.SIGINT)