From e11ce493ea06cb591ce21d3b9f036f05202e27e6 Mon Sep 17 00:00:00 2001
From: Marcelo Henrique Neppel <marcelo.neppel@canonical.com>
Date: Thu, 11 Jun 2026 18:52:37 -0300
Subject: [PATCH 1/3] fix(charm): release storage on unit teardown

The charm implemented no removal hooks, so on unit teardown the
charmed-postgresql snap services kept the Juju storage mounts busy.
Juju's unmount then failed with "target is busy", leaving storage
stuck detaching and blocking machine and model removal (only
destroy-model --force could clear it).

Stop the workload in the storage-detaching hook, which Juju runs
before stop, so the mounts are free by the time Juju unmounts them.
This stops every charmed-postgresql snap service plus the charm's
topology-observer and log-rotation processes, and is idempotent
across the per-storage detaching events.

Fixes #1550.

Signed-off-by: Marcelo Henrique Neppel <marcelo.neppel@canonical.com>
---
 src/charm.py                      | 15 +++++++++++++++
 src/cluster.py                    | 15 +++++++++++++++
 src/rotate_logs.py                | 12 ++++++++++++
 tests/integration/test_storage.py | 28 ++++++++++++++++++++++++++++
 tests/unit/test_charm.py          | 21 +++++++++++++++++++++
 tests/unit/test_cluster.py        | 15 +++++++++++++++
 tests/unit/test_rotate_logs.py    | 31 +++++++++++++++++++++++++++++++
 7 files changed, 137 insertions(+)

diff --git a/src/charm.py b/src/charm.py
index 4a49576a137..dd56d1b7c35 100755
--- a/src/charm.py
+++ b/src/charm.py
@@ -374,6 +374,10 @@ def __init__(self, *args):
         # incorrectly
         # https://canonical-charm-refresh.readthedocs-hosted.com/latest/add-to-charm/status/#implementation
         self.framework.observe(self.on.collect_unit_status, self._reconcile_refresh_status)
+        for storage_name in self.meta.storages:
+            self.framework.observe(
+                self.on[storage_name].storage_detaching, self._on_storage_detaching
+            )
         self.cluster_name = self.app.name
         self._member_name = self.unit.name.replace("/", "-")
         self._certs_path = "/usr/local/share/ca-certificates"
@@ -2475,6 +2479,17 @@ def _install_snap_package(
             )
             raise
 
+    def _on_storage_detaching(self, _) -> None:
+        """Release the storage so Juju can unmount it on unit teardown.
+
+        ``storage-detaching`` runs before ``stop``, so this is the earliest hook
+        where the snap services and background processes that keep the storage
+        mounts busy can be stopped.
+        """
+        self._observer.stop_observer()
+        self._rotate_logs.stop_log_rotation()
+        self._patroni.stop_all_services()
+
     def _is_storage_attached(self) -> bool:
         """Returns if storage is attached."""
         try:
diff --git a/src/cluster.py b/src/cluster.py
index c9d7a9749c0..5ad80b99d43 100644
--- a/src/cluster.py
+++ b/src/cluster.py
@@ -796,6 +796,21 @@ def stop_patroni(self) -> bool:
             logger.exception(error_message, exc_info=e)
             return False
 
+    def stop_all_services(self) -> None:
+        """Stop every charmed-postgresql snap service.
+
+        Used on unit teardown so the snap releases the storage mounts (PostgreSQL,
+        pgBackRest, the exporters) before Juju unmounts them.
+        """
+        try:
+            logger.debug("Stopping all charmed-postgresql services...")
+            cache = snap.SnapCache()
+            selected_snap = cache["charmed-postgresql"]
+            selected_snap.stop()
+        except snap.SnapError as e:
+            error_message = "Failed to stop charmed-postgresql snap services"
+            logger.exception(error_message, exc_info=e)
+
     def switchover(self, candidate: str | None = None, async_cluster: bool = False) -> None:
         """Trigger a switchover."""
         # Try to trigger the switchover.
diff --git a/src/rotate_logs.py b/src/rotate_logs.py
index 6ccc7816fa5..71e6da83d3c 100644
--- a/src/rotate_logs.py
+++ b/src/rotate_logs.py
@@ -5,6 +5,7 @@
 
 import logging
 import os
+import signal
 import subprocess
 from typing import TYPE_CHECKING
 
@@ -57,3 +58,14 @@ def start_log_rotation(self):
 
         self._charm.unit_peer_data.update({"rotate-logs-pid": f"{pid}"})
         logging.info(f"Started rotate logs process with PID {pid}")
+
+    def stop_log_rotation(self) -> None:
+        """Stop the running rotate logs process if we have previously started it."""
+        if stored_pid := self._charm.unit_peer_data.get("rotate-logs-pid"):
+            pid = int(stored_pid)
+            try:
+                os.kill(pid, signal.SIGINT)
+                logging.info(f"Stopped rotate logs process with PID {pid}")
+                self._charm.unit_peer_data.update({"rotate-logs-pid": ""})
+            except OSError:
+                pass
diff --git a/tests/integration/test_storage.py b/tests/integration/test_storage.py
index 3cefedfccf9..4622af58ac2 100644
--- a/tests/integration/test_storage.py
+++ b/tests/integration/test_storage.py
@@ -31,3 +31,31 @@ def test_storage(juju: JujuFixture, charm):
             assert storage["key"] == f"{expected_storages[index]}/{index}", (
                 f"Storage {expected_storages[index]} not found"
             )
+
+
+@pytest.mark.abort_on_fail
+def test_storage_released_on_removal(juju: JujuFixture, charm):
+    """Graceful removal must release the storage so teardown completes.
+
+    Regression test for canonical/postgresql-operator#1550: without stopping the
+    workload in the ``storage-detaching`` hook the charmed-postgresql snap keeps
+    the storage mounts busy, so Juju's unmount fails ("target is busy") and the
+    unit, storage and machine removal hangs forever.
+    """
+    if DATABASE_APP_NAME not in juju.status().apps:
+        with juju.ext.fast_forward():
+            juju.ext.model.deploy(charm, num_units=1, config={"profile": "testing"})
+            juju.ext.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active")
+
+    logger.info("Removing %s and waiting for a clean teardown", DATABASE_APP_NAME)
+    juju.ext.model.remove_application(
+        DATABASE_APP_NAME,
+        block_until_done=True,
+        destroy_storage=True,
+        timeout=15 * 60,
+    )
+
+    detaching = [
+        storage for storage in juju.ext.model.list_storage() if storage["life"] == "detaching"
+    ]
+    assert not detaching, f"storage stuck detaching after removal: {detaching}"
diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py
index f29388e5d24..29179edb960 100644
--- a/tests/unit/test_charm.py
+++ b/tests/unit/test_charm.py
@@ -124,6 +124,27 @@ def test_on_install(harness):
         assert isinstance(harness.model.unit.status, WaitingStatus)
 
 
+def test_on_storage_detaching(harness):
+    with (
+        patch("charm.PostgresqlOperatorCharm._patroni", new_callable=PropertyMock) as _patroni,
+        patch("charm.ClusterTopologyObserver.stop_observer") as _stop_observer,
+        patch("charm.RotateLogs.stop_log_rotation") as _stop_log_rotation,
+    ):
+        # Every storage's detaching event releases the storage by stopping the
+        # background processes and all the snap services.
+        for storage_name in ("archive", "data", "logs", "temp"):
+            _stop_observer.reset_mock()
+            _stop_log_rotation.reset_mock()
+            _patroni.reset_mock()
+
+            storage_id = harness.add_storage(storage_name, attach=True)[0]
+            harness.detach_storage(storage_id)
+
+            _stop_observer.assert_called_once_with()
+            _stop_log_rotation.assert_called_once_with()
+            _patroni.return_value.stop_all_services.assert_called_once_with()
+
+
 def test_patroni_scrape_config(harness):
     result = harness.charm.patroni_scrape_config()
 
diff --git a/tests/unit/test_cluster.py b/tests/unit/test_cluster.py
index 582211a7e36..c08788ddef2 100644
--- a/tests/unit/test_cluster.py
+++ b/tests/unit/test_cluster.py
@@ -367,6 +367,21 @@ def test_stop_patroni(peers_ips, patroni):
         assert not patroni.stop_patroni()
 
 
+def test_stop_all_services(peers_ips, patroni):
+    with patch("charm.snap.SnapCache") as _snap_cache:
+        _cache = _snap_cache.return_value
+        _selected_snap = _cache.__getitem__.return_value
+
+        # All snap services are stopped (no services= argument means the whole snap).
+        patroni.stop_all_services()
+        _cache.__getitem__.assert_called_once_with("charmed-postgresql")
+        _selected_snap.stop.assert_called_once_with()
+
+        # A snap error does not propagate out of the teardown helper.
+        _selected_snap.stop.side_effect = snap.SnapError
+        patroni.stop_all_services()
+
+
 def test_reinitialize_postgresql(peers_ips, patroni):
     with patch("requests.post") as _post:
         patroni.reinitialize_postgresql()
diff --git a/tests/unit/test_rotate_logs.py b/tests/unit/test_rotate_logs.py
index 70b870f0218..e399e9d2d43 100644
--- a/tests/unit/test_rotate_logs.py
+++ b/tests/unit/test_rotate_logs.py
@@ -1,5 +1,6 @@
 # Copyright 2024 Canonical Ltd.
 # See LICENSE file for licensing details.
+import signal
 from unittest.mock import Mock, PropertyMock, patch
 
 import pytest
@@ -96,3 +97,33 @@ def test_start_log_rotation_already_running(harness):
         harness.charm.rotate_logs.start_log_rotation()
         _kill.assert_called_once_with(1234, 0)
         _popen.assert_called_once()
+
+
+def test_stop_log_rotation(harness):
+    with (
+        patch("os.kill") as _kill,
+        patch.object(MockCharm, "_peers", new_callable=PropertyMock) as _peers,
+    ):
+        # Nothing is done when the peer relation (and so the stored PID) is gone.
+        _peers.return_value = None
+        harness.charm.rotate_logs.stop_log_rotation()
+        _kill.assert_not_called()
+
+        # Nothing is done when there is no process running.
+        _peers.return_value = Mock(data={harness.charm.unit: {}})
+        harness.charm.rotate_logs.stop_log_rotation()
+        _kill.assert_not_called()
+
+        # The process is killed and the stored PID is cleared.
+        peer_data = {"rotate-logs-pid": "1"}
+        _peers.return_value = Mock(data={harness.charm.unit: peer_data})
+        harness.charm.rotate_logs.stop_log_rotation()
+        _kill.assert_called_once_with(1, signal.SIGINT)
+        assert peer_data["rotate-logs-pid"] == ""
+        _kill.reset_mock()
+
+        # A dead process doesn't break the script.
+        _peers.return_value = Mock(data={harness.charm.unit: {"rotate-logs-pid": "1"}})
+        _kill.side_effect = OSError
+        harness.charm.rotate_logs.stop_log_rotation()
+        _kill.assert_called_once_with(1, signal.SIGINT)

From 1641aeb9d3be7a2f0e7808e9e70851ef1a97dccb Mon Sep 17 00:00:00 2001
From: Marcelo Henrique Neppel <marcelo.neppel@canonical.com>
Date: Thu, 2 Jul 2026 17:43:28 -0300
Subject: [PATCH 2/3] test(integration): reproduce #1550 teardown on juju 4.0

The storage-detaching regression only manifests on Juju 4.0: 3.6 masks it
with a cleanup_storage shortcut that removes still-Dying storage, and on
4.0 only rootfs (machine-scoped) storage reproduces the stuck unmount, so
running the teardown check on 3.6 proves nothing. Move it out of
test_storage.py into its own spread task pinned to a juju40 variant with
rootfs storage (force-deployed because the charm still declares
assumes: juju < 4).

Also make the list_storage adapter tolerate Juju 4.0's empty list-storage
output, which it prints instead of "{}" for a model with no storage.

Signed-off-by: Marcelo Henrique Neppel <marcelo.neppel@canonical.com>
---
 tests/integration/adapters.py                 |  3 +-
 tests/integration/test_storage.py             | 28 ----------
 tests/integration/test_storage_detaching.py   | 54 +++++++++++++++++++
 .../test_storage_detaching.py/task.yaml       | 13 +++++
 4 files changed, 69 insertions(+), 29 deletions(-)
 create mode 100644 tests/integration/test_storage_detaching.py
 create mode 100644 tests/spread/test_storage_detaching.py/task.yaml

diff --git a/tests/integration/adapters.py b/tests/integration/adapters.py
index 1892a638146..64cacb23f09 100644
--- a/tests/integration/adapters.py
+++ b/tests/integration/adapters.py
@@ -379,7 +379,8 @@ def destroy_unit(
     def list_storage(self, filesystem: bool = False, volume: bool = False) -> list[TStorageInfo]:
         """Lists storage details."""
         raw = self._juju.cli("list-storage", "--format", "json")
-        json_ = json.loads(raw)
+        # Juju 4.0 prints nothing (not `{}`) for a model with no storage.
+        json_ = json.loads(raw) if raw.strip() else {}
         ret = []
         for storage_key, storage_details in json_.get("storage", {}).items():
             ret.append({"key": storage_key, **storage_details})
diff --git a/tests/integration/test_storage.py b/tests/integration/test_storage.py
index 4622af58ac2..3cefedfccf9 100644
--- a/tests/integration/test_storage.py
+++ b/tests/integration/test_storage.py
@@ -31,31 +31,3 @@ def test_storage(juju: JujuFixture, charm):
             assert storage["key"] == f"{expected_storages[index]}/{index}", (
                 f"Storage {expected_storages[index]} not found"
             )
-
-
-@pytest.mark.abort_on_fail
-def test_storage_released_on_removal(juju: JujuFixture, charm):
-    """Graceful removal must release the storage so teardown completes.
-
-    Regression test for canonical/postgresql-operator#1550: without stopping the
-    workload in the ``storage-detaching`` hook the charmed-postgresql snap keeps
-    the storage mounts busy, so Juju's unmount fails ("target is busy") and the
-    unit, storage and machine removal hangs forever.
-    """
-    if DATABASE_APP_NAME not in juju.status().apps:
-        with juju.ext.fast_forward():
-            juju.ext.model.deploy(charm, num_units=1, config={"profile": "testing"})
-            juju.ext.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active")
-
-    logger.info("Removing %s and waiting for a clean teardown", DATABASE_APP_NAME)
-    juju.ext.model.remove_application(
-        DATABASE_APP_NAME,
-        block_until_done=True,
-        destroy_storage=True,
-        timeout=15 * 60,
-    )
-
-    detaching = [
-        storage for storage in juju.ext.model.list_storage() if storage["life"] == "detaching"
-    ]
-    assert not detaching, f"storage stuck detaching after removal: {detaching}"
diff --git a/tests/integration/test_storage_detaching.py b/tests/integration/test_storage_detaching.py
new file mode 100644
index 00000000000..bb9bd2a64cc
--- /dev/null
+++ b/tests/integration/test_storage_detaching.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+# Copyright 2025 Canonical Ltd.
+# See LICENSE file for licensing details.
+
+import logging
+
+import pytest
+
+from .adapters import JujuFixture
+from .jubilant_helpers import DATABASE_APP_NAME
+
+logger = logging.getLogger(__name__)
+
+# rootfs (machine-scoped) storage is the issue's scenario. It matters:
+# - Juju 3.6 masks the bug via a cleanup_storage shortcut that removes still-Dying
+#   storage, so this only reproduces on Juju 4.0 (see tests/spread/.../task.yaml).
+# - On 4.0 the detachable ``lxd`` pool lets Juju destroy the container before
+#   detaching, so only ``rootfs`` reproduces the stuck unmount.
+ROOTFS_STORAGE = dict.fromkeys(("archive", "data", "logs", "temp"), "rootfs")
+
+
+@pytest.mark.abort_on_fail
+def test_storage_released_on_removal(juju: JujuFixture, charm):
+    """Graceful removal must release the storage so teardown completes.
+
+    Regression test for canonical/postgresql-operator#1550: without stopping the
+    workload in the ``storage-detaching`` hook the charmed-postgresql snap keeps the
+    storage mounts busy, so Juju's unmount fails ("target is busy") and the unit,
+    storage and machine removal hangs forever.
+
+    Pinned to Juju 4.0 with rootfs storage by its spread task. ``force=True`` is
+    required because the charm still declares ``assumes: juju < 4``.
+    """
+    juju.ext.model.deploy(
+        charm,
+        num_units=1,
+        config={"profile": "testing"},
+        storage=ROOTFS_STORAGE,
+        force=True,
+    )
+    juju.ext.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active")
+
+    logger.info("Removing %s and waiting for a clean teardown", DATABASE_APP_NAME)
+    juju.ext.model.remove_application(
+        DATABASE_APP_NAME,
+        block_until_done=True,
+        destroy_storage=True,
+        timeout=15 * 60,
+    )
+
+    detaching = [
+        storage for storage in juju.ext.model.list_storage() if storage["life"] == "detaching"
+    ]
+    assert not detaching, f"storage stuck detaching after removal: {detaching}"
diff --git a/tests/spread/test_storage_detaching.py/task.yaml b/tests/spread/test_storage_detaching.py/task.yaml
new file mode 100644
index 00000000000..3f1f886a556
--- /dev/null
+++ b/tests/spread/test_storage_detaching.py/task.yaml
@@ -0,0 +1,13 @@
+summary: test_storage_detaching.py
+environment:
+  TEST_MODULE: test_storage_detaching.py
+  # #1550 only reproduces on Juju 4.0: 3.6 masks it via a cleanup_storage shortcut
+  # that removes still-Dying storage. Pin this task to a 4.0 variant and drop the
+  # default juju36 variant so it runs ONLY on 4.0 (the rest of the suite stays 3.6).
+  CONCIERGE_JUJU_CHANNEL/juju40: 4.0/stable
+variants:
+  - -juju36
+execute: |
+  tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results"
+artifacts:
+  - allure-results

From 1b7a4fb22c9063a34ff307cd86b2294be3db021e Mon Sep 17 00:00:00 2001
From: Marcelo Henrique Neppel <marcelo.neppel@canonical.com>
Date: Thu, 2 Jul 2026 19:48:49 -0300
Subject: [PATCH 3/3] fix(charm): only release storage on full app teardown

Stopping the workload in storage-detaching also fired on scale-down
(remove-unit), where the surviving leader still needs the departing
unit's Patroni reachable to remove it from the raft cluster. Stopping it
early broke that reconfiguration, leaving the cluster unable to elect a
primary ("Primary unit not found") and failing the HA/scaling
integration tests.

The storage unmount only actually hangs on full teardown
(remove-application/destroy-model), so guard the handler on
planned_units() == 0. On scale-down it now does nothing, restoring the
pre-fix cluster behaviour, while destroy-model/remove-application still
release the storage.

Signed-off-by: Marcelo Henrique Neppel <marcelo.neppel@canonical.com>
---
 src/charm.py             |  8 ++++++++
 tests/unit/test_charm.py | 15 ++++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/charm.py b/src/charm.py
index f53dad9ca5f..83494c6f0e2 100755
--- a/src/charm.py
+++ b/src/charm.py
@@ -2570,10 +2570,18 @@ def _install_snap_package(
     def _on_storage_detaching(self, _) -> None:
         """Release the storage so Juju can unmount it on unit teardown.
 
+        Only act when the whole application is going away (``planned_units == 0``,
+        i.e. remove-application/destroy-model), which is where the storage unmount
+        actually hangs. On scale-down the surviving units still need this unit's
+        Patroni reachable to remove it from the raft cluster, so stopping the
+        workload here would break primary election.
+
         ``storage-detaching`` runs before ``stop``, so this is the earliest hook
         where the snap services and background processes that keep the storage
         mounts busy can be stopped.
         """
+        if self.app.planned_units() > 0:
+            return
         self._observer.stop_observer()
         self._rotate_logs.stop_log_rotation()
         # Stop every charmed-postgresql snap service (PostgreSQL, pgBackRest, the
diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py
index 17f54ff0d45..86cb7b0ed8a 100644
--- a/tests/unit/test_charm.py
+++ b/tests/unit/test_charm.py
@@ -130,10 +130,23 @@ def test_on_storage_detaching(harness):
         patch("charm.snap.SnapCache") as _snap_cache,
         patch("charm.ClusterTopologyObserver.stop_observer") as _stop_observer,
         patch("charm.RotateLogs.stop_log_rotation") as _stop_log_rotation,
+        patch.object(harness.charm.app, "planned_units") as _planned_units,
     ):
         _selected_snap = _snap_cache.return_value.__getitem__.return_value
-        # Every storage's detaching event releases the storage by stopping the
+
+        # Scale-down (units remain): the surviving cluster still needs this unit's
+        # Patroni reachable to remove it from raft, so the workload must NOT be
+        # stopped here.
+        _planned_units.return_value = 2
+        storage_id = harness.add_storage("data", attach=True)[0]
+        harness.detach_storage(storage_id)
+        _stop_observer.assert_not_called()
+        _stop_log_rotation.assert_not_called()
+        _selected_snap.stop.assert_not_called()
+
+        # Full teardown (no units remain): release the storage by stopping the
         # background processes and all the snap services.
+        _planned_units.return_value = 0
         for storage_name in ("archive", "data", "logs", "temp"):
             _stop_observer.reset_mock()
             _stop_log_rotation.reset_mock()