Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,10 @@ def __init__(self, *args):
# incorrectly
# https://canonical-charm-refresh.readthedocs-hosted.com/latest/add-to-charm/status/#implementation
self.framework.observe(self.on.collect_unit_status, self._reconcile_refresh_status)
for storage_name in self.meta.storages:
self.framework.observe(
self.on[storage_name].storage_detaching, self._on_storage_detaching
)
self.cluster_name = self.app.name
self._member_name = self.unit.name.replace("/", "-")
self._certs_path = "/usr/local/share/ca-certificates"
Expand Down Expand Up @@ -2563,6 +2567,30 @@ def _install_snap_package(
)
raise

def _on_storage_detaching(self, _) -> None:
"""Release the storage so Juju can unmount it on unit teardown.

Only act when the whole application is going away (``planned_units == 0``,
i.e. remove-application/destroy-model), which is where the storage unmount
actually hangs. On scale-down the surviving units still need this unit's
Patroni reachable to remove it from the raft cluster, so stopping the
workload here would break primary election.

``storage-detaching`` runs before ``stop``, so this is the earliest hook
where the snap services and background processes that keep the storage
mounts busy can be stopped.
"""
if self.app.planned_units() > 0:
return
self._observer.stop_observer()
self._rotate_logs.stop_log_rotation()
# Stop every charmed-postgresql snap service (PostgreSQL, pgBackRest, the
# exporters) so the snap releases the storage mounts before Juju unmounts them.
try:
snap.SnapCache()[charm_refresh.snap_name()].stop()
except snap.SnapError:
logger.exception("Failed to stop charmed-postgresql snap services")

def _is_storage_attached(self) -> bool:
"""Returns if storage is attached."""
try:
Expand Down
12 changes: 12 additions & 0 deletions src/rotate_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import logging
import os
import signal
import subprocess
from typing import TYPE_CHECKING

Expand Down Expand Up @@ -56,3 +57,14 @@ def start_log_rotation(self):

self._charm.unit_peer_data.update({"rotate-logs-pid": f"{pid}"})
logging.info(f"Started rotate logs process with PID {pid}")

def stop_log_rotation(self) -> None:
"""Stop the running rotate logs process if we have previously started it."""
if stored_pid := self._charm.unit_peer_data.get("rotate-logs-pid"):
pid = int(stored_pid)
try:
os.kill(pid, signal.SIGINT)
logging.info(f"Stopped rotate logs process with PID {pid}")
self._charm.unit_peer_data.update({"rotate-logs-pid": ""})
except OSError:
pass
3 changes: 2 additions & 1 deletion tests/integration/adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,8 @@ def destroy_unit(
def list_storage(self, filesystem: bool = False, volume: bool = False) -> list[TStorageInfo]:
"""Lists storage details."""
raw = self._juju.cli("list-storage", "--format", "json")
json_ = json.loads(raw)
# Juju 4.0 prints nothing (not `{}`) for a model with no storage.
json_ = json.loads(raw) if raw.strip() else {}
ret = []
for storage_key, storage_details in json_.get("storage", {}).items():
ret.append({"key": storage_key, **storage_details})
Expand Down
54 changes: 54 additions & 0 deletions tests/integration/test_storage_detaching.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env python3
# Copyright 2025 Canonical Ltd.
# See LICENSE file for licensing details.

import logging

import pytest

from .adapters import JujuFixture
from .jubilant_helpers import DATABASE_APP_NAME

logger = logging.getLogger(__name__)

# rootfs (machine-scoped) storage is the issue's scenario. It matters:
# - Juju 3.6 masks the bug via a cleanup_storage shortcut that removes still-Dying
# storage, so this only reproduces on Juju 4.0 (see tests/spread/.../task.yaml).
# - On 4.0 the detachable ``lxd`` pool lets Juju destroy the container before
# detaching, so only ``rootfs`` reproduces the stuck unmount.
ROOTFS_STORAGE = dict.fromkeys(("archive", "data", "logs", "temp"), "rootfs")


@pytest.mark.abort_on_fail
def test_storage_released_on_removal(juju: JujuFixture, charm):
"""Graceful removal must release the storage so teardown completes.

Regression test for canonical/postgresql-operator#1550: without stopping the
workload in the ``storage-detaching`` hook the charmed-postgresql snap keeps the
storage mounts busy, so Juju's unmount fails ("target is busy") and the unit,
storage and machine removal hangs forever.

Pinned to Juju 4.0 with rootfs storage by its spread task. ``force=True`` is
required because the charm still declares ``assumes: juju < 4``.
"""
juju.ext.model.deploy(
charm,
num_units=1,
config={"profile": "testing"},
storage=ROOTFS_STORAGE,
force=True,
)
juju.ext.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active")

logger.info("Removing %s and waiting for a clean teardown", DATABASE_APP_NAME)
juju.ext.model.remove_application(
DATABASE_APP_NAME,
block_until_done=True,
destroy_storage=True,
timeout=15 * 60,
)

detaching = [
storage for storage in juju.ext.model.list_storage() if storage["life"] == "detaching"
]
assert not detaching, f"storage stuck detaching after removal: {detaching}"
13 changes: 13 additions & 0 deletions tests/spread/test_storage_detaching.py/task.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
summary: test_storage_detaching.py
environment:
TEST_MODULE: test_storage_detaching.py
# #1550 only reproduces on Juju 4.0: 3.6 masks it via a cleanup_storage shortcut
# that removes still-Dying storage. Pin this task to a 4.0 variant and drop the
# default juju36 variant so it runs ONLY on 4.0 (the rest of the suite stays 3.6).
CONCIERGE_JUJU_CHANNEL/juju40: 4.0/stable
variants:
- -juju36
execute: |
tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results"
artifacts:
- allure-results
35 changes: 35 additions & 0 deletions tests/unit/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,41 @@ def test_on_install(harness):
assert isinstance(harness.model.unit.status, WaitingStatus)


def test_on_storage_detaching(harness):
with (
patch("charm.snap.SnapCache") as _snap_cache,
patch("charm.ClusterTopologyObserver.stop_observer") as _stop_observer,
patch("charm.RotateLogs.stop_log_rotation") as _stop_log_rotation,
patch.object(harness.charm.app, "planned_units") as _planned_units,
):
_selected_snap = _snap_cache.return_value.__getitem__.return_value

# Scale-down (units remain): the surviving cluster still needs this unit's
# Patroni reachable to remove it from raft, so the workload must NOT be
# stopped here.
_planned_units.return_value = 2
storage_id = harness.add_storage("data", attach=True)[0]
harness.detach_storage(storage_id)
_stop_observer.assert_not_called()
_stop_log_rotation.assert_not_called()
_selected_snap.stop.assert_not_called()

# Full teardown (no units remain): release the storage by stopping the
# background processes and all the snap services.
_planned_units.return_value = 0
for storage_name in ("archive", "data", "logs", "temp"):
_stop_observer.reset_mock()
_stop_log_rotation.reset_mock()
_selected_snap.reset_mock()

storage_id = harness.add_storage(storage_name, attach=True)[0]
harness.detach_storage(storage_id)

_stop_observer.assert_called_once_with()
_stop_log_rotation.assert_called_once_with()
_selected_snap.stop.assert_called_once_with()


def test_patroni_scrape_config(harness):
result = harness.charm.patroni_scrape_config()

Expand Down
31 changes: 31 additions & 0 deletions tests/unit/test_rotate_logs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright 2024 Canonical Ltd.
# See LICENSE file for licensing details.
import signal
from unittest.mock import Mock, PropertyMock, patch

import pytest
Expand Down Expand Up @@ -96,3 +97,33 @@ def test_start_log_rotation_already_running(harness):
harness.charm.rotate_logs.start_log_rotation()
_kill.assert_called_once_with(1234, 0)
_popen.assert_called_once()


def test_stop_log_rotation(harness):
with (
patch("os.kill") as _kill,
patch.object(MockCharm, "_peers", new_callable=PropertyMock) as _peers,
):
# Nothing is done when the peer relation (and so the stored PID) is gone.
_peers.return_value = None
harness.charm.rotate_logs.stop_log_rotation()
_kill.assert_not_called()

# Nothing is done when there is no process running.
_peers.return_value = Mock(data={harness.charm.unit: {}})
harness.charm.rotate_logs.stop_log_rotation()
_kill.assert_not_called()

# The process is killed and the stored PID is cleared.
peer_data = {"rotate-logs-pid": "1"}
_peers.return_value = Mock(data={harness.charm.unit: peer_data})
harness.charm.rotate_logs.stop_log_rotation()
_kill.assert_called_once_with(1, signal.SIGINT)
assert peer_data["rotate-logs-pid"] == ""
_kill.reset_mock()

# A dead process doesn't break the script.
_peers.return_value = Mock(data={harness.charm.unit: {"rotate-logs-pid": "1"}})
_kill.side_effect = OSError
harness.charm.rotate_logs.stop_log_rotation()
_kill.assert_called_once_with(1, signal.SIGINT)
Loading