From 76eee494aeb99495d6101765278fb3d7f8bc1c6a Mon Sep 17 00:00:00 2001 From: Frode Nordahl Date: Thu, 11 Jun 2026 14:40:10 +0000 Subject: [PATCH] Work around Juju unit error after reboot by awaiting symlink recreation After enable_hugepages_vfio_on_hvs_in_vms reboots a VM, Juju may still be recreating symlinks for unit tools when the first config-changed hook fires, causing the unit to enter an error state (LP: #2077936). Add a _wait_for_juju_symlinks_after_reboot method that monitors /var/log/juju/machine-*.log for both the Reboot and symlink recreation messages before allowing execution to continue. This is called after enable_hugepages returns from rebooting, both on the success and error-recovery paths. Assisted-by: GLM-5.1, opencode Signed-off-by: Frode Nordahl --- unit_tests/charm_tests/test_utils.py | 38 ++++++++++++++++++++++++ zaza/openstack/charm_tests/test_utils.py | 35 ++++++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/unit_tests/charm_tests/test_utils.py b/unit_tests/charm_tests/test_utils.py index c89c01e0d..34c5ad65c 100644 --- a/unit_tests/charm_tests/test_utils.py +++ b/unit_tests/charm_tests/test_utils.py @@ -217,6 +217,7 @@ def test_enable_hugepages_vfio_on_hvs_in_vms(self): self.patch_object(test_utils.zaza.utilities.machine_os, 'enable_vfio_unsafe_noiommu_mode') self.patch_object(test_utils.model, 'wait_for_application_states') + self.patch_target('_wait_for_juju_symlinks_after_reboot') nr_hugepages = 4 unit = mock.MagicMock() @@ -239,6 +240,8 @@ def test_enable_hugepages_vfio_on_hvs_in_vms(self): unit, nr_hugepages, model_name=self.target.model_name) + self._wait_for_juju_symlinks_after_reboot.assert_called_once_with( + unit.name) self.enable_vfio_unsafe_noiommu_mode.assert_called_once_with( unit, model_name=self.target.model_name) @@ -256,6 +259,7 @@ def test_enable_hugepages_vfio_on_hvs_in_vms_kvm_kernel(self): self.patch_object(test_utils.zaza.utilities.machine_os, 'enable_vfio_unsafe_noiommu_mode') self.patch_object(test_utils.model, 'wait_for_application_states') + self.patch_target('_wait_for_juju_symlinks_after_reboot') nr_hugepages = 4 unit = mock.MagicMock() @@ -286,6 +290,8 @@ def test_enable_hugepages_vfio_on_hvs_in_vms_kvm_kernel(self): unit, nr_hugepages, model_name=self.target.model_name) + self._wait_for_juju_symlinks_after_reboot.assert_called_once_with( + unit.name) self.enable_vfio_unsafe_noiommu_mode.assert_called_once_with( unit, model_name=self.target.model_name) @@ -310,6 +316,7 @@ def test_enable_hugepages_vfio_on_hvs_in_vms_recover_unit_error(self): 'enable_vfio_unsafe_noiommu_mode') self.patch_object(test_utils.model, 'wait_for_application_states') self.patch_object(test_utils.model, 'resolve_units') + self.patch_target('_wait_for_juju_symlinks_after_reboot') nr_hugepages = 4 unit = mock.MagicMock() @@ -350,6 +357,37 @@ def test_enable_hugepages_vfio_on_hvs_in_vms_recover_unit_error(self): ] ) + def test_wait_for_juju_symlinks_after_reboot(self): + """Test waiting for Juju symlink recreation after reboot.""" + self.patch_object(test_utils.zaza.utilities.juju, 'remote_run') + self.target.model_name = 'zaza-123' + + self.target._wait_for_juju_symlinks_after_reboot('ovn-chassis/0') + + escaped_unit = 'ovn-chassis-0' + expected_cmd = ( + "grep -qPz '(?s)Reboot.*?\\n.*?symlinks.*{}' " + "/var/log/juju/machine-*.log".format(escaped_unit)) + self.remote_run.assert_called_once_with( + 'ovn-chassis/0', + expected_cmd, + model_name='zaza-123', + fatal=True) + + def test_wait_for_juju_symlinks_after_reboot_retries(self): + """Test that waiting for symlinks retries on failure.""" + self.patch_object(test_utils.zaza.utilities.juju, 'remote_run') + self.target.model_name = 'zaza-123' + + self.remote_run.side_effect = [ + test_utils.zaza.model.CommandRunFailed('grep', {'Code': '1'}), + None, + ] + + self.target._wait_for_juju_symlinks_after_reboot('ovn-chassis/0') + + self.assertEqual(self.remote_run.call_count, 2) + class TestOpenStackBaseTest(ut_utils.BaseTestCase): diff --git a/zaza/openstack/charm_tests/test_utils.py b/zaza/openstack/charm_tests/test_utils.py index bb6536cf9..7c5ed9040 100644 --- a/zaza/openstack/charm_tests/test_utils.py +++ b/zaza/openstack/charm_tests/test_utils.py @@ -719,6 +719,39 @@ def assert_unit_cpu_topology(self, unit, nr_1g_hugepages): mbtotal, self.assert_unit_cpu_topology.__doc__)) + def _wait_for_juju_symlinks_after_reboot(self, unit_name): + """Wait for Juju to recreate tool symlinks after a machine reboot. + + After a reboot triggered by Juju, symlinks for unit tools may not be + recreated until slightly after the reboot completes. Interacting with + a unit before symlink recreation finishes may cause the unit to enter + an error state (ref: https://launchpad.net/bugs/2077936). + + This function monitors the Juju machine log on the unit for evidence + that both the reboot and subsequent symlink recreation for the given + unit have been recorded, indicating it is safe to continue. + + :param unit_name: Name of the unit to wait for (e.g. 'ovn-chassis/0') + :type unit_name: str + """ + escaped_unit = unit_name.replace('/', '-') + grep_cmd = ( + 'grep -qPz \'(?s)Reboot.*?\\n.*?symlinks.*{}\' ' + '/var/log/juju/machine-*.log'.format(escaped_unit)) + logging.info( + 'Waiting for Juju reboot and symlink recreation to be ' + 'logged for {}'.format(unit_name)) + for attempt in tenacity.Retrying( + stop=tenacity.stop_after_attempt(30), + wait=tenacity.wait_exponential(multiplier=1, min=2, max=30)): + with attempt: + zaza.utilities.juju.remote_run( + unit_name, grep_cmd, + model_name=self.model_name, fatal=True) + logging.info( + 'Juju reboot and symlink recreation confirmed ' + 'for {}'.format(unit_name)) + def enable_hugepages_vfio_on_hvs_in_vms(self, nr_1g_hugepages): """Enable hugepages and unsafe VFIO NOIOMMU on virtual hypervisors.""" for unit in model.get_units( @@ -755,12 +788,14 @@ def enable_hugepages_vfio_on_hvs_in_vms(self, nr_1g_hugepages): try: zaza.utilities.machine_os.enable_hugepages( unit, nr_1g_hugepages, model_name=self.model_name) + self._wait_for_juju_symlinks_after_reboot(unit.name) except zaza.model.UnitError: logging.warn(f'Unit {unit.name} went into error state during' ' huge pages enablement. Attempting to recover.' ' Possible cause:' ' https://bugs.launchpad.net/juju/+bug/2077936') zaza.model.resolve_units() + self._wait_for_juju_symlinks_after_reboot(unit.name) try: logging.info('Enabling unsafe VFIO NOIOMMU mode on {}'