diff --git a/nerve/__init__.py b/nerve/__init__.py index d87ae80..c08ac23 100644 --- a/nerve/__init__.py +++ b/nerve/__init__.py @@ -1,3 +1,8 @@ """Nerve — Personal AI Assistant.""" +# Apply anyio hot-loop patch as early as possible — monkey-patches +# CancelScope._deliver_cancellation to prevent 100% CPU spins on +# unrecoverable cancellations. See nerve/_anyio_patch.py for details. +from nerve import _anyio_patch as _anyio_patch # noqa: F401 + __version__ = "0.1.0" diff --git a/nerve/_anyio_patch.py b/nerve/_anyio_patch.py new file mode 100644 index 0000000..7ee5091 --- /dev/null +++ b/nerve/_anyio_patch.py @@ -0,0 +1,189 @@ +"""Monkey-patch for anyio 4.13.0 _deliver_cancellation hot-loop bug. + +Upstream bug (anyio 4.13.0, _backends/_asyncio.py:582-616): + + for task in self._tasks: + should_retry = True # ← set unconditionally + if task._must_cancel: + continue + if task is not current and (task is self._host_task or _task_started(task)): + waiter = task._fut_waiter + if not isinstance(waiter, asyncio.Future) or not waiter.done(): + task.cancel(origin._cancel_reason) + ... + ... + if origin is self and should_retry: + self._cancel_handle = get_running_loop().call_soon( + self._deliver_cancellation, origin + ) + +``should_retry`` is set to True simply because there is *any* task in the +scope's ``_tasks`` set, regardless of whether cancellation could actually be +delivered. When every task in the scope is the *current* task (a scope that +contains only itself, or a TaskGroup whose only live member is running the +cancel) nothing gets ``task.cancel()``-ed, but the scheduler reschedules +``_deliver_cancellation`` via ``call_soon`` on every event-loop tick. Result: +one CPU core pinned at 100%, tens of thousands of ``epoll_pwait`` syscalls per +second, and no forward progress. + +We have seen this trigger at least three times (April 22, 23, and 24, 2026). +The SDK-side mitigation in ``nerve.agent.engine._safe_disconnect`` only runs +during ``client.disconnect()``, so spins originating elsewhere (telegram +polling, cron, an active SDK request hitting a broken pipe) are not covered. + +The fix below sets ``should_retry = True`` only when we *actually* called +``task.cancel()`` on a task that could still receive a cancellation. We +skip three categories of tasks that upstream blindly retries on: + +1. **Done tasks** (``task.done() is True``). Root cause of the April 24 + regression: if a task completes while the scope has ``cancel_called`` + and the scope never pops it out of ``_tasks``, every subsequent pass + sees ``_must_cancel=False`` (cleared when the task finished), + ``_task_started()=True``, ``_fut_waiter=None``. Our "waiter not done" + branch fires, ``task.cancel()`` is a no-op on a done task, and yet we + flagged ``should_retry=True`` anyway. Result: infinite ``call_soon`` + on a scope whose only inhabitant is already a corpse. We observed + three such zombie-scopes running simultaneously, all spinning at + ~55k epoll_pwait/sec combined (100% CPU, load 1.6, 60°C). + +2. **Tasks already flagged with ``_must_cancel``**. asyncio's + ``Task.__step`` checks the flag when the task next resumes and will + raise ``CancelledError`` without our help. Retrying in that case means + we re-queue ourselves on every event-loop tick while the task is + blocked (shielded, awaiting I/O). This was the second iteration of + the bug (April 23 evening). + +3. **The current task**, because a task cannot cancel itself from inside + a callback. (``current_task()`` is often ``None`` here because the + callback runs from ``call_soon`` outside any task context — the check + is a belt-and-suspenders guard for the other case.) + +Import this module once, before anyio is used (i.e. very early in the +process entry point — see ``nerve/__main__.py``). +""" + +from __future__ import annotations + +import asyncio +import logging +from asyncio import current_task +from asyncio import get_running_loop + +from anyio._backends import _asyncio as _anyio_asyncio + +logger = logging.getLogger(__name__) + +_APPLIED = False + + +def _patched_deliver_cancellation(self, origin): # type: ignore[no-untyped-def] + """Drop-in replacement for ``CancelScope._deliver_cancellation``. + + Semantics: ``should_retry`` is True **only when we actually called + ``task.cancel()`` on some task in this pass**. A task already flagged + with ``_must_cancel`` does not justify a retry — asyncio's + ``Task.__step`` checks the flag when the task next resumes and will + raise ``CancelledError`` without our help. Retrying in that case means + we re-queue ourselves on every event-loop tick while the task is + blocked (shielded, awaiting I/O, or — critically — is the *current + task* running this very callback), which is the hot loop we're trying + to kill. + """ + should_retry = False + current = current_task() + for task in self._tasks: + # Already finished. anyio doesn't always remove tasks from + # ``_tasks`` before the scope's cancel callback fires (done + # tasks can linger until the task group unwinds). ``task.cancel()`` + # is a no-op on a done task, so retrying is pure busy-loop. + # This is the root cause of the April 24, 2026 spin. + if task.done(): + continue + + # The current task is running this callback; it can't cancel + # itself from inside it. Whatever state it's in (including + # ``_must_cancel``) will be handled once we return and control + # flows back to ``Task.__step``. + if task is current: + continue + + # Already flagged for cancellation. asyncio will deliver the + # exception when the task resumes; no retry needed from us. + # (Upstream anyio 4.13.0 set should_retry=True here — that's the + # root-cause bug.) + if task._must_cancel: # type: ignore[attr-defined] + continue + + # The task is eligible for cancellation if it has started. + if task is self._host_task or _anyio_asyncio._task_started(task): + waiter = task._fut_waiter # type: ignore[attr-defined] + if not isinstance(waiter, asyncio.Future) or not waiter.done(): + task.cancel(origin._cancel_reason) + # We actually delivered a cancel — re-check next tick. + should_retry = True + if ( + task is origin._host_task + and origin._pending_uncancellations is not None + ): + origin._pending_uncancellations += 1 + + # Deliver cancellation to child scopes that aren't shielded or running + # their own cancellation callbacks. + for scope in self._child_scopes: + if not scope._shield and not scope.cancel_called: + should_retry = scope._deliver_cancellation(origin) or should_retry + + # Schedule another callback only if we actually did work this pass. + if origin is self: + if should_retry: + self._cancel_handle = get_running_loop().call_soon( + self._deliver_cancellation, origin + ) + else: + self._cancel_handle = None + + return should_retry + + +def apply() -> bool: + """Install the patch. Safe to call multiple times. + + Returns True if the patch was applied in this call, False if it was + already applied (or the target symbol is missing). + """ + global _APPLIED + if _APPLIED: + return False + + cancel_scope_cls = getattr(_anyio_asyncio, "CancelScope", None) + if cancel_scope_cls is None: + logger.warning( + "anyio patch: CancelScope not found in %s; patch NOT applied", + _anyio_asyncio.__name__, + ) + return False + + if not hasattr(cancel_scope_cls, "_deliver_cancellation"): + logger.warning( + "anyio patch: _deliver_cancellation not found on CancelScope; " + "patch NOT applied (anyio API changed?)" + ) + return False + + if not hasattr(_anyio_asyncio, "_task_started"): + logger.warning( + "anyio patch: _task_started helper not found; patch NOT applied" + ) + return False + + cancel_scope_cls._deliver_cancellation = _patched_deliver_cancellation + _APPLIED = True + logger.info( + "anyio patch applied: CancelScope._deliver_cancellation fixed " + "(prevents 100%% CPU spin on unrecoverable cancel)" + ) + return True + + +# Apply on import so anyone who imports this module gets the patch. +apply() diff --git a/tests/test_anyio_patch.py b/tests/test_anyio_patch.py new file mode 100644 index 0000000..45da9d7 --- /dev/null +++ b/tests/test_anyio_patch.py @@ -0,0 +1,245 @@ +"""Regression tests for the anyio _deliver_cancellation hot-loop patch. + +See nerve/_anyio_patch.py for the root-cause explanation. +""" + +from __future__ import annotations + +import asyncio + +import anyio +import pytest + +# Importing nerve applies the patch as a side effect of nerve/__init__.py. +import nerve # noqa: F401 +from nerve import _anyio_patch + + +def test_patch_is_applied(): + """The patch must be active after ``import nerve``.""" + from anyio._backends import _asyncio as anyio_asyncio + + assert _anyio_patch._APPLIED is True + assert ( + anyio_asyncio.CancelScope._deliver_cancellation.__module__ + == "nerve._anyio_patch" + ) + + +def test_apply_is_idempotent(): + """Calling apply() twice must not raise and must return False the 2nd time.""" + # Already applied from module import, so this should be a no-op. + assert _anyio_patch.apply() is False + + +@pytest.mark.asyncio +async def test_fail_after_still_works(): + """Cancellation via fail_after must still raise TimeoutError.""" + with pytest.raises(TimeoutError): + with anyio.fail_after(0.01): + await anyio.sleep(1) + + +@pytest.mark.asyncio +async def test_move_on_after_still_works(): + """Cancellation via move_on_after must exit cleanly.""" + with anyio.move_on_after(0.01) as scope: + await anyio.sleep(1) + assert scope.cancelled_caught + + +@pytest.mark.asyncio +async def test_task_group_cancellation_still_works(): + """Cancelling a task group must cancel its children.""" + cancelled = [] + + async def child(index): + try: + await anyio.sleep(10) + except asyncio.CancelledError: + cancelled.append(index) + raise + + async with anyio.create_task_group() as tg: + tg.start_soon(child, 0) + tg.start_soon(child, 1) + await anyio.sleep(0.01) + tg.cancel_scope.cancel() + + assert sorted(cancelled) == [0, 1] + + +@pytest.mark.asyncio +async def test_no_hot_loop_when_only_task_is_current(): + """Regression: a CancelScope whose only task is the current task must not + re-schedule ``_deliver_cancellation`` forever. + + This is the exact shape of the bug: ``should_retry`` used to be set to + True simply because ``self._tasks`` was non-empty, even when no task + could actually be cancelled. The scheduler then re-queued itself via + ``call_soon()`` on every event-loop tick, pinning one CPU core. + + After the patch, the first pass must leave ``_cancel_handle = None`` + because nothing was delivered and nothing is awaiting pickup. + """ + from anyio._backends._asyncio import CancelScope + + scope = CancelScope() + # Enter/exit manually so we have a host_task bound to the running task + # without letting anyio's normal finalization run cancellation for us. + current = asyncio.current_task() + scope._host_task = current + scope._cancel_called = True + scope._cancel_reason = "regression test" + # Force the "pathological" shape: only task in scope is the current task, + # so the upstream loop would spin forever without the patch. + scope._tasks = {current} + + loop = asyncio.get_running_loop() + # Drop any stray handle so we can assert cleanly against the post-state. + scope._cancel_handle = None + + should_retry = scope._deliver_cancellation(scope) + + assert should_retry is False, ( + "Patched _deliver_cancellation must not retry when the only task " + "in the scope is the current task (nothing to cancel)." + ) + assert scope._cancel_handle is None, ( + "Patched _deliver_cancellation must clear _cancel_handle when no " + "retry is needed — otherwise the hot loop returns." + ) + + # Sanity: event loop must not have a queued _deliver_cancellation + # callback parked in it. We can't easily introspect the ready queue, but + # we *can* confirm we didn't stash a handle on the scope ourselves. + assert not loop.is_closed() + + +@pytest.mark.asyncio +async def test_no_hot_loop_when_current_task_has_must_cancel(): + """Regression (April 23, 2026 evening): the first version of the patch + still spun when the *current task* already had ``_must_cancel=True``. + + In production this happens when a task that's about to be cancelled is + itself running the cancel callback (via a CancelScope it owns). The + previous patch unconditionally set ``should_retry = True`` whenever it + saw ``_must_cancel``, re-queuing ``_deliver_cancellation`` on every + event-loop tick. Result: ~20% CPU / ~61k epoll_pwait/sec, nerve kept + crowning the fan. + + The fix: skip the current task entirely, and only retry when we + actually called ``task.cancel()``. + """ + from anyio._backends._asyncio import CancelScope + + scope = CancelScope() + current = asyncio.current_task() + scope._host_task = current + scope._cancel_called = True + scope._cancel_reason = "regression test (must_cancel)" + scope._tasks = {current} + scope._cancel_handle = None + + # Simulate the production state: a task already flagged as + # "must cancel" is sitting in the scope. asyncio will deliver the + # CancelledError when the task next resumes — we must NOT re-queue + # ourselves in the meantime. + # + # The real ``_asyncio.Task._must_cancel`` attribute is read-only from + # Python, so we use a stub that mimics the shape the patch reads. + class _FakeTask: + def __init__(self): + self._must_cancel = True + self._fut_waiter = None + self._cancel_calls = 0 + + def done(self): + return False + + def cancel(self, *_args, **_kwargs): + self._cancel_calls += 1 + return True + + fake = _FakeTask() + scope._tasks = {fake} + + should_retry = scope._deliver_cancellation(scope) + + assert fake._cancel_calls == 0, ( + "Must not re-cancel a task already flagged with _must_cancel — " + "asyncio will deliver on resume." + ) + assert should_retry is False, ( + "Patched _deliver_cancellation must not retry on _must_cancel " + "alone. That was the April 23 regression: the callback re-queued " + "itself every tick while the task sat with _must_cancel=True, " + "burning ~20% CPU on 61k epoll_pwait/sec." + ) + assert scope._cancel_handle is None, ( + "No retry → no pending handle." + ) + + +@pytest.mark.asyncio +async def test_no_hot_loop_when_only_task_is_done(): + """Regression (April 24, 2026): a scope whose only task has already + ``done()==True`` must not re-queue ``_deliver_cancellation`` forever. + + Observed in production: anyio 4.13.0 does not always remove tasks from + ``CancelScope._tasks`` the instant they finish — a done task can linger + until the task group unwinds. For a done task ``_must_cancel`` is False + (cleared during final step), ``_task_started()`` is True, and + ``_fut_waiter`` is None. The previous patch therefore fell into the + "waiter not done → cancel()" branch, called ``task.cancel()`` (which is + a no-op on a done task), and still set ``should_retry=True``. Result: + a zombie-scope spinning ~55k epoll_pwait/sec per scope, pinning one + CPU core until the process was restarted. + + Three such zombies were found in a live process dump (scope IDs + 0x7ffec1774f50, 0x7ffec17ae090, 0x7ffec17ad6d0) — the live fix was to + ``discard()`` the done task and ``handle.cancel()`` the pending + ``call_soon``. This test guarantees the code-level fix holds. + """ + from anyio._backends._asyncio import CancelScope + + scope = CancelScope() + current = asyncio.current_task() + scope._host_task = current + scope._cancel_called = True + scope._cancel_reason = "regression test (done)" + scope._cancel_handle = None + + class _DoneTask: + """Mimics an asyncio.Task that has already completed.""" + + def __init__(self): + self._must_cancel = False # cleared when task finished + self._fut_waiter = None # done tasks have no waiter + self._cancel_calls = 0 + + def done(self): + return True + + def cancel(self, *_args, **_kwargs): + self._cancel_calls += 1 + return False # cancel() returns False on done tasks + + done_task = _DoneTask() + scope._tasks = {done_task} + + should_retry = scope._deliver_cancellation(scope) + + assert should_retry is False, ( + "Patched _deliver_cancellation must not retry on done tasks. " + "Retrying on a done task is the April 24 hot-loop: cancel() is a " + "no-op, but the callback keeps re-queuing itself every tick." + ) + assert done_task._cancel_calls == 0, ( + "Must not call task.cancel() on a done task — it's a no-op that " + "only exists to flag should_retry=True." + ) + assert scope._cancel_handle is None, ( + "No retry → no pending handle. Otherwise the zombie-scope spin " + "returns on the next rebuild." + )