From 54a694d452d403f8d15583dd059b8e4c3f4f7511 Mon Sep 17 00:00:00 2001 From: goodboy Date: Tue, 9 Jun 2026 20:15:31 -0400 Subject: [PATCH 001/110] Open py-version range + harness gate for py3.14 backends (#379) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prep for a future sub-interpreter (PEP 734 `concurrent.interpreters`) spawn backend per issue test-harness error-gating; the backend itself comes later. Deats, - bump `pyproject.toml` `requires-python` to `>=3.12, <3.15` and list the `3.14` classifier — the new stdlib `concurrent.interpreters` module only ships on 3.14 - `_testing.pytest.pytest_configure` wraps `try_set_start_method()` in a `pytest.UsageError` handler so an unsupported `--spawn-backend` on the running py-version prints a clean banner instead of a traceback (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit d318f1f8f4f6a056a8ca67d7f15e4e2a72507ea3) (factored: kept only the pyproject + `_testing/pytest.py` parts of "Add `'subint'` spawn backend scaffold (#379)"; dropped tractor/spawn/_spawn.py + tractor/spawn/_subint.py) --- tractor/_testing/pytest.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tractor/_testing/pytest.py b/tractor/_testing/pytest.py index 1d803c9e4..c33406ff4 100644 --- a/tractor/_testing/pytest.py +++ b/tractor/_testing/pytest.py @@ -227,7 +227,13 @@ def pytest_addoption( def pytest_configure(config): backend = config.option.spawn_backend from tractor.spawn._spawn import try_set_start_method - try_set_start_method(backend) + try: + try_set_start_method(backend) + except RuntimeError as err: + # e.g. `--spawn-backend=subint` on Python < 3.14 — turn the + # runtime gate error into a clean pytest usage error so the + # suite exits with a helpful banner instead of a traceback. + raise pytest.UsageError(str(err)) from err # register custom marks to avoid warnings see, # https://docs.pytest.org/en/stable/how-to/writing_plugins.html#registering-custom-markers From 4b63b7f1cc5f8ffba5cc0a39b660db989e434fdd Mon Sep 17 00:00:00 2001 From: goodboy Date: Fri, 17 Apr 2026 13:26:19 -0400 Subject: [PATCH 002/110] Handle py3.14+ incompats as test skips Since we're devving subints we require the 3.14+ stdlib API and a couple compiled libs don't support it yet, namely: - `cffi`, which we're only using for the `.ipc._linux` eventfd stuff (now factored into `hotbaud` anyway). - `greenback`, which requires `greenlet` which doesn't seem to be wheeled yet * on nixos the sdist build was failing due to lack of `g++` which i don't care to figure out rn since we don't need `.devx` stuff immediately for this subints prototype. * [ ] we still need to adjust any dependent suites to skip. Adjust `test_ringbuf` to skip on import failure. Also project wide, - pin us to py 3.13+ in prep for last-2-minor-version policy. - drop `msgspec>=0.20.0`, the first release with py3.14 support. (cherry picked from commit d2ea8aa2deb681faf7986759ff822579ab9ee22f) --- tests/test_ringbuf.py | 3 +++ tractor/ipc/_linux.py | 14 ++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/tests/test_ringbuf.py b/tests/test_ringbuf.py index 56c4eae8b..e55a87b90 100644 --- a/tests/test_ringbuf.py +++ b/tests/test_ringbuf.py @@ -21,6 +21,9 @@ # XXX, in case you want to melt your cores, comment this skip line XD pytestmark = pytest.mark.skip +# XXX `cffi` dun build on py3.14 yet.. +cffi = pytest.importorskip("cffi") + @tractor.context async def child_read_shm( diff --git a/tractor/ipc/_linux.py b/tractor/ipc/_linux.py index 88d80d1c1..cd7de870f 100644 --- a/tractor/ipc/_linux.py +++ b/tractor/ipc/_linux.py @@ -17,10 +17,20 @@ Linux specifics, for now we are only exposing EventFD ''' -import os import errno +import os +import sys + +try: + import cffi +except ImportError as ie: + if sys.version_info < (3, 14): + ie.add_note( + f'The `cffi` pkg has no 3.14 support yet.\n' + ) + + raise ie -import cffi import trio ffi = cffi.FFI() From f7dfd37df45e29d4f38bd232e49f9ad37b4f577d Mon Sep 17 00:00:00 2001 From: goodboy Date: Tue, 9 Jun 2026 20:16:02 -0400 Subject: [PATCH 003/110] Extract `_actor_child_main()` as shared child entry Pull the `_child.py` `__main__` block body out into a callable `_actor_child_main()` so alternate spawn backends can bootstrap a subactor without going through the CLI entrypoint. Deats, - new `_actor_child_main(uid, loglevel, parent_addr, infect_asyncio, spawn_method='trio')` holds the full child-side runtime startup previously inlined under `if __name__ == '__main__':` - `__main__` block reduces to arg-parsing + a call into the new func - add `"subint"` to the `_runtime.py` spawn-method check so a child accepts `SpawnSpec` from that (future) backend; inert str-compare w/o it (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit b8f243e98d38688131d56397a87e3e56106bf4a9) (factored: kept only the `_child.py`/`_runtime.py` entry-extraction parts of "Impl min-viable `subint` spawn backend (B.2)"; dropped tractor/spawn/_subint.py + subint prompt-io logs) --- tractor/_child.py | 52 ++++++++++++++++++++++++++++++------- tractor/runtime/_runtime.py | 2 +- 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/tractor/_child.py b/tractor/_child.py index c61cdec3f..727a5054a 100644 --- a/tractor/_child.py +++ b/tractor/_child.py @@ -15,16 +15,23 @@ # along with this program. If not, see . """ -This is the "bootloader" for actors started using the native trio backend. +The "bootloader" for sub-actors spawned via the native `trio` +backend (the default `python -m tractor._child` CLI entry) and +the in-process `subint` backend (`tractor.spawn._subint`). """ +from __future__ import annotations import argparse - from ast import literal_eval +from typing import TYPE_CHECKING from .runtime._runtime import Actor from .spawn._entry import _trio_main +if TYPE_CHECKING: + from .discovery._addr import UnwrappedAddress + from .spawn._spawn import SpawnMethodKey + def parse_uid(arg): name, uuid = literal_eval(arg) # ensure 2 elements @@ -39,6 +46,36 @@ def parse_ipaddr(arg): return arg +def _actor_child_main( + uid: tuple[str, str], + loglevel: str | None, + parent_addr: UnwrappedAddress | None, + infect_asyncio: bool, + spawn_method: SpawnMethodKey = 'trio', + +) -> None: + ''' + Construct the child `Actor` and dispatch to `_trio_main()`. + + Shared entry shape used by both the `python -m tractor._child` + CLI (trio/mp subproc backends) and the `subint` backend, which + invokes this from inside a fresh `concurrent.interpreters` + sub-interpreter via `Interpreter.call()`. + + ''' + subactor = Actor( + name=uid[0], + uuid=uid[1], + loglevel=loglevel, + spawn_method=spawn_method, + ) + _trio_main( + subactor, + parent_addr=parent_addr, + infect_asyncio=infect_asyncio, + ) + + if __name__ == "__main__": __tracebackhide__: bool = True @@ -49,15 +86,10 @@ def parse_ipaddr(arg): parser.add_argument("--asyncio", action='store_true') args = parser.parse_args() - subactor = Actor( - name=args.uid[0], - uuid=args.uid[1], + _actor_child_main( + uid=args.uid, loglevel=args.loglevel, - spawn_method="trio" - ) - - _trio_main( - subactor, parent_addr=args.parent_addr, infect_asyncio=args.asyncio, + spawn_method='trio', ) diff --git a/tractor/runtime/_runtime.py b/tractor/runtime/_runtime.py index bee9e20d4..0c25b9262 100644 --- a/tractor/runtime/_runtime.py +++ b/tractor/runtime/_runtime.py @@ -870,7 +870,7 @@ async def _from_parent( accept_addrs: list[UnwrappedAddress]|None = None - if self._spawn_method == "trio": + if self._spawn_method in ("trio", "subint"): # Receive post-spawn runtime state from our parent. spawnspec: msgtypes.SpawnSpec = await chan.recv() From 253b210bcd5dce3783c5b0c2c36b5de52b537e89 Mon Sep 17 00:00:00 2001 From: goodboy Date: Sat, 18 Apr 2026 18:28:15 -0400 Subject: [PATCH 004/110] Add `._debug_hangs` to `.devx` for hang triage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bottle up the diagnostic primitives that actually cracked the silent mid-suite hangs in the `subint` spawn-backend bringup (issue there" session has them on the shelf instead of reinventing from scratch. Deats, - `dump_on_hang(seconds, *, path)` — context manager wrapping `faulthandler.dump_traceback_later()`. Critical gotcha baked in: dumps go to a *file*, not `sys.stderr`, bc pytest's stderr capture silently eats the output and you can spend an hour convinced you're looking at the wrong thing - `track_resource_deltas(label, *, writer)` — context manager logging per-block `(threading.active_count(), len(_interpreters.list_all()))` deltas; quickly rules out leak-accumulation theories when a suite progressively worsens (if counts don't grow, it's not a leak, look for a race on shared cleanup instead) - `resource_delta_fixture(*, autouse, writer)` — factory returning a `pytest` fixture wrapping `track_resource_deltas` per-test; opt in by importing into a `conftest.py`. Kept as a factory (not a bare fixture) so callers own `autouse` / `writer` wiring Also, - export the three names from `tractor.devx` - dep-free on py<3.13 (swallows `ImportError` for `_interpreters`) - link back to the provenance in the module docstring (issue #379 / commit `26fb820`) (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit 09466a1e9dc548040df0bddb5aed96402c1d0a19) --- tractor/devx/__init__.py | 5 + tractor/devx/_debug_hangs.py | 227 +++++++++++++++++++++++++++++++++++ 2 files changed, 232 insertions(+) create mode 100644 tractor/devx/_debug_hangs.py diff --git a/tractor/devx/__init__.py b/tractor/devx/__init__.py index 80c6744f9..6b681d985 100644 --- a/tractor/devx/__init__.py +++ b/tractor/devx/__init__.py @@ -41,6 +41,11 @@ pformat_caller_frame as pformat_caller_frame, pformat_boxed_tb as pformat_boxed_tb, ) +from ._debug_hangs import ( + dump_on_hang as dump_on_hang, + track_resource_deltas as track_resource_deltas, + resource_delta_fixture as resource_delta_fixture, +) # TODO, move this to a new `.devx._pdbp` mod? diff --git a/tractor/devx/_debug_hangs.py b/tractor/devx/_debug_hangs.py new file mode 100644 index 000000000..1ac66f942 --- /dev/null +++ b/tractor/devx/_debug_hangs.py @@ -0,0 +1,227 @@ +# tractor: structured concurrent "actors". +# Copyright 2018-eternity Tyler Goodlet. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +''' +Hang-diagnostic helpers for concurrent / multi-interpreter code. + +Collected from the `subint` spawn backend bringup (issue #379) +where silent test-suite hangs needed careful teardown +instrumentation to diagnose. This module bottles up the +techniques that actually worked so future hangs are faster +to corner. + +Two primitives: + +1. `dump_on_hang()` — context manager wrapping + `faulthandler.dump_traceback_later()` with the critical + gotcha baked in: write the dump to a **file**, not + `sys.stderr`. Under `pytest` (and any other output + capturer) stderr gets swallowed and the dump is easy to + miss — burning hours convinced you're looking at the wrong + thing. + +2. `track_resource_deltas()` — context manager (+ optional + autouse-fixture factory) logging per-block deltas of + `threading.active_count()` and — if running on py3.13+ — + `len(_interpreters.list_all())`. Lets you quickly rule out + leak-accumulation theories when a suite hangs more + frequently as it progresses (if counts don't grow, it's + not a leak; look for a race on shared cleanup instead). + +See issue #379 / commit `26fb820` for the worked example. + +''' +from __future__ import annotations +import faulthandler +import sys +import threading +from contextlib import contextmanager +from pathlib import Path +from typing import ( + Callable, + Iterator, +) + +try: + import _interpreters # type: ignore +except ImportError: + _interpreters = None # type: ignore + + +__all__ = [ + 'dump_on_hang', + 'track_resource_deltas', + 'resource_delta_fixture', +] + + +@contextmanager +def dump_on_hang( + seconds: float = 30.0, + *, + path: str | Path = '/tmp/tractor_hang.dump', + all_threads: bool = True, + +) -> Iterator[str]: + ''' + Arm `faulthandler` to dump all-thread tracebacks to + `path` after `seconds` if the with-block hasn't exited. + + *Writes to a file, not stderr* — `pytest`'s stderr + capture silently eats stderr-destined `faulthandler` + output, and the same happens under any framework that + redirects file-descriptors. Pointing the dump at a real + file sidesteps that. + + Yields the resolved file path so it's easy to read back. + + Example + ------- + :: + + from tractor.devx import dump_on_hang + + def test_hang(): + with dump_on_hang( + seconds=15, + path='/tmp/my_test_hang.dump', + ) as dump_path: + trio.run(main) + # if it hangs, inspect dump_path afterward + + ''' + dump_path = Path(path) + f = dump_path.open('w') + try: + faulthandler.dump_traceback_later( + seconds, + repeat=False, + file=f, + exit=False, + ) + try: + yield str(dump_path) + finally: + faulthandler.cancel_dump_traceback_later() + finally: + f.close() + + +def _snapshot() -> tuple[int, int]: + ''' + Return `(thread_count, subint_count)`. + + Subint count reported as `0` on pythons lacking the + private `_interpreters` stdlib module (i.e. py<3.13). + + ''' + threads: int = threading.active_count() + subints: int = ( + len(_interpreters.list_all()) + if _interpreters is not None + else 0 + ) + return threads, subints + + +@contextmanager +def track_resource_deltas( + label: str = '', + *, + writer: Callable[[str], None] | None = None, + +) -> Iterator[tuple[int, int]]: + ''' + Log `(threads, subints)` deltas across the with-block. + + `writer` defaults to `sys.stderr.write` (+ trailing + newline); pass a custom callable to route elsewhere + (e.g., a log handler or an append-to-file). + + Yields the pre-entry snapshot so callers can assert + against the expected counts if they want. + + Example + ------- + :: + + from tractor.devx import track_resource_deltas + + async def test_foo(): + with track_resource_deltas(label='test_foo'): + async with tractor.open_nursery() as an: + ... + + # Output: + # test_foo: threads 2->2, subints 1->1 + + ''' + before = _snapshot() + try: + yield before + finally: + after = _snapshot() + msg: str = ( + f'{label}: ' + f'threads {before[0]}->{after[0]}, ' + f'subints {before[1]}->{after[1]}' + ) + if writer is None: + sys.stderr.write(msg + '\n') + sys.stderr.flush() + else: + writer(msg) + + +def resource_delta_fixture( + *, + autouse: bool = True, + writer: Callable[[str], None] | None = None, + +) -> Callable: + ''' + Factory returning a `pytest` fixture that wraps each test + in `track_resource_deltas(label=)`. + + Usage in a `conftest.py`:: + + # tests/conftest.py + from tractor.devx import resource_delta_fixture + + track_resources = resource_delta_fixture() + + or opt-in per-test:: + + track_resources = resource_delta_fixture(autouse=False) + + def test_foo(track_resources): + ... + + Kept as a factory (not a bare fixture) so callers control + `autouse` / `writer` without having to subclass or patch. + + ''' + import pytest # deferred: only needed when caller opts in + + @pytest.fixture(autouse=autouse) + def _track_resources(request): + with track_resource_deltas( + label=request.node.name, + writer=writer, + ): + yield + + return _track_resources From fd4caa19036f5677b528bb6400ba2d9d0e22427b Mon Sep 17 00:00:00 2001 From: goodboy Date: Tue, 9 Jun 2026 20:18:44 -0400 Subject: [PATCH 005/110] Arm `dump_on_hang` on `test_stale_entry_is_deleted` Wrap the test's `trio.run(main)` in `dump_on_hang(seconds=20)` so any future hang regression captures a stack dump for triage instead of wedging CI silently; under the default backends it's a no-op safety net. Includes a "KNOWN ISSUE" comment block documenting the (future) `subint` backend hang classes observed against this test during Phase B bringup (#379). (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit 4a3254583b1168382314fdad1b268c2498bbf729) (factored: kept only the tests/discovery/test_registrar.py part of "Doc `subint` backend hang classes + arm `dump_on_hang`"; dropped subint conc-anal docs + tests/test_subint_cancellation.py) --- tests/discovery/test_registrar.py | 52 ++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/tests/discovery/test_registrar.py b/tests/discovery/test_registrar.py index 60b2b10c4..6f34b1177 100644 --- a/tests/discovery/test_registrar.py +++ b/tests/discovery/test_registrar.py @@ -14,6 +14,7 @@ import pytest import subprocess import tractor +from tractor.devx import dump_on_hang from tractor.trionics import collapse_eg from tractor._testing import tractor_test from tractor.discovery._addr import wrap_address @@ -562,4 +563,53 @@ async def main(): await ptl.cancel_actor() await an.cancel() - trio.run(main) + # TODO, remove once the `[subint]` variant no longer hangs. + # + # Status (as of Phase B hard-kill landing): + # + # - `[trio]`/`[mp_*]` variants: completes normally; `dump_on_hang` + # is a no-op safety net here. + # + # - `[subint]` variant: hangs indefinitely AND is un-Ctrl-C-able. + # `strace -p ` while in the hang reveals a silently- + # dropped SIGINT — the C signal handler tries to write the + # signum byte to Python's signal-wakeup fd and gets `EAGAIN`, + # meaning the pipe is full (nobody's draining it). + # + # Root-cause chain: our hard-kill in `spawn._subint` abandoned + # the driver OS-thread (which is `daemon=True`) after the soft- + # kill timeout, but the *sub-interpreter* inside that thread is + # still running `trio.run()` — `_interpreters.destroy()` can't + # force-stop a running subint (raises `InterpreterError`), and + # legacy-config subints share the main GIL. The abandoned subint + # starves the parent's trio event loop from iterating often + # enough to drain its wakeup pipe → SIGINT silently drops. + # + # This is structurally a CPython-level limitation: there's no + # public force-destroy primitive for a running subint. We + # escape on the harness side via a SIGINT-loop in the `daemon` + # fixture teardown (killing the bg registrar subproc closes its + # end of the IPC, which eventually unblocks a recv in main trio, + # which lets the loop drain the wakeup pipe). Long-term fix path: + # msgspec PEP 684 support (jcrist/msgspec#563) → isolated-mode + # subints with per-interp GIL. + # + # Full analysis: + # `ai/conc-anal/subint_sigint_starvation_issue.md` + # + # See also the *sibling* hang class documented in + # `ai/conc-anal/subint_cancel_delivery_hang_issue.md` — same + # subint backend, different root cause (Ctrl-C-able hang, main + # trio loop iterating fine; ours to fix, not CPython's). + # Reproduced by `tests/test_subint_cancellation.py + # ::test_subint_non_checkpointing_child`. + # + # Kept here (and not behind a `pytestmark.skip`) so we can still + # inspect the dump file if the hang ever returns after a refactor. + # `pytest`'s stderr capture eats `faulthandler` output otherwise, + # so we route `dump_on_hang` to a file. + with dump_on_hang( + seconds=20, + path=f'/tmp/test_stale_entry_is_deleted_{start_method}.dump', + ): + trio.run(main) From b2cc4f502c866f1a932fc59a63d91c03ec191b7a Mon Sep 17 00:00:00 2001 From: goodboy Date: Tue, 9 Jun 2026 20:19:11 -0400 Subject: [PATCH 006/110] Wall-cap `test_stale_entry_is_deleted` via `pytest-timeout` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a hard process-level wall-clock bound on a test known to wedge un-Ctrl-C-ably under an in-dev spawn backend, so an unattended suite run can't hang indefinitely. Deats, - New `testing` dep: `pytest-timeout>=2.3`. - `test_stale_entry_is_deleted`: `@pytest.mark.timeout(3, method='thread')`. The `method='thread'` choice is deliberate — `method='signal'` routes via `SIGALRM` which can be starved by the same GIL-hostage path that drops `SIGINT`, so it'd never actually fire in the starvation case. At timeout, `pytest-timeout` hard-kills the pytest process itself — that's the intended behavior here; the alternative is the suite never returning. (this commit msg was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit 189f4e3f72e9f1eda5d24bcbab5743f7e35bd913) (factored: kept pyproject + tests/discovery/test_registrar.py parts of "Wall-cap `subint` audit tests via `pytest-timeout`"; dropped tests/test_subint_cancellation.py) --- pyproject.toml | 5 +++++ tests/discovery/test_registrar.py | 16 ++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 0a23dce51..d06c0f5e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,6 +84,11 @@ testing = [ # interactions stay predictable across dev installs). "pytest>=9.0.3", # CVE-2025-71176 (insecure tmpdir) patched in 9.0.3 "pexpect>=4.9.0,<5", + # per-test wall-clock bound (used via + # `@pytest.mark.timeout(..., method='thread')` on the + # known-hanging `subint`-backend audit tests; see + # `ai/conc-anal/subint_*_issue.md`). + "pytest-timeout>=2.3", ] repl = [ "pyperclip>=1.9.0", diff --git a/tests/discovery/test_registrar.py b/tests/discovery/test_registrar.py index 6f34b1177..bd0156080 100644 --- a/tests/discovery/test_registrar.py +++ b/tests/discovery/test_registrar.py @@ -517,6 +517,22 @@ async def kill_transport( # @pytest.mark.parametrize('use_signal', [False, True]) +# +# Wall-clock bound via `pytest-timeout` (`method='thread'`). +# Under `--spawn-backend=subint` this test can wedge in an +# un-Ctrl-C-able state (abandoned-subint + shared-GIL +# starvation → signal-wakeup-fd pipe fills → SIGINT silently +# dropped; see `ai/conc-anal/subint_sigint_starvation_issue.md`). +# `method='thread'` is specifically required because `signal`- +# method SIGALRM suffers the same GIL-starvation path and +# wouldn't fire the Python-level handler. +# At timeout the plugin hard-kills the pytest process — that's +# the intended behavior here; the alternative is an unattended +# suite run that never returns. +@pytest.mark.timeout( + 3, # NOTE should be a 2.1s happy path. + method='thread', +) def test_stale_entry_is_deleted( debug_mode: bool, daemon: subprocess.Popen, From 6afeb6b6efe08f2ad33900a77562e4e506177877 Mon Sep 17 00:00:00 2001 From: goodboy Date: Tue, 21 Apr 2026 13:36:41 -0400 Subject: [PATCH 007/110] Skip `test_stale_entry_is_deleted` hanger with `subint`s (cherry picked from commit 985ea76de593d5707f5fcdd7f3095dfca6a77176) --- tests/discovery/test_registrar.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/discovery/test_registrar.py b/tests/discovery/test_registrar.py index bd0156080..02748370a 100644 --- a/tests/discovery/test_registrar.py +++ b/tests/discovery/test_registrar.py @@ -132,6 +132,10 @@ async def say_hello_use_wait( return result +@pytest.mark.timeout( + 3, + method='thread', +) @tractor_test @pytest.mark.parametrize( 'func', @@ -545,6 +549,12 @@ def test_stale_entry_is_deleted( stale entry and not delivering a bad portal. ''' + if start_method == 'subint': + pytest.skip( + 'XXX SUBINT HANGING TEST XXX\n' + 'See oustanding issue(s)\n' + ) + async def main(): name: str = 'transport_fails_actor' From 239c3fb14dddb6cb123f070069bf25a0a3188aaf Mon Sep 17 00:00:00 2001 From: goodboy Date: Tue, 21 Apr 2026 21:24:51 -0400 Subject: [PATCH 008/110] Add `skipon_spawn_backend` pytest marker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A reusable `@pytest.mark.skipon_spawn_backend( '' [, ...], reason='...')` marker for backend-specific known-hang / -borked cases — avoids scattering `@pytest.mark.skipif(lambda ...)` branches across tests that misbehave under a particular `--spawn-backend`. Deats, - `pytest_configure()` registers the marker via `addinivalue_line('markers', ...)`. - New `pytest_collection_modifyitems()` hook walks each collected item with `item.iter_markers( name='skipon_spawn_backend')`, checks whether the active `--spawn-backend` appears in `mark.args`, and if so injects a concrete `pytest.mark.skip( reason=...)`. `iter_markers()` makes the decorator work at function, class, or module (`pytestmark = [...]`) scope transparently. - First matching mark wins; default reason is `f'Borked on --spawn-backend={backend!r}'` if the caller doesn't supply one. Also, tighten type annotations on nearby `pytest` integration points — `pytest_configure`, `debug_mode`, `spawn_backend`, `tpt_protos`, `tpt_proto` — now taking typed `pytest.Config` / `pytest.FixtureRequest` params. (this commit msg was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit 3b26b59dad8c6b365890746ed5acc0811bec94c6) --- tractor/_testing/pytest.py | 61 +++++++++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 7 deletions(-) diff --git a/tractor/_testing/pytest.py b/tractor/_testing/pytest.py index c33406ff4..ef3cc9a73 100644 --- a/tractor/_testing/pytest.py +++ b/tractor/_testing/pytest.py @@ -224,8 +224,10 @@ def pytest_addoption( ) -def pytest_configure(config): - backend = config.option.spawn_backend +def pytest_configure( + config: pytest.Config, +): + backend: str = config.option.spawn_backend from tractor.spawn._spawn import try_set_start_method try: try_set_start_method(backend) @@ -241,10 +243,52 @@ def pytest_configure(config): 'markers', 'no_tpt(proto_key): test will (likely) not behave with tpt backend' ) + config.addinivalue_line( + 'markers', + 'skipon_spawn_backend(*start_methods, reason=None): ' + 'skip this test under any of the given `--spawn-backend` ' + 'values; useful for backend-specific known-hang / -borked ' + 'cases (e.g. the `subint` GIL-starvation class documented ' + 'in `ai/conc-anal/subint_sigint_starvation_issue.md`).' + ) + + +def pytest_collection_modifyitems( + config: pytest.Config, + items: list[pytest.Function], +): + ''' + Expand any `@pytest.mark.skipon_spawn_backend(''[, + ...], reason='...')` markers into concrete + `pytest.mark.skip(reason=...)` calls for tests whose + backend-arg set contains the active `--spawn-backend`. + + Uses `item.iter_markers(name=...)` which walks function + + class + module-level marks in the correct scope order (and + handles both the single-`MarkDecorator` and `list[Mark]` + forms of a module-level `pytestmark`) — so the same marker + works at any level a user puts it. + + ''' + backend: str = config.option.spawn_backend + default_reason: str = f'Borked on --spawn-backend={backend!r}' + for item in items: + for mark in item.iter_markers(name='skipon_spawn_backend'): + if backend in mark.args: + reason: str = mark.kwargs.get( + 'reason', + default_reason, + ) + item.add_marker(pytest.mark.skip(reason=reason)) + # first matching mark wins; no value in stacking + # multiple `skip`s on the same item. + break @pytest.fixture(scope='session') -def debug_mode(request) -> bool: +def debug_mode( + request: pytest.FixtureRequest, +) -> bool: ''' Flag state for whether `--tpdb` (for `tractor`-py-debugger) was passed to the test run. @@ -258,12 +302,16 @@ def debug_mode(request) -> bool: @pytest.fixture(scope='session') -def spawn_backend(request) -> str: +def spawn_backend( + request: pytest.FixtureRequest, +) -> str: return request.config.option.spawn_backend @pytest.fixture(scope='session') -def tpt_protos(request) -> list[str]: +def tpt_protos( + request: pytest.FixtureRequest, +) -> list[str]: # allow quoting on CLI proto_keys: list[str] = [ @@ -291,7 +339,7 @@ def tpt_protos(request) -> list[str]: autouse=True, ) def tpt_proto( - request, + request: pytest.FixtureRequest, tpt_protos: list[str], ) -> str: proto_key: str = tpt_protos[0] @@ -343,7 +391,6 @@ def pytest_generate_tests( metafunc: pytest.Metafunc, ): spawn_backend: str = metafunc.config.option.spawn_backend - if not spawn_backend: # XXX some weird windows bug with `pytest`? spawn_backend = 'trio' From d5c549a3c35f3938800a77a1694ad7b121255599 Mon Sep 17 00:00:00 2001 From: goodboy Date: Tue, 9 Jun 2026 20:19:26 -0400 Subject: [PATCH 009/110] Mark `subint`-hanging tests with `skipon_spawn_backend` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adopt the `@pytest.mark.skipon_spawn_backend('subint', reason=...)` marker (a617b521) across the suites reproducing the `subint` GIL-contention / starvation hang classes doc'd in `ai/conc-anal/subint_*_issue.md`. Deats, - Module-level `pytestmark` on full-file-hanging suites: - `tests/test_cancellation.py` - `tests/test_inter_peer_cancellation.py` - `tests/test_pubsub.py` - `tests/test_shm.py` - Per-test decorator where only one test in the file hangs: - `tests/discovery/test_registrar.py ::test_stale_entry_is_deleted` — replaces the inline `if start_method == 'subint': pytest.skip` branch with a declarative skip. - `tests/test_subint_cancellation.py ::test_subint_non_checkpointing_child`. - A few per-test decorators are left commented-in- place as breadcrumbs for later finer-grained unskips. Also, some nearby tidying in the affected files: - Annotate loose fixture / test params (`pytest.FixtureRequest`, `str`, `tuple`, `bool`) in `tests/conftest.py`, `tests/devx/conftest.py`, and `tests/test_cancellation.py`. - Normalize `"""..."""` → `'''...'''` docstrings per repo convention on a few touched tests. - Add `timeout=6` / `timeout=10` to `@tractor_test(...)` on `test_cancel_infinite_streamer` and `test_some_cancels_all`. - Drop redundant `spawn_backend` param from `test_cancel_via_SIGINT`; use `start_method` in the `'mp' in ...` check instead. (this commit msg was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit 4b2a0886c3ac4b8fdeb1db3db820df2396f3c198) (factored: dropped spawn-backend-only path: tests/test_subint_cancellation.py) --- tests/conftest.py | 6 +- tests/devx/conftest.py | 5 +- tests/discovery/test_registrar.py | 18 +++-- tests/test_cancellation.py | 107 +++++++++++++++++++++----- tests/test_inter_peer_cancellation.py | 9 +++ tests/test_pubsub.py | 8 ++ tests/test_shm.py | 8 ++ 7 files changed, 128 insertions(+), 33 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index c7b205313..90498ba05 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -139,7 +139,9 @@ def pytest_addoption( @pytest.fixture(scope='session', autouse=True) -def loglevel(request) -> str: +def loglevel( + request: pytest.FixtureRequest, +) -> str: import tractor orig = tractor.log._default_loglevel level = tractor.log._default_loglevel = request.config.option.loglevel @@ -156,7 +158,7 @@ def loglevel(request) -> str: @pytest.fixture(scope='function') def test_log( - request, + request: pytest.FixtureRequest, loglevel: str, ) -> tractor.log.StackLevelAdapter: ''' diff --git a/tests/devx/conftest.py b/tests/devx/conftest.py index eb56d74c5..747c859d7 100644 --- a/tests/devx/conftest.py +++ b/tests/devx/conftest.py @@ -146,13 +146,12 @@ def _spawn( ids='ctl-c={}'.format, ) def ctlc( - request, + request: pytest.FixtureRequest, ci_env: bool, ) -> bool: - use_ctlc = request.param - + use_ctlc: bool = request.param node = request.node markers = node.own_markers for mark in markers: diff --git a/tests/discovery/test_registrar.py b/tests/discovery/test_registrar.py index 02748370a..a004ddac7 100644 --- a/tests/discovery/test_registrar.py +++ b/tests/discovery/test_registrar.py @@ -520,8 +520,6 @@ async def kill_transport( -# @pytest.mark.parametrize('use_signal', [False, True]) -# # Wall-clock bound via `pytest-timeout` (`method='thread'`). # Under `--spawn-backend=subint` this test can wedge in an # un-Ctrl-C-able state (abandoned-subint + shared-GIL @@ -537,6 +535,16 @@ async def kill_transport( 3, # NOTE should be a 2.1s happy path. method='thread', ) +@pytest.mark.skipon_spawn_backend( + 'subint', + reason=( + 'XXX SUBINT HANGING TEST XXX\n' + 'See oustanding issue(s)\n' + # TODO, put issue link! + ) +) +# @pytest.mark.parametrize('use_signal', [False, True]) +# def test_stale_entry_is_deleted( debug_mode: bool, daemon: subprocess.Popen, @@ -549,12 +557,6 @@ def test_stale_entry_is_deleted( stale entry and not delivering a bad portal. ''' - if start_method == 'subint': - pytest.skip( - 'XXX SUBINT HANGING TEST XXX\n' - 'See oustanding issue(s)\n' - ) - async def main(): name: str = 'transport_fails_actor' diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py index f1091372f..645ee068e 100644 --- a/tests/test_cancellation.py +++ b/tests/test_cancellation.py @@ -21,6 +21,16 @@ _friggin_windows: bool = platform.system() == 'Windows' +pytestmark = pytest.mark.skipon_spawn_backend( + 'subint', + reason=( + 'XXX SUBINT HANGING TEST XXX\n' + 'See oustanding issue(s)\n' + # TODO, put issue link! + ) +) + + async def assert_err(delay=0): await trio.sleep(delay) assert 0 @@ -110,8 +120,17 @@ async def main(): assert exc.boxed_type == errtype +# @pytest.mark.skipon_spawn_backend( +# 'subint', +# reason=( +# 'XXX SUBINT HANGING TEST XXX\n' +# 'See oustanding issue(s)\n' +# # TODO, put issue link! +# ) +# ) def test_multierror( reg_addr: tuple[str, int], + start_method: str, ): ''' Verify we raise a ``BaseExceptionGroup`` out of a nursery where @@ -141,15 +160,28 @@ async def main(): trio.run(main) -@pytest.mark.parametrize('delay', (0, 0.5)) @pytest.mark.parametrize( - 'num_subactors', range(25, 26), + 'delay', + (0, 0.5), + ids='delays={}'.format, ) -def test_multierror_fast_nursery(reg_addr, start_method, num_subactors, delay): - """Verify we raise a ``BaseExceptionGroup`` out of a nursery where +@pytest.mark.parametrize( + 'num_subactors', + range(25, 26), + ids= 'num_subs={}'.format, +) +def test_multierror_fast_nursery( + reg_addr: tuple, + start_method: str, + num_subactors: int, + delay: float, +): + ''' + Verify we raise a ``BaseExceptionGroup`` out of a nursery where more then one actor errors and also with a delay before failure to test failure during an ongoing spawning. - """ + + ''' async def main(): async with tractor.open_nursery( registry_addrs=[reg_addr], @@ -189,8 +221,15 @@ async def do_nothing(): pass -@pytest.mark.parametrize('mechanism', ['nursery_cancel', KeyboardInterrupt]) -def test_cancel_single_subactor(reg_addr, mechanism): +@pytest.mark.parametrize( + 'mechanism', [ + 'nursery_cancel', + KeyboardInterrupt, +]) +def test_cancel_single_subactor( + reg_addr: tuple, + mechanism: str|KeyboardInterrupt, +): ''' Ensure a ``ActorNursery.start_actor()`` spawned subactor cancels when the nursery is cancelled. @@ -232,9 +271,12 @@ async def stream_forever(): await trio.sleep(0.01) -@tractor_test -async def test_cancel_infinite_streamer(start_method): - +@tractor_test( + timeout=6, +) +async def test_cancel_infinite_streamer( + start_method: str +): # stream for at most 1 seconds with ( trio.fail_after(4), @@ -257,6 +299,14 @@ async def test_cancel_infinite_streamer(start_method): assert n.cancelled +# @pytest.mark.skipon_spawn_backend( +# 'subint', +# reason=( +# 'XXX SUBINT HANGING TEST XXX\n' +# 'See oustanding issue(s)\n' +# # TODO, put issue link! +# ) +# ) @pytest.mark.parametrize( 'num_actors_and_errs', [ @@ -286,7 +336,9 @@ async def test_cancel_infinite_streamer(start_method): 'no_daemon_actors_fail_all_run_in_actors_sleep_then_fail', ], ) -@tractor_test +@tractor_test( + timeout=10, +) async def test_some_cancels_all( num_actors_and_errs: tuple, start_method: str, @@ -370,7 +422,10 @@ async def test_some_cancels_all( pytest.fail("Should have gotten a remote assertion error?") -async def spawn_and_error(breadth, depth) -> None: +async def spawn_and_error( + breadth: int, + depth: int, +) -> None: name = tractor.current_actor().name async with tractor.open_nursery() as nursery: for i in range(breadth): @@ -396,7 +451,10 @@ async def spawn_and_error(breadth, depth) -> None: @tractor_test -async def test_nested_multierrors(loglevel, start_method): +async def test_nested_multierrors( + loglevel: str, + start_method: str, +): ''' Test that failed actor sets are wrapped in `BaseExceptionGroup`s. This test goes only 2 nurseries deep but we should eventually have tests @@ -483,20 +541,21 @@ async def test_nested_multierrors(loglevel, start_method): @no_windows def test_cancel_via_SIGINT( - loglevel, - start_method, - spawn_backend, + loglevel: str, + start_method: str, ): - """Ensure that a control-C (SIGINT) signal cancels both the parent and + ''' + Ensure that a control-C (SIGINT) signal cancels both the parent and child processes in trionic fashion - """ + + ''' pid: int = os.getpid() async def main(): with trio.fail_after(2): async with tractor.open_nursery() as tn: await tn.start_actor('sucka') - if 'mp' in spawn_backend: + if 'mp' in start_method: time.sleep(0.1) os.kill(pid, signal.SIGINT) await trio.sleep_forever() @@ -580,6 +639,14 @@ async def spawn_sub_with_sync_blocking_task(): print('exiting first subactor layer..\n') +# @pytest.mark.skipon_spawn_backend( +# 'subint', +# reason=( +# 'XXX SUBINT HANGING TEST XXX\n' +# 'See oustanding issue(s)\n' +# # TODO, put issue link! +# ) +# ) @pytest.mark.parametrize( 'man_cancel_outer', [ @@ -694,7 +761,7 @@ async def main(): def test_fast_graceful_cancel_when_spawn_task_in_soft_proc_wait_for_daemon( - start_method, + start_method: str, ): ''' This is a very subtle test which demonstrates how cancellation diff --git a/tests/test_inter_peer_cancellation.py b/tests/test_inter_peer_cancellation.py index b79c0393a..fc5d741d2 100644 --- a/tests/test_inter_peer_cancellation.py +++ b/tests/test_inter_peer_cancellation.py @@ -26,6 +26,15 @@ from .conftest import cpu_scaling_factor +pytestmark = pytest.mark.skipon_spawn_backend( + 'subint', + reason=( + 'XXX SUBINT GIL-CONTENTION HANGING TEST XXX\n' + 'See oustanding issue(s)\n' + # TODO, put issue link! + ) +) + # XXX TODO cases: # - [x] WE cancelled the peer and thus should not see any raised # `ContextCancelled` as it should be reaped silently? diff --git a/tests/test_pubsub.py b/tests/test_pubsub.py index 6d416f89c..1bf8563a6 100644 --- a/tests/test_pubsub.py +++ b/tests/test_pubsub.py @@ -7,6 +7,14 @@ from tractor.experimental import msgpub from tractor._testing import tractor_test +pytestmark = pytest.mark.skipon_spawn_backend( + 'subint', + reason=( + 'XXX SUBINT HANGING TEST XXX\n' + 'See oustanding issue(s)\n' + # TODO, put issue link! + ) +) def test_type_checks(): diff --git a/tests/test_shm.py b/tests/test_shm.py index 00a36f8aa..3409f3384 100644 --- a/tests/test_shm.py +++ b/tests/test_shm.py @@ -14,6 +14,14 @@ attach_shm_list, ) +pytestmark = pytest.mark.skipon_spawn_backend( + 'subint', + reason=( + 'XXX SUBINT GIL-CONTENTION HANGING TEST XXX\n' + 'See oustanding issue(s)\n' + # TODO, put issue link! + ) +) @tractor.context async def child_attach_shml_alot( From 21b17b6a40caa70c67aaf4354aabffbf6abdfc69 Mon Sep 17 00:00:00 2001 From: goodboy Date: Tue, 9 Jun 2026 20:19:56 -0400 Subject: [PATCH 010/110] Refactor `_runtime_vars` into pure get/set API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Resetting `_runtime_vars` post-(forking-)spawn was previously only possible via direct mutation of `_state._runtime_vars` from an external module + an inline default dict duplicating the `_state.py`-internal defaults. Split the access surface into a pure getter + explicit setter so such a reset call site becomes a one-liner composition: `set_runtime_vars(get_runtime_vars(clear_values=True))`. Deats `tractor/runtime/_state.py`, - extract initial values into a module-level `_RUNTIME_VARS_DEFAULTS: dict[str, Any]` constant; the live `_runtime_vars` is now initialised from `dict(_RUNTIME_VARS_DEFAULTS)` - `get_runtime_vars()` grows a `clear_values: bool = False` kwarg. When True, returns a fresh copy of `_RUNTIME_VARS_DEFAULTS` instead of the live dict — still a **pure read**, never mutates anything - new `set_runtime_vars(rtvars: dict | RuntimeVars)` — atomic replacement of the live dict's contents via `.clear()` + `.update()`, so existing references to the same dict object remain valid. Accepts either the historical dict form or the `RuntimeVars` struct (this commit msg was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit 7804a9fe57693dd5e15bee6a08e7d2fa14b6a98a) (factored: kept only the tractor/runtime/_state.py part; dropped tractor/spawn/_subint_forkserver.py call-site rewire) --- tractor/runtime/_state.py | 66 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 3 deletions(-) diff --git a/tractor/runtime/_state.py b/tractor/runtime/_state.py index 55aa3291a..aedcc9520 100644 --- a/tractor/runtime/_state.py +++ b/tractor/runtime/_state.py @@ -117,7 +117,14 @@ def update( ) -_runtime_vars: dict[str, Any] = { +# The "fresh process" defaults — what `_runtime_vars` looks +# like in a just-booted Python process that hasn't yet entered +# `open_root_actor()` nor received a parent `SpawnSpec`. Kept +# as a module-level constant so `get_runtime_vars(clear_values= +# True)` can reset the live dict back to this baseline (see +# `tractor.spawn._subint_forkserver` for the one current caller +# that needs it). +_RUNTIME_VARS_DEFAULTS: dict[str, Any] = { # root of actor-process tree info '_is_root': False, # bool '_root_mailbox': (None, None), # tuple[str|None, str|None] @@ -138,10 +145,12 @@ def update( # infected-`asyncio`-mode: `trio` running as guest. '_is_infected_aio': False, } +_runtime_vars: dict[str, Any] = dict(_RUNTIME_VARS_DEFAULTS) def get_runtime_vars( as_dict: bool = True, + clear_values: bool = False, ) -> dict: ''' Deliver a **copy** of the current `Actor`'s "runtime variables". @@ -150,11 +159,62 @@ def get_runtime_vars( form, but the `RuntimeVars` struct should be utilized as possible for future calls. + Pure read — **never mutates** the module-level `_runtime_vars`. + + If `clear_values=True`, return a copy of the fresh-process + defaults (`_RUNTIME_VARS_DEFAULTS`) instead of the live + dict. Useful in combination with `set_runtime_vars()` to + reset process-global state back to "cold" — the main caller + today is the `subint_forkserver` spawn backend's post-fork + child prelude: + + set_runtime_vars(get_runtime_vars(clear_values=True)) + + `os.fork()` inherits the parent's full memory image, so the + child sees the parent's populated `_runtime_vars` (e.g. + `_is_root=True`) which would trip the `assert not + self.enable_modules` gate in `Actor._from_parent()` on the + subsequent parent→child `SpawnSpec` handshake if left alone. + ''' + src: dict = ( + _RUNTIME_VARS_DEFAULTS + if clear_values + else _runtime_vars + ) + snapshot: dict = dict(src) if as_dict: - return dict(_runtime_vars) + return snapshot + return RuntimeVars(**snapshot) + - return RuntimeVars(**_runtime_vars) +def set_runtime_vars( + rtvars: dict | RuntimeVars, +) -> None: + ''' + Atomically replace the module-level `_runtime_vars` contents + with those of `rtvars` (via `.clear()` + `.update()` so + live references to the same dict object remain valid). + + Accepts either the historical `dict` form or the `RuntimeVars` + `msgspec.Struct` form (the latter still mostly unused but + the blessed forward shape — see the struct's definition). + + Paired with `get_runtime_vars()` as the explicit + write-half of the runtime-vars API — prefer this over + direct mutation of `_runtime_vars[...]` from new call sites. + + ''' + if isinstance(rtvars, RuntimeVars): + # `msgspec.Struct` → dict via its declared field set; + # avoids pulling in `msgspec.structs.asdict` just for + # this one call path. + rtvars = { + field_name: getattr(rtvars, field_name) + for field_name in rtvars.__struct_fields__ + } + _runtime_vars.clear() + _runtime_vars.update(rtvars) def last_actor() -> Actor|None: From 353fb82c366dc0bd7a43024d1416a3b04a1e8f8d Mon Sep 17 00:00:00 2001 From: goodboy Date: Thu, 23 Apr 2026 11:39:42 -0400 Subject: [PATCH 011/110] Enable `debug_mode` for `subint_forkserver` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `subint_forkserver` backend's child runtime is trio-native (uses `_trio_main` + receives `SpawnSpec` over IPC just like `trio`/`subint`), so `tractor.devx.debug._tty_lock` works in those subactors. Wire the runtime gates that historically hard-coded `_spawn_method == 'trio'` to recognize this third backend. Deats, - new `_DEBUG_COMPATIBLE_BACKENDS` module-const in `tractor._root` listing the spawn backends whose subactor runtime is trio-native (`'trio'`, `'subint_forkserver'`). Both the enable-site (`_runtime_vars['_debug_mode'] = True`) and the cleanup-site reset key. off the same tuple — keep them in lockstep when adding backends - `open_root_actor`'s `RuntimeError` for unsupported backends now reports the full compatible-set + the rejected method instead of the stale "only `trio`" msg. - `runtime._runtime.Actor._from_parent`'s SpawnSpec-recv gate adds `'subint_forkserver'` to the existing `('trio', 'subint')` tuple — fork child-side runtime receives the same SpawnSpec IPC handshake as the others. - `subint_forkserver_proc` child-target now passes `spawn_method='subint_forkserver'` (was hard-coded `'trio'`) so `Actor.pformat()` / log lines reflect the actual parent-side spawn mechanism rather than masquerading as plain `trio`. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit 8bcbe730bfe08133ef8cb94944f9d7fef74b6e01) --- tractor/_root.py | 26 +++++++++++++++++++++++--- tractor/runtime/_runtime.py | 9 ++++++++- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/tractor/_root.py b/tractor/_root.py index 9b58523da..3c20fff07 100644 --- a/tractor/_root.py +++ b/tractor/_root.py @@ -69,6 +69,20 @@ logger = log.get_logger('tractor') +# Spawn backends under which `debug_mode=True` is supported. +# Requirement: the spawned subactor's root runtime must be +# trio-native so `tractor.devx.debug._tty_lock` works. Matches +# both the enable-site in `open_root_actor` and the cleanup- +# site reset of `_runtime_vars['_debug_mode']` — keep them in +# lockstep when adding backends. +_DEBUG_COMPATIBLE_BACKENDS: tuple[str, ...] = ( + 'trio', + # forkserver children run `_trio_main` in their own OS + # process — same child-side runtime shape as `trio_proc`. + 'subint_forkserver', +) + + # TODO: stick this in a `@acm` defined in `devx.debug`? # -[ ] also maybe consider making this a `wrapt`-deco to # save an indent level? @@ -293,10 +307,14 @@ async def open_root_actor( ) loglevel: str = loglevel.upper() + # Debug-mode is currently only supported for backends whose + # subactor root runtime is trio-native (so `tractor.devx. + # debug._tty_lock` works). See `_DEBUG_COMPATIBLE_BACKENDS` + # module-const for the list. if ( debug_mode and - _spawn._spawn_method == 'trio' + _spawn._spawn_method in _DEBUG_COMPATIBLE_BACKENDS ): _state._runtime_vars['_debug_mode'] = True @@ -318,7 +336,9 @@ async def open_root_actor( elif debug_mode: raise RuntimeError( - "Debug mode is only supported for the `trio` backend!" + f'Debug mode currently supported only for ' + f'{_DEBUG_COMPATIBLE_BACKENDS!r} spawn backends, not ' + f'{_spawn._spawn_method!r}.' ) assert loglevel @@ -619,7 +639,7 @@ async def ping_tpt_socket( if ( debug_mode and - _spawn._spawn_method == 'trio' + _spawn._spawn_method in _DEBUG_COMPATIBLE_BACKENDS ): _state._runtime_vars['_debug_mode'] = False diff --git a/tractor/runtime/_runtime.py b/tractor/runtime/_runtime.py index 0c25b9262..cbfaa3132 100644 --- a/tractor/runtime/_runtime.py +++ b/tractor/runtime/_runtime.py @@ -870,7 +870,14 @@ async def _from_parent( accept_addrs: list[UnwrappedAddress]|None = None - if self._spawn_method in ("trio", "subint"): + if self._spawn_method in ( + 'trio', + 'subint', + # `subint_forkserver` parent-side sends a + # `SpawnSpec` over IPC just like the other two + # — fork child-side runtime is trio-native. + 'subint_forkserver', + ): # Receive post-spawn runtime state from our parent. spawnspec: msgtypes.SpawnSpec = await chan.recv() From 662a34b9946f44370cb832ffecddc190d4d64d14 Mon Sep 17 00:00:00 2001 From: goodboy Date: Thu, 23 Apr 2026 14:37:48 -0400 Subject: [PATCH 012/110] Wire `reg_addr` through leaky cancel tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stopgap companion to d0121960 (`subint_forkserver` test-cancellation leak doc): five tests in `tests/test_cancellation.py` were running against the default `:1616` registry, so any leaked `subint-forkserv` descendant from a prior test holds the port and blows up every subsequent run with `TooSlowError` / "address in use". Thread the session-unique `reg_addr` fixture through so each run picks its own port — zombies can no longer poison other tests (they'll only cross-contaminate whatever happens to share their port, which is now nothing). Deats, - add `reg_addr: tuple` fixture param to: - `test_cancel_infinite_streamer` - `test_some_cancels_all` - `test_nested_multierrors` - `test_cancel_via_SIGINT` - `test_cancel_via_SIGINT_other_task` - explicitly pass `registry_addrs=[reg_addr]` to the two `open_nursery()` calls that previously had no kwargs at all (in `test_cancel_via_SIGINT` and `test_cancel_via_SIGINT_other_task`) - add bounded `@pytest.mark.timeout(7, method='thread')` to `test_nested_multierrors` so a hung run doesn't wedge the whole session Still doesn't close the real leak — the `subint_forkserver` backend's `_ForkedProc.kill()` is PID-scoped not tree-scoped, so grandchildren survive teardown regardless of registry port. This commit is just blast-radius containment until that fix lands. See `ai/conc-anal/ subint_forkserver_test_cancellation_leak_issue.md`. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit 1af21210571c4ba420742e945b9030f71daabf85) --- tests/test_cancellation.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py index 645ee068e..17f197234 100644 --- a/tests/test_cancellation.py +++ b/tests/test_cancellation.py @@ -275,7 +275,8 @@ async def stream_forever(): timeout=6, ) async def test_cancel_infinite_streamer( - start_method: str + reg_addr: tuple, + start_method: str, ): # stream for at most 1 seconds with ( @@ -341,6 +342,7 @@ async def test_cancel_infinite_streamer( ) async def test_some_cancels_all( num_actors_and_errs: tuple, + reg_addr: tuple, start_method: str, loglevel: str, ): @@ -450,8 +452,13 @@ async def spawn_and_error( await nursery.run_in_actor(*args, **kwargs) +@pytest.mark.timeout( + 10, + method='thread', +) @tractor_test async def test_nested_multierrors( + reg_addr: tuple, loglevel: str, start_method: str, ): @@ -541,6 +548,7 @@ async def test_nested_multierrors( @no_windows def test_cancel_via_SIGINT( + reg_addr: tuple, loglevel: str, start_method: str, ): @@ -553,7 +561,9 @@ def test_cancel_via_SIGINT( async def main(): with trio.fail_after(2): - async with tractor.open_nursery() as tn: + async with tractor.open_nursery( + registry_addrs=[reg_addr], + ) as tn: await tn.start_actor('sucka') if 'mp' in start_method: time.sleep(0.1) @@ -566,6 +576,7 @@ async def main(): @no_windows def test_cancel_via_SIGINT_other_task( + reg_addr: tuple, loglevel: str, start_method: str, spawn_backend: str, @@ -594,7 +605,9 @@ def test_cancel_via_SIGINT_other_task( async def spawn_and_sleep_forever( task_status=trio.TASK_STATUS_IGNORED ): - async with tractor.open_nursery() as tn: + async with tractor.open_nursery( + registry_addrs=[reg_addr], + ) as tn: for i in range(3): await tn.run_in_actor( sleep_forever, From d328177873a0e9316bc8d4420076f494af3eee89 Mon Sep 17 00:00:00 2001 From: goodboy Date: Thu, 23 Apr 2026 16:27:38 -0400 Subject: [PATCH 013/110] Break parent-chan shield during teardown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Completes the nested-cancel deadlock fix started in 0cd0b633 (fork-child FD scrub) and fe540d02 (pidfd- cancellable wait). The remaining piece: the parent- channel `process_messages` loop runs under `shield=True` (so normal cancel cascades don't kill it prematurely), and relies on EOF arriving when the parent closes the socket to exit naturally. Under exec-spawn backends (`trio_proc`, mp) that EOF arrival is reliable — parent's teardown closes the handler-task socket deterministically. But fork- based backends like `subint_forkserver` share enough process-image state that EOF delivery becomes racy: the loop parks waiting for an EOF that only arrives after the parent finishes its own teardown, but the parent is itself blocked on `os.waitpid()` for THIS actor's exit. Mutual wait → deadlock. Deats, - `async_main` stashes the cancel-scope returned by `root_tn.start(...)` for the parent-chan `process_messages` task onto the actor as `_parent_chan_cs` - `Actor.cancel()`'s teardown path (after `ipc_server.cancel()` + `wait_for_shutdown()`) calls `self._parent_chan_cs.cancel()` to explicitly break the shield — no more waiting for EOF delivery, unwinding proceeds deterministically regardless of backend - inline comments on both sites explain the mutual- wait deadlock + why the explicit cancel is backend-agnostic rather than a forkserver-specific workaround With this + the prior two fixes, the `subint_forkserver` nested-cancel cascade unwinds cleanly end-to-end. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit 8ac3dfeb85e64be48b00b28a1731784ae83fda56) --- tractor/runtime/_runtime.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/tractor/runtime/_runtime.py b/tractor/runtime/_runtime.py index cbfaa3132..12b2473e1 100644 --- a/tractor/runtime/_runtime.py +++ b/tractor/runtime/_runtime.py @@ -1216,6 +1216,23 @@ async def cancel( ipc_server.cancel() await ipc_server.wait_for_shutdown() + # Break the shield on the parent-channel + # `process_messages` loop (started with `shield=True` + # in `async_main` above). Required to avoid a + # deadlock during teardown of fork-spawned subactors: + # without this cancel, the loop parks waiting for + # EOF on the parent channel, but the parent is + # blocked on `os.waitpid()` for THIS actor's exit + # — mutual wait. For exec-spawn backends the EOF + # arrives naturally when the parent closes its + # handler-task socket during its own teardown, but + # in fork backends the shared-process-image makes + # that delivery racy / not guaranteed. Explicit + # cancel here gives us deterministic unwinding + # regardless of backend. + if self._parent_chan_cs is not None: + self._parent_chan_cs.cancel() + # cancel all rpc tasks permanently if self._service_tn: self._service_tn.cancel_scope.cancel() @@ -1736,7 +1753,16 @@ async def async_main( # start processing parent requests until our channel # server is 100% up and running. if actor._parent_chan: - await root_tn.start( + # Capture the shielded `loop_cs` for the + # parent-channel `process_messages` task so + # `Actor.cancel()` has a handle to break the + # shield during teardown — without this, the + # shielded loop would park on the parent chan + # indefinitely waiting for EOF that only arrives + # after the PARENT tears down, which under + # fork-based backends (e.g. `subint_forkserver`) + # it waits on THIS actor's exit — deadlock. + actor._parent_chan_cs = await root_tn.start( partial( _rpc.process_messages, chan=actor._parent_chan, From 6bf8364c1625eb8748997c6d837aaf76952268bf Mon Sep 17 00:00:00 2001 From: goodboy Date: Thu, 23 Apr 2026 16:44:15 -0400 Subject: [PATCH 014/110] Skip-mark `subint_forkserver` nested-multierror hang Skip-mark the still-hanging `test_nested_multierrors[subint_forkserver]` via `@pytest.mark.skipon_spawn_backend('subint_forkserver', reason=...)` so it stops blocking the test matrix while the remaining bug is being chased. The mark is an inert no-op until that (in-dev) backend lands. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit 506617c695698fb4928560b0996d9605fd00fd35) (factored: kept only the tests/test_cancellation.py skip-mark; dropped the subint_forkserver conc-anal doc update) --- tests/test_cancellation.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py index 17f197234..3776b3e33 100644 --- a/tests/test_cancellation.py +++ b/tests/test_cancellation.py @@ -452,6 +452,19 @@ async def spawn_and_error( await nursery.run_in_actor(*args, **kwargs) +@pytest.mark.skipon_spawn_backend( + 'subint_forkserver', + reason=( + 'Multi-level fork-spawn cancel cascade hang — ' + 'peer-channel `process_messages` loops do not ' + 'exit on `service_tn.cancel_scope.cancel()`. ' + 'See `ai/conc-anal/' + 'subint_forkserver_test_cancellation_leak_issue.md` ' + 'for the full diagnosis + candidate fix directions. ' + 'Drop this mark once the peer-chan-loop exit issue ' + 'is closed.' + ), +) @pytest.mark.timeout( 10, method='thread', From af0b1fb2dc06ba49d1c6fa3f28e9cf1ceeab2642 Mon Sep 17 00:00:00 2001 From: goodboy Date: Thu, 23 Apr 2026 22:34:49 -0400 Subject: [PATCH 015/110] Bound peer-clear wait in `async_main` finally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fifth diagnostic pass pinpointed the hang to `async_main`'s finally block — every stuck actor reaches `FINALLY ENTER` but never `RETURNING`. Specifically `await ipc_server.wait_for_no_more_ peers()` never returns when a peer-channel handler is stuck: the `_no_more_peers` Event is set only when `server._peers` empties, and stuck handlers keep their channels registered. Wrap the call in `trio.move_on_after(3.0)` + a warning-log on timeout that records the still- connected peer count. 3s is enough for any graceful cancel-ack round-trip; beyond that we're in bug territory and need to proceed with local teardown so the parent's `_ForkedProc.wait()` can unblock. Defensive-in-depth regardless of the underlying bug — a local finally shouldn't block on remote cooperation forever. Verified: with this fix, ALL 15 actors reach `async_main: RETURNING` (up from 10/15 before). Test still hangs past 45s though — there's at least one MORE unbounded wait downstream of `async_main`. Candidates enumerated in the doc update (`open_root_actor` finally / `actor.cancel()` internals / trio.run bg tasks / `_serve_ipc_eps` finally). Skip-mark stays on `test_nested_multierrors[subint_forkserver]`. Also updates `subint_forkserver_test_cancellation_leak_issue.md` with the new pinpoint + summary of the 6-item investigation win list: 1. FD hygiene fix (`_close_inherited_fds`) — orphan-SIGINT closed 2. pidfd-based `_ForkedProc.wait` — cancellable 3. `_parent_chan_cs` wiring — shielded parent-chan loop now breakable 4. `wait_for_no_more_peers` bound — THIS commit 5. Ruled-out hypotheses: tree-kill missing, stuck socket recv, capture-pipe fill (all wrong) 6. Remaining unknown: at least one more unbounded wait in the teardown cascade above `async_main` (this commit msg was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit e312a68d8a0bacef1c9006bf1f6cd2ae9919f683) (factored: dropped subint_forkserver conc-anal doc update) --- tractor/runtime/_runtime.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/tractor/runtime/_runtime.py b/tractor/runtime/_runtime.py index 12b2473e1..9dcca501c 100644 --- a/tractor/runtime/_runtime.py +++ b/tractor/runtime/_runtime.py @@ -1973,7 +1973,25 @@ async def async_main( f' {pformat(ipc_server._peers)}' ) log.runtime(teardown_report) - await ipc_server.wait_for_no_more_peers() + # NOTE: bound the peer-clear wait — otherwise if any + # peer-channel handler is stuck (e.g. never got its + # cancel propagated due to a runtime bug), this wait + # blocks forever and deadlocks the whole actor-tree + # teardown cascade. 3s is enough for any graceful + # cancel-ack round-trip; beyond that we're in bug + # territory and need to proceed with local teardown + # so the parent's `_ForkedProc.wait()` can unblock. + # See `ai/conc-anal/ + # subint_forkserver_test_cancellation_leak_issue.md` + # for the full diagnosis. + with trio.move_on_after(3.0) as _peers_cs: + await ipc_server.wait_for_no_more_peers() + if _peers_cs.cancelled_caught: + teardown_report += ( + f'-> TIMED OUT waiting for peers to clear ' + f'({len(ipc_server._peers)} still connected)\n' + ) + log.warning(teardown_report) teardown_report += ( '-]> all peer channels are complete.\n' From f68ae754447ac277b84efa2346b3147758bfda48 Mon Sep 17 00:00:00 2001 From: goodboy Date: Tue, 9 Jun 2026 20:21:58 -0400 Subject: [PATCH 016/110] Update `subint_forkserver` skip reason: capture-pipe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refresh the `test_nested_multierrors` skip-mark reason to the final diagnosis: the hang is pytest's default `--capture=fd` pipe filling from high-volume subactor traceback output inherited via fds 1,2 in fork children — `pytest -s` passes cleanly. Records the fix direction (redirect child stdio to `/dev/null` in the fork-child prelude) for whoever lands the backend. (this commit msg was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit eceed29d4a6b04edb5b04c555666944aac3b79d9) (factored: kept only the tests/test_cancellation.py skip-reason update of "Pin forkserver hang to pytest `--capture=fd`"; dropped the subint conc-anal doc + tests/spawn/test_subint_forkserver.py) --- tests/test_cancellation.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py index 3776b3e33..fe41dc99c 100644 --- a/tests/test_cancellation.py +++ b/tests/test_cancellation.py @@ -455,14 +455,16 @@ async def spawn_and_error( @pytest.mark.skipon_spawn_backend( 'subint_forkserver', reason=( - 'Multi-level fork-spawn cancel cascade hang — ' - 'peer-channel `process_messages` loops do not ' - 'exit on `service_tn.cancel_scope.cancel()`. ' - 'See `ai/conc-anal/' + 'Passes cleanly with `pytest -s` (no stdout capture) ' + 'but hangs under default `--capture=fd` due to ' + 'pytest-capture-pipe buffer fill from high-volume ' + 'subactor error-log traceback output inherited via fds ' + '1,2 in fork children. Fix direction: redirect subactor ' + 'stdout/stderr to `/dev/null` in `_child_target` / ' + '`_actor_child_main` so forkserver children don\'t hold ' + 'pytest\'s capture pipe open. See `ai/conc-anal/' 'subint_forkserver_test_cancellation_leak_issue.md` ' - 'for the full diagnosis + candidate fix directions. ' - 'Drop this mark once the peer-chan-loop exit issue ' - 'is closed.' + '"Update — pytest capture pipe is the final gate".' ), ) @pytest.mark.timeout( From e70ef50a0c9345d5ff9ffdd14ba26bcc1f2739d5 Mon Sep 17 00:00:00 2001 From: goodboy Date: Tue, 9 Jun 2026 20:22:23 -0400 Subject: [PATCH 017/110] Default `pytest` to use `--capture=sys` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lands the capture-pipe workaround from the prior cluster of diagnosis commits: switch pytest's `--capture` mode from the default `fd` (redirects fd 1,2 to temp files, which fork children inherit and can deadlock writing into) to `sys` (only `sys.stdout` / `sys.stderr` — fd 1,2 left alone). Trade-off documented inline in `pyproject.toml`: - LOST: per-test attribution of raw-fd output (C-ext writes, `os.write(2, ...)`, subproc stdout). Still goes to terminal / CI capture, just not per-test-scoped in the failure report. - KEPT: `print()` + `logging` capture per-test (tractor's logger uses `sys.stderr`). - KEPT: `pytest -s` debugging behavior. This allows us to re-enable `test_nested_multierrors` without skip-marking + clears the class of pytest-capture-induced hangs for any future fork-based backend tests. Deats, - `pyproject.toml`: `'--capture=sys'` added to `addopts` w/ ~20 lines of rationale comment cross-ref'ing the post-mortem doc - `test_cancellation`: drop `skipon_spawn_backend('subint_forkserver')` from `test_nested_ multierrors` — no longer needed. * file-level `pytestmark` covers any residual. - `tests/spawn/test_subint_forkserver.py`: orphan-SIGINT test's xfail mark loosened from `strict=True` to `strict=False` + reason rewritten. * it passes in isolation but is session-env-pollution sensitive (leftover subactor PIDs competing for ports / inheriting harness FDs). * tolerate both outcomes until suite isolation improves. - `test_shm`: extend the existing `skipon_spawn_backend('subint', ...)` to also skip `'subint_forkserver'`. * Different root cause from the cancel-cascade class: `multiprocessing.SharedMemory`'s `resource_tracker` + internals assume fresh- process state, don't survive fork-without-exec cleanly - `tests/discovery/test_registrar.py`: bump timeout 3→7s on one test (unrelated to forkserver; just a flaky-under-load bump). - `tractor.spawn._subint_forkserver`: inline comment-only future-work marker right before `_actor_child_main()` describing the planned conditional stdout/stderr-to-`/dev/null` redirect for cases where `--capture=sys` isn't enough (no code change — the redirect logic itself is deferred). EXTRA NOTEs ----------- The `--capture=sys` approach is the minimum- invasive fix: just a pytest ini change, no runtime code change, works for all fork-based backends, trade-offs well-understood (terminal-level capture still happens, just not pytest's per-test attribution of raw-fd output). (this commit msg was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit 4c133ab54124aca51827b481b6a1f2edfa9f390d) (factored: dropped spawn-backend-only paths: tests/spawn/test_subint_forkserver.py + tractor/spawn/_subint_forkserver.py; the xfail-loosening bullet above no longer applies) --- pyproject.toml | 23 +++++++++++++++++++++++ tests/discovery/test_registrar.py | 2 +- tests/test_cancellation.py | 17 ++--------------- tests/test_shm.py | 10 +++++++--- 4 files changed, 33 insertions(+), 19 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d06c0f5e8..9f6601cfc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -244,6 +244,29 @@ addopts = [ # don't show frickin captured logs AGAIN in the report.. '--show-capture=no', + # sys-level capture. REQUIRED for fork-based spawn + # backends (e.g. `subint_forkserver`): default + # `--capture=fd` redirects fd 1,2 to temp files, and fork + # children inherit those fds — opaque deadlocks happen in + # the pytest-capture-machinery ↔ fork-child stdio + # interaction. `--capture=sys` only redirects Python-level + # `sys.stdout`/`sys.stderr`, leaving fd 1,2 alone. + # + # Trade-off (vs. `--capture=fd`): + # - LOST: per-test attribution of subactor *raw-fd* output + # (C-ext writes, `os.write(2, ...)`, subproc stdout). Not + # zero — those go to the terminal, captured by CI's + # terminal-level capture, just not per-test-scoped in the + # pytest failure report. + # - KEPT: Python-level `print()` + `logging` capture per- + # test (tractor's logger uses `sys.stderr`, so tractor + # log output IS still attributed per-test). + # - KEPT: user `pytest -s` for debugging (unaffected). + # + # Full post-mortem in + # `ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`. + '--capture=sys', + # disable `xonsh` plugin # https://docs.pytest.org/en/stable/how-to/plugins.html#disabling-plugins-from-autoloading # https://docs.pytest.org/en/stable/how-to/plugins.html#deactivating-unregistering-a-plugin-by-name diff --git a/tests/discovery/test_registrar.py b/tests/discovery/test_registrar.py index a004ddac7..d87a6861b 100644 --- a/tests/discovery/test_registrar.py +++ b/tests/discovery/test_registrar.py @@ -133,7 +133,7 @@ async def say_hello_use_wait( @pytest.mark.timeout( - 3, + 7, method='thread', ) @tractor_test diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py index fe41dc99c..27a5eee2a 100644 --- a/tests/test_cancellation.py +++ b/tests/test_cancellation.py @@ -452,21 +452,8 @@ async def spawn_and_error( await nursery.run_in_actor(*args, **kwargs) -@pytest.mark.skipon_spawn_backend( - 'subint_forkserver', - reason=( - 'Passes cleanly with `pytest -s` (no stdout capture) ' - 'but hangs under default `--capture=fd` due to ' - 'pytest-capture-pipe buffer fill from high-volume ' - 'subactor error-log traceback output inherited via fds ' - '1,2 in fork children. Fix direction: redirect subactor ' - 'stdout/stderr to `/dev/null` in `_child_target` / ' - '`_actor_child_main` so forkserver children don\'t hold ' - 'pytest\'s capture pipe open. See `ai/conc-anal/' - 'subint_forkserver_test_cancellation_leak_issue.md` ' - '"Update — pytest capture pipe is the final gate".' - ), -) +# NOTE: subint_forkserver skip handled by file-level `pytestmark` +# above (same pytest-capture-fd hang class as siblings). @pytest.mark.timeout( 10, method='thread', diff --git a/tests/test_shm.py b/tests/test_shm.py index 3409f3384..61bcdee20 100644 --- a/tests/test_shm.py +++ b/tests/test_shm.py @@ -16,10 +16,14 @@ pytestmark = pytest.mark.skipon_spawn_backend( 'subint', + 'subint_forkserver', reason=( - 'XXX SUBINT GIL-CONTENTION HANGING TEST XXX\n' - 'See oustanding issue(s)\n' - # TODO, put issue link! + 'subint: GIL-contention hanging class.\n' + 'subint_forkserver: `multiprocessing.SharedMemory` ' + 'has known issues with fork-without-exec (mp\'s ' + 'resource_tracker and SharedMemory internals assume ' + 'fresh-process state). RemoteActorError surfaces from ' + 'the shm-attach path. TODO, put issue link!\n' ) ) From 18754f25702ca1ce8f1bd480d05f70336ec3f67a Mon Sep 17 00:00:00 2001 From: goodboy Date: Fri, 24 Apr 2026 20:26:25 -0400 Subject: [PATCH 018/110] Wire `reg_addr` through infected-asyncio tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Continues the hygiene pattern from de601676 (cancel tests) into `tests/test_infected_asyncio.py`: many tests here were calling `tractor.open_nursery()` w/o `registry_addrs=[reg_addr]` and thus racing on the default `:1616` registry across sessions. Thread the session-unique `reg_addr` through so leaked or slow-to-teardown subactors from a prior test can't cross-pollute. Deats, - add `registry_addrs=[reg_addr]` to `open_nursery()` calls in suite where missing. - `test_sigint_closes_lifetime_stack`: - add `reg_addr`, `debug_mode`, `start_method` fixture params - `delay` now reads the `debug_mode` param directly instead of calling `tractor.debug_mode()` (fires slightly earlier in the test lifecycle) - sanity assert `if debug_mode: assert tractor.debug_mode()` after nursery open - new print showing SIGINT target (`send_sigint_to` + resolved pid) - catch `trio.TooSlowError` around `ctx.wait_for_result()` and conditionally `pytest.xfail` when `send_sigint_to == 'child' and start_method == 'subint_forkserver'` — the known orphan-SIGINT limitation tracked in `ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md` - parametrize id typo fix: `'just_trio_slee'` → `'just_trio_sleep'` (this commit msg was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit b350aa09eec0c145af37daaf482c654839d15158) --- tests/test_infected_asyncio.py | 83 ++++++++++++++++++++++++++++------ 1 file changed, 70 insertions(+), 13 deletions(-) diff --git a/tests/test_infected_asyncio.py b/tests/test_infected_asyncio.py index 9f6b43e5f..e13df3251 100644 --- a/tests/test_infected_asyncio.py +++ b/tests/test_infected_asyncio.py @@ -183,6 +183,7 @@ def test_tractor_cancels_aio( async def main(): async with tractor.open_nursery( debug_mode=debug_mode, + registry_addrs=[reg_addr], ) as an: portal = await an.run_in_actor( asyncio_actor, @@ -205,11 +206,11 @@ def test_trio_cancels_aio( ''' async def main(): - + # cancel the nursery shortly after boot with trio.move_on_after(1): - # cancel the nursery shortly after boot - - async with tractor.open_nursery() as tn: + async with tractor.open_nursery( + registry_addrs=[reg_addr], + ) as tn: await tn.run_in_actor( asyncio_actor, target='aio_sleep_forever', @@ -277,7 +278,9 @@ def test_context_spawns_aio_task_that_errors( ''' async def main(): with trio.fail_after(1 + delay): - async with tractor.open_nursery() as an: + async with tractor.open_nursery( + registry_addrs=[reg_addr], + ) as an: p = await an.start_actor( 'aio_daemon', enable_modules=[__name__], @@ -360,7 +363,9 @@ def test_aio_cancelled_from_aio_causes_trio_cancelled( async def main(): an: tractor.ActorNursery - async with tractor.open_nursery() as an: + async with tractor.open_nursery( + registry_addrs=[reg_addr], + ) as an: p: tractor.Portal = await an.run_in_actor( asyncio_actor, target='aio_cancel', @@ -569,7 +574,9 @@ def test_basic_interloop_channel_stream( async def main(): # TODO, figure out min timeout here! with trio.fail_after(6): - async with tractor.open_nursery() as an: + async with tractor.open_nursery( + registry_addrs=[reg_addr], + ) as an: portal = await an.run_in_actor( stream_from_aio, infect_asyncio=True, @@ -582,9 +589,13 @@ async def main(): # TODO: parametrize the above test and avoid the duplication here? -def test_trio_error_cancels_intertask_chan(reg_addr): +def test_trio_error_cancels_intertask_chan( + reg_addr: tuple[str, int], +): async def main(): - async with tractor.open_nursery() as an: + async with tractor.open_nursery( + registry_addrs=[reg_addr], + ) as an: portal = await an.run_in_actor( stream_from_aio, trio_raise_err=True, @@ -619,6 +630,7 @@ async def main(): async with tractor.open_nursery( debug_mode=debug_mode, # enable_stack_on_sig=True, + registry_addrs=[reg_addr], ) as an: portal = await an.run_in_actor( stream_from_aio, @@ -667,6 +679,7 @@ def test_aio_exits_early_relays_AsyncioTaskExited( async def main(): with trio.fail_after(1 + delay): async with tractor.open_nursery( + registry_addrs=[reg_addr], debug_mode=debug_mode, # enable_stack_on_sig=True, ) as an: @@ -707,6 +720,7 @@ def test_aio_errors_and_channel_propagates_and_closes( ): async def main(): async with tractor.open_nursery( + registry_addrs=[reg_addr], debug_mode=debug_mode, ) as an: portal = await an.run_in_actor( @@ -806,6 +820,7 @@ def test_echoserver_detailed_mechanics( ): async def main(): async with tractor.open_nursery( + registry_addrs=[reg_addr], debug_mode=debug_mode, ) as an: p = await an.start_actor( @@ -984,7 +999,7 @@ async def manage_file( ], ids=[ 'bg_aio_task', - 'just_trio_slee', + 'just_trio_sleep', ], ) @pytest.mark.parametrize( @@ -1000,11 +1015,14 @@ async def manage_file( ) def test_sigint_closes_lifetime_stack( tmp_path: Path, + reg_addr: tuple, + debug_mode: bool, + wait_for_ctx: bool, bg_aio_task: bool, trio_side_is_shielded: bool, - debug_mode: bool, send_sigint_to: str, + start_method: str, ): ''' Ensure that an infected child can use the `Actor.lifetime_stack` @@ -1014,12 +1032,22 @@ def test_sigint_closes_lifetime_stack( ''' async def main(): - delay = 999 if tractor.debug_mode() else 1 + delay: float = ( + 999 + if debug_mode + else 1 + ) try: an: tractor.ActorNursery async with tractor.open_nursery( + registry_addrs=[reg_addr], debug_mode=debug_mode, ) as an: + + # sanity + if debug_mode: + assert tractor.debug_mode() + p: tractor.Portal = await an.start_actor( 'file_mngr', enable_modules=[__name__], @@ -1054,6 +1082,10 @@ async def main(): cpid if send_sigint_to == 'child' else os.getpid() ) + print( + f'Sending SIGINT to {send_sigint_to!r}\n' + f'pid: {pid!r}\n' + ) os.kill( pid, signal.SIGINT, @@ -1064,13 +1096,37 @@ async def main(): # timeout should trigger! if wait_for_ctx: print('waiting for ctx outcome in parent..') + + if debug_mode: + assert delay == 999 + try: - with trio.fail_after(1 + delay): + with trio.fail_after( + 1 + delay + ): await ctx.wait_for_result() except tractor.ContextCancelled as ctxc: assert ctxc.canceller == ctx.chan.uid raise + except trio.TooSlowError: + if ( + send_sigint_to == 'child' + and + start_method == 'subint_forkserver' + ): + pytest.xfail( + reason=( + 'SIGINT delivery to fork-child subactor is known ' + 'to NOT SUCCEED, precisely bc we have not wired up a' + '"trio SIGINT mode" in the child pre-fork.\n' + 'Also see `test_orphaned_subactor_sigint_cleanup_DRAFT` for' + 'a dedicated suite demonstrating this expected limitation as ' + 'well as the detailed doc:\n' + '`ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md`.\n' + ), + ) + # XXX CASE 2: this seems to be the source of the # original issue which exhibited BEFORE we put # a `Actor.cancel_soon()` inside @@ -1170,6 +1226,7 @@ async def main(): with trio.fail_after(3): an: tractor.ActorNursery async with tractor.open_nursery( + registry_addrs=[reg_addr], debug_mode=debug_mode, loglevel=loglevel, ) as an: From 04ffde40f3ab8fe9441b0a7bb27214e5fed3fa3a Mon Sep 17 00:00:00 2001 From: goodboy Date: Fri, 24 Apr 2026 21:47:46 -0400 Subject: [PATCH 019/110] Skip `test_loglevel_propagated_to_subactor` on subint forkserver too (cherry picked from commit 2ca0f41e61cbb980b5a0d7863a4b0f801552e95f) --- tests/test_spawning.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/test_spawning.py b/tests/test_spawning.py index 7f3421fe5..b0e8a88de 100644 --- a/tests/test_spawning.py +++ b/tests/test_spawning.py @@ -194,9 +194,14 @@ def test_loglevel_propagated_to_subactor( reg_addr: tuple, level: str, ): - if start_method == 'mp_forkserver': + if start_method in ('mp_forkserver', 'subint_forkserver'): pytest.skip( - "a bug with `capfd` seems to make forkserver capture not work?" + "a bug with `capfd` seems to make forkserver capture not work? " + "(same class as the `mp_forkserver` pre-existing skip — fork-" + "based backends inherit pytest's capfd temp-file fds into the " + "subactor and the IPC handshake reads garbage (`unclean EOF " + "read only X/HUGE_NUMBER bytes`). Work around by using " + "`capsys` instead or skip entirely." ) async def main(): From 48ace6dd8287c510e08a7cc8ff0d1c57146ffaf5 Mon Sep 17 00:00:00 2001 From: goodboy Date: Sat, 25 Apr 2026 00:05:58 -0400 Subject: [PATCH 020/110] Add `_testing._reap` + auto-reap fixture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Zombie-subactor cleanup for the test suite, SC-polite discipline (`SIGINT` first, bounded grace, `SIGKILL` only on survivors). Two parts: a shared reaper module + an autouse session-end fixture that runs it. Deats, - new `tractor/_testing/_reap.py` (+230 LOC) — Linux- only reaper using `/proc//{status,cwd,cmdline}` inspection. Two detection modes: - `find_descendants(parent_pid)` for the in-session case (PPid-direct-match while pytest is still alive). - `find_orphans(repo_root)` for the CLI / post- mortem case (`PPid==1` reparented to init + `cwd` filter to repo root + `python` cmdline filter). - `reap(pids, *, grace=3.0, poll=0.25)` does the signal ladder: SIGINT all, poll up to `grace` for exit, SIGKILL any survivors. Returns `(signalled, killed)` for caller-side reporting. - new `_reap_orphaned_subactors` session-scoped autouse fixture in `tractor/_testing/pytest.py` — after `yield`, runs `find_descendants(os.getpid())` + `reap(...)` so each pytest session leaves no surviving forks. - companion CLI scaffolding lives at `scripts/tractor-reap` (separate commit) for the pytest-died-mid-session case where the in-session fixture didn't get to run. Also, - promote `from tractor.spawn._spawn import SpawnMethodKey` to module-top in `pytest.py` (was inline-imported inside `pytest_generate_tests`), and reuse it in `pytest_collection_modifyitems` to assert each `skipon_spawn_backend` mark arg is a valid spawn-method literal — catches typos at collection time. - inline `# ?TODO` flags running these through the `try_set_backend` checker for stronger validation. Cross-refs `feedback_sc_graceful_cancel_first.md` for the SIGINT-before-SIGKILL discipline rationale. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit eae478f3d55509c9bea4f3449e988efc8f8eb47e) --- tractor/_testing/_reap.py | 230 +++++++++++++++++++++++++++++++++++++ tractor/_testing/pytest.py | 60 +++++++++- 2 files changed, 288 insertions(+), 2 deletions(-) create mode 100644 tractor/_testing/_reap.py diff --git a/tractor/_testing/_reap.py b/tractor/_testing/_reap.py new file mode 100644 index 000000000..3e2309ffa --- /dev/null +++ b/tractor/_testing/_reap.py @@ -0,0 +1,230 @@ +# tractor: structured concurrent "actors". +# Copyright 2018-eternity Tyler Goodlet. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +''' +Zombie-subactor reaper — SC-polite (SIGINT first, SIGKILL +as last resort with a bounded grace window). + +Shared implementation between the `tractor-reap` CLI +(`scripts/tractor-reap`) and the pytest session-scoped +auto-fixture that guards the test suite against leftover +subactor processes. + +Design notes +------------ + +- Linux-only: reads `/proc//{status,cwd,cmdline}`. +- Two detection modes: + + 1. **descendant-mode** — when invoked from a still-live + parent (e.g. a pytest session-end fixture), match by + `PPid == parent_pid`. Direct + precise; the target + PIDs are still reparented to the live pytest process + at teardown time, before pytest exits. + + 2. **orphan-mode** — when invoked after the parent died + (e.g. the `tractor-reap` CLI run post-Ctrl+C), match + by `PPid == 1` (reparented to init) AND `cwd == + ` AND cmdline contains `python`. The cwd + filter is what keeps the heuristic from sweeping up + unrelated init-children on the box. + +- Escalation: for every matched PID, SIGINT, poll for up + to `grace` seconds, then SIGKILL any survivors. The + two-phase pattern is the SC-graceful-cancel discipline + documented in `feedback_sc_graceful_cancel_first.md` — + we want the subactor runtime to run its trio cancel + shield + IPC teardown paths where it can. + +''' +from __future__ import annotations + +import os +import pathlib +import signal +import time + + +def _read_status_ppid(pid: int) -> int | None: + ''' + Return the parent-pid from `/proc//status` or + `None` if the proc went away / is unreadable. + + ''' + try: + with open(f'/proc/{pid}/status') as f: + for line in f: + if line.startswith('PPid:'): + return int(line.split()[1]) + except (FileNotFoundError, PermissionError, ProcessLookupError): + return None + return None + + +def _read_cwd(pid: int) -> str | None: + try: + return os.readlink(f'/proc/{pid}/cwd') + except (FileNotFoundError, PermissionError, ProcessLookupError): + return None + + +def _read_cmdline(pid: int) -> str: + try: + with open(f'/proc/{pid}/cmdline', 'rb') as f: + return f.read().replace(b'\0', b' ').decode(errors='replace') + except (FileNotFoundError, PermissionError, ProcessLookupError): + return '' + + +def _iter_live_pids() -> list[int]: + ''' + Enumerate currently-alive pids from `/proc`. + + ''' + try: + entries: list[str] = os.listdir('/proc') + except OSError: + return [] + return [int(e) for e in entries if e.isdigit()] + + +def find_descendants( + parent_pid: int, +) -> list[int]: + ''' + PIDs whose `PPid == parent_pid` — i.e. direct + children of the given pid. Used by the pytest + session-end fixture where `parent_pid` is still + alive as the pytest-python process. + + ''' + return [ + pid + for pid in _iter_live_pids() + if _read_status_ppid(pid) == parent_pid + ] + + +def find_orphans( + repo_root: pathlib.Path, +) -> list[int]: + ''' + PIDs that are: + + - reparented to init (`PPid == 1`), + - have `cwd == `, + - and have a `python` in their cmdline. + + This is the "pytest-died-mid-session" case where the + subactor forks got reparented. The cwd filter is the + critical bit that keeps us from sweeping up unrelated + init-children on the box. + + ''' + repo: str = str(repo_root) + hits: list[int] = [] + for pid in _iter_live_pids(): + if _read_status_ppid(pid) != 1: + continue + cwd: str | None = _read_cwd(pid) + if cwd != repo: + continue + cmd: str = _read_cmdline(pid) + if 'python' not in cmd: + continue + hits.append(pid) + return hits + + +def reap( + pids: list[int], + *, + grace: float = 3.0, + poll: float = 0.25, + log=print, +) -> tuple[list[int], list[int]]: + ''' + Deliver SIGINT to each pid, wait up to `grace` + seconds for them to exit, then SIGKILL any that + survive. + + Returns `(signalled, survivors_killed)` so callers + can report / assert. + + `log` is the logger function for user-visible + progress lines — default `print`; pytest fixture + swaps it for a `pytest`-friendly writer. + + ''' + if not pids: + return ([], []) + + signalled: list[int] = [] + for pid in pids: + try: + os.kill(pid, signal.SIGINT) + signalled.append(pid) + except ProcessLookupError: + # raced — already gone + pass + + if signalled: + log( + f'[tractor-reap] SIGINT → {len(signalled)} ' + f'proc(s): {signalled}' + ) + + deadline: float = time.monotonic() + grace + while time.monotonic() < deadline: + time.sleep(poll) + alive: list[int] = [ + pid for pid in signalled if _is_alive(pid) + ] + if not alive: + return (signalled, []) + + survivors: list[int] = [ + pid for pid in signalled if _is_alive(pid) + ] + if survivors: + log( + f'[tractor-reap] SIGKILL (after {grace}s ' + f'grace) → {survivors}' + ) + for pid in survivors: + try: + os.kill(pid, signal.SIGKILL) + except ProcessLookupError: + pass + + return (signalled, survivors) + + +def _is_alive(pid: int) -> bool: + ''' + True iff `/proc/` still exists AND the proc + isn't already a zombie (Z state). + + ''' + try: + with open(f'/proc/{pid}/status') as f: + for line in f: + if line.startswith('State:'): + # e.g. 'State:\tZ (zombie)' + return 'Z' not in line.split()[1] + except (FileNotFoundError, ProcessLookupError): + return False + return True diff --git a/tractor/_testing/pytest.py b/tractor/_testing/pytest.py index ef3cc9a73..89535b1f8 100644 --- a/tractor/_testing/pytest.py +++ b/tractor/_testing/pytest.py @@ -32,8 +32,20 @@ import pytest import tractor +from tractor.spawn._spawn import SpawnMethodKey import trio +# Spawn-backend keys which may appear in `skipon_spawn_backend` +# marks ahead of the named backend actually being registered in +# `tractor.spawn._spawn.SpawnMethodKey`; such marks are inert +# (they can never match an active backend) but must not break +# collection. +_IN_DEV_SPAWN_BACKENDS: tuple[str, ...] = ( + 'subint', + 'subint_forkserver', + 'main_thread_forkserver', +) + def tractor_test( wrapped: Callable|None = None, @@ -274,7 +286,16 @@ class + module-level marks in the correct scope order (and default_reason: str = f'Borked on --spawn-backend={backend!r}' for item in items: for mark in item.iter_markers(name='skipon_spawn_backend'): - if backend in mark.args: + skip_backends: tuple[str] = mark.args + for skip_backend in skip_backends: + assert ( + skip_backend in get_args(SpawnMethodKey) + or + skip_backend in _IN_DEV_SPAWN_BACKENDS + ) + # ?TODO, run these through the try-set-backend checker to + # avoid typos? + if backend in skip_backends: reason: str = mark.kwargs.get( 'reason', default_reason, @@ -285,6 +306,42 @@ class + module-level marks in the correct scope order (and break +@pytest.fixture( + scope='session', + autouse=True, +) +def _reap_orphaned_subactors(): + ''' + Session-scoped autouse fixture: after the whole test + session finishes, SIGINT any subactor processes still + parented to this `pytest` process, wait a bounded + grace window, then SIGKILL survivors. + + Rationale: under fork-based spawn backends (notably + `subint_forkserver`), a test that times out or bails + mid-teardown can leave subactor forks alive. Without + this reap, they linger across sessions and compete + for ports / inherit pytest's capture-pipe fds — which + flakifies later tests. SC-polite discipline: SIGINT + first to let the subactor's trio cancel shield + IPC + teardown paths run before we escalate. + + Matching companion CLI: `scripts/tractor-reap` for + the pytest-died-mid-session case. + + ''' + import os + parent_pid: int = os.getpid() + yield + from tractor._testing._reap import ( + find_descendants, + reap, + ) + pids: list[int] = find_descendants(parent_pid) + if pids: + reap(pids, grace=3.0) + + @pytest.fixture(scope='session') def debug_mode( request: pytest.FixtureRequest, @@ -398,7 +455,6 @@ def pytest_generate_tests( # drive the valid-backend set from the canonical `Literal` so # adding a new spawn backend (e.g. `'subint'`) doesn't require # touching the harness. - from tractor.spawn._spawn import SpawnMethodKey assert spawn_backend in get_args(SpawnMethodKey) # NOTE: used-to-be-used-to dyanmically parametrize tests for when From f633ebf5c6c9c5c4662ee4ccaf9ccdf907e420a5 Mon Sep 17 00:00:00 2001 From: goodboy Date: Sun, 26 Apr 2026 18:04:40 -0400 Subject: [PATCH 021/110] Add `tractor-reap` CLI + document auto-reap New `scripts/tractor-reap` CLI wraps the `_testing._reap` mod for manual zombie-subactor cleanup after crashed pytest sessions. Two modes: - orphan-mode (default): finds PPid==1 procs with cwd matching repo root + `python` in cmdline. - descendant-mode (`--parent `): scoped sweep under a still-live supervisor. SC-polite: SIGINT with bounded grace window (default 3s) before escalating to SIGKILL. Exit code signals whether escalation was needed (useful for CI health-checks). Also, document both the auto-reap fixture and the CLI in `/run-tests` SKILL.md (section 10). (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit 6d76b60404591aca2af195520f03b18a5a55a9cb) --- .claude/skills/run-tests/SKILL.md | 64 +++++++++++++++ scripts/tractor-reap | 124 ++++++++++++++++++++++++++++++ 2 files changed, 188 insertions(+) create mode 100755 scripts/tractor-reap diff --git a/.claude/skills/run-tests/SKILL.md b/.claude/skills/run-tests/SKILL.md index ea0d4ae63..4212e25a5 100644 --- a/.claude/skills/run-tests/SKILL.md +++ b/.claude/skills/run-tests/SKILL.md @@ -528,3 +528,67 @@ filling log volume. Full post-mortem in `ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`. Lesson codified here so future-me grep-finds the workaround before digging. + +## 10. Reaping zombie subactors (`tractor-reap`) + +**Symptom:** after a `pytest` run crashes, times out, +or is `Ctrl+C`'d, subactor forks (esp. under +`subint_forkserver`) can be reparented to `init` +(PPid==1) and linger. They hold onto ports, inherit +pytest's capture-pipe fds, and flakify later +sessions. + +**Two layers of defense:** + +### a) Session-scoped auto-fixture (always on) + +`tractor/_testing/pytest.py::_reap_orphaned_subactors` +runs at pytest session teardown. It walks `/proc` for +direct descendants of the pytest pid, SIGINTs them, +waits up to 3s, then SIGKILLs survivors. SC-polite: +gives the subactor runtime a chance to run its trio +cancel shield + IPC teardown before escalation. + +This is *autouse* and session-scoped — you don't need +to do anything. It just runs. + +### b) `scripts/tractor-reap` CLI (manual reap) + +For the **pytest-died-mid-session** case (Ctrl+C, OOM +kill, hung process you had to `kill -9`), the fixture +never ran. Reach for the CLI: + +```sh +# default: orphans (PPid==1, cwd==repo, cmd contains python) +scripts/tractor-reap + +# descendant-mode: from a still-live supervisor +scripts/tractor-reap --parent + +# see what would be reaped, don't signal +scripts/tractor-reap -n + +# tune the SIGINT → SIGKILL grace window +scripts/tractor-reap --grace 5 +``` + +Exit code: `0` if everyone exited on SIGINT, `1` if +SIGKILL had to escalate — so you can chain it in CI +health-checks (`scripts/tractor-reap || `). + +**What it matches** (orphan-mode): +- `PPid == 1` (reparented to init → definitely + orphaned, not just a currently-running child) +- `cwd == ` (keeps the sweep scoped; won't + touch unrelated init-children elsewhere) +- `python` in cmdline + +**What it does not do:** kill anything whose PPid is +still a live tractor parent. If the parent is alive +it's not an orphan; use `--parent ` if you need +to force-reap under a still-live supervisor. + +**When NOT to run it:** while a pytest session is +active in another terminal. It's safe (won't touch +that session's live children in orphan-mode) but can +race if the target session is mid-teardown. diff --git a/scripts/tractor-reap b/scripts/tractor-reap new file mode 100755 index 000000000..092208878 --- /dev/null +++ b/scripts/tractor-reap @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +# tractor: structured concurrent "actors". +# Copyright 2018-eternity Tyler Goodlet. +# +# SPDX-License-Identifier: AGPL-3.0-or-later +''' +`tractor-reap` — SC-polite zombie-subactor reaper. + +Finds `tractor` subactor processes left alive after a +`pytest` (or any tractor-app) run that failed to fully +cancel its actor tree, then sends SIGINT with a bounded +grace window before escalating to SIGKILL. + +Detection modes (auto-selected): + + --parent : descendant-mode — kill procs whose + PPid == . Use when a parent + is still alive and you want to + scope the sweep precisely (e.g. + CI wrapper calling in from outside + pytest). + + (default) : orphan-mode — kill procs with + PPid==1 (init-reparented) whose + cwd matches the repo root AND + whose cmdline contains `python`. + The cwd filter is what prevents + sweeping unrelated init-children. + +Usage: + + # after a pytest run crashed/was Ctrl+C'd + scripts/tractor-reap + + # from inside a still-live supervisor + scripts/tractor-reap --parent 12345 + + # dry-run: list what would be reaped, don't signal + scripts/tractor-reap -n + +''' +import argparse +import pathlib +import subprocess +import sys + + +def _repo_root() -> pathlib.Path: + ''' + Use `git rev-parse --show-toplevel` when available; + fall back to the repo this script lives in. + + ''' + try: + out: str = subprocess.check_output( + ['git', 'rev-parse', '--show-toplevel'], + stderr=subprocess.DEVNULL, + text=True, + ).strip() + return pathlib.Path(out) + except (subprocess.CalledProcessError, FileNotFoundError): + return pathlib.Path(__file__).resolve().parent.parent + + +def main() -> int: + parser = argparse.ArgumentParser( + prog='tractor-reap', + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + '--parent', '-p', + type=int, + default=None, + help='descendant-mode: reap procs with PPid==', + ) + parser.add_argument( + '--grace', '-g', + type=float, + default=3.0, + help='SIGINT grace window in seconds (default 3.0)', + ) + parser.add_argument( + '--dry-run', '-n', + action='store_true', + help='list matched pids but do not signal', + ) + args = parser.parse_args() + + # import lazily so `--help` doesn't require the tractor + # package to be importable (e.g. when running from a + # shell not inside a venv). + repo = _repo_root() + sys.path.insert(0, str(repo)) + from tractor._testing._reap import ( + find_descendants, + find_orphans, + reap, + ) + + if args.parent is not None: + pids: list[int] = find_descendants(args.parent) + mode: str = f'descendants of PPid={args.parent}' + else: + pids = find_orphans(repo) + mode = f'orphans (PPid=1, cwd={repo})' + + if not pids: + print(f'[tractor-reap] no {mode} to reap') + return 0 + + if args.dry_run: + print(f'[tractor-reap] dry-run — {mode}:\n {pids}') + return 0 + + signalled, survivors = reap(pids, grace=args.grace) + # exit 0 if everyone exited cleanly, else 1 to signal + # escalation happened — makes the command useful in + # CI health-checks and `||`-chaining. + return 0 if not survivors else 1 + + +if __name__ == '__main__': + raise SystemExit(main()) From 50392e6e78dcfe91350ad14bbbd91a5bc4904d5e Mon Sep 17 00:00:00 2001 From: goodboy Date: Tue, 9 Jun 2026 20:22:23 -0400 Subject: [PATCH 022/110] =?UTF-8?q?Document=20`SharedMemory`=20=C3=97=20`s?= =?UTF-8?q?ubint=5Fforkserver`=20incompat?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New `ai/conc-anal/` doc: `mp.SharedMemory` is fork-without-exec unsafe — child inherits parent's `resource_tracker` fd → EBADF on first shm op; leaked `/shm_list` cascades `FileExistsError` across parametrize variants. Canonical CPython issue class, NOT a tractor bug. Includes two longer-term mitigation paths (reset inherited tracker fd vs migrate off `mp.shared_memory`). Also, update `tests/test_shm.py`: - comment out `subint_forkserver` from skip list - rewrite reason with precise failure-mode descriptions + link to the analysis doc (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit c99d475d0317f9e0fbbb4d51abc299c90e4d965a) (factored: dropped spawn-backend-only paths: ai/conc-anal/subint_forkserver_mp_shared_memory_issue.md) --- tests/test_shm.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/test_shm.py b/tests/test_shm.py index 61bcdee20..8ea434570 100644 --- a/tests/test_shm.py +++ b/tests/test_shm.py @@ -16,14 +16,18 @@ pytestmark = pytest.mark.skipon_spawn_backend( 'subint', - 'subint_forkserver', + # 'subint_forkserver', reason=( 'subint: GIL-contention hanging class.\n' 'subint_forkserver: `multiprocessing.SharedMemory` ' - 'has known issues with fork-without-exec (mp\'s ' - 'resource_tracker and SharedMemory internals assume ' - 'fresh-process state). RemoteActorError surfaces from ' - 'the shm-attach path. TODO, put issue link!\n' + 'is fork-without-exec unsafe — child inherits parent\'s ' + '`resource_tracker` fd → EBADF on first shm op ' + '(`test_child_attaches_alot`); leaked `/shm_list` from ' + 'a "passing" run cascades into `FileExistsError` across ' + 'parametrize variants (`test_parent_writer_child_reader`). ' + 'Canonical CPython issue class, NOT a tractor bug; full ' + 'tracker doc:\n' + 'ai/conc-anal/subint_forkserver_mp_shared_memory_issue.md' ) ) From 0b6a7aa1a99fc5508bc6e3dc67d0d087310cb136 Mon Sep 17 00:00:00 2001 From: goodboy Date: Tue, 9 Jun 2026 20:22:38 -0400 Subject: [PATCH 023/110] Fix `SharedMemory` under `subint_forkserver` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements the resolution described in c99d475d's `subint_forkserver_mp_shared_memory_issue.md` (now updated with the resolution post-mortem). Two-part fix that side-steps `mp.resource_tracker` entirely rather than try to make it fork-safe — turns out that's both simpler AND more correct given tractor already SC-manages allocation lifetimes. Deats, - `tractor/ipc/_mp_bs.py::disable_mantracker()`: drop the `platform.python_version_tuple()[:-1] >= ('3', '13')` branch — patches now run unconditionally: * monkey-patch `mp.resource_tracker. _resource_tracker` to a no-op `ManTracker` subclass (empty `register` / `unregister` / `ensure_running`). * return `partial(SharedMemory, track=False)` for the per-allocation opt-out. * belt + suspenders: even if something dodges the wrapper, the singleton can't talk to the inherited (broken) parent fd. - `tractor/ipc/_shm.py::open_shm_list()`: drop the 3.13+ conditional skip of the unlink-callback; install a `try_unlink()` wrapper that swallows `FileNotFoundError` (sibling-already-cleaned race in shared-key setups). Without `mp.resource_tracker` doing it for us, we own the unlink — `actor. lifetime_stack` is the right place since tractor already controls actor lifecycle. - `tests/test_shm.py`: uncomment-out `subint_forkserver` from the module-level skip- list (tests pass now). Inline comment cross-refs the two `_mp_bs` / `_shm` workarounds. - `ai/conc-anal/subint_forkserver_mp_shared_memory_ issue.md`: heavy rewrite — flips status from "open / unresolvable in tractor" to "resolved, kept as decision record". Adds Resolution section, "Why this is the right call" rationale (mp tracker is widely criticized; tractor already owns lifecycle), trade-offs (crash-leaked segments, lost mp leak warning), verification (7 passed under both `subint_forkserver` and `trio` backends), and upstream issue links (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit aa3e2309266f29329bc31edeef3c6af2b3111f34) (factored: dropped subint_forkserver conc-anal doc update) --- tests/test_shm.py | 4 +++ tractor/ipc/_mp_bs.py | 70 ++++++++++++++++++------------------------- tractor/ipc/_shm.py | 25 +++++++++++----- 3 files changed, 51 insertions(+), 48 deletions(-) diff --git a/tests/test_shm.py b/tests/test_shm.py index 8ea434570..d6ad93f4f 100644 --- a/tests/test_shm.py +++ b/tests/test_shm.py @@ -17,6 +17,10 @@ pytestmark = pytest.mark.skipon_spawn_backend( 'subint', # 'subint_forkserver', + # XXX we hack around this stdlib limitation by both, + # - setting `ShareMemory(track=False)` + # - overriding the `mp.ResourceTracker` nonsense in + # `.ipc._mp_bs`. reason=( 'subint: GIL-contention hanging class.\n' 'subint_forkserver: `multiprocessing.SharedMemory` ' diff --git a/tractor/ipc/_mp_bs.py b/tractor/ipc/_mp_bs.py index 462291c6b..7f2092d24 100644 --- a/tractor/ipc/_mp_bs.py +++ b/tractor/ipc/_mp_bs.py @@ -17,7 +17,7 @@ Utils to tame mp non-SC madeness ''' -import platform +from functools import partial def disable_mantracker(): @@ -27,49 +27,37 @@ def disable_mantracker(): ''' from multiprocessing.shared_memory import SharedMemory - + from multiprocessing import ( + resource_tracker as mantracker, + ) + + # XXX ALWAYS disable the stdlib's "resource tracker"; it prevents + # fork backends and never was useful to us since we're SC + # lifetime managing all allocations. + class ManTracker(mantracker.ResourceTracker): + def register(self, name, rtype): + pass + + def unregister(self, name, rtype): + pass + + def ensure_running(self): + pass + + # "know your land and know your prey" + # https://www.dailymotion.com/video/x6ozzco + mantracker._resource_tracker = ManTracker() + mantracker.register = mantracker._resource_tracker.register + mantracker.ensure_running = mantracker._resource_tracker.ensure_running + mantracker.unregister = mantracker._resource_tracker.unregister + mantracker.getfd = mantracker._resource_tracker.getfd # 3.13+ only.. can pass `track=False` to disable # all the resource tracker bs. # https://docs.python.org/3/library/multiprocessing.shared_memory.html - if (_py_313 := ( - platform.python_version_tuple()[:-1] - >= - ('3', '13') - ) - ): - from functools import partial - return partial( - SharedMemory, - track=False, - ) - - # !TODO, once we drop 3.12- we can obvi remove all this! - else: - from multiprocessing import ( - resource_tracker as mantracker, - ) - - # Tell the "resource tracker" thing to fuck off. - class ManTracker(mantracker.ResourceTracker): - def register(self, name, rtype): - pass - - def unregister(self, name, rtype): - pass - - def ensure_running(self): - pass - - # "know your land and know your prey" - # https://www.dailymotion.com/video/x6ozzco - mantracker._resource_tracker = ManTracker() - mantracker.register = mantracker._resource_tracker.register - mantracker.ensure_running = mantracker._resource_tracker.ensure_running - mantracker.unregister = mantracker._resource_tracker.unregister - mantracker.getfd = mantracker._resource_tracker.getfd - - # use std type verbatim - shmT = SharedMemory + shmT = partial( + SharedMemory, + track=False, + ) return shmT diff --git a/tractor/ipc/_shm.py b/tractor/ipc/_shm.py index b60fafcce..f0225d707 100644 --- a/tractor/ipc/_shm.py +++ b/tractor/ipc/_shm.py @@ -929,15 +929,26 @@ def open_shm_list( # "close" attached shm on actor teardown try: actor = tractor.current_actor() - actor.lifetime_stack.callback(shml.shm.close) - # XXX on 3.13+ we don't need to call this? - # -> bc we pass `track=False` for `SharedMemeory` orr? - if ( - platform.python_version_tuple()[:-1] < ('3', '13') - ): - actor.lifetime_stack.callback(shml.shm.unlink) + # >XXX NOTE< on 3.13+ we need to call this AS WELL AS pass + # `track=False` for `mp.SharedMemeory` otherwise fork based + # backends will error out due to long lived stdlib + # limitations, + # - https://bugs.python.org/issue38119 + # - https://bugs.python.org/issue45209 + # + def try_unlink(): + try: + shml.shm.unlink() + except FileNotFoundError as fne: + log.debug( + f'ShmList already deallocated pre-actor-shutdown.\n' + f'{fne!r}\n' + ) + + actor.lifetime_stack.callback(try_unlink) + except RuntimeError: log.warning('tractor runtime not active, skipping teardown steps') From e5ca5bb01788df9f5d95e1c2842de02420f591e7 Mon Sep 17 00:00:00 2001 From: goodboy Date: Tue, 9 Jun 2026 20:22:56 -0400 Subject: [PATCH 024/110] Add `--shm` orphan sweep to `tractor-reap` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since `tractor.ipc._mp_bs.disable_mantracker()` turns off `mp.resource_tracker` entirely (see the conc-anal doc `subint_forkserver_mp_shared_memory_issue.md`), a hard-crashing actor can leave `/dev/shm/` segments that nothing else GCs. New `tractor-reap` phase 2 sweeps them. Deats, - `tractor/_testing/_reap.py`: add `find_orphaned_shm()` + `reap_shm()` helpers. Match criteria: regular file under `/dev/shm`, owned by current uid, AND no live proc has it open (mmap'd or fd-held). In-use enumeration via `psutil.Process.memory_maps()` + `.open_files()` — xplatform, kernel-canonical (same answer `lsof` would give), no reliance on tractor-specific shm-key naming. - `_ensure_shm_supported()` guard: helpers raise `NotImplementedError` outside Linux/FreeBSD bc macOS POSIX shm has no fs-visible path (`shm_open` only) and Windows is a different story. - `scripts/tractor-reap`: new `--shm` (run after process reap) and `--shm-only` (skip process phase) flags. `-n` dry-runs both phases. Exit code is `1` if either phase had survivors/errors. - `pyproject.toml` + `uv.lock`: add `psutil>=7.0.0` to the `testing` dep group; lazy-imported in `_reap.py` so the process-reap path stays import-clean without it. Also, - doc `--shm` in `.claude/skills/run-tests/SKILL.md` (new section 10c) — covers match criteria + the preservation guarantee for unrelated apps. - flip mitigation status in `subint_forkserver_mp_shared_memory_issue.md` from "could extend `tractor-reap`" to "implemented", with a note that callers should still UUID-pin shm keys to avoid cross-session collisions. Verified locally vs 81 in-use segments held by `piker`, `lttng-ust-*`, `aja-shm-*` — all preserved; only the genuinely-orphaned tractor segments got unlinked. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit 4f12d69b41211c56454c31fc1335ac658cf77416) (factored: dropped subint_forkserver conc-anal doc update) --- .claude/skills/run-tests/SKILL.md | 38 +++++ pyproject.toml | 5 + scripts/tractor-reap | 116 ++++++++++---- tractor/_testing/_reap.py | 252 ++++++++++++++++++++++++++++-- 4 files changed, 372 insertions(+), 39 deletions(-) diff --git a/.claude/skills/run-tests/SKILL.md b/.claude/skills/run-tests/SKILL.md index 4212e25a5..deb359081 100644 --- a/.claude/skills/run-tests/SKILL.md +++ b/.claude/skills/run-tests/SKILL.md @@ -592,3 +592,41 @@ to force-reap under a still-live supervisor. active in another terminal. It's safe (won't touch that session's live children in orphan-mode) but can race if the target session is mid-teardown. + +### c) `--shm` / `--shm-only`: orphan-segment sweep + +Because `tractor.ipc._mp_bs.disable_mantracker()` +turns off `mp.resource_tracker` (see +`ai/conc-anal/subint_forkserver_mp_shared_memory_issue.md`), +a hard-crashing actor can leave `/dev/shm/` +segments behind that nothing else GCs. + +```sh +# process reap THEN shm sweep +scripts/tractor-reap --shm + +# shm sweep only (skip process phase) +scripts/tractor-reap --shm-only + +# dry-run: list candidates, don't unlink +scripts/tractor-reap --shm -n +``` + +**Match criteria** (very conservative — this is a +shared-system path, can't be wrong): +- segment is a regular file under `/dev/shm`, +- owned by the **current uid** (`stat.st_uid`), +- AND **no live process holds it open** — + enumerated by walking every readable + `/proc//maps` (post-mmap mappings) AND + `/proc//fd/*` (pre-mmap shm-opened fds). + +The "nobody has it open" check is the +kernel-canonical "is this leaked?" test — same +answer `lsof /dev/shm/` would give. No +reliance on tractor-specific naming, so it works +for any tractor app. Critically, it WILL NOT touch +segments held by other apps you have running +(e.g. `piker`, `lttng-ust-*`, `aja-shm-*` — +verified locally with 81 in-use segments correctly +preserved). diff --git a/pyproject.toml b/pyproject.toml index 9f6601cfc..6b2ac0414 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -89,6 +89,11 @@ testing = [ # known-hanging `subint`-backend audit tests; see # `ai/conc-anal/subint_*_issue.md`). "pytest-timeout>=2.3", + # used by `tractor._testing._reap` for the + # `tractor-reap` zombie-subactor + leaked-shm + # cleanup utility (xplatform `Process.memory_maps`, + # `Process.open_files`). + "psutil>=7.0.0", ] repl = [ "pyperclip>=1.9.0", diff --git a/scripts/tractor-reap b/scripts/tractor-reap index 092208878..3640d210e 100755 --- a/scripts/tractor-reap +++ b/scripts/tractor-reap @@ -4,14 +4,26 @@ # # SPDX-License-Identifier: AGPL-3.0-or-later ''' -`tractor-reap` — SC-polite zombie-subactor reaper. +`tractor-reap` — SC-polite zombie-subactor reaper + +optional `/dev/shm/` orphan-segment sweep. -Finds `tractor` subactor processes left alive after a -`pytest` (or any tractor-app) run that failed to fully -cancel its actor tree, then sends SIGINT with a bounded -grace window before escalating to SIGKILL. +Two cleanup phases (run in order when both are enabled): -Detection modes (auto-selected): +1. **process reap** — finds `tractor` subactor processes + left alive after a `pytest` (or any tractor-app) run + that failed to fully cancel its actor tree, then sends + SIGINT with a bounded grace window before escalating + to SIGKILL. + +2. **shm sweep** (`--shm` / `--shm-only`) — unlinks + `/dev/shm/` entries owned by the current uid + that no live process has open (mmap'd or fd-held). + Needed because `tractor` disables + `mp.resource_tracker` (see `tractor.ipc._mp_bs`), so a + hard-crashing actor leaves leaked segments that + nothing else GCs. + +Process-reap detection modes (auto-selected): --parent : descendant-mode — kill procs whose PPid == . Use when a parent @@ -29,14 +41,21 @@ Detection modes (auto-selected): Usage: - # after a pytest run crashed/was Ctrl+C'd + # process reap only (default) scripts/tractor-reap + # process reap + shm sweep + scripts/tractor-reap --shm + + # only the shm sweep, skip process reap + scripts/tractor-reap --shm-only + # from inside a still-live supervisor scripts/tractor-reap --parent 12345 - # dry-run: list what would be reaped, don't signal + # dry-run: list what would be reaped, don't act scripts/tractor-reap -n + scripts/tractor-reap --shm -n ''' import argparse @@ -83,7 +102,21 @@ def main() -> int: parser.add_argument( '--dry-run', '-n', action='store_true', - help='list matched pids but do not signal', + help='list matched pids/paths but do not signal/unlink', + ) + parser.add_argument( + '--shm', + action='store_true', + help=( + 'after process reap, also unlink orphaned ' + '/dev/shm segments owned by the current user ' + 'that no live process is mapping or holding open' + ), + ) + parser.add_argument( + '--shm-only', + action='store_true', + help='skip process reap; only do the shm sweep', ) args = parser.parse_args() @@ -95,29 +128,54 @@ def main() -> int: from tractor._testing._reap import ( find_descendants, find_orphans, + find_orphaned_shm, reap, + reap_shm, ) - if args.parent is not None: - pids: list[int] = find_descendants(args.parent) - mode: str = f'descendants of PPid={args.parent}' - else: - pids = find_orphans(repo) - mode = f'orphans (PPid=1, cwd={repo})' - - if not pids: - print(f'[tractor-reap] no {mode} to reap') - return 0 - - if args.dry_run: - print(f'[tractor-reap] dry-run — {mode}:\n {pids}') - return 0 - - signalled, survivors = reap(pids, grace=args.grace) - # exit 0 if everyone exited cleanly, else 1 to signal - # escalation happened — makes the command useful in - # CI health-checks and `||`-chaining. - return 0 if not survivors else 1 + rc: int = 0 + + # --- phase 1: process reap (skipped under --shm-only) --- + if not args.shm_only: + if args.parent is not None: + pids: list[int] = find_descendants(args.parent) + mode: str = f'descendants of PPid={args.parent}' + else: + pids = find_orphans(repo) + mode = f'orphans (PPid=1, cwd={repo})' + + if not pids: + print(f'[tractor-reap] no {mode} to reap') + elif args.dry_run: + print( + f'[tractor-reap] dry-run — {mode}:\n {pids}' + ) + else: + _, survivors = reap(pids, grace=args.grace) + if survivors: + rc = 1 + + # --- phase 2: shm sweep (opt-in) --- + if args.shm or args.shm_only: + leaked: list[str] = find_orphaned_shm() + if not leaked: + print( + '[tractor-reap] no orphaned /dev/shm ' + 'segments to sweep' + ) + elif args.dry_run: + print( + f'[tractor-reap] dry-run — {len(leaked)} ' + f'orphaned shm segment(s):\n {leaked}' + ) + else: + _, errors = reap_shm(leaked) + if errors: + rc = 1 + + # exit 0 if everything cleaned cleanly, else 1 — useful + # for CI health-check chaining. + return rc if __name__ == '__main__': diff --git a/tractor/_testing/_reap.py b/tractor/_testing/_reap.py index 3e2309ffa..f16c22d30 100644 --- a/tractor/_testing/_reap.py +++ b/tractor/_testing/_reap.py @@ -16,17 +16,25 @@ ''' Zombie-subactor reaper — SC-polite (SIGINT first, SIGKILL -as last resort with a bounded grace window). +as last resort with a bounded grace window) plus optional +`/dev/shm/` orphan-segment sweep. Shared implementation between the `tractor-reap` CLI (`scripts/tractor-reap`) and the pytest session-scoped auto-fixture that guards the test suite against leftover subactor processes. -Design notes ------------- +Design notes — process reap +--------------------------- + +- Linux-only today: reads `/proc//{status,cwd,cmdline}`. + Module imports cleanly elsewhere; calling `find_*` on a + non-Linux box returns an empty list (no `/proc` + enumeration). A future xplatform pass could swap this + for `psutil.Process.children()` / + `psutil.process_iter()` since `psutil` is already a + test-time dependency. -- Linux-only: reads `/proc//{status,cwd,cmdline}`. - Two detection modes: 1. **descendant-mode** — when invoked from a still-live @@ -49,14 +57,71 @@ we want the subactor runtime to run its trio cancel shield + IPC teardown paths where it can. +Design notes — shm sweep +------------------------ + +Since `tractor/ipc/_mp_bs.disable_mantracker()` turns off +`mp.resource_tracker` entirely, a hard-crashing actor can +leave `/dev/shm/` segments behind that nothing else +GCs (see +`ai/conc-anal/subint_forkserver_mp_shared_memory_issue.md`, +"Trade-offs / known gaps"). + +The shm sweep is **Linux-/FreeBSD-only**: both expose +POSIX shared-memory segments as regular files under +`/dev/shm`, so `os.stat()` + `os.unlink()` are the +correct primitives. macOS POSIX shm has no fs-visible +path (segments live behind `shm_open`/`shm_unlink` +syscalls only), and Windows is a different story +entirely. Calling the shm helpers on an unsupported +platform raises `NotImplementedError`. + +In-use enumeration delegates to `psutil` — +`Process.memory_maps()` (post-mmap) + +`Process.open_files()` (pre-mmap shm-opened fds) — +xplatform, mature, and handles the per-process +permission/race edge cases correctly. Segments matching +neither are genuinely leaked → safe to unlink. + +The "nobody has it open" check is the kernel-canonical +test — same answer `lsof /dev/shm/` would give. No +reliance on tractor-specific naming conventions (shm +keys are caller-defined). + ''' from __future__ import annotations import os import pathlib import signal +import stat +import sys import time +# `/dev/shm` is the POSIX-shm filesystem on Linux + FreeBSD. +# macOS uses `shm_open` syscalls without a fs-visible path, +# so the shm helpers refuse to run there. +_SHM_PLATFORM_OK: bool = sys.platform.startswith( + ('linux', 'freebsd') +) +SHM_DIR: str = '/dev/shm' + + +def _ensure_shm_supported() -> None: + ''' + Guard for shm helpers — they assume `/dev/shm` exists + as a tmpfs and `os.unlink()` is the right primitive. + Both true on Linux + FreeBSD; not true elsewhere. + + ''' + if not _SHM_PLATFORM_OK: + raise NotImplementedError( + f'shm reap is only supported on Linux/FreeBSD; ' + f'got sys.platform={sys.platform!r}. macOS ' + f'POSIX shm has no fs-visible path; Windows ' + f'has no /dev/shm equivalent.' + ) + def _read_status_ppid(pid: int) -> int | None: ''' @@ -69,7 +134,11 @@ def _read_status_ppid(pid: int) -> int | None: for line in f: if line.startswith('PPid:'): return int(line.split()[1]) - except (FileNotFoundError, PermissionError, ProcessLookupError): + except ( + FileNotFoundError, + PermissionError, + ProcessLookupError, + ): return None return None @@ -77,21 +146,32 @@ def _read_status_ppid(pid: int) -> int | None: def _read_cwd(pid: int) -> str | None: try: return os.readlink(f'/proc/{pid}/cwd') - except (FileNotFoundError, PermissionError, ProcessLookupError): + except ( + FileNotFoundError, + PermissionError, + ProcessLookupError, + ): return None def _read_cmdline(pid: int) -> str: try: with open(f'/proc/{pid}/cmdline', 'rb') as f: - return f.read().replace(b'\0', b' ').decode(errors='replace') - except (FileNotFoundError, PermissionError, ProcessLookupError): + return f.read().replace(b'\0', b' ').decode( + errors='replace', + ) + except ( + FileNotFoundError, + PermissionError, + ProcessLookupError, + ): return '' def _iter_live_pids() -> list[int]: ''' - Enumerate currently-alive pids from `/proc`. + Enumerate currently-alive pids from `/proc`. Returns + `[]` on systems without `/proc` (e.g. macOS). ''' try: @@ -225,6 +305,158 @@ def _is_alive(pid: int) -> bool: if line.startswith('State:'): # e.g. 'State:\tZ (zombie)' return 'Z' not in line.split()[1] - except (FileNotFoundError, ProcessLookupError): + except ( + FileNotFoundError, + ProcessLookupError, + ): return False return True + + +def _enumerate_in_use_shm( + shm_dir: str = SHM_DIR, +) -> set[str]: + ''' + Return the set of `/` paths currently + held open by any live process — via `psutil`'s + xplatform `Process.memory_maps()` (post-mmap + segments) and `Process.open_files()` (pre-mmap + shm-opened fds). + + Lazy-imports `psutil` so the module stays importable + on installs without it (it's a `testing` group dep). + + ''' + _ensure_shm_supported() + + # lazy + actionable failure: leaked shm sweep is the + # only thing in this module that needs psutil; we + # don't want a top-level ImportError breaking the + # process-reap path. + try: + import psutil + except ImportError as exc: + raise RuntimeError( + 'shm reap requires `psutil` — install the ' + '`testing` dep group, e.g. ' + '`uv sync --group testing`.' + ) from exc + + in_use: set[str] = set() + prefix: str = shm_dir.rstrip('/') + '/' + for proc in psutil.process_iter(['pid']): + try: + for m in proc.memory_maps(grouped=False): + if m.path.startswith(prefix): + in_use.add(m.path) + for f in proc.open_files(): + if f.path.startswith(prefix): + in_use.add(f.path) + except ( + psutil.NoSuchProcess, + psutil.AccessDenied, + psutil.ZombieProcess, + FileNotFoundError, + PermissionError, + ): + # raced — proc died or we can't see its + # mappings (e.g. root-owned). Skip; missing + # an in-use entry only means we'd preserve + # something we could reap, never the + # reverse — safe-by-default. + continue + return in_use + + +def find_orphaned_shm( + *, + uid: int | None = None, + shm_dir: str = SHM_DIR, +) -> list[str]: + ''' + `/` paths that are: + + - owned by `uid` (default: the current effective uid), + - and currently held by NO live process — i.e. + genuinely leaked. + + Linux/FreeBSD only — see module docstring. No reliance + on caller-defined shm-key naming, so this works for + any tractor app (not just the test suite). + + ''' + _ensure_shm_supported() + + if uid is None: + uid = os.geteuid() + + try: + entries: list[str] = os.listdir(shm_dir) + except OSError: + return [] + + in_use: set[str] = _enumerate_in_use_shm(shm_dir=shm_dir) + leaked: list[str] = [] + prefix: str = shm_dir.rstrip('/') + '/' + for entry in entries: + path: str = prefix + entry + try: + st: os.stat_result = os.stat(path) + except OSError: + continue + # only regular files — skip subdirs / sockets etc. + if not stat.S_ISREG(st.st_mode): + continue + if st.st_uid != uid: + continue + if path in in_use: + continue + leaked.append(path) + return leaked + + +def reap_shm( + paths: list[str], + *, + log=print, +) -> tuple[list[str], list[tuple[str, OSError]]]: + ''' + Unlink the given `/dev/shm/...` paths. + + Linux/FreeBSD only — `os.unlink()` is the correct + primitive on the POSIX-shm tmpfs there. macOS POSIX + shm has no fs-visible path; the equivalent there is + `posix_ipc.unlink_shared_memory(name)` (not + implemented here — see module docstring). + + Returns `(unlinked, errors)` where `errors` is a list + of `(path, exc)` for paths that could not be removed + (e.g. permissions). Paths that raced to being already- + gone are counted as successfully unlinked. + + ''' + _ensure_shm_supported() + + unlinked: list[str] = [] + errors: list[tuple[str, OSError]] = [] + for path in paths: + try: + os.unlink(path) + unlinked.append(path) + except FileNotFoundError: + # raced — already gone, treat as success + unlinked.append(path) + except OSError as exc: + errors.append((path, exc)) + + if unlinked: + log( + f'[tractor-reap] unlinked {len(unlinked)} ' + f'orphaned shm segment(s): {unlinked}' + ) + for path, exc in errors: + log( + f'[tractor-reap] could not unlink {path}: ' + f'{exc!r}' + ) + return (unlinked, errors) From d5a15b4d70069d20f63155e4df78107ecdd73a1f Mon Sep 17 00:00:00 2001 From: goodboy Date: Mon, 27 Apr 2026 11:46:43 -0400 Subject: [PATCH 025/110] Bump `test_stale_entry_is_deleted`'s timeout to 30 Seems that when run in-suite it delays more then the so-measured "happy path" timing; better to have no suite-global interruption then asserting a fast single test's run. (cherry picked from commit 65fcfbf2246663584b89b6cc1e49507eed974804) --- tests/discovery/test_registrar.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/tests/discovery/test_registrar.py b/tests/discovery/test_registrar.py index d87a6861b..618c93ad6 100644 --- a/tests/discovery/test_registrar.py +++ b/tests/discovery/test_registrar.py @@ -520,6 +520,10 @@ async def kill_transport( +# ?TODO, do a OSc style signalling test on this? +# -[ ] doesn't work for fork backends +# @pytest.mark.parametrize('use_signal', [False, True]) +# # Wall-clock bound via `pytest-timeout` (`method='thread'`). # Under `--spawn-backend=subint` this test can wedge in an # un-Ctrl-C-able state (abandoned-subint + shared-GIL @@ -532,19 +536,21 @@ async def kill_transport( # the intended behavior here; the alternative is an unattended # suite run that never returns. @pytest.mark.timeout( - 3, # NOTE should be a 2.1s happy path. + 30, + # NOTE should be a 2.1s happy path. + # XXX for `subint_forkserver` this is SUPER SENSITIVE so keep it + # higher to avoid flaky runs.. method='thread', ) @pytest.mark.skipon_spawn_backend( 'subint', + # 'subint_forkserver', reason=( 'XXX SUBINT HANGING TEST XXX\n' 'See oustanding issue(s)\n' # TODO, put issue link! ) ) -# @pytest.mark.parametrize('use_signal', [False, True]) -# def test_stale_entry_is_deleted( debug_mode: bool, daemon: subprocess.Popen, @@ -558,7 +564,6 @@ def test_stale_entry_is_deleted( ''' async def main(): - name: str = 'transport_fails_actor' _reg_ptl: tractor.Portal an: tractor.ActorNursery @@ -591,6 +596,14 @@ async def main(): await ptl.cancel_actor() await an.cancel() + # XXX, for tracing if this starts being flaky again.. + # + # async def _timeout_main(): + # with trio.move_on_after(4) as cs: + # await main() + # if cs.cancel_called: + # await tractor.pause() + # TODO, remove once the `[subint]` variant no longer hangs. # # Status (as of Phase B hard-kill landing): @@ -641,3 +654,4 @@ async def main(): path=f'/tmp/test_stale_entry_is_deleted_{start_method}.dump', ): trio.run(main) + # trio.run(_timeout_main) From a4554106fc851a1ef75a73186c1376a056e57a3d Mon Sep 17 00:00:00 2001 From: goodboy Date: Mon, 27 Apr 2026 12:59:00 -0400 Subject: [PATCH 026/110] Wire `test_dynamic_pub_sub` to standard fixtures Pull in the `reg_addr`, `debug_mode`, and `test_log` fixtures so this test follows the same conventions as the rest of the suite: - pass `registry_addrs=[reg_addr]` + `debug_mode` into `tractor.open_nursery()` (so `--tpdb` etc work). - after the `pytest.raises` block, add `assert err` + `test_log.exception('Timed out AS EXPECTED')` so the expected timeout is logged explicitly instead of swallowed. Also, - drop whitespace-only blank lines around the `subs` param of `consumer()` and `ctx` param of `one_task_streams_and_one_handles_reqresp()`. - promote `test_sigint_both_stream_types`'s one-line docstring to multi-line form. (this commit msg was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit 9b05f659b3dbdcf6ad9869e18ca2eb62b91fbdc1) --- tests/test_advanced_streaming.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/tests/test_advanced_streaming.py b/tests/test_advanced_streaming.py index 907a21964..89191a4b7 100644 --- a/tests/test_advanced_streaming.py +++ b/tests/test_advanced_streaming.py @@ -76,9 +76,7 @@ async def subscribe( async def consumer( - subs: list[str], - ) -> None: uid = tractor.current_actor().uid @@ -108,15 +106,21 @@ async def consumer( print(f'{uid} got: {value}') -def test_dynamic_pub_sub(): - +def test_dynamic_pub_sub( + reg_addr: tuple, + debug_mode: bool, + test_log: tractor.log.StackLevelAdapter, +): global _registry from multiprocessing import cpu_count cpus = cpu_count() async def main(): - async with tractor.open_nursery() as n: + async with tractor.open_nursery( + registry_addrs=[reg_addr], + debug_mode=debug_mode, + ) as n: # name of this actor will be same as target func await n.run_in_actor(publisher) @@ -155,12 +159,13 @@ async def main(): else: pytest.fail('Never got a `TooSlowError` ?') + assert err + test_log.exception('Timed out AS EXPECTED') + @tractor.context async def one_task_streams_and_one_handles_reqresp( - ctx: tractor.Context, - ) -> None: await ctx.started() @@ -257,7 +262,8 @@ async def echo_ctx_stream( def test_sigint_both_stream_types(): - '''Verify that running a bi-directional and recv only stream + ''' + Verify that running a bi-directional and recv only stream side-by-side will cancel correctly from SIGINT. ''' From 149e1e1993c106f34faa2ebe92c96af645a67153 Mon Sep 17 00:00:00 2001 From: goodboy Date: Mon, 27 Apr 2026 13:52:28 -0400 Subject: [PATCH 027/110] Wire `reg_addr` into `test_context_stream_semantics` Same wire-up pattern as the prior `test_dynamic_pub_sub` commit: each test that already pulled in `debug_mode` now also pulls in `reg_addr` and passes `registry_addrs=[reg_addr]` into `tractor.open_nursery()`, so the suite's standard registry-addr conventions apply. Tests touched: - `test_started_misuse` - `test_simple_context` - `test_parent_cancels` - `test_one_end_stream_not_opened` - `test_maybe_allow_overruns_stream` - `test_ctx_with_self_actor` (this commit msg was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit 66f1941f464c31cf485d8d8faa894f38df042606) --- tests/test_context_stream_semantics.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_context_stream_semantics.py b/tests/test_context_stream_semantics.py index 6d7de4d60..5548ed17b 100644 --- a/tests/test_context_stream_semantics.py +++ b/tests/test_context_stream_semantics.py @@ -115,10 +115,12 @@ async def not_started_but_stream_opened( ) def test_started_misuse( target: Callable, + reg_addr: tuple, debug_mode: bool, ): async def main(): async with tractor.open_nursery( + registry_addrs=[reg_addr], debug_mode=debug_mode, ) as an: portal = await an.start_actor( @@ -184,6 +186,7 @@ def test_simple_context( error_parent, child_blocks_forever, pointlessly_open_stream, + reg_addr: tuple, debug_mode: bool, ): @@ -193,6 +196,7 @@ async def main(): with trio.fail_after(timeout): async with tractor.open_nursery( + registry_addrs=[reg_addr], debug_mode=debug_mode, ) as an: portal = await an.start_actor( @@ -278,6 +282,7 @@ def test_parent_cancels( cancel_method: str, chk_ctx_result_before_exit: bool, child_returns_early: bool, + reg_addr: tuple, debug_mode: bool, ): ''' @@ -355,6 +360,7 @@ async def check_canceller( async def main(): async with tractor.open_nursery( + registry_addrs=[reg_addr], debug_mode=debug_mode, ) as an: portal = await an.start_actor( @@ -931,6 +937,7 @@ async def keep_sending_from_child( ) def test_one_end_stream_not_opened( overrun_by: tuple[str, int, Callable], + reg_addr: tuple, debug_mode: bool, ): ''' @@ -949,6 +956,7 @@ def test_one_end_stream_not_opened( async def main(): async with tractor.open_nursery( + registry_addrs=[reg_addr], debug_mode=debug_mode, ) as an: portal = await an.start_actor( @@ -1113,6 +1121,7 @@ def test_maybe_allow_overruns_stream( # conftest wide loglevel: str, + reg_addr: tuple, debug_mode: bool, ): ''' @@ -1133,6 +1142,7 @@ def test_maybe_allow_overruns_stream( ''' async def main(): async with tractor.open_nursery( + registry_addrs=[reg_addr], debug_mode=debug_mode, ) as an: portal = await an.start_actor( @@ -1249,6 +1259,7 @@ async def main(): def test_ctx_with_self_actor( loglevel: str, + reg_addr: tuple, debug_mode: bool, ): ''' @@ -1263,6 +1274,7 @@ def test_ctx_with_self_actor( ''' async def main(): async with tractor.open_nursery( + registry_addrs=[reg_addr], debug_mode=debug_mode, enable_modules=[__name__], ) as an: From f7a58b82fe78cb58032ffd16b59b0abc0b5298f6 Mon Sep 17 00:00:00 2001 From: goodboy Date: Tue, 9 Jun 2026 20:23:12 -0400 Subject: [PATCH 028/110] =?UTF-8?q?Sweep=20`subint=5Fforkserver`=20?= =?UTF-8?q?=E2=86=92=20`main=5Fthread=5Fforkserver`=20in=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the variant-1 / variant-2 backend split, update remaining string-match refs to the variant-1 backend so user-visible gates + skip-marks + comments name the working backend correctly: - `tractor._root._DEBUG_COMPATIBLE_BACKENDS`: include `main_thread_forkserver`, drop the stub-only `subint_forkserver` entry. - `tests/test_spawning.py::test_loglevel_propagated_to_subactor`: capfd-skip flips to `main_thread_forkserver`. - `tests/test_infected_asyncio.py::test_sigint_closes_lifetime_stack`: xfail-condition flips to `main_thread_forkserver`. - `tests/test_shm.py`: drop stale "broken on `main_thread_forkserver`" reason-text since the `mp.SharedMemory(track=False)` + resource-tracker monkey-patch in `.ipc._mp_bs` makes the tests pass; the skip-mark only fires on plain `subint` now. - Comment / docstring sweep: `runtime._state`, `runtime._runtime`, `_testing.pytest`, `_subint.py`, `pyproject.toml`, `test_cancellation.py`, `test_registrar.py` — refs to variant-1 backend updated. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit 205382a39be6ecfc602c5b8a69f4cbba25348721) (factored: dropped spawn-backend-only path: tractor/spawn/_subint.py) --- pyproject.toml | 2 +- tests/discovery/test_registrar.py | 6 +++--- tests/test_cancellation.py | 8 ++++++-- tests/test_infected_asyncio.py | 2 +- tests/test_shm.py | 22 ++++++++-------------- tests/test_spawning.py | 2 +- tractor/_root.py | 2 +- tractor/_testing/pytest.py | 2 +- tractor/runtime/_runtime.py | 2 +- tractor/runtime/_state.py | 6 +++--- 10 files changed, 26 insertions(+), 28 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6b2ac0414..9c1dc8160 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -250,7 +250,7 @@ addopts = [ '--show-capture=no', # sys-level capture. REQUIRED for fork-based spawn - # backends (e.g. `subint_forkserver`): default + # backends (e.g. `main_thread_forkserver`): default # `--capture=fd` redirects fd 1,2 to temp files, and fork # children inherit those fds — opaque deadlocks happen in # the pytest-capture-machinery ↔ fork-child stdio diff --git a/tests/discovery/test_registrar.py b/tests/discovery/test_registrar.py index 618c93ad6..d7fa15c28 100644 --- a/tests/discovery/test_registrar.py +++ b/tests/discovery/test_registrar.py @@ -538,13 +538,13 @@ async def kill_transport( @pytest.mark.timeout( 30, # NOTE should be a 2.1s happy path. - # XXX for `subint_forkserver` this is SUPER SENSITIVE so keep it - # higher to avoid flaky runs.. + # XXX for `main_thread_forkserver` this is SUPER SENSITIVE + # so keep it higher to avoid flaky runs.. method='thread', ) @pytest.mark.skipon_spawn_backend( 'subint', - # 'subint_forkserver', + # 'main_thread_forkserver', reason=( 'XXX SUBINT HANGING TEST XXX\n' 'See oustanding issue(s)\n' diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py index 27a5eee2a..28168fd6d 100644 --- a/tests/test_cancellation.py +++ b/tests/test_cancellation.py @@ -452,8 +452,12 @@ async def spawn_and_error( await nursery.run_in_actor(*args, **kwargs) -# NOTE: subint_forkserver skip handled by file-level `pytestmark` -# above (same pytest-capture-fd hang class as siblings). +# NOTE: `main_thread_forkserver` capture-fd hang class is no +# longer skipped here — `--capture=sys` (the new `pyproject.toml` +# default) sidesteps the pipe-buffer-fill deadlock for +# `test_nested_multierrors`. See +# `ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md` +# / #449 for the post-mortem. @pytest.mark.timeout( 10, method='thread', diff --git a/tests/test_infected_asyncio.py b/tests/test_infected_asyncio.py index e13df3251..d3524a6b5 100644 --- a/tests/test_infected_asyncio.py +++ b/tests/test_infected_asyncio.py @@ -1113,7 +1113,7 @@ async def main(): if ( send_sigint_to == 'child' and - start_method == 'subint_forkserver' + start_method == 'main_thread_forkserver' ): pytest.xfail( reason=( diff --git a/tests/test_shm.py b/tests/test_shm.py index d6ad93f4f..84d0988ec 100644 --- a/tests/test_shm.py +++ b/tests/test_shm.py @@ -16,22 +16,16 @@ pytestmark = pytest.mark.skipon_spawn_backend( 'subint', - # 'subint_forkserver', - # XXX we hack around this stdlib limitation by both, - # - setting `ShareMemory(track=False)` - # - overriding the `mp.ResourceTracker` nonsense in - # `.ipc._mp_bs`. + # NOTE, `main_thread_forkserver` works for these tests + # via the `mp.SharedMemory(track=False)` + + # `mp.resource_tracker` monkey-patch in `.ipc._mp_bs`. + # Without that workaround the fork-inherited + # `resource_tracker` fd would EBADF on first shm op + + # cascade into `FileExistsError` across parametrize + # variants. Tracker doc: + # `ai/conc-anal/subint_forkserver_mp_shared_memory_issue.md`. reason=( 'subint: GIL-contention hanging class.\n' - 'subint_forkserver: `multiprocessing.SharedMemory` ' - 'is fork-without-exec unsafe — child inherits parent\'s ' - '`resource_tracker` fd → EBADF on first shm op ' - '(`test_child_attaches_alot`); leaked `/shm_list` from ' - 'a "passing" run cascades into `FileExistsError` across ' - 'parametrize variants (`test_parent_writer_child_reader`). ' - 'Canonical CPython issue class, NOT a tractor bug; full ' - 'tracker doc:\n' - 'ai/conc-anal/subint_forkserver_mp_shared_memory_issue.md' ) ) diff --git a/tests/test_spawning.py b/tests/test_spawning.py index b0e8a88de..63a2fb8e1 100644 --- a/tests/test_spawning.py +++ b/tests/test_spawning.py @@ -194,7 +194,7 @@ def test_loglevel_propagated_to_subactor( reg_addr: tuple, level: str, ): - if start_method in ('mp_forkserver', 'subint_forkserver'): + if start_method in ('mp_forkserver', 'main_thread_forkserver'): pytest.skip( "a bug with `capfd` seems to make forkserver capture not work? " "(same class as the `mp_forkserver` pre-existing skip — fork-" diff --git a/tractor/_root.py b/tractor/_root.py index 3c20fff07..233a89d7a 100644 --- a/tractor/_root.py +++ b/tractor/_root.py @@ -79,7 +79,7 @@ 'trio', # forkserver children run `_trio_main` in their own OS # process — same child-side runtime shape as `trio_proc`. - 'subint_forkserver', + 'main_thread_forkserver', ) diff --git a/tractor/_testing/pytest.py b/tractor/_testing/pytest.py index 89535b1f8..e6761b144 100644 --- a/tractor/_testing/pytest.py +++ b/tractor/_testing/pytest.py @@ -318,7 +318,7 @@ def _reap_orphaned_subactors(): grace window, then SIGKILL survivors. Rationale: under fork-based spawn backends (notably - `subint_forkserver`), a test that times out or bails + `main_thread_forkserver`), a test that times out or bails mid-teardown can leave subactor forks alive. Without this reap, they linger across sessions and compete for ports / inherit pytest's capture-pipe fds — which diff --git a/tractor/runtime/_runtime.py b/tractor/runtime/_runtime.py index 9dcca501c..9c9cbdea6 100644 --- a/tractor/runtime/_runtime.py +++ b/tractor/runtime/_runtime.py @@ -1760,7 +1760,7 @@ async def async_main( # shielded loop would park on the parent chan # indefinitely waiting for EOF that only arrives # after the PARENT tears down, which under - # fork-based backends (e.g. `subint_forkserver`) + # fork-based backends (e.g. `main_thread_forkserver`) # it waits on THIS actor's exit — deadlock. actor._parent_chan_cs = await root_tn.start( partial( diff --git a/tractor/runtime/_state.py b/tractor/runtime/_state.py index aedcc9520..b9316448b 100644 --- a/tractor/runtime/_state.py +++ b/tractor/runtime/_state.py @@ -122,8 +122,8 @@ def update( # `open_root_actor()` nor received a parent `SpawnSpec`. Kept # as a module-level constant so `get_runtime_vars(clear_values= # True)` can reset the live dict back to this baseline (see -# `tractor.spawn._subint_forkserver` for the one current caller -# that needs it). +# `tractor.spawn._main_thread_forkserver` for the one current +# caller that needs it). _RUNTIME_VARS_DEFAULTS: dict[str, Any] = { # root of actor-process tree info '_is_root': False, # bool @@ -165,7 +165,7 @@ def get_runtime_vars( defaults (`_RUNTIME_VARS_DEFAULTS`) instead of the live dict. Useful in combination with `set_runtime_vars()` to reset process-global state back to "cold" — the main caller - today is the `subint_forkserver` spawn backend's post-fork + today is the `main_thread_forkserver` spawn backend's post-fork child prelude: set_runtime_vars(get_runtime_vars(clear_values=True)) From d24ccaada165ef246231e31bb5cb660509434cba Mon Sep 17 00:00:00 2001 From: goodboy Date: Mon, 27 Apr 2026 20:15:20 -0400 Subject: [PATCH 029/110] Fix `_testing.addr.get_rando_addr` cross-process collisions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the random port was a default-arg expression (`_rando_port: str = random.randint(1000, 9999)`) — evaluated ONCE at module import time, making it a per-process singleton. Two parallel pytest sessions had a 1/9000 birthday-pair chance of picking the same port; when it hit, every `reg_addr`-using test in BOTH runs would cascade-fail with "Address already in use". Switch to per-call `random.randint()` salted with `os.getpid()` so: - within one session: two calls return distinct ports — e.g. `test_tpt_bind_addrs::bind-subset-reg` now actually gets two different reg addrs on the TCP backend (it was silently duplicating before), - across parallel sessions: pid salt biases each process's port choices apart, making cross-run collisions vanishingly rare. Drop the bogus `: str` annotation (was always `int`). UDS already gets per-process isolation via `UDSAddress.get_random()`'s `@` socket-path suffix, so no change needed there. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit 7c5dd4d03350bd5889d15de30ab5d59e0660b98c) --- tractor/_testing/addr.py | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/tractor/_testing/addr.py b/tractor/_testing/addr.py index 1cff80db6..6927db770 100644 --- a/tractor/_testing/addr.py +++ b/tractor/_testing/addr.py @@ -22,6 +22,7 @@ our `tractor.discovery` subsys? ''' +import os import random from typing import ( Type, @@ -31,17 +32,28 @@ def get_rando_addr( tpt_proto: str, - *, - - # choose random port at import time - _rando_port: str = random.randint(1000, 9999) - ) -> tuple[str, str|int]: ''' Used to globally override the runtime to the per-test-session-dynamic addr so that all tests never conflict with any other actor tree using the default. + Cross-process isolation: TCP-port picks salt + `random.randint()` with `os.getpid()` so two parallel + pytest sessions (e.g. one running `--tpt-proto=tcp` and + another `--tpt-proto=uds` concurrently) almost-never + collide on the same port. Without the salt, the prior + impl's import-time `random.randint(1000, 9999)` default + arg was effectively a process-singleton with a 1/9000 + chance of cross-run collision per pair — and when it + happened EVERY `reg_addr`-using test in BOTH runs would + fight over the bind, cascading into a chain of + "Address already in use" failures. + + For UDS this concern doesn't apply: `UDSAddress.get_random()` + already builds socket paths from `os.getpid()` so each + pytest process gets its own socket-path namespace. + ''' addr_type: Type[_addr.Addres] = _addr._address_types[tpt_proto] def_reg_addr: tuple[str, int] = _addr._default_lo_addrs[tpt_proto] @@ -51,9 +63,21 @@ def get_rando_addr( testrun_reg_addr: tuple[str, int|str] match tpt_proto: case 'tcp': + # Per-call randomness mixed with `os.getpid()` — + # see the docstring above for the cross-process + # isolation rationale. The mix means: + # - within one pytest session, two calls return + # distinct ports (good for tests that need a + # second-different-reg-addr in one fn body, e.g. + # `test_tpt_bind_addrs::bind-subset-reg`), + # - across parallel pytest sessions, the pid bias + # makes coincident port choices unlikely. + port: int = 1000 + ( + random.randint(0, 8999) + os.getpid() + ) % 9000 testrun_reg_addr = ( addr_type.def_bindspace, - _rando_port, + port, ) # NOTE, file-name uniqueness (no-collisions) will be based on From ec5141b720bf06ca1802fb6bf20b503699a3fd50 Mon Sep 17 00:00:00 2001 From: goodboy Date: Mon, 27 Apr 2026 21:41:02 -0400 Subject: [PATCH 030/110] Add opt-in `reap_subactors_per_test` fixture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Function-scoped, NON-autouse zombie-subactor reaper for modules whose teardown is known-leaky enough to cascade- fail every following test in a session. Sibling to the autouse session-scoped `_reap_orphaned_subactors`. The session-scoped one fires at session end — too late to save tests that follow a hung/leaky test in the suite. The new fixture, opted into via `pytestmark = pytest.mark.usefixtures(...)`, runs between tests in a problem-module so a leftover subactor from test N can't squat on registrar ports / UDS paths / shm segments needed by tests N+1, N+2, ... Intentionally NOT autouse — the fixture's presence on a module signals "this module's teardown leaks; please root-cause instead of relying forever on cleanup". A visibility-vs-convenience trade picked in favor of the former. Apply to `tests/test_infected_asyncio.py` since both recent full-suite runs (parallel-tpt-proto + TCP-only) showed the cascade originating in this file's KBI- and SIGINT-flavored tests under `main_thread_forkserver`. Module-comment names the specific offenders so future de-flake work has a starting point. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit b376eb033262a45305cec72b896996f750c004a8) --- tests/test_infected_asyncio.py | 16 +++++++++++ tractor/_testing/pytest.py | 49 ++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/tests/test_infected_asyncio.py b/tests/test_infected_asyncio.py index d3524a6b5..8157e6d48 100644 --- a/tests/test_infected_asyncio.py +++ b/tests/test_infected_asyncio.py @@ -32,6 +32,22 @@ from tractor._testing import expect_ctxc +# Per-test zombie-subactor reaper. Opt-in (NOT autouse) — +# see `tractor._testing.pytest.reap_subactors_per_test`'s +# docstring for the full rationale. This module specifically +# needs it because tests like +# `test_echoserver_detailed_mechanics[KeyboardInterrupt]` +# and the `test_sigint_closes_lifetime_stack[*]` matrix have +# been observed to hang past pytest's wall-clock under +# `main_thread_forkserver`, leaving subactor forks that +# squat on registrar resources and cascade-fail every +# subsequent test (`test_inter_peer_cancellation`, +# `test_legacy_one_way_streaming`, etc.). +pytestmark = pytest.mark.usefixtures( + 'reap_subactors_per_test', +) + + @pytest.fixture( scope='module', ) diff --git a/tractor/_testing/pytest.py b/tractor/_testing/pytest.py index e6761b144..23c9630ca 100644 --- a/tractor/_testing/pytest.py +++ b/tractor/_testing/pytest.py @@ -342,6 +342,55 @@ def _reap_orphaned_subactors(): reap(pids, grace=3.0) +@pytest.fixture +def reap_subactors_per_test(): + ''' + Per-test (function-scoped) zombie-subactor reaper — + **opt-in**, NOT autouse. + + When a test's teardown fails to fully cancel its actor + tree (e.g. an asyncio cancel-cascade times out under + `main_thread_forkserver`, pytest hits its 200s wall- + clock and abandons), the leftover subactor lingers as a + direct child of `pytest` and squats on whatever + registrar port / UDS path / shm segment it had bound. + Subsequent tests trying to allocate the same resource + fail — and with backends that bind a session-shared + `reg_addr`, that means EVERY following test in the + suite cascades. The session-scoped sibling + (`_reap_orphaned_subactors`) only kicks in at session + end which is too late to save the cascade. + + Apply at module-level on the topically-problematic + test files via: + + ```python + pytestmark = pytest.mark.usefixtures( + 'reap_subactors_per_test', + ) + ``` + + Or per-test via the same `usefixtures` mark on a + specific function. Intentionally NOT autouse so the + fixture's presence on a module signals "this module's + teardown is known-leaky enough to contaminate + siblings"; the visibility helps future-us track down + root causes rather than burying them under blanket + cleanup. + + ''' + import os + parent_pid: int = os.getpid() + yield + from tractor._testing._reap import ( + find_descendants, + reap, + ) + pids: list[int] = find_descendants(parent_pid) + if pids: + reap(pids, grace=3.0) + + @pytest.fixture(scope='session') def debug_mode( request: pytest.FixtureRequest, From 13ccbaff6063cd791522928438e266bb3eaea471 Mon Sep 17 00:00:00 2001 From: goodboy Date: Mon, 27 Apr 2026 23:25:04 -0400 Subject: [PATCH 031/110] Use `trio.fail_after` cap in `test_dynamic_pub_sub` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop `@pytest.mark.timeout(...)` for the per-test wall-clock cap on `test_dynamic_pub_sub`; rely on `trio.fail_after(12)` inside `main()` instead. Both pytest-timeout enforcement modes are incompatible with trio under fork-based backends: - `method='signal'` (SIGALRM) synchronously raises `Failed` in trio's main thread mid-`epoll.poll()`, leaving `GLOBAL_RUN_CONTEXT` half-installed ("Trio guest run got abandoned") so EVERY subsequent `trio.run()` in the same pytest process bails with `RuntimeError: Attempted to call run() from inside a run()` — full-session poison. - `method='thread'` calls `_thread.interrupt_main()` which can let the KBI escape trio's `KIManager` under fork- cascade teardown races and bubble out of pytest entirely — kills the whole session. `trio.fail_after()` keeps cancellation inside the trio loop: - Raises `TooSlowError` cleanly through the open-nursery's cancel cascade. - Doesn't disturb any out-of-band signal/thread state. - Failure stays scoped to the single test — no cross-test global state corruption either way. Verified empirically: 10 hammer-runs of `test_dynamic_pub_sub` go from 5/10 fail (with global-state poison) to 3/10 fail (no poison, all sibling tests still pass). The ~30% remaining flake rate is a genuine fork-cancel-cascade hang — separate from this fix but no longer contaminates. Module-level NOTE comment explains the rationale so future readers don't re-introduce the bug. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit 530160fa6925e9127e48b9e3fbad62d4a1db704f) --- tests/test_advanced_streaming.py | 143 +++++++++++++++++++++++-------- 1 file changed, 106 insertions(+), 37 deletions(-) diff --git a/tests/test_advanced_streaming.py b/tests/test_advanced_streaming.py index 89191a4b7..3d8714cfa 100644 --- a/tests/test_advanced_streaming.py +++ b/tests/test_advanced_streaming.py @@ -5,6 +5,7 @@ from collections import Counter import itertools import platform +from typing import Type import pytest import trio @@ -106,61 +107,120 @@ async def consumer( print(f'{uid} got: {value}') +# NOTE: deliberately NOT using `@pytest.mark.timeout(...)` — +# both pytest-timeout enforcement modes break trio under +# fork-based backends: +# +# - `method='signal'` (SIGALRM): the handler synchronously +# raises `Failed` in trio's main thread mid-`epoll.poll()`, +# leaves `GLOBAL_RUN_CONTEXT` half-installed ("Trio guest +# run got abandoned"), and EVERY subsequent `trio.run()` +# in the same pytest process bails with +# `RuntimeError: Attempted to call run() from inside a +# run()` — session-wide poison. +# +# - `method='thread'`: calls `_thread.interrupt_main()` +# raising `KeyboardInterrupt` into the main thread. Under +# fork-based backends with mid-cascade fd-juggling the KBI +# can escape trio's `KIManager` and bubble out of pytest +# itself — kills the WHOLE session. +# +# Instead we use `trio.fail_after()` INSIDE `main()` below: +# trio's own `Cancelled`/`TooSlowError` machinery handles the +# timeout, cleanly unwinds the actor nursery's cancel +# cascade, and only fails the single test (no cross-test +# state corruption either way). +# +# `pyproject.toml`'s default `timeout = 200` is still a +# last-resort safety net. +@pytest.mark.parametrize( + 'expect_cancel_exc', [ + KeyboardInterrupt, + trio.TooSlowError, + ], + ids=lambda item: + f'expect_user_exc_raised={item.__name__}' +) def test_dynamic_pub_sub( reg_addr: tuple, debug_mode: bool, test_log: tractor.log.StackLevelAdapter, + reap_subactors_per_test: int, + expect_cancel_exc: Type[BaseException], ): + failed_to_raise_report: str = ( + f'Never got a {expect_cancel_exc!r} ??' + ) + global _registry from multiprocessing import cpu_count cpus = cpu_count() async def main(): - async with tractor.open_nursery( - registry_addrs=[reg_addr], - debug_mode=debug_mode, - ) as n: + # Hard safety cap via trio's own cancellation — see + # the module-level NOTE on why we avoid `pytest-timeout` + # for this test. Total expected runtime: ~1s spawn + 3s + # sleep + ~1-2s cancel cascade ≈ 5-6s. 12s gives plenty + # of headroom; if exceeded, trio raises `TooSlowError` + # which the outer `try` block treats as a hang report + # (or, if `expect_cancel_exc is trio.TooSlowError`, as + # the test passing — either way, no global state + # corruption). + with trio.fail_after(12): + async with tractor.open_nursery( + registry_addrs=[reg_addr], + debug_mode=debug_mode, + ) as n: - # name of this actor will be same as target func - await n.run_in_actor(publisher) + # name of this actor will be same as target func + await n.run_in_actor(publisher) + + for i, sub in zip( + range(cpus - 2), + itertools.cycle(_registry.keys()) + ): + await n.run_in_actor( + consumer, + name=f'consumer_{sub}', + subs=[sub], + ) - for i, sub in zip( - range(cpus - 2), - itertools.cycle(_registry.keys()) - ): + # make one dynamic subscriber await n.run_in_actor( consumer, - name=f'consumer_{sub}', - subs=[sub], + name='consumer_dynamic', + subs=list(_registry.keys()), ) - # make one dynamic subscriber - await n.run_in_actor( - consumer, - name='consumer_dynamic', - subs=list(_registry.keys()), - ) - - # block until cancelled by user - with trio.fail_after(3): - await trio.sleep_forever() + # block until "cancelled by user" + await trio.sleep(3) + test_log.warning( + f'Raising user cancel exc: ' + f'{expect_cancel_exc!r}' + ) + raise expect_cancel_exc('simulate user cancel!') try: trio.run(main) - except ( - trio.TooSlowError, - ExceptionGroup, - ) as err: - if isinstance(err, ExceptionGroup): - for suberr in err.exceptions: - if isinstance(suberr, trio.TooSlowError): - break - else: - pytest.fail('Never got a `TooSlowError` ?') - - assert err - test_log.exception('Timed out AS EXPECTED') + pytest.fail(failed_to_raise_report) + except expect_cancel_exc: + # parent-side raised the user-cancel exc directly and + # it propagated unwrapped; clean path. + test_log.exception('Got user-cancel exc AS EXPECTED') + except BaseExceptionGroup as err: + # under fork-based backends the user-raised cancel + # can race with subactor-side stream teardown + # (`trio.EndOfChannel` from a publisher's `send()` + # whose remote half got cut). The expected exc may + # then be nested deeper in the group rather than at + # the top level. `BaseExceptionGroup.split()` walks + # the exc tree recursively (Python 3.11+). + matched, _ = err.split(expect_cancel_exc) + if matched is None: + pytest.fail(failed_to_raise_report) + + test_log.exception('Got user-cancel exc AS EXPECTED') @tractor.context @@ -292,7 +352,6 @@ async def main(): resp = await stream.receive() assert resp == msg raise KeyboardInterrupt - try: trio.run(main) assert 0, "Didn't receive KBI!?" @@ -362,7 +421,12 @@ async def close_stream_on_sentinel(): print('streamer exited .open_streamer() block') +@pytest.mark.timeout( + 6, + method='signal', +) def test_local_task_fanout_from_stream( + reg_addr: tuple, debug_mode: bool, ): ''' @@ -427,4 +491,9 @@ async def pull_and_count(name: str): await p.cancel_actor() - trio.run(main) + async def w_timeout(): + with trio.fail_after(6): + await main() + + # trio.run(main) + trio.run(w_timeout) From 3954d9f5275edd20693f6988eac14d8ce502d426 Mon Sep 17 00:00:00 2001 From: goodboy Date: Mon, 27 Apr 2026 23:27:19 -0400 Subject: [PATCH 032/110] Return parent `pid: int` from new `reap_subactors_per_test` fixture (cherry picked from commit f8178df0fdc0c93ea484f66cb7483bd6a347cd2c) --- tractor/_testing/pytest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tractor/_testing/pytest.py b/tractor/_testing/pytest.py index 23c9630ca..5e89bee06 100644 --- a/tractor/_testing/pytest.py +++ b/tractor/_testing/pytest.py @@ -343,7 +343,7 @@ def _reap_orphaned_subactors(): @pytest.fixture -def reap_subactors_per_test(): +def reap_subactors_per_test() -> int: ''' Per-test (function-scoped) zombie-subactor reaper — **opt-in**, NOT autouse. @@ -381,7 +381,7 @@ def reap_subactors_per_test(): ''' import os parent_pid: int = os.getpid() - yield + yield parent_pid from tractor._testing._reap import ( find_descendants, reap, From c603a6e1a091e401c6b595afcf2041ded17255d1 Mon Sep 17 00:00:00 2001 From: goodboy Date: Wed, 29 Apr 2026 10:21:56 -0400 Subject: [PATCH 033/110] Backend-aware timeout in `maybe_expect_raises` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Default `timeout` from `int = 3` → `int|None = None`; when unset, pick a backend-aware value. Fork-based backends (`main_thread_forkserver`) need real headroom bc actor spawn + IPC ctx-exit + msg-validation error path is much heavier than under `trio` backend — especially under cross-pytest-stream contention (#451). Defaults: - `main_thread_forkserver` → 30s - everything else → 3s (unchanged) Empirical flake history that motivated 30s as the floor on fork backends (all from `test_basic_payload_spec`): - 3s → all-valid variant flaked w/ `TooSlowError` - 8s → `invalid-return` variant flaked w/ `Cancelled` (surfaced instead of `MsgTypeError` bc the outer `fail_after` fired mid-error-path) - 15s → flaked under cross-pytest-stream contention 30s gives plenty of headroom while still failing-loud on a genuine hang. Callers can opt out by passing an explicit `timeout=` kw. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit 060f7d24c42698e00d9ed5d34f936f487b619b67) --- tests/msg/test_pldrx_limiting.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/tests/msg/test_pldrx_limiting.py b/tests/msg/test_pldrx_limiting.py index b180dc035..1a8b61176 100644 --- a/tests/msg/test_pldrx_limiting.py +++ b/tests/msg/test_pldrx_limiting.py @@ -55,12 +55,37 @@ async def maybe_expect_raises( raises: BaseException|None = None, ensure_in_message: list[str]|None = None, post_mortem: bool = False, - timeout: int = 3, + # NOTE, `None` selects a backend-aware default below — + # see `_BACKEND_TIMEOUT_DEFAULTS` for rationale. Caller + # can override with an explicit value to opt out. + timeout: int|None = None, ) -> None: ''' Async wrapper for ensuring errors propagate from the inner scope. ''' + if timeout is None: + # Pick a backend-aware default. Fork-based backends + # (`main_thread_forkserver`) need much more headroom + # because actor spawn + IPC ctx-exit + msg-validation + # error path takes longer than under `trio` backend + # — especially under cross-pytest-stream contention + # (#451). `test_basic_payload_spec` empirically: + # - 3s flaked all-valid variant (`TooSlowError`) + # - 8s flaked `invalid-return` variant + # (`Cancelled` surfaced instead of `MsgTypeError` + # because `fail_after` fired mid-error-path) + # - 15s flaked under cross-stream contention + # 30s for fork-based gives plenty of headroom while + # still failing-loud on a genuine hang. Other + # backends keep the original 3s. + from tractor.spawn import _spawn as _spawn_mod + timeout = ( + 30 + if _spawn_mod._spawn_method == 'main_thread_forkserver' + else 3 + ) + if tractor.debug_mode(): timeout += 999 From dacd7a7c57e2a7d18f98d363abc9655e278d8129 Mon Sep 17 00:00:00 2001 From: goodboy Date: Wed, 29 Apr 2026 10:28:48 -0400 Subject: [PATCH 034/110] Backend-aware `fail_after` in pub/sub test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirror `060f7d24`'s pattern (backend-aware timeout in `maybe_expect_raises`) for `test_dynamic_pub_sub`'s hard `trio.fail_after` cap. Fork-based backends pay per-spawn fork+IPC-handshake cost which stacks over `cpus - 1` sequential `n.run_in_actor()` calls; empirically 12s flakes on `main_thread_forkserver` under UDS cross-pytest contention (#451 / #452). Defaults: - `main_thread_forkserver` → 30s - everything else → 12s (unchanged) Hoist the timeout-pick out of the `main()` closure so the dispatch happens once in the trio task rather than re-evaluating per spawn. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit 383b0fdd7526b3bab9f871cb7b99a7e1a1f6b9ae) --- tests/test_advanced_streaming.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/tests/test_advanced_streaming.py b/tests/test_advanced_streaming.py index 3d8714cfa..d4b206f1f 100644 --- a/tests/test_advanced_streaming.py +++ b/tests/test_advanced_streaming.py @@ -157,17 +157,26 @@ def test_dynamic_pub_sub( from multiprocessing import cpu_count cpus = cpu_count() + # Hard safety cap via trio's own cancellation — see the + # module-level NOTE on why we avoid `pytest-timeout` for + # this test. Picked backend-aware: under `trio` backend + # spawn is cheap (~1s for `cpus` actors) but fork-based + # backends pay a per-spawn cost (forkserver round-trip + + # IPC peer-handshake) that can stack up over `cpus - 1` + # sequential `n.run_in_actor()` calls — especially on UDS + # under cross-pytest contention (#451 / #452). Empirically + # 12s flakes on `main_thread_forkserver`; 30s gives + # plenty of headroom while still failing-loud on a real + # hang. + from tractor.spawn import _spawn as _spawn_mod + fail_after_s: int = ( + 30 + if _spawn_mod._spawn_method == 'main_thread_forkserver' + else 12 + ) + async def main(): - # Hard safety cap via trio's own cancellation — see - # the module-level NOTE on why we avoid `pytest-timeout` - # for this test. Total expected runtime: ~1s spawn + 3s - # sleep + ~1-2s cancel cascade ≈ 5-6s. 12s gives plenty - # of headroom; if exceeded, trio raises `TooSlowError` - # which the outer `try` block treats as a hang report - # (or, if `expect_cancel_exc is trio.TooSlowError`, as - # the test passing — either way, no global state - # corruption). - with trio.fail_after(12): + with trio.fail_after(fail_after_s): async with tractor.open_nursery( registry_addrs=[reg_addr], debug_mode=debug_mode, From 1d185e4d94a68ffb138131ae0ee6355163205a43 Mon Sep 17 00:00:00 2001 From: goodboy Date: Wed, 29 Apr 2026 10:32:23 -0400 Subject: [PATCH 035/110] Add `--enable-stackscope` pytest plugin flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New `--enable-stackscope` CLI flag installs a SIGUSR1 → trio-task-tree-dump handler in pytest itself + every spawned subactor for live stack visibility during hang investigations. Lighter than `--tpdb` (no pdb machinery / tty-lock contention) — pure stack-only triage. Plumbing: - `_testing.pytest.pytest_addoption()` adds the flag. - `_testing.pytest.pytest_configure()` (when flag set): * exports `TRACTOR_ENABLE_STACKSCOPE=1` so fork-children inherit it via environ, * installs the handler in pytest itself via `enable_stack_on_sig()`. - `runtime._runtime.Actor.async_main()` extends the existing `_debug_mode` gate to ALSO fire when `TRACTOR_ENABLE_STACKSCOPE` is in env — so subactors install the same handler at runtime startup. Capture-bypass tee in `dump_task_tree()`: Pytest's default `--capture=fd` swallows `log.devx()` output, making SIGUSR1 dumps invisible right when you need them. Render the dump once to a `full_dump` str, then unconditionally tee to: - `/tmp/tractor-stackscope-.log` (append-mode, always written) — guaranteed-readable artifact even under CI / `nohup` / no-tty. `tail -f` to follow. - `/dev/tty` (best-effort) — pytest never captures the tty; ignored if device is missing. Other, - squelch the benign `RuntimeWarning` ("coroutine method 'asend'/'athrow' was never awaited") from `stackscope._glue`'s import-time async-gen type introspection so `--enable-stackscope` setup stays quiet. - log msg in the `_runtime` ImportError branch now mentions `--enable-stackscope` alongside debug-mode. Usage, pytest --enable-stackscope -k # in another shell, find the pid + signal: kill -USR1 # tail the artifact: tail -f /tmp/tractor-stackscope-.log (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit 5418f2dc3c9960fb5036eea508ea32616d19d0d6) --- tractor/_testing/pytest.py | 46 ++++++++++++++++++++++++ tractor/devx/_stackscope.py | 72 +++++++++++++++++++++++++------------ tractor/runtime/_runtime.py | 18 ++++++++-- 3 files changed, 112 insertions(+), 24 deletions(-) diff --git a/tractor/_testing/pytest.py b/tractor/_testing/pytest.py index 5e89bee06..5ccd1ebfa 100644 --- a/tractor/_testing/pytest.py +++ b/tractor/_testing/pytest.py @@ -224,6 +224,21 @@ def pytest_addoption( ), ) + parser.addoption( + "--enable-stackscope", + action="store_true", + dest='tractor_enable_stackscope', + default=False, + help=( + 'Install `stackscope` SIGUSR1 handler in pytest + ' + 'every spawned subactor for live trio task-tree ' + 'dumps during hang investigations. Lighter than ' + '`--tpdb` (no pdb machinery / tty-lock contention) ' + '— use when you only need stack visibility. To ' + 'capture: `kill -USR1 `.' + ), + ) + # provide which IPC transport protocols opting-in test suites # should accumulatively run against. parser.addoption( @@ -264,6 +279,37 @@ def pytest_configure( 'in `ai/conc-anal/subint_sigint_starvation_issue.md`).' ) + # `--enable-stackscope`: install SIGUSR1 → trio task-tree + # dump in pytest itself + propagate to every subactor via + # an env var that fork-children inherit and the runtime + # gate honors. Lighter than `--tpdb` (no pdb machinery) — + # purely for hang-investigation stack visibility. + if getattr( + config.option, 'tractor_enable_stackscope', False + ): + import os + # Env var inherited via fork → subactor's runtime + # picks it up at `Actor.async_main` startup. See the + # gate in `tractor.runtime._runtime` matching this + # var name. + os.environ['TRACTOR_ENABLE_STACKSCOPE'] = '1' + + # Install in pytest itself so `kill -USR1 ` + # dumps the parent trio task-tree (which is where + # most Mode-A-class hangs park). + try: + from tractor.devx._stackscope import ( + enable_stack_on_sig, + ) + enable_stack_on_sig() + except ImportError: + import warnings + warnings.warn( + '`stackscope` not installed — ' + '--enable-stackscope is a no-op. ' + 'Install via the `devx` dep group.' + ) + def pytest_collection_modifyitems( config: pytest.Config, diff --git a/tractor/devx/_stackscope.py b/tractor/devx/_stackscope.py index 6a9ecd48c..3992858fa 100644 --- a/tractor/devx/_stackscope.py +++ b/tractor/devx/_stackscope.py @@ -66,7 +66,20 @@ def dump_task_tree() -> None: Do a classic `stackscope.extract()` task-tree dump to console at `.devx()` level. + Also unconditionally tee the rendered tree to two + capture-bypassing sinks so SIGUSR1 dumps remain visible + when the parent process has captured stdio (e.g. pytest's + default `--capture=fd`): + + - `/tmp/tractor-stackscope-.log` (append-mode, always + written) — guaranteed-readable artifact even under CI + / `nohup` / no-tty conditions. `tail -f` to follow. + - `/dev/tty` if a controlling terminal is attached — + best-effort, ignored if the device is missing or write + fails. pytest never captures the tty. + ''' + import os import stackscope tree_str: str = str( stackscope.extract( @@ -96,7 +109,7 @@ def dump_task_tree() -> None: # |_{Supervisor/Scope # |_[Storage/Memory/IPC-Stream/Data-Struct - log.devx( + full_dump: str = ( f'Dumping `stackscope` tree for actor\n' f'(>: {actor.uid!r}\n' f' |_{mp.current_process()}\n' @@ -105,33 +118,35 @@ def dump_task_tree() -> None: f'\n' f'{sigint_handler_report}\n' f'signal.getsignal(SIGINT) -> {current_sigint_handler!r}\n' - # f'\n' - # start-of-trace-tree delimiter (mostly for testing) - # f'------ {actor.uid!r} ------\n' f'\n' f'------ start-of-{actor.uid!r} ------\n' f'|\n' f'{tree_str}' - # end-of-trace-tree delimiter (mostly for testing) f'|\n' f'|_____ end-of-{actor.uid!r} ______\n' ) - # TODO: can remove this right? - # -[ ] was original code from author - # - # print( - # 'DUMPING FROM PRINT\n' - # + - # content - # ) - # import logging - # try: - # with open("/dev/tty", "w") as tty: - # tty.write(tree_str) - # except BaseException: - # logging.getLogger( - # "task_tree" - # ).exception("Error printing task tree") + log.devx(full_dump) + + # NOTE, capture-bypass sinks. Pytest's default + # `--capture=fd` swallows `log.devx()` above; the + # following two writes guarantee the dump reaches the + # human even when stdio is captured. + fpath: str = f'/tmp/tractor-stackscope-{os.getpid()}.log' + try: + with open(fpath, 'a') as f: + f.write(full_dump + '\n') + except OSError: + log.exception( + f'Failed to tee stackscope dump to {fpath!r}' + ) + + try: + with open('/dev/tty', 'w') as tty: + tty.write(full_dump + '\n') + except OSError: + # no controlling tty (CI / nohup / detached) — + # silently fall through; the file sink covers it. + pass _handler_lock = RLock() _tree_dumped: bool = False @@ -233,7 +248,20 @@ def enable_stack_on_sig( ''' try: - import stackscope + # NOTE, `stackscope._glue` does intentional async-gen type + # introspection at import-time which trips + # `RuntimeWarning: coroutine method 'asend'/'athrow' was + # never awaited`. Benign — they only want the wrapper + # type — but visible to users. Squelch the import-only + # warning so SIGUSR1 setup stays quiet. + import warnings + with warnings.catch_warnings(): + warnings.filterwarnings( + 'ignore', + category=RuntimeWarning, + message=r"coroutine method '(asend|athrow)' .* was never awaited", + ) + import stackscope except ImportError: log.warning( 'The `stackscope` lib is not installed!\n' diff --git a/tractor/runtime/_runtime.py b/tractor/runtime/_runtime.py index 9c9cbdea6..da02a7055 100644 --- a/tractor/runtime/_runtime.py +++ b/tractor/runtime/_runtime.py @@ -929,7 +929,20 @@ async def _from_parent( # => update process-wide globals # TODO! -[ ] another `Struct` for rtvs.. rvs: dict[str, Any] = spawnspec._runtime_vars - if rvs['_debug_mode']: + + # `stackscope` SIGUSR1 handler: install when EITHER + # `_debug_mode=True` (full multi-actor pdb support + # path) OR the `TRACTOR_ENABLE_STACKSCOPE` env var + # is set (lighter test-time hang-debug path; see + # `tractor._testing.pytest`'s `--enable-stackscope` + # CLI flag — env var propagates via fork-inherited + # environ). + import os + if ( + rvs['_debug_mode'] + or + os.environ.get('TRACTOR_ENABLE_STACKSCOPE') + ): from ..devx import ( enable_stack_on_sig, maybe_init_greenback, @@ -945,7 +958,8 @@ async def _from_parent( except ImportError: log.warning( - '`stackscope` not installed for use in debug mode!' + '`stackscope` not installed for use in ' + 'debug mode / `--enable-stackscope`!' ) if rvs.get('use_greenback', False): From c31b85587a72570cd97424b774effe4e5eb6a59e Mon Sep 17 00:00:00 2001 From: goodboy Date: Wed, 29 Apr 2026 12:01:03 -0400 Subject: [PATCH 036/110] Route `stackscope` SIGUSR1 onto trio loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signal handlers fire in a non-trio stack frame; calling `stackscope.extract(recurse_child_tasks=True)` from there only walks the `` task and misses everything inside `async_main`'s nurseries — exactly the part you want to see during a hang. Fix: capture `trio.lowlevel.current_trio_token()` at `enable_stack_on_sig()` time and stash it as a module- level `_trio_token`. The SIGUSR1 handler then dispatches the dump *onto* the trio loop via `_trio_token.run_sync_soon(_safe_dump_task_tree)`, so `stackscope.extract` runs from a real trio-task context and walks the full nursery tree. Late-binding: pytest's `pytest_configure` calls `enable_stack_on_sig()` outside any `trio.run`, so token capture there is a `RuntimeError` — left at `None`. The runtime re-calls `enable_stack_on_sig()` from inside `async_main` (subactor side) where the token IS available, so subactors get the full-tree path. `dump_tree_on_sig` falls back to a direct call when `_trio_token is None` (parent process pre-trio.run, or signal delivered after `trio.run` returns). `_safe_dump_task_tree()` is a `run_sync_soon`-friendly wrapper that swallows any exception from `dump_task_tree()` — trio prints + crashes on uncaught exceptions in scheduled callbacks; better to log + keep the run alive so the user can re-trigger. Other, - emit `capture-bypass tee: ` line + `tail -f` hint in the rendered dump header so users know where to find the artifact even when stdio is captured. - swap the inline `f' |_{actor}'` line for a `_pformat.nest_from_op` rendering of `actor_repr` (matches the rest of the runtime's nested-op style). - log lines on handler install + already-installed branches now note `(trio_token captured: )` so it's obvious from the log whether the full-tree path is wired. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit 2d4995e08df6face2fbdf8314bc0af7d454d57ae) --- tractor/devx/_stackscope.py | 86 +++++++++++++++++++++++++++++++------ 1 file changed, 73 insertions(+), 13 deletions(-) diff --git a/tractor/devx/_stackscope.py b/tractor/devx/_stackscope.py index 3992858fa..85f57098c 100644 --- a/tractor/devx/_stackscope.py +++ b/tractor/devx/_stackscope.py @@ -47,7 +47,9 @@ import trio from tractor.runtime import _state from tractor import log as logmod -from tractor.devx import debug +from tractor.devx import ( + debug, +) log = logmod.get_logger() @@ -109,16 +111,29 @@ def dump_task_tree() -> None: # |_{Supervisor/Scope # |_[Storage/Memory/IPC-Stream/Data-Struct + fpath: str = f'/tmp/tractor-stackscope-{os.getpid()}.log' + from . import pformat + actor_repr: str = pformat.nest_from_op( + input_op='|_', + text=f'{actor}', + nest_prefix='|_', + nest_indent=3, + ) full_dump: str = ( f'Dumping `stackscope` tree for actor\n' f'(>: {actor.uid!r}\n' f' |_{mp.current_process()}\n' f' |_{thr}\n' - f' |_{actor}\n' + # TODO, use the nest_from_op + f'{actor_repr}' + # f' |_{actor}' f'\n' f'{sigint_handler_report}\n' f'signal.getsignal(SIGINT) -> {current_sigint_handler!r}\n' f'\n' + f'capture-bypass tee: {fpath}\n' + f'(`tail -f {fpath}` to follow across signals)\n' + f'\n' f'------ start-of-{actor.uid!r} ------\n' f'|\n' f'{tree_str}' @@ -131,7 +146,6 @@ def dump_task_tree() -> None: # `--capture=fd` swallows `log.devx()` above; the # following two writes guarantee the dump reaches the # human even when stdio is captured. - fpath: str = f'/tmp/tractor-stackscope-{os.getpid()}.log' try: with open(fpath, 'a') as f: f.write(full_dump + '\n') @@ -151,6 +165,34 @@ def dump_task_tree() -> None: _handler_lock = RLock() _tree_dumped: bool = False +# Captured at `enable_stack_on_sig()` time when running +# inside a trio task. `dump_tree_on_sig` uses this to +# schedule `dump_task_tree` ON the trio loop via +# `token.run_sync_soon` so stackscope sees a real current +# task and can recurse into nursery children. Without +# it (signal handler running in a non-trio stack frame), +# `stackscope.extract` only walks the `` task and +# misses everything inside `async_main`'s nurseries. +_trio_token: trio.lowlevel.TrioToken|None = None + + +def _safe_dump_task_tree() -> None: + ''' + `run_sync_soon`-friendly wrapper that swallows any + exception from `dump_task_tree`. Trio prints + + crashes on uncaught exceptions in scheduled + callbacks; we'd rather log + keep the test running so + the user can re-trigger the dump. + + ''' + try: + dump_task_tree() + except BaseException: + log.exception( + '`dump_task_tree()` raised (scheduled via ' + '`run_sync_soon`); continuing.\n' + ) + def dump_tree_on_sig( sig: int, @@ -174,16 +216,17 @@ def dump_tree_on_sig( 'Trying to dump `stackscope` tree..\n' ) try: - dump_task_tree() - # await actor._service_n.start_soon( - # partial( - # trio.to_thread.run_sync, - # dump_task_tree, - # ) - # ) - # trio.lowlevel.current_trio_token().run_sync_soon( - # dump_task_tree - # ) + # Prefer scheduling on the trio loop — runs the + # dump from a real trio-task context so + # `stackscope.extract(recurse_child_tasks=True)` + # walks every nursery child instead of seeing + # only the `` task. Falls back to a direct + # call when no token was captured (e.g. signal + # delivered outside a trio.run). + if _trio_token is not None: + _trio_token.run_sync_soon(_safe_dump_task_tree) + else: + dump_task_tree() except RuntimeError: log.exception( @@ -269,11 +312,27 @@ def enable_stack_on_sig( ) return None + # Capture the trio token if we're inside `trio.run` + # so SIGUSR1 dispatches the dump *onto* the trio loop + # (full task-tree visibility). When called outside trio + # (e.g. from `pytest_configure`), token capture fails + # silently and `dump_tree_on_sig` falls back to the + # direct-call path. + global _trio_token + try: + _trio_token = trio.lowlevel.current_trio_token() + except RuntimeError: + # not in a `trio.run` — leave None; runtime can + # re-call `enable_stack_on_sig()` later from + # inside `async_main` to capture it. + _trio_token = None + handler: Callable|int = getsignal(sig) if handler is dump_tree_on_sig: log.devx( 'A `SIGUSR1` handler already exists?\n' f'|_ {handler!r}\n' + f'(trio_token captured: {_trio_token is not None})\n' ) return @@ -287,5 +346,6 @@ def enable_stack_on_sig( f'{stackscope!r}\n\n' f'With `SIGUSR1` handler\n' f'|_{dump_tree_on_sig}\n' + f'(trio_token captured: {_trio_token is not None})\n' ) return stackscope From 950b6bc6b8375bae3889ec1f87a7a85dca59c193 Mon Sep 17 00:00:00 2001 From: goodboy Date: Wed, 29 Apr 2026 12:49:36 -0400 Subject: [PATCH 037/110] Add todo for running `test_debugger` suite on forkserver spawner (cherry picked from commit 2917b74ba4dbdee2bfda5d3a28bb5b76a10ff0bc) --- tests/devx/conftest.py | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/tests/devx/conftest.py b/tests/devx/conftest.py index 747c859d7..f7d336eb0 100644 --- a/tests/devx/conftest.py +++ b/tests/devx/conftest.py @@ -65,9 +65,18 @@ def spawn( run an `./examples/..` script by name. ''' - if start_method != 'trio': + supported_spawners: set[str] = { + 'trio', + # ?TODO, other spawners that will work? + # - [ ] need to pass `start_method={spawner}` to underlying + # `examples/debugging/