diff --git a/scripts/coding_discovery_tools/ai_tools_discovery.py b/scripts/coding_discovery_tools/ai_tools_discovery.py index 91d3dee..e7c6a4c 100644 --- a/scripts/coding_discovery_tools/ai_tools_discovery.py +++ b/scripts/coding_discovery_tools/ai_tools_discovery.py @@ -65,7 +65,7 @@ CursorSkillsExtractorFactory, ClineSkillsExtractorFactory, ) - from .utils import send_report_to_backend, send_scan_event, send_discovery_metrics, get_user_info, get_all_users_macos, get_all_users_windows, get_all_users_linux, load_pending_reports, save_failed_reports, report_to_sentry, get_claude_subscription_type, get_cursor_subscription_type, QUEUE_FILE + from .utils import send_report_to_backend, send_scan_event, send_discovery_metrics, get_user_info, get_all_users_macos, get_all_users_windows, get_all_users_linux, load_pending_reports, save_failed_reports, report_to_sentry, get_claude_subscription_type, get_cursor_subscription_type, in_container, QUEUE_FILE from .linux_extraction_helpers import linux_home_for_user from .logging_helpers import configure_logger, log_rules_details, log_mcp_details, log_settings_details from .settings_transformers import transform_settings_to_backend_format @@ -119,7 +119,7 @@ CursorSkillsExtractorFactory, ClineSkillsExtractorFactory, ) - from scripts.coding_discovery_tools.utils import send_report_to_backend, send_scan_event, send_discovery_metrics, get_user_info, get_all_users_macos, get_all_users_windows, get_all_users_linux, load_pending_reports, save_failed_reports, report_to_sentry, get_claude_subscription_type, get_cursor_subscription_type, QUEUE_FILE + from scripts.coding_discovery_tools.utils import send_report_to_backend, send_scan_event, send_discovery_metrics, get_user_info, get_all_users_macos, get_all_users_windows, get_all_users_linux, load_pending_reports, save_failed_reports, report_to_sentry, get_claude_subscription_type, get_cursor_subscription_type, in_container, QUEUE_FILE from scripts.coding_discovery_tools.linux_extraction_helpers import linux_home_for_user from scripts.coding_discovery_tools.logging_helpers import configure_logger, log_rules_details, log_mcp_details, log_settings_details from scripts.coding_discovery_tools.settings_transformers import transform_settings_to_backend_format @@ -2134,6 +2134,7 @@ def generate_single_tool_report(self, tool: Dict, device_id: str, home_user: str "home_user": home_user, "system_user": system_user or home_user, "device_id": device_id, + "is_container": in_container(), "tools": [tool_for_report] } diff --git a/scripts/coding_discovery_tools/linux/device_id.py b/scripts/coding_discovery_tools/linux/device_id.py index 25bdcb3..7dd2642 100644 --- a/scripts/coding_discovery_tools/linux/device_id.py +++ b/scripts/coding_discovery_tools/linux/device_id.py @@ -1,10 +1,13 @@ """Device ID extraction for Linux.""" import logging +import os +import tempfile +import uuid from pathlib import Path +from .. import cache from ..coding_tool_base import BaseDeviceIdExtractor -from ..utils import get_hostname logger = logging.getLogger(__name__) @@ -13,6 +16,8 @@ Path("/var/lib/dbus/machine-id"), # older dbus fallback ] +_DEVICE_ID_FILENAME = "device-id" + class LinuxDeviceIdExtractor(BaseDeviceIdExtractor): """Device ID extractor for Linux systems.""" @@ -20,7 +25,13 @@ class LinuxDeviceIdExtractor(BaseDeviceIdExtractor): def extract_device_id(self) -> str: """ Return the machine-id from /etc/machine-id (systemd standard). - Falls back to hostname if the file is absent or unreadable. + + When no machine-id is available (common in containers, which often have + an empty/absent /etc/machine-id), fall back to a UUID persisted in the + home-user's ``~/.unbound/`` directory. A restarted container that mounts + a persistent ``~/.unbound`` (the primary ``unbound login`` flow) then + keeps a single stable device row instead of exploding into one row per + launch — which an ephemeral hostname fallback would produce. """ for path in _MACHINE_ID_PATHS: try: @@ -31,4 +42,69 @@ def extract_device_id(self) -> str: except Exception as e: logger.debug(f"Could not read {path}: {e}") - return get_hostname() + return self._persisted_device_id() + + @staticmethod + def _persisted_device_id() -> str: + """Read (or generate-and-persist) a stable UUID under ``~/.unbound/``. + + Reuses the repo's canonical state-dir resolver (``cache._ensure_state_dir``) + rather than a bare ``Path.home()`` so we honour the writable-fallback chain + and land next to the API key written by ``unbound login``. If the state dir + cannot be resolved or the write fails, return an unpersisted uuid4 — no + worse than the previous ephemeral hostname behaviour, and never raises. + """ + try: + if not cache._ensure_state_dir(): + logger.warning("No usable state dir for device-id; using ephemeral uuid") + return str(uuid.uuid4()) + + device_id_path = cache.UNBOUND_DIR / _DEVICE_ID_FILENAME + + try: + if device_id_path.exists() and device_id_path.is_file(): + existing = device_id_path.read_text(encoding="utf-8").strip() + if existing: + # Validate the persisted value is a well-formed UUID. A + # truncated/partial write (pre-atomic-write), a manual edit, + # or another tool clobbering the file can leave a non-UUID + # string here; returning it would create a backend device + # row that no valid UUID can ever match. Treat corrupt as + # absent and regenerate. + try: + uuid.UUID(existing) + return existing + except ValueError: + logger.warning( + f"Corrupt (non-UUID) persisted device-id at " + f"{device_id_path!s}; regenerating" + ) + except Exception as e: + logger.warning(f"Could not read persisted device-id at {device_id_path!s}: {e}") + + new_id = str(uuid.uuid4()) + try: + # Atomic write: a SIGKILL/OOM/power-loss mid-write must never leave + # a partial UUID on disk. Write to a temp file in the same dir then + # os.replace() (atomic rename on the same filesystem), mirroring + # cache.atomic_write_cache(). + cache.UNBOUND_DIR.mkdir(parents=True, exist_ok=True) + fd, tmp = tempfile.mkstemp( + prefix=".device-id.", suffix=".tmp", dir=str(cache.UNBOUND_DIR) + ) + try: + with os.fdopen(fd, "w", encoding="utf-8") as f: + f.write(new_id) + os.replace(tmp, str(device_id_path)) + finally: + if os.path.exists(tmp): + try: + os.unlink(tmp) + except OSError: + pass + except Exception as e: + logger.warning(f"Could not persist device-id at {device_id_path!s}: {e}") + return new_id + except Exception as e: + logger.warning(f"device-id fallback failed: {e}") + return str(uuid.uuid4()) diff --git a/scripts/coding_discovery_tools/utils.py b/scripts/coding_discovery_tools/utils.py index 8b5d9d5..7b09014 100644 --- a/scripts/coding_discovery_tools/utils.py +++ b/scripts/coding_discovery_tools/utils.py @@ -2,6 +2,7 @@ Utility functions shared across the AI tools discovery system """ +import functools import json import logging import os @@ -101,6 +102,47 @@ def get_hostname() -> str: return platform.node() +@functools.lru_cache(maxsize=1) +def in_container() -> bool: + """Best-effort detection of whether we're running inside a container. + + Combines several signals because no single one is reliable across runtimes + and kernels: + - ``/.dockerenv`` / ``/run/.containerenv`` — Docker / Podman runtime markers. + - root filesystem mounted as ``overlay`` — cgroup-version-agnostic. + - ``/proc/1/cgroup`` docker/lxc/kube markers — cgroup v1 ONLY (v2 shows + ``0::/`` from inside, so this is a fallback, not the primary check). + + This is for honest behavioural branching, not security — every marker here + is forgeable by whoever controls the container. Result is cached for the + process lifetime. + """ + try: + if os.path.exists("/.dockerenv") or os.path.exists("/run/.containerenv"): + return True + except OSError: + pass + + try: + with open("/proc/mounts", encoding="utf-8") as f: + for line in f: + parts = line.split() + if len(parts) >= 3 and parts[1] == "/" and parts[2] == "overlay": + return True + except OSError: + pass + + try: + with open("/proc/1/cgroup", encoding="utf-8") as f: + blob = f.read() + if any(marker in blob for marker in ("/docker", "/lxc", "kubepods", "/containerd")): + return True + except OSError: + pass + + return False + + class DsclBatchData(NamedTuple): uid_map: Dict[str, int] shell_map: Dict[str, str] diff --git a/tests/test_container_attribution.py b/tests/test_container_attribution.py new file mode 100644 index 0000000..97050c5 --- /dev/null +++ b/tests/test_container_attribution.py @@ -0,0 +1,245 @@ +"""Tests for Approach B coding-discovery-tool changes. + +Covers: + - ``is_container`` top-level field in the report payload built by + ``generate_single_tool_report``. + - The persisted-UUID device_id fallback in ``LinuxDeviceIdExtractor`` when + /etc/machine-id is empty/absent (read existing, else generate+write). + - The ``in_container()`` helper salvaged into utils.py. + +Per CLAUDE.md: tests mock filesystem reads/writes; ``in_container`` is +lru_cached so we clear its cache around container-state toggles. +""" + +import tempfile +import unittest +import uuid +from pathlib import Path +from unittest.mock import patch + +from scripts.coding_discovery_tools.ai_tools_discovery import AIToolsDetector +import scripts.coding_discovery_tools.ai_tools_discovery as ai_tools_discovery +import scripts.coding_discovery_tools.utils as utils_mod +from scripts.coding_discovery_tools.utils import in_container + + +class TestIsContainerInReport(unittest.TestCase): + """``is_container`` is always present as a top-level report field.""" + + def _make_report(self): + detector = AIToolsDetector() + tool = {"name": "dummy-tool", "version": "1.0", "_internal": "hidden"} + return detector.generate_single_tool_report( + tool, device_id="DEV-123", home_user="testuser" + ) + + def test_is_container_true(self): + with patch.object(ai_tools_discovery, "in_container", return_value=True): + report = self._make_report() + self.assertIn("is_container", report) + self.assertIs(report["is_container"], True) + + def test_is_container_false(self): + with patch.object(ai_tools_discovery, "in_container", return_value=False): + report = self._make_report() + self.assertIn("is_container", report) + self.assertIs(report["is_container"], False) + + def test_is_container_is_top_level_not_in_tool(self): + """The flag lives in the report dict, not inside the tool payload.""" + with patch.object(ai_tools_discovery, "in_container", return_value=True): + report = self._make_report() + self.assertIn("is_container", report) + self.assertEqual(len(report["tools"]), 1) + self.assertNotIn("is_container", report["tools"][0]) + # Internal keys (leading underscore) are still filtered out. + self.assertNotIn("_internal", report["tools"][0]) + + def test_home_user_not_namespaced(self): + """Approach B: plain home_user, no PR #165 container namespacing.""" + with patch.object(ai_tools_discovery, "in_container", return_value=True): + report = self._make_report() + self.assertEqual(report["home_user"], "testuser") + + +class TestInContainerHelper(unittest.TestCase): + """The salvaged in_container() helper. lru_cached -> clear around toggles.""" + + def setUp(self): + in_container.cache_clear() + self.addCleanup(in_container.cache_clear) + + def test_detects_dockerenv(self): + def fake_exists(p): + return p == "/.dockerenv" + + with patch.object(utils_mod.os.path, "exists", side_effect=fake_exists): + self.assertTrue(in_container()) + + def test_no_markers_returns_false(self): + with patch.object(utils_mod.os.path, "exists", return_value=False), \ + patch("builtins.open", side_effect=OSError("no proc")): + self.assertFalse(in_container()) + + def test_lru_cache_clear_resets_state(self): + # Without cache_clear() the first result would stick and lie. + with patch.object(utils_mod.os.path, "exists", return_value=False), \ + patch("builtins.open", side_effect=OSError): + self.assertFalse(in_container()) + in_container.cache_clear() + with patch.object(utils_mod.os.path, "exists", + side_effect=lambda p: p == "/run/.containerenv"): + self.assertTrue(in_container()) + + +class TestLinuxDeviceIdFallback(unittest.TestCase): + """Persisted-UUID device_id fallback when machine-id is empty/absent. + + These tests use a REAL temp dir (patched onto ``cache.UNBOUND_DIR``) so that + the atomic ``tempfile.mkstemp`` + ``os.replace`` write path is exercised for + real rather than mocked away. + """ + + def setUp(self): + from scripts.coding_discovery_tools.linux.device_id import LinuxDeviceIdExtractor + self.extractor = LinuxDeviceIdExtractor() + self._tmp = tempfile.TemporaryDirectory() + self.addCleanup(self._tmp.cleanup) + self.state_dir = Path(self._tmp.name) / ".unbound" + self.state_dir.mkdir(parents=True, exist_ok=True) + self.device_id_path = self.state_dir / "device-id" + + def test_machine_id_takes_precedence(self): + """Existing behavior preserved: machine-id wins, no UUID fallback.""" + from scripts.coding_discovery_tools.linux import device_id as did_mod + + class FakeMachineIdPath: + def exists(self): + return True + + def is_file(self): + return True + + def read_text(self, encoding="utf-8"): + return " abc123machineid \n" + + with patch.object(did_mod, "_MACHINE_ID_PATHS", [FakeMachineIdPath()]): + self.assertEqual(self.extractor.extract_device_id(), "abc123machineid") + + def test_fallback_generates_and_persists_uuid(self): + """First call with empty machine-id -> generate uuid4 and write it. + + Reads the file back off the real fs to prove the atomic write landed. + """ + from scripts.coding_discovery_tools.linux import device_id as did_mod + + fixed = "11111111-2222-3333-4444-555555555555" + + with patch.object(did_mod, "_MACHINE_ID_PATHS", []), \ + patch.object(did_mod.cache, "_ensure_state_dir", return_value=True), \ + patch.object(did_mod.cache, "UNBOUND_DIR", self.state_dir), \ + patch.object(did_mod.uuid, "uuid4", return_value=uuid.UUID(fixed)): + result = self.extractor.extract_device_id() + + self.assertEqual(result, fixed) + self.assertTrue(self.device_id_path.exists()) + self.assertEqual(self.device_id_path.read_text(encoding="utf-8"), fixed) + # No leftover temp files from the atomic write. + leftovers = [p for p in self.state_dir.iterdir() if p.name != "device-id"] + self.assertEqual(leftovers, []) + + def test_fallback_reads_existing_uuid(self): + """Second/restarted run -> read the persisted uuid, do NOT regenerate.""" + from scripts.coding_discovery_tools.linux import device_id as did_mod + + persisted = "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee" + self.device_id_path.write_text(f" {persisted}\n", encoding="utf-8") + + # uuid4 would raise if called — proving we read instead of generate. + with patch.object(did_mod, "_MACHINE_ID_PATHS", []), \ + patch.object(did_mod.cache, "_ensure_state_dir", return_value=True), \ + patch.object(did_mod.cache, "UNBOUND_DIR", self.state_dir), \ + patch.object(did_mod.uuid, "uuid4", + side_effect=AssertionError("uuid4 should not be called")): + result = self.extractor.extract_device_id() + + self.assertEqual(result, persisted) + # File untouched (still has the original whitespace-padded content). + self.assertEqual(self.device_id_path.read_text(encoding="utf-8"), + f" {persisted}\n") + + def test_fallback_corrupt_partial_uuid_regenerates(self): + """A truncated/partial (non-UUID) persisted value is treated as absent. + + Simulates a pre-atomic-write partial write or a manual edit. The extractor + must reject it, mint a fresh valid UUID, and atomically overwrite the file. + """ + from scripts.coding_discovery_tools.linux import device_id as did_mod + + # Partial UUID left by an interrupted write. + self.device_id_path.write_text("11111111-222", encoding="utf-8") + fixed = "33333333-4444-5555-6666-777777777777" + + with patch.object(did_mod, "_MACHINE_ID_PATHS", []), \ + patch.object(did_mod.cache, "_ensure_state_dir", return_value=True), \ + patch.object(did_mod.cache, "UNBOUND_DIR", self.state_dir), \ + patch.object(did_mod.uuid, "uuid4", return_value=uuid.UUID(fixed)): + result = self.extractor.extract_device_id() + + # Regenerated, and the corrupt value is gone — replaced by a valid UUID. + self.assertEqual(result, fixed) + uuid.UUID(result) # parses -> well-formed + on_disk = self.device_id_path.read_text(encoding="utf-8") + self.assertEqual(on_disk, fixed) + uuid.UUID(on_disk) # the persisted value is now valid + + def test_fallback_garbage_non_uuid_regenerates(self): + """An arbitrary non-UUID string (manual edit / other tool) is rejected.""" + from scripts.coding_discovery_tools.linux import device_id as did_mod + + self.device_id_path.write_text("not-a-uuid-at-all", encoding="utf-8") + fixed = "44444444-5555-6666-7777-888888888888" + + with patch.object(did_mod, "_MACHINE_ID_PATHS", []), \ + patch.object(did_mod.cache, "_ensure_state_dir", return_value=True), \ + patch.object(did_mod.cache, "UNBOUND_DIR", self.state_dir), \ + patch.object(did_mod.uuid, "uuid4", return_value=uuid.UUID(fixed)): + result = self.extractor.extract_device_id() + + self.assertEqual(result, fixed) + self.assertEqual(self.device_id_path.read_text(encoding="utf-8"), fixed) + + def test_fallback_unwritable_state_dir_returns_uuid_unpersisted(self): + """No usable state dir -> return an (unpersisted) uuid, never raise.""" + from scripts.coding_discovery_tools.linux import device_id as did_mod + + fixed = "99999999-8888-7777-6666-555555555555" + with patch.object(did_mod, "_MACHINE_ID_PATHS", []), \ + patch.object(did_mod.cache, "_ensure_state_dir", return_value=False), \ + patch.object(did_mod.uuid, "uuid4", return_value=uuid.UUID(fixed)): + result = self.extractor.extract_device_id() + self.assertEqual(result, fixed) + + def test_fallback_write_failure_still_returns_uuid(self): + """A write error must not crash; the freshly-minted uuid is returned. + + Patches the atomic-write primitive (mkstemp) to raise. + """ + from scripts.coding_discovery_tools.linux import device_id as did_mod + + fixed = "12121212-3434-5656-7878-909090909090" + + with patch.object(did_mod, "_MACHINE_ID_PATHS", []), \ + patch.object(did_mod.cache, "_ensure_state_dir", return_value=True), \ + patch.object(did_mod.cache, "UNBOUND_DIR", self.state_dir), \ + patch.object(did_mod.tempfile, "mkstemp", + side_effect=OSError("read-only fs")), \ + patch.object(did_mod.uuid, "uuid4", return_value=uuid.UUID(fixed)): + result = self.extractor.extract_device_id() + self.assertEqual(result, fixed) + # Write failed -> nothing persisted. + self.assertFalse(self.device_id_path.exists()) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_copilot_cli_discovery.py b/tests/test_copilot_cli_discovery.py index 51f2ff6..53e78fa 100644 --- a/tests/test_copilot_cli_discovery.py +++ b/tests/test_copilot_cli_discovery.py @@ -251,7 +251,6 @@ def test_session_store_db_marker_detected(self): copilot_dir = self._make_copilot_dir() (copilot_dir / "session-store.db").write_text("", encoding="utf-8") self.assertIsNotNone(self.detector.detect()) - def test_real_install_with_strong_and_shared_markers_detected(self): """A real install (strong markers config.json + session-store.db) that ALSO has shared skills/ is detected — shared markers never veto a strong one."""