From 27b66bc9736cfa0bf2a8e29552868d73d21f01bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?thesqrldev=F0=9F=90=BF=EF=B8=8F?= Date: Fri, 29 May 2026 05:46:00 +0900 Subject: [PATCH 1/9] feat(human-sim): research-grade realism upgrade for agent_helpers.py Rewrite the human-behavior-simulation layer (agent-workspace/) with correctness and detection-realism fixes from a multi-lens review. - typing: default semantic mode now emits correct virtual-key codes (_vk_for_char) and a non-zero key hold; no longer routes through the core press_key (which emitted 0ms holds + VK_NUMPAD codes for letters) - tremor: exact OU discretization, dt tied to the real per-event interval, amplitude re-calibrated to ~0.8px (inside the 0.3-1.2px human band), anisotropic 2:1 axes - motion: asymmetric ballistic velocity (Beta(2,3)) replacing symmetric smoothstep; Fitts' Law movement time (optional target width); overshoot + correction on long moves - scroll: cursor anchored before wheel; discrete detent multiples (wheel) - idle: bounded cursor drift during human_wait (anchored, <=~15px) - click: <=1px release micro-drift (clamped in-viewport); teleport invariant preserved (press == final move) - session: cursor/click-bias/tremor-orientation persist across -c calls via a per-BU_NAME atomic state file - hardening: underscore-private config tables/class (no namespace leak), narrowed _viewport except, physical-typing dd>=hold constraint - docs: STALE banners on the two design/review drafts; add HUMAN_SIM_VALIDATION.md as the authoritative validation artifact Known ceilings documented in-module (not fixable in this layer): event rate ~20-40Hz (per-call IPC), getCoalescedEvents().length==1 / no PointerEvent stream, CDP-presence detectability. Adds tests/unit/test_human_behavior.py (17 hermetic tests, no browser). Reviewed in a separate lane (APPROVE; 0 regressions). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../HUMAN_BEHAVIOR_SIMULATION_REPORT.md | 517 +++++++++++++ agent-workspace/HUMAN_SIM_VALIDATION.md | 82 +++ .../IMPLEMENTATION_REVIEW_FOR_GEMINI.md | 467 ++++++++++++ agent-workspace/agent_helpers.py | 688 +++++++++++++++++- tests/unit/test_human_behavior.py | 309 ++++++++ 5 files changed, 2059 insertions(+), 4 deletions(-) create mode 100644 agent-workspace/HUMAN_BEHAVIOR_SIMULATION_REPORT.md create mode 100644 agent-workspace/HUMAN_SIM_VALIDATION.md create mode 100644 agent-workspace/IMPLEMENTATION_REVIEW_FOR_GEMINI.md create mode 100644 tests/unit/test_human_behavior.py diff --git a/agent-workspace/HUMAN_BEHAVIOR_SIMULATION_REPORT.md b/agent-workspace/HUMAN_BEHAVIOR_SIMULATION_REPORT.md new file mode 100644 index 00000000..018ff0cf --- /dev/null +++ b/agent-workspace/HUMAN_BEHAVIOR_SIMULATION_REPORT.md @@ -0,0 +1,517 @@ +# Human Behavior Simulation Layer for browser-harness + +> ⚠️ **STALE — PRE-IMPLEMENTATION DESIGN PROPOSAL (2026-05-28).** Superseded by the shipped code. +> This doc proposes a **numpy** implementation; the shipped layer is **pure stdlib**. OU parameters, +> the tremor envelope, target-offset model, cursor init, and the typing virtual-key path all changed +> after this was written. Do NOT treat its pseudocode or §3.3 "Validation metrics" as ground truth. +> **Authoritative now:** `agent-workspace/agent_helpers.py` + `agent-workspace/HUMAN_SIM_VALIDATION.md`. + +**Date:** 2026-05-28 +**Purpose:** External LLM review of proposed implementation strategy +**Reviewer context:** This report is self-contained. No prior knowledge of the codebase is assumed. + +--- + +## 1. System Overview + +### 1.1 What is browser-harness? + +A Python CLI tool that controls the user's **real, running Chrome browser** via CDP (Chrome DevTools Protocol). Unlike Puppeteer/Playwright/Selenium which launch a new browser instance, browser-harness connects to Chrome started with `--remote-debugging-port=9222`. + +**Key architectural constraint:** browser-harness does NOT launch Chrome. It connects to an existing instance. This means: +- No Chrome launch flags are controlled by the tool +- The browser's TLS fingerprint, GPU, fonts, plugins, cookies, sessions are all **real user data** +- `navigator.webdriver` is `undefined` (not `true` as with ChromeDriver) +- No `window.cdc_*` ChromeDriver artifacts exist + +### 1.2 Current Code Architecture + +``` +browser-harness/ +├── src/browser_harness/ +│ ├── helpers.py # 493 lines — all browser control primitives +│ ├── daemon.py # CDP WebSocket daemon +│ ├── admin.py # Daemon lifecycle (start/stop/ensure) +│ ├── run.py # CLI entry point +│ └── _ipc.py # IPC between CLI and daemon +├── agent-workspace/ +│ ├── agent_helpers.py # 7 lines — EMPTY, designated for extensions +│ └── domain-skills/ # 90+ site-specific playbooks +└── interaction-skills/ # 16 files — browser mechanics (dialogs, tabs, etc.) +``` + +**Extension point:** `agent_helpers.py` is auto-loaded by `helpers.py` at import time. Any function defined there becomes available as a top-level helper in browser-harness scripts. This is the designated location for the proposed human simulation layer. + +### 1.3 Current Input Primitives (from helpers.py) + +```python +# Mouse click — fires mousePressed + mouseReleased immediately, no preceding mouseMoved +def click_at_xy(x, y, button="left", clicks=1): + cdp("Input.dispatchMouseEvent", type="mousePressed", x=x, y=y, button=button, clickCount=clicks) + cdp("Input.dispatchMouseEvent", type="mouseReleased", x=x, y=y, button=button, clickCount=clicks) + +# Text input — bulk insertion, no per-character events +def type_text(text): + cdp("Input.insertText", text=text) + +# Scroll — single mouseWheel event, no easing or physics +def scroll(x, y, dy=-300, dx=0): + cdp("Input.dispatchMouseEvent", type="mouseWheel", x=x, y=y, deltaX=dx, deltaY=dy) + +# Key press — immediate keyDown/char/keyUp sequence +def press_key(key, modifiers=0): + # ... dispatches 2-3 CDP events with zero delay between them + +# Wait — fixed delay, no randomization +def wait(seconds=1.0): + time.sleep(seconds) +``` + +**Summary:** All current primitives are mechanically instantaneous with zero human-like characteristics. No randomization, no trajectories, no timing variance. + +--- + +## 2. Research Findings + +### 2.1 Threat Model: What Bot Detectors Actually Measure + +Modern bot detection operates on two layers: + +| Layer | Signals | browser-harness Status | +|-------|---------|----------------------| +| **Static fingerprint** | navigator.webdriver, WebGL renderer, screen geometry, canvas hash, font enumeration, TLS fingerprint (JA3), plugins, speech voices, hardware concurrency | **All clean.** Real Chrome = real fingerprint. 14 detection vectors neutralized automatically. | +| **Behavioral fingerprint** | Mouse trajectories, keystroke timing, scroll physics, click precision, session patterns, event timestamp regularity | **Fully exposed.** Zero simulation. This is the only remaining attack surface. | + +**Key insight from research:** Commercial anti-detect browsers (Multilogin $99/mo, GoLogin $49/mo, Kameleo €59/mo) spend enormous engineering effort on C++ Chromium forks to achieve what browser-harness gets for free (real fingerprints). But they also include behavioral simulation that browser-harness lacks entirely. + +### 2.2 Bot Detection Signal Priority (Shen et al. 2021, ACM Computing Surveys) + +Ranked by detection importance: + +1. **Event timestamp regularity** — #1 signal. Perfect 16.67ms intervals = immediate flag +2. **Velocity profile** — constant-speed movement is detectable +3. **Trajectory linearity** — straight-line mouse paths are detectable +4. **Micro-jitter absence** — humans exhibit 0.3-1.2px RMS hand tremor +5. **Click dwell time** — zero-ms or integer-ms mousePressed→mouseReleased is a flag +6. **Pre-click hover absence** — humans pause 80-200ms before clicking +7. **Overshoot + correction** — 10-15% of long-distance moves show this +8. **Fitts' Law violation** — movement time must scale with distance/target size + +### 2.3 Ensemble Detection Warning + +Modern systems (Cloudflare Bot Management, DataDome, Akamai, HUMAN/PerimeterX) use 200-2000+ signal ensembles with ML classifiers. **Fixing one signal while leaving others at default makes the session MORE suspicious, not less.** The signals must be temporally and causally consistent: + +- Mouse movement must precede every click (can't teleport) +- Scroll events must correlate with viewport focus +- Form completion time must scale with field length +- All simulated signals must share a single timeline + +### 2.4 Existing Field Data (from 90+ domain-skills) + +browser-harness already has site-specific anti-bot knowledge: + +| Site | Detection Stack | Required Wait | Threshold | +|------|----------------|---------------|-----------| +| Glassdoor | Cloudflare Bot Mgmt | `wait(5)` post-load | ~5 pages/min | +| G2 | DataDome 5.6.1 | `wait(5)` post-load | 100 req/s API | +| eBay | PerimeterX | `wait(3)` between pages | 5-10 rapid req | +| Facebook | Account-level | `≥2s` between scrolls | Behavioral | +| Booking.com | AWS WAF | `wait(5)` on challenge | Crypto PoW | +| Walmart | PerimeterX | Bare `Mozilla/5.0` UA | UA-sensitive | +| 10+ sites | None | None | No detection | + +--- + +## 3. Proposed Implementation + +### 3.1 Design Principles + +1. **Layer on top, don't modify core.** All code goes in `agent_helpers.py` (auto-loaded by helpers.py). Core primitives remain untouched for backward compatibility. +2. **Opt-in, not mandatory.** New `human_*` prefixed functions. Existing `click_at_xy()` stays instant for speed-critical operations. +3. **Statistically grounded.** All distributions and parameters from peer-reviewed research with specific citations. +4. **Ensemble-consistent.** All behavioral signals share a single random seed and temporal model. +5. **numpy-only dependency.** No exotic packages. numpy is already commonly available. + +### 3.2 Proposed API + +```python +# Mouse movement + click (replaces click_at_xy for stealth scenarios) +human_click(x, y, button="left") + # 1. Generate Bezier trajectory from current cursor position to (x, y) + # 2. Dispatch ~100-150 mouseMoved events along trajectory + # 3. Pre-click hover pause (80-200ms) + # 4. mousePressed with position jitter (σ=2-4px from target) + # 5. Click dwell (log-normal, μ=85ms) + # 6. mouseReleased with 0.5px drift + +# Mouse movement without click (for hover actions) +human_move(x, y) + # Bezier trajectory only, no click + +# Human-like typing (replaces type_text for stealth scenarios) +human_type(text, profile="skilled") + # Per-character press_key with log-normal inter-key delays + # Profiles: hunt_peck (36 WPM), average (72), skilled (100), expert (140) + # Optional error injection (1-3% rate with backspace correction) + +# Human-like scrolling +human_scroll(x, y, distance, direction="down") + # Multiple mouseWheel events with log-normal deltas + # Reading pauses injected at 12% probability + # Trackpad-style inertia deceleration + +# Randomized wait (replaces wait() for stealth scenarios) +human_wait(base_seconds=1.0) + # Log-normal distribution around base_seconds + +# Session-level human simulation +human_session_start() + # Initialize cursor position tracking + # Set up OU process for idle drift + # Configure timing model + +# Composite: navigate like a human +human_navigate(url) + # goto_url(url) + wait_for_load() + human_wait(2-5s reading time) +``` + +### 3.3 Algorithm Details + +#### 3.3.1 Mouse Trajectory: Cubic Bezier + Smoothstep + OU Noise + +**Why this combination:** +- Cubic Bezier alone produces ~6-8° mean angle change (human target: 8.2° per Ahmed & Traore 2011) +- WindMouse produces 54° (6.6x too jagged) — rejected +- Catmull-Rom spline produces 0.46° (18x too smooth) — rejected +- OU noise adds micro-tremor bringing total to ~7-10° — within human range + +**Algorithm:** + +```python +import numpy as np + +def _bezier_trajectory(start, end, num_points=120): + """Generate human-like mouse trajectory using cubic Bezier + OU noise.""" + sx, sy = start + ex, ey = end + dist = np.hypot(ex - sx, ey - sy) + + # Control points: offset perpendicular to straight line + # Arc magnitude ~9% of distance with Gaussian variance + mid_x, mid_y = (sx + ex) / 2, (sy + ey) / 2 + dx, dy = ex - sx, ey - sy + perp_x, perp_y = -dy, dx # perpendicular vector + norm = np.hypot(perp_x, perp_y) or 1 + perp_x, perp_y = perp_x / norm, perp_y / norm + + arc1 = dist * np.random.normal(0.09, 0.04) + arc2 = dist * np.random.normal(0.09, 0.04) + + cp1 = (sx + dx * 0.3 + perp_x * arc1, sy + dy * 0.3 + perp_y * arc1) + cp2 = (sx + dx * 0.7 + perp_x * arc2, sy + dy * 0.7 + perp_y * arc2) + + # Bezier evaluation with smoothstep time easing + t_linear = np.linspace(0, 1, num_points) + t = t_linear * t_linear * (3 - 2 * t_linear) # smoothstep + + mt = 1 - t + points_x = mt**3 * sx + 3 * mt**2 * t * cp1[0] + 3 * mt * t**2 * cp2[0] + t**3 * ex + points_y = mt**3 * sy + 3 * mt**2 * t * cp1[1] + 3 * mt * t**2 * cp2[1] + t**3 * ey + + # OU noise (micro-tremor), scaled down at endpoints + theta, sigma, dt = 0.7, 0.5, 1/60 + noise_x, noise_y = np.zeros(num_points), np.zeros(num_points) + for i in range(1, num_points): + noise_x[i] = noise_x[i-1] + theta * (0 - noise_x[i-1]) * dt + sigma * np.sqrt(dt) * np.random.randn() + noise_y[i] = noise_y[i-1] + theta * (0 - noise_y[i-1]) * dt + sigma * np.sqrt(dt) * np.random.randn() + + # Scale jitter down at endpoints (stable start/end) + jitter_scale = 1.0 - np.abs(2 * t_linear - 1) + points_x += noise_x * jitter_scale + points_y += noise_y * jitter_scale + + return list(zip(points_x, points_y)) +``` + +**Validation metrics:** +- Path deviation from straight line: ~7% (human range: 2-15%) +- Mean angle change: ~7-10° (human empirical: 8.2°) +- Velocity profile: bell-shaped (smoothstep) +- Micro-jitter RMS: ~0.28px (human range: 0.3-1.2px) + +#### 3.3.2 Event Timing + +**Critical: #1 detection signal.** + +```python +def _human_delay(base_ms, sigma_ms=3.0): + """Add Gaussian jitter to avoid timestamp regularity.""" + jitter = np.random.normal(0, sigma_ms) + delay = max(1, base_ms + jitter) / 1000.0 + time.sleep(delay) + +# Between mouseMoved events: 16.67ms ± 3-5ms Gaussian +# Pre-click hover: uniform(80, 200) ms +# Click dwell: log-normal(μ=log(85), σ=0.28) ms ≈ 50-150ms +# Post-click pause: uniform(50, 150) ms +``` + +#### 3.3.3 Keystroke Dynamics (CMU Keystroke Dataset, Killourhy & Maxion 2009) + +```python +TYPING_PROFILES = { + "hunt_peck": {"dd_mean": 335, "dd_std": 182, "hold_mean": 95, "hold_std": 30}, + "average": {"dd_mean": 166, "dd_std": 62, "hold_mean": 79, "hold_std": 22}, + "skilled": {"dd_mean": 120, "dd_std": 34, "hold_mean": 75, "hold_std": 18}, + "expert": {"dd_mean": 86, "dd_std": 18, "hold_mean": 65, "hold_std": 12}, +} + +def human_type(text, profile="skilled"): + """Type text with human-like inter-key timing.""" + p = TYPING_PROFILES[profile] + for i, ch in enumerate(text): + # Log-normal inter-key delay + if i > 0: + dd = np.random.lognormal( + mean=np.log(p["dd_mean"]) - 0.5 * (p["dd_std"]/p["dd_mean"])**2, + sigma=p["dd_std"] / p["dd_mean"] + ) + time.sleep(max(20, dd) / 1000.0) + + # Key down + press_key(ch) + # Note: press_key already dispatches keyDown + char + keyUp + # For deeper realism, could split into separate keyDown/keyUp + # with log-normal hold time, but current implementation is adequate + # for most detection systems. +``` + +**Parameters source:** CMU Keystroke Dynamics Benchmark (51 subjects, peer-reviewed DSN 2009). The `skilled` profile (DD mean=120ms, std=34ms, ~100 WPM) is recommended as default — closely matches the CMU empirical average (118ms, 42ms). + +#### 3.3.4 Scroll Simulation + +```python +def human_scroll(x, y, distance=3000, direction="down"): + """Scroll with human-like physics.""" + sign = -1 if direction == "down" else 1 + scrolled = 0 + while scrolled < distance: + # Log-normal scroll delta + delta = np.random.lognormal(mean=np.log(167), sigma=0.4) + delta = min(delta, distance - scrolled) + + cdp("Input.dispatchMouseEvent", type="mouseWheel", + x=x, y=y, deltaX=0, deltaY=sign * delta) + + scrolled += delta + + # Reading pause (12% probability) + if np.random.random() < 0.12: + pause = np.random.uniform(0.8, 3.0) + time.sleep(pause) + else: + # Normal inter-scroll delay + delay = np.random.lognormal(mean=np.log(0.101), sigma=0.3) + time.sleep(max(0.03, delay)) +``` + +#### 3.3.5 Human Click (Full Sequence) + +```python +_cursor_pos = [0, 0] # Track current cursor position + +def human_click(x, y, button="left"): + """Move cursor to target via Bezier trajectory, then click with human timing.""" + global _cursor_pos + + # 1. Generate trajectory + trajectory = _bezier_trajectory(_cursor_pos, (x, y)) + + # 2. Dispatch mouseMoved events along trajectory + for px, py in trajectory: + cdp("Input.dispatchMouseEvent", type="mouseMoved", x=px, y=py) + _human_delay(16.67, sigma_ms=3.0) # ~60fps with jitter + + # 3. Pre-click hover + time.sleep(np.random.uniform(0.08, 0.20)) + + # 4. Click with position jitter + click_x = x + np.random.normal(0, 2.5) + click_y = y + np.random.normal(0, 2.0) + + cdp("Input.dispatchMouseEvent", type="mousePressed", + x=click_x, y=click_y, button=button, clickCount=1) + + # 5. Click dwell (log-normal) + dwell = np.random.lognormal(mean=np.log(85), sigma=0.28) / 1000.0 + time.sleep(max(0.03, dwell)) + + # 6. Release with slight drift + release_x = click_x + np.random.normal(0, 0.5) + release_y = click_y + np.random.normal(0, 0.5) + + cdp("Input.dispatchMouseEvent", type="mouseReleased", + x=release_x, y=release_y, button=button, clickCount=1) + + # 7. Update cursor position + _cursor_pos = [release_x, release_y] +``` + +--- + +## 4. Open Questions for Review + +### 4.1 Algorithm Selection + +**Q1:** Is cubic Bezier + smoothstep + OU noise the right combination? Or should we consider: +- B-spline with randomly placed knots for more trajectory variety? +- Separate ballistic phase (fast) + corrective phase (slow near target) per Fitts' Law? +- Completely different approach like recorded human trajectory replay from a dataset? + +**Q2:** The OU process parameters (θ=0.7, σ=0.5) produce RMS 0.28px, which is slightly below the human lower bound of 0.3px. Should we increase σ to 0.6-0.8 for better coverage of the human distribution? + +### 4.2 Architecture Decisions + +**Q3:** Should cursor position tracking (`_cursor_pos`) be: +- A global variable (simple, current proposal)? +- A class instance (`HumanSession`) that also tracks session state? +- Stored in the CDP daemon for cross-script persistence? + +**Q4:** Should we implement overshoot-and-correction (documented as occurring in 10-15% of long-distance moves >400px)? It adds complexity but addresses a known detection signal. If yes, what algorithm? + +**Q5:** The `human_type()` function currently uses `press_key()` which dispatches keyDown+char+keyUp instantly. For deeper realism, should we split these into separate events with per-key hold time (dwell)? The CMU dataset provides hold time distributions (mean=79ms, std=22ms). This would triple the CDP calls per character. + +### 4.3 Timing Model + +**Q6:** The current proposal uses `time.sleep()` for inter-event delays. Python's `time.sleep()` has ~1ms granularity on most systems but can be up to ~15ms on some platforms. Is this sufficient for event timing jitter, or should we use a busy-wait loop for sub-millisecond precision? + +**Q7:** Should we implement a global "fatigue model" where typing speed gradually decreases (2-5%) over long sessions? Academic literature (CMU dataset) shows this effect but it may be over-engineering. + +### 4.4 Ensemble Consistency + +**Q8:** The current design has independent random generators for each behavioral dimension (mouse, keyboard, scroll). Should these be correlated? For example: +- Faster mouse movement → faster typing (same "user energy level") +- Longer reading pauses → slower scroll speed +- Time-of-day affecting all timing parameters + +**Q9:** Should we inject synthetic `visibilitychange`/`blur`/`focus` events to simulate tab switching? Research shows humans switch tabs every 2-10 minutes. This would require JS injection via `Page.addScriptToEvaluateOnNewDocument`, which browser-harness currently avoids. + +### 4.5 Validation Strategy + +**Q10:** How should we validate the implementation? Options: +- Run against creepjs.com / fingerprintjs.com and compare scores +- Test against Cloudflare Bot Management on a known-protected site (Glassdoor) +- Statistical analysis: compare generated trajectory metrics against Ahmed & Traore 2011 dataset +- A/B test: same task with `click_at_xy()` vs `human_click()` on DataDome-protected sites + +### 4.6 Performance Trade-offs + +**Q11:** `human_click()` takes ~2 seconds (120 mouseMoved events × 16.67ms + hover + dwell) vs `click_at_xy()` at ~50ms. For scripts that perform 100+ clicks, this is 200s vs 5s. Should we implement an adaptive mode that: +- Uses `human_click()` for first N interactions (establishing behavioral baseline) +- Gradually reduces trajectory points for subsequent clicks +- Falls back to `click_at_xy()` for off-screen/background operations + +**Q12:** Should `num_points` in the trajectory scale with distance (Fitts' Law: longer distance → more points → longer duration)? Current fixed 120 points means short moves take disproportionately long. + +### 4.7 Dependency Policy + +**Q13:** numpy is proposed as the sole dependency. Alternatives: +- **Pure Python (math + random only):** No dependency, but 5-10x slower for trajectory generation. Acceptable since we sleep between events anyway? +- **numpy:** Fast, convenient, widely available. But adds a dependency to a tool that currently has zero Python dependencies beyond stdlib. +- **scipy:** Adds CubicSpline, stats distributions. Overkill? + +### 4.8 Scope and Phasing + +**Q14:** Should we implement all features at once, or phase: +- **Phase 1:** `human_wait()` + timing jitter only (addresses #1 detection signal, minimal code) +- **Phase 2:** `human_click()` with Bezier trajectory (addresses #2-4 signals) +- **Phase 3:** `human_type()` + `human_scroll()` (full behavioral stack) + +Or is there a reason to ship everything together (ensemble consistency argument)? + +--- + +## 5. Competitive Positioning + +### 5.1 After Implementation + +| Tool | Fingerprint Layer | Behavioral Layer | Cost | Dependency | +|------|-------------------|-----------------|------|------------| +| **browser-harness + proposed** | Real Chrome (best possible) | Research-grade simulation | Free | numpy | +| Multilogin | C++ Chromium fork | Built-in | $99/month | Custom binary | +| GoLogin | Orbita Chromium fork | Built-in | $49/month | Custom binary | +| Browser Use (cloud) | Stock Playwright | LLM-emergent behavior | API pricing | Cloud service | +| puppeteer-stealth | JS patches (leaky) | ghost-cursor (partial) | Free | Node.js | +| Bright Data Scraping Browser | Managed Chromium | Server-side | $13.50/GB | Cloud service | + +### 5.2 Unique Advantages of This Approach + +1. **Fingerprint authenticity:** No other tool achieves this without a custom browser fork. browser-harness uses THE USER'S ACTUAL BROWSER with their actual history, cookies, extensions, and hardware. + +2. **Research-grounded behavioral simulation:** Most commercial tools use ad-hoc randomization. The proposed implementation uses peer-reviewed parameters (CMU Keystroke Dataset, Ahmed & Traore 2011, Shen et al. 2021). + +3. **Zero cost, single-file addition.** The entire implementation goes into one Python file (`agent_helpers.py`) that is auto-loaded by the existing architecture. + +4. **AI agent integration.** browser-harness is designed for AI agent control (Claude, GPT, etc.). Human simulation makes AI-driven browser sessions indistinguishable from human ones. + +--- + +## 6. Existing Code Context + +### 6.1 How agent_helpers.py is loaded (from helpers.py:478-493) + +```python +def _load_agent_helpers(): + p = AGENT_WORKSPACE / "agent_helpers.py" + if not p.exists(): + return + spec = importlib.util.spec_from_file_location("browser_harness_agent_helpers", p) + if not spec or not spec.loader: + return + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + for name, value in vars(module).items(): + if name.startswith("_"): + continue + globals()[name] = value + +_load_agent_helpers() +``` + +**Implication:** Any public function (not starting with `_`) defined in `agent_helpers.py` becomes a top-level import from `browser_harness.helpers`. So `human_click(x, y)` would be callable directly in browser-harness scripts just like `click_at_xy(x, y)`. + +### 6.2 CDP primitives available (used by proposed code) + +```python +cdp("Input.dispatchMouseEvent", type="mouseMoved|mousePressed|mouseReleased|mouseWheel", x=..., y=..., button=..., clickCount=..., deltaX=..., deltaY=...) +cdp("Input.dispatchKeyEvent", type="keyDown|char|keyUp", key=..., code=..., text=..., modifiers=..., windowsVirtualKeyCode=..., nativeVirtualKeyCode=...) +cdp("Input.insertText", text=...) +cdp("Page.addScriptToEvaluateOnNewDocument", source=...) # available but currently unused +cdp("Runtime.evaluate", expression=..., returnByValue=True, awaitPromise=True) +``` + +### 6.3 Design constraints (from SKILL.md) + +- Core helpers stay short. Task-specific additions go in `agent_helpers.py`. +- Don't add a manager layer. No retries framework, session manager, daemon supervisor, config system, or logging framework. +- Prefer compositor-level actions over framework hacks. + +--- + +## 7. Summary of Review Request + +Please evaluate: + +1. **Algorithm correctness:** Are the proposed algorithms (Bezier + smoothstep + OU, log-normal timing, CMU keystroke model) the best choices? What alternatives should be considered? + +2. **Parameter calibration:** Are the statistical parameters well-chosen and properly sourced? Any concerning gaps between proposed values and known human distributions? + +3. **Architecture fit:** Does the proposed single-file, opt-in, `human_*` prefix approach fit well with the existing codebase? Any anti-patterns? + +4. **Completeness:** Are there critical behavioral signals not addressed by this proposal? + +5. **Risk assessment:** What are the most likely failure modes? Which bot detection systems would this approach fail against, and why? + +6. **Implementation priority:** What should be built first for maximum impact with minimum code? + +7. **Answers to Q1-Q14** above, with reasoning. diff --git a/agent-workspace/HUMAN_SIM_VALIDATION.md b/agent-workspace/HUMAN_SIM_VALIDATION.md new file mode 100644 index 00000000..b5ebf536 --- /dev/null +++ b/agent-workspace/HUMAN_SIM_VALIDATION.md @@ -0,0 +1,82 @@ +# Human Behavior Simulation — Validation (shipped code) + +**Date:** 2026-05-29 +**Status:** Shipped & validated. This is the authoritative validation artifact for +`agent-workspace/agent_helpers.py`. The two `HUMAN_*_REVIEW*.md` / `*REPORT.md` files are +**stale design/review drafts** kept for history only — numbers there do not reflect the code. + +browser-harness connects to the user's real running Chrome via CDP, so the static fingerprint +is genuinely the user's own. This layer addresses the residual **behavioral** surface for +**ethical-use-only** UI automation reliability (own accounts / authorized targets / ToS-respecting). + +--- + +## How to run the tests + +```bash +python3 tests/unit/test_human_behavior.py # 17/17, hermetic (no browser/daemon) +``` + +The suite injects a fake `browser_harness.helpers` (capturing every CDP call) so the module's +load contract and dispatch invariants are exercised without a live browser. + +## Validated results (against the SHIPPED parameters) + +| Property | Result | Target / source | Status | +|---|---|---|---| +| `_lognormal` mean/std recovery (incl. std>mean) | within ~1-2% over 60k draws | requested mean/std | PASS | +| OU tremor **stationary std** | **1.001** (req 1.0) | exact discretization, no Euler bias | PASS | +| OU **lag-1 autocorrelation** | **0.7455** | `exp(-dt/τ)` = 0.7470 at dt=35ms, τ=0.12s | PASS | +| Tremor **per-axis RMS** | **0.795 px** | human hand-tremor band 0.3–1.2 px | PASS | +| Tremor anisotropy | 2:1 axes, session-fixed rotation | structured (not isotropic noise) | PASS | +| Ballistic easing | velocity peaks at ~t=0.33, monotonic | Meyer/Woodworth 2-component | PASS | +| Fitts' Law MT (paced, W=80) | D=50→164ms, 200→297, 800→495, 1600→607 | log law, not linear | PASS | +| Realized per-step turning angle | **5.58°** | Ahmed & Traore ~8.2° (see calibration note) | TRADE | +| Bezier endpoint | exactly == target | teleport-fix invariant | PASS | +| `human_click` invariant | mousePressed == final mouseMoved (int) | no teleport-on-click | PASS | +| `_vk_for_char('a')` | (65, 'KeyA', 'a') | NOT ord('a')=97=VK_NUMPAD1 | PASS | +| Integer coords to CDP | every x/y/deltaY is int | plausible MouseEvent.clientX | PASS | +| Wheel deltas are detent multiples | 5000 seeds / 48810 events / **0 non-detent** | discrete wheel notch | PASS | +| Idle drift bound | ≤25px from anchor over a 10s wait | bounded wander (not random walk) | PASS | +| Release micro-drift | ≤1px, clamped in-viewport | finger shift during hold | PASS | + +## Calibration decision (amplitude vs curvature) + +The cited curvature (8.2°/step, Ahmed & Traore 2011) and the cited tremor amplitude band +(0.3–1.2px RMS, signal #4) **cannot both be satisfied by one constant tremor σ**. The earlier +draft chose σ=12 → ~2.19px std, which hit the angle but **exceeded the amplitude band** (a direct +contradiction of its own signal #4). The shipped code **prioritizes amplitude**: tremor std is set +to land at ~0.795px (inside the human band), with the realized per-step angle falling to ~5.58°. +Rationale: micro-jitter RMS is a directly-measured detector signal; the 8.2° figure is an asserted +aggregate. Both metrics now sit in a plausible region rather than one being wildly off. + +## Known ceilings — NOT fixable in this layer (documented honestly in the module docstring) + +1. **Event rate 20–40Hz** — each CDP call is a per-call IPC round-trip; real pointing devices emit + 60–1000Hz. A rate-binning detector flags this regardless of trajectory shape. Fixing it requires + daemon-side vector batching (core change, out of scope for `agent-workspace/`). +2. **`getCoalescedEvents().length == 1`** and no PointerEvent pressure/tilt — `Input.dispatchMouseEvent` + emits MouseEvents only; pointer-fidelity detectors see synthetic input. +3. **CDP / remote-debugging presence** is detectable independent of behavior (anti-debugger probes). + +Net: this layer lowers per-action risk against heuristic/weak-ML detectors and buys time; it does +**not** make a session indistinguishable to a top-tier 200–2000-signal ensemble. + +## Backward compatibility + +Core primitives (`click_at_xy`, `type_text`, `scroll`, `press_key`, `wait`) are untouched. The +`human_*` verbs are additive and opt-in. New optional kwargs (`human_click(..., width=)`, +`human_wait(..., drift=)`, `human_session(..., fresh=)`) default to backward-compatible behavior, +except `human_wait` now emits bounded idle drift by default (`drift=True`); pass `drift=False` to +restore a dead sleep. + +## Public API + +`human_session(pacing="paced", fresh=False)` · `human_navigate(url)` · `human_move(x, y, width=None)` · +`human_click(x, y, button="left", width=None)` · `human_type(text, profile="skilled", mode="semantic")` · +`human_scroll(x, y, distance=3000, direction="down", device="trackpad")` · `human_wait(base=1.0, drift=True)` + +Config tables (`_PACING`, `_TYPING_PROFILES`) and `_HumanSession` are underscore-private (not exported +into the core helper namespace). Session state (cursor / click-bias / tremor-orientation) persists +across separate `browser-harness -c '...'` invocations via a per-`BU_NAME` state file (atomic write, +TTL `_SESSION_TTL`=600s). diff --git a/agent-workspace/IMPLEMENTATION_REVIEW_FOR_GEMINI.md b/agent-workspace/IMPLEMENTATION_REVIEW_FOR_GEMINI.md new file mode 100644 index 00000000..7f70e7f6 --- /dev/null +++ b/agent-workspace/IMPLEMENTATION_REVIEW_FOR_GEMINI.md @@ -0,0 +1,467 @@ +# Human Behavior Simulation — Implementation Review + +> ⚠️ **STALE — describes a 348-line draft that no longer exists.** The shipped code is ~560 lines and +> diverges on: OU tremor (Euler θ=0.7/σ=0.5 → **exact discretization**, dt tied to the real per-event +> interval, std re-calibrated to **0.795px**, anisotropic), envelope (triangle → **sine**), target-offset +> (per-click bias → **session-level** bias), cursor init ([0,0] → **lazy random**), typing (default +> semantic now uses `_vk_for_char` + non-zero hold; no longer routes through the buggy `press_key`), +> velocity (smoothstep → **ballistic Beta(2,3)**), plus **Fitts' Law MT**, **overshoot**, **scroll detents**, +> **idle drift**, **cross-`-c` session persistence**, and namespace hardening. +> Its §4 "Validated Test Results" table reflects the abandoned σ=0.6 OU (RMS 0.451px) — **NOT** shipped +> behavior. Review questions A2 (sine envelope), A3 (device param), R1 (persistence), C2 (`_vk_for_char`) +> are already resolved in code. +> **Authoritative now:** `agent-workspace/agent_helpers.py` + `agent-workspace/HUMAN_SIM_VALIDATION.md`. + +**Date:** 2026-05-28 +**Reviewer:** Gemini Deep Think +**Context:** This is a completed implementation. We seek a code-level review, not a design review. A prior design review by GPT 5.5 Pro Extended Thinking was already incorporated. + +--- + +## 1. What This Is + +A single-file Python module (`agent_helpers.py`, 348 lines) that adds human-like behavioral simulation to **browser-harness**, a CDP-based browser automation tool. browser-harness connects to the user's real running Chrome via `--remote-debugging-port`, giving it perfect static fingerprints (real GPU, fonts, cookies, TLS). This module addresses the remaining behavioral detection surface. + +## 2. Architecture + +``` +browser-harness/ +├── src/browser_harness/ +│ └── helpers.py # Core CDP primitives (click_at_xy, type_text, scroll, etc.) +│ # Auto-loads agent_helpers.py at import time via: +│ # _load_agent_helpers() → importlib → exports public names +├── agent-workspace/ +│ └── agent_helpers.py # THIS FILE — human simulation layer +``` + +**Loading mechanism** (`helpers.py:478-493`): +```python +def _load_agent_helpers(): + p = AGENT_WORKSPACE / "agent_helpers.py" + spec = importlib.util.spec_from_file_location("browser_harness_agent_helpers", p) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + for name, value in vars(module).items(): + if name.startswith("_"): + continue + globals()[name] = value # exports public names into helpers namespace +``` + +So `human_click(x, y)` becomes callable just like `click_at_xy(x, y)` in browser-harness scripts. + +## 3. The Complete Implementation + +```python +"""Human behavior simulation for browser-harness. + +Adds human-like timing, mouse trajectories, typing, and scrolling +on top of CDP primitives for UI automation reliability. + +Usage: + human_session("paced") # configure session (optional, defaults to "paced") + human_click(500, 300) # Bezier trajectory + click with timing + human_type("hello", mode="semantic") # per-character with inter-key delays + human_scroll(600, 400, 2000) # scroll with reading pauses + human_wait(2.0) # log-normal randomized wait +""" + +import math, random, time + +from browser_harness.helpers import cdp, press_key + + +# --------------------------------------------------------------------------- +# Pacing profiles — policy-based, not adaptive +# --------------------------------------------------------------------------- + +PACING = { + "fast": { + "move_speed": 0.3, + "hover_range": (0.02, 0.05), + "dwell_mean": 45, + "type_speed": 1.5, + "scroll_speed": 1.5, + "wait_mult": 0.5, + "event_jitter_ms": 1.5, + }, + "paced": { + "move_speed": 1.0, + "hover_range": (0.08, 0.20), + "dwell_mean": 85, + "type_speed": 1.0, + "scroll_speed": 1.0, + "wait_mult": 1.0, + "event_jitter_ms": 3.0, + }, + "physical": { + "move_speed": 1.2, + "hover_range": (0.10, 0.25), + "dwell_mean": 95, + "type_speed": 0.8, + "scroll_speed": 0.8, + "wait_mult": 1.3, + "event_jitter_ms": 4.0, + }, +} + +# --------------------------------------------------------------------------- +# Typing profiles — CMU Keystroke Dataset (Killourhy & Maxion, DSN 2009) +# dd = down-down interval, hold = key hold duration (ms) +# --------------------------------------------------------------------------- + +TYPING_PROFILES = { + "hunt_peck": {"dd_mean": 335, "dd_std": 182, "hold_mean": 95, "hold_std": 30}, + "average": {"dd_mean": 166, "dd_std": 62, "hold_mean": 79, "hold_std": 22}, + "skilled": {"dd_mean": 120, "dd_std": 34, "hold_mean": 75, "hold_std": 18}, + "expert": {"dd_mean": 86, "dd_std": 18, "hold_mean": 65, "hold_std": 12}, +} + + +# --------------------------------------------------------------------------- +# Session +# --------------------------------------------------------------------------- + +class HumanSession: + """Tracks cursor position and pacing configuration across interactions.""" + + def __init__(self, pacing="paced"): + self.cursor = [0.0, 0.0] + self.pacing = pacing + self.profile = PACING[pacing] + + def set_pacing(self, pacing): + self.pacing = pacing + self.profile = PACING[pacing] + + +_session = None + + +def _s(): + global _session + if _session is None: + _session = HumanSession() + return _session + + +# --------------------------------------------------------------------------- +# Math helpers (pure Python — no numpy required) +# --------------------------------------------------------------------------- + +def _lognormal(mean, std): + """Sample from log-normal given desired mean and std in natural units.""" + if mean <= 0: + return max(0.001, mean) + variance = std ** 2 + mu = math.log(mean ** 2 / math.sqrt(variance + mean ** 2)) + sigma = math.sqrt(math.log(1 + variance / mean ** 2)) + return random.lognormvariate(mu, sigma) + + +def _smoothstep(t): + return t * t * (3.0 - 2.0 * t) + + +def _ou_noise(n, theta=0.7, sigma=0.5, dt=1.0 / 60): + """Ornstein-Uhlenbeck process for micro-tremor simulation.""" + vals = [0.0] + for _ in range(n - 1): + prev = vals[-1] + vals.append(prev + theta * (0 - prev) * dt + sigma * math.sqrt(dt) * random.gauss(0, 1)) + return vals + + +def _distance_points(dist): + """Scale trajectory point count with distance (Fitts' Law).""" + return max(20, min(200, int(dist * 0.08))) + + +def _bezier_trajectory(start, end): + """Cubic Bezier + smoothstep easing + OU micro-jitter + path noise. + + Produces curvature ~7-10 deg/step (human empirical: 8.2 deg, Ahmed & Traore 2011). + """ + sx, sy = start + ex, ey = end + dist = math.hypot(ex - sx, ey - sy) + + if dist < 2: + return [(ex, ey)] + + n = _distance_points(dist) + + dx, dy = ex - sx, ey - sy + norm = dist or 1 + perp_x, perp_y = -dy / norm, dx / norm + + arc1 = dist * random.gauss(0.09, 0.04) + arc2 = dist * random.gauss(0.09, 0.04) + + cp1 = (sx + dx * 0.3 + perp_x * arc1, sy + dy * 0.3 + perp_y * arc1) + cp2 = (sx + dx * 0.7 + perp_x * arc2, sy + dy * 0.7 + perp_y * arc2) + + noise_x = _ou_noise(n, theta=0.7, sigma=0.6) + noise_y = _ou_noise(n, theta=0.7, sigma=0.6) + + step_px = dist / max(1, n - 1) + path_noise_sigma = step_px * 0.14 + + points = [] + for i in range(n): + t_lin = i / max(1, n - 1) + t = _smoothstep(t_lin) + mt = 1.0 - t + + px = mt**3 * sx + 3 * mt**2 * t * cp1[0] + 3 * mt * t**2 * cp2[0] + t**3 * ex + py = mt**3 * sy + 3 * mt**2 * t * cp1[1] + 3 * mt * t**2 * cp2[1] + t**3 * ey + + jitter_scale = 1.0 - abs(2 * t_lin - 1) + px += noise_x[i] * jitter_scale + py += noise_y[i] * jitter_scale + px += random.gauss(0, path_noise_sigma) * jitter_scale + py += random.gauss(0, path_noise_sigma) * jitter_scale + + points.append((px, py)) + + return points + + +def _target_offset(x, y): + """Target-acquisition uncertainty: slight offset biased down-right (hand anatomy).""" + return ( + x + random.gauss(1.5, 2.5), + y + random.gauss(1.0, 2.0), + ) + + +# --------------------------------------------------------------------------- +# Timing helpers +# --------------------------------------------------------------------------- + +def _jittered_sleep(base_ms, sigma_ms=None): + if sigma_ms is None: + sigma_ms = _s().profile["event_jitter_ms"] + delay = max(8, base_ms + random.gauss(0, sigma_ms)) / 1000.0 + time.sleep(delay) + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def human_session(pacing="paced"): + """Configure or reconfigure the session pacing. + + Modes: + fast — internal/admin tools, speed over realism + paced — ordinary UI automation (default) + physical — keyboard/mouse event testing, most deliberate + """ + global _session + _session = HumanSession(pacing) + return _session + + +def human_wait(base=1.0): + """Log-normal randomized wait around base seconds.""" + p = _s().profile + actual = _lognormal(base, base * 0.3) * p["wait_mult"] + time.sleep(max(0.05, actual)) + + +def human_move(x, y): + """Move cursor to (x, y) via Bezier trajectory with OU micro-jitter.""" + s = _s() + trajectory = _bezier_trajectory(s.cursor, (x, y)) + speed = s.profile["move_speed"] + base_interval = max(25, 16.67 * speed) + + for px, py in trajectory: + cdp("Input.dispatchMouseEvent", type="mouseMoved", x=px, y=py) + _jittered_sleep(base_interval) + + s.cursor = [x, y] + + +def human_click(x, y, button="left"): + """Move cursor via Bezier trajectory, then click with human timing. + + Full event sequence: mouseMoved (trajectory) -> hover pause -> + mousePressed (with position jitter) -> dwell -> mouseReleased (with drift). + """ + s = _s() + p = s.profile + + human_move(x, y) + + hover_lo, hover_hi = p["hover_range"] + time.sleep(random.uniform(hover_lo, hover_hi)) + + cx, cy = _target_offset(x, y) + + cdp("Input.dispatchMouseEvent", + type="mousePressed", x=cx, y=cy, button=button, clickCount=1) + + dwell = _lognormal(p["dwell_mean"], p["dwell_mean"] * 0.28) / 1000.0 + time.sleep(max(0.03, dwell)) + + rx = cx + random.gauss(0, 0.5) + ry = cy + random.gauss(0, 0.5) + cdp("Input.dispatchMouseEvent", + type="mouseReleased", x=rx, y=ry, button=button, clickCount=1) + + s.cursor = [rx, ry] + + +def human_type(text, profile="skilled", mode="semantic"): + """Type text with human-like inter-key timing. + + Modes: + semantic — press_key per character with log-normal inter-key delays. + Sufficient for most form filling and UI interaction. + physical — separate keyDown/keyUp with per-key hold (dwell) time. + Use when testing actual keyboard event handling. + + Profiles: hunt_peck (~36 WPM), average (~72), skilled (~100), expert (~140). + """ + tp = TYPING_PROFILES[profile] + speed = _s().profile["type_speed"] + dd_mean = tp["dd_mean"] / speed + dd_std = tp["dd_std"] / speed + + if mode == "physical": + _type_physical(text, tp, speed) + else: + _type_semantic(text, dd_mean, dd_std) + + +def _type_semantic(text, dd_mean, dd_std): + for i, ch in enumerate(text): + if i > 0: + delay = _lognormal(dd_mean, dd_std) + time.sleep(max(0.02, delay / 1000.0)) + press_key(ch) + + +def _type_physical(text, tp, speed): + hold_mean = tp["hold_mean"] / speed + hold_std = tp["hold_std"] / speed + dd_mean = tp["dd_mean"] / speed + dd_std = tp["dd_std"] / speed + + _KEYS = { + "Enter": (13, "Enter", "\r"), "Tab": (9, "Tab", "\t"), + "Backspace": (8, "Backspace", ""), " ": (32, "Space", " "), + } + + prev_up_time = 0.0 + + for i, ch in enumerate(text): + if i > 0: + dd = _lognormal(dd_mean, dd_std) / 1000.0 + elapsed = time.monotonic() - prev_up_time + remaining = max(0.005, dd - elapsed) + time.sleep(remaining) + + vk, code, t = _KEYS.get(ch, (ord(ch) if len(ch) == 1 else 0, ch, ch if len(ch) == 1 else "")) + base = {"key": ch, "code": code, "windowsVirtualKeyCode": vk, "nativeVirtualKeyCode": vk} + + cdp("Input.dispatchKeyEvent", type="keyDown", **base, **({"text": t} if t else {})) + if t and len(t) == 1: + cdp("Input.dispatchKeyEvent", type="char", text=t, + **{k: v for k, v in base.items() if k != "text"}) + + hold = _lognormal(hold_mean, hold_std) / 1000.0 + time.sleep(max(0.02, hold)) + + cdp("Input.dispatchKeyEvent", type="keyUp", **base) + prev_up_time = time.monotonic() + + +def human_scroll(x, y, distance=3000, direction="down"): + """Scroll with human-like physics: log-normal deltas and reading pauses.""" + sign = -1 if direction == "down" else 1 + speed = _s().profile["scroll_speed"] + scrolled = 0 + + while scrolled < distance: + delta = _lognormal(167, 60) / speed + delta = min(delta, distance - scrolled) + if delta < 1: + break + + cdp("Input.dispatchMouseEvent", + type="mouseWheel", x=x, y=y, deltaX=0, deltaY=sign * delta) + scrolled += delta + + if random.random() < 0.12: + time.sleep(random.uniform(0.8, 3.0)) + else: + interval = _lognormal(101, 30) / speed / 1000.0 + time.sleep(max(0.03, interval)) +``` + +## 4. Validated Test Results + +### 4.1 Statistical Validation (math functions, isolated) + +| Test | Result | Target | Status | +|------|--------|--------|--------| +| Lognormal(85,24) | mean=85.7, std=24.3 | ~85, ~24 | PASS | +| Lognormal(120,34) | mean=120.0, std=34.5 | ~120, ~34 | PASS | +| Lognormal(167,60) | mean=165.0, std=60.7 | ~167, ~60 | PASS | +| Bezier endpoint accuracy | 0.00px error | <5px | PASS | +| Trajectory curvature | 7.6 deg ± 0.8 | ~8.2 deg (Ahmed & Traore 2011) | PASS | +| OU noise RMS (sigma=0.6) | 0.451px | 0.3-1.2px (human range) | PASS | +| Distance scaling 50px | 20 pts, 0.3s | Fitts' Law | PASS | +| Distance scaling 1000px | 80 pts, 1.3s | Fitts' Law | PASS | +| Distance scaling 2000px | 160 pts, 2.7s | Fitts' Law | PASS | + +### 4.2 Browser Integration Test (CDP, live Chrome) + +| Function | fast mode | paced mode | Status | +|----------|----------|-----------|--------| +| human_move | 1.23s | ~3s | PASS | +| human_click | 0.67s | 4.36s | PASS | +| human_wait(0.5) | 0.36s | - | PASS | +| human_scroll(500px) | 0.29s | - | PASS | + +### 4.3 IPC Constraint Discovered During Testing + +browser-harness daemon creates a new IPC socket connection per CDP call. Rapid-fire calls (<25ms intervals) saturate the daemon. Fix: `human_move` enforces `max(25, 16.67 * speed)` ms minimum between `mouseMoved` events. + +## 5. Design Decisions Already Made (via GPT 5.5 Review) + +| Decision | Choice | Rationale | +|----------|--------|-----------| +| State management | `HumanSession` class | Explicit state, survives tab switches (vs. fragile global) | +| Pacing | Policy-based (fast/paced/physical) | Auditable and predictable (vs. adaptive stealth) | +| Dependencies | Pure Python stdlib | Zero dependency friction (vs. numpy requirement) | +| Typing modes | semantic / physical | Most UIs need semantic; physical only for event-handler testing | +| Trajectory scaling | Distance-proportional points | Short moves fast, long moves natural (Fitts' Law) | +| Overshoot | Replaced with target-acquisition uncertainty | `_target_offset` with bivariate Gaussian (vs. complex overshoot model) | +| Synthetic focus/blur | Not implemented | Contradicts tool's compositor-level philosophy | +| Fatigue model | Not implemented | Over-engineering for Phase 1 | + +## 6. Specific Review Questions + +### Code Quality +**C1.** The `_lognormal(mean, std)` conversion to mu/sigma uses the standard formula, but does it handle edge cases correctly? What happens when `std > mean` (high variance)? + +**C2.** The `_type_physical` function duplicates the `_KEYS` mapping from `helpers.py`. Should it import the mapping, or is the duplication acceptable given the limited overlap (4 keys vs 14)? + +**C3.** Is the `_target_offset` bivariate Gaussian (mean=1.5px right, 1.0px down) a reasonable model for hand-anatomy click bias? The down-right bias comes from the observation that most users click with a rightward wrist angle. + +### Algorithm Correctness +**A1.** The path noise uses `path_noise_sigma = step_px * 0.14` which produces ~8 deg curvature. But this means the noise amplitude scales with distance (longer moves = bigger noise). Is this physically correct, or should path noise be distance-independent? + +**A2.** The `jitter_scale = 1.0 - abs(2 * t_lin - 1)` produces a triangle envelope: zero at endpoints, max at midpoint. This means the trajectory always starts and ends on the exact Bezier curve with zero noise. Is a triangle the right shape, or should noise taper more gradually (e.g., sine envelope)? + +**A3.** In `human_scroll`, the `_lognormal(167, 60)` for scroll delta comes from eye-tracking literature (Liu et al. 2010). But this is for mouse-wheel scrolling. On macOS with a trackpad (the common case for this tool), scroll deltas are typically smaller and more continuous. Should there be a device-type parameter? + +### Robustness +**R1.** The `_session` global is module-level and persists across browser-harness script invocations within the same daemon lifetime. But the cursor position tracking starts at `[0, 0]` — it doesn't know where the real cursor is. Is this a problem? Should we query actual cursor position from Chrome? + +**R2.** The 25ms IPC floor in `human_move` was discovered empirically. Is there a more principled way to determine this, or should it be configurable? + +**R3.** What happens if `human_click` targets coordinates outside the viewport? The CDP `Input.dispatchMouseEvent` accepts any coordinates, but Chrome may not deliver the event to the correct element. Should we clamp to viewport bounds? diff --git a/agent-workspace/agent_helpers.py b/agent-workspace/agent_helpers.py index 2d493c17..27d055ce 100644 --- a/agent-workspace/agent_helpers.py +++ b/agent-workspace/agent_helpers.py @@ -1,7 +1,687 @@ -"""Agent-editable browser helpers. +"""Human behavior simulation for browser-harness. -Add task-specific browser primitives here. Core helpers from browser_harness.helpers -load this file when BH_AGENT_WORKSPACE points at this directory, or when this -repo's default agent-workspace exists. +Adds human-like timing, mouse trajectories, typing, and scrolling +on top of CDP primitives for UI automation reliability. + +Usage: + human_session("paced") # configure session (optional; default "paced") + human_navigate("https://example.com") # nav + load wait + human reading pause + human_move(500, 300) # Fitts-timed ballistic trajectory + tremor + human_click(500, 300, width=64) # move + click; pass target width for Fitts + human_type("hello", mode="semantic") # per-char correct keycodes + key-hold dwell + human_scroll(600, 400, 2000, device="trackpad") + human_wait(2.0) # log-normal wait with live idle drift + +Session state (cursor, click bias, tremor orientation) persists across separate +`browser-harness -c '...'` invocations via a per-BU_NAME state file, so the cursor +does not teleport to a fresh random point on every call. + +KNOWN CEILINGS — NOT fixable inside this layer; documented honestly: + * Event rate is floored ~20-40Hz by the per-call CDP/IPC round-trip and time.sleep + granularity. Real pointing devices report at 60-1000Hz. A detector that bins + inter-event intervals can flag the low rate regardless of trajectory shape. + * Input.dispatchMouseEvent emits MouseEvents only: getCoalescedEvents().length == 1 + and there is no PointerEvent pressure/tilt stream. Pointer-fidelity detectors see + synthetic input even with a perfect trajectory. + * A CDP/remote-debugging attachment is itself detectable (anti-debugger probes, + Runtime/Page domain side-effects) independent of behavior. +This layer targets heuristic and weak-ML detectors; against top-tier ensembles it +lowers per-action risk and buys time, it does not make a session indistinguishable. """ +import json, math, os, random, tempfile, time + +from browser_harness.helpers import cdp, _KEYS as _CORE_KEYS + + +# --------------------------------------------------------------------------- +# Pacing profiles — policy-based, not adaptive. +# move_step_ms : per-event interval (>= IPC floor); also sets the event rate. +# move_time_mult: scales the Fitts'-Law movement-time estimate. +# --------------------------------------------------------------------------- + +_PACING = { + "fast": { + "move_step_ms": 25, + "move_time_mult": 0.7, + "hover_range": (0.02, 0.05), + "dwell_mean": 45, + "type_speed": 1.5, + "scroll_speed": 1.5, + "wait_mult": 0.5, + "event_jitter_ms": 1.5, + }, + "paced": { + "move_step_ms": 35, + "move_time_mult": 1.0, + "hover_range": (0.08, 0.20), + "dwell_mean": 85, + "type_speed": 1.0, + "scroll_speed": 1.0, + "wait_mult": 1.0, + "event_jitter_ms": 3.0, + }, + "physical": { + "move_step_ms": 50, + "move_time_mult": 1.3, + "hover_range": (0.10, 0.25), + "dwell_mean": 95, + "type_speed": 0.8, + "scroll_speed": 0.8, + "wait_mult": 1.3, + "event_jitter_ms": 4.0, + }, +} + +# Typing profiles — CMU Keystroke Dataset (Killourhy & Maxion, DSN 2009). +# dd = down-down interval, hold = key hold duration (ms). WPM at 5 chars/word: +# hunt_peck ~36, average ~72, skilled ~100, expert ~140. +_TYPING_PROFILES = { + "hunt_peck": {"dd_mean": 335, "dd_std": 182, "hold_mean": 95, "hold_std": 30}, + "average": {"dd_mean": 166, "dd_std": 62, "hold_mean": 79, "hold_std": 22}, + "skilled": {"dd_mean": 120, "dd_std": 34, "hold_mean": 75, "hold_std": 18}, + "expert": {"dd_mean": 86, "dd_std": 18, "hold_mean": 65, "hold_std": 12}, +} + +_IPC_FLOOR_MS = 20 + +# Hand tremor model. Two anisotropic OU axes (2:1) rotated by a session-fixed angle. +# Combined isotropic-equivalent RMS = sqrt((1.0^2 + 0.5^2)/2) ~= 0.79px, inside the +# 0.3-1.2px human hand-tremor band (vs the prior 2.19px which exceeded it). +_TREMOR_STD_MAJOR = 1.0 +_TREMOR_STD_MINOR = 0.5 +_TREMOR_TAU = 0.12 # OU correlation time (s); autocorr per step = exp(-dt/tau) + +# Fitts' Law: MT = a + b * log2(D/W + 1) (Shannon form). Mouse-typical constants. +_FITTS_A_MS = 80.0 +_FITTS_B_MS = 120.0 +_FITTS_DEFAULT_W = 80.0 # assumed target width (px) when caller gives none + +_SESSION_TTL = 600 # ignore persisted session state older than this (s) + + +# --------------------------------------------------------------------------- +# Session (state persists across -c invocations via a per-BU_NAME file) +# --------------------------------------------------------------------------- + +def _state_path(): + base = os.environ.get("BH_TMP_DIR") or os.environ.get("BH_RUNTIME_DIR") or tempfile.gettempdir() + name = os.environ.get("BU_NAME", "default") + return os.path.join(base, "bh_human_session_%s.json" % name) + + +class _HumanSession: + """Tracks cursor, per-session click bias, and tremor orientation. + + cursor is None until first action — avoids the [0, 0] teleport signature. + click_bias is session-level (not per-click) so the statistical mean of click + error does not converge to the target center over many clicks. + State is restored from disk (unless fresh=True) so continuity survives the + fresh Python process that each `browser-harness -c` spawns. + """ + + def __init__(self, pacing="paced", fresh=False): + self.pacing = pacing + self.profile = _PACING[pacing] + self.viewport = None + self.cursor = None + self.click_bias = (random.gauss(0, 1.0), random.gauss(0, 1.0)) + self.tremor_angle = random.uniform(0, math.pi) + if not fresh: + self._load() + + def set_pacing(self, pacing): + self.pacing = pacing + self.profile = _PACING[pacing] + + def invalidate_viewport(self): + self.viewport = None + + def _load(self): + try: + p = _state_path() + with open(p) as f: + d = json.load(f) + if time.time() - float(d.get("ts", 0)) > _SESSION_TTL: + return + cur = d.get("cursor") + if cur and len(cur) == 2: + self.cursor = [float(cur[0]), float(cur[1])] + cb = d.get("click_bias") + if cb and len(cb) == 2: + self.click_bias = (float(cb[0]), float(cb[1])) + if "tremor_angle" in d: + self.tremor_angle = float(d["tremor_angle"]) + except Exception: + pass + + def _save(self): + # Atomic write (tmp + os.replace) so a concurrent reader never sees a + # half-written file; a corrupt/partial read in _load just falls back to + # a fresh session. + try: + p = _state_path() + tmp = "%s.%d.tmp" % (p, os.getpid()) + with open(tmp, "w") as f: + json.dump({ + "cursor": self.cursor, + "click_bias": list(self.click_bias), + "tremor_angle": self.tremor_angle, + "ts": time.time(), + }, f) + os.replace(tmp, p) + except Exception: + pass + + +_session = None + + +def _s(): + global _session + if _session is None: + _session = _HumanSession() + return _session + + +def _viewport(s): + if s.viewport is None: + try: + m = cdp("Page.getLayoutMetrics") + vp = m.get("layoutViewport", {}) + s.viewport = ( + int(vp.get("clientWidth", 1200)), + int(vp.get("clientHeight", 800)), + ) + except (KeyError, TypeError, ValueError, OSError, RuntimeError): + s.viewport = (1200, 800) + return s.viewport + + +def _ensure_cursor(s): + """Lazy cursor init at a random plausible viewport position.""" + if s.cursor is None: + w, h = _viewport(s) + s.cursor = [ + random.uniform(w * 0.2, w * 0.8), + random.uniform(h * 0.2, h * 0.8), + ] + return s.cursor + + +def _clamp(x, y, s=None): + s = s or _s() + w, h = _viewport(s) + return (max(0, min(w - 1, x)), max(0, min(h - 1, y))) + + +# --------------------------------------------------------------------------- +# Math helpers (pure Python — no numpy required) +# --------------------------------------------------------------------------- + +def _lognormal(mean, std, max_sigma=3): + """Sample log-normal with the requested mean/std, truncated at mean + max_sigma*std. + + Truncation prevents catastrophic right-tail outliers (e.g. a 50s wait for a 1s base). + """ + if mean <= 0: + return max(0.001, mean) + variance = std ** 2 + mu = math.log(mean ** 2 / math.sqrt(variance + mean ** 2)) + sigma = math.sqrt(math.log(1 + variance / mean ** 2)) + val = random.lognormvariate(mu, sigma) + return min(val, mean + max_sigma * std) + + +def _ou_axis(n, dt, std): + """Exact-discretization OU chain seeded from its stationary distribution. + + Uses a = exp(-dt/tau) and innovation std = std*sqrt(1-a^2), so the realized + stationary std equals `std` exactly and lag-1 autocorrelation equals a exactly + (no Euler-Maruyama bias). dt is the REAL per-event interval, so the temporal + correlation matches the wall-clock signal a detector observes. + """ + if n <= 0: + return [] + a = math.exp(-dt / _TREMOR_TAU) + innov = std * math.sqrt(max(0.0, 1.0 - a * a)) + vals = [random.gauss(0, std)] + for _ in range(n - 1): + vals.append(a * vals[-1] + innov * random.gauss(0, 1)) + return vals + + +def _tremor(n, dt, angle): + """Anisotropic tremor: two OU axes (2:1) rotated into screen coords by `angle`.""" + maj = _ou_axis(n, dt, _TREMOR_STD_MAJOR) + mnr = _ou_axis(n, dt, _TREMOR_STD_MINOR) + ca, sa = math.cos(angle), math.sin(angle) + nx = [maj[i] * ca - mnr[i] * sa for i in range(n)] + ny = [maj[i] * sa + mnr[i] * ca for i in range(n)] + return nx, ny + + +def _ballistic_easing(n): + """Asymmetric ease whose velocity peaks early (~t=0.33) with a long decel tail. + + Cumulative of a Beta(2, 3)-shaped velocity profile — the Meyer/Woodworth + two-component (ballistic + corrective) reaching model — instead of the + symmetric smoothstep bell that constant easing produces. + """ + if n <= 1: + return [1.0] + av, bv = 2.0, 3.0 + w = [] + for i in range(n): + s = i / (n - 1) + w.append((s ** (av - 1)) * ((1 - s) ** (bv - 1))) + total = sum(w) or 1.0 + out, acc = [], 0.0 + for wi in w: + acc += wi + out.append(acc / total) + out[-1] = 1.0 + return out + + +def _fitts_ms(dist, width=None): + """Movement time via Fitts' Law (Shannon form): MT = a + b*log2(D/W + 1).""" + w = width if (width and width > 0) else _FITTS_DEFAULT_W + idx = math.log2(dist / w + 1.0) + return _FITTS_A_MS + _FITTS_B_MS * idx + + +def _bezier_trajectory(start, end, n, dt, angle): + """Cubic Bezier + ballistic easing + sine-enveloped anisotropic OU tremor. + + Both control points share one side (C-arc) to avoid implausible S-curves. + The sine envelope is C^1-continuous (no triangle cusp) and zeroes tremor at + both endpoints, so the landing point is stable. The final point is forced to + `end` exactly, which preserves the human_click teleport-fix invariant. + """ + sx, sy = start + ex, ey = end + dist = math.hypot(ex - sx, ey - sy) + if dist < 2: + return [(ex, ey)] + + dx, dy = ex - sx, ey - sy + norm = dist or 1 + perp_x, perp_y = -dy / norm, dx / norm + + side = random.choice([-1, 1]) + arc1 = dist * abs(random.gauss(0.09, 0.04)) * side + arc2 = dist * abs(random.gauss(0.09, 0.04)) * side + + cp1 = (sx + dx * 0.3 + perp_x * arc1, sy + dy * 0.3 + perp_y * arc1) + cp2 = (sx + dx * 0.7 + perp_x * arc2, sy + dy * 0.7 + perp_y * arc2) + + ease = _ballistic_easing(n) + nx, ny = _tremor(n, dt, angle) + + points = [] + for i in range(n): + t = ease[i] + mt = 1.0 - t + px = mt**3 * sx + 3 * mt**2 * t * cp1[0] + 3 * mt * t**2 * cp2[0] + t**3 * ex + py = mt**3 * sy + 3 * mt**2 * t * cp1[1] + 3 * mt * t**2 * cp2[1] + t**3 * ey + env = math.sin((i / max(1, n - 1)) * math.pi) + px += nx[i] * env + py += ny[i] * env + points.append((px, py)) + + points[-1] = (ex, ey) + return points + + +def _target_offset(x, y, s=None): + """Click point = session-level systematic bias + small per-click variance. + + Variance is bounded (~1.5px) so small UI targets are still hit. Bias is fixed + per session so the click-error mean does not converge to the target center. + """ + s = s or _s() + bias_x, bias_y = s.click_bias + var = 1.5 + return ( + x + bias_x + random.gauss(0, var), + y + bias_y + random.gauss(0, var * 0.7), + ) + + +# --------------------------------------------------------------------------- +# Key code resolution +# --------------------------------------------------------------------------- + +def _vk_for_char(ch): + """Resolve (vk, code, text) for a character with correct virtual-key codes. + + Uses _CORE_KEYS for special keys, ASCII-UPPERCASE ordinals for letters + (so 'a' -> VK 65 'KeyA', not ord('a')=97 = VK_NUMPAD1), and ASCII digits. + """ + if ch in _CORE_KEYS: + return _CORE_KEYS[ch] + if len(ch) == 1: + upper = ch.upper() + if "A" <= upper <= "Z": + return (ord(upper), "Key%s" % upper, ch) + if "0" <= ch <= "9": + return (ord(ch), "Digit%s" % ch, ch) + return (0, ch, ch if len(ch) == 1 else "") + + +# --------------------------------------------------------------------------- +# Timing / dispatch helpers +# --------------------------------------------------------------------------- + +def _jittered_sleep(base_ms, sigma_ms=None): + if sigma_ms is None: + sigma_ms = _s().profile["event_jitter_ms"] + delay = max(_IPC_FLOOR_MS, base_ms + random.gauss(0, sigma_ms)) / 1000.0 + time.sleep(delay) + + +def _dispatch_char(ch, hold_s): + """One keystroke: keyDown [+ char] -> hold -> keyUp, with correct keycodes.""" + vk, code, t = _vk_for_char(ch) + base = {"key": ch, "code": code, "windowsVirtualKeyCode": vk, "nativeVirtualKeyCode": vk} + cdp("Input.dispatchKeyEvent", type="keyDown", **base, **({"text": t} if t else {})) + if t and len(t) == 1: + cdp("Input.dispatchKeyEvent", type="char", text=t, + **{k: v for k, v in base.items() if k != "text"}) + time.sleep(max(0.01, hold_s)) + cdp("Input.dispatchKeyEvent", type="keyUp", **base) + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def human_session(pacing="paced", fresh=False): + """Configure session pacing. + + Modes: + fast — internal/admin tools (faster movement, smaller hover) + paced — ordinary UI automation (default) + physical — keyboard/mouse event testing (most deliberate) + + fresh=True starts a clean session (ignores any persisted cursor/bias). + """ + global _session + _session = _HumanSession(pacing, fresh=fresh) + return _session + + +def human_wait(base=1.0, drift=True): + """Log-normal randomized wait. When drift=True and the cursor is known, the + cursor wanders a few px during the wait instead of freezing — real users are + never perfectly still while reading, and a frozen cursor between actions is a + session-level bot tell. + """ + s = _s() + p = s.profile + total = max(0.05, _lognormal(base, base * 0.3) * p["wait_mult"]) + + if not drift or s.cursor is None or total < 0.4: + time.sleep(total) + return + + remaining = total + ax, ay = s.cursor # fixed anchor: drift offsets from here, not cumulatively + while remaining > 0.05: + chunk = min(remaining, random.uniform(0.15, 0.4)) + time.sleep(chunk) + remaining -= chunk + if remaining > 0.1 and random.random() < 0.5: + nx, ny = _clamp(ax + random.gauss(0, 4.0), ay + random.gauss(0, 4.0), s) + ix, iy = int(round(nx)), int(round(ny)) + cdp("Input.dispatchMouseEvent", type="mouseMoved", x=ix, y=iy) + s.cursor = [float(ix), float(iy)] + s._save() + + +def human_move(x, y, width=None): + """Move cursor to (x, y) via a Fitts-timed ballistic Bezier trajectory. + + Movement time follows Fitts' Law (pass `width` = target width in px for a + correct index of difficulty; a default is assumed otherwise). Long moves + occasionally overshoot and correct. Coordinates are integer-quantized at + dispatch so MouseEvent.clientX in the page is a normal integer. The session + cursor is updated per event, so a mid-trajectory CDP failure leaves the + cursor where the pointer actually stopped. + """ + s = _s() + p = s.profile + cur = _ensure_cursor(s) + + x, y = _clamp(x, y, s) + dist = math.hypot(x - cur[0], y - cur[1]) + if dist < 2: + s.cursor = [float(x), float(y)] + return + + dt = p["move_step_ms"] / 1000.0 + duration_ms = _fitts_ms(dist, width) * p["move_time_mult"] + n = max(8, min(120, int(duration_ms / p["move_step_ms"]))) + + segments = [] + if dist > 400 and random.random() < 0.15: + over = min(dist * abs(random.gauss(0.06, 0.02)), 60.0) + ux = cur[0] + (x - cur[0]) / dist * (dist + over) + uy = cur[1] + (y - cur[1]) / dist * (dist + over) + ux, uy = _clamp(ux, uy, s) + n1 = max(6, int(n * 0.8)) + n2 = max(4, n - n1) + segments.append(_bezier_trajectory((cur[0], cur[1]), (ux, uy), n1, dt, s.tremor_angle)) + segments.append(_bezier_trajectory((ux, uy), (x, y), n2, dt, s.tremor_angle)) + else: + segments.append(_bezier_trajectory((cur[0], cur[1]), (x, y), n, dt, s.tremor_angle)) + + for seg in segments: + for px, py in seg: + ix, iy = int(round(px)), int(round(py)) + cdp("Input.dispatchMouseEvent", type="mouseMoved", x=ix, y=iy) + s.cursor = [float(ix), float(iy)] + _jittered_sleep(p["move_step_ms"]) + + s.cursor = [float(x), float(y)] + s._save() + + +def human_click(x, y, button="left", width=None): + """Move to the click target, then dispatch press/release at that point. + + Invariant: mousePressed coordinates EXACTLY equal the final mouseMoved + coordinate — the offset (jitter) is folded into the move destination, not + added after, so there is no teleport-on-click. A <=1px micro-drift during the + dwell makes the release point differ slightly from the press (real fingers + shift during a hold) while staying inside the target's hit-box. + """ + s = _s() + p = s.profile + + cx, cy = _target_offset(x, y, s) + cx, cy = _clamp(cx, cy, s) + + human_move(cx, cy, width=width) + + hover_lo, hover_hi = p["hover_range"] + time.sleep(random.uniform(hover_lo, hover_hi)) + + ix, iy = int(round(cx)), int(round(cy)) + cdp("Input.dispatchMouseEvent", + type="mousePressed", x=ix, y=iy, button=button, clickCount=1) + + dwell = _lognormal(p["dwell_mean"], p["dwell_mean"] * 0.28) / 1000.0 + time.sleep(max(0.02, dwell * 0.5)) + + ddx = max(-1, min(1, int(round(random.gauss(0, 0.6))))) + ddy = max(-1, min(1, int(round(random.gauss(0, 0.6))))) + rx, ry = _clamp(ix + ddx, iy + ddy, s) + rx, ry = int(round(rx)), int(round(ry)) + if (rx, ry) != (ix, iy): + cdp("Input.dispatchMouseEvent", type="mouseMoved", x=rx, y=ry) + time.sleep(max(0.02, dwell * 0.5)) + + cdp("Input.dispatchMouseEvent", + type="mouseReleased", x=rx, y=ry, button=button, clickCount=1) + s.cursor = [float(rx), float(ry)] + s._save() + + +def human_type(text, profile="skilled", mode="semantic"): + """Type text with human-like inter-key timing AND key-hold dwell. + + Both modes emit correct virtual-key codes (via _vk_for_char) and a non-zero + key hold — the default semantic mode no longer routes through the core + press_key (which emits 0ms holds and VK_NUMPAD codes for lowercase letters). + + Modes: + semantic — per-key keyDown/char/keyUp with a sampled hold and a + log-normal inter-key gap. Sufficient for most form filling. + physical — down-down (DD) timing measured KeyDown-to-KeyDown per the + CMU dataset, with hold running concurrently with the gap. + + Profiles: hunt_peck (~36 WPM), average (~72), skilled (~100), expert (~140). + """ + tp = _TYPING_PROFILES[profile] + speed = _s().profile["type_speed"] + if mode == "physical": + _type_physical(text, tp, speed) + else: + _type_semantic(text, tp, speed) + + +def _type_semantic(text, tp, speed): + dd_mean = tp["dd_mean"] / speed + dd_std = tp["dd_std"] / speed + hold_mean = tp["hold_mean"] / speed + hold_std = tp["hold_std"] / speed + for i, ch in enumerate(text): + if i > 0: + gap = _lognormal(dd_mean, dd_std) + time.sleep(max(0.02, gap / 1000.0)) + hold = _lognormal(hold_mean, hold_std) / 1000.0 + _dispatch_char(ch, hold) + + +def _type_physical(text, tp, speed): + """Physical typing: KeyDown-to-KeyDown timing per the CMU dataset. + + DD is measured from the previous keyDown to the current keyDown; hold runs + concurrently with DD (it does not add to the inter-keystroke delay). When a + sampled DD is shorter than the previous hold, DD is lifted to hold + 10ms so + the realized DD distribution is not silently truncated for fast profiles. + """ + hold_mean = tp["hold_mean"] / speed + hold_std = tp["hold_std"] / speed + dd_mean = tp["dd_mean"] / speed + dd_std = tp["dd_std"] / speed + + prev_down_time = 0.0 + prev_hold = 0.0 + + for i, ch in enumerate(text): + hold = _lognormal(hold_mean, hold_std) / 1000.0 + if i > 0: + dd = _lognormal(dd_mean, dd_std) / 1000.0 + dd = max(dd, prev_hold + 0.01) + elapsed = time.monotonic() - prev_down_time + time.sleep(max(0.001, dd - elapsed)) + + vk, code, t = _vk_for_char(ch) + base = {"key": ch, "code": code, "windowsVirtualKeyCode": vk, "nativeVirtualKeyCode": vk} + + down_time = time.monotonic() + cdp("Input.dispatchKeyEvent", type="keyDown", **base, + **({"text": t} if t else {})) + if t and len(t) == 1: + cdp("Input.dispatchKeyEvent", type="char", text=t, + **{k: v for k, v in base.items() if k != "text"}) + + time.sleep(max(0.01, hold)) + cdp("Input.dispatchKeyEvent", type="keyUp", **base) + prev_down_time = down_time + prev_hold = hold + + +def human_scroll(x, y, distance=3000, direction="down", device="trackpad"): + """Scroll with human-like physics, anchored at the cursor. + + The cursor is first moved to (x, y) so the wheel events originate where the + pointer actually is (no scroll at a never-visited point), and the session + cursor is left at the anchor afterward. + + Device profiles: + trackpad — small continuous deltas, high frequency (default; macOS pattern) + wheel — discrete detent multiples (a fixed notch x a small count), + matching a mechanical wheel's quantized deltaY + """ + sign = -1 if direction == "down" else 1 + s = _s() + speed = s.profile["scroll_speed"] + + x, y = _clamp(x, y, s) + human_move(x, y) + ix, iy = int(round(x)), int(round(y)) + + if device == "wheel": + notch = random.choice([100, 120]) + interval_mean = 101 + reading_prob = 0.12 + else: + interval_mean = 40 + reading_prob = 0.04 + + scrolled = 0 + while scrolled < distance: + if device == "wheel": + # Always a whole-notch multiple — never clamped to the remainder, so + # every deltaY is a real detent magnitude. The last event may overscroll + # by up to one event (<=3 notches), which is what a real wheel does. + count = random.choices([1, 2, 3], weights=[0.7, 0.2, 0.1])[0] + delta = notch * count + else: + # Continuous trackpad deltas; clamp the final step to the remainder. + delta = min(_lognormal(25, 10) / speed, distance - scrolled) + d = int(round(sign * delta)) + if d == 0: + break + + cdp("Input.dispatchMouseEvent", type="mouseWheel", + x=ix, y=iy, deltaX=0, deltaY=d) + scrolled += abs(d) + + if random.random() < reading_prob: + time.sleep(random.uniform(0.8, 3.0)) + else: + interval = _lognormal(interval_mean, interval_mean * 0.3) / speed / 1000.0 + time.sleep(max(0.025, interval)) + + s.cursor = [float(ix), float(iy)] + s._save() + + +def human_navigate(url): + """Navigate then pause like a human reading the page. + + Resolves new_tab/goto_url and wait_for_load from the core helpers at call + time (so this module does not hard-depend on their names), invalidates the + cached viewport for the new page, and adds a log-normal reading pause. + + Prefers new_tab over goto_url per the SKILL.md rule that goto_url runs in the + user's active tab and clobbers their work — the safe default opens a new tab. + """ + import browser_harness.helpers as _h + + _s().invalidate_viewport() + goto = getattr(_h, "new_tab", None) or getattr(_h, "goto_url", None) + if goto: + goto(url) + wait_for_load = getattr(_h, "wait_for_load", None) + if wait_for_load: + try: + wait_for_load() + except Exception: + pass + human_wait(random.uniform(2.0, 5.0)) diff --git a/tests/unit/test_human_behavior.py b/tests/unit/test_human_behavior.py new file mode 100644 index 00000000..07750f84 --- /dev/null +++ b/tests/unit/test_human_behavior.py @@ -0,0 +1,309 @@ +"""Hermetic unit tests for the human-behavior-simulation layer. + +No browser, no daemon, no installed browser_harness: a fake browser_harness.helpers +is injected into sys.modules before agent_helpers.py is loaded, so every CDP call is +captured in-memory. Pure-math functions are tested statistically; the dispatch +functions are tested for the structural/integer/ordering invariants that the +behavioral model depends on. + +Run: python3 tests/unit/test_human_behavior.py + or: pytest tests/unit/test_human_behavior.py +""" + +import importlib.util +import math +import os +import statistics +import sys +import tempfile +import types + +# --- inject a fake browser_harness.helpers BEFORE loading the module ---------- + +EVENTS = [] # captured CDP calls: list of (method, kwargs) +SLEEPS = [] # captured sleep durations + + +def _fake_cdp(method, **kw): + EVENTS.append((method, kw)) + if method == "Page.getLayoutMetrics": + return {"layoutViewport": {"clientWidth": 1200, "clientHeight": 800}} + return {} + + +_FAKE_KEYS = { + "Enter": (13, "Enter", "\r"), + "Tab": (9, "Tab", "\t"), + "Backspace": (8, "Backspace", ""), + " ": (32, "Space", " "), +} + + +def _install_fake_helpers(): + pkg = types.ModuleType("browser_harness") + helpers = types.ModuleType("browser_harness.helpers") + helpers.cdp = _fake_cdp + helpers._KEYS = _FAKE_KEYS + helpers.new_tab = lambda url: EVENTS.append(("new_tab", {"url": url})) + helpers.goto_url = lambda url: EVENTS.append(("goto_url", {"url": url})) + helpers.wait_for_load = lambda: EVENTS.append(("wait_for_load", {})) + pkg.helpers = helpers + sys.modules["browser_harness"] = pkg + sys.modules["browser_harness.helpers"] = helpers + + +def _load_module(): + _install_fake_helpers() + path = os.path.join(os.path.dirname(__file__), "..", "..", "agent-workspace", "agent_helpers.py") + spec = importlib.util.spec_from_file_location("ah_under_test", os.path.abspath(path)) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +os.environ["BH_TMP_DIR"] = tempfile.mkdtemp(prefix="bh_human_test_") +os.environ["BU_NAME"] = "unittest" +ah = _load_module() + +# make sleeps instant but observable +import time as _time +_REAL_SLEEP = _time.sleep +_time.sleep = lambda d=0: SLEEPS.append(d) + + +def _reset(): + EVENTS.clear() + SLEEPS.clear() + ah.human_session("paced", fresh=True) + + +def _mouse_events(): + return [(m, k) for (m, k) in EVENTS if m == "Input.dispatchMouseEvent"] + + +def _key_events(): + return [(m, k) for (m, k) in EVENTS if m == "Input.dispatchKeyEvent"] + + +# --- pure math --------------------------------------------------------------- + +def test_vk_for_char_letters_digits_specials(): + assert ah._vk_for_char("a") == (65, "KeyA", "a") + assert ah._vk_for_char("A") == (65, "KeyA", "A") + assert ah._vk_for_char("z") == (90, "KeyZ", "z") + assert ah._vk_for_char("5") == (53, "Digit5", "5") + assert ah._vk_for_char("Enter") == (13, "Enter", "\r") + # the bug being fixed: lowercase must NOT map to ord('a')=97 (VK_NUMPAD1) + vk, _, _ = ah._vk_for_char("a") + assert vk == 65 and not (97 <= vk <= 122) + + +def test_lognormal_recovers_mean_std(): + for mean, std in [(120, 34), (85, 24), (1.0, 0.3), (50, 80)]: # incl std>mean + xs = [ah._lognormal(mean, std, max_sigma=12) for _ in range(60000)] + m = statistics.mean(xs) + sd = statistics.pstdev(xs) + assert abs(m - mean) / mean < 0.04, (mean, std, m) + assert abs(sd - std) / std < 0.08, (mean, std, sd) + + +def test_ou_stationary_std_and_autocorr(): + dt = 0.035 + chain = ah._ou_axis(40000, dt, 1.0) + sd = statistics.pstdev(chain) + assert 0.95 <= sd <= 1.05, sd # stationary std == requested (exact discretization) + # lag-1 autocorrelation == exp(-dt/tau) + expected = math.exp(-dt / ah._TREMOR_TAU) + m = statistics.mean(chain) + num = sum((chain[i] - m) * (chain[i + 1] - m) for i in range(len(chain) - 1)) + den = sum((c - m) ** 2 for c in chain) + ac = num / den + assert abs(ac - expected) < 0.03, (ac, expected) + + +def test_tremor_rms_in_human_band(): + dt = 0.035 + nx, ny = ah._tremor(40000, dt, 0.6) + per_axis_rms = math.sqrt((sum(v * v for v in nx) + sum(v * v for v in ny)) / (2 * len(nx))) + assert 0.3 <= per_axis_rms <= 1.2, per_axis_rms # cited human hand-tremor band + + +def test_ballistic_easing_monotonic_and_early_peak(): + n = 100 + e = ah._ballistic_easing(n) + assert e[0] == 0.0 or e[0] < 1e-9 + assert abs(e[-1] - 1.0) < 1e-9 + assert all(e[i + 1] >= e[i] - 1e-12 for i in range(n - 1)), "must be monotonic" + vel = [e[i + 1] - e[i] for i in range(n - 1)] + peak = vel.index(max(vel)) + assert peak < n * 0.5, peak # velocity peaks in first half (asymmetric, not smoothstep) + + +def test_fitts_sublinear_and_increasing(): + short = ah._fitts_ms(50) + long = ah._fitts_ms(1600) + assert long > short + # log law: a 32x distance increase must NOT produce a 32x time increase + assert (long / short) < (1600 / 50) + + +def test_bezier_endpoint_exact_and_finite(): + pts = ah._bezier_trajectory((10.0, 10.0), (640.0, 480.0), n=40, dt=0.035, angle=0.5) + assert len(pts) == 40 + assert pts[-1] == (640.0, 480.0) # invariant: last point is the exact target + assert all(math.isfinite(x) and math.isfinite(y) for x, y in pts) + + +# --- dispatch invariants ----------------------------------------------------- + +def _all_mouse_coords_int(): + for _, k in _mouse_events(): + assert isinstance(k["x"], int) and isinstance(k["y"], int), k + if "deltaY" in k: + assert isinstance(k["deltaY"], int), k + + +def test_human_move_integer_coords_and_cursor_update(): + _reset() + ah.human_move(900, 600) + _all_mouse_coords_int() + moved = [k for m, k in _mouse_events() if k.get("type") == "mouseMoved"] + assert len(moved) >= 8 + assert moved[-1]["x"] == 900 and moved[-1]["y"] == 600 + assert ah._s().cursor == [900.0, 600.0] + + +def test_human_click_teleport_invariant(): + _reset() + ah.human_move(100, 100) + ah.human_click(700, 400) + me = _mouse_events() + press_idx = next(i for i, (m, k) in enumerate(me) if k.get("type") == "mousePressed") + # the event immediately before the press must be a mouseMoved at the SAME coords + prev_type = me[press_idx - 1][1]["type"] + assert prev_type == "mouseMoved" + assert me[press_idx - 1][1]["x"] == me[press_idx][1]["x"] + assert me[press_idx - 1][1]["y"] == me[press_idx][1]["y"] + _all_mouse_coords_int() + # release within 1px of press (micro-drift, stays in hit-box) + rel = next(k for m, k in me if k.get("type") == "mouseReleased") + prs = me[press_idx][1] + assert abs(rel["x"] - prs["x"]) <= 1 and abs(rel["y"] - prs["y"]) <= 1 + + +def test_human_type_semantic_correct_vk_and_hold(): + _reset() + ah.human_type("ab5", mode="semantic") + ke = _key_events() + downs = [k for m, k in ke if k["type"] == "keyDown"] + ups = [k for m, k in ke if k["type"] == "keyUp"] + assert len(downs) == 3 and len(ups) == 3 + # no letter keyDown may carry a NUMPAD/function virtual-key code (the old bug) + for d in downs: + if d["key"].isalpha(): + assert d["windowsVirtualKeyCode"] not in range(97, 123), d + a_down = downs[0] + assert a_down["windowsVirtualKeyCode"] == 65 and a_down["code"] == "KeyA" + # hold is structural: _dispatch_char always sleeps >= 0.01 between down and up + assert any(s >= 0.01 for s in SLEEPS) + + +def test_human_type_physical_down_to_down_and_vk(): + _reset() + ah.human_type("hi", mode="physical") + ke = _key_events() + seq = [(k["type"], k.get("windowsVirtualKeyCode")) for m, k in ke] + # h: keyDown(72) char keyUp(72), i: keyDown(73) char keyUp(73) + downs = [vk for t, vk in seq if t == "keyDown"] + assert downs == [72, 73], downs + + +def test_human_scroll_cursor_anchored_and_integer_deltas(): + import random + random.seed(3) + # run many times: the old final-step clamp produced non-detent deltas ~39% of seeds + for _ in range(60): + _reset() + ah.human_move(50, 50) + EVENTS.clear() + ah.human_scroll(600, 400, distance=800, device="wheel") + me = _mouse_events() + first_wheel = next(i for i, (m, k) in enumerate(me) if k.get("type") == "mouseWheel") + assert any(k.get("type") == "mouseMoved" for m, k in me[:first_wheel]) # anchor move + wheels = [k for m, k in me if k.get("type") == "mouseWheel"] + assert len(wheels) >= 1 + for w in wheels: + assert isinstance(w["deltaY"], int) and w["deltaY"] != 0 + assert abs(w["deltaY"]) % 100 == 0 or abs(w["deltaY"]) % 120 == 0, w # every event a detent multiple + assert ah._s().cursor == [600.0, 400.0] + + +def test_idle_drift_stays_near_anchor(): + import math as _m + import random + random.seed(11) + _reset() + ah.human_move(400, 400) + anchor = tuple(ah._s().cursor) + EVENTS.clear() + ah.human_wait(10.0, drift=True) # long wait => many drift steps + for m, k in _mouse_events(): + if k.get("type") == "mouseMoved": + assert _m.hypot(k["x"] - anchor[0], k["y"] - anchor[1]) <= 25, (k, anchor) + + +def test_click_release_stays_in_viewport_at_corner(): + _reset() + ah.human_move(1100, 700) + EVENTS.clear() + ah.human_click(5000, 5000) # clamps to bottom-right corner (1199, 799) + for m, k in _mouse_events(): + assert 0 <= k["x"] <= 1199 and 0 <= k["y"] <= 799, k + cx, cy = ah._s().cursor + assert 0 <= cx <= 1199 and 0 <= cy <= 799 + + +def test_human_wait_idle_drift_emits_moves(): + import random + random.seed(7) + _reset() + ah.human_move(300, 300) + EVENTS.clear() + ah.human_wait(2.0, drift=True) + moved = [k for m, k in _mouse_events() if k.get("type") == "mouseMoved"] + assert len(moved) >= 1, "idle drift should emit at least one move over a 2s wait" + _all_mouse_coords_int() + + +def test_human_navigate_invalidates_viewport_and_pauses(): + _reset() + ah._s().viewport = (999, 999) + ah.human_navigate("https://example.com") + assert ah._s().viewport != (999, 999) or ah._s().viewport is None + # prefers new_tab (does not clobber the user's active tab via goto_url) + assert any(m == "new_tab" for m, k in EVENTS) + assert not any(m == "goto_url" for m, k in EVENTS) + + +def test_no_public_config_leak(): + # only human_* verbs should be exportable (no leading-underscore tables/class) + public = [n for n in vars(ah) if not n.startswith("_") and callable(getattr(ah, n))] + human = [n for n in public if n.startswith("human_")] + leaked = [n for n in public if n in ("PACING", "TYPING_PROFILES", "HumanSession")] + assert leaked == [], leaked + assert set(human) >= {"human_session", "human_wait", "human_move", + "human_click", "human_type", "human_scroll", "human_navigate"} + + +def _run_all(): + fns = [g for n, g in sorted(globals().items()) if n.startswith("test_") and callable(g)] + passed = 0 + for fn in fns: + fn() + print("PASS %s" % fn.__name__) + passed += 1 + print("\n%d/%d tests passed" % (passed, len(fns))) + + +if __name__ == "__main__": + _run_all() From 7da475c6feb5b6420096fa80ca7d9c52a5306340 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?thesqrldev=F0=9F=90=BF=EF=B8=8F?= Date: Fri, 29 May 2026 06:30:00 +0900 Subject: [PATCH 2/9] =?UTF-8?q?feat(human-sim):=20push=20past=20the=20CDP?= =?UTF-8?q?=20input=20ceilings=20=E2=80=94=20server-side=20~60Hz=20dispatc?= =?UTF-8?q?h=20+=20Runtime.enable=20drop?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tackles the three documented behavioral-detection ceilings with what is actually fixable (researched against Chromium source + fingerprinting lit), and documents honestly what is not. Event RATE (was ~28Hz) — FIXED: - daemon: new `meta:"input_sequence"` handler dispatches a precomputed event list server-side over the persistent CDP WS, sleeping delay_ms before each. Decouples the rate from the per-call client<->daemon IPC round-trip. - helpers: `_send(timeout, raise_on_error)` + `dispatch_input_sequence()` (timeout sized to sum(delay) + per-event slack). - agent_helpers: human_move/human_click/human_scroll now build one event batch and emit via `_emit()`; move_step_ms lowered to ~16ms (~60Hz). A mid-batch failure resumes the remainder client-side (resume-from-count, never re-sends the dispatched prefix — no double-fire); pre-batch daemons fall back to the client path automatically. CDP-presence — mitigated: - daemon omits Runtime.enable by default (`_enabled_domains()`), removing the console-serialization detection class. Nothing consumes Runtime events and Runtime.evaluate works without enable; BH_CDP_ENABLE_RUNTIME=1 restores it. Documented as NOT fixable in software (need a patched Chromium): - getCoalescedEvents() stays empty — CDP injects via ForwardMouseEvent, bypassing the compositor coalescing queue (also why we target ~60Hz, not higher: extra uncoalesced events look more anomalous). - screenX==clientX — CDP sets no window/desktop offset (Cloudflare Turnstile checks this); not settable via CDP. Corrected: pressure 0/0.5, tilt 0, pointerType "mouse" are spec-correct for a real mouse — NOT a bot tell (earlier over-statement removed). Tests: tests/unit/test_daemon_input_sequence.py (3, hermetic via cdp_use stub) + test_human_behavior.py grows to 22 (batch dispatch, ~60Hz rate, single-batch click invariant, fallback, resume-from-count). All 25 pass; py_compile clean. Reviewed in a separate lane across two passes (APPROVE; the partial-failure double-dispatch found in pass 1 is fixed and regression-tested). Co-Authored-By: Claude Opus 4.8 (1M context) --- agent-workspace/HUMAN_SIM_VALIDATION.md | 43 ++-- agent-workspace/agent_helpers.py | 243 ++++++++++++++++------- src/browser_harness/daemon.py | 43 +++- src/browser_harness/helpers.py | 26 ++- tests/unit/test_daemon_input_sequence.py | 115 +++++++++++ tests/unit/test_human_behavior.py | 97 +++++++++ 6 files changed, 475 insertions(+), 92 deletions(-) create mode 100644 tests/unit/test_daemon_input_sequence.py diff --git a/agent-workspace/HUMAN_SIM_VALIDATION.md b/agent-workspace/HUMAN_SIM_VALIDATION.md index b5ebf536..8a7259e8 100644 --- a/agent-workspace/HUMAN_SIM_VALIDATION.md +++ b/agent-workspace/HUMAN_SIM_VALIDATION.md @@ -14,7 +14,8 @@ is genuinely the user's own. This layer addresses the residual **behavioral** su ## How to run the tests ```bash -python3 tests/unit/test_human_behavior.py # 17/17, hermetic (no browser/daemon) +python3 tests/unit/test_human_behavior.py # 22/22 — behavior + dispatch invariants +python3 tests/unit/test_daemon_input_sequence.py # 3/3 — daemon batch handler + Runtime omit ``` The suite injects a fake `browser_harness.helpers` (capturing every CDP call) so the module's @@ -50,17 +51,35 @@ to land at ~0.795px (inside the human band), with the realized per-step angle fa Rationale: micro-jitter RMS is a directly-measured detector signal; the 8.2° figure is an asserted aggregate. Both metrics now sit in a plausible region rather than one being wildly off. -## Known ceilings — NOT fixable in this layer (documented honestly in the module docstring) - -1. **Event rate 20–40Hz** — each CDP call is a per-call IPC round-trip; real pointing devices emit - 60–1000Hz. A rate-binning detector flags this regardless of trajectory shape. Fixing it requires - daemon-side vector batching (core change, out of scope for `agent-workspace/`). -2. **`getCoalescedEvents().length == 1`** and no PointerEvent pressure/tilt — `Input.dispatchMouseEvent` - emits MouseEvents only; pointer-fidelity detectors see synthetic input. -3. **CDP / remote-debugging presence** is detectable independent of behavior (anti-debugger probes). - -Net: this layer lowers per-action risk against heuristic/weak-ML detectors and buys time; it does -**not** make a session indistinguishable to a top-tier 200–2000-signal ensemble. +## Ceilings — what the 2026-05-29 daemon/core update fixed, and what it cannot + +Researched against Chromium source + fingerprinting literature. + +1. **Event RATE — FIXED.** High-frequency mouse/wheel dispatch now runs server-side via the + daemon's persistent CDP WS (new `meta:"input_sequence"` handler in `daemon.py` + + `helpers.dispatch_input_sequence`), so top-level events reach the page at ~60Hz instead of + the ~28–30Hz the per-call IPC client path tops out at. A mid-batch send failure resumes the + remainder client-side (resume-from-count; never re-sends the dispatched prefix); a pre-batch + daemon falls back to the client path automatically (restart the daemon for the fast path). +2. **Coalesced events — NOT fixable in software.** CDP `Input.dispatchMouseEvent` injects via + `RenderWidgetHostImpl::ForwardMouseEvent`, bypassing the compositor coalescing queue, so + `PointerEvent.getCoalescedEvents()` stays empty at *any* injection rate. (This is precisely + why we target ~60Hz, not higher — extra uncoalesced events look more anomalous, not less.) + Closing it requires a patched Chromium binary. +3. **screenX/screenY — residual tell.** CDP sets `screenX==clientX` (no window/desktop offset), + which a real windowed browser never produces; Cloudflare Turnstile checks this. Not settable + via CDP and not safely patchable from page JS. Unfixed. +4. **pressure / tilt / pointerType — NOT a tell.** pressure 0 (no button) / 0.5 (button), tilt 0, + pointerType "mouse" are exactly the W3C defaults a real mouse reports. (Corrects an earlier + over-statement that the absence of a pressure/tilt stream was synthetic.) +5. **CDP-presence — mitigated.** The daemon omits `Runtime.enable` by default + (`BH_CDP_ENABLE_RUNTIME=1` restores it), removing the console-serialization detection class; + `Runtime.evaluate` works without it and nothing in browser-harness consumes Runtime events. + An attached remote-debugging client remains fundamentally detectable by other means. + +Net: defeats heuristic/weak-ML detectors and the event-rate signal. The coalesced-events and +screenX tells mean a top-tier ensemble inspecting CDP input fidelity can still identify the +session; full parity needs a patched Chromium, out of scope for this pure-Python layer. ## Backward compatibility diff --git a/agent-workspace/agent_helpers.py b/agent-workspace/agent_helpers.py index 27d055ce..2951adf7 100644 --- a/agent-workspace/agent_helpers.py +++ b/agent-workspace/agent_helpers.py @@ -16,17 +16,31 @@ `browser-harness -c '...'` invocations via a per-BU_NAME state file, so the cursor does not teleport to a fresh random point on every call. -KNOWN CEILINGS — NOT fixable inside this layer; documented honestly: - * Event rate is floored ~20-40Hz by the per-call CDP/IPC round-trip and time.sleep - granularity. Real pointing devices report at 60-1000Hz. A detector that bins - inter-event intervals can flag the low rate regardless of trajectory shape. - * Input.dispatchMouseEvent emits MouseEvents only: getCoalescedEvents().length == 1 - and there is no PointerEvent pressure/tilt stream. Pointer-fidelity detectors see - synthetic input even with a perfect trajectory. - * A CDP/remote-debugging attachment is itself detectable (anti-debugger probes, - Runtime/Page domain side-effects) independent of behavior. -This layer targets heuristic and weak-ML detectors; against top-tier ensembles it -lowers per-action risk and buys time, it does not make a session indistinguishable. +KNOWN CEILINGS — researched (Chromium source + fingerprinting literature, 2026-05): + * Event RATE — FIXED. High-frequency mouse trajectories and wheel streams now + dispatch server-side via the daemon's persistent CDP WS (helpers + .dispatch_input_sequence), so top-level mouse/pointer events reach the page at + ~60Hz (a plausible delivered rate; higher would only add uncoalesced events that + look more anomalous, see below), not the ~30Hz the per-call IPC client path tops + out at. Falls back to client-side dispatch if the daemon predates the batch op + (restart the daemon for the fast path); a mid-batch failure resumes the remainder + rather than re-sending the dispatched prefix. + * COALESCED EVENTS — NOT fixable in software. CDP Input.dispatchMouseEvent injects + via RenderWidgetHostImpl::ForwardMouseEvent, bypassing the compositor coalescing + queue, so PointerEvent.getCoalescedEvents() stays empty regardless of injection + rate. (This is why we target ~60Hz, not higher: extra uncoalesced events would + only look more anomalous.) Closing it requires a patched Chromium binary. + * screenX/screenY — residual tell. CDP sets screenX==clientX (no window/desktop + offset), which a real windowed browser never produces; Cloudflare Turnstile + checks this. Not settable via CDP and not safely patchable from page JS. + * pressure/tilt/pointerType — NOT a tell: pressure 0 (no button) / 0.5 (button), + tilt 0, pointerType "mouse" are exactly the W3C defaults a real mouse reports. + * CDP-presence — Runtime.enable is omitted by default at the daemon (kills the + console-serialization detection class); but an attached remote-debugging client + is fundamentally detectable by other means. +Net: defeats heuristic/weak-ML detectors and the event-rate signal. The coalesced +and screenX tells mean a top-tier ensemble inspecting CDP input fidelity can still +identify the session; full parity needs a patched Chromium (out of scope here). """ import json, math, os, random, tempfile, time @@ -42,7 +56,7 @@ _PACING = { "fast": { - "move_step_ms": 25, + "move_step_ms": 14, "move_time_mult": 0.7, "hover_range": (0.02, 0.05), "dwell_mean": 45, @@ -52,7 +66,7 @@ "event_jitter_ms": 1.5, }, "paced": { - "move_step_ms": 35, + "move_step_ms": 16, "move_time_mult": 1.0, "hover_range": (0.08, 0.20), "dwell_mean": 85, @@ -62,7 +76,7 @@ "event_jitter_ms": 3.0, }, "physical": { - "move_step_ms": 50, + "move_step_ms": 20, "move_time_mult": 1.3, "hover_range": (0.10, 0.25), "dwell_mean": 95, @@ -374,13 +388,6 @@ def _vk_for_char(ch): # Timing / dispatch helpers # --------------------------------------------------------------------------- -def _jittered_sleep(base_ms, sigma_ms=None): - if sigma_ms is None: - sigma_ms = _s().profile["event_jitter_ms"] - delay = max(_IPC_FLOOR_MS, base_ms + random.gauss(0, sigma_ms)) / 1000.0 - time.sleep(delay) - - def _dispatch_char(ch, hold_s): """One keystroke: keyDown [+ char] -> hold -> keyUp, with correct keycodes.""" vk, code, t = _vk_for_char(ch) @@ -393,6 +400,95 @@ def _dispatch_char(ch, hold_s): cdp("Input.dispatchKeyEvent", type="keyUp", **base) +# --------------------------------------------------------------------------- +# Server-side batched dispatch +# --------------------------------------------------------------------------- + +def _emit(events): + """Dispatch a precomputed input-event list in ONE IPC call (server-side, ~60Hz). + + Prefers helpers.dispatch_input_sequence so the daemon emits events over its + persistent CDP WS, decoupling the event rate from per-call IPC. Falls back to + client-side cdp() (respecting the IPC floor) if the daemon predates the batch + op. Each event is {"method","params","delay_ms"}; delay is applied BEFORE it. + """ + if not events: + return + seq = None + try: + from browser_harness.helpers import dispatch_input_sequence as seq + except Exception: + seq = None + if seq is not None: + try: + r = seq(events) + except Exception: + r = None # transport/connect failure -> dispatch the whole thing client-side + if isinstance(r, dict): + if r.get("ok"): + return # fully dispatched server-side + if "count" in r: + # The daemon ran the batch but a send failed mid-sequence (e.g. a + # stale session after navigation). It already emitted r["count"] + # events — resume ONLY the remainder client-side (cdp() auto-reattaches + # on a stale session). Re-sending the dispatched prefix would + # double-fire events (a correctness bug AND a detection tell). + events = events[int(r["count"]):] + # else: error WITHOUT count == op unsupported (pre-batch daemon) -> + # nothing was dispatched, so fall through to full client-side dispatch. + for ev in events: + d = ev.get("delay_ms") or 0 + if d: + time.sleep(max(_IPC_FLOOR_MS, d) / 1000.0) + cdp(ev["method"], **(ev.get("params") or {})) + + +def _move_events(s, start, end, width=None): + """Build (without dispatching) the mouseMoved event list for a ballistic move. + + Returns (events, end_xy). Per-event delays are ~move_step_ms with jitter; the + final event lands exactly on `end` (preserving the click teleport invariant). + """ + p = s.profile + sx, sy = start + ex, ey = end + dist = math.hypot(ex - sx, ey - sy) + if dist < 2: + return [], (float(ex), float(ey)) + + step_ms = p["move_step_ms"] + dt = step_ms / 1000.0 + duration_ms = _fitts_ms(dist, width) * p["move_time_mult"] + n = max(8, min(120, int(duration_ms / step_ms))) + + segments = [] + if dist > 400 and random.random() < 0.15: + over = min(dist * abs(random.gauss(0.06, 0.02)), 60.0) + ux = sx + (ex - sx) / dist * (dist + over) + uy = sy + (ey - sy) / dist * (dist + over) + ux, uy = _clamp(ux, uy, s) + n1 = max(6, int(n * 0.8)) + n2 = max(4, n - n1) + segments.append(_bezier_trajectory((sx, sy), (ux, uy), n1, dt, s.tremor_angle)) + segments.append(_bezier_trajectory((ux, uy), (ex, ey), n2, dt, s.tremor_angle)) + else: + segments.append(_bezier_trajectory((sx, sy), (ex, ey), n, dt, s.tremor_angle)) + + jit = p["event_jitter_ms"] + events = [] + first = True + for seg in segments: + for px, py in seg: + delay = 0.0 if first else max(4.0, step_ms + random.gauss(0, jit)) + events.append({ + "method": "Input.dispatchMouseEvent", + "params": {"type": "mouseMoved", "x": int(round(px)), "y": int(round(py))}, + "delay_ms": round(delay, 2), + }) + first = False + return events, (float(ex), float(ey)) + + # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- @@ -451,40 +547,12 @@ def human_move(x, y, width=None): cursor where the pointer actually stopped. """ s = _s() - p = s.profile cur = _ensure_cursor(s) - x, y = _clamp(x, y, s) - dist = math.hypot(x - cur[0], y - cur[1]) - if dist < 2: - s.cursor = [float(x), float(y)] - return - - dt = p["move_step_ms"] / 1000.0 - duration_ms = _fitts_ms(dist, width) * p["move_time_mult"] - n = max(8, min(120, int(duration_ms / p["move_step_ms"]))) - - segments = [] - if dist > 400 and random.random() < 0.15: - over = min(dist * abs(random.gauss(0.06, 0.02)), 60.0) - ux = cur[0] + (x - cur[0]) / dist * (dist + over) - uy = cur[1] + (y - cur[1]) / dist * (dist + over) - ux, uy = _clamp(ux, uy, s) - n1 = max(6, int(n * 0.8)) - n2 = max(4, n - n1) - segments.append(_bezier_trajectory((cur[0], cur[1]), (ux, uy), n1, dt, s.tremor_angle)) - segments.append(_bezier_trajectory((ux, uy), (x, y), n2, dt, s.tremor_angle)) - else: - segments.append(_bezier_trajectory((cur[0], cur[1]), (x, y), n, dt, s.tremor_angle)) - - for seg in segments: - for px, py in seg: - ix, iy = int(round(px)), int(round(py)) - cdp("Input.dispatchMouseEvent", type="mouseMoved", x=ix, y=iy) - s.cursor = [float(ix), float(iy)] - _jittered_sleep(p["move_step_ms"]) - - s.cursor = [float(x), float(y)] + events, end = _move_events(s, (cur[0], cur[1]), (x, y), width) + if events: + _emit(events) + s.cursor = [end[0], end[1]] s._save() @@ -499,32 +567,48 @@ def human_click(x, y, button="left", width=None): """ s = _s() p = s.profile + cur = _ensure_cursor(s) cx, cy = _target_offset(x, y, s) cx, cy = _clamp(cx, cy, s) - human_move(cx, cy, width=width) - - hover_lo, hover_hi = p["hover_range"] - time.sleep(random.uniform(hover_lo, hover_hi)) - + events, _ = _move_events(s, (cur[0], cur[1]), (cx, cy), width) ix, iy = int(round(cx)), int(round(cy)) - cdp("Input.dispatchMouseEvent", - type="mousePressed", x=ix, y=iy, button=button, clickCount=1) - dwell = _lognormal(p["dwell_mean"], p["dwell_mean"] * 0.28) / 1000.0 - time.sleep(max(0.02, dwell * 0.5)) + hover_ms = random.uniform(*p["hover_range"]) * 1000.0 + dwell_ms = _lognormal(p["dwell_mean"], p["dwell_mean"] * 0.28) + + # mousePressed at EXACTLY the final move coordinate (no teleport-on-click). + events.append({ + "method": "Input.dispatchMouseEvent", + "params": {"type": "mousePressed", "x": ix, "y": iy, "button": button, "clickCount": 1}, + "delay_ms": round(hover_ms, 2), + }) + # <=1px release micro-drift during the dwell, clamped in-viewport. ddx = max(-1, min(1, int(round(random.gauss(0, 0.6))))) ddy = max(-1, min(1, int(round(random.gauss(0, 0.6))))) rx, ry = _clamp(ix + ddx, iy + ddy, s) rx, ry = int(round(rx)), int(round(ry)) if (rx, ry) != (ix, iy): - cdp("Input.dispatchMouseEvent", type="mouseMoved", x=rx, y=ry) - time.sleep(max(0.02, dwell * 0.5)) + events.append({ + "method": "Input.dispatchMouseEvent", + "params": {"type": "mouseMoved", "x": rx, "y": ry}, + "delay_ms": round(dwell_ms * 0.5, 2), + }) + events.append({ + "method": "Input.dispatchMouseEvent", + "params": {"type": "mouseReleased", "x": rx, "y": ry, "button": button, "clickCount": 1}, + "delay_ms": round(dwell_ms * 0.5, 2), + }) + else: + events.append({ + "method": "Input.dispatchMouseEvent", + "params": {"type": "mouseReleased", "x": ix, "y": iy, "button": button, "clickCount": 1}, + "delay_ms": round(dwell_ms, 2), + }) - cdp("Input.dispatchMouseEvent", - type="mouseReleased", x=rx, y=ry, button=button, clickCount=1) + _emit(events) s.cursor = [float(rx), float(ry)] s._save() @@ -622,7 +706,8 @@ def human_scroll(x, y, distance=3000, direction="down", device="trackpad"): speed = s.profile["scroll_speed"] x, y = _clamp(x, y, s) - human_move(x, y) + cur = _ensure_cursor(s) + events, _ = _move_events(s, (cur[0], cur[1]), (x, y), None) # anchor the pointer first ix, iy = int(round(x)), int(round(y)) if device == "wheel": @@ -630,10 +715,11 @@ def human_scroll(x, y, distance=3000, direction="down", device="trackpad"): interval_mean = 101 reading_prob = 0.12 else: - interval_mean = 40 + interval_mean = 16 # ~60Hz trackpad momentum (server-side dispatch enables it) reading_prob = 0.04 scrolled = 0 + first_wheel = True while scrolled < distance: if device == "wheel": # Always a whole-notch multiple — never clamped to the remainder, so @@ -648,16 +734,21 @@ def human_scroll(x, y, distance=3000, direction="down", device="trackpad"): if d == 0: break - cdp("Input.dispatchMouseEvent", type="mouseWheel", - x=ix, y=iy, deltaX=0, deltaY=d) - scrolled += abs(d) - - if random.random() < reading_prob: - time.sleep(random.uniform(0.8, 3.0)) + if first_wheel: + dly = 0.0 + elif random.random() < reading_prob: + dly = random.uniform(0.8, 3.0) * 1000.0 else: - interval = _lognormal(interval_mean, interval_mean * 0.3) / speed / 1000.0 - time.sleep(max(0.025, interval)) + dly = max(8.0, _lognormal(interval_mean, interval_mean * 0.3) / speed) + events.append({ + "method": "Input.dispatchMouseEvent", + "params": {"type": "mouseWheel", "x": ix, "y": iy, "deltaX": 0, "deltaY": d}, + "delay_ms": round(dly, 2), + }) + scrolled += abs(d) + first_wheel = False + _emit(events) s.cursor = [float(ix), float(iy)] s._save() diff --git a/src/browser_harness/daemon.py b/src/browser_harness/daemon.py index 077183e7..55dbeba5 100644 --- a/src/browser_harness/daemon.py +++ b/src/browser_harness/daemon.py @@ -179,6 +179,24 @@ def is_real_page(t): return t["type"] == "page" and not t.get("url", "").startswith(INTERNAL) +def _enabled_domains(): + """CDP domains to enable on each session. + + Runtime is OMITTED by default: enabling it activates console-serialization + side effects that historically leaked CDP presence (the console.log-getter / + error-getter detection). The classic getter variant was patched in V8 ~M127, + but the Proxy-trap variant and pre-M127 Chrome remain, and nothing in + browser-harness consumes Runtime events — Runtime.evaluate (used by js()/the + title marker) works WITHOUT Runtime.enable. Page/DOM/Network are required + (notably for wait_for_network_idle). Set BH_CDP_ENABLE_RUNTIME=1 to restore + Runtime.enable if a downstream use ever needs Runtime events. + """ + domains = ["Page", "DOM", "Network"] + if os.environ.get("BH_CDP_ENABLE_RUNTIME") == "1": + domains.append("Runtime") + return domains + + class Daemon: def __init__(self): self.cdp = None @@ -227,7 +245,7 @@ async def enable_one(d): ) except Exception as e: log(f"enable {d} on {session_id}: {e}") - await asyncio.gather(*(enable_one(d) for d in ("Page", "DOM", "Runtime", "Network"))) + await asyncio.gather(*(enable_one(d) for d in _enabled_domains())) async def start(self): self.stop = asyncio.Event() @@ -274,6 +292,29 @@ async def handle(self, req): if meta == "drain_events": out = list(self.events); self.events.clear() return {"events": out} + if meta == "input_sequence": + # Dispatch a precomputed list of input events server-side over the + # persistent CDP WS, sleeping delay_ms BEFORE each event. This decouples + # the event rate from the per-call client<->daemon IPC round-trip, so a + # mouse trajectory reaches Chrome at a realistic ~60Hz instead of the + # ~30Hz the one-socket-per-call client path tops out at. Events: + # [{"method": "Input.dispatchMouseEvent", "params": {...}, "delay_ms": 16}, ...] + events = req.get("events") or [] + sid = req.get("session_id") or self.session + n = 0 + for ev in events: + dly = ev.get("delay_ms") or 0 + if dly > 0: + await asyncio.sleep(dly / 1000.0) + m = ev.get("method") + if not m: + continue + try: + await self.cdp.send_raw(m, ev.get("params") or {}, session_id=sid) + except Exception as e: + return {"error": str(e), "count": n} + n += 1 + return {"ok": True, "count": n} if meta == "session": return {"session_id": self.session} if meta == "current_tab": # Resolve the attached page's target info server-side. Helpers can't diff --git a/src/browser_harness/helpers.py b/src/browser_harness/helpers.py index 7e4cf13c..a12c255c 100644 --- a/src/browser_harness/helpers.py +++ b/src/browser_harness/helpers.py @@ -39,13 +39,13 @@ def _load_env_file(p): INTERNAL = ("chrome://", "chrome-untrusted://", "devtools://", "chrome-extension://", "about:") -def _send(req): - c, token = ipc.connect(NAME, timeout=5.0) +def _send(req, timeout=5.0, raise_on_error=True): + c, token = ipc.connect(NAME, timeout=timeout) try: r = ipc.request(c, token, req) finally: c.close() - if "error" in r: raise RuntimeError(r["error"]) + if raise_on_error and "error" in r: raise RuntimeError(r["error"]) return r @@ -54,6 +54,26 @@ def cdp(method, session_id=None, **params): return _send({"method": method, "params": params, "session_id": session_id}).get("result", {}) +def dispatch_input_sequence(events, session_id=None): + """Dispatch a list of input events server-side, at a realistic rate, in ONE IPC call. + + events: [{"method": "Input.dispatchMouseEvent", "params": {...}, "delay_ms": 16}, ...] + The daemon sleeps delay_ms BEFORE each event and emits it over its persistent + CDP WebSocket, so the realized inter-event interval is bounded by the local WS + (sub-ms) rather than a fresh client socket per event (which tops out ~30Hz). + Use for high-frequency mouse trajectories / wheel streams; ordinary calls keep + using cdp(). The client read timeout is sized to the sequence's total delay plus + a per-event allowance for the daemon's CDP round-trips. Returns the daemon reply + dict WITHOUT raising: {"ok": True, "count": N} on success, or {"error", "count"} + if a send failed mid-sequence (count = events already dispatched) — the caller + uses count to resume the remainder instead of re-dispatching the whole batch. + """ + total_s = sum((e.get("delay_ms") or 0) for e in events) / 1000.0 + timeout = max(5.0, total_s + 10.0 + 0.01 * len(events)) + return _send({"meta": "input_sequence", "events": events, "session_id": session_id}, + timeout=timeout, raise_on_error=False) + + def drain_events(): return _send({"meta": "drain_events"})["events"] diff --git a/tests/unit/test_daemon_input_sequence.py b/tests/unit/test_daemon_input_sequence.py new file mode 100644 index 00000000..ba31cbe8 --- /dev/null +++ b/tests/unit/test_daemon_input_sequence.py @@ -0,0 +1,115 @@ +"""Hermetic test for the daemon input_sequence handler + Runtime.enable omission. + +Stubs cdp_use.client so daemon.py imports without the real CDP dependency, then +drives Daemon.handle() with a recording fake CDP client. No browser, no daemon. + +Run: python3 tests/unit/test_daemon_input_sequence.py +""" +import asyncio +import os +import sys +import types + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "src"))) + +# Stub cdp_use.client so `from cdp_use.client import CDPClient` succeeds. +_cu = types.ModuleType("cdp_use") +_cuc = types.ModuleType("cdp_use.client") + + +class _StubCDPClient: + def __init__(self, url): + self.url = url + + +_cuc.CDPClient = _StubCDPClient +sys.modules["cdp_use"] = _cu +sys.modules["cdp_use.client"] = _cuc + +import browser_harness.daemon as dm # noqa: E402 + + +class _RecCDP: + def __init__(self): + self.calls = [] + + async def send_raw(self, method, params=None, session_id=None): + self.calls.append((method, params or {}, session_id)) + return {} + + +def test_input_sequence_dispatches_in_order_with_delays(): + sleeps = [] + + async def fake_sleep(d): + sleeps.append(d) + + orig = asyncio.sleep + asyncio.sleep = fake_sleep + try: + d = dm.Daemon() + d.cdp = _RecCDP() + d.session = "S1" + events = [ + {"method": "Input.dispatchMouseEvent", "params": {"type": "mouseMoved", "x": 1, "y": 2}, "delay_ms": 0}, + {"method": "Input.dispatchMouseEvent", "params": {"type": "mouseMoved", "x": 3, "y": 4}, "delay_ms": 16}, + {"method": "Input.dispatchMouseEvent", "params": {"type": "mousePressed", "x": 3, "y": 4, "button": "left", "clickCount": 1}, "delay_ms": 50}, + ] + res = asyncio.run(d.handle({"meta": "input_sequence", "events": events, "session_id": "S1"})) + assert res == {"ok": True, "count": 3}, res + assert [c[0] for c in d.cdp.calls] == ["Input.dispatchMouseEvent"] * 3 + assert d.cdp.calls[0][1] == {"type": "mouseMoved", "x": 1, "y": 2} + assert d.cdp.calls[2][1]["type"] == "mousePressed" + assert all(c[2] == "S1" for c in d.cdp.calls) # all to the page session + assert sorted(round(s * 1000) for s in sleeps) == [16, 50] # delay_ms honored, 0 skipped + finally: + asyncio.sleep = orig + + +def test_input_sequence_aborts_on_send_error_with_count(): + async def fake_sleep(d): + pass + + orig = asyncio.sleep + asyncio.sleep = fake_sleep + try: + class _FailSecond: + def __init__(self): + self.n = 0 + + async def send_raw(self, method, params=None, session_id=None): + self.n += 1 + if self.n == 2: + raise RuntimeError("Session with given id not found") + return {} + + d = dm.Daemon() + d.cdp = _FailSecond() + d.session = "S" + events = [{"method": "Input.dispatchMouseEvent", "params": {}, "delay_ms": 0}] * 3 + res = asyncio.run(d.handle({"meta": "input_sequence", "events": events})) + assert res.get("error") and res.get("count") == 1, res # aborts, reports progress + finally: + asyncio.sleep = orig + + +def test_enabled_domains_omits_runtime_by_default(): + os.environ.pop("BH_CDP_ENABLE_RUNTIME", None) + assert dm._enabled_domains() == ["Page", "DOM", "Network"] # Runtime omitted + os.environ["BH_CDP_ENABLE_RUNTIME"] = "1" + try: + assert "Runtime" in dm._enabled_domains() # restorable via env + finally: + os.environ.pop("BH_CDP_ENABLE_RUNTIME", None) + + +def _run_all(): + fns = [g for n, g in sorted(globals().items()) if n.startswith("test_") and callable(g)] + for fn in fns: + fn() + print("PASS", fn.__name__) + print("\n%d/%d passed" % (len(fns), len(fns))) + + +if __name__ == "__main__": + _run_all() diff --git a/tests/unit/test_human_behavior.py b/tests/unit/test_human_behavior.py index 07750f84..cb24d970 100644 --- a/tests/unit/test_human_behavior.py +++ b/tests/unit/test_human_behavior.py @@ -22,6 +22,7 @@ EVENTS = [] # captured CDP calls: list of (method, kwargs) SLEEPS = [] # captured sleep durations +BATCHES = [] # captured dispatch_input_sequence event lists def _fake_cdp(method, **kw): @@ -31,6 +32,15 @@ def _fake_cdp(method, **kw): return {} +def _fake_dispatch_seq(events, session_id=None): + # Record the batch AND expand it into EVENTS, mirroring what the daemon does + # server-side, so the existing per-event assertions keep working. + BATCHES.append(events) + for ev in events: + EVENTS.append((ev["method"], ev.get("params") or {})) + return {"ok": True, "count": len(events)} + + _FAKE_KEYS = { "Enter": (13, "Enter", "\r"), "Tab": (9, "Tab", "\t"), @@ -43,6 +53,7 @@ def _install_fake_helpers(): pkg = types.ModuleType("browser_harness") helpers = types.ModuleType("browser_harness.helpers") helpers.cdp = _fake_cdp + helpers.dispatch_input_sequence = _fake_dispatch_seq helpers._KEYS = _FAKE_KEYS helpers.new_tab = lambda url: EVENTS.append(("new_tab", {"url": url})) helpers.goto_url = lambda url: EVENTS.append(("goto_url", {"url": url})) @@ -74,6 +85,7 @@ def _load_module(): def _reset(): EVENTS.clear() SLEEPS.clear() + BATCHES.clear() ah.human_session("paced", fresh=True) @@ -295,6 +307,91 @@ def test_no_public_config_leak(): "human_click", "human_type", "human_scroll", "human_navigate"} +def test_move_dispatched_as_single_batch(): + _reset() + ah.human_move(900, 600) + assert len(BATCHES) == 1, "human_move should dispatch ONE server-side batch" + evs = BATCHES[0] + assert len(evs) >= 8 + for e in evs: + assert e["method"] == "Input.dispatchMouseEvent" + assert e["params"]["type"] == "mouseMoved" + assert isinstance(e["params"]["x"], int) and isinstance(e["params"]["y"], int) + assert isinstance(e["delay_ms"], (int, float)) and e["delay_ms"] >= 0 + assert evs[0]["delay_ms"] == 0.0 # first event fires immediately + assert evs[-1]["params"]["x"] == 900 and evs[-1]["params"]["y"] == 600 # exact endpoint + + +def test_move_event_rate_near_60hz(): + _reset() + ah.human_move(1000, 700) + nz = [e["delay_ms"] for e in BATCHES[0] if e["delay_ms"] > 0] + avg = sum(nz) / len(nz) + # paced move_step_ms=16 -> ~62Hz. Assert mean inter-event delay is 10-25ms: + # NOT the old ~35ms (28Hz), and not absurdly fast (which would look uncoalesced). + assert 10 <= avg <= 25, avg + + +def test_click_single_batch_press_release_invariant(): + _reset() + ah.human_move(100, 100) + BATCHES.clear(); EVENTS.clear() + ah.human_click(700, 400) + assert len(BATCHES) == 1, "the whole click is one batch" + evs = BATCHES[0] + types_ = [e["params"]["type"] for e in evs] + assert types_.count("mousePressed") == 1 and types_.count("mouseReleased") == 1 + assert types_[-1] == "mouseReleased" + pi = types_.index("mousePressed") + assert types_[pi - 1] == "mouseMoved" # press follows a move + assert evs[pi - 1]["params"]["x"] == evs[pi]["params"]["x"] # at identical coords + assert evs[pi - 1]["params"]["y"] == evs[pi]["params"]["y"] + + +def test_emit_falls_back_when_daemon_lacks_op(): + _reset() + helpers = sys.modules["browser_harness.helpers"] + orig = helpers.dispatch_input_sequence + + def _raise(events, session_id=None): + raise RuntimeError("'method'") # old daemon: unknown meta -> error + + helpers.dispatch_input_sequence = _raise + try: + EVENTS.clear(); BATCHES.clear() + ah.human_move(800, 500) + assert BATCHES == [], "raising batch op must not record a batch" + moved = [k for m, k in _mouse_events() if k.get("type") == "mouseMoved"] + assert len(moved) >= 8, "fallback must still dispatch via cdp()" + assert moved[-1]["x"] == 800 and moved[-1]["y"] == 500 + finally: + helpers.dispatch_input_sequence = orig + + +def test_emit_resumes_from_count_no_double_dispatch(): + # Daemon ran K events then failed (e.g. stale session). _emit must re-dispatch + # ONLY events[K:] client-side, never the already-sent prefix (no double-fire). + _reset() + helpers = sys.modules["browser_harness.helpers"] + orig = helpers.dispatch_input_sequence + K = 3 + + def _partial(events, session_id=None): + BATCHES.append(events) # the attempted batch (full list) + return {"error": "Session with given id not found", "count": K} + + helpers.dispatch_input_sequence = _partial + try: + EVENTS.clear(); BATCHES.clear() + ah.human_move(900, 600) + n = len(BATCHES[0]) + moved = [k for m, k in _mouse_events() if k.get("type") == "mouseMoved"] + assert len(moved) == n - K, (len(moved), n) # remainder only, prefix NOT resent + assert moved[-1]["x"] == 900 and moved[-1]["y"] == 600 + finally: + helpers.dispatch_input_sequence = orig + + def _run_all(): fns = [g for n, g in sorted(globals().items()) if n.startswith("test_") and callable(g)] passed = 0 From 56455a48b4c987051403365c3b4c3263979b0292 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?thesqrldev=F0=9F=90=BF=EF=B8=8F?= Date: Fri, 29 May 2026 06:58:06 +0900 Subject: [PATCH 3/9] =?UTF-8?q?feat(human-sim):=20Phase=200=20=E2=80=94=20?= =?UTF-8?q?empirical=20self-test=20probe=20+=20ceiling=20decision=20record?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Turns the residual-CDP-tell question from speculation into measurement, and records the no-fork decision from the 6-lens investigation. - agent_helpers: human_selftest() instruments the live page (transparent full-viewport overlay, clicks swallowed) while driving real human_* input, then reports for THIS Chrome: T2 screenX-vs-clientX delta, T1 getCoalescedEvents length, delivered pointer rate (~60 fast / ~30 fallback), isTrusted. chrome_version() reads the major via UA (Runtime.enable-free). Both exported; _eval stays private. - CEILING_DECISIONS.md: research conclusions + decision + phased plan. Key findings: T2 (screenX==clientX) is already fixed upstream in Chrome >=142 (crbug 40280325) — verify, don't assume. T1 (getCoalescedEvents empty) has zero confirmed production deployments — theoretical, not shipped. Frida (macOS hardened-runtime / keychain break) and a Chromium fork (profile conflict + 4-week rebase) are both REJECTED for a personal tool. T1's only real fix is a sparingly-used OS-injection (CGEvent) mode, left on the shelf until a confirmed target is shown to check it. Tests: +2 selftest verdict-logic tests (exposed vs fixed-Chrome canned data); test_human_behavior.py now 24, daemon 3. py_compile clean; load-path verified (helpers auto-loads, human_selftest/chrome_version exported). NOTE: selftest verdict logic is unit-tested; its LIVE behavior on real Chrome (whether CDP fires pointermove on the overlay, getCoalescedEvents values) is exactly what the tool is meant to reveal on first run — not yet verified here. Co-Authored-By: Claude Opus 4.8 (1M context) --- agent-workspace/CEILING_DECISIONS.md | 69 +++++++++++++ agent-workspace/HUMAN_SIM_VALIDATION.md | 9 +- agent-workspace/agent_helpers.py | 131 +++++++++++++++++++++++- tests/unit/test_human_behavior.py | 55 ++++++++++ 4 files changed, 262 insertions(+), 2 deletions(-) create mode 100644 agent-workspace/CEILING_DECISIONS.md diff --git a/agent-workspace/CEILING_DECISIONS.md b/agent-workspace/CEILING_DECISIONS.md new file mode 100644 index 00000000..d87c02a9 --- /dev/null +++ b/agent-workspace/CEILING_DECISIONS.md @@ -0,0 +1,69 @@ +# Residual CDP-input tells — research, decision, and plan + +**Date:** 2026-05-29 · **Status:** decided (no fork) · scope: the human-behavior-sim layer. + +Two behavioral tells were flagged as "not fixable from JS/CDP": +- **T1** — `PointerEvent.getCoalescedEvents()` is empty for CDP-injected mouse moves. +- **T2** — `screenX==clientX` (no window/desktop offset) on CDP-injected mouse events. + +A 6-lens investigation (Chromium source + fingerprinting literature + prior-art repos) **reversed +the problem**. Conclusions, with the evidence that drove them: + +## Findings + +**T2 is already fixed upstream.** crbug 40280325 (`Input.dispatchMouseEvent` set screen==viewport) +was fixed via `ConvertWidgetPointToScreenPoint` in `content/browser/devtools/protocol/input_handler.cc`, +shipped in **Chrome 142 (Oct 2025)**. The `cdp-patches` library was archived the same month ("no +reason to use this anymore"). On any current Chrome, T2 needs no mitigation. The cross-origin-iframe +variant may not be fully covered — measure if a specific target matters. + +**T1 is theoretical, not deployed.** No confirmed production use of `getCoalescedEvents()` length +checks was found across Cloudflare, DataDome, Akamai, PerimeterX/HUMAN, or Kasada in reverse-engineered +live anti-bot JS — it is a researcher-documented signal, not a shipped one. Fixing it requires a +Chromium binary patch (`third_party/blink/.../pointer_event_manager.cc`); CDP `Input.dispatchMouseEvent` +injects via `RenderWidgetHostImpl::ForwardMouseEvent`, bypassing the compositor coalescing queue. + +## Approaches considered + +| Approach | T1 | T2 | real profile | macOS | cost | verdict | +|---|---|---|---|---|---|---| +| **upstream Chrome ≥142** | ✗ | ✅ | kept | — | 0 | **closes T2 for free** | +| CDP `Input.synthesize*` | ✗ | n/a | kept | ok | low | produces touch/wheel, not pointermove — useless for moves | +| **OS injection (CGEvent)** | ✅ | ✅ | kept | hard | med | real events; needs foreground + moves cursor (sparingly) | +| Frida runtime hook | — | — | **broken** | **no** | huge | **rejected**: `__RESTRICT` blocks DYLD_INSERT; re-sign breaks keychain; SIP off unacceptable; arm64e PAC | +| Chromium fork | partial | ✅ | **broken** | hard | huge | **rejected**: profile conflict + ~1-3 dev-days per 4-week Chrome release | +| cdp-patches (OS, both) | ✅ | ✅ | kept | **n/a** | med | no macOS Quartz backend; archived | + +## Decision + +- **Do not fork Chromium and do not Frida-hook.** Cost ≫ benefit for a personal ethical-use tool; + both destroy the real-profile value prop on macOS. +- **T2:** rely on upstream (Chrome ≥142). Verify, don't assume. +- **T1:** accept as a documented residual. It is not deployed in production. Closing it would cost a + fork or a foreground-stealing OS-injection path — not justified until a real target is shown to check it. +- Higher-ROI work than T1/T2: behavioral timing entropy, event ordering, `pointerrawupdate`. (Runtime.enable + is already dropped at the daemon.) + +## Plan + +**Phase 0 — measure, not guess (DONE, 2026-05-29).** +`human_selftest()` + `chrome_version()` in `agent_helpers.py` instrument the live page while driving +real `human_*` input and report, for *your* Chrome: T2 (screenX vs clientX delta), T1 +(getCoalescedEvents length), delivered pointer-event rate (~60 fast path / ~30 fallback), isTrusted. +Run on a normal http(s) page: +```bash +browser-harness -c 'new_tab("https://example.com"); wait_for_load(); import json; print(json.dumps(human_selftest(), indent=2))' +``` + +**Phase 1 — T2 remediation (conditional).** Only if the selftest shows T2 exposed (Chrome <142): +update Chrome (the free, undetectable fix). A JS getter override is detectable (toString/worker) — avoid. + +**Phase 2 — T1 OS-injection mode (on the shelf; build only if a confirmed target checks coalesced).** +Scoped `human_click_os(x, y)` using pyobjc Quartz `CGEventPost` so the click is a real OS event +(real coalescing + screenX + isTrusted). Design: resolve the Chrome window's screen rect +(`CGWindowListCopyWindowInfo`), map viewport→screen, foreground-activate, post down/up, optionally +restore. Constraints: TCC Accessibility grant; foreground + physical-cursor move (so reserve it for the +rare detection-sensitive click; keep CDP for navigation/reading/bulk). The private SkyLight +`SLEventPostToPid` (cursor-stationary, background) is fragile/unbound — not pursued. + +**Phase 3 — Chromium fork. Rejected** (see table). Recorded only so the decision isn't relitigated. diff --git a/agent-workspace/HUMAN_SIM_VALIDATION.md b/agent-workspace/HUMAN_SIM_VALIDATION.md index 8a7259e8..25bb5fdc 100644 --- a/agent-workspace/HUMAN_SIM_VALIDATION.md +++ b/agent-workspace/HUMAN_SIM_VALIDATION.md @@ -14,10 +14,17 @@ is genuinely the user's own. This layer addresses the residual **behavioral** su ## How to run the tests ```bash -python3 tests/unit/test_human_behavior.py # 22/22 — behavior + dispatch invariants +python3 tests/unit/test_human_behavior.py # 24/24 — behavior + dispatch + selftest logic python3 tests/unit/test_daemon_input_sequence.py # 3/3 — daemon batch handler + Runtime omit ``` +To measure what YOUR Chrome actually exposes (T1 coalesced / T2 screenX / delivered rate / +isTrusted), run `human_selftest()` on a normal http(s) page — see `CEILING_DECISIONS.md`: + +```bash +browser-harness -c 'new_tab("https://example.com"); wait_for_load(); import json; print(json.dumps(human_selftest(), indent=2))' +``` + The suite injects a fake `browser_harness.helpers` (capturing every CDP call) so the module's load contract and dispatch invariants are exercised without a live browser. diff --git a/agent-workspace/agent_helpers.py b/agent-workspace/agent_helpers.py index 2951adf7..a27b3e58 100644 --- a/agent-workspace/agent_helpers.py +++ b/agent-workspace/agent_helpers.py @@ -43,7 +43,7 @@ identify the session; full parity needs a patched Chromium (out of scope here). """ -import json, math, os, random, tempfile, time +import json, math, os, random, re, tempfile, time from browser_harness.helpers import cdp, _KEYS as _CORE_KEYS @@ -776,3 +776,132 @@ def human_navigate(url): except Exception: pass human_wait(random.uniform(2.0, 5.0)) + + +# --------------------------------------------------------------------------- +# Diagnostics — empirically measure what THIS Chrome + layer actually expose +# (turns the residual-tell discussion from speculation into measurement) +# --------------------------------------------------------------------------- + +def _eval(expr, await_promise=False): + """Runtime.evaluate in the page main world. Works WITHOUT Runtime.enable.""" + r = cdp("Runtime.evaluate", expression=expr, returnByValue=True, awaitPromise=await_promise) + if r.get("exceptionDetails"): + raise RuntimeError("selftest JS failed: %s" % r["exceptionDetails"]) + return r.get("result", {}).get("value") + + +def chrome_version(): + """(major:int|None, user_agent:str). + + T2 (screenX==clientX) was a CDP bug fixed upstream in Chrome >= 142 + (crbug 40280325, ConvertWidgetPointToScreenPoint), so on a current Chrome it + needs no mitigation. Read via the UA string (no Runtime.enable required). + """ + ua = _eval("navigator.userAgent") or "" + m = re.search(r"Chrome/(\d+)", ua) + return (int(m.group(1)) if m else None, ua) + + +_PROBE_JS = r""" +(() => { + const P = (window.__bh_probe = {moves: [], clicks: []}); + let ov = document.getElementById('__bh_probe_overlay'); + if (ov) ov.remove(); + ov = document.createElement('div'); + ov.id = '__bh_probe_overlay'; + ov.style.cssText = 'position:fixed;left:0;top:0;width:100vw;height:100vh;' + + 'z-index:2147483647;background:transparent;pointer-events:auto;cursor:default;'; + (document.body || document.documentElement).appendChild(ov); + const rec = (arr, e) => { + let coalesced = -1; + try { coalesced = (typeof e.getCoalescedEvents === 'function') ? e.getCoalescedEvents().length : -1; } catch (_) {} + arr.push({t: e.timeStamp, sx: e.screenX, cx: e.clientX, sy: e.screenY, cy: e.clientY, + trusted: e.isTrusted, coalesced: coalesced}); + }; + ov.addEventListener('pointermove', e => rec(P.moves, e), {passive: true}); + ov.addEventListener('pointerdown', e => rec(P.clicks, e), {passive: true}); + ov.addEventListener('click', e => { e.preventDefault(); e.stopPropagation(); }, true); + return true; +})() +""" + +_READ_JS = "JSON.stringify(window.__bh_probe || null)" +_CLEAN_JS = ("(()=>{const o=document.getElementById('__bh_probe_overlay');if(o)o.remove();" + "try{delete window.__bh_probe;}catch(e){}return true;})()") + + +def human_selftest(verbose=True): + """Measure what the connected Chrome + this layer actually expose to a page: + T1 (getCoalescedEvents length), T2 (screenX vs clientX), delivered pointer-event + rate, and isTrusted — by instrumenting the live page while driving real human_* + input. Converts the residual-tell question into a measurement on YOUR Chrome. + + Run on an ordinary http(s) page (NOT chrome://) with the tab focused. Installs a + transparent full-viewport overlay (clicks are swallowed, no navigation), drives a + move+click, reads back per-event metrics, then removes the overlay. Returns a dict; + prints a verdict when verbose. + """ + s = _s() + try: + major, ua = chrome_version() + except Exception: + major, ua = None, "" + w, h = _viewport(s) + + _eval(_PROBE_JS) + raw = None + try: + human_move(int(w * 0.30), int(h * 0.40)) + human_move(int(w * 0.70), int(h * 0.60)) + human_click(int(w * 0.50), int(h * 0.50)) + time.sleep(0.15) # let the renderer flush the input events before reading + raw = _eval(_READ_JS) + finally: + try: + _eval(_CLEAN_JS) + except Exception: + pass + + data = json.loads(raw) if raw else {"moves": [], "clicks": []} + moves = data.get("moves", []) + clicks = data.get("clicks", []) + allev = moves + clicks + + deltas = [abs(e["sx"] - e["cx"]) + abs(e["sy"] - e["cy"]) for e in allev] + max_delta = max(deltas) if deltas else 0 + t2_exposed = bool(allev) and max_delta == 0 # screenX==clientX -> CDP screen-coord bug present + + cl = [e["coalesced"] for e in moves if e.get("coalesced", -1) >= 0] + t1_max = max(cl) if cl else 0 + t1_exposed = bool(cl) and t1_max <= 1 # getCoalescedEvents never >1 -> coalescing bypassed + + rate = None + if len(moves) >= 2: + span = (moves[-1]["t"] - moves[0]["t"]) / 1000.0 + rate = round((len(moves) - 1) / span, 1) if span > 0 else None + + trusted = all(e["trusted"] for e in allev) if allev else None + + res = { + "chrome_major": major, "user_agent": ua, + "moves_captured": len(moves), "clicks_captured": len(clicks), + "t2_screenx_exposed": t2_exposed, "screen_client_max_delta_px": max_delta, + "t1_coalesced_exposed": t1_exposed, "coalesced_len_max": t1_max, + "delivered_rate_hz": rate, "is_trusted": trusted, + } + + if verbose: + v_major = major if major is not None else "?" + print("=== human_selftest ===") + print("Chrome major: %s (T2 fixed upstream in >= 142)" % v_major) + print("captured: %d moves, %d clicks; isTrusted=%s" % (len(moves), len(clicks), trusted)) + if not allev: + print("WARNING: no events captured — run on a normal http(s) page with the tab focused.") + else: + print("T2 screenX: %s" % ("EXPOSED (screenX==clientX bug)" if t2_exposed + else "OK (window offset present, delta=%dpx)" % max_delta)) + print("T1 coalesced: %s" % ("EXPOSED (getCoalescedEvents<=1; CDP bypasses coalescing)" + if t1_exposed else "has coalescing (max=%d)" % t1_max)) + print("delivered pointer rate: %s Hz (~60 => fast server-side path; ~30 => old daemon/fallback)" % rate) + return res diff --git a/tests/unit/test_human_behavior.py b/tests/unit/test_human_behavior.py index cb24d970..a62daf12 100644 --- a/tests/unit/test_human_behavior.py +++ b/tests/unit/test_human_behavior.py @@ -392,6 +392,61 @@ def _partial(events, session_id=None): helpers.dispatch_input_sequence = orig +def test_selftest_detects_exposed_tells(): + import json as _j + _reset() + canned = {"moves": [{"t": 0, "sx": 100, "cx": 100, "sy": 50, "cy": 50, "trusted": True, "coalesced": 1}, + {"t": 16, "sx": 110, "cx": 110, "sy": 55, "cy": 55, "trusted": True, "coalesced": 1}], + "clicks": [{"t": 40, "sx": 120, "cx": 120, "sy": 60, "cy": 60, "trusted": True, "coalesced": -1}]} + + def fake_eval(expr, await_promise=False): + if "userAgent" in expr: + return "Mozilla/5.0 (Macintosh) Chrome/120.0.0.0 Safari/537.36" + if "JSON.stringify" in expr: + return _j.dumps(canned) + return True + + orig = ah._eval + ah._eval = fake_eval + try: + r = ah.human_selftest(verbose=False) + assert r["chrome_major"] == 120 + assert r["t2_screenx_exposed"] is True # screenX==clientX (delta 0) on Chrome <142 + assert r["screen_client_max_delta_px"] == 0 + assert r["t1_coalesced_exposed"] is True # all coalesced == 1 + assert r["is_trusted"] is True + finally: + ah._eval = orig + + +def test_selftest_detects_fixed_chrome(): + import json as _j + _reset() + canned = {"moves": [{"t": 0, "sx": 172, "cx": 100, "sy": 130, "cy": 50, "trusted": True, "coalesced": 3}, + {"t": 16, "sx": 182, "cx": 110, "sy": 135, "cy": 55, "trusted": True, "coalesced": 2}, + {"t": 33, "sx": 192, "cx": 120, "sy": 140, "cy": 60, "trusted": True, "coalesced": 2}], + "clicks": [{"t": 50, "sx": 202, "cx": 130, "sy": 145, "cy": 65, "trusted": True, "coalesced": -1}]} + + def fake_eval(expr, await_promise=False): + if "userAgent" in expr: + return "Mozilla/5.0 Chrome/148.0.0.0 Safari/537.36" + if "JSON.stringify" in expr: + return _j.dumps(canned) + return True + + orig = ah._eval + ah._eval = fake_eval + try: + r = ah.human_selftest(verbose=False) + assert r["chrome_major"] == 148 + assert r["t2_screenx_exposed"] is False # window offset present + assert r["screen_client_max_delta_px"] == 152 # |172-100| + |130-50| (manhattan x+y) + assert r["t1_coalesced_exposed"] is False # coalesced max 3 > 1 + assert r["delivered_rate_hz"] is not None + finally: + ah._eval = orig + + def _run_all(): fns = [g for n, g in sorted(globals().items()) if n.startswith("test_") and callable(g)] passed = 0 From d35d52f60b6f6b5f0e549c233310cb4fd812d96a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?thesqrldev=F0=9F=90=BF=EF=B8=8F?= Date: Fri, 29 May 2026 19:10:45 +0900 Subject: [PATCH 4/9] docs(human-sim): record Phase 0 live measurement (Chrome 148) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ran human_selftest() against the live machine's real Chrome 148.0.7778.181: - T2 (screenX==clientX): NOT exposed — screen/client delta 121px (upstream fix live). - T1 (getCoalescedEvents): EXPOSED — max 1 (CDP coalescing bypass confirmed on 148). - isTrusted true; delivered pointer rate 41Hz with the server-side fast path verified active (dispatch_input_sequence returned {ok,count:2} on a fresh daemon against real Chrome — the daemon batch handler works end-to-end). Confirms the no-fork decision empirically: the only exposed tell (T1) is the one with zero production deployment. Phases 1/2 not triggered. Co-Authored-By: Claude Opus 4.8 (1M context) --- agent-workspace/CEILING_DECISIONS.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/agent-workspace/CEILING_DECISIONS.md b/agent-workspace/CEILING_DECISIONS.md index d87c02a9..c1d8261c 100644 --- a/agent-workspace/CEILING_DECISIONS.md +++ b/agent-workspace/CEILING_DECISIONS.md @@ -55,6 +55,19 @@ Run on a normal http(s) page: browser-harness -c 'new_tab("https://example.com"); wait_for_load(); import json; print(json.dumps(human_selftest(), indent=2))' ``` +**Phase 0 result — measured 2026-05-29 on the live machine (Chrome 148.0.7778.181):** +- **T2 screenX: NOT exposed** — screen-vs-client delta = 121px. The upstream fix is live; no action. +- **T1 coalesced: EXPOSED** — getCoalescedEvents max = 1 (CDP bypass confirmed on Chrome 148; matches research). +- isTrusted = true. Delivered pointer rate = **41Hz** (server-side fast path verified active: + `dispatch_input_sequence(...)` returned `{ok:True, count:2}` on a fresh daemon against real Chrome; + ~24ms/event = 16ms delay + WS send latency, up from the ~28Hz baseline). +- **Conclusion CONFIRMED by measurement:** the ONLY exposed tell (T1) is the one with zero production + deployment → no fork, no OS-injection. Phases 1 and 2 are NOT triggered. +- Minor: the probe's click-capture returned 0 (diagnostic gap, not a finding — the 36-move stream + supplied every metric). Optional polish: also capture `mousedown` / lengthen the settle. +- Optional future tune: subtract estimated WS send time from each delay to lift 41Hz toward ~60Hz — + low value while T1 (coalesced) betrays CDP regardless of rate. + **Phase 1 — T2 remediation (conditional).** Only if the selftest shows T2 exposed (Chrome <142): update Chrome (the free, undetectable fix). A JS getter override is detectable (toString/worker) — avoid. From 7285a41be93b8ca683206bebfaaa9ebf8b0a1036 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?thesqrldev=F0=9F=90=BF=EF=B8=8F?= Date: Fri, 29 May 2026 22:22:03 +0900 Subject: [PATCH 5/9] =?UTF-8?q?fix(human-sim):=20polish=20human=5Fselftest?= =?UTF-8?q?=20=E2=80=94=20move-derived=20verdict,=20median=20rate,=20robus?= =?UTF-8?q?t=20click=20capture?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live runs surfaced two selftest flaws (verdict logic was always correct; these are diagnostic-quality issues): - Rate metric swung 19-41Hz run-to-run: (n-1)/span counted the large gaps BETWEEN the move/move/click trajectories. Switched to the MEDIAN inter-move interval, which reports the true per-event cadence — now a stable ~48-56Hz (server-side fast path), matching the verified daemon dispatch. - clicks_captured was intermittently 0, which read like a bug. Verified via a catch-all probe that human_click DOES fire a full, correct event chain (pointerdown/mousedown/pointerup/mouseup/click) — it really clicks. The selftest's single-press capture can just miss the read window, so: capture pointerdown AND mousedown, retry-read briefly, and — decisively — derive the T1/T2/rate/isTrusted verdict from the deterministic MOVE stream only. Clicks are now labelled best-effort and never gate the verdict. Recorded the live Chrome-148 measurement update in CEILING_DECISIONS.md. Tests: 24/24 (verdict logic unchanged — canned signal lives in the move stream); py_compile clean; verified across repeated live runs (T2 OK delta 121px, T1 EXPOSED coalesced<=1, rate ~50Hz, isTrusted true — consistent regardless of click capture). Co-Authored-By: Claude Opus 4.8 (1M context) --- agent-workspace/CEILING_DECISIONS.md | 15 ++++---- agent-workspace/agent_helpers.py | 52 +++++++++++++++++++++------- 2 files changed, 48 insertions(+), 19 deletions(-) diff --git a/agent-workspace/CEILING_DECISIONS.md b/agent-workspace/CEILING_DECISIONS.md index c1d8261c..6cdd2994 100644 --- a/agent-workspace/CEILING_DECISIONS.md +++ b/agent-workspace/CEILING_DECISIONS.md @@ -58,14 +58,17 @@ browser-harness -c 'new_tab("https://example.com"); wait_for_load(); import json **Phase 0 result — measured 2026-05-29 on the live machine (Chrome 148.0.7778.181):** - **T2 screenX: NOT exposed** — screen-vs-client delta = 121px. The upstream fix is live; no action. - **T1 coalesced: EXPOSED** — getCoalescedEvents max = 1 (CDP bypass confirmed on Chrome 148; matches research). -- isTrusted = true. Delivered pointer rate = **41Hz** (server-side fast path verified active: - `dispatch_input_sequence(...)` returned `{ok:True, count:2}` on a fresh daemon against real Chrome; - ~24ms/event = 16ms delay + WS send latency, up from the ~28Hz baseline). +- isTrusted = true. Delivered pointer rate = **~48-56Hz** (median inter-move; server-side fast path + verified active — `dispatch_input_sequence(...)` returned `{ok:True, count:2}` on a fresh daemon + against real Chrome — ~18-21ms/event, up from the ~28Hz baseline). - **Conclusion CONFIRMED by measurement:** the ONLY exposed tell (T1) is the one with zero production deployment → no fork, no OS-injection. Phases 1 and 2 are NOT triggered. -- Minor: the probe's click-capture returned 0 (diagnostic gap, not a finding — the 36-move stream - supplied every metric). Optional polish: also capture `mousedown` / lengthen the settle. -- Optional future tune: subtract estimated WS send time from each delay to lift 41Hz toward ~60Hz — +- Selftest polish: the verdict is now derived from the deterministic move stream (40+ events/run); + the rate uses the **median** inter-move interval (the prior `(n-1)/span` swung 19-41Hz because it + counted the gaps between the move/move/click trajectories). A catch-all probe verified **human_click + fires a full, correct chain** (pointerdown/mousedown/pointerup/mouseup/click) — so it really clicks; + the selftest's own click capture is best-effort/informational and never gates the verdict. +- Optional future tune: subtract estimated WS send time from each delay to lift ~50Hz toward ~60Hz — low value while T1 (coalesced) betrays CDP regardless of rate. **Phase 1 — T2 remediation (conditional).** Only if the selftest shows T2 exposed (Chrome <142): diff --git a/agent-workspace/agent_helpers.py b/agent-workspace/agent_helpers.py index a27b3e58..667e99c4 100644 --- a/agent-workspace/agent_helpers.py +++ b/agent-workspace/agent_helpers.py @@ -816,11 +816,12 @@ def chrome_version(): const rec = (arr, e) => { let coalesced = -1; try { coalesced = (typeof e.getCoalescedEvents === 'function') ? e.getCoalescedEvents().length : -1; } catch (_) {} - arr.push({t: e.timeStamp, sx: e.screenX, cx: e.clientX, sy: e.screenY, cy: e.clientY, + arr.push({type: e.type, t: e.timeStamp, sx: e.screenX, cx: e.clientX, sy: e.screenY, cy: e.clientY, trusted: e.isTrusted, coalesced: coalesced}); }; ov.addEventListener('pointermove', e => rec(P.moves, e), {passive: true}); - ov.addEventListener('pointerdown', e => rec(P.clicks, e), {passive: true}); + // capture BOTH — whichever the CDP press surfaces (pointerdown and/or mousedown) + ['pointerdown', 'mousedown'].forEach(t => ov.addEventListener(t, e => rec(P.clicks, e), {passive: true})); ov.addEventListener('click', e => { e.preventDefault(); e.stopPropagation(); }, true); return true; })() @@ -841,6 +842,11 @@ def human_selftest(verbose=True): transparent full-viewport overlay (clicks are swallowed, no navigation), drives a move+click, reads back per-event metrics, then removes the overlay. Returns a dict; prints a verdict when verbose. + + The verdict (T1/T2/rate/isTrusted) is derived from the deterministic move stream. + Click capture is best-effort and informational only — human_click is verified to + fire a full, correct event chain (pointerdown/mousedown/pointerup/mouseup/click), + but a single press event's delivery can fall outside the read window. """ s = _s() try: @@ -850,36 +856,56 @@ def human_selftest(verbose=True): w, h = _viewport(s) _eval(_PROBE_JS) - raw = None + data = {"moves": [], "clicks": []} try: human_move(int(w * 0.30), int(h * 0.40)) human_move(int(w * 0.70), int(h * 0.60)) human_click(int(w * 0.50), int(h * 0.50)) - time.sleep(0.15) # let the renderer flush the input events before reading + time.sleep(0.30) # let the renderer flush the input events before reading raw = _eval(_READ_JS) + if raw: + data = json.loads(raw) + # click (pointerdown/mousedown) can flush slightly later than the moves — + # re-read briefly if absent. Distinguishes delivery lag (caught here) from + # genuine non-firing (stays empty). + for _ in range(4): + if data.get("clicks"): + break + time.sleep(0.1) + raw = _eval(_READ_JS) + if raw: + data = json.loads(raw) finally: try: _eval(_CLEAN_JS) except Exception: pass - - data = json.loads(raw) if raw else {"moves": [], "clicks": []} moves = data.get("moves", []) clicks = data.get("clicks", []) allev = moves + clicks - deltas = [abs(e["sx"] - e["cx"]) + abs(e["sy"] - e["cy"]) for e in allev] + # Verdict is derived from the MOVE stream (deterministic, ~40+ events/run). Click + # capture is best-effort: human_click fires a full, correct chain (verified — + # pointerdown/mousedown/pointerup/mouseup/click all fire), but the single press + # event's delivery to the page can fall outside the read window, so clicks are + # informational only and never gate the verdict. + deltas = [abs(e["sx"] - e["cx"]) + abs(e["sy"] - e["cy"]) for e in moves] max_delta = max(deltas) if deltas else 0 - t2_exposed = bool(allev) and max_delta == 0 # screenX==clientX -> CDP screen-coord bug present + t2_exposed = bool(moves) and max_delta == 0 # screenX==clientX -> CDP screen-coord bug present cl = [e["coalesced"] for e in moves if e.get("coalesced", -1) >= 0] t1_max = max(cl) if cl else 0 t1_exposed = bool(cl) and t1_max <= 1 # getCoalescedEvents never >1 -> coalescing bypassed + # Rate = MEDIAN inter-move interval, not (n-1)/span: the selftest drives several + # separate trajectories (move, move, click) with large gaps between them (hover, + # dwell, IPC). Median ignores those few big gaps and reports the true per-event + # dispatch cadence within a trajectory. rate = None - if len(moves) >= 2: - span = (moves[-1]["t"] - moves[0]["t"]) / 1000.0 - rate = round((len(moves) - 1) / span, 1) if span > 0 else None + if len(moves) >= 3: + deltas = sorted(moves[i + 1]["t"] - moves[i]["t"] for i in range(len(moves) - 1)) + mid = deltas[len(deltas) // 2] + rate = round(1000.0 / mid, 1) if mid > 0 else None trusted = all(e["trusted"] for e in allev) if allev else None @@ -895,7 +921,7 @@ def human_selftest(verbose=True): v_major = major if major is not None else "?" print("=== human_selftest ===") print("Chrome major: %s (T2 fixed upstream in >= 142)" % v_major) - print("captured: %d moves, %d clicks; isTrusted=%s" % (len(moves), len(clicks), trusted)) + print("captured: %d moves (authoritative), %d clicks (best-effort); isTrusted=%s" % (len(moves), len(clicks), trusted)) if not allev: print("WARNING: no events captured — run on a normal http(s) page with the tab focused.") else: @@ -903,5 +929,5 @@ def human_selftest(verbose=True): else "OK (window offset present, delta=%dpx)" % max_delta)) print("T1 coalesced: %s" % ("EXPOSED (getCoalescedEvents<=1; CDP bypasses coalescing)" if t1_exposed else "has coalescing (max=%d)" % t1_max)) - print("delivered pointer rate: %s Hz (~60 => fast server-side path; ~30 => old daemon/fallback)" % rate) + print("pointer rate (median inter-move): %s Hz (>=~40 => server-side fast path; ~25-30 => client fallback)" % rate) return res From 207113fcc7e09d19b6dc77acc6b812b49aa4201d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?thesqrldev=F0=9F=90=BF=EF=B8=8F?= Date: Sat, 30 May 2026 04:32:31 +0900 Subject: [PATCH 6/9] =?UTF-8?q?feat(human-sim):=20Phase=202=20=E2=80=94=20?= =?UTF-8?q?macOS=20OS-injection=20mode=20(human=5Fclick=5Fos)=20to=20close?= =?UTF-8?q?=20the=20T1=20coalesced=20tell?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Real Quartz CGEvents traverse the full HID->compositor->renderer pipeline, so the page sees genuine coalesced pointer events + correct screenX + isTrusted — the one thing CDP Input.dispatchMouseEvent provably cannot do (it bypasses the compositor coalescing queue). Opt-in, macOS-only, lazy pyobjc import (core stays pure-stdlib). API: human_click_os(x,y[,button,app_name]) / human_move_os(x,y) / os_selftest() / os_calibrate(). Reuses the Fitts/Bezier/tremor trajectory, posting real moves at ~125Hz so Chrome's compositor coalesces them. Safety (this posts REAL clicks on the live desktop — three-layer guard, never blind): - frontmost check: foregrounds BH_BROWSER_APP (default "Google Chrome") and refuses if the wrong app is frontmost (Brave/Edge users pass app_name / set the env). - display-bounds check: refuses if the mapped global point is off all displays (CGGetActiveDisplayList/CGDisplayBounds; handles negative multi-monitor origins). - cursor-arrival check: posts the move, reads back the real cursor, refuses if it didn't reach the target (= Accessibility not granted) instead of a silent no-op. - click uses kCGMouseEventClickState=1 (else clickState 0 may not register / exposes MouseEvent.detail===0). Validated: 30 hermetic tests (mocked Quartz: capability, client->screen mapping, full move+down+up sequence with clickState, off-screen refusal, Accessibility-denied refusal). os_calibrate() run LIVE returned error_px [0.0, 0] — the client->screen mapping matches the browser's reported screenX/screenY EXACTLY on the primary display, so OS clicks land where intended (validated WITHOUT moving the cursor). Two adversarial review passes (APPROVE). NOT yet run live: the CGEvent path needs `pip install pyobjc-framework-Quartz` into the env + Accessibility granted; os_selftest() then measures whether getCoalescedEvents()>1 actually results. Multi-monitor mapping unvalidated (os_calibrate covered primary only). Co-Authored-By: Claude Opus 4.8 (1M context) --- agent-workspace/CEILING_DECISIONS.md | 25 ++- agent-workspace/agent_helpers.py | 273 ++++++++++++++++++++++++++- tests/unit/test_human_behavior.py | 179 ++++++++++++++++++ 3 files changed, 469 insertions(+), 8 deletions(-) diff --git a/agent-workspace/CEILING_DECISIONS.md b/agent-workspace/CEILING_DECISIONS.md index 6cdd2994..6b63ee8e 100644 --- a/agent-workspace/CEILING_DECISIONS.md +++ b/agent-workspace/CEILING_DECISIONS.md @@ -74,12 +74,23 @@ browser-harness -c 'new_tab("https://example.com"); wait_for_load(); import json **Phase 1 — T2 remediation (conditional).** Only if the selftest shows T2 exposed (Chrome <142): update Chrome (the free, undetectable fix). A JS getter override is detectable (toString/worker) — avoid. -**Phase 2 — T1 OS-injection mode (on the shelf; build only if a confirmed target checks coalesced).** -Scoped `human_click_os(x, y)` using pyobjc Quartz `CGEventPost` so the click is a real OS event -(real coalescing + screenX + isTrusted). Design: resolve the Chrome window's screen rect -(`CGWindowListCopyWindowInfo`), map viewport→screen, foreground-activate, post down/up, optionally -restore. Constraints: TCC Accessibility grant; foreground + physical-cursor move (so reserve it for the -rare detection-sensitive click; keep CDP for navigation/reading/bulk). The private SkyLight -`SLEventPostToPid` (cursor-stationary, background) is fragile/unbound — not pursued. +**Phase 2 — T1 OS-injection mode — IMPLEMENTED 2026-05-30 (opt-in, macOS).** +`human_click_os(x, y)` / `human_move_os(x, y)` post real Quartz `CGEvent`s (lazy pyobjc import; the core +stays pure-stdlib) so the page sees genuine coalesced events + correct screenX + isTrusted. Pipeline: +capability gate → foreground the browser (`BH_BROWSER_APP`; refuses if the wrong app is frontmost) → +client→screen map → display-bounds check (refuses off-screen / wrong monitor) → Fitts/Bezier trajectory of +real moves at ~125Hz (to trigger compositor coalescing) → cursor-arrival verify (refuses if it did not +move = Accessibility not granted) → click with `kCGMouseEventClickState=1`. The three-layer guard +(frontmost / display-bounds / cursor-arrival) means it never posts a blind real click. +**Validated:** 30 hermetic tests (mocked Quartz) + `os_calibrate()` run LIVE returned error_px [0.0, 0] — +the client→screen mapping matches the browser's reported screenX/screenY EXACTLY on the primary display, so +clicks land where intended. Reviewed across two adversarial passes (APPROVE; the missing clickState, +off-screen, wrong-app, and silent-no-op risks were caught and fixed). +**Not yet exercised live:** the real CGEvent path needs `pip install pyobjc-framework-Quartz` into the +browser-harness env + Accessibility granted to the terminal/python; then `os_selftest()` measures whether +`getCoalescedEvents() > 1` actually results (the gated proof). Multi-monitor mapping is unvalidated +(os_calibrate only covered the primary display). COST stands — foreground + physical-cursor move → reserve +for the rare detection-sensitive click; keep CDP (`human_click`) for navigation/reading/bulk. The private +SkyLight `SLEventPostToPid` (cursor-stationary, background) is fragile/unbound — not pursued. **Phase 3 — Chromium fork. Rejected** (see table). Recorded only so the decision isn't relitigated. diff --git a/agent-workspace/agent_helpers.py b/agent-workspace/agent_helpers.py index 667e99c4..cc4a65ce 100644 --- a/agent-workspace/agent_helpers.py +++ b/agent-workspace/agent_helpers.py @@ -43,7 +43,7 @@ identify the session; full parity needs a patched Chromium (out of scope here). """ -import json, math, os, random, re, tempfile, time +import json, math, os, random, re, sys, tempfile, time from browser_harness.helpers import cdp, _KEYS as _CORE_KEYS @@ -931,3 +931,274 @@ def human_selftest(verbose=True): if t1_exposed else "has coalescing (max=%d)" % t1_max)) print("pointer rate (median inter-move): %s Hz (>=~40 => server-side fast path; ~25-30 => client fallback)" % rate) return res + + +# --------------------------------------------------------------------------- +# OS-level input injection (macOS, opt-in) — the ONLY way to close the +# getCoalescedEvents() tell (T1) that CDP Input.dispatchMouseEvent cannot. +# +# Real Quartz CGEvents traverse the full HID -> compositor -> renderer pipeline, +# so the page sees genuine coalesced pointer events, correct screenX/screenY, and +# isTrusted. COST: brings Chrome to the foreground and moves the PHYSICAL cursor, +# so reserve it for the rare detection-sensitive click — keep human_click / CDP for +# navigation, reading, and bulk. Requires: macOS, `pyobjc-framework-Quartz`, and +# Accessibility permission for the terminal/python (System Settings > Privacy & +# Security > Accessibility). NOT loaded/used unless you call these functions. +# --------------------------------------------------------------------------- + +_OS_STEP_MS = 8 # ~125Hz post rate so Chrome's compositor coalesces the moves (the T1 win) + + +def os_input_available(): + """(ok: bool, reason: str). macOS + importable Quartz. Accessibility can't be + cheaply probed here; if events don't land, grant it (the reason hints at this).""" + if sys.platform != "darwin": + return (False, "OS input injection is macOS-only (this platform: %s)" % sys.platform) + try: + import Quartz # noqa: F401 + except Exception: + return (False, "missing dependency: pip install pyobjc-framework-Quartz " + "(into the browser-harness env), and grant Accessibility") + return (True, "ok (ensure Accessibility is granted if clicks don't land)") + + +def _os_screen_point(s, cx, cy): + """Map viewport (client) CSS px -> global screen points (CGEvent space) using the + page's own window geometry. CSS px == screen points for CGEvent, so devicePixelRatio + does not enter — no Retina scaling needed.""" + geo = _eval("JSON.stringify([window.screenX, window.screenY, " + "window.outerHeight - window.innerHeight, window.outerWidth - window.innerWidth])") + sx0, sy0, top_chrome, side_chrome = json.loads(geo) + # modern Chrome has no side borders (side_chrome ~ 0); top_chrome = tabstrip+toolbar. + return (sx0 + side_chrome / 2.0 + cx, sy0 + top_chrome + cy) + + +def _os_cursor(Quartz): + return Quartz.CGEventGetLocation(Quartz.CGEventCreate(None)) # CGPoint (.x, .y), top-left origin + + +def _os_post_move(Quartz, x, y): + e = Quartz.CGEventCreateMouseEvent(None, Quartz.kCGEventMouseMoved, (x, y), Quartz.kCGMouseButtonLeft) + Quartz.CGEventPost(Quartz.kCGHIDEventTap, e) + + +def _os_post_button(Quartz, x, y, down, button): + if button == "right": + et = Quartz.kCGEventRightMouseDown if down else Quartz.kCGEventRightMouseUp + bt = Quartz.kCGMouseButtonRight + else: + et = Quartz.kCGEventLeftMouseDown if down else Quartz.kCGEventLeftMouseUp + bt = Quartz.kCGMouseButtonLeft + e = Quartz.CGEventCreateMouseEvent(None, et, (x, y), bt) + # clickState 1 = single click. Without it the event defaults to clickState 0, which + # can fail to synthesize a real 'click' and exposes MouseEvent.detail===0 (a tell). + Quartz.CGEventSetIntegerValueField(e, Quartz.kCGMouseEventClickState, 1) + Quartz.CGEventPost(Quartz.kCGHIDEventTap, e) + + +def _within_displays(Quartz, x, y): + """True if the global point falls on some active display — guards against driving a + REAL click off-screen / onto the wrong monitor (the mapped point is not re-clamped).""" + try: + err, ids, n = Quartz.CGGetActiveDisplayList(16, None, None) + except Exception: + return True # can't enumerate displays -> best-effort, don't block + for did in list(ids)[:n]: + b = Quartz.CGDisplayBounds(did) + ox, oy, w, h = b.origin.x, b.origin.y, b.size.width, b.size.height + if ox - 1 <= x <= ox + w + 1 and oy - 1 <= y <= oy + h + 1: + return True + return False + + +def _os_trajectory(s, Quartz, start, end): + sx, sy = (start.x, start.y) if hasattr(start, "x") else (start[0], start[1]) + dist = math.hypot(end[0] - sx, end[1] - sy) + if dist < 2: + _os_post_move(Quartz, end[0], end[1]) + return + dt = _OS_STEP_MS / 1000.0 + n = max(8, min(150, int(_fitts_ms(dist) * s.profile["move_time_mult"] / _OS_STEP_MS))) + for px, py in _bezier_trajectory((sx, sy), end, n, dt, s.tremor_angle): + _os_post_move(Quartz, px, py) + time.sleep(max(0.003, _OS_STEP_MS / 1000.0 + random.gauss(0, 0.002))) + + +def _activate_chrome(app_name="Google Chrome"): + """Foreground the browser app; return the resulting frontmost process name ('' if + undeterminable). Used to refuse posting a real click into the wrong app.""" + import subprocess + safe = app_name.replace("\\", "").replace('"', "") # app names carry no quotes; avoid AppleScript breakage + try: + subprocess.run(["osascript", "-e", 'tell application "%s" to activate' % safe], + capture_output=True, timeout=5) + except Exception: + pass + time.sleep(0.2) + try: + r = subprocess.run( + ["osascript", "-e", + "tell application \"System Events\" to get name of first application process whose frontmost is true"], + capture_output=True, text=True, timeout=5) + return (r.stdout or "").strip() + except Exception: + return "" + + +def _os_prepare(x, y, activate, app_name): + """Shared pre-flight for the OS-injection path: capability gate, lazy Quartz import, + foreground + frontmost verification, client->screen mapping, and display-bounds check. + Raises (never posts blind) if anything is wrong. Returns (Quartz, s, x, y, tx, ty).""" + ok, reason = os_input_available() + if not ok: + raise RuntimeError("OS input unavailable: %s" % reason) + import Quartz + s = _s() + x, y = _clamp(x, y, s) + if activate: + front = _activate_chrome(app_name) + if front and app_name.lower() not in front.lower() and front.lower() not in app_name.lower(): + raise RuntimeError("foreground app is %r, not %r — refusing to inject a real click " + "into the wrong app (set BH_BROWSER_APP or pass app_name)" % (front, app_name)) + tx, ty = _os_screen_point(s, x, y) + if not _within_displays(Quartz, tx, ty): + raise RuntimeError("target screen point (%.0f, %.0f) is outside all displays — refusing to " + "click off-screen / on the wrong monitor; run os_calibrate() here" % (tx, ty)) + return Quartz, s, x, y, tx, ty + + +def _os_goto(s, Quartz, tx, ty): + """Move the physical cursor to (tx, ty) and VERIFY it arrived — a no-move means + Accessibility is not granted (CGEventPost silently drops), so raise instead of + returning a false success.""" + _os_trajectory(s, Quartz, _os_cursor(Quartz), (tx, ty)) + pos = _os_cursor(Quartz) + px, py = (pos.x, pos.y) if hasattr(pos, "x") else (pos[0], pos[1]) + if abs(px - tx) > 4 or abs(py - ty) > 4: + raise RuntimeError("OS cursor did not reach target (got %.0f,%.0f want %.0f,%.0f) — grant " + "Accessibility to your terminal/python in System Settings > Privacy & " + "Security > Accessibility" % (px, py, tx, ty)) + + +def human_move_os(x, y, activate=True, app_name=None): + """Move the physical cursor to viewport (x, y) via real Quartz CGEvents. macOS only; + see the section header for constraints and prerequisites.""" + app_name = app_name or os.environ.get("BH_BROWSER_APP", "Google Chrome") + Quartz, s, x, y, tx, ty = _os_prepare(x, y, activate, app_name) + _os_goto(s, Quartz, tx, ty) + s.cursor = [float(x), float(y)] + s._save() + return {"screen_point": [round(tx, 1), round(ty, 1)]} + + +def human_click_os(x, y, button="left", activate=True, app_name=None): + """High-stealth click at viewport (x, y) via REAL macOS OS events (Quartz CGEvent). + Real HID events traverse the full pipeline, so the page is INTENDED to see genuine + coalesced pointer events, correct screenX/screenY, and isTrusted — the path CDP + cannot take. Verify the coalescing actually results on your setup with os_selftest() + (it has not been measured here). macOS only. + + COST: brings the browser to the foreground (refuses if the wrong app is frontmost) and + moves the physical cursor (refuses if the cursor doesn't reach the target, i.e. + Accessibility not granted, or if the point is off all displays). Use sparingly for the + rare detection-sensitive click; use human_click (CDP) for navigation/reading/bulk. The + target tab MUST be the active/visible tab of the frontmost browser window. + """ + app_name = app_name or os.environ.get("BH_BROWSER_APP", "Google Chrome") + Quartz, s, x, y, tx, ty = _os_prepare(x, y, activate, app_name) + p = s.profile + _os_goto(s, Quartz, tx, ty) + time.sleep(random.uniform(*p["hover_range"])) + _os_post_button(Quartz, tx, ty, True, button) + time.sleep(max(0.03, _lognormal(p["dwell_mean"], p["dwell_mean"] * 0.28) / 1000.0)) + _os_post_button(Quartz, tx, ty, False, button) + s.cursor = [float(x), float(y)] + s._save() + return {"screen_point": [round(tx, 1), round(ty, 1)]} + + +def os_calibrate(): + """Validate the client->screen mapping WITHOUT moving the physical cursor (no Quartz + needed): drive CDP moves through the same overlay probe human_selftest uses, capture + the screenX/screenY Chrome reports for them (ground truth on Chrome >= 142), and + compare to _os_screen_point() at the same client point. A small error means + human_click_os will land where intended. Run on a normal http(s) page. Returns a dict. + """ + s = _s() + _eval(_PROBE_JS) + data = {"moves": []} + try: + w, h = _viewport(s) + human_move(int(w * 0.40), int(h * 0.40)) + human_move(int(w * 0.60), int(h * 0.60)) + time.sleep(0.20) + raw = _eval(_READ_JS) + if raw: + data = json.loads(raw) + finally: + try: + _eval(_CLEAN_JS) + except Exception: + pass + + moves = [m for m in data.get("moves", []) if "sx" in m and "cx" in m] + if not moves: + return {"ok": False, "reason": "no moves captured (run on a normal http(s) page, tab focused)"} + m = moves[len(moves) // 2] + pred = _os_screen_point(s, m["cx"], m["cy"]) + err = (abs(pred[0] - m["sx"]), abs(pred[1] - m["sy"])) + return { + "ok": err[0] <= 3 and err[1] <= 3, + "client": [m["cx"], m["cy"]], + "predicted_screen": [round(pred[0], 1), round(pred[1], 1)], + "browser_screen": [m["sx"], m["sy"]], + "error_px": [round(err[0], 1), round(err[1], 1)], + } + + +def os_selftest(verbose=True): + """Prove (or disprove) that the OS-injection path actually closes T1: drive REAL OS + moves (human_move_os) through the same overlay probe human_selftest uses, then read + getCoalescedEvents() on the resulting pointermoves. coalesced_len_max > 1 means real + coalescing happened — the tell CDP cannot close is closed. Requires pyobjc + + Accessibility + foreground; moves the physical cursor. Run on a normal http(s) page. + """ + ok, reason = os_input_available() + if not ok: + raise RuntimeError("os_selftest unavailable: %s" % reason) + s = _s() + w, h = _viewport(s) + _eval(_PROBE_JS) + data = {"moves": []} + try: + human_move_os(int(w * 0.35), int(h * 0.45)) + human_move_os(int(w * 0.65), int(h * 0.55)) + time.sleep(0.30) + raw = _eval(_READ_JS) + if raw: + data = json.loads(raw) + finally: + try: + _eval(_CLEAN_JS) + except Exception: + pass + + moves = [m for m in data.get("moves", []) if m.get("coalesced", -1) >= 0] + cl = [m["coalesced"] for m in moves] + deltas = [abs(m["sx"] - m["cx"]) + abs(m["sy"] - m["cy"]) for m in moves if "sx" in m] + res = { + "moves_captured": len(moves), + "coalesced_len_max": max(cl) if cl else 0, + "t1_coalesced_closed": bool(cl) and max(cl) > 1, + "screen_client_max_delta_px": max(deltas) if deltas else 0, + } + if verbose: + print("=== os_selftest (REAL OS input via CGEvent) ===") + print("moves captured: %d" % len(moves)) + print("getCoalescedEvents max: %d -> T1 %s" % ( + res["coalesced_len_max"], + "CLOSED (real coalescing observed!)" if res["t1_coalesced_closed"] + else "NOT closed (no coalescing > 1 seen)")) + if not moves: + print("WARNING: no moves captured — Accessibility not granted, or run on a normal http(s) page.") + return res diff --git a/tests/unit/test_human_behavior.py b/tests/unit/test_human_behavior.py index a62daf12..3f0dc8ae 100644 --- a/tests/unit/test_human_behavior.py +++ b/tests/unit/test_human_behavior.py @@ -447,6 +447,185 @@ def fake_eval(expr, await_promise=False): ah._eval = orig +class _FakeCGPoint: + def __init__(self, x, y): + self.x = x + self.y = y + + +class _FakeSize: + def __init__(self, w, h): + self.width = w + self.height = h + + +class _FakeRect: + def __init__(self, ox, oy, w, h): + self.origin = _FakeCGPoint(ox, oy) + self.size = _FakeSize(w, h) + + +class _FakeQuartz: + kCGEventMouseMoved = 5 + kCGEventLeftMouseDown = 1 + kCGEventLeftMouseUp = 2 + kCGEventRightMouseDown = 3 + kCGEventRightMouseUp = 4 + kCGMouseButtonLeft = 0 + kCGMouseButtonRight = 1 + kCGHIDEventTap = 0 + kCGMouseEventClickState = 1 + + def __init__(self, display=(0.0, 0.0, 3000.0, 3000.0), move_cursor=True): + self.posted = [] + self.cursor = _FakeCGPoint(50.0, 50.0) + self._display = display + self._move_cursor = move_cursor + + def CGEventCreate(self, src): + return ("create",) + + def CGEventGetLocation(self, e): + return self.cursor + + def CGEventCreateMouseEvent(self, src, etype, pos, btn): + return {"type": etype, "x": float(pos[0]), "y": float(pos[1]), "btn": btn, "clickState": 0} + + def CGEventSetIntegerValueField(self, e, field, val): + if field == self.kCGMouseEventClickState: + e["clickState"] = val + + def CGEventPost(self, tap, e): + self.posted.append(e) + if self._move_cursor and e["type"] == self.kCGEventMouseMoved: + self.cursor = _FakeCGPoint(e["x"], e["y"]) # model the real cursor moving + + def CGGetActiveDisplayList(self, maxd, a, b): + return (0, [1], 1) + + def CGDisplayBounds(self, did): + return _FakeRect(*self._display) + + +def test_os_input_available_without_quartz(): + sys.modules.pop("Quartz", None) + ok, reason = ah.os_input_available() + # on darwin without pyobjc -> missing-dependency; on non-darwin -> macOS-only + assert ok is False + assert ("pyobjc" in reason) or ("macOS-only" in reason), reason + + +def test_os_screen_point_mapping(): + _reset() + orig = ah._eval + # window.screenX=10, screenY=80, top_chrome=80, side_chrome=0 + ah._eval = lambda expr, await_promise=False: "[10, 80, 80, 0]" + try: + sx, sy = ah._os_screen_point(ah._s(), 600, 400) + assert sx == 610.0 and sy == 560.0, (sx, sy) # 10+0+600, 80+80+400 + finally: + ah._eval = orig + + +def test_human_click_os_posts_real_event_sequence(): + import sys as _sys + if _sys.platform != "darwin": + return # OS path is macOS-only + _reset() + fake = _FakeQuartz() + _sys.modules["Quartz"] = fake + orig_eval, orig_act = ah._eval, ah._activate_chrome + ah._eval = lambda expr, await_promise=False: "[10, 80, 80, 0]" # window geometry + ah._activate_chrome = lambda *a, **k: "Google Chrome" # frontmost matches app_name + try: + r = ah.human_click_os(600, 400) + assert r["screen_point"] == [610.0, 560.0] + types_ = [e["type"] for e in fake.posted] + assert fake.kCGEventLeftMouseDown in types_ and fake.kCGEventLeftMouseUp in types_ + di = types_.index(fake.kCGEventLeftMouseDown) + ui = types_.index(fake.kCGEventLeftMouseUp) + assert di < ui # down before up + assert all(t == fake.kCGEventMouseMoved for t in types_[:di]) # moves precede press + down, up = fake.posted[di], fake.posted[ui] + assert down["x"] == 610.0 and down["y"] == 560.0 # press at mapped screen point + assert down["clickState"] == 1 and up["clickState"] == 1 # D1 fix: single-click state set + last_move = fake.posted[di - 1] + assert last_move["x"] == 610.0 and last_move["y"] == 560.0 # final move == press (no teleport) + finally: + ah._eval, ah._activate_chrome = orig_eval, orig_act + _sys.modules.pop("Quartz", None) + + +def test_human_click_os_refuses_offscreen_point(): + import sys as _sys + if _sys.platform != "darwin": + return + _reset() + fake = _FakeQuartz(display=(0.0, 0.0, 100.0, 100.0)) # tiny display; (610,560) is outside + _sys.modules["Quartz"] = fake + orig_eval, orig_act = ah._eval, ah._activate_chrome + ah._eval = lambda expr, await_promise=False: "[10, 80, 80, 0]" + ah._activate_chrome = lambda *a, **k: "Google Chrome" + try: + raised = False + try: + ah.human_click_os(600, 400) + except RuntimeError as e: + raised = "outside all displays" in str(e) + assert raised, "must refuse to click off all displays" + assert fake.posted == [], "no events may be posted when the target is off-screen" + finally: + ah._eval, ah._activate_chrome = orig_eval, orig_act + _sys.modules.pop("Quartz", None) + + +def test_os_goto_raises_when_cursor_does_not_move(): + import sys as _sys + if _sys.platform != "darwin": + return + _reset() + fake = _FakeQuartz(move_cursor=False) # model Accessibility-denied: posts no-op, cursor stuck + _sys.modules["Quartz"] = fake + orig_eval, orig_act = ah._eval, ah._activate_chrome + ah._eval = lambda expr, await_promise=False: "[10, 80, 80, 0]" + ah._activate_chrome = lambda *a, **k: "Google Chrome" + try: + raised = False + try: + ah.human_click_os(600, 400) + except RuntimeError as e: + raised = "did not reach target" in str(e) + assert raised, "must raise when the cursor never reaches the target (Accessibility denied)" + finally: + ah._eval, ah._activate_chrome = orig_eval, orig_act + _sys.modules.pop("Quartz", None) + + +def test_os_calibrate_error_computation(): + _reset() + orig = ah._eval + + import json as _j + + def fake_eval(expr, await_promise=False): + if "outerHeight" in expr: + return "[10, 80, 80, 0]" # geometry -> pred (610, 560) for client (600,400) + if "__bh_probe" in expr and "JSON.stringify" in expr: # _READ_JS + return _j.dumps({"moves": [{"sx": 610, "sy": 560, "cx": 600, "cy": 400, + "coalesced": 1, "trusted": True}], "clicks": []}) + return True # probe install / cleanup + + ah._eval = fake_eval + try: + r = ah.os_calibrate() + assert r["ok"] is True + assert r["predicted_screen"] == [610.0, 560.0] + assert r["browser_screen"] == [610, 560] + assert r["error_px"] == [0.0, 0.0] + finally: + ah._eval = orig + + def _run_all(): fns = [g for n, g in sorted(globals().items()) if n.startswith("test_") and callable(g)] passed = 0 From fdbcd25a7f80c8d6dd401f7cb667a9e8adaff265 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?thesqrldev=F0=9F=90=BF=EF=B8=8F?= Date: Sat, 30 May 2026 05:11:23 +0900 Subject: [PATCH 7/9] fix(human-sim): _viewport CSS-px (Retina) + PROVEN T1 closed live via OS injection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Running os_selftest() live (pyobjc installed + Accessibility granted) drove real CGEvent moves and the page reported getCoalescedEvents() max = 2 (>1) with a real screenX offset (delta 164px): T1 (getCoalescedEvents empty) is GENUINELY CLOSED via the OS-injection path — the one tell CDP Input.dispatchMouseEvent provably cannot fix. The live run also exposed a latent Retina bug: _viewport read layoutViewport, which is DEVICE px (2x at devicePixelRatio 2); CDP/CGEvent coordinates are CSS px, so every fractional target (e.g. 0.65*w) was 2x too large and mapped off-screen — the OS display-bounds guard correctly refused. Fixed _viewport to prefer cssLayoutViewport (CSS px), falling back to layoutViewport for older Chrome. This also corrects _clamp and cursor-init on any dpr>1 display (affected the CDP path too, latent). Tests: +1 Retina regression (test_viewport_uses_css_pixels_on_retina; _FakeQuartz cdp now returns both cssLayoutViewport=1200x800 and layoutViewport=2400x1600, asserts the CSS one is used). 31/31 + daemon 3. py_compile clean. os_calibrate live still [0.0,0]. Co-Authored-By: Claude Opus 4.8 (1M context) --- agent-workspace/CEILING_DECISIONS.md | 19 +++++++++++++------ agent-workspace/agent_helpers.py | 5 ++++- tests/unit/test_human_behavior.py | 13 ++++++++++++- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/agent-workspace/CEILING_DECISIONS.md b/agent-workspace/CEILING_DECISIONS.md index 6b63ee8e..de1a31b8 100644 --- a/agent-workspace/CEILING_DECISIONS.md +++ b/agent-workspace/CEILING_DECISIONS.md @@ -86,11 +86,18 @@ move = Accessibility not granted) → click with `kCGMouseEventClickState=1`. Th the client→screen mapping matches the browser's reported screenX/screenY EXACTLY on the primary display, so clicks land where intended. Reviewed across two adversarial passes (APPROVE; the missing clickState, off-screen, wrong-app, and silent-no-op risks were caught and fixed). -**Not yet exercised live:** the real CGEvent path needs `pip install pyobjc-framework-Quartz` into the -browser-harness env + Accessibility granted to the terminal/python; then `os_selftest()` measures whether -`getCoalescedEvents() > 1` actually results (the gated proof). Multi-monitor mapping is unvalidated -(os_calibrate only covered the primary display). COST stands — foreground + physical-cursor move → reserve -for the rare detection-sensitive click; keep CDP (`human_click`) for navigation/reading/bulk. The private -SkyLight `SLEventPostToPid` (cursor-stationary, background) is fragile/unbound — not pursued. +**PROVEN LIVE 2026-05-30:** with `pyobjc-framework-Quartz` installed in the tool env + Accessibility +granted, `os_selftest()` drove real CGEvent moves and the page reported `getCoalescedEvents()` **max = 2 +(> 1)** with a real screenX offset (delta 164px) — **T1 is genuinely CLOSED via the OS path**, the one +thing CDP cannot do (CDP moves always report coalesced ≤ 1). The OS click really clicks (catch-all probe +earlier confirmed the full pointerdown/mousedown/up/click chain). +Finding from the live run: it exposed a latent **Retina bug** — `_viewport` read `layoutViewport` = +DEVICE px (2x at devicePixelRatio 2), so fractional targets (e.g. 0.65·w) mapped off-screen and the +display-bounds guard (correctly) refused. Fixed to use `cssLayoutViewport` = CSS px, the units CDP and +CGEvent both expect. This also corrected `_clamp`/cursor-init on any dpr>1 display. +Still unvalidated: multi-monitor mapping (single 1728×1117 display here). COST stands — foreground + +physical-cursor move → reserve for the rare detection-sensitive click; keep CDP (`human_click`) for +navigation/reading/bulk. The private SkyLight `SLEventPostToPid` (cursor-stationary, background) is +fragile/unbound — not pursued. **Phase 3 — Chromium fork. Rejected** (see table). Recorded only so the decision isn't relitigated. diff --git a/agent-workspace/agent_helpers.py b/agent-workspace/agent_helpers.py index cc4a65ce..9d762970 100644 --- a/agent-workspace/agent_helpers.py +++ b/agent-workspace/agent_helpers.py @@ -202,7 +202,10 @@ def _viewport(s): if s.viewport is None: try: m = cdp("Page.getLayoutMetrics") - vp = m.get("layoutViewport", {}) + # cssLayoutViewport is CSS px; layoutViewport is DEVICE px (x devicePixelRatio). + # CDP Input.dispatchMouseEvent coords are CSS px, so we must use the CSS one or + # every coordinate is 2x too large on a Retina (dpr>1) display. + vp = m.get("cssLayoutViewport") or m.get("layoutViewport", {}) s.viewport = ( int(vp.get("clientWidth", 1200)), int(vp.get("clientHeight", 800)), diff --git a/tests/unit/test_human_behavior.py b/tests/unit/test_human_behavior.py index 3f0dc8ae..6e8b9a9c 100644 --- a/tests/unit/test_human_behavior.py +++ b/tests/unit/test_human_behavior.py @@ -28,7 +28,9 @@ def _fake_cdp(method, **kw): EVENTS.append((method, kw)) if method == "Page.getLayoutMetrics": - return {"layoutViewport": {"clientWidth": 1200, "clientHeight": 800}} + # model a Retina display: layoutViewport is DEVICE px (2x), cssLayoutViewport is CSS px + return {"cssLayoutViewport": {"clientWidth": 1200, "clientHeight": 800}, + "layoutViewport": {"clientWidth": 2400, "clientHeight": 1600}} return {} @@ -626,6 +628,15 @@ def fake_eval(expr, await_promise=False): ah._eval = orig +def test_viewport_uses_css_pixels_on_retina(): + # Regression: _viewport must read cssLayoutViewport (CSS px), not layoutViewport + # (device px x dpr), or fractional coords are 2x too large on Retina and map off-screen. + _reset() + ah._s().viewport = None + w, h = ah._viewport(ah._s()) + assert (w, h) == (1200, 800), (w, h) # CSS px, NOT the device-px 2400x1600 + + def _run_all(): fns = [g for n, g in sorted(globals().items()) if n.startswith("test_") and callable(g)] passed = 0 From 0f4a44822a01de10ef340b2f2a17ee4225531065 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?thesqrldev=F0=9F=90=BF=EF=B8=8F?= Date: Sun, 31 May 2026 19:19:23 +0900 Subject: [PATCH 8/9] Make OS coalescing proof deterministic under renderer stress The live no-stress OS selftest was flaky on Chrome 148: real Quartz moves can legitimately report getCoalescedEvents()==1 when the renderer is idle. The proof now adds transient renderer stress so the compositor path difference is measured deterministically, while the actual OS movement model stays human-scale. Constraint: Chrome 148 on a single-display Retina Mac produced no-stress OS coalescing inconsistently. Rejected: Increase OS move rate or burst CGEvents | live repeats stayed flaky and changed the movement model. Confidence: high Scope-risk: narrow Directive: Treat os_selftest(stress=True) as the T1 proof mode; do not claim unstressed Chrome always returns coalesced events. Tested: python3 tests/unit/test_human_behavior.py; python3 tests/unit/test_daemon_input_sequence.py; uv run --with pytest pytest tests/unit/test_admin.py tests/unit/test_helpers.py tests/unit/test_daemon.py tests/unit/test_run.py -q; python3 -m py_compile agent-workspace/agent_helpers.py src/browser_harness/daemon.py src/browser_harness/helpers.py; live os_calibrate ok error_px [0.0,0]; live os_selftest stress repeats coalesced_len_max 2/3/3; live CDP stress coalesced_len_max 1. Not-tested: live multi-monitor hardware; current machine exposes one active display only. --- agent-workspace/agent_helpers.py | 46 ++++++++++++++++++++------- tests/unit/test_daemon.py | 52 +++++++++++++++++++------------ tests/unit/test_human_behavior.py | 27 +++++++++++++--- 3 files changed, 90 insertions(+), 35 deletions(-) diff --git a/agent-workspace/agent_helpers.py b/agent-workspace/agent_helpers.py index 9d762970..e7fced74 100644 --- a/agent-workspace/agent_helpers.py +++ b/agent-workspace/agent_helpers.py @@ -25,11 +25,13 @@ out at. Falls back to client-side dispatch if the daemon predates the batch op (restart the daemon for the fast path); a mid-batch failure resumes the remainder rather than re-sending the dispatched prefix. - * COALESCED EVENTS — NOT fixable in software. CDP Input.dispatchMouseEvent injects + * COALESCED EVENTS — FIXED for opt-in macOS OS input. CDP Input.dispatchMouseEvent injects via RenderWidgetHostImpl::ForwardMouseEvent, bypassing the compositor coalescing queue, so PointerEvent.getCoalescedEvents() stays empty regardless of injection rate. (This is why we target ~60Hz, not higher: extra uncoalesced events would - only look more anomalous.) Closing it requires a patched Chromium binary. + only look more anomalous.) The opt-in human_*_os path posts real Quartz CGEvents + through the OS/HID pipeline; os_selftest(stress=True) verifies that Chrome exposes + coalesced samples on the live machine without a browser fork. * screenX/screenY — residual tell. CDP sets screenX==clientX (no window/desktop offset), which a real windowed browser never produces; Cloudflare Turnstile checks this. Not settable via CDP and not safely patchable from page JS. @@ -38,9 +40,9 @@ * CDP-presence — Runtime.enable is omitted by default at the daemon (kills the console-serialization detection class); but an attached remote-debugging client is fundamentally detectable by other means. -Net: defeats heuristic/weak-ML detectors and the event-rate signal. The coalesced -and screenX tells mean a top-tier ensemble inspecting CDP input fidelity can still -identify the session; full parity needs a patched Chromium (out of scope here). +Net: CDP mode defeats heuristic/weak-ML detectors and the event-rate signal but still +has CDP input-fidelity tells. Use the opt-in OS path for rare detection-sensitive +clicks when physical cursor movement / foreground focus is acceptable. """ import json, math, os, random, re, sys, tempfile, time @@ -831,7 +833,23 @@ def chrome_version(): """ _READ_JS = "JSON.stringify(window.__bh_probe || null)" -_CLEAN_JS = ("(()=>{const o=document.getElementById('__bh_probe_overlay');if(o)o.remove();" +_OS_STRESS_JS = r""" +(() => { + // T1 proof mode: briefly occupy the renderer main thread while real OS input + // arrives. Genuine HID/compositor input then reaches pointermove as coalesced + // samples; CDP Input.dispatchMouseEvent remains uncoalesced under the same load. + window.__bh_os_stress_stop = false; + function burn() { + const end = performance.now() + 12; + while (performance.now() < end) {} + if (!window.__bh_os_stress_stop) setTimeout(burn, 0); + } + setTimeout(burn, 0); + return true; +})() +""" +_CLEAN_JS = ("(()=>{window.__bh_os_stress_stop=true;" + "const o=document.getElementById('__bh_probe_overlay');if(o)o.remove();" "try{delete window.__bh_probe;}catch(e){}return true;})()") @@ -949,7 +967,7 @@ def human_selftest(verbose=True): # Security > Accessibility). NOT loaded/used unless you call these functions. # --------------------------------------------------------------------------- -_OS_STEP_MS = 8 # ~125Hz post rate so Chrome's compositor coalesces the moves (the T1 win) +_OS_STEP_MS = 8 # ~125Hz post rate: common HID polling cadence for real mouse movement def os_input_available(): @@ -1099,7 +1117,7 @@ def human_click_os(x, y, button="left", activate=True, app_name=None): Real HID events traverse the full pipeline, so the page is INTENDED to see genuine coalesced pointer events, correct screenX/screenY, and isTrusted — the path CDP cannot take. Verify the coalescing actually results on your setup with os_selftest() - (it has not been measured here). macOS only. + (stress=True is the deterministic proof mode). macOS only. COST: brings the browser to the foreground (refuses if the wrong app is frontmost) and moves the physical cursor (refuses if the cursor doesn't reach the target, i.e. @@ -1159,12 +1177,15 @@ def os_calibrate(): } -def os_selftest(verbose=True): +def os_selftest(verbose=True, stress=True): """Prove (or disprove) that the OS-injection path actually closes T1: drive REAL OS moves (human_move_os) through the same overlay probe human_selftest uses, then read getCoalescedEvents() on the resulting pointermoves. coalesced_len_max > 1 means real - coalescing happened — the tell CDP cannot close is closed. Requires pyobjc + - Accessibility + foreground; moves the physical cursor. Run on a normal http(s) page. + coalescing happened — the tell CDP cannot close is closed. By default the probe adds + a short renderer stress loop because unstressed Chrome can legitimately deliver + getCoalescedEvents()==1 for both real and synthetic moves; under stress, the + compositor-vs-CDP path difference is deterministic. Requires pyobjc + Accessibility + + foreground; moves the physical cursor. Run on a normal http(s) page. """ ok, reason = os_input_available() if not ok: @@ -1174,6 +1195,8 @@ def os_selftest(verbose=True): _eval(_PROBE_JS) data = {"moves": []} try: + if stress: + _eval(_OS_STRESS_JS) human_move_os(int(w * 0.35), int(h * 0.45)) human_move_os(int(w * 0.65), int(h * 0.55)) time.sleep(0.30) @@ -1194,6 +1217,7 @@ def os_selftest(verbose=True): "coalesced_len_max": max(cl) if cl else 0, "t1_coalesced_closed": bool(cl) and max(cl) > 1, "screen_client_max_delta_px": max(deltas) if deltas else 0, + "stress": bool(stress), } if verbose: print("=== os_selftest (REAL OS input via CGEvent) ===") diff --git a/tests/unit/test_daemon.py b/tests/unit/test_daemon.py index 90c5bc85..ae96422f 100644 --- a/tests/unit/test_daemon.py +++ b/tests/unit/test_daemon.py @@ -21,12 +21,13 @@ def _fresh_daemon(): return d -def test_set_session_enables_all_four_default_domains_on_new_session(): +def test_set_session_enables_default_domains_on_new_session(): """Regression: switch_tab() / new_tab() in helpers.py route through the `set_session` IPC, which previously only enabled Page on the new session. With Network disabled, wait_for_network_idle() silently stops - receiving events after a tab switch. Initial attach enables all four - (Page, DOM, Runtime, Network); set_session must enable the same set.""" + receiving events after a tab switch. Initial attach enables the default + domains (Page, DOM, Network); set_session must enable the same set. + Runtime.enable is intentionally omitted unless BH_CDP_ENABLE_RUNTIME=1.""" d = _fresh_daemon() new_session = "session-AFTER-switch" @@ -40,8 +41,8 @@ def test_set_session_enables_all_four_default_domains_on_new_session(): method for (method, _params, sid) in d.cdp.calls if sid == new_session and method.endswith(".enable") ] - assert set(enabled_on_new) == {"Page.enable", "DOM.enable", "Runtime.enable", "Network.enable"}, ( - f"set_session must enable Page/DOM/Runtime/Network on the new session " + assert set(enabled_on_new) == {"Page.enable", "DOM.enable", "Network.enable"}, ( + f"set_session must enable Page/DOM/Network on the new session " f"(parity with initial attach). Got: {enabled_on_new}" ) assert d.session == new_session @@ -84,10 +85,21 @@ async def send_raw(self, method, params=None, session_id=None): attempted = [m for (m, _p, _s) in d.cdp.calls] assert "Page.enable" in attempted assert "DOM.enable" in attempted # attempted, but raised - assert "Runtime.enable" in attempted assert "Network.enable" in attempted +def test_enable_default_domains_can_opt_into_runtime(monkeypatch): + """Runtime.enable is off by default to avoid console-serialization CDP tells, + but remains available behind an explicit compatibility flag.""" + monkeypatch.setenv("BH_CDP_ENABLE_RUNTIME", "1") + d = _fresh_daemon() + + asyncio.run(d._enable_default_domains("session-X")) + + attempted = [m for (m, _p, _s) in d.cdp.calls] + assert {"Page.enable", "DOM.enable", "Runtime.enable", "Network.enable"}.issubset(attempted) + + def test_set_session_disables_network_on_old_session_before_enabling_new(): """When switching tabs, the previous session's Network domain must be disabled so background tabs (polling, SSE, etc.) stop emitting events @@ -140,12 +152,12 @@ def test_set_session_does_not_disable_network_when_no_previous_session(): def test_set_session_runs_disable_and_enables_in_parallel(): - """The four Domain.enable calls (plus Network.disable on the old session) + """The default Domain.enable calls (plus Network.disable on the old session) must run concurrently via asyncio.gather, not sequentially. With the old sequential code, helpers.switch_tab() would block in _send() for up to ~22s on a slow/remote daemon while the helper's IPC socket has a 5s read timeout, causing client-side socket timeouts. Verifying that all - five CDP calls reach send_raw before any returns proves parallelization.""" + CDP calls reach send_raw before any returns proves parallelization.""" class _ConcurrencyProbeCDP: def __init__(self): self.calls = [] @@ -178,8 +190,8 @@ async def run(): # in-flight. Cap iterations to avoid hanging if parallelization breaks. for _ in range(50): await asyncio.sleep(0) - # 5 = Network.disable on OLD + 4 enables on NEW. - if d.cdp.in_flight >= 5: + # 4 = Network.disable on OLD + 3 default enables on NEW. + if d.cdp.in_flight >= 4: break peak = d.cdp.max_concurrent d.cdp.release.set() @@ -187,20 +199,20 @@ async def run(): return peak, d.cdp.calls peak, calls = asyncio.run(run()) - assert peak == 5, ( - f"set_session must run disable + 4 enables concurrently via gather " - f"(observed peak in-flight = {peak}; expected 5 = 1 disable on OLD + " - f"4 enables on NEW). Sequential await would peak at 1." + assert peak == 4, ( + f"set_session must run disable + 3 default enables concurrently via gather " + f"(observed peak in-flight = {peak}; expected 4 = 1 disable on OLD + " + f"3 enables on NEW). Sequential await would peak at 1." ) # Sanity: the right calls were made. methods = sorted({m for (m, _p, _s) in calls}) assert "Network.disable" in methods - assert {"Page.enable", "DOM.enable", "Runtime.enable", "Network.enable"}.issubset(methods) + assert {"Page.enable", "DOM.enable", "Network.enable"}.issubset(methods) -def test_set_session_first_attach_runs_four_enables_in_parallel(): +def test_set_session_first_attach_runs_default_enables_in_parallel(): """When there's no previous session, the disable path is skipped — only - the four enables run, still in parallel.""" + the default enables run, still in parallel.""" class _ConcurrencyProbeCDP: def __init__(self): self.calls = [] @@ -231,7 +243,7 @@ async def run(): })) for _ in range(50): await asyncio.sleep(0) - if d.cdp.in_flight >= 4: + if d.cdp.in_flight >= 3: break peak = d.cdp.max_concurrent d.cdp.release.set() @@ -239,8 +251,8 @@ async def run(): return peak peak = asyncio.run(run()) - assert peak == 4, ( - f"first set_session must run 4 enables concurrently " + assert peak == 3, ( + f"first set_session must run 3 default enables concurrently " f"(observed peak = {peak}). No Network.disable should fire." ) diff --git a/tests/unit/test_human_behavior.py b/tests/unit/test_human_behavior.py index 6e8b9a9c..87d86d68 100644 --- a/tests/unit/test_human_behavior.py +++ b/tests/unit/test_human_behavior.py @@ -478,10 +478,10 @@ class _FakeQuartz: kCGHIDEventTap = 0 kCGMouseEventClickState = 1 - def __init__(self, display=(0.0, 0.0, 3000.0, 3000.0), move_cursor=True): + def __init__(self, display=(0.0, 0.0, 3000.0, 3000.0), displays=None, move_cursor=True): self.posted = [] self.cursor = _FakeCGPoint(50.0, 50.0) - self._display = display + self._displays = list(displays or [display]) self._move_cursor = move_cursor def CGEventCreate(self, src): @@ -503,10 +503,11 @@ def CGEventPost(self, tap, e): self.cursor = _FakeCGPoint(e["x"], e["y"]) # model the real cursor moving def CGGetActiveDisplayList(self, maxd, a, b): - return (0, [1], 1) + ids = list(range(1, len(self._displays) + 1)) + return (0, ids[:maxd], min(len(ids), maxd)) def CGDisplayBounds(self, did): - return _FakeRect(*self._display) + return _FakeRect(*self._displays[int(did) - 1]) def test_os_input_available_without_quartz(): @@ -529,6 +530,24 @@ def test_os_screen_point_mapping(): ah._eval = orig +def test_os_selftest_has_renderer_stress_probe_for_coalescing_proof(): + # Unstressed Chrome can legitimately report getCoalescedEvents()==1 even for + # real OS moves. The self-test uses transient renderer load to prove the + # compositor path without changing the actual movement model. + assert "__bh_os_stress_stop" in ah._OS_STRESS_JS + assert "performance.now()" in ah._OS_STRESS_JS + + +def test_within_displays_accepts_secondary_monitor_bounds(): + fake = _FakeQuartz(displays=[ + (0.0, 0.0, 1728.0, 1117.0), + (-1440.0, 100.0, 1440.0, 900.0), + ]) + assert ah._within_displays(fake, 100.0, 100.0) is True + assert ah._within_displays(fake, -100.0, 200.0) is True + assert ah._within_displays(fake, -1500.0, 200.0) is False + + def test_human_click_os_posts_real_event_sequence(): import sys as _sys if _sys.platform != "darwin": From 306155fc705eb5a2b133170e6bb433ff9df00646 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?thesqrldev=F0=9F=90=BF=EF=B8=8F?= Date: Sun, 31 May 2026 19:20:46 +0900 Subject: [PATCH 9/9] Keep OS cursor verification reliable under stress The stress-mode coalescing proof can make CGEventPost completion observable before the WindowServer has settled the final cursor location. The OS path now re-posts the final landing point with a short settle loop before treating the run as an Accessibility failure. Constraint: live os_selftest(stress=True) once read the cursor 31px from target immediately after trajectory completion. Rejected: Loosen the 4px verification threshold | would hide wrong-monitor or Accessibility failures instead of waiting for real cursor arrival. Confidence: high Scope-risk: narrow Directive: Keep the final-arrival check strict; add settling, not broad tolerance, when WindowServer timing is the issue. Tested: python3 tests/unit/test_human_behavior.py; python3 tests/unit/test_daemon_input_sequence.py; uv run --with pytest pytest tests/unit/test_admin.py tests/unit/test_helpers.py tests/unit/test_daemon.py tests/unit/test_run.py -q; python3 -m py_compile agent-workspace/agent_helpers.py src/browser_harness/daemon.py src/browser_harness/helpers.py; live os_calibrate ok error_px [0.0,0]; live os_selftest stress repeats coalesced_len_max 2/3/2; live CDP stress coalesced_len_max 1. Not-tested: live multi-monitor hardware; current machine exposes one active display only. --- agent-workspace/agent_helpers.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/agent-workspace/agent_helpers.py b/agent-workspace/agent_helpers.py index e7fced74..66cd8250 100644 --- a/agent-workspace/agent_helpers.py +++ b/agent-workspace/agent_helpers.py @@ -1093,12 +1093,19 @@ def _os_goto(s, Quartz, tx, ty): Accessibility is not granted (CGEventPost silently drops), so raise instead of returning a false success.""" _os_trajectory(s, Quartz, _os_cursor(Quartz), (tx, ty)) - pos = _os_cursor(Quartz) - px, py = (pos.x, pos.y) if hasattr(pos, "x") else (pos[0], pos[1]) - if abs(px - tx) > 4 or abs(py - ty) > 4: - raise RuntimeError("OS cursor did not reach target (got %.0f,%.0f want %.0f,%.0f) — grant " - "Accessibility to your terminal/python in System Settings > Privacy & " - "Security > Accessibility" % (px, py, tx, ty)) + px = py = None + for _ in range(3): + # CGEventPost can be asynchronous under renderer/WindowServer load. Re-post + # the final landing point and let the OS settle before declaring failure. + _os_post_move(Quartz, tx, ty) + time.sleep(0.03) + pos = _os_cursor(Quartz) + px, py = (pos.x, pos.y) if hasattr(pos, "x") else (pos[0], pos[1]) + if abs(px - tx) <= 4 and abs(py - ty) <= 4: + return + raise RuntimeError("OS cursor did not reach target (got %.0f,%.0f want %.0f,%.0f) — grant " + "Accessibility to your terminal/python in System Settings > Privacy & " + "Security > Accessibility" % (px, py, tx, ty)) def human_move_os(x, y, activate=True, app_name=None):