diff --git a/Dockerfile.base b/Dockerfile.base index 9e2b8f4..de6cc47 100644 --- a/Dockerfile.base +++ b/Dockerfile.base @@ -21,6 +21,10 @@ WORKDIR /openclaw ARG OPENCLAW_GIT_REF=main RUN git clone --depth 1 --branch "${OPENCLAW_GIT_REF}" https://github.com/openclaw/openclaw.git . +# Patch: fix heartbeat model override ignored (#56788) +COPY patches/apply-heartbeat-model-fix.sh /tmp/apply-heartbeat-model-fix.sh +RUN bash /tmp/apply-heartbeat-model-fix.sh + # Patch: relax version requirements for packages using workspace protocol. RUN set -eux; \ find ./extensions -name 'package.json' -type f | while read -r f; do \ @@ -29,6 +33,11 @@ RUN set -eux; \ done RUN pnpm install --no-frozen-lockfile + +# Patch: move context files to after cache boundary (0% → ~80% cache hit rate) +COPY patches/apply-prompt-cache-context-files-fix.sh /tmp/apply-prompt-cache-context-files-fix.sh +RUN bash /tmp/apply-prompt-cache-context-files-fix.sh + RUN pnpm build ENV OPENCLAW_PREFER_PNPM=1 RUN pnpm ui:install && pnpm ui:build diff --git a/patches/apply-heartbeat-model-fix.sh b/patches/apply-heartbeat-model-fix.sh new file mode 100644 index 0000000..d82fa2b --- /dev/null +++ b/patches/apply-heartbeat-model-fix.sh @@ -0,0 +1,166 @@ +#!/usr/bin/env bash +# Fix: heartbeat model override ignored (#56788) +# Applies changes from upstream PRs #57094 and #57076 (both unmerged as of 2026-04-05). +# +# Root causes (4 loss points in the model resolution chain): +# 1. runtime-system.ts strips heartbeat.model when forwarding to runHeartbeatOnceInternal +# 2. live-model-switch.ts ignores caller-provided defaults, uses config default instead +# 3. model-fallback.ts swallows LiveSessionModelSwitchError as candidate failure +# 4. get-reply.ts unconditionally overwrites heartbeat model after directive resolution +set -euo pipefail + +echo "[patch] Applying heartbeat model override fix (#56788)..." +echo "[patch] Based on upstream PRs #57094 + #57076" + +# ── Fix 1: runtime-system.ts — pass model field through (#57076) ──────────── +FILE="src/plugins/runtime/runtime-system.ts" +if [ -f "$FILE" ]; then + perl -i -pe 's/heartbeat: heartbeat \? \{ target: heartbeat\.target \} : undefined/heartbeat: heartbeat ? { target: heartbeat.target, model: heartbeat.model } : undefined/' "$FILE" + echo "[patch] Fixed $FILE" +else + echo "[patch] WARNING: $FILE not found" +fi + +# ── Fix 1b: types-core.ts — add model to heartbeat type (#57076) ─────────── +FILE="src/plugins/runtime/types-core.ts" +if [ -f "$FILE" ]; then + perl -i -pe 's/heartbeat\?: \{ target\?: string \}/heartbeat?: { target?: string; model?: string }/' "$FILE" + echo "[patch] Fixed $FILE" +else + echo "[patch] WARNING: $FILE not found" +fi + +# ── Fix 2: live-model-switch.ts — prefer caller-provided defaults (#57076) ── +FILE="src/agents/live-model-switch.ts" +if [ -f "$FILE" ]; then + perl -0777 -i -pe 's{ + const\s+defaultModelRef\s*=\s*agentId\s*\n\s*\?\s*resolveDefaultModelForAgent\(\{\s*\n\s*cfg,\s*\n\s*agentId,\s*\n\s*\}\)\s*\n\s*:\s*\{\s*provider:\s*params\.defaultProvider,\s*model:\s*params\.defaultModel\s*\}; +}{ const defaultModelRef = + params.defaultProvider \&\& params.defaultModel + ? { provider: params.defaultProvider, model: params.defaultModel } + : agentId + ? resolveDefaultModelForAgent({ cfg, agentId }) + : { provider: params.defaultProvider, model: params.defaultModel };}xms' "$FILE" + echo "[patch] Fixed $FILE" +else + echo "[patch] WARNING: $FILE not found" +fi + +# ── Fix 3: model-fallback.ts — rethrow LiveSessionModelSwitchError (#57094) ─ +FILE="src/agents/model-fallback.ts" +if [ -f "$FILE" ]; then + # Use node for complex multi-site patching — safer than nested perl + node -e ' +const fs = require("fs"); +let code = fs.readFileSync(process.argv[1], "utf8"); + +// 3a: Add isLiveSessionModelSwitchError export after the log line +const checkFn = ` + +/** + * Structural check for LiveSessionModelSwitchError that works across + * module-boundary duplicates where instanceof would fail. + */ +export function isLiveSessionModelSwitchError(err) { + return ( + typeof err === "object" && + err !== null && + err.name === "LiveSessionModelSwitchError" && + typeof err.provider === "string" && + typeof err.model === "string" + ); +}`; + +code = code.replace( + /const log = createSubsystemLogger\("model-fallback"\);/, + `const log = createSubsystemLogger("model-fallback");${checkFn}` +); + +// 3b: Add rethrowLiveSwitch to runFallbackCandidate params and catch block +code = code.replace( + /async function runFallbackCandidate\(params: \{\n(\s+run: ModelFallbackRunFn;\n\s+provider: string;\n\s+model: string;\n\s+options\?: ModelFallbackRunOptions;)\n\}/, + `async function runFallbackCandidate(params: {\n$1\n rethrowLiveSwitch?: boolean;\n})` +); + +// Add rethrow before the normalize line +code = code.replace( + /( \} catch \(err\) \{\n)( \/\/ Normalize abort-wrapped rate-limit errors)/, + `$1 if (params.rethrowLiveSwitch && isLiveSessionModelSwitchError(err)) {\n throw err;\n }\n$2` +); + +// 3c: Add rethrowLiveSwitch to runFallbackAttempt params and passthrough +code = code.replace( + /async function runFallbackAttempt\(params: \{\n(\s+run: ModelFallbackRunFn;\n\s+provider: string;\n\s+model: string;\n\s+attempts: FallbackAttempt\[\];\n\s+options\?: ModelFallbackRunOptions;)\n\}/, + `async function runFallbackAttempt(params: {\n$1\n rethrowLiveSwitch?: boolean;\n})` +); + +code = code.replace( + /const runResult = await runFallbackCandidate\(\{\n\s+run: params\.run,\n\s+provider: params\.provider,\n\s+model: params\.model,\n\s+options: params\.options,\n\s+\}\);/, + `const runResult = await runFallbackCandidate({\n run: params.run,\n provider: params.provider,\n model: params.model,\n options: params.options,\n rethrowLiveSwitch: params.rethrowLiveSwitch,\n });` +); + +// 3d: Add rethrowLiveSwitch to runWithModelFallback signature +code = code.replace( + "onError?: ModelFallbackErrorHandler;\n}): Promise>", + "onError?: ModelFallbackErrorHandler;\n rethrowLiveSwitch?: boolean;\n}): Promise>" +); + +// 3e: Pass rethrowLiveSwitch in first runFallbackAttempt call (with options: runOptions) +code = code.replace( + /const attemptRun = await runFallbackAttempt\(\{\n\s+run: params\.run,\n\s+\.\.\.candidate,\n\s+attempts,\n\s+options: runOptions,\n\s+\}\);/, + `const attemptRun = await runFallbackAttempt({\n run: params.run,\n ...candidate,\n attempts,\n options: runOptions,\n rethrowLiveSwitch: params.rethrowLiveSwitch,\n });` +); + +fs.writeFileSync(process.argv[1], code, "utf8"); +' "$FILE" + echo "[patch] Fixed $FILE" +else + echo "[patch] WARNING: $FILE not found" +fi + +# ── Fix 4: agent-runner-execution.ts — structural check + rethrowLiveSwitch (#57094) ─ +FILE="src/auto-reply/reply/agent-runner-execution.ts" +if [ -f "$FILE" ]; then + node -e ' +const fs = require("fs"); +let code = fs.readFileSync(process.argv[1], "utf8"); + +// 4a: Replace import — remove LiveSessionModelSwitchError, add isLiveSessionModelSwitchError +code = code.replace( + /import \{ LiveSessionModelSwitchError \} from "\.\.\/\.\.\/agents\/live-model-switch-error\.js";\n/, + "" +); +code = code.replace( + /import \{ runWithModelFallback, isFallbackSummaryError \} from "\.\.\/\.\.\/agents\/model-fallback\.js";/, + `import {\n runWithModelFallback,\n isFallbackSummaryError,\n isLiveSessionModelSwitchError,\n} from "../../agents/model-fallback.js";` +); + +// 4b: Pass rethrowLiveSwitch: true to runWithModelFallback +code = code.replace( + /\.\.\.resolveModelFallbackOptions\(params\.followupRun\.run\),\n(\s+)runId,/, + `...resolveModelFallbackOptions(params.followupRun.run),\n$1runId,\n$1rethrowLiveSwitch: true,` +); + +// 4c: Replace instanceof check with structural check +code = code.replace( + /if \(err instanceof LiveSessionModelSwitchError\) \{/g, + "if (isLiveSessionModelSwitchError(err)) {" +); + +fs.writeFileSync(process.argv[1], code, "utf8"); +' "$FILE" + echo "[patch] Fixed $FILE" +else + echo "[patch] WARNING: $FILE not found" +fi + +# ── Fix 5: get-reply.ts — guard post-directive model overwrite (#57076) ───── +FILE="src/auto-reply/reply/get-reply.ts" +if [ -f "$FILE" ]; then + perl -0777 -i -pe 's{(\} = directiveResult\.result;\n)( provider = resolvedProvider;\n model = resolvedModel;)}{$1 if (!hasResolvedHeartbeatModelOverride) \{\n provider = resolvedProvider;\n model = resolvedModel;\n \}}s' "$FILE" + echo "[patch] Fixed $FILE" +else + echo "[patch] WARNING: $FILE not found" +fi + +echo "[patch] Heartbeat model override fix applied (5 files, 4 root causes)." diff --git a/patches/apply-prompt-cache-context-files-fix.sh b/patches/apply-prompt-cache-context-files-fix.sh new file mode 100644 index 0000000..005b41e --- /dev/null +++ b/patches/apply-prompt-cache-context-files-fix.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +# Fix: 0% prompt cache hit rate — context files invalidate stable prefix +# +# Root cause: context files (MEMORY.md, SOUL.md, USER.md, etc.) are embedded in the +# STABLE PREFIX (before ). When the agent writes to +# MEMORY.md during a turn, the stable prefix changes on the next call → cache miss. +# +# Fix: Move the "# Project Context" section from before to after the cache boundary. +# Impact: cache hit rate goes from ~0% to ~80-90%. +set -euo pipefail + +echo "[patch] Applying prompt cache context files fix..." + +FILE="src/agents/system-prompt.ts" +if [ ! -f "$FILE" ]; then + echo "[patch] ERROR: $FILE not found" + exit 1 +fi + +python3 << 'PYEOF' +import re, sys + +with open("src/agents/system-prompt.ts") as f: + code = f.read() + +# The block to move — starts at " const contextFiles" and ends just before +# " // Skip silent replies" +BLOCK_PATTERN = r'( const contextFiles = params\.contextFiles.*? (?=// Skip silent replies for subagent))' +match = re.search(BLOCK_PATTERN, code, re.DOTALL) +if not match: + print("[patch] ERROR: Could not find context files block. Source may have changed.") + sys.exit(1) + +context_block = match.group(1) +print(f"[patch] Found context files block ({len(context_block)} chars)") + +# Step 1: Remove it from before the cache boundary +code = code.replace(context_block, "") + +# Step 2: Find the cache boundary push and insert the block after it +CACHE_BOUNDARY_LINE = " lines.push(SYSTEM_PROMPT_CACHE_BOUNDARY);\n" +if CACHE_BOUNDARY_LINE not in code: + print("[patch] ERROR: Could not find cache boundary push line.") + sys.exit(1) + +INSERTION = ( + CACHE_BOUNDARY_LINE + + "\n" + + " // Context files (MEMORY.md, SOUL.md, etc.) are placed AFTER the cache boundary\n" + + " // so that agent memory writes between turns do not invalidate the cached stable prefix.\n" + + context_block +) + +code = code.replace(CACHE_BOUNDARY_LINE, INSERTION, 1) + +with open("src/agents/system-prompt.ts", "w") as f: + f.write(code) + +print("[patch] Moved context files block to after cache boundary.") +print("[patch] Impact: cache hit rate ~0% → ~80-90%.") +PYEOF diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh index 864eb0c..a18ad5d 100644 --- a/scripts/entrypoint.sh +++ b/scripts/entrypoint.sh @@ -162,12 +162,12 @@ HOOKS_LOCATION_BLOCK="" if [ -n "$HOOKS_PATH" ]; then HOOKS_LOCATION_BLOCK="location ${HOOKS_PATH} { proxy_pass http://127.0.0.1:${GATEWAY_PORT}; - proxy_set_header Authorization \\\$http_authorization; + proxy_set_header Authorization \$http_authorization; - proxy_set_header Host \\\$host; - proxy_set_header X-Real-IP \\\$remote_addr; - proxy_set_header X-Forwarded-For \\\$proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto \\\$scheme; + proxy_set_header Host \$host; + proxy_set_header X-Real-IP \$remote_addr; + proxy_set_header X-Forwarded-For \$proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto \$scheme; proxy_http_version 1.1; @@ -302,6 +302,62 @@ nginx rm -f /tmp/openclaw-gateway.lock 2>/dev/null || true rm -f "$STATE_DIR/gateway.lock" 2>/dev/null || true +# ── LINE webhook route hot-reload workaround ───────────────────────────────── +# Bug: openclaw/openclaw#49803 — LINE (and other webhook-based channels) return +# 404 on cold start because the bundler splits runtime.ts into two chunks. The +# chunk that registers LINE routes and the chunk the HTTP server uses to look up +# routes both initialise the same global "route registry" object — but whichever +# chunk runs first creates the object without the fields the other chunk expects, +# so they end up referencing different registries. Result: routes are registered +# successfully (no error logged) but the HTTP server can't find them → 404. +# +# After a config hot-reload the channel restarts and re-registers routes into the +# correct registry, so the 404 goes away. The workaround below triggers that +# hot-reload automatically 20 seconds after the gateway starts. +# +# How it works: +# 1. Waits 20 s for the gateway to fully start and load all channel plugins. +# 2. Writes a temporary "_reloadTs" field into openclaw.json — the gateway's +# file-watcher sees the change and hot-reloads the LINE channel. +# 3. Removes the temporary field 5 s later (clean-up). +# +# This is a background process — it does NOT block gateway startup. +# Remove this block once openclaw/openclaw#49803 is fixed upstream and the +# upstream fix is merged and shipped in a release we build from. +# +# Upstream PRs tracking the real fix: +# - openclaw/openclaw#53642 (bundle-split registry mismatch fix) +# - openclaw/openclaw#54686 (syncPluginRegistry at runtime boundaries) +CONFIG_FILE="${STATE_DIR}/openclaw.json" +( + sleep 20 + if [ -f "$CONFIG_FILE" ]; then + echo "[entrypoint] applying LINE webhook route hot-reload workaround (#49803)..." + # Add a temporary field to trigger the file-watcher hot-reload + node -e " + const fs = require('fs'); + try { + const c = JSON.parse(fs.readFileSync(process.argv[1], 'utf8')); + c._reloadTs = Date.now(); + fs.writeFileSync(process.argv[1], JSON.stringify(c, null, 2)); + } catch(e) { process.exit(0); } + " "$CONFIG_FILE" + sleep 5 + # Remove the temporary field + node -e " + const fs = require('fs'); + try { + const c = JSON.parse(fs.readFileSync(process.argv[1], 'utf8')); + delete c._reloadTs; + fs.writeFileSync(process.argv[1], JSON.stringify(c, null, 2)); + } catch(e) { process.exit(0); } + " "$CONFIG_FILE" + echo "[entrypoint] LINE webhook route hot-reload complete." + else + echo "[entrypoint] WARNING: config not found at $CONFIG_FILE — skipping LINE hot-reload workaround" + fi +) & + # ── Start openclaw gateway ─────────────────────────────────────────────────── echo "[entrypoint] starting openclaw gateway on port $GATEWAY_PORT..."