diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..1d2fc8319 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +*.sh text eol=lf + diff --git a/.gitignore b/.gitignore index fad199338..04ec53c91 100644 --- a/.gitignore +++ b/.gitignore @@ -57,4 +57,8 @@ terraform.tfvars test/e2e/.dev.vars # Temporary e2e wrangler configs -.wrangler-e2e-*.jsonc \ No newline at end of file +.wrangler-e2e-*.jsonc + +# Local PR / helper docs (not part of the repo) +.pr-error-info.md +PR-DESCRIPTION.md \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md index ec30d2c18..2ae6a5294 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -246,6 +246,16 @@ npx wrangler secret list Enable debug routes with `DEBUG_ROUTES=true` and check `/debug/processes`. +## Troubleshooting + +- **Health OK but no reply from agent / Control UI hangs** + - Check `GET /api/status`: `gatewayProcess.status`, `gatewayProcess.exitCode`, and `lastStderrPreview` (if present) for the last failed gateway run. + - Check `GET /debug/processes?logs=true` or `GET /debug/processes?logs=true&failed=1` for full stderr of gateway/start-openclaw (or start-moltbot) processes. + - Run `npx wrangler tail` and look for `[WS] close` / `[WS] error` JSON lines (code, reason, side) when reproducing the issue. + +- **Gateway exits with code 126** + - Usually caused by script not executable or CRLF line endings in `start-openclaw.sh` (or `start-moltbot.sh`). Ensure the Dockerfile runs `tr -d '\\r'` on the script and `chmod 755` before use. Keep `*.sh` as LF in `.gitattributes`. + ## R2 Storage Notes R2 is mounted via s3fs at `/data/moltbot`. Important gotchas: diff --git a/src/index.ts b/src/index.ts index 3a615dd61..e34518598 100644 --- a/src/index.ts +++ b/src/index.ts @@ -385,18 +385,14 @@ app.all('*', async (c) => { } }); - // Handle close events + // Handle close events (structured log for wrangler tail / debugging) serverWs.addEventListener('close', (event) => { - if (debugLogs) { - console.log('[WS] Client closed:', event.code, event.reason); - } + console.error('[WS] close', JSON.stringify({ side: 'client', code: event.code, reason: event.reason || '(none)' })); containerWs.close(event.code, event.reason); }); containerWs.addEventListener('close', (event) => { - if (debugLogs) { - console.log('[WS] Container closed:', event.code, event.reason); - } + console.error('[WS] close', JSON.stringify({ side: 'container', code: event.code, reason: event.reason || '(none)' })); // Transform the close reason (truncate to 123 bytes max for WebSocket spec) let reason = transformErrorMessage(event.reason, url.host); if (reason.length > 123) { @@ -407,15 +403,14 @@ app.all('*', async (c) => { } serverWs.close(event.code, reason); }); - - // Handle errors + // Handle errors (structured log for wrangler tail / debugging) serverWs.addEventListener('error', (event) => { - console.error('[WS] Client error:', event); + console.error('[WS] error', JSON.stringify({ side: 'client', message: event instanceof ErrorEvent ? event.message : String(event) })); containerWs.close(1011, 'Client error'); }); containerWs.addEventListener('error', (event) => { - console.error('[WS] Container error:', event); + console.error('[WS] error', JSON.stringify({ side: 'container', message: event instanceof ErrorEvent ? event.message : String(event) })); serverWs.close(1011, 'Container error'); }); diff --git a/src/routes/debug.ts b/src/routes/debug.ts index 8ffc05bfb..c6f65622d 100644 --- a/src/routes/debug.ts +++ b/src/routes/debug.ts @@ -1,6 +1,7 @@ import { Hono } from 'hono'; import type { AppEnv } from '../types'; -import { findExistingMoltbotProcess, waitForProcess } from '../gateway'; +import { ensureMoltbotGateway, findExistingMoltbotProcess, waitForProcess } from '../gateway'; +import { sanitizeStderr } from '../utils/sanitize'; /** * Debug routes for inspecting container state @@ -9,6 +10,49 @@ import { findExistingMoltbotProcess, waitForProcess } from '../gateway'; */ const debug = new Hono(); +// GET /debug/start-gateway - Force start the Moltbot gateway (for triggering 126 logs in wrangler tail) +debug.get('/start-gateway', async (c) => { + const sandbox = c.get('sandbox'); + try { + const process = await ensureMoltbotGateway(sandbox, c.env); + return c.json({ + success: true, + processId: process.id, + status: process.status, + message: 'Gateway started successfully', + }); + } catch (err) { + const errorMessage = err instanceof Error ? err.message : String(err); + // Try to attach logs from the most recent failed start-moltbot process + let lastFailed: Record | null = null; + try { + const processes = await sandbox.listProcesses(); + const starter = processes + .filter(p => (p.command.includes('start-openclaw.sh') || p.command.includes('start-moltbot.sh')) && + (p.status === 'failed' || p.status === 'completed')) + .sort((a, b) => (b.startTime?.getTime() ?? 0) - (a.startTime?.getTime() ?? 0))[0]; + if (starter) { + const logs = await starter.getLogs(); + lastFailed = { + id: starter.id, + command: starter.command, + status: starter.status, + exitCode: starter.exitCode, + stdout: logs.stdout || '', + stderr: logs.stderr || '', + }; + } + } catch { + // ignore + } + return c.json({ + success: false, + error: errorMessage, + lastFailedProcess: lastFailed, + }, 503); + } +}); + // GET /debug/version - Returns version info from inside the container debug.get('/version', async (c) => { const sandbox = c.get('sandbox'); @@ -36,11 +80,13 @@ debug.get('/version', async (c) => { }); // GET /debug/processes - List all processes with optional logs +// Query: logs=true (include stdout/stderr), failed=1 (only gateway-related failed/completed with non-zero exit) debug.get('/processes', async (c) => { const sandbox = c.get('sandbox'); try { const processes = await sandbox.listProcesses(); const includeLogs = c.req.query('logs') === 'true'; + const failedOnly = c.req.query('failed') === '1'; const processData = await Promise.all( processes.map(async (p) => { @@ -67,28 +113,60 @@ debug.get('/processes', async (c) => { }), ); - // Sort by status (running first, then starting, completed, failed) - // Within each status, sort by startTime descending (newest first) + // Optionally filter to gateway-related failed/completed only + const isGatewayRelated = (d: Record) => { + const cmd = (d.command as string) || ''; + const status = d.status as string; + const exitCode = d.exitCode as number | undefined; + return ( + (cmd.includes('start-openclaw.sh') || cmd.includes('start-moltbot.sh') || + cmd.includes('openclaw gateway') || cmd.includes('clawdbot gateway')) && + !cmd.includes('openclaw devices') && !cmd.includes('clawdbot devices') && + (status === 'failed' || (status === 'completed' && exitCode != null && exitCode !== 0)) + ); + }; + let list = processData; + if (failedOnly) { + list = processData.filter(isGatewayRelated); + } + + // Sort by status (running first, then starting, completed, failed), then by startTime descending const statusOrder: Record = { running: 0, starting: 1, completed: 2, failed: 3, }; - - processData.sort((a, b) => { + list.sort((a, b) => { const statusA = statusOrder[a.status as string] ?? 99; const statusB = statusOrder[b.status as string] ?? 99; - if (statusA !== statusB) { - return statusA - statusB; - } - // Within same status, sort by startTime descending + if (statusA !== statusB) return statusA - statusB; const timeA = (a.startTime as string) || ''; const timeB = (b.startTime as string) || ''; return timeB.localeCompare(timeA); }); - return c.json({ count: processes.length, processes: processData }); + // Last failed gateway stderr preview (sanitized) when logs=true or failed=1 + let lastFailedStderrPreview: string | undefined; + if (includeLogs || failedOnly) { + const failedStarter = processes + .filter(p => (p.command.includes('start-openclaw.sh') || p.command.includes('start-moltbot.sh')) && + (p.status === 'failed' || (p.status === 'completed' && p.exitCode != null && p.exitCode !== 0))) + .sort((a, b) => (b.startTime?.getTime() ?? 0) - (a.startTime?.getTime() ?? 0))[0]; + if (failedStarter) { + try { + const logs = await failedStarter.getLogs(); + const stderr = logs.stderr || ''; + if (stderr) lastFailedStderrPreview = sanitizeStderr(stderr, 500); + } catch { + lastFailedStderrPreview = '(failed to retrieve logs)'; + } + } + } + + const payload: Record = { count: list.length, processes: list }; + if (lastFailedStderrPreview != null) payload.lastFailedStderrPreview = lastFailedStderrPreview; + return c.json(payload); } catch (error) { const errorMessage = error instanceof Error ? error.message : 'Unknown error'; return c.json({ error: errorMessage }, 500); diff --git a/src/routes/public.ts b/src/routes/public.ts index c2f769c7d..68015adc2 100644 --- a/src/routes/public.ts +++ b/src/routes/public.ts @@ -2,6 +2,7 @@ import { Hono } from 'hono'; import type { AppEnv } from '../types'; import { MOLTBOT_PORT } from '../config'; import { findExistingMoltbotProcess } from '../gateway'; +import { sanitizeStderr } from '../utils/sanitize'; /** * Public routes - NO Cloudflare Access authentication required @@ -30,30 +31,80 @@ publicRoutes.get('/logo-small.png', (c) => { return c.env.ASSETS.fetch(c.req.raw); }); +// GET /favicon.ico - ゲートウェイに流さず 204 で返す(503 を防ぐ) +publicRoutes.get('/favicon.ico', async (c) => { + const res = await c.env.ASSETS.fetch(new Request(new URL('/favicon.ico', c.req.url))); + if (res.ok) return res; + return new Response(null, { status: 204 }); +}); + // GET /api/status - Public health check for gateway status (no auth required) +// デバッグ用: プロセス数・ゲートウェイプロセス状態・exitCode を返す publicRoutes.get('/api/status', async (c) => { const sandbox = c.get('sandbox'); + const debugInfo: Record = { + ok: false, + status: 'unknown', + processId: null as string | null, + processCount: 0, + gatewayProcess: null as { command: string; status: string; exitCode?: number } | null, + hint: '', + }; + try { + const processes = await sandbox.listProcesses(); + debugInfo.processCount = processes.length; + + const gatewayProc = processes.find( + (p) => + (p.command.includes('start-openclaw.sh') || p.command.includes('start-moltbot.sh') || + p.command.includes('openclaw gateway') || p.command.includes('clawdbot gateway')) && + !p.command.includes('openclaw devices') && !p.command.includes('clawdbot devices') + ); + if (gatewayProc) { + debugInfo.gatewayProcess = { + command: gatewayProc.command, + status: gatewayProc.status, + exitCode: gatewayProc.exitCode, + }; + } + const process = await findExistingMoltbotProcess(sandbox); if (!process) { - return c.json({ ok: false, status: 'not_running' }); + debugInfo.status = 'not_running'; + debugInfo.hint = gatewayProc?.exitCode != null + ? `Gateway process exited with code ${gatewayProc.exitCode}. Check wrangler tail or /debug/processes?logs=true` + : 'No gateway process. Visit / or /debug/start-gateway to start.'; + if (gatewayProc && (gatewayProc.status === 'failed' || (gatewayProc.status === 'completed' && gatewayProc.exitCode != null && gatewayProc.exitCode !== 0))) { + try { + const logs = await gatewayProc.getLogs(); + const stderr = logs.stderr || ''; + if (stderr) debugInfo.lastStderrPreview = sanitizeStderr(stderr, 300); + } catch { + // ignore + } + } + return c.json(debugInfo); } - // Process exists, check if it's actually responding - // Try to reach the gateway with a short timeout + debugInfo.processId = process.id; + try { await process.waitForPort(18789, { mode: 'tcp', timeout: 5000 }); - return c.json({ ok: true, status: 'running', processId: process.id }); + debugInfo.ok = true; + debugInfo.status = 'running'; + debugInfo.hint = 'Gateway is up. If UI shows "Pairing required", visit /_admin/ to approve this device.'; + return c.json(debugInfo); } catch { - return c.json({ ok: false, status: 'not_responding', processId: process.id }); + debugInfo.status = 'not_responding'; + debugInfo.hint = 'Process exists but port 18789 not responding. Gateway may be starting or crashed.'; + return c.json(debugInfo); } } catch (err) { - return c.json({ - ok: false, - status: 'error', - error: err instanceof Error ? err.message : 'Unknown error', - }); + debugInfo.status = 'error'; + debugInfo.hint = err instanceof Error ? err.message : 'Unknown error'; + return c.json(debugInfo); } }); diff --git a/src/utils/sanitize.ts b/src/utils/sanitize.ts new file mode 100644 index 000000000..1aa97e7ca --- /dev/null +++ b/src/utils/sanitize.ts @@ -0,0 +1,13 @@ +/** + * Sanitize stderr/log text for safe exposure in API responses (redact secrets). + */ +export function sanitizeStderr(text: string, maxLen = 500): string { + if (!text || typeof text !== 'string') return ''; + let out = text + .replace(/\bsk-ant-[a-zA-Z0-9-]{20,}/g, 'sk-ant-***REDACTED***') + .replace(/\bxoxb-[a-zA-Z0-9-]+/g, 'xoxb-***REDACTED***') + .replace(/\bxoxp-[a-zA-Z0-9-]+/g, 'xoxp-***REDACTED***') + .replace(/\b[A-Za-z0-9_-]{20,}@[a-zA-Z]+\.[a-zA-Z]+/g, '***REDACTED***'); + if (out.length > maxLen) out = out.slice(0, maxLen) + '...'; + return out; +}