From 655fbe002a4acb44fa9b51205660c57c3758c02e Mon Sep 17 00:00:00 2001 From: Alan Pena Date: Wed, 25 Mar 2026 00:17:46 -0400 Subject: [PATCH 1/3] fix: auto-reset telegram bridge session after lock failure --- scripts/telegram-bridge.js | 44 +++++-- test/telegram-bridge-session-reset.test.js | 126 +++++++++++++++++++++ 2 files changed, 162 insertions(+), 8 deletions(-) create mode 100644 test/telegram-bridge-session-reset.test.js diff --git a/scripts/telegram-bridge.js b/scripts/telegram-bridge.js index c51a5529a..c023c2d6b 100755 --- a/scripts/telegram-bridge.js +++ b/scripts/telegram-bridge.js @@ -140,20 +140,36 @@ function runAgentInSandbox(message, sessionId) { const response = responseLines.join("\n").trim(); if (response) { - resolve(response); + resolve({ response, exitCode: code, stderr }); } else if (code !== 0) { - resolve(`Agent exited with code ${code}. ${stderr.trim().slice(0, 500)}`); + resolve({ response: `Agent exited with code ${code}. ${stderr.trim().slice(0, 500)}`, exitCode: code, stderr }); } else { - resolve("(no response)"); + resolve({ response: "(no response)", exitCode: code, stderr }); } }); proc.on("error", (err) => { - resolve(`Error: ${err.message}`); + resolve({ response: `Error: ${err.message}`, exitCode: 1, stderr: err.message }); }); }); } +// ── Session lock detection ──────────────────────────────────────── + +/** + * Returns true when an agent result indicates a session-file lock collision. + * Only matches the specific lock/corruption class of errors — not general failures. + * Exit code 255 alone is not sufficient (SSH uses it for connection errors too), + * so we require either an explicit lock message in stderr/response, or code 255 + * combined with lock-related output. + */ +function isSessionLockFailure(result) { + const output = `${result.stderr || ""} ${result.response || ""}`; + if (output.includes("session file locked")) return true; + if (result.exitCode === 255 && /lock|session.*corrupt/i.test(output)) return true; + return false; +} + // ── Poll loop ───────────────────────────────────────────────────── async function poll() { @@ -205,10 +221,20 @@ async function poll() { const typingInterval = setInterval(() => sendTyping(chatId), 4000); try { - const response = await runAgentInSandbox(msg.text, chatId); + const result = await runAgentInSandbox(msg.text, chatId); clearInterval(typingInterval); - console.log(`[${chatId}] agent: ${response.slice(0, 100)}...`); - await sendMessage(chatId, response, msg.message_id); + + // Detect session lock failures and auto-reset to prevent corrupted + // context from persisting into subsequent messages (#833). + if (isSessionLockFailure(result)) { + activeSessions.delete(chatId); + console.error(`[${chatId}] session lock failure — session reset`); + await sendMessage(chatId, "⚠️ Session error detected — your session has been automatically reset. Please resend your message.", msg.message_id); + continue; + } + + console.log(`[${chatId}] agent: ${result.response.slice(0, 100)}...`); + await sendMessage(chatId, result.response, msg.message_id); } catch (err) { clearInterval(typingInterval); await sendMessage(chatId, `Error: ${err.message}`, msg.message_id); @@ -249,4 +275,6 @@ async function main() { poll(); } -main(); +if (require.main === module) { + main(); +} diff --git a/test/telegram-bridge-session-reset.test.js b/test/telegram-bridge-session-reset.test.js new file mode 100644 index 000000000..5eadedb24 --- /dev/null +++ b/test/telegram-bridge-session-reset.test.js @@ -0,0 +1,126 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Regression tests for #833: session lock failures must auto-reset the session +// so the next message starts with clean context instead of resuming corrupted +// conversation history. + +import fs from "node:fs"; +import path from "node:path"; +import { describe, it, expect } from "vitest"; + +const BRIDGE_SRC = fs.readFileSync( + path.join(import.meta.dirname, "..", "scripts", "telegram-bridge.js"), + "utf-8", +); + +// Extract and evaluate the isSessionLockFailure function from the source +// so we test the real implementation without triggering the script's +// top-level side effects (process.exit, openshell resolution, etc.). + +const fnMatch = BRIDGE_SRC.match( + /function isSessionLockFailure\(result\)\s*\{[\s\S]*?^\}/m, +); +if (!fnMatch) throw new Error("Could not extract isSessionLockFailure from telegram-bridge.js"); +const isSessionLockFailure = new Function( + `${fnMatch[0]}; return isSessionLockFailure;`, +)(); + +describe("isSessionLockFailure", () => { + it("detects exit code 255 as a lock failure when stderr mentions lock", () => { + const result = { response: "some output", exitCode: 255, stderr: "lock timeout" }; + expect(isSessionLockFailure(result)).toBe(true); + }); + + it("does not flag exit code 255 alone without lock indicators", () => { + const result = { response: "some output", exitCode: 255, stderr: "ssh: connect to host openshell-nemoclaw port 22: Connection refused" }; + expect(isSessionLockFailure(result)).toBe(false); + }); + + it("detects 'session file locked' in stderr regardless of exit code", () => { + const result = { + response: "", + exitCode: 1, + stderr: "Error: session file locked (timeout 10000ms)", + }; + expect(isSessionLockFailure(result)).toBe(true); + }); + + it("detects 'session file locked' in response when stderr is empty", () => { + const result = { + response: "Agent exited with code 1. session file locked (timeout 10000ms)", + exitCode: 1, + stderr: "", + }; + expect(isSessionLockFailure(result)).toBe(true); + }); + + it("detects exit code 255 with session corruption message", () => { + const result = { + response: "", + exitCode: 255, + stderr: "session file locked (timeout 10000ms)", + }; + expect(isSessionLockFailure(result)).toBe(true); + }); + + it("does not flag a normal successful result", () => { + const result = { response: "Hello!", exitCode: 0, stderr: "" }; + expect(isSessionLockFailure(result)).toBe(false); + }); + + it("does not flag a generic non-lock failure (e.g. OOM, timeout)", () => { + const result = { + response: "Agent exited with code 137. Killed", + exitCode: 137, + stderr: "Killed", + }; + expect(isSessionLockFailure(result)).toBe(false); + }); + + it("does not flag a normal non-zero exit without lock indicators", () => { + const result = { + response: "Error: connection refused", + exitCode: 1, + stderr: "ssh: connect to host openshell-nemoclaw port 22: Connection refused", + }; + expect(isSessionLockFailure(result)).toBe(false); + }); + + it("handles missing stderr and response gracefully", () => { + const result = { response: undefined, exitCode: 0, stderr: undefined }; + expect(isSessionLockFailure(result)).toBe(false); + }); +}); + +describe("telegram-bridge session reset wiring", () => { + it("calls isSessionLockFailure in the poll handler", () => { + expect(BRIDGE_SRC).toContain("isSessionLockFailure(result)"); + }); + + it("deletes activeSessions on lock failure", () => { + // The lock-failure branch must clear the session so the next message + // starts fresh instead of resuming corrupted context. + const lockBlock = BRIDGE_SRC.slice( + BRIDGE_SRC.indexOf("if (isSessionLockFailure(result))"), + ); + expect(lockBlock).toContain("activeSessions.delete(chatId)"); + }); + + it("does not delete activeSessions for normal responses", () => { + // Outside the lock-failure branch, the session should be preserved. + // Find the line after the lock-failure block that logs the normal response. + const normalPath = BRIDGE_SRC.match( + /console\.log\(`\[\$\{chatId\}\] agent:.*result\.response/, + ); + expect(normalPath).not.toBeNull(); + }); + + it("notifies the user when a session is auto-reset", () => { + const lockBlock = BRIDGE_SRC.slice( + BRIDGE_SRC.indexOf("if (isSessionLockFailure(result))"), + ); + expect(lockBlock).toContain("sendMessage(chatId"); + expect(lockBlock).toMatch(/session.*reset/i); + }); +}); From 6d93a8c61d63f059a6dfb18763e11bd3e16d9535 Mon Sep 17 00:00:00 2001 From: Alan Pena Date: Wed, 25 Mar 2026 00:34:56 -0400 Subject: [PATCH 2/3] fix: guard lock detection against false positives and rotate session ID on reset - Only check result.response for lock text when exitCode !== 0, so a successful reply quoting the error string is not misidentified. - Replace activeSessions.delete() with rotateSession() that assigns a new unique session suffix, ensuring the next runAgentInSandbox call uses a different --session-id and avoids reopening the corrupted remote session file. - Update tests: add false-positive coverage, verify rotateSession wiring and getSessionId usage. --- scripts/telegram-bridge.js | 32 ++++++++++++++++------ test/telegram-bridge-session-reset.test.js | 24 ++++++++-------- 2 files changed, 37 insertions(+), 19 deletions(-) diff --git a/scripts/telegram-bridge.js b/scripts/telegram-bridge.js index c023c2d6b..783223125 100755 --- a/scripts/telegram-bridge.js +++ b/scripts/telegram-bridge.js @@ -39,7 +39,19 @@ if (!TOKEN) { console.error("TELEGRAM_BOT_TOKEN required"); process.exit(1); } if (!API_KEY) { console.error("NVIDIA_API_KEY required"); process.exit(1); } let offset = 0; -const activeSessions = new Map(); // chatId → message history +const activeSessions = new Map(); // chatId → session suffix +let sessionCounter = 0; + +function getSessionId(chatId) { + if (!activeSessions.has(chatId)) { + activeSessions.set(chatId, `${chatId}-${++sessionCounter}`); + } + return activeSessions.get(chatId); +} + +function rotateSession(chatId) { + activeSessions.set(chatId, `${chatId}-${++sessionCounter}`); +} // ── Telegram API helpers ────────────────────────────────────────── @@ -164,9 +176,13 @@ function runAgentInSandbox(message, sessionId) { * combined with lock-related output. */ function isSessionLockFailure(result) { - const output = `${result.stderr || ""} ${result.response || ""}`; - if (output.includes("session file locked")) return true; - if (result.exitCode === 255 && /lock|session.*corrupt/i.test(output)) return true; + const stderr = result.stderr || ""; + // Always check stderr regardless of exit code. + if (stderr.includes("session file locked")) return true; + // Only check response text when the process actually failed — a successful + // reply that quotes the error string (e.g. explaining the error) is not a lock failure. + if (result.exitCode !== 0 && (result.response || "").includes("session file locked")) return true; + if (result.exitCode === 255 && /lock|session.*corrupt/i.test(`${stderr} ${result.response || ""}`)) return true; return false; } @@ -209,7 +225,7 @@ async function poll() { // Handle /reset if (msg.text === "/reset") { - activeSessions.delete(chatId); + rotateSession(chatId); await sendMessage(chatId, "Session reset.", msg.message_id); continue; } @@ -221,14 +237,14 @@ async function poll() { const typingInterval = setInterval(() => sendTyping(chatId), 4000); try { - const result = await runAgentInSandbox(msg.text, chatId); + const result = await runAgentInSandbox(msg.text, getSessionId(chatId)); clearInterval(typingInterval); // Detect session lock failures and auto-reset to prevent corrupted // context from persisting into subsequent messages (#833). if (isSessionLockFailure(result)) { - activeSessions.delete(chatId); - console.error(`[${chatId}] session lock failure — session reset`); + rotateSession(chatId); + console.error(`[${chatId}] session lock failure — session rotated`); await sendMessage(chatId, "⚠️ Session error detected — your session has been automatically reset. Please resend your message.", msg.message_id); continue; } diff --git a/test/telegram-bridge-session-reset.test.js b/test/telegram-bridge-session-reset.test.js index 5eadedb24..af1e64295 100644 --- a/test/telegram-bridge-session-reset.test.js +++ b/test/telegram-bridge-session-reset.test.js @@ -69,6 +69,15 @@ describe("isSessionLockFailure", () => { expect(isSessionLockFailure(result)).toBe(false); }); + it("does not flag a successful reply that quotes the error text", () => { + const result = { + response: 'The error "session file locked" means another process is using the file.', + exitCode: 0, + stderr: "", + }; + expect(isSessionLockFailure(result)).toBe(false); + }); + it("does not flag a generic non-lock failure (e.g. OOM, timeout)", () => { const result = { response: "Agent exited with code 137. Killed", @@ -98,22 +107,15 @@ describe("telegram-bridge session reset wiring", () => { expect(BRIDGE_SRC).toContain("isSessionLockFailure(result)"); }); - it("deletes activeSessions on lock failure", () => { - // The lock-failure branch must clear the session so the next message - // starts fresh instead of resuming corrupted context. + it("rotates session on lock failure instead of just deleting", () => { const lockBlock = BRIDGE_SRC.slice( BRIDGE_SRC.indexOf("if (isSessionLockFailure(result))"), ); - expect(lockBlock).toContain("activeSessions.delete(chatId)"); + expect(lockBlock).toContain("rotateSession(chatId)"); }); - it("does not delete activeSessions for normal responses", () => { - // Outside the lock-failure branch, the session should be preserved. - // Find the line after the lock-failure block that logs the normal response. - const normalPath = BRIDGE_SRC.match( - /console\.log\(`\[\$\{chatId\}\] agent:.*result\.response/, - ); - expect(normalPath).not.toBeNull(); + it("uses getSessionId when calling runAgentInSandbox", () => { + expect(BRIDGE_SRC).toContain("runAgentInSandbox(msg.text, getSessionId(chatId))"); }); it("notifies the user when a session is auto-reset", () => { From a91573ebae529e77c250e9b7e8fc09de48a36eda Mon Sep 17 00:00:00 2001 From: Alan Pena Date: Wed, 25 Mar 2026 00:46:16 -0400 Subject: [PATCH 3/3] fix: use collision-resistant session IDs, tighten lock regex, guard top-level init - Replace deterministic chatId-counter with crypto.randomUUID() so restarted bridges never recreate abandoned session IDs. - Tighten exit-255 fallback regex to require session-specific phrasing (session.*lock, lock.*session, session.*corrupt) instead of bare 'lock'. - Move env/OpenShell validation into init(), called only when require.main === module, so requiring the file in tests is safe. - Add test for bare 'lock' false positive on exit 255. --- scripts/telegram-bridge.js | 49 +++++++++++++--------- test/telegram-bridge-session-reset.test.js | 9 +++- 2 files changed, 37 insertions(+), 21 deletions(-) diff --git a/scripts/telegram-bridge.js b/scripts/telegram-bridge.js index 783223125..9f1b0929b 100755 --- a/scripts/telegram-bridge.js +++ b/scripts/telegram-bridge.js @@ -16,41 +16,51 @@ * ALLOWED_CHAT_IDS — comma-separated Telegram chat IDs to accept (optional, accepts all if unset) */ +const crypto = require("crypto"); const https = require("https"); const { execFileSync, spawn } = require("child_process"); const { resolveOpenshell } = require("../bin/lib/resolve-openshell"); const { shellQuote, validateName } = require("../bin/lib/runner"); -const OPENSHELL = resolveOpenshell(); -if (!OPENSHELL) { - console.error("openshell not found on PATH or in common locations"); - process.exit(1); -} +// Lazy-initialized by init() so requiring the module in tests does not +// trigger process.exit from missing env vars or openshell resolution. +let OPENSHELL; +let TOKEN; +let API_KEY; +let SANDBOX; +let ALLOWED_CHATS; + +function init() { + OPENSHELL = resolveOpenshell(); + if (!OPENSHELL) { + console.error("openshell not found on PATH or in common locations"); + process.exit(1); + } -const TOKEN = process.env.TELEGRAM_BOT_TOKEN; -const API_KEY = process.env.NVIDIA_API_KEY; -const SANDBOX = process.env.SANDBOX_NAME || "nemoclaw"; -try { validateName(SANDBOX, "SANDBOX_NAME"); } catch (e) { console.error(e.message); process.exit(1); } -const ALLOWED_CHATS = process.env.ALLOWED_CHAT_IDS - ? process.env.ALLOWED_CHAT_IDS.split(",").map((s) => s.trim()) - : null; + TOKEN = process.env.TELEGRAM_BOT_TOKEN; + API_KEY = process.env.NVIDIA_API_KEY; + SANDBOX = process.env.SANDBOX_NAME || "nemoclaw"; + try { validateName(SANDBOX, "SANDBOX_NAME"); } catch (e) { console.error(e.message); process.exit(1); } + ALLOWED_CHATS = process.env.ALLOWED_CHAT_IDS + ? process.env.ALLOWED_CHAT_IDS.split(",").map((s) => s.trim()) + : null; -if (!TOKEN) { console.error("TELEGRAM_BOT_TOKEN required"); process.exit(1); } -if (!API_KEY) { console.error("NVIDIA_API_KEY required"); process.exit(1); } + if (!TOKEN) { console.error("TELEGRAM_BOT_TOKEN required"); process.exit(1); } + if (!API_KEY) { console.error("NVIDIA_API_KEY required"); process.exit(1); } +} let offset = 0; -const activeSessions = new Map(); // chatId → session suffix -let sessionCounter = 0; +const activeSessions = new Map(); // chatId → session UUID suffix function getSessionId(chatId) { if (!activeSessions.has(chatId)) { - activeSessions.set(chatId, `${chatId}-${++sessionCounter}`); + activeSessions.set(chatId, `${chatId}-${crypto.randomUUID()}`); } return activeSessions.get(chatId); } function rotateSession(chatId) { - activeSessions.set(chatId, `${chatId}-${++sessionCounter}`); + activeSessions.set(chatId, `${chatId}-${crypto.randomUUID()}`); } // ── Telegram API helpers ────────────────────────────────────────── @@ -182,7 +192,7 @@ function isSessionLockFailure(result) { // Only check response text when the process actually failed — a successful // reply that quotes the error string (e.g. explaining the error) is not a lock failure. if (result.exitCode !== 0 && (result.response || "").includes("session file locked")) return true; - if (result.exitCode === 255 && /lock|session.*corrupt/i.test(`${stderr} ${result.response || ""}`)) return true; + if (result.exitCode === 255 && /(session.*lock|lock.*session|session.*corrupt|corrupt.*session)/i.test(`${stderr} ${result.response || ""}`)) return true; return false; } @@ -292,5 +302,6 @@ async function main() { } if (require.main === module) { + init(); main(); } diff --git a/test/telegram-bridge-session-reset.test.js b/test/telegram-bridge-session-reset.test.js index af1e64295..1b7f2dfec 100644 --- a/test/telegram-bridge-session-reset.test.js +++ b/test/telegram-bridge-session-reset.test.js @@ -27,11 +27,16 @@ const isSessionLockFailure = new Function( )(); describe("isSessionLockFailure", () => { - it("detects exit code 255 as a lock failure when stderr mentions lock", () => { - const result = { response: "some output", exitCode: 255, stderr: "lock timeout" }; + it("detects exit code 255 as a lock failure when stderr mentions session lock", () => { + const result = { response: "some output", exitCode: 255, stderr: "session lock timeout" }; expect(isSessionLockFailure(result)).toBe(true); }); + it("does not flag exit code 255 with bare 'lock' unrelated to session", () => { + const result = { response: "some output", exitCode: 255, stderr: "lock timeout on resource" }; + expect(isSessionLockFailure(result)).toBe(false); + }); + it("does not flag exit code 255 alone without lock indicators", () => { const result = { response: "some output", exitCode: 255, stderr: "ssh: connect to host openshell-nemoclaw port 22: Connection refused" }; expect(isSessionLockFailure(result)).toBe(false);