NVIDIA · ericksoa · Mar 25, 2026 · Mar 25, 2026 · Mar 26, 2026
diff --git a/Dockerfile b/Dockerfile
@@ -54,6 +54,11 @@ ARG CHAT_UI_URL=http://127.0.0.1:18789
 ARG NEMOCLAW_INFERENCE_BASE_URL=https://inference.local/v1
 ARG NEMOCLAW_INFERENCE_API=openai-completions
 ARG NEMOCLAW_INFERENCE_COMPAT_B64=e30=
+# EXPERIMENTAL: small model mode reduces the system prompt for local inference.
+# When set to 1, writes compact workspace files and lowers bootstrap token budgets
+# so small local models (e.g. qwen2.5:0.5b via Ollama) have more capacity for
+# actual conversation instead of digesting a large system prompt.
+ARG NEMOCLAW_SMALL_MODEL_MODE=0
 # Unique per build to ensure each image gets a fresh auth token.
 # Pass --build-arg NEMOCLAW_BUILD_ID=$(date +%s) to bust the cache.
 ARG NEMOCLAW_BUILD_ID=default
@@ -67,7 +72,8 @@ ENV NEMOCLAW_MODEL=${NEMOCLAW_MODEL} \
     CHAT_UI_URL=${CHAT_UI_URL} \
     NEMOCLAW_INFERENCE_BASE_URL=${NEMOCLAW_INFERENCE_BASE_URL} \
     NEMOCLAW_INFERENCE_API=${NEMOCLAW_INFERENCE_API} \
-    NEMOCLAW_INFERENCE_COMPAT_B64=${NEMOCLAW_INFERENCE_COMPAT_B64}
+    NEMOCLAW_INFERENCE_COMPAT_B64=${NEMOCLAW_INFERENCE_COMPAT_B64} \
+    NEMOCLAW_SMALL_MODEL_MODE=${NEMOCLAW_SMALL_MODEL_MODE}
 
 WORKDIR /sandbox
 USER sandbox
@@ -99,8 +105,13 @@ providers = { \
         'models': [{**({'compat': inference_compat} if inference_compat else {}), 'id': model, 'name': primary_model_ref, 'reasoning': False, 'input': ['text'], 'cost': {'input': 0, 'output': 0, 'cacheRead': 0, 'cacheWrite': 0}, 'contextWindow': 131072, 'maxTokens': 4096}] \
     } \
 }; \
+small_model = os.environ.get('NEMOCLAW_SMALL_MODEL_MODE', '0') == '1'; \
+agent_defaults = {'model': {'primary': primary_model_ref}}; \
+if small_model: \
+    agent_defaults['bootstrapMaxChars'] = 4000; \
+    agent_defaults['bootstrapTotalMaxChars'] = 8000; \
 config = { \
-    'agents': {'defaults': {'model': {'primary': primary_model_ref}}}, \
+    'agents': {'defaults': agent_defaults}, \
     'models': {'mode': 'merge', 'providers': providers}, \
     'channels': {'defaults': {'configWrites': False}}, \
     'gateway': { \
@@ -122,6 +133,46 @@ os.chmod(path, 0o600)"
 RUN openclaw doctor --fix > /dev/null 2>&1 || true \
     && openclaw plugins install /opt/nemoclaw > /dev/null 2>&1 || true
 
+# EXPERIMENTAL: small model mode — write compact workspace files so local models
+# spend fewer tokens on system prompt and more on actual conversation.
+# Users can still override these files inside the sandbox after creation.
+# hadolint ignore=SC2016
+RUN if [ "$NEMOCLAW_SMALL_MODEL_MODE" = "1" ]; then \
+      echo '[experimental] Small model mode: writing compact workspace files'; \
+      printf '%s\n' \
+        '# SOUL' \
+        '' \
+        'You are a helpful AI assistant running locally. Be concise and direct.' \
+        '' \
+        '- Answer questions accurately' \
+        '- Admit when you don'\''t know something' \
+        '- Keep responses short unless asked to elaborate' \
+        '- Use tools when available and appropriate' \
+        '- Never fabricate information' \
+        > /sandbox/.openclaw-data/workspace/SOUL.md; \
+      printf '%s\n' \
+        '# Agents' \
+        '' \
+        '## Startup' \
+        '' \
+        'Read these files if they exist:' \
+        '- `SOUL.md` — your behavioral rules' \
+        '- `USER.md` — context about the person you are helping' \
+        '' \
+        '## Rules' \
+        '' \
+        '- Safe to do freely: read files, search, organize workspace' \
+        '- Ask before: sending messages, deleting files, any external action' \
+        '- Never exfiltrate private data' \
+        '- Use `trash` instead of `rm`' \
+        '' \
+        '## Memory' \
+        '' \
+        'Write notes to `memory/YYYY-MM-DD.md` to remember things across sessions.' \
+        'Curate important facts into `MEMORY.md`.' \
+        > /sandbox/.openclaw-data/workspace/AGENTS.md; \
+    fi
+
 # Lock openclaw.json via DAC: chown to root so the sandbox user cannot modify
 # it at runtime.  This works regardless of Landlock enforcement status.
 # The Landlock policy (/sandbox/.openclaw in read_only) provides defense-in-depth

diff --git a/bin/lib/onboard.js b/bin/lib/onboard.js
@@ -458,6 +458,7 @@
   let inferenceBaseUrl = "https://inference.local/v1";
   let inferenceApi = preferredInferenceApi || "openai-completions";
   let inferenceCompat = null;
+  let smallModelMode = false;
 
   switch (provider) {
     case "openai-api":
@@ -485,6 +486,12 @@
         supportsStore: false,
       };
       break;
+    case "ollama-local":
+    case "vllm-local":
+      providerKey = "inference";
+      primaryModelRef = `inference/${model}`;
+      smallModelMode = true;
+      break;
     case "nvidia-prod":
     case "nvidia-nim":
     default:
@@ -493,7 +500,7 @@
       break;
   }
 
-  return { providerKey, primaryModelRef, inferenceBaseUrl, inferenceApi, inferenceCompat };
+  return { providerKey, primaryModelRef, inferenceBaseUrl, inferenceApi, inferenceCompat, smallModelMode };
 }
 
 function patchStagedDockerfile(dockerfilePath, model, chatUiUrl, buildId = String(Date.now()), provider = null, preferredInferenceApi = null) {
@@ -503,6 +510,7 @@
     inferenceBaseUrl,
     inferenceApi,
     inferenceCompat,
+    smallModelMode,
   } = getSandboxInferenceConfig(model, provider, preferredInferenceApi);
   let dockerfile = fs.readFileSync(dockerfilePath, "utf8");
   dockerfile = dockerfile.replace(
@@ -533,6 +541,10 @@
     /^ARG NEMOCLAW_INFERENCE_COMPAT_B64=.*$/m,
     `ARG NEMOCLAW_INFERENCE_COMPAT_B64=${encodeDockerJsonArg(inferenceCompat)}`
   );
+  dockerfile = dockerfile.replace(
+    /^ARG NEMOCLAW_SMALL_MODEL_MODE=.*$/m,
+    `ARG NEMOCLAW_SMALL_MODEL_MODE=${smallModelMode ? "1" : "0"}`
+  );
   dockerfile = dockerfile.replace(
     /^ARG NEMOCLAW_BUILD_ID=.*$/m,
     `ARG NEMOCLAW_BUILD_ID=${buildId}`
@@ -1426,7 +1438,7 @@

 // ── Step 3: Sandbox ──────────────────────────────────────────────

 async function createSandbox(gpu, model, provider, preferredInferenceApi = null) {
  step(5, 7, "Creating sandbox");

  const nameAnswer = await promptOrDefault(
@@ -1490,6 +1502,10 @@
   console.log(`  Creating sandbox '${sandboxName}' (this takes a few minutes on first run)...`);
   const chatUiUrl = process.env.CHAT_UI_URL || "http://127.0.0.1:18789";
   patchStagedDockerfile(stagedDockerfile, model, chatUiUrl, String(Date.now()), provider, preferredInferenceApi);
+  const { smallModelMode } = getSandboxInferenceConfig(model, provider, preferredInferenceApi);
+  if (smallModelMode) {
+    console.log("  [experimental] Small model mode: reduced system prompt for local inference");
+  }
   // Only pass non-sensitive env vars to the sandbox. NVIDIA_API_KEY is NOT
   // needed inside the sandbox — inference is proxied through the OpenShell
   // gateway which injects the stored credential server-side. The gateway

diff --git a/test/local-inference.test.js b/test/local-inference.test.js
@@ -164,4 +164,5 @@ describe("local inference helpers", () => {
     );
     expect(result).toEqual({ ok: true });
   });
+
 });
diff --git a/test/onboard.test.js b/test/onboard.test.js
@@ -50,6 +50,7 @@ describe("onboard helpers", () => {
         "ARG NEMOCLAW_PRIMARY_MODEL_REF=nvidia/nemotron-3-super-120b-a12b",
         "ARG CHAT_UI_URL=http://127.0.0.1:18789",
         "ARG NEMOCLAW_INFERENCE_COMPAT_B64=e30=",
+        "ARG NEMOCLAW_SMALL_MODEL_MODE=0",
         "ARG NEMOCLAW_BUILD_ID=default",
       ].join("\n")
     );
@@ -76,6 +77,7 @@ describe("onboard helpers", () => {
         inferenceBaseUrl: "https://inference.local/v1",
         inferenceApi: "openai-completions",
         inferenceCompat: null,
+        smallModelMode: false,
       }
     );
   });
@@ -93,6 +95,7 @@ describe("onboard helpers", () => {
         "ARG NEMOCLAW_INFERENCE_BASE_URL=https://inference.local/v1",
         "ARG NEMOCLAW_INFERENCE_API=openai-completions",
         "ARG NEMOCLAW_INFERENCE_COMPAT_B64=e30=",
+        "ARG NEMOCLAW_SMALL_MODEL_MODE=0",
         "ARG NEMOCLAW_BUILD_ID=default",
       ].join("\n")
     );
@@ -127,6 +130,7 @@ describe("onboard helpers", () => {
         inferenceCompat: {
           supportsStore: false,
         },
+        smallModelMode: false,
       }
     );
   });
@@ -140,10 +144,75 @@ describe("onboard helpers", () => {
         inferenceBaseUrl: "https://inference.local/v1",
         inferenceApi: "openai-responses",
         inferenceCompat: null,
+        smallModelMode: false,
       }
     );
   });
 
+  it("maps ollama-local to the routed inference provider with small model mode", () => {
+    assert.deepEqual(
+      getSandboxInferenceConfig("qwen2.5:7b", "ollama-local", "openai-completions"),
+      {
+        providerKey: "inference",
+        primaryModelRef: "inference/qwen2.5:7b",
+        inferenceBaseUrl: "https://inference.local/v1",
+        inferenceApi: "openai-completions",
+        inferenceCompat: null,
+        smallModelMode: true,
+      }
+    );
+  });
+
+  it("maps vllm-local to the routed inference provider with small model mode", () => {
+    assert.deepEqual(
+      getSandboxInferenceConfig("nemotron-3-nano:30b", "vllm-local", "openai-completions"),
+      {
+        providerKey: "inference",
+        primaryModelRef: "inference/nemotron-3-nano:30b",
+        inferenceBaseUrl: "https://inference.local/v1",
+        inferenceApi: "openai-completions",
+        inferenceCompat: null,
+        smallModelMode: true,
+      }
+    );
+  });
+
+  it("patches the staged Dockerfile with small model mode for ollama-local", () => {
+    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-onboard-dockerfile-ollama-"));
+    const dockerfilePath = path.join(tmpDir, "Dockerfile");
+    fs.writeFileSync(
+      dockerfilePath,
+      [
+        "ARG NEMOCLAW_MODEL=nvidia/nemotron-3-super-120b-a12b",
+        "ARG NEMOCLAW_PROVIDER_KEY=nvidia",
+        "ARG NEMOCLAW_PRIMARY_MODEL_REF=nvidia/nemotron-3-super-120b-a12b",
+        "ARG CHAT_UI_URL=http://127.0.0.1:18789",
+        "ARG NEMOCLAW_INFERENCE_BASE_URL=https://inference.local/v1",
+        "ARG NEMOCLAW_INFERENCE_API=openai-completions",
+        "ARG NEMOCLAW_INFERENCE_COMPAT_B64=e30=",
+        "ARG NEMOCLAW_SMALL_MODEL_MODE=0",
+        "ARG NEMOCLAW_BUILD_ID=default",
+      ].join("\n")
+    );
+
+    try {
+      patchStagedDockerfile(
+        dockerfilePath,
+        "qwen2.5:0.5b",
+        "http://127.0.0.1:18789",
+        "build-ollama",
+        "ollama-local"
+      );
+      const patched = fs.readFileSync(dockerfilePath, "utf8");
+      assert.match(patched, /^ARG NEMOCLAW_MODEL=qwen2\.5:0\.5b$/m);
+      assert.match(patched, /^ARG NEMOCLAW_PROVIDER_KEY=inference$/m);
+      assert.match(patched, /^ARG NEMOCLAW_PRIMARY_MODEL_REF=inference\/qwen2\.5:0\.5b$/m);
+      assert.match(patched, /^ARG NEMOCLAW_SMALL_MODEL_MODE=1$/m);
+    } finally {
+      fs.rmSync(tmpDir, { recursive: true, force: true });
+    }
+  });
+
   it("pins the gateway image to the installed OpenShell release version", () => {
     expect(getInstalledOpenshellVersion("openshell 0.0.12")).toBe("0.0.12");
     expect(getInstalledOpenshellVersion("openshell 0.0.13-dev.8+gbbcaed2ea")).toBe("0.0.13");
-Original file line number
+Diff line change
@@ Expand Up / @@ -164,4 +164,5 @@ describe("local inference helpers", () => { @@
         );
         expect(result).toEqual({ ok: true });
       });
     });