From 0edda59fe4c56c30000c3b0ad9da9120acae8054 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Wed, 13 May 2026 18:55:23 +0800
Subject: [PATCH 01/41] Add LLMAgent code

---
 docker/run_docker.sh                     |   6 +
 isaaclab_arena/llm_env_gen/__init__.py   |   4 +
 isaaclab_arena/llm_env_gen/llm_agent.py  | 164 +++++++++++++++++++++++
 isaaclab_arena/llm_env_gen/schema.py     | 107 +++++++++++++++
 isaaclab_arena/llm_env_gen/try_schema.py |  69 ++++++++++
 setup.py                                 |   2 +
 6 files changed, 352 insertions(+)
 create mode 100644 isaaclab_arena/llm_env_gen/__init__.py
 create mode 100644 isaaclab_arena/llm_env_gen/llm_agent.py
 create mode 100644 isaaclab_arena/llm_env_gen/schema.py
 create mode 100644 isaaclab_arena/llm_env_gen/try_schema.py

diff --git a/docker/run_docker.sh b/docker/run_docker.sh
index baeb038cc..10dbb02f4 100755
--- a/docker/run_docker.sh
+++ b/docker/run_docker.sh
@@ -185,6 +185,12 @@ else
         fi
     fi
 
+    # pass through API keys used by the LLM scene-gen prototype; values are
+    # inherited from the host shell so the key never lives in the repo.
+    if [ -n "$NV_API_KEY" ]; then
+        DOCKER_RUN_ARGS+=("--env" "NV_API_KEY")
+    fi
+
     # if gr00t is installed, mount the gr00t directory in case anything needs to change there
     if [ "$INSTALL_GROOT" = "true" ]; then
         DOCKER_RUN_ARGS+=("-v" "./submodules/Isaac-GR00T:${WORKDIR}/submodules/Isaac-GR00T")
diff --git a/isaaclab_arena/llm_env_gen/__init__.py b/isaaclab_arena/llm_env_gen/__init__.py
new file mode 100644
index 000000000..16ea4c218
--- /dev/null
+++ b/isaaclab_arena/llm_env_gen/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/isaaclab_arena/llm_env_gen/llm_agent.py b/isaaclab_arena/llm_env_gen/llm_agent.py
new file mode 100644
index 000000000..5d9575c45
--- /dev/null
+++ b/isaaclab_arena/llm_env_gen/llm_agent.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""LLM agent for parsing natural-language scene prompts into a SceneSpec.
+
+Uses Claude via NVIDIA's OpenAI-compatible inference API. Emits the
+SceneSpec Pydantic bundle so asset resolution stays deterministic.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import json
+import os
+
+from .schema import SceneSpec
+
+DEFAULT_BASE_URL = "https://inference-api.nvidia.com"
+DEFAULT_MODEL = "nvidia/deepseek-ai/deepseek-v4-flash"
+
+
+def build_catalog_text() -> str:
+    """Introspect AssetRegistry and build the vocabulary the LLM is allowed to use."""
+    from isaaclab_arena.assets.registries import AssetRegistry
+
+    registry = AssetRegistry()
+    backgrounds: list[str] = []
+    objects: list[dict] = []
+    embodiments: list[str] = []
+    for name in registry.get_all_keys():
+        cls = registry.get_asset_by_name(name)
+        tags = list(getattr(cls, "tags", []))
+        if "embodiment" in tags:
+            embodiments.append(name)
+        elif "background" in tags:
+            backgrounds.append(name)
+        elif "object" in tags:
+            objects.append({"name": name, "tags": [t for t in tags if t != "object"]})
+
+    obj_lines = "\n".join(f"- {o['name']}  tags={o['tags']}" for o in sorted(objects, key=lambda o: o["name"]))
+    return (
+        f"EMBODIMENTS: {', '.join(sorted(embodiments))}\n\n"
+        f"BACKGROUNDS: {', '.join(sorted(backgrounds))}\n\n"
+        f"OBJECTS ({len(objects)}):\n{obj_lines}"
+    )
+
+
+class LLMAgent:
+    """Parses a natural-language prompt into a SceneSpec."""
+
+    def __init__(
+        self,
+        api_key: str | None = None,
+        model: str = DEFAULT_MODEL,
+        base_url: str = DEFAULT_BASE_URL,
+    ):
+        from openai import OpenAI
+
+        self.api_key = api_key or os.getenv("NV_API_KEY")
+        assert self.api_key, "API key required: set NV_API_KEY or pass api_key."
+        self.model = model
+        self.client = OpenAI(api_key=self.api_key, base_url=base_url)
+
+    def generate_spec(
+        self,
+        prompt: str,
+        catalog_text: str | None = None,
+        temperature: float = 0.2,
+        max_tokens: int = 2000,
+    ) -> tuple[SceneSpec, str]:
+        """Return (validated SceneSpec, raw LLM response)."""
+        catalog_text = catalog_text or build_catalog_text()
+        system = self._system_prompt()
+        user = f"{catalog_text}\n\nUSER PROMPT:\n{prompt}\n\nReturn ONLY a JSON object matching the SceneSpec schema."
+
+        resp = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": system},
+                {"role": "user", "content": user},
+            ],
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )
+        raw = resp.choices[0].message.content
+        data = self._extract_json(raw)
+        spec = SceneSpec.model_validate(data)
+        return spec, raw
+
+    def _system_prompt(self) -> str:
+        schema = json.dumps(SceneSpec.model_json_schema(), indent=2)
+        return (
+            "You are a scene-generation parser for robot manipulation tasks.\n"
+            "Convert a natural-language prompt into a SceneSpec JSON object that matches the schema below.\n\n"
+            "RULES:\n"
+            "- item.query: the short human name as it appears in the prompt (e.g. 'avocado', 'bowl').\n"
+            "  The resolver fuzzy-matches this against the OBJECTS catalog; you do NOT need to emit the\n"
+            "  exact registered name.\n"
+            "- item.role: 'foreground' for objects the task acts on; 'distractor' for extras mentioned as\n"
+            "  clutter; 'anchor' for reference surfaces (rare — the background usually covers this).\n"
+            "- item.category_tags: tags that semantically narrow the query, preferring assets with those\n"
+            "  tags. This is a PREFERENCE, not a hard filter — the resolver will fall back to the full\n"
+            "  catalog if the tag pool is empty or yields no close match. Err toward emitting useful tags;\n"
+            "  the trace will report what was relaxed.\n"
+            "- relation.kind ∈ {on, in, next_to, at_position, is_anchor, open, closed}.\n"
+            "  subject/target reference items by their query string or the background name.\n"
+            "  * 'on' / 'in' / other spatial relations describe object placement in the initial scene.\n"
+            "  * 'open' and 'closed' are UNARY state markers for articulated objects (microwave, fridge,\n"
+            "    cabinet) in the initial scene. Their target MUST be null. They describe the *initial*\n"
+            "    state; the task list (below) specifies state changes.\n"
+            "  * Articulated objects (microwave etc.) need both a spatial 'on(microwave, background)'\n"
+            "    relation AND an 'open(microwave, null)' or 'closed(microwave, null)' state marker.\n"
+            "  * Distractor items around the appliance need 'on(distractor, background)' relations.\n"
+            "- initial_scene_graph: FULL snapshot of all relations in the starting state. Every persistent\n"
+            "  relation (e.g. bowl on table, distractors present) must appear here. Relations that change\n"
+            "  via tasks are still listed here in their starting form.\n"
+            "- tasks: a list of atomic actions to perform in order. Each task has:\n"
+            '    * kind ∈ {"pick_and_place", "open_door", "close_door"}\n'
+            "    * subject: the primary object being acted on (e.g. 'avocado', 'microwave')\n"
+            "    * target: the secondary object/location (e.g. 'bowl' for pick_and_place, null for open/close)\n"
+            "    * description: natural-language summary of the task\n"
+            "  Examples:\n"
+            '    * Pick-and-place: {"kind": "pick_and_place", "subject": "avocado", "target": "bowl",\n'
+            '                       "description": "pick up the avocado and place it in the bowl"}\n'
+            '    * Open door: {"kind": "open_door", "subject": "microwave", "target": null,\n'
+            '                  "description": "open the microwave door"}\n'
+            '    * Close door: {"kind": "close_door", "subject": "microwave", "target": null,\n'
+            '                   "description": "close the microwave door"}\n'
+            "  The tasks implicitly define the final scene: apply each task's transformation in order\n"
+            "  to determine what relations hold at completion.\n"
+            "- embodiment: use a bare robot family name ('franka', 'droid', 'g1', 'gr1') when the prompt\n"
+            "  does not specify a control mode — the resolver defaults each to its IK variant. Use a\n"
+            "  full registered name (e.g. 'franka_joint_pos') only when the prompt requests joint control.\n"
+            "- Emit ONLY the JSON object. No prose, no markdown fences.\n\n"
+            f"SCHEMA:\n{schema}"
+        )
+
+    @staticmethod
+    def _extract_json(content: str) -> dict:
+        content = content.strip()
+        if content.startswith("```"):
+            lines = content.split("\n")
+            if lines and lines[0].startswith("```"):
+                lines = lines[1:]
+            if lines and lines[-1].startswith("```"):
+                lines = lines[:-1]
+            content = "\n".join(lines)
+
+        with contextlib.suppress(json.JSONDecodeError):
+            return json.loads(content)
+
+        start = content.find("{")
+        assert start != -1, f"No JSON object in LLM response: {content!r}"
+        depth = 0
+        for i in range(start, len(content)):
+            if content[i] == "{":
+                depth += 1
+            elif content[i] == "}":
+                depth -= 1
+                if depth == 0:
+                    return json.loads(content[start : i + 1])
+        raise AssertionError(f"Unbalanced JSON in LLM response: {content!r}")
diff --git a/isaaclab_arena/llm_env_gen/schema.py b/isaaclab_arena/llm_env_gen/schema.py
new file mode 100644
index 000000000..33f707503
--- /dev/null
+++ b/isaaclab_arena/llm_env_gen/schema.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Schema the LLM must fill in when parsing a natural-language scene prompt.
+
+The LLM sees a list of the *available* asset tags / embodiment names pulled
+from the registries at call time, and must return a SceneSpec that only uses
+those vocabularies. Concrete asset names are resolved by the Resolver in a
+second, deterministic step — the LLM never invents USD paths.
+"""
+
+from __future__ import annotations
+
+from typing import Literal
+
+from pydantic import BaseModel, Field, model_validator
+
+# Relation kinds currently surfaced to the LLM. Mirror the subset of
+# isaaclab_arena.relations.relations that makes sense for tabletop prompts.
+# "in" has no In class in isaaclab_arena.relations.relations yet — see the
+# TODO there. The scene builder materializes goal-state "in" relations as
+# the task's success predicate.
+RelationKind = Literal["on", "in", "next_to", "at_position", "is_anchor", "open", "closed"]
+
+ItemRole = Literal["foreground", "distractor", "anchor"]
+
+# Task kinds the LLM can propose as atomic actions in a plan.
+TaskKind = Literal["pick_and_place", "open_door", "close_door"]
+
+
+class Item(BaseModel):
+    """One object the LLM wants in the scene.
+
+    `query` is the short human name from the prompt ("avocado", "bowl"). The
+    resolver maps it to a registered asset. `category_tags` narrow the search
+    and act as a fallback when the exact name does not resolve — e.g. a
+    distractor "vegetable" resolves to any asset tagged "vegetable".
+    """
+
+    query: str
+    role: ItemRole
+    category_tags: list[str] = Field(default_factory=list)
+    instance_name: str | None = None
+    # Uniform spawn scale. ``None`` (the default) lets the placement
+    # proposer auto-fit the asset against the tabletop bbox; an explicit
+    # positive float overrides the auto-fit.
+    scale: float | None = None
+
+
+class Relation(BaseModel):
+    """A spatial / structural relation between two items (or on one item)."""
+
+    kind: RelationKind
+    subject: str
+    target: str | None = None
+    params: dict = Field(default_factory=dict)
+
+    def identity(self) -> tuple[str, str, str | None]:
+        """Hashable identity for diffing scene graphs — ignores params."""
+        return (self.kind, self.subject, self.target)
+
+
+class Task(BaseModel):
+    """One atomic task in the plan that transforms the scene state.
+
+    A task specifies what action to perform (kind), what object it acts on
+    (subject), and optionally where it goes (target). The description provides
+    natural-language context for the task.
+    """
+
+    kind: TaskKind
+    subject: str  # object instance name (e.g. 'avocado', 'microwave')
+    target: str | None = None  # target object/location (e.g. 'bowl', 'background')
+    description: str  # natural-language task description
+
+
+class SceneSpec(BaseModel):
+    """LLM output — a structured plan for the scene and a list of tasks.
+
+    The language prompt is decomposed into:
+
+      * ``initial_scene_graph`` — every relation that holds at env reset.
+        This configures where objects spawn. This is a FULL snapshot
+        including all relations that persist throughout all tasks.
+      * ``tasks`` — a list of atomic actions to execute in sequence. Each
+        task specifies what to do (kind), what object(s) it acts on
+        (subject/target), and a natural-language description. The task
+        sequence implicitly defines the intermediate scene graphs by applying
+        each task's transformations in order.
+    """
+
+    task_description: str
+    background: str
+    embodiment: str = "franka_ik"
+    items: list[Item]
+    initial_scene_graph: list[Relation]
+    tasks: list[Task]
+
+    @model_validator(mode="after")
+    def _tasks_must_be_non_empty(self) -> SceneSpec:
+        if not self.tasks:
+            raise ValueError(
+                "tasks list is empty — at least one task must be specified to define the scene transformation."
+            )
+        return self
diff --git a/isaaclab_arena/llm_env_gen/try_schema.py b/isaaclab_arena/llm_env_gen/try_schema.py
new file mode 100644
index 000000000..07afdb234
--- /dev/null
+++ b/isaaclab_arena/llm_env_gen/try_schema.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Run the LLM parser on a prompt and print the resulting SceneSpec.
+
+Must run inside the Docker container (needs AssetRegistry). Requires
+NV_API_KEY and the `openai` pip package.
+
+Examples:
+    # Print the Pydantic SceneSpec JSON schema (no LLM call):
+    /isaac-sim/python.sh -m isaaclab_arena.llm_env_gen.try_schema --print-schema
+
+    # Print the catalog sent to the LLM (no LLM call):
+    /isaac-sim/python.sh -m isaaclab_arena.llm_env_gen.try_schema --print-catalog
+
+    # Call the LLM and print the parsed SceneSpec:
+    /isaac-sim/python.sh -m isaaclab_arena.llm_env_gen.try_schema \
+        --prompt "franka pick up avocado from the table and place it into a bowl on the table. there are other veggies on the table as distractor"
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+
+DEFAULT_PROMPT = (
+    "franka pick up avocado from the table and place it into a bowl on the table. "
+    "there are other veggies on the table as distractor"
+)
+SEQUENTIAL_PROMPT = (
+    "franka opens a microwave, picks up avocado on the table, place it into the microwave and close the microwave door."
+    " There are other utensils on the table as distractor"
+)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--prompt", type=str, default=DEFAULT_PROMPT)
+    parser.add_argument("--model", type=str, default=None)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--print-schema", action="store_true")
+    parser.add_argument("--print-catalog", action="store_true")
+    args = parser.parse_args()
+
+    from isaaclab_arena.llm_env_gen.schema import SceneSpec
+
+    if args.print_schema:
+        print(json.dumps(SceneSpec.model_json_schema(), indent=2))
+        return
+
+    from isaaclab_arena.llm_env_gen.llm_agent import LLMAgent, build_catalog_text
+
+    catalog = build_catalog_text()
+    if args.print_catalog:
+        print(catalog)
+        return
+
+    kwargs = {"model": args.model} if args.model else {}
+    agent = LLMAgent(**kwargs)
+    spec, raw = agent.generate_spec(args.prompt, catalog_text=catalog, temperature=args.temperature)
+
+    print("=== raw LLM response ===")
+    print(raw)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/setup.py b/setup.py
index 82cd92b56..80e447adb 100644
--- a/setup.py
+++ b/setup.py
@@ -14,6 +14,8 @@
     "vuer[all]",
     "lightwheel-sdk",
     "pytest",
+    # Used lazily by isaaclab_arena/llm_env_gen/* for NV_API_KEY-based LLM calls.
+    "openai",
 ]
 
 DEV_DEPS = [

From d4bdaa6ea54582b2299fca644d1bcc6240ff9b73 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Tue, 26 May 2026 00:28:48 +0800
Subject: [PATCH 02/41] Add asset resolver

Resolver matches object/robot proposal from LLM (based on name/tag) to
the exact entries in the arena regestry.
It outputs an ArenaEnvGraphSpec with the nodes, tasks and initial
state scene graph filled in based on LLM output + resolver match.
---
 .../environments/arena_env_graph_spec.py      |  43 +-
 isaaclab_arena/llm_env_gen/resolver.py        | 425 ++++++++++++++++++
 isaaclab_arena/llm_env_gen/try_schema.py      |  84 +++-
 3 files changed, 549 insertions(+), 3 deletions(-)
 create mode 100644 isaaclab_arena/llm_env_gen/resolver.py

diff --git a/isaaclab_arena/environments/arena_env_graph_spec.py b/isaaclab_arena/environments/arena_env_graph_spec.py
index 2f69f97d6..c6827a7ac 100644
--- a/isaaclab_arena/environments/arena_env_graph_spec.py
+++ b/isaaclab_arena/environments/arena_env_graph_spec.py
@@ -4,7 +4,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import yaml
-from dataclasses import dataclass, field
+from dataclasses import asdict, dataclass, field
 from enum import Enum
 from pathlib import Path
 from typing import Any
@@ -171,6 +171,27 @@ def from_dict(cls, data: dict[str, Any]) -> "ArenaEnvGraphSpec":
             state_specs=state_specs,
         )
 
+    def to_dict(self) -> dict[str, Any]:
+        """Return a YAML/JSON-serializable dict.
+
+        Output shape round-trips through :meth:`from_dict` / :meth:`from_yaml`:
+        enums become their ``.value`` strings and ``None`` / empty-dict fields
+        are omitted so the optional-field parsers fall back to their defaults.
+        """
+        return asdict(self, dict_factory=_yaml_dict_factory)
+
+    def to_yaml(self, path: str | Path) -> Path:
+        """Write this spec to ``path`` as YAML. Creates parent dirs as needed.
+
+        Returns the resolved :class:`Path` written. Symmetric with
+        :meth:`from_yaml`.
+        """
+        out_path = Path(path)
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        with out_path.open("w", encoding="utf-8") as f:
+            yaml.safe_dump(self.to_dict(), f, sort_keys=False)
+        return out_path
+
     @property
     def nodes_by_id(self) -> dict[str, ArenaEnvGraphNodeSpec]:
         return {node.id: node for node in self.nodes}
@@ -254,3 +275,23 @@ def _parse_task(data: Any) -> ArenaEnvGraphTaskSpec:
         success_state_spec_id=required_str(data, "success_state_spec_id"),
         task_args=optional_dict(data, "task_args"),
     )
+
+
+def _yaml_dict_factory(pairs: list[tuple[str, Any]]) -> dict[str, Any]:
+    """``dataclasses.asdict`` hook used by :meth:`ArenaEnvGraphSpec.to_dict`.
+
+    Two responsibilities:
+      * convert :class:`Enum` field values to their ``.value`` strings so
+        ``yaml.safe_dump`` can serialize them, and
+      * drop ``None`` / empty-dict fields so the emitted YAML stays clean
+        and ``optional_str`` / ``optional_dict`` parsers pick up defaults
+        instead of seeing redundant keys.
+    """
+    out: dict[str, Any] = {}
+    for key, value in pairs:
+        if isinstance(value, Enum):
+            value = value.value
+        if value is None or (isinstance(value, dict) and not value):
+            continue
+        out[key] = value
+    return out
diff --git a/isaaclab_arena/llm_env_gen/resolver.py b/isaaclab_arena/llm_env_gen/resolver.py
new file mode 100644
index 000000000..616475b6c
--- /dev/null
+++ b/isaaclab_arena/llm_env_gen/resolver.py
@@ -0,0 +1,425 @@
+# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Deterministic resolver that turns a SceneSpec into an ArenaEnvGraphSpec.
+
+The LLM emits a SceneSpec. Resolver.resolve() walks that spec, binds each
+query string to a registered Asset (preferring exact name, then fuzzy match
+filtered by tags), and emits a fully-formed :class:`ArenaEnvGraphSpec`:
+
+  * ``nodes`` — background, embodiment, and objects.
+  * ``state_specs`` — one initial state spec derived from
+    ``SceneSpec.initial_scene_graph``, plus one empty success state spec
+    per task as a placeholder for downstream synthesis.
+  * ``tasks`` — one task per LLM task, wired to its initial / success
+    state spec ids.
+
+Per-step "why-this-binding" decisions accumulate on ``self.trace`` so the
+caller can inspect resolution after the fact.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from difflib import get_close_matches
+
+from isaaclab_arena.assets.registries import AssetRegistry
+from isaaclab_arena.environments.arena_env_graph_spec import (
+    ArenaEnvGraphNodeSpec,
+    ArenaEnvGraphNodeType,
+    ArenaEnvGraphSpatialConstraintSpec,
+    ArenaEnvGraphSpatialConstraintType,
+    ArenaEnvGraphSpec,
+    ArenaEnvGraphStateSpec,
+    ArenaEnvGraphTaskSpec,
+)
+from isaaclab_arena.environments.graph_spec_utils import assert_references_exist, assert_unique_ids
+
+from .schema import Item, Relation, SceneSpec, Task
+
+# When the LLM emits a bare robot family name, pick the IK variant.
+IK_DEFAULTS: dict[str, str] = {
+    "franka": "franka_ik",
+    "droid": "droid_differential_ik",
+    "g1": "g1_wbc_pink",
+    "gr1": "gr1_pink",
+}
+
+# SceneSpec relation kinds that have no ArenaEnvGraphSpatialConstraintType
+# counterpart yet. Open/closed are task-state goals on articulated assets and
+# only become meaningful inside the task class, not as scene-graph edges.
+_UNSUPPORTED_RELATION_KINDS: frozenset[str] = frozenset({"open", "closed"})
+
+# id used for the single initial state spec the resolver emits.
+_INITIAL_STATE_SPEC_ID = "state_initial"
+
+
+# id pattern used for the per-task success state spec placeholders. Each is
+# emitted as an empty ArenaEnvGraphStateSpec so that ArenaEnvGraphSpec
+# reference-existence assertions hold; downstream task-graph synthesis is
+# responsible for populating them.
+def _success_state_spec_id(task_index: int) -> str:
+    return f"state_success_{task_index}"
+
+
+# Mapping from SceneSpec relation kinds to the spatial-constraint types used
+# inside an ArenaEnvGraphStateSpec. Keys must stay in sync with
+# isaaclab_arena.llm_env_gen.schema.RelationKind.
+_RELATION_KIND_TO_CONSTRAINT_TYPE: dict[str, ArenaEnvGraphSpatialConstraintType] = {
+    "on": ArenaEnvGraphSpatialConstraintType.ON,
+    "in": ArenaEnvGraphSpatialConstraintType.IN,
+    "next_to": ArenaEnvGraphSpatialConstraintType.NEXT_TO,
+    "at_position": ArenaEnvGraphSpatialConstraintType.AT_POSITION,
+    "is_anchor": ArenaEnvGraphSpatialConstraintType.IS_ANCHOR,
+}
+
+# Relation kinds whose semantic anchor is the *subject* (no child). For these
+# we set parent=subject and leave child=None. Everything else uses
+# parent=target, child=subject (e.g. on/in/next_to).
+_SUBJECT_AS_PARENT_KINDS: frozenset[str] = frozenset({"is_anchor", "at_position"})
+
+
+@dataclass
+class TraceEvent:
+    """One step in the resolution pipeline — emitted to a structured log."""
+
+    stage: str
+    query: str
+    chosen: str | None
+    candidates: list[str] = field(default_factory=list)
+    note: str = ""
+
+
+class Resolver:
+    """Resolves SceneSpec fields against AssetRegistry.
+
+    Design notes:
+      * Never raises on LLM mistakes — instead records a trace event with
+        chosen=None so the caller can decide (retry LLM, ask user, fall back).
+      * Exact name match wins. Otherwise substring containment, then difflib
+        fuzzy, within a tag-filtered pool.
+      * category_tags is a PREFERENCE, not a hard filter: if the tag pool is
+        empty or yields no close match, we relax to the full object pool and
+        record the relaxation in the trace.
+      * The trace lives on the Resolver instance (``self.trace``) and is
+        cleared at the start of every ``resolve()`` call.
+    """
+
+    def __init__(self, registry: AssetRegistry | None = None):
+        self.registry = registry or AssetRegistry()
+        # Populated incrementally by every resolution call. Caller reads after
+        # ``resolve()`` returns.
+        self.trace: list[TraceEvent] = []
+
+    def resolve(self, spec: SceneSpec, env_name: str | None = None) -> ArenaEnvGraphSpec:
+        """Resolve a SceneSpec into a full :class:`ArenaEnvGraphSpec`.
+
+        ``env_name`` is derived from the first task and background if not
+        provided. The success state of each task is NOT derived here —
+        downstream code is responsible for filling in the per-task success
+        state specs that this resolver emits as empty placeholders.
+        """
+        self.trace = []
+
+        nodes: list[ArenaEnvGraphNodeSpec] = []
+
+        background_node = self._resolve_background_node(spec.background)
+        if background_node is not None:
+            nodes.append(background_node)
+
+        nodes.append(self._resolve_embodiment_node(spec.embodiment))
+
+        for item in spec.items:
+            item_node = self._resolve_item_node(item)
+            if item_node is not None:
+                nodes.append(item_node)
+
+        known_ids = {node.id for node in nodes}
+
+        initial_state_spec = self._build_initial_state_spec(spec.initial_scene_graph, known_ids)
+        success_state_specs = [ArenaEnvGraphStateSpec(id=_success_state_spec_id(i)) for i in range(len(spec.tasks))]
+        state_specs = [initial_state_spec, *success_state_specs]
+        tasks = self._build_task_specs(spec.tasks, known_ids)
+
+        env_graph_spec = ArenaEnvGraphSpec(
+            env_name=env_name or self._derive_env_name(spec),
+            nodes=nodes,
+            tasks=tasks,
+            state_specs=state_specs,
+        )
+
+        # Defensive: the resolver owns every id and reference it emits, so
+        # these invariants should always hold. Catching a violation here
+        # surfaces resolver bugs eagerly rather than at downstream consumers.
+        assert_unique_ids(env_graph_spec.nodes, env_graph_spec.tasks, env_graph_spec.state_specs)
+        assert_references_exist(env_graph_spec.nodes, env_graph_spec.tasks, env_graph_spec.state_specs)
+
+        return env_graph_spec
+
+    @staticmethod
+    def _derive_env_name(spec: SceneSpec) -> str:
+        first_kind = spec.tasks[0].kind if spec.tasks else "task"
+        return f"llm_gen_{spec.background}_{first_kind}"
+
+    # ------------------------------------------------------------------
+    # Node construction
+    # ------------------------------------------------------------------
+
+    def _resolve_background_node(self, query: str) -> ArenaEnvGraphNodeSpec | None:
+        cls = self._resolve_name(query, required_tag="background")
+        if cls is None:
+            return None
+        return ArenaEnvGraphNodeSpec(
+            id=query,
+            name=cls.name,
+            type=ArenaEnvGraphNodeType.BACKGROUND,
+        )
+
+    def _resolve_embodiment_node(self, query: str) -> ArenaEnvGraphNodeSpec:
+        embodiment_name = self._resolve_embodiment(query)
+        return ArenaEnvGraphNodeSpec(
+            id=embodiment_name,
+            name=embodiment_name,
+            type=ArenaEnvGraphNodeType.EMBODIMENT,
+        )
+
+    def _resolve_item_node(self, item: Item) -> ArenaEnvGraphNodeSpec | None:
+        cls = self._resolve_item(item)
+        if cls is None:
+            return None
+        params: dict = {}
+        if item.scale is not None:
+            params["scale"] = item.scale
+        return ArenaEnvGraphNodeSpec(
+            id=item.instance_name or item.query,
+            name=cls.name,
+            type=ArenaEnvGraphNodeType.OBJECT,
+            params=params,
+        )
+
+    # ------------------------------------------------------------------
+    # State spec + task spec construction
+    # ------------------------------------------------------------------
+
+    def _build_initial_state_spec(self, graph: list[Relation], known_ids: set[str]) -> ArenaEnvGraphStateSpec:
+        """Translate the LLM's initial scene graph into an ArenaEnvGraphStateSpec."""
+        constraints: list[ArenaEnvGraphSpatialConstraintSpec] = []
+        for index, rel in enumerate(graph):
+            constraint = self._build_spatial_constraint(rel, index, known_ids)
+            if constraint is not None:
+                constraints.append(constraint)
+        return ArenaEnvGraphStateSpec(
+            id=_INITIAL_STATE_SPEC_ID,
+            spatial_constraints=constraints,
+            task_constraints=[],
+        )
+
+    def _build_spatial_constraint(
+        self, rel: Relation, index: int, known_ids: set[str]
+    ) -> ArenaEnvGraphSpatialConstraintSpec | None:
+        stage_prefix = "relation.initial"
+        if rel.kind in _UNSUPPORTED_RELATION_KINDS:
+            self.trace.append(
+                TraceEvent(
+                    f"{stage_prefix}.unsupported_kind",
+                    rel.subject,
+                    None,
+                    note=f"kind={rel.kind!r} has no spatial-constraint counterpart; skipping",
+                )
+            )
+            return None
+        if rel.kind == "in":
+            self.trace.append(
+                TraceEvent(
+                    f"{stage_prefix}.in_skipped",
+                    rel.subject,
+                    rel.target,
+                    note="'in' has no initial-state semantics; specify placement changes via tasks instead.",
+                )
+            )
+            return None
+        if rel.kind not in _RELATION_KIND_TO_CONSTRAINT_TYPE:
+            self.trace.append(
+                TraceEvent(
+                    f"{stage_prefix}.unknown_kind",
+                    rel.subject,
+                    None,
+                    note=f"kind={rel.kind!r} has no constraint mapping; skipping",
+                )
+            )
+            return None
+        if rel.subject not in known_ids:
+            self.trace.append(TraceEvent(f"{stage_prefix}.unknown_subject", rel.subject, None, note=rel.kind))
+            return None
+        if rel.target is not None and rel.target not in known_ids:
+            self.trace.append(TraceEvent(f"{stage_prefix}.unknown_target", rel.target, None, note=rel.kind))
+            return None
+
+        constraint_type = _RELATION_KIND_TO_CONSTRAINT_TYPE[rel.kind]
+        if rel.kind in _SUBJECT_AS_PARENT_KINDS:
+            parent, child = rel.subject, None
+        else:
+            if rel.target is None:
+                self.trace.append(
+                    TraceEvent(
+                        f"{stage_prefix}.missing_target",
+                        rel.subject,
+                        None,
+                        note=f"kind={rel.kind!r} requires a target; skipping",
+                    )
+                )
+                return None
+            parent, child = rel.target, rel.subject
+
+        child_part = f"_{child}" if child is not None else ""
+        constraint_id = f"{_INITIAL_STATE_SPEC_ID}_{index}_{rel.kind}_{parent}{child_part}"
+        self.trace.append(TraceEvent(f"{stage_prefix}.ok", rel.subject, rel.target, note=rel.kind))
+        return ArenaEnvGraphSpatialConstraintSpec(
+            id=constraint_id,
+            type=constraint_type,
+            parent=parent,
+            child=child,
+            params=dict(rel.params),
+        )
+
+    def _build_task_specs(self, tasks: list[Task], known_ids: set[str]) -> list[ArenaEnvGraphTaskSpec]:
+        out: list[ArenaEnvGraphTaskSpec] = []
+        for index, task in enumerate(tasks):
+            self.trace.append(
+                TraceEvent(
+                    "task.resolve",
+                    task.kind,
+                    task.kind,
+                    note=f"subject={task.subject}, target={task.target}",
+                )
+            )
+            if task.subject not in known_ids:
+                self.trace.append(TraceEvent("task.unknown_subject", task.subject, None, note=f"task kind={task.kind}"))
+            if task.target is not None and task.target not in known_ids:
+                self.trace.append(TraceEvent("task.unknown_target", task.target, None, note=f"task kind={task.kind}"))
+            out.append(
+                ArenaEnvGraphTaskSpec(
+                    id=f"task_{index}_{task.kind}",
+                    type=task.kind,
+                    initial_state_spec_id=_INITIAL_STATE_SPEC_ID,
+                    # Points at an empty placeholder state spec emitted by
+                    # resolve(); downstream task-graph synthesis fills it in.
+                    success_state_spec_id=_success_state_spec_id(index),
+                    task_args={
+                        "subject": task.subject,
+                        "target": task.target,
+                        "description": task.description,
+                    },
+                )
+            )
+        return out
+
+    # ------------------------------------------------------------------
+    # Asset binding helpers (use self.trace directly)
+    # ------------------------------------------------------------------
+
+    def _resolve_item(self, item: Item) -> type | None:
+        if self.registry.is_registered(item.query):
+            self.trace.append(TraceEvent("item.exact", item.query, item.query))
+            return self.registry.get_asset_by_name(item.query)
+
+        object_pool = self._pool_for(["object"])
+
+        if item.category_tags:
+            pool = self._pool_for(item.category_tags)
+            if not pool:
+                self.trace.append(
+                    TraceEvent(
+                        "item.tag_pool_empty",
+                        item.query,
+                        None,
+                        note=f"no assets matched tags={item.category_tags}; relaxing to objects",
+                    )
+                )
+            else:
+                cls = self._best_match(item.query, pool, stage_prefix="item.in_tags", note=f"tags={item.category_tags}")
+                if cls is not None:
+                    return cls
+                self.trace.append(
+                    TraceEvent(
+                        "item.no_match_in_tags",
+                        item.query,
+                        None,
+                        candidates=pool[:10],
+                        note=f"tags={item.category_tags}; relaxing to objects",
+                    )
+                )
+
+        cls = self._best_match(
+            item.query, object_pool, stage_prefix="item.relaxed", note="closest object; category ignored"
+        )
+        if cls is not None:
+            return cls
+
+        self.trace.append(TraceEvent("item.miss", item.query, None, candidates=object_pool[:10]))
+        return None
+
+    def _best_match(self, query: str, pool: list[str], stage_prefix: str, note: str) -> type | None:
+        """Prefer substring containment (e.g. 'bowl' → 'bowl_ycb_robolab'), then difflib fuzzy."""
+        q = query.lower()
+        substrs = [p for p in pool if q in p.lower()]
+        if substrs:
+            chosen = min(substrs, key=len)
+            self.trace.append(TraceEvent(f"{stage_prefix}.substring", query, chosen, candidates=substrs[:5], note=note))
+            return self.registry.get_asset_by_name(chosen)
+
+        matches = get_close_matches(query, pool, n=3, cutoff=0.5)
+        if matches:
+            self.trace.append(TraceEvent(f"{stage_prefix}.fuzzy", query, matches[0], candidates=matches, note=note))
+            return self.registry.get_asset_by_name(matches[0])
+        return None
+
+    def _pool_for(self, tags: list[str]) -> list[str]:
+        # Intersection across tags — an item tagged {"vegetable", "graspable"}
+        # must satisfy both.
+        assets = None
+        for tag in tags:
+            tagged = {a.name for a in self.registry.get_assets_by_tag(tag)}
+            assets = tagged if assets is None else assets & tagged
+        return sorted(assets or [])
+
+    def _resolve_name(self, name: str, required_tag: str | None) -> type | None:
+        if self.registry.is_registered(name):
+            cls = self.registry.get_asset_by_name(name)
+            if required_tag and required_tag not in getattr(cls, "tags", []):
+                self.trace.append(TraceEvent("name.wrong_tag", name, None, note=f"expected tag {required_tag!r}"))
+                return None
+            self.trace.append(TraceEvent("name.exact", name, name))
+            return cls
+
+        pool = self._pool_for([required_tag]) if required_tag else self.registry.get_all_keys()
+        matches = get_close_matches(name, pool, n=3, cutoff=0.5)
+        if matches:
+            self.trace.append(TraceEvent("name.fuzzy", name, matches[0], candidates=matches))
+            return self.registry.get_asset_by_name(matches[0])
+
+        self.trace.append(TraceEvent("name.miss", name, None, candidates=pool[:10]))
+        return None
+
+    def _resolve_embodiment(self, name: str) -> str:
+        if self.registry.is_registered(name):
+            self.trace.append(TraceEvent("embodiment.exact", name, name))
+            return name
+
+        lower = name.lower()
+        if lower in IK_DEFAULTS:
+            chosen = IK_DEFAULTS[lower]
+            self.trace.append(
+                TraceEvent("embodiment.ik_default", name, chosen, note=f"bare family {name!r} → IK variant")
+            )
+            return chosen
+
+        embodiment_pool = self._pool_for(["embodiment"])
+        matches = get_close_matches(name, embodiment_pool, n=3, cutoff=0.5)
+        if matches:
+            self.trace.append(TraceEvent("embodiment.fuzzy", name, matches[0], candidates=matches))
+            return matches[0]
+        self.trace.append(TraceEvent("embodiment.miss", name, None, note="falling back to franka_ik"))
+        return "franka_ik"
diff --git a/isaaclab_arena/llm_env_gen/try_schema.py b/isaaclab_arena/llm_env_gen/try_schema.py
index 07afdb234..a45784663 100644
--- a/isaaclab_arena/llm_env_gen/try_schema.py
+++ b/isaaclab_arena/llm_env_gen/try_schema.py
@@ -3,11 +3,15 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-"""Run the LLM parser on a prompt and print the resulting SceneSpec.
+"""Run the LLM parser on a prompt and dump the resolved ArenaEnvGraphSpec.
 
 Must run inside the Docker container (needs AssetRegistry). Requires
 NV_API_KEY and the `openai` pip package.
 
+Output: the resolved spec is always written to
+``isaaclab_arena_environments/llm_generated/<env_name>_proposal.yaml`` (in
+addition to being printed to stdout).
+
 Examples:
     # Print the Pydantic SceneSpec JSON schema (no LLM call):
     /isaac-sim/python.sh -m isaaclab_arena.llm_env_gen.try_schema --print-schema
@@ -15,7 +19,7 @@
     # Print the catalog sent to the LLM (no LLM call):
     /isaac-sim/python.sh -m isaaclab_arena.llm_env_gen.try_schema --print-catalog
 
-    # Call the LLM and print the parsed SceneSpec:
+    # Call the LLM, resolve, print, and dump YAML:
     /isaac-sim/python.sh -m isaaclab_arena.llm_env_gen.try_schema \
         --prompt "franka pick up avocado from the table and place it into a bowl on the table. there are other veggies on the table as distractor"
 """
@@ -24,6 +28,7 @@
 
 import argparse
 import json
+from pathlib import Path
 
 DEFAULT_PROMPT = (
     "franka pick up avocado from the table and place it into a bowl on the table. "
@@ -34,6 +39,11 @@
     " There are other utensils on the table as distractor"
 )
 
+# Resolved-spec dumps land here so they're easy to find next to the existing
+# auto-generated env modules. Path is computed from this file so it works
+# inside the container (/workspaces/isaaclab_arena) and outside.
+_LLM_GENERATED_DIR = Path(__file__).resolve().parents[2] / "isaaclab_arena_environments" / "llm_generated"
+
 
 def main() -> None:
     parser = argparse.ArgumentParser(description=__doc__)
@@ -42,6 +52,18 @@ def main() -> None:
     parser.add_argument("--temperature", type=float, default=0.2)
     parser.add_argument("--print-schema", action="store_true")
     parser.add_argument("--print-catalog", action="store_true")
+    parser.add_argument(
+        "--background",
+        type=str,
+        default="maple_table_robolab",
+        help=(
+            "Override the background chosen by the LLM (e.g. 'office_table' "
+            "or 'kitchen'). Default is 'maple_table_robolab' because its "
+            "tabletop ObjectReference yields a clean bbox and stable "
+            "placement, unlike the rotated plain 'table' background. Pass "
+            "an empty string ('') to keep the LLM's choice."
+        ),
+    )
     args = parser.parse_args()
 
     from isaaclab_arena.llm_env_gen.schema import SceneSpec
@@ -64,6 +86,64 @@ def main() -> None:
     print("=== raw LLM response ===")
     print(raw)
 
+    if args.background and args.background != spec.background:
+        # Swap the background name wherever it appears so downstream code
+        # (resolver, proposer) sees a consistent scene. Relations whose
+        # target was the old background get rewired to the new one.
+        old_bg = spec.background
+        new_bg = args.background
+        for rel in spec.initial_scene_graph:
+            if rel.target == old_bg:
+                rel.target = new_bg
+        # Note: tasks don't directly reference background in target (typically None or items),
+        # so no background substitution needed in task.target
+        spec.background = new_bg
+        print(f"\n=== background override applied: {old_bg!r} -> {new_bg!r} ===")
+
+    print("\n=== parsed SceneSpec ===")
+    print(spec.model_dump_json(indent=2))
+
+    from isaaclab_arena.llm_env_gen.resolver import Resolver
+
+    resolver = Resolver()
+    env_graph_spec = resolver.resolve(spec)
+
+    print(f"\n=== resolved ArenaEnvGraphSpec (env_name={env_graph_spec.env_name!r}) ===")
+
+    print("\nnodes:")
+    for node in env_graph_spec.nodes:
+        params_str = f"  params={node.params}" if node.params else ""
+        print(f"  {node.id:24s} type={node.type.value:18s} name={node.name}{params_str}")
+
+    print("\nstate_specs:")
+    for state_spec in env_graph_spec.state_specs:
+        s_count = len(state_spec.spatial_constraints)
+        t_count = len(state_spec.task_constraints)
+        print(f"  {state_spec.id:24s} spatial={s_count} task={t_count}")
+        for c in state_spec.spatial_constraints:
+            child_str = f", child={c.child}" if c.child else ""
+            params_str = f"  params={c.params}" if c.params else ""
+            print(f"    {c.type.value:16s} parent={c.parent}{child_str}{params_str}")
+        for c in state_spec.task_constraints:
+            print(f"    {c.type.value:16s} parent={c.parent}  child={c.child}")
+
+    print("\ntasks:")
+    for task in env_graph_spec.tasks:
+        print(
+            f"  {task.id:28s} type={task.type:18s} "
+            f"initial={task.initial_state_spec_id!r} success={task.success_state_spec_id!r}"
+        )
+        print(f"    task_args: {task.task_args}")
+
+    print("\n=== trace ===")
+    for t in resolver.trace:
+        chosen = t.chosen if t.chosen is not None else "<none>"
+        extra = f"  [{t.note}]" if t.note else ""
+        print(f"  {t.stage:34s} {t.query!s:24s} -> {chosen}{extra}")
+
+    out_path = env_graph_spec.to_yaml(_LLM_GENERATED_DIR / f"{env_graph_spec.env_name}_proposal.yaml")
+    print(f"\n=== wrote ArenaEnvGraphSpec YAML to {out_path} ===")
+
 
 if __name__ == "__main__":
     main()

From 3162272477d2108cd535538b29096f0fde0f13d0 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Tue, 26 May 2026 18:16:06 +0800
Subject: [PATCH 03/41] Move assert_unique_ids + assert_references_exist to
 post_init so it's always called when loading from yaml/dict

---
 .../environments/arena_env_graph_spec.py      | 23 +++++++++++--------
 isaaclab_arena/llm_env_gen/resolver.py        |  7 ------
 2 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/isaaclab_arena/environments/arena_env_graph_spec.py b/isaaclab_arena/environments/arena_env_graph_spec.py
index c6827a7ac..bf80433f4 100644
--- a/isaaclab_arena/environments/arena_env_graph_spec.py
+++ b/isaaclab_arena/environments/arena_env_graph_spec.py
@@ -149,6 +149,16 @@ class ArenaEnvGraphSpec:
     tasks: list[ArenaEnvGraphTaskSpec] = field(default_factory=list)
     state_specs: list[ArenaEnvGraphStateSpec] = field(default_factory=list)
 
+    def __post_init__(self) -> None:
+        # Enforce graph invariants on EVERY construction path (YAML parse, direct
+        # dataclass instantiation, programmatic build, ...). Centralizing here means
+        # downstream consumers — including ``nodes_by_id`` / ``tasks_by_id`` /
+        # ``state_specs_by_id``, which collapse duplicates silently in their dict
+        # comprehensions — can rely on globally-unique ids and valid references
+        # without re-validating.
+        assert_unique_ids(self.nodes, self.tasks, self.state_specs)
+        assert_references_exist(self.nodes, self.tasks, self.state_specs)
+
     @classmethod
     def from_yaml(cls, path: str | Path) -> "ArenaEnvGraphSpec":
         with Path(path).open("r", encoding="utf-8") as f:
@@ -157,18 +167,11 @@ def from_yaml(cls, path: str | Path) -> "ArenaEnvGraphSpec":
     @classmethod
     def from_dict(cls, data: dict[str, Any]) -> "ArenaEnvGraphSpec":
         data = as_dict(data, "Env graph spec")
-        nodes = parse_list(data, "nodes", _parse_node)
-        tasks = parse_list(data, "tasks", _parse_task)
-        state_specs = parse_list(data, "state_specs", _parse_state_spec)
-
-        assert_unique_ids(nodes, tasks, state_specs)
-        assert_references_exist(nodes, tasks, state_specs)
-
         return cls(
             env_name=required_str(data, "env_name"),
-            nodes=nodes,
-            tasks=tasks,
-            state_specs=state_specs,
+            nodes=parse_list(data, "nodes", _parse_node),
+            tasks=parse_list(data, "tasks", _parse_task),
+            state_specs=parse_list(data, "state_specs", _parse_state_spec),
         )
 
     def to_dict(self) -> dict[str, Any]:
diff --git a/isaaclab_arena/llm_env_gen/resolver.py b/isaaclab_arena/llm_env_gen/resolver.py
index 616475b6c..04f051442 100644
--- a/isaaclab_arena/llm_env_gen/resolver.py
+++ b/isaaclab_arena/llm_env_gen/resolver.py
@@ -35,7 +35,6 @@
     ArenaEnvGraphStateSpec,
     ArenaEnvGraphTaskSpec,
 )
-from isaaclab_arena.environments.graph_spec_utils import assert_references_exist, assert_unique_ids
 
 from .schema import Item, Relation, SceneSpec, Task
 
@@ -150,12 +149,6 @@ def resolve(self, spec: SceneSpec, env_name: str | None = None) -> ArenaEnvGraph
             state_specs=state_specs,
         )
 
-        # Defensive: the resolver owns every id and reference it emits, so
-        # these invariants should always hold. Catching a violation here
-        # surfaces resolver bugs eagerly rather than at downstream consumers.
-        assert_unique_ids(env_graph_spec.nodes, env_graph_spec.tasks, env_graph_spec.state_specs)
-        assert_references_exist(env_graph_spec.nodes, env_graph_spec.tasks, env_graph_spec.state_specs)
-
         return env_graph_spec
 
     @staticmethod

From 06d4b0e0d7818e159d307edbf39942b5a761823f Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Tue, 26 May 2026 19:02:12 +0800
Subject: [PATCH 04/41] Update schema to better support unary relation

---
 isaaclab_arena/llm_env_gen/resolver.py | 21 +++++----------------
 isaaclab_arena/llm_env_gen/schema.py   | 14 +++++++++++++-
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/isaaclab_arena/llm_env_gen/resolver.py b/isaaclab_arena/llm_env_gen/resolver.py
index 04f051442..a2c8484fa 100644
--- a/isaaclab_arena/llm_env_gen/resolver.py
+++ b/isaaclab_arena/llm_env_gen/resolver.py
@@ -74,11 +74,6 @@ def _success_state_spec_id(task_index: int) -> str:
     "is_anchor": ArenaEnvGraphSpatialConstraintType.IS_ANCHOR,
 }
 
-# Relation kinds whose semantic anchor is the *subject* (no child). For these
-# we set parent=subject and leave child=None. Everything else uses
-# parent=target, child=subject (e.g. on/in/next_to).
-_SUBJECT_AS_PARENT_KINDS: frozenset[str] = frozenset({"is_anchor", "at_position"})
-
 
 @dataclass
 class TraceEvent:
@@ -251,19 +246,13 @@ def _build_spatial_constraint(
             return None
 
         constraint_type = _RELATION_KIND_TO_CONSTRAINT_TYPE[rel.kind]
-        if rel.kind in _SUBJECT_AS_PARENT_KINDS:
+        # ``target is None`` is the unary signal from the schema (e.g. ``is_anchor``,
+        # ``at_position``) — see ``Relation.target`` in ``schema.py``. Binary
+        # relations (on / in / next_to / ...) provide a target; the subject
+        # becomes the child anchored on the target.
+        if rel.target is None:
             parent, child = rel.subject, None
         else:
-            if rel.target is None:
-                self.trace.append(
-                    TraceEvent(
-                        f"{stage_prefix}.missing_target",
-                        rel.subject,
-                        None,
-                        note=f"kind={rel.kind!r} requires a target; skipping",
-                    )
-                )
-                return None
             parent, child = rel.target, rel.subject
 
         child_part = f"_{child}" if child is not None else ""
diff --git a/isaaclab_arena/llm_env_gen/schema.py b/isaaclab_arena/llm_env_gen/schema.py
index 33f707503..c22370173 100644
--- a/isaaclab_arena/llm_env_gen/schema.py
+++ b/isaaclab_arena/llm_env_gen/schema.py
@@ -50,10 +50,22 @@ class Item(BaseModel):
 
 
 class Relation(BaseModel):
-    """A spatial / structural relation between two items (or on one item)."""
+    """A spatial / structural relation between items.
+
+    Binary kinds (``on``, ``in``, ``next_to``, ...) must set ``target`` to the
+    other item — semantics is "subject is in relation to target". Unary kinds
+    (``is_anchor``, ``at_position``, ...) describe an intrinsic property of
+    ``subject`` alone and must leave ``target`` as ``None``. The downstream
+    resolver uses ``target is None`` as the single signal to distinguish the
+    two — see ``Resolver._build_spatial_constraint``.
+    """
 
     kind: RelationKind
     subject: str
+    # ``None`` for unary relations (the subject is the anchor); a string for
+    # binary relations (subject is anchored on this target). The resolver
+    # branches on this field rather than maintaining a kind-specific allowlist,
+    # so populating it correctly is part of the LLM's contract.
     target: str | None = None
     params: dict = Field(default_factory=dict)
 

From 087c498553bf3e3d0242559a61a69eb53ad53b11 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Tue, 26 May 2026 22:23:29 +0800
Subject: [PATCH 05/41] Simplify relation types in resolver

---
 isaaclab_arena/llm_env_gen/llm_agent.py | 11 +++----
 isaaclab_arena/llm_env_gen/resolver.py  | 42 +++++++------------------
 isaaclab_arena/llm_env_gen/schema.py    |  6 +++-
 3 files changed, 20 insertions(+), 39 deletions(-)

diff --git a/isaaclab_arena/llm_env_gen/llm_agent.py b/isaaclab_arena/llm_env_gen/llm_agent.py
index 5d9575c45..65daff5c9 100644
--- a/isaaclab_arena/llm_env_gen/llm_agent.py
+++ b/isaaclab_arena/llm_env_gen/llm_agent.py
@@ -104,14 +104,11 @@ def _system_prompt(self) -> str:
             "  tags. This is a PREFERENCE, not a hard filter — the resolver will fall back to the full\n"
             "  catalog if the tag pool is empty or yields no close match. Err toward emitting useful tags;\n"
             "  the trace will report what was relaxed.\n"
-            "- relation.kind ∈ {on, in, next_to, at_position, is_anchor, open, closed}.\n"
+            "- relation.kind ∈ {on, in, next_to, at_position, is_anchor}. Spatial relations only —\n"
+            "  articulated-state changes are expressed via tasks below, not as relations.\n"
             "  subject/target reference items by their query string or the background name.\n"
-            "  * 'on' / 'in' / other spatial relations describe object placement in the initial scene.\n"
-            "  * 'open' and 'closed' are UNARY state markers for articulated objects (microwave, fridge,\n"
-            "    cabinet) in the initial scene. Their target MUST be null. They describe the *initial*\n"
-            "    state; the task list (below) specifies state changes.\n"
-            "  * Articulated objects (microwave etc.) need both a spatial 'on(microwave, background)'\n"
-            "    relation AND an 'open(microwave, null)' or 'closed(microwave, null)' state marker.\n"
+            "  * Articulated objects (microwave, fridge, cabinet) still need a spatial\n"
+            "    'on(<object>, background)' relation to anchor them.\n"
             "  * Distractor items around the appliance need 'on(distractor, background)' relations.\n"
             "- initial_scene_graph: FULL snapshot of all relations in the starting state. Every persistent\n"
             "  relation (e.g. bowl on table, distractors present) must appear here. Relations that change\n"
diff --git a/isaaclab_arena/llm_env_gen/resolver.py b/isaaclab_arena/llm_env_gen/resolver.py
index a2c8484fa..3dda87bdb 100644
--- a/isaaclab_arena/llm_env_gen/resolver.py
+++ b/isaaclab_arena/llm_env_gen/resolver.py
@@ -46,11 +46,6 @@
     "gr1": "gr1_pink",
 }
 
-# SceneSpec relation kinds that have no ArenaEnvGraphSpatialConstraintType
-# counterpart yet. Open/closed are task-state goals on articulated assets and
-# only become meaningful inside the task class, not as scene-graph edges.
-_UNSUPPORTED_RELATION_KINDS: frozenset[str] = frozenset({"open", "closed"})
-
 # id used for the single initial state spec the resolver emits.
 _INITIAL_STATE_SPEC_ID = "state_initial"
 
@@ -63,18 +58,6 @@ def _success_state_spec_id(task_index: int) -> str:
     return f"state_success_{task_index}"
 
 
-# Mapping from SceneSpec relation kinds to the spatial-constraint types used
-# inside an ArenaEnvGraphStateSpec. Keys must stay in sync with
-# isaaclab_arena.llm_env_gen.schema.RelationKind.
-_RELATION_KIND_TO_CONSTRAINT_TYPE: dict[str, ArenaEnvGraphSpatialConstraintType] = {
-    "on": ArenaEnvGraphSpatialConstraintType.ON,
-    "in": ArenaEnvGraphSpatialConstraintType.IN,
-    "next_to": ArenaEnvGraphSpatialConstraintType.NEXT_TO,
-    "at_position": ArenaEnvGraphSpatialConstraintType.AT_POSITION,
-    "is_anchor": ArenaEnvGraphSpatialConstraintType.IS_ANCHOR,
-}
-
-
 @dataclass
 class TraceEvent:
     """One step in the resolution pipeline — emitted to a structured log."""
@@ -208,16 +191,6 @@ def _build_spatial_constraint(
         self, rel: Relation, index: int, known_ids: set[str]
     ) -> ArenaEnvGraphSpatialConstraintSpec | None:
         stage_prefix = "relation.initial"
-        if rel.kind in _UNSUPPORTED_RELATION_KINDS:
-            self.trace.append(
-                TraceEvent(
-                    f"{stage_prefix}.unsupported_kind",
-                    rel.subject,
-                    None,
-                    note=f"kind={rel.kind!r} has no spatial-constraint counterpart; skipping",
-                )
-            )
-            return None
         if rel.kind == "in":
             self.trace.append(
                 TraceEvent(
@@ -228,13 +201,21 @@ def _build_spatial_constraint(
                 )
             )
             return None
-        if rel.kind not in _RELATION_KIND_TO_CONSTRAINT_TYPE:
+        # ``ArenaEnvGraphSpatialConstraintType(value)`` is the built-in value-based
+        # enum lookup — the schema's ``RelationKind`` literal strings are kept in
+        # 1:1 sync with this enum's values, so this single call replaces what used
+        # to be a hand-maintained dict + membership check. A ``ValueError`` here
+        # means the LLM produced a kind that pydantic's ``Literal`` should have
+        # rejected upstream — we still trace defensively in case of bypass.
+        try:
+            constraint_type = ArenaEnvGraphSpatialConstraintType(rel.kind)
+        except ValueError:
             self.trace.append(
                 TraceEvent(
-                    f"{stage_prefix}.unknown_kind",
+                    f"{stage_prefix}.unsupported_kind",
                     rel.subject,
                     None,
-                    note=f"kind={rel.kind!r} has no constraint mapping; skipping",
+                    note=f"kind={rel.kind!r} has no ArenaEnvGraphSpatialConstraintType counterpart; skipping",
                 )
             )
             return None
@@ -245,7 +226,6 @@ def _build_spatial_constraint(
             self.trace.append(TraceEvent(f"{stage_prefix}.unknown_target", rel.target, None, note=rel.kind))
             return None
 
-        constraint_type = _RELATION_KIND_TO_CONSTRAINT_TYPE[rel.kind]
         # ``target is None`` is the unary signal from the schema (e.g. ``is_anchor``,
         # ``at_position``) — see ``Relation.target`` in ``schema.py``. Binary
         # relations (on / in / next_to / ...) provide a target; the subject
diff --git a/isaaclab_arena/llm_env_gen/schema.py b/isaaclab_arena/llm_env_gen/schema.py
index c22370173..80509b46c 100644
--- a/isaaclab_arena/llm_env_gen/schema.py
+++ b/isaaclab_arena/llm_env_gen/schema.py
@@ -19,10 +19,14 @@
 
 # Relation kinds currently surfaced to the LLM. Mirror the subset of
 # isaaclab_arena.relations.relations that makes sense for tabletop prompts.
+# Values must match the corresponding ``ArenaEnvGraphSpatialConstraintType``
+# enum values one-to-one — the resolver looks the constraint type up via
+# ``ArenaEnvGraphSpatialConstraintType(kind)`` rather than maintaining a
+# parallel dict.
 # "in" has no In class in isaaclab_arena.relations.relations yet — see the
 # TODO there. The scene builder materializes goal-state "in" relations as
 # the task's success predicate.
-RelationKind = Literal["on", "in", "next_to", "at_position", "is_anchor", "open", "closed"]
+RelationKind = Literal["on", "in", "next_to", "at_position", "is_anchor"]
 
 ItemRole = Literal["foreground", "distractor", "anchor"]
 

From cf38ec3050f73fa6aef2e4b41b8af8bb45731540 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Tue, 26 May 2026 22:24:39 +0800
Subject: [PATCH 06/41] Fix copyright years

---
 isaaclab_arena/llm_env_gen/llm_agent.py  | 2 +-
 isaaclab_arena/llm_env_gen/resolver.py   | 2 +-
 isaaclab_arena/llm_env_gen/schema.py     | 2 +-
 isaaclab_arena/llm_env_gen/try_schema.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/isaaclab_arena/llm_env_gen/llm_agent.py b/isaaclab_arena/llm_env_gen/llm_agent.py
index 65daff5c9..f6402b409 100644
--- a/isaaclab_arena/llm_env_gen/llm_agent.py
+++ b/isaaclab_arena/llm_env_gen/llm_agent.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
+# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
 # All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
diff --git a/isaaclab_arena/llm_env_gen/resolver.py b/isaaclab_arena/llm_env_gen/resolver.py
index 3dda87bdb..6377a1a05 100644
--- a/isaaclab_arena/llm_env_gen/resolver.py
+++ b/isaaclab_arena/llm_env_gen/resolver.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
+# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
 # All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
diff --git a/isaaclab_arena/llm_env_gen/schema.py b/isaaclab_arena/llm_env_gen/schema.py
index 80509b46c..47b8cc821 100644
--- a/isaaclab_arena/llm_env_gen/schema.py
+++ b/isaaclab_arena/llm_env_gen/schema.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
+# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
 # All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
diff --git a/isaaclab_arena/llm_env_gen/try_schema.py b/isaaclab_arena/llm_env_gen/try_schema.py
index a45784663..656eb3ad4 100644
--- a/isaaclab_arena/llm_env_gen/try_schema.py
+++ b/isaaclab_arena/llm_env_gen/try_schema.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
+# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
 # All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0

From 7e5f58e5dbb809d3ec5a833c064afe702db912e5 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Tue, 26 May 2026 22:34:57 +0800
Subject: [PATCH 07/41] Add unit test

---
 isaaclab_arena/tests/test_resolver.py | 417 ++++++++++++++++++++++++++
 1 file changed, 417 insertions(+)
 create mode 100644 isaaclab_arena/tests/test_resolver.py

diff --git a/isaaclab_arena/tests/test_resolver.py b/isaaclab_arena/tests/test_resolver.py
new file mode 100644
index 000000000..1db0a2bdf
--- /dev/null
+++ b/isaaclab_arena/tests/test_resolver.py
@@ -0,0 +1,417 @@
+# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Unit tests for :class:`isaaclab_arena.llm_env_gen.resolver.Resolver`.
+
+The resolver is pure Python — no Isaac Sim / Kit / pxr dependency — so these
+tests run as plain pytest functions against an injected fake AssetRegistry.
+They exercise the resolver's deterministic logic in isolation: asset binding
+strategies (exact / substring / fuzzy / tag-pool relaxation / miss),
+embodiment family defaults, spatial constraint construction (binary vs unary
+relations, ``in`` skipping, unknown-node defensive traces), task spec wiring,
+and trace lifecycle.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from isaaclab_arena.environments.arena_env_graph_spec import ArenaEnvGraphNodeType, ArenaEnvGraphSpatialConstraintType
+from isaaclab_arena.llm_env_gen.resolver import IK_DEFAULTS, Resolver
+from isaaclab_arena.llm_env_gen.schema import Item, Relation, SceneSpec, Task
+
+# ---------------------------------------------------------------------------
+# Test fixtures
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class FakeAsset:
+    """Minimal stand-in for the asset classes the resolver inspects.
+
+    Real asset classes are decorated classes pulled in via
+    ``ensure_assets_registered()``. The resolver only ever reads ``.name`` and
+    ``.tags`` off them, so a simple dataclass suffices and keeps the tests
+    independent of isaaclab / Kit.
+    """
+
+    name: str
+    tags: list[str]
+
+
+class FakeAssetRegistry:
+    """Duck-typed AssetRegistry for unit tests.
+
+    Implements the four methods the resolver calls — ``is_registered``,
+    ``get_asset_by_name``, ``get_assets_by_tag``, ``get_all_keys`` — without
+    pulling in isaaclab. We deliberately don't subclass :class:`AssetRegistry`
+    directly because it uses ``SingletonMeta``, which would force test
+    isolation gymnastics. Duck-typing via the resolver's ``registry`` argument
+    is the supported injection point.
+    """
+
+    def __init__(self, assets: list[FakeAsset]):
+        self._by_name: dict[str, FakeAsset] = {a.name: a for a in assets}
+
+    def is_registered(self, key: str) -> bool:
+        return key in self._by_name
+
+    def get_asset_by_name(self, name: str) -> FakeAsset:
+        assert name in self._by_name, f"unregistered asset: {name}"
+        return self._by_name[name]
+
+    def get_assets_by_tag(self, tag: str) -> list[FakeAsset]:
+        return [a for a in self._by_name.values() if tag in a.tags]
+
+    def get_all_keys(self) -> list[str]:
+        return list(self._by_name)
+
+
+def _default_assets() -> list[FakeAsset]:
+    """Small but representative catalog covering all three asset categories.
+
+    Object names intentionally include the suffix conventions seen in the
+    real registry (e.g. ``bowl_ycb_robolab``) so substring-match tests
+    exercise realistic fuzzy/substring behaviour.
+    """
+    return [
+        FakeAsset(name="maple_table", tags=["background"]),
+        FakeAsset(name="franka_ik", tags=["embodiment"]),
+        FakeAsset(name="franka_joint_pos", tags=["embodiment"]),
+        FakeAsset(name="bowl_ycb_robolab", tags=["object", "bowl"]),
+        FakeAsset(name="avocado01_fruits_robolab", tags=["object", "fruit"]),
+        FakeAsset(name="apple01_fruits_robolab", tags=["object", "fruit"]),
+        FakeAsset(name="cracker_box", tags=["object", "graspable"]),
+    ]
+
+
+def _make_resolver(assets: list[FakeAsset] | None = None) -> Resolver:
+    return Resolver(registry=FakeAssetRegistry(assets or _default_assets()))
+
+
+def _make_scene(
+    *,
+    background: str = "maple_table",
+    embodiment: str = "franka_ik",
+    items: list[Item] | None = None,
+    initial_scene_graph: list[Relation] | None = None,
+    tasks: list[Task] | None = None,
+) -> SceneSpec:
+    """Build a :class:`SceneSpec` with sane defaults for tests that don't care."""
+    return SceneSpec(
+        task_description="test scene",
+        background=background,
+        embodiment=embodiment,
+        items=items or [],
+        initial_scene_graph=initial_scene_graph or [],
+        # ``tasks`` must be non-empty per SceneSpec validator.
+        tasks=tasks
+        or [Task(kind="pick_and_place", subject="placeholder", target="placeholder", description="placeholder")],
+    )
+
+
+# ---------------------------------------------------------------------------
+# Top-level resolve()
+# ---------------------------------------------------------------------------
+
+
+def test_resolve_happy_path():
+    items = [
+        Item(query="bowl", role="foreground", category_tags=["bowl"]),
+        Item(query="avocado", role="foreground", category_tags=["fruit"]),
+    ]
+    initial = [
+        Relation(kind="is_anchor", subject="maple_table"),
+        Relation(kind="on", subject="bowl", target="maple_table"),
+        Relation(kind="on", subject="avocado", target="maple_table"),
+    ]
+    tasks = [Task(kind="pick_and_place", subject="avocado", target="bowl", description="put avocado in bowl")]
+    spec = _make_resolver().resolve(_make_scene(items=items, initial_scene_graph=initial, tasks=tasks))
+
+    # Auto-derived env_name: f"llm_gen_{background}_{first_task_kind}".
+    assert spec.env_name == "llm_gen_maple_table_pick_and_place"
+
+    # Node order: background, embodiment, items in declaration order.
+    node_ids = [n.id for n in spec.nodes]
+    assert node_ids == ["maple_table", "franka_ik", "bowl", "avocado"]
+    assert spec.nodes_by_id["maple_table"].type == ArenaEnvGraphNodeType.BACKGROUND
+    assert spec.nodes_by_id["franka_ik"].type == ArenaEnvGraphNodeType.EMBODIMENT
+    # Item node.name reflects the *resolved* asset name, not the query.
+    assert spec.nodes_by_id["bowl"].name == "bowl_ycb_robolab"
+    assert spec.nodes_by_id["avocado"].name == "avocado01_fruits_robolab"
+
+    # State specs: 1 initial + 1 success placeholder per task.
+    assert len(spec.state_specs) == 2
+    initial_state = spec.state_specs_by_id["state_initial"]
+    assert len(initial_state.spatial_constraints) == 3
+
+    is_anchor = initial_state.spatial_constraints[0]
+    assert is_anchor.type == ArenaEnvGraphSpatialConstraintType.IS_ANCHOR
+    assert is_anchor.parent == "maple_table"
+    assert is_anchor.child is None
+    assert is_anchor.id == "state_initial_0_is_anchor_maple_table"
+
+    on_bowl = initial_state.spatial_constraints[1]
+    assert on_bowl.type == ArenaEnvGraphSpatialConstraintType.ON
+    # Binary relations: parent=target, child=subject (the LLM says "bowl on
+    # table" — the resolver inverts so the table is the anchor).
+    assert on_bowl.parent == "maple_table"
+    assert on_bowl.child == "bowl"
+    assert on_bowl.id == "state_initial_1_on_maple_table_bowl"
+
+    # Tasks.
+    assert len(spec.tasks) == 1
+    task = spec.tasks_by_id["task_0_pick_and_place"]
+    assert task.initial_state_spec_id == "state_initial"
+    assert task.success_state_spec_id == "state_success_0"
+    assert task.task_args == {"subject": "avocado", "target": "bowl", "description": "put avocado in bowl"}
+
+
+def test_resolve_overrides_env_name():
+    resolver = _make_resolver()
+    spec = resolver.resolve(_make_scene(), env_name="my_custom_env")
+    assert spec.env_name == "my_custom_env"
+
+
+def test_resolve_clears_trace_between_calls():
+    resolver = _make_resolver()
+    resolver.resolve(_make_scene())
+    n_after_first = len(resolver.trace)
+    # Sanity: at least background, embodiment, and task events should be present.
+    assert n_after_first > 0
+
+    resolver.resolve(_make_scene())
+    n_after_second = len(resolver.trace)
+    # If trace persisted across calls, the second count would be > the first.
+    # Deterministic input → identical trace length when the trace is cleared.
+    assert n_after_second == n_after_first
+
+
+def test_resolve_with_empty_initial_scene_graph():
+    spec = _make_resolver().resolve(_make_scene(initial_scene_graph=[]))
+    initial_state = spec.state_specs_by_id["state_initial"]
+    assert initial_state.spatial_constraints == []
+    # Even with no constraints, the spec should still be well-formed.
+    assert initial_state.task_constraints == []
+
+
+# ---------------------------------------------------------------------------
+# Item resolution strategies
+# ---------------------------------------------------------------------------
+
+
+def test_item_exact_name_match():
+    # Query that's already a registered asset name skips fuzzy matching.
+    items = [Item(query="cracker_box", role="foreground", category_tags=["graspable"])]
+    resolver = _make_resolver()
+    spec = resolver.resolve(_make_scene(items=items))
+    assert spec.nodes_by_id["cracker_box"].name == "cracker_box"
+    assert any(e.stage == "item.exact" for e in resolver.trace)
+
+
+def test_item_substring_match_in_tag_pool():
+    items = [Item(query="bowl", role="foreground", category_tags=["bowl"])]
+    resolver = _make_resolver()
+    spec = resolver.resolve(_make_scene(items=items))
+    assert spec.nodes_by_id["bowl"].name == "bowl_ycb_robolab"
+    assert any(e.stage == "item.in_tags.substring" for e in resolver.trace)
+
+
+def test_item_relaxes_when_tag_pool_yields_no_match():
+    # category_tags points to a real tag pool ('fruit') but the query
+    # ('cracker') doesn't substring-match either fruit. The resolver should
+    # relax to the full object pool and find cracker_box.
+    items = [Item(query="cracker", role="foreground", category_tags=["fruit"])]
+    resolver = _make_resolver()
+    spec = resolver.resolve(_make_scene(items=items))
+    assert spec.nodes_by_id["cracker"].name == "cracker_box"
+    trace_stages = [e.stage for e in resolver.trace]
+    assert "item.no_match_in_tags" in trace_stages
+    assert any(s.startswith("item.relaxed") for s in trace_stages)
+
+
+def test_item_relaxes_when_tag_pool_empty():
+    # Unknown tag → empty tag pool → resolver short-circuits the pool-search
+    # and relaxes immediately.
+    items = [Item(query="cracker", role="foreground", category_tags=["nonexistent"])]
+    resolver = _make_resolver()
+    spec = resolver.resolve(_make_scene(items=items))
+    assert spec.nodes_by_id["cracker"].name == "cracker_box"
+    assert any(e.stage == "item.tag_pool_empty" for e in resolver.trace)
+
+
+def test_item_miss_omits_node():
+    # Query that matches no asset (and no substring / fuzzy candidate) is
+    # silently dropped — the resolver records a trace but doesn't raise.
+    items = [Item(query="zzz_no_match_anywhere", role="foreground", category_tags=["object"])]
+    resolver = _make_resolver()
+    spec = resolver.resolve(_make_scene(items=items))
+    assert "zzz_no_match_anywhere" not in spec.nodes_by_id
+    assert any(e.stage == "item.miss" for e in resolver.trace)
+
+
+def test_item_scale_param_passed_through():
+    items = [Item(query="bowl", role="foreground", category_tags=["bowl"], scale=0.75)]
+    spec = _make_resolver().resolve(_make_scene(items=items))
+    assert spec.nodes_by_id["bowl"].params == {"scale": 0.75}
+
+
+def test_item_instance_name_overrides_query_for_node_id():
+    items = [Item(query="bowl", role="foreground", category_tags=["bowl"], instance_name="serving_bowl")]
+    spec = _make_resolver().resolve(_make_scene(items=items))
+    # ``instance_name`` controls the *node id*; ``name`` still reflects the
+    # resolved asset, so the same asset can appear twice under different ids.
+    assert "serving_bowl" in spec.nodes_by_id
+    assert "bowl" not in spec.nodes_by_id
+    assert spec.nodes_by_id["serving_bowl"].name == "bowl_ycb_robolab"
+
+
+# ---------------------------------------------------------------------------
+# Embodiment resolution
+# ---------------------------------------------------------------------------
+
+
+def test_embodiment_exact_match():
+    spec = _make_resolver().resolve(_make_scene(embodiment="franka_joint_pos"))
+    assert spec.nodes_by_id["franka_joint_pos"].type == ArenaEnvGraphNodeType.EMBODIMENT
+
+
+def test_embodiment_ik_default_for_bare_family():
+    resolver = _make_resolver()
+    spec = resolver.resolve(_make_scene(embodiment="franka"))
+    # The mapping is exported from the resolver so callers can introspect it.
+    assert IK_DEFAULTS["franka"] == "franka_ik"
+    assert spec.nodes_by_id["franka_ik"].type == ArenaEnvGraphNodeType.EMBODIMENT
+    assert any(e.stage == "embodiment.ik_default" for e in resolver.trace)
+
+
+def test_embodiment_unknown_falls_back_to_franka_ik():
+    # Unknown family names never raise — they fall back to franka_ik and
+    # record a miss trace. ``franka_ik`` must therefore be registered.
+    resolver = _make_resolver()
+    spec = resolver.resolve(_make_scene(embodiment="totally_unknown_robot"))
+    assert spec.nodes_by_id["franka_ik"].type == ArenaEnvGraphNodeType.EMBODIMENT
+    assert any(e.stage == "embodiment.miss" for e in resolver.trace)
+
+
+# ---------------------------------------------------------------------------
+# Background resolution
+# ---------------------------------------------------------------------------
+
+
+def test_background_with_wrong_tag_omitted():
+    # An asset registered under the name "maple_table" but NOT tagged
+    # "background" is rejected with a name.wrong_tag trace, so the background
+    # node is absent from the resulting spec.
+    assets = [
+        FakeAsset(name="franka_ik", tags=["embodiment"]),
+        FakeAsset(name="maple_table", tags=["object"]),  # wrong tag
+    ]
+    resolver = _make_resolver(assets)
+    spec = resolver.resolve(_make_scene(background="maple_table"))
+    assert "maple_table" not in spec.nodes_by_id
+    assert any(e.stage == "name.wrong_tag" for e in resolver.trace)
+
+
+# ---------------------------------------------------------------------------
+# Spatial constraint construction
+# ---------------------------------------------------------------------------
+
+
+def test_spatial_constraint_binary_relation_id_and_parent_child():
+    items = [Item(query="cracker_box", role="foreground", category_tags=["graspable"])]
+    initial = [Relation(kind="on", subject="cracker_box", target="maple_table")]
+    spec = _make_resolver().resolve(_make_scene(items=items, initial_scene_graph=initial))
+    constraint = spec.state_specs_by_id["state_initial"].spatial_constraints[0]
+    # Binary: parent=target, child=subject.
+    assert constraint.parent == "maple_table"
+    assert constraint.child == "cracker_box"
+    assert constraint.id == "state_initial_0_on_maple_table_cracker_box"
+
+
+def test_spatial_constraint_unary_relation_id_and_parent_child():
+    initial = [Relation(kind="is_anchor", subject="maple_table")]
+    spec = _make_resolver().resolve(_make_scene(initial_scene_graph=initial))
+    constraint = spec.state_specs_by_id["state_initial"].spatial_constraints[0]
+    # Unary (target is None): parent=subject, child=None.
+    assert constraint.type == ArenaEnvGraphSpatialConstraintType.IS_ANCHOR
+    assert constraint.parent == "maple_table"
+    assert constraint.child is None
+    # No "_{child}" suffix when child is None.
+    assert constraint.id == "state_initial_0_is_anchor_maple_table"
+
+
+def test_spatial_constraint_in_relation_skipped():
+    items = [Item(query="cracker_box", role="foreground", category_tags=["graspable"])]
+    initial = [Relation(kind="in", subject="cracker_box", target="maple_table")]
+    resolver = _make_resolver()
+    spec = resolver.resolve(_make_scene(items=items, initial_scene_graph=initial))
+    # "in" has no initial-state semantics — see Resolver._build_spatial_constraint.
+    assert spec.state_specs_by_id["state_initial"].spatial_constraints == []
+    assert any(e.stage == "relation.initial.in_skipped" for e in resolver.trace)
+
+
+def test_spatial_constraint_unknown_subject_skipped():
+    initial = [Relation(kind="on", subject="not_a_node", target="maple_table")]
+    resolver = _make_resolver()
+    spec = resolver.resolve(_make_scene(initial_scene_graph=initial))
+    assert spec.state_specs_by_id["state_initial"].spatial_constraints == []
+    assert any(e.stage == "relation.initial.unknown_subject" for e in resolver.trace)
+
+
+def test_spatial_constraint_unknown_target_skipped():
+    initial = [Relation(kind="on", subject="maple_table", target="missing_node")]
+    resolver = _make_resolver()
+    spec = resolver.resolve(_make_scene(initial_scene_graph=initial))
+    assert spec.state_specs_by_id["state_initial"].spatial_constraints == []
+    assert any(e.stage == "relation.initial.unknown_target" for e in resolver.trace)
+
+
+def test_spatial_constraint_params_passed_through():
+    items = [Item(query="cracker_box", role="foreground", category_tags=["graspable"])]
+    initial = [
+        Relation(
+            kind="at_position",
+            subject="cracker_box",
+            params={"position_xyz": [0.1, 0.2, 0.3]},
+        ),
+    ]
+    spec = _make_resolver().resolve(_make_scene(items=items, initial_scene_graph=initial))
+    constraint = spec.state_specs_by_id["state_initial"].spatial_constraints[0]
+    assert constraint.type == ArenaEnvGraphSpatialConstraintType.AT_POSITION
+    # ``params`` are passed through verbatim — the resolver doesn't validate
+    # the schema of relation-kind-specific params; that's the downstream
+    # builder's job.
+    assert constraint.params == {"position_xyz": [0.1, 0.2, 0.3]}
+
+
+# ---------------------------------------------------------------------------
+# Task spec construction
+# ---------------------------------------------------------------------------
+
+
+def test_multiple_tasks_get_distinct_success_state_ids():
+    tasks = [
+        Task(kind="pick_and_place", subject="bowl", target="maple_table", description="d1"),
+        Task(kind="open_door", subject="bowl", target=None, description="d2"),
+        Task(kind="close_door", subject="bowl", target=None, description="d3"),
+    ]
+    items = [Item(query="bowl", role="foreground", category_tags=["bowl"])]
+    spec = _make_resolver().resolve(_make_scene(items=items, tasks=tasks))
+
+    # Task ids follow ``task_{index}_{kind}``.
+    task_ids = [t.id for t in spec.tasks]
+    assert task_ids == ["task_0_pick_and_place", "task_1_open_door", "task_2_close_door"]
+
+    # Each task points at its own per-task placeholder success state.
+    success_ids = [t.success_state_spec_id for t in spec.tasks]
+    assert success_ids == ["state_success_0", "state_success_1", "state_success_2"]
+
+    # state_specs contains 1 initial + 3 placeholder success specs.
+    assert len(spec.state_specs) == 4
+    for i in range(3):
+        # Placeholders are empty — downstream synthesis is responsible for them.
+        assert spec.state_specs_by_id[f"state_success_{i}"].spatial_constraints == []
+        assert spec.state_specs_by_id[f"state_success_{i}"].task_constraints == []

From ae99097f55ea8858482559ca43c6a0ca1ff9822c Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Wed, 27 May 2026 15:54:35 +0800
Subject: [PATCH 08/41] Add pydantic to runtime deps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

schema.py imports BaseModel, Field and model_validator at module load —
not lazily — so pydantic must be declared in RUNTIME_DEPS. Caught by the
PR #718 reviewer; declaring it explicitly removes the silent reliance on
a transitive dependency.

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 setup.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/setup.py b/setup.py
index 80e447adb..1848b437c 100644
--- a/setup.py
+++ b/setup.py
@@ -16,6 +16,9 @@
     "pytest",
     # Used lazily by isaaclab_arena/llm_env_gen/* for NV_API_KEY-based LLM calls.
     "openai",
+    # Hard dependency of isaaclab_arena/llm_env_gen/schema.py (BaseModel / Field /
+    # model_validator imported at module load — not lazy).
+    "pydantic>=2.0",
 ]
 
 DEV_DEPS = [

From ef8f6945675af802aab7b132022a7441f595114d Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Wed, 27 May 2026 15:55:13 +0800
Subject: [PATCH 09/41] Use explicit ValueError for missing LLM API key

assert is a no-op under python -O, so the api_key guard in LLMAgent's
constructor could silently disappear in optimised runs. Switch to an
explicit raise so the failure mode is the same regardless of interpreter
flags. Surfaced by PR #718 review.

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 isaaclab_arena/llm_env_gen/llm_agent.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/isaaclab_arena/llm_env_gen/llm_agent.py b/isaaclab_arena/llm_env_gen/llm_agent.py
index f6402b409..d84ba0248 100644
--- a/isaaclab_arena/llm_env_gen/llm_agent.py
+++ b/isaaclab_arena/llm_env_gen/llm_agent.py
@@ -59,7 +59,11 @@ def __init__(
         from openai import OpenAI
 
         self.api_key = api_key or os.getenv("NV_API_KEY")
-        assert self.api_key, "API key required: set NV_API_KEY or pass api_key."
+        # Use an explicit raise instead of ``assert`` so the guard survives
+        # ``python -O`` (which strips asserts) — missing-key failures must be
+        # loud regardless of interpreter flags.
+        if not self.api_key:
+            raise ValueError("API key required: set NV_API_KEY or pass api_key.")
         self.model = model
         self.client = OpenAI(api_key=self.api_key, base_url=base_url)
 

From 1a07ba64c9ed9978845b4966e0eeba2128e94331 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Wed, 27 May 2026 15:56:03 +0800
Subject: [PATCH 10/41] Rewrite unary relation subjects in background override

The --background override loop in try_schema.py only rewrote
``rel.target == old_bg``, missing unary relations like
``is_anchor(old_bg)`` whose subject is the old background name and
whose target is None. After the override, the unary constraint
referenced the now-unknown old background, the resolver emitted a
``relation.initial.unknown_subject`` trace, and the anchor declaration
was silently dropped from the resolved scene graph.

Also rewrite ``rel.subject == old_bg`` so the anchor relation survives
the rename. Greptile P1 from PR #718.

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 isaaclab_arena/llm_env_gen/try_schema.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/isaaclab_arena/llm_env_gen/try_schema.py b/isaaclab_arena/llm_env_gen/try_schema.py
index 656eb3ad4..f57a8c66e 100644
--- a/isaaclab_arena/llm_env_gen/try_schema.py
+++ b/isaaclab_arena/llm_env_gen/try_schema.py
@@ -88,11 +88,18 @@ def main() -> None:
 
     if args.background and args.background != spec.background:
         # Swap the background name wherever it appears so downstream code
-        # (resolver, proposer) sees a consistent scene. Relations whose
-        # target was the old background get rewired to the new one.
+        # (resolver, proposer) sees a consistent scene. Rewrite both
+        # ``rel.target`` (binary relations like ``on(bowl, table)``) AND
+        # ``rel.subject`` (unary relations like ``is_anchor(table)``);
+        # missing the subject case would leave the unary constraint
+        # pointing at the old background name, after which the resolver
+        # would emit a ``relation.initial.unknown_subject`` trace and
+        # silently drop the constraint.
         old_bg = spec.background
         new_bg = args.background
         for rel in spec.initial_scene_graph:
+            if rel.subject == old_bg:
+                rel.subject = new_bg
             if rel.target == old_bg:
                 rel.target = new_bg
         # Note: tasks don't directly reference background in target (typically None or items),

From 7fc2ba0c6170b29d3c18c2f894423b3ec405fba2 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Wed, 27 May 2026 16:24:30 +0800
Subject: [PATCH 11/41] Expose at_pose as an LLM-emittable relation kind
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ArenaEnvGraphSpatialConstraintType has an AT_POSE entry but RelationKind
omitted it, breaking the 1:1 contract that the resolver relies on for
its ``ArenaEnvGraphSpatialConstraintType(rel.kind)`` lookup. Add
"at_pose" to the Literal so the LLM can emit exact-pose placements (the
test fixtures already round-trip at_pose constraints through
ArenaEnvGraphSpec — they just couldn't originate from the LLM step).

Also expand the comment block above RelationKind to explain why the
solver-internal kinds (position_limits, random_around_solution,
rotate_around_solution) remain hidden from the LLM. xyao-nv comment on
PR #718.

The system prompt's hardcoded enumeration of relation kinds is
intentionally left in sync via a follow-up commit that auto-derives it
from typing.get_args(RelationKind).

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 isaaclab_arena/llm_env_gen/llm_agent.py |  2 +-
 isaaclab_arena/llm_env_gen/schema.py    | 13 ++++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/isaaclab_arena/llm_env_gen/llm_agent.py b/isaaclab_arena/llm_env_gen/llm_agent.py
index d84ba0248..82569408e 100644
--- a/isaaclab_arena/llm_env_gen/llm_agent.py
+++ b/isaaclab_arena/llm_env_gen/llm_agent.py
@@ -108,7 +108,7 @@ def _system_prompt(self) -> str:
             "  tags. This is a PREFERENCE, not a hard filter — the resolver will fall back to the full\n"
             "  catalog if the tag pool is empty or yields no close match. Err toward emitting useful tags;\n"
             "  the trace will report what was relaxed.\n"
-            "- relation.kind ∈ {on, in, next_to, at_position, is_anchor}. Spatial relations only —\n"
+            "- relation.kind ∈ {on, in, next_to, at_position, at_pose, is_anchor}. Spatial relations only —\n"
             "  articulated-state changes are expressed via tasks below, not as relations.\n"
             "  subject/target reference items by their query string or the background name.\n"
             "  * Articulated objects (microwave, fridge, cabinet) still need a spatial\n"
diff --git a/isaaclab_arena/llm_env_gen/schema.py b/isaaclab_arena/llm_env_gen/schema.py
index 47b8cc821..6ba0650e9 100644
--- a/isaaclab_arena/llm_env_gen/schema.py
+++ b/isaaclab_arena/llm_env_gen/schema.py
@@ -18,15 +18,18 @@
 from pydantic import BaseModel, Field, model_validator
 
 # Relation kinds currently surfaced to the LLM. Mirror the subset of
-# isaaclab_arena.relations.relations that makes sense for tabletop prompts.
-# Values must match the corresponding ``ArenaEnvGraphSpatialConstraintType``
-# enum values one-to-one — the resolver looks the constraint type up via
+# ``ArenaEnvGraphSpatialConstraintType`` that makes sense for tabletop
+# prompts; values must match the enum's values one-to-one because the
+# resolver looks the constraint type up via
 # ``ArenaEnvGraphSpatialConstraintType(kind)`` rather than maintaining a
-# parallel dict.
+# parallel dict. Solver-internal kinds (``position_limits``,
+# ``random_around_solution``, ``rotate_around_solution``) are intentionally
+# omitted — they describe how the placement solver explores poses and are
+# not natural for an LLM to emit.
 # "in" has no In class in isaaclab_arena.relations.relations yet — see the
 # TODO there. The scene builder materializes goal-state "in" relations as
 # the task's success predicate.
-RelationKind = Literal["on", "in", "next_to", "at_position", "is_anchor"]
+RelationKind = Literal["on", "in", "next_to", "at_position", "at_pose", "is_anchor"]
 
 ItemRole = Literal["foreground", "distractor", "anchor"]
 

From c941d644a3bca20a84fb3823f436a5a6916795da Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Wed, 27 May 2026 17:32:10 +0800
Subject: [PATCH 12/41] Derive prompt kind enumerations from schema literals
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The system prompt hardcoded the {on, in, next_to, at_position, at_pose,
is_anchor} and {pick_and_place, open_door, close_door} lists that the
LLM is allowed to emit, duplicating the pydantic Literal types in
schema.py. The previous commit had to remember to manually add at_pose
to both places — exactly the kind of drift this couples up.

Use typing.get_args(RelationKind) and typing.get_args(TaskKind) to
generate the enumerations in-place; the prompt now follows the schema
without intervention. xyao-nv comment on PR #718.

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 isaaclab_arena/llm_env_gen/llm_agent.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/isaaclab_arena/llm_env_gen/llm_agent.py b/isaaclab_arena/llm_env_gen/llm_agent.py
index 82569408e..641601f7d 100644
--- a/isaaclab_arena/llm_env_gen/llm_agent.py
+++ b/isaaclab_arena/llm_env_gen/llm_agent.py
@@ -14,8 +14,9 @@
 import contextlib
 import json
 import os
+from typing import get_args
 
-from .schema import SceneSpec
+from .schema import RelationKind, SceneSpec, TaskKind
 
 DEFAULT_BASE_URL = "https://inference-api.nvidia.com"
 DEFAULT_MODEL = "nvidia/deepseek-ai/deepseek-v4-flash"
@@ -95,6 +96,14 @@ def generate_spec(
 
     def _system_prompt(self) -> str:
         schema = json.dumps(SceneSpec.model_json_schema(), indent=2)
+        # Derive the enumerations the LLM is allowed to emit directly from
+        # the pydantic literal types so the prompt cannot drift out of sync
+        # when RelationKind / TaskKind change. Bare identifiers for
+        # relation kinds (e.g. ``on``), JSON-style quoted strings for task
+        # kinds (e.g. ``"pick_and_place"``) — matching the surrounding
+        # prose style.
+        relation_kinds = ", ".join(get_args(RelationKind))
+        task_kinds = ", ".join(f'"{k}"' for k in get_args(TaskKind))
         return (
             "You are a scene-generation parser for robot manipulation tasks.\n"
             "Convert a natural-language prompt into a SceneSpec JSON object that matches the schema below.\n\n"
@@ -108,7 +117,7 @@ def _system_prompt(self) -> str:
             "  tags. This is a PREFERENCE, not a hard filter — the resolver will fall back to the full\n"
             "  catalog if the tag pool is empty or yields no close match. Err toward emitting useful tags;\n"
             "  the trace will report what was relaxed.\n"
-            "- relation.kind ∈ {on, in, next_to, at_position, at_pose, is_anchor}. Spatial relations only —\n"
+            f"- relation.kind ∈ {{{relation_kinds}}}. Spatial relations only —\n"
             "  articulated-state changes are expressed via tasks below, not as relations.\n"
             "  subject/target reference items by their query string or the background name.\n"
             "  * Articulated objects (microwave, fridge, cabinet) still need a spatial\n"
@@ -118,7 +127,7 @@ def _system_prompt(self) -> str:
             "  relation (e.g. bowl on table, distractors present) must appear here. Relations that change\n"
             "  via tasks are still listed here in their starting form.\n"
             "- tasks: a list of atomic actions to perform in order. Each task has:\n"
-            '    * kind ∈ {"pick_and_place", "open_door", "close_door"}\n'
+            f"    * kind ∈ {{{task_kinds}}}\n"
             "    * subject: the primary object being acted on (e.g. 'avocado', 'microwave')\n"
             "    * target: the secondary object/location (e.g. 'bowl' for pick_and_place, null for open/close)\n"
             "    * description: natural-language summary of the task\n"

From 2d27374e03a1724ab027e3dc0b3518fed7478252 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Wed, 27 May 2026 17:47:34 +0800
Subject: [PATCH 13/41] Replace _extract_json asserts with
 LLMResponseParseError
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both error paths in LLMAgent._extract_json used ``assert`` /
``AssertionError`` to signal "couldn't find balanced JSON". That's
brittle under ``python -O`` (asserts get stripped) and the
``AssertionError`` type doesn't tell callers anything about *why* the
response was rejected.

Introduce ``LLMResponseParseError(ValueError)`` so:
  * the guard survives optimised mode,
  * callers can ``except LLMResponseParseError`` to retry only
    parse failures (without also catching pydantic validation errors
    from SceneSpec.model_validate), and
  * existing ``except ValueError`` clauses still match because the new
    exception is a ValueError subclass.

Truncate the raw response in the error message at 500 chars — enough
to debug a misbehaving prompt without making stack traces unreadable
when the model returns a long screed. isaaclab-review-bot comment on
PR #718.

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 isaaclab_arena/llm_env_gen/llm_agent.py | 27 +++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/isaaclab_arena/llm_env_gen/llm_agent.py b/isaaclab_arena/llm_env_gen/llm_agent.py
index 641601f7d..605277f7d 100644
--- a/isaaclab_arena/llm_env_gen/llm_agent.py
+++ b/isaaclab_arena/llm_env_gen/llm_agent.py
@@ -21,6 +21,21 @@
 DEFAULT_BASE_URL = "https://inference-api.nvidia.com"
 DEFAULT_MODEL = "nvidia/deepseek-ai/deepseek-v4-flash"
 
+# Truncate raw LLM responses to this many characters when including them in
+# error messages — long enough to diagnose the failure, short enough to keep
+# stack traces readable.
+_RAW_RESPONSE_PREVIEW_CHARS = 500
+
+
+class LLMResponseParseError(ValueError):
+    """Raised when an LLM response cannot be parsed into a JSON object.
+
+    Subclasses ``ValueError`` so existing ``except ValueError`` clauses
+    (e.g. around ``SceneSpec.model_validate``) still catch it, but the
+    distinct type lets callers that want to retry the LLM call separate
+    parse failures from validation failures.
+    """
+
 
 def build_catalog_text() -> str:
     """Introspect AssetRegistry and build the vocabulary the LLM is allowed to use."""
@@ -161,8 +176,16 @@ def _extract_json(content: str) -> dict:
         with contextlib.suppress(json.JSONDecodeError):
             return json.loads(content)
 
+        # ``raise LLMResponseParseError`` rather than ``assert`` so the guard
+        # survives ``python -O`` (which strips asserts), and so callers can
+        # distinguish parse failures from validation failures by exception
+        # type. The truncated raw response is the most useful field for
+        # debugging a misbehaving prompt.
         start = content.find("{")
-        assert start != -1, f"No JSON object in LLM response: {content!r}"
+        if start == -1:
+            raise LLMResponseParseError(
+                f"No JSON object found in LLM response: {content[:_RAW_RESPONSE_PREVIEW_CHARS]!r}"
+            )
         depth = 0
         for i in range(start, len(content)):
             if content[i] == "{":
@@ -171,4 +194,4 @@ def _extract_json(content: str) -> dict:
                 depth -= 1
                 if depth == 0:
                     return json.loads(content[start : i + 1])
-        raise AssertionError(f"Unbalanced JSON in LLM response: {content!r}")
+        raise LLMResponseParseError(f"Unbalanced braces in LLM response: {content[:_RAW_RESPONSE_PREVIEW_CHARS]!r}")

From c6f2cfa84710aa7f836c79b3d78065cbaa586adf Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Wed, 27 May 2026 17:48:50 +0800
Subject: [PATCH 14/41] Document __init__ and generate_spec arguments on
 LLMAgent

The previous one-liner docstrings didn't explain what each parameter
controlled, where the defaults came from, or which exceptions callers
should expect. xyao-nv flagged this twice on PR #718.

For __init__: describe the NV_API_KEY fallback, point at
build.nvidia.com for the model catalogue, and call out that base_url is
the swap point for self-hosted OpenAI-compatible endpoints.

For generate_spec: document why temperature is kept low, why max_tokens
is generous, what catalog_text overrides enable, and explicitly list
both the parse-failure (LLMResponseParseError) and validation-failure
(pydantic.ValidationError) exception types so callers can handle them
differently.

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 isaaclab_arena/llm_env_gen/llm_agent.py | 48 ++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/isaaclab_arena/llm_env_gen/llm_agent.py b/isaaclab_arena/llm_env_gen/llm_agent.py
index 605277f7d..7e2e7e96a 100644
--- a/isaaclab_arena/llm_env_gen/llm_agent.py
+++ b/isaaclab_arena/llm_env_gen/llm_agent.py
@@ -72,6 +72,22 @@ def __init__(
         model: str = DEFAULT_MODEL,
         base_url: str = DEFAULT_BASE_URL,
     ):
+        """Configure the OpenAI-compatible client used to call the LLM.
+
+        Args:
+            api_key: Bearer token for the inference endpoint. Falls back
+                to the ``NV_API_KEY`` environment variable when ``None``;
+                raises ``ValueError`` if neither is set.
+            model: Model identifier as understood by the endpoint at
+                ``base_url`` (e.g. ``"nvidia/deepseek-ai/deepseek-v4-flash"``).
+                See https://build.nvidia.com for the catalogue of NVIDIA-hosted
+                models.
+            base_url: OpenAI-compatible API root. Defaults to
+                ``DEFAULT_BASE_URL`` (NVIDIA's hosted inference endpoint);
+                override to point at a self-hosted vLLM / Ollama / etc.
+                deployment that exposes the same OpenAI chat-completions
+                wire format.
+        """
         from openai import OpenAI
 
         self.api_key = api_key or os.getenv("NV_API_KEY")
@@ -90,7 +106,37 @@ def generate_spec(
         temperature: float = 0.2,
         max_tokens: int = 2000,
     ) -> tuple[SceneSpec, str]:
-        """Return (validated SceneSpec, raw LLM response)."""
+        """Call the LLM and return the parsed SceneSpec plus the raw response.
+
+        Args:
+            prompt: Natural-language scene description from the end user.
+                Concatenated with the asset catalog and the JSON-only
+                instruction to form the chat ``user`` message.
+            catalog_text: Pre-built asset vocabulary (the output of
+                ``build_catalog_text()``). When ``None``, the catalog is
+                rebuilt from the live ``AssetRegistry``. Pass an explicit
+                value to (a) avoid the cost of rebuilding it across
+                repeated calls, or (b) experiment with a restricted /
+                augmented catalog without mutating the registry.
+            temperature: Sampling temperature forwarded to the LLM. Kept
+                low by default (0.2) because SceneSpec generation is a
+                deterministic-ish translation task — high temperature
+                yields creative but invalid schemas.
+            max_tokens: Hard cap on the response length. Set generously
+                (2000) so multi-task SceneSpecs aren't truncated mid-JSON;
+                shrink if the endpoint enforces a tighter quota.
+
+        Returns:
+            A ``(SceneSpec, raw_response)`` tuple. The raw text is useful
+            for debugging when ``model_validate`` rejects the parsed
+            JSON.
+
+        Raises:
+            LLMResponseParseError: when the response can't be parsed as a
+                JSON object (no opening brace, unbalanced braces).
+            pydantic.ValidationError: when the parsed JSON is well-formed
+                but doesn't match the SceneSpec schema.
+        """
         catalog_text = catalog_text or build_catalog_text()
         system = self._system_prompt()
         user = f"{catalog_text}\n\nUSER PROMPT:\n{prompt}\n\nReturn ONLY a JSON object matching the SceneSpec schema."

From 9188c1b4cddec51441f3bda7a21c648577bf2bde Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Wed, 27 May 2026 18:02:53 +0800
Subject: [PATCH 15/41] Rename schema.py to llm_schema.py and SceneSpec to
 LLMEnvSpec
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

xyao-nv asked for env-centric naming across the LLM agent's docstrings
and prompt to match the broader llm_env_gen package terminology
(env graph spec, env writer, env name). The class name SceneSpec was
the highest-friction mismatch, so the rename goes all the way:

  * schema.py             -> llm_schema.py (git mv)
  * class SceneSpec       -> class LLMEnvSpec
  * .schema imports       -> .llm_schema imports across llm_agent.py,
                              resolver.py, try_schema.py,
                              tests/test_resolver.py
  * "scene" prose         -> "env" / "env-generation" in module
                              docstrings, class one-liners, the system
                              prompt's opening line, and the rendered
                              user message ("matching the LLMEnvSpec
                              schema")
  * setup.py M1 comment   -> updated path

Kept intentionally unchanged:
  * the ``initial_scene_graph`` field name on LLMEnvSpec — renaming it
    would change the JSON schema the LLM is prompted against and is
    out of scope here. Called out in the LLMEnvSpec docstring.
  * the ``try_schema.py`` CLI script name — same module path so
    existing invocations keep working; renaming to try_llm_env_gen.py
    is a natural follow-up.

Also corrects the module docstring's stale claim that LLMAgent uses
Claude — the default model has long been deepseek-v4-flash, and the
client is OpenAI-compatible. Describe the endpoint instead of naming a
specific model. (xyao-nv comment "Emm did you write below using
Claude?")

25 resolver + env-graph-spec tests pass; ``try_schema --print-schema``
prints the LLMEnvSpec JSON schema; the rendered system prompt opens
with "You are an env-generation parser...".

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 isaaclab_arena/llm_env_gen/llm_agent.py       | 40 ++++++++++---------
 .../llm_env_gen/{schema.py => llm_schema.py}  | 25 +++++++-----
 isaaclab_arena/llm_env_gen/resolver.py        | 16 ++++----
 isaaclab_arena/llm_env_gen/try_schema.py      |  8 ++--
 isaaclab_arena/tests/test_resolver.py         | 10 ++---
 setup.py                                      |  2 +-
 6 files changed, 53 insertions(+), 48 deletions(-)
 rename isaaclab_arena/llm_env_gen/{schema.py => llm_schema.py} (84%)

diff --git a/isaaclab_arena/llm_env_gen/llm_agent.py b/isaaclab_arena/llm_env_gen/llm_agent.py
index 7e2e7e96a..848e3a732 100644
--- a/isaaclab_arena/llm_env_gen/llm_agent.py
+++ b/isaaclab_arena/llm_env_gen/llm_agent.py
@@ -3,10 +3,11 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-"""LLM agent for parsing natural-language scene prompts into a SceneSpec.
+"""LLM agent for parsing natural-language env-generation prompts into an LLMEnvSpec.
 
-Uses Claude via NVIDIA's OpenAI-compatible inference API. Emits the
-SceneSpec Pydantic bundle so asset resolution stays deterministic.
+Calls an OpenAI-compatible chat-completions endpoint (NVIDIA's hosted
+inference by default) and validates the response against the LLMEnvSpec
+pydantic bundle so asset resolution stays deterministic.
 """
 
 from __future__ import annotations
@@ -16,7 +17,7 @@
 import os
 from typing import get_args
 
-from .schema import RelationKind, SceneSpec, TaskKind
+from .llm_schema import LLMEnvSpec, RelationKind, TaskKind
 
 DEFAULT_BASE_URL = "https://inference-api.nvidia.com"
 DEFAULT_MODEL = "nvidia/deepseek-ai/deepseek-v4-flash"
@@ -31,7 +32,7 @@ class LLMResponseParseError(ValueError):
     """Raised when an LLM response cannot be parsed into a JSON object.
 
     Subclasses ``ValueError`` so existing ``except ValueError`` clauses
-    (e.g. around ``SceneSpec.model_validate``) still catch it, but the
+    (e.g. around ``LLMEnvSpec.model_validate``) still catch it, but the
     distinct type lets callers that want to retry the LLM call separate
     parse failures from validation failures.
     """
@@ -64,7 +65,7 @@ def build_catalog_text() -> str:
 
 
 class LLMAgent:
-    """Parses a natural-language prompt into a SceneSpec."""
+    """Parses a natural-language env-generation prompt into an LLMEnvSpec."""
 
     def __init__(
         self,
@@ -105,11 +106,11 @@ def generate_spec(
         catalog_text: str | None = None,
         temperature: float = 0.2,
         max_tokens: int = 2000,
-    ) -> tuple[SceneSpec, str]:
-        """Call the LLM and return the parsed SceneSpec plus the raw response.
+    ) -> tuple[LLMEnvSpec, str]:
+        """Call the LLM and return the parsed LLMEnvSpec plus the raw response.
 
         Args:
-            prompt: Natural-language scene description from the end user.
+            prompt: Natural-language env description from the end user.
                 Concatenated with the asset catalog and the JSON-only
                 instruction to form the chat ``user`` message.
             catalog_text: Pre-built asset vocabulary (the output of
@@ -119,15 +120,16 @@ def generate_spec(
                 repeated calls, or (b) experiment with a restricted /
                 augmented catalog without mutating the registry.
             temperature: Sampling temperature forwarded to the LLM. Kept
-                low by default (0.2) because SceneSpec generation is a
+                low by default (0.2) because LLMEnvSpec generation is a
                 deterministic-ish translation task — high temperature
                 yields creative but invalid schemas.
             max_tokens: Hard cap on the response length. Set generously
-                (2000) so multi-task SceneSpecs aren't truncated mid-JSON;
-                shrink if the endpoint enforces a tighter quota.
+                (2000) so multi-task LLMEnvSpecs aren't truncated
+                mid-JSON; shrink if the endpoint enforces a tighter
+                quota.
 
         Returns:
-            A ``(SceneSpec, raw_response)`` tuple. The raw text is useful
+            A ``(LLMEnvSpec, raw_response)`` tuple. The raw text is useful
             for debugging when ``model_validate`` rejects the parsed
             JSON.
 
@@ -135,11 +137,11 @@ def generate_spec(
             LLMResponseParseError: when the response can't be parsed as a
                 JSON object (no opening brace, unbalanced braces).
             pydantic.ValidationError: when the parsed JSON is well-formed
-                but doesn't match the SceneSpec schema.
+                but doesn't match the LLMEnvSpec schema.
         """
         catalog_text = catalog_text or build_catalog_text()
         system = self._system_prompt()
-        user = f"{catalog_text}\n\nUSER PROMPT:\n{prompt}\n\nReturn ONLY a JSON object matching the SceneSpec schema."
+        user = f"{catalog_text}\n\nUSER PROMPT:\n{prompt}\n\nReturn ONLY a JSON object matching the LLMEnvSpec schema."
 
         resp = self.client.chat.completions.create(
             model=self.model,
@@ -152,11 +154,11 @@ def generate_spec(
         )
         raw = resp.choices[0].message.content
         data = self._extract_json(raw)
-        spec = SceneSpec.model_validate(data)
+        spec = LLMEnvSpec.model_validate(data)
         return spec, raw
 
     def _system_prompt(self) -> str:
-        schema = json.dumps(SceneSpec.model_json_schema(), indent=2)
+        schema = json.dumps(LLMEnvSpec.model_json_schema(), indent=2)
         # Derive the enumerations the LLM is allowed to emit directly from
         # the pydantic literal types so the prompt cannot drift out of sync
         # when RelationKind / TaskKind change. Bare identifiers for
@@ -166,8 +168,8 @@ def _system_prompt(self) -> str:
         relation_kinds = ", ".join(get_args(RelationKind))
         task_kinds = ", ".join(f'"{k}"' for k in get_args(TaskKind))
         return (
-            "You are a scene-generation parser for robot manipulation tasks.\n"
-            "Convert a natural-language prompt into a SceneSpec JSON object that matches the schema below.\n\n"
+            "You are an env-generation parser for robot manipulation tasks.\n"
+            "Convert a natural-language prompt into an LLMEnvSpec JSON object that matches the schema below.\n\n"
             "RULES:\n"
             "- item.query: the short human name as it appears in the prompt (e.g. 'avocado', 'bowl').\n"
             "  The resolver fuzzy-matches this against the OBJECTS catalog; you do NOT need to emit the\n"
diff --git a/isaaclab_arena/llm_env_gen/schema.py b/isaaclab_arena/llm_env_gen/llm_schema.py
similarity index 84%
rename from isaaclab_arena/llm_env_gen/schema.py
rename to isaaclab_arena/llm_env_gen/llm_schema.py
index 6ba0650e9..7d3765e71 100644
--- a/isaaclab_arena/llm_env_gen/schema.py
+++ b/isaaclab_arena/llm_env_gen/llm_schema.py
@@ -3,10 +3,10 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-"""Schema the LLM must fill in when parsing a natural-language scene prompt.
+"""Schema the LLM must fill in when parsing a natural-language env-generation prompt.
 
 The LLM sees a list of the *available* asset tags / embodiment names pulled
-from the registries at call time, and must return a SceneSpec that only uses
+from the registries at call time, and must return a LLMEnvSpec that only uses
 those vocabularies. Concrete asset names are resolved by the Resolver in a
 second, deterministic step — the LLM never invents USD paths.
 """
@@ -27,8 +27,8 @@
 # omitted — they describe how the placement solver explores poses and are
 # not natural for an LLM to emit.
 # "in" has no In class in isaaclab_arena.relations.relations yet — see the
-# TODO there. The scene builder materializes goal-state "in" relations as
-# the task's success predicate.
+# TODO there. The downstream env builder materializes goal-state "in"
+# relations as the task's success predicate.
 RelationKind = Literal["on", "in", "next_to", "at_position", "at_pose", "is_anchor"]
 
 ItemRole = Literal["foreground", "distractor", "anchor"]
@@ -82,7 +82,7 @@ def identity(self) -> tuple[str, str, str | None]:
 
 
 class Task(BaseModel):
-    """One atomic task in the plan that transforms the scene state.
+    """One atomic task in the plan that transforms the env state.
 
     A task specifies what action to perform (kind), what object it acts on
     (subject), and optionally where it goes (target). The description provides
@@ -95,18 +95,21 @@ class Task(BaseModel):
     description: str  # natural-language task description
 
 
-class SceneSpec(BaseModel):
-    """LLM output — a structured plan for the scene and a list of tasks.
+class LLMEnvSpec(BaseModel):
+    """LLM output — a structured plan for the env and a list of tasks.
 
     The language prompt is decomposed into:
 
       * ``initial_scene_graph`` — every relation that holds at env reset.
         This configures where objects spawn. This is a FULL snapshot
-        including all relations that persist throughout all tasks.
+        including all relations that persist throughout all tasks. (Field
+        name kept as ``initial_scene_graph`` even though the class is now
+        ``LLMEnvSpec`` — renaming the field would change the JSON schema
+        the LLM is prompted against and is out of scope here.)
       * ``tasks`` — a list of atomic actions to execute in sequence. Each
         task specifies what to do (kind), what object(s) it acts on
         (subject/target), and a natural-language description. The task
-        sequence implicitly defines the intermediate scene graphs by applying
+        sequence implicitly defines the intermediate env graphs by applying
         each task's transformations in order.
     """
 
@@ -118,9 +121,9 @@ class SceneSpec(BaseModel):
     tasks: list[Task]
 
     @model_validator(mode="after")
-    def _tasks_must_be_non_empty(self) -> SceneSpec:
+    def _tasks_must_be_non_empty(self) -> LLMEnvSpec:
         if not self.tasks:
             raise ValueError(
-                "tasks list is empty — at least one task must be specified to define the scene transformation."
+                "tasks list is empty — at least one task must be specified to define the env transformation."
             )
         return self
diff --git a/isaaclab_arena/llm_env_gen/resolver.py b/isaaclab_arena/llm_env_gen/resolver.py
index 6377a1a05..e88fec619 100644
--- a/isaaclab_arena/llm_env_gen/resolver.py
+++ b/isaaclab_arena/llm_env_gen/resolver.py
@@ -3,15 +3,15 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-"""Deterministic resolver that turns a SceneSpec into an ArenaEnvGraphSpec.
+"""Deterministic resolver that turns an LLMEnvSpec into an ArenaEnvGraphSpec.
 
-The LLM emits a SceneSpec. Resolver.resolve() walks that spec, binds each
+The LLM emits an LLMEnvSpec. Resolver.resolve() walks that spec, binds each
 query string to a registered Asset (preferring exact name, then fuzzy match
 filtered by tags), and emits a fully-formed :class:`ArenaEnvGraphSpec`:
 
   * ``nodes`` — background, embodiment, and objects.
   * ``state_specs`` — one initial state spec derived from
-    ``SceneSpec.initial_scene_graph``, plus one empty success state spec
+    ``LLMEnvSpec.initial_scene_graph``, plus one empty success state spec
     per task as a placeholder for downstream synthesis.
   * ``tasks`` — one task per LLM task, wired to its initial / success
     state spec ids.
@@ -36,7 +36,7 @@
     ArenaEnvGraphTaskSpec,
 )
 
-from .schema import Item, Relation, SceneSpec, Task
+from .llm_schema import Item, LLMEnvSpec, Relation, Task
 
 # When the LLM emits a bare robot family name, pick the IK variant.
 IK_DEFAULTS: dict[str, str] = {
@@ -70,7 +70,7 @@ class TraceEvent:
 
 
 class Resolver:
-    """Resolves SceneSpec fields against AssetRegistry.
+    """Resolves LLMEnvSpec fields against AssetRegistry.
 
     Design notes:
       * Never raises on LLM mistakes — instead records a trace event with
@@ -90,8 +90,8 @@ def __init__(self, registry: AssetRegistry | None = None):
         # ``resolve()`` returns.
         self.trace: list[TraceEvent] = []
 
-    def resolve(self, spec: SceneSpec, env_name: str | None = None) -> ArenaEnvGraphSpec:
-        """Resolve a SceneSpec into a full :class:`ArenaEnvGraphSpec`.
+    def resolve(self, spec: LLMEnvSpec, env_name: str | None = None) -> ArenaEnvGraphSpec:
+        """Resolve an LLMEnvSpec into a full :class:`ArenaEnvGraphSpec`.
 
         ``env_name`` is derived from the first task and background if not
         provided. The success state of each task is NOT derived here —
@@ -130,7 +130,7 @@ def resolve(self, spec: SceneSpec, env_name: str | None = None) -> ArenaEnvGraph
         return env_graph_spec
 
     @staticmethod
-    def _derive_env_name(spec: SceneSpec) -> str:
+    def _derive_env_name(spec: LLMEnvSpec) -> str:
         first_kind = spec.tasks[0].kind if spec.tasks else "task"
         return f"llm_gen_{spec.background}_{first_kind}"
 
diff --git a/isaaclab_arena/llm_env_gen/try_schema.py b/isaaclab_arena/llm_env_gen/try_schema.py
index f57a8c66e..e23a04d1e 100644
--- a/isaaclab_arena/llm_env_gen/try_schema.py
+++ b/isaaclab_arena/llm_env_gen/try_schema.py
@@ -13,7 +13,7 @@
 addition to being printed to stdout).
 
 Examples:
-    # Print the Pydantic SceneSpec JSON schema (no LLM call):
+    # Print the Pydantic LLMEnvSpec JSON schema (no LLM call):
     /isaac-sim/python.sh -m isaaclab_arena.llm_env_gen.try_schema --print-schema
 
     # Print the catalog sent to the LLM (no LLM call):
@@ -66,10 +66,10 @@ def main() -> None:
     )
     args = parser.parse_args()
 
-    from isaaclab_arena.llm_env_gen.schema import SceneSpec
+    from isaaclab_arena.llm_env_gen.llm_schema import LLMEnvSpec
 
     if args.print_schema:
-        print(json.dumps(SceneSpec.model_json_schema(), indent=2))
+        print(json.dumps(LLMEnvSpec.model_json_schema(), indent=2))
         return
 
     from isaaclab_arena.llm_env_gen.llm_agent import LLMAgent, build_catalog_text
@@ -107,7 +107,7 @@ def main() -> None:
         spec.background = new_bg
         print(f"\n=== background override applied: {old_bg!r} -> {new_bg!r} ===")
 
-    print("\n=== parsed SceneSpec ===")
+    print("\n=== parsed LLMEnvSpec ===")
     print(spec.model_dump_json(indent=2))
 
     from isaaclab_arena.llm_env_gen.resolver import Resolver
diff --git a/isaaclab_arena/tests/test_resolver.py b/isaaclab_arena/tests/test_resolver.py
index 1db0a2bdf..1f1b70746 100644
--- a/isaaclab_arena/tests/test_resolver.py
+++ b/isaaclab_arena/tests/test_resolver.py
@@ -19,8 +19,8 @@
 from dataclasses import dataclass
 
 from isaaclab_arena.environments.arena_env_graph_spec import ArenaEnvGraphNodeType, ArenaEnvGraphSpatialConstraintType
+from isaaclab_arena.llm_env_gen.llm_schema import Item, LLMEnvSpec, Relation, Task
 from isaaclab_arena.llm_env_gen.resolver import IK_DEFAULTS, Resolver
-from isaaclab_arena.llm_env_gen.schema import Item, Relation, SceneSpec, Task
 
 # ---------------------------------------------------------------------------
 # Test fixtures
@@ -98,15 +98,15 @@ def _make_scene(
     items: list[Item] | None = None,
     initial_scene_graph: list[Relation] | None = None,
     tasks: list[Task] | None = None,
-) -> SceneSpec:
-    """Build a :class:`SceneSpec` with sane defaults for tests that don't care."""
-    return SceneSpec(
+) -> LLMEnvSpec:
+    """Build a :class:`LLMEnvSpec` with sane defaults for tests that don't care."""
+    return LLMEnvSpec(
         task_description="test scene",
         background=background,
         embodiment=embodiment,
         items=items or [],
         initial_scene_graph=initial_scene_graph or [],
-        # ``tasks`` must be non-empty per SceneSpec validator.
+        # ``tasks`` must be non-empty per LLMEnvSpec validator.
         tasks=tasks
         or [Task(kind="pick_and_place", subject="placeholder", target="placeholder", description="placeholder")],
     )
diff --git a/setup.py b/setup.py
index 1848b437c..721c9b3c8 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@
     "pytest",
     # Used lazily by isaaclab_arena/llm_env_gen/* for NV_API_KEY-based LLM calls.
     "openai",
-    # Hard dependency of isaaclab_arena/llm_env_gen/schema.py (BaseModel / Field /
+    # Hard dependency of isaaclab_arena/llm_env_gen/llm_schema.py (BaseModel / Field /
     # model_validator imported at module load — not lazy).
     "pydantic>=2.0",
 ]

From decd8788d324b8e51a2b5a85f30fc518ca31472b Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Wed, 27 May 2026 18:05:58 +0800
Subject: [PATCH 16/41] Surface resolution failures via has_resolution_errors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Callers of Resolver.resolve() get an ArenaEnvGraphSpec back regardless
of what went wrong upstream — by design, the resolver records trace
events and keeps going instead of raising. That's the right default
for batch usage, but interactive callers (try_schema.py, the web
review tool, future LLM-retry loops) need an easy way to ask "should I
trust this output?" without re-walking the spec to look for missing
nodes.

Add two read-only properties:

  * ``resolution_errors`` — list of TraceEvent entries from the last
    resolve() call whose stage indicates dropped or invalidated data
    (item.miss, name.wrong_tag, name.miss, the three
    relation.initial.unknown_* / unsupported_kind stages, and the two
    task.unknown_* stages).
  * ``has_resolution_errors`` — bool wrapper.

The set is centralised on the class as ``_ERROR_TRACE_STAGES`` and
spelled out in a comment so future additions to ``_resolve_*`` /
``_build_*`` know to update it. Crucially, ``embodiment.miss`` (which
falls back to a usable embodiment) and the relaxation events
(``item.tag_pool_empty``, ``item.no_match_in_tags``) are NOT included
— they don't cost the caller any data.

Three tests cover the clean-run / item-miss / relaxation-only cases,
including a small ``_clean_scene_kwargs`` helper because the existing
``_make_scene`` default uses a deliberately-unresolvable "placeholder"
task subject/target. isaaclab-review-bot comment on PR #718.

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 isaaclab_arena/llm_env_gen/resolver.py | 36 +++++++++++++++
 isaaclab_arena/tests/test_resolver.py  | 62 ++++++++++++++++++++++++++
 2 files changed, 98 insertions(+)

diff --git a/isaaclab_arena/llm_env_gen/resolver.py b/isaaclab_arena/llm_env_gen/resolver.py
index e88fec619..ff3f7d2bb 100644
--- a/isaaclab_arena/llm_env_gen/resolver.py
+++ b/isaaclab_arena/llm_env_gen/resolver.py
@@ -82,14 +82,50 @@ class Resolver:
         record the relaxation in the trace.
       * The trace lives on the Resolver instance (``self.trace``) and is
         cleared at the start of every ``resolve()`` call.
+      * Callers that want to bail out / retry on bad resolutions can poll
+        ``has_resolution_errors`` / ``resolution_errors`` after
+        ``resolve()``; relaxation events and the embodiment-fallback miss
+        are NOT considered errors because the resolver still produced
+        usable output for them.
     """
 
+    # Trace stages emitted only when the resolver had to drop or invalidate
+    # data — i.e. the resulting spec is semantically incomplete. Distinct
+    # from advisory stages like ``item.tag_pool_empty`` (successful
+    # relaxation) and ``embodiment.miss`` (falls back to a usable
+    # embodiment). Updated alongside ``_resolve_*`` / ``_build_*`` whenever
+    # a new failure mode is added.
+    _ERROR_TRACE_STAGES: frozenset[str] = frozenset({
+        "item.miss",
+        "name.wrong_tag",
+        "name.miss",
+        "relation.initial.unsupported_kind",
+        "relation.initial.unknown_subject",
+        "relation.initial.unknown_target",
+        "task.unknown_subject",
+        "task.unknown_target",
+    })
+
     def __init__(self, registry: AssetRegistry | None = None):
         self.registry = registry or AssetRegistry()
         # Populated incrementally by every resolution call. Caller reads after
         # ``resolve()`` returns.
         self.trace: list[TraceEvent] = []
 
+    @property
+    def resolution_errors(self) -> list[TraceEvent]:
+        """Trace events flagged as failures of the last ``resolve()`` call.
+
+        See ``_ERROR_TRACE_STAGES`` for the exact set. The list preserves
+        trace order so callers can show "what went wrong, in order".
+        """
+        return [e for e in self.trace if e.stage in self._ERROR_TRACE_STAGES]
+
+    @property
+    def has_resolution_errors(self) -> bool:
+        """``True`` iff the last ``resolve()`` produced an incomplete spec."""
+        return bool(self.resolution_errors)
+
     def resolve(self, spec: LLMEnvSpec, env_name: str | None = None) -> ArenaEnvGraphSpec:
         """Resolve an LLMEnvSpec into a full :class:`ArenaEnvGraphSpec`.
 
diff --git a/isaaclab_arena/tests/test_resolver.py b/isaaclab_arena/tests/test_resolver.py
index 1f1b70746..894d722ad 100644
--- a/isaaclab_arena/tests/test_resolver.py
+++ b/isaaclab_arena/tests/test_resolver.py
@@ -197,6 +197,68 @@ def test_resolve_with_empty_initial_scene_graph():
     assert initial_state.task_constraints == []
 
 
+# ---------------------------------------------------------------------------
+# Resolution-error reporting
+# ---------------------------------------------------------------------------
+
+
+def _clean_scene_kwargs() -> dict:
+    """Scene where every node resolves and every task arg references a known node.
+
+    The default ``_make_scene`` uses a "placeholder" task subject/target that
+    deliberately doesn't resolve — fine for tests that only care about node
+    counts but unsuitable for resolution-error tests where we need a baseline
+    where the resolver succeeds completely.
+    """
+    return dict(
+        items=[Item(query="bowl", role="foreground", category_tags=["bowl"])],
+        tasks=[Task(kind="pick_and_place", subject="bowl", target="maple_table", description="d")],
+    )
+
+
+def test_has_resolution_errors_false_on_clean_run():
+    # Fully resolvable env; no error-bearing trace events should appear.
+    resolver = _make_resolver()
+    resolver.resolve(_make_scene(**_clean_scene_kwargs()))
+    assert resolver.resolution_errors == []
+    assert resolver.has_resolution_errors is False
+
+
+def test_has_resolution_errors_true_when_item_unresolvable():
+    # Add an unresolvable item on top of the clean baseline so the *only*
+    # error stage that fires is ``item.miss``.
+    kwargs = _clean_scene_kwargs()
+    kwargs["items"] = kwargs["items"] + [
+        Item(query="zzz_no_match_anywhere", role="foreground", category_tags=["object"])
+    ]
+    resolver = _make_resolver()
+    resolver.resolve(_make_scene(**kwargs))
+    assert resolver.has_resolution_errors is True
+    assert [e.stage for e in resolver.resolution_errors] == ["item.miss"]
+
+
+def test_has_resolution_errors_false_when_only_relaxation_or_fallback():
+    # Both events the bot's heuristic would mistakenly flag: tag-pool
+    # relaxation (successful) and embodiment fallback (franka_ik). Neither
+    # drops data from the resolved spec, so neither should count as an
+    # error. ``cracker`` is in the default catalog but tagged ``graspable``,
+    # not ``fruit`` — so the fruit-tag pool yields no match and the resolver
+    # relaxes to the full object pool.
+    kwargs = _clean_scene_kwargs()
+    kwargs["items"] = [Item(query="cracker", role="foreground", category_tags=["fruit"])]
+    # Switch the task subject to match the new item id so task args still resolve.
+    kwargs["tasks"] = [Task(kind="pick_and_place", subject="cracker", target="maple_table", description="d")]
+    resolver = _make_resolver()
+    resolver.resolve(_make_scene(embodiment="totally_unknown_robot", **kwargs))
+    trace_stages = [e.stage for e in resolver.trace]
+    # Sanity: the warning events actually fired in this run.
+    assert "item.no_match_in_tags" in trace_stages
+    assert "embodiment.miss" in trace_stages
+    # But neither shows up as an error.
+    assert resolver.has_resolution_errors is False
+    assert resolver.resolution_errors == []
+
+
 # ---------------------------------------------------------------------------
 # Item resolution strategies
 # ---------------------------------------------------------------------------

From 0ad51da7b0fbed83b30e63a50918609adb707b49 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Wed, 27 May 2026 23:22:12 +0800
Subject: [PATCH 17/41] Add ping() health-check method to LLMAgent
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sends a minimal chat completion (temperature=0, max_tokens=8) to verify
the API key authenticates, the configured model exists at the endpoint,
and the network path is reachable. Intended for CI startup probes and
local key-setup checks: callers `try: agent.ping() except Exception` to
distinguish a misconfigured environment from downstream resolver errors.

Returns the response text for diagnostics but does not assert on it —
different endpoints phrase the acknowledgment differently, and a reply
at all means the wire is healthy. Exceptions from the openai client
propagate unchanged so callers can branch on AuthenticationError /
NotFoundError / APIConnectionError / RateLimitError as needed.

Addresses C3 feedback on PR #718.

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 isaaclab_arena/llm_env_gen/llm_agent.py | 43 +++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/isaaclab_arena/llm_env_gen/llm_agent.py b/isaaclab_arena/llm_env_gen/llm_agent.py
index 848e3a732..81765a196 100644
--- a/isaaclab_arena/llm_env_gen/llm_agent.py
+++ b/isaaclab_arena/llm_env_gen/llm_agent.py
@@ -157,6 +157,49 @@ def generate_spec(
         spec = LLMEnvSpec.model_validate(data)
         return spec, raw
 
+    def ping(self) -> str:
+        """Smoke-test the configured endpoint + API key with a minimal request.
+
+        Sends a one-shot chat completion to verify:
+          * the API key authenticates,
+          * the configured model exists at ``base_url``,
+          * the network path is reachable.
+
+        Intended for CI startup probes and local key-setup checks; the
+        success signal is "we got a response without raising". The
+        response *content* is returned for diagnostics but intentionally
+        not asserted on — different models phrase the acknowledgment
+        differently, and a quirky reply still means the wire is working.
+
+        Returns:
+            The model's response text (typically "OK" or similar). Empty
+            string if the model returned no content (still a successful
+            round-trip).
+
+        Raises:
+            Any exception raised by the underlying ``openai`` client.
+            Common ones at this layer are ``AuthenticationError``
+            (bad key), ``NotFoundError`` (wrong ``model``),
+            ``APIConnectionError`` (unreachable endpoint), and
+            ``RateLimitError`` (quota exhausted). Callers typically
+            ``except Exception`` here and report the failure to the
+            operator.
+
+        Example:
+            >>> agent = LLMAgent()
+            >>> try:
+            ...     agent.ping()
+            ... except Exception as e:
+            ...     sys.exit(f"LLM endpoint health-check failed: {e}")
+        """
+        resp = self.client.chat.completions.create(
+            model=self.model,
+            messages=[{"role": "user", "content": "Respond with exactly: OK"}],
+            temperature=0,
+            max_tokens=8,
+        )
+        return resp.choices[0].message.content or ""
+
     def _system_prompt(self) -> str:
         schema = json.dumps(LLMEnvSpec.model_json_schema(), indent=2)
         # Derive the enumerations the LLM is allowed to emit directly from

From 16f8224829fad67eadb4c860b991d18210361141 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Wed, 27 May 2026 23:43:00 +0800
Subject: [PATCH 18/41] Add unit + remote-e2e tests for LLMAgent

Covers all four pure-Python concerns in the agent: __init__ argument /
env-var precedence and the missing-key guard, _extract_json behaviour
across well-behaved, fenced, prose-wrapped, nested, and malformed
responses (including the LLMResponseParseError -> ValueError MRO
contract so existing except ValueError callers keep working),
generate_spec request shape and error propagation, ping minimal-request
params and exception propagation, and _system_prompt enumeration of
every RelationKind / TaskKind literal so the prompt cannot drift from
the schema silently.

The 28 mocked tests patch openai.OpenAI so they never hit the wire and
run in ~0.3s. A single live test (test_generate_spec_against_live_endpoint)
exercises the default DEFAULT_MODEL / DEFAULT_BASE_URL configuration
end-to-end and is gated by both a new llm_remote_e2e pytest marker and
a skipif on NV_API_KEY so default runs stay offline.

Addresses C4 feedback on PR #718.

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 isaaclab_arena/tests/test_llm_agent.py | 363 +++++++++++++++++++++++++
 pytest.ini                             |   1 +
 2 files changed, 364 insertions(+)
 create mode 100644 isaaclab_arena/tests/test_llm_agent.py

diff --git a/isaaclab_arena/tests/test_llm_agent.py b/isaaclab_arena/tests/test_llm_agent.py
new file mode 100644
index 000000000..9ec274637
--- /dev/null
+++ b/isaaclab_arena/tests/test_llm_agent.py
@@ -0,0 +1,363 @@
+# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Unit tests for :class:`isaaclab_arena.llm_env_gen.llm_agent.LLMAgent`.
+
+The agent's behaviour decomposes into four pure-Python concerns that we exercise
+without ever hitting the wire:
+
+* ``__init__`` argument / env-var precedence and the missing-key guard.
+* ``_extract_json`` parsing of well-behaved, fenced, prosed, and malformed
+  LLM responses (including the ``LLMResponseParseError`` → ``ValueError`` MRO
+  contract so callers can still ``except ValueError``).
+* ``generate_spec`` / ``ping`` — the openai client is replaced with a
+  ``MagicMock`` so we assert on the request shape (model, messages,
+  temperature, max_tokens) and the error-propagation contract.
+* ``_system_prompt`` is asserted to enumerate every ``RelationKind`` /
+  ``TaskKind`` literal so prompt and schema cannot drift apart silently.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from typing import get_args
+from unittest.mock import MagicMock, patch
+
+import pytest
+from pydantic import ValidationError
+
+from isaaclab_arena.llm_env_gen.llm_agent import (
+    _RAW_RESPONSE_PREVIEW_CHARS,
+    DEFAULT_BASE_URL,
+    DEFAULT_MODEL,
+    LLMAgent,
+    LLMResponseParseError,
+)
+from isaaclab_arena.llm_env_gen.llm_schema import RelationKind, TaskKind
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def stub_openai():
+    """Patch the ``openai.OpenAI`` constructor so ``LLMAgent()`` never hits the wire.
+
+    The agent does a deferred ``from openai import OpenAI`` inside
+    ``__init__`` to avoid pulling the dependency at module import time, so we
+    patch the symbol on the ``openai`` module itself rather than on the
+    ``llm_agent`` namespace.
+    """
+    with patch("openai.OpenAI") as mock_cls:
+        mock_cls.return_value = MagicMock()
+        yield mock_cls
+
+
+@pytest.fixture
+def agent(stub_openai):
+    """A constructed ``LLMAgent`` with a fully mocked openai client.
+
+    Tests should set ``agent.client.chat.completions.create.return_value`` (or
+    ``.side_effect``) to control the simulated LLM response.
+    """
+    return LLMAgent(api_key="test-key")
+
+
+def _chat_response(content: str | None):
+    """Build the nested mock that mimics the openai chat-completion response shape."""
+    resp = MagicMock()
+    resp.choices = [MagicMock()]
+    resp.choices[0].message.content = content
+    return resp
+
+
+# Minimal LLMEnvSpec payload — exercises every required field plus one task so
+# the ``tasks_must_be_non_empty`` validator passes. Reused across the
+# generate_spec happy-path tests.
+_MINIMAL_SPEC: dict = {
+    "task_description": "pick up the avocado and place it in the bowl",
+    "background": "kitchen",
+    "embodiment": "franka_ik",
+    "items": [
+        {"query": "avocado", "role": "foreground", "category_tags": []},
+        {"query": "bowl", "role": "foreground", "category_tags": []},
+    ],
+    "initial_scene_graph": [
+        {"kind": "on", "subject": "avocado", "target": "kitchen"},
+        {"kind": "on", "subject": "bowl", "target": "kitchen"},
+    ],
+    "tasks": [{
+        "kind": "pick_and_place",
+        "subject": "avocado",
+        "target": "bowl",
+        "description": "pick up the avocado and place it in the bowl",
+    }],
+}
+
+
+# ---------------------------------------------------------------------------
+# __init__
+# ---------------------------------------------------------------------------
+
+
+class TestInit:
+    def test_explicit_api_key_overrides_env(self, monkeypatch, stub_openai):
+        monkeypatch.setenv("NV_API_KEY", "env-key")
+        a = LLMAgent(api_key="explicit-key")
+        assert a.api_key == "explicit-key"
+
+    def test_falls_back_to_env_var(self, monkeypatch, stub_openai):
+        monkeypatch.setenv("NV_API_KEY", "env-key")
+        a = LLMAgent()
+        assert a.api_key == "env-key"
+
+    def test_raises_when_no_key_anywhere(self, monkeypatch, stub_openai):
+        monkeypatch.delenv("NV_API_KEY", raising=False)
+        with pytest.raises(ValueError, match="API key required"):
+            LLMAgent()
+
+    def test_default_model_and_base_url(self, stub_openai):
+        a = LLMAgent(api_key="k")
+        assert a.model == DEFAULT_MODEL
+        stub_openai.assert_called_once_with(api_key="k", base_url=DEFAULT_BASE_URL)
+
+    def test_custom_model_and_base_url(self, stub_openai):
+        a = LLMAgent(api_key="k", model="custom-model", base_url="http://localhost:8000")
+        assert a.model == "custom-model"
+        stub_openai.assert_called_once_with(api_key="k", base_url="http://localhost:8000")
+
+
+# ---------------------------------------------------------------------------
+# _extract_json
+# ---------------------------------------------------------------------------
+
+
+class TestExtractJson:
+    def test_plain_json_object(self):
+        assert LLMAgent._extract_json('{"a": 1}') == {"a": 1}
+
+    def test_strips_fenced_json_block(self):
+        assert LLMAgent._extract_json('```json\n{"a": 1}\n```') == {"a": 1}
+
+    def test_strips_bare_triple_backticks(self):
+        assert LLMAgent._extract_json('```\n{"a": 1}\n```') == {"a": 1}
+
+    def test_extracts_object_from_prose(self):
+        text = 'Sure! Here is the JSON: {"a": 1} -- hope that helps.'
+        assert LLMAgent._extract_json(text) == {"a": 1}
+
+    def test_handles_nested_braces(self):
+        text = 'prefix {"outer": {"inner": [1, 2, 3]}} suffix'
+        assert LLMAgent._extract_json(text) == {"outer": {"inner": [1, 2, 3]}}
+
+    def test_raises_when_no_opening_brace(self):
+        with pytest.raises(LLMResponseParseError, match="No JSON object found"):
+            LLMAgent._extract_json("plain text with no braces at all")
+
+    def test_raises_on_unbalanced_braces(self):
+        with pytest.raises(LLMResponseParseError, match="Unbalanced braces"):
+            LLMAgent._extract_json('prefix {"a": 1 with no closing brace')
+
+    def test_parse_error_is_a_value_error(self):
+        # MRO contract: LLMResponseParseError subclasses ValueError so existing
+        # ``except ValueError`` clauses (e.g. wrapping model_validate) still
+        # catch parse failures. Asserting via ``except ValueError`` rather than
+        # ``issubclass`` keeps the test grounded in how callers actually use it.
+        with pytest.raises(ValueError):
+            LLMAgent._extract_json("no braces here")
+
+    def test_truncates_long_raw_response_in_error(self):
+        # Confirm the preview cap really clips the embedded raw response —
+        # otherwise a megabyte-scale LLM hallucination would bury the
+        # stack trace. We allow a small wrapper budget for the surrounding
+        # error message (repr quotes + "No JSON object found in ..." prefix).
+        huge = "x" * 5000
+        with pytest.raises(LLMResponseParseError) as exc_info:
+            LLMAgent._extract_json(huge)
+        msg = str(exc_info.value)
+        wrapper_budget = 200
+        assert len(msg) <= _RAW_RESPONSE_PREVIEW_CHARS + wrapper_budget
+        # ...and a 4000-char run from deep inside ``huge`` must not have leaked.
+        assert "x" * 4000 not in msg
+
+
+# ---------------------------------------------------------------------------
+# generate_spec
+# ---------------------------------------------------------------------------
+
+
+class TestGenerateSpec:
+    def test_happy_path_returns_spec_and_raw(self, agent):
+        raw = json.dumps(_MINIMAL_SPEC)
+        agent.client.chat.completions.create.return_value = _chat_response(raw)
+        spec, returned_raw = agent.generate_spec("avocado on kitchen", catalog_text="catalog")
+        assert spec.embodiment == "franka_ik"
+        assert spec.background == "kitchen"
+        assert len(spec.tasks) == 1
+        assert returned_raw == raw
+
+    def test_handles_fenced_response(self, agent):
+        raw = f"```json\n{json.dumps(_MINIMAL_SPEC)}\n```"
+        agent.client.chat.completions.create.return_value = _chat_response(raw)
+        spec, _ = agent.generate_spec("p", catalog_text="catalog")
+        assert spec.embodiment == "franka_ik"
+
+    def test_propagates_parse_error_for_garbage_response(self, agent):
+        agent.client.chat.completions.create.return_value = _chat_response("not json at all")
+        with pytest.raises(LLMResponseParseError):
+            agent.generate_spec("p", catalog_text="catalog")
+
+    def test_propagates_validation_error_for_schema_violation(self, agent):
+        # Well-formed JSON but missing every required LLMEnvSpec field — pydantic
+        # surfaces this as a ``ValidationError`` distinct from a parse error.
+        agent.client.chat.completions.create.return_value = _chat_response('{"missing": "fields"}')
+        with pytest.raises(ValidationError):
+            agent.generate_spec("p", catalog_text="catalog")
+
+    def test_request_uses_configured_model(self, agent):
+        agent.client.chat.completions.create.return_value = _chat_response(json.dumps(_MINIMAL_SPEC))
+        agent.generate_spec("p", catalog_text="catalog")
+        kwargs = agent.client.chat.completions.create.call_args.kwargs
+        assert kwargs["model"] == agent.model
+
+    def test_forwards_temperature_and_max_tokens(self, agent):
+        agent.client.chat.completions.create.return_value = _chat_response(json.dumps(_MINIMAL_SPEC))
+        agent.generate_spec("p", catalog_text="catalog", temperature=0.7, max_tokens=500)
+        kwargs = agent.client.chat.completions.create.call_args.kwargs
+        assert kwargs["temperature"] == 0.7
+        assert kwargs["max_tokens"] == 500
+
+    def test_user_message_contains_catalog_and_prompt(self, agent):
+        agent.client.chat.completions.create.return_value = _chat_response(json.dumps(_MINIMAL_SPEC))
+        agent.generate_spec("user wants avocado on kitchen", catalog_text="<<CATALOG-MARKER>>")
+        msgs = agent.client.chat.completions.create.call_args.kwargs["messages"]
+        assert [m["role"] for m in msgs] == ["system", "user"]
+        user_msg = msgs[1]["content"]
+        assert "<<CATALOG-MARKER>>" in user_msg
+        assert "user wants avocado on kitchen" in user_msg
+        # The "JSON-only" instruction is the contract that lets _extract_json
+        # work — if it disappears the LLM tends to wrap in prose.
+        assert "JSON" in user_msg
+
+
+# ---------------------------------------------------------------------------
+# ping
+# ---------------------------------------------------------------------------
+
+
+class TestPing:
+    def test_returns_response_content(self, agent):
+        agent.client.chat.completions.create.return_value = _chat_response("OK")
+        assert agent.ping() == "OK"
+
+    def test_returns_empty_string_when_content_is_none(self, agent):
+        # Some providers return ``None`` content alongside a finish_reason — we
+        # treat that as a successful round-trip (the wire works) rather than
+        # raising, since the caller's contract is "did this raise?".
+        agent.client.chat.completions.create.return_value = _chat_response(None)
+        assert agent.ping() == ""
+
+    def test_uses_minimal_request_params(self, agent):
+        agent.client.chat.completions.create.return_value = _chat_response("OK")
+        agent.ping()
+        kwargs = agent.client.chat.completions.create.call_args.kwargs
+        assert kwargs["model"] == agent.model
+        assert kwargs["temperature"] == 0
+        assert kwargs["max_tokens"] == 8
+        # Single user message — no system prompt / catalog payload. Keeping the
+        # request small is the whole point: ping must stay cheap enough to run
+        # on every CI job startup.
+        assert len(kwargs["messages"]) == 1
+        assert kwargs["messages"][0]["role"] == "user"
+
+    def test_propagates_client_exceptions(self, agent):
+        class FakeAuthError(Exception):
+            pass
+
+        agent.client.chat.completions.create.side_effect = FakeAuthError("invalid api key")
+        with pytest.raises(FakeAuthError, match="invalid api key"):
+            agent.ping()
+
+
+# ---------------------------------------------------------------------------
+# _system_prompt
+# ---------------------------------------------------------------------------
+
+
+class TestSystemPrompt:
+    def test_enumerates_every_relation_kind(self, agent):
+        # The prompt derives its bullet list from ``get_args(RelationKind)``;
+        # this assertion fails the moment someone adds a kind to the literal
+        # without rebuilding the prompt, which would silently teach the LLM a
+        # vocabulary the resolver doesn't accept.
+        prompt = agent._system_prompt()
+        for kind in get_args(RelationKind):
+            assert kind in prompt, f"relation kind {kind!r} missing from system prompt"
+
+    def test_enumerates_every_task_kind(self, agent):
+        # Task kinds are quoted in the prompt (JSON-style) to disambiguate from
+        # surrounding prose — keep the quoting in sync with the source.
+        prompt = agent._system_prompt()
+        for kind in get_args(TaskKind):
+            assert f'"{kind}"' in prompt, f"task kind {kind!r} missing from system prompt"
+
+    def test_embeds_llm_env_spec_schema(self, agent):
+        # We assert on field names rather than diffing the full JSON schema so
+        # the test isn't brittle to pydantic's schema-generation tweaks across
+        # versions.
+        prompt = agent._system_prompt()
+        for field in ("task_description", "background", "embodiment", "items", "initial_scene_graph", "tasks"):
+            assert field in prompt
+
+
+# ---------------------------------------------------------------------------
+# Live endpoint (opt-in, network + auth required)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.llm_remote_e2e
+@pytest.mark.skipif(not os.getenv("NV_API_KEY"), reason="NV_API_KEY not set; skipping live LLM endpoint test")
+def test_generate_spec_against_live_endpoint():
+    """End-to-end smoke test against the real OpenAI-compatible endpoint.
+
+    Exercises the full pipeline with default ``model`` / ``base_url`` /
+    system prompt:
+
+        auth → HTTPS → model response → JSON extract → LLMEnvSpec validation
+
+    Two layers gate this from default ``pytest`` runs:
+
+      * ``llm_remote_e2e`` marker — registered in ``pytest.ini`` next to
+        ``gr00t_remote_e2e``. Run explicitly with
+        ``pytest -m llm_remote_e2e isaaclab_arena/tests/test_llm_agent.py``.
+      * ``skipif`` on ``NV_API_KEY`` — belt-and-braces so a forgotten
+        marker filter still skips when no key is configured locally.
+
+    The asset catalog is supplied inline rather than via ``AssetRegistry``
+    so the test doesn't depend on Isaac Lab asset registration state — we
+    only want to validate the LLM wire here, not the catalog builder.
+
+    Assertions are intentionally loose: we check shape (non-empty raw,
+    non-empty tasks, populated background/embodiment) rather than exact
+    content, since LLM output drifts between model versions.
+    """
+    agent = LLMAgent()
+    catalog = (
+        "EMBODIMENTS: franka_ik\n\n"
+        "BACKGROUNDS: maple_table_kitchen\n\n"
+        "OBJECTS (2):\n"
+        "- avocado_robolab  tags=['vegetable']\n"
+        "- bowl_robolab  tags=['container']"
+    )
+    spec, raw = agent.generate_spec(
+        "pick up the avocado and place it in the bowl on the kitchen table",
+        catalog_text=catalog,
+    )
+    assert isinstance(raw, str) and raw, "LLM returned empty raw response"
+    assert spec.tasks, "LLMEnvSpec must contain at least one task"
+    assert spec.background, "LLMEnvSpec.background must be populated"
+    assert spec.embodiment, "LLMEnvSpec.embodiment must be populated"
diff --git a/pytest.ini b/pytest.ini
index d9d330ca9..a4747c4cc 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -5,3 +5,4 @@ markers =
     with_newton: test uses Newton physics
     gr00t_policy: test exercises GR00T policy/data code that runs in the base container (lightweight gr00t deps only)
     gr00t_remote_e2e: test requires a live GR00T remote policy server
+    llm_remote_e2e: test requires a live OpenAI-compatible LLM endpoint (needs NV_API_KEY)

From 85a7f83c362819ceeb68a83d5b343cfd3c00a5dc Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Wed, 27 May 2026 23:43:12 +0800
Subject: [PATCH 19/41] Add llm_remote_e2e job to CI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Runs the live LLMAgent end-to-end test against the OpenAI-compatible
endpoint on every PR, in parallel with the existing test /
tests_with_subprocess / GR00T E2E jobs. Reuses the arena container only
for the bundled openai + pydantic dependencies and /isaac-sim/python.sh
interpreter — the job itself is pure-Python and skips the nvidia-smi /
kit-cache setup steps.

The NV_API_KEY env var (read by LLMAgent) is wired from a new
ARENA_NV_API_KEY repo secret, mirroring the ARENA_NGC_API_KEY ->
NGC_API_KEY convention. A pre-test guard fails the job loudly when the
secret is unset: without it, the test's skipif(not NV_API_KEY) would
produce a silently-green job with zero LLM coverage.

Requires ARENA_NV_API_KEY to be configured in repo secrets before this
job will succeed.

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 .github/workflows/ci.yml | 47 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 20cbdbddc..e8bd220b9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -265,6 +265,53 @@ jobs:
             isaaclab_arena_gr00t/tests/test_gr00t_remote_closedloop_policy_runner.py
 
 
+  test_llm_remote_e2e:
+    name: LLM remote E2E
+    runs-on: [self-hosted, gpu-arena]
+    timeout-minutes: 20
+    needs: [pre_commit]
+    env:
+      # NV_API_KEY is the variable LLMAgent reads at runtime (see
+      # isaaclab_arena/llm_env_gen/llm_agent.py). The repo-level secret is
+      # named ARENA_NV_API_KEY to mirror ARENA_NGC_API_KEY and avoid
+      # collisions with other consumers of NV_API_KEY in the runner env.
+      NV_API_KEY: ${{ secrets.ARENA_NV_API_KEY }}
+
+    container:
+      image: nvcr.io/nvstaging/isaac-amr/isaaclab_arena:latest
+      credentials:
+        username: $oauthtoken
+        password: ${{ env.NGC_API_KEY }}
+
+    steps:
+      # No nvidia-smi / kit cache setup: this job is pure-Python (openai +
+      # pydantic) and never touches Isaac Sim. We reuse the arena image
+      # only because it already has the deps + the /isaac-sim/python.sh
+      # interpreter the rest of the suite calls into.
+      - *install_git_step
+      - *cleanup_step
+      - *mark_repo_safe_step
+      - *checkout_step
+      - *git_lfs_step
+      - *install_project_step
+
+      # Fail loudly when the secret isn't wired up — the test itself
+      # ``skipif``s when NV_API_KEY is empty, so without this guard a
+      # missing secret would silently produce a green job with zero LLM
+      # coverage.
+      - name: Verify ARENA_NV_API_KEY is configured
+        run: |
+          if [ -z "${NV_API_KEY}" ]; then
+            echo "::error::ARENA_NV_API_KEY repo secret is not set; cannot run llm_remote_e2e tests."
+            exit 1
+          fi
+
+      - name: Run LLM remote E2E test
+        run: |
+          /isaac-sim/python.sh -m pytest -sv --durations=0 -m llm_remote_e2e \
+            isaaclab_arena/tests/test_llm_agent.py
+
+
   build_docs_pre_merge:
     name: Build the docs (pre-merge)
     runs-on: [self-hosted, gpu-arena]

From 472a48b56526c36866f1c0a036b5af486e1451c7 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Wed, 27 May 2026 23:51:58 +0800
Subject: [PATCH 20/41] Remove Resolver into a new branch

---
 isaaclab_arena/llm_env_gen/resolver.py   | 423 --------------------
 isaaclab_arena/llm_env_gen/try_schema.py |  47 ---
 isaaclab_arena/tests/test_resolver.py    | 479 -----------------------
 3 files changed, 949 deletions(-)
 delete mode 100644 isaaclab_arena/llm_env_gen/resolver.py
 delete mode 100644 isaaclab_arena/tests/test_resolver.py

diff --git a/isaaclab_arena/llm_env_gen/resolver.py b/isaaclab_arena/llm_env_gen/resolver.py
deleted file mode 100644
index ff3f7d2bb..000000000
--- a/isaaclab_arena/llm_env_gen/resolver.py
+++ /dev/null
@@ -1,423 +0,0 @@
-# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
-# All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-"""Deterministic resolver that turns an LLMEnvSpec into an ArenaEnvGraphSpec.
-
-The LLM emits an LLMEnvSpec. Resolver.resolve() walks that spec, binds each
-query string to a registered Asset (preferring exact name, then fuzzy match
-filtered by tags), and emits a fully-formed :class:`ArenaEnvGraphSpec`:
-
-  * ``nodes`` — background, embodiment, and objects.
-  * ``state_specs`` — one initial state spec derived from
-    ``LLMEnvSpec.initial_scene_graph``, plus one empty success state spec
-    per task as a placeholder for downstream synthesis.
-  * ``tasks`` — one task per LLM task, wired to its initial / success
-    state spec ids.
-
-Per-step "why-this-binding" decisions accumulate on ``self.trace`` so the
-caller can inspect resolution after the fact.
-"""
-
-from __future__ import annotations
-
-from dataclasses import dataclass, field
-from difflib import get_close_matches
-
-from isaaclab_arena.assets.registries import AssetRegistry
-from isaaclab_arena.environments.arena_env_graph_spec import (
-    ArenaEnvGraphNodeSpec,
-    ArenaEnvGraphNodeType,
-    ArenaEnvGraphSpatialConstraintSpec,
-    ArenaEnvGraphSpatialConstraintType,
-    ArenaEnvGraphSpec,
-    ArenaEnvGraphStateSpec,
-    ArenaEnvGraphTaskSpec,
-)
-
-from .llm_schema import Item, LLMEnvSpec, Relation, Task
-
-# When the LLM emits a bare robot family name, pick the IK variant.
-IK_DEFAULTS: dict[str, str] = {
-    "franka": "franka_ik",
-    "droid": "droid_differential_ik",
-    "g1": "g1_wbc_pink",
-    "gr1": "gr1_pink",
-}
-
-# id used for the single initial state spec the resolver emits.
-_INITIAL_STATE_SPEC_ID = "state_initial"
-
-
-# id pattern used for the per-task success state spec placeholders. Each is
-# emitted as an empty ArenaEnvGraphStateSpec so that ArenaEnvGraphSpec
-# reference-existence assertions hold; downstream task-graph synthesis is
-# responsible for populating them.
-def _success_state_spec_id(task_index: int) -> str:
-    return f"state_success_{task_index}"
-
-
-@dataclass
-class TraceEvent:
-    """One step in the resolution pipeline — emitted to a structured log."""
-
-    stage: str
-    query: str
-    chosen: str | None
-    candidates: list[str] = field(default_factory=list)
-    note: str = ""
-
-
-class Resolver:
-    """Resolves LLMEnvSpec fields against AssetRegistry.
-
-    Design notes:
-      * Never raises on LLM mistakes — instead records a trace event with
-        chosen=None so the caller can decide (retry LLM, ask user, fall back).
-      * Exact name match wins. Otherwise substring containment, then difflib
-        fuzzy, within a tag-filtered pool.
-      * category_tags is a PREFERENCE, not a hard filter: if the tag pool is
-        empty or yields no close match, we relax to the full object pool and
-        record the relaxation in the trace.
-      * The trace lives on the Resolver instance (``self.trace``) and is
-        cleared at the start of every ``resolve()`` call.
-      * Callers that want to bail out / retry on bad resolutions can poll
-        ``has_resolution_errors`` / ``resolution_errors`` after
-        ``resolve()``; relaxation events and the embodiment-fallback miss
-        are NOT considered errors because the resolver still produced
-        usable output for them.
-    """
-
-    # Trace stages emitted only when the resolver had to drop or invalidate
-    # data — i.e. the resulting spec is semantically incomplete. Distinct
-    # from advisory stages like ``item.tag_pool_empty`` (successful
-    # relaxation) and ``embodiment.miss`` (falls back to a usable
-    # embodiment). Updated alongside ``_resolve_*`` / ``_build_*`` whenever
-    # a new failure mode is added.
-    _ERROR_TRACE_STAGES: frozenset[str] = frozenset({
-        "item.miss",
-        "name.wrong_tag",
-        "name.miss",
-        "relation.initial.unsupported_kind",
-        "relation.initial.unknown_subject",
-        "relation.initial.unknown_target",
-        "task.unknown_subject",
-        "task.unknown_target",
-    })
-
-    def __init__(self, registry: AssetRegistry | None = None):
-        self.registry = registry or AssetRegistry()
-        # Populated incrementally by every resolution call. Caller reads after
-        # ``resolve()`` returns.
-        self.trace: list[TraceEvent] = []
-
-    @property
-    def resolution_errors(self) -> list[TraceEvent]:
-        """Trace events flagged as failures of the last ``resolve()`` call.
-
-        See ``_ERROR_TRACE_STAGES`` for the exact set. The list preserves
-        trace order so callers can show "what went wrong, in order".
-        """
-        return [e for e in self.trace if e.stage in self._ERROR_TRACE_STAGES]
-
-    @property
-    def has_resolution_errors(self) -> bool:
-        """``True`` iff the last ``resolve()`` produced an incomplete spec."""
-        return bool(self.resolution_errors)
-
-    def resolve(self, spec: LLMEnvSpec, env_name: str | None = None) -> ArenaEnvGraphSpec:
-        """Resolve an LLMEnvSpec into a full :class:`ArenaEnvGraphSpec`.
-
-        ``env_name`` is derived from the first task and background if not
-        provided. The success state of each task is NOT derived here —
-        downstream code is responsible for filling in the per-task success
-        state specs that this resolver emits as empty placeholders.
-        """
-        self.trace = []
-
-        nodes: list[ArenaEnvGraphNodeSpec] = []
-
-        background_node = self._resolve_background_node(spec.background)
-        if background_node is not None:
-            nodes.append(background_node)
-
-        nodes.append(self._resolve_embodiment_node(spec.embodiment))
-
-        for item in spec.items:
-            item_node = self._resolve_item_node(item)
-            if item_node is not None:
-                nodes.append(item_node)
-
-        known_ids = {node.id for node in nodes}
-
-        initial_state_spec = self._build_initial_state_spec(spec.initial_scene_graph, known_ids)
-        success_state_specs = [ArenaEnvGraphStateSpec(id=_success_state_spec_id(i)) for i in range(len(spec.tasks))]
-        state_specs = [initial_state_spec, *success_state_specs]
-        tasks = self._build_task_specs(spec.tasks, known_ids)
-
-        env_graph_spec = ArenaEnvGraphSpec(
-            env_name=env_name or self._derive_env_name(spec),
-            nodes=nodes,
-            tasks=tasks,
-            state_specs=state_specs,
-        )
-
-        return env_graph_spec
-
-    @staticmethod
-    def _derive_env_name(spec: LLMEnvSpec) -> str:
-        first_kind = spec.tasks[0].kind if spec.tasks else "task"
-        return f"llm_gen_{spec.background}_{first_kind}"
-
-    # ------------------------------------------------------------------
-    # Node construction
-    # ------------------------------------------------------------------
-
-    def _resolve_background_node(self, query: str) -> ArenaEnvGraphNodeSpec | None:
-        cls = self._resolve_name(query, required_tag="background")
-        if cls is None:
-            return None
-        return ArenaEnvGraphNodeSpec(
-            id=query,
-            name=cls.name,
-            type=ArenaEnvGraphNodeType.BACKGROUND,
-        )
-
-    def _resolve_embodiment_node(self, query: str) -> ArenaEnvGraphNodeSpec:
-        embodiment_name = self._resolve_embodiment(query)
-        return ArenaEnvGraphNodeSpec(
-            id=embodiment_name,
-            name=embodiment_name,
-            type=ArenaEnvGraphNodeType.EMBODIMENT,
-        )
-
-    def _resolve_item_node(self, item: Item) -> ArenaEnvGraphNodeSpec | None:
-        cls = self._resolve_item(item)
-        if cls is None:
-            return None
-        params: dict = {}
-        if item.scale is not None:
-            params["scale"] = item.scale
-        return ArenaEnvGraphNodeSpec(
-            id=item.instance_name or item.query,
-            name=cls.name,
-            type=ArenaEnvGraphNodeType.OBJECT,
-            params=params,
-        )
-
-    # ------------------------------------------------------------------
-    # State spec + task spec construction
-    # ------------------------------------------------------------------
-
-    def _build_initial_state_spec(self, graph: list[Relation], known_ids: set[str]) -> ArenaEnvGraphStateSpec:
-        """Translate the LLM's initial scene graph into an ArenaEnvGraphStateSpec."""
-        constraints: list[ArenaEnvGraphSpatialConstraintSpec] = []
-        for index, rel in enumerate(graph):
-            constraint = self._build_spatial_constraint(rel, index, known_ids)
-            if constraint is not None:
-                constraints.append(constraint)
-        return ArenaEnvGraphStateSpec(
-            id=_INITIAL_STATE_SPEC_ID,
-            spatial_constraints=constraints,
-            task_constraints=[],
-        )
-
-    def _build_spatial_constraint(
-        self, rel: Relation, index: int, known_ids: set[str]
-    ) -> ArenaEnvGraphSpatialConstraintSpec | None:
-        stage_prefix = "relation.initial"
-        if rel.kind == "in":
-            self.trace.append(
-                TraceEvent(
-                    f"{stage_prefix}.in_skipped",
-                    rel.subject,
-                    rel.target,
-                    note="'in' has no initial-state semantics; specify placement changes via tasks instead.",
-                )
-            )
-            return None
-        # ``ArenaEnvGraphSpatialConstraintType(value)`` is the built-in value-based
-        # enum lookup — the schema's ``RelationKind`` literal strings are kept in
-        # 1:1 sync with this enum's values, so this single call replaces what used
-        # to be a hand-maintained dict + membership check. A ``ValueError`` here
-        # means the LLM produced a kind that pydantic's ``Literal`` should have
-        # rejected upstream — we still trace defensively in case of bypass.
-        try:
-            constraint_type = ArenaEnvGraphSpatialConstraintType(rel.kind)
-        except ValueError:
-            self.trace.append(
-                TraceEvent(
-                    f"{stage_prefix}.unsupported_kind",
-                    rel.subject,
-                    None,
-                    note=f"kind={rel.kind!r} has no ArenaEnvGraphSpatialConstraintType counterpart; skipping",
-                )
-            )
-            return None
-        if rel.subject not in known_ids:
-            self.trace.append(TraceEvent(f"{stage_prefix}.unknown_subject", rel.subject, None, note=rel.kind))
-            return None
-        if rel.target is not None and rel.target not in known_ids:
-            self.trace.append(TraceEvent(f"{stage_prefix}.unknown_target", rel.target, None, note=rel.kind))
-            return None
-
-        # ``target is None`` is the unary signal from the schema (e.g. ``is_anchor``,
-        # ``at_position``) — see ``Relation.target`` in ``schema.py``. Binary
-        # relations (on / in / next_to / ...) provide a target; the subject
-        # becomes the child anchored on the target.
-        if rel.target is None:
-            parent, child = rel.subject, None
-        else:
-            parent, child = rel.target, rel.subject
-
-        child_part = f"_{child}" if child is not None else ""
-        constraint_id = f"{_INITIAL_STATE_SPEC_ID}_{index}_{rel.kind}_{parent}{child_part}"
-        self.trace.append(TraceEvent(f"{stage_prefix}.ok", rel.subject, rel.target, note=rel.kind))
-        return ArenaEnvGraphSpatialConstraintSpec(
-            id=constraint_id,
-            type=constraint_type,
-            parent=parent,
-            child=child,
-            params=dict(rel.params),
-        )
-
-    def _build_task_specs(self, tasks: list[Task], known_ids: set[str]) -> list[ArenaEnvGraphTaskSpec]:
-        out: list[ArenaEnvGraphTaskSpec] = []
-        for index, task in enumerate(tasks):
-            self.trace.append(
-                TraceEvent(
-                    "task.resolve",
-                    task.kind,
-                    task.kind,
-                    note=f"subject={task.subject}, target={task.target}",
-                )
-            )
-            if task.subject not in known_ids:
-                self.trace.append(TraceEvent("task.unknown_subject", task.subject, None, note=f"task kind={task.kind}"))
-            if task.target is not None and task.target not in known_ids:
-                self.trace.append(TraceEvent("task.unknown_target", task.target, None, note=f"task kind={task.kind}"))
-            out.append(
-                ArenaEnvGraphTaskSpec(
-                    id=f"task_{index}_{task.kind}",
-                    type=task.kind,
-                    initial_state_spec_id=_INITIAL_STATE_SPEC_ID,
-                    # Points at an empty placeholder state spec emitted by
-                    # resolve(); downstream task-graph synthesis fills it in.
-                    success_state_spec_id=_success_state_spec_id(index),
-                    task_args={
-                        "subject": task.subject,
-                        "target": task.target,
-                        "description": task.description,
-                    },
-                )
-            )
-        return out
-
-    # ------------------------------------------------------------------
-    # Asset binding helpers (use self.trace directly)
-    # ------------------------------------------------------------------
-
-    def _resolve_item(self, item: Item) -> type | None:
-        if self.registry.is_registered(item.query):
-            self.trace.append(TraceEvent("item.exact", item.query, item.query))
-            return self.registry.get_asset_by_name(item.query)
-
-        object_pool = self._pool_for(["object"])
-
-        if item.category_tags:
-            pool = self._pool_for(item.category_tags)
-            if not pool:
-                self.trace.append(
-                    TraceEvent(
-                        "item.tag_pool_empty",
-                        item.query,
-                        None,
-                        note=f"no assets matched tags={item.category_tags}; relaxing to objects",
-                    )
-                )
-            else:
-                cls = self._best_match(item.query, pool, stage_prefix="item.in_tags", note=f"tags={item.category_tags}")
-                if cls is not None:
-                    return cls
-                self.trace.append(
-                    TraceEvent(
-                        "item.no_match_in_tags",
-                        item.query,
-                        None,
-                        candidates=pool[:10],
-                        note=f"tags={item.category_tags}; relaxing to objects",
-                    )
-                )
-
-        cls = self._best_match(
-            item.query, object_pool, stage_prefix="item.relaxed", note="closest object; category ignored"
-        )
-        if cls is not None:
-            return cls
-
-        self.trace.append(TraceEvent("item.miss", item.query, None, candidates=object_pool[:10]))
-        return None
-
-    def _best_match(self, query: str, pool: list[str], stage_prefix: str, note: str) -> type | None:
-        """Prefer substring containment (e.g. 'bowl' → 'bowl_ycb_robolab'), then difflib fuzzy."""
-        q = query.lower()
-        substrs = [p for p in pool if q in p.lower()]
-        if substrs:
-            chosen = min(substrs, key=len)
-            self.trace.append(TraceEvent(f"{stage_prefix}.substring", query, chosen, candidates=substrs[:5], note=note))
-            return self.registry.get_asset_by_name(chosen)
-
-        matches = get_close_matches(query, pool, n=3, cutoff=0.5)
-        if matches:
-            self.trace.append(TraceEvent(f"{stage_prefix}.fuzzy", query, matches[0], candidates=matches, note=note))
-            return self.registry.get_asset_by_name(matches[0])
-        return None
-
-    def _pool_for(self, tags: list[str]) -> list[str]:
-        # Intersection across tags — an item tagged {"vegetable", "graspable"}
-        # must satisfy both.
-        assets = None
-        for tag in tags:
-            tagged = {a.name for a in self.registry.get_assets_by_tag(tag)}
-            assets = tagged if assets is None else assets & tagged
-        return sorted(assets or [])
-
-    def _resolve_name(self, name: str, required_tag: str | None) -> type | None:
-        if self.registry.is_registered(name):
-            cls = self.registry.get_asset_by_name(name)
-            if required_tag and required_tag not in getattr(cls, "tags", []):
-                self.trace.append(TraceEvent("name.wrong_tag", name, None, note=f"expected tag {required_tag!r}"))
-                return None
-            self.trace.append(TraceEvent("name.exact", name, name))
-            return cls
-
-        pool = self._pool_for([required_tag]) if required_tag else self.registry.get_all_keys()
-        matches = get_close_matches(name, pool, n=3, cutoff=0.5)
-        if matches:
-            self.trace.append(TraceEvent("name.fuzzy", name, matches[0], candidates=matches))
-            return self.registry.get_asset_by_name(matches[0])
-
-        self.trace.append(TraceEvent("name.miss", name, None, candidates=pool[:10]))
-        return None
-
-    def _resolve_embodiment(self, name: str) -> str:
-        if self.registry.is_registered(name):
-            self.trace.append(TraceEvent("embodiment.exact", name, name))
-            return name
-
-        lower = name.lower()
-        if lower in IK_DEFAULTS:
-            chosen = IK_DEFAULTS[lower]
-            self.trace.append(
-                TraceEvent("embodiment.ik_default", name, chosen, note=f"bare family {name!r} → IK variant")
-            )
-            return chosen
-
-        embodiment_pool = self._pool_for(["embodiment"])
-        matches = get_close_matches(name, embodiment_pool, n=3, cutoff=0.5)
-        if matches:
-            self.trace.append(TraceEvent("embodiment.fuzzy", name, matches[0], candidates=matches))
-            return matches[0]
-        self.trace.append(TraceEvent("embodiment.miss", name, None, note="falling back to franka_ik"))
-        return "franka_ik"
diff --git a/isaaclab_arena/llm_env_gen/try_schema.py b/isaaclab_arena/llm_env_gen/try_schema.py
index e23a04d1e..54004ac42 100644
--- a/isaaclab_arena/llm_env_gen/try_schema.py
+++ b/isaaclab_arena/llm_env_gen/try_schema.py
@@ -28,7 +28,6 @@
 
 import argparse
 import json
-from pathlib import Path
 
 DEFAULT_PROMPT = (
     "franka pick up avocado from the table and place it into a bowl on the table. "
@@ -39,11 +38,6 @@
     " There are other utensils on the table as distractor"
 )
 
-# Resolved-spec dumps land here so they're easy to find next to the existing
-# auto-generated env modules. Path is computed from this file so it works
-# inside the container (/workspaces/isaaclab_arena) and outside.
-_LLM_GENERATED_DIR = Path(__file__).resolve().parents[2] / "isaaclab_arena_environments" / "llm_generated"
-
 
 def main() -> None:
     parser = argparse.ArgumentParser(description=__doc__)
@@ -110,47 +104,6 @@ def main() -> None:
     print("\n=== parsed LLMEnvSpec ===")
     print(spec.model_dump_json(indent=2))
 
-    from isaaclab_arena.llm_env_gen.resolver import Resolver
-
-    resolver = Resolver()
-    env_graph_spec = resolver.resolve(spec)
-
-    print(f"\n=== resolved ArenaEnvGraphSpec (env_name={env_graph_spec.env_name!r}) ===")
-
-    print("\nnodes:")
-    for node in env_graph_spec.nodes:
-        params_str = f"  params={node.params}" if node.params else ""
-        print(f"  {node.id:24s} type={node.type.value:18s} name={node.name}{params_str}")
-
-    print("\nstate_specs:")
-    for state_spec in env_graph_spec.state_specs:
-        s_count = len(state_spec.spatial_constraints)
-        t_count = len(state_spec.task_constraints)
-        print(f"  {state_spec.id:24s} spatial={s_count} task={t_count}")
-        for c in state_spec.spatial_constraints:
-            child_str = f", child={c.child}" if c.child else ""
-            params_str = f"  params={c.params}" if c.params else ""
-            print(f"    {c.type.value:16s} parent={c.parent}{child_str}{params_str}")
-        for c in state_spec.task_constraints:
-            print(f"    {c.type.value:16s} parent={c.parent}  child={c.child}")
-
-    print("\ntasks:")
-    for task in env_graph_spec.tasks:
-        print(
-            f"  {task.id:28s} type={task.type:18s} "
-            f"initial={task.initial_state_spec_id!r} success={task.success_state_spec_id!r}"
-        )
-        print(f"    task_args: {task.task_args}")
-
-    print("\n=== trace ===")
-    for t in resolver.trace:
-        chosen = t.chosen if t.chosen is not None else "<none>"
-        extra = f"  [{t.note}]" if t.note else ""
-        print(f"  {t.stage:34s} {t.query!s:24s} -> {chosen}{extra}")
-
-    out_path = env_graph_spec.to_yaml(_LLM_GENERATED_DIR / f"{env_graph_spec.env_name}_proposal.yaml")
-    print(f"\n=== wrote ArenaEnvGraphSpec YAML to {out_path} ===")
-
 
 if __name__ == "__main__":
     main()
diff --git a/isaaclab_arena/tests/test_resolver.py b/isaaclab_arena/tests/test_resolver.py
deleted file mode 100644
index 894d722ad..000000000
--- a/isaaclab_arena/tests/test_resolver.py
+++ /dev/null
@@ -1,479 +0,0 @@
-# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
-# All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-"""Unit tests for :class:`isaaclab_arena.llm_env_gen.resolver.Resolver`.
-
-The resolver is pure Python — no Isaac Sim / Kit / pxr dependency — so these
-tests run as plain pytest functions against an injected fake AssetRegistry.
-They exercise the resolver's deterministic logic in isolation: asset binding
-strategies (exact / substring / fuzzy / tag-pool relaxation / miss),
-embodiment family defaults, spatial constraint construction (binary vs unary
-relations, ``in`` skipping, unknown-node defensive traces), task spec wiring,
-and trace lifecycle.
-"""
-
-from __future__ import annotations
-
-from dataclasses import dataclass
-
-from isaaclab_arena.environments.arena_env_graph_spec import ArenaEnvGraphNodeType, ArenaEnvGraphSpatialConstraintType
-from isaaclab_arena.llm_env_gen.llm_schema import Item, LLMEnvSpec, Relation, Task
-from isaaclab_arena.llm_env_gen.resolver import IK_DEFAULTS, Resolver
-
-# ---------------------------------------------------------------------------
-# Test fixtures
-# ---------------------------------------------------------------------------
-
-
-@dataclass
-class FakeAsset:
-    """Minimal stand-in for the asset classes the resolver inspects.
-
-    Real asset classes are decorated classes pulled in via
-    ``ensure_assets_registered()``. The resolver only ever reads ``.name`` and
-    ``.tags`` off them, so a simple dataclass suffices and keeps the tests
-    independent of isaaclab / Kit.
-    """
-
-    name: str
-    tags: list[str]
-
-
-class FakeAssetRegistry:
-    """Duck-typed AssetRegistry for unit tests.
-
-    Implements the four methods the resolver calls — ``is_registered``,
-    ``get_asset_by_name``, ``get_assets_by_tag``, ``get_all_keys`` — without
-    pulling in isaaclab. We deliberately don't subclass :class:`AssetRegistry`
-    directly because it uses ``SingletonMeta``, which would force test
-    isolation gymnastics. Duck-typing via the resolver's ``registry`` argument
-    is the supported injection point.
-    """
-
-    def __init__(self, assets: list[FakeAsset]):
-        self._by_name: dict[str, FakeAsset] = {a.name: a for a in assets}
-
-    def is_registered(self, key: str) -> bool:
-        return key in self._by_name
-
-    def get_asset_by_name(self, name: str) -> FakeAsset:
-        assert name in self._by_name, f"unregistered asset: {name}"
-        return self._by_name[name]
-
-    def get_assets_by_tag(self, tag: str) -> list[FakeAsset]:
-        return [a for a in self._by_name.values() if tag in a.tags]
-
-    def get_all_keys(self) -> list[str]:
-        return list(self._by_name)
-
-
-def _default_assets() -> list[FakeAsset]:
-    """Small but representative catalog covering all three asset categories.
-
-    Object names intentionally include the suffix conventions seen in the
-    real registry (e.g. ``bowl_ycb_robolab``) so substring-match tests
-    exercise realistic fuzzy/substring behaviour.
-    """
-    return [
-        FakeAsset(name="maple_table", tags=["background"]),
-        FakeAsset(name="franka_ik", tags=["embodiment"]),
-        FakeAsset(name="franka_joint_pos", tags=["embodiment"]),
-        FakeAsset(name="bowl_ycb_robolab", tags=["object", "bowl"]),
-        FakeAsset(name="avocado01_fruits_robolab", tags=["object", "fruit"]),
-        FakeAsset(name="apple01_fruits_robolab", tags=["object", "fruit"]),
-        FakeAsset(name="cracker_box", tags=["object", "graspable"]),
-    ]
-
-
-def _make_resolver(assets: list[FakeAsset] | None = None) -> Resolver:
-    return Resolver(registry=FakeAssetRegistry(assets or _default_assets()))
-
-
-def _make_scene(
-    *,
-    background: str = "maple_table",
-    embodiment: str = "franka_ik",
-    items: list[Item] | None = None,
-    initial_scene_graph: list[Relation] | None = None,
-    tasks: list[Task] | None = None,
-) -> LLMEnvSpec:
-    """Build a :class:`LLMEnvSpec` with sane defaults for tests that don't care."""
-    return LLMEnvSpec(
-        task_description="test scene",
-        background=background,
-        embodiment=embodiment,
-        items=items or [],
-        initial_scene_graph=initial_scene_graph or [],
-        # ``tasks`` must be non-empty per LLMEnvSpec validator.
-        tasks=tasks
-        or [Task(kind="pick_and_place", subject="placeholder", target="placeholder", description="placeholder")],
-    )
-
-
-# ---------------------------------------------------------------------------
-# Top-level resolve()
-# ---------------------------------------------------------------------------
-
-
-def test_resolve_happy_path():
-    items = [
-        Item(query="bowl", role="foreground", category_tags=["bowl"]),
-        Item(query="avocado", role="foreground", category_tags=["fruit"]),
-    ]
-    initial = [
-        Relation(kind="is_anchor", subject="maple_table"),
-        Relation(kind="on", subject="bowl", target="maple_table"),
-        Relation(kind="on", subject="avocado", target="maple_table"),
-    ]
-    tasks = [Task(kind="pick_and_place", subject="avocado", target="bowl", description="put avocado in bowl")]
-    spec = _make_resolver().resolve(_make_scene(items=items, initial_scene_graph=initial, tasks=tasks))
-
-    # Auto-derived env_name: f"llm_gen_{background}_{first_task_kind}".
-    assert spec.env_name == "llm_gen_maple_table_pick_and_place"
-
-    # Node order: background, embodiment, items in declaration order.
-    node_ids = [n.id for n in spec.nodes]
-    assert node_ids == ["maple_table", "franka_ik", "bowl", "avocado"]
-    assert spec.nodes_by_id["maple_table"].type == ArenaEnvGraphNodeType.BACKGROUND
-    assert spec.nodes_by_id["franka_ik"].type == ArenaEnvGraphNodeType.EMBODIMENT
-    # Item node.name reflects the *resolved* asset name, not the query.
-    assert spec.nodes_by_id["bowl"].name == "bowl_ycb_robolab"
-    assert spec.nodes_by_id["avocado"].name == "avocado01_fruits_robolab"
-
-    # State specs: 1 initial + 1 success placeholder per task.
-    assert len(spec.state_specs) == 2
-    initial_state = spec.state_specs_by_id["state_initial"]
-    assert len(initial_state.spatial_constraints) == 3
-
-    is_anchor = initial_state.spatial_constraints[0]
-    assert is_anchor.type == ArenaEnvGraphSpatialConstraintType.IS_ANCHOR
-    assert is_anchor.parent == "maple_table"
-    assert is_anchor.child is None
-    assert is_anchor.id == "state_initial_0_is_anchor_maple_table"
-
-    on_bowl = initial_state.spatial_constraints[1]
-    assert on_bowl.type == ArenaEnvGraphSpatialConstraintType.ON
-    # Binary relations: parent=target, child=subject (the LLM says "bowl on
-    # table" — the resolver inverts so the table is the anchor).
-    assert on_bowl.parent == "maple_table"
-    assert on_bowl.child == "bowl"
-    assert on_bowl.id == "state_initial_1_on_maple_table_bowl"
-
-    # Tasks.
-    assert len(spec.tasks) == 1
-    task = spec.tasks_by_id["task_0_pick_and_place"]
-    assert task.initial_state_spec_id == "state_initial"
-    assert task.success_state_spec_id == "state_success_0"
-    assert task.task_args == {"subject": "avocado", "target": "bowl", "description": "put avocado in bowl"}
-
-
-def test_resolve_overrides_env_name():
-    resolver = _make_resolver()
-    spec = resolver.resolve(_make_scene(), env_name="my_custom_env")
-    assert spec.env_name == "my_custom_env"
-
-
-def test_resolve_clears_trace_between_calls():
-    resolver = _make_resolver()
-    resolver.resolve(_make_scene())
-    n_after_first = len(resolver.trace)
-    # Sanity: at least background, embodiment, and task events should be present.
-    assert n_after_first > 0
-
-    resolver.resolve(_make_scene())
-    n_after_second = len(resolver.trace)
-    # If trace persisted across calls, the second count would be > the first.
-    # Deterministic input → identical trace length when the trace is cleared.
-    assert n_after_second == n_after_first
-
-
-def test_resolve_with_empty_initial_scene_graph():
-    spec = _make_resolver().resolve(_make_scene(initial_scene_graph=[]))
-    initial_state = spec.state_specs_by_id["state_initial"]
-    assert initial_state.spatial_constraints == []
-    # Even with no constraints, the spec should still be well-formed.
-    assert initial_state.task_constraints == []
-
-
-# ---------------------------------------------------------------------------
-# Resolution-error reporting
-# ---------------------------------------------------------------------------
-
-
-def _clean_scene_kwargs() -> dict:
-    """Scene where every node resolves and every task arg references a known node.
-
-    The default ``_make_scene`` uses a "placeholder" task subject/target that
-    deliberately doesn't resolve — fine for tests that only care about node
-    counts but unsuitable for resolution-error tests where we need a baseline
-    where the resolver succeeds completely.
-    """
-    return dict(
-        items=[Item(query="bowl", role="foreground", category_tags=["bowl"])],
-        tasks=[Task(kind="pick_and_place", subject="bowl", target="maple_table", description="d")],
-    )
-
-
-def test_has_resolution_errors_false_on_clean_run():
-    # Fully resolvable env; no error-bearing trace events should appear.
-    resolver = _make_resolver()
-    resolver.resolve(_make_scene(**_clean_scene_kwargs()))
-    assert resolver.resolution_errors == []
-    assert resolver.has_resolution_errors is False
-
-
-def test_has_resolution_errors_true_when_item_unresolvable():
-    # Add an unresolvable item on top of the clean baseline so the *only*
-    # error stage that fires is ``item.miss``.
-    kwargs = _clean_scene_kwargs()
-    kwargs["items"] = kwargs["items"] + [
-        Item(query="zzz_no_match_anywhere", role="foreground", category_tags=["object"])
-    ]
-    resolver = _make_resolver()
-    resolver.resolve(_make_scene(**kwargs))
-    assert resolver.has_resolution_errors is True
-    assert [e.stage for e in resolver.resolution_errors] == ["item.miss"]
-
-
-def test_has_resolution_errors_false_when_only_relaxation_or_fallback():
-    # Both events the bot's heuristic would mistakenly flag: tag-pool
-    # relaxation (successful) and embodiment fallback (franka_ik). Neither
-    # drops data from the resolved spec, so neither should count as an
-    # error. ``cracker`` is in the default catalog but tagged ``graspable``,
-    # not ``fruit`` — so the fruit-tag pool yields no match and the resolver
-    # relaxes to the full object pool.
-    kwargs = _clean_scene_kwargs()
-    kwargs["items"] = [Item(query="cracker", role="foreground", category_tags=["fruit"])]
-    # Switch the task subject to match the new item id so task args still resolve.
-    kwargs["tasks"] = [Task(kind="pick_and_place", subject="cracker", target="maple_table", description="d")]
-    resolver = _make_resolver()
-    resolver.resolve(_make_scene(embodiment="totally_unknown_robot", **kwargs))
-    trace_stages = [e.stage for e in resolver.trace]
-    # Sanity: the warning events actually fired in this run.
-    assert "item.no_match_in_tags" in trace_stages
-    assert "embodiment.miss" in trace_stages
-    # But neither shows up as an error.
-    assert resolver.has_resolution_errors is False
-    assert resolver.resolution_errors == []
-
-
-# ---------------------------------------------------------------------------
-# Item resolution strategies
-# ---------------------------------------------------------------------------
-
-
-def test_item_exact_name_match():
-    # Query that's already a registered asset name skips fuzzy matching.
-    items = [Item(query="cracker_box", role="foreground", category_tags=["graspable"])]
-    resolver = _make_resolver()
-    spec = resolver.resolve(_make_scene(items=items))
-    assert spec.nodes_by_id["cracker_box"].name == "cracker_box"
-    assert any(e.stage == "item.exact" for e in resolver.trace)
-
-
-def test_item_substring_match_in_tag_pool():
-    items = [Item(query="bowl", role="foreground", category_tags=["bowl"])]
-    resolver = _make_resolver()
-    spec = resolver.resolve(_make_scene(items=items))
-    assert spec.nodes_by_id["bowl"].name == "bowl_ycb_robolab"
-    assert any(e.stage == "item.in_tags.substring" for e in resolver.trace)
-
-
-def test_item_relaxes_when_tag_pool_yields_no_match():
-    # category_tags points to a real tag pool ('fruit') but the query
-    # ('cracker') doesn't substring-match either fruit. The resolver should
-    # relax to the full object pool and find cracker_box.
-    items = [Item(query="cracker", role="foreground", category_tags=["fruit"])]
-    resolver = _make_resolver()
-    spec = resolver.resolve(_make_scene(items=items))
-    assert spec.nodes_by_id["cracker"].name == "cracker_box"
-    trace_stages = [e.stage for e in resolver.trace]
-    assert "item.no_match_in_tags" in trace_stages
-    assert any(s.startswith("item.relaxed") for s in trace_stages)
-
-
-def test_item_relaxes_when_tag_pool_empty():
-    # Unknown tag → empty tag pool → resolver short-circuits the pool-search
-    # and relaxes immediately.
-    items = [Item(query="cracker", role="foreground", category_tags=["nonexistent"])]
-    resolver = _make_resolver()
-    spec = resolver.resolve(_make_scene(items=items))
-    assert spec.nodes_by_id["cracker"].name == "cracker_box"
-    assert any(e.stage == "item.tag_pool_empty" for e in resolver.trace)
-
-
-def test_item_miss_omits_node():
-    # Query that matches no asset (and no substring / fuzzy candidate) is
-    # silently dropped — the resolver records a trace but doesn't raise.
-    items = [Item(query="zzz_no_match_anywhere", role="foreground", category_tags=["object"])]
-    resolver = _make_resolver()
-    spec = resolver.resolve(_make_scene(items=items))
-    assert "zzz_no_match_anywhere" not in spec.nodes_by_id
-    assert any(e.stage == "item.miss" for e in resolver.trace)
-
-
-def test_item_scale_param_passed_through():
-    items = [Item(query="bowl", role="foreground", category_tags=["bowl"], scale=0.75)]
-    spec = _make_resolver().resolve(_make_scene(items=items))
-    assert spec.nodes_by_id["bowl"].params == {"scale": 0.75}
-
-
-def test_item_instance_name_overrides_query_for_node_id():
-    items = [Item(query="bowl", role="foreground", category_tags=["bowl"], instance_name="serving_bowl")]
-    spec = _make_resolver().resolve(_make_scene(items=items))
-    # ``instance_name`` controls the *node id*; ``name`` still reflects the
-    # resolved asset, so the same asset can appear twice under different ids.
-    assert "serving_bowl" in spec.nodes_by_id
-    assert "bowl" not in spec.nodes_by_id
-    assert spec.nodes_by_id["serving_bowl"].name == "bowl_ycb_robolab"
-
-
-# ---------------------------------------------------------------------------
-# Embodiment resolution
-# ---------------------------------------------------------------------------
-
-
-def test_embodiment_exact_match():
-    spec = _make_resolver().resolve(_make_scene(embodiment="franka_joint_pos"))
-    assert spec.nodes_by_id["franka_joint_pos"].type == ArenaEnvGraphNodeType.EMBODIMENT
-
-
-def test_embodiment_ik_default_for_bare_family():
-    resolver = _make_resolver()
-    spec = resolver.resolve(_make_scene(embodiment="franka"))
-    # The mapping is exported from the resolver so callers can introspect it.
-    assert IK_DEFAULTS["franka"] == "franka_ik"
-    assert spec.nodes_by_id["franka_ik"].type == ArenaEnvGraphNodeType.EMBODIMENT
-    assert any(e.stage == "embodiment.ik_default" for e in resolver.trace)
-
-
-def test_embodiment_unknown_falls_back_to_franka_ik():
-    # Unknown family names never raise — they fall back to franka_ik and
-    # record a miss trace. ``franka_ik`` must therefore be registered.
-    resolver = _make_resolver()
-    spec = resolver.resolve(_make_scene(embodiment="totally_unknown_robot"))
-    assert spec.nodes_by_id["franka_ik"].type == ArenaEnvGraphNodeType.EMBODIMENT
-    assert any(e.stage == "embodiment.miss" for e in resolver.trace)
-
-
-# ---------------------------------------------------------------------------
-# Background resolution
-# ---------------------------------------------------------------------------
-
-
-def test_background_with_wrong_tag_omitted():
-    # An asset registered under the name "maple_table" but NOT tagged
-    # "background" is rejected with a name.wrong_tag trace, so the background
-    # node is absent from the resulting spec.
-    assets = [
-        FakeAsset(name="franka_ik", tags=["embodiment"]),
-        FakeAsset(name="maple_table", tags=["object"]),  # wrong tag
-    ]
-    resolver = _make_resolver(assets)
-    spec = resolver.resolve(_make_scene(background="maple_table"))
-    assert "maple_table" not in spec.nodes_by_id
-    assert any(e.stage == "name.wrong_tag" for e in resolver.trace)
-
-
-# ---------------------------------------------------------------------------
-# Spatial constraint construction
-# ---------------------------------------------------------------------------
-
-
-def test_spatial_constraint_binary_relation_id_and_parent_child():
-    items = [Item(query="cracker_box", role="foreground", category_tags=["graspable"])]
-    initial = [Relation(kind="on", subject="cracker_box", target="maple_table")]
-    spec = _make_resolver().resolve(_make_scene(items=items, initial_scene_graph=initial))
-    constraint = spec.state_specs_by_id["state_initial"].spatial_constraints[0]
-    # Binary: parent=target, child=subject.
-    assert constraint.parent == "maple_table"
-    assert constraint.child == "cracker_box"
-    assert constraint.id == "state_initial_0_on_maple_table_cracker_box"
-
-
-def test_spatial_constraint_unary_relation_id_and_parent_child():
-    initial = [Relation(kind="is_anchor", subject="maple_table")]
-    spec = _make_resolver().resolve(_make_scene(initial_scene_graph=initial))
-    constraint = spec.state_specs_by_id["state_initial"].spatial_constraints[0]
-    # Unary (target is None): parent=subject, child=None.
-    assert constraint.type == ArenaEnvGraphSpatialConstraintType.IS_ANCHOR
-    assert constraint.parent == "maple_table"
-    assert constraint.child is None
-    # No "_{child}" suffix when child is None.
-    assert constraint.id == "state_initial_0_is_anchor_maple_table"
-
-
-def test_spatial_constraint_in_relation_skipped():
-    items = [Item(query="cracker_box", role="foreground", category_tags=["graspable"])]
-    initial = [Relation(kind="in", subject="cracker_box", target="maple_table")]
-    resolver = _make_resolver()
-    spec = resolver.resolve(_make_scene(items=items, initial_scene_graph=initial))
-    # "in" has no initial-state semantics — see Resolver._build_spatial_constraint.
-    assert spec.state_specs_by_id["state_initial"].spatial_constraints == []
-    assert any(e.stage == "relation.initial.in_skipped" for e in resolver.trace)
-
-
-def test_spatial_constraint_unknown_subject_skipped():
-    initial = [Relation(kind="on", subject="not_a_node", target="maple_table")]
-    resolver = _make_resolver()
-    spec = resolver.resolve(_make_scene(initial_scene_graph=initial))
-    assert spec.state_specs_by_id["state_initial"].spatial_constraints == []
-    assert any(e.stage == "relation.initial.unknown_subject" for e in resolver.trace)
-
-
-def test_spatial_constraint_unknown_target_skipped():
-    initial = [Relation(kind="on", subject="maple_table", target="missing_node")]
-    resolver = _make_resolver()
-    spec = resolver.resolve(_make_scene(initial_scene_graph=initial))
-    assert spec.state_specs_by_id["state_initial"].spatial_constraints == []
-    assert any(e.stage == "relation.initial.unknown_target" for e in resolver.trace)
-
-
-def test_spatial_constraint_params_passed_through():
-    items = [Item(query="cracker_box", role="foreground", category_tags=["graspable"])]
-    initial = [
-        Relation(
-            kind="at_position",
-            subject="cracker_box",
-            params={"position_xyz": [0.1, 0.2, 0.3]},
-        ),
-    ]
-    spec = _make_resolver().resolve(_make_scene(items=items, initial_scene_graph=initial))
-    constraint = spec.state_specs_by_id["state_initial"].spatial_constraints[0]
-    assert constraint.type == ArenaEnvGraphSpatialConstraintType.AT_POSITION
-    # ``params`` are passed through verbatim — the resolver doesn't validate
-    # the schema of relation-kind-specific params; that's the downstream
-    # builder's job.
-    assert constraint.params == {"position_xyz": [0.1, 0.2, 0.3]}
-
-
-# ---------------------------------------------------------------------------
-# Task spec construction
-# ---------------------------------------------------------------------------
-
-
-def test_multiple_tasks_get_distinct_success_state_ids():
-    tasks = [
-        Task(kind="pick_and_place", subject="bowl", target="maple_table", description="d1"),
-        Task(kind="open_door", subject="bowl", target=None, description="d2"),
-        Task(kind="close_door", subject="bowl", target=None, description="d3"),
-    ]
-    items = [Item(query="bowl", role="foreground", category_tags=["bowl"])]
-    spec = _make_resolver().resolve(_make_scene(items=items, tasks=tasks))
-
-    # Task ids follow ``task_{index}_{kind}``.
-    task_ids = [t.id for t in spec.tasks]
-    assert task_ids == ["task_0_pick_and_place", "task_1_open_door", "task_2_close_door"]
-
-    # Each task points at its own per-task placeholder success state.
-    success_ids = [t.success_state_spec_id for t in spec.tasks]
-    assert success_ids == ["state_success_0", "state_success_1", "state_success_2"]
-
-    # state_specs contains 1 initial + 3 placeholder success specs.
-    assert len(spec.state_specs) == 4
-    for i in range(3):
-        # Placeholders are empty — downstream synthesis is responsible for them.
-        assert spec.state_specs_by_id[f"state_success_{i}"].spatial_constraints == []
-        assert spec.state_specs_by_id[f"state_success_{i}"].task_constraints == []

From d5cadd7b2c6b804a54a33ccb1a0bd1df1eca0d31 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Thu, 28 May 2026 00:31:02 +0800
Subject: [PATCH 21/41] Do ont skip test_generate_spec_against_live_endpoint

---
 isaaclab_arena/tests/test_llm_agent.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/isaaclab_arena/tests/test_llm_agent.py b/isaaclab_arena/tests/test_llm_agent.py
index 9ec274637..ef5f0f755 100644
--- a/isaaclab_arena/tests/test_llm_agent.py
+++ b/isaaclab_arena/tests/test_llm_agent.py
@@ -22,7 +22,6 @@
 from __future__ import annotations
 
 import json
-import os
 from typing import get_args
 from unittest.mock import MagicMock, patch
 
@@ -320,7 +319,6 @@ def test_embeds_llm_env_spec_schema(self, agent):
 
 
 @pytest.mark.llm_remote_e2e
-@pytest.mark.skipif(not os.getenv("NV_API_KEY"), reason="NV_API_KEY not set; skipping live LLM endpoint test")
 def test_generate_spec_against_live_endpoint():
     """End-to-end smoke test against the real OpenAI-compatible endpoint.
 

From f3a36c71aa9d302f8f66ec1d7834c6db6a297674 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Thu, 28 May 2026 00:36:09 +0800
Subject: [PATCH 22/41] Revert "Move assert_unique_ids +
 assert_references_exist to post_init so it's always called when loading from
 yaml/dict"

This reverts commit 3162272477d2108cd535538b29096f0fde0f13d0.
---
 .../environments/arena_env_graph_spec.py      | 23 ++++++++-----------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/isaaclab_arena/environments/arena_env_graph_spec.py b/isaaclab_arena/environments/arena_env_graph_spec.py
index bf80433f4..c6827a7ac 100644
--- a/isaaclab_arena/environments/arena_env_graph_spec.py
+++ b/isaaclab_arena/environments/arena_env_graph_spec.py
@@ -149,16 +149,6 @@ class ArenaEnvGraphSpec:
     tasks: list[ArenaEnvGraphTaskSpec] = field(default_factory=list)
     state_specs: list[ArenaEnvGraphStateSpec] = field(default_factory=list)
 
-    def __post_init__(self) -> None:
-        # Enforce graph invariants on EVERY construction path (YAML parse, direct
-        # dataclass instantiation, programmatic build, ...). Centralizing here means
-        # downstream consumers — including ``nodes_by_id`` / ``tasks_by_id`` /
-        # ``state_specs_by_id``, which collapse duplicates silently in their dict
-        # comprehensions — can rely on globally-unique ids and valid references
-        # without re-validating.
-        assert_unique_ids(self.nodes, self.tasks, self.state_specs)
-        assert_references_exist(self.nodes, self.tasks, self.state_specs)
-
     @classmethod
     def from_yaml(cls, path: str | Path) -> "ArenaEnvGraphSpec":
         with Path(path).open("r", encoding="utf-8") as f:
@@ -167,11 +157,18 @@ def from_yaml(cls, path: str | Path) -> "ArenaEnvGraphSpec":
     @classmethod
     def from_dict(cls, data: dict[str, Any]) -> "ArenaEnvGraphSpec":
         data = as_dict(data, "Env graph spec")
+        nodes = parse_list(data, "nodes", _parse_node)
+        tasks = parse_list(data, "tasks", _parse_task)
+        state_specs = parse_list(data, "state_specs", _parse_state_spec)
+
+        assert_unique_ids(nodes, tasks, state_specs)
+        assert_references_exist(nodes, tasks, state_specs)
+
         return cls(
             env_name=required_str(data, "env_name"),
-            nodes=parse_list(data, "nodes", _parse_node),
-            tasks=parse_list(data, "tasks", _parse_task),
-            state_specs=parse_list(data, "state_specs", _parse_state_spec),
+            nodes=nodes,
+            tasks=tasks,
+            state_specs=state_specs,
         )
 
     def to_dict(self) -> dict[str, Any]:

From 421a282ba3561dfbee800236f5038ff79993cfe6 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Thu, 28 May 2026 00:42:24 +0800
Subject: [PATCH 23/41] Revert ArenaEnvGraphSpec YAML-serialization additions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Restore isaaclab_arena/environments/arena_env_graph_spec.py to match
main: drop the to_dict / to_yaml methods and the _yaml_dict_factory
helper that were added earlier on this branch. Nothing on the branch
consumes them anymore — try_schema.py now prints model_dump_json
instead of dumping the resolved spec to YAML, and no tests depend on
the methods — so the additions were dead code on tip.

Removes the dataclasses.asdict import that was only used by the
serialization helpers.

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 .../environments/arena_env_graph_spec.py      | 43 +------------------
 isaaclab_arena/llm_env_gen/try_schema.py      |  7 +--
 2 files changed, 2 insertions(+), 48 deletions(-)

diff --git a/isaaclab_arena/environments/arena_env_graph_spec.py b/isaaclab_arena/environments/arena_env_graph_spec.py
index c6827a7ac..2f69f97d6 100644
--- a/isaaclab_arena/environments/arena_env_graph_spec.py
+++ b/isaaclab_arena/environments/arena_env_graph_spec.py
@@ -4,7 +4,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import yaml
-from dataclasses import asdict, dataclass, field
+from dataclasses import dataclass, field
 from enum import Enum
 from pathlib import Path
 from typing import Any
@@ -171,27 +171,6 @@ def from_dict(cls, data: dict[str, Any]) -> "ArenaEnvGraphSpec":
             state_specs=state_specs,
         )
 
-    def to_dict(self) -> dict[str, Any]:
-        """Return a YAML/JSON-serializable dict.
-
-        Output shape round-trips through :meth:`from_dict` / :meth:`from_yaml`:
-        enums become their ``.value`` strings and ``None`` / empty-dict fields
-        are omitted so the optional-field parsers fall back to their defaults.
-        """
-        return asdict(self, dict_factory=_yaml_dict_factory)
-
-    def to_yaml(self, path: str | Path) -> Path:
-        """Write this spec to ``path`` as YAML. Creates parent dirs as needed.
-
-        Returns the resolved :class:`Path` written. Symmetric with
-        :meth:`from_yaml`.
-        """
-        out_path = Path(path)
-        out_path.parent.mkdir(parents=True, exist_ok=True)
-        with out_path.open("w", encoding="utf-8") as f:
-            yaml.safe_dump(self.to_dict(), f, sort_keys=False)
-        return out_path
-
     @property
     def nodes_by_id(self) -> dict[str, ArenaEnvGraphNodeSpec]:
         return {node.id: node for node in self.nodes}
@@ -275,23 +254,3 @@ def _parse_task(data: Any) -> ArenaEnvGraphTaskSpec:
         success_state_spec_id=required_str(data, "success_state_spec_id"),
         task_args=optional_dict(data, "task_args"),
     )
-
-
-def _yaml_dict_factory(pairs: list[tuple[str, Any]]) -> dict[str, Any]:
-    """``dataclasses.asdict`` hook used by :meth:`ArenaEnvGraphSpec.to_dict`.
-
-    Two responsibilities:
-      * convert :class:`Enum` field values to their ``.value`` strings so
-        ``yaml.safe_dump`` can serialize them, and
-      * drop ``None`` / empty-dict fields so the emitted YAML stays clean
-        and ``optional_str`` / ``optional_dict`` parsers pick up defaults
-        instead of seeing redundant keys.
-    """
-    out: dict[str, Any] = {}
-    for key, value in pairs:
-        if isinstance(value, Enum):
-            value = value.value
-        if value is None or (isinstance(value, dict) and not value):
-            continue
-        out[key] = value
-    return out
diff --git a/isaaclab_arena/llm_env_gen/try_schema.py b/isaaclab_arena/llm_env_gen/try_schema.py
index 54004ac42..a5cb5114f 100644
--- a/isaaclab_arena/llm_env_gen/try_schema.py
+++ b/isaaclab_arena/llm_env_gen/try_schema.py
@@ -5,12 +5,7 @@
 
 """Run the LLM parser on a prompt and dump the resolved ArenaEnvGraphSpec.
 
-Must run inside the Docker container (needs AssetRegistry). Requires
-NV_API_KEY and the `openai` pip package.
-
-Output: the resolved spec is always written to
-``isaaclab_arena_environments/llm_generated/<env_name>_proposal.yaml`` (in
-addition to being printed to stdout).
+Requires NV_API_KEY environment variable.
 
 Examples:
     # Print the Pydantic LLMEnvSpec JSON schema (no LLM call):

From f04ba3ed1a93da17f96d84f614c5f9bc1717d87f Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Thu, 28 May 2026 01:01:20 +0800
Subject: [PATCH 24/41] Sanity-check LLM endpoint on LLMAgent construction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Invoke ping() at the end of LLMAgent.__init__ so a bad key, wrong
model name, or unreachable base_url surfaces from the constructor
itself rather than deep inside the first generate_spec call. Trades a
~hundreds-of-ms one-shot completion at startup for a much clearer
stack trace at the point of misconfiguration.

The existing mocked unit tests stay green because MagicMock
autovivifies the chat.completions.create chain — the ping call against
an un-stubbed mock returns a MagicMock (truthy, no exception). Two new
TestInit tests lock in the contract so a future refactor cannot
silently drop the connection check:

  * test_init_pings_to_verify_connection — asserts __init__ issues
    exactly one chat-completion call with the canonical ping shape
    (single user message, temperature=0, max_tokens=8), guarding
    against accidental startup-cost inflation.
  * test_init_propagates_ping_failure — asserts openai exceptions
    raised during the constructor ping surface from LLMAgent()
    itself, not from a silently-broken instance later.

Updates the __init__ docstring with a Raises block covering both the
existing ValueError (missing key) and the new openai-client exceptions
forwarded from ping.

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 isaaclab_arena/llm_env_gen/llm_agent.py | 13 ++++++++++++
 isaaclab_arena/tests/test_llm_agent.py  | 28 +++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/isaaclab_arena/llm_env_gen/llm_agent.py b/isaaclab_arena/llm_env_gen/llm_agent.py
index 81765a196..3e4992be5 100644
--- a/isaaclab_arena/llm_env_gen/llm_agent.py
+++ b/isaaclab_arena/llm_env_gen/llm_agent.py
@@ -88,6 +88,13 @@ def __init__(
                 override to point at a self-hosted vLLM / Ollama / etc.
                 deployment that exposes the same OpenAI chat-completions
                 wire format.
+
+        Raises:
+            ValueError: when no API key is available (neither argument
+                nor ``NV_API_KEY`` env var).
+            Any exception raised by the underlying ``openai`` client
+                during the startup ``ping()``. See :meth:`ping` for the
+                common failure modes.
         """
         from openai import OpenAI
 
@@ -99,6 +106,12 @@ def __init__(
             raise ValueError("API key required: set NV_API_KEY or pass api_key.")
         self.model = model
         self.client = OpenAI(api_key=self.api_key, base_url=base_url)
+        # Fail-fast connection check. Costs ~hundreds of ms on hot paths and
+        # converts a deferred ``AuthenticationError`` (or ``NotFoundError`` /
+        # ``APIConnectionError``) into a constructor-time failure with a clear
+        # call stack, which is much easier to diagnose than the same error
+        # surfacing mid-pipeline inside ``generate_spec``.
+        self.ping()
 
     def generate_spec(
         self,
diff --git a/isaaclab_arena/tests/test_llm_agent.py b/isaaclab_arena/tests/test_llm_agent.py
index ef5f0f755..028ee104b 100644
--- a/isaaclab_arena/tests/test_llm_agent.py
+++ b/isaaclab_arena/tests/test_llm_agent.py
@@ -129,6 +129,34 @@ def test_custom_model_and_base_url(self, stub_openai):
         assert a.model == "custom-model"
         stub_openai.assert_called_once_with(api_key="k", base_url="http://localhost:8000")
 
+    def test_init_pings_to_verify_connection(self, stub_openai):
+        # ``__init__`` is contracted to run a ping round-trip before returning
+        # so a bad key / wrong model / dead endpoint fails at construction time
+        # rather than deep inside the first generate_spec. Locking in the
+        # request shape (single user message, max_tokens=8, temperature=0)
+        # guarantees we don't accidentally inflate the startup cost.
+        a = LLMAgent(api_key="k")
+        a.client.chat.completions.create.assert_called_once()
+        kwargs = a.client.chat.completions.create.call_args.kwargs
+        assert kwargs["temperature"] == 0
+        assert kwargs["max_tokens"] == 8
+        assert len(kwargs["messages"]) == 1
+
+    def test_init_propagates_ping_failure(self):
+        # If the openai client raises during the constructor ping (bad key,
+        # unreachable endpoint, ...), the exception must surface from
+        # ``LLMAgent()`` itself — not be swallowed into a silently-broken
+        # instance that fails later when generate_spec is called.
+        class FakeAuthError(Exception):
+            pass
+
+        with patch("openai.OpenAI") as mock_cls:
+            client = MagicMock()
+            client.chat.completions.create.side_effect = FakeAuthError("bad key")
+            mock_cls.return_value = client
+            with pytest.raises(FakeAuthError, match="bad key"):
+                LLMAgent(api_key="k")
+
 
 # ---------------------------------------------------------------------------
 # _extract_json

From 0409075efec648e5f953958dddee7bf028f1b1ba Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Thu, 28 May 2026 01:20:30 +0800
Subject: [PATCH 25/41] Fix llm_remote_e2e test

---
 .github/workflows/ci.yml               | 3 ++-
 isaaclab_arena/tests/test_llm_agent.py | 2 --
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e8bd220b9..096d645a0 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -156,7 +156,8 @@ jobs:
 
       - name: Run in-process PhysX tests without cameras
         run: |
-          /isaac-sim/python.sh -m pytest -sv --durations=0 -m "not with_cameras and not with_subprocess and not with_newton" \
+          /isaac-sim/python.sh -m pytest -sv --durations=0 \
+            -m "not with_cameras and not with_subprocess and not with_newton and not llm_remote_e2e" \
             isaaclab_arena/tests/
 
       - name: Run GR00T policy/data tests (lightweight gr00t deps only)
diff --git a/isaaclab_arena/tests/test_llm_agent.py b/isaaclab_arena/tests/test_llm_agent.py
index 028ee104b..4ad24a75a 100644
--- a/isaaclab_arena/tests/test_llm_agent.py
+++ b/isaaclab_arena/tests/test_llm_agent.py
@@ -360,8 +360,6 @@ def test_generate_spec_against_live_endpoint():
       * ``llm_remote_e2e`` marker — registered in ``pytest.ini`` next to
         ``gr00t_remote_e2e``. Run explicitly with
         ``pytest -m llm_remote_e2e isaaclab_arena/tests/test_llm_agent.py``.
-      * ``skipif`` on ``NV_API_KEY`` — belt-and-braces so a forgotten
-        marker filter still skips when no key is configured locally.
 
     The asset catalog is supplied inline rather than via ``AssetRegistry``
     so the test doesn't depend on Isaac Lab asset registration state — we

From e40955347153efde62a5684596aaa9cb0e8ad2da Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Thu, 28 May 2026 11:41:43 +0800
Subject: [PATCH 26/41] Use "atomic task" consistently in env-gen prompt and
 schema
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

xyao-nv flagged that the env-gen prompt and LLMEnvSpec docstrings mixed
"atomic action" and "atomic task" interchangeably when referring to the
entries of the ``tasks`` field. Unify on "atomic task(s)" everywhere
since the field is ``tasks`` and the class is ``Task`` — saying
"action" introduces a third noun for the same concept. Also aligns with
the existing ``isaaclab_arena/tasks/sequential_task_base.py`` wording.

Addresses thread #3313780564 on PR #718.

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 isaaclab_arena/llm_env_gen/llm_agent.py  | 2 +-
 isaaclab_arena/llm_env_gen/llm_schema.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/isaaclab_arena/llm_env_gen/llm_agent.py b/isaaclab_arena/llm_env_gen/llm_agent.py
index 3e4992be5..2de5c9dd4 100644
--- a/isaaclab_arena/llm_env_gen/llm_agent.py
+++ b/isaaclab_arena/llm_env_gen/llm_agent.py
@@ -245,7 +245,7 @@ def _system_prompt(self) -> str:
             "- initial_scene_graph: FULL snapshot of all relations in the starting state. Every persistent\n"
             "  relation (e.g. bowl on table, distractors present) must appear here. Relations that change\n"
             "  via tasks are still listed here in their starting form.\n"
-            "- tasks: a list of atomic actions to perform in order. Each task has:\n"
+            "- tasks: a list of atomic tasks to perform in order. Each task has:\n"
             f"    * kind ∈ {{{task_kinds}}}\n"
             "    * subject: the primary object being acted on (e.g. 'avocado', 'microwave')\n"
             "    * target: the secondary object/location (e.g. 'bowl' for pick_and_place, null for open/close)\n"
diff --git a/isaaclab_arena/llm_env_gen/llm_schema.py b/isaaclab_arena/llm_env_gen/llm_schema.py
index 7d3765e71..8a11abe3a 100644
--- a/isaaclab_arena/llm_env_gen/llm_schema.py
+++ b/isaaclab_arena/llm_env_gen/llm_schema.py
@@ -33,7 +33,7 @@
 
 ItemRole = Literal["foreground", "distractor", "anchor"]
 
-# Task kinds the LLM can propose as atomic actions in a plan.
+# Task kinds the LLM can propose as an atomic task.
 TaskKind = Literal["pick_and_place", "open_door", "close_door"]
 
 
@@ -106,7 +106,7 @@ class LLMEnvSpec(BaseModel):
         name kept as ``initial_scene_graph`` even though the class is now
         ``LLMEnvSpec`` — renaming the field would change the JSON schema
         the LLM is prompted against and is out of scope here.)
-      * ``tasks`` — a list of atomic actions to execute in sequence. Each
+      * ``tasks`` — a list of atomic tasks to execute in sequence. Each
         task specifies what to do (kind), what object(s) it acts on
         (subject/target), and a natural-language description. The task
         sequence implicitly defines the intermediate env graphs by applying

From 0cde09496b938af28af67faf9710fca184d1f49d Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Thu, 28 May 2026 11:51:45 +0800
Subject: [PATCH 27/41] Drop implementation details from Item.scale comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two PR threads converged on the same 3-line comment, so addressing
them together:

* "Uniform spawn scale" misled by implying a 1.0 baseline — some
  object_library assets already carry intrinsic scaling, so the
  override is not strictly uniform. Drop the qualifier.
* "auto-fit ... against the tabletop bbox" leaks implementation
  details of the current placement proposer; @zhx06 is expanding
  the proposer beyond bbox-based fitting, so the comment would go
  stale. Describe the effect (auto-fit) without the mechanism.

Addresses threads #3313785736 and #3313791294 on PR #718.

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 isaaclab_arena/llm_env_gen/llm_schema.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/isaaclab_arena/llm_env_gen/llm_schema.py b/isaaclab_arena/llm_env_gen/llm_schema.py
index 8a11abe3a..9693f399f 100644
--- a/isaaclab_arena/llm_env_gen/llm_schema.py
+++ b/isaaclab_arena/llm_env_gen/llm_schema.py
@@ -50,9 +50,9 @@ class Item(BaseModel):
     role: ItemRole
     category_tags: list[str] = Field(default_factory=list)
     instance_name: str | None = None
-    # Uniform spawn scale. ``None`` (the default) lets the placement
-    # proposer auto-fit the asset against the tabletop bbox; an explicit
-    # positive float overrides the auto-fit.
+    # Spawn scale. ``None`` (the default) lets the placement proposer
+    # auto-fit the asset; an explicit positive float overrides the
+    # auto-fit.
     scale: float | None = None
 
 

From 58dbc137f7ed74b90ada6467ab91fb885e350e24 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Thu, 28 May 2026 11:58:02 +0800
Subject: [PATCH 28/41] Drop Resolver cross-references from LLMEnvSpec schema
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Resolver was moved out of this MR into a follow-up branch, so the
three docstrings / comments in llm_schema.py that name-checked it
either point at a class that is not in this PR (the Resolver class
itself, Resolver._build_spatial_constraint) or duplicate contract text
already covered by the surrounding class docstring.

Removed:

* Module docstring — drop "by the Resolver" from the second-step
  description; the architectural intent (deterministic second pass)
  reads fine without the class reference.
* Relation class docstring — drop the trailing sentence cross-ref to
  Resolver._build_spatial_constraint; the binary-vs-unary contract is
  fully stated in the preceding sentences.
* Relation.target inline comment — drop entirely; it restated the
  binary-vs-unary contract that the class docstring already documents
  authoritatively.

Addresses thread #3313795576 on PR #718.

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 isaaclab_arena/llm_env_gen/llm_schema.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/isaaclab_arena/llm_env_gen/llm_schema.py b/isaaclab_arena/llm_env_gen/llm_schema.py
index 9693f399f..2452d8232 100644
--- a/isaaclab_arena/llm_env_gen/llm_schema.py
+++ b/isaaclab_arena/llm_env_gen/llm_schema.py
@@ -7,8 +7,8 @@
 
 The LLM sees a list of the *available* asset tags / embodiment names pulled
 from the registries at call time, and must return a LLMEnvSpec that only uses
-those vocabularies. Concrete asset names are resolved by the Resolver in a
-second, deterministic step — the LLM never invents USD paths.
+those vocabularies. Concrete asset names are resolved in a second, deterministic
+step — the LLM never invents USD paths.
 """
 
 from __future__ import annotations
@@ -62,17 +62,11 @@ class Relation(BaseModel):
     Binary kinds (``on``, ``in``, ``next_to``, ...) must set ``target`` to the
     other item — semantics is "subject is in relation to target". Unary kinds
     (``is_anchor``, ``at_position``, ...) describe an intrinsic property of
-    ``subject`` alone and must leave ``target`` as ``None``. The downstream
-    resolver uses ``target is None`` as the single signal to distinguish the
-    two — see ``Resolver._build_spatial_constraint``.
+    ``subject`` alone and must leave ``target`` as ``None``.
     """
 
     kind: RelationKind
     subject: str
-    # ``None`` for unary relations (the subject is the anchor); a string for
-    # binary relations (subject is anchored on this target). The resolver
-    # branches on this field rather than maintaining a kind-specific allowlist,
-    # so populating it correctly is part of the LLM's contract.
     target: str | None = None
     params: dict = Field(default_factory=dict)
 

From 1989c8f127b21d91048c3dee0dd76f43fd0bfcd1 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Thu, 28 May 2026 14:26:18 +0800
Subject: [PATCH 29/41] Move per-field LLM prompt rules into pydantic Field
 descriptions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

xyao-nv asked why the system prompt restated information that the
schema could express structurally, and suggested moving rules into
``Field(description=...)`` so they live in one place instead of being
duplicated across prompt prose and pydantic types.

Rebalance:

* Per-field semantics (what each field means, defaults, optionality)
  now live as ``Field(description=...)`` strings on every member of
  Item / Relation / Task / LLMEnvSpec. pydantic embeds these in
  ``model_json_schema()`` automatically, so the existing SCHEMA block
  at the end of the prompt now carries the per-field guidance the
  RULES section used to spell out.
* Enum members no longer need explicit ``∈ {kinds}`` lines — they are
  already enforced by the ``Literal[...]`` types and show up as
  ``enum:`` arrays inside the schema JSON.
* The prompt's RULES section shrinks to GUIDANCE: only cross-cutting
  invariants (articulated objects need spatial ``on(...)`` anchors;
  distractors need ``on(distractor, background)``), few-shot task
  examples, and the output-format directive remain — these either
  span multiple fields or change global LLM behaviour and don't fit a
  single-field description.
* New anti-hallucination guidance (also from xyao-nv's review): the
  GUIDANCE block now tells the LLM to output null for unspecified
  optional fields instead of guessing, and the same nudge is echoed
  in the optional-field descriptions on ``instance_name``, ``scale``,
  and ``Task.target`` so the LLM sees it both globally and per-field.

LLMAgent no longer imports RelationKind / TaskKind / get_args; the
schema dump is the only mechanism by which enum members reach the
prompt now.

Net line count grows (+59) because Field descriptions are verbose, but
the single source of truth for any given rule is now the schema —
xyao-nv's explicit ask. Token cost in the actual API request stays
roughly flat: what left the prompt mostly reappeared inside the JSON
schema dump.

Addresses thread #3313636935 on PR #718.

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 isaaclab_arena/llm_env_gen/llm_agent.py  |  57 +++-----
 isaaclab_arena/llm_env_gen/llm_schema.py | 168 ++++++++++++++++-------
 2 files changed, 135 insertions(+), 90 deletions(-)

diff --git a/isaaclab_arena/llm_env_gen/llm_agent.py b/isaaclab_arena/llm_env_gen/llm_agent.py
index 2de5c9dd4..6b0ad15f4 100644
--- a/isaaclab_arena/llm_env_gen/llm_agent.py
+++ b/isaaclab_arena/llm_env_gen/llm_agent.py
@@ -15,9 +15,8 @@
 import contextlib
 import json
 import os
-from typing import get_args
 
-from .llm_schema import LLMEnvSpec, RelationKind, TaskKind
+from .llm_schema import LLMEnvSpec
 
 DEFAULT_BASE_URL = "https://inference-api.nvidia.com"
 DEFAULT_MODEL = "nvidia/deepseek-ai/deepseek-v4-flash"
@@ -215,53 +214,31 @@ def ping(self) -> str:
 
     def _system_prompt(self) -> str:
         schema = json.dumps(LLMEnvSpec.model_json_schema(), indent=2)
-        # Derive the enumerations the LLM is allowed to emit directly from
-        # the pydantic literal types so the prompt cannot drift out of sync
-        # when RelationKind / TaskKind change. Bare identifiers for
-        # relation kinds (e.g. ``on``), JSON-style quoted strings for task
-        # kinds (e.g. ``"pick_and_place"``) — matching the surrounding
-        # prose style.
-        relation_kinds = ", ".join(get_args(RelationKind))
-        task_kinds = ", ".join(f'"{k}"' for k in get_args(TaskKind))
+        # Per-field guidance (what each field means, enum members, default
+        # behaviours) lives on the ``Field(description=...)`` entries in
+        # llm_schema.py and is surfaced to the LLM via the SCHEMA block
+        # below. Only cross-cutting rules (those that span multiple fields
+        # or change LLM output behaviour globally) and few-shot examples
+        # belong here.
         return (
             "You are an env-generation parser for robot manipulation tasks.\n"
             "Convert a natural-language prompt into an LLMEnvSpec JSON object that matches the schema below.\n\n"
-            "RULES:\n"
-            "- item.query: the short human name as it appears in the prompt (e.g. 'avocado', 'bowl').\n"
-            "  The resolver fuzzy-matches this against the OBJECTS catalog; you do NOT need to emit the\n"
-            "  exact registered name.\n"
-            "- item.role: 'foreground' for objects the task acts on; 'distractor' for extras mentioned as\n"
-            "  clutter; 'anchor' for reference surfaces (rare — the background usually covers this).\n"
-            "- item.category_tags: tags that semantically narrow the query, preferring assets with those\n"
-            "  tags. This is a PREFERENCE, not a hard filter — the resolver will fall back to the full\n"
-            "  catalog if the tag pool is empty or yields no close match. Err toward emitting useful tags;\n"
-            "  the trace will report what was relaxed.\n"
-            f"- relation.kind ∈ {{{relation_kinds}}}. Spatial relations only —\n"
-            "  articulated-state changes are expressed via tasks below, not as relations.\n"
-            "  subject/target reference items by their query string or the background name.\n"
-            "  * Articulated objects (microwave, fridge, cabinet) still need a spatial\n"
-            "    'on(<object>, background)' relation to anchor them.\n"
-            "  * Distractor items around the appliance need 'on(distractor, background)' relations.\n"
-            "- initial_scene_graph: FULL snapshot of all relations in the starting state. Every persistent\n"
-            "  relation (e.g. bowl on table, distractors present) must appear here. Relations that change\n"
-            "  via tasks are still listed here in their starting form.\n"
-            "- tasks: a list of atomic tasks to perform in order. Each task has:\n"
-            f"    * kind ∈ {{{task_kinds}}}\n"
-            "    * subject: the primary object being acted on (e.g. 'avocado', 'microwave')\n"
-            "    * target: the secondary object/location (e.g. 'bowl' for pick_and_place, null for open/close)\n"
-            "    * description: natural-language summary of the task\n"
-            "  Examples:\n"
+            "GUIDANCE:\n"
+            "- Follow the per-field ``description`` strings in SCHEMA for what each field expects.\n"
+            "- If the prompt does not specify a value for an optional field, output null.\n"
+            "  Do NOT hallucinate values — the resolver tolerates nulls; it cannot fix invented data.\n"
+            "- Articulated objects (microwave, fridge, cabinet) still need a spatial\n"
+            "  'on(<object>, background)' relation in initial_scene_graph to anchor them; their\n"
+            "  open/close behaviour is expressed via tasks, not via relations.\n"
+            "- Distractor items around the appliance need 'on(distractor, background)' relations\n"
+            "  in initial_scene_graph as well.\n"
+            "- Task examples (showing kind + subject + target + description shape):\n"
             '    * Pick-and-place: {"kind": "pick_and_place", "subject": "avocado", "target": "bowl",\n'
             '                       "description": "pick up the avocado and place it in the bowl"}\n'
             '    * Open door: {"kind": "open_door", "subject": "microwave", "target": null,\n'
             '                  "description": "open the microwave door"}\n'
             '    * Close door: {"kind": "close_door", "subject": "microwave", "target": null,\n'
             '                   "description": "close the microwave door"}\n'
-            "  The tasks implicitly define the final scene: apply each task's transformation in order\n"
-            "  to determine what relations hold at completion.\n"
-            "- embodiment: use a bare robot family name ('franka', 'droid', 'g1', 'gr1') when the prompt\n"
-            "  does not specify a control mode — the resolver defaults each to its IK variant. Use a\n"
-            "  full registered name (e.g. 'franka_joint_pos') only when the prompt requests joint control.\n"
             "- Emit ONLY the JSON object. No prose, no markdown fences.\n\n"
             f"SCHEMA:\n{schema}"
         )
diff --git a/isaaclab_arena/llm_env_gen/llm_schema.py b/isaaclab_arena/llm_env_gen/llm_schema.py
index 2452d8232..648e8bb99 100644
--- a/isaaclab_arena/llm_env_gen/llm_schema.py
+++ b/isaaclab_arena/llm_env_gen/llm_schema.py
@@ -38,22 +38,45 @@
 
 
 class Item(BaseModel):
-    """One object the LLM wants in the scene.
-
-    `query` is the short human name from the prompt ("avocado", "bowl"). The
-    resolver maps it to a registered asset. `category_tags` narrow the search
-    and act as a fallback when the exact name does not resolve — e.g. a
-    distractor "vegetable" resolves to any asset tagged "vegetable".
-    """
-
-    query: str
-    role: ItemRole
-    category_tags: list[str] = Field(default_factory=list)
-    instance_name: str | None = None
-    # Spawn scale. ``None`` (the default) lets the placement proposer
-    # auto-fit the asset; an explicit positive float overrides the
-    # auto-fit.
-    scale: float | None = None
+    """One object the LLM wants in the scene."""
+
+    query: str = Field(
+        description=(
+            "Short human name for the object as it appears in the prompt "
+            "(e.g. 'avocado', 'bowl'). The downstream resolver fuzzy-matches "
+            "this against the asset catalog — do NOT emit the exact "
+            "registered name."
+        ),
+    )
+    role: ItemRole = Field(
+        description=(
+            "Role the item plays in the env: 'foreground' for objects the "
+            "task acts on; 'distractor' for extras mentioned as clutter; "
+            "'anchor' for reference surfaces (rare — the background usually "
+            "covers this)."
+        ),
+    )
+    category_tags: list[str] = Field(
+        default_factory=list,
+        description=(
+            "Tags that semantically narrow the query, preferring assets with "
+            "those tags. PREFERENCE only, not a hard filter — the resolver "
+            "falls back to the full catalog if the tag pool is empty or "
+            "yields no close match. Err toward emitting useful tags."
+        ),
+    )
+    instance_name: str | None = Field(
+        default=None,
+        description="Optional explicit instance label for the item; leave null if the prompt does not name one.",
+    )
+    scale: float | None = Field(
+        default=None,
+        description=(
+            "Spawn scale. Leave null (the default) so the placement proposer "
+            "auto-fits the asset; only set a positive float when the prompt "
+            "explicitly demands a size override."
+        ),
+    )
 
 
 class Relation(BaseModel):
@@ -65,10 +88,26 @@ class Relation(BaseModel):
     ``subject`` alone and must leave ``target`` as ``None``.
     """
 
-    kind: RelationKind
-    subject: str
-    target: str | None = None
-    params: dict = Field(default_factory=dict)
+    kind: RelationKind = Field(
+        description=(
+            "Spatial relation only — articulated-state changes (open/close) are expressed via tasks, not via relations."
+        ),
+    )
+    subject: str = Field(
+        description="Item the relation applies to, named by its Item.query string or the background name.",
+    )
+    target: str | None = Field(
+        default=None,
+        description=(
+            "The other item the relation is anchored on for binary kinds "
+            "(on / in / next_to / at_position / at_pose); leave null for "
+            "unary kinds (is_anchor)."
+        ),
+    )
+    params: dict = Field(
+        default_factory=dict,
+        description="Optional kind-specific parameters; leave empty by default.",
+    )
 
     def identity(self) -> tuple[str, str, str | None]:
         """Hashable identity for diffing scene graphs — ignores params."""
@@ -76,43 +115,72 @@ def identity(self) -> tuple[str, str, str | None]:
 
 
 class Task(BaseModel):
-    """One atomic task in the plan that transforms the env state.
-
-    A task specifies what action to perform (kind), what object it acts on
-    (subject), and optionally where it goes (target). The description provides
-    natural-language context for the task.
-    """
-
-    kind: TaskKind
-    subject: str  # object instance name (e.g. 'avocado', 'microwave')
-    target: str | None = None  # target object/location (e.g. 'bowl', 'background')
-    description: str  # natural-language task description
+    """One atomic task in the plan that transforms the env state."""
+
+    kind: TaskKind = Field(description="The action to perform.")
+    subject: str = Field(
+        description=(
+            "The primary object the task acts on, named by its Item.query string (e.g. 'avocado', 'microwave')."
+        ),
+    )
+    target: str | None = Field(
+        default=None,
+        description=(
+            "The secondary object or location, named by its Item.query "
+            "string or the background name. Leave null for unary tasks "
+            "(open_door / close_door)."
+        ),
+    )
+    description: str = Field(
+        description="Natural-language summary of the task (e.g. 'pick up the avocado and place it in the bowl').",
+    )
 
 
 class LLMEnvSpec(BaseModel):
     """LLM output — a structured plan for the env and a list of tasks.
 
-    The language prompt is decomposed into:
-
-      * ``initial_scene_graph`` — every relation that holds at env reset.
-        This configures where objects spawn. This is a FULL snapshot
-        including all relations that persist throughout all tasks. (Field
-        name kept as ``initial_scene_graph`` even though the class is now
-        ``LLMEnvSpec`` — renaming the field would change the JSON schema
-        the LLM is prompted against and is out of scope here.)
-      * ``tasks`` — a list of atomic tasks to execute in sequence. Each
-        task specifies what to do (kind), what object(s) it acts on
-        (subject/target), and a natural-language description. The task
-        sequence implicitly defines the intermediate env graphs by applying
-        each task's transformations in order.
+    Field-level guidance lives on the individual ``Field(description=...)``
+    entries below and is surfaced to the LLM via ``model_json_schema()``;
+    only cross-cutting rules and few-shot examples are kept in the
+    prompt text (see ``LLMAgent._system_prompt``).
     """
 
-    task_description: str
-    background: str
-    embodiment: str = "franka_ik"
-    items: list[Item]
-    initial_scene_graph: list[Relation]
-    tasks: list[Task]
+    task_description: str = Field(
+        description="One-sentence natural-language summary of what the env exercises overall."
+    )
+    background: str = Field(
+        description="Background asset name from the BACKGROUNDS catalog (e.g. 'maple_table_kitchen').",
+    )
+    embodiment: str = Field(
+        default="franka_ik",
+        description=(
+            "Robot embodiment to control. Use a bare family name ('franka', "
+            "'droid', 'g1', 'gr1') when the prompt does not specify a "
+            "control mode — the resolver defaults each to its IK variant. "
+            "Use a full registered name (e.g. 'franka_joint_pos') only when "
+            "the prompt explicitly requests joint control."
+        ),
+    )
+    items: list[Item] = Field(description="Objects to place in the env.")
+    initial_scene_graph: list[Relation] = Field(
+        description=(
+            "FULL snapshot of all relations in the starting state. Every "
+            "persistent relation (e.g. bowl on table, distractors present) "
+            "must appear here. Relations that change via tasks are still "
+            "listed here in their starting form."
+            # NOTE: field name kept as ``initial_scene_graph`` even though
+            # the class is now ``LLMEnvSpec`` — renaming the field would
+            # change the JSON schema the LLM is prompted against and is
+            # out of scope here.
+        ),
+    )
+    tasks: list[Task] = Field(
+        description=(
+            "Tasks to execute in sequence. The task sequence implicitly "
+            "defines the intermediate env graphs by applying each task's "
+            "transformations in order."
+        ),
+    )
 
     @model_validator(mode="after")
     def _tasks_must_be_non_empty(self) -> LLMEnvSpec:

From 3de65f80659aea9896ef6470a913890fafa0ef3d Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Thu, 28 May 2026 14:36:47 +0800
Subject: [PATCH 30/41] Add required reasoning field to LLMEnvSpec for forced
 CoT

Add a required ``reasoning: str`` field as the FIRST entry of
LLMEnvSpec. Instruction-tuned models respect schema field order, so
listing it first forces the LLM to write a step-by-step analysis
before committing to any structured field (the "think then commit"
pattern, which measurably improves structured-output quality).

As a bonus, the reasoning trace makes downstream failures
debuggable: when the resolver drops an item or picks the wrong
asset, the trace shows which step the model got wrong instead of
leaving the malformed spec as the only signal.

The Field description prescribes a 4-step analysis (task /
foreground / background / distractors) and asks the model not to
duplicate the analysis into ``task_description``. Verified
end-to-end against the live endpoint: the model produces a
self-contained 200-500 char reasoning block that walks all four
steps, and ``task_description`` stays a one-sentence summary.

Surface the field on its own line in ``try_schema.py`` so it's
easy to spot when iterating on prompts.

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 isaaclab_arena/llm_env_gen/llm_schema.py | 20 ++++++++++++++++++++
 isaaclab_arena/llm_env_gen/try_schema.py |  6 ++++++
 isaaclab_arena/tests/test_llm_agent.py   | 14 +++++++++++++-
 3 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/isaaclab_arena/llm_env_gen/llm_schema.py b/isaaclab_arena/llm_env_gen/llm_schema.py
index 648e8bb99..1b6f67ff1 100644
--- a/isaaclab_arena/llm_env_gen/llm_schema.py
+++ b/isaaclab_arena/llm_env_gen/llm_schema.py
@@ -145,6 +145,26 @@ class LLMEnvSpec(BaseModel):
     prompt text (see ``LLMAgent._system_prompt``).
     """
 
+    # Forced chain-of-thought field, listed FIRST so the LLM emits its
+    # analysis before committing to any structured field. Instruction-tuned
+    # models respect schema field order, and writing reasoning before
+    # answers measurably improves structured-output quality (the
+    # "think step by step then commit" pattern). Bonus debuggability:
+    # when a downstream resolver step fails, the reasoning trace shows
+    # which step the model got wrong (e.g. it picked "tomato" because
+    # it misidentified the foreground object as a vegetable) — without
+    # this, the only signal is the malformed spec itself.
+    reasoning: str = Field(
+        description=(
+            "Step-by-step analysis of the user prompt, written BEFORE the "
+            "structured fields below. Identify (1) the task / intent, (2) "
+            "the foreground objects the task acts on, (3) the background "
+            "surface or scene, (4) any distractors. For each object, "
+            "briefly justify the catalog query and tags you will pick. "
+            "Resolve any ambiguity here before filling the structured "
+            "fields — do not restate this analysis in ``task_description``."
+        ),
+    )
     task_description: str = Field(
         description="One-sentence natural-language summary of what the env exercises overall."
     )
diff --git a/isaaclab_arena/llm_env_gen/try_schema.py b/isaaclab_arena/llm_env_gen/try_schema.py
index a5cb5114f..d659f211e 100644
--- a/isaaclab_arena/llm_env_gen/try_schema.py
+++ b/isaaclab_arena/llm_env_gen/try_schema.py
@@ -75,6 +75,12 @@ def main() -> None:
     print("=== raw LLM response ===")
     print(raw)
 
+    # Surface the forced chain-of-thought field on its own so it's easy to
+    # spot when debugging a bad spec — without this, ``reasoning`` is
+    # buried inside the multi-hundred-line model_dump_json below.
+    print("\n=== LLM reasoning ===")
+    print(spec.reasoning)
+
     if args.background and args.background != spec.background:
         # Swap the background name wherever it appears so downstream code
         # (resolver, proposer) sees a consistent scene. Rewrite both
diff --git a/isaaclab_arena/tests/test_llm_agent.py b/isaaclab_arena/tests/test_llm_agent.py
index 4ad24a75a..669770822 100644
--- a/isaaclab_arena/tests/test_llm_agent.py
+++ b/isaaclab_arena/tests/test_llm_agent.py
@@ -78,6 +78,10 @@ def _chat_response(content: str | None):
 # the ``tasks_must_be_non_empty`` validator passes. Reused across the
 # generate_spec happy-path tests.
 _MINIMAL_SPEC: dict = {
+    "reasoning": (
+        "User wants a pick-and-place: foreground object is 'avocado', "
+        "target container is 'bowl', background is the kitchen table."
+    ),
     "task_description": "pick up the avocado and place it in the bowl",
     "background": "kitchen",
     "embodiment": "franka_ik",
@@ -337,7 +341,15 @@ def test_embeds_llm_env_spec_schema(self, agent):
         # the test isn't brittle to pydantic's schema-generation tweaks across
         # versions.
         prompt = agent._system_prompt()
-        for field in ("task_description", "background", "embodiment", "items", "initial_scene_graph", "tasks"):
+        for field in (
+            "reasoning",
+            "task_description",
+            "background",
+            "embodiment",
+            "items",
+            "initial_scene_graph",
+            "tasks",
+        ):
             assert field in prompt
 
 

From d67fd12e0a6829686a93db57bfd2c70fb37facb8 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Thu, 28 May 2026 14:45:21 +0800
Subject: [PATCH 31/41] Move llm_env_gen package under
 environments/agentic_env_gen

Pure relocation requested in PR review: the package conceptually
sits alongside the other env-construction code in
isaaclab_arena/environments/, and the directory name shifts from
llm_env_gen to agentic_env_gen to align with the broader
"agent/agentic" terminology direction (the LLM is one possible
backend; a VLM-driven agent could follow later).

Files moved:
* isaaclab_arena/llm_env_gen/{__init__,llm_agent,llm_schema,try_schema}.py
  -> isaaclab_arena/environments/agentic_env_gen/...

Updated absolute import paths in try_schema.py (CLI module path in
docstring + 2 imports) and test_llm_agent.py (docstring + 2
imports), plus comment-only path references in setup.py and
.github/workflows/ci.yml.

Strictly a directory move; class names (LLMAgent, LLMEnvSpec,
LLMResponseParseError), file names within the package, the
llm_remote_e2e pytest marker, and the test file name are all
unchanged. Those renames follow in subsequent commits to keep each
diff reviewable.

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 .github/workflows/ci.yml                               |  2 +-
 .../agentic_env_gen}/__init__.py                       |  0
 .../agentic_env_gen}/llm_agent.py                      |  0
 .../agentic_env_gen}/llm_schema.py                     |  0
 .../agentic_env_gen}/try_schema.py                     | 10 +++++-----
 isaaclab_arena/tests/test_llm_agent.py                 |  6 +++---
 setup.py                                               |  4 ++--
 7 files changed, 11 insertions(+), 11 deletions(-)
 rename isaaclab_arena/{llm_env_gen => environments/agentic_env_gen}/__init__.py (100%)
 rename isaaclab_arena/{llm_env_gen => environments/agentic_env_gen}/llm_agent.py (100%)
 rename isaaclab_arena/{llm_env_gen => environments/agentic_env_gen}/llm_schema.py (100%)
 rename isaaclab_arena/{llm_env_gen => environments/agentic_env_gen}/try_schema.py (89%)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 096d645a0..216c48ed0 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -273,7 +273,7 @@ jobs:
     needs: [pre_commit]
     env:
       # NV_API_KEY is the variable LLMAgent reads at runtime (see
-      # isaaclab_arena/llm_env_gen/llm_agent.py). The repo-level secret is
+      # isaaclab_arena/environments/agentic_env_gen/llm_agent.py). The repo-level secret is
       # named ARENA_NV_API_KEY to mirror ARENA_NGC_API_KEY and avoid
       # collisions with other consumers of NV_API_KEY in the runner env.
       NV_API_KEY: ${{ secrets.ARENA_NV_API_KEY }}
diff --git a/isaaclab_arena/llm_env_gen/__init__.py b/isaaclab_arena/environments/agentic_env_gen/__init__.py
similarity index 100%
rename from isaaclab_arena/llm_env_gen/__init__.py
rename to isaaclab_arena/environments/agentic_env_gen/__init__.py
diff --git a/isaaclab_arena/llm_env_gen/llm_agent.py b/isaaclab_arena/environments/agentic_env_gen/llm_agent.py
similarity index 100%
rename from isaaclab_arena/llm_env_gen/llm_agent.py
rename to isaaclab_arena/environments/agentic_env_gen/llm_agent.py
diff --git a/isaaclab_arena/llm_env_gen/llm_schema.py b/isaaclab_arena/environments/agentic_env_gen/llm_schema.py
similarity index 100%
rename from isaaclab_arena/llm_env_gen/llm_schema.py
rename to isaaclab_arena/environments/agentic_env_gen/llm_schema.py
diff --git a/isaaclab_arena/llm_env_gen/try_schema.py b/isaaclab_arena/environments/agentic_env_gen/try_schema.py
similarity index 89%
rename from isaaclab_arena/llm_env_gen/try_schema.py
rename to isaaclab_arena/environments/agentic_env_gen/try_schema.py
index d659f211e..05f822d5a 100644
--- a/isaaclab_arena/llm_env_gen/try_schema.py
+++ b/isaaclab_arena/environments/agentic_env_gen/try_schema.py
@@ -9,13 +9,13 @@
 
 Examples:
     # Print the Pydantic LLMEnvSpec JSON schema (no LLM call):
-    /isaac-sim/python.sh -m isaaclab_arena.llm_env_gen.try_schema --print-schema
+    /isaac-sim/python.sh -m isaaclab_arena.environments.agentic_env_gen.try_schema --print-schema
 
     # Print the catalog sent to the LLM (no LLM call):
-    /isaac-sim/python.sh -m isaaclab_arena.llm_env_gen.try_schema --print-catalog
+    /isaac-sim/python.sh -m isaaclab_arena.environments.agentic_env_gen.try_schema --print-catalog
 
     # Call the LLM, resolve, print, and dump YAML:
-    /isaac-sim/python.sh -m isaaclab_arena.llm_env_gen.try_schema \
+    /isaac-sim/python.sh -m isaaclab_arena.environments.agentic_env_gen.try_schema \
         --prompt "franka pick up avocado from the table and place it into a bowl on the table. there are other veggies on the table as distractor"
 """
 
@@ -55,13 +55,13 @@ def main() -> None:
     )
     args = parser.parse_args()
 
-    from isaaclab_arena.llm_env_gen.llm_schema import LLMEnvSpec
+    from isaaclab_arena.environments.agentic_env_gen.llm_schema import LLMEnvSpec
 
     if args.print_schema:
         print(json.dumps(LLMEnvSpec.model_json_schema(), indent=2))
         return
 
-    from isaaclab_arena.llm_env_gen.llm_agent import LLMAgent, build_catalog_text
+    from isaaclab_arena.environments.agentic_env_gen.llm_agent import LLMAgent, build_catalog_text
 
     catalog = build_catalog_text()
     if args.print_catalog:
diff --git a/isaaclab_arena/tests/test_llm_agent.py b/isaaclab_arena/tests/test_llm_agent.py
index 669770822..2731e80f7 100644
--- a/isaaclab_arena/tests/test_llm_agent.py
+++ b/isaaclab_arena/tests/test_llm_agent.py
@@ -3,7 +3,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-"""Unit tests for :class:`isaaclab_arena.llm_env_gen.llm_agent.LLMAgent`.
+"""Unit tests for :class:`isaaclab_arena.environments.agentic_env_gen.llm_agent.LLMAgent`.
 
 The agent's behaviour decomposes into four pure-Python concerns that we exercise
 without ever hitting the wire:
@@ -28,14 +28,14 @@
 import pytest
 from pydantic import ValidationError
 
-from isaaclab_arena.llm_env_gen.llm_agent import (
+from isaaclab_arena.environments.agentic_env_gen.llm_agent import (
     _RAW_RESPONSE_PREVIEW_CHARS,
     DEFAULT_BASE_URL,
     DEFAULT_MODEL,
     LLMAgent,
     LLMResponseParseError,
 )
-from isaaclab_arena.llm_env_gen.llm_schema import RelationKind, TaskKind
+from isaaclab_arena.environments.agentic_env_gen.llm_schema import RelationKind, TaskKind
 
 # ---------------------------------------------------------------------------
 # Fixtures
diff --git a/setup.py b/setup.py
index 721c9b3c8..b77a2e629 100644
--- a/setup.py
+++ b/setup.py
@@ -14,9 +14,9 @@
     "vuer[all]",
     "lightwheel-sdk",
     "pytest",
-    # Used lazily by isaaclab_arena/llm_env_gen/* for NV_API_KEY-based LLM calls.
+    # Used lazily by isaaclab_arena/environments/agentic_env_gen/* for NV_API_KEY-based LLM calls.
     "openai",
-    # Hard dependency of isaaclab_arena/llm_env_gen/llm_schema.py (BaseModel / Field /
+    # Hard dependency of isaaclab_arena/environments/agentic_env_gen/llm_schema.py (BaseModel / Field /
     # model_validator imported at module load — not lazy).
     "pydantic>=2.0",
 ]

From 1d1f8a48a0994c1c67390126f7864ab10d797100 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Thu, 28 May 2026 14:57:41 +0800
Subject: [PATCH 32/41] Rename LLMEnvSpec to EnvIntentSpec
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adopt the name xyao-nv proposed in PR review: "Intent means it's a
blueprint, comparing with raw response from agent. LLM is too
restricted, AgenticEnvSpec sounds too broad."

The class is the structured "env intent" the agent commits to
after parsing a natural-language prompt — calling it an *intent*
spec (rather than an *LLM* spec) keeps the name accurate when the
backend is later swapped for a VLM or a hand-written rule-based
agent.

Renames:
* class LLMEnvSpec -> EnvIntentSpec
* file llm_schema.py -> env_intent_spec.py
* test_embeds_llm_env_spec_schema -> test_embeds_env_intent_spec_schema

Updates 24 references across llm_agent.py, try_schema.py,
test_llm_agent.py, setup.py, and the renamed schema file itself
(class definition, validator return annotation, docstrings, prompt
templates, import statements, console labels, comment paths).

Refreshes the historical comment that explained why the field
initial_scene_graph was kept after the previous SceneSpec ->
LLMEnvSpec rename; now reads SceneSpec -> LLMEnvSpec ->
EnvIntentSpec so the rationale survives this rename too.

LLMAgent, LLMResponseParseError, llm_agent.py, the llm_remote_e2e
pytest marker, and test_llm_agent.py file name are deliberately
NOT touched here — they follow in F8c (agent rename) and F8d
(marker / test file / prose sweep) to keep each diff reviewable.

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 .../{llm_schema.py => env_intent_spec.py}     | 14 ++++----
 .../environments/agentic_env_gen/llm_agent.py | 34 ++++++++++---------
 .../agentic_env_gen/try_schema.py             |  8 ++---
 isaaclab_arena/tests/test_llm_agent.py        | 16 ++++-----
 setup.py                                      |  2 +-
 5 files changed, 38 insertions(+), 36 deletions(-)
 rename isaaclab_arena/environments/agentic_env_gen/{llm_schema.py => env_intent_spec.py} (94%)

diff --git a/isaaclab_arena/environments/agentic_env_gen/llm_schema.py b/isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py
similarity index 94%
rename from isaaclab_arena/environments/agentic_env_gen/llm_schema.py
rename to isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py
index 1b6f67ff1..5562574c4 100644
--- a/isaaclab_arena/environments/agentic_env_gen/llm_schema.py
+++ b/isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py
@@ -6,7 +6,7 @@
 """Schema the LLM must fill in when parsing a natural-language env-generation prompt.
 
 The LLM sees a list of the *available* asset tags / embodiment names pulled
-from the registries at call time, and must return a LLMEnvSpec that only uses
+from the registries at call time, and must return an EnvIntentSpec that only uses
 those vocabularies. Concrete asset names are resolved in a second, deterministic
 step — the LLM never invents USD paths.
 """
@@ -136,8 +136,8 @@ class Task(BaseModel):
     )
 
 
-class LLMEnvSpec(BaseModel):
-    """LLM output — a structured plan for the env and a list of tasks.
+class EnvIntentSpec(BaseModel):
+    """Agent output — a structured "env intent" (blueprint) for the env and a list of tasks.
 
     Field-level guidance lives on the individual ``Field(description=...)``
     entries below and is surfaced to the LLM via ``model_json_schema()``;
@@ -189,9 +189,9 @@ class LLMEnvSpec(BaseModel):
             "must appear here. Relations that change via tasks are still "
             "listed here in their starting form."
             # NOTE: field name kept as ``initial_scene_graph`` even though
-            # the class is now ``LLMEnvSpec`` — renaming the field would
-            # change the JSON schema the LLM is prompted against and is
-            # out of scope here.
+            # the class has been renamed (SceneSpec -> LLMEnvSpec ->
+            # EnvIntentSpec) — renaming the field would change the JSON
+            # schema the agent is prompted against and is out of scope here.
         ),
     )
     tasks: list[Task] = Field(
@@ -203,7 +203,7 @@ class LLMEnvSpec(BaseModel):
     )
 
     @model_validator(mode="after")
-    def _tasks_must_be_non_empty(self) -> LLMEnvSpec:
+    def _tasks_must_be_non_empty(self) -> EnvIntentSpec:
         if not self.tasks:
             raise ValueError(
                 "tasks list is empty — at least one task must be specified to define the env transformation."
diff --git a/isaaclab_arena/environments/agentic_env_gen/llm_agent.py b/isaaclab_arena/environments/agentic_env_gen/llm_agent.py
index 6b0ad15f4..daa6ac065 100644
--- a/isaaclab_arena/environments/agentic_env_gen/llm_agent.py
+++ b/isaaclab_arena/environments/agentic_env_gen/llm_agent.py
@@ -3,10 +3,10 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-"""LLM agent for parsing natural-language env-generation prompts into an LLMEnvSpec.
+"""LLM agent for parsing natural-language env-generation prompts into an EnvIntentSpec.
 
 Calls an OpenAI-compatible chat-completions endpoint (NVIDIA's hosted
-inference by default) and validates the response against the LLMEnvSpec
+inference by default) and validates the response against the EnvIntentSpec
 pydantic bundle so asset resolution stays deterministic.
 """
 
@@ -16,7 +16,7 @@
 import json
 import os
 
-from .llm_schema import LLMEnvSpec
+from .env_intent_spec import EnvIntentSpec
 
 DEFAULT_BASE_URL = "https://inference-api.nvidia.com"
 DEFAULT_MODEL = "nvidia/deepseek-ai/deepseek-v4-flash"
@@ -31,7 +31,7 @@ class LLMResponseParseError(ValueError):
     """Raised when an LLM response cannot be parsed into a JSON object.
 
     Subclasses ``ValueError`` so existing ``except ValueError`` clauses
-    (e.g. around ``LLMEnvSpec.model_validate``) still catch it, but the
+    (e.g. around ``EnvIntentSpec.model_validate``) still catch it, but the
     distinct type lets callers that want to retry the LLM call separate
     parse failures from validation failures.
     """
@@ -64,7 +64,7 @@ def build_catalog_text() -> str:
 
 
 class LLMAgent:
-    """Parses a natural-language env-generation prompt into an LLMEnvSpec."""
+    """Parses a natural-language env-generation prompt into an EnvIntentSpec."""
 
     def __init__(
         self,
@@ -118,8 +118,8 @@ def generate_spec(
         catalog_text: str | None = None,
         temperature: float = 0.2,
         max_tokens: int = 2000,
-    ) -> tuple[LLMEnvSpec, str]:
-        """Call the LLM and return the parsed LLMEnvSpec plus the raw response.
+    ) -> tuple[EnvIntentSpec, str]:
+        """Call the LLM and return the parsed EnvIntentSpec plus the raw response.
 
         Args:
             prompt: Natural-language env description from the end user.
@@ -132,16 +132,16 @@ def generate_spec(
                 repeated calls, or (b) experiment with a restricted /
                 augmented catalog without mutating the registry.
             temperature: Sampling temperature forwarded to the LLM. Kept
-                low by default (0.2) because LLMEnvSpec generation is a
+                low by default (0.2) because EnvIntentSpec generation is a
                 deterministic-ish translation task — high temperature
                 yields creative but invalid schemas.
             max_tokens: Hard cap on the response length. Set generously
-                (2000) so multi-task LLMEnvSpecs aren't truncated
+                (2000) so multi-task EnvIntentSpecs aren't truncated
                 mid-JSON; shrink if the endpoint enforces a tighter
                 quota.
 
         Returns:
-            A ``(LLMEnvSpec, raw_response)`` tuple. The raw text is useful
+            A ``(EnvIntentSpec, raw_response)`` tuple. The raw text is useful
             for debugging when ``model_validate`` rejects the parsed
             JSON.
 
@@ -149,11 +149,13 @@ def generate_spec(
             LLMResponseParseError: when the response can't be parsed as a
                 JSON object (no opening brace, unbalanced braces).
             pydantic.ValidationError: when the parsed JSON is well-formed
-                but doesn't match the LLMEnvSpec schema.
+                but doesn't match the EnvIntentSpec schema.
         """
         catalog_text = catalog_text or build_catalog_text()
         system = self._system_prompt()
-        user = f"{catalog_text}\n\nUSER PROMPT:\n{prompt}\n\nReturn ONLY a JSON object matching the LLMEnvSpec schema."
+        user = (
+            f"{catalog_text}\n\nUSER PROMPT:\n{prompt}\n\nReturn ONLY a JSON object matching the EnvIntentSpec schema."
+        )
 
         resp = self.client.chat.completions.create(
             model=self.model,
@@ -166,7 +168,7 @@ def generate_spec(
         )
         raw = resp.choices[0].message.content
         data = self._extract_json(raw)
-        spec = LLMEnvSpec.model_validate(data)
+        spec = EnvIntentSpec.model_validate(data)
         return spec, raw
 
     def ping(self) -> str:
@@ -213,16 +215,16 @@ def ping(self) -> str:
         return resp.choices[0].message.content or ""
 
     def _system_prompt(self) -> str:
-        schema = json.dumps(LLMEnvSpec.model_json_schema(), indent=2)
+        schema = json.dumps(EnvIntentSpec.model_json_schema(), indent=2)
         # Per-field guidance (what each field means, enum members, default
         # behaviours) lives on the ``Field(description=...)`` entries in
-        # llm_schema.py and is surfaced to the LLM via the SCHEMA block
+        # env_intent_spec.py and is surfaced to the LLM via the SCHEMA block
         # below. Only cross-cutting rules (those that span multiple fields
         # or change LLM output behaviour globally) and few-shot examples
         # belong here.
         return (
             "You are an env-generation parser for robot manipulation tasks.\n"
-            "Convert a natural-language prompt into an LLMEnvSpec JSON object that matches the schema below.\n\n"
+            "Convert a natural-language prompt into an EnvIntentSpec JSON object that matches the schema below.\n\n"
             "GUIDANCE:\n"
             "- Follow the per-field ``description`` strings in SCHEMA for what each field expects.\n"
             "- If the prompt does not specify a value for an optional field, output null.\n"
diff --git a/isaaclab_arena/environments/agentic_env_gen/try_schema.py b/isaaclab_arena/environments/agentic_env_gen/try_schema.py
index 05f822d5a..af44fbf37 100644
--- a/isaaclab_arena/environments/agentic_env_gen/try_schema.py
+++ b/isaaclab_arena/environments/agentic_env_gen/try_schema.py
@@ -8,7 +8,7 @@
 Requires NV_API_KEY environment variable.
 
 Examples:
-    # Print the Pydantic LLMEnvSpec JSON schema (no LLM call):
+    # Print the Pydantic EnvIntentSpec JSON schema (no LLM call):
     /isaac-sim/python.sh -m isaaclab_arena.environments.agentic_env_gen.try_schema --print-schema
 
     # Print the catalog sent to the LLM (no LLM call):
@@ -55,10 +55,10 @@ def main() -> None:
     )
     args = parser.parse_args()
 
-    from isaaclab_arena.environments.agentic_env_gen.llm_schema import LLMEnvSpec
+    from isaaclab_arena.environments.agentic_env_gen.env_intent_spec import EnvIntentSpec
 
     if args.print_schema:
-        print(json.dumps(LLMEnvSpec.model_json_schema(), indent=2))
+        print(json.dumps(EnvIntentSpec.model_json_schema(), indent=2))
         return
 
     from isaaclab_arena.environments.agentic_env_gen.llm_agent import LLMAgent, build_catalog_text
@@ -102,7 +102,7 @@ def main() -> None:
         spec.background = new_bg
         print(f"\n=== background override applied: {old_bg!r} -> {new_bg!r} ===")
 
-    print("\n=== parsed LLMEnvSpec ===")
+    print("\n=== parsed EnvIntentSpec ===")
     print(spec.model_dump_json(indent=2))
 
 
diff --git a/isaaclab_arena/tests/test_llm_agent.py b/isaaclab_arena/tests/test_llm_agent.py
index 2731e80f7..afc9b032b 100644
--- a/isaaclab_arena/tests/test_llm_agent.py
+++ b/isaaclab_arena/tests/test_llm_agent.py
@@ -28,6 +28,7 @@
 import pytest
 from pydantic import ValidationError
 
+from isaaclab_arena.environments.agentic_env_gen.env_intent_spec import RelationKind, TaskKind
 from isaaclab_arena.environments.agentic_env_gen.llm_agent import (
     _RAW_RESPONSE_PREVIEW_CHARS,
     DEFAULT_BASE_URL,
@@ -35,7 +36,6 @@
     LLMAgent,
     LLMResponseParseError,
 )
-from isaaclab_arena.environments.agentic_env_gen.llm_schema import RelationKind, TaskKind
 
 # ---------------------------------------------------------------------------
 # Fixtures
@@ -74,7 +74,7 @@ def _chat_response(content: str | None):
     return resp
 
 
-# Minimal LLMEnvSpec payload — exercises every required field plus one task so
+# Minimal EnvIntentSpec payload — exercises every required field plus one task so
 # the ``tasks_must_be_non_empty`` validator passes. Reused across the
 # generate_spec happy-path tests.
 _MINIMAL_SPEC: dict = {
@@ -243,7 +243,7 @@ def test_propagates_parse_error_for_garbage_response(self, agent):
             agent.generate_spec("p", catalog_text="catalog")
 
     def test_propagates_validation_error_for_schema_violation(self, agent):
-        # Well-formed JSON but missing every required LLMEnvSpec field — pydantic
+        # Well-formed JSON but missing every required EnvIntentSpec field — pydantic
         # surfaces this as a ``ValidationError`` distinct from a parse error.
         agent.client.chat.completions.create.return_value = _chat_response('{"missing": "fields"}')
         with pytest.raises(ValidationError):
@@ -336,7 +336,7 @@ def test_enumerates_every_task_kind(self, agent):
         for kind in get_args(TaskKind):
             assert f'"{kind}"' in prompt, f"task kind {kind!r} missing from system prompt"
 
-    def test_embeds_llm_env_spec_schema(self, agent):
+    def test_embeds_env_intent_spec_schema(self, agent):
         # We assert on field names rather than diffing the full JSON schema so
         # the test isn't brittle to pydantic's schema-generation tweaks across
         # versions.
@@ -365,7 +365,7 @@ def test_generate_spec_against_live_endpoint():
     Exercises the full pipeline with default ``model`` / ``base_url`` /
     system prompt:
 
-        auth → HTTPS → model response → JSON extract → LLMEnvSpec validation
+        auth → HTTPS → model response → JSON extract → EnvIntentSpec validation
 
     Two layers gate this from default ``pytest`` runs:
 
@@ -394,6 +394,6 @@ def test_generate_spec_against_live_endpoint():
         catalog_text=catalog,
     )
     assert isinstance(raw, str) and raw, "LLM returned empty raw response"
-    assert spec.tasks, "LLMEnvSpec must contain at least one task"
-    assert spec.background, "LLMEnvSpec.background must be populated"
-    assert spec.embodiment, "LLMEnvSpec.embodiment must be populated"
+    assert spec.tasks, "EnvIntentSpec must contain at least one task"
+    assert spec.background, "EnvIntentSpec.background must be populated"
+    assert spec.embodiment, "EnvIntentSpec.embodiment must be populated"
diff --git a/setup.py b/setup.py
index b77a2e629..6db01162b 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@
     "pytest",
     # Used lazily by isaaclab_arena/environments/agentic_env_gen/* for NV_API_KEY-based LLM calls.
     "openai",
-    # Hard dependency of isaaclab_arena/environments/agentic_env_gen/llm_schema.py (BaseModel / Field /
+    # Hard dependency of isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py (BaseModel / Field /
     # model_validator imported at module load — not lazy).
     "pydantic>=2.0",
 ]

From 3336f262c8ea28f2879dbbf8255c1931ff4051c2 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Thu, 28 May 2026 15:12:39 +0800
Subject: [PATCH 33/41] Rename LLMAgent to EnvGenAgent and sweep LLM naming
 across env-gen package
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Continues the agentic terminology shift requested in PR review:
"Let's just use agent/agentic instead of LLM all over the code."
With this commit, the env-gen package surface no longer mentions
"LLM" in any name — the backend stays LLM-driven today, but the
contract is generic enough that a VLM- or rule-based agent could
slot in without touching consumers.

Class / file / marker / test-file renames:
* class LLMAgent              -> EnvGenAgent
* class LLMResponseParseError -> AgentResponseParseError
* file  llm_agent.py          -> env_gen_agent.py
* file  test_llm_agent.py     -> test_env_gen_agent.py
* mark  llm_remote_e2e        -> agent_remote_e2e
* job   test_llm_remote_e2e   -> test_agent_remote_e2e

AgentResponseParseError docstring rewritten to explicitly contrast
with json.JSONDecodeError (addresses xyao's review question
"AgenticResponseParseJSON error? Shall it be non overlapping with
JSON's errors?"). Now spells out that:
  * JSONDecodeError = we found JSON but it's malformed
  * AgentResponseParseError = we couldn't even find a JSON payload
…so callers can attribute failures correctly (retry vs. fix
schema) without string-matching error messages.

Prose sweep across the package (~60 sites): LLM -> agent in
docstrings, comments, console banners, error messages, and CI step
names. Three references in EnvGenAgent that describe the OpenAI
chat-completions wire (constructor docstring, generate_spec
docstring, temperature/max_tokens args) became "model" rather than
"agent" because that's the layer the OpenAI client targets — the
agent is the wrapper class.

pytest.ini marker description tightened: "live OpenAI-compatible
LLM endpoint" -> "live OpenAI-compatible chat-completions
endpoint" (model-neutral, matches the wire contract).

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 .github/workflows/ci.yml                      | 20 ++---
 docker/run_docker.sh                          |  2 +-
 .../{llm_agent.py => env_gen_agent.py}        | 52 ++++++-----
 .../agentic_env_gen/env_intent_spec.py        | 24 +++---
 .../agentic_env_gen/try_schema.py             | 20 ++---
 ...est_llm_agent.py => test_env_gen_agent.py} | 86 +++++++++----------
 pytest.ini                                    |  2 +-
 setup.py                                      |  2 +-
 8 files changed, 107 insertions(+), 101 deletions(-)
 rename isaaclab_arena/environments/agentic_env_gen/{llm_agent.py => env_gen_agent.py} (85%)
 rename isaaclab_arena/tests/{test_llm_agent.py => test_env_gen_agent.py} (85%)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 216c48ed0..90d536ce7 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -157,7 +157,7 @@ jobs:
       - name: Run in-process PhysX tests without cameras
         run: |
           /isaac-sim/python.sh -m pytest -sv --durations=0 \
-            -m "not with_cameras and not with_subprocess and not with_newton and not llm_remote_e2e" \
+            -m "not with_cameras and not with_subprocess and not with_newton and not agent_remote_e2e" \
             isaaclab_arena/tests/
 
       - name: Run GR00T policy/data tests (lightweight gr00t deps only)
@@ -266,14 +266,14 @@ jobs:
             isaaclab_arena_gr00t/tests/test_gr00t_remote_closedloop_policy_runner.py
 
 
-  test_llm_remote_e2e:
-    name: LLM remote E2E
+  test_agent_remote_e2e:
+    name: Agent remote E2E
     runs-on: [self-hosted, gpu-arena]
     timeout-minutes: 20
     needs: [pre_commit]
     env:
-      # NV_API_KEY is the variable LLMAgent reads at runtime (see
-      # isaaclab_arena/environments/agentic_env_gen/llm_agent.py). The repo-level secret is
+      # NV_API_KEY is the variable EnvGenAgent reads at runtime (see
+      # isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py). The repo-level secret is
       # named ARENA_NV_API_KEY to mirror ARENA_NGC_API_KEY and avoid
       # collisions with other consumers of NV_API_KEY in the runner env.
       NV_API_KEY: ${{ secrets.ARENA_NV_API_KEY }}
@@ -298,19 +298,19 @@ jobs:
 
       # Fail loudly when the secret isn't wired up — the test itself
       # ``skipif``s when NV_API_KEY is empty, so without this guard a
-      # missing secret would silently produce a green job with zero LLM
+      # missing secret would silently produce a green job with zero agent
       # coverage.
       - name: Verify ARENA_NV_API_KEY is configured
         run: |
           if [ -z "${NV_API_KEY}" ]; then
-            echo "::error::ARENA_NV_API_KEY repo secret is not set; cannot run llm_remote_e2e tests."
+            echo "::error::ARENA_NV_API_KEY repo secret is not set; cannot run agent_remote_e2e tests."
             exit 1
           fi
 
-      - name: Run LLM remote E2E test
+      - name: Run agent remote E2E test
         run: |
-          /isaac-sim/python.sh -m pytest -sv --durations=0 -m llm_remote_e2e \
-            isaaclab_arena/tests/test_llm_agent.py
+          /isaac-sim/python.sh -m pytest -sv --durations=0 -m agent_remote_e2e \
+            isaaclab_arena/tests/test_env_gen_agent.py
 
 
   build_docs_pre_merge:
diff --git a/docker/run_docker.sh b/docker/run_docker.sh
index 10dbb02f4..c8af3e7ce 100755
--- a/docker/run_docker.sh
+++ b/docker/run_docker.sh
@@ -185,7 +185,7 @@ else
         fi
     fi
 
-    # pass through API keys used by the LLM scene-gen prototype; values are
+    # pass through API keys used by the agentic env-gen prototype; values are
     # inherited from the host shell so the key never lives in the repo.
     if [ -n "$NV_API_KEY" ]; then
         DOCKER_RUN_ARGS+=("--env" "NV_API_KEY")
diff --git a/isaaclab_arena/environments/agentic_env_gen/llm_agent.py b/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py
similarity index 85%
rename from isaaclab_arena/environments/agentic_env_gen/llm_agent.py
rename to isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py
index daa6ac065..d315c1cf7 100644
--- a/isaaclab_arena/environments/agentic_env_gen/llm_agent.py
+++ b/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py
@@ -3,7 +3,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-"""LLM agent for parsing natural-language env-generation prompts into an EnvIntentSpec.
+"""Agent for parsing natural-language env-generation prompts into an EnvIntentSpec.
 
 Calls an OpenAI-compatible chat-completions endpoint (NVIDIA's hosted
 inference by default) and validates the response against the EnvIntentSpec
@@ -21,24 +21,34 @@
 DEFAULT_BASE_URL = "https://inference-api.nvidia.com"
 DEFAULT_MODEL = "nvidia/deepseek-ai/deepseek-v4-flash"
 
-# Truncate raw LLM responses to this many characters when including them in
+# Truncate raw agent responses to this many characters when including them in
 # error messages — long enough to diagnose the failure, short enough to keep
 # stack traces readable.
 _RAW_RESPONSE_PREVIEW_CHARS = 500
 
 
-class LLMResponseParseError(ValueError):
-    """Raised when an LLM response cannot be parsed into a JSON object.
+class AgentResponseParseError(ValueError):
+    """Raised when an agent's response envelope cannot be located in the model output.
+
+    This is distinct from (and not a subclass of) ``json.JSONDecodeError``:
+      * ``JSONDecodeError`` means we found a JSON-shaped payload but it's
+        malformed (unbalanced quotes, trailing comma, …) — that's the
+        model emitting bad JSON.
+      * ``AgentResponseParseError`` means we couldn't even *find* a JSON
+        payload to hand to ``json.loads`` — the model returned prose,
+        a refusal, a partial response, or an unbalanced ``{...}`` block.
+
+    Keeping the two non-overlapping lets callers attribute failures
+    correctly (retry the agent vs. fix the schema) without disambiguating
+    error messages by string-matching.
 
     Subclasses ``ValueError`` so existing ``except ValueError`` clauses
-    (e.g. around ``EnvIntentSpec.model_validate``) still catch it, but the
-    distinct type lets callers that want to retry the LLM call separate
-    parse failures from validation failures.
+    (e.g. around ``EnvIntentSpec.model_validate``) still catch it.
     """
 
 
 def build_catalog_text() -> str:
-    """Introspect AssetRegistry and build the vocabulary the LLM is allowed to use."""
+    """Introspect AssetRegistry and build the vocabulary the agent is allowed to use."""
     from isaaclab_arena.assets.registries import AssetRegistry
 
     registry = AssetRegistry()
@@ -63,7 +73,7 @@ def build_catalog_text() -> str:
     )
 
 
-class LLMAgent:
+class EnvGenAgent:
     """Parses a natural-language env-generation prompt into an EnvIntentSpec."""
 
     def __init__(
@@ -72,7 +82,7 @@ def __init__(
         model: str = DEFAULT_MODEL,
         base_url: str = DEFAULT_BASE_URL,
     ):
-        """Configure the OpenAI-compatible client used to call the LLM.
+        """Configure the OpenAI-compatible client used to call the model.
 
         Args:
             api_key: Bearer token for the inference endpoint. Falls back
@@ -119,7 +129,7 @@ def generate_spec(
         temperature: float = 0.2,
         max_tokens: int = 2000,
     ) -> tuple[EnvIntentSpec, str]:
-        """Call the LLM and return the parsed EnvIntentSpec plus the raw response.
+        """Call the model and return the parsed EnvIntentSpec plus the raw response.
 
         Args:
             prompt: Natural-language env description from the end user.
@@ -131,7 +141,7 @@ def generate_spec(
                 value to (a) avoid the cost of rebuilding it across
                 repeated calls, or (b) experiment with a restricted /
                 augmented catalog without mutating the registry.
-            temperature: Sampling temperature forwarded to the LLM. Kept
+            temperature: Sampling temperature forwarded to the model. Kept
                 low by default (0.2) because EnvIntentSpec generation is a
                 deterministic-ish translation task — high temperature
                 yields creative but invalid schemas.
@@ -146,7 +156,7 @@ def generate_spec(
             JSON.
 
         Raises:
-            LLMResponseParseError: when the response can't be parsed as a
+            AgentResponseParseError: when the response can't be parsed as a
                 JSON object (no opening brace, unbalanced braces).
             pydantic.ValidationError: when the parsed JSON is well-formed
                 but doesn't match the EnvIntentSpec schema.
@@ -200,11 +210,11 @@ def ping(self) -> str:
             operator.
 
         Example:
-            >>> agent = LLMAgent()
+            >>> agent = EnvGenAgent()
             >>> try:
             ...     agent.ping()
             ... except Exception as e:
-            ...     sys.exit(f"LLM endpoint health-check failed: {e}")
+            ...     sys.exit(f"Agent endpoint health-check failed: {e}")
         """
         resp = self.client.chat.completions.create(
             model=self.model,
@@ -218,9 +228,9 @@ def _system_prompt(self) -> str:
         schema = json.dumps(EnvIntentSpec.model_json_schema(), indent=2)
         # Per-field guidance (what each field means, enum members, default
         # behaviours) lives on the ``Field(description=...)`` entries in
-        # env_intent_spec.py and is surfaced to the LLM via the SCHEMA block
+        # env_intent_spec.py and is surfaced to the agent via the SCHEMA block
         # below. Only cross-cutting rules (those that span multiple fields
-        # or change LLM output behaviour globally) and few-shot examples
+        # or change agent output behaviour globally) and few-shot examples
         # belong here.
         return (
             "You are an env-generation parser for robot manipulation tasks.\n"
@@ -259,15 +269,15 @@ def _extract_json(content: str) -> dict:
         with contextlib.suppress(json.JSONDecodeError):
             return json.loads(content)
 
-        # ``raise LLMResponseParseError`` rather than ``assert`` so the guard
+        # ``raise AgentResponseParseError`` rather than ``assert`` so the guard
         # survives ``python -O`` (which strips asserts), and so callers can
         # distinguish parse failures from validation failures by exception
         # type. The truncated raw response is the most useful field for
         # debugging a misbehaving prompt.
         start = content.find("{")
         if start == -1:
-            raise LLMResponseParseError(
-                f"No JSON object found in LLM response: {content[:_RAW_RESPONSE_PREVIEW_CHARS]!r}"
+            raise AgentResponseParseError(
+                f"No JSON object found in agent response: {content[:_RAW_RESPONSE_PREVIEW_CHARS]!r}"
             )
         depth = 0
         for i in range(start, len(content)):
@@ -277,4 +287,4 @@ def _extract_json(content: str) -> dict:
                 depth -= 1
                 if depth == 0:
                     return json.loads(content[start : i + 1])
-        raise LLMResponseParseError(f"Unbalanced braces in LLM response: {content[:_RAW_RESPONSE_PREVIEW_CHARS]!r}")
+        raise AgentResponseParseError(f"Unbalanced braces in agent response: {content[:_RAW_RESPONSE_PREVIEW_CHARS]!r}")
diff --git a/isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py b/isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py
index 5562574c4..ef5de2488 100644
--- a/isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py
+++ b/isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py
@@ -3,12 +3,12 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-"""Schema the LLM must fill in when parsing a natural-language env-generation prompt.
+"""Schema the agent must fill in when parsing a natural-language env-generation prompt.
 
-The LLM sees a list of the *available* asset tags / embodiment names pulled
+The agent sees a list of the *available* asset tags / embodiment names pulled
 from the registries at call time, and must return an EnvIntentSpec that only uses
 those vocabularies. Concrete asset names are resolved in a second, deterministic
-step — the LLM never invents USD paths.
+step — the agent never invents USD paths.
 """
 
 from __future__ import annotations
@@ -17,7 +17,7 @@
 
 from pydantic import BaseModel, Field, model_validator
 
-# Relation kinds currently surfaced to the LLM. Mirror the subset of
+# Relation kinds currently surfaced to the agent. Mirror the subset of
 # ``ArenaEnvGraphSpatialConstraintType`` that makes sense for tabletop
 # prompts; values must match the enum's values one-to-one because the
 # resolver looks the constraint type up via
@@ -25,7 +25,7 @@
 # parallel dict. Solver-internal kinds (``position_limits``,
 # ``random_around_solution``, ``rotate_around_solution``) are intentionally
 # omitted — they describe how the placement solver explores poses and are
-# not natural for an LLM to emit.
+# not natural for an agent to emit.
 # "in" has no In class in isaaclab_arena.relations.relations yet — see the
 # TODO there. The downstream env builder materializes goal-state "in"
 # relations as the task's success predicate.
@@ -33,12 +33,12 @@
 
 ItemRole = Literal["foreground", "distractor", "anchor"]
 
-# Task kinds the LLM can propose as an atomic task.
+# Task kinds the agent can propose as an atomic task.
 TaskKind = Literal["pick_and_place", "open_door", "close_door"]
 
 
 class Item(BaseModel):
-    """One object the LLM wants in the scene."""
+    """One object the agent wants in the scene."""
 
     query: str = Field(
         description=(
@@ -140,12 +140,12 @@ class EnvIntentSpec(BaseModel):
     """Agent output — a structured "env intent" (blueprint) for the env and a list of tasks.
 
     Field-level guidance lives on the individual ``Field(description=...)``
-    entries below and is surfaced to the LLM via ``model_json_schema()``;
+    entries below and is surfaced to the agent via ``model_json_schema()``;
     only cross-cutting rules and few-shot examples are kept in the
-    prompt text (see ``LLMAgent._system_prompt``).
+    prompt text (see ``EnvGenAgent._system_prompt``).
     """
 
-    # Forced chain-of-thought field, listed FIRST so the LLM emits its
+    # Forced chain-of-thought field, listed FIRST so the agent emits its
     # analysis before committing to any structured field. Instruction-tuned
     # models respect schema field order, and writing reasoning before
     # answers measurably improves structured-output quality (the
@@ -188,10 +188,6 @@ class EnvIntentSpec(BaseModel):
             "persistent relation (e.g. bowl on table, distractors present) "
             "must appear here. Relations that change via tasks are still "
             "listed here in their starting form."
-            # NOTE: field name kept as ``initial_scene_graph`` even though
-            # the class has been renamed (SceneSpec -> LLMEnvSpec ->
-            # EnvIntentSpec) — renaming the field would change the JSON
-            # schema the agent is prompted against and is out of scope here.
         ),
     )
     tasks: list[Task] = Field(
diff --git a/isaaclab_arena/environments/agentic_env_gen/try_schema.py b/isaaclab_arena/environments/agentic_env_gen/try_schema.py
index af44fbf37..57bef2dbd 100644
--- a/isaaclab_arena/environments/agentic_env_gen/try_schema.py
+++ b/isaaclab_arena/environments/agentic_env_gen/try_schema.py
@@ -3,18 +3,18 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-"""Run the LLM parser on a prompt and dump the resolved ArenaEnvGraphSpec.
+"""Run the agent on a prompt and dump the resolved ArenaEnvGraphSpec.
 
 Requires NV_API_KEY environment variable.
 
 Examples:
-    # Print the Pydantic EnvIntentSpec JSON schema (no LLM call):
+    # Print the Pydantic EnvIntentSpec JSON schema (no agent call):
     /isaac-sim/python.sh -m isaaclab_arena.environments.agentic_env_gen.try_schema --print-schema
 
-    # Print the catalog sent to the LLM (no LLM call):
+    # Print the catalog sent to the agent (no agent call):
     /isaac-sim/python.sh -m isaaclab_arena.environments.agentic_env_gen.try_schema --print-catalog
 
-    # Call the LLM, resolve, print, and dump YAML:
+    # Call the agent, resolve, print, and dump YAML:
     /isaac-sim/python.sh -m isaaclab_arena.environments.agentic_env_gen.try_schema \
         --prompt "franka pick up avocado from the table and place it into a bowl on the table. there are other veggies on the table as distractor"
 """
@@ -46,11 +46,11 @@ def main() -> None:
         type=str,
         default="maple_table_robolab",
         help=(
-            "Override the background chosen by the LLM (e.g. 'office_table' "
+            "Override the background chosen by the agent (e.g. 'office_table' "
             "or 'kitchen'). Default is 'maple_table_robolab' because its "
             "tabletop ObjectReference yields a clean bbox and stable "
             "placement, unlike the rotated plain 'table' background. Pass "
-            "an empty string ('') to keep the LLM's choice."
+            "an empty string ('') to keep the agent's choice."
         ),
     )
     args = parser.parse_args()
@@ -61,7 +61,7 @@ def main() -> None:
         print(json.dumps(EnvIntentSpec.model_json_schema(), indent=2))
         return
 
-    from isaaclab_arena.environments.agentic_env_gen.llm_agent import LLMAgent, build_catalog_text
+    from isaaclab_arena.environments.agentic_env_gen.env_gen_agent import EnvGenAgent, build_catalog_text
 
     catalog = build_catalog_text()
     if args.print_catalog:
@@ -69,16 +69,16 @@ def main() -> None:
         return
 
     kwargs = {"model": args.model} if args.model else {}
-    agent = LLMAgent(**kwargs)
+    agent = EnvGenAgent(**kwargs)
     spec, raw = agent.generate_spec(args.prompt, catalog_text=catalog, temperature=args.temperature)
 
-    print("=== raw LLM response ===")
+    print("=== raw agent response ===")
     print(raw)
 
     # Surface the forced chain-of-thought field on its own so it's easy to
     # spot when debugging a bad spec — without this, ``reasoning`` is
     # buried inside the multi-hundred-line model_dump_json below.
-    print("\n=== LLM reasoning ===")
+    print("\n=== agent reasoning ===")
     print(spec.reasoning)
 
     if args.background and args.background != spec.background:
diff --git a/isaaclab_arena/tests/test_llm_agent.py b/isaaclab_arena/tests/test_env_gen_agent.py
similarity index 85%
rename from isaaclab_arena/tests/test_llm_agent.py
rename to isaaclab_arena/tests/test_env_gen_agent.py
index afc9b032b..ea63f2539 100644
--- a/isaaclab_arena/tests/test_llm_agent.py
+++ b/isaaclab_arena/tests/test_env_gen_agent.py
@@ -3,14 +3,14 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-"""Unit tests for :class:`isaaclab_arena.environments.agentic_env_gen.llm_agent.LLMAgent`.
+"""Unit tests for :class:`isaaclab_arena.environments.agentic_env_gen.env_gen_agent.EnvGenAgent`.
 
 The agent's behaviour decomposes into four pure-Python concerns that we exercise
 without ever hitting the wire:
 
 * ``__init__`` argument / env-var precedence and the missing-key guard.
 * ``_extract_json`` parsing of well-behaved, fenced, prosed, and malformed
-  LLM responses (including the ``LLMResponseParseError`` → ``ValueError`` MRO
+  agent responses (including the ``AgentResponseParseError`` → ``ValueError`` MRO
   contract so callers can still ``except ValueError``).
 * ``generate_spec`` / ``ping`` — the openai client is replaced with a
   ``MagicMock`` so we assert on the request shape (model, messages,
@@ -28,14 +28,14 @@
 import pytest
 from pydantic import ValidationError
 
-from isaaclab_arena.environments.agentic_env_gen.env_intent_spec import RelationKind, TaskKind
-from isaaclab_arena.environments.agentic_env_gen.llm_agent import (
+from isaaclab_arena.environments.agentic_env_gen.env_gen_agent import (
     _RAW_RESPONSE_PREVIEW_CHARS,
     DEFAULT_BASE_URL,
     DEFAULT_MODEL,
-    LLMAgent,
-    LLMResponseParseError,
+    AgentResponseParseError,
+    EnvGenAgent,
 )
+from isaaclab_arena.environments.agentic_env_gen.env_intent_spec import RelationKind, TaskKind
 
 # ---------------------------------------------------------------------------
 # Fixtures
@@ -44,12 +44,12 @@
 
 @pytest.fixture
 def stub_openai():
-    """Patch the ``openai.OpenAI`` constructor so ``LLMAgent()`` never hits the wire.
+    """Patch the ``openai.OpenAI`` constructor so ``EnvGenAgent()`` never hits the wire.
 
     The agent does a deferred ``from openai import OpenAI`` inside
     ``__init__`` to avoid pulling the dependency at module import time, so we
     patch the symbol on the ``openai`` module itself rather than on the
-    ``llm_agent`` namespace.
+    ``env_gen_agent`` namespace.
     """
     with patch("openai.OpenAI") as mock_cls:
         mock_cls.return_value = MagicMock()
@@ -58,12 +58,12 @@ def stub_openai():
 
 @pytest.fixture
 def agent(stub_openai):
-    """A constructed ``LLMAgent`` with a fully mocked openai client.
+    """A constructed ``EnvGenAgent`` with a fully mocked openai client.
 
     Tests should set ``agent.client.chat.completions.create.return_value`` (or
-    ``.side_effect``) to control the simulated LLM response.
+    ``.side_effect``) to control the simulated agent response.
     """
-    return LLMAgent(api_key="test-key")
+    return EnvGenAgent(api_key="test-key")
 
 
 def _chat_response(content: str | None):
@@ -110,26 +110,26 @@ def _chat_response(content: str | None):
 class TestInit:
     def test_explicit_api_key_overrides_env(self, monkeypatch, stub_openai):
         monkeypatch.setenv("NV_API_KEY", "env-key")
-        a = LLMAgent(api_key="explicit-key")
+        a = EnvGenAgent(api_key="explicit-key")
         assert a.api_key == "explicit-key"
 
     def test_falls_back_to_env_var(self, monkeypatch, stub_openai):
         monkeypatch.setenv("NV_API_KEY", "env-key")
-        a = LLMAgent()
+        a = EnvGenAgent()
         assert a.api_key == "env-key"
 
     def test_raises_when_no_key_anywhere(self, monkeypatch, stub_openai):
         monkeypatch.delenv("NV_API_KEY", raising=False)
         with pytest.raises(ValueError, match="API key required"):
-            LLMAgent()
+            EnvGenAgent()
 
     def test_default_model_and_base_url(self, stub_openai):
-        a = LLMAgent(api_key="k")
+        a = EnvGenAgent(api_key="k")
         assert a.model == DEFAULT_MODEL
         stub_openai.assert_called_once_with(api_key="k", base_url=DEFAULT_BASE_URL)
 
     def test_custom_model_and_base_url(self, stub_openai):
-        a = LLMAgent(api_key="k", model="custom-model", base_url="http://localhost:8000")
+        a = EnvGenAgent(api_key="k", model="custom-model", base_url="http://localhost:8000")
         assert a.model == "custom-model"
         stub_openai.assert_called_once_with(api_key="k", base_url="http://localhost:8000")
 
@@ -139,7 +139,7 @@ def test_init_pings_to_verify_connection(self, stub_openai):
         # rather than deep inside the first generate_spec. Locking in the
         # request shape (single user message, max_tokens=8, temperature=0)
         # guarantees we don't accidentally inflate the startup cost.
-        a = LLMAgent(api_key="k")
+        a = EnvGenAgent(api_key="k")
         a.client.chat.completions.create.assert_called_once()
         kwargs = a.client.chat.completions.create.call_args.kwargs
         assert kwargs["temperature"] == 0
@@ -149,7 +149,7 @@ def test_init_pings_to_verify_connection(self, stub_openai):
     def test_init_propagates_ping_failure(self):
         # If the openai client raises during the constructor ping (bad key,
         # unreachable endpoint, ...), the exception must surface from
-        # ``LLMAgent()`` itself — not be swallowed into a silently-broken
+        # ``EnvGenAgent()`` itself — not be swallowed into a silently-broken
         # instance that fails later when generate_spec is called.
         class FakeAuthError(Exception):
             pass
@@ -159,7 +159,7 @@ class FakeAuthError(Exception):
             client.chat.completions.create.side_effect = FakeAuthError("bad key")
             mock_cls.return_value = client
             with pytest.raises(FakeAuthError, match="bad key"):
-                LLMAgent(api_key="k")
+                EnvGenAgent(api_key="k")
 
 
 # ---------------------------------------------------------------------------
@@ -169,46 +169,46 @@ class FakeAuthError(Exception):
 
 class TestExtractJson:
     def test_plain_json_object(self):
-        assert LLMAgent._extract_json('{"a": 1}') == {"a": 1}
+        assert EnvGenAgent._extract_json('{"a": 1}') == {"a": 1}
 
     def test_strips_fenced_json_block(self):
-        assert LLMAgent._extract_json('```json\n{"a": 1}\n```') == {"a": 1}
+        assert EnvGenAgent._extract_json('```json\n{"a": 1}\n```') == {"a": 1}
 
     def test_strips_bare_triple_backticks(self):
-        assert LLMAgent._extract_json('```\n{"a": 1}\n```') == {"a": 1}
+        assert EnvGenAgent._extract_json('```\n{"a": 1}\n```') == {"a": 1}
 
     def test_extracts_object_from_prose(self):
         text = 'Sure! Here is the JSON: {"a": 1} -- hope that helps.'
-        assert LLMAgent._extract_json(text) == {"a": 1}
+        assert EnvGenAgent._extract_json(text) == {"a": 1}
 
     def test_handles_nested_braces(self):
         text = 'prefix {"outer": {"inner": [1, 2, 3]}} suffix'
-        assert LLMAgent._extract_json(text) == {"outer": {"inner": [1, 2, 3]}}
+        assert EnvGenAgent._extract_json(text) == {"outer": {"inner": [1, 2, 3]}}
 
     def test_raises_when_no_opening_brace(self):
-        with pytest.raises(LLMResponseParseError, match="No JSON object found"):
-            LLMAgent._extract_json("plain text with no braces at all")
+        with pytest.raises(AgentResponseParseError, match="No JSON object found"):
+            EnvGenAgent._extract_json("plain text with no braces at all")
 
     def test_raises_on_unbalanced_braces(self):
-        with pytest.raises(LLMResponseParseError, match="Unbalanced braces"):
-            LLMAgent._extract_json('prefix {"a": 1 with no closing brace')
+        with pytest.raises(AgentResponseParseError, match="Unbalanced braces"):
+            EnvGenAgent._extract_json('prefix {"a": 1 with no closing brace')
 
     def test_parse_error_is_a_value_error(self):
-        # MRO contract: LLMResponseParseError subclasses ValueError so existing
+        # MRO contract: AgentResponseParseError subclasses ValueError so existing
         # ``except ValueError`` clauses (e.g. wrapping model_validate) still
         # catch parse failures. Asserting via ``except ValueError`` rather than
         # ``issubclass`` keeps the test grounded in how callers actually use it.
         with pytest.raises(ValueError):
-            LLMAgent._extract_json("no braces here")
+            EnvGenAgent._extract_json("no braces here")
 
     def test_truncates_long_raw_response_in_error(self):
         # Confirm the preview cap really clips the embedded raw response —
-        # otherwise a megabyte-scale LLM hallucination would bury the
+        # otherwise a megabyte-scale agent hallucination would bury the
         # stack trace. We allow a small wrapper budget for the surrounding
         # error message (repr quotes + "No JSON object found in ..." prefix).
         huge = "x" * 5000
-        with pytest.raises(LLMResponseParseError) as exc_info:
-            LLMAgent._extract_json(huge)
+        with pytest.raises(AgentResponseParseError) as exc_info:
+            EnvGenAgent._extract_json(huge)
         msg = str(exc_info.value)
         wrapper_budget = 200
         assert len(msg) <= _RAW_RESPONSE_PREVIEW_CHARS + wrapper_budget
@@ -239,7 +239,7 @@ def test_handles_fenced_response(self, agent):
 
     def test_propagates_parse_error_for_garbage_response(self, agent):
         agent.client.chat.completions.create.return_value = _chat_response("not json at all")
-        with pytest.raises(LLMResponseParseError):
+        with pytest.raises(AgentResponseParseError):
             agent.generate_spec("p", catalog_text="catalog")
 
     def test_propagates_validation_error_for_schema_violation(self, agent):
@@ -271,7 +271,7 @@ def test_user_message_contains_catalog_and_prompt(self, agent):
         assert "<<CATALOG-MARKER>>" in user_msg
         assert "user wants avocado on kitchen" in user_msg
         # The "JSON-only" instruction is the contract that lets _extract_json
-        # work — if it disappears the LLM tends to wrap in prose.
+        # work — if it disappears the agent tends to wrap in prose.
         assert "JSON" in user_msg
 
 
@@ -323,7 +323,7 @@ class TestSystemPrompt:
     def test_enumerates_every_relation_kind(self, agent):
         # The prompt derives its bullet list from ``get_args(RelationKind)``;
         # this assertion fails the moment someone adds a kind to the literal
-        # without rebuilding the prompt, which would silently teach the LLM a
+        # without rebuilding the prompt, which would silently teach the agent a
         # vocabulary the resolver doesn't accept.
         prompt = agent._system_prompt()
         for kind in get_args(RelationKind):
@@ -358,7 +358,7 @@ def test_embeds_env_intent_spec_schema(self, agent):
 # ---------------------------------------------------------------------------
 
 
-@pytest.mark.llm_remote_e2e
+@pytest.mark.agent_remote_e2e
 def test_generate_spec_against_live_endpoint():
     """End-to-end smoke test against the real OpenAI-compatible endpoint.
 
@@ -369,19 +369,19 @@ def test_generate_spec_against_live_endpoint():
 
     Two layers gate this from default ``pytest`` runs:
 
-      * ``llm_remote_e2e`` marker — registered in ``pytest.ini`` next to
+      * ``agent_remote_e2e`` marker — registered in ``pytest.ini`` next to
         ``gr00t_remote_e2e``. Run explicitly with
-        ``pytest -m llm_remote_e2e isaaclab_arena/tests/test_llm_agent.py``.
+        ``pytest -m agent_remote_e2e isaaclab_arena/tests/test_env_gen_agent.py``.
 
     The asset catalog is supplied inline rather than via ``AssetRegistry``
     so the test doesn't depend on Isaac Lab asset registration state — we
-    only want to validate the LLM wire here, not the catalog builder.
+    only want to validate the agent wire here, not the catalog builder.
 
     Assertions are intentionally loose: we check shape (non-empty raw,
     non-empty tasks, populated background/embodiment) rather than exact
-    content, since LLM output drifts between model versions.
+    content, since agent output drifts between model versions.
     """
-    agent = LLMAgent()
+    agent = EnvGenAgent()
     catalog = (
         "EMBODIMENTS: franka_ik\n\n"
         "BACKGROUNDS: maple_table_kitchen\n\n"
@@ -393,7 +393,7 @@ def test_generate_spec_against_live_endpoint():
         "pick up the avocado and place it in the bowl on the kitchen table",
         catalog_text=catalog,
     )
-    assert isinstance(raw, str) and raw, "LLM returned empty raw response"
+    assert isinstance(raw, str) and raw, "agent returned empty raw response"
     assert spec.tasks, "EnvIntentSpec must contain at least one task"
     assert spec.background, "EnvIntentSpec.background must be populated"
     assert spec.embodiment, "EnvIntentSpec.embodiment must be populated"
diff --git a/pytest.ini b/pytest.ini
index a4747c4cc..2c26acc38 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -5,4 +5,4 @@ markers =
     with_newton: test uses Newton physics
     gr00t_policy: test exercises GR00T policy/data code that runs in the base container (lightweight gr00t deps only)
     gr00t_remote_e2e: test requires a live GR00T remote policy server
-    llm_remote_e2e: test requires a live OpenAI-compatible LLM endpoint (needs NV_API_KEY)
+    agent_remote_e2e: test requires a live OpenAI-compatible chat-completions endpoint (needs NV_API_KEY)
diff --git a/setup.py b/setup.py
index 6db01162b..b8afc5af7 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@
     "vuer[all]",
     "lightwheel-sdk",
     "pytest",
-    # Used lazily by isaaclab_arena/environments/agentic_env_gen/* for NV_API_KEY-based LLM calls.
+    # Used lazily by isaaclab_arena/environments/agentic_env_gen/* for NV_API_KEY-based agent calls.
     "openai",
     # Hard dependency of isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py (BaseModel / Field /
     # model_validator imported at module load — not lazy).

From fb4259691b4be0fa606d94f413df2321e02dedd4 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Thu, 28 May 2026 17:09:18 +0800
Subject: [PATCH 34/41] Adopt structured outputs and wire-validate at
 construction time
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Switch EnvGenAgent from prose-extraction to OpenAI-compatible
structured outputs (response_format=json_schema). The wire now
guarantees a JSON envelope matching EnvIntentSpec, so the agent
drops:

* _extract_json (markdown-fence stripping + brace-counting heuristic)
* AgentResponseParseError (the "no JSON found" / "unbalanced braces"
  exception class is unreachable under structured outputs)
* the "Emit ONLY the JSON object" instruction in the prompt
* the prompt's embedded JSON schema dump (the wire carries it)

Addresses xyao's review question "Can we leverage the structured
outputs API?".

Two endpoint quirks surfaced during validation against the requested
models (nvidia/deepseek-ai/deepseek-v4-flash and
aws/anthropic/bedrock-claude-opus-4-6) and are handled cleanly:

* NVIDIA DeepSeek-v4-flash routes structured outputs into
  message.reasoning_content instead of message.content. The new
  extract_response_text() reads either channel transparently.
* AWS Bedrock requires additionalProperties:false on every object
  node, which pydantic's default schema doesn't emit. The new
  build_strict_schema() deep-walks the schema and applies the
  strict-mode constraints (additionalProperties:false, every
  property required, drop default keys).

DeepSeek-v4-flash also emits literal control chars (e.g. \t) inside
JSON strings despite the structured-outputs contract. json.loads
with strict=False accepts them; pydantic's stricter
model_validate_json would reject. Documented inline.

A new structured_output_utils module owns all wire-level
diagnostics (pydantic-model-agnostic, BaseModel-typed):

* ping(client, model) — cheap liveness probe (no response_format)
* check_structured_output_support(client, model, spec_class) —
  full capability probe; returns a StructuredOutputSupport
  dataclass that separates api_error (4xx at the wire) from
  parse_error (JSON / schema validation), so deployment
  validators can attribute failures correctly without
  string-matching error messages.
* build_strict_schema / apply_strict_constraints — schema munging
* extract_response_text — channel fallback

EnvGenAgent.__init__ now runs both probes in order:

  1. ping (cheap, fails fast on dead wire / bad key)
  2. check_structured_output_support (heavier; carries the
     EnvIntentSpec schema)

A model that can't honour response_format raises RuntimeError at
construction with the diagnostic payload in the message so the
operator can attribute the cause (api_error vs parse_error vs
route='empty'). No silently-broken agent instances.

Tests:
* 28 new unit tests in test_structured_output_utils.py covering
  schema munging (incl. an EnvIntentSpec-walk that confirms every
  object node is strict-mode-compatible), channel fallback, ping,
  and check_structured_output_support failure modes.
* 16 unit tests in test_env_gen_agent.py rewritten for the new
  __init__ + generate_spec pipeline. Locks in: call order
  (ping then probe), early exit when ping fails (probe never
  fires), construction failure when probe returns unsupported
  (RuntimeError carries the diagnostic fields).
* 2 live tests guarded by the agent_remote_e2e marker:
  test_default_model_supports_structured_output pins the
  capability contract against the default model;
  test_generate_spec_against_live_endpoint exercises the full
  pipeline.

Verified end-to-end against the live nvidia/deepseek-ai/deepseek-v4-flash
endpoint (parses the EnvIntentSpec out of reasoning_content) and the
schema munging was independently confirmed against
aws/anthropic/bedrock-claude-opus-4-6 (got past Bedrock's
additionalProperties validator; geo-restricted from our test
environment, but the wire path is sound).

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 .../agentic_env_gen/env_gen_agent.py          | 256 +++++------
 .../structured_output_utils.py                | 283 ++++++++++++
 isaaclab_arena/tests/test_env_gen_agent.py    | 407 +++++++++--------
 .../tests/test_structured_output_utils.py     | 421 ++++++++++++++++++
 4 files changed, 1031 insertions(+), 336 deletions(-)
 create mode 100644 isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py
 create mode 100644 isaaclab_arena/tests/test_structured_output_utils.py

diff --git a/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py b/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py
index d315c1cf7..63e044a49 100644
--- a/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py
+++ b/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py
@@ -6,46 +6,25 @@
 """Agent for parsing natural-language env-generation prompts into an EnvIntentSpec.
 
 Calls an OpenAI-compatible chat-completions endpoint (NVIDIA's hosted
-inference by default) and validates the response against the EnvIntentSpec
-pydantic bundle so asset resolution stays deterministic.
+inference by default) and uses the **structured-outputs** API
+(``response_format={"type": "json_schema", ...}``) so the wire
+guarantees a valid JSON envelope matching EnvIntentSpec. There is no
+prose-parsing fallback — if the configured model/endpoint does not
+support structured outputs, :class:`EnvGenAgent` will refuse to
+construct.
 """
 
 from __future__ import annotations
 
-import contextlib
 import json
 import os
 
 from .env_intent_spec import EnvIntentSpec
+from .structured_output_utils import build_strict_schema, check_structured_output_support, extract_response_text, ping
 
 DEFAULT_BASE_URL = "https://inference-api.nvidia.com"
 DEFAULT_MODEL = "nvidia/deepseek-ai/deepseek-v4-flash"
 
-# Truncate raw agent responses to this many characters when including them in
-# error messages — long enough to diagnose the failure, short enough to keep
-# stack traces readable.
-_RAW_RESPONSE_PREVIEW_CHARS = 500
-
-
-class AgentResponseParseError(ValueError):
-    """Raised when an agent's response envelope cannot be located in the model output.
-
-    This is distinct from (and not a subclass of) ``json.JSONDecodeError``:
-      * ``JSONDecodeError`` means we found a JSON-shaped payload but it's
-        malformed (unbalanced quotes, trailing comma, …) — that's the
-        model emitting bad JSON.
-      * ``AgentResponseParseError`` means we couldn't even *find* a JSON
-        payload to hand to ``json.loads`` — the model returned prose,
-        a refusal, a partial response, or an unbalanced ``{...}`` block.
-
-    Keeping the two non-overlapping lets callers attribute failures
-    correctly (retry the agent vs. fix the schema) without disambiguating
-    error messages by string-matching.
-
-    Subclasses ``ValueError`` so existing ``except ValueError`` clauses
-    (e.g. around ``EnvIntentSpec.model_validate``) still catch it.
-    """
-
 
 def build_catalog_text() -> str:
     """Introspect AssetRegistry and build the vocabulary the agent is allowed to use."""
@@ -74,7 +53,16 @@ def build_catalog_text() -> str:
 
 
 class EnvGenAgent:
-    """Parses a natural-language env-generation prompt into an EnvIntentSpec."""
+    """Parses a natural-language env-generation prompt into an EnvIntentSpec.
+
+    The agent is **structured-outputs only**: every call to
+    ``generate_spec`` passes ``response_format={"type": "json_schema",
+    ...}`` to the chat-completions endpoint, and the response is
+    parsed directly as JSON. There is no prose / markdown-fence
+    fallback — if the configured model/endpoint doesn't honour
+    ``response_format``, the constructor raises before the agent is
+    usable.
+    """
 
     def __init__(
         self,
@@ -82,7 +70,23 @@ def __init__(
         model: str = DEFAULT_MODEL,
         base_url: str = DEFAULT_BASE_URL,
     ):
-        """Configure the OpenAI-compatible client used to call the model.
+        """Configure the OpenAI-compatible client and validate the model.
+
+        Construction runs two fail-fast wire checks in order:
+
+          1. :func:`.structured_output_utils.ping` — cheap liveness
+             probe (no ``response_format``). Confirms the API key
+             authenticates, the model name resolves at ``base_url``,
+             and the network path is reachable.
+          2. :func:`.structured_output_utils.check_structured_output_support`
+             — sends ``response_format=json_schema`` with the
+             ``EnvIntentSpec`` schema and asserts a valid envelope
+             comes back. Confirms the model actually honours the
+             structured-outputs contract ``generate_spec`` relies on.
+
+        Both run at construction time so a misconfigured model fails
+        immediately with a clear stack — not mid-pipeline inside the
+        first ``generate_spec`` call.
 
         Args:
             api_key: Bearer token for the inference endpoint. Falls back
@@ -91,7 +95,10 @@ def __init__(
             model: Model identifier as understood by the endpoint at
                 ``base_url`` (e.g. ``"nvidia/deepseek-ai/deepseek-v4-flash"``).
                 See https://build.nvidia.com for the catalogue of NVIDIA-hosted
-                models.
+                models. Must support OpenAI-compatible structured
+                outputs (``response_format=json_schema``) — the
+                constructor validates this and refuses to proceed
+                otherwise.
             base_url: OpenAI-compatible API root. Defaults to
                 ``DEFAULT_BASE_URL`` (NVIDIA's hosted inference endpoint);
                 override to point at a self-hosted vLLM / Ollama / etc.
@@ -101,9 +108,13 @@ def __init__(
         Raises:
             ValueError: when no API key is available (neither argument
                 nor ``NV_API_KEY`` env var).
+            RuntimeError: when the configured model does not support
+                structured outputs (probe came back unsupported).
             Any exception raised by the underlying ``openai`` client
-                during the startup ``ping()``. See :meth:`ping` for the
-                common failure modes.
+                during the ping probe — typically
+                ``AuthenticationError`` (bad key), ``NotFoundError``
+                (wrong model), ``APIConnectionError`` (unreachable
+                endpoint), or ``RateLimitError`` (quota exhausted).
         """
         from openai import OpenAI
 
@@ -115,12 +126,31 @@ def __init__(
             raise ValueError("API key required: set NV_API_KEY or pass api_key.")
         self.model = model
         self.client = OpenAI(api_key=self.api_key, base_url=base_url)
-        # Fail-fast connection check. Costs ~hundreds of ms on hot paths and
-        # converts a deferred ``AuthenticationError`` (or ``NotFoundError`` /
-        # ``APIConnectionError``) into a constructor-time failure with a clear
-        # call stack, which is much easier to diagnose than the same error
-        # surfacing mid-pipeline inside ``generate_spec``.
-        self.ping()
+        # Cached on the instance because the schema is non-trivial to walk
+        # (~10 nested object nodes) and ``generate_spec`` may be called many
+        # times. Munged once per agent lifetime.
+        self._spec_schema = build_strict_schema(EnvIntentSpec)
+
+        # 1) Cheap liveness probe first. If the wire is down or the key is
+        # bad we don't want to waste tokens on the heavier structured-output
+        # probe below — ``ping`` is the right tool for "is the endpoint
+        # talking to us at all?".
+        ping(self.client, self.model)
+
+        # 2) Structured-output capability check. ``generate_spec`` is
+        # structured-outputs-only, so a model that can't honour
+        # ``response_format=json_schema`` is fundamentally unusable for
+        # this agent. Surface the failure at construction time with the
+        # full diagnostic payload from the probe (api_error vs
+        # parse_error vs route) so deployment validators can attribute
+        # the cause without grepping logs.
+        support = check_structured_output_support(self.client, self.model, EnvIntentSpec)
+        if not support.supported:
+            raise RuntimeError(
+                f"Model {self.model!r} at {base_url!r} does not support structured outputs: "
+                f"api_error={support.api_error!r} parse_error={support.parse_error!r} "
+                f"route={support.response_route!r}"
+            )
 
     def generate_spec(
         self,
@@ -131,10 +161,15 @@ def generate_spec(
     ) -> tuple[EnvIntentSpec, str]:
         """Call the model and return the parsed EnvIntentSpec plus the raw response.
 
+        Uses OpenAI-compatible structured outputs: the request includes
+        ``response_format={"type": "json_schema", ...}`` with the
+        EnvIntentSpec schema, and the response is parsed directly as
+        JSON. No prose / markdown-fence fallback.
+
         Args:
             prompt: Natural-language env description from the end user.
-                Concatenated with the asset catalog and the JSON-only
-                instruction to form the chat ``user`` message.
+                Concatenated with the asset catalog to form the chat
+                ``user`` message.
             catalog_text: Pre-built asset vocabulary (the output of
                 ``build_catalog_text()``). When ``None``, the catalog is
                 rebuilt from the live ``AssetRegistry``. Pass an explicit
@@ -151,21 +186,27 @@ def generate_spec(
                 quota.
 
         Returns:
-            A ``(EnvIntentSpec, raw_response)`` tuple. The raw text is useful
-            for debugging when ``model_validate`` rejects the parsed
-            JSON.
+            A ``(EnvIntentSpec, raw_response)`` tuple. The raw text is
+            useful for debugging when validation rejects the parsed
+            JSON (or for inspecting the model's reasoning chain).
 
         Raises:
-            AgentResponseParseError: when the response can't be parsed as a
-                JSON object (no opening brace, unbalanced braces).
-            pydantic.ValidationError: when the parsed JSON is well-formed
-                but doesn't match the EnvIntentSpec schema.
+            RuntimeError: when the model returns an empty response on
+                both ``content`` and ``reasoning_content`` channels
+                (the structured-outputs envelope dropped). Indicates
+                the endpoint or model does not actually honour
+                ``response_format`` — run
+                :meth:`check_structured_output_support` to confirm.
+            json.JSONDecodeError: when the model returned non-JSON
+                text despite the structured-outputs guarantee
+                (vanishingly rare; usually a transport/proxy issue).
+            pydantic.ValidationError: when the parsed JSON is
+                well-formed but violates EnvIntentSpec's semantic
+                constraints (e.g. empty ``tasks`` list).
         """
         catalog_text = catalog_text or build_catalog_text()
         system = self._system_prompt()
-        user = (
-            f"{catalog_text}\n\nUSER PROMPT:\n{prompt}\n\nReturn ONLY a JSON object matching the EnvIntentSpec schema."
-        )
+        user = f"{catalog_text}\n\nUSER PROMPT:\n{prompt}"
 
         resp = self.client.chat.completions.create(
             model=self.model,
@@ -173,70 +214,41 @@ def generate_spec(
                 {"role": "system", "content": system},
                 {"role": "user", "content": user},
             ],
+            response_format={
+                "type": "json_schema",
+                "json_schema": {"name": "EnvIntentSpec", "strict": True, "schema": self._spec_schema},
+            },
             temperature=temperature,
             max_tokens=max_tokens,
         )
-        raw = resp.choices[0].message.content
-        data = self._extract_json(raw)
+        text, route = extract_response_text(resp.choices[0].message)
+        if route == "empty":
+            raise RuntimeError(
+                f"Model {self.model!r} returned an empty structured-outputs envelope. "
+                "Run check_structured_output_support() to verify the endpoint/model "
+                "actually honours response_format=json_schema."
+            )
+        # ``strict=False`` lets json.loads accept unescaped control characters
+        # (e.g. literal tabs) inside JSON strings — DeepSeek-v4-flash is known
+        # to emit these despite the structured-outputs contract. Pydantic's
+        # own ``model_validate_json`` is stricter and would reject them.
+        data = json.loads(text, strict=False)
         spec = EnvIntentSpec.model_validate(data)
-        return spec, raw
-
-    def ping(self) -> str:
-        """Smoke-test the configured endpoint + API key with a minimal request.
-
-        Sends a one-shot chat completion to verify:
-          * the API key authenticates,
-          * the configured model exists at ``base_url``,
-          * the network path is reachable.
-
-        Intended for CI startup probes and local key-setup checks; the
-        success signal is "we got a response without raising". The
-        response *content* is returned for diagnostics but intentionally
-        not asserted on — different models phrase the acknowledgment
-        differently, and a quirky reply still means the wire is working.
-
-        Returns:
-            The model's response text (typically "OK" or similar). Empty
-            string if the model returned no content (still a successful
-            round-trip).
-
-        Raises:
-            Any exception raised by the underlying ``openai`` client.
-            Common ones at this layer are ``AuthenticationError``
-            (bad key), ``NotFoundError`` (wrong ``model``),
-            ``APIConnectionError`` (unreachable endpoint), and
-            ``RateLimitError`` (quota exhausted). Callers typically
-            ``except Exception`` here and report the failure to the
-            operator.
-
-        Example:
-            >>> agent = EnvGenAgent()
-            >>> try:
-            ...     agent.ping()
-            ... except Exception as e:
-            ...     sys.exit(f"Agent endpoint health-check failed: {e}")
-        """
-        resp = self.client.chat.completions.create(
-            model=self.model,
-            messages=[{"role": "user", "content": "Respond with exactly: OK"}],
-            temperature=0,
-            max_tokens=8,
-        )
-        return resp.choices[0].message.content or ""
+        return spec, text
 
     def _system_prompt(self) -> str:
-        schema = json.dumps(EnvIntentSpec.model_json_schema(), indent=2)
         # Per-field guidance (what each field means, enum members, default
         # behaviours) lives on the ``Field(description=...)`` entries in
-        # env_intent_spec.py and is surfaced to the agent via the SCHEMA block
-        # below. Only cross-cutting rules (those that span multiple fields
-        # or change agent output behaviour globally) and few-shot examples
-        # belong here.
+        # env_intent_spec.py and is surfaced to the agent via the SCHEMA
+        # the structured-outputs API embeds in every request. Only
+        # cross-cutting rules and few-shot examples belong here. The
+        # "emit ONLY JSON" instruction is intentionally absent —
+        # structured outputs enforce the envelope at the wire level.
         return (
             "You are an env-generation parser for robot manipulation tasks.\n"
-            "Convert a natural-language prompt into an EnvIntentSpec JSON object that matches the schema below.\n\n"
+            "Convert a natural-language prompt into an EnvIntentSpec.\n\n"
             "GUIDANCE:\n"
-            "- Follow the per-field ``description`` strings in SCHEMA for what each field expects.\n"
+            "- Follow the per-field ``description`` strings in the schema for what each field expects.\n"
             "- If the prompt does not specify a value for an optional field, output null.\n"
             "  Do NOT hallucinate values — the resolver tolerates nulls; it cannot fix invented data.\n"
             "- Articulated objects (microwave, fridge, cabinet) still need a spatial\n"
@@ -251,40 +263,4 @@ def _system_prompt(self) -> str:
             '                  "description": "open the microwave door"}\n'
             '    * Close door: {"kind": "close_door", "subject": "microwave", "target": null,\n'
             '                   "description": "close the microwave door"}\n'
-            "- Emit ONLY the JSON object. No prose, no markdown fences.\n\n"
-            f"SCHEMA:\n{schema}"
         )
-
-    @staticmethod
-    def _extract_json(content: str) -> dict:
-        content = content.strip()
-        if content.startswith("```"):
-            lines = content.split("\n")
-            if lines and lines[0].startswith("```"):
-                lines = lines[1:]
-            if lines and lines[-1].startswith("```"):
-                lines = lines[:-1]
-            content = "\n".join(lines)
-
-        with contextlib.suppress(json.JSONDecodeError):
-            return json.loads(content)
-
-        # ``raise AgentResponseParseError`` rather than ``assert`` so the guard
-        # survives ``python -O`` (which strips asserts), and so callers can
-        # distinguish parse failures from validation failures by exception
-        # type. The truncated raw response is the most useful field for
-        # debugging a misbehaving prompt.
-        start = content.find("{")
-        if start == -1:
-            raise AgentResponseParseError(
-                f"No JSON object found in agent response: {content[:_RAW_RESPONSE_PREVIEW_CHARS]!r}"
-            )
-        depth = 0
-        for i in range(start, len(content)):
-            if content[i] == "{":
-                depth += 1
-            elif content[i] == "}":
-                depth -= 1
-                if depth == 0:
-                    return json.loads(content[start : i + 1])
-        raise AgentResponseParseError(f"Unbalanced braces in agent response: {content[:_RAW_RESPONSE_PREVIEW_CHARS]!r}")
diff --git a/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py b/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py
new file mode 100644
index 000000000..69f542f47
--- /dev/null
+++ b/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py
@@ -0,0 +1,283 @@
+# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Utilities for OpenAI-compatible structured outputs (``response_format=json_schema``).
+
+The functions here are the building blocks the env-gen agent uses to
+send strict-mode-compatible schemas, handle provider-specific response
+routing (NVIDIA DeepSeek's ``reasoning_content`` quirk), and probe a
+candidate model's structured-output capability before deployment.
+
+They are intentionally pydantic-model-agnostic: pass any
+``pydantic.BaseModel`` subclass as ``spec_class`` and the utility
+adapts. The agent module wires :class:`EnvIntentSpec` in as the
+production default.
+"""
+
+from __future__ import annotations
+
+import copy
+import json
+from dataclasses import dataclass
+from typing import Any
+
+from pydantic import BaseModel
+
+# Truncate echoed response payloads in diagnostic results to this many
+# characters — long enough to diagnose a failure, short enough to keep
+# error messages and probe results readable.
+_RESPONSE_PREVIEW_CHARS = 500
+
+
+@dataclass(frozen=True)
+class StructuredOutputSupport:
+    """Result of probing a model for structured-outputs capability.
+
+    The probe sends a one-shot request asking the configured model to
+    return a payload matching ``spec_class``'s strict schema. The
+    result captures every signal a deployment validator needs to
+    decide "is this model usable?":
+
+      * ``supported``: True iff a valid ``spec_class`` instance came
+        back end-to-end (wire ok, schema honoured, pydantic
+        validation passed).
+      * ``response_route``: which channel held the structured output
+        (``"content"`` for OpenAI-compatible models,
+        ``"reasoning_content"`` for NVIDIA DeepSeek, ``"empty"`` when
+        the model dropped the request).
+      * ``api_error`` / ``parse_error``: filled in (mutually
+        exclusively, in that order) when ``supported`` is False so
+        the caller can attribute the failure correctly.
+    """
+
+    supported: bool
+    model: str
+    finish_reason: str | None
+    response_route: str
+    api_error: str | None
+    parse_error: str | None
+    sample_payload: str | None
+
+
+def build_strict_schema(model_cls: type[BaseModel]) -> dict[str, Any]:
+    """Return ``model_cls``'s JSON schema munged for OpenAI strict mode.
+
+    OpenAI's structured outputs strict mode (and AWS Bedrock's
+    Anthropic models, which surface the same constraint) require:
+
+      * ``additionalProperties: false`` on every object schema.
+      * Every property listed in ``required`` (use a nullable type
+        union — e.g. ``str | None`` — for fields that should be
+        emittable as ``null``).
+      * No ``default`` keys in the schema (defaults are nonsensical
+        when every field is required).
+
+    Pydantic's default ``model_json_schema()`` honours the first
+    constraint only. We deep-walk the schema and apply the other two
+    so the schema flies past both NVIDIA and Bedrock validation.
+
+    The returned dict is a deep copy — mutating it never leaks back
+    into pydantic's internal schema cache.
+    """
+    schema = copy.deepcopy(model_cls.model_json_schema())
+    apply_strict_constraints(schema)
+    return schema
+
+
+def apply_strict_constraints(node: Any) -> None:
+    """Recursively apply OpenAI strict-mode constraints to a JSON-schema node.
+
+    Mutates ``node`` in place. Safe to call on an already-munged schema
+    (the operation is idempotent).
+    """
+    if isinstance(node, dict):
+        if node.get("type") == "object" and "properties" in node:
+            node["additionalProperties"] = False
+            node["required"] = list(node["properties"].keys())
+        # Strict mode forbids ``default`` keys (every field is required, so
+        # defaults can never apply). Drop them defensively at every level.
+        node.pop("default", None)
+        for v in node.values():
+            apply_strict_constraints(v)
+    elif isinstance(node, list):
+        for v in node:
+            apply_strict_constraints(v)
+
+
+def ping(client: Any, model: str) -> str:
+    """Smoke-test the endpoint + API key + model with a minimal request.
+
+    Sends a one-shot chat completion (no structured outputs) to verify:
+
+      * the API key authenticates,
+      * the configured model exists at the client's ``base_url``,
+      * the network path is reachable.
+
+    Intended for CI startup probes and constructor-time fail-fast
+    checks; the success signal is "we got a response without
+    raising". The response *content* is returned for diagnostics but
+    intentionally not asserted on — different models phrase the
+    acknowledgment differently, and a quirky reply still means the
+    wire is working.
+
+    This is the *cheap* probe; pair with
+    :func:`check_structured_output_support` for a full deployment
+    validation (ping confirms the wire, the probe confirms the
+    model can actually produce structured outputs).
+
+    Args:
+        client: An OpenAI-compatible client (typically
+            ``openai.OpenAI`` or a compatible mock).
+        model: Model identifier forwarded to
+            ``client.chat.completions.create(model=...)``.
+
+    Returns:
+        The model's response text (typically "OK" or similar). Empty
+        string if the model returned no content (still a successful
+        round-trip).
+
+    Raises:
+        Any exception raised by the underlying ``openai`` client.
+        Common ones at this layer are ``AuthenticationError``
+        (bad key), ``NotFoundError`` (wrong ``model``),
+        ``APIConnectionError`` (unreachable endpoint), and
+        ``RateLimitError`` (quota exhausted).
+    """
+    resp = client.chat.completions.create(
+        model=model,
+        messages=[{"role": "user", "content": "Respond with exactly: OK"}],
+        temperature=0,
+        max_tokens=8,
+    )
+    return resp.choices[0].message.content or ""
+
+
+def extract_response_text(message: Any) -> tuple[str, str]:
+    """Pull the agent's structured-output text from the chat-completion message.
+
+    Returns ``(text, route)`` where ``route`` is one of:
+
+      * ``"content"`` — the standard OpenAI-compatible channel.
+      * ``"reasoning_content"`` — NVIDIA DeepSeek's provider-specific
+        channel; the model emits structured outputs here instead of
+        ``content``. We treat it as equivalent.
+      * ``"empty"`` — both channels were empty / missing; the caller
+        should surface a clear error.
+    """
+    content = getattr(message, "content", None)
+    if content:
+        return content, "content"
+    reasoning = getattr(message, "reasoning_content", None)
+    if reasoning:
+        return reasoning, "reasoning_content"
+    return "", "empty"
+
+
+def check_structured_output_support(
+    client: Any,
+    model: str,
+    spec_class: type[BaseModel],
+) -> StructuredOutputSupport:
+    """Probe whether ``model`` can produce ``spec_class``-shaped structured outputs.
+
+    Sends a single chat-completion against ``client`` with
+    ``response_format=json_schema`` carrying ``spec_class``'s strict
+    schema and a minimal user prompt asking the model to fabricate a
+    valid instance. Reports diagnostics rather than raising so
+    deployment validators can decide how to react (warn, fall back,
+    abort).
+
+    Two failure modes are reported separately:
+
+      * ``api_error`` — the request was rejected at the wire
+        (400/401/etc). The endpoint or its proxy doesn't understand
+        ``response_format``, or the schema violates a
+        provider-specific constraint (e.g. Bedrock requiring
+        ``additionalProperties: false`` everywhere — we munge for
+        this, but other constraints can still surface here).
+      * ``parse_error`` — the request succeeded and the model
+        returned a payload, but it doesn't parse as JSON or doesn't
+        validate against the schema.
+
+    Args:
+        client: An OpenAI-compatible client (typically
+            ``openai.OpenAI`` or a compatible mock).
+        model: Model identifier as understood by the client's
+            base_url. Forwarded verbatim to
+            ``client.chat.completions.create(model=...)``.
+        spec_class: The pydantic model whose strict schema will be
+            sent to the endpoint.
+
+    Returns:
+        A :class:`StructuredOutputSupport` capturing the outcome.
+    """
+    schema = build_strict_schema(spec_class)
+    # The user prompt is deliberately content-free; the schema itself
+    # plus the system prompt below carry all the structural
+    # information. We just want a valid envelope back.
+    system = (
+        f"Return a valid {spec_class.__name__} JSON object. Every required field must be "
+        "populated — use realistic dummy values where the prompt doesn't specify one."
+    )
+    try:
+        resp = client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": system},
+                {"role": "user", "content": "Generate a minimal valid example."},
+            ],
+            response_format={
+                "type": "json_schema",
+                "json_schema": {"name": spec_class.__name__, "strict": True, "schema": schema},
+            },
+            temperature=0,
+            max_tokens=2000,
+        )
+    except Exception as exc:
+        return StructuredOutputSupport(
+            supported=False,
+            model=model,
+            finish_reason=None,
+            response_route="empty",
+            api_error=f"{type(exc).__name__}: {str(exc)[:_RESPONSE_PREVIEW_CHARS]}",
+            parse_error=None,
+            sample_payload=None,
+        )
+
+    finish_reason = resp.choices[0].finish_reason
+    text, route = extract_response_text(resp.choices[0].message)
+    sample = text[:_RESPONSE_PREVIEW_CHARS] if text else None
+    if not text:
+        return StructuredOutputSupport(
+            supported=False,
+            model=model,
+            finish_reason=finish_reason,
+            response_route=route,
+            api_error=None,
+            parse_error="Model returned an empty envelope on both content and reasoning_content.",
+            sample_payload=None,
+        )
+    try:
+        data = json.loads(text, strict=False)
+        spec_class.model_validate(data)
+    except Exception as exc:
+        return StructuredOutputSupport(
+            supported=False,
+            model=model,
+            finish_reason=finish_reason,
+            response_route=route,
+            api_error=None,
+            parse_error=f"{type(exc).__name__}: {str(exc)[:_RESPONSE_PREVIEW_CHARS]}",
+            sample_payload=sample,
+        )
+    return StructuredOutputSupport(
+        supported=True,
+        model=model,
+        finish_reason=finish_reason,
+        response_route=route,
+        api_error=None,
+        parse_error=None,
+        sample_payload=sample,
+    )
diff --git a/isaaclab_arena/tests/test_env_gen_agent.py b/isaaclab_arena/tests/test_env_gen_agent.py
index ea63f2539..2624374f8 100644
--- a/isaaclab_arena/tests/test_env_gen_agent.py
+++ b/isaaclab_arena/tests/test_env_gen_agent.py
@@ -5,54 +5,79 @@
 
 """Unit tests for :class:`isaaclab_arena.environments.agentic_env_gen.env_gen_agent.EnvGenAgent`.
 
-The agent's behaviour decomposes into four pure-Python concerns that we exercise
-without ever hitting the wire:
-
-* ``__init__`` argument / env-var precedence and the missing-key guard.
-* ``_extract_json`` parsing of well-behaved, fenced, prosed, and malformed
-  agent responses (including the ``AgentResponseParseError`` → ``ValueError`` MRO
-  contract so callers can still ``except ValueError``).
-* ``generate_spec`` / ``ping`` — the openai client is replaced with a
+The agent's behaviour decomposes into three agent-level concerns that
+we exercise without ever hitting the wire:
+
+* ``__init__`` argument / env-var precedence, the missing-key guard,
+  and the two constructor-time validations (``ping`` then
+  ``check_structured_output_support``) that convert late wire /
+  capability failures into fail-fast errors.
+* ``generate_spec`` — the openai client is replaced with a
   ``MagicMock`` so we assert on the request shape (model, messages,
-  temperature, max_tokens) and the error-propagation contract.
-* ``_system_prompt`` is asserted to enumerate every ``RelationKind`` /
-  ``TaskKind`` literal so prompt and schema cannot drift apart silently.
+  ``response_format``, temperature, max_tokens) and the
+  error-propagation contract.
+* ``_system_prompt`` keeps its cross-cutting guidance intact;
+  per-field schema details ride on the wire via
+  ``response_format=json_schema`` rather than the prompt text.
+
+Schema munging, the ``ping`` and ``check_structured_output_support``
+helpers, and their failure-mode coverage all live in
+:mod:`test_structured_output_utils`.
 """
 
 from __future__ import annotations
 
 import json
-from typing import get_args
 from unittest.mock import MagicMock, patch
 
 import pytest
 from pydantic import ValidationError
 
-from isaaclab_arena.environments.agentic_env_gen.env_gen_agent import (
-    _RAW_RESPONSE_PREVIEW_CHARS,
-    DEFAULT_BASE_URL,
-    DEFAULT_MODEL,
-    AgentResponseParseError,
-    EnvGenAgent,
-)
-from isaaclab_arena.environments.agentic_env_gen.env_intent_spec import RelationKind, TaskKind
+from isaaclab_arena.environments.agentic_env_gen.env_gen_agent import DEFAULT_BASE_URL, DEFAULT_MODEL, EnvGenAgent
+from isaaclab_arena.environments.agentic_env_gen.structured_output_utils import apply_strict_constraints
 
 # ---------------------------------------------------------------------------
 # Fixtures
 # ---------------------------------------------------------------------------
 
 
+def _chat_response(content: str | None = None, reasoning_content: str | None = None, finish_reason: str = "stop"):
+    """Build a nested mock matching the openai chat-completion response shape.
+
+    Models that route structured outputs into ``reasoning_content`` (e.g.
+    NVIDIA DeepSeek) leave ``content`` empty — the fixture mirrors that by
+    populating either channel independently.
+    """
+    resp = MagicMock()
+    resp.choices = [MagicMock()]
+    resp.choices[0].finish_reason = finish_reason
+    resp.choices[0].message.content = content
+    resp.choices[0].message.reasoning_content = reasoning_content
+    return resp
+
+
 @pytest.fixture
 def stub_openai():
-    """Patch the ``openai.OpenAI`` constructor so ``EnvGenAgent()`` never hits the wire.
+    """Patch ``openai.OpenAI`` so ``EnvGenAgent()`` never hits the wire.
 
     The agent does a deferred ``from openai import OpenAI`` inside
-    ``__init__`` to avoid pulling the dependency at module import time, so we
-    patch the symbol on the ``openai`` module itself rather than on the
-    ``env_gen_agent`` namespace.
+    ``__init__`` to avoid pulling the dependency at module import
+    time, so we patch the symbol on the ``openai`` module itself.
+
+    The patched client is pre-loaded to satisfy the two constructor
+    probes (cheap ``ping`` then full structured-output check):
+    ``side_effect`` returns a "OK" ping response then a
+    ``_MINIMAL_SPEC`` probe response. Tests that want to assert on a
+    failing ``__init__`` reach for ``patch("openai.OpenAI")``
+    directly with a custom ``side_effect``.
     """
     with patch("openai.OpenAI") as mock_cls:
-        mock_cls.return_value = MagicMock()
+        client = MagicMock()
+        client.chat.completions.create.side_effect = [
+            _chat_response(content="OK"),
+            _chat_response(content=json.dumps(_MINIMAL_SPEC)),
+        ]
+        mock_cls.return_value = client
         yield mock_cls
 
 
@@ -60,18 +85,17 @@ def stub_openai():
 def agent(stub_openai):
     """A constructed ``EnvGenAgent`` with a fully mocked openai client.
 
-    Tests should set ``agent.client.chat.completions.create.return_value`` (or
-    ``.side_effect``) to control the simulated agent response.
+    ``__init__``'s two calls (ping + structured-output probe) are
+    served by ``stub_openai``'s pre-loaded ``side_effect``. After
+    construction we *reset the mock* so per-test assertions on
+    ``call_args`` / ``call_count`` start from a clean slate; tests
+    can then set ``.return_value`` (or a fresh ``.side_effect``) to
+    drive whichever method they're exercising.
     """
-    return EnvGenAgent(api_key="test-key")
-
-
-def _chat_response(content: str | None):
-    """Build the nested mock that mimics the openai chat-completion response shape."""
-    resp = MagicMock()
-    resp.choices = [MagicMock()]
-    resp.choices[0].message.content = content
-    return resp
+    a = EnvGenAgent(api_key="test-key")
+    a.client.chat.completions.create.side_effect = None
+    a.client.chat.completions.create.reset_mock()
+    return a
 
 
 # Minimal EnvIntentSpec payload — exercises every required field plus one task so
@@ -86,8 +110,8 @@ def _chat_response(content: str | None):
     "background": "kitchen",
     "embodiment": "franka_ik",
     "items": [
-        {"query": "avocado", "role": "foreground", "category_tags": []},
-        {"query": "bowl", "role": "foreground", "category_tags": []},
+        {"query": "avocado", "role": "foreground", "category_tags": [], "instance_name": None, "scale": None},
+        {"query": "bowl", "role": "foreground", "category_tags": [], "instance_name": None, "scale": None},
     ],
     "initial_scene_graph": [
         {"kind": "on", "subject": "avocado", "target": "kitchen"},
@@ -133,24 +157,33 @@ def test_custom_model_and_base_url(self, stub_openai):
         assert a.model == "custom-model"
         stub_openai.assert_called_once_with(api_key="k", base_url="http://localhost:8000")
 
-    def test_init_pings_to_verify_connection(self, stub_openai):
-        # ``__init__`` is contracted to run a ping round-trip before returning
-        # so a bad key / wrong model / dead endpoint fails at construction time
-        # rather than deep inside the first generate_spec. Locking in the
-        # request shape (single user message, max_tokens=8, temperature=0)
-        # guarantees we don't accidentally inflate the startup cost.
+    def test_init_runs_ping_then_structured_output_probe(self, stub_openai):
+        # ``__init__`` is contracted to run TWO wire checks in order:
+        # (1) the cheap ``ping`` so a dead endpoint / bad key fails before
+        # we spend tokens on (2) the heavier structured-output probe.
+        # Asserting the order matters because reversing it would waste a
+        # full schema probe on every misconfigured deployment.
         a = EnvGenAgent(api_key="k")
-        a.client.chat.completions.create.assert_called_once()
-        kwargs = a.client.chat.completions.create.call_args.kwargs
-        assert kwargs["temperature"] == 0
-        assert kwargs["max_tokens"] == 8
-        assert len(kwargs["messages"]) == 1
+        assert a.client.chat.completions.create.call_count == 2
+        first, second = a.client.chat.completions.create.call_args_list
+        # First call = ping: small message, no response_format.
+        assert first.kwargs["temperature"] == 0
+        assert first.kwargs["max_tokens"] == 8
+        assert len(first.kwargs["messages"]) == 1
+        assert "response_format" not in first.kwargs
+        # Second call = structured-output probe: carries the EnvIntentSpec
+        # schema, signalling the model has to actually honour
+        # ``response_format=json_schema``.
+        assert second.kwargs["response_format"]["type"] == "json_schema"
+        assert second.kwargs["response_format"]["json_schema"]["name"] == "EnvIntentSpec"
 
     def test_init_propagates_ping_failure(self):
-        # If the openai client raises during the constructor ping (bad key,
-        # unreachable endpoint, ...), the exception must surface from
-        # ``EnvGenAgent()`` itself — not be swallowed into a silently-broken
-        # instance that fails later when generate_spec is called.
+        # If the openai client raises on the FIRST (ping) call — bad key,
+        # unreachable endpoint, etc. — the exception must surface from
+        # ``EnvGenAgent()`` itself, not be swallowed into a silently-broken
+        # instance that fails later when generate_spec is called. The
+        # structured-output probe must NOT be attempted (otherwise we'd
+        # waste a schema-carrying request on a dead wire).
         class FakeAuthError(Exception):
             pass
 
@@ -160,60 +193,44 @@ class FakeAuthError(Exception):
             mock_cls.return_value = client
             with pytest.raises(FakeAuthError, match="bad key"):
                 EnvGenAgent(api_key="k")
-
-
-# ---------------------------------------------------------------------------
-# _extract_json
-# ---------------------------------------------------------------------------
-
-
-class TestExtractJson:
-    def test_plain_json_object(self):
-        assert EnvGenAgent._extract_json('{"a": 1}') == {"a": 1}
-
-    def test_strips_fenced_json_block(self):
-        assert EnvGenAgent._extract_json('```json\n{"a": 1}\n```') == {"a": 1}
-
-    def test_strips_bare_triple_backticks(self):
-        assert EnvGenAgent._extract_json('```\n{"a": 1}\n```') == {"a": 1}
-
-    def test_extracts_object_from_prose(self):
-        text = 'Sure! Here is the JSON: {"a": 1} -- hope that helps.'
-        assert EnvGenAgent._extract_json(text) == {"a": 1}
-
-    def test_handles_nested_braces(self):
-        text = 'prefix {"outer": {"inner": [1, 2, 3]}} suffix'
-        assert EnvGenAgent._extract_json(text) == {"outer": {"inner": [1, 2, 3]}}
-
-    def test_raises_when_no_opening_brace(self):
-        with pytest.raises(AgentResponseParseError, match="No JSON object found"):
-            EnvGenAgent._extract_json("plain text with no braces at all")
-
-    def test_raises_on_unbalanced_braces(self):
-        with pytest.raises(AgentResponseParseError, match="Unbalanced braces"):
-            EnvGenAgent._extract_json('prefix {"a": 1 with no closing brace')
-
-    def test_parse_error_is_a_value_error(self):
-        # MRO contract: AgentResponseParseError subclasses ValueError so existing
-        # ``except ValueError`` clauses (e.g. wrapping model_validate) still
-        # catch parse failures. Asserting via ``except ValueError`` rather than
-        # ``issubclass`` keeps the test grounded in how callers actually use it.
-        with pytest.raises(ValueError):
-            EnvGenAgent._extract_json("no braces here")
-
-    def test_truncates_long_raw_response_in_error(self):
-        # Confirm the preview cap really clips the embedded raw response —
-        # otherwise a megabyte-scale agent hallucination would bury the
-        # stack trace. We allow a small wrapper budget for the surrounding
-        # error message (repr quotes + "No JSON object found in ..." prefix).
-        huge = "x" * 5000
-        with pytest.raises(AgentResponseParseError) as exc_info:
-            EnvGenAgent._extract_json(huge)
-        msg = str(exc_info.value)
-        wrapper_budget = 200
-        assert len(msg) <= _RAW_RESPONSE_PREVIEW_CHARS + wrapper_budget
-        # ...and a 4000-char run from deep inside ``huge`` must not have leaked.
-        assert "x" * 4000 not in msg
+            # Exactly one create() call — the ping. The probe never ran.
+            assert client.chat.completions.create.call_count == 1
+
+    def test_init_raises_when_structured_output_unsupported(self):
+        # The agent is structured-outputs-only — a model that can't honour
+        # ``response_format=json_schema`` is fundamentally unusable. The
+        # constructor must refuse rather than letting downstream
+        # ``generate_spec`` blow up later. The error message must surface
+        # the diagnostic fields from the probe so the operator can attribute
+        # the cause (api_error vs parse_error vs empty envelope).
+        with patch("openai.OpenAI") as mock_cls:
+            client = MagicMock()
+            client.chat.completions.create.side_effect = [
+                _chat_response(content="OK"),  # ping passes
+                _chat_response(content=None, reasoning_content=None),  # probe empty
+            ]
+            mock_cls.return_value = client
+            with pytest.raises(RuntimeError, match="does not support structured outputs") as exc_info:
+                EnvGenAgent(api_key="k")
+            msg = str(exc_info.value)
+            # Empty-envelope probe → parse_error populated, api_error None,
+            # route="empty". All three should appear in the message so the
+            # caller can distinguish from a 4xx (api_error populated) case.
+            assert "parse_error=" in msg
+            assert "api_error=" in msg
+            assert "route='empty'" in msg
+
+    def test_init_caches_strict_schema(self, stub_openai):
+        # The strict schema munging walks ~10 nested object nodes; caching it
+        # on the instance avoids redoing the walk on every generate_spec call.
+        # The cached schema must already be munged — re-running the munger
+        # should be a no-op (idempotent).
+        a = EnvGenAgent(api_key="k")
+        assert isinstance(a._spec_schema, dict)
+        before = json.dumps(a._spec_schema, sort_keys=True)
+        apply_strict_constraints(a._spec_schema)
+        after = json.dumps(a._spec_schema, sort_keys=True)
+        assert before == after
 
 
 # ---------------------------------------------------------------------------
@@ -224,94 +241,85 @@ def test_truncates_long_raw_response_in_error(self):
 class TestGenerateSpec:
     def test_happy_path_returns_spec_and_raw(self, agent):
         raw = json.dumps(_MINIMAL_SPEC)
-        agent.client.chat.completions.create.return_value = _chat_response(raw)
+        agent.client.chat.completions.create.return_value = _chat_response(content=raw)
         spec, returned_raw = agent.generate_spec("avocado on kitchen", catalog_text="catalog")
         assert spec.embodiment == "franka_ik"
         assert spec.background == "kitchen"
         assert len(spec.tasks) == 1
         assert returned_raw == raw
 
-    def test_handles_fenced_response(self, agent):
-        raw = f"```json\n{json.dumps(_MINIMAL_SPEC)}\n```"
-        agent.client.chat.completions.create.return_value = _chat_response(raw)
-        spec, _ = agent.generate_spec("p", catalog_text="catalog")
+    def test_reads_from_reasoning_content_channel(self, agent):
+        # DeepSeek quirk: when structured outputs are requested, the model
+        # puts the JSON in ``reasoning_content`` instead of ``content``.
+        raw = json.dumps(_MINIMAL_SPEC)
+        agent.client.chat.completions.create.return_value = _chat_response(content=None, reasoning_content=raw)
+        spec, returned_raw = agent.generate_spec("p", catalog_text="catalog")
         assert spec.embodiment == "franka_ik"
+        assert returned_raw == raw
 
-    def test_propagates_parse_error_for_garbage_response(self, agent):
-        agent.client.chat.completions.create.return_value = _chat_response("not json at all")
-        with pytest.raises(AgentResponseParseError):
+    def test_request_sets_response_format_to_json_schema(self, agent):
+        agent.client.chat.completions.create.return_value = _chat_response(content=json.dumps(_MINIMAL_SPEC))
+        agent.generate_spec("p", catalog_text="catalog")
+        kwargs = agent.client.chat.completions.create.call_args.kwargs
+        assert kwargs["response_format"]["type"] == "json_schema"
+        assert kwargs["response_format"]["json_schema"]["name"] == "EnvIntentSpec"
+        assert kwargs["response_format"]["json_schema"]["strict"] is True
+        # The schema sent on the wire is the cached, strict-mode-munged copy.
+        assert kwargs["response_format"]["json_schema"]["schema"] is agent._spec_schema
+
+    def test_raises_runtime_error_on_empty_envelope(self, agent):
+        # Both channels empty — the endpoint accepted ``response_format`` but
+        # the model dropped the structured output (the canonical "endpoint
+        # doesn't actually support structured outputs" failure mode).
+        agent.client.chat.completions.create.return_value = _chat_response(content=None, reasoning_content=None)
+        with pytest.raises(RuntimeError, match="empty structured-outputs envelope"):
             agent.generate_spec("p", catalog_text="catalog")
 
+    def test_tolerates_unescaped_control_chars(self, agent):
+        # DeepSeek-v4-flash emits literal tab/newline characters inside JSON
+        # strings despite the structured-outputs contract. Python's default
+        # ``json.loads`` rejects them; we pass ``strict=False`` to accept.
+        payload = dict(_MINIMAL_SPEC)
+        payload["task_description"] = "pick up\tthe\tavocado"
+        raw = json.dumps(payload).replace("\\t", "\t")
+        assert "\t" in raw  # raw payload now has literal tab chars in a string
+        agent.client.chat.completions.create.return_value = _chat_response(content=raw)
+        spec, _ = agent.generate_spec("p", catalog_text="catalog")
+        assert "\t" in spec.task_description
+
     def test_propagates_validation_error_for_schema_violation(self, agent):
-        # Well-formed JSON but missing every required EnvIntentSpec field — pydantic
-        # surfaces this as a ``ValidationError`` distinct from a parse error.
-        agent.client.chat.completions.create.return_value = _chat_response('{"missing": "fields"}')
+        # Well-formed JSON but missing every required EnvIntentSpec field —
+        # pydantic surfaces this as a ``ValidationError`` distinct from a
+        # transport or parse error.
+        agent.client.chat.completions.create.return_value = _chat_response(content='{"missing": "fields"}')
         with pytest.raises(ValidationError):
             agent.generate_spec("p", catalog_text="catalog")
 
     def test_request_uses_configured_model(self, agent):
-        agent.client.chat.completions.create.return_value = _chat_response(json.dumps(_MINIMAL_SPEC))
+        agent.client.chat.completions.create.return_value = _chat_response(content=json.dumps(_MINIMAL_SPEC))
         agent.generate_spec("p", catalog_text="catalog")
         kwargs = agent.client.chat.completions.create.call_args.kwargs
         assert kwargs["model"] == agent.model
 
     def test_forwards_temperature_and_max_tokens(self, agent):
-        agent.client.chat.completions.create.return_value = _chat_response(json.dumps(_MINIMAL_SPEC))
+        agent.client.chat.completions.create.return_value = _chat_response(content=json.dumps(_MINIMAL_SPEC))
         agent.generate_spec("p", catalog_text="catalog", temperature=0.7, max_tokens=500)
         kwargs = agent.client.chat.completions.create.call_args.kwargs
         assert kwargs["temperature"] == 0.7
         assert kwargs["max_tokens"] == 500
 
     def test_user_message_contains_catalog_and_prompt(self, agent):
-        agent.client.chat.completions.create.return_value = _chat_response(json.dumps(_MINIMAL_SPEC))
+        agent.client.chat.completions.create.return_value = _chat_response(content=json.dumps(_MINIMAL_SPEC))
         agent.generate_spec("user wants avocado on kitchen", catalog_text="<<CATALOG-MARKER>>")
         msgs = agent.client.chat.completions.create.call_args.kwargs["messages"]
         assert [m["role"] for m in msgs] == ["system", "user"]
         user_msg = msgs[1]["content"]
         assert "<<CATALOG-MARKER>>" in user_msg
         assert "user wants avocado on kitchen" in user_msg
-        # The "JSON-only" instruction is the contract that lets _extract_json
-        # work — if it disappears the agent tends to wrap in prose.
-        assert "JSON" in user_msg
-
-
-# ---------------------------------------------------------------------------
-# ping
-# ---------------------------------------------------------------------------
-
-
-class TestPing:
-    def test_returns_response_content(self, agent):
-        agent.client.chat.completions.create.return_value = _chat_response("OK")
-        assert agent.ping() == "OK"
-
-    def test_returns_empty_string_when_content_is_none(self, agent):
-        # Some providers return ``None`` content alongside a finish_reason — we
-        # treat that as a successful round-trip (the wire works) rather than
-        # raising, since the caller's contract is "did this raise?".
-        agent.client.chat.completions.create.return_value = _chat_response(None)
-        assert agent.ping() == ""
-
-    def test_uses_minimal_request_params(self, agent):
-        agent.client.chat.completions.create.return_value = _chat_response("OK")
-        agent.ping()
-        kwargs = agent.client.chat.completions.create.call_args.kwargs
-        assert kwargs["model"] == agent.model
-        assert kwargs["temperature"] == 0
-        assert kwargs["max_tokens"] == 8
-        # Single user message — no system prompt / catalog payload. Keeping the
-        # request small is the whole point: ping must stay cheap enough to run
-        # on every CI job startup.
-        assert len(kwargs["messages"]) == 1
-        assert kwargs["messages"][0]["role"] == "user"
-
-    def test_propagates_client_exceptions(self, agent):
-        class FakeAuthError(Exception):
-            pass
-
-        agent.client.chat.completions.create.side_effect = FakeAuthError("invalid api key")
-        with pytest.raises(FakeAuthError, match="invalid api key"):
-            agent.ping()
+        # Under structured outputs the "emit ONLY JSON" instruction is
+        # redundant (and was deliberately dropped) — the wire enforces
+        # the envelope.
+        assert "Return ONLY" not in user_msg
 
 
 # ---------------------------------------------------------------------------
@@ -320,37 +328,32 @@ class FakeAuthError(Exception):
 
 
 class TestSystemPrompt:
-    def test_enumerates_every_relation_kind(self, agent):
-        # The prompt derives its bullet list from ``get_args(RelationKind)``;
-        # this assertion fails the moment someone adds a kind to the literal
-        # without rebuilding the prompt, which would silently teach the agent a
-        # vocabulary the resolver doesn't accept.
-        prompt = agent._system_prompt()
-        for kind in get_args(RelationKind):
-            assert kind in prompt, f"relation kind {kind!r} missing from system prompt"
-
-    def test_enumerates_every_task_kind(self, agent):
-        # Task kinds are quoted in the prompt (JSON-style) to disambiguate from
-        # surrounding prose — keep the quoting in sync with the source.
+    def test_contains_cross_cutting_guidance(self, agent):
+        # Under structured outputs the schema (including every Relation /
+        # Task literal enum) flows to the model via ``response_format``.
+        # The system prompt is reserved for cross-cutting rules that
+        # can't be expressed in the schema — articulated-object anchoring,
+        # distractor anchoring, anti-hallucination directives. Lock those
+        # markers in so a future prompt rewrite can't accidentally drop
+        # them.
         prompt = agent._system_prompt()
-        for kind in get_args(TaskKind):
-            assert f'"{kind}"' in prompt, f"task kind {kind!r} missing from system prompt"
+        for marker in (
+            "Articulated objects",
+            "Distractor items",
+            "Do NOT hallucinate",
+            "pick_and_place",
+            "open_door",
+            "close_door",
+        ):
+            assert marker in prompt, f"system prompt missing required marker {marker!r}"
 
-    def test_embeds_env_intent_spec_schema(self, agent):
-        # We assert on field names rather than diffing the full JSON schema so
-        # the test isn't brittle to pydantic's schema-generation tweaks across
-        # versions.
+    def test_does_not_repeat_response_format_instruction(self, agent):
+        # Belt-and-suspenders: ensure the prompt isn't still telling the
+        # model "emit ONLY JSON" — that instruction is redundant under
+        # structured outputs and the wire enforces it.
         prompt = agent._system_prompt()
-        for field in (
-            "reasoning",
-            "task_description",
-            "background",
-            "embodiment",
-            "items",
-            "initial_scene_graph",
-            "tasks",
-        ):
-            assert field in prompt
+        assert "Emit ONLY" not in prompt
+        assert "ONLY the JSON object" not in prompt
 
 
 # ---------------------------------------------------------------------------
@@ -362,10 +365,11 @@ def test_embeds_env_intent_spec_schema(self, agent):
 def test_generate_spec_against_live_endpoint():
     """End-to-end smoke test against the real OpenAI-compatible endpoint.
 
-    Exercises the full pipeline with default ``model`` / ``base_url`` /
-    system prompt:
+    Exercises the full structured-outputs pipeline with default
+    ``model`` / ``base_url`` / system prompt:
 
-        auth → HTTPS → model response → JSON extract → EnvIntentSpec validation
+        auth → HTTPS → response_format=json_schema → channel fallback
+        → json.loads(strict=False) → EnvIntentSpec.model_validate
 
     Two layers gate this from default ``pytest`` runs:
 
@@ -377,9 +381,16 @@ def test_generate_spec_against_live_endpoint():
     so the test doesn't depend on Isaac Lab asset registration state — we
     only want to validate the agent wire here, not the catalog builder.
 
+    The structured-outputs *capability* of the default model is
+    pinned separately by
+    :func:`test_structured_output_utils.test_default_model_supports_structured_output`;
+    this test exercises the higher-level ``generate_spec`` pipeline
+    end-to-end.
+
     Assertions are intentionally loose: we check shape (non-empty raw,
-    non-empty tasks, populated background/embodiment) rather than exact
-    content, since agent output drifts between model versions.
+    non-empty tasks, populated background/embodiment, populated
+    reasoning) rather than exact content, since agent output drifts
+    between model versions.
     """
     agent = EnvGenAgent()
     catalog = (
@@ -397,3 +408,7 @@ def test_generate_spec_against_live_endpoint():
     assert spec.tasks, "EnvIntentSpec must contain at least one task"
     assert spec.background, "EnvIntentSpec.background must be populated"
     assert spec.embodiment, "EnvIntentSpec.embodiment must be populated"
+    # Structured outputs guarantee the forced-CoT reasoning field is
+    # populated — under the old prose-extraction path it could come
+    # back blank if the model wrapped the schema in markdown.
+    assert spec.reasoning, "EnvIntentSpec.reasoning must be populated"
diff --git a/isaaclab_arena/tests/test_structured_output_utils.py b/isaaclab_arena/tests/test_structured_output_utils.py
new file mode 100644
index 000000000..45511d8ba
--- /dev/null
+++ b/isaaclab_arena/tests/test_structured_output_utils.py
@@ -0,0 +1,421 @@
+# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Unit tests for :mod:`isaaclab_arena.environments.agentic_env_gen.structured_output_utils`.
+
+The utility module owns the three concerns that decouple "is this
+endpoint compatible with our structured-outputs contract?" from the
+agent's higher-level pipeline:
+
+* ``build_strict_schema`` / ``apply_strict_constraints`` — schema
+  munging that walks every object node (``$defs``, nested arrays,
+  ``anyOf`` arms) and applies OpenAI strict-mode constraints. Locked
+  in here so a future pydantic version that changes default schema
+  output doesn't silently regress Bedrock compatibility.
+* ``extract_response_text`` — the NVIDIA-DeepSeek-vs-OpenAI channel
+  fallback (``content`` first, then ``reasoning_content``,
+  ``"empty"`` last).
+* ``check_structured_output_support`` — the deployment validator's
+  diagnostic probe. Tested both with mocks (failure-mode coverage)
+  and against the real default model (so we notice the day
+  NVIDIA's hosted DeepSeek-v4-flash drops structured-output
+  support).
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from unittest.mock import MagicMock
+
+import pytest
+from pydantic import BaseModel
+
+from isaaclab_arena.environments.agentic_env_gen.env_gen_agent import DEFAULT_BASE_URL, DEFAULT_MODEL
+from isaaclab_arena.environments.agentic_env_gen.env_intent_spec import EnvIntentSpec
+from isaaclab_arena.environments.agentic_env_gen.structured_output_utils import (
+    StructuredOutputSupport,
+    apply_strict_constraints,
+    build_strict_schema,
+    check_structured_output_support,
+    extract_response_text,
+    ping,
+)
+
+# ---------------------------------------------------------------------------
+# Fixtures + helpers
+# ---------------------------------------------------------------------------
+
+
+def _chat_response(content: str | None = None, reasoning_content: str | None = None, finish_reason: str = "stop"):
+    """Build a nested mock matching the openai chat-completion response shape."""
+    resp = MagicMock()
+    resp.choices = [MagicMock()]
+    resp.choices[0].finish_reason = finish_reason
+    resp.choices[0].message.content = content
+    resp.choices[0].message.reasoning_content = reasoning_content
+    return resp
+
+
+# Minimal EnvIntentSpec payload that satisfies the ``tasks_must_be_non_empty``
+# validator — reused across the ``check_structured_output_support`` happy-path
+# tests so they exercise the real production schema rather than a toy stub.
+_MINIMAL_SPEC: dict = {
+    "reasoning": (
+        "User wants a pick-and-place: foreground object is 'avocado', "
+        "target container is 'bowl', background is the kitchen table."
+    ),
+    "task_description": "pick up the avocado and place it in the bowl",
+    "background": "kitchen",
+    "embodiment": "franka_ik",
+    "items": [
+        {"query": "avocado", "role": "foreground", "category_tags": [], "instance_name": None, "scale": None},
+        {"query": "bowl", "role": "foreground", "category_tags": [], "instance_name": None, "scale": None},
+    ],
+    "initial_scene_graph": [
+        {"kind": "on", "subject": "avocado", "target": "kitchen"},
+        {"kind": "on", "subject": "bowl", "target": "kitchen"},
+    ],
+    "tasks": [{
+        "kind": "pick_and_place",
+        "subject": "avocado",
+        "target": "bowl",
+        "description": "pick up the avocado and place it in the bowl",
+    }],
+}
+
+
+# ---------------------------------------------------------------------------
+# build_strict_schema / apply_strict_constraints
+# ---------------------------------------------------------------------------
+
+
+class _ToyChild(BaseModel):
+    name: str
+    optional_value: int | None = None
+
+
+class _ToyParent(BaseModel):
+    title: str
+    child: _ToyChild
+    children: list[_ToyChild] = []
+
+
+class TestBuildStrictSchema:
+    def test_root_object_additional_properties_false(self):
+        schema = build_strict_schema(_ToyParent)
+        assert schema["additionalProperties"] is False
+
+    def test_root_object_lists_every_property_as_required(self):
+        schema = build_strict_schema(_ToyParent)
+        assert set(schema["required"]) == {"title", "child", "children"}
+
+    def test_nested_defs_object_also_strict(self):
+        # OpenAI strict mode applies the constraint to *every* object node,
+        # not just the top level — including ``$defs`` entries that get
+        # referenced via ``$ref``. Bedrock in particular rejects the request
+        # if any descendant object schema is missing the marker.
+        schema = build_strict_schema(_ToyParent)
+        defs = schema["$defs"]
+        assert defs["_ToyChild"]["additionalProperties"] is False
+        assert set(defs["_ToyChild"]["required"]) == {"name", "optional_value"}
+
+    def test_defaults_stripped_everywhere(self):
+        # Pydantic emits ``"default": null`` for ``optional_value`` at the
+        # property level; strict mode rejects ``default`` since every field
+        # is required. Drop the key recursively.
+        schema = build_strict_schema(_ToyParent)
+        stack = [schema]
+        while stack:
+            node = stack.pop()
+            if isinstance(node, dict):
+                assert "default" not in node, f"unexpected default key in {node!r}"
+                stack.extend(node.values())
+            elif isinstance(node, list):
+                stack.extend(node)
+
+    def test_munging_does_not_mutate_pydantic_cached_schema(self):
+        # Pydantic caches ``model_json_schema()`` results internally; our
+        # munger MUST work on a deep copy so the cache stays clean for
+        # other callers (e.g. ``model_dump_json()`` consumers).
+        before = json.dumps(_ToyParent.model_json_schema(), sort_keys=True)
+        build_strict_schema(_ToyParent)
+        after = json.dumps(_ToyParent.model_json_schema(), sort_keys=True)
+        assert before == after
+
+    def test_apply_strict_constraints_is_idempotent(self):
+        # Safe to call multiple times — the second pass must be a no-op.
+        # Important because callers may receive an already-munged schema
+        # from a cache and re-apply defensively.
+        schema = build_strict_schema(_ToyParent)
+        snapshot = json.dumps(schema, sort_keys=True)
+        apply_strict_constraints(schema)
+        assert json.dumps(schema, sort_keys=True) == snapshot
+
+    def test_env_intent_spec_munges_clean(self):
+        # The real production schema we ship — confirm every object node
+        # has the strict-mode marker so the wire stays compatible with
+        # Bedrock and any other strict-mode validator users point at.
+        schema = build_strict_schema(EnvIntentSpec)
+
+        def assert_strict(node):
+            if isinstance(node, dict):
+                if node.get("type") == "object" and "properties" in node:
+                    assert node.get("additionalProperties") is False
+                    assert set(node["required"]) == set(node["properties"].keys())
+                for v in node.values():
+                    assert_strict(v)
+            elif isinstance(node, list):
+                for v in node:
+                    assert_strict(v)
+
+        assert_strict(schema)
+
+
+# ---------------------------------------------------------------------------
+# extract_response_text
+# ---------------------------------------------------------------------------
+
+
+class TestExtractResponseText:
+    def test_prefers_content_when_both_populated(self):
+        msg = MagicMock(content='{"a": 1}', reasoning_content='{"b": 2}')
+        text, route = extract_response_text(msg)
+        assert text == '{"a": 1}'
+        assert route == "content"
+
+    def test_falls_back_to_reasoning_content_when_content_empty(self):
+        # NVIDIA DeepSeek-v4-flash routes structured outputs into the
+        # provider-specific ``reasoning_content`` channel and leaves
+        # ``content`` as ``None``. The agent must transparently read either.
+        msg = MagicMock(content=None, reasoning_content='{"b": 2}')
+        text, route = extract_response_text(msg)
+        assert text == '{"b": 2}'
+        assert route == "reasoning_content"
+
+    def test_empty_when_both_channels_blank(self):
+        msg = MagicMock(content=None, reasoning_content=None)
+        text, route = extract_response_text(msg)
+        assert text == ""
+        assert route == "empty"
+
+    def test_empty_when_message_has_no_attrs(self):
+        # Some mock / stub message objects don't define the channels at all;
+        # ``getattr(..., None)`` must still resolve to "empty" rather than
+        # raising AttributeError.
+        msg = object()  # bare object, no attrs
+        text, route = extract_response_text(msg)
+        assert text == ""
+        assert route == "empty"
+
+    def test_treats_empty_string_as_falsy(self):
+        # ``""`` and ``None`` must both route to the fallback (otherwise an
+        # empty content with a populated reasoning_content would never
+        # reach the reasoning channel).
+        msg = MagicMock(content="", reasoning_content='{"b": 2}')
+        text, route = extract_response_text(msg)
+        assert text == '{"b": 2}'
+        assert route == "reasoning_content"
+
+
+# ---------------------------------------------------------------------------
+# ping
+# ---------------------------------------------------------------------------
+
+
+class TestPing:
+    def test_returns_response_content(self):
+        client = MagicMock()
+        client.chat.completions.create.return_value = _chat_response(content="OK")
+        assert ping(client, "any-model") == "OK"
+
+    def test_returns_empty_string_when_content_is_none(self):
+        # Some providers return ``None`` content alongside a finish_reason — we
+        # treat that as a successful round-trip (the wire works) rather than
+        # raising, since the caller's contract is "did this raise?".
+        client = MagicMock()
+        client.chat.completions.create.return_value = _chat_response(content=None)
+        assert ping(client, "any-model") == ""
+
+    def test_uses_minimal_request_params(self):
+        client = MagicMock()
+        client.chat.completions.create.return_value = _chat_response(content="OK")
+        ping(client, "model-name")
+        kwargs = client.chat.completions.create.call_args.kwargs
+        assert kwargs["model"] == "model-name"
+        assert kwargs["temperature"] == 0
+        assert kwargs["max_tokens"] == 8
+        # Single user message — no system prompt / catalog payload. Keeping the
+        # request small is the whole point: ping must stay cheap enough to
+        # gate every agent construction.
+        assert len(kwargs["messages"]) == 1
+        assert kwargs["messages"][0]["role"] == "user"
+        # ping is a structured-outputs-agnostic liveness check; it must NOT
+        # ask the model to honour response_format (otherwise it can't fail
+        # gracefully on models that lack structured-output support, which
+        # defeats the point of having a cheap probe).
+        assert "response_format" not in kwargs
+
+    def test_propagates_client_exceptions(self):
+        class FakeAuthError(Exception):
+            pass
+
+        client = MagicMock()
+        client.chat.completions.create.side_effect = FakeAuthError("invalid api key")
+        with pytest.raises(FakeAuthError, match="invalid api key"):
+            ping(client, "m")
+
+
+# ---------------------------------------------------------------------------
+# check_structured_output_support (mocked)
+# ---------------------------------------------------------------------------
+
+
+class TestCheckStructuredOutputSupport:
+    def test_reports_supported_on_valid_envelope(self):
+        client = MagicMock()
+        client.chat.completions.create.return_value = _chat_response(content=json.dumps(_MINIMAL_SPEC))
+        result = check_structured_output_support(client, "some-model", EnvIntentSpec)
+        assert isinstance(result, StructuredOutputSupport)
+        assert result.supported is True
+        assert result.model == "some-model"
+        assert result.response_route == "content"
+        assert result.api_error is None
+        assert result.parse_error is None
+        assert result.sample_payload  # truncated text echoed for diagnostics
+
+    def test_reports_reasoning_content_route(self):
+        # NVIDIA DeepSeek envelope — the canonical reason this helper
+        # exists. We must report ``supported=True`` AND surface the
+        # route so deployment validators can flag the model as "works
+        # but uses the non-standard channel".
+        client = MagicMock()
+        client.chat.completions.create.return_value = _chat_response(
+            content=None, reasoning_content=json.dumps(_MINIMAL_SPEC)
+        )
+        result = check_structured_output_support(client, "deepseek", EnvIntentSpec)
+        assert result.supported is True
+        assert result.response_route == "reasoning_content"
+
+    def test_reports_api_error_on_4xx(self):
+        # The most common "model doesn't support structured outputs"
+        # signal at the wire level: a 4xx rejecting the
+        # ``response_format`` parameter or the schema. Surface it as
+        # ``api_error``, leave ``parse_error`` empty, so callers can
+        # attribute correctly.
+        class FakeBadRequest(Exception):
+            pass
+
+        client = MagicMock()
+        client.chat.completions.create.side_effect = FakeBadRequest("Error code: 400 - additionalProperties")
+        result = check_structured_output_support(client, "claude", EnvIntentSpec)
+        assert result.supported is False
+        assert result.api_error is not None
+        assert "FakeBadRequest" in result.api_error
+        assert "400" in result.api_error
+        assert result.parse_error is None
+        # On an api_error, no payload is available to echo.
+        assert result.sample_payload is None
+        assert result.finish_reason is None
+
+    def test_reports_parse_error_on_empty_envelope(self):
+        # Wire accepts the request, model produces nothing on either
+        # channel. The endpoint silently dropped the structured output
+        # — the most insidious failure mode, since ``finish_reason``
+        # still reads ``stop``.
+        client = MagicMock()
+        client.chat.completions.create.return_value = _chat_response(content=None, reasoning_content=None)
+        result = check_structured_output_support(client, "broken", EnvIntentSpec)
+        assert result.supported is False
+        assert result.api_error is None
+        assert result.parse_error is not None
+        assert "empty envelope" in result.parse_error
+        assert result.response_route == "empty"
+        assert result.finish_reason == "stop"  # forwarded so callers can correlate
+
+    def test_reports_parse_error_on_invalid_json(self):
+        client = MagicMock()
+        client.chat.completions.create.return_value = _chat_response(content="not json")
+        result = check_structured_output_support(client, "m", EnvIntentSpec)
+        assert result.supported is False
+        assert result.parse_error is not None
+        assert "JSONDecodeError" in result.parse_error
+        assert result.sample_payload == "not json"
+
+    def test_reports_parse_error_on_validation_failure(self):
+        # JSON parses fine, but doesn't match the schema. The probe
+        # exists to detect this exact class of "model returns
+        # something, but it's wrong" failure.
+        client = MagicMock()
+        client.chat.completions.create.return_value = _chat_response(content='{"missing": "fields"}')
+        result = check_structured_output_support(client, "m", EnvIntentSpec)
+        assert result.supported is False
+        assert result.parse_error is not None
+        assert "ValidationError" in result.parse_error
+
+    def test_request_shape(self):
+        client = MagicMock()
+        client.chat.completions.create.return_value = _chat_response(content=json.dumps(_MINIMAL_SPEC))
+        check_structured_output_support(client, "model-name", EnvIntentSpec)
+        kwargs = client.chat.completions.create.call_args.kwargs
+        assert kwargs["model"] == "model-name"
+        assert kwargs["temperature"] == 0
+        assert kwargs["response_format"]["type"] == "json_schema"
+        assert kwargs["response_format"]["json_schema"]["name"] == "EnvIntentSpec"
+        assert kwargs["response_format"]["json_schema"]["strict"] is True
+        # The schema sent on the wire must already be munged for strict mode
+        # — otherwise Bedrock rejects with 400. Spot-check the root marker.
+        sent_schema = kwargs["response_format"]["json_schema"]["schema"]
+        assert sent_schema["additionalProperties"] is False
+
+    def test_accepts_alternative_spec_class(self):
+        # Callers can probe with a smaller toy spec for cheap model
+        # surveys — the probe shouldn't be hard-wired to EnvIntentSpec.
+        class TinySpec(BaseModel):
+            ok: bool
+
+        client = MagicMock()
+        client.chat.completions.create.return_value = _chat_response(content='{"ok": true}')
+        result = check_structured_output_support(client, "m", TinySpec)
+        assert result.supported is True
+        kwargs = client.chat.completions.create.call_args.kwargs
+        assert kwargs["response_format"]["json_schema"]["name"] == "TinySpec"
+
+
+# ---------------------------------------------------------------------------
+# Live endpoint (opt-in, network + auth required)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.agent_remote_e2e
+def test_default_model_supports_structured_output():
+    """The default ``EnvGenAgent`` model must support structured outputs.
+
+    This is the gating contract of the whole agent: ``generate_spec``
+    is structured-outputs-only, so the default
+    ``DEFAULT_MODEL`` / ``DEFAULT_BASE_URL`` pair must pass the probe.
+    Failing here means production env-gen is broken — usually because
+    NVIDIA changed which channel DeepSeek-v4-flash routes structured
+    outputs into, or pulled the model from the default-models
+    catalogue.
+
+    Asserts only ``supported=True``; the route may be either
+    ``content`` (standard OpenAI) or ``reasoning_content`` (NVIDIA
+    DeepSeek quirk) — both are handled transparently downstream.
+    """
+    api_key = os.environ.get("NV_API_KEY")
+    assert api_key, "NV_API_KEY env var required to run live tests"
+
+    from openai import OpenAI
+
+    client = OpenAI(api_key=api_key, base_url=DEFAULT_BASE_URL)
+    result = check_structured_output_support(client, DEFAULT_MODEL, EnvIntentSpec)
+    assert result.supported, (
+        f"Default model {result.model!r} does not support structured outputs against "
+        f"{DEFAULT_BASE_URL!r}: api_error={result.api_error!r} "
+        f"parse_error={result.parse_error!r} route={result.response_route!r} "
+        f"payload={result.sample_payload!r}"
+    )
+    assert result.response_route in {"content", "reasoning_content"}

From 5cc62691728d9298e614963847e2ec6a26b631d0 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Thu, 28 May 2026 17:56:00 +0800
Subject: [PATCH 35/41] Guard empty choices list in
 check_structured_output_support

Some providers (Azure content-filter trips, Bedrock guardrail
rejections) succeed at the HTTP level but return an empty choices
list. The naive ``resp.choices[0]`` access raised ``IndexError`` and
broke the function's contract of always returning a
``StructuredOutputSupport`` value.

Surface the condition as a ``parse_error`` with
``response_route="empty"`` so callers route it identically to the
existing "empty envelope" case, but with a distinct message
("no choices") so operators can tell the two scenarios apart in logs.

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 .../structured_output_utils.py                | 23 +++++++++++++++++--
 .../tests/test_structured_output_utils.py     | 22 ++++++++++++++++++
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py b/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py
index 69f542f47..dae1042d9 100644
--- a/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py
+++ b/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py
@@ -246,8 +246,27 @@ def check_structured_output_support(
             sample_payload=None,
         )
 
-    finish_reason = resp.choices[0].finish_reason
-    text, route = extract_response_text(resp.choices[0].message)
+    # Some providers (e.g. Azure content-filter trips, Bedrock guardrail
+    # rejections) succeed at the HTTP level but return an empty ``choices``
+    # list — no candidates were emitted. ``resp.choices[0]`` would raise
+    # ``IndexError`` and break our always-return-a-value contract, so
+    # surface it as a parse_error with a distinct message that operators
+    # can tell apart from the "envelope returned but content empty" case
+    # handled further down.
+    choices = getattr(resp, "choices", None) or []
+    if not choices:
+        return StructuredOutputSupport(
+            supported=False,
+            model=model,
+            finish_reason=None,
+            response_route="empty",
+            api_error=None,
+            parse_error="Response contained no choices (model emitted zero candidates).",
+            sample_payload=None,
+        )
+
+    finish_reason = choices[0].finish_reason
+    text, route = extract_response_text(choices[0].message)
     sample = text[:_RESPONSE_PREVIEW_CHARS] if text else None
     if not text:
         return StructuredOutputSupport(
diff --git a/isaaclab_arena/tests/test_structured_output_utils.py b/isaaclab_arena/tests/test_structured_output_utils.py
index 45511d8ba..d35f86fe0 100644
--- a/isaaclab_arena/tests/test_structured_output_utils.py
+++ b/isaaclab_arena/tests/test_structured_output_utils.py
@@ -335,6 +335,28 @@ def test_reports_parse_error_on_empty_envelope(self):
         assert result.response_route == "empty"
         assert result.finish_reason == "stop"  # forwarded so callers can correlate
 
+    def test_reports_parse_error_when_choices_list_is_empty(self):
+        # Real provider behaviour: HTTP returns 200 OK but ``choices`` is
+        # an empty list. Seen on Azure when a content-filter trips, and
+        # on Bedrock when a guardrail rejects the response post-hoc.
+        # Naive ``resp.choices[0]`` access would IndexError and break
+        # the always-return-a-StructuredOutputSupport contract; the
+        # function must instead surface it as a ``parse_error`` with
+        # ``response_route="empty"`` so callers route it the same way
+        # they route an empty envelope.
+        resp = MagicMock()
+        resp.choices = []
+        client = MagicMock()
+        client.chat.completions.create.return_value = resp
+        result = check_structured_output_support(client, "guardrailed", EnvIntentSpec)
+        assert result.supported is False
+        assert result.api_error is None
+        assert result.parse_error is not None
+        assert "no choices" in result.parse_error
+        assert result.response_route == "empty"
+        assert result.finish_reason is None
+        assert result.sample_payload is None
+
     def test_reports_parse_error_on_invalid_json(self):
         client = MagicMock()
         client.chat.completions.create.return_value = _chat_response(content="not json")

From f92fa73e23a2c306b52bf0ffed5279689a469cc3 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Thu, 28 May 2026 18:05:44 +0800
Subject: [PATCH 36/41] Allow empty tasks list in EnvIntentSpec
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the ``_tasks_must_be_non_empty`` model validator so the agent
can legitimately emit an empty ``tasks`` list when the user prompt
describes a task-less scene (static playground / sandbox env). The
arena layer treats empty tasks as a ``NoTask`` null object, so
rejecting the empty list at the schema layer was a redundant guardrail
that forced the agent to invent placeholder tasks for task-less
prompts.

Expand the ``tasks`` field description to teach the agent the new
contract — empty list is valid, prefer it over a placeholder task —
so behaviour doesn't regress now that the validator no longer enforces
non-emptiness.

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 .../agentic_env_gen/env_intent_spec.py          | 17 +++++++----------
 isaaclab_arena/tests/test_env_gen_agent.py      |  5 ++---
 .../tests/test_structured_output_utils.py       |  4 ++--
 3 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py b/isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py
index ef5de2488..4d3c9147d 100644
--- a/isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py
+++ b/isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py
@@ -15,7 +15,7 @@
 
 from typing import Literal
 
-from pydantic import BaseModel, Field, model_validator
+from pydantic import BaseModel, Field
 
 # Relation kinds currently surfaced to the agent. Mirror the subset of
 # ``ArenaEnvGraphSpatialConstraintType`` that makes sense for tabletop
@@ -194,14 +194,11 @@ class EnvIntentSpec(BaseModel):
         description=(
             "Tasks to execute in sequence. The task sequence implicitly "
             "defines the intermediate env graphs by applying each task's "
-            "transformations in order."
+            "transformations in order. An empty list is valid and means "
+            "the env has no task — at the arena layer this maps to the "
+            "``NoTask`` null object (e.g. a static playground / sandbox "
+            "env). Prefer an empty list over inventing a placeholder "
+            "task when the user prompt genuinely describes a task-less "
+            "scene."
         ),
     )
-
-    @model_validator(mode="after")
-    def _tasks_must_be_non_empty(self) -> EnvIntentSpec:
-        if not self.tasks:
-            raise ValueError(
-                "tasks list is empty — at least one task must be specified to define the env transformation."
-            )
-        return self
diff --git a/isaaclab_arena/tests/test_env_gen_agent.py b/isaaclab_arena/tests/test_env_gen_agent.py
index 2624374f8..31ed459c8 100644
--- a/isaaclab_arena/tests/test_env_gen_agent.py
+++ b/isaaclab_arena/tests/test_env_gen_agent.py
@@ -98,9 +98,8 @@ def agent(stub_openai):
     return a
 
 
-# Minimal EnvIntentSpec payload — exercises every required field plus one task so
-# the ``tasks_must_be_non_empty`` validator passes. Reused across the
-# generate_spec happy-path tests.
+# Minimal EnvIntentSpec payload — exercises every required field plus one
+# task. Reused across the generate_spec happy-path tests.
 _MINIMAL_SPEC: dict = {
     "reasoning": (
         "User wants a pick-and-place: foreground object is 'avocado', "
diff --git a/isaaclab_arena/tests/test_structured_output_utils.py b/isaaclab_arena/tests/test_structured_output_utils.py
index d35f86fe0..c92a50ffd 100644
--- a/isaaclab_arena/tests/test_structured_output_utils.py
+++ b/isaaclab_arena/tests/test_structured_output_utils.py
@@ -59,8 +59,8 @@ def _chat_response(content: str | None = None, reasoning_content: str | None = N
     return resp
 
 
-# Minimal EnvIntentSpec payload that satisfies the ``tasks_must_be_non_empty``
-# validator — reused across the ``check_structured_output_support`` happy-path
+# Minimal EnvIntentSpec payload exercising every required field plus one
+# task — reused across the ``check_structured_output_support`` happy-path
 # tests so they exercise the real production schema rather than a toy stub.
 _MINIMAL_SPEC: dict = {
     "reasoning": (

From 9cd53f46a4aa4ef0f51eabbd89781e5169df9134 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Thu, 28 May 2026 19:02:35 +0800
Subject: [PATCH 37/41] Simplify check_structured_output_support to
 bool-or-raise
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ``StructuredOutputSupport`` dataclass exposed 7 typed signals
(``supported``, ``response_route``, ``finish_reason``, ``api_error``,
``parse_error``, ``sample_payload``, ``model``) but every caller
immediately collapsed it back to a boolean and built its own error
message from the failure fields. Collapse the contract:

* ``check_structured_output_support`` now returns ``True`` on a
  clean round-trip and raises ``RuntimeError`` with a multi-line
  diagnostic on every failure mode. The originating SDK / parser
  exception (``BadRequestError``, ``JSONDecodeError``,
  ``ValidationError``, etc.) is chained via ``raise ... from exc``
  so the traceback retains the full context.
* ``EnvGenAgent.__init__`` drops the ``if not support.supported:
  raise RuntimeError(...)`` block — the probe raises the
  diagnostic directly.

The diagnostic format (``response_route`` / ``finish_reason`` /
``cause`` / ``sample_payload``) is unchanged; ``api_error`` and
``parse_error`` merge into a single ``cause`` field (the exception
type prefix — ``BadRequestError:`` vs ``JSONDecodeError:`` vs
``ValidationError:`` — is self-classifying so the split was
redundant). ``base_url`` drops out of the message — the model
name + cause already pin the failure, and the exception chain
preserves the SDK exception's response metadata for callers that
need it.

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 .../agentic_env_gen/env_gen_agent.py          |  15 +-
 .../structured_output_utils.py                | 173 ++++++++--------
 isaaclab_arena/tests/test_env_gen_agent.py    |  25 ++-
 .../tests/test_structured_output_utils.py     | 193 ++++++++++--------
 4 files changed, 209 insertions(+), 197 deletions(-)

diff --git a/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py b/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py
index 63e044a49..76ee1d5c1 100644
--- a/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py
+++ b/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py
@@ -140,17 +140,10 @@ def __init__(
         # 2) Structured-output capability check. ``generate_spec`` is
         # structured-outputs-only, so a model that can't honour
         # ``response_format=json_schema`` is fundamentally unusable for
-        # this agent. Surface the failure at construction time with the
-        # full diagnostic payload from the probe (api_error vs
-        # parse_error vs route) so deployment validators can attribute
-        # the cause without grepping logs.
-        support = check_structured_output_support(self.client, self.model, EnvIntentSpec)
-        if not support.supported:
-            raise RuntimeError(
-                f"Model {self.model!r} at {base_url!r} does not support structured outputs: "
-                f"api_error={support.api_error!r} parse_error={support.parse_error!r} "
-                f"route={support.response_route!r}"
-            )
+        # this agent. The probe raises ``RuntimeError`` with a multi-line
+        # diagnostic (route / finish_reason / cause / sample_payload) on
+        # any failure mode — no caller-side wrapping needed.
+        check_structured_output_support(self.client, self.model, EnvIntentSpec)
 
     def generate_spec(
         self,
diff --git a/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py b/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py
index dae1042d9..7ea0ed62d 100644
--- a/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py
+++ b/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py
@@ -20,7 +20,6 @@
 
 import copy
 import json
-from dataclasses import dataclass
 from typing import Any
 
 from pydantic import BaseModel
@@ -31,34 +30,29 @@
 _RESPONSE_PREVIEW_CHARS = 500
 
 
-@dataclass(frozen=True)
-class StructuredOutputSupport:
-    """Result of probing a model for structured-outputs capability.
-
-    The probe sends a one-shot request asking the configured model to
-    return a payload matching ``spec_class``'s strict schema. The
-    result captures every signal a deployment validator needs to
-    decide "is this model usable?":
-
-      * ``supported``: True iff a valid ``spec_class`` instance came
-        back end-to-end (wire ok, schema honoured, pydantic
-        validation passed).
-      * ``response_route``: which channel held the structured output
-        (``"content"`` for OpenAI-compatible models,
-        ``"reasoning_content"`` for NVIDIA DeepSeek, ``"empty"`` when
-        the model dropped the request).
-      * ``api_error`` / ``parse_error``: filled in (mutually
-        exclusively, in that order) when ``supported`` is False so
-        the caller can attribute the failure correctly.
+def _format_failure_message(
+    *,
+    model: str,
+    response_route: str,
+    finish_reason: str | None,
+    cause: str,
+    sample_payload: str | None,
+) -> str:
+    """Build the multi-line diagnostic message for a structured-output failure.
+
+    The format pairs every signal the probe captured into a layout that
+    grep/CI logs can read at a glance. ``sample_payload`` is the
+    single most useful field — it turns a cryptic ``JSONDecodeError:
+    Expecting value`` into a debuggable failure by showing what the
+    model actually returned (prose preamble? HTML error page? empty?).
     """
-
-    supported: bool
-    model: str
-    finish_reason: str | None
-    response_route: str
-    api_error: str | None
-    parse_error: str | None
-    sample_payload: str | None
+    return (
+        f"Model {model!r} does not support structured outputs:\n"
+        f"  response_route = {response_route!r}\n"
+        f"  finish_reason  = {finish_reason!r}\n"
+        f"  cause          = {cause}\n"
+        f"  sample_payload = {sample_payload!r}"
+    )
 
 
 def build_strict_schema(model_cls: type[BaseModel]) -> dict[str, Any]:
@@ -179,27 +173,21 @@ def check_structured_output_support(
     client: Any,
     model: str,
     spec_class: type[BaseModel],
-) -> StructuredOutputSupport:
+) -> bool:
     """Probe whether ``model`` can produce ``spec_class``-shaped structured outputs.
 
     Sends a single chat-completion against ``client`` with
     ``response_format=json_schema`` carrying ``spec_class``'s strict
     schema and a minimal user prompt asking the model to fabricate a
-    valid instance. Reports diagnostics rather than raising so
-    deployment validators can decide how to react (warn, fall back,
-    abort).
-
-    Two failure modes are reported separately:
-
-      * ``api_error`` — the request was rejected at the wire
-        (400/401/etc). The endpoint or its proxy doesn't understand
-        ``response_format``, or the schema violates a
-        provider-specific constraint (e.g. Bedrock requiring
-        ``additionalProperties: false`` everywhere — we munge for
-        this, but other constraints can still surface here).
-      * ``parse_error`` — the request succeeded and the model
-        returned a payload, but it doesn't parse as JSON or doesn't
-        validate against the schema.
+    valid instance. Returns ``True`` if the model successfully
+    produced a valid ``spec_class`` instance end-to-end.
+
+    Every failure mode raises ``RuntimeError`` with a multi-line
+    diagnostic that names the failed channel, ``finish_reason``,
+    the underlying cause, and a preview of the model's response.
+    When the failure has an originating SDK exception (HTTP error,
+    JSONDecodeError, ValidationError) it is chained via
+    ``__cause__`` so the traceback retains the full context.
 
     Args:
         client: An OpenAI-compatible client (typically
@@ -211,7 +199,17 @@ def check_structured_output_support(
             sent to the endpoint.
 
     Returns:
-        A :class:`StructuredOutputSupport` capturing the outcome.
+        ``True`` when the probe round-trips successfully (wire ok,
+        schema honoured, pydantic validation passed).
+
+    Raises:
+        RuntimeError: for any failure mode — API rejection at the
+            wire (400/401/etc.), empty ``choices`` list (Azure
+            content-filter / Bedrock guardrail rejection), empty
+            envelope on both ``content`` and ``reasoning_content``,
+            JSON parse failure, or pydantic schema-validation
+            failure. The exception's ``__cause__`` (when populated)
+            is the originating SDK / parser exception.
     """
     schema = build_strict_schema(spec_class)
     # The user prompt is deliberately content-free; the schema itself
@@ -236,67 +234,58 @@ def check_structured_output_support(
             max_tokens=2000,
         )
     except Exception as exc:
-        return StructuredOutputSupport(
-            supported=False,
-            model=model,
-            finish_reason=None,
-            response_route="empty",
-            api_error=f"{type(exc).__name__}: {str(exc)[:_RESPONSE_PREVIEW_CHARS]}",
-            parse_error=None,
-            sample_payload=None,
-        )
+        raise RuntimeError(
+            _format_failure_message(
+                model=model,
+                response_route="empty",
+                finish_reason=None,
+                cause=f"{type(exc).__name__}: {str(exc)[:_RESPONSE_PREVIEW_CHARS]}",
+                sample_payload=None,
+            )
+        ) from exc
 
     # Some providers (e.g. Azure content-filter trips, Bedrock guardrail
     # rejections) succeed at the HTTP level but return an empty ``choices``
     # list — no candidates were emitted. ``resp.choices[0]`` would raise
-    # ``IndexError`` and break our always-return-a-value contract, so
-    # surface it as a parse_error with a distinct message that operators
-    # can tell apart from the "envelope returned but content empty" case
-    # handled further down.
+    # ``IndexError``; surface it with a distinct ``cause`` message that
+    # operators can tell apart from the "envelope returned but content
+    # empty" case handled further down.
     choices = getattr(resp, "choices", None) or []
     if not choices:
-        return StructuredOutputSupport(
-            supported=False,
-            model=model,
-            finish_reason=None,
-            response_route="empty",
-            api_error=None,
-            parse_error="Response contained no choices (model emitted zero candidates).",
-            sample_payload=None,
+        raise RuntimeError(
+            _format_failure_message(
+                model=model,
+                response_route="empty",
+                finish_reason=None,
+                cause="Response contained no choices (model emitted zero candidates).",
+                sample_payload=None,
+            )
         )
 
     finish_reason = choices[0].finish_reason
     text, route = extract_response_text(choices[0].message)
     sample = text[:_RESPONSE_PREVIEW_CHARS] if text else None
     if not text:
-        return StructuredOutputSupport(
-            supported=False,
-            model=model,
-            finish_reason=finish_reason,
-            response_route=route,
-            api_error=None,
-            parse_error="Model returned an empty envelope on both content and reasoning_content.",
-            sample_payload=None,
+        raise RuntimeError(
+            _format_failure_message(
+                model=model,
+                response_route=route,
+                finish_reason=finish_reason,
+                cause="Model returned an empty envelope on both content and reasoning_content.",
+                sample_payload=None,
+            )
         )
     try:
         data = json.loads(text, strict=False)
         spec_class.model_validate(data)
     except Exception as exc:
-        return StructuredOutputSupport(
-            supported=False,
-            model=model,
-            finish_reason=finish_reason,
-            response_route=route,
-            api_error=None,
-            parse_error=f"{type(exc).__name__}: {str(exc)[:_RESPONSE_PREVIEW_CHARS]}",
-            sample_payload=sample,
-        )
-    return StructuredOutputSupport(
-        supported=True,
-        model=model,
-        finish_reason=finish_reason,
-        response_route=route,
-        api_error=None,
-        parse_error=None,
-        sample_payload=sample,
-    )
+        raise RuntimeError(
+            _format_failure_message(
+                model=model,
+                response_route=route,
+                finish_reason=finish_reason,
+                cause=f"{type(exc).__name__}: {str(exc)[:_RESPONSE_PREVIEW_CHARS]}",
+                sample_payload=sample,
+            )
+        ) from exc
+    return True
diff --git a/isaaclab_arena/tests/test_env_gen_agent.py b/isaaclab_arena/tests/test_env_gen_agent.py
index 31ed459c8..8f834b764 100644
--- a/isaaclab_arena/tests/test_env_gen_agent.py
+++ b/isaaclab_arena/tests/test_env_gen_agent.py
@@ -199,9 +199,12 @@ def test_init_raises_when_structured_output_unsupported(self):
         # The agent is structured-outputs-only — a model that can't honour
         # ``response_format=json_schema`` is fundamentally unusable. The
         # constructor must refuse rather than letting downstream
-        # ``generate_spec`` blow up later. The error message must surface
-        # the diagnostic fields from the probe so the operator can attribute
-        # the cause (api_error vs parse_error vs empty envelope).
+        # ``generate_spec`` blow up later. ``check_structured_output_support``
+        # raises the diagnostic RuntimeError directly, so all the
+        # informative fields are baked into the probe's exception — no
+        # caller-side message construction. This test just confirms the
+        # probe's exception reaches the caller verbatim (no swallow,
+        # no rewrap that drops fields).
         with patch("openai.OpenAI") as mock_cls:
             client = MagicMock()
             client.chat.completions.create.side_effect = [
@@ -212,12 +215,16 @@ def test_init_raises_when_structured_output_unsupported(self):
             with pytest.raises(RuntimeError, match="does not support structured outputs") as exc_info:
                 EnvGenAgent(api_key="k")
             msg = str(exc_info.value)
-            # Empty-envelope probe → parse_error populated, api_error None,
-            # route="empty". All three should appear in the message so the
-            # caller can distinguish from a 4xx (api_error populated) case.
-            assert "parse_error=" in msg
-            assert "api_error=" in msg
-            assert "route='empty'" in msg
+            # Diagnostic fields from the probe must reach the operator —
+            # ``sample_payload`` in particular is what turns cryptic JSON /
+            # validation errors into debuggable failures.
+            assert "response_route" in msg
+            assert "finish_reason" in msg
+            assert "cause" in msg
+            assert "sample_payload" in msg
+            # The empty-envelope route signal — keeps callers able to
+            # attribute "empty" vs "content" vs "reasoning_content".
+            assert "'empty'" in msg
 
     def test_init_caches_strict_schema(self, stub_openai):
         # The strict schema munging walks ~10 nested object nodes; caching it
diff --git a/isaaclab_arena/tests/test_structured_output_utils.py b/isaaclab_arena/tests/test_structured_output_utils.py
index c92a50ffd..aa1646eda 100644
--- a/isaaclab_arena/tests/test_structured_output_utils.py
+++ b/isaaclab_arena/tests/test_structured_output_utils.py
@@ -36,7 +36,6 @@
 from isaaclab_arena.environments.agentic_env_gen.env_gen_agent import DEFAULT_BASE_URL, DEFAULT_MODEL
 from isaaclab_arena.environments.agentic_env_gen.env_intent_spec import EnvIntentSpec
 from isaaclab_arena.environments.agentic_env_gen.structured_output_utils import (
-    StructuredOutputSupport,
     apply_strict_constraints,
     build_strict_schema,
     check_structured_output_support,
@@ -274,108 +273,139 @@ class FakeAuthError(Exception):
 
 
 class TestCheckStructuredOutputSupport:
-    def test_reports_supported_on_valid_envelope(self):
+    """Bool-or-raise contract: returns True on a clean round-trip, raises
+    ``RuntimeError`` with a multi-line diagnostic on every failure mode.
+
+    Each failure-mode test pins three things:
+      1. ``RuntimeError`` (not the original SDK exception) reaches the
+         caller — so callers have a single exception type to catch.
+      2. The model name appears in the message (the most-grepped field).
+      3. The ``cause`` field carries the upstream classifier
+         (``BadRequestError`` vs ``JSONDecodeError`` vs ``ValidationError``)
+         so the failure attribution survives the wrapping.
+
+    Where the underlying SDK / parser exception is preserved on
+    ``__cause__``, we assert that too — it's what makes
+    ``raise ... from exc`` worth doing.
+    """
+
+    def test_returns_true_on_valid_envelope(self):
         client = MagicMock()
         client.chat.completions.create.return_value = _chat_response(content=json.dumps(_MINIMAL_SPEC))
-        result = check_structured_output_support(client, "some-model", EnvIntentSpec)
-        assert isinstance(result, StructuredOutputSupport)
-        assert result.supported is True
-        assert result.model == "some-model"
-        assert result.response_route == "content"
-        assert result.api_error is None
-        assert result.parse_error is None
-        assert result.sample_payload  # truncated text echoed for diagnostics
-
-    def test_reports_reasoning_content_route(self):
-        # NVIDIA DeepSeek envelope — the canonical reason this helper
-        # exists. We must report ``supported=True`` AND surface the
-        # route so deployment validators can flag the model as "works
-        # but uses the non-standard channel".
+        # The whole public contract collapses to: ``True`` or it raises.
+        # ``is True`` rather than truthy so a future regression that
+        # returns a dict/tuple/etc fails this test.
+        assert check_structured_output_support(client, "some-model", EnvIntentSpec) is True
+
+    def test_returns_true_on_reasoning_content_envelope(self):
+        # NVIDIA DeepSeek envelope — content empty, structured output
+        # on the ``reasoning_content`` channel. Must NOT raise; the
+        # ``extract_response_text`` fallback handles this transparently.
+        # The previous dataclass surfaced ``response_route`` so callers
+        # could distinguish; the new API hides that detail (callers
+        # don't need it — both channels are equivalent for our purposes).
         client = MagicMock()
         client.chat.completions.create.return_value = _chat_response(
             content=None, reasoning_content=json.dumps(_MINIMAL_SPEC)
         )
-        result = check_structured_output_support(client, "deepseek", EnvIntentSpec)
-        assert result.supported is True
-        assert result.response_route == "reasoning_content"
+        assert check_structured_output_support(client, "deepseek", EnvIntentSpec) is True
 
-    def test_reports_api_error_on_4xx(self):
+    def test_raises_on_4xx_with_underlying_exception_chained(self):
         # The most common "model doesn't support structured outputs"
-        # signal at the wire level: a 4xx rejecting the
-        # ``response_format`` parameter or the schema. Surface it as
-        # ``api_error``, leave ``parse_error`` empty, so callers can
-        # attribute correctly.
+        # signal at the wire level: a 4xx rejecting ``response_format``
+        # or the schema. The original SDK exception must reach the
+        # caller via ``__cause__`` so the traceback retains the HTTP
+        # status / body — otherwise debugging "why did construction
+        # fail?" requires re-running locally.
         class FakeBadRequest(Exception):
             pass
 
         client = MagicMock()
-        client.chat.completions.create.side_effect = FakeBadRequest("Error code: 400 - additionalProperties")
-        result = check_structured_output_support(client, "claude", EnvIntentSpec)
-        assert result.supported is False
-        assert result.api_error is not None
-        assert "FakeBadRequest" in result.api_error
-        assert "400" in result.api_error
-        assert result.parse_error is None
-        # On an api_error, no payload is available to echo.
-        assert result.sample_payload is None
-        assert result.finish_reason is None
-
-    def test_reports_parse_error_on_empty_envelope(self):
+        original = FakeBadRequest("Error code: 400 - additionalProperties")
+        client.chat.completions.create.side_effect = original
+        with pytest.raises(RuntimeError, match="does not support structured outputs") as exc_info:
+            check_structured_output_support(client, "claude", EnvIntentSpec)
+        msg = str(exc_info.value)
+        # Model name surfaces (most-grepped field) and the cause type
+        # classifies the failure (4xx wire error, not parse / validation).
+        assert "'claude'" in msg
+        assert "FakeBadRequest" in msg
+        assert "400" in msg
+        # On an api_error there's no response payload to echo.
+        assert "sample_payload = None" in msg
+        # Exception chaining preserves the original for traceback drill-down.
+        assert exc_info.value.__cause__ is original
+
+    def test_raises_on_empty_envelope(self):
         # Wire accepts the request, model produces nothing on either
         # channel. The endpoint silently dropped the structured output
         # — the most insidious failure mode, since ``finish_reason``
-        # still reads ``stop``.
+        # still reads ``stop``. No underlying exception to chain.
         client = MagicMock()
         client.chat.completions.create.return_value = _chat_response(content=None, reasoning_content=None)
-        result = check_structured_output_support(client, "broken", EnvIntentSpec)
-        assert result.supported is False
-        assert result.api_error is None
-        assert result.parse_error is not None
-        assert "empty envelope" in result.parse_error
-        assert result.response_route == "empty"
-        assert result.finish_reason == "stop"  # forwarded so callers can correlate
-
-    def test_reports_parse_error_when_choices_list_is_empty(self):
+        with pytest.raises(RuntimeError, match="does not support structured outputs") as exc_info:
+            check_structured_output_support(client, "broken", EnvIntentSpec)
+        msg = str(exc_info.value)
+        assert "empty envelope" in msg
+        # finish_reason forwarded so the operator can correlate with
+        # provider logs (was it a content-filter stop, a length cap, etc.).
+        assert "finish_reason  = 'stop'" in msg
+        # No upstream exception to chain on this branch (the function
+        # itself synthesises the failure from a structurally-OK response).
+        assert exc_info.value.__cause__ is None
+
+    def test_raises_when_choices_list_is_empty(self):
         # Real provider behaviour: HTTP returns 200 OK but ``choices`` is
         # an empty list. Seen on Azure when a content-filter trips, and
         # on Bedrock when a guardrail rejects the response post-hoc.
         # Naive ``resp.choices[0]`` access would IndexError and break
-        # the always-return-a-StructuredOutputSupport contract; the
-        # function must instead surface it as a ``parse_error`` with
-        # ``response_route="empty"`` so callers route it the same way
-        # they route an empty envelope.
+        # the contract — surface it as a structured RuntimeError with
+        # a distinct ``cause`` message that operators can tell apart
+        # from the "envelope returned but content empty" case.
         resp = MagicMock()
         resp.choices = []
         client = MagicMock()
         client.chat.completions.create.return_value = resp
-        result = check_structured_output_support(client, "guardrailed", EnvIntentSpec)
-        assert result.supported is False
-        assert result.api_error is None
-        assert result.parse_error is not None
-        assert "no choices" in result.parse_error
-        assert result.response_route == "empty"
-        assert result.finish_reason is None
-        assert result.sample_payload is None
-
-    def test_reports_parse_error_on_invalid_json(self):
+        with pytest.raises(RuntimeError, match="does not support structured outputs") as exc_info:
+            check_structured_output_support(client, "guardrailed", EnvIntentSpec)
+        msg = str(exc_info.value)
+        assert "no choices" in msg
+        assert "response_route = 'empty'" in msg
+
+    def test_raises_on_invalid_json_with_payload_preview(self):
+        # The JSON-decode failure is the case where ``sample_payload``
+        # earns its keep — without it the operator sees only
+        # "Expecting value: line 1 column 1" and has to re-run locally
+        # to discover the model emitted a prose preamble. With the
+        # preview in the message the failure is debuggable from CI logs.
         client = MagicMock()
         client.chat.completions.create.return_value = _chat_response(content="not json")
-        result = check_structured_output_support(client, "m", EnvIntentSpec)
-        assert result.supported is False
-        assert result.parse_error is not None
-        assert "JSONDecodeError" in result.parse_error
-        assert result.sample_payload == "not json"
-
-    def test_reports_parse_error_on_validation_failure(self):
+        with pytest.raises(RuntimeError, match="does not support structured outputs") as exc_info:
+            check_structured_output_support(client, "m", EnvIntentSpec)
+        msg = str(exc_info.value)
+        assert "JSONDecodeError" in msg
+        assert "'not json'" in msg  # the literal response preview
+        # Original JSONDecodeError preserved on ``__cause__``.
+        assert exc_info.value.__cause__ is not None
+        assert type(exc_info.value.__cause__).__name__ == "JSONDecodeError"
+
+    def test_raises_on_validation_failure_with_payload_preview(self):
         # JSON parses fine, but doesn't match the schema. The probe
         # exists to detect this exact class of "model returns
-        # something, but it's wrong" failure.
+        # something, but it's wrong" failure. The original
+        # ValidationError chains via ``__cause__`` so ``.errors()``
+        # is still reachable for callers that want the structured
+        # error list.
+        from pydantic import ValidationError
+
         client = MagicMock()
         client.chat.completions.create.return_value = _chat_response(content='{"missing": "fields"}')
-        result = check_structured_output_support(client, "m", EnvIntentSpec)
-        assert result.supported is False
-        assert result.parse_error is not None
-        assert "ValidationError" in result.parse_error
+        with pytest.raises(RuntimeError, match="does not support structured outputs") as exc_info:
+            check_structured_output_support(client, "m", EnvIntentSpec)
+        msg = str(exc_info.value)
+        assert "ValidationError" in msg
+        assert '{"missing": "fields"}' in msg  # payload preview echoed
+        assert isinstance(exc_info.value.__cause__, ValidationError)
 
     def test_request_shape(self):
         client = MagicMock()
@@ -400,8 +430,7 @@ class TinySpec(BaseModel):
 
         client = MagicMock()
         client.chat.completions.create.return_value = _chat_response(content='{"ok": true}')
-        result = check_structured_output_support(client, "m", TinySpec)
-        assert result.supported is True
+        assert check_structured_output_support(client, "m", TinySpec) is True
         kwargs = client.chat.completions.create.call_args.kwargs
         assert kwargs["response_format"]["json_schema"]["name"] == "TinySpec"
 
@@ -423,9 +452,10 @@ def test_default_model_supports_structured_output():
     outputs into, or pulled the model from the default-models
     catalogue.
 
-    Asserts only ``supported=True``; the route may be either
-    ``content`` (standard OpenAI) or ``reasoning_content`` (NVIDIA
-    DeepSeek quirk) — both are handled transparently downstream.
+    The probe's ``RuntimeError`` already carries a multi-line
+    diagnostic (model / route / finish_reason / cause /
+    sample_payload), so test-failure output is self-describing — no
+    extra error-message construction needed here.
     """
     api_key = os.environ.get("NV_API_KEY")
     assert api_key, "NV_API_KEY env var required to run live tests"
@@ -433,11 +463,4 @@ def test_default_model_supports_structured_output():
     from openai import OpenAI
 
     client = OpenAI(api_key=api_key, base_url=DEFAULT_BASE_URL)
-    result = check_structured_output_support(client, DEFAULT_MODEL, EnvIntentSpec)
-    assert result.supported, (
-        f"Default model {result.model!r} does not support structured outputs against "
-        f"{DEFAULT_BASE_URL!r}: api_error={result.api_error!r} "
-        f"parse_error={result.parse_error!r} route={result.response_route!r} "
-        f"payload={result.sample_payload!r}"
-    )
-    assert result.response_route in {"content", "reasoning_content"}
+    assert check_structured_output_support(client, DEFAULT_MODEL, EnvIntentSpec) is True

From bd4293bd8889255e1b80292c95fe76c3df743e42 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Thu, 28 May 2026 22:05:56 +0800
Subject: [PATCH 38/41] Mark inference e2e tests flacky; add TODO in code to
 handle retries later

---
 .../environments/agentic_env_gen/env_gen_agent.py   |  8 ++++++++
 .../agentic_env_gen/structured_output_utils.py      | 13 +++++++++++++
 isaaclab_arena/tests/test_env_gen_agent.py          |  9 +++++++++
 .../tests/test_structured_output_utils.py           |  8 ++++++++
 4 files changed, 38 insertions(+)

diff --git a/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py b/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py
index 76ee1d5c1..8cb597b1d 100644
--- a/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py
+++ b/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py
@@ -201,6 +201,14 @@ def generate_spec(
         system = self._system_prompt()
         user = f"{catalog_text}\n\nUSER PROMPT:\n{prompt}"
 
+        # TODO(qianl): wrap with transient-error retry (exponential backoff
+        # + jitter) for ``APIConnectionError`` / ``APITimeoutError`` / 429
+        # / 5xx, plus self-correction on ``pydantic.ValidationError`` (feed
+        # the .errors() report back to the model so it can fix the violation
+        # on retry). Deterministic 4xx errors must still propagate
+        # immediately. Until then, ``test_generate_spec_against_live_endpoint``
+        # carries ``@pytest.mark.flaky`` to absorb transport-layer hiccups
+        # at the test layer.
         resp = self.client.chat.completions.create(
             model=self.model,
             messages=[
diff --git a/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py b/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py
index 7ea0ed62d..03a628339 100644
--- a/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py
+++ b/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py
@@ -139,6 +139,11 @@ def ping(client: Any, model: str) -> str:
         ``APIConnectionError`` (unreachable endpoint), and
         ``RateLimitError`` (quota exhausted).
     """
+    # TODO(qianl): wrap with transient-error retry (exponential backoff +
+    # jitter) for ``APIConnectionError`` / ``APITimeoutError`` / 429 / 5xx.
+    # Deterministic errors (401/403/404) must still propagate immediately.
+    # Until then, the affected live tests carry ``@pytest.mark.flaky`` to
+    # absorb intermittent wire-level hiccups at the test layer.
     resp = client.chat.completions.create(
         model=model,
         messages=[{"role": "user", "content": "Respond with exactly: OK"}],
@@ -219,6 +224,14 @@ def check_structured_output_support(
         f"Return a valid {spec_class.__name__} JSON object. Every required field must be "
         "populated — use realistic dummy values where the prompt doesn't specify one."
     )
+    # TODO(qianl): wrap with transient-error retry (exponential backoff +
+    # jitter) for ``APIConnectionError`` / ``APITimeoutError`` / 429 / 5xx.
+    # Deterministic errors (400/401/403/404/422) must still propagate
+    # immediately so genuinely-unsupported endpoints fail fast. Currently
+    # this is the primary source of e2e flakes (provider occasionally
+    # returns blank ``content`` in the structured-outputs envelope) —
+    # affected live tests carry ``@pytest.mark.flaky`` as the short-term
+    # mitigation.
     try:
         resp = client.chat.completions.create(
             model=model,
diff --git a/isaaclab_arena/tests/test_env_gen_agent.py b/isaaclab_arena/tests/test_env_gen_agent.py
index 8f834b764..3207351a5 100644
--- a/isaaclab_arena/tests/test_env_gen_agent.py
+++ b/isaaclab_arena/tests/test_env_gen_agent.py
@@ -367,6 +367,15 @@ def test_does_not_repeat_response_format_instruction(self, agent):
 # ---------------------------------------------------------------------------
 
 
+# The test exercises a real wire call against NVIDIA's hosted DeepSeek-v4-flash,
+# which has intermittent quirks under structured outputs (occasional blank
+# content, transient 429 / 5xx, etc.). A single failed attempt does NOT
+# mean ``generate_spec`` is broken — allow up to 2 reruns so the transport
+# layer's intermittency doesn't fail CI. Real breakage will still fail all 3.
+# TODO(qianl): drop the flaky marker once production-side retry is wired
+# into ``generate_spec`` / ``check_structured_output_support`` (see TODOs in
+# env_gen_agent.py and structured_output_utils.py).
+@pytest.mark.flaky(max_runs=3, min_passes=1)
 @pytest.mark.agent_remote_e2e
 def test_generate_spec_against_live_endpoint():
     """End-to-end smoke test against the real OpenAI-compatible endpoint.
diff --git a/isaaclab_arena/tests/test_structured_output_utils.py b/isaaclab_arena/tests/test_structured_output_utils.py
index aa1646eda..458a78643 100644
--- a/isaaclab_arena/tests/test_structured_output_utils.py
+++ b/isaaclab_arena/tests/test_structured_output_utils.py
@@ -440,6 +440,14 @@ class TinySpec(BaseModel):
 # ---------------------------------------------------------------------------
 
 
+# The probe hits a real model on every run. NVIDIA's hosted DeepSeek-v4-flash
+# is intermittently quirky under structured outputs (occasional blank
+# ``content``, transient 429 / 5xx from the proxy, etc.); a single failed
+# attempt does NOT mean the deployment is actually broken. Allow up to 2
+# reruns so a transient blip doesn't fail CI. Real breakage will fail all 3.
+# TODO(qianl): drop the flaky marker once production-side retry is wired
+# into ``check_structured_output_support`` (see TODO in structured_output_utils.py).
+@pytest.mark.flaky(max_runs=3, min_passes=1)
 @pytest.mark.agent_remote_e2e
 def test_default_model_supports_structured_output():
     """The default ``EnvGenAgent`` model must support structured outputs.

From 11fe53e63c2edd26e1139214e0e562e344e7c6f0 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Thu, 28 May 2026 23:07:14 +0800
Subject: [PATCH 39/41] Guard empty choices list in ping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some provider responses succeed at the HTTP layer (200 OK) but return
an empty ``choices`` list — Azure content-filter trips, Bedrock
guardrail rejections, certain rate-limit shapes. The unguarded
``resp.choices[0]`` would raise ``IndexError``, breaking the
documented ``Raises`` contract and surfacing as an opaque crash
from ``EnvGenAgent.__init__()``.

Mirror the guard already present in ``check_structured_output_support``:
raise a structured ``RuntimeError`` with the model name baked in so
callers see a diagnosable ping failure. Update the ``Raises`` docstring
section to document the new failure mode.

Signed-off-by: Qian Lin <qianl@nvidia.com>
---
 .../structured_output_utils.py                | 22 ++++++++++++++++++-
 .../tests/test_structured_output_utils.py     | 19 ++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py b/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py
index 03a628339..6aeb2f8ed 100644
--- a/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py
+++ b/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py
@@ -133,6 +133,13 @@ def ping(client: Any, model: str) -> str:
         round-trip).
 
     Raises:
+        RuntimeError: when the request succeeded at the HTTP level but
+            the response contained no choices (e.g. Azure content-filter
+            trips, Bedrock guardrail rejections, certain rate-limit
+            responses that return 200 OK with an empty ``choices``
+            list). The wire is healthy but the model declined to
+            answer — surfaced as a clean ping failure rather than the
+            ``IndexError`` a naive ``choices[0]`` access would raise.
         Any exception raised by the underlying ``openai`` client.
         Common ones at this layer are ``AuthenticationError``
         (bad key), ``NotFoundError`` (wrong ``model``),
@@ -150,7 +157,20 @@ def ping(client: Any, model: str) -> str:
         temperature=0,
         max_tokens=8,
     )
-    return resp.choices[0].message.content or ""
+    # Mirror the guard in ``check_structured_output_support``: some
+    # providers return HTTP 200 with an empty ``choices`` list (content
+    # filter, guardrail, or rate-limit cases). The unguarded
+    # ``resp.choices[0]`` would raise ``IndexError`` here, breaking the
+    # documented ``Raises`` contract and surfacing as an opaque crash
+    # from ``EnvGenAgent.__init__``. Re-raise as a structured
+    # RuntimeError so callers see a diagnosable ping failure.
+    choices = getattr(resp, "choices", None) or []
+    if not choices:
+        raise RuntimeError(
+            f"ping to model {model!r} returned HTTP 200 with no choices "
+            "(content filter / guardrail / rate-limit response with empty body)."
+        )
+    return choices[0].message.content or ""
 
 
 def extract_response_text(message: Any) -> tuple[str, str]:
diff --git a/isaaclab_arena/tests/test_structured_output_utils.py b/isaaclab_arena/tests/test_structured_output_utils.py
index 458a78643..978eb2cf1 100644
--- a/isaaclab_arena/tests/test_structured_output_utils.py
+++ b/isaaclab_arena/tests/test_structured_output_utils.py
@@ -266,6 +266,25 @@ class FakeAuthError(Exception):
         with pytest.raises(FakeAuthError, match="invalid api key"):
             ping(client, "m")
 
+    def test_raises_runtime_error_when_choices_list_is_empty(self):
+        # Real provider behaviour: HTTP returns 200 OK but ``choices`` is an
+        # empty list. Seen on Azure when a content-filter trips, on Bedrock
+        # when a guardrail rejects post-hoc, and on certain rate-limit
+        # responses. Naive ``resp.choices[0]`` would raise ``IndexError`` —
+        # an opaque crash that breaks the documented ``Raises`` contract.
+        # The function must instead surface a structured ``RuntimeError``
+        # so callers (notably ``EnvGenAgent.__init__``) see a diagnosable
+        # ping failure with the model name baked in.
+        resp = MagicMock()
+        resp.choices = []
+        client = MagicMock()
+        client.chat.completions.create.return_value = resp
+        with pytest.raises(RuntimeError, match="no choices") as exc_info:
+            ping(client, "guardrailed-model")
+        # Model name surfaces in the message — most-grepped field when
+        # triaging a CI ping failure.
+        assert "'guardrailed-model'" in str(exc_info.value)
+
 
 # ---------------------------------------------------------------------------
 # check_structured_output_support (mocked)

From 0a3c30a1e0f8f8b76ee23e4445d13e7b87a18ef1 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Thu, 28 May 2026 23:12:03 +0800
Subject: [PATCH 40/41] Add todo for light/hdr images

---
 isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py b/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py
index 8cb597b1d..66216215b 100644
--- a/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py
+++ b/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py
@@ -34,6 +34,7 @@ def build_catalog_text() -> str:
     backgrounds: list[str] = []
     objects: list[dict] = []
     embodiments: list[str] = []
+    # TODO(qianl): handle optional lights and hdr images.
     for name in registry.get_all_keys():
         cls = registry.get_asset_by_name(name)
         tags = list(getattr(cls, "tags", []))

From dc1ed0d7e9e7d74996a1406fc5d7a2fddda639c4 Mon Sep 17 00:00:00 2001
From: Qian Lin <qianl@nvidia.com>
Date: Thu, 28 May 2026 23:34:16 +0800
Subject: [PATCH 41/41] Add TODO for relation params incompatibility with
 strict mode models

---
 .../environments/agentic_env_gen/env_intent_spec.py           | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py b/isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py
index 4d3c9147d..55995f942 100644
--- a/isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py
+++ b/isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py
@@ -104,6 +104,10 @@ class Relation(BaseModel):
             "unary kinds (is_anchor)."
         ),
     )
+    # TODO(qianl): free-form ``dict`` emits ``additionalProperties: true``,
+    # which strict-mode structured-outputs endpoints (OpenAI strict /
+    # Bedrock-Claude) reject with a 400. The default NVIDIA DeepSeek is
+    # lenient and accepts it, so this is a latent portability landmine.
     params: dict = Field(
         default_factory=dict,
         description="Optional kind-specific parameters; leave empty by default.",