diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 20cbdbddc..90d536ce7 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -156,7 +156,8 @@ jobs:
 
       - name: Run in-process PhysX tests without cameras
         run: |
-          /isaac-sim/python.sh -m pytest -sv --durations=0 -m "not with_cameras and not with_subprocess and not with_newton" \
+          /isaac-sim/python.sh -m pytest -sv --durations=0 \
+            -m "not with_cameras and not with_subprocess and not with_newton and not agent_remote_e2e" \
             isaaclab_arena/tests/
 
       - name: Run GR00T policy/data tests (lightweight gr00t deps only)
@@ -265,6 +266,53 @@ jobs:
             isaaclab_arena_gr00t/tests/test_gr00t_remote_closedloop_policy_runner.py
 
 
+  test_agent_remote_e2e:
+    name: Agent remote E2E
+    runs-on: [self-hosted, gpu-arena]
+    timeout-minutes: 20
+    needs: [pre_commit]
+    env:
+      # NV_API_KEY is the variable EnvGenAgent reads at runtime (see
+      # isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py). The repo-level secret is
+      # named ARENA_NV_API_KEY to mirror ARENA_NGC_API_KEY and avoid
+      # collisions with other consumers of NV_API_KEY in the runner env.
+      NV_API_KEY: ${{ secrets.ARENA_NV_API_KEY }}
+
+    container:
+      image: nvcr.io/nvstaging/isaac-amr/isaaclab_arena:latest
+      credentials:
+        username: $oauthtoken
+        password: ${{ env.NGC_API_KEY }}
+
+    steps:
+      # No nvidia-smi / kit cache setup: this job is pure-Python (openai +
+      # pydantic) and never touches Isaac Sim. We reuse the arena image
+      # only because it already has the deps + the /isaac-sim/python.sh
+      # interpreter the rest of the suite calls into.
+      - *install_git_step
+      - *cleanup_step
+      - *mark_repo_safe_step
+      - *checkout_step
+      - *git_lfs_step
+      - *install_project_step
+
+      # Fail loudly when the secret isn't wired up — the test itself
+      # ``skipif``s when NV_API_KEY is empty, so without this guard a
+      # missing secret would silently produce a green job with zero agent
+      # coverage.
+      - name: Verify ARENA_NV_API_KEY is configured
+        run: |
+          if [ -z "${NV_API_KEY}" ]; then
+            echo "::error::ARENA_NV_API_KEY repo secret is not set; cannot run agent_remote_e2e tests."
+            exit 1
+          fi
+
+      - name: Run agent remote E2E test
+        run: |
+          /isaac-sim/python.sh -m pytest -sv --durations=0 -m agent_remote_e2e \
+            isaaclab_arena/tests/test_env_gen_agent.py
+
+
   build_docs_pre_merge:
     name: Build the docs (pre-merge)
     runs-on: [self-hosted, gpu-arena]
diff --git a/docker/run_docker.sh b/docker/run_docker.sh
index baeb038cc..c8af3e7ce 100755
--- a/docker/run_docker.sh
+++ b/docker/run_docker.sh
@@ -185,6 +185,12 @@ else
         fi
     fi
 
+    # pass through API keys used by the agentic env-gen prototype; values are
+    # inherited from the host shell so the key never lives in the repo.
+    if [ -n "$NV_API_KEY" ]; then
+        DOCKER_RUN_ARGS+=("--env" "NV_API_KEY")
+    fi
+
     # if gr00t is installed, mount the gr00t directory in case anything needs to change there
     if [ "$INSTALL_GROOT" = "true" ]; then
         DOCKER_RUN_ARGS+=("-v" "./submodules/Isaac-GR00T:${WORKDIR}/submodules/Isaac-GR00T")
diff --git a/isaaclab_arena/environments/agentic_env_gen/__init__.py b/isaaclab_arena/environments/agentic_env_gen/__init__.py
new file mode 100644
index 000000000..16ea4c218
--- /dev/null
+++ b/isaaclab_arena/environments/agentic_env_gen/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py b/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py
new file mode 100644
index 000000000..8cb597b1d
--- /dev/null
+++ b/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py
@@ -0,0 +1,267 @@
+# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Agent for parsing natural-language env-generation prompts into an EnvIntentSpec.
+
+Calls an OpenAI-compatible chat-completions endpoint (NVIDIA's hosted
+inference by default) and uses the **structured-outputs** API
+(``response_format={"type": "json_schema", ...}``) so the wire
+guarantees a valid JSON envelope matching EnvIntentSpec. There is no
+prose-parsing fallback — if the configured model/endpoint does not
+support structured outputs, :class:`EnvGenAgent` will refuse to
+construct.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+
+from .env_intent_spec import EnvIntentSpec
+from .structured_output_utils import build_strict_schema, check_structured_output_support, extract_response_text, ping
+
+DEFAULT_BASE_URL = "https://inference-api.nvidia.com"
+DEFAULT_MODEL = "nvidia/deepseek-ai/deepseek-v4-flash"
+
+
+def build_catalog_text() -> str:
+    """Introspect AssetRegistry and build the vocabulary the agent is allowed to use."""
+    from isaaclab_arena.assets.registries import AssetRegistry
+
+    registry = AssetRegistry()
+    backgrounds: list[str] = []
+    objects: list[dict] = []
+    embodiments: list[str] = []
+    for name in registry.get_all_keys():
+        cls = registry.get_asset_by_name(name)
+        tags = list(getattr(cls, "tags", []))
+        if "embodiment" in tags:
+            embodiments.append(name)
+        elif "background" in tags:
+            backgrounds.append(name)
+        elif "object" in tags:
+            objects.append({"name": name, "tags": [t for t in tags if t != "object"]})
+
+    obj_lines = "\n".join(f"- {o['name']}  tags={o['tags']}" for o in sorted(objects, key=lambda o: o["name"]))
+    return (
+        f"EMBODIMENTS: {', '.join(sorted(embodiments))}\n\n"
+        f"BACKGROUNDS: {', '.join(sorted(backgrounds))}\n\n"
+        f"OBJECTS ({len(objects)}):\n{obj_lines}"
+    )
+
+
+class EnvGenAgent:
+    """Parses a natural-language env-generation prompt into an EnvIntentSpec.
+
+    The agent is **structured-outputs only**: every call to
+    ``generate_spec`` passes ``response_format={"type": "json_schema",
+    ...}`` to the chat-completions endpoint, and the response is
+    parsed directly as JSON. There is no prose / markdown-fence
+    fallback — if the configured model/endpoint doesn't honour
+    ``response_format``, the constructor raises before the agent is
+    usable.
+    """
+
+    def __init__(
+        self,
+        api_key: str | None = None,
+        model: str = DEFAULT_MODEL,
+        base_url: str = DEFAULT_BASE_URL,
+    ):
+        """Configure the OpenAI-compatible client and validate the model.
+
+        Construction runs two fail-fast wire checks in order:
+
+          1. :func:`.structured_output_utils.ping` — cheap liveness
+             probe (no ``response_format``). Confirms the API key
+             authenticates, the model name resolves at ``base_url``,
+             and the network path is reachable.
+          2. :func:`.structured_output_utils.check_structured_output_support`
+             — sends ``response_format=json_schema`` with the
+             ``EnvIntentSpec`` schema and asserts a valid envelope
+             comes back. Confirms the model actually honours the
+             structured-outputs contract ``generate_spec`` relies on.
+
+        Both run at construction time so a misconfigured model fails
+        immediately with a clear stack — not mid-pipeline inside the
+        first ``generate_spec`` call.
+
+        Args:
+            api_key: Bearer token for the inference endpoint. Falls back
+                to the ``NV_API_KEY`` environment variable when ``None``;
+                raises ``ValueError`` if neither is set.
+            model: Model identifier as understood by the endpoint at
+                ``base_url`` (e.g. ``"nvidia/deepseek-ai/deepseek-v4-flash"``).
+                See https://build.nvidia.com for the catalogue of NVIDIA-hosted
+                models. Must support OpenAI-compatible structured
+                outputs (``response_format=json_schema``) — the
+                constructor validates this and refuses to proceed
+                otherwise.
+            base_url: OpenAI-compatible API root. Defaults to
+                ``DEFAULT_BASE_URL`` (NVIDIA's hosted inference endpoint);
+                override to point at a self-hosted vLLM / Ollama / etc.
+                deployment that exposes the same OpenAI chat-completions
+                wire format.
+
+        Raises:
+            ValueError: when no API key is available (neither argument
+                nor ``NV_API_KEY`` env var).
+            RuntimeError: when the configured model does not support
+                structured outputs (probe came back unsupported).
+            Any exception raised by the underlying ``openai`` client
+                during the ping probe — typically
+                ``AuthenticationError`` (bad key), ``NotFoundError``
+                (wrong model), ``APIConnectionError`` (unreachable
+                endpoint), or ``RateLimitError`` (quota exhausted).
+        """
+        from openai import OpenAI
+
+        self.api_key = api_key or os.getenv("NV_API_KEY")
+        # Use an explicit raise instead of ``assert`` so the guard survives
+        # ``python -O`` (which strips asserts) — missing-key failures must be
+        # loud regardless of interpreter flags.
+        if not self.api_key:
+            raise ValueError("API key required: set NV_API_KEY or pass api_key.")
+        self.model = model
+        self.client = OpenAI(api_key=self.api_key, base_url=base_url)
+        # Cached on the instance because the schema is non-trivial to walk
+        # (~10 nested object nodes) and ``generate_spec`` may be called many
+        # times. Munged once per agent lifetime.
+        self._spec_schema = build_strict_schema(EnvIntentSpec)
+
+        # 1) Cheap liveness probe first. If the wire is down or the key is
+        # bad we don't want to waste tokens on the heavier structured-output
+        # probe below — ``ping`` is the right tool for "is the endpoint
+        # talking to us at all?".
+        ping(self.client, self.model)
+
+        # 2) Structured-output capability check. ``generate_spec`` is
+        # structured-outputs-only, so a model that can't honour
+        # ``response_format=json_schema`` is fundamentally unusable for
+        # this agent. The probe raises ``RuntimeError`` with a multi-line
+        # diagnostic (route / finish_reason / cause / sample_payload) on
+        # any failure mode — no caller-side wrapping needed.
+        check_structured_output_support(self.client, self.model, EnvIntentSpec)
+
+    def generate_spec(
+        self,
+        prompt: str,
+        catalog_text: str | None = None,
+        temperature: float = 0.2,
+        max_tokens: int = 2000,
+    ) -> tuple[EnvIntentSpec, str]:
+        """Call the model and return the parsed EnvIntentSpec plus the raw response.
+
+        Uses OpenAI-compatible structured outputs: the request includes
+        ``response_format={"type": "json_schema", ...}`` with the
+        EnvIntentSpec schema, and the response is parsed directly as
+        JSON. No prose / markdown-fence fallback.
+
+        Args:
+            prompt: Natural-language env description from the end user.
+                Concatenated with the asset catalog to form the chat
+                ``user`` message.
+            catalog_text: Pre-built asset vocabulary (the output of
+                ``build_catalog_text()``). When ``None``, the catalog is
+                rebuilt from the live ``AssetRegistry``. Pass an explicit
+                value to (a) avoid the cost of rebuilding it across
+                repeated calls, or (b) experiment with a restricted /
+                augmented catalog without mutating the registry.
+            temperature: Sampling temperature forwarded to the model. Kept
+                low by default (0.2) because EnvIntentSpec generation is a
+                deterministic-ish translation task — high temperature
+                yields creative but invalid schemas.
+            max_tokens: Hard cap on the response length. Set generously
+                (2000) so multi-task EnvIntentSpecs aren't truncated
+                mid-JSON; shrink if the endpoint enforces a tighter
+                quota.
+
+        Returns:
+            A ``(EnvIntentSpec, raw_response)`` tuple. The raw text is
+            useful for debugging when validation rejects the parsed
+            JSON (or for inspecting the model's reasoning chain).
+
+        Raises:
+            RuntimeError: when the model returns an empty response on
+                both ``content`` and ``reasoning_content`` channels
+                (the structured-outputs envelope dropped). Indicates
+                the endpoint or model does not actually honour
+                ``response_format`` — run
+                :meth:`check_structured_output_support` to confirm.
+            json.JSONDecodeError: when the model returned non-JSON
+                text despite the structured-outputs guarantee
+                (vanishingly rare; usually a transport/proxy issue).
+            pydantic.ValidationError: when the parsed JSON is
+                well-formed but violates EnvIntentSpec's semantic
+                constraints (e.g. empty ``tasks`` list).
+        """
+        catalog_text = catalog_text or build_catalog_text()
+        system = self._system_prompt()
+        user = f"{catalog_text}\n\nUSER PROMPT:\n{prompt}"
+
+        # TODO(qianl): wrap with transient-error retry (exponential backoff
+        # + jitter) for ``APIConnectionError`` / ``APITimeoutError`` / 429
+        # / 5xx, plus self-correction on ``pydantic.ValidationError`` (feed
+        # the .errors() report back to the model so it can fix the violation
+        # on retry). Deterministic 4xx errors must still propagate
+        # immediately. Until then, ``test_generate_spec_against_live_endpoint``
+        # carries ``@pytest.mark.flaky`` to absorb transport-layer hiccups
+        # at the test layer.
+        resp = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": system},
+                {"role": "user", "content": user},
+            ],
+            response_format={
+                "type": "json_schema",
+                "json_schema": {"name": "EnvIntentSpec", "strict": True, "schema": self._spec_schema},
+            },
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )
+        text, route = extract_response_text(resp.choices[0].message)
+        if route == "empty":
+            raise RuntimeError(
+                f"Model {self.model!r} returned an empty structured-outputs envelope. "
+                "Run check_structured_output_support() to verify the endpoint/model "
+                "actually honours response_format=json_schema."
+            )
+        # ``strict=False`` lets json.loads accept unescaped control characters
+        # (e.g. literal tabs) inside JSON strings — DeepSeek-v4-flash is known
+        # to emit these despite the structured-outputs contract. Pydantic's
+        # own ``model_validate_json`` is stricter and would reject them.
+        data = json.loads(text, strict=False)
+        spec = EnvIntentSpec.model_validate(data)
+        return spec, text
+
+    def _system_prompt(self) -> str:
+        # Per-field guidance (what each field means, enum members, default
+        # behaviours) lives on the ``Field(description=...)`` entries in
+        # env_intent_spec.py and is surfaced to the agent via the SCHEMA
+        # the structured-outputs API embeds in every request. Only
+        # cross-cutting rules and few-shot examples belong here. The
+        # "emit ONLY JSON" instruction is intentionally absent —
+        # structured outputs enforce the envelope at the wire level.
+        return (
+            "You are an env-generation parser for robot manipulation tasks.\n"
+            "Convert a natural-language prompt into an EnvIntentSpec.\n\n"
+            "GUIDANCE:\n"
+            "- Follow the per-field ``description`` strings in the schema for what each field expects.\n"
+            "- If the prompt does not specify a value for an optional field, output null.\n"
+            "  Do NOT hallucinate values — the resolver tolerates nulls; it cannot fix invented data.\n"
+            "- Articulated objects (microwave, fridge, cabinet) still need a spatial\n"
+            "  'on(<object>, background)' relation in initial_scene_graph to anchor them; their\n"
+            "  open/close behaviour is expressed via tasks, not via relations.\n"
+            "- Distractor items around the appliance need 'on(distractor, background)' relations\n"
+            "  in initial_scene_graph as well.\n"
+            "- Task examples (showing kind + subject + target + description shape):\n"
+            '    * Pick-and-place: {"kind": "pick_and_place", "subject": "avocado", "target": "bowl",\n'
+            '                       "description": "pick up the avocado and place it in the bowl"}\n'
+            '    * Open door: {"kind": "open_door", "subject": "microwave", "target": null,\n'
+            '                  "description": "open the microwave door"}\n'
+            '    * Close door: {"kind": "close_door", "subject": "microwave", "target": null,\n'
+            '                   "description": "close the microwave door"}\n'
+        )
diff --git a/isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py b/isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py
new file mode 100644
index 000000000..4d3c9147d
--- /dev/null
+++ b/isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py
@@ -0,0 +1,204 @@
+# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Schema the agent must fill in when parsing a natural-language env-generation prompt.
+
+The agent sees a list of the *available* asset tags / embodiment names pulled
+from the registries at call time, and must return an EnvIntentSpec that only uses
+those vocabularies. Concrete asset names are resolved in a second, deterministic
+step — the agent never invents USD paths.
+"""
+
+from __future__ import annotations
+
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+# Relation kinds currently surfaced to the agent. Mirror the subset of
+# ``ArenaEnvGraphSpatialConstraintType`` that makes sense for tabletop
+# prompts; values must match the enum's values one-to-one because the
+# resolver looks the constraint type up via
+# ``ArenaEnvGraphSpatialConstraintType(kind)`` rather than maintaining a
+# parallel dict. Solver-internal kinds (``position_limits``,
+# ``random_around_solution``, ``rotate_around_solution``) are intentionally
+# omitted — they describe how the placement solver explores poses and are
+# not natural for an agent to emit.
+# "in" has no In class in isaaclab_arena.relations.relations yet — see the
+# TODO there. The downstream env builder materializes goal-state "in"
+# relations as the task's success predicate.
+RelationKind = Literal["on", "in", "next_to", "at_position", "at_pose", "is_anchor"]
+
+ItemRole = Literal["foreground", "distractor", "anchor"]
+
+# Task kinds the agent can propose as an atomic task.
+TaskKind = Literal["pick_and_place", "open_door", "close_door"]
+
+
+class Item(BaseModel):
+    """One object the agent wants in the scene."""
+
+    query: str = Field(
+        description=(
+            "Short human name for the object as it appears in the prompt "
+            "(e.g. 'avocado', 'bowl'). The downstream resolver fuzzy-matches "
+            "this against the asset catalog — do NOT emit the exact "
+            "registered name."
+        ),
+    )
+    role: ItemRole = Field(
+        description=(
+            "Role the item plays in the env: 'foreground' for objects the "
+            "task acts on; 'distractor' for extras mentioned as clutter; "
+            "'anchor' for reference surfaces (rare — the background usually "
+            "covers this)."
+        ),
+    )
+    category_tags: list[str] = Field(
+        default_factory=list,
+        description=(
+            "Tags that semantically narrow the query, preferring assets with "
+            "those tags. PREFERENCE only, not a hard filter — the resolver "
+            "falls back to the full catalog if the tag pool is empty or "
+            "yields no close match. Err toward emitting useful tags."
+        ),
+    )
+    instance_name: str | None = Field(
+        default=None,
+        description="Optional explicit instance label for the item; leave null if the prompt does not name one.",
+    )
+    scale: float | None = Field(
+        default=None,
+        description=(
+            "Spawn scale. Leave null (the default) so the placement proposer "
+            "auto-fits the asset; only set a positive float when the prompt "
+            "explicitly demands a size override."
+        ),
+    )
+
+
+class Relation(BaseModel):
+    """A spatial / structural relation between items.
+
+    Binary kinds (``on``, ``in``, ``next_to``, ...) must set ``target`` to the
+    other item — semantics is "subject is in relation to target". Unary kinds
+    (``is_anchor``, ``at_position``, ...) describe an intrinsic property of
+    ``subject`` alone and must leave ``target`` as ``None``.
+    """
+
+    kind: RelationKind = Field(
+        description=(
+            "Spatial relation only — articulated-state changes (open/close) are expressed via tasks, not via relations."
+        ),
+    )
+    subject: str = Field(
+        description="Item the relation applies to, named by its Item.query string or the background name.",
+    )
+    target: str | None = Field(
+        default=None,
+        description=(
+            "The other item the relation is anchored on for binary kinds "
+            "(on / in / next_to / at_position / at_pose); leave null for "
+            "unary kinds (is_anchor)."
+        ),
+    )
+    params: dict = Field(
+        default_factory=dict,
+        description="Optional kind-specific parameters; leave empty by default.",
+    )
+
+    def identity(self) -> tuple[str, str, str | None]:
+        """Hashable identity for diffing scene graphs — ignores params."""
+        return (self.kind, self.subject, self.target)
+
+
+class Task(BaseModel):
+    """One atomic task in the plan that transforms the env state."""
+
+    kind: TaskKind = Field(description="The action to perform.")
+    subject: str = Field(
+        description=(
+            "The primary object the task acts on, named by its Item.query string (e.g. 'avocado', 'microwave')."
+        ),
+    )
+    target: str | None = Field(
+        default=None,
+        description=(
+            "The secondary object or location, named by its Item.query "
+            "string or the background name. Leave null for unary tasks "
+            "(open_door / close_door)."
+        ),
+    )
+    description: str = Field(
+        description="Natural-language summary of the task (e.g. 'pick up the avocado and place it in the bowl').",
+    )
+
+
+class EnvIntentSpec(BaseModel):
+    """Agent output — a structured "env intent" (blueprint) for the env and a list of tasks.
+
+    Field-level guidance lives on the individual ``Field(description=...)``
+    entries below and is surfaced to the agent via ``model_json_schema()``;
+    only cross-cutting rules and few-shot examples are kept in the
+    prompt text (see ``EnvGenAgent._system_prompt``).
+    """
+
+    # Forced chain-of-thought field, listed FIRST so the agent emits its
+    # analysis before committing to any structured field. Instruction-tuned
+    # models respect schema field order, and writing reasoning before
+    # answers measurably improves structured-output quality (the
+    # "think step by step then commit" pattern). Bonus debuggability:
+    # when a downstream resolver step fails, the reasoning trace shows
+    # which step the model got wrong (e.g. it picked "tomato" because
+    # it misidentified the foreground object as a vegetable) — without
+    # this, the only signal is the malformed spec itself.
+    reasoning: str = Field(
+        description=(
+            "Step-by-step analysis of the user prompt, written BEFORE the "
+            "structured fields below. Identify (1) the task / intent, (2) "
+            "the foreground objects the task acts on, (3) the background "
+            "surface or scene, (4) any distractors. For each object, "
+            "briefly justify the catalog query and tags you will pick. "
+            "Resolve any ambiguity here before filling the structured "
+            "fields — do not restate this analysis in ``task_description``."
+        ),
+    )
+    task_description: str = Field(
+        description="One-sentence natural-language summary of what the env exercises overall."
+    )
+    background: str = Field(
+        description="Background asset name from the BACKGROUNDS catalog (e.g. 'maple_table_kitchen').",
+    )
+    embodiment: str = Field(
+        default="franka_ik",
+        description=(
+            "Robot embodiment to control. Use a bare family name ('franka', "
+            "'droid', 'g1', 'gr1') when the prompt does not specify a "
+            "control mode — the resolver defaults each to its IK variant. "
+            "Use a full registered name (e.g. 'franka_joint_pos') only when "
+            "the prompt explicitly requests joint control."
+        ),
+    )
+    items: list[Item] = Field(description="Objects to place in the env.")
+    initial_scene_graph: list[Relation] = Field(
+        description=(
+            "FULL snapshot of all relations in the starting state. Every "
+            "persistent relation (e.g. bowl on table, distractors present) "
+            "must appear here. Relations that change via tasks are still "
+            "listed here in their starting form."
+        ),
+    )
+    tasks: list[Task] = Field(
+        description=(
+            "Tasks to execute in sequence. The task sequence implicitly "
+            "defines the intermediate env graphs by applying each task's "
+            "transformations in order. An empty list is valid and means "
+            "the env has no task — at the arena layer this maps to the "
+            "``NoTask`` null object (e.g. a static playground / sandbox "
+            "env). Prefer an empty list over inventing a placeholder "
+            "task when the user prompt genuinely describes a task-less "
+            "scene."
+        ),
+    )
diff --git a/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py b/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py
new file mode 100644
index 000000000..03a628339
--- /dev/null
+++ b/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py
@@ -0,0 +1,304 @@
+# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Utilities for OpenAI-compatible structured outputs (``response_format=json_schema``).
+
+The functions here are the building blocks the env-gen agent uses to
+send strict-mode-compatible schemas, handle provider-specific response
+routing (NVIDIA DeepSeek's ``reasoning_content`` quirk), and probe a
+candidate model's structured-output capability before deployment.
+
+They are intentionally pydantic-model-agnostic: pass any
+``pydantic.BaseModel`` subclass as ``spec_class`` and the utility
+adapts. The agent module wires :class:`EnvIntentSpec` in as the
+production default.
+"""
+
+from __future__ import annotations
+
+import copy
+import json
+from typing import Any
+
+from pydantic import BaseModel
+
+# Truncate echoed response payloads in diagnostic results to this many
+# characters — long enough to diagnose a failure, short enough to keep
+# error messages and probe results readable.
+_RESPONSE_PREVIEW_CHARS = 500
+
+
+def _format_failure_message(
+    *,
+    model: str,
+    response_route: str,
+    finish_reason: str | None,
+    cause: str,
+    sample_payload: str | None,
+) -> str:
+    """Build the multi-line diagnostic message for a structured-output failure.
+
+    The format pairs every signal the probe captured into a layout that
+    grep/CI logs can read at a glance. ``sample_payload`` is the
+    single most useful field — it turns a cryptic ``JSONDecodeError:
+    Expecting value`` into a debuggable failure by showing what the
+    model actually returned (prose preamble? HTML error page? empty?).
+    """
+    return (
+        f"Model {model!r} does not support structured outputs:\n"
+        f"  response_route = {response_route!r}\n"
+        f"  finish_reason  = {finish_reason!r}\n"
+        f"  cause          = {cause}\n"
+        f"  sample_payload = {sample_payload!r}"
+    )
+
+
+def build_strict_schema(model_cls: type[BaseModel]) -> dict[str, Any]:
+    """Return ``model_cls``'s JSON schema munged for OpenAI strict mode.
+
+    OpenAI's structured outputs strict mode (and AWS Bedrock's
+    Anthropic models, which surface the same constraint) require:
+
+      * ``additionalProperties: false`` on every object schema.
+      * Every property listed in ``required`` (use a nullable type
+        union — e.g. ``str | None`` — for fields that should be
+        emittable as ``null``).
+      * No ``default`` keys in the schema (defaults are nonsensical
+        when every field is required).
+
+    Pydantic's default ``model_json_schema()`` honours the first
+    constraint only. We deep-walk the schema and apply the other two
+    so the schema flies past both NVIDIA and Bedrock validation.
+
+    The returned dict is a deep copy — mutating it never leaks back
+    into pydantic's internal schema cache.
+    """
+    schema = copy.deepcopy(model_cls.model_json_schema())
+    apply_strict_constraints(schema)
+    return schema
+
+
+def apply_strict_constraints(node: Any) -> None:
+    """Recursively apply OpenAI strict-mode constraints to a JSON-schema node.
+
+    Mutates ``node`` in place. Safe to call on an already-munged schema
+    (the operation is idempotent).
+    """
+    if isinstance(node, dict):
+        if node.get("type") == "object" and "properties" in node:
+            node["additionalProperties"] = False
+            node["required"] = list(node["properties"].keys())
+        # Strict mode forbids ``default`` keys (every field is required, so
+        # defaults can never apply). Drop them defensively at every level.
+        node.pop("default", None)
+        for v in node.values():
+            apply_strict_constraints(v)
+    elif isinstance(node, list):
+        for v in node:
+            apply_strict_constraints(v)
+
+
+def ping(client: Any, model: str) -> str:
+    """Smoke-test the endpoint + API key + model with a minimal request.
+
+    Sends a one-shot chat completion (no structured outputs) to verify:
+
+      * the API key authenticates,
+      * the configured model exists at the client's ``base_url``,
+      * the network path is reachable.
+
+    Intended for CI startup probes and constructor-time fail-fast
+    checks; the success signal is "we got a response without
+    raising". The response *content* is returned for diagnostics but
+    intentionally not asserted on — different models phrase the
+    acknowledgment differently, and a quirky reply still means the
+    wire is working.
+
+    This is the *cheap* probe; pair with
+    :func:`check_structured_output_support` for a full deployment
+    validation (ping confirms the wire, the probe confirms the
+    model can actually produce structured outputs).
+
+    Args:
+        client: An OpenAI-compatible client (typically
+            ``openai.OpenAI`` or a compatible mock).
+        model: Model identifier forwarded to
+            ``client.chat.completions.create(model=...)``.
+
+    Returns:
+        The model's response text (typically "OK" or similar). Empty
+        string if the model returned no content (still a successful
+        round-trip).
+
+    Raises:
+        Any exception raised by the underlying ``openai`` client.
+        Common ones at this layer are ``AuthenticationError``
+        (bad key), ``NotFoundError`` (wrong ``model``),
+        ``APIConnectionError`` (unreachable endpoint), and
+        ``RateLimitError`` (quota exhausted).
+    """
+    # TODO(qianl): wrap with transient-error retry (exponential backoff +
+    # jitter) for ``APIConnectionError`` / ``APITimeoutError`` / 429 / 5xx.
+    # Deterministic errors (401/403/404) must still propagate immediately.
+    # Until then, the affected live tests carry ``@pytest.mark.flaky`` to
+    # absorb intermittent wire-level hiccups at the test layer.
+    resp = client.chat.completions.create(
+        model=model,
+        messages=[{"role": "user", "content": "Respond with exactly: OK"}],
+        temperature=0,
+        max_tokens=8,
+    )
+    return resp.choices[0].message.content or ""
+
+
+def extract_response_text(message: Any) -> tuple[str, str]:
+    """Pull the agent's structured-output text from the chat-completion message.
+
+    Returns ``(text, route)`` where ``route`` is one of:
+
+      * ``"content"`` — the standard OpenAI-compatible channel.
+      * ``"reasoning_content"`` — NVIDIA DeepSeek's provider-specific
+        channel; the model emits structured outputs here instead of
+        ``content``. We treat it as equivalent.
+      * ``"empty"`` — both channels were empty / missing; the caller
+        should surface a clear error.
+    """
+    content = getattr(message, "content", None)
+    if content:
+        return content, "content"
+    reasoning = getattr(message, "reasoning_content", None)
+    if reasoning:
+        return reasoning, "reasoning_content"
+    return "", "empty"
+
+
+def check_structured_output_support(
+    client: Any,
+    model: str,
+    spec_class: type[BaseModel],
+) -> bool:
+    """Probe whether ``model`` can produce ``spec_class``-shaped structured outputs.
+
+    Sends a single chat-completion against ``client`` with
+    ``response_format=json_schema`` carrying ``spec_class``'s strict
+    schema and a minimal user prompt asking the model to fabricate a
+    valid instance. Returns ``True`` if the model successfully
+    produced a valid ``spec_class`` instance end-to-end.
+
+    Every failure mode raises ``RuntimeError`` with a multi-line
+    diagnostic that names the failed channel, ``finish_reason``,
+    the underlying cause, and a preview of the model's response.
+    When the failure has an originating SDK exception (HTTP error,
+    JSONDecodeError, ValidationError) it is chained via
+    ``__cause__`` so the traceback retains the full context.
+
+    Args:
+        client: An OpenAI-compatible client (typically
+            ``openai.OpenAI`` or a compatible mock).
+        model: Model identifier as understood by the client's
+            base_url. Forwarded verbatim to
+            ``client.chat.completions.create(model=...)``.
+        spec_class: The pydantic model whose strict schema will be
+            sent to the endpoint.
+
+    Returns:
+        ``True`` when the probe round-trips successfully (wire ok,
+        schema honoured, pydantic validation passed).
+
+    Raises:
+        RuntimeError: for any failure mode — API rejection at the
+            wire (400/401/etc.), empty ``choices`` list (Azure
+            content-filter / Bedrock guardrail rejection), empty
+            envelope on both ``content`` and ``reasoning_content``,
+            JSON parse failure, or pydantic schema-validation
+            failure. The exception's ``__cause__`` (when populated)
+            is the originating SDK / parser exception.
+    """
+    schema = build_strict_schema(spec_class)
+    # The user prompt is deliberately content-free; the schema itself
+    # plus the system prompt below carry all the structural
+    # information. We just want a valid envelope back.
+    system = (
+        f"Return a valid {spec_class.__name__} JSON object. Every required field must be "
+        "populated — use realistic dummy values where the prompt doesn't specify one."
+    )
+    # TODO(qianl): wrap with transient-error retry (exponential backoff +
+    # jitter) for ``APIConnectionError`` / ``APITimeoutError`` / 429 / 5xx.
+    # Deterministic errors (400/401/403/404/422) must still propagate
+    # immediately so genuinely-unsupported endpoints fail fast. Currently
+    # this is the primary source of e2e flakes (provider occasionally
+    # returns blank ``content`` in the structured-outputs envelope) —
+    # affected live tests carry ``@pytest.mark.flaky`` as the short-term
+    # mitigation.
+    try:
+        resp = client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": system},
+                {"role": "user", "content": "Generate a minimal valid example."},
+            ],
+            response_format={
+                "type": "json_schema",
+                "json_schema": {"name": spec_class.__name__, "strict": True, "schema": schema},
+            },
+            temperature=0,
+            max_tokens=2000,
+        )
+    except Exception as exc:
+        raise RuntimeError(
+            _format_failure_message(
+                model=model,
+                response_route="empty",
+                finish_reason=None,
+                cause=f"{type(exc).__name__}: {str(exc)[:_RESPONSE_PREVIEW_CHARS]}",
+                sample_payload=None,
+            )
+        ) from exc
+
+    # Some providers (e.g. Azure content-filter trips, Bedrock guardrail
+    # rejections) succeed at the HTTP level but return an empty ``choices``
+    # list — no candidates were emitted. ``resp.choices[0]`` would raise
+    # ``IndexError``; surface it with a distinct ``cause`` message that
+    # operators can tell apart from the "envelope returned but content
+    # empty" case handled further down.
+    choices = getattr(resp, "choices", None) or []
+    if not choices:
+        raise RuntimeError(
+            _format_failure_message(
+                model=model,
+                response_route="empty",
+                finish_reason=None,
+                cause="Response contained no choices (model emitted zero candidates).",
+                sample_payload=None,
+            )
+        )
+
+    finish_reason = choices[0].finish_reason
+    text, route = extract_response_text(choices[0].message)
+    sample = text[:_RESPONSE_PREVIEW_CHARS] if text else None
+    if not text:
+        raise RuntimeError(
+            _format_failure_message(
+                model=model,
+                response_route=route,
+                finish_reason=finish_reason,
+                cause="Model returned an empty envelope on both content and reasoning_content.",
+                sample_payload=None,
+            )
+        )
+    try:
+        data = json.loads(text, strict=False)
+        spec_class.model_validate(data)
+    except Exception as exc:
+        raise RuntimeError(
+            _format_failure_message(
+                model=model,
+                response_route=route,
+                finish_reason=finish_reason,
+                cause=f"{type(exc).__name__}: {str(exc)[:_RESPONSE_PREVIEW_CHARS]}",
+                sample_payload=sample,
+            )
+        ) from exc
+    return True
diff --git a/isaaclab_arena/environments/agentic_env_gen/try_schema.py b/isaaclab_arena/environments/agentic_env_gen/try_schema.py
new file mode 100644
index 000000000..57bef2dbd
--- /dev/null
+++ b/isaaclab_arena/environments/agentic_env_gen/try_schema.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Run the agent on a prompt and dump the resolved ArenaEnvGraphSpec.
+
+Requires NV_API_KEY environment variable.
+
+Examples:
+    # Print the Pydantic EnvIntentSpec JSON schema (no agent call):
+    /isaac-sim/python.sh -m isaaclab_arena.environments.agentic_env_gen.try_schema --print-schema
+
+    # Print the catalog sent to the agent (no agent call):
+    /isaac-sim/python.sh -m isaaclab_arena.environments.agentic_env_gen.try_schema --print-catalog
+
+    # Call the agent, resolve, print, and dump YAML:
+    /isaac-sim/python.sh -m isaaclab_arena.environments.agentic_env_gen.try_schema \
+        --prompt "franka pick up avocado from the table and place it into a bowl on the table. there are other veggies on the table as distractor"
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+
+DEFAULT_PROMPT = (
+    "franka pick up avocado from the table and place it into a bowl on the table. "
+    "there are other veggies on the table as distractor"
+)
+SEQUENTIAL_PROMPT = (
+    "franka opens a microwave, picks up avocado on the table, place it into the microwave and close the microwave door."
+    " There are other utensils on the table as distractor"
+)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--prompt", type=str, default=DEFAULT_PROMPT)
+    parser.add_argument("--model", type=str, default=None)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--print-schema", action="store_true")
+    parser.add_argument("--print-catalog", action="store_true")
+    parser.add_argument(
+        "--background",
+        type=str,
+        default="maple_table_robolab",
+        help=(
+            "Override the background chosen by the agent (e.g. 'office_table' "
+            "or 'kitchen'). Default is 'maple_table_robolab' because its "
+            "tabletop ObjectReference yields a clean bbox and stable "
+            "placement, unlike the rotated plain 'table' background. Pass "
+            "an empty string ('') to keep the agent's choice."
+        ),
+    )
+    args = parser.parse_args()
+
+    from isaaclab_arena.environments.agentic_env_gen.env_intent_spec import EnvIntentSpec
+
+    if args.print_schema:
+        print(json.dumps(EnvIntentSpec.model_json_schema(), indent=2))
+        return
+
+    from isaaclab_arena.environments.agentic_env_gen.env_gen_agent import EnvGenAgent, build_catalog_text
+
+    catalog = build_catalog_text()
+    if args.print_catalog:
+        print(catalog)
+        return
+
+    kwargs = {"model": args.model} if args.model else {}
+    agent = EnvGenAgent(**kwargs)
+    spec, raw = agent.generate_spec(args.prompt, catalog_text=catalog, temperature=args.temperature)
+
+    print("=== raw agent response ===")
+    print(raw)
+
+    # Surface the forced chain-of-thought field on its own so it's easy to
+    # spot when debugging a bad spec — without this, ``reasoning`` is
+    # buried inside the multi-hundred-line model_dump_json below.
+    print("\n=== agent reasoning ===")
+    print(spec.reasoning)
+
+    if args.background and args.background != spec.background:
+        # Swap the background name wherever it appears so downstream code
+        # (resolver, proposer) sees a consistent scene. Rewrite both
+        # ``rel.target`` (binary relations like ``on(bowl, table)``) AND
+        # ``rel.subject`` (unary relations like ``is_anchor(table)``);
+        # missing the subject case would leave the unary constraint
+        # pointing at the old background name, after which the resolver
+        # would emit a ``relation.initial.unknown_subject`` trace and
+        # silently drop the constraint.
+        old_bg = spec.background
+        new_bg = args.background
+        for rel in spec.initial_scene_graph:
+            if rel.subject == old_bg:
+                rel.subject = new_bg
+            if rel.target == old_bg:
+                rel.target = new_bg
+        # Note: tasks don't directly reference background in target (typically None or items),
+        # so no background substitution needed in task.target
+        spec.background = new_bg
+        print(f"\n=== background override applied: {old_bg!r} -> {new_bg!r} ===")
+
+    print("\n=== parsed EnvIntentSpec ===")
+    print(spec.model_dump_json(indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/isaaclab_arena/tests/test_env_gen_agent.py b/isaaclab_arena/tests/test_env_gen_agent.py
new file mode 100644
index 000000000..3207351a5
--- /dev/null
+++ b/isaaclab_arena/tests/test_env_gen_agent.py
@@ -0,0 +1,429 @@
+# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Unit tests for :class:`isaaclab_arena.environments.agentic_env_gen.env_gen_agent.EnvGenAgent`.
+
+The agent's behaviour decomposes into three agent-level concerns that
+we exercise without ever hitting the wire:
+
+* ``__init__`` argument / env-var precedence, the missing-key guard,
+  and the two constructor-time validations (``ping`` then
+  ``check_structured_output_support``) that convert late wire /
+  capability failures into fail-fast errors.
+* ``generate_spec`` — the openai client is replaced with a
+  ``MagicMock`` so we assert on the request shape (model, messages,
+  ``response_format``, temperature, max_tokens) and the
+  error-propagation contract.
+* ``_system_prompt`` keeps its cross-cutting guidance intact;
+  per-field schema details ride on the wire via
+  ``response_format=json_schema`` rather than the prompt text.
+
+Schema munging, the ``ping`` and ``check_structured_output_support``
+helpers, and their failure-mode coverage all live in
+:mod:`test_structured_output_utils`.
+"""
+
+from __future__ import annotations
+
+import json
+from unittest.mock import MagicMock, patch
+
+import pytest
+from pydantic import ValidationError
+
+from isaaclab_arena.environments.agentic_env_gen.env_gen_agent import DEFAULT_BASE_URL, DEFAULT_MODEL, EnvGenAgent
+from isaaclab_arena.environments.agentic_env_gen.structured_output_utils import apply_strict_constraints
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+def _chat_response(content: str | None = None, reasoning_content: str | None = None, finish_reason: str = "stop"):
+    """Build a nested mock matching the openai chat-completion response shape.
+
+    Models that route structured outputs into ``reasoning_content`` (e.g.
+    NVIDIA DeepSeek) leave ``content`` empty — the fixture mirrors that by
+    populating either channel independently.
+    """
+    resp = MagicMock()
+    resp.choices = [MagicMock()]
+    resp.choices[0].finish_reason = finish_reason
+    resp.choices[0].message.content = content
+    resp.choices[0].message.reasoning_content = reasoning_content
+    return resp
+
+
+@pytest.fixture
+def stub_openai():
+    """Patch ``openai.OpenAI`` so ``EnvGenAgent()`` never hits the wire.
+
+    The agent does a deferred ``from openai import OpenAI`` inside
+    ``__init__`` to avoid pulling the dependency at module import
+    time, so we patch the symbol on the ``openai`` module itself.
+
+    The patched client is pre-loaded to satisfy the two constructor
+    probes (cheap ``ping`` then full structured-output check):
+    ``side_effect`` returns a "OK" ping response then a
+    ``_MINIMAL_SPEC`` probe response. Tests that want to assert on a
+    failing ``__init__`` reach for ``patch("openai.OpenAI")``
+    directly with a custom ``side_effect``.
+    """
+    with patch("openai.OpenAI") as mock_cls:
+        client = MagicMock()
+        client.chat.completions.create.side_effect = [
+            _chat_response(content="OK"),
+            _chat_response(content=json.dumps(_MINIMAL_SPEC)),
+        ]
+        mock_cls.return_value = client
+        yield mock_cls
+
+
+@pytest.fixture
+def agent(stub_openai):
+    """A constructed ``EnvGenAgent`` with a fully mocked openai client.
+
+    ``__init__``'s two calls (ping + structured-output probe) are
+    served by ``stub_openai``'s pre-loaded ``side_effect``. After
+    construction we *reset the mock* so per-test assertions on
+    ``call_args`` / ``call_count`` start from a clean slate; tests
+    can then set ``.return_value`` (or a fresh ``.side_effect``) to
+    drive whichever method they're exercising.
+    """
+    a = EnvGenAgent(api_key="test-key")
+    a.client.chat.completions.create.side_effect = None
+    a.client.chat.completions.create.reset_mock()
+    return a
+
+
+# Minimal EnvIntentSpec payload — exercises every required field plus one
+# task. Reused across the generate_spec happy-path tests.
+_MINIMAL_SPEC: dict = {
+    "reasoning": (
+        "User wants a pick-and-place: foreground object is 'avocado', "
+        "target container is 'bowl', background is the kitchen table."
+    ),
+    "task_description": "pick up the avocado and place it in the bowl",
+    "background": "kitchen",
+    "embodiment": "franka_ik",
+    "items": [
+        {"query": "avocado", "role": "foreground", "category_tags": [], "instance_name": None, "scale": None},
+        {"query": "bowl", "role": "foreground", "category_tags": [], "instance_name": None, "scale": None},
+    ],
+    "initial_scene_graph": [
+        {"kind": "on", "subject": "avocado", "target": "kitchen"},
+        {"kind": "on", "subject": "bowl", "target": "kitchen"},
+    ],
+    "tasks": [{
+        "kind": "pick_and_place",
+        "subject": "avocado",
+        "target": "bowl",
+        "description": "pick up the avocado and place it in the bowl",
+    }],
+}
+
+
+# ---------------------------------------------------------------------------
+# __init__
+# ---------------------------------------------------------------------------
+
+
+class TestInit:
+    def test_explicit_api_key_overrides_env(self, monkeypatch, stub_openai):
+        monkeypatch.setenv("NV_API_KEY", "env-key")
+        a = EnvGenAgent(api_key="explicit-key")
+        assert a.api_key == "explicit-key"
+
+    def test_falls_back_to_env_var(self, monkeypatch, stub_openai):
+        monkeypatch.setenv("NV_API_KEY", "env-key")
+        a = EnvGenAgent()
+        assert a.api_key == "env-key"
+
+    def test_raises_when_no_key_anywhere(self, monkeypatch, stub_openai):
+        monkeypatch.delenv("NV_API_KEY", raising=False)
+        with pytest.raises(ValueError, match="API key required"):
+            EnvGenAgent()
+
+    def test_default_model_and_base_url(self, stub_openai):
+        a = EnvGenAgent(api_key="k")
+        assert a.model == DEFAULT_MODEL
+        stub_openai.assert_called_once_with(api_key="k", base_url=DEFAULT_BASE_URL)
+
+    def test_custom_model_and_base_url(self, stub_openai):
+        a = EnvGenAgent(api_key="k", model="custom-model", base_url="http://localhost:8000")
+        assert a.model == "custom-model"
+        stub_openai.assert_called_once_with(api_key="k", base_url="http://localhost:8000")
+
+    def test_init_runs_ping_then_structured_output_probe(self, stub_openai):
+        # ``__init__`` is contracted to run TWO wire checks in order:
+        # (1) the cheap ``ping`` so a dead endpoint / bad key fails before
+        # we spend tokens on (2) the heavier structured-output probe.
+        # Asserting the order matters because reversing it would waste a
+        # full schema probe on every misconfigured deployment.
+        a = EnvGenAgent(api_key="k")
+        assert a.client.chat.completions.create.call_count == 2
+        first, second = a.client.chat.completions.create.call_args_list
+        # First call = ping: small message, no response_format.
+        assert first.kwargs["temperature"] == 0
+        assert first.kwargs["max_tokens"] == 8
+        assert len(first.kwargs["messages"]) == 1
+        assert "response_format" not in first.kwargs
+        # Second call = structured-output probe: carries the EnvIntentSpec
+        # schema, signalling the model has to actually honour
+        # ``response_format=json_schema``.
+        assert second.kwargs["response_format"]["type"] == "json_schema"
+        assert second.kwargs["response_format"]["json_schema"]["name"] == "EnvIntentSpec"
+
+    def test_init_propagates_ping_failure(self):
+        # If the openai client raises on the FIRST (ping) call — bad key,
+        # unreachable endpoint, etc. — the exception must surface from
+        # ``EnvGenAgent()`` itself, not be swallowed into a silently-broken
+        # instance that fails later when generate_spec is called. The
+        # structured-output probe must NOT be attempted (otherwise we'd
+        # waste a schema-carrying request on a dead wire).
+        class FakeAuthError(Exception):
+            pass
+
+        with patch("openai.OpenAI") as mock_cls:
+            client = MagicMock()
+            client.chat.completions.create.side_effect = FakeAuthError("bad key")
+            mock_cls.return_value = client
+            with pytest.raises(FakeAuthError, match="bad key"):
+                EnvGenAgent(api_key="k")
+            # Exactly one create() call — the ping. The probe never ran.
+            assert client.chat.completions.create.call_count == 1
+
+    def test_init_raises_when_structured_output_unsupported(self):
+        # The agent is structured-outputs-only — a model that can't honour
+        # ``response_format=json_schema`` is fundamentally unusable. The
+        # constructor must refuse rather than letting downstream
+        # ``generate_spec`` blow up later. ``check_structured_output_support``
+        # raises the diagnostic RuntimeError directly, so all the
+        # informative fields are baked into the probe's exception — no
+        # caller-side message construction. This test just confirms the
+        # probe's exception reaches the caller verbatim (no swallow,
+        # no rewrap that drops fields).
+        with patch("openai.OpenAI") as mock_cls:
+            client = MagicMock()
+            client.chat.completions.create.side_effect = [
+                _chat_response(content="OK"),  # ping passes
+                _chat_response(content=None, reasoning_content=None),  # probe empty
+            ]
+            mock_cls.return_value = client
+            with pytest.raises(RuntimeError, match="does not support structured outputs") as exc_info:
+                EnvGenAgent(api_key="k")
+            msg = str(exc_info.value)
+            # Diagnostic fields from the probe must reach the operator —
+            # ``sample_payload`` in particular is what turns cryptic JSON /
+            # validation errors into debuggable failures.
+            assert "response_route" in msg
+            assert "finish_reason" in msg
+            assert "cause" in msg
+            assert "sample_payload" in msg
+            # The empty-envelope route signal — keeps callers able to
+            # attribute "empty" vs "content" vs "reasoning_content".
+            assert "'empty'" in msg
+
+    def test_init_caches_strict_schema(self, stub_openai):
+        # The strict schema munging walks ~10 nested object nodes; caching it
+        # on the instance avoids redoing the walk on every generate_spec call.
+        # The cached schema must already be munged — re-running the munger
+        # should be a no-op (idempotent).
+        a = EnvGenAgent(api_key="k")
+        assert isinstance(a._spec_schema, dict)
+        before = json.dumps(a._spec_schema, sort_keys=True)
+        apply_strict_constraints(a._spec_schema)
+        after = json.dumps(a._spec_schema, sort_keys=True)
+        assert before == after
+
+
+# ---------------------------------------------------------------------------
+# generate_spec
+# ---------------------------------------------------------------------------
+
+
+class TestGenerateSpec:
+    def test_happy_path_returns_spec_and_raw(self, agent):
+        raw = json.dumps(_MINIMAL_SPEC)
+        agent.client.chat.completions.create.return_value = _chat_response(content=raw)
+        spec, returned_raw = agent.generate_spec("avocado on kitchen", catalog_text="catalog")
+        assert spec.embodiment == "franka_ik"
+        assert spec.background == "kitchen"
+        assert len(spec.tasks) == 1
+        assert returned_raw == raw
+
+    def test_reads_from_reasoning_content_channel(self, agent):
+        # DeepSeek quirk: when structured outputs are requested, the model
+        # puts the JSON in ``reasoning_content`` instead of ``content``.
+        raw = json.dumps(_MINIMAL_SPEC)
+        agent.client.chat.completions.create.return_value = _chat_response(content=None, reasoning_content=raw)
+        spec, returned_raw = agent.generate_spec("p", catalog_text="catalog")
+        assert spec.embodiment == "franka_ik"
+        assert returned_raw == raw
+
+    def test_request_sets_response_format_to_json_schema(self, agent):
+        agent.client.chat.completions.create.return_value = _chat_response(content=json.dumps(_MINIMAL_SPEC))
+        agent.generate_spec("p", catalog_text="catalog")
+        kwargs = agent.client.chat.completions.create.call_args.kwargs
+        assert kwargs["response_format"]["type"] == "json_schema"
+        assert kwargs["response_format"]["json_schema"]["name"] == "EnvIntentSpec"
+        assert kwargs["response_format"]["json_schema"]["strict"] is True
+        # The schema sent on the wire is the cached, strict-mode-munged copy.
+        assert kwargs["response_format"]["json_schema"]["schema"] is agent._spec_schema
+
+    def test_raises_runtime_error_on_empty_envelope(self, agent):
+        # Both channels empty — the endpoint accepted ``response_format`` but
+        # the model dropped the structured output (the canonical "endpoint
+        # doesn't actually support structured outputs" failure mode).
+        agent.client.chat.completions.create.return_value = _chat_response(content=None, reasoning_content=None)
+        with pytest.raises(RuntimeError, match="empty structured-outputs envelope"):
+            agent.generate_spec("p", catalog_text="catalog")
+
+    def test_tolerates_unescaped_control_chars(self, agent):
+        # DeepSeek-v4-flash emits literal tab/newline characters inside JSON
+        # strings despite the structured-outputs contract. Python's default
+        # ``json.loads`` rejects them; we pass ``strict=False`` to accept.
+        payload = dict(_MINIMAL_SPEC)
+        payload["task_description"] = "pick up\tthe\tavocado"
+        raw = json.dumps(payload).replace("\\t", "\t")
+        assert "\t" in raw  # raw payload now has literal tab chars in a string
+        agent.client.chat.completions.create.return_value = _chat_response(content=raw)
+        spec, _ = agent.generate_spec("p", catalog_text="catalog")
+        assert "\t" in spec.task_description
+
+    def test_propagates_validation_error_for_schema_violation(self, agent):
+        # Well-formed JSON but missing every required EnvIntentSpec field —
+        # pydantic surfaces this as a ``ValidationError`` distinct from a
+        # transport or parse error.
+        agent.client.chat.completions.create.return_value = _chat_response(content='{"missing": "fields"}')
+        with pytest.raises(ValidationError):
+            agent.generate_spec("p", catalog_text="catalog")
+
+    def test_request_uses_configured_model(self, agent):
+        agent.client.chat.completions.create.return_value = _chat_response(content=json.dumps(_MINIMAL_SPEC))
+        agent.generate_spec("p", catalog_text="catalog")
+        kwargs = agent.client.chat.completions.create.call_args.kwargs
+        assert kwargs["model"] == agent.model
+
+    def test_forwards_temperature_and_max_tokens(self, agent):
+        agent.client.chat.completions.create.return_value = _chat_response(content=json.dumps(_MINIMAL_SPEC))
+        agent.generate_spec("p", catalog_text="catalog", temperature=0.7, max_tokens=500)
+        kwargs = agent.client.chat.completions.create.call_args.kwargs
+        assert kwargs["temperature"] == 0.7
+        assert kwargs["max_tokens"] == 500
+
+    def test_user_message_contains_catalog_and_prompt(self, agent):
+        agent.client.chat.completions.create.return_value = _chat_response(content=json.dumps(_MINIMAL_SPEC))
+        agent.generate_spec("user wants avocado on kitchen", catalog_text="<<CATALOG-MARKER>>")
+        msgs = agent.client.chat.completions.create.call_args.kwargs["messages"]
+        assert [m["role"] for m in msgs] == ["system", "user"]
+        user_msg = msgs[1]["content"]
+        assert "<<CATALOG-MARKER>>" in user_msg
+        assert "user wants avocado on kitchen" in user_msg
+        # Under structured outputs the "emit ONLY JSON" instruction is
+        # redundant (and was deliberately dropped) — the wire enforces
+        # the envelope.
+        assert "Return ONLY" not in user_msg
+
+
+# ---------------------------------------------------------------------------
+# _system_prompt
+# ---------------------------------------------------------------------------
+
+
+class TestSystemPrompt:
+    def test_contains_cross_cutting_guidance(self, agent):
+        # Under structured outputs the schema (including every Relation /
+        # Task literal enum) flows to the model via ``response_format``.
+        # The system prompt is reserved for cross-cutting rules that
+        # can't be expressed in the schema — articulated-object anchoring,
+        # distractor anchoring, anti-hallucination directives. Lock those
+        # markers in so a future prompt rewrite can't accidentally drop
+        # them.
+        prompt = agent._system_prompt()
+        for marker in (
+            "Articulated objects",
+            "Distractor items",
+            "Do NOT hallucinate",
+            "pick_and_place",
+            "open_door",
+            "close_door",
+        ):
+            assert marker in prompt, f"system prompt missing required marker {marker!r}"
+
+    def test_does_not_repeat_response_format_instruction(self, agent):
+        # Belt-and-suspenders: ensure the prompt isn't still telling the
+        # model "emit ONLY JSON" — that instruction is redundant under
+        # structured outputs and the wire enforces it.
+        prompt = agent._system_prompt()
+        assert "Emit ONLY" not in prompt
+        assert "ONLY the JSON object" not in prompt
+
+
+# ---------------------------------------------------------------------------
+# Live endpoint (opt-in, network + auth required)
+# ---------------------------------------------------------------------------
+
+
+# The test exercises a real wire call against NVIDIA's hosted DeepSeek-v4-flash,
+# which has intermittent quirks under structured outputs (occasional blank
+# content, transient 429 / 5xx, etc.). A single failed attempt does NOT
+# mean ``generate_spec`` is broken — allow up to 2 reruns so the transport
+# layer's intermittency doesn't fail CI. Real breakage will still fail all 3.
+# TODO(qianl): drop the flaky marker once production-side retry is wired
+# into ``generate_spec`` / ``check_structured_output_support`` (see TODOs in
+# env_gen_agent.py and structured_output_utils.py).
+@pytest.mark.flaky(max_runs=3, min_passes=1)
+@pytest.mark.agent_remote_e2e
+def test_generate_spec_against_live_endpoint():
+    """End-to-end smoke test against the real OpenAI-compatible endpoint.
+
+    Exercises the full structured-outputs pipeline with default
+    ``model`` / ``base_url`` / system prompt:
+
+        auth → HTTPS → response_format=json_schema → channel fallback
+        → json.loads(strict=False) → EnvIntentSpec.model_validate
+
+    Two layers gate this from default ``pytest`` runs:
+
+      * ``agent_remote_e2e`` marker — registered in ``pytest.ini`` next to
+        ``gr00t_remote_e2e``. Run explicitly with
+        ``pytest -m agent_remote_e2e isaaclab_arena/tests/test_env_gen_agent.py``.
+
+    The asset catalog is supplied inline rather than via ``AssetRegistry``
+    so the test doesn't depend on Isaac Lab asset registration state — we
+    only want to validate the agent wire here, not the catalog builder.
+
+    The structured-outputs *capability* of the default model is
+    pinned separately by
+    :func:`test_structured_output_utils.test_default_model_supports_structured_output`;
+    this test exercises the higher-level ``generate_spec`` pipeline
+    end-to-end.
+
+    Assertions are intentionally loose: we check shape (non-empty raw,
+    non-empty tasks, populated background/embodiment, populated
+    reasoning) rather than exact content, since agent output drifts
+    between model versions.
+    """
+    agent = EnvGenAgent()
+    catalog = (
+        "EMBODIMENTS: franka_ik\n\n"
+        "BACKGROUNDS: maple_table_kitchen\n\n"
+        "OBJECTS (2):\n"
+        "- avocado_robolab  tags=['vegetable']\n"
+        "- bowl_robolab  tags=['container']"
+    )
+    spec, raw = agent.generate_spec(
+        "pick up the avocado and place it in the bowl on the kitchen table",
+        catalog_text=catalog,
+    )
+    assert isinstance(raw, str) and raw, "agent returned empty raw response"
+    assert spec.tasks, "EnvIntentSpec must contain at least one task"
+    assert spec.background, "EnvIntentSpec.background must be populated"
+    assert spec.embodiment, "EnvIntentSpec.embodiment must be populated"
+    # Structured outputs guarantee the forced-CoT reasoning field is
+    # populated — under the old prose-extraction path it could come
+    # back blank if the model wrapped the schema in markdown.
+    assert spec.reasoning, "EnvIntentSpec.reasoning must be populated"
diff --git a/isaaclab_arena/tests/test_structured_output_utils.py b/isaaclab_arena/tests/test_structured_output_utils.py
new file mode 100644
index 000000000..458a78643
--- /dev/null
+++ b/isaaclab_arena/tests/test_structured_output_utils.py
@@ -0,0 +1,474 @@
+# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Unit tests for :mod:`isaaclab_arena.environments.agentic_env_gen.structured_output_utils`.
+
+The utility module owns the three concerns that decouple "is this
+endpoint compatible with our structured-outputs contract?" from the
+agent's higher-level pipeline:
+
+* ``build_strict_schema`` / ``apply_strict_constraints`` — schema
+  munging that walks every object node (``$defs``, nested arrays,
+  ``anyOf`` arms) and applies OpenAI strict-mode constraints. Locked
+  in here so a future pydantic version that changes default schema
+  output doesn't silently regress Bedrock compatibility.
+* ``extract_response_text`` — the NVIDIA-DeepSeek-vs-OpenAI channel
+  fallback (``content`` first, then ``reasoning_content``,
+  ``"empty"`` last).
+* ``check_structured_output_support`` — the deployment validator's
+  diagnostic probe. Tested both with mocks (failure-mode coverage)
+  and against the real default model (so we notice the day
+  NVIDIA's hosted DeepSeek-v4-flash drops structured-output
+  support).
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from unittest.mock import MagicMock
+
+import pytest
+from pydantic import BaseModel
+
+from isaaclab_arena.environments.agentic_env_gen.env_gen_agent import DEFAULT_BASE_URL, DEFAULT_MODEL
+from isaaclab_arena.environments.agentic_env_gen.env_intent_spec import EnvIntentSpec
+from isaaclab_arena.environments.agentic_env_gen.structured_output_utils import (
+    apply_strict_constraints,
+    build_strict_schema,
+    check_structured_output_support,
+    extract_response_text,
+    ping,
+)
+
+# ---------------------------------------------------------------------------
+# Fixtures + helpers
+# ---------------------------------------------------------------------------
+
+
+def _chat_response(content: str | None = None, reasoning_content: str | None = None, finish_reason: str = "stop"):
+    """Build a nested mock matching the openai chat-completion response shape."""
+    resp = MagicMock()
+    resp.choices = [MagicMock()]
+    resp.choices[0].finish_reason = finish_reason
+    resp.choices[0].message.content = content
+    resp.choices[0].message.reasoning_content = reasoning_content
+    return resp
+
+
+# Minimal EnvIntentSpec payload exercising every required field plus one
+# task — reused across the ``check_structured_output_support`` happy-path
+# tests so they exercise the real production schema rather than a toy stub.
+_MINIMAL_SPEC: dict = {
+    "reasoning": (
+        "User wants a pick-and-place: foreground object is 'avocado', "
+        "target container is 'bowl', background is the kitchen table."
+    ),
+    "task_description": "pick up the avocado and place it in the bowl",
+    "background": "kitchen",
+    "embodiment": "franka_ik",
+    "items": [
+        {"query": "avocado", "role": "foreground", "category_tags": [], "instance_name": None, "scale": None},
+        {"query": "bowl", "role": "foreground", "category_tags": [], "instance_name": None, "scale": None},
+    ],
+    "initial_scene_graph": [
+        {"kind": "on", "subject": "avocado", "target": "kitchen"},
+        {"kind": "on", "subject": "bowl", "target": "kitchen"},
+    ],
+    "tasks": [{
+        "kind": "pick_and_place",
+        "subject": "avocado",
+        "target": "bowl",
+        "description": "pick up the avocado and place it in the bowl",
+    }],
+}
+
+
+# ---------------------------------------------------------------------------
+# build_strict_schema / apply_strict_constraints
+# ---------------------------------------------------------------------------
+
+
+class _ToyChild(BaseModel):
+    name: str
+    optional_value: int | None = None
+
+
+class _ToyParent(BaseModel):
+    title: str
+    child: _ToyChild
+    children: list[_ToyChild] = []
+
+
+class TestBuildStrictSchema:
+    def test_root_object_additional_properties_false(self):
+        schema = build_strict_schema(_ToyParent)
+        assert schema["additionalProperties"] is False
+
+    def test_root_object_lists_every_property_as_required(self):
+        schema = build_strict_schema(_ToyParent)
+        assert set(schema["required"]) == {"title", "child", "children"}
+
+    def test_nested_defs_object_also_strict(self):
+        # OpenAI strict mode applies the constraint to *every* object node,
+        # not just the top level — including ``$defs`` entries that get
+        # referenced via ``$ref``. Bedrock in particular rejects the request
+        # if any descendant object schema is missing the marker.
+        schema = build_strict_schema(_ToyParent)
+        defs = schema["$defs"]
+        assert defs["_ToyChild"]["additionalProperties"] is False
+        assert set(defs["_ToyChild"]["required"]) == {"name", "optional_value"}
+
+    def test_defaults_stripped_everywhere(self):
+        # Pydantic emits ``"default": null`` for ``optional_value`` at the
+        # property level; strict mode rejects ``default`` since every field
+        # is required. Drop the key recursively.
+        schema = build_strict_schema(_ToyParent)
+        stack = [schema]
+        while stack:
+            node = stack.pop()
+            if isinstance(node, dict):
+                assert "default" not in node, f"unexpected default key in {node!r}"
+                stack.extend(node.values())
+            elif isinstance(node, list):
+                stack.extend(node)
+
+    def test_munging_does_not_mutate_pydantic_cached_schema(self):
+        # Pydantic caches ``model_json_schema()`` results internally; our
+        # munger MUST work on a deep copy so the cache stays clean for
+        # other callers (e.g. ``model_dump_json()`` consumers).
+        before = json.dumps(_ToyParent.model_json_schema(), sort_keys=True)
+        build_strict_schema(_ToyParent)
+        after = json.dumps(_ToyParent.model_json_schema(), sort_keys=True)
+        assert before == after
+
+    def test_apply_strict_constraints_is_idempotent(self):
+        # Safe to call multiple times — the second pass must be a no-op.
+        # Important because callers may receive an already-munged schema
+        # from a cache and re-apply defensively.
+        schema = build_strict_schema(_ToyParent)
+        snapshot = json.dumps(schema, sort_keys=True)
+        apply_strict_constraints(schema)
+        assert json.dumps(schema, sort_keys=True) == snapshot
+
+    def test_env_intent_spec_munges_clean(self):
+        # The real production schema we ship — confirm every object node
+        # has the strict-mode marker so the wire stays compatible with
+        # Bedrock and any other strict-mode validator users point at.
+        schema = build_strict_schema(EnvIntentSpec)
+
+        def assert_strict(node):
+            if isinstance(node, dict):
+                if node.get("type") == "object" and "properties" in node:
+                    assert node.get("additionalProperties") is False
+                    assert set(node["required"]) == set(node["properties"].keys())
+                for v in node.values():
+                    assert_strict(v)
+            elif isinstance(node, list):
+                for v in node:
+                    assert_strict(v)
+
+        assert_strict(schema)
+
+
+# ---------------------------------------------------------------------------
+# extract_response_text
+# ---------------------------------------------------------------------------
+
+
+class TestExtractResponseText:
+    def test_prefers_content_when_both_populated(self):
+        msg = MagicMock(content='{"a": 1}', reasoning_content='{"b": 2}')
+        text, route = extract_response_text(msg)
+        assert text == '{"a": 1}'
+        assert route == "content"
+
+    def test_falls_back_to_reasoning_content_when_content_empty(self):
+        # NVIDIA DeepSeek-v4-flash routes structured outputs into the
+        # provider-specific ``reasoning_content`` channel and leaves
+        # ``content`` as ``None``. The agent must transparently read either.
+        msg = MagicMock(content=None, reasoning_content='{"b": 2}')
+        text, route = extract_response_text(msg)
+        assert text == '{"b": 2}'
+        assert route == "reasoning_content"
+
+    def test_empty_when_both_channels_blank(self):
+        msg = MagicMock(content=None, reasoning_content=None)
+        text, route = extract_response_text(msg)
+        assert text == ""
+        assert route == "empty"
+
+    def test_empty_when_message_has_no_attrs(self):
+        # Some mock / stub message objects don't define the channels at all;
+        # ``getattr(..., None)`` must still resolve to "empty" rather than
+        # raising AttributeError.
+        msg = object()  # bare object, no attrs
+        text, route = extract_response_text(msg)
+        assert text == ""
+        assert route == "empty"
+
+    def test_treats_empty_string_as_falsy(self):
+        # ``""`` and ``None`` must both route to the fallback (otherwise an
+        # empty content with a populated reasoning_content would never
+        # reach the reasoning channel).
+        msg = MagicMock(content="", reasoning_content='{"b": 2}')
+        text, route = extract_response_text(msg)
+        assert text == '{"b": 2}'
+        assert route == "reasoning_content"
+
+
+# ---------------------------------------------------------------------------
+# ping
+# ---------------------------------------------------------------------------
+
+
+class TestPing:
+    def test_returns_response_content(self):
+        client = MagicMock()
+        client.chat.completions.create.return_value = _chat_response(content="OK")
+        assert ping(client, "any-model") == "OK"
+
+    def test_returns_empty_string_when_content_is_none(self):
+        # Some providers return ``None`` content alongside a finish_reason — we
+        # treat that as a successful round-trip (the wire works) rather than
+        # raising, since the caller's contract is "did this raise?".
+        client = MagicMock()
+        client.chat.completions.create.return_value = _chat_response(content=None)
+        assert ping(client, "any-model") == ""
+
+    def test_uses_minimal_request_params(self):
+        client = MagicMock()
+        client.chat.completions.create.return_value = _chat_response(content="OK")
+        ping(client, "model-name")
+        kwargs = client.chat.completions.create.call_args.kwargs
+        assert kwargs["model"] == "model-name"
+        assert kwargs["temperature"] == 0
+        assert kwargs["max_tokens"] == 8
+        # Single user message — no system prompt / catalog payload. Keeping the
+        # request small is the whole point: ping must stay cheap enough to
+        # gate every agent construction.
+        assert len(kwargs["messages"]) == 1
+        assert kwargs["messages"][0]["role"] == "user"
+        # ping is a structured-outputs-agnostic liveness check; it must NOT
+        # ask the model to honour response_format (otherwise it can't fail
+        # gracefully on models that lack structured-output support, which
+        # defeats the point of having a cheap probe).
+        assert "response_format" not in kwargs
+
+    def test_propagates_client_exceptions(self):
+        class FakeAuthError(Exception):
+            pass
+
+        client = MagicMock()
+        client.chat.completions.create.side_effect = FakeAuthError("invalid api key")
+        with pytest.raises(FakeAuthError, match="invalid api key"):
+            ping(client, "m")
+
+
+# ---------------------------------------------------------------------------
+# check_structured_output_support (mocked)
+# ---------------------------------------------------------------------------
+
+
+class TestCheckStructuredOutputSupport:
+    """Bool-or-raise contract: returns True on a clean round-trip, raises
+    ``RuntimeError`` with a multi-line diagnostic on every failure mode.
+
+    Each failure-mode test pins three things:
+      1. ``RuntimeError`` (not the original SDK exception) reaches the
+         caller — so callers have a single exception type to catch.
+      2. The model name appears in the message (the most-grepped field).
+      3. The ``cause`` field carries the upstream classifier
+         (``BadRequestError`` vs ``JSONDecodeError`` vs ``ValidationError``)
+         so the failure attribution survives the wrapping.
+
+    Where the underlying SDK / parser exception is preserved on
+    ``__cause__``, we assert that too — it's what makes
+    ``raise ... from exc`` worth doing.
+    """
+
+    def test_returns_true_on_valid_envelope(self):
+        client = MagicMock()
+        client.chat.completions.create.return_value = _chat_response(content=json.dumps(_MINIMAL_SPEC))
+        # The whole public contract collapses to: ``True`` or it raises.
+        # ``is True`` rather than truthy so a future regression that
+        # returns a dict/tuple/etc fails this test.
+        assert check_structured_output_support(client, "some-model", EnvIntentSpec) is True
+
+    def test_returns_true_on_reasoning_content_envelope(self):
+        # NVIDIA DeepSeek envelope — content empty, structured output
+        # on the ``reasoning_content`` channel. Must NOT raise; the
+        # ``extract_response_text`` fallback handles this transparently.
+        # The previous dataclass surfaced ``response_route`` so callers
+        # could distinguish; the new API hides that detail (callers
+        # don't need it — both channels are equivalent for our purposes).
+        client = MagicMock()
+        client.chat.completions.create.return_value = _chat_response(
+            content=None, reasoning_content=json.dumps(_MINIMAL_SPEC)
+        )
+        assert check_structured_output_support(client, "deepseek", EnvIntentSpec) is True
+
+    def test_raises_on_4xx_with_underlying_exception_chained(self):
+        # The most common "model doesn't support structured outputs"
+        # signal at the wire level: a 4xx rejecting ``response_format``
+        # or the schema. The original SDK exception must reach the
+        # caller via ``__cause__`` so the traceback retains the HTTP
+        # status / body — otherwise debugging "why did construction
+        # fail?" requires re-running locally.
+        class FakeBadRequest(Exception):
+            pass
+
+        client = MagicMock()
+        original = FakeBadRequest("Error code: 400 - additionalProperties")
+        client.chat.completions.create.side_effect = original
+        with pytest.raises(RuntimeError, match="does not support structured outputs") as exc_info:
+            check_structured_output_support(client, "claude", EnvIntentSpec)
+        msg = str(exc_info.value)
+        # Model name surfaces (most-grepped field) and the cause type
+        # classifies the failure (4xx wire error, not parse / validation).
+        assert "'claude'" in msg
+        assert "FakeBadRequest" in msg
+        assert "400" in msg
+        # On an api_error there's no response payload to echo.
+        assert "sample_payload = None" in msg
+        # Exception chaining preserves the original for traceback drill-down.
+        assert exc_info.value.__cause__ is original
+
+    def test_raises_on_empty_envelope(self):
+        # Wire accepts the request, model produces nothing on either
+        # channel. The endpoint silently dropped the structured output
+        # — the most insidious failure mode, since ``finish_reason``
+        # still reads ``stop``. No underlying exception to chain.
+        client = MagicMock()
+        client.chat.completions.create.return_value = _chat_response(content=None, reasoning_content=None)
+        with pytest.raises(RuntimeError, match="does not support structured outputs") as exc_info:
+            check_structured_output_support(client, "broken", EnvIntentSpec)
+        msg = str(exc_info.value)
+        assert "empty envelope" in msg
+        # finish_reason forwarded so the operator can correlate with
+        # provider logs (was it a content-filter stop, a length cap, etc.).
+        assert "finish_reason  = 'stop'" in msg
+        # No upstream exception to chain on this branch (the function
+        # itself synthesises the failure from a structurally-OK response).
+        assert exc_info.value.__cause__ is None
+
+    def test_raises_when_choices_list_is_empty(self):
+        # Real provider behaviour: HTTP returns 200 OK but ``choices`` is
+        # an empty list. Seen on Azure when a content-filter trips, and
+        # on Bedrock when a guardrail rejects the response post-hoc.
+        # Naive ``resp.choices[0]`` access would IndexError and break
+        # the contract — surface it as a structured RuntimeError with
+        # a distinct ``cause`` message that operators can tell apart
+        # from the "envelope returned but content empty" case.
+        resp = MagicMock()
+        resp.choices = []
+        client = MagicMock()
+        client.chat.completions.create.return_value = resp
+        with pytest.raises(RuntimeError, match="does not support structured outputs") as exc_info:
+            check_structured_output_support(client, "guardrailed", EnvIntentSpec)
+        msg = str(exc_info.value)
+        assert "no choices" in msg
+        assert "response_route = 'empty'" in msg
+
+    def test_raises_on_invalid_json_with_payload_preview(self):
+        # The JSON-decode failure is the case where ``sample_payload``
+        # earns its keep — without it the operator sees only
+        # "Expecting value: line 1 column 1" and has to re-run locally
+        # to discover the model emitted a prose preamble. With the
+        # preview in the message the failure is debuggable from CI logs.
+        client = MagicMock()
+        client.chat.completions.create.return_value = _chat_response(content="not json")
+        with pytest.raises(RuntimeError, match="does not support structured outputs") as exc_info:
+            check_structured_output_support(client, "m", EnvIntentSpec)
+        msg = str(exc_info.value)
+        assert "JSONDecodeError" in msg
+        assert "'not json'" in msg  # the literal response preview
+        # Original JSONDecodeError preserved on ``__cause__``.
+        assert exc_info.value.__cause__ is not None
+        assert type(exc_info.value.__cause__).__name__ == "JSONDecodeError"
+
+    def test_raises_on_validation_failure_with_payload_preview(self):
+        # JSON parses fine, but doesn't match the schema. The probe
+        # exists to detect this exact class of "model returns
+        # something, but it's wrong" failure. The original
+        # ValidationError chains via ``__cause__`` so ``.errors()``
+        # is still reachable for callers that want the structured
+        # error list.
+        from pydantic import ValidationError
+
+        client = MagicMock()
+        client.chat.completions.create.return_value = _chat_response(content='{"missing": "fields"}')
+        with pytest.raises(RuntimeError, match="does not support structured outputs") as exc_info:
+            check_structured_output_support(client, "m", EnvIntentSpec)
+        msg = str(exc_info.value)
+        assert "ValidationError" in msg
+        assert '{"missing": "fields"}' in msg  # payload preview echoed
+        assert isinstance(exc_info.value.__cause__, ValidationError)
+
+    def test_request_shape(self):
+        client = MagicMock()
+        client.chat.completions.create.return_value = _chat_response(content=json.dumps(_MINIMAL_SPEC))
+        check_structured_output_support(client, "model-name", EnvIntentSpec)
+        kwargs = client.chat.completions.create.call_args.kwargs
+        assert kwargs["model"] == "model-name"
+        assert kwargs["temperature"] == 0
+        assert kwargs["response_format"]["type"] == "json_schema"
+        assert kwargs["response_format"]["json_schema"]["name"] == "EnvIntentSpec"
+        assert kwargs["response_format"]["json_schema"]["strict"] is True
+        # The schema sent on the wire must already be munged for strict mode
+        # — otherwise Bedrock rejects with 400. Spot-check the root marker.
+        sent_schema = kwargs["response_format"]["json_schema"]["schema"]
+        assert sent_schema["additionalProperties"] is False
+
+    def test_accepts_alternative_spec_class(self):
+        # Callers can probe with a smaller toy spec for cheap model
+        # surveys — the probe shouldn't be hard-wired to EnvIntentSpec.
+        class TinySpec(BaseModel):
+            ok: bool
+
+        client = MagicMock()
+        client.chat.completions.create.return_value = _chat_response(content='{"ok": true}')
+        assert check_structured_output_support(client, "m", TinySpec) is True
+        kwargs = client.chat.completions.create.call_args.kwargs
+        assert kwargs["response_format"]["json_schema"]["name"] == "TinySpec"
+
+
+# ---------------------------------------------------------------------------
+# Live endpoint (opt-in, network + auth required)
+# ---------------------------------------------------------------------------
+
+
+# The probe hits a real model on every run. NVIDIA's hosted DeepSeek-v4-flash
+# is intermittently quirky under structured outputs (occasional blank
+# ``content``, transient 429 / 5xx from the proxy, etc.); a single failed
+# attempt does NOT mean the deployment is actually broken. Allow up to 2
+# reruns so a transient blip doesn't fail CI. Real breakage will fail all 3.
+# TODO(qianl): drop the flaky marker once production-side retry is wired
+# into ``check_structured_output_support`` (see TODO in structured_output_utils.py).
+@pytest.mark.flaky(max_runs=3, min_passes=1)
+@pytest.mark.agent_remote_e2e
+def test_default_model_supports_structured_output():
+    """The default ``EnvGenAgent`` model must support structured outputs.
+
+    This is the gating contract of the whole agent: ``generate_spec``
+    is structured-outputs-only, so the default
+    ``DEFAULT_MODEL`` / ``DEFAULT_BASE_URL`` pair must pass the probe.
+    Failing here means production env-gen is broken — usually because
+    NVIDIA changed which channel DeepSeek-v4-flash routes structured
+    outputs into, or pulled the model from the default-models
+    catalogue.
+
+    The probe's ``RuntimeError`` already carries a multi-line
+    diagnostic (model / route / finish_reason / cause /
+    sample_payload), so test-failure output is self-describing — no
+    extra error-message construction needed here.
+    """
+    api_key = os.environ.get("NV_API_KEY")
+    assert api_key, "NV_API_KEY env var required to run live tests"
+
+    from openai import OpenAI
+
+    client = OpenAI(api_key=api_key, base_url=DEFAULT_BASE_URL)
+    assert check_structured_output_support(client, DEFAULT_MODEL, EnvIntentSpec) is True
diff --git a/pytest.ini b/pytest.ini
index d9d330ca9..2c26acc38 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -5,3 +5,4 @@ markers =
     with_newton: test uses Newton physics
     gr00t_policy: test exercises GR00T policy/data code that runs in the base container (lightweight gr00t deps only)
     gr00t_remote_e2e: test requires a live GR00T remote policy server
+    agent_remote_e2e: test requires a live OpenAI-compatible chat-completions endpoint (needs NV_API_KEY)
diff --git a/setup.py b/setup.py
index 82cd92b56..b8afc5af7 100644
--- a/setup.py
+++ b/setup.py
@@ -14,6 +14,11 @@
     "vuer[all]",
     "lightwheel-sdk",
     "pytest",
+    # Used lazily by isaaclab_arena/environments/agentic_env_gen/* for NV_API_KEY-based agent calls.
+    "openai",
+    # Hard dependency of isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py (BaseModel / Field /
+    # model_validator imported at module load — not lazy).
+    "pydantic>=2.0",
 ]
 
 DEV_DEPS = [