diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 20cbdbddc..90d536ce7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -156,7 +156,8 @@ jobs: - name: Run in-process PhysX tests without cameras run: | - /isaac-sim/python.sh -m pytest -sv --durations=0 -m "not with_cameras and not with_subprocess and not with_newton" \ + /isaac-sim/python.sh -m pytest -sv --durations=0 \ + -m "not with_cameras and not with_subprocess and not with_newton and not agent_remote_e2e" \ isaaclab_arena/tests/ - name: Run GR00T policy/data tests (lightweight gr00t deps only) @@ -265,6 +266,53 @@ jobs: isaaclab_arena_gr00t/tests/test_gr00t_remote_closedloop_policy_runner.py + test_agent_remote_e2e: + name: Agent remote E2E + runs-on: [self-hosted, gpu-arena] + timeout-minutes: 20 + needs: [pre_commit] + env: + # NV_API_KEY is the variable EnvGenAgent reads at runtime (see + # isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py). The repo-level secret is + # named ARENA_NV_API_KEY to mirror ARENA_NGC_API_KEY and avoid + # collisions with other consumers of NV_API_KEY in the runner env. + NV_API_KEY: ${{ secrets.ARENA_NV_API_KEY }} + + container: + image: nvcr.io/nvstaging/isaac-amr/isaaclab_arena:latest + credentials: + username: $oauthtoken + password: ${{ env.NGC_API_KEY }} + + steps: + # No nvidia-smi / kit cache setup: this job is pure-Python (openai + + # pydantic) and never touches Isaac Sim. We reuse the arena image + # only because it already has the deps + the /isaac-sim/python.sh + # interpreter the rest of the suite calls into. + - *install_git_step + - *cleanup_step + - *mark_repo_safe_step + - *checkout_step + - *git_lfs_step + - *install_project_step + + # Fail loudly when the secret isn't wired up — the test itself + # ``skipif``s when NV_API_KEY is empty, so without this guard a + # missing secret would silently produce a green job with zero agent + # coverage. + - name: Verify ARENA_NV_API_KEY is configured + run: | + if [ -z "${NV_API_KEY}" ]; then + echo "::error::ARENA_NV_API_KEY repo secret is not set; cannot run agent_remote_e2e tests." + exit 1 + fi + + - name: Run agent remote E2E test + run: | + /isaac-sim/python.sh -m pytest -sv --durations=0 -m agent_remote_e2e \ + isaaclab_arena/tests/test_env_gen_agent.py + + build_docs_pre_merge: name: Build the docs (pre-merge) runs-on: [self-hosted, gpu-arena] diff --git a/docker/run_docker.sh b/docker/run_docker.sh index baeb038cc..c8af3e7ce 100755 --- a/docker/run_docker.sh +++ b/docker/run_docker.sh @@ -185,6 +185,12 @@ else fi fi + # pass through API keys used by the agentic env-gen prototype; values are + # inherited from the host shell so the key never lives in the repo. + if [ -n "$NV_API_KEY" ]; then + DOCKER_RUN_ARGS+=("--env" "NV_API_KEY") + fi + # if gr00t is installed, mount the gr00t directory in case anything needs to change there if [ "$INSTALL_GROOT" = "true" ]; then DOCKER_RUN_ARGS+=("-v" "./submodules/Isaac-GR00T:${WORKDIR}/submodules/Isaac-GR00T") diff --git a/isaaclab_arena/environments/agentic_env_gen/__init__.py b/isaaclab_arena/environments/agentic_env_gen/__init__.py new file mode 100644 index 000000000..16ea4c218 --- /dev/null +++ b/isaaclab_arena/environments/agentic_env_gen/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py b/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py new file mode 100644 index 000000000..8cb597b1d --- /dev/null +++ b/isaaclab_arena/environments/agentic_env_gen/env_gen_agent.py @@ -0,0 +1,267 @@ +# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Agent for parsing natural-language env-generation prompts into an EnvIntentSpec. + +Calls an OpenAI-compatible chat-completions endpoint (NVIDIA's hosted +inference by default) and uses the **structured-outputs** API +(``response_format={"type": "json_schema", ...}``) so the wire +guarantees a valid JSON envelope matching EnvIntentSpec. There is no +prose-parsing fallback — if the configured model/endpoint does not +support structured outputs, :class:`EnvGenAgent` will refuse to +construct. +""" + +from __future__ import annotations + +import json +import os + +from .env_intent_spec import EnvIntentSpec +from .structured_output_utils import build_strict_schema, check_structured_output_support, extract_response_text, ping + +DEFAULT_BASE_URL = "https://inference-api.nvidia.com" +DEFAULT_MODEL = "nvidia/deepseek-ai/deepseek-v4-flash" + + +def build_catalog_text() -> str: + """Introspect AssetRegistry and build the vocabulary the agent is allowed to use.""" + from isaaclab_arena.assets.registries import AssetRegistry + + registry = AssetRegistry() + backgrounds: list[str] = [] + objects: list[dict] = [] + embodiments: list[str] = [] + for name in registry.get_all_keys(): + cls = registry.get_asset_by_name(name) + tags = list(getattr(cls, "tags", [])) + if "embodiment" in tags: + embodiments.append(name) + elif "background" in tags: + backgrounds.append(name) + elif "object" in tags: + objects.append({"name": name, "tags": [t for t in tags if t != "object"]}) + + obj_lines = "\n".join(f"- {o['name']} tags={o['tags']}" for o in sorted(objects, key=lambda o: o["name"])) + return ( + f"EMBODIMENTS: {', '.join(sorted(embodiments))}\n\n" + f"BACKGROUNDS: {', '.join(sorted(backgrounds))}\n\n" + f"OBJECTS ({len(objects)}):\n{obj_lines}" + ) + + +class EnvGenAgent: + """Parses a natural-language env-generation prompt into an EnvIntentSpec. + + The agent is **structured-outputs only**: every call to + ``generate_spec`` passes ``response_format={"type": "json_schema", + ...}`` to the chat-completions endpoint, and the response is + parsed directly as JSON. There is no prose / markdown-fence + fallback — if the configured model/endpoint doesn't honour + ``response_format``, the constructor raises before the agent is + usable. + """ + + def __init__( + self, + api_key: str | None = None, + model: str = DEFAULT_MODEL, + base_url: str = DEFAULT_BASE_URL, + ): + """Configure the OpenAI-compatible client and validate the model. + + Construction runs two fail-fast wire checks in order: + + 1. :func:`.structured_output_utils.ping` — cheap liveness + probe (no ``response_format``). Confirms the API key + authenticates, the model name resolves at ``base_url``, + and the network path is reachable. + 2. :func:`.structured_output_utils.check_structured_output_support` + — sends ``response_format=json_schema`` with the + ``EnvIntentSpec`` schema and asserts a valid envelope + comes back. Confirms the model actually honours the + structured-outputs contract ``generate_spec`` relies on. + + Both run at construction time so a misconfigured model fails + immediately with a clear stack — not mid-pipeline inside the + first ``generate_spec`` call. + + Args: + api_key: Bearer token for the inference endpoint. Falls back + to the ``NV_API_KEY`` environment variable when ``None``; + raises ``ValueError`` if neither is set. + model: Model identifier as understood by the endpoint at + ``base_url`` (e.g. ``"nvidia/deepseek-ai/deepseek-v4-flash"``). + See https://build.nvidia.com for the catalogue of NVIDIA-hosted + models. Must support OpenAI-compatible structured + outputs (``response_format=json_schema``) — the + constructor validates this and refuses to proceed + otherwise. + base_url: OpenAI-compatible API root. Defaults to + ``DEFAULT_BASE_URL`` (NVIDIA's hosted inference endpoint); + override to point at a self-hosted vLLM / Ollama / etc. + deployment that exposes the same OpenAI chat-completions + wire format. + + Raises: + ValueError: when no API key is available (neither argument + nor ``NV_API_KEY`` env var). + RuntimeError: when the configured model does not support + structured outputs (probe came back unsupported). + Any exception raised by the underlying ``openai`` client + during the ping probe — typically + ``AuthenticationError`` (bad key), ``NotFoundError`` + (wrong model), ``APIConnectionError`` (unreachable + endpoint), or ``RateLimitError`` (quota exhausted). + """ + from openai import OpenAI + + self.api_key = api_key or os.getenv("NV_API_KEY") + # Use an explicit raise instead of ``assert`` so the guard survives + # ``python -O`` (which strips asserts) — missing-key failures must be + # loud regardless of interpreter flags. + if not self.api_key: + raise ValueError("API key required: set NV_API_KEY or pass api_key.") + self.model = model + self.client = OpenAI(api_key=self.api_key, base_url=base_url) + # Cached on the instance because the schema is non-trivial to walk + # (~10 nested object nodes) and ``generate_spec`` may be called many + # times. Munged once per agent lifetime. + self._spec_schema = build_strict_schema(EnvIntentSpec) + + # 1) Cheap liveness probe first. If the wire is down or the key is + # bad we don't want to waste tokens on the heavier structured-output + # probe below — ``ping`` is the right tool for "is the endpoint + # talking to us at all?". + ping(self.client, self.model) + + # 2) Structured-output capability check. ``generate_spec`` is + # structured-outputs-only, so a model that can't honour + # ``response_format=json_schema`` is fundamentally unusable for + # this agent. The probe raises ``RuntimeError`` with a multi-line + # diagnostic (route / finish_reason / cause / sample_payload) on + # any failure mode — no caller-side wrapping needed. + check_structured_output_support(self.client, self.model, EnvIntentSpec) + + def generate_spec( + self, + prompt: str, + catalog_text: str | None = None, + temperature: float = 0.2, + max_tokens: int = 2000, + ) -> tuple[EnvIntentSpec, str]: + """Call the model and return the parsed EnvIntentSpec plus the raw response. + + Uses OpenAI-compatible structured outputs: the request includes + ``response_format={"type": "json_schema", ...}`` with the + EnvIntentSpec schema, and the response is parsed directly as + JSON. No prose / markdown-fence fallback. + + Args: + prompt: Natural-language env description from the end user. + Concatenated with the asset catalog to form the chat + ``user`` message. + catalog_text: Pre-built asset vocabulary (the output of + ``build_catalog_text()``). When ``None``, the catalog is + rebuilt from the live ``AssetRegistry``. Pass an explicit + value to (a) avoid the cost of rebuilding it across + repeated calls, or (b) experiment with a restricted / + augmented catalog without mutating the registry. + temperature: Sampling temperature forwarded to the model. Kept + low by default (0.2) because EnvIntentSpec generation is a + deterministic-ish translation task — high temperature + yields creative but invalid schemas. + max_tokens: Hard cap on the response length. Set generously + (2000) so multi-task EnvIntentSpecs aren't truncated + mid-JSON; shrink if the endpoint enforces a tighter + quota. + + Returns: + A ``(EnvIntentSpec, raw_response)`` tuple. The raw text is + useful for debugging when validation rejects the parsed + JSON (or for inspecting the model's reasoning chain). + + Raises: + RuntimeError: when the model returns an empty response on + both ``content`` and ``reasoning_content`` channels + (the structured-outputs envelope dropped). Indicates + the endpoint or model does not actually honour + ``response_format`` — run + :meth:`check_structured_output_support` to confirm. + json.JSONDecodeError: when the model returned non-JSON + text despite the structured-outputs guarantee + (vanishingly rare; usually a transport/proxy issue). + pydantic.ValidationError: when the parsed JSON is + well-formed but violates EnvIntentSpec's semantic + constraints (e.g. empty ``tasks`` list). + """ + catalog_text = catalog_text or build_catalog_text() + system = self._system_prompt() + user = f"{catalog_text}\n\nUSER PROMPT:\n{prompt}" + + # TODO(qianl): wrap with transient-error retry (exponential backoff + # + jitter) for ``APIConnectionError`` / ``APITimeoutError`` / 429 + # / 5xx, plus self-correction on ``pydantic.ValidationError`` (feed + # the .errors() report back to the model so it can fix the violation + # on retry). Deterministic 4xx errors must still propagate + # immediately. Until then, ``test_generate_spec_against_live_endpoint`` + # carries ``@pytest.mark.flaky`` to absorb transport-layer hiccups + # at the test layer. + resp = self.client.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ], + response_format={ + "type": "json_schema", + "json_schema": {"name": "EnvIntentSpec", "strict": True, "schema": self._spec_schema}, + }, + temperature=temperature, + max_tokens=max_tokens, + ) + text, route = extract_response_text(resp.choices[0].message) + if route == "empty": + raise RuntimeError( + f"Model {self.model!r} returned an empty structured-outputs envelope. " + "Run check_structured_output_support() to verify the endpoint/model " + "actually honours response_format=json_schema." + ) + # ``strict=False`` lets json.loads accept unescaped control characters + # (e.g. literal tabs) inside JSON strings — DeepSeek-v4-flash is known + # to emit these despite the structured-outputs contract. Pydantic's + # own ``model_validate_json`` is stricter and would reject them. + data = json.loads(text, strict=False) + spec = EnvIntentSpec.model_validate(data) + return spec, text + + def _system_prompt(self) -> str: + # Per-field guidance (what each field means, enum members, default + # behaviours) lives on the ``Field(description=...)`` entries in + # env_intent_spec.py and is surfaced to the agent via the SCHEMA + # the structured-outputs API embeds in every request. Only + # cross-cutting rules and few-shot examples belong here. The + # "emit ONLY JSON" instruction is intentionally absent — + # structured outputs enforce the envelope at the wire level. + return ( + "You are an env-generation parser for robot manipulation tasks.\n" + "Convert a natural-language prompt into an EnvIntentSpec.\n\n" + "GUIDANCE:\n" + "- Follow the per-field ``description`` strings in the schema for what each field expects.\n" + "- If the prompt does not specify a value for an optional field, output null.\n" + " Do NOT hallucinate values — the resolver tolerates nulls; it cannot fix invented data.\n" + "- Articulated objects (microwave, fridge, cabinet) still need a spatial\n" + " 'on(, background)' relation in initial_scene_graph to anchor them; their\n" + " open/close behaviour is expressed via tasks, not via relations.\n" + "- Distractor items around the appliance need 'on(distractor, background)' relations\n" + " in initial_scene_graph as well.\n" + "- Task examples (showing kind + subject + target + description shape):\n" + ' * Pick-and-place: {"kind": "pick_and_place", "subject": "avocado", "target": "bowl",\n' + ' "description": "pick up the avocado and place it in the bowl"}\n' + ' * Open door: {"kind": "open_door", "subject": "microwave", "target": null,\n' + ' "description": "open the microwave door"}\n' + ' * Close door: {"kind": "close_door", "subject": "microwave", "target": null,\n' + ' "description": "close the microwave door"}\n' + ) diff --git a/isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py b/isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py new file mode 100644 index 000000000..4d3c9147d --- /dev/null +++ b/isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py @@ -0,0 +1,204 @@ +# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Schema the agent must fill in when parsing a natural-language env-generation prompt. + +The agent sees a list of the *available* asset tags / embodiment names pulled +from the registries at call time, and must return an EnvIntentSpec that only uses +those vocabularies. Concrete asset names are resolved in a second, deterministic +step — the agent never invents USD paths. +""" + +from __future__ import annotations + +from typing import Literal + +from pydantic import BaseModel, Field + +# Relation kinds currently surfaced to the agent. Mirror the subset of +# ``ArenaEnvGraphSpatialConstraintType`` that makes sense for tabletop +# prompts; values must match the enum's values one-to-one because the +# resolver looks the constraint type up via +# ``ArenaEnvGraphSpatialConstraintType(kind)`` rather than maintaining a +# parallel dict. Solver-internal kinds (``position_limits``, +# ``random_around_solution``, ``rotate_around_solution``) are intentionally +# omitted — they describe how the placement solver explores poses and are +# not natural for an agent to emit. +# "in" has no In class in isaaclab_arena.relations.relations yet — see the +# TODO there. The downstream env builder materializes goal-state "in" +# relations as the task's success predicate. +RelationKind = Literal["on", "in", "next_to", "at_position", "at_pose", "is_anchor"] + +ItemRole = Literal["foreground", "distractor", "anchor"] + +# Task kinds the agent can propose as an atomic task. +TaskKind = Literal["pick_and_place", "open_door", "close_door"] + + +class Item(BaseModel): + """One object the agent wants in the scene.""" + + query: str = Field( + description=( + "Short human name for the object as it appears in the prompt " + "(e.g. 'avocado', 'bowl'). The downstream resolver fuzzy-matches " + "this against the asset catalog — do NOT emit the exact " + "registered name." + ), + ) + role: ItemRole = Field( + description=( + "Role the item plays in the env: 'foreground' for objects the " + "task acts on; 'distractor' for extras mentioned as clutter; " + "'anchor' for reference surfaces (rare — the background usually " + "covers this)." + ), + ) + category_tags: list[str] = Field( + default_factory=list, + description=( + "Tags that semantically narrow the query, preferring assets with " + "those tags. PREFERENCE only, not a hard filter — the resolver " + "falls back to the full catalog if the tag pool is empty or " + "yields no close match. Err toward emitting useful tags." + ), + ) + instance_name: str | None = Field( + default=None, + description="Optional explicit instance label for the item; leave null if the prompt does not name one.", + ) + scale: float | None = Field( + default=None, + description=( + "Spawn scale. Leave null (the default) so the placement proposer " + "auto-fits the asset; only set a positive float when the prompt " + "explicitly demands a size override." + ), + ) + + +class Relation(BaseModel): + """A spatial / structural relation between items. + + Binary kinds (``on``, ``in``, ``next_to``, ...) must set ``target`` to the + other item — semantics is "subject is in relation to target". Unary kinds + (``is_anchor``, ``at_position``, ...) describe an intrinsic property of + ``subject`` alone and must leave ``target`` as ``None``. + """ + + kind: RelationKind = Field( + description=( + "Spatial relation only — articulated-state changes (open/close) are expressed via tasks, not via relations." + ), + ) + subject: str = Field( + description="Item the relation applies to, named by its Item.query string or the background name.", + ) + target: str | None = Field( + default=None, + description=( + "The other item the relation is anchored on for binary kinds " + "(on / in / next_to / at_position / at_pose); leave null for " + "unary kinds (is_anchor)." + ), + ) + params: dict = Field( + default_factory=dict, + description="Optional kind-specific parameters; leave empty by default.", + ) + + def identity(self) -> tuple[str, str, str | None]: + """Hashable identity for diffing scene graphs — ignores params.""" + return (self.kind, self.subject, self.target) + + +class Task(BaseModel): + """One atomic task in the plan that transforms the env state.""" + + kind: TaskKind = Field(description="The action to perform.") + subject: str = Field( + description=( + "The primary object the task acts on, named by its Item.query string (e.g. 'avocado', 'microwave')." + ), + ) + target: str | None = Field( + default=None, + description=( + "The secondary object or location, named by its Item.query " + "string or the background name. Leave null for unary tasks " + "(open_door / close_door)." + ), + ) + description: str = Field( + description="Natural-language summary of the task (e.g. 'pick up the avocado and place it in the bowl').", + ) + + +class EnvIntentSpec(BaseModel): + """Agent output — a structured "env intent" (blueprint) for the env and a list of tasks. + + Field-level guidance lives on the individual ``Field(description=...)`` + entries below and is surfaced to the agent via ``model_json_schema()``; + only cross-cutting rules and few-shot examples are kept in the + prompt text (see ``EnvGenAgent._system_prompt``). + """ + + # Forced chain-of-thought field, listed FIRST so the agent emits its + # analysis before committing to any structured field. Instruction-tuned + # models respect schema field order, and writing reasoning before + # answers measurably improves structured-output quality (the + # "think step by step then commit" pattern). Bonus debuggability: + # when a downstream resolver step fails, the reasoning trace shows + # which step the model got wrong (e.g. it picked "tomato" because + # it misidentified the foreground object as a vegetable) — without + # this, the only signal is the malformed spec itself. + reasoning: str = Field( + description=( + "Step-by-step analysis of the user prompt, written BEFORE the " + "structured fields below. Identify (1) the task / intent, (2) " + "the foreground objects the task acts on, (3) the background " + "surface or scene, (4) any distractors. For each object, " + "briefly justify the catalog query and tags you will pick. " + "Resolve any ambiguity here before filling the structured " + "fields — do not restate this analysis in ``task_description``." + ), + ) + task_description: str = Field( + description="One-sentence natural-language summary of what the env exercises overall." + ) + background: str = Field( + description="Background asset name from the BACKGROUNDS catalog (e.g. 'maple_table_kitchen').", + ) + embodiment: str = Field( + default="franka_ik", + description=( + "Robot embodiment to control. Use a bare family name ('franka', " + "'droid', 'g1', 'gr1') when the prompt does not specify a " + "control mode — the resolver defaults each to its IK variant. " + "Use a full registered name (e.g. 'franka_joint_pos') only when " + "the prompt explicitly requests joint control." + ), + ) + items: list[Item] = Field(description="Objects to place in the env.") + initial_scene_graph: list[Relation] = Field( + description=( + "FULL snapshot of all relations in the starting state. Every " + "persistent relation (e.g. bowl on table, distractors present) " + "must appear here. Relations that change via tasks are still " + "listed here in their starting form." + ), + ) + tasks: list[Task] = Field( + description=( + "Tasks to execute in sequence. The task sequence implicitly " + "defines the intermediate env graphs by applying each task's " + "transformations in order. An empty list is valid and means " + "the env has no task — at the arena layer this maps to the " + "``NoTask`` null object (e.g. a static playground / sandbox " + "env). Prefer an empty list over inventing a placeholder " + "task when the user prompt genuinely describes a task-less " + "scene." + ), + ) diff --git a/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py b/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py new file mode 100644 index 000000000..03a628339 --- /dev/null +++ b/isaaclab_arena/environments/agentic_env_gen/structured_output_utils.py @@ -0,0 +1,304 @@ +# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Utilities for OpenAI-compatible structured outputs (``response_format=json_schema``). + +The functions here are the building blocks the env-gen agent uses to +send strict-mode-compatible schemas, handle provider-specific response +routing (NVIDIA DeepSeek's ``reasoning_content`` quirk), and probe a +candidate model's structured-output capability before deployment. + +They are intentionally pydantic-model-agnostic: pass any +``pydantic.BaseModel`` subclass as ``spec_class`` and the utility +adapts. The agent module wires :class:`EnvIntentSpec` in as the +production default. +""" + +from __future__ import annotations + +import copy +import json +from typing import Any + +from pydantic import BaseModel + +# Truncate echoed response payloads in diagnostic results to this many +# characters — long enough to diagnose a failure, short enough to keep +# error messages and probe results readable. +_RESPONSE_PREVIEW_CHARS = 500 + + +def _format_failure_message( + *, + model: str, + response_route: str, + finish_reason: str | None, + cause: str, + sample_payload: str | None, +) -> str: + """Build the multi-line diagnostic message for a structured-output failure. + + The format pairs every signal the probe captured into a layout that + grep/CI logs can read at a glance. ``sample_payload`` is the + single most useful field — it turns a cryptic ``JSONDecodeError: + Expecting value`` into a debuggable failure by showing what the + model actually returned (prose preamble? HTML error page? empty?). + """ + return ( + f"Model {model!r} does not support structured outputs:\n" + f" response_route = {response_route!r}\n" + f" finish_reason = {finish_reason!r}\n" + f" cause = {cause}\n" + f" sample_payload = {sample_payload!r}" + ) + + +def build_strict_schema(model_cls: type[BaseModel]) -> dict[str, Any]: + """Return ``model_cls``'s JSON schema munged for OpenAI strict mode. + + OpenAI's structured outputs strict mode (and AWS Bedrock's + Anthropic models, which surface the same constraint) require: + + * ``additionalProperties: false`` on every object schema. + * Every property listed in ``required`` (use a nullable type + union — e.g. ``str | None`` — for fields that should be + emittable as ``null``). + * No ``default`` keys in the schema (defaults are nonsensical + when every field is required). + + Pydantic's default ``model_json_schema()`` honours the first + constraint only. We deep-walk the schema and apply the other two + so the schema flies past both NVIDIA and Bedrock validation. + + The returned dict is a deep copy — mutating it never leaks back + into pydantic's internal schema cache. + """ + schema = copy.deepcopy(model_cls.model_json_schema()) + apply_strict_constraints(schema) + return schema + + +def apply_strict_constraints(node: Any) -> None: + """Recursively apply OpenAI strict-mode constraints to a JSON-schema node. + + Mutates ``node`` in place. Safe to call on an already-munged schema + (the operation is idempotent). + """ + if isinstance(node, dict): + if node.get("type") == "object" and "properties" in node: + node["additionalProperties"] = False + node["required"] = list(node["properties"].keys()) + # Strict mode forbids ``default`` keys (every field is required, so + # defaults can never apply). Drop them defensively at every level. + node.pop("default", None) + for v in node.values(): + apply_strict_constraints(v) + elif isinstance(node, list): + for v in node: + apply_strict_constraints(v) + + +def ping(client: Any, model: str) -> str: + """Smoke-test the endpoint + API key + model with a minimal request. + + Sends a one-shot chat completion (no structured outputs) to verify: + + * the API key authenticates, + * the configured model exists at the client's ``base_url``, + * the network path is reachable. + + Intended for CI startup probes and constructor-time fail-fast + checks; the success signal is "we got a response without + raising". The response *content* is returned for diagnostics but + intentionally not asserted on — different models phrase the + acknowledgment differently, and a quirky reply still means the + wire is working. + + This is the *cheap* probe; pair with + :func:`check_structured_output_support` for a full deployment + validation (ping confirms the wire, the probe confirms the + model can actually produce structured outputs). + + Args: + client: An OpenAI-compatible client (typically + ``openai.OpenAI`` or a compatible mock). + model: Model identifier forwarded to + ``client.chat.completions.create(model=...)``. + + Returns: + The model's response text (typically "OK" or similar). Empty + string if the model returned no content (still a successful + round-trip). + + Raises: + Any exception raised by the underlying ``openai`` client. + Common ones at this layer are ``AuthenticationError`` + (bad key), ``NotFoundError`` (wrong ``model``), + ``APIConnectionError`` (unreachable endpoint), and + ``RateLimitError`` (quota exhausted). + """ + # TODO(qianl): wrap with transient-error retry (exponential backoff + + # jitter) for ``APIConnectionError`` / ``APITimeoutError`` / 429 / 5xx. + # Deterministic errors (401/403/404) must still propagate immediately. + # Until then, the affected live tests carry ``@pytest.mark.flaky`` to + # absorb intermittent wire-level hiccups at the test layer. + resp = client.chat.completions.create( + model=model, + messages=[{"role": "user", "content": "Respond with exactly: OK"}], + temperature=0, + max_tokens=8, + ) + return resp.choices[0].message.content or "" + + +def extract_response_text(message: Any) -> tuple[str, str]: + """Pull the agent's structured-output text from the chat-completion message. + + Returns ``(text, route)`` where ``route`` is one of: + + * ``"content"`` — the standard OpenAI-compatible channel. + * ``"reasoning_content"`` — NVIDIA DeepSeek's provider-specific + channel; the model emits structured outputs here instead of + ``content``. We treat it as equivalent. + * ``"empty"`` — both channels were empty / missing; the caller + should surface a clear error. + """ + content = getattr(message, "content", None) + if content: + return content, "content" + reasoning = getattr(message, "reasoning_content", None) + if reasoning: + return reasoning, "reasoning_content" + return "", "empty" + + +def check_structured_output_support( + client: Any, + model: str, + spec_class: type[BaseModel], +) -> bool: + """Probe whether ``model`` can produce ``spec_class``-shaped structured outputs. + + Sends a single chat-completion against ``client`` with + ``response_format=json_schema`` carrying ``spec_class``'s strict + schema and a minimal user prompt asking the model to fabricate a + valid instance. Returns ``True`` if the model successfully + produced a valid ``spec_class`` instance end-to-end. + + Every failure mode raises ``RuntimeError`` with a multi-line + diagnostic that names the failed channel, ``finish_reason``, + the underlying cause, and a preview of the model's response. + When the failure has an originating SDK exception (HTTP error, + JSONDecodeError, ValidationError) it is chained via + ``__cause__`` so the traceback retains the full context. + + Args: + client: An OpenAI-compatible client (typically + ``openai.OpenAI`` or a compatible mock). + model: Model identifier as understood by the client's + base_url. Forwarded verbatim to + ``client.chat.completions.create(model=...)``. + spec_class: The pydantic model whose strict schema will be + sent to the endpoint. + + Returns: + ``True`` when the probe round-trips successfully (wire ok, + schema honoured, pydantic validation passed). + + Raises: + RuntimeError: for any failure mode — API rejection at the + wire (400/401/etc.), empty ``choices`` list (Azure + content-filter / Bedrock guardrail rejection), empty + envelope on both ``content`` and ``reasoning_content``, + JSON parse failure, or pydantic schema-validation + failure. The exception's ``__cause__`` (when populated) + is the originating SDK / parser exception. + """ + schema = build_strict_schema(spec_class) + # The user prompt is deliberately content-free; the schema itself + # plus the system prompt below carry all the structural + # information. We just want a valid envelope back. + system = ( + f"Return a valid {spec_class.__name__} JSON object. Every required field must be " + "populated — use realistic dummy values where the prompt doesn't specify one." + ) + # TODO(qianl): wrap with transient-error retry (exponential backoff + + # jitter) for ``APIConnectionError`` / ``APITimeoutError`` / 429 / 5xx. + # Deterministic errors (400/401/403/404/422) must still propagate + # immediately so genuinely-unsupported endpoints fail fast. Currently + # this is the primary source of e2e flakes (provider occasionally + # returns blank ``content`` in the structured-outputs envelope) — + # affected live tests carry ``@pytest.mark.flaky`` as the short-term + # mitigation. + try: + resp = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": system}, + {"role": "user", "content": "Generate a minimal valid example."}, + ], + response_format={ + "type": "json_schema", + "json_schema": {"name": spec_class.__name__, "strict": True, "schema": schema}, + }, + temperature=0, + max_tokens=2000, + ) + except Exception as exc: + raise RuntimeError( + _format_failure_message( + model=model, + response_route="empty", + finish_reason=None, + cause=f"{type(exc).__name__}: {str(exc)[:_RESPONSE_PREVIEW_CHARS]}", + sample_payload=None, + ) + ) from exc + + # Some providers (e.g. Azure content-filter trips, Bedrock guardrail + # rejections) succeed at the HTTP level but return an empty ``choices`` + # list — no candidates were emitted. ``resp.choices[0]`` would raise + # ``IndexError``; surface it with a distinct ``cause`` message that + # operators can tell apart from the "envelope returned but content + # empty" case handled further down. + choices = getattr(resp, "choices", None) or [] + if not choices: + raise RuntimeError( + _format_failure_message( + model=model, + response_route="empty", + finish_reason=None, + cause="Response contained no choices (model emitted zero candidates).", + sample_payload=None, + ) + ) + + finish_reason = choices[0].finish_reason + text, route = extract_response_text(choices[0].message) + sample = text[:_RESPONSE_PREVIEW_CHARS] if text else None + if not text: + raise RuntimeError( + _format_failure_message( + model=model, + response_route=route, + finish_reason=finish_reason, + cause="Model returned an empty envelope on both content and reasoning_content.", + sample_payload=None, + ) + ) + try: + data = json.loads(text, strict=False) + spec_class.model_validate(data) + except Exception as exc: + raise RuntimeError( + _format_failure_message( + model=model, + response_route=route, + finish_reason=finish_reason, + cause=f"{type(exc).__name__}: {str(exc)[:_RESPONSE_PREVIEW_CHARS]}", + sample_payload=sample, + ) + ) from exc + return True diff --git a/isaaclab_arena/environments/agentic_env_gen/try_schema.py b/isaaclab_arena/environments/agentic_env_gen/try_schema.py new file mode 100644 index 000000000..57bef2dbd --- /dev/null +++ b/isaaclab_arena/environments/agentic_env_gen/try_schema.py @@ -0,0 +1,110 @@ +# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Run the agent on a prompt and dump the resolved ArenaEnvGraphSpec. + +Requires NV_API_KEY environment variable. + +Examples: + # Print the Pydantic EnvIntentSpec JSON schema (no agent call): + /isaac-sim/python.sh -m isaaclab_arena.environments.agentic_env_gen.try_schema --print-schema + + # Print the catalog sent to the agent (no agent call): + /isaac-sim/python.sh -m isaaclab_arena.environments.agentic_env_gen.try_schema --print-catalog + + # Call the agent, resolve, print, and dump YAML: + /isaac-sim/python.sh -m isaaclab_arena.environments.agentic_env_gen.try_schema \ + --prompt "franka pick up avocado from the table and place it into a bowl on the table. there are other veggies on the table as distractor" +""" + +from __future__ import annotations + +import argparse +import json + +DEFAULT_PROMPT = ( + "franka pick up avocado from the table and place it into a bowl on the table. " + "there are other veggies on the table as distractor" +) +SEQUENTIAL_PROMPT = ( + "franka opens a microwave, picks up avocado on the table, place it into the microwave and close the microwave door." + " There are other utensils on the table as distractor" +) + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--prompt", type=str, default=DEFAULT_PROMPT) + parser.add_argument("--model", type=str, default=None) + parser.add_argument("--temperature", type=float, default=0.2) + parser.add_argument("--print-schema", action="store_true") + parser.add_argument("--print-catalog", action="store_true") + parser.add_argument( + "--background", + type=str, + default="maple_table_robolab", + help=( + "Override the background chosen by the agent (e.g. 'office_table' " + "or 'kitchen'). Default is 'maple_table_robolab' because its " + "tabletop ObjectReference yields a clean bbox and stable " + "placement, unlike the rotated plain 'table' background. Pass " + "an empty string ('') to keep the agent's choice." + ), + ) + args = parser.parse_args() + + from isaaclab_arena.environments.agentic_env_gen.env_intent_spec import EnvIntentSpec + + if args.print_schema: + print(json.dumps(EnvIntentSpec.model_json_schema(), indent=2)) + return + + from isaaclab_arena.environments.agentic_env_gen.env_gen_agent import EnvGenAgent, build_catalog_text + + catalog = build_catalog_text() + if args.print_catalog: + print(catalog) + return + + kwargs = {"model": args.model} if args.model else {} + agent = EnvGenAgent(**kwargs) + spec, raw = agent.generate_spec(args.prompt, catalog_text=catalog, temperature=args.temperature) + + print("=== raw agent response ===") + print(raw) + + # Surface the forced chain-of-thought field on its own so it's easy to + # spot when debugging a bad spec — without this, ``reasoning`` is + # buried inside the multi-hundred-line model_dump_json below. + print("\n=== agent reasoning ===") + print(spec.reasoning) + + if args.background and args.background != spec.background: + # Swap the background name wherever it appears so downstream code + # (resolver, proposer) sees a consistent scene. Rewrite both + # ``rel.target`` (binary relations like ``on(bowl, table)``) AND + # ``rel.subject`` (unary relations like ``is_anchor(table)``); + # missing the subject case would leave the unary constraint + # pointing at the old background name, after which the resolver + # would emit a ``relation.initial.unknown_subject`` trace and + # silently drop the constraint. + old_bg = spec.background + new_bg = args.background + for rel in spec.initial_scene_graph: + if rel.subject == old_bg: + rel.subject = new_bg + if rel.target == old_bg: + rel.target = new_bg + # Note: tasks don't directly reference background in target (typically None or items), + # so no background substitution needed in task.target + spec.background = new_bg + print(f"\n=== background override applied: {old_bg!r} -> {new_bg!r} ===") + + print("\n=== parsed EnvIntentSpec ===") + print(spec.model_dump_json(indent=2)) + + +if __name__ == "__main__": + main() diff --git a/isaaclab_arena/tests/test_env_gen_agent.py b/isaaclab_arena/tests/test_env_gen_agent.py new file mode 100644 index 000000000..3207351a5 --- /dev/null +++ b/isaaclab_arena/tests/test_env_gen_agent.py @@ -0,0 +1,429 @@ +# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Unit tests for :class:`isaaclab_arena.environments.agentic_env_gen.env_gen_agent.EnvGenAgent`. + +The agent's behaviour decomposes into three agent-level concerns that +we exercise without ever hitting the wire: + +* ``__init__`` argument / env-var precedence, the missing-key guard, + and the two constructor-time validations (``ping`` then + ``check_structured_output_support``) that convert late wire / + capability failures into fail-fast errors. +* ``generate_spec`` — the openai client is replaced with a + ``MagicMock`` so we assert on the request shape (model, messages, + ``response_format``, temperature, max_tokens) and the + error-propagation contract. +* ``_system_prompt`` keeps its cross-cutting guidance intact; + per-field schema details ride on the wire via + ``response_format=json_schema`` rather than the prompt text. + +Schema munging, the ``ping`` and ``check_structured_output_support`` +helpers, and their failure-mode coverage all live in +:mod:`test_structured_output_utils`. +""" + +from __future__ import annotations + +import json +from unittest.mock import MagicMock, patch + +import pytest +from pydantic import ValidationError + +from isaaclab_arena.environments.agentic_env_gen.env_gen_agent import DEFAULT_BASE_URL, DEFAULT_MODEL, EnvGenAgent +from isaaclab_arena.environments.agentic_env_gen.structured_output_utils import apply_strict_constraints + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +def _chat_response(content: str | None = None, reasoning_content: str | None = None, finish_reason: str = "stop"): + """Build a nested mock matching the openai chat-completion response shape. + + Models that route structured outputs into ``reasoning_content`` (e.g. + NVIDIA DeepSeek) leave ``content`` empty — the fixture mirrors that by + populating either channel independently. + """ + resp = MagicMock() + resp.choices = [MagicMock()] + resp.choices[0].finish_reason = finish_reason + resp.choices[0].message.content = content + resp.choices[0].message.reasoning_content = reasoning_content + return resp + + +@pytest.fixture +def stub_openai(): + """Patch ``openai.OpenAI`` so ``EnvGenAgent()`` never hits the wire. + + The agent does a deferred ``from openai import OpenAI`` inside + ``__init__`` to avoid pulling the dependency at module import + time, so we patch the symbol on the ``openai`` module itself. + + The patched client is pre-loaded to satisfy the two constructor + probes (cheap ``ping`` then full structured-output check): + ``side_effect`` returns a "OK" ping response then a + ``_MINIMAL_SPEC`` probe response. Tests that want to assert on a + failing ``__init__`` reach for ``patch("openai.OpenAI")`` + directly with a custom ``side_effect``. + """ + with patch("openai.OpenAI") as mock_cls: + client = MagicMock() + client.chat.completions.create.side_effect = [ + _chat_response(content="OK"), + _chat_response(content=json.dumps(_MINIMAL_SPEC)), + ] + mock_cls.return_value = client + yield mock_cls + + +@pytest.fixture +def agent(stub_openai): + """A constructed ``EnvGenAgent`` with a fully mocked openai client. + + ``__init__``'s two calls (ping + structured-output probe) are + served by ``stub_openai``'s pre-loaded ``side_effect``. After + construction we *reset the mock* so per-test assertions on + ``call_args`` / ``call_count`` start from a clean slate; tests + can then set ``.return_value`` (or a fresh ``.side_effect``) to + drive whichever method they're exercising. + """ + a = EnvGenAgent(api_key="test-key") + a.client.chat.completions.create.side_effect = None + a.client.chat.completions.create.reset_mock() + return a + + +# Minimal EnvIntentSpec payload — exercises every required field plus one +# task. Reused across the generate_spec happy-path tests. +_MINIMAL_SPEC: dict = { + "reasoning": ( + "User wants a pick-and-place: foreground object is 'avocado', " + "target container is 'bowl', background is the kitchen table." + ), + "task_description": "pick up the avocado and place it in the bowl", + "background": "kitchen", + "embodiment": "franka_ik", + "items": [ + {"query": "avocado", "role": "foreground", "category_tags": [], "instance_name": None, "scale": None}, + {"query": "bowl", "role": "foreground", "category_tags": [], "instance_name": None, "scale": None}, + ], + "initial_scene_graph": [ + {"kind": "on", "subject": "avocado", "target": "kitchen"}, + {"kind": "on", "subject": "bowl", "target": "kitchen"}, + ], + "tasks": [{ + "kind": "pick_and_place", + "subject": "avocado", + "target": "bowl", + "description": "pick up the avocado and place it in the bowl", + }], +} + + +# --------------------------------------------------------------------------- +# __init__ +# --------------------------------------------------------------------------- + + +class TestInit: + def test_explicit_api_key_overrides_env(self, monkeypatch, stub_openai): + monkeypatch.setenv("NV_API_KEY", "env-key") + a = EnvGenAgent(api_key="explicit-key") + assert a.api_key == "explicit-key" + + def test_falls_back_to_env_var(self, monkeypatch, stub_openai): + monkeypatch.setenv("NV_API_KEY", "env-key") + a = EnvGenAgent() + assert a.api_key == "env-key" + + def test_raises_when_no_key_anywhere(self, monkeypatch, stub_openai): + monkeypatch.delenv("NV_API_KEY", raising=False) + with pytest.raises(ValueError, match="API key required"): + EnvGenAgent() + + def test_default_model_and_base_url(self, stub_openai): + a = EnvGenAgent(api_key="k") + assert a.model == DEFAULT_MODEL + stub_openai.assert_called_once_with(api_key="k", base_url=DEFAULT_BASE_URL) + + def test_custom_model_and_base_url(self, stub_openai): + a = EnvGenAgent(api_key="k", model="custom-model", base_url="http://localhost:8000") + assert a.model == "custom-model" + stub_openai.assert_called_once_with(api_key="k", base_url="http://localhost:8000") + + def test_init_runs_ping_then_structured_output_probe(self, stub_openai): + # ``__init__`` is contracted to run TWO wire checks in order: + # (1) the cheap ``ping`` so a dead endpoint / bad key fails before + # we spend tokens on (2) the heavier structured-output probe. + # Asserting the order matters because reversing it would waste a + # full schema probe on every misconfigured deployment. + a = EnvGenAgent(api_key="k") + assert a.client.chat.completions.create.call_count == 2 + first, second = a.client.chat.completions.create.call_args_list + # First call = ping: small message, no response_format. + assert first.kwargs["temperature"] == 0 + assert first.kwargs["max_tokens"] == 8 + assert len(first.kwargs["messages"]) == 1 + assert "response_format" not in first.kwargs + # Second call = structured-output probe: carries the EnvIntentSpec + # schema, signalling the model has to actually honour + # ``response_format=json_schema``. + assert second.kwargs["response_format"]["type"] == "json_schema" + assert second.kwargs["response_format"]["json_schema"]["name"] == "EnvIntentSpec" + + def test_init_propagates_ping_failure(self): + # If the openai client raises on the FIRST (ping) call — bad key, + # unreachable endpoint, etc. — the exception must surface from + # ``EnvGenAgent()`` itself, not be swallowed into a silently-broken + # instance that fails later when generate_spec is called. The + # structured-output probe must NOT be attempted (otherwise we'd + # waste a schema-carrying request on a dead wire). + class FakeAuthError(Exception): + pass + + with patch("openai.OpenAI") as mock_cls: + client = MagicMock() + client.chat.completions.create.side_effect = FakeAuthError("bad key") + mock_cls.return_value = client + with pytest.raises(FakeAuthError, match="bad key"): + EnvGenAgent(api_key="k") + # Exactly one create() call — the ping. The probe never ran. + assert client.chat.completions.create.call_count == 1 + + def test_init_raises_when_structured_output_unsupported(self): + # The agent is structured-outputs-only — a model that can't honour + # ``response_format=json_schema`` is fundamentally unusable. The + # constructor must refuse rather than letting downstream + # ``generate_spec`` blow up later. ``check_structured_output_support`` + # raises the diagnostic RuntimeError directly, so all the + # informative fields are baked into the probe's exception — no + # caller-side message construction. This test just confirms the + # probe's exception reaches the caller verbatim (no swallow, + # no rewrap that drops fields). + with patch("openai.OpenAI") as mock_cls: + client = MagicMock() + client.chat.completions.create.side_effect = [ + _chat_response(content="OK"), # ping passes + _chat_response(content=None, reasoning_content=None), # probe empty + ] + mock_cls.return_value = client + with pytest.raises(RuntimeError, match="does not support structured outputs") as exc_info: + EnvGenAgent(api_key="k") + msg = str(exc_info.value) + # Diagnostic fields from the probe must reach the operator — + # ``sample_payload`` in particular is what turns cryptic JSON / + # validation errors into debuggable failures. + assert "response_route" in msg + assert "finish_reason" in msg + assert "cause" in msg + assert "sample_payload" in msg + # The empty-envelope route signal — keeps callers able to + # attribute "empty" vs "content" vs "reasoning_content". + assert "'empty'" in msg + + def test_init_caches_strict_schema(self, stub_openai): + # The strict schema munging walks ~10 nested object nodes; caching it + # on the instance avoids redoing the walk on every generate_spec call. + # The cached schema must already be munged — re-running the munger + # should be a no-op (idempotent). + a = EnvGenAgent(api_key="k") + assert isinstance(a._spec_schema, dict) + before = json.dumps(a._spec_schema, sort_keys=True) + apply_strict_constraints(a._spec_schema) + after = json.dumps(a._spec_schema, sort_keys=True) + assert before == after + + +# --------------------------------------------------------------------------- +# generate_spec +# --------------------------------------------------------------------------- + + +class TestGenerateSpec: + def test_happy_path_returns_spec_and_raw(self, agent): + raw = json.dumps(_MINIMAL_SPEC) + agent.client.chat.completions.create.return_value = _chat_response(content=raw) + spec, returned_raw = agent.generate_spec("avocado on kitchen", catalog_text="catalog") + assert spec.embodiment == "franka_ik" + assert spec.background == "kitchen" + assert len(spec.tasks) == 1 + assert returned_raw == raw + + def test_reads_from_reasoning_content_channel(self, agent): + # DeepSeek quirk: when structured outputs are requested, the model + # puts the JSON in ``reasoning_content`` instead of ``content``. + raw = json.dumps(_MINIMAL_SPEC) + agent.client.chat.completions.create.return_value = _chat_response(content=None, reasoning_content=raw) + spec, returned_raw = agent.generate_spec("p", catalog_text="catalog") + assert spec.embodiment == "franka_ik" + assert returned_raw == raw + + def test_request_sets_response_format_to_json_schema(self, agent): + agent.client.chat.completions.create.return_value = _chat_response(content=json.dumps(_MINIMAL_SPEC)) + agent.generate_spec("p", catalog_text="catalog") + kwargs = agent.client.chat.completions.create.call_args.kwargs + assert kwargs["response_format"]["type"] == "json_schema" + assert kwargs["response_format"]["json_schema"]["name"] == "EnvIntentSpec" + assert kwargs["response_format"]["json_schema"]["strict"] is True + # The schema sent on the wire is the cached, strict-mode-munged copy. + assert kwargs["response_format"]["json_schema"]["schema"] is agent._spec_schema + + def test_raises_runtime_error_on_empty_envelope(self, agent): + # Both channels empty — the endpoint accepted ``response_format`` but + # the model dropped the structured output (the canonical "endpoint + # doesn't actually support structured outputs" failure mode). + agent.client.chat.completions.create.return_value = _chat_response(content=None, reasoning_content=None) + with pytest.raises(RuntimeError, match="empty structured-outputs envelope"): + agent.generate_spec("p", catalog_text="catalog") + + def test_tolerates_unescaped_control_chars(self, agent): + # DeepSeek-v4-flash emits literal tab/newline characters inside JSON + # strings despite the structured-outputs contract. Python's default + # ``json.loads`` rejects them; we pass ``strict=False`` to accept. + payload = dict(_MINIMAL_SPEC) + payload["task_description"] = "pick up\tthe\tavocado" + raw = json.dumps(payload).replace("\\t", "\t") + assert "\t" in raw # raw payload now has literal tab chars in a string + agent.client.chat.completions.create.return_value = _chat_response(content=raw) + spec, _ = agent.generate_spec("p", catalog_text="catalog") + assert "\t" in spec.task_description + + def test_propagates_validation_error_for_schema_violation(self, agent): + # Well-formed JSON but missing every required EnvIntentSpec field — + # pydantic surfaces this as a ``ValidationError`` distinct from a + # transport or parse error. + agent.client.chat.completions.create.return_value = _chat_response(content='{"missing": "fields"}') + with pytest.raises(ValidationError): + agent.generate_spec("p", catalog_text="catalog") + + def test_request_uses_configured_model(self, agent): + agent.client.chat.completions.create.return_value = _chat_response(content=json.dumps(_MINIMAL_SPEC)) + agent.generate_spec("p", catalog_text="catalog") + kwargs = agent.client.chat.completions.create.call_args.kwargs + assert kwargs["model"] == agent.model + + def test_forwards_temperature_and_max_tokens(self, agent): + agent.client.chat.completions.create.return_value = _chat_response(content=json.dumps(_MINIMAL_SPEC)) + agent.generate_spec("p", catalog_text="catalog", temperature=0.7, max_tokens=500) + kwargs = agent.client.chat.completions.create.call_args.kwargs + assert kwargs["temperature"] == 0.7 + assert kwargs["max_tokens"] == 500 + + def test_user_message_contains_catalog_and_prompt(self, agent): + agent.client.chat.completions.create.return_value = _chat_response(content=json.dumps(_MINIMAL_SPEC)) + agent.generate_spec("user wants avocado on kitchen", catalog_text="<>") + msgs = agent.client.chat.completions.create.call_args.kwargs["messages"] + assert [m["role"] for m in msgs] == ["system", "user"] + user_msg = msgs[1]["content"] + assert "<>" in user_msg + assert "user wants avocado on kitchen" in user_msg + # Under structured outputs the "emit ONLY JSON" instruction is + # redundant (and was deliberately dropped) — the wire enforces + # the envelope. + assert "Return ONLY" not in user_msg + + +# --------------------------------------------------------------------------- +# _system_prompt +# --------------------------------------------------------------------------- + + +class TestSystemPrompt: + def test_contains_cross_cutting_guidance(self, agent): + # Under structured outputs the schema (including every Relation / + # Task literal enum) flows to the model via ``response_format``. + # The system prompt is reserved for cross-cutting rules that + # can't be expressed in the schema — articulated-object anchoring, + # distractor anchoring, anti-hallucination directives. Lock those + # markers in so a future prompt rewrite can't accidentally drop + # them. + prompt = agent._system_prompt() + for marker in ( + "Articulated objects", + "Distractor items", + "Do NOT hallucinate", + "pick_and_place", + "open_door", + "close_door", + ): + assert marker in prompt, f"system prompt missing required marker {marker!r}" + + def test_does_not_repeat_response_format_instruction(self, agent): + # Belt-and-suspenders: ensure the prompt isn't still telling the + # model "emit ONLY JSON" — that instruction is redundant under + # structured outputs and the wire enforces it. + prompt = agent._system_prompt() + assert "Emit ONLY" not in prompt + assert "ONLY the JSON object" not in prompt + + +# --------------------------------------------------------------------------- +# Live endpoint (opt-in, network + auth required) +# --------------------------------------------------------------------------- + + +# The test exercises a real wire call against NVIDIA's hosted DeepSeek-v4-flash, +# which has intermittent quirks under structured outputs (occasional blank +# content, transient 429 / 5xx, etc.). A single failed attempt does NOT +# mean ``generate_spec`` is broken — allow up to 2 reruns so the transport +# layer's intermittency doesn't fail CI. Real breakage will still fail all 3. +# TODO(qianl): drop the flaky marker once production-side retry is wired +# into ``generate_spec`` / ``check_structured_output_support`` (see TODOs in +# env_gen_agent.py and structured_output_utils.py). +@pytest.mark.flaky(max_runs=3, min_passes=1) +@pytest.mark.agent_remote_e2e +def test_generate_spec_against_live_endpoint(): + """End-to-end smoke test against the real OpenAI-compatible endpoint. + + Exercises the full structured-outputs pipeline with default + ``model`` / ``base_url`` / system prompt: + + auth → HTTPS → response_format=json_schema → channel fallback + → json.loads(strict=False) → EnvIntentSpec.model_validate + + Two layers gate this from default ``pytest`` runs: + + * ``agent_remote_e2e`` marker — registered in ``pytest.ini`` next to + ``gr00t_remote_e2e``. Run explicitly with + ``pytest -m agent_remote_e2e isaaclab_arena/tests/test_env_gen_agent.py``. + + The asset catalog is supplied inline rather than via ``AssetRegistry`` + so the test doesn't depend on Isaac Lab asset registration state — we + only want to validate the agent wire here, not the catalog builder. + + The structured-outputs *capability* of the default model is + pinned separately by + :func:`test_structured_output_utils.test_default_model_supports_structured_output`; + this test exercises the higher-level ``generate_spec`` pipeline + end-to-end. + + Assertions are intentionally loose: we check shape (non-empty raw, + non-empty tasks, populated background/embodiment, populated + reasoning) rather than exact content, since agent output drifts + between model versions. + """ + agent = EnvGenAgent() + catalog = ( + "EMBODIMENTS: franka_ik\n\n" + "BACKGROUNDS: maple_table_kitchen\n\n" + "OBJECTS (2):\n" + "- avocado_robolab tags=['vegetable']\n" + "- bowl_robolab tags=['container']" + ) + spec, raw = agent.generate_spec( + "pick up the avocado and place it in the bowl on the kitchen table", + catalog_text=catalog, + ) + assert isinstance(raw, str) and raw, "agent returned empty raw response" + assert spec.tasks, "EnvIntentSpec must contain at least one task" + assert spec.background, "EnvIntentSpec.background must be populated" + assert spec.embodiment, "EnvIntentSpec.embodiment must be populated" + # Structured outputs guarantee the forced-CoT reasoning field is + # populated — under the old prose-extraction path it could come + # back blank if the model wrapped the schema in markdown. + assert spec.reasoning, "EnvIntentSpec.reasoning must be populated" diff --git a/isaaclab_arena/tests/test_structured_output_utils.py b/isaaclab_arena/tests/test_structured_output_utils.py new file mode 100644 index 000000000..458a78643 --- /dev/null +++ b/isaaclab_arena/tests/test_structured_output_utils.py @@ -0,0 +1,474 @@ +# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Unit tests for :mod:`isaaclab_arena.environments.agentic_env_gen.structured_output_utils`. + +The utility module owns the three concerns that decouple "is this +endpoint compatible with our structured-outputs contract?" from the +agent's higher-level pipeline: + +* ``build_strict_schema`` / ``apply_strict_constraints`` — schema + munging that walks every object node (``$defs``, nested arrays, + ``anyOf`` arms) and applies OpenAI strict-mode constraints. Locked + in here so a future pydantic version that changes default schema + output doesn't silently regress Bedrock compatibility. +* ``extract_response_text`` — the NVIDIA-DeepSeek-vs-OpenAI channel + fallback (``content`` first, then ``reasoning_content``, + ``"empty"`` last). +* ``check_structured_output_support`` — the deployment validator's + diagnostic probe. Tested both with mocks (failure-mode coverage) + and against the real default model (so we notice the day + NVIDIA's hosted DeepSeek-v4-flash drops structured-output + support). +""" + +from __future__ import annotations + +import json +import os +from unittest.mock import MagicMock + +import pytest +from pydantic import BaseModel + +from isaaclab_arena.environments.agentic_env_gen.env_gen_agent import DEFAULT_BASE_URL, DEFAULT_MODEL +from isaaclab_arena.environments.agentic_env_gen.env_intent_spec import EnvIntentSpec +from isaaclab_arena.environments.agentic_env_gen.structured_output_utils import ( + apply_strict_constraints, + build_strict_schema, + check_structured_output_support, + extract_response_text, + ping, +) + +# --------------------------------------------------------------------------- +# Fixtures + helpers +# --------------------------------------------------------------------------- + + +def _chat_response(content: str | None = None, reasoning_content: str | None = None, finish_reason: str = "stop"): + """Build a nested mock matching the openai chat-completion response shape.""" + resp = MagicMock() + resp.choices = [MagicMock()] + resp.choices[0].finish_reason = finish_reason + resp.choices[0].message.content = content + resp.choices[0].message.reasoning_content = reasoning_content + return resp + + +# Minimal EnvIntentSpec payload exercising every required field plus one +# task — reused across the ``check_structured_output_support`` happy-path +# tests so they exercise the real production schema rather than a toy stub. +_MINIMAL_SPEC: dict = { + "reasoning": ( + "User wants a pick-and-place: foreground object is 'avocado', " + "target container is 'bowl', background is the kitchen table." + ), + "task_description": "pick up the avocado and place it in the bowl", + "background": "kitchen", + "embodiment": "franka_ik", + "items": [ + {"query": "avocado", "role": "foreground", "category_tags": [], "instance_name": None, "scale": None}, + {"query": "bowl", "role": "foreground", "category_tags": [], "instance_name": None, "scale": None}, + ], + "initial_scene_graph": [ + {"kind": "on", "subject": "avocado", "target": "kitchen"}, + {"kind": "on", "subject": "bowl", "target": "kitchen"}, + ], + "tasks": [{ + "kind": "pick_and_place", + "subject": "avocado", + "target": "bowl", + "description": "pick up the avocado and place it in the bowl", + }], +} + + +# --------------------------------------------------------------------------- +# build_strict_schema / apply_strict_constraints +# --------------------------------------------------------------------------- + + +class _ToyChild(BaseModel): + name: str + optional_value: int | None = None + + +class _ToyParent(BaseModel): + title: str + child: _ToyChild + children: list[_ToyChild] = [] + + +class TestBuildStrictSchema: + def test_root_object_additional_properties_false(self): + schema = build_strict_schema(_ToyParent) + assert schema["additionalProperties"] is False + + def test_root_object_lists_every_property_as_required(self): + schema = build_strict_schema(_ToyParent) + assert set(schema["required"]) == {"title", "child", "children"} + + def test_nested_defs_object_also_strict(self): + # OpenAI strict mode applies the constraint to *every* object node, + # not just the top level — including ``$defs`` entries that get + # referenced via ``$ref``. Bedrock in particular rejects the request + # if any descendant object schema is missing the marker. + schema = build_strict_schema(_ToyParent) + defs = schema["$defs"] + assert defs["_ToyChild"]["additionalProperties"] is False + assert set(defs["_ToyChild"]["required"]) == {"name", "optional_value"} + + def test_defaults_stripped_everywhere(self): + # Pydantic emits ``"default": null`` for ``optional_value`` at the + # property level; strict mode rejects ``default`` since every field + # is required. Drop the key recursively. + schema = build_strict_schema(_ToyParent) + stack = [schema] + while stack: + node = stack.pop() + if isinstance(node, dict): + assert "default" not in node, f"unexpected default key in {node!r}" + stack.extend(node.values()) + elif isinstance(node, list): + stack.extend(node) + + def test_munging_does_not_mutate_pydantic_cached_schema(self): + # Pydantic caches ``model_json_schema()`` results internally; our + # munger MUST work on a deep copy so the cache stays clean for + # other callers (e.g. ``model_dump_json()`` consumers). + before = json.dumps(_ToyParent.model_json_schema(), sort_keys=True) + build_strict_schema(_ToyParent) + after = json.dumps(_ToyParent.model_json_schema(), sort_keys=True) + assert before == after + + def test_apply_strict_constraints_is_idempotent(self): + # Safe to call multiple times — the second pass must be a no-op. + # Important because callers may receive an already-munged schema + # from a cache and re-apply defensively. + schema = build_strict_schema(_ToyParent) + snapshot = json.dumps(schema, sort_keys=True) + apply_strict_constraints(schema) + assert json.dumps(schema, sort_keys=True) == snapshot + + def test_env_intent_spec_munges_clean(self): + # The real production schema we ship — confirm every object node + # has the strict-mode marker so the wire stays compatible with + # Bedrock and any other strict-mode validator users point at. + schema = build_strict_schema(EnvIntentSpec) + + def assert_strict(node): + if isinstance(node, dict): + if node.get("type") == "object" and "properties" in node: + assert node.get("additionalProperties") is False + assert set(node["required"]) == set(node["properties"].keys()) + for v in node.values(): + assert_strict(v) + elif isinstance(node, list): + for v in node: + assert_strict(v) + + assert_strict(schema) + + +# --------------------------------------------------------------------------- +# extract_response_text +# --------------------------------------------------------------------------- + + +class TestExtractResponseText: + def test_prefers_content_when_both_populated(self): + msg = MagicMock(content='{"a": 1}', reasoning_content='{"b": 2}') + text, route = extract_response_text(msg) + assert text == '{"a": 1}' + assert route == "content" + + def test_falls_back_to_reasoning_content_when_content_empty(self): + # NVIDIA DeepSeek-v4-flash routes structured outputs into the + # provider-specific ``reasoning_content`` channel and leaves + # ``content`` as ``None``. The agent must transparently read either. + msg = MagicMock(content=None, reasoning_content='{"b": 2}') + text, route = extract_response_text(msg) + assert text == '{"b": 2}' + assert route == "reasoning_content" + + def test_empty_when_both_channels_blank(self): + msg = MagicMock(content=None, reasoning_content=None) + text, route = extract_response_text(msg) + assert text == "" + assert route == "empty" + + def test_empty_when_message_has_no_attrs(self): + # Some mock / stub message objects don't define the channels at all; + # ``getattr(..., None)`` must still resolve to "empty" rather than + # raising AttributeError. + msg = object() # bare object, no attrs + text, route = extract_response_text(msg) + assert text == "" + assert route == "empty" + + def test_treats_empty_string_as_falsy(self): + # ``""`` and ``None`` must both route to the fallback (otherwise an + # empty content with a populated reasoning_content would never + # reach the reasoning channel). + msg = MagicMock(content="", reasoning_content='{"b": 2}') + text, route = extract_response_text(msg) + assert text == '{"b": 2}' + assert route == "reasoning_content" + + +# --------------------------------------------------------------------------- +# ping +# --------------------------------------------------------------------------- + + +class TestPing: + def test_returns_response_content(self): + client = MagicMock() + client.chat.completions.create.return_value = _chat_response(content="OK") + assert ping(client, "any-model") == "OK" + + def test_returns_empty_string_when_content_is_none(self): + # Some providers return ``None`` content alongside a finish_reason — we + # treat that as a successful round-trip (the wire works) rather than + # raising, since the caller's contract is "did this raise?". + client = MagicMock() + client.chat.completions.create.return_value = _chat_response(content=None) + assert ping(client, "any-model") == "" + + def test_uses_minimal_request_params(self): + client = MagicMock() + client.chat.completions.create.return_value = _chat_response(content="OK") + ping(client, "model-name") + kwargs = client.chat.completions.create.call_args.kwargs + assert kwargs["model"] == "model-name" + assert kwargs["temperature"] == 0 + assert kwargs["max_tokens"] == 8 + # Single user message — no system prompt / catalog payload. Keeping the + # request small is the whole point: ping must stay cheap enough to + # gate every agent construction. + assert len(kwargs["messages"]) == 1 + assert kwargs["messages"][0]["role"] == "user" + # ping is a structured-outputs-agnostic liveness check; it must NOT + # ask the model to honour response_format (otherwise it can't fail + # gracefully on models that lack structured-output support, which + # defeats the point of having a cheap probe). + assert "response_format" not in kwargs + + def test_propagates_client_exceptions(self): + class FakeAuthError(Exception): + pass + + client = MagicMock() + client.chat.completions.create.side_effect = FakeAuthError("invalid api key") + with pytest.raises(FakeAuthError, match="invalid api key"): + ping(client, "m") + + +# --------------------------------------------------------------------------- +# check_structured_output_support (mocked) +# --------------------------------------------------------------------------- + + +class TestCheckStructuredOutputSupport: + """Bool-or-raise contract: returns True on a clean round-trip, raises + ``RuntimeError`` with a multi-line diagnostic on every failure mode. + + Each failure-mode test pins three things: + 1. ``RuntimeError`` (not the original SDK exception) reaches the + caller — so callers have a single exception type to catch. + 2. The model name appears in the message (the most-grepped field). + 3. The ``cause`` field carries the upstream classifier + (``BadRequestError`` vs ``JSONDecodeError`` vs ``ValidationError``) + so the failure attribution survives the wrapping. + + Where the underlying SDK / parser exception is preserved on + ``__cause__``, we assert that too — it's what makes + ``raise ... from exc`` worth doing. + """ + + def test_returns_true_on_valid_envelope(self): + client = MagicMock() + client.chat.completions.create.return_value = _chat_response(content=json.dumps(_MINIMAL_SPEC)) + # The whole public contract collapses to: ``True`` or it raises. + # ``is True`` rather than truthy so a future regression that + # returns a dict/tuple/etc fails this test. + assert check_structured_output_support(client, "some-model", EnvIntentSpec) is True + + def test_returns_true_on_reasoning_content_envelope(self): + # NVIDIA DeepSeek envelope — content empty, structured output + # on the ``reasoning_content`` channel. Must NOT raise; the + # ``extract_response_text`` fallback handles this transparently. + # The previous dataclass surfaced ``response_route`` so callers + # could distinguish; the new API hides that detail (callers + # don't need it — both channels are equivalent for our purposes). + client = MagicMock() + client.chat.completions.create.return_value = _chat_response( + content=None, reasoning_content=json.dumps(_MINIMAL_SPEC) + ) + assert check_structured_output_support(client, "deepseek", EnvIntentSpec) is True + + def test_raises_on_4xx_with_underlying_exception_chained(self): + # The most common "model doesn't support structured outputs" + # signal at the wire level: a 4xx rejecting ``response_format`` + # or the schema. The original SDK exception must reach the + # caller via ``__cause__`` so the traceback retains the HTTP + # status / body — otherwise debugging "why did construction + # fail?" requires re-running locally. + class FakeBadRequest(Exception): + pass + + client = MagicMock() + original = FakeBadRequest("Error code: 400 - additionalProperties") + client.chat.completions.create.side_effect = original + with pytest.raises(RuntimeError, match="does not support structured outputs") as exc_info: + check_structured_output_support(client, "claude", EnvIntentSpec) + msg = str(exc_info.value) + # Model name surfaces (most-grepped field) and the cause type + # classifies the failure (4xx wire error, not parse / validation). + assert "'claude'" in msg + assert "FakeBadRequest" in msg + assert "400" in msg + # On an api_error there's no response payload to echo. + assert "sample_payload = None" in msg + # Exception chaining preserves the original for traceback drill-down. + assert exc_info.value.__cause__ is original + + def test_raises_on_empty_envelope(self): + # Wire accepts the request, model produces nothing on either + # channel. The endpoint silently dropped the structured output + # — the most insidious failure mode, since ``finish_reason`` + # still reads ``stop``. No underlying exception to chain. + client = MagicMock() + client.chat.completions.create.return_value = _chat_response(content=None, reasoning_content=None) + with pytest.raises(RuntimeError, match="does not support structured outputs") as exc_info: + check_structured_output_support(client, "broken", EnvIntentSpec) + msg = str(exc_info.value) + assert "empty envelope" in msg + # finish_reason forwarded so the operator can correlate with + # provider logs (was it a content-filter stop, a length cap, etc.). + assert "finish_reason = 'stop'" in msg + # No upstream exception to chain on this branch (the function + # itself synthesises the failure from a structurally-OK response). + assert exc_info.value.__cause__ is None + + def test_raises_when_choices_list_is_empty(self): + # Real provider behaviour: HTTP returns 200 OK but ``choices`` is + # an empty list. Seen on Azure when a content-filter trips, and + # on Bedrock when a guardrail rejects the response post-hoc. + # Naive ``resp.choices[0]`` access would IndexError and break + # the contract — surface it as a structured RuntimeError with + # a distinct ``cause`` message that operators can tell apart + # from the "envelope returned but content empty" case. + resp = MagicMock() + resp.choices = [] + client = MagicMock() + client.chat.completions.create.return_value = resp + with pytest.raises(RuntimeError, match="does not support structured outputs") as exc_info: + check_structured_output_support(client, "guardrailed", EnvIntentSpec) + msg = str(exc_info.value) + assert "no choices" in msg + assert "response_route = 'empty'" in msg + + def test_raises_on_invalid_json_with_payload_preview(self): + # The JSON-decode failure is the case where ``sample_payload`` + # earns its keep — without it the operator sees only + # "Expecting value: line 1 column 1" and has to re-run locally + # to discover the model emitted a prose preamble. With the + # preview in the message the failure is debuggable from CI logs. + client = MagicMock() + client.chat.completions.create.return_value = _chat_response(content="not json") + with pytest.raises(RuntimeError, match="does not support structured outputs") as exc_info: + check_structured_output_support(client, "m", EnvIntentSpec) + msg = str(exc_info.value) + assert "JSONDecodeError" in msg + assert "'not json'" in msg # the literal response preview + # Original JSONDecodeError preserved on ``__cause__``. + assert exc_info.value.__cause__ is not None + assert type(exc_info.value.__cause__).__name__ == "JSONDecodeError" + + def test_raises_on_validation_failure_with_payload_preview(self): + # JSON parses fine, but doesn't match the schema. The probe + # exists to detect this exact class of "model returns + # something, but it's wrong" failure. The original + # ValidationError chains via ``__cause__`` so ``.errors()`` + # is still reachable for callers that want the structured + # error list. + from pydantic import ValidationError + + client = MagicMock() + client.chat.completions.create.return_value = _chat_response(content='{"missing": "fields"}') + with pytest.raises(RuntimeError, match="does not support structured outputs") as exc_info: + check_structured_output_support(client, "m", EnvIntentSpec) + msg = str(exc_info.value) + assert "ValidationError" in msg + assert '{"missing": "fields"}' in msg # payload preview echoed + assert isinstance(exc_info.value.__cause__, ValidationError) + + def test_request_shape(self): + client = MagicMock() + client.chat.completions.create.return_value = _chat_response(content=json.dumps(_MINIMAL_SPEC)) + check_structured_output_support(client, "model-name", EnvIntentSpec) + kwargs = client.chat.completions.create.call_args.kwargs + assert kwargs["model"] == "model-name" + assert kwargs["temperature"] == 0 + assert kwargs["response_format"]["type"] == "json_schema" + assert kwargs["response_format"]["json_schema"]["name"] == "EnvIntentSpec" + assert kwargs["response_format"]["json_schema"]["strict"] is True + # The schema sent on the wire must already be munged for strict mode + # — otherwise Bedrock rejects with 400. Spot-check the root marker. + sent_schema = kwargs["response_format"]["json_schema"]["schema"] + assert sent_schema["additionalProperties"] is False + + def test_accepts_alternative_spec_class(self): + # Callers can probe with a smaller toy spec for cheap model + # surveys — the probe shouldn't be hard-wired to EnvIntentSpec. + class TinySpec(BaseModel): + ok: bool + + client = MagicMock() + client.chat.completions.create.return_value = _chat_response(content='{"ok": true}') + assert check_structured_output_support(client, "m", TinySpec) is True + kwargs = client.chat.completions.create.call_args.kwargs + assert kwargs["response_format"]["json_schema"]["name"] == "TinySpec" + + +# --------------------------------------------------------------------------- +# Live endpoint (opt-in, network + auth required) +# --------------------------------------------------------------------------- + + +# The probe hits a real model on every run. NVIDIA's hosted DeepSeek-v4-flash +# is intermittently quirky under structured outputs (occasional blank +# ``content``, transient 429 / 5xx from the proxy, etc.); a single failed +# attempt does NOT mean the deployment is actually broken. Allow up to 2 +# reruns so a transient blip doesn't fail CI. Real breakage will fail all 3. +# TODO(qianl): drop the flaky marker once production-side retry is wired +# into ``check_structured_output_support`` (see TODO in structured_output_utils.py). +@pytest.mark.flaky(max_runs=3, min_passes=1) +@pytest.mark.agent_remote_e2e +def test_default_model_supports_structured_output(): + """The default ``EnvGenAgent`` model must support structured outputs. + + This is the gating contract of the whole agent: ``generate_spec`` + is structured-outputs-only, so the default + ``DEFAULT_MODEL`` / ``DEFAULT_BASE_URL`` pair must pass the probe. + Failing here means production env-gen is broken — usually because + NVIDIA changed which channel DeepSeek-v4-flash routes structured + outputs into, or pulled the model from the default-models + catalogue. + + The probe's ``RuntimeError`` already carries a multi-line + diagnostic (model / route / finish_reason / cause / + sample_payload), so test-failure output is self-describing — no + extra error-message construction needed here. + """ + api_key = os.environ.get("NV_API_KEY") + assert api_key, "NV_API_KEY env var required to run live tests" + + from openai import OpenAI + + client = OpenAI(api_key=api_key, base_url=DEFAULT_BASE_URL) + assert check_structured_output_support(client, DEFAULT_MODEL, EnvIntentSpec) is True diff --git a/pytest.ini b/pytest.ini index d9d330ca9..2c26acc38 100644 --- a/pytest.ini +++ b/pytest.ini @@ -5,3 +5,4 @@ markers = with_newton: test uses Newton physics gr00t_policy: test exercises GR00T policy/data code that runs in the base container (lightweight gr00t deps only) gr00t_remote_e2e: test requires a live GR00T remote policy server + agent_remote_e2e: test requires a live OpenAI-compatible chat-completions endpoint (needs NV_API_KEY) diff --git a/setup.py b/setup.py index 82cd92b56..b8afc5af7 100644 --- a/setup.py +++ b/setup.py @@ -14,6 +14,11 @@ "vuer[all]", "lightwheel-sdk", "pytest", + # Used lazily by isaaclab_arena/environments/agentic_env_gen/* for NV_API_KEY-based agent calls. + "openai", + # Hard dependency of isaaclab_arena/environments/agentic_env_gen/env_intent_spec.py (BaseModel / Field / + # model_validator imported at module load — not lazy). + "pydantic>=2.0", ] DEV_DEPS = [