IBM · araujof · Jun 28, 2026 · Jun 28, 2026 · Jun 28, 2026 · Jun 28, 2026
@@ -14,7 +14,7 @@ OPENAI_BASE_URL=your base url
 
 # Target agent configuration
 AGENT_URL=http://localhost:9000
-# RITS_* are consumed by the example agents under mcp_servers/*, not by Smith itself.
+# RITS_* are consumed by the example agents under examples/*, not by Smith itself.
 # Option A: Ollama (local)
 RITS_MODEL="qwen3.5:latest"
 RITS_BASE_URL="http://localhost:11434/v1"
@@ -26,11 +26,11 @@ OLLAMA_BASE_URL="http://localhost:11434"
 # RITS_MODEL=your rits model
 # RITS_BASE_URL=your rits base url
 # RITS_API_KEY=your rits api key
-TARGET_AGENT_PATH=mcp_servers/your_mcp_server/
-GUIDANCE_FILE=mcp_servers/your_mcp_server/smith/guidance.txt
-SYSTEM_VAR_FILE=mcp_servers/your_mcp_server/smith/system_vars.json
-PROMPTFOO_CONFIG_FILE=mcp_servers/your_mcp_server/smith/promptfooconfig.yaml
-PROMPTFOO_OUTPUT_FILE=mcp_servers/your_mcp_server/smith/redteam1.yaml
+TARGET_AGENT_PATH=examples/your_mcp_server/
+GUIDANCE_FILE=examples/your_mcp_server/smith/guidance.txt
+SYSTEM_VAR_FILE=examples/your_mcp_server/smith/system_vars.json
+PROMPTFOO_CONFIG_FILE=examples/your_mcp_server/smith/promptfooconfig.yaml
+PROMPTFOO_OUTPUT_FILE=examples/your_mcp_server/smith/redteam1.yaml
 
 ## MCP settings: for get_mcp_parameters when generating the policy
 MCP_TRANSPORT=sse
@@ -40,27 +40,29 @@ MCP_URL=http://localhost:8000/sse
 # MCP_TRANSPORT=stdio
 # MCP_COMMAND=python
 # MCP_ARGS=server.py
-# MCP_CWD=mcp_servers/call-for-papers-mcp
+# MCP_CWD=examples/call-for-papers-mcp
 
 # example for nodejs MCP
 # MCP_TRANSPORT=stdio
 # MCP_COMMAND=node
 # MCP_ARGS=dist/index.js
-# MCP_CWD=mcp_servers/context7-mcp
+# MCP_CWD=examples/context7-mcp
 
 
 # Policy testing configuration
 BAD_COMMAND_PATH=references/test_cases/allow
 BENIGN_COMMAND_PATH=references/test_cases/disallow
 TEST_CASE_PATH=references/test_cases/
-TEST_PATH=tests/integration/
+# Where the policy_testing harness writes the scorecard + failure list
+# (relative to BASE_URL). The harness itself ships inside the smith package.
+TEST_OUTPUT_DIR=references/scorecard/
 TEST_RESULT_PATH=scorecard_summary.txt
 TEST_FAILURES_PATH=score_test_failures.txt
 CROSS_VALIDATE_OUTPUT=references/cross_validate_report.json
 
 # Test case generation configuration
 TEST_CASE_TEMPLATE=references/test_case_template.json
-TEST_GENERATION_PATH=scripts/test_generation/
+TEST_GENERATION_PATH=src/smith/test_generation/
 DECOMP_FILE=references/decomp_file.json
 FLATTEN_FILE=references/decomp_flatten_file.json
 ATTACK_FILE_CSV=ares/assets/safety_behaviors_text_subset.csv

@@ -31,14 +31,9 @@ jobs:
     timeout-minutes: 15
     steps:
       - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: Install ruff + black
-        # Pinned so CI matches the versions the codebase is formatted/linted
-        # against — an unpinned install pulls newer styles that flag files CI
-        # should pass. Bump these deliberately alongside a reformat commit.
-        run: python -m pip install ruff==0.15.20 black==26.5.1
+      - uses: astral-sh/setup-uv@v5
+      # `make lint` runs ruff/black via `uvx` with versions pinned in the
+      # Makefile (ruff 0.15.20 / black 26.5.1); bump those alongside a reformat.
       - name: make lint
         run: make lint
 
@@ -79,14 +74,12 @@ jobs:
     timeout-minutes: 30
     steps:
       - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v5
       - uses: actions/setup-python@v5
         with:
           python-version: "3.11"
-          cache: pip
-      - name: Install Smith and dependencies
-        run: make install
-      - name: CLI smoke test
-        run: smith --help
+      - name: Install Smith (uv) and smoke-test the CLI
+        run: make build
 
   audit:
     name: Dependency audit (advisory)
@@ -98,10 +91,6 @@ jobs:
     continue-on-error: true
     steps:
       - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
+      - uses: astral-sh/setup-uv@v5
       - name: pip-audit
-        run: |
-          python -m pip install pip-audit
-          pip-audit -r requirements.txt
+        run: make audit
@@ -50,7 +50,7 @@ references/test_case_report.html
 assets/opa/outputs/
 
 # ---- Generated: ARES red-teaming artifacts ----
-scripts/test_generation/ares/assets/*generate*.json
+src/smith/test_generation/ares/assets/*generate*.json
 
 # ---- Generated: policy testing scorecard output ----
 scripts/tests/integration/*.txt

@@ -15,6 +15,26 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ## [Unreleased]
 
+### Changed
+
+- Repackaged `scripts/` into an installable `smith` Python package using a `src/`
+  layout, with `pyproject.toml` at the repo root declaring runtime dependencies
+  (`[project.dependencies]`) and a `[dev]` extra. The CLI entry point is now
+  `smith = smith.cli:main`.
+- Package management and the build/publish workflow now use [uv](https://docs.astral.sh/uv/)
+  (`make install`, `make package`, `make publish`).
+- The OPA scorecard harness ships inside the package (`smith.policy_testing`) and
+  writes all generated outputs to a `BASE_URL`-relative dir (`references/scorecard/`,
+  via `TEST_OUTPUT_DIR`) instead of `scripts/tests/integration/`.
+- Renamed `mcp_servers/` to `examples/`.
+
+### Removed
+
+- Legacy code unreachable from the CLI: a kubectl/mcpgateway/beeai cluster, duplicate
+  entry points, a dead `visualization/` package, and the previous (non-functional)
+  pytest suite. Also removed stray upstream ARES repository scaffolding; ARES is the
+  external `ares-redteamer` tool, located via `ARES_HOME`.
+
 ## [0.1.0] - 2026-06-28
 
 ### Added
@@ -30,7 +50,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - **MCP tool extraction** over SSE and stdio transports (`get_mcp_parameter`) and tool-call translation (`test_case_translation`).
 - **Rego policy validation** with optional auto-fix (`policy_validation`, `policy_validation_fix`).
 - Runtime configuration driven entirely from `.env` (see `.env_template`); target-agent selection via `TARGET_AGENT_PATH`, `GUIDANCE_FILE`, `SYSTEM_VAR_FILE`, `MCP_*`, and `AGENT_URL`.
-- Example target agents under `mcp_servers/`, each carrying its Smith inputs (`guidance.txt`, `tool_definitions.json`, `system_vars.json`).
+- Example target agents under `examples/`, each carrying its Smith inputs (`guidance.txt`, `tool_definitions.json`, `system_vars.json`).
 
 [Unreleased]: https://github.com/IBM/smith/compare/0.1.0...HEAD
 [0.1.0]: https://github.com/IBM/smith/releases/tag/0.1.0
@@ -9,26 +9,25 @@ Smith is an **agent skill** (plugin) that manages the full lifecycle of [OPA](ht
 Smith has two layers that must be understood together:
 
 1. **The skill layer** — `SKILL.md` plus markdown guides under `opa_policy/` and `test_generation/`. These are *instructions the agent (you) follows*, not code. When a user asks to create, test, or improve a policy, the agent reads the relevant markdown guide and orchestrates the work, often by invoking the `smith` CLI.
-2. **The CLI backend** — `scripts/cli.py` exposes a single `smith --flag <stage>` command that runs the heavy Python pipelines (decomposition, attack generation, label validation, clustering, etc.).
+2. **The CLI backend** — `src/smith/cli.py` exposes a single `smith --flag <stage>` command that runs the heavy Python pipelines (decomposition, attack generation, label validation, clustering, etc.).
 
 So: high-level control flow lives in markdown guides; deterministic pipeline stages live behind CLI flags.
 
 ## Commands
 
 ```bash
-# Setup (from repo root)
-python -m venv .venv && source .venv/bin/activate
-make install                   # pip install -r requirements.txt + editable scripts/ (installs the `smith` CLI)
+# Setup (from repo root) — package management uses uv
+make install                   # uv venv + uv pip install -e ".[dev]" (installs the `smith` CLI)
 cp .env_template .env          # then fill in values (see Configuration below)
 
 # Dev workflow — the root Makefile mirrors CI (.github/workflows/ci.yml); `make ci` is the gate
-make lint            # ruff check + black --check (config in scripts/pyproject.toml)
+make lint            # ruff check + black --check (config in pyproject.toml)
 make format          # ruff --fix + black (apply fixes)
 make lint-policy     # Regal (falls back to OPA) lint of assets/policy.rego
-make license-check   # verify SPDX Apache-2.0 headers (scripts/tools/license_headers.py)
+make license-check   # verify SPDX Apache-2.0 headers (src/smith/tools/license_headers.py)
 make build           # editable install + `smith --help` smoke test
 make ci              # the gate: lint + lint-policy + license-check
-make test            # policy scorecard (delegates to scripts/Makefile; needs Docker + the OPA server)
+make test            # policy scorecard: starts OPA in Docker + runs the packaged harness
 
 # CLI pipeline stages (run from anywhere once installed; reads paths from .env)
 smith --flag get_mcp_parameter      # auto-extract MCP tool defs -> <TARGET_AGENT_PATH>/smith/tool_definitions.json
@@ -44,28 +43,25 @@ smith --flag apply_cross_validate   # apply approved label corrections from cros
 smith --flag policy_validation --policy_path <file.rego>      # validate a rego file
 smith --flag policy_validation_fix --policy_path <file.rego>  # validate and auto-fix
 
-# Policy-testing harness (scripts/Makefile — what `smith --flag policy_testing` and root `make test` invoke)
-cd scripts
+# Policy-testing OPA server (root Makefile; the packaged harness in
+# src/smith/policy_testing/ is what `smith --flag policy_testing` and `make test` invoke)
 make opaserver/start   # start OPA server on :8181 with assets/policy.rego (lints first)
-make test              # run tests/integration/score_card.sh, output scorecard + failures
-make test/verbose      # per-test-case results
-make lint/policy       # OPA check on the rego policy
-make lint/code         # ruff + black over scripts tests visualization
-make opaserver/stop
+make opaserver/status  # show whether the OPA container is running
+make opaserver/stop    # stop the OPA server
 ```
 
-`make test` requires the OPA server to be running and curls `localhost:8181/v1/data/mcp/policies/allow` for every JSON case under `references/test_cases/{allow,disallow}/`. A case in `disallow/` is expected to return `allow: false`; `allow/` expects `true`. Results land in `scripts/tests/integration/{scorecard_summary.txt,score_test_failures.txt,tp.txt,fp.txt,tn.txt,fn.txt}`.
+`make test` requires the OPA server to be running and curls `localhost:8181/v1/data/mcp/policies/allow` for every JSON case under `references/test_cases/{allow,disallow}/`. A case in `disallow/` is expected to return `allow: false`; `allow/` expects `true`. Results land in `references/scorecard/{scorecard_summary.txt,score_test_failures.txt,tp.txt,fp.txt,tn.txt,fn.txt}`.
 
 ## External tools (install separately)
 
 - **OPA** + **Regal** (Styra linter) — required for testing and `regal_suggestion`.
-- **ARES** (IBM red-teaming) and **Promptfoo** (`npm install -g promptfoo`) — required for adversarial test generation. ARES installs under `scripts/test_generation/ares/` and needs its plugins (`ares-autodan`, `ares-human-jailbreak`, `ares-garak`).
+- **ARES** (IBM red-teaming) and **Promptfoo** (`npm install -g promptfoo`) — required for adversarial test generation. ARES installs under `src/smith/test_generation/ares/` and needs its plugins (`ares-autodan`, `ares-human-jailbreak`, `ares-garak`).
 
 ## Repo conventions
 
-- **Packaging + tool config live in `scripts/pyproject.toml`** (not at the repo root): flat layout (`cli.py` is the import root), console entry `smith = cli:main`, and `[tool.ruff]`/`[tool.black]` config. Black is pinned to `target-version = py311` so formatting is deterministic across interpreters; the vendored ARES tree is excluded from packaging and linting.
+- **Packaging + tool config live in the root `pyproject.toml`**: src layout (`src/smith/`), console entry `smith = smith.cli:main`, declared `[project.dependencies]` (+ `[dev]` extra), `[tool.setuptools.package-data]` shipping the policy_testing harness + `ares_config`, and `[tool.ruff]`/`[tool.black]` config. Black is pinned to `target-version = py311`. Package management uses **uv** (`make install`, `make package`/`make publish` → `uv build`/`uv publish`).
 - **CI** (`.github/workflows/ci.yml`) mirrors `make ci` and pins `ruff==0.15.20` / `black==26.5.1` — bump these deliberately alongside a reformat commit. The Rego-lint job is currently disabled in CI; still run `make lint-policy` locally.
-- **License headers:** every in-scope file (`.py`, `.rego`, `.sh`, `.yaml`, `.yml`, plus `Makefile`/`Dockerfile`) carries an Apache-2.0 SPDX header. `make license` inserts, `make license-check` verifies (`scripts/tools/license_headers.py`). Excludes `scripts/test_generation/ares/`, `mcp_servers/`, `references/`, and generated outputs.
+- **License headers:** every in-scope file (`.py`, `.rego`, `.sh`, `.yaml`, `.yml`, plus `Makefile`/`Dockerfile`) carries an Apache-2.0 SPDX header. `make license` inserts, `make license-check` verifies (`src/smith/tools/license_headers.py`). Excludes `src/smith/test_generation/ares/`, `examples/`, `references/`, and generated outputs.
 - **DCO sign-off** is required on every commit (`git commit -s`).
 - **Changelog:** user-facing changes get an entry under `## [Unreleased]` in `CHANGELOG.md` (Keep a Changelog); maintainers promote it to a dated version when cutting a release tag.
 - `smith --help` and a bare `smith` (no flag) work without a populated `.env` — args are parsed before any env-derived path assembly, so don't reintroduce eager `BASE_URL + os.getenv(...)` work ahead of `argparse`.
@@ -74,13 +70,13 @@ make opaserver/stop
 
 Almost every path in the codebase is **assembled from `.env` at runtime** via `os.getenv`, not hardcoded. The dominant pattern is `BASE_URL + os.getenv("SOME_PATH")`. `BASE_URL` is the absolute path to the skill folder (trailing slash). When you change where files are read/written, you are almost always editing `.env`, not Python.
 
-Target-agent selection is driven by a small set of vars: `TARGET_AGENT_PATH`, `GUIDANCE_FILE`, `SYSTEM_VAR_FILE`, `MCP_*`, and `AGENT_URL`. Pointing Smith at a different agent example (under `mcp_servers/`) means repointing these, not changing code.
+Target-agent selection is driven by a small set of vars: `TARGET_AGENT_PATH`, `GUIDANCE_FILE`, `SYSTEM_VAR_FILE`, `MCP_*`, and `AGENT_URL`. Pointing Smith at a different agent example (under `examples/`) means repointing these, not changing code.
 
 Key model vars: `MODEL_SONNET` (the LLM used across pipelines), `OPENAI_API_KEY`, `OPENAI_BASE_URL`, `TEMP`, `TOP_P`.
 
 ## Per-target-agent inputs
 
-Each target agent under `mcp_servers/<agent>/` carries its Smith inputs in a `smith/` subfolder:
+Each target agent under `examples/<agent>/` carries its Smith inputs in a `smith/` subfolder:
 - `guidance.txt` — natural-language policy rules (the source of truth).
 - `tool_definitions.json` — MCP tools + params (auto-generated by `get_mcp_parameter`); maps to `input.arguments.*` in the policy.
 - `system_vars.json` — session/system variables (roles, teams, claims); maps to `input.extensions.subject.*`.
@@ -93,7 +89,7 @@ The generated policy may **only** reference data available from tool arguments o
 - `assets/opa/` — OPA intermediate results: AST (`ast.json`), graph (`ast.dot`), backups.
 - `references/` — all generated intermediates: `decomp_file.json`, `vars_file.json`, `test_cases.json`, attack files, `label_validation_results.json`, `test_case_report.html`, and final `test_cases/{allow,disallow,malicious}/`.
 
-## scripts/ package map
+## src/smith/ package map
 
 - `policy_generation/` — MCP tool extraction (`extract_tools.py`) and rego validation (`validate_policy.py`).
 - `test_generation/` — generation pipeline stages run in order by the `test_generation` flag: `decompose` → `grey_condition` → `variable_extraction` → `case_generation` → `attack` (ARES) → `attack_promptfoo` → `convert_test_case`. Also `extract_tool_args.py` for translation.

@@ -25,30 +25,33 @@ The [`Makefile`](Makefile) mirrors CI — a green `make ci` locally means a gree
 pipeline:
 
 ```bash
-make install         # create a venv and install Smith + dependencies
-make lint            # ruff check + black --check (read-only)
+make install         # create a uv venv and install Smith (editable) + [dev] extras
+make lint            # ruff check + black --check over src/ (read-only)
 make format          # ruff --fix + black (apply formatting)
 make lint-policy     # Regal/OPA lint of assets/policy.rego
 make license-check   # verify every in-scope file carries the SPDX header
 make test            # policy scorecard (needs Docker + the OPA server)
-make ci              # the gate: lint + lint-policy + license-check + build smoke
+make ci              # the gate: lint + lint-policy + license-check
 ```
 
+Package management uses [`uv`](https://docs.astral.sh/uv/). Build and publish
+with `make package` / `make publish` (`uv build` / `uv publish`).
+
 Before submitting a PR, make sure `make ci` passes.
 
 ## Coding standards
 
 - **Python 3.11+.** Keep code compatible with 3.11 and 3.12.
 - **Formatting & linting:** [`ruff`](https://docs.astral.sh/ruff/) and
   [`black`](https://black.readthedocs.io/) (config in
-  [`scripts/pyproject.toml`](scripts/pyproject.toml)). CI runs `ruff check` and
+  [`pyproject.toml`](pyproject.toml)). CI runs `ruff check` and
   `black --check`; run `make format` to fix issues locally.
 - **Rego:** policies are linted/formatted with Regal
   ([`make lint-policy`](Makefile)). Keep rule names, namespaces, and allow/deny
   semantics consistent; add only narrowly scoped conditions rather than rewriting
   whole policies.
-- The vendored ARES tree (`scripts/test_generation/ares/`) is a separate upstream
-  project and is excluded from linting, formatting, and license headers.
+- The vendored ARES inputs (`src/smith/test_generation/ares/`) are separate
+  upstream material and are excluded from linting, formatting, and license headers.
 
 ### Source file headers
 
@@ -63,7 +66,7 @@ shell, YAML, Makefiles, Dockerfiles, and Rego (all `#`-comment formats):
 Place it after any shebang (`#!/usr/bin/env python3`, `#!/bin/bash`). Run
 `make license` to insert missing headers and `make license-check` to verify
 coverage; both are driven by
-[`scripts/tools/license_headers.py`](scripts/tools/license_headers.py).
+[`src/smith/tools/license_headers.py`](src/smith/tools/license_headers.py).
 
 ## Changelog