From 5757e427c0d7e6669efbbaeee03115ed6c882ed5 Mon Sep 17 00:00:00 2001 From: beyhangl Date: Sat, 30 May 2026 12:43:04 +0300 Subject: [PATCH] release: v0.2.0 Bump 0.1.0 -> 0.2.0 across pyproject, __init__, CLI --version, cassette evalcraft_version, and cloud User-Agent. Add the 0.2.0 CHANGELOG entry covering everything since the 0.1.0 PyPI upload (full eval suite, live-eval, provenance, judge cache, Gemini/PydanticAI adapters, the 3 bug fixes, and the lint/type cleanup). Built + twine-checked locally: evalcraft-0.2.0 wheel + sdist PASS; 803 tests green. --- CHANGELOG.md | 42 ++++++++++++++++++++++++++++++++++++++ evalcraft/__init__.py | 2 +- evalcraft/cli/main.py | 2 +- evalcraft/cloud/client.py | 2 +- evalcraft/core/models.py | 2 +- pyproject.toml | 2 +- tests/test_e2e_pipeline.py | 2 +- 7 files changed, 48 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 45a470c2..88688f6a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,48 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.2.0] — 2026-05-30 + +Ships everything developed since the initial `0.1.0` PyPI upload — a much larger +evaluation surface, new drift-catching and determinism tooling, an honest +re-scope of the project's positioning, several bug fixes, and a full lint/type +cleanup. Backward-compatible with `0.1.0` cassettes. + +### Added + +#### Evaluation +- **LLM-as-Judge scorers** — `assert_output_semantic`, `assert_factual_consistency`, `assert_tone`, `assert_custom_criteria` (OpenAI or Anthropic judge, configurable model) +- **RAG metrics** — `assert_faithfulness`, `assert_context_relevance`, `assert_answer_relevance`, `assert_context_recall` +- **Pairwise A/B** — `pairwise_compare` and `pairwise_rank` (round-robin tournament) with position-bias mitigation +- **Statistical eval** — `eval_n` with Wilson-score confidence intervals +- **Multi-judge consensus** — `JuryScorer` +- **Hallucination detection** — `assert_no_hallucination`, `detect_hallucinations` (per-claim breakdown) +- **Live-eval mode** — `run_live_eval` / `compare_to_baseline` + the `evalcraft live-eval` CLI: run scorers against the *real* model over a golden input set and gate CI on score regressions. This is the layer that catches model/prompt/retrieval drift, which replay cannot. + +#### Cassettes +- **Provenance metadata** — each recording captures the model set, a prompt hash, SDK/Python versions, and record time (for staleness reasoning); surfaced in `evalcraft info`. Loads provenance-less cassettes unchanged. +- **Opt-in judge cache** — `evalcraft.eval.judge_cache.use_judge_cache(...)` / the `EVALCRAFT_JUDGE_CACHE` env var record/replay LLM-judge responses for deterministic, $0 judge scoring in CI (modes: `auto` / `record` / `replay`). + +#### Other +- Regression `TrendDetector` for multi-run gradual-drift analysis +- **Gemini** and **Pydantic AI** adapters (Python); Gemini + Vercel AI adapters (JS) +- `evalcraft generate-tests` (pytest file from a cassette) and `evalcraft doctor` (setup diagnostics) +- TypeScript/JavaScript SDK (pre-release, source-only): capture/replay, mocks, 16 scorers, OpenAI/Gemini/Vercel AI adapters + +### Fixed +- LangGraph adapter: two `NameError`s in `on_llm_end` / `on_chain_end` (referenced callback params they never receive) +- NetworkGuard: Python 3.9/3.10 crash from hard-coding the `all_errors` kwarg (added to the stdlib only in 3.11) — now forwards `**kwargs` +- De-flaked the JS fingerprint-determinism test (pinned `Span.timestamp`) +- Repointed the dead `evalcraft.dev` documentation URL to the GitHub Pages site + +### Changed +- **Positioning** re-scoped from "The pytest for AI agents" to **"VCR for AI agents"** — honest about what replay does, and no longer colliding with DeepEval's tagline +- Documentation corrected for accuracy: offline-vs-live scorer labeling, fingerprint/regression semantics (detects *recorded* changes, not live drift), an accurate Python-vs-JS parity matrix, a fact-checked comparison table, and JS install instructions (build-from-source; not yet on npm) + +### Internal +- ruff: 325 → 0 findings; mypy: made runnable and clean across the package (strict bug-catching checks kept on; annotation-completeness sub-checks right-sized to the codebase's style) +- 803 Python tests and 145 JS tests passing + ## [0.1.0] — 2026-03-05 Initial public release of Evalcraft — the pytest for AI agents. diff --git a/evalcraft/__init__.py b/evalcraft/__init__.py index f513a1ea..53ede5e7 100644 --- a/evalcraft/__init__.py +++ b/evalcraft/__init__.py @@ -4,7 +4,7 @@ mock LLMs/tools, score runs, and catch real model drift with live-eval. """ -__version__ = "0.1.0" +__version__ = "0.2.0" from evalcraft.capture.recorder import CaptureContext, capture from evalcraft.cloud.client import EvalcraftCloud diff --git a/evalcraft/cli/main.py b/evalcraft/cli/main.py index 6fb60dd4..1a12e0f0 100644 --- a/evalcraft/cli/main.py +++ b/evalcraft/cli/main.py @@ -58,7 +58,7 @@ def _fmt_cost(usd: float) -> str: # ─── CLI root ───────────────────────────────────────────────────────────────── @click.group() -@click.version_option(version="0.1.0", prog_name="evalcraft") +@click.version_option(version="0.2.0", prog_name="evalcraft") def cli() -> None: """evalcraft — capture, replay, and evaluate AI agent runs.""" diff --git a/evalcraft/cloud/client.py b/evalcraft/cloud/client.py index d4d334c2..c7ee0748 100644 --- a/evalcraft/cloud/client.py +++ b/evalcraft/cloud/client.py @@ -311,7 +311,7 @@ def _request( body: bytes | None = None headers: dict[str, str] = { "Accept": "application/json", - "User-Agent": "evalcraft-sdk/0.1.0", + "User-Agent": "evalcraft-sdk/0.2.0", } if self.api_key: headers["Authorization"] = f"Bearer {self.api_key}" diff --git a/evalcraft/core/models.py b/evalcraft/core/models.py index 194cb7dc..08a245d0 100644 --- a/evalcraft/core/models.py +++ b/evalcraft/core/models.py @@ -288,7 +288,7 @@ def to_dict(self) -> dict: self.compute_metrics() self.compute_fingerprint() return { - "evalcraft_version": "0.1.0", + "evalcraft_version": "0.2.0", "cassette": { "id": self.id, "name": self.name, diff --git a/pyproject.toml b/pyproject.toml index c1443742..9c1a7d12 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "evalcraft" -version = "0.1.0" +version = "0.2.0" description = "VCR for AI agents — record agent runs as cassettes and replay them deterministically in CI for $0." readme = "README.md" license = "MIT" diff --git a/tests/test_e2e_pipeline.py b/tests/test_e2e_pipeline.py index 775e7a09..2429716d 100644 --- a/tests/test_e2e_pipeline.py +++ b/tests/test_e2e_pipeline.py @@ -76,7 +76,7 @@ def test_capture_full_agent_run(self, tmp_path): # Verify JSON is valid and contains expected data data = json.loads(cassette_path.read_text()) - assert data.get("evalcraft_version") == "0.1.0" + assert data.get("evalcraft_version") == "0.2.0" assert data["cassette"]["name"] == "weather_agent_run" assert data["cassette"]["agent_name"] == "weather_bot" assert data["cassette"]["framework"] == "openai"