beyhangl · beyhangl · May 30, 2026 · May 30, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,48 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.2.0] — 2026-05-30
+
+Ships everything developed since the initial `0.1.0` PyPI upload — a much larger
+evaluation surface, new drift-catching and determinism tooling, an honest
+re-scope of the project's positioning, several bug fixes, and a full lint/type
+cleanup. Backward-compatible with `0.1.0` cassettes.
+
+### Added
+
+#### Evaluation
+- **LLM-as-Judge scorers** — `assert_output_semantic`, `assert_factual_consistency`, `assert_tone`, `assert_custom_criteria` (OpenAI or Anthropic judge, configurable model)
+- **RAG metrics** — `assert_faithfulness`, `assert_context_relevance`, `assert_answer_relevance`, `assert_context_recall`
+- **Pairwise A/B** — `pairwise_compare` and `pairwise_rank` (round-robin tournament) with position-bias mitigation
+- **Statistical eval** — `eval_n` with Wilson-score confidence intervals
+- **Multi-judge consensus** — `JuryScorer`
+- **Hallucination detection** — `assert_no_hallucination`, `detect_hallucinations` (per-claim breakdown)
+- **Live-eval mode** — `run_live_eval` / `compare_to_baseline` + the `evalcraft live-eval` CLI: run scorers against the *real* model over a golden input set and gate CI on score regressions. This is the layer that catches model/prompt/retrieval drift, which replay cannot.
+
+#### Cassettes
+- **Provenance metadata** — each recording captures the model set, a prompt hash, SDK/Python versions, and record time (for staleness reasoning); surfaced in `evalcraft info`. Loads provenance-less cassettes unchanged.
+- **Opt-in judge cache** — `evalcraft.eval.judge_cache.use_judge_cache(...)` / the `EVALCRAFT_JUDGE_CACHE` env var record/replay LLM-judge responses for deterministic, $0 judge scoring in CI (modes: `auto` / `record` / `replay`).
+
+#### Other
+- Regression `TrendDetector` for multi-run gradual-drift analysis
+- **Gemini** and **Pydantic AI** adapters (Python); Gemini + Vercel AI adapters (JS)
+- `evalcraft generate-tests` (pytest file from a cassette) and `evalcraft doctor` (setup diagnostics)
+- TypeScript/JavaScript SDK (pre-release, source-only): capture/replay, mocks, 16 scorers, OpenAI/Gemini/Vercel AI adapters
+
+### Fixed
+- LangGraph adapter: two `NameError`s in `on_llm_end` / `on_chain_end` (referenced callback params they never receive)
+- NetworkGuard: Python 3.9/3.10 crash from hard-coding the `all_errors` kwarg (added to the stdlib only in 3.11) — now forwards `**kwargs`
+- De-flaked the JS fingerprint-determinism test (pinned `Span.timestamp`)
+- Repointed the dead `evalcraft.dev` documentation URL to the GitHub Pages site
+
+### Changed
+- **Positioning** re-scoped from "The pytest for AI agents" to **"VCR for AI agents"** — honest about what replay does, and no longer colliding with DeepEval's tagline
+- Documentation corrected for accuracy: offline-vs-live scorer labeling, fingerprint/regression semantics (detects *recorded* changes, not live drift), an accurate Python-vs-JS parity matrix, a fact-checked comparison table, and JS install instructions (build-from-source; not yet on npm)
+
+### Internal
+- ruff: 325 → 0 findings; mypy: made runnable and clean across the package (strict bug-catching checks kept on; annotation-completeness sub-checks right-sized to the codebase's style)
+- 803 Python tests and 145 JS tests passing
+
 ## [0.1.0] — 2026-03-05
 
 Initial public release of Evalcraft — the pytest for AI agents.

diff --git a/evalcraft/__init__.py b/evalcraft/__init__.py
@@ -4,7 +4,7 @@
 mock LLMs/tools, score runs, and catch real model drift with live-eval.
 """
 
-__version__ = "0.1.0"
+__version__ = "0.2.0"
 
 from evalcraft.capture.recorder import CaptureContext, capture
 from evalcraft.cloud.client import EvalcraftCloud

diff --git a/evalcraft/cli/main.py b/evalcraft/cli/main.py
@@ -58,7 +58,7 @@ def _fmt_cost(usd: float) -> str:
 # ─── CLI root ─────────────────────────────────────────────────────────────────
 
 @click.group()
-@click.version_option(version="0.1.0", prog_name="evalcraft")
+@click.version_option(version="0.2.0", prog_name="evalcraft")
 def cli() -> None:
     """evalcraft — capture, replay, and evaluate AI agent runs."""
 

diff --git a/evalcraft/cloud/client.py b/evalcraft/cloud/client.py
@@ -311,7 +311,7 @@ def _request(
         body: bytes | None = None
         headers: dict[str, str] = {
             "Accept": "application/json",
-            "User-Agent": "evalcraft-sdk/0.1.0",
+            "User-Agent": "evalcraft-sdk/0.2.0",
         }
         if self.api_key:
             headers["Authorization"] = f"Bearer {self.api_key}"

diff --git a/evalcraft/core/models.py b/evalcraft/core/models.py
@@ -288,7 +288,7 @@ def to_dict(self) -> dict:
         self.compute_metrics()
         self.compute_fingerprint()
         return {
-            "evalcraft_version": "0.1.0",
+            "evalcraft_version": "0.2.0",
             "cassette": {
                 "id": self.id,
                 "name": self.name,

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "evalcraft"
-version = "0.1.0"
+version = "0.2.0"
 description = "VCR for AI agents — record agent runs as cassettes and replay them deterministically in CI for $0."
 readme = "README.md"
 license = "MIT"

diff --git a/tests/test_e2e_pipeline.py b/tests/test_e2e_pipeline.py
@@ -76,7 +76,7 @@ def test_capture_full_agent_run(self, tmp_path):
 
         # Verify JSON is valid and contains expected data
         data = json.loads(cassette_path.read_text())
-        assert data.get("evalcraft_version") == "0.1.0"
+        assert data.get("evalcraft_version") == "0.2.0"
         assert data["cassette"]["name"] == "weather_agent_run"
         assert data["cassette"]["agent_name"] == "weather_bot"
         assert data["cassette"]["framework"] == "openai"