diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..ec9c6e5 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,28 @@ +name: tests + +on: + push: + pull_request: + +jobs: + pytest: + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install pytest + + - name: Run tests + run: | + pytest -q \ No newline at end of file diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..5ef52eb --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +testpaths = tests +pythonpath = . +addopts = -q \ No newline at end of file diff --git a/sample_data/eval_cases.json b/sample_data/eval_cases.json new file mode 100644 index 0000000..65c11b2 --- /dev/null +++ b/sample_data/eval_cases.json @@ -0,0 +1,37 @@ +[ + { + "id": "E-001", + "title": "Uncapped liability + consequential", + "text": "The Vendor shall be liable for all damages including consequential, indirect, special damages. Liability is uncapped.", + "expected_risk_level": "High", + "expected_categories": ["Liability"] + }, + { + "id": "E-002", + "title": "Termination for convenience no cure", + "text": "Either party may terminate for convenience at any time upon written notice. No cure period is provided.", + "expected_risk_level": "Medium", + "expected_categories": ["Termination"] + }, + { + "id": "E-003", + "title": "Neutral baseline", + "text": "This agreement describes services and payment terms. Standard confidentiality applies. No special liability clauses are stated.", + "expected_risk_level": "Low", + "expected_categories": [] + }, + { + "id": "E-004", + "title": "Both liability + termination", + "text": "Either party may terminate for convenience at any time upon notice. Vendor is liable for all damages including consequential. Liability is uncapped.", + "expected_risk_level": "High", + "expected_categories": ["Liability", "Termination"] + }, + { + "id": "E-005", + "title": "Cure period present (should reduce termination signal)", + "text": "Either party may terminate upon material breach if not cured within 30 days after notice. Termination for convenience is not permitted.", + "expected_risk_level": "Low", + "expected_categories": [] + } +] \ No newline at end of file diff --git a/src/evaluation_stub.py b/src/evaluation_stub.py index a152622..967745e 100644 --- a/src/evaluation_stub.py +++ b/src/evaluation_stub.py @@ -1,33 +1,152 @@ from __future__ import annotations + +import json +from dataclasses import dataclass from pathlib import Path +from typing import Any, Dict, List, Set, Tuple + +from src.model_adapter import analyze_contract + + +@dataclass +class EvalCase: + id: str + title: str + text: str + expected_risk_level: str + expected_categories: List[str] + + +def _normalize_level(level: str) -> str: + return str(level).strip().title() + + +def _predicted_categories(result_dict: Dict[str, Any]) -> Set[str]: + findings = result_dict.get("findings", []) + cats = {f.get("category", "").strip() for f in findings if f.get("category")} + return {c for c in cats if c} + + +def run_evaluation(cases: List[EvalCase]) -> Dict[str, Any]: + """ + Runs evaluation cases through the adapter, compares predicted vs expected. + + Returns a dict: + - risk_level_accuracy + - category_precision/recall/f1 (micro) + - mismatches[] + """ + if not cases: + return { + "risk_level_accuracy": 0.0, + "category_precision": 0.0, + "category_recall": 0.0, + "category_f1": 0.0, + "mismatches": [], + "n": 0, + } + + level_hits = 0 + tp = fp = fn = 0 + mismatches: List[Dict[str, Any]] = [] + + for c in cases: + result = analyze_contract(c.text, title=c.title, source_type="paste").model_dump() -from .model_adapter import analyze_contract + pred_level = _normalize_level(result["summary"]["risk_level"]) + exp_level = _normalize_level(c.expected_risk_level) + if pred_level == exp_level: + level_hits += 1 -SAMPLES_DIR = Path("sample_data") + pred_cats = _predicted_categories(result) + exp_cats = set(c.expected_categories) + + # micro counts + tp += len(pred_cats & exp_cats) + fp += len(pred_cats - exp_cats) + fn += len(exp_cats - pred_cats) + + if pred_level != exp_level or pred_cats != exp_cats: + mismatches.append( + { + "id": c.id, + "title": c.title, + "pred_risk_level": pred_level, + "exp_risk_level": exp_level, + "pred_categories": sorted(list(pred_cats)), + "exp_categories": sorted(list(exp_cats)), + "run_id": result.get("run_id"), + } + ) + + accuracy = level_hits / len(cases) + + precision = tp / (tp + fp) if (tp + fp) else 0.0 + recall = tp / (tp + fn) if (tp + fn) else 0.0 + f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0.0 + + return { + "risk_level_accuracy": round(accuracy, 3), + "category_precision": round(precision, 3), + "category_recall": round(recall, 3), + "category_f1": round(f1, 3), + "mismatches": mismatches, + "n": len(cases), + } + + +def load_cases(path: str | Path) -> List[EvalCase]: + p = Path(path) + payload = json.loads(p.read_text(encoding="utf-8")) + + cases: List[EvalCase] = [] + for row in payload: + cases.append( + EvalCase( + id=row["id"], + title=row.get("title", row["id"]), + text=row["text"], + expected_risk_level=row["expected_risk_level"], + expected_categories=row.get("expected_categories", []), + ) + ) + return cases def main() -> None: - samples = sorted(SAMPLES_DIR.glob("*.txt")) - if not samples: - print("No sample contracts found in sample_data/. Add .txt files to run evaluation.") - return - - scores = [] - for fp in samples: - contract_text = fp.read_text(encoding="utf-8", errors="ignore") - result = analyze_contract(contract_text=contract_text, title=fp.stem, source_type="sample_data") - - score = result.summary.overall_risk_score - level = result.summary.risk_level - findings_count = len(result.findings) - - scores.append(score) - print(f"{fp.name}: score={score}, level={level}, findings={findings_count}") - - if scores: - print("\nSummary") - print(f"count={len(scores)} min={min(scores)} max={max(scores)} avg={sum(scores)/len(scores):.2f}") + import argparse + + parser = argparse.ArgumentParser(description="Offline evaluation harness (stub).") + parser.add_argument( + "path", + nargs="?", + default="sample_data/eval_cases.json", + help="Path to evaluation cases JSON (default: sample_data/eval_cases.json)", + ) + args = parser.parse_args() + + cases = load_cases(args.path) + metrics = run_evaluation(cases) + + print("\n=== Evaluation Summary ===") + print(f"Cases: {metrics['n']}") + print(f"Risk-level accuracy: {metrics['risk_level_accuracy']}") + print( + "Category micro P/R/F1: " + f"{metrics['category_precision']} / {metrics['category_recall']} / {metrics['category_f1']}" + ) + + if metrics["mismatches"]: + print("\n=== Mismatches ===") + for m in metrics["mismatches"]: + print( + f"- {m['id']} | {m['title']} | " + f"level {m['pred_risk_level']} vs {m['exp_risk_level']} | " + f"cats {m['pred_categories']} vs {m['exp_categories']}" + ) + else: + print("\nNo mismatches 🎉") if __name__ == "__main__": diff --git a/tests/test_evaluation_harness.py b/tests/test_evaluation_harness.py new file mode 100644 index 0000000..fcff3bf --- /dev/null +++ b/tests/test_evaluation_harness.py @@ -0,0 +1,7 @@ +from src.evaluation_stub import load_cases, run_evaluation + +def test_eval_harness_loads_and_runs(): + cases = load_cases("sample_data/eval_cases.json") + metrics = run_evaluation(cases) + assert "risk_level_accuracy" in metrics + assert metrics["n"] > 0 \ No newline at end of file diff --git a/tests/test_evidence_rule.py b/tests/test_evidence_rule.py new file mode 100644 index 0000000..5876307 --- /dev/null +++ b/tests/test_evidence_rule.py @@ -0,0 +1,13 @@ +from src.model_adapter import analyze_contract + +def test_all_findings_have_evidence(): + text = "Vendor is liable for all damages including consequential. Liability is uncapped." + result = analyze_contract(text, title="Test", source_type="paste").model_dump() + findings = result.get("findings", []) + assert len(findings) > 0 + + for f in findings: + ev = f.get("evidence", []) + assert isinstance(ev, list) + assert len(ev) >= 1 + assert ev[0].get("snippet") \ No newline at end of file diff --git a/tests/test_scoring_monotonic.py b/tests/test_scoring_monotonic.py new file mode 100644 index 0000000..2929426 --- /dev/null +++ b/tests/test_scoring_monotonic.py @@ -0,0 +1,32 @@ +from src.schemas import Finding, Evidence +from src.scoring import compute_score + +def test_high_severity_scores_higher_than_low(): + low = [ + Finding( + finding_id="T-LOW", + category="General", + risk_statement="Low risk", + severity="Low", + confidence=1.0, + evidence=[Evidence(clause_ref="X", snippet="...")], + recommendation="N/A", + ) + ] + + high = [ + Finding( + finding_id="T-HIGH", + category="Liability", + risk_statement="High risk", + severity="High", + confidence=1.0, + evidence=[Evidence(clause_ref="Y", snippet="...")], + recommendation="N/A", + ) + ] + + s_low, _ = compute_score(low) + s_high, _ = compute_score(high) + + assert s_high > s_low \ No newline at end of file