From cd0edf4cb20cfc2dfe517ba6dfdb64b21a9b1850 Mon Sep 17 00:00:00 2001 From: AzurLiu Date: Sun, 31 May 2026 17:01:35 +0800 Subject: [PATCH 1/2] Demo blocked AI eval claim --- falsiflow_ai_eval/evidence.csv | 2 + falsiflow_ai_eval/evidence_pass_demo.csv | 19 ++ .../evidence_placeholder_demo.csv | 2 + falsiflow_ai_eval/project.json | 222 ++++++++++++++++++ .../source_files/ai_eval_raw_export.csv | 14 ++ falsiflow_ai_eval/template.json | 9 + 6 files changed, 268 insertions(+) create mode 100644 falsiflow_ai_eval/evidence.csv create mode 100644 falsiflow_ai_eval/evidence_pass_demo.csv create mode 100644 falsiflow_ai_eval/evidence_placeholder_demo.csv create mode 100644 falsiflow_ai_eval/project.json create mode 100644 falsiflow_ai_eval/source_files/ai_eval_raw_export.csv create mode 100644 falsiflow_ai_eval/template.json diff --git a/falsiflow_ai_eval/evidence.csv b/falsiflow_ai_eval/evidence.csv new file mode 100644 index 0000000..3e18c0a --- /dev/null +++ b/falsiflow_ai_eval/evidence.csv @@ -0,0 +1,2 @@ +gate_id,candidate_id,sample_id,field,value,source_file,measured_at,operator_or_agent,instrument_id,notes +eval_provenance,candidate_model,eval_run_001,dataset_version_recorded,dataset_pending,source_files/ai_eval_raw_export.csv,2026-05-26T08:00:00Z,falsiflow_eval_operator,eval_harness_001,Placeholder dataset version should block readiness. diff --git a/falsiflow_ai_eval/evidence_pass_demo.csv b/falsiflow_ai_eval/evidence_pass_demo.csv new file mode 100644 index 0000000..c01ed03 --- /dev/null +++ b/falsiflow_ai_eval/evidence_pass_demo.csv @@ -0,0 +1,19 @@ +gate_id,candidate_id,sample_id,field,value,source_file,measured_at,operator_or_agent,instrument_id,notes +eval_provenance,candidate_model,eval_run_001,dataset_version_recorded,true,source_files/ai_eval_raw_export.csv,2026-05-26T08:00:00Z,falsiflow_eval_operator,eval_harness_001,Dataset version pinned in evaluation manifest. +eval_provenance,candidate_model,eval_run_001,prompt_set_hash_recorded,true,source_files/ai_eval_raw_export.csv,2026-05-26T08:00:00Z,falsiflow_eval_operator,eval_harness_001,Prompt set hash pinned in evaluation manifest. +eval_provenance,candidate_model,eval_run_001,candidate_model_version_recorded,true,source_files/ai_eval_raw_export.csv,2026-05-26T08:00:00Z,falsiflow_eval_operator,eval_harness_001,Candidate model revision recorded. +eval_provenance,candidate_model,eval_run_001,baseline_model_version_recorded,true,source_files/ai_eval_raw_export.csv,2026-05-26T08:00:00Z,falsiflow_eval_operator,eval_harness_001,Baseline model revision recorded. +eval_provenance,candidate_model,eval_run_001,evaluator_version_recorded,true,source_files/ai_eval_raw_export.csv,2026-05-26T08:00:00Z,falsiflow_eval_operator,eval_harness_001,Evaluator harness version recorded. +benchmark_quality,candidate_model,eval_run_001,exact_match_rate,0.86,source_files/ai_eval_raw_export.csv,2026-05-26T09:00:00Z,falsiflow_eval_operator,eval_harness_001,Candidate exact-match metric. +benchmark_quality,candidate_model,eval_run_001,hallucination_rate,0.035,source_files/ai_eval_raw_export.csv,2026-05-26T09:00:00Z,falsiflow_eval_operator,eval_harness_001,Candidate hallucination audit metric. +benchmark_quality,candidate_model,eval_run_001,safety_policy_failure_rate,0.012,source_files/ai_eval_raw_export.csv,2026-05-26T09:00:00Z,falsiflow_eval_operator,eval_harness_001,Candidate safety-policy failure metric. +benchmark_quality,candidate_model,eval_run_001,evaluated_item_count,640,source_files/ai_eval_raw_export.csv,2026-05-26T09:00:00Z,falsiflow_eval_operator,eval_harness_001,Candidate evaluation item count. +benchmark_quality,baseline_model,eval_run_001,exact_match_rate,0.78,source_files/ai_eval_raw_export.csv,2026-05-26T09:00:00Z,falsiflow_eval_operator,eval_harness_001,Baseline exact-match metric. +benchmark_quality,baseline_model,eval_run_001,hallucination_rate,0.07,source_files/ai_eval_raw_export.csv,2026-05-26T09:00:00Z,falsiflow_eval_operator,eval_harness_001,Baseline hallucination audit metric. +benchmark_quality,baseline_model,eval_run_001,safety_policy_failure_rate,0.025,source_files/ai_eval_raw_export.csv,2026-05-26T09:00:00Z,falsiflow_eval_operator,eval_harness_001,Baseline safety-policy failure metric. +benchmark_quality,baseline_model,eval_run_001,evaluated_item_count,640,source_files/ai_eval_raw_export.csv,2026-05-26T09:00:00Z,falsiflow_eval_operator,eval_harness_001,Baseline evaluation item count. +reproducibility_package,candidate_model,eval_run_001,eval_script_hash_recorded,true,source_files/ai_eval_raw_export.csv,2026-05-26T10:00:00Z,falsiflow_eval_operator,eval_harness_001,Evaluation script hash recorded. +reproducibility_package,candidate_model,eval_run_001,random_seed_logged,true,source_files/ai_eval_raw_export.csv,2026-05-26T10:00:00Z,falsiflow_eval_operator,eval_harness_001,Random seed logged. +reproducibility_package,candidate_model,eval_run_001,raw_outputs_archived,true,source_files/ai_eval_raw_export.csv,2026-05-26T10:00:00Z,falsiflow_eval_operator,eval_harness_001,Raw model outputs archived. +reproducibility_package,candidate_model,eval_run_001,human_spotcheck_passed,true,source_files/ai_eval_raw_export.csv,2026-05-26T10:00:00Z,falsiflow_eval_operator,eval_harness_001,Human spotcheck passed. +reproducibility_package,candidate_model,eval_run_001,regression_ci_run_recorded,true,source_files/ai_eval_raw_export.csv,2026-05-26T10:00:00Z,falsiflow_eval_operator,eval_harness_001,Regression CI run recorded. diff --git a/falsiflow_ai_eval/evidence_placeholder_demo.csv b/falsiflow_ai_eval/evidence_placeholder_demo.csv new file mode 100644 index 0000000..3e18c0a --- /dev/null +++ b/falsiflow_ai_eval/evidence_placeholder_demo.csv @@ -0,0 +1,2 @@ +gate_id,candidate_id,sample_id,field,value,source_file,measured_at,operator_or_agent,instrument_id,notes +eval_provenance,candidate_model,eval_run_001,dataset_version_recorded,dataset_pending,source_files/ai_eval_raw_export.csv,2026-05-26T08:00:00Z,falsiflow_eval_operator,eval_harness_001,Placeholder dataset version should block readiness. diff --git a/falsiflow_ai_eval/project.json b/falsiflow_ai_eval/project.json new file mode 100644 index 0000000..da49b62 --- /dev/null +++ b/falsiflow_ai_eval/project.json @@ -0,0 +1,222 @@ +{ + "project": { + "id": "falsiflow_ai_claim_evaluation_demo", + "name": "Falsiflow AI claim evaluation demo", + "domain": "ai-evaluation-and-benchmark-claims", + "version": "0.1.0" + }, + "claim": { + "id": "model_claim_ready_for_public_comparison", + "statement": "An AI model quality claim is ready for public comparison only after versioned evaluation provenance, benchmark quality gates, and reproducibility artifacts pass.", + "requires_gates": [ + "eval_provenance", + "benchmark_quality", + "reproducibility_package" + ] + }, + "evidence_policy": { + "require_source_files": true, + "reject_placeholder_values": true, + "allowed_source_roots": [ + "source_files" + ], + "required_metadata_fields": [ + "source_file", + "measured_at", + "operator_or_agent" + ], + "placeholder_markers": [ + "dataset_pending", + "model_pending", + "raw_outputs_pending", + "not_measured", + "todo" + ] + }, + "gates": [ + { + "id": "eval_provenance", + "title": "Evaluation provenance", + "samples": [ + { + "candidate_id": "candidate_model", + "sample_id": "eval_run_001" + } + ], + "required_fields": [ + "dataset_version_recorded", + "prompt_set_hash_recorded", + "candidate_model_version_recorded", + "baseline_model_version_recorded", + "evaluator_version_recorded" + ], + "acceptance_rules": [ + { + "field": "dataset_version_recorded", + "operator": "==", + "value": true, + "reason": "Evaluation dataset version must be recorded before comparing model quality." + }, + { + "field": "prompt_set_hash_recorded", + "operator": "==", + "value": true, + "reason": "Prompt or task set hash must be recorded before claiming improvement." + }, + { + "field": "candidate_model_version_recorded", + "operator": "==", + "value": true, + "reason": "Candidate model version must be pinned." + }, + { + "field": "baseline_model_version_recorded", + "operator": "==", + "value": true, + "reason": "Baseline model version must be pinned." + }, + { + "field": "evaluator_version_recorded", + "operator": "==", + "value": true, + "reason": "Evaluator harness version must be pinned." + } + ] + }, + { + "id": "benchmark_quality", + "title": "Benchmark quality gate", + "samples": [ + { + "candidate_id": "candidate_model", + "sample_id": "eval_run_001" + }, + { + "candidate_id": "baseline_model", + "sample_id": "eval_run_001" + } + ], + "required_fields": [ + "exact_match_rate", + "hallucination_rate", + "safety_policy_failure_rate", + "evaluated_item_count" + ], + "derived_fields": [ + { + "field": "accuracy_ratio_vs_baseline", + "operation": "ratio", + "numerator": "exact_match_rate", + "denominator": { + "candidate_id": "baseline_model", + "sample_id": "eval_run_001", + "field": "exact_match_rate" + } + }, + { + "field": "hallucination_ratio_vs_baseline", + "operation": "ratio", + "numerator": "hallucination_rate", + "denominator": { + "candidate_id": "baseline_model", + "sample_id": "eval_run_001", + "field": "hallucination_rate" + } + } + ], + "acceptance_rules": [ + { + "field": "exact_match_rate", + "operator": ">=", + "value": 0.82, + "candidate_id": "candidate_model", + "reason": "Candidate model must clear the absolute quality floor." + }, + { + "field": "accuracy_ratio_vs_baseline", + "operator": ">=", + "value": 1.08, + "candidate_id": "candidate_model", + "reason": "Candidate model must beat the pinned baseline by a meaningful margin." + }, + { + "field": "hallucination_rate", + "operator": "<=", + "value": 0.05, + "candidate_id": "candidate_model", + "reason": "Candidate model must keep hallucination rate inside the release claim boundary." + }, + { + "field": "hallucination_ratio_vs_baseline", + "operator": "<=", + "value": 0.8, + "candidate_id": "candidate_model", + "reason": "Candidate model should reduce hallucination relative to the pinned baseline." + }, + { + "field": "safety_policy_failure_rate", + "operator": "<=", + "value": 0.02, + "candidate_id": "candidate_model", + "reason": "Candidate model must keep safety-policy failures below the public comparison threshold." + }, + { + "field": "evaluated_item_count", + "operator": ">=", + "value": 500, + "candidate_id": "candidate_model", + "reason": "The evaluation must cover enough items to support the public comparison claim." + } + ] + }, + { + "id": "reproducibility_package", + "title": "Reproducibility package", + "samples": [ + { + "candidate_id": "candidate_model", + "sample_id": "eval_run_001" + } + ], + "required_fields": [ + "eval_script_hash_recorded", + "random_seed_logged", + "raw_outputs_archived", + "human_spotcheck_passed", + "regression_ci_run_recorded" + ], + "acceptance_rules": [ + { + "field": "eval_script_hash_recorded", + "operator": "==", + "value": true, + "reason": "Evaluation script hash must be recorded." + }, + { + "field": "random_seed_logged", + "operator": "==", + "value": true, + "reason": "Random seed or deterministic decode settings must be logged." + }, + { + "field": "raw_outputs_archived", + "operator": "==", + "value": true, + "reason": "Raw model outputs must be archived for review." + }, + { + "field": "human_spotcheck_passed", + "operator": "==", + "value": true, + "reason": "Human spotcheck must pass before the claim is public-facing." + }, + { + "field": "regression_ci_run_recorded", + "operator": "==", + "value": true, + "reason": "A regression CI run must be recorded before publishing the comparison." + } + ] + } + ] +} diff --git a/falsiflow_ai_eval/source_files/ai_eval_raw_export.csv b/falsiflow_ai_eval/source_files/ai_eval_raw_export.csv new file mode 100644 index 0000000..913bf02 --- /dev/null +++ b/falsiflow_ai_eval/source_files/ai_eval_raw_export.csv @@ -0,0 +1,14 @@ +record_type,run_id,model_id,dataset_version,prompt_set_hash,metric,value,artifact_uri,notes +manifest,eval_run_001,candidate_model,claims_eval_v2026_05_26,promptset_sha256_demo,evaluator_version,0.4.0,artifacts/eval_manifest.json,Versioned evaluation manifest. +manifest,eval_run_001,baseline_model,claims_eval_v2026_05_26,promptset_sha256_demo,evaluator_version,0.4.0,artifacts/eval_manifest.json,Versioned baseline manifest. +metric,eval_run_001,candidate_model,claims_eval_v2026_05_26,promptset_sha256_demo,exact_match_rate,0.86,artifacts/candidate_metrics.json,Candidate quality metric. +metric,eval_run_001,candidate_model,claims_eval_v2026_05_26,promptset_sha256_demo,hallucination_rate,0.035,artifacts/candidate_judgments.json,Candidate hallucination audit. +metric,eval_run_001,candidate_model,claims_eval_v2026_05_26,promptset_sha256_demo,safety_policy_failure_rate,0.012,artifacts/candidate_safety.json,Candidate safety metric. +metric,eval_run_001,candidate_model,claims_eval_v2026_05_26,promptset_sha256_demo,evaluated_item_count,640,artifacts/candidate_metrics.json,Candidate sample count. +metric,eval_run_001,baseline_model,claims_eval_v2026_05_26,promptset_sha256_demo,exact_match_rate,0.78,artifacts/baseline_metrics.json,Baseline quality metric. +metric,eval_run_001,baseline_model,claims_eval_v2026_05_26,promptset_sha256_demo,hallucination_rate,0.07,artifacts/baseline_judgments.json,Baseline hallucination audit. +metric,eval_run_001,baseline_model,claims_eval_v2026_05_26,promptset_sha256_demo,safety_policy_failure_rate,0.025,artifacts/baseline_safety.json,Baseline safety metric. +metric,eval_run_001,baseline_model,claims_eval_v2026_05_26,promptset_sha256_demo,evaluated_item_count,640,artifacts/baseline_metrics.json,Baseline sample count. +artifact,eval_run_001,candidate_model,claims_eval_v2026_05_26,promptset_sha256_demo,raw_outputs_archived,true,artifacts/candidate_raw_outputs.jsonl,Raw outputs retained for review. +artifact,eval_run_001,candidate_model,claims_eval_v2026_05_26,promptset_sha256_demo,eval_script_hash_recorded,true,artifacts/eval_script.sha256,Evaluation script hash retained. +artifact,eval_run_001,candidate_model,claims_eval_v2026_05_26,promptset_sha256_demo,regression_ci_run_recorded,true,artifacts/ci_run_url.txt,Regression CI run retained. diff --git a/falsiflow_ai_eval/template.json b/falsiflow_ai_eval/template.json new file mode 100644 index 0000000..efc8c2d --- /dev/null +++ b/falsiflow_ai_eval/template.json @@ -0,0 +1,9 @@ +{ + "id": "ai_claim_evaluation", + "name": "AI claim evaluation readiness", + "domain": "ai-evaluation-and-benchmark-claims", + "description": "Evidence-gated workflow for deciding whether an AI model quality claim is ready for public comparison.", + "project_config": "project.json", + "demo_evidence": "evidence_pass_demo.csv", + "placeholder_evidence": "evidence_placeholder_demo.csv" +} From cdd34fdc4c37230da49fdc88403c014e222181b5 Mon Sep 17 00:00:00 2001 From: AzurLiu Date: Sun, 31 May 2026 17:06:15 +0800 Subject: [PATCH 2/2] Add source-backed AI eval evidence --- falsiflow_ai_eval/evidence.csv | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/falsiflow_ai_eval/evidence.csv b/falsiflow_ai_eval/evidence.csv index 3e18c0a..c01ed03 100644 --- a/falsiflow_ai_eval/evidence.csv +++ b/falsiflow_ai_eval/evidence.csv @@ -1,2 +1,19 @@ gate_id,candidate_id,sample_id,field,value,source_file,measured_at,operator_or_agent,instrument_id,notes -eval_provenance,candidate_model,eval_run_001,dataset_version_recorded,dataset_pending,source_files/ai_eval_raw_export.csv,2026-05-26T08:00:00Z,falsiflow_eval_operator,eval_harness_001,Placeholder dataset version should block readiness. +eval_provenance,candidate_model,eval_run_001,dataset_version_recorded,true,source_files/ai_eval_raw_export.csv,2026-05-26T08:00:00Z,falsiflow_eval_operator,eval_harness_001,Dataset version pinned in evaluation manifest. +eval_provenance,candidate_model,eval_run_001,prompt_set_hash_recorded,true,source_files/ai_eval_raw_export.csv,2026-05-26T08:00:00Z,falsiflow_eval_operator,eval_harness_001,Prompt set hash pinned in evaluation manifest. +eval_provenance,candidate_model,eval_run_001,candidate_model_version_recorded,true,source_files/ai_eval_raw_export.csv,2026-05-26T08:00:00Z,falsiflow_eval_operator,eval_harness_001,Candidate model revision recorded. +eval_provenance,candidate_model,eval_run_001,baseline_model_version_recorded,true,source_files/ai_eval_raw_export.csv,2026-05-26T08:00:00Z,falsiflow_eval_operator,eval_harness_001,Baseline model revision recorded. +eval_provenance,candidate_model,eval_run_001,evaluator_version_recorded,true,source_files/ai_eval_raw_export.csv,2026-05-26T08:00:00Z,falsiflow_eval_operator,eval_harness_001,Evaluator harness version recorded. +benchmark_quality,candidate_model,eval_run_001,exact_match_rate,0.86,source_files/ai_eval_raw_export.csv,2026-05-26T09:00:00Z,falsiflow_eval_operator,eval_harness_001,Candidate exact-match metric. +benchmark_quality,candidate_model,eval_run_001,hallucination_rate,0.035,source_files/ai_eval_raw_export.csv,2026-05-26T09:00:00Z,falsiflow_eval_operator,eval_harness_001,Candidate hallucination audit metric. +benchmark_quality,candidate_model,eval_run_001,safety_policy_failure_rate,0.012,source_files/ai_eval_raw_export.csv,2026-05-26T09:00:00Z,falsiflow_eval_operator,eval_harness_001,Candidate safety-policy failure metric. +benchmark_quality,candidate_model,eval_run_001,evaluated_item_count,640,source_files/ai_eval_raw_export.csv,2026-05-26T09:00:00Z,falsiflow_eval_operator,eval_harness_001,Candidate evaluation item count. +benchmark_quality,baseline_model,eval_run_001,exact_match_rate,0.78,source_files/ai_eval_raw_export.csv,2026-05-26T09:00:00Z,falsiflow_eval_operator,eval_harness_001,Baseline exact-match metric. +benchmark_quality,baseline_model,eval_run_001,hallucination_rate,0.07,source_files/ai_eval_raw_export.csv,2026-05-26T09:00:00Z,falsiflow_eval_operator,eval_harness_001,Baseline hallucination audit metric. +benchmark_quality,baseline_model,eval_run_001,safety_policy_failure_rate,0.025,source_files/ai_eval_raw_export.csv,2026-05-26T09:00:00Z,falsiflow_eval_operator,eval_harness_001,Baseline safety-policy failure metric. +benchmark_quality,baseline_model,eval_run_001,evaluated_item_count,640,source_files/ai_eval_raw_export.csv,2026-05-26T09:00:00Z,falsiflow_eval_operator,eval_harness_001,Baseline evaluation item count. +reproducibility_package,candidate_model,eval_run_001,eval_script_hash_recorded,true,source_files/ai_eval_raw_export.csv,2026-05-26T10:00:00Z,falsiflow_eval_operator,eval_harness_001,Evaluation script hash recorded. +reproducibility_package,candidate_model,eval_run_001,random_seed_logged,true,source_files/ai_eval_raw_export.csv,2026-05-26T10:00:00Z,falsiflow_eval_operator,eval_harness_001,Random seed logged. +reproducibility_package,candidate_model,eval_run_001,raw_outputs_archived,true,source_files/ai_eval_raw_export.csv,2026-05-26T10:00:00Z,falsiflow_eval_operator,eval_harness_001,Raw model outputs archived. +reproducibility_package,candidate_model,eval_run_001,human_spotcheck_passed,true,source_files/ai_eval_raw_export.csv,2026-05-26T10:00:00Z,falsiflow_eval_operator,eval_harness_001,Human spotcheck passed. +reproducibility_package,candidate_model,eval_run_001,regression_ci_run_recorded,true,source_files/ai_eval_raw_export.csv,2026-05-26T10:00:00Z,falsiflow_eval_operator,eval_harness_001,Regression CI run recorded.