encypherai · erik-sv · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026
diff --git a/docs/claim-generator-requirements.md b/docs/claim-generator-requirements.md
diff --git a/src/c2pa_kg/builders/ir_builder.py b/src/c2pa_kg/builders/ir_builder.py
@@ -27,7 +27,7 @@
 )
 from c2pa_kg.parsers.asciidoc import parse_assertion_docs, parse_validation_doc
 from c2pa_kg.parsers.cddl import parse_cddl_directory
-from c2pa_kg.parsers.html_spec import parse_html_spec_file
+from c2pa_kg.parsers.html_spec import parse_html_generation_rules, parse_html_spec_file
 from c2pa_kg.parsers.json_schema import parse_json_schema
 
 # ---------------------------------------------------------------------------
@@ -506,9 +506,11 @@ def build_knowledge_graph(
             extracted schemas ZIP - must contain a cddl/ directory).
         version: SpecVersion metadata for the graph being built.
         html_spec: Path to a locally saved rendered HTML spec from
-            spec.c2pa.org. When provided, validation rules and status codes
-            are extracted from the HTML instead of the AsciiDoc source. This
-            allows building from the public site without specs-core access.
+            spec.c2pa.org. When provided alongside specs-core AsciiDoc,
+            claim-generator requirements are extracted from HTML and appended
+            to the AsciiDoc validation rules. If AsciiDoc validation sources
+            are unavailable, validation rules and status codes are extracted
+            from HTML as a fallback.
 
     Returns:
         Fully populated KnowledgeGraph.
@@ -537,22 +539,30 @@ def build_knowledge_graph(
 
     # ------------------------------------------------------------------
     # 3. Validation rules, status codes, assertion descriptions.
-    #    Prefer HTML spec when provided; fall back to AsciiDoc source.
+    #    Use AsciiDoc for validation/status codes when available: it preserves
+    #    the established VAL-* rule IDs used by predicates. HTML is used to add
+    #    claim-generator GEN-* requirements from non-validation sections.
     # ------------------------------------------------------------------
-    if html_spec is not None and html_spec.is_file():
+    validation_path = spec_source / _VALIDATION_SUBPATH
+    used_html_for_validation = False
+    if validation_path.is_file():
+        rules, status_codes = parse_validation_doc(validation_path)
+        for rule in rules:
+            kg.add_rule(rule)
+        for code in status_codes:
+            kg.status_codes.append(code)
+    elif html_spec is not None and html_spec.is_file():
         rules, status_codes = parse_html_spec_file(html_spec)
+        used_html_for_validation = True
         for rule in rules:
             kg.add_rule(rule)
         for code in status_codes:
             kg.status_codes.append(code)
-    else:
-        validation_path = spec_source / _VALIDATION_SUBPATH
-        if validation_path.is_file():
-            rules, status_codes = parse_validation_doc(validation_path)
-            for rule in rules:
-                kg.add_rule(rule)
-            for code in status_codes:
-                kg.status_codes.append(code)
+
+    if html_spec is not None and html_spec.is_file() and not used_html_for_validation:
+        html_text = html_spec.read_text(encoding="utf-8")
+        for rule in parse_html_generation_rules(html_text):
+            kg.add_rule(rule)
 
     assertions_dir = spec_source / _ASSERTIONS_SUBPATH
     if assertions_dir.is_dir():

diff --git a/src/c2pa_kg/cli.py b/src/c2pa_kg/cli.py
@@ -82,7 +82,20 @@ def cli() -> None:
     type=click.Path(file_okay=False, path_type=Path),
     help="Directory where output artifacts are written.",
 )
-def generate(spec_source: Path, version: str, output_dir: Path) -> None:
+@click.option(
+    "--html-spec",
+    type=click.Path(exists=True, dir_okay=False, path_type=Path),
+    help=(
+        "Path to a rendered C2PA spec HTML page. When provided, validation "
+        "status codes and claim-generator requirements are extracted from HTML."
+    ),
+)
+def generate(
+    spec_source: Path,
+    version: str,
+    output_dir: Path,
+    html_spec: Path | None,
+) -> None:
     """Build knowledge graph artifacts for a single spec version."""
     try:
         spec_version: SpecVersion = get_version(version)
@@ -103,7 +116,11 @@ def generate(spec_source: Path, version: str, output_dir: Path) -> None:
         ) from exc
 
     click.echo(f"Building knowledge graph for version {version} ...")
-    kg: KnowledgeGraph = build_knowledge_graph(spec_source, spec_version)
+    kg: KnowledgeGraph = build_knowledge_graph(
+        spec_source,
+        spec_version,
+        html_spec=html_spec,
+    )
 
     version_dir = output_dir / version
     version_dir.mkdir(parents=True, exist_ok=True)

diff --git a/src/c2pa_kg/emitters/rules.py b/src/c2pa_kg/emitters/rules.py
@@ -12,6 +12,7 @@
 
 from c2pa_kg.models import (
     KnowledgeGraph,
+    RuleApplicability,
     RuleSeverity,
     StatusCode,
     ValidationPhase,
@@ -49,6 +50,10 @@ def _rule_to_dict(rule: ValidationRule) -> dict[str, Any]:
         "phase": rule.phase.value,
         "spec_section": rule.spec_section,
     }
+    if rule.spec_area:
+        d["spec_area"] = rule.spec_area
+    if rule.applicability != RuleApplicability.UNSPECIFIED:
+        d["applicability"] = rule.applicability.value
     if rule.condition:
         d["condition"] = rule.condition
     if rule.action:
@@ -117,21 +122,48 @@ def _group_status_codes(
     return result
 
 
+def _group_generation_rules_by_area(
+    rules: list[ValidationRule],
+) -> dict[str, list[dict[str, Any]]]:
+    """Group GEN- rules by spec section area, sorted by severity."""
+    gen_rules = [
+        r for r in rules
+        if r.applicability in (RuleApplicability.CLAIM_GENERATOR, RuleApplicability.BOTH)
+        and r.rule_id.startswith("GEN-")
+    ]
+
+    areas: dict[str, list[ValidationRule]] = {}
+    for rule in gen_rules:
+        area = rule.spec_area or rule.spec_section or "General"
+        areas.setdefault(area, []).append(rule)
+
+    result: dict[str, list[dict[str, Any]]] = {}
+    for area in sorted(areas):
+        sorted_rules = sorted(areas[area], key=lambda r: _SEVERITY_ORDER.get(r.severity, 99))
+        result[area] = [_rule_to_dict(r) for r in sorted_rules]
+
+    return result
+
+
 # ---------------------------------------------------------------------------
 # Summary statistics
 # ---------------------------------------------------------------------------
 
 def _build_summary(rules: list[ValidationRule]) -> dict[str, Any]:
-    """Build a summary dict of rule counts by phase and severity."""
+    """Build a summary dict of rule counts by phase, severity, and applicability."""
     by_phase: dict[str, int] = {}
     by_severity: dict[str, int] = {}
+    by_applicability: dict[str, int] = {}
     for rule in rules:
         by_phase[rule.phase.value] = by_phase.get(rule.phase.value, 0) + 1
         by_severity[rule.severity.value] = by_severity.get(rule.severity.value, 0) + 1
+        app = rule.applicability.value
+        by_applicability[app] = by_applicability.get(app, 0) + 1
     return {
         "total": len(rules),
         "by_phase": by_phase,
         "by_severity": by_severity,
+        "by_applicability": by_applicability,
     }
 
 
@@ -146,12 +178,17 @@ def emit_rules_json(kg: KnowledgeGraph, output_path: Path) -> None:
     {
       "version": "2.4",
       "rule_count": N,
-      "summary": { "total": N, "by_phase": {...}, "by_severity": {...} },
+      "summary": { "total": N, "by_phase": {...}, "by_severity": {...},
+                   "by_applicability": {...} },
       "phases": {
         "structural": [ ...rules... ],
         "cryptographic": [ ...rules... ],
         ...
       },
+      "generation_rules": {
+        "<spec area>": [ ...GEN- rules for claim generators... ],
+        ...
+      },
       "status_codes": {
         "success": [ ...codes... ],
         "failure": [ ...codes... ],
@@ -163,7 +200,10 @@ def emit_rules_json(kg: KnowledgeGraph, output_path: Path) -> None:
         kg: The knowledge graph containing rules and status codes.
         output_path: Destination .json file path.
     """
-    phases = _group_rules_by_phase(kg.validation_rules)
+    # Separate validation rules (VAL-) from generation rules (GEN-)
+    val_rules = [r for r in kg.validation_rules if r.rule_id.startswith("VAL-")]
+    phases = _group_rules_by_phase(val_rules)
+    generation_rules = _group_generation_rules_by_area(kg.validation_rules)
     status_codes = _group_status_codes(kg.status_codes)
     summary = _build_summary(kg.validation_rules)
 
@@ -172,6 +212,7 @@ def emit_rules_json(kg: KnowledgeGraph, output_path: Path) -> None:
         "rule_count": kg.rule_count,
         "summary": summary,
         "phases": phases,
+        "generation_rules": generation_rules,
         "status_codes": status_codes,
     }
 

diff --git a/src/c2pa_kg/models.py b/src/c2pa_kg/models.py
@@ -75,6 +75,15 @@ class ValidationPhase(Enum):
     CONTENT = "content"
 
 
+class RuleApplicability(Enum):
+    """Who a normative rule is directed at."""
+
+    CLAIM_GENERATOR = "claim_generator"
+    VALIDATOR = "validator"
+    BOTH = "both"
+    UNSPECIFIED = "unspecified"
+
+
 class ChangeType(Enum):
     """Type of change between spec versions."""
 
@@ -193,10 +202,12 @@ class ValidationRule:
     action: str = ""
     referenced_entities: list[str] = field(default_factory=list)
     spec_section: str = ""
+    spec_area: str = ""
     source_text: str = ""
+    applicability: RuleApplicability = RuleApplicability.UNSPECIFIED
 
     def to_dict(self) -> dict:
-        return {
+        d: dict = {
             "rule_id": self.rule_id,
             "description": self.description,
             "severity": self.severity.value,
@@ -207,6 +218,11 @@ def to_dict(self) -> dict:
             "spec_section": self.spec_section,
             "source_text": self.source_text,
         }
+        if self.spec_area:
+            d["spec_area"] = self.spec_area
+        if self.applicability != RuleApplicability.UNSPECIFIED:
+            d["applicability"] = self.applicability.value
+        return d
 
 
 @dataclass
@@ -512,7 +528,11 @@ def kg_from_dict(data: dict) -> KnowledgeGraph:
             action=rud.get("action", ""),
             referenced_entities=rud.get("referenced_entities", []),
             spec_section=rud.get("spec_section", ""),
+            spec_area=rud.get("spec_area", ""),
             source_text=rud.get("source_text", ""),
+            applicability=RuleApplicability(
+                rud.get("applicability", "unspecified")
+            ),
         ))
 
     for en, ed in data.get("enum_types", {}).items():

diff --git a/src/c2pa_kg/parsers/asciidoc.py b/src/c2pa_kg/parsers/asciidoc.py
@@ -76,6 +76,41 @@ def _section_at(offset: int, headers: list[tuple[int, int, str]]) -> str:
 _INFO_RE = re.compile(r"\binformational\b", re.IGNORECASE)
 _FAILURE_RE = re.compile(r"\bfailure\b", re.IGNORECASE)
 
+_INCLUDE_RE = re.compile(r"^include::([^\[]+)\[[^\]]*\]\s*$", re.MULTILINE)
+
+
+def _resolve_adoc_includes(path: Path, *, seen: frozenset[Path] = frozenset()) -> str:
+    """Read an AsciiDoc file and inline relative ``include::*.adoc[]`` files.
+
+    The validation clause is split across partials, and the status-code tables
+    put their rows in included files. Non-AsciiDoc includes, such as CDDL source
+    snippets, are intentionally dropped from the parsed text: they are examples
+    or schemas, not normative prose for this parser.
+    """
+    resolved = path.resolve()
+    if resolved in seen:
+        return ""
+    seen = seen | {resolved}
+    text = path.read_text(encoding="utf-8")
+
+    def _replace(match: re.Match[str]) -> str:
+        include_target = match.group(1).strip()
+        include_path = (path.parent / include_target).resolve()
+        if include_path.suffix.lower() != ".adoc" or not include_path.is_file():
+            return ""
+        return _resolve_adoc_includes(include_path, seen=seen)
+
+    return _INCLUDE_RE.sub(_replace, text)
+
+
+def _read_validation_adoc(validation_path: Path) -> str:
+    """Read Validation.adoc with validation partials and status-code annex."""
+    text = _resolve_adoc_includes(validation_path)
+    annex = validation_path.with_name("ValidationCodes_Annex.adoc")
+    if annex.is_file():
+        text = f"{text}\n\n{_resolve_adoc_includes(annex)}"
+    return text
+
 
 def _category_from_context(text_before: str) -> str:
     """Determine status code category from the text preceding a table."""
@@ -232,7 +267,7 @@ def parse_validation_doc(
     Returns:
         Tuple of (list[ValidationRule], list[StatusCode]).
     """
-    text = validation_path.read_text(encoding="utf-8")
+    text = _read_validation_adoc(validation_path)
 
     status_codes = _parse_status_code_tables(text)
     rules = _parse_normative_rules(text)