Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
513 changes: 513 additions & 0 deletions docs/claim-generator-requirements.md

Large diffs are not rendered by default.

38 changes: 24 additions & 14 deletions src/c2pa_kg/builders/ir_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
)
from c2pa_kg.parsers.asciidoc import parse_assertion_docs, parse_validation_doc
from c2pa_kg.parsers.cddl import parse_cddl_directory
from c2pa_kg.parsers.html_spec import parse_html_spec_file
from c2pa_kg.parsers.html_spec import parse_html_generation_rules, parse_html_spec_file
from c2pa_kg.parsers.json_schema import parse_json_schema

# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -506,9 +506,11 @@ def build_knowledge_graph(
extracted schemas ZIP - must contain a cddl/ directory).
version: SpecVersion metadata for the graph being built.
html_spec: Path to a locally saved rendered HTML spec from
spec.c2pa.org. When provided, validation rules and status codes
are extracted from the HTML instead of the AsciiDoc source. This
allows building from the public site without specs-core access.
spec.c2pa.org. When provided alongside specs-core AsciiDoc,
claim-generator requirements are extracted from HTML and appended
to the AsciiDoc validation rules. If AsciiDoc validation sources
are unavailable, validation rules and status codes are extracted
from HTML as a fallback.

Returns:
Fully populated KnowledgeGraph.
Expand Down Expand Up @@ -537,22 +539,30 @@ def build_knowledge_graph(

# ------------------------------------------------------------------
# 3. Validation rules, status codes, assertion descriptions.
# Prefer HTML spec when provided; fall back to AsciiDoc source.
# Use AsciiDoc for validation/status codes when available: it preserves
# the established VAL-* rule IDs used by predicates. HTML is used to add
# claim-generator GEN-* requirements from non-validation sections.
# ------------------------------------------------------------------
if html_spec is not None and html_spec.is_file():
validation_path = spec_source / _VALIDATION_SUBPATH
used_html_for_validation = False
if validation_path.is_file():
rules, status_codes = parse_validation_doc(validation_path)
for rule in rules:
kg.add_rule(rule)
for code in status_codes:
kg.status_codes.append(code)
elif html_spec is not None and html_spec.is_file():
rules, status_codes = parse_html_spec_file(html_spec)
used_html_for_validation = True
for rule in rules:
kg.add_rule(rule)
for code in status_codes:
kg.status_codes.append(code)
else:
validation_path = spec_source / _VALIDATION_SUBPATH
if validation_path.is_file():
rules, status_codes = parse_validation_doc(validation_path)
for rule in rules:
kg.add_rule(rule)
for code in status_codes:
kg.status_codes.append(code)

if html_spec is not None and html_spec.is_file() and not used_html_for_validation:
html_text = html_spec.read_text(encoding="utf-8")
for rule in parse_html_generation_rules(html_text):
kg.add_rule(rule)

assertions_dir = spec_source / _ASSERTIONS_SUBPATH
if assertions_dir.is_dir():
Expand Down
21 changes: 19 additions & 2 deletions src/c2pa_kg/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,20 @@ def cli() -> None:
type=click.Path(file_okay=False, path_type=Path),
help="Directory where output artifacts are written.",
)
def generate(spec_source: Path, version: str, output_dir: Path) -> None:
@click.option(
"--html-spec",
type=click.Path(exists=True, dir_okay=False, path_type=Path),
help=(
"Path to a rendered C2PA spec HTML page. When provided, validation "
"status codes and claim-generator requirements are extracted from HTML."
),
)
def generate(
spec_source: Path,
version: str,
output_dir: Path,
html_spec: Path | None,
) -> None:
"""Build knowledge graph artifacts for a single spec version."""
try:
spec_version: SpecVersion = get_version(version)
Expand All @@ -103,7 +116,11 @@ def generate(spec_source: Path, version: str, output_dir: Path) -> None:
) from exc

click.echo(f"Building knowledge graph for version {version} ...")
kg: KnowledgeGraph = build_knowledge_graph(spec_source, spec_version)
kg: KnowledgeGraph = build_knowledge_graph(
spec_source,
spec_version,
html_spec=html_spec,
)

version_dir = output_dir / version
version_dir.mkdir(parents=True, exist_ok=True)
Expand Down
47 changes: 44 additions & 3 deletions src/c2pa_kg/emitters/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from c2pa_kg.models import (
KnowledgeGraph,
RuleApplicability,
RuleSeverity,
StatusCode,
ValidationPhase,
Expand Down Expand Up @@ -49,6 +50,10 @@ def _rule_to_dict(rule: ValidationRule) -> dict[str, Any]:
"phase": rule.phase.value,
"spec_section": rule.spec_section,
}
if rule.spec_area:
d["spec_area"] = rule.spec_area
if rule.applicability != RuleApplicability.UNSPECIFIED:
d["applicability"] = rule.applicability.value
if rule.condition:
d["condition"] = rule.condition
if rule.action:
Expand Down Expand Up @@ -117,21 +122,48 @@ def _group_status_codes(
return result


def _group_generation_rules_by_area(
rules: list[ValidationRule],
) -> dict[str, list[dict[str, Any]]]:
"""Group GEN- rules by spec section area, sorted by severity."""
gen_rules = [
r for r in rules
if r.applicability in (RuleApplicability.CLAIM_GENERATOR, RuleApplicability.BOTH)
and r.rule_id.startswith("GEN-")
]

areas: dict[str, list[ValidationRule]] = {}
for rule in gen_rules:
area = rule.spec_area or rule.spec_section or "General"
areas.setdefault(area, []).append(rule)

result: dict[str, list[dict[str, Any]]] = {}
for area in sorted(areas):
sorted_rules = sorted(areas[area], key=lambda r: _SEVERITY_ORDER.get(r.severity, 99))
result[area] = [_rule_to_dict(r) for r in sorted_rules]

return result


# ---------------------------------------------------------------------------
# Summary statistics
# ---------------------------------------------------------------------------

def _build_summary(rules: list[ValidationRule]) -> dict[str, Any]:
"""Build a summary dict of rule counts by phase and severity."""
"""Build a summary dict of rule counts by phase, severity, and applicability."""
by_phase: dict[str, int] = {}
by_severity: dict[str, int] = {}
by_applicability: dict[str, int] = {}
for rule in rules:
by_phase[rule.phase.value] = by_phase.get(rule.phase.value, 0) + 1
by_severity[rule.severity.value] = by_severity.get(rule.severity.value, 0) + 1
app = rule.applicability.value
by_applicability[app] = by_applicability.get(app, 0) + 1
return {
"total": len(rules),
"by_phase": by_phase,
"by_severity": by_severity,
"by_applicability": by_applicability,
}


Expand All @@ -146,12 +178,17 @@ def emit_rules_json(kg: KnowledgeGraph, output_path: Path) -> None:
{
"version": "2.4",
"rule_count": N,
"summary": { "total": N, "by_phase": {...}, "by_severity": {...} },
"summary": { "total": N, "by_phase": {...}, "by_severity": {...},
"by_applicability": {...} },
"phases": {
"structural": [ ...rules... ],
"cryptographic": [ ...rules... ],
...
},
"generation_rules": {
"<spec area>": [ ...GEN- rules for claim generators... ],
...
},
"status_codes": {
"success": [ ...codes... ],
"failure": [ ...codes... ],
Expand All @@ -163,7 +200,10 @@ def emit_rules_json(kg: KnowledgeGraph, output_path: Path) -> None:
kg: The knowledge graph containing rules and status codes.
output_path: Destination .json file path.
"""
phases = _group_rules_by_phase(kg.validation_rules)
# Separate validation rules (VAL-) from generation rules (GEN-)
val_rules = [r for r in kg.validation_rules if r.rule_id.startswith("VAL-")]
phases = _group_rules_by_phase(val_rules)
generation_rules = _group_generation_rules_by_area(kg.validation_rules)
status_codes = _group_status_codes(kg.status_codes)
summary = _build_summary(kg.validation_rules)

Expand All @@ -172,6 +212,7 @@ def emit_rules_json(kg: KnowledgeGraph, output_path: Path) -> None:
"rule_count": kg.rule_count,
"summary": summary,
"phases": phases,
"generation_rules": generation_rules,
"status_codes": status_codes,
}

Expand Down
22 changes: 21 additions & 1 deletion src/c2pa_kg/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,15 @@ class ValidationPhase(Enum):
CONTENT = "content"


class RuleApplicability(Enum):
"""Who a normative rule is directed at."""

CLAIM_GENERATOR = "claim_generator"
VALIDATOR = "validator"
BOTH = "both"
UNSPECIFIED = "unspecified"


class ChangeType(Enum):
"""Type of change between spec versions."""

Expand Down Expand Up @@ -193,10 +202,12 @@ class ValidationRule:
action: str = ""
referenced_entities: list[str] = field(default_factory=list)
spec_section: str = ""
spec_area: str = ""
source_text: str = ""
applicability: RuleApplicability = RuleApplicability.UNSPECIFIED

def to_dict(self) -> dict:
return {
d: dict = {
"rule_id": self.rule_id,
"description": self.description,
"severity": self.severity.value,
Expand All @@ -207,6 +218,11 @@ def to_dict(self) -> dict:
"spec_section": self.spec_section,
"source_text": self.source_text,
}
if self.spec_area:
d["spec_area"] = self.spec_area
if self.applicability != RuleApplicability.UNSPECIFIED:
d["applicability"] = self.applicability.value
return d


@dataclass
Expand Down Expand Up @@ -512,7 +528,11 @@ def kg_from_dict(data: dict) -> KnowledgeGraph:
action=rud.get("action", ""),
referenced_entities=rud.get("referenced_entities", []),
spec_section=rud.get("spec_section", ""),
spec_area=rud.get("spec_area", ""),
source_text=rud.get("source_text", ""),
applicability=RuleApplicability(
rud.get("applicability", "unspecified")
),
))

for en, ed in data.get("enum_types", {}).items():
Expand Down
37 changes: 36 additions & 1 deletion src/c2pa_kg/parsers/asciidoc.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,41 @@ def _section_at(offset: int, headers: list[tuple[int, int, str]]) -> str:
_INFO_RE = re.compile(r"\binformational\b", re.IGNORECASE)
_FAILURE_RE = re.compile(r"\bfailure\b", re.IGNORECASE)

_INCLUDE_RE = re.compile(r"^include::([^\[]+)\[[^\]]*\]\s*$", re.MULTILINE)


def _resolve_adoc_includes(path: Path, *, seen: frozenset[Path] = frozenset()) -> str:
"""Read an AsciiDoc file and inline relative ``include::*.adoc[]`` files.

The validation clause is split across partials, and the status-code tables
put their rows in included files. Non-AsciiDoc includes, such as CDDL source
snippets, are intentionally dropped from the parsed text: they are examples
or schemas, not normative prose for this parser.
"""
resolved = path.resolve()
if resolved in seen:
return ""
seen = seen | {resolved}
text = path.read_text(encoding="utf-8")

def _replace(match: re.Match[str]) -> str:
include_target = match.group(1).strip()
include_path = (path.parent / include_target).resolve()
if include_path.suffix.lower() != ".adoc" or not include_path.is_file():
return ""
return _resolve_adoc_includes(include_path, seen=seen)

return _INCLUDE_RE.sub(_replace, text)


def _read_validation_adoc(validation_path: Path) -> str:
"""Read Validation.adoc with validation partials and status-code annex."""
text = _resolve_adoc_includes(validation_path)
annex = validation_path.with_name("ValidationCodes_Annex.adoc")
if annex.is_file():
text = f"{text}\n\n{_resolve_adoc_includes(annex)}"
return text


def _category_from_context(text_before: str) -> str:
"""Determine status code category from the text preceding a table."""
Expand Down Expand Up @@ -232,7 +267,7 @@ def parse_validation_doc(
Returns:
Tuple of (list[ValidationRule], list[StatusCode]).
"""
text = validation_path.read_text(encoding="utf-8")
text = _read_validation_adoc(validation_path)

status_codes = _parse_status_code_tables(text)
rules = _parse_normative_rules(text)
Expand Down
Loading