PlanetRead · bhuvan-somisetty · May 14, 2026
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,6 @@
+__pycache__/
+*.py[cod]
+*.egg-info/
+dist/
+build/
+.pytest_cache/
diff --git a/README.md b/README.md
@@ -0,0 +1,101 @@
+# Intelligent CC Generation
+
+Automated closed-caption suggestion tool for non-speech audio events in educational videos.
+
+## cc_quality — Caption Output Validator
+
+Once the pipeline generates an SRT/SLS file, `cc_quality` checks whether those captions
+are actually **readable and accessible** for deaf and hard-of-hearing viewers, against
+three established standards:
+
+| Standard | What it governs |
+|---|---|
+| WCAG 2.1 SC 1.2.2 | Captions must exist and be synchronised |
+| FCC 47 CFR § 79.1 | Reading rate limits (220 WPM adult, 130 WPM children's content) |
+| BBC Subtitle Guidelines 2024 | Minimum on-screen duration (1.5 s), line length (42 chars Latin / 28 Devanagari), inter-caption gap |
+
+### Rules
+
+| Rule | Severity | Trigger |
+|---|---|---|
+| `MIN_DURATION` | error | Caption on-screen < 1.5 s |
+| `READING_SPEED` | error | WPM exceeds FCC limit for content type |
+| `LINE_LENGTH` | warning | Longest line exceeds BBC character limit |
+| `OVERLAP` | error | Caption end time exceeds next caption's start |
+| `MIN_GAP` | warning | Gap between captions < 83 ms (~2 frames at 24 fps) |
+
+### Install
+
+```bash
+pip install -e .
+```
+
+### Validate a file
+
+```bash
+cc-quality output.srt
+cc-quality output.srt --content-type children
+cc-quality output.srt --report json
+```
+
+Sample output:
+
+```
+────────────────────────────────────────────────────────────
+  CC Quality Report  ·  output.srt
+────────────────────────────────────────────────────────────
+  Quality score : 74.0 / 100
+  Captions      : 4
+  Errors        : 2
+  Warnings      : 1
+────────────────────────────────────────────────────────────
+
+  ✗ [MIN_DURATION] Caption #1  @00:01.00
+     Caption displays for 0.80s (minimum 1.5s per BBC guidelines)
+     → Extend end time to 2.500
+
+  ✗ [READING_SPEED] Caption #2  @00:03.00
+     Reading speed 960 WPM exceeds FCC limit of 220 WPM
+     → Extend display duration or shorten the caption text
+```
+
+### Auto-fix timing violations
+
+```bash
+cc-quality output.srt --fix
+# writes output_fixed.srt with corrected timestamps
+cc-quality output.srt --fix --output reviewed.srt
+```
+
+The optimizer only adjusts **timestamps** — it never changes caption text.
+
+### Use as a library
+
+```python
+from cc_quality import parse_srt, validate, optimize, write_srt
+
+captions = parse_srt(open("output.srt").read())
+report = validate(captions, content_type="adult")
+
+print(f"Quality score: {report.quality_score:.1f}/100")
+for v in report.violations:
+    print(f"[{v.severity.upper()}] {v.rule}: {v.detail}")
+
+if not report.passed():
+    fixed = optimize(captions)
+    open("fixed.srt", "w").write(write_srt(fixed))
+```
+
+### Hindi / Devanagari support
+
+The validator automatically detects Devanagari script and applies the tighter
+28-character line limit. Hindi caption text is preserved as-is through
+parse → validate → optimize → write cycles.
+
+### Run tests
+
+```bash
+pytest
+```
+
+39 tests covering all rules, edge cases, and the SRT round-trip.
diff --git a/cc_quality/__init__.py b/cc_quality/__init__.py
@@ -0,0 +1,16 @@
+"""cc_quality — accessibility standards validator for generated captions."""
+
+from .models import Caption, ValidationReport, Violation
+from .optimizer import optimize, write_srt
+from .validator import parse_srt, validate, validate_file
+
+__all__ = [
+    "Caption",
+    "ValidationReport",
+    "Violation",
+    "parse_srt",
+    "validate",
+    "validate_file",
+    "optimize",
+    "write_srt",
+]
diff --git a/cc_quality/cli.py b/cc_quality/cli.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+"""
+cc-quality — Accessibility standards checker for generated SRT caption files.
+
+Usage
+-----
+  cc-quality input.srt
+  cc-quality input.srt --content-type children
+  cc-quality input.srt --fix --output fixed.srt
+  cc-quality input.srt --report json
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+from .optimizer import optimize, write_srt
+from .validator import parse_srt, validate
+
+
+def _print_text_report(report, filename: str) -> None:
+    width = 60
+    print(f"\n{'─' * width}")
+    print(f"  CC Quality Report  ·  {filename}")
+    print(f"{'─' * width}")
+    print(f"  Quality score : {report.quality_score:.1f} / 100")
+    print(f"  Captions      : {report.caption_count}")
+    print(f"  Errors        : {len(report.errors())}")
+    print(f"  Warnings      : {len(report.warnings())}")
+    print(f"{'─' * width}")
+
+    if not report.violations:
+        print("  ✓  All captions meet accessibility standards.\n")
+        return
+
+    for v in report.violations:
+        icon = "✗" if v.severity == "error" else "⚠"
+        ts = f"{int(v.start_time // 60):02d}:{v.start_time % 60:05.2f}"
+        print(f"\n  {icon} [{v.rule}] Caption #{v.caption_index}  @{ts}")
+        print(f"     {v.detail}")
+        if v.suggested_fix:
+            print(f"     → {v.suggested_fix}")
+
+    print()
+
+
+def _print_json_report(report, filename: str) -> None:
+    data = {
+        "file": filename,
+        "quality_score": round(report.quality_score, 2),
+        "caption_count": report.caption_count,
+        "errors": len(report.errors()),
+        "warnings": len(report.warnings()),
+        "violations": [
+            {
+                "caption": v.caption_index,
+                "timestamp": round(v.start_time, 3),
+                "rule": v.rule,
+                "severity": v.severity,
+                "detail": v.detail,
+                "suggested_fix": v.suggested_fix,
+            }
+            for v in report.violations
+        ],
+    }
+    print(json.dumps(data, ensure_ascii=False, indent=2))
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(
+        prog="cc-quality",
+        description="Validate SRT caption files against WCAG 2.1 / FCC / BBC standards",
+    )
+    parser.add_argument("input", help="Path to SRT file")
+    parser.add_argument(
+        "--content-type",
+        choices=["adult", "children"],
+        default="adult",
+        metavar="TYPE",
+        help="Content type: 'adult' (220 WPM) or 'children' (130 WPM). Default: adult",
+    )
+    parser.add_argument(
+        "--fix",
+        action="store_true",
+        help="Auto-fix timing violations and write a corrected SRT file",
+    )
+    parser.add_argument(
+        "--output",
+        metavar="FILE",
+        help="Output path for fixed SRT (default: <input>_fixed.srt)",
+    )
+    parser.add_argument(
+        "--report",
+        choices=["text", "json"],
+        default="text",
+        help="Output format (default: text)",
+    )
+
+    args = parser.parse_args(argv)
+    srt_path = Path(args.input)
+
+    if not srt_path.exists():
+        print(f"Error: file not found — {srt_path}", file=sys.stderr)
+        return 2
+
+    content = srt_path.read_text(encoding="utf-8")
+    captions = parse_srt(content)
+
+    if not captions:
+        print("Error: no captions found in the file.", file=sys.stderr)
+        return 2
+
+    report = validate(captions, content_type=args.content_type)
+
+    if args.report == "json":
+        _print_json_report(report, srt_path.name)
+    else:
+        _print_text_report(report, srt_path.name)
+
+    if args.fix:
+        fixed = optimize(captions)
+        out_path = Path(args.output) if args.output else srt_path.with_stem(srt_path.stem + "_fixed")
+        out_path.write_text(write_srt(fixed), encoding="utf-8")
+        print(f"Fixed captions written to: {out_path}")
+
+    return 0 if report.passed() else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/cc_quality/models.py b/cc_quality/models.py
@@ -0,0 +1,41 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class Caption:
+    index: int
+    start: float   # seconds
+    end: float     # seconds
+    text: str
+
+    @property
+    def duration(self) -> float:
+        return self.end - self.start
+
+
+@dataclass
+class Violation:
+    caption_index: int
+    start_time: float
+    rule: str
+    severity: str   # 'error' | 'warning'
+    detail: str
+    suggested_fix: str = ""
+
+
+@dataclass
+class ValidationReport:
+    violations: list[Violation] = field(default_factory=list)
+    caption_count: int = 0
+    quality_score: float = 100.0
+
+    def errors(self) -> list[Violation]:
+        return [v for v in self.violations if v.severity == "error"]
+
+    def warnings(self) -> list[Violation]:
+        return [v for v in self.violations if v.severity == "warning"]
+
+    def passed(self) -> bool:
+        return len(self.errors()) == 0
diff --git a/cc_quality/optimizer.py b/cc_quality/optimizer.py
@@ -0,0 +1,66 @@
+"""
+Auto-fix common accessibility violations in parsed caption lists.
+
+Fixes applied (in order):
+  1. Extend captions that are too short to meet MIN_DURATION.
+  2. Trim captions that overlap the next one after the fix above.
+  3. Enforce a minimum gap between consecutive captions.
+
+The optimizer never changes caption *text* — it only adjusts timestamps.
+"""
+
+from __future__ import annotations
+
+import copy
+
+from .models import Caption
+from .validator import MIN_DURATION_SECONDS, MIN_GAP_SECONDS
+
+
+def optimize(captions: list[Caption]) -> list[Caption]:
+    """
+    Return a new list of Caption objects with timing violations corrected.
+
+    The original list is not mutated.
+    """
+    if not captions:
+        return []
+
+    fixed = copy.deepcopy(captions)
+
+    # Pass 1: extend captions that are too short
+    for cap in fixed:
+        if cap.duration < MIN_DURATION_SECONDS:
+            cap.end = cap.start + MIN_DURATION_SECONDS
+
+    # Pass 2: resolve overlaps and enforce minimum gap (iterate forward)
+    for i in range(len(fixed) - 1):
+        cap = fixed[i]
+        nxt = fixed[i + 1]
+        max_end = nxt.start - MIN_GAP_SECONDS
+        if cap.end > max_end:
+            cap.end = max_end
+        # If clipping made this caption too short, leave it — a human should
+        # review captions that can't satisfy both constraints simultaneously.
+
+    return fixed
+
+
+def write_srt(captions: list[Caption]) -> str:
+    """Serialise a list of Caption objects back to SRT format."""
+    blocks: list[str] = []
+    for cap in captions:
+        blocks.append(
+            f"{cap.index}\n"
+            f"{_fmt_ts(cap.start)} --> {_fmt_ts(cap.end)}\n"
+            f"{cap.text}"
+        )
+    return "\n\n".join(blocks) + "\n"
+
+
+def _fmt_ts(seconds: float) -> str:
+    h = int(seconds // 3600)
+    m = int((seconds % 3600) // 60)
+    s = int(seconds % 60)
+    ms = round((seconds - int(seconds)) * 1000)
+    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"