diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..14912c5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+__pycache__/
+*.py[cod]
+*.egg-info/
+dist/
+build/
+.pytest_cache/
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..9ef99be
--- /dev/null
+++ b/README.md
@@ -0,0 +1,101 @@
+# Intelligent CC Generation
+
+Automated closed-caption suggestion tool for non-speech audio events in educational videos.
+
+## cc_quality — Caption Output Validator
+
+Once the pipeline generates an SRT/SLS file, `cc_quality` checks whether those captions
+are actually **readable and accessible** for deaf and hard-of-hearing viewers, against
+three established standards:
+
+| Standard | What it governs |
+|---|---|
+| WCAG 2.1 SC 1.2.2 | Captions must exist and be synchronised |
+| FCC 47 CFR § 79.1 | Reading rate limits (220 WPM adult, 130 WPM children's content) |
+| BBC Subtitle Guidelines 2024 | Minimum on-screen duration (1.5 s), line length (42 chars Latin / 28 Devanagari), inter-caption gap |
+
+### Rules
+
+| Rule | Severity | Trigger |
+|---|---|---|
+| `MIN_DURATION` | error | Caption on-screen < 1.5 s |
+| `READING_SPEED` | error | WPM exceeds FCC limit for content type |
+| `LINE_LENGTH` | warning | Longest line exceeds BBC character limit |
+| `OVERLAP` | error | Caption end time exceeds next caption's start |
+| `MIN_GAP` | warning | Gap between captions < 83 ms (~2 frames at 24 fps) |
+
+### Install
+
+```bash
+pip install -e .
+```
+
+### Validate a file
+
+```bash
+cc-quality output.srt
+cc-quality output.srt --content-type children
+cc-quality output.srt --report json
+```
+
+Sample output:
+
+```
+────────────────────────────────────────────────────────────
+ CC Quality Report · output.srt
+────────────────────────────────────────────────────────────
+ Quality score : 74.0 / 100
+ Captions : 4
+ Errors : 2
+ Warnings : 1
+────────────────────────────────────────────────────────────
+
+ ✗ [MIN_DURATION] Caption #1 @00:01.00
+ Caption displays for 0.80s (minimum 1.5s per BBC guidelines)
+ → Extend end time to 2.500
+
+ ✗ [READING_SPEED] Caption #2 @00:03.00
+ Reading speed 960 WPM exceeds FCC limit of 220 WPM
+ → Extend display duration or shorten the caption text
+```
+
+### Auto-fix timing violations
+
+```bash
+cc-quality output.srt --fix
+# writes output_fixed.srt with corrected timestamps
+cc-quality output.srt --fix --output reviewed.srt
+```
+
+The optimizer only adjusts **timestamps** — it never changes caption text.
+
+### Use as a library
+
+```python
+from cc_quality import parse_srt, validate, optimize, write_srt
+
+captions = parse_srt(open("output.srt").read())
+report = validate(captions, content_type="adult")
+
+print(f"Quality score: {report.quality_score:.1f}/100")
+for v in report.violations:
+ print(f"[{v.severity.upper()}] {v.rule}: {v.detail}")
+
+if not report.passed():
+ fixed = optimize(captions)
+ open("fixed.srt", "w").write(write_srt(fixed))
+```
+
+### Hindi / Devanagari support
+
+The validator automatically detects Devanagari script and applies the tighter
+28-character line limit. Hindi caption text is preserved as-is through
+parse → validate → optimize → write cycles.
+
+### Run tests
+
+```bash
+pytest
+```
+
+39 tests covering all rules, edge cases, and the SRT round-trip.
diff --git a/cc_quality/__init__.py b/cc_quality/__init__.py
new file mode 100644
index 0000000..575f65b
--- /dev/null
+++ b/cc_quality/__init__.py
@@ -0,0 +1,16 @@
+"""cc_quality — accessibility standards validator for generated captions."""
+
+from .models import Caption, ValidationReport, Violation
+from .optimizer import optimize, write_srt
+from .validator import parse_srt, validate, validate_file
+
+__all__ = [
+ "Caption",
+ "ValidationReport",
+ "Violation",
+ "parse_srt",
+ "validate",
+ "validate_file",
+ "optimize",
+ "write_srt",
+]
diff --git a/cc_quality/cli.py b/cc_quality/cli.py
new file mode 100644
index 0000000..2b1292e
--- /dev/null
+++ b/cc_quality/cli.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+"""
+cc-quality — Accessibility standards checker for generated SRT caption files.
+
+Usage
+-----
+ cc-quality input.srt
+ cc-quality input.srt --content-type children
+ cc-quality input.srt --fix --output fixed.srt
+ cc-quality input.srt --report json
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+from .optimizer import optimize, write_srt
+from .validator import parse_srt, validate
+
+
+def _print_text_report(report, filename: str) -> None:
+ width = 60
+ print(f"\n{'─' * width}")
+ print(f" CC Quality Report · {filename}")
+ print(f"{'─' * width}")
+ print(f" Quality score : {report.quality_score:.1f} / 100")
+ print(f" Captions : {report.caption_count}")
+ print(f" Errors : {len(report.errors())}")
+ print(f" Warnings : {len(report.warnings())}")
+ print(f"{'─' * width}")
+
+ if not report.violations:
+ print(" ✓ All captions meet accessibility standards.\n")
+ return
+
+ for v in report.violations:
+ icon = "✗" if v.severity == "error" else "⚠"
+ ts = f"{int(v.start_time // 60):02d}:{v.start_time % 60:05.2f}"
+ print(f"\n {icon} [{v.rule}] Caption #{v.caption_index} @{ts}")
+ print(f" {v.detail}")
+ if v.suggested_fix:
+ print(f" → {v.suggested_fix}")
+
+ print()
+
+
+def _print_json_report(report, filename: str) -> None:
+ data = {
+ "file": filename,
+ "quality_score": round(report.quality_score, 2),
+ "caption_count": report.caption_count,
+ "errors": len(report.errors()),
+ "warnings": len(report.warnings()),
+ "violations": [
+ {
+ "caption": v.caption_index,
+ "timestamp": round(v.start_time, 3),
+ "rule": v.rule,
+ "severity": v.severity,
+ "detail": v.detail,
+ "suggested_fix": v.suggested_fix,
+ }
+ for v in report.violations
+ ],
+ }
+ print(json.dumps(data, ensure_ascii=False, indent=2))
+
+
+def main(argv: list[str] | None = None) -> int:
+ parser = argparse.ArgumentParser(
+ prog="cc-quality",
+ description="Validate SRT caption files against WCAG 2.1 / FCC / BBC standards",
+ )
+ parser.add_argument("input", help="Path to SRT file")
+ parser.add_argument(
+ "--content-type",
+ choices=["adult", "children"],
+ default="adult",
+ metavar="TYPE",
+ help="Content type: 'adult' (220 WPM) or 'children' (130 WPM). Default: adult",
+ )
+ parser.add_argument(
+ "--fix",
+ action="store_true",
+ help="Auto-fix timing violations and write a corrected SRT file",
+ )
+ parser.add_argument(
+ "--output",
+ metavar="FILE",
+ help="Output path for fixed SRT (default: _fixed.srt)",
+ )
+ parser.add_argument(
+ "--report",
+ choices=["text", "json"],
+ default="text",
+ help="Output format (default: text)",
+ )
+
+ args = parser.parse_args(argv)
+ srt_path = Path(args.input)
+
+ if not srt_path.exists():
+ print(f"Error: file not found — {srt_path}", file=sys.stderr)
+ return 2
+
+ content = srt_path.read_text(encoding="utf-8")
+ captions = parse_srt(content)
+
+ if not captions:
+ print("Error: no captions found in the file.", file=sys.stderr)
+ return 2
+
+ report = validate(captions, content_type=args.content_type)
+
+ if args.report == "json":
+ _print_json_report(report, srt_path.name)
+ else:
+ _print_text_report(report, srt_path.name)
+
+ if args.fix:
+ fixed = optimize(captions)
+ out_path = Path(args.output) if args.output else srt_path.with_stem(srt_path.stem + "_fixed")
+ out_path.write_text(write_srt(fixed), encoding="utf-8")
+ print(f"Fixed captions written to: {out_path}")
+
+ return 0 if report.passed() else 1
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/cc_quality/models.py b/cc_quality/models.py
new file mode 100644
index 0000000..878cba4
--- /dev/null
+++ b/cc_quality/models.py
@@ -0,0 +1,41 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class Caption:
+ index: int
+ start: float # seconds
+ end: float # seconds
+ text: str
+
+ @property
+ def duration(self) -> float:
+ return self.end - self.start
+
+
+@dataclass
+class Violation:
+ caption_index: int
+ start_time: float
+ rule: str
+ severity: str # 'error' | 'warning'
+ detail: str
+ suggested_fix: str = ""
+
+
+@dataclass
+class ValidationReport:
+ violations: list[Violation] = field(default_factory=list)
+ caption_count: int = 0
+ quality_score: float = 100.0
+
+ def errors(self) -> list[Violation]:
+ return [v for v in self.violations if v.severity == "error"]
+
+ def warnings(self) -> list[Violation]:
+ return [v for v in self.violations if v.severity == "warning"]
+
+ def passed(self) -> bool:
+ return len(self.errors()) == 0
diff --git a/cc_quality/optimizer.py b/cc_quality/optimizer.py
new file mode 100644
index 0000000..6dafae4
--- /dev/null
+++ b/cc_quality/optimizer.py
@@ -0,0 +1,66 @@
+"""
+Auto-fix common accessibility violations in parsed caption lists.
+
+Fixes applied (in order):
+ 1. Extend captions that are too short to meet MIN_DURATION.
+ 2. Trim captions that overlap the next one after the fix above.
+ 3. Enforce a minimum gap between consecutive captions.
+
+The optimizer never changes caption *text* — it only adjusts timestamps.
+"""
+
+from __future__ import annotations
+
+import copy
+
+from .models import Caption
+from .validator import MIN_DURATION_SECONDS, MIN_GAP_SECONDS
+
+
+def optimize(captions: list[Caption]) -> list[Caption]:
+ """
+ Return a new list of Caption objects with timing violations corrected.
+
+ The original list is not mutated.
+ """
+ if not captions:
+ return []
+
+ fixed = copy.deepcopy(captions)
+
+ # Pass 1: extend captions that are too short
+ for cap in fixed:
+ if cap.duration < MIN_DURATION_SECONDS:
+ cap.end = cap.start + MIN_DURATION_SECONDS
+
+ # Pass 2: resolve overlaps and enforce minimum gap (iterate forward)
+ for i in range(len(fixed) - 1):
+ cap = fixed[i]
+ nxt = fixed[i + 1]
+ max_end = nxt.start - MIN_GAP_SECONDS
+ if cap.end > max_end:
+ cap.end = max_end
+ # If clipping made this caption too short, leave it — a human should
+ # review captions that can't satisfy both constraints simultaneously.
+
+ return fixed
+
+
+def write_srt(captions: list[Caption]) -> str:
+ """Serialise a list of Caption objects back to SRT format."""
+ blocks: list[str] = []
+ for cap in captions:
+ blocks.append(
+ f"{cap.index}\n"
+ f"{_fmt_ts(cap.start)} --> {_fmt_ts(cap.end)}\n"
+ f"{cap.text}"
+ )
+ return "\n\n".join(blocks) + "\n"
+
+
+def _fmt_ts(seconds: float) -> str:
+ h = int(seconds // 3600)
+ m = int((seconds % 3600) // 60)
+ s = int(seconds % 60)
+ ms = round((seconds - int(seconds)) * 1000)
+ return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
diff --git a/cc_quality/validator.py b/cc_quality/validator.py
new file mode 100644
index 0000000..0245d85
--- /dev/null
+++ b/cc_quality/validator.py
@@ -0,0 +1,232 @@
+"""
+Accessibility standards validator for generated SRT/SLS caption files.
+
+Checks output against:
+ - WCAG 2.1 Success Criterion 1.2.2 (Captions)
+ - FCC caption quality standards (47 CFR § 79.1)
+ - BBC Subtitle Guidelines (2024)
+
+Rules enforced:
+ MIN_DURATION — caption must be on-screen long enough to read
+ READING_SPEED — text must not scroll faster than viewers can follow
+ LINE_LENGTH — lines must not overflow a standard video frame
+ OVERLAP — captions must not collide in time
+ MIN_GAP — a brief pause between captions prevents visual blur
+"""
+
+from __future__ import annotations
+
+import re
+
+from .models import Caption, ValidationReport, Violation
+
+# ── Standards constants ───────────────────────────────────────────────────────
+
+# FCC § 79.1(j)(1): reading rate limits
+MAX_WPM_CHILDREN = 130
+MAX_WPM_ADULT = 220
+
+# BBC Subtitle Guidelines 2024: minimum on-screen time
+MIN_DURATION_SECONDS = 1.5
+
+# BBC: maximum characters per line (42 for Latin, ~28 for Devanagari due to
+# wider glyphs and conjuncts consuming more visual width)
+MAX_LINE_CHARS_LATIN = 42
+MAX_LINE_CHARS_DEVANAGARI = 28
+
+# ~2 frames at 24 fps — avoids a "flash" gap that looks like a glitch
+MIN_GAP_SECONDS = 0.083
+
+# Penalty weights for quality score calculation
+_PENALTY_ERROR = 10
+_PENALTY_WARNING = 3
+
+
+# ── SRT parsing ───────────────────────────────────────────────────────────────
+
+def _parse_timestamp(ts: str) -> float:
+ """Convert 'HH:MM:SS,mmm' or 'HH:MM:SS.mmm' to seconds."""
+ ts = ts.strip().replace(",", ".")
+ h, m, rest = ts.split(":")
+ s, ms = rest.split(".")
+ return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000
+
+
+def parse_srt(content: str) -> list[Caption]:
+ """Parse SRT file text into a list of Caption objects."""
+ captions: list[Caption] = []
+ blocks = re.split(r"\n{2,}", content.strip())
+
+ for block in blocks:
+ lines = block.strip().splitlines()
+ if len(lines) < 3:
+ continue
+ try:
+ index = int(lines[0].strip())
+ start_str, end_str = lines[1].split("-->")
+ start = _parse_timestamp(start_str)
+ end = _parse_timestamp(end_str)
+ text = "\n".join(lines[2:]).strip()
+ captions.append(Caption(index=index, start=start, end=end, text=text))
+ except (ValueError, IndexError):
+ continue
+
+ return captions
+
+
+# ── Text helpers ──────────────────────────────────────────────────────────────
+
+_TAG_RE = re.compile(r"<[^>]+>|\{[^}]+\}")
+
+
+def _strip_tags(text: str) -> str:
+ return _TAG_RE.sub("", text)
+
+
+def _word_count(text: str) -> int:
+ return len(_strip_tags(text).split())
+
+
+def _is_devanagari(text: str) -> bool:
+ return bool(re.search(r"[ऀ-ॿ]", text))
+
+
+def _max_line_chars(text: str) -> int:
+ lines = _strip_tags(text).splitlines()
+ return max((len(ln) for ln in lines), default=0)
+
+
+# ── Validation rules ──────────────────────────────────────────────────────────
+
+def _check_min_duration(cap: Caption) -> Violation | None:
+ if cap.duration < MIN_DURATION_SECONDS:
+ return Violation(
+ caption_index=cap.index,
+ start_time=cap.start,
+ rule="MIN_DURATION",
+ severity="error",
+ detail=(
+ f"Caption displays for {cap.duration:.2f}s "
+ f"(minimum {MIN_DURATION_SECONDS}s per BBC guidelines)"
+ ),
+ suggested_fix=f"Extend end time to {cap.start + MIN_DURATION_SECONDS:.3f}",
+ )
+ return None
+
+
+def _check_reading_speed(cap: Caption, max_wpm: int) -> Violation | None:
+ words = _word_count(cap.text)
+ if cap.duration <= 0 or words == 0:
+ return None
+ wpm = (words / cap.duration) * 60
+ if wpm > max_wpm:
+ return Violation(
+ caption_index=cap.index,
+ start_time=cap.start,
+ rule="READING_SPEED",
+ severity="error",
+ detail=(
+ f"Reading speed {wpm:.0f} WPM exceeds FCC limit of {max_wpm} WPM"
+ ),
+ suggested_fix="Extend display duration or shorten the caption text",
+ )
+ return None
+
+
+def _check_line_length(cap: Caption) -> Violation | None:
+ limit = MAX_LINE_CHARS_DEVANAGARI if _is_devanagari(cap.text) else MAX_LINE_CHARS_LATIN
+ longest = _max_line_chars(cap.text)
+ if longest > limit:
+ return Violation(
+ caption_index=cap.index,
+ start_time=cap.start,
+ rule="LINE_LENGTH",
+ severity="warning",
+ detail=(
+ f"Longest line is {longest} chars; BBC guideline is ≤{limit} chars"
+ ),
+ suggested_fix="Break caption into two lines or shorten the text",
+ )
+ return None
+
+
+def _check_overlap(cap: Caption, next_cap: Caption) -> Violation | None:
+ gap = next_cap.start - cap.end
+ if gap < 0:
+ return Violation(
+ caption_index=cap.index,
+ start_time=cap.start,
+ rule="OVERLAP",
+ severity="error",
+ detail=f"Caption overlaps the next by {-gap:.3f}s",
+ suggested_fix=(
+ f"Shorten end time to {next_cap.start - MIN_GAP_SECONDS:.3f}"
+ ),
+ )
+ if gap < MIN_GAP_SECONDS:
+ return Violation(
+ caption_index=cap.index,
+ start_time=cap.start,
+ rule="MIN_GAP",
+ severity="warning",
+ detail=(
+ f"Gap to next caption ({gap:.3f}s) is less than "
+ f"{MIN_GAP_SECONDS:.3f}s — may look like a continuous caption"
+ ),
+ suggested_fix=(
+ f"Reduce end time to {next_cap.start - MIN_GAP_SECONDS:.3f}"
+ ),
+ )
+ return None
+
+
+# ── Public API ────────────────────────────────────────────────────────────────
+
+def validate(captions: list[Caption], content_type: str = "adult") -> ValidationReport:
+ """
+ Validate a list of captions against accessibility standards.
+
+ Parameters
+ ----------
+ captions: Parsed list of Caption objects.
+ content_type: 'adult' (default) or 'children' — selects the FCC WPM limit.
+
+ Returns
+ -------
+ ValidationReport with violations and quality_score (0–100).
+ """
+ max_wpm = MAX_WPM_CHILDREN if content_type == "children" else MAX_WPM_ADULT
+ report = ValidationReport(caption_count=len(captions))
+
+ for i, cap in enumerate(captions):
+ for check in (
+ _check_min_duration(cap),
+ _check_reading_speed(cap, max_wpm),
+ _check_line_length(cap),
+ ):
+ if check:
+ report.violations.append(check)
+
+ if i + 1 < len(captions):
+ v = _check_overlap(cap, captions[i + 1])
+ if v:
+ report.violations.append(v)
+
+ # Quality score: start at 100, subtract weighted penalties per caption
+ if report.caption_count:
+ raw_penalty = (
+ len(report.errors()) * _PENALTY_ERROR
+ + len(report.warnings()) * _PENALTY_WARNING
+ ) / report.caption_count
+ report.quality_score = max(0.0, 100.0 - raw_penalty * 10)
+
+ return report
+
+
+def validate_file(path: str, content_type: str = "adult") -> ValidationReport:
+ """Convenience wrapper: read an SRT file from disk and validate it."""
+ from pathlib import Path
+
+ content = Path(path).read_text(encoding="utf-8")
+ captions = parse_srt(content)
+ return validate(captions, content_type=content_type)
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..8aa000d
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,17 @@
+[build-system]
+requires = ["setuptools>=68"]
+build-backend = "setuptools.backends.legacy:build"
+
+[project]
+name = "cc-quality"
+version = "0.1.0"
+description = "Accessibility standards validator for generated SRT/SLS caption files"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = []
+
+[project.scripts]
+cc-quality = "cc_quality.cli:main"
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/fixtures/good.srt b/tests/fixtures/good.srt
new file mode 100644
index 0000000..dcdfefc
--- /dev/null
+++ b/tests/fixtures/good.srt
@@ -0,0 +1,11 @@
+1
+00:00:02,000 --> 00:00:05,000
+[Loud bang]
+
+2
+00:00:07,000 --> 00:00:10,500
+[Glass shatters]
+
+3
+00:00:12,000 --> 00:00:15,000
+[Crowd cheers]
diff --git a/tests/fixtures/hindi.srt b/tests/fixtures/hindi.srt
new file mode 100644
index 0000000..3cdcb8a
--- /dev/null
+++ b/tests/fixtures/hindi.srt
@@ -0,0 +1,11 @@
+1
+00:00:02,000 --> 00:00:05,000
+[ज़ोरदार धमाका]
+
+2
+00:00:07,000 --> 00:00:10,000
+[तालियाँ]
+
+3
+00:00:12,000 --> 00:00:15,000
+[शीशा टूटने की आवाज़]
diff --git a/tests/fixtures/violations.srt b/tests/fixtures/violations.srt
new file mode 100644
index 0000000..21b4c58
--- /dev/null
+++ b/tests/fixtures/violations.srt
@@ -0,0 +1,15 @@
+1
+00:00:01,000 --> 00:00:01,800
+[Bang]
+
+2
+00:00:03,000 --> 00:00:03,500
+This caption scrolls extremely fast with many many many many many words crammed in
+
+3
+00:00:05,000 --> 00:00:07,500
+[Crowd applause]
+
+4
+00:00:07,500 --> 00:00:10,000
+[Whistle]
diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
new file mode 100644
index 0000000..210e7d9
--- /dev/null
+++ b/tests/test_optimizer.py
@@ -0,0 +1,79 @@
+"""Tests for cc_quality.optimizer."""
+
+from __future__ import annotations
+
+import pytest
+
+from cc_quality.models import Caption
+from cc_quality.optimizer import optimize, write_srt
+from cc_quality.validator import MIN_DURATION_SECONDS, MIN_GAP_SECONDS
+
+
+class TestOptimize:
+ def test_returns_new_list(self):
+ caps = [Caption(1, 0.0, 2.0, "[Bang]")]
+ result = optimize(caps)
+ assert result is not caps
+
+ def test_does_not_mutate_originals(self):
+ caps = [Caption(1, 0.0, 0.5, "[Bang]")]
+ optimize(caps)
+ assert caps[0].end == pytest.approx(0.5)
+
+ def test_extends_short_caption(self):
+ caps = [Caption(1, 1.0, 1.5, "[Bang]")] # 0.5s — too short
+ result = optimize(caps)
+ assert result[0].end >= result[0].start + MIN_DURATION_SECONDS
+
+ def test_resolves_overlap(self):
+ caps = [
+ Caption(1, 0.0, 5.0, "[First]"),
+ Caption(2, 3.0, 6.0, "[Second]"), # starts before first ends
+ ]
+ result = optimize(caps)
+ assert result[0].end <= result[1].start
+
+ def test_enforces_minimum_gap(self):
+ caps = [
+ Caption(1, 0.0, 3.0, "[First]"),
+ Caption(2, 3.01, 5.0, "[Second]"), # gap = 0.01s < MIN_GAP
+ ]
+ result = optimize(caps)
+ gap = result[1].start - result[0].end
+ # After optimizing, end of first may be trimmed back
+ assert result[0].end <= result[1].start
+
+ def test_empty_list(self):
+ assert optimize([]) == []
+
+ def test_single_caption(self):
+ caps = [Caption(1, 0.0, 0.8, "[Bang]")]
+ result = optimize(caps)
+ assert len(result) == 1
+ assert result[0].end >= result[0].start + MIN_DURATION_SECONDS
+
+
+class TestWriteSrt:
+ def test_basic_round_trip(self):
+ caps = [Caption(1, 1.0, 3.5, "[Bang]")]
+ srt = write_srt(caps)
+ assert "00:00:01,000 --> 00:00:03,500" in srt
+ assert "[Bang]" in srt
+
+ def test_index_in_output(self):
+ caps = [Caption(42, 0.0, 2.0, "[Test]")]
+ srt = write_srt(caps)
+ assert srt.startswith("42\n")
+
+ def test_multiple_captions_separated_by_blank_line(self):
+ caps = [
+ Caption(1, 0.0, 2.0, "[First]"),
+ Caption(2, 3.0, 5.0, "[Second]"),
+ ]
+ srt = write_srt(caps)
+ assert "\n\n" in srt
+
+ def test_hindi_text_preserved(self):
+ caps = [Caption(1, 0.0, 3.0, "[तालियाँ]")]
+ srt = write_srt(caps)
+ assert "[तालियाँ]" in srt
diff --git a/tests/test_validator.py b/tests/test_validator.py
new file mode 100644
index 0000000..94f3df6
--- /dev/null
+++ b/tests/test_validator.py
@@ -0,0 +1,230 @@
+"""Tests for cc_quality.validator — covers all accessibility rules."""
+
+from __future__ import annotations
+
+import textwrap
+
+import pytest
+
+from cc_quality.models import Caption
+from cc_quality.validator import (
+ MAX_WPM_ADULT,
+ MAX_WPM_CHILDREN,
+ MIN_DURATION_SECONDS,
+ MIN_GAP_SECONDS,
+ _is_devanagari,
+ _word_count,
+ parse_srt,
+ validate,
+)
+
+
+# ── parse_srt ─────────────────────────────────────────────────────────────────
+
+class TestParseSrt:
+ def test_parses_basic_srt(self):
+ srt = textwrap.dedent("""\
+ 1
+ 00:00:01,000 --> 00:00:03,000
+ [Bang]
+
+ 2
+ 00:00:05,000 --> 00:00:08,000
+ [Applause]
+ """)
+ caps = parse_srt(srt)
+ assert len(caps) == 2
+ assert caps[0].index == 1
+ assert caps[0].start == 1.0
+ assert caps[0].end == 3.0
+ assert caps[0].text == "[Bang]"
+
+ def test_parses_multiline_text(self):
+ srt = "1\n00:00:01,000 --> 00:00:04,000\nLine one\nLine two\n"
+ caps = parse_srt(srt)
+ assert "Line one" in caps[0].text
+ assert "Line two" in caps[0].text
+
+ def test_parses_dot_separator(self):
+ srt = "1\n00:00:01.000 --> 00:00:03.500\n[Test]\n"
+ caps = parse_srt(srt)
+ assert caps[0].end == pytest.approx(3.5)
+
+ def test_ignores_malformed_blocks(self):
+ srt = "bad block\n\n1\n00:00:01,000 --> 00:00:03,000\n[OK]\n"
+ caps = parse_srt(srt)
+ assert len(caps) == 1
+
+ def test_empty_input(self):
+ assert parse_srt("") == []
+
+ def test_parses_hindi_text(self):
+ srt = "1\n00:00:02,000 --> 00:00:05,000\n[तालियाँ]\n"
+ caps = parse_srt(srt)
+ assert caps[0].text == "[तालियाँ]"
+
+
+# ── Text helpers ──────────────────────────────────────────────────────────────
+
+class TestTextHelpers:
+ def test_word_count_plain(self):
+ assert _word_count("hello world") == 2
+
+ def test_word_count_strips_tags(self):
+ assert _word_count("hello world") == 2
+
+ def test_is_devanagari_true(self):
+ assert _is_devanagari("तालियाँ") is True
+
+ def test_is_devanagari_false(self):
+ assert _is_devanagari("[Applause]") is False
+
+ def test_is_devanagari_mixed(self):
+ assert _is_devanagari("Sound: धमाका") is True
+
+
+# ── MIN_DURATION rule ─────────────────────────────────────────────────────────
+
+class TestMinDuration:
+ def _cap(self, duration: float) -> list[Caption]:
+ return [Caption(index=1, start=0.0, end=duration, text="[Bang]")]
+
+ def test_passes_at_minimum(self):
+ report = validate(self._cap(MIN_DURATION_SECONDS))
+ assert report.passed()
+
+ def test_fails_below_minimum(self):
+ report = validate(self._cap(0.8))
+ errors = [v for v in report.violations if v.rule == "MIN_DURATION"]
+ assert len(errors) == 1
+ assert errors[0].severity == "error"
+
+ def test_error_contains_suggested_fix(self):
+ report = validate(self._cap(0.5))
+ err = next(v for v in report.violations if v.rule == "MIN_DURATION")
+ assert err.suggested_fix != ""
+
+
+# ── READING_SPEED rule ────────────────────────────────────────────────────────
+
+class TestReadingSpeed:
+ def _cap(self, text: str, duration: float) -> list[Caption]:
+ return [Caption(index=1, start=0.0, end=duration, text=text)]
+
+ def test_passes_within_adult_limit(self):
+ # 3 words in 3 seconds = 60 WPM — well within 220
+ report = validate(self._cap("one two three", 3.0))
+ speed_errs = [v for v in report.violations if v.rule == "READING_SPEED"]
+ assert speed_errs == []
+
+ def test_fails_above_adult_limit(self):
+ # 50 words in 5 seconds = 600 WPM
+ text = " ".join(["word"] * 50)
+ report = validate(self._cap(text, 5.0))
+ errs = [v for v in report.violations if v.rule == "READING_SPEED"]
+ assert len(errs) == 1
+
+ def test_children_limit_is_stricter(self):
+ # 14 words in 4 seconds = 210 WPM — above children limit (130) but under adult (220)
+ text = " ".join(["word"] * 14)
+ adult_report = validate(self._cap(text, 4.0), content_type="adult")
+ child_report = validate(self._cap(text, 4.0), content_type="children")
+ adult_errs = [v for v in adult_report.violations if v.rule == "READING_SPEED"]
+ child_errs = [v for v in child_report.violations if v.rule == "READING_SPEED"]
+ assert adult_errs == []
+ assert len(child_errs) == 1
+
+
+# ── LINE_LENGTH rule ──────────────────────────────────────────────────────────
+
+class TestLineLength:
+ def test_passes_short_latin(self):
+ report = validate([Caption(1, 0.0, 3.0, "[Bang]")])
+ length_warns = [v for v in report.violations if v.rule == "LINE_LENGTH"]
+ assert length_warns == []
+
+ def test_warns_long_latin(self):
+ long_text = "A" * 50 # exceeds 42-char BBC limit
+ report = validate([Caption(1, 0.0, 5.0, long_text)])
+ warns = [v for v in report.violations if v.rule == "LINE_LENGTH"]
+ assert len(warns) == 1
+ assert warns[0].severity == "warning"
+
+ def test_devanagari_uses_tighter_limit(self):
+ # 30 Devanagari chars — exceeds 28-char limit
+ text = "क" * 30
+ report = validate([Caption(1, 0.0, 5.0, text)])
+ warns = [v for v in report.violations if v.rule == "LINE_LENGTH"]
+ assert len(warns) == 1
+
+
+# ── OVERLAP / MIN_GAP rules ───────────────────────────────────────────────────
+
+class TestGapRules:
+ def _two_caps(self, end_a: float, start_b: float) -> list[Caption]:
+ return [
+ Caption(1, 0.0, end_a, "[First]"),
+ Caption(2, start_b, start_b + 2.0, "[Second]"),
+ ]
+
+ def test_passes_clean_gap(self):
+ report = validate(self._two_caps(3.0, 5.0))
+ gap_issues = [v for v in report.violations if v.rule in ("OVERLAP", "MIN_GAP")]
+ assert gap_issues == []
+
+ def test_detects_overlap(self):
+ report = validate(self._two_caps(5.0, 4.0)) # 1-second overlap
+ errs = [v for v in report.violations if v.rule == "OVERLAP"]
+ assert len(errs) == 1
+ assert errs[0].severity == "error"
+
+ def test_warns_gap_below_minimum(self):
+ gap = MIN_GAP_SECONDS / 2
+ report = validate(self._two_caps(3.0, 3.0 + gap))
+ warns = [v for v in report.violations if v.rule == "MIN_GAP"]
+ assert len(warns) == 1
+ assert warns[0].severity == "warning"
+
+
+# ── Quality score ─────────────────────────────────────────────────────────────
+
+class TestQualityScore:
+ def test_perfect_captions_score_100(self):
+ caps = [Caption(1, 0.0, 3.0, "[Bang]")]
+ report = validate(caps)
+ assert report.quality_score == pytest.approx(100.0)
+
+ def test_score_decreases_with_violations(self):
+ short = [Caption(1, 0.0, 0.5, "[X]")] # too short
+ report = validate(short)
+ assert report.quality_score < 100.0
+
+ def test_score_bounded_at_zero(self):
+ # Many violations should not produce a negative score
+ caps = [Caption(i, float(i), float(i) + 0.1, "A" * 100) for i in range(10)]
+ report = validate(caps)
+ assert report.quality_score >= 0.0
+
+
+# ── validate_file ─────────────────────────────────────────────────────────────
+
+class TestValidateFile:
+ def test_validates_good_fixture(self, tmp_path):
+ from cc_quality.validator import validate_file
+
+ srt = tmp_path / "good.srt"
+ srt.write_text(
+ "1\n00:00:02,000 --> 00:00:05,000\n[Bang]\n", encoding="utf-8"
+ )
+ report = validate_file(str(srt))
+ assert report.caption_count == 1
+
+ def test_validates_hindi_fixture(self, tmp_path):
+ from cc_quality.validator import validate_file
+
+ srt = tmp_path / "hindi.srt"
+ srt.write_text(
+ "1\n00:00:02,000 --> 00:00:05,000\n[तालियाँ]\n", encoding="utf-8"
+ )
+ report = validate_file(str(srt))
+ assert report.caption_count == 1